├── .python-version
├── src
    └── swe_care
    │   ├── __init__.py
    │   ├── collect
    │       ├── __init__.py
    │       ├── get_top_repos.py
    │       └── __main__.py
    │   ├── harness
    │       ├── __init__.py
    │       ├── evaluators
    │       │   ├── utils.py
    │       │   ├── __init__.py
    │       │   └── repo_level.py
    │       ├── __main__.py
    │       └── code_review_eval.py
    │   ├── schema
    │       ├── __init__.py
    │       ├── inference.py
    │       ├── evaluation.py
    │       ├── collect.py
    │       └── dataset.py
    │   ├── utils
    │       ├── __init__.py
    │       ├── github_graphql
    │       │   ├── GetLabels.graphql
    │       │   ├── GetIssueLabels.graphql
    │       │   ├── GetThreadComments.graphql
    │       │   ├── GetIssueComments.graphql
    │       │   ├── GetClosingIssues.graphql
    │       │   ├── GetCommits.graphql
    │       │   ├── GetReviewThreads.graphql
    │       │   ├── GetReviewComments.graphql
    │       │   ├── GetReviews.graphql
    │       │   ├── __init__.py
    │       │   ├── GetSpecificPullRequest.graphql
    │       │   └── GetMergedPullRequests.graphql
    │       ├── prompt_loader.py
    │       ├── read_file.py
    │       ├── patch.py
    │       ├── file_source_retrieval.py
    │       ├── load.py
    │       ├── llm_models
    │       │   ├── __init__.py
    │       │   └── clients.py
    │       ├── estimate.py
    │       └── github.py
    │   ├── inference
    │       ├── __init__.py
    │       └── __main__.py
    │   └── templates
    │       ├── rm_sample.yaml
    │       ├── questionnaire
    │           ├── llm_respondent.yaml
    │           └── questionnaire.md.j2
    │       ├── classify_review_effort.yaml
    │       ├── classify_problem_domain.yaml
    │       ├── repo_level_evaluation.yaml
    │       ├── estimate_difficulty.yaml
    │       ├── classify_relevant_review_comment.yaml
    │       ├── code_review_text_prompt.yaml
    │       └── code_review_llm_evaluation.yaml
├── .pre-commit-config.yaml
├── LEGAL.md
├── AGENTS.md
├── pyproject.toml
├── .gitignore
├── CLAUDE.md
├── scripts
    └── README.md
├── LICENSE
└── docs
    └── questionnaire-demo.md


/.python-version:
--------------------------------------------------------------------------------
1 | 3.10
2 | 


--------------------------------------------------------------------------------
/src/swe_care/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/swe_care/collect/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/swe_care/harness/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/swe_care/schema/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/swe_care/utils/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/swe_care/inference/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | repos:
 2 |   - repo: https://github.com/astral-sh/ruff-pre-commit
 3 |     # Ruff version.
 4 |     rev: v0.12.0
 5 |     hooks:
 6 |       # Run the linter.
 7 |       - id: ruff-check
 8 |         args: [--fix]
 9 |       # Run the formatter.
10 |       - id: ruff-format
11 | 


--------------------------------------------------------------------------------
/src/swe_care/utils/github_graphql/GetLabels.graphql:
--------------------------------------------------------------------------------
 1 | query GetLabels($prId: ID!, $cursor: String) {
 2 |   node(id: $prId) {
 3 |     ... on PullRequest {
 4 |       labels(first: 100, after: $cursor) {
 5 |         totalCount
 6 |         pageInfo {
 7 |           hasNextPage
 8 |           endCursor
 9 |         }
10 |         nodes {
11 |           name
12 |         }
13 |       }
14 |     }
15 |   }
16 | }


--------------------------------------------------------------------------------
/src/swe_care/utils/github_graphql/GetIssueLabels.graphql:
--------------------------------------------------------------------------------
 1 | query GetIssueLabels($issueId: ID!, $cursor: String) {
 2 |   node(id: $issueId) {
 3 |     ... on Issue {
 4 |       labels(first: 100, after: $cursor) {
 5 |         totalCount
 6 |         pageInfo {
 7 |           hasNextPage
 8 |           endCursor
 9 |         }
10 |         nodes {
11 |           name
12 |         }
13 |       }
14 |     }
15 |   }
16 | }


--------------------------------------------------------------------------------
/LEGAL.md:
--------------------------------------------------------------------------------
1 | Legal Disclaimer
2 | 
3 | Within this source code, the comments in Chinese shall be the original, governing version. Any comment in other languages are for reference only. In the event of any conflict between the Chinese language version comments and other language version comments, the Chinese language version shall prevail.
4 | 
5 | 法律免责声明
6 | 
7 | 关于代码注释部分，中文注释为官方版本，其它语言注释仅做参考。中文注释可能与其它语言注释存在不一致，当中文注释与其它语言注释存在不一致时，请以中文注释为准。


--------------------------------------------------------------------------------
/src/swe_care/utils/github_graphql/GetThreadComments.graphql:
--------------------------------------------------------------------------------
 1 | query GetThreadComments($threadId: ID!, $cursor: String) {
 2 |   node(id: $threadId) {
 3 |     ... on PullRequestReviewThread {
 4 |       comments(first: 100, after: $cursor) {
 5 |         totalCount
 6 |         pageInfo {
 7 |           hasNextPage
 8 |           endCursor
 9 |         }
10 |         nodes {
11 |           id
12 |         }
13 |       }
14 |     }
15 |   }
16 | }


--------------------------------------------------------------------------------
/src/swe_care/schema/inference.py:
--------------------------------------------------------------------------------
 1 | from dataclasses import dataclass
 2 | 
 3 | from dataclasses_json import dataclass_json
 4 | 
 5 | from swe_care.schema.dataset import CodeReviewTaskInstance
 6 | 
 7 | 
 8 | @dataclass_json
 9 | @dataclass
10 | class CodeReviewInferenceInstance(CodeReviewTaskInstance):
11 |     """Schema for code review inference instances."""
12 | 
13 |     text: str
14 |     """The input text including instructions, the "Oracle"/RAG retrieved file, and an example of the patch format for output."""
15 | 


--------------------------------------------------------------------------------
/src/swe_care/utils/github_graphql/GetIssueComments.graphql:
--------------------------------------------------------------------------------
 1 | query GetIssueComments($issueId: ID!, $cursor: String) {
 2 |   node(id: $issueId) {
 3 |     ... on Issue {
 4 |       comments(first: 100, after: $cursor) {
 5 |         totalCount
 6 |         pageInfo {
 7 |           hasNextPage
 8 |           endCursor
 9 |         }
10 |         nodes {
11 |           id
12 |           author {
13 |             login
14 |           }
15 |           body
16 |           createdAt
17 |           updatedAt
18 |         }
19 |       }
20 |     }
21 |   }
22 | }


--------------------------------------------------------------------------------
/src/swe_care/utils/github_graphql/GetClosingIssues.graphql:
--------------------------------------------------------------------------------
 1 | query GetClosingIssues($prId: ID!, $cursor: String) {
 2 |   node(id: $prId) {
 3 |     ... on PullRequest {
 4 |       closingIssuesReferences(first: 100, after: $cursor) {
 5 |         totalCount
 6 |         pageInfo {
 7 |           hasNextPage
 8 |           endCursor
 9 |         }
10 |         nodes {
11 |           id
12 |           number
13 |           url
14 |           title
15 |           body
16 |           state
17 |           labels(first: 10) {
18 |             totalCount
19 |             pageInfo {
20 |               hasNextPage
21 |               endCursor
22 |             }
23 |             nodes {
24 |               name
25 |             }
26 |           }
27 |         }
28 |       }
29 |     }
30 |   }
31 | }


--------------------------------------------------------------------------------
/src/swe_care/templates/rm_sample.yaml:
--------------------------------------------------------------------------------
 1 | text: |-
 2 |   {% if relevant_files -%}
 3 |   <code>
 4 |   {% for file_path, file_content in relevant_files.items() -%}
 5 |   [start of {{ file_path }}]
 6 |   {% if add_line_numbers -%}
 7 |   {% if file_content -%}
 8 |   {% for line in file_content.split('\n') -%}
 9 |   {{ "{:4d}".format(loop.index) }} {{ line }}
10 |   {% endfor -%}
11 |   {% endif -%}
12 |   {% else -%}
13 |   {{ file_content }}
14 |   {% endif -%}
15 |   [end of {{ file_path }}]
16 |   {% endfor -%}
17 |   </code>
18 |   {% endif -%}
19 |   {% if diff_hunk -%}
20 |   <diff_hunk>
21 |   {{ diff_hunk }}
22 |   </diff_hunk>
23 |   {% endif -%}
24 |   <path>{{ path }}</path>
25 |   <line>{{ line }}</line>
26 |   <review_comment>
27 |   {{ review_comment }}
28 |   </review_comment>
29 | 


--------------------------------------------------------------------------------
/src/swe_care/utils/github_graphql/GetCommits.graphql:
--------------------------------------------------------------------------------
 1 | query GetCommits($prId: ID!, $cursor: String) {
 2 |   node(id: $prId) {
 3 |     ... on PullRequest {
 4 |       commits(first: 100, after: $cursor) {
 5 |         totalCount
 6 |         pageInfo {
 7 |           hasNextPage
 8 |           endCursor
 9 |         }
10 |         nodes {
11 |           commit {
12 |             oid
13 |             message
14 |             changedFilesIfAvailable
15 |             authoredDate
16 |             author {
17 |               user {
18 |                 login
19 |               }
20 |             }
21 |             committedDate
22 |             committer {
23 |               user {
24 |                 login
25 |               }
26 |             }
27 |             parents(first: 2) {
28 |               nodes {
29 |                 oid
30 |               }
31 |             }
32 |           }
33 |         }
34 |       }
35 |     }
36 |   }
37 | }


--------------------------------------------------------------------------------
/src/swe_care/utils/github_graphql/GetReviewThreads.graphql:
--------------------------------------------------------------------------------
 1 | query GetReviewThreads($prId: ID!, $cursor: String) {
 2 |   node(id: $prId) {
 3 |     ... on PullRequest {
 4 |       reviewThreads(first: 100, after: $cursor) {
 5 |         totalCount
 6 |         pageInfo {
 7 |           hasNextPage
 8 |           endCursor
 9 |         }
10 |         nodes {
11 |           id
12 |           isResolved
13 |           isOutdated
14 |           isCollapsed
15 |           path
16 |           startLine
17 |           originalLine
18 |           diffSide
19 |           startDiffSide
20 |           resolvedBy {
21 |             login
22 |           }
23 |           comments(first: 10) {
24 |             totalCount
25 |             pageInfo {
26 |               hasNextPage
27 |               endCursor
28 |             }
29 |             nodes {
30 |               id
31 |             }
32 |           }
33 |         }
34 |       }
35 |     }
36 |   }
37 | }


--------------------------------------------------------------------------------
/src/swe_care/utils/github_graphql/GetReviewComments.graphql:
--------------------------------------------------------------------------------
 1 | query GetReviewComments($reviewId: ID!, $cursor: String) {
 2 |   node(id: $reviewId) {
 3 |     ... on PullRequestReview {
 4 |       comments(first: 100, after: $cursor) {
 5 |         totalCount
 6 |         pageInfo {
 7 |           hasNextPage
 8 |           endCursor
 9 |         }
10 |         nodes {
11 |           id
12 |           author {
13 |             login
14 |           }
15 |           body
16 |           createdAt
17 |           updatedAt
18 |           path
19 |           diffHunk
20 |           line
21 |           startLine
22 |           originalLine
23 |           originalStartLine
24 |           replyTo {
25 |             id
26 |           }
27 |           isMinimized,
28 |           minimizedReason,
29 |           commit {
30 |             oid
31 |           }
32 |           originalCommit {
33 |             oid
34 |           }
35 |         }
36 |       }
37 |     }
38 |   }
39 | }


--------------------------------------------------------------------------------
/src/swe_care/schema/evaluation.py:
--------------------------------------------------------------------------------
 1 | from dataclasses import dataclass
 2 | from typing import Any
 3 | 
 4 | from dataclasses_json import dataclass_json
 5 | 
 6 | 
 7 | @dataclass_json
 8 | @dataclass
 9 | class CodeReviewPrediction:
10 |     """Schema for code review prediction instances."""
11 | 
12 |     instance_id: str
13 |     """The instance ID of the code review task"""
14 |     review_text: str
15 |     """The prediction of the code review"""
16 |     review_trajectory: list[str] | None = None
17 |     """The trajectory of the code review, including the intermediate steps"""
18 | 
19 | 
20 | @dataclass_json
21 | @dataclass
22 | class EvaluatorResult:
23 |     """Schema for evaluator result instances."""
24 | 
25 |     evaluator: str
26 |     """The type of the evaluator"""
27 |     evaluation: dict[str, Any]
28 |     """The evaluation of the code review"""
29 | 
30 | 
31 | @dataclass_json
32 | @dataclass
33 | class CodeReviewEvaluationResult:
34 |     """Schema for code review evaluation instances."""
35 | 
36 |     instance_id: str
37 |     """The instance ID of the code review task"""
38 |     score: float
39 |     """The score of the code review"""
40 |     evaluations: list[EvaluatorResult]
41 |     """The evaluation of the code review"""
42 | 


--------------------------------------------------------------------------------
/src/swe_care/templates/questionnaire/llm_respondent.yaml:
--------------------------------------------------------------------------------
 1 | system: |-
 2 |   You are an expert code reviewer with deep expertise in software engineering, code quality assessment, and security best practices. You will evaluate a code review task by completing a structured questionnaire.
 3 | 
 4 |   Your role is to:
 5 |   1. Thoroughly analyze the provided code review context
 6 |   2. Make objective, evidence-based assessments
 7 |   3. Complete the questionnaire with precision and detail
 8 |   4. Focus on technical accuracy rather than speculation
 9 | 
10 |   Key principles:
11 |   - Be thorough but concise in your explanations
12 |   - Reference specific files, functions, and line numbers when discussing code
13 |   - Consider both the immediate changes and their broader impact
14 |   - Evaluate security, performance, maintainability, and correctness aspects
15 |   - Be critical but fair in your assessments
16 |   - For questions with multiple choice options, select the most appropriate answer
17 |   - For text fields, provide clear and concise explanations with specific references to the code when relevant
18 | 
19 | user: |-
20 |   Here is the context for the code review task:
21 | 
22 |   {{ context }}
23 | 
24 |   Now, please complete {{ section_title }} of the questionnaire.
25 | 


--------------------------------------------------------------------------------
/src/swe_care/utils/github_graphql/GetReviews.graphql:
--------------------------------------------------------------------------------
 1 | query GetReviews($prId: ID!, $cursor: String) {
 2 |   node(id: $prId) {
 3 |     ... on PullRequest {
 4 |       reviews(first: 100, after: $cursor) {
 5 |         totalCount
 6 |         pageInfo {
 7 |           hasNextPage
 8 |           endCursor
 9 |         }
10 |         nodes {
11 |           id
12 |           author {
13 |             login
14 |           }
15 |           state
16 |           body
17 |           submittedAt
18 |           updatedAt
19 |           commit {
20 |             oid
21 |           }
22 |           comments(first: 100) {
23 |             totalCount
24 |             pageInfo {
25 |               hasNextPage
26 |               endCursor
27 |             }
28 |             nodes {
29 |               id
30 |               author {
31 |                 login
32 |               }
33 |               body
34 |               createdAt
35 |               updatedAt
36 |               path
37 |               diffHunk
38 |               line
39 |               startLine
40 |               originalLine
41 |               originalStartLine
42 |               replyTo {
43 |                 id
44 |               }
45 |               isMinimized,
46 |               minimizedReason,
47 |               commit {
48 |                 oid
49 |               }
50 |               originalCommit {
51 |                 oid
52 |               }
53 |             }
54 |           }
55 |         }
56 |       }
57 |     }
58 |   }
59 | }


--------------------------------------------------------------------------------
/src/swe_care/templates/classify_review_effort.yaml:
--------------------------------------------------------------------------------
 1 | system: |-
 2 |   You are an experienced software engineer responsible for estimating code review effort. Your task is to estimate how much effort would be required to review a code change on a scale of 1 to 5, where:
 3 | 
 4 |   1 = Very Low Effort: Simple changes like typo fixes, minor documentation updates, or trivial formatting changes
 5 |   2 = Low Effort: Small bug fixes, minor feature additions, or straightforward code changes affecting a few lines
 6 |   3 = Medium Effort: Moderate complexity changes involving multiple files, standard feature implementations, or routine refactoring
 7 |   4 = High Effort: Complex changes affecting multiple components, significant new features, or architectural modifications requiring careful review
 8 |   5 = Very High Effort: Major architectural changes, complex algorithms, security-critical modifications, or changes requiring domain expertise
 9 | 
10 |   Consider factors like:
11 |   - Size and scope of the change
12 |   - Complexity of the code modifications
13 |   - Number of files affected
14 |   - Potential impact on system behavior
15 |   - Risk level of the changes
16 | 
17 |   Please respond with ONLY a single number from 1 to 5.
18 | 
19 | user: |-
20 |   Please estimate the review effort (1-5) for the following code change:
21 | 
22 |   **Pull Request Title:**
23 |   {{ title }}
24 | 
25 |   **Pull Request Description:**
26 |   {{ body }}
27 | 
28 |   **Commit Message:**
29 |   {{ commit_message }}
30 | 
31 |   **Code Changes (Patch):**
32 |   {{ patch }}


--------------------------------------------------------------------------------
/src/swe_care/templates/classify_problem_domain.yaml:
--------------------------------------------------------------------------------
 1 | system: |-
 2 |   You are an expert software engineer responsible for classifying software development tasks. Your task is to analyze a problem statement and classify it into exactly one of the following categories:
 3 | 
 4 |   1. Bug Fixes: Resolving functional errors, crashes, incorrect outputs
 5 |   2. New Feature Additions: Adding new functionality or features to the application
 6 |   3. Code Refactoring / Architectural Improvement: Improving code structure, readability, maintainability without changing external behavior
 7 |   4. Documentation Updates: Changes related to code comments or external documentation
 8 |   5. Test Suite / CI Enhancements: Improving test coverage, test quality, or continuous integration processes
 9 |   6. Performance Optimizations: Improving application speed, response time, or resource usage efficiency
10 |   7. Security Patches / Vulnerability Fixes: Fixing code defects that could lead to security issues
11 |   8. Dependency Updates & Env Compatibility: Updating third-party library dependencies or ensuring compatibility across different environments
12 |   9. Code Style, Linting, Formatting Fixes: Ensuring code complies with team coding standards and consistency
13 | 
14 |   Please respond with ONLY the category name exactly as listed above (e.g., "Bug Fixes", "New Feature Additions", etc.).
15 | 
16 | user: |-
17 |   Please classify the following problem statement into one of the predefined categories:
18 | 
19 |   Problem Statement:
20 |   {{ problem_statement }}
21 | 
22 |   Category:


--------------------------------------------------------------------------------
/src/swe_care/templates/repo_level_evaluation.yaml:
--------------------------------------------------------------------------------
 1 | system: |-
 2 |    [ROLE] You are a senior code reviewer evaluating comment quality.
 3 | 
 4 |    [OBJECTIVE] Classify comments as high-quality (1) or low-quality (0) based on impact.
 5 | 
 6 |    [CONTEXT]
 7 |    - Problem statement, diff, and file context are provided.
 8 |    - Focus on whether the comment references changed code and suggests actionable fixes.
 9 |    - Think like a developer: would you feel compelled to modify your code in response? If yes, it's likely positive.
10 | 
11 |    [CRITERIA for label=1]
12 |    ✓ Identifies concrete issue (bug, design, etc.) in the diff
13 |    ✓ Provides specific, actionable fix suggestion
14 |    ✓ References line numbers or code snippets
15 |    ✓ Helps solve the PR's goal
16 | 
17 |    [CRITERIA for label=0]
18 |    ✗ Generic ("LGTM"), vague, or social
19 |    ✗ Off-topic or nitpicking without impact
20 |    ✗ Incorrect or unhelpful
21 |    ✗ Asks question without identifying flaw
22 | 
23 |    [OUTPUT]
24 |    Return ONLY valid JSON:
25 |    ```json
26 |    {
27 |        "label": <1 for positive/useful, 0 for negative/not-useful>,
28 |        "reason": "<Explain why this comment is or isn't useful and high-quality>"
29 |    }
30 |    ```
31 | 
32 | user: |-
33 |    Please analyze the following code review comment and classify it as positive (useful/high-quality) or negative (not useful/low-quality).
34 | 
35 |    <issue>
36 |    {{ problem_statement }}
37 |    </issue>
38 | 
39 |    <patch_to_review>
40 |    {{ patch_to_review }}
41 |    </patch_to_review>
42 | 
43 |    <review>
44 |    {{ formatted_review }}
45 |    </review>
46 | 


--------------------------------------------------------------------------------
/src/swe_care/utils/github_graphql/__init__.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path
 2 | 
 3 | # --- GraphQL Queries Directory ---
 4 | GRAPHQL_QUERIES_DIR = Path(__file__).parent
 5 | 
 6 | # --- GraphQL Queries Mapping ---
 7 | GRAPHQL_QUERIES = {
 8 |     # Main query for fetching PRs with first page of nested data
 9 |     "merged_pull_requests": (
10 |         GRAPHQL_QUERIES_DIR / "GetMergedPullRequests.graphql"
11 |     ).read_text(),
12 |     # Query for fetching additional pages of labels
13 |     "labels": (GRAPHQL_QUERIES_DIR / "GetLabels.graphql").read_text(),
14 |     # Query for fetching additional pages of commits
15 |     "commits": (GRAPHQL_QUERIES_DIR / "GetCommits.graphql").read_text(),
16 |     # Query for fetching additional pages of reviews
17 |     "reviews": (GRAPHQL_QUERIES_DIR / "GetReviews.graphql").read_text(),
18 |     # Query for fetching additional pages of review comments
19 |     "review_comments": (GRAPHQL_QUERIES_DIR / "GetReviewComments.graphql").read_text(),
20 |     # Query for fetching additional pages of closing issues
21 |     "closing_issues": (GRAPHQL_QUERIES_DIR / "GetClosingIssues.graphql").read_text(),
22 |     # Query for fetching additional pages of issue labels
23 |     "issue_labels": (GRAPHQL_QUERIES_DIR / "GetIssueLabels.graphql").read_text(),
24 |     # Query for fetching additional pages of issue comments
25 |     "issue_comments": (GRAPHQL_QUERIES_DIR / "GetIssueComments.graphql").read_text(),
26 |     # Query for fetching additional pages of review threads
27 |     "review_threads": (GRAPHQL_QUERIES_DIR / "GetReviewThreads.graphql").read_text(),
28 |     # Query for fetching additional pages of thread comments
29 |     "thread_comments": (GRAPHQL_QUERIES_DIR / "GetThreadComments.graphql").read_text(),
30 |     # Query for fetching additional pages of specific PR
31 |     "specific_pr": (GRAPHQL_QUERIES_DIR / "GetSpecificPullRequest.graphql").read_text(),
32 | }
33 | 


--------------------------------------------------------------------------------
/src/swe_care/templates/estimate_difficulty.yaml:
--------------------------------------------------------------------------------
 1 | system: |-
 2 |   You are an experienced software engineer responsible for estimating implementation difficulty. Your task is to estimate how difficult it would be to implement a pull request from scratch on a scale of 1 to 3, where:
 3 | 
 4 |   1 = Low Difficulty: Simple implementations like typo fixes, minor configuration changes, straightforward bug fixes with clear solutions, basic feature additions using existing patterns, or routine maintenance tasks.
 5 | 
 6 |   2 = Medium Difficulty: Moderate implementations requiring some problem-solving, such as bug fixes requiring investigation, feature additions involving multiple components, refactoring existing code, integration with existing APIs, or changes requiring understanding of business logic.
 7 | 
 8 |   3 = High Difficulty: Complex implementations requiring significant technical expertise, such as architectural changes, performance optimizations, complex algorithms, new integrations with external systems, security-related fixes, or features requiring deep domain knowledge.
 9 | 
10 |   Consider factors like:
11 |   - Technical complexity of the problem being solved
12 |   - Amount of new code vs. modifications to existing code
13 |   - Number of systems/components involved
14 |   - Domain knowledge required
15 |   - Algorithm complexity
16 |   - Architectural impact
17 |   - Dependencies and integrations needed
18 |   - Research or investigation required
19 | 
20 |   Please respond with ONLY a single number from 1 to 3.
21 | 
22 | user: |-
23 |   Please estimate the implementation difficulty level (1, 2, 3) for the following pull request:
24 | 
25 |   **Pull Request Title:**
26 |   {{ title }}
27 | 
28 |   **Pull Request Description:**
29 |   {{ body }}
30 | 
31 |   **Commit Message:**
32 |   {{ commit_message }}
33 | 
34 |   **Changed Files:**
35 |   {{ changed_files }}
36 | 
37 |   **Code Changes (Patch):**
38 |   {{ patch }}
39 | 
40 |   Implementation Difficulty (1, 2, 3):


--------------------------------------------------------------------------------
/src/swe_care/templates/classify_relevant_review_comment.yaml:
--------------------------------------------------------------------------------
 1 | system: |-
 2 |   You are an expert code reviewer who determines whether review comments are likely to result in code changes. Your task is to classify each review comment as either "relevant" (likely to lead to code changes) or "irrelevant" (unlikely to lead to code changes).
 3 | 
 4 |   A comment is RELEVANT if it:
 5 |   - Requests specific code changes (e.g., "make this method static", "change variable name to be more descriptive")
 6 |   - Points out bugs, errors, or potential issues that need fixing
 7 |   - Suggests improvements to logic, algorithms, or data structures
 8 |   - Identifies missing functionality or edge cases that need to be handled
 9 |   - Requests changes to method signatures, parameters, or return types
10 |   - Points out code that violates best practices or design patterns
11 | 
12 |   A comment is IRRELEVANT if it:
13 |   - Is a simple approval or praise (e.g., "looks good", "LGTM", "nice", "thanks")
14 |   - Only requests formatting changes with no impact on code logic (e.g., "fix indentation", "add spaces")
15 |   - Requests adding tests (these don't change the code under review)
16 |   - Asks for clarification or explanation without suggesting changes (e.g., "please explain", "what does this do?")
17 |   - References previous comments that cannot be identified (e.g., "same as before", "see above")
18 |   - Only requests adding comments or documentation (e.g., "add Javadoc", "document this method")
19 |   - Is too vague to result in specific code changes
20 | 
21 |   Important: Focus on whether the comment would likely result in changes to the actual code logic, not just cosmetic changes or additions to test files.
22 | 
23 |   Respond with ONLY "relevant" or "irrelevant".
24 | 
25 | user: |-
26 |   Please classify the following code review comment:
27 | 
28 |   **Comment:**
29 |   {{ comment_text }}
30 |   {% if diff_hunk %}
31 | 
32 |   **Context (diff hunk):**
33 |   ```diff
34 |   {{ diff_hunk }}
35 |   ```
36 |   {% endif %}
37 |   {% if path %}
38 |   **File path:** {{ path }}
39 |   {% if line %}**Line:** {{ line }}{% endif %}
40 |   {% endif %}
41 | 
42 |   **Classification:**
43 | 


--------------------------------------------------------------------------------
/src/swe_care/templates/code_review_text_prompt.yaml:
--------------------------------------------------------------------------------
 1 | text: |-
 2 |   {% if files -%}
 3 |   You will be provided with 1) a partial code base, 2) an issue statement explaining a problem to resolve, and 3) a patch addressing the issue.
 4 |   {% else -%}
 5 |   You will be provided with 1) an issue statement explaining a problem to resolve, and 2) a patch addressing the issue.
 6 |   {% endif -%}
 7 |   <issue>
 8 |   {{ problem_statement }}
 9 |   </issue>
10 |   {% if files -%}
11 |   <code>
12 |   {% for file_path, file_content in files.items() -%}
13 |   [start of {{ file_path }}]
14 |   {% if add_line_numbers -%}
15 |   {% if file_content -%}
16 |   {% for line in file_content.split('\n') -%}
17 |   {{ "{:4d}".format(loop.index) }} {{ line }}
18 |   {% endfor -%}
19 |   {% endif -%}
20 |   {% else -%}
21 |   {{ file_content }}
22 |   {% endif -%}
23 |   [end of {{ file_path }}]
24 |   {% endfor -%}
25 |   </code>
26 |   {% endif -%}
27 |   <patch>
28 |   {{ patch }}
29 |   </patch>
30 |   I need you to comprehensively review the above patch with respect to the given issue from multiple dimensions such as functional implementation, code quality, and defects. Generate a report in the following format.
31 |   <review>
32 |   ## Function
33 |   Briefly describe the main purpose and implemented functionality of this patch. Specifically, does this patch address the issue?
34 | 
35 |   ## Complexity
36 |   Is the patch more complex than it should be? Check it at every level - lines, functions, classes.
37 | 
38 |   ## Style
39 |   Does the patch follow the programming conventions of the original code? Does it pick good names for newly introduced variables, functions, classes, and files?
40 | 
41 |   ## Documentation
42 |   Does the patch provide clear and necessary comments? If the patch changes how users build, test, interact with, or release code, does it also update associated documentation?
43 | 
44 |   ## Defects
45 |   If there are any defects in the patch, point out their locations (file path and line number) and provide improvement suggestions in the following format:
46 |   <defect>
47 |   file_path: file path of defect 1 in the patch
48 |   line: line number of defect 1 in the patch
49 |   suggestion: suggestion for defect 1
50 |   </defect>
51 |   <defect>
52 |   file_path: file path of defect 2 in the patch
53 |   line: line number of defect 2 in the patch
54 |   suggestion: suggestion for defect 2
55 |   </defect>
56 |   ...
57 |   Note:
58 |   - If there is no defect, output "None" in this section.
59 |   </review>
60 | 


--------------------------------------------------------------------------------
/src/swe_care/templates/questionnaire/questionnaire.md.j2:
--------------------------------------------------------------------------------
 1 | {#-
 2 | This template assumes a Jinja2 rendering context variable named `instance`
 3 | whose schema conforms to `CodeReviewTaskInstance` defined in `src/swe_care/schema/dataset.py`.
 4 | -#}
 5 | 
 6 | ### Code Review Annotation Questionnaire
 7 | 
 8 | **Instance ID**: `{{ instance.instance_id }}`  
 9 | **Repository**: `{{ instance.repo }}`  
10 | **Pull Request**: [`#{{ instance.pull_number }}`](https://github.com/{{ instance.repo }}/pull/{{ instance.pull_number }})  
11 | **Language**: `{{ instance.language }}`  
12 | **Created at**: `{{ instance.created_at }}`  
13 | **Base commit**: `{{ instance.base_commit }}`  
14 | **Commit to review**: `{{ instance.commit_to_review.head_commit }}`
15 | **Commit message**:
16 | > {{ instance.commit_to_review.head_commit_message | default('', true) | trim | replace('\n', '\n> ') }}
17 | 
18 | ---
19 | 
20 | ### Context
21 | 
22 | #### Problem Statement
23 | 
24 | > {{ instance.problem_statement | default('', true) | trim | replace('\n', '\n> ') }}
25 | 
26 | #### Hints (pre-PR comments or linked issues context)
27 | 
28 | > {{ instance.hints_text | default('', true) | trim | replace('\n', '\n> ') }}
29 | 
30 | #### Resolved Issues
31 | 
32 | {% if instance.resolved_issues %}
33 | {% for ri in instance.resolved_issues %}
34 | 
35 | ##### Issue #{{ ri.number }}
36 | 
37 | - Title:
38 | 
39 | > {{ ri.title | default('', true) | trim | replace('\n', '\n> ') }}
40 | 
41 | - Body:
42 | 
43 | > {{ ri.body | default('', true) | trim | replace('\n', '\n> ') }}
44 | {% endfor %}
45 | {% else %}
46 | 
47 | - None listed
48 | {% endif %}
49 | 
50 | #### Patch to Review (diff)
51 | 
52 | ```diff
53 | {{ instance.commit_to_review.patch_to_review }}
54 | ```
55 | 
56 | #### Reference Review Comments (from the PR)
57 | 
58 | Count: {{ instance.reference_review_comments | length }}
59 | 
60 | {% for c in instance.reference_review_comments %}
61 | ---
62 | 
63 | ##### Comment {{ loop.index }}
64 | 
65 | - Review comment:
66 |   
67 | > {{ c.text | default('', true) | trim | replace('\n', '\n> ') }}
68 | 
69 | - Diff hunk the review comment applied to:
70 | 
71 | ```diff
72 | {{ c.diff_hunk | default('', true) }}
73 | ```
74 | 
75 | - Path: `{{ c.path }}`
76 | - Line: {{ c.line if c.line is not none else 'n/a' }}  |  Start line: {{ c.start_line if c.start_line is not none else 'n/a' }}
77 | - Original line: {{ c.original_line if c.original_line is not none else 'n/a' }}  |  Original start line: {{ c.original_start_line if c.original_start_line is not none else 'n/a' }}
78 | {% endfor %}
79 | {% if schema_md %}
80 | ---
81 | 
82 | {{ schema_md }}
83 | {%- endif -%}
84 | 


--------------------------------------------------------------------------------
/AGENTS.md:
--------------------------------------------------------------------------------
 1 | # Repository Guidelines
 2 | 
 3 | ## Project Structure & Module Organization
 4 | - `src/swe_care/`: source package
 5 |   - `collect/` (GitHub data + dataset build CLIs)
 6 |   - `inference/` (text generation + API inference)
 7 |   - `harness/` (evaluation runners and evaluators)
 8 |   - `schema/`, `utils/`, `templates/` (data models, helpers, prompts)
 9 | - `scripts/`: orchestration utilities (e.g., `run_eval_pipeline.py`)
10 | - `docs/`: documentation and demos
11 | - `results/`: local outputs created by commands (not tracked)
12 | 
13 | ## Build, Test, and Development Commands
14 | - Install deps: `pip install uv && uv sync` (or `pip install -e .`)
15 | - Pre-commit hooks: `pre-commit install` • Run all: `pre-commit run --all-files`
16 | - Lint/format: `ruff check .` • `ruff format .`
17 | - Quick pipeline:
18 |   `python scripts/run_eval_pipeline.py --dataset-file results/dataset/code_review_task_instances.jsonl --output-dir results/pipeline_output --model gpt-4o --model-provider openai --file-source oracle`
19 | - Module CLIs: `python -m swe_care.collect ...` | `python -m swe_care.inference ...` | `python -m swe_care.harness ...`
20 | 
21 | ## Coding Style & Naming Conventions
22 | - Python ≥ 3.10 with type hints; prefer dataclasses for schemas.
23 | - Ruff (Black-like): 4-space indent, line length 88, double quotes (`pyproject.toml`).
24 | - snake_case for modules/functions/options; PascalCase for classes.
25 | - Keep functions cohesive; shared helpers in `utils/`; prompts in `templates/`.
26 | - Output naming examples: `<owner>__<repo>_graphql_prs_data.jsonl`, `<owner>__<repo>_rm_samples.jsonl`.
27 | 
28 | ## Testing Guidelines
29 | - No traditional unit tests; validate via small, reproducible runs that write to `results/`.
30 | - Verify interfaces with help: `python -m swe_care.collect <sub> -h`.
31 | - Aim for determinism (fixed params, stable sorting). Include sample outputs in PRs when logic changes.
32 | 
33 | ## Commit & Pull Request Guidelines
34 | - Conventional Commits: `feat:`, `fix:`, `docs:`, `refactor:`, `chore:`, `test:`; add scope (e.g., `inference:`).
35 | - PRs include: purpose, summary, example commands, sample outputs (paths under `results/`), and linked issues (e.g., `Closes #123`).
36 | - Keep PRs focused; document new flags/env vars in README and `--help`. Ensure pre-commit passes.
37 | 
38 | ## Security & Configuration Tips
39 | - Never commit secrets. Use env vars: `OPENAI_API_KEY`, `ANTHROPIC_API_KEY`; optional `OPENAI_BASE_URL`, `ANTHROPIC_BASE_URL`.
40 | - GitHub access via `--tokens`; watch rate limits.
41 | - Retrieval with Pyserini may require Java 21; set `--retrieval-output-dir` for temporary work.
42 | 


--------------------------------------------------------------------------------
/src/swe_care/utils/prompt_loader.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Utility functions for loading and rendering YAML-based prompt templates.
 3 | """
 4 | 
 5 | from pathlib import Path
 6 | from typing import Any
 7 | 
 8 | import yaml
 9 | from jinja2 import Environment, FileSystemLoader
10 | 
11 | # Initialize Jinja2 environment
12 | _template_dir = Path(__file__).parent.parent / "templates"
13 | _jinja_env = Environment(loader=FileSystemLoader(_template_dir))
14 | 
15 | 
16 | def load_prompt(template_name: str, **context: Any) -> str | tuple[str, str]:
17 |     """
18 |     Load a YAML-based prompt template and render it with the provided context.
19 | 
20 |     Args:
21 |         template_name: Name of the YAML template file (without extension) in the templates directory
22 |         **context: Context variables to pass to the Jinja2 template
23 | 
24 |     Returns:
25 |         - str: Rendered text if the template has only a 'text' key
26 |         - tuple[str, str]: (system_prompt, user_prompt) if the template has 'system' and 'user' keys
27 |         - str: Rendered user_prompt if the template has only a 'user' key
28 | 
29 |     Raises:
30 |         FileNotFoundError: If the template file doesn't exist
31 |         ValueError: If the template structure is invalid
32 |     """
33 |     template_path = _template_dir / f"{template_name}.yaml"
34 | 
35 |     if not template_path.exists():
36 |         raise FileNotFoundError(f"Template file not found: {template_path}")
37 | 
38 |     # Load YAML content
39 |     with open(template_path) as f:
40 |         template_data = yaml.safe_load(f)
41 | 
42 |     if not isinstance(template_data, dict):
43 |         raise ValueError(f"Invalid template structure in {template_name}.yaml")
44 | 
45 |     # Handle different template structures
46 |     if "text" in template_data:
47 |         # Simple text template
48 |         template = _jinja_env.from_string(template_data["text"])
49 |         return template.render(**context)
50 | 
51 |     elif "user" in template_data:
52 |         # User prompt with optional system prompt
53 |         user_template = _jinja_env.from_string(template_data["user"])
54 |         user_prompt = user_template.render(**context)
55 | 
56 |         if "system" in template_data:
57 |             # Both system and user prompts
58 |             system_template = _jinja_env.from_string(template_data["system"])
59 |             system_prompt = system_template.render(**context)
60 |             return (system_prompt, user_prompt)
61 |         else:
62 |             # Only user prompt
63 |             return user_prompt
64 | 
65 |     else:
66 |         raise ValueError(
67 |             f"Template {template_name}.yaml must contain either 'text' or 'user' key"
68 |         )
69 | 


--------------------------------------------------------------------------------
/src/swe_care/utils/read_file.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path
 2 | from typing import Tuple
 3 | 
 4 | import chardet
 5 | 
 6 | default_encoding = "utf-8"
 7 | common_encodings = (
 8 |     "utf-8",
 9 |     "utf-16",
10 |     "latin-1",
11 |     "ascii",
12 |     "windows-1252",
13 |     "cp1251",
14 |     "cp1253",
15 |     "cp1254",
16 |     "cp1255",
17 |     "cp1256",
18 |     "shift_jis",
19 |     "big5",
20 |     "gb2312",
21 | )
22 | 
23 | 
24 | def detect_file_encoding(file_path: Path) -> str:
25 |     """Function to detect encoding"""
26 |     # Read the file as binary data
27 |     raw_data = file_path.read_bytes()
28 |     # Detect encoding
29 |     detected = chardet.detect(raw_data)
30 |     encoding = detected["encoding"]
31 |     return encoding
32 | 
33 | 
34 | def read_file_with_encodings(file_path: Path, encodings: Tuple[str]) -> Tuple[str, str]:
35 |     """Attempt to read a file using various encodings, return content if successful"""
36 |     for encoding in encodings:
37 |         try:
38 |             content = file_path.read_text(encoding=encoding)
39 |             return content, encoding
40 |         except (UnicodeDecodeError, TypeError, ValueError, UnicodeError):
41 |             continue
42 |     raise ValueError(
43 |         f"Could not read file with any of the provided encodings: {encodings}"
44 |     )
45 | 
46 | 
47 | def read_file_with_limit(content: str, max_lines_to_read: int = None) -> str:
48 |     """Helper function to return a limited number of lines from the content."""
49 |     if max_lines_to_read is None:
50 |         return content
51 |     else:
52 |         return "\n".join(content.splitlines()[:max_lines_to_read])
53 | 
54 | 
55 | def read_file_to_string(file_path: Path | str, *, max_lines_to_read: int = None) -> str:
56 |     """Function to detect encoding and read file to string with an optional line limit."""
57 |     if isinstance(file_path, str):
58 |         file_path = Path(file_path)
59 | 
60 |     try:
61 |         content, _ = read_file_with_encodings(file_path, (default_encoding,))
62 |         return read_file_with_limit(content, max_lines_to_read)
63 |     except ValueError:
64 |         pass
65 | 
66 |     try:
67 |         detected_encoding = detect_file_encoding(file_path)
68 |         # Read the file with the detected encoding
69 |         content, _ = read_file_with_encodings(file_path, (detected_encoding,))
70 |         return read_file_with_limit(content, max_lines_to_read)
71 |     except ValueError:
72 |         pass
73 | 
74 |     try:
75 |         content, _ = read_file_with_encodings(file_path, common_encodings)
76 |         return read_file_with_limit(content, max_lines_to_read)
77 |     except ValueError:
78 |         pass
79 | 
80 |     raise ValueError(f"Could not read file: {file_path}")
81 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
  1 | [project]
  2 | name = "swe-care"
  3 | version = "0.2.0"
  4 | description = "Add your description here"
  5 | readme = "README.md"
  6 | authors = []
  7 | requires-python = ">=3.10"
  8 | dependencies = [
  9 |     "anthropic>=0.55.0",
 10 |     "dataclasses-json>=0.6.7",
 11 |     "jinja2>=3.1.0",
 12 |     "loguru>=0.7.2",
 13 |     "openai>=1.90.0",
 14 |     "requests[socks]>=2.31.0",
 15 |     "tenacity>=9.0.0",
 16 |     "tiktoken>=0.9.0",
 17 |     "tqdm>=4.66.0",
 18 |     "unidiff>=0.7.5",
 19 |     "nltk>=3.9.1",
 20 |     "rank_bm25>=0.2.2",
 21 |     "gitpython>=3.1.45",
 22 |     "jedi>=0.19.2",
 23 |     "pyserini>=1.2.0",
 24 |     "httpx[socks]>=0.28.1",
 25 |     "chardet>=5.2.0",
 26 |     "jinja2>=3.1.6",
 27 |     "tree-sitter>=0.25.1",
 28 |     "tree-sitter-python>=0.25.0",
 29 |     "datasets>=4.2.0",
 30 | ]
 31 | 
 32 | [project.scripts]
 33 | swe-care = "swe_care.collect:main"
 34 | 
 35 | [build-system]
 36 | requires = ["hatchling"]
 37 | build-backend = "hatchling.build"
 38 | 
 39 | [dependency-groups]
 40 | dev = [
 41 |     "pre-commit",
 42 |     "ruff",
 43 | ]
 44 | 
 45 | [tool.ruff]
 46 | # Exclude a variety of commonly ignored directories.
 47 | exclude = [
 48 |     ".bzr",
 49 |     ".direnv",
 50 |     ".eggs",
 51 |     ".git",
 52 |     ".git-rewrite",
 53 |     ".hg",
 54 |     ".ipynb_checkpoints",
 55 |     ".mypy_cache",
 56 |     ".nox",
 57 |     ".pants.d",
 58 |     ".pyenv",
 59 |     ".pytest_cache",
 60 |     ".pytype",
 61 |     ".ruff_cache",
 62 |     ".svn",
 63 |     ".tox",
 64 |     ".venv",
 65 |     ".vscode",
 66 |     "__pypackages__",
 67 |     "_build",
 68 |     "buck-out",
 69 |     "build",
 70 |     "dist",
 71 |     "node_modules",
 72 |     "site-packages",
 73 |     "venv",
 74 | ]
 75 | 
 76 | # Same as Black.
 77 | line-length = 88
 78 | indent-width = 4
 79 | 
 80 | # Assume Python 3.10
 81 | target-version = "py310"
 82 | 
 83 | [tool.ruff.lint]
 84 | # Enable Pyflakes (`F`) and a subset of the pycodestyle (`E`)  codes by default.
 85 | # Unlike Flake8, Ruff doesn't enable pycodestyle warnings (`W`) or
 86 | # McCabe complexity (`C901`) by default.
 87 | select = ["E4", "E7", "E9", "F"]
 88 | ignore = []
 89 | 
 90 | # Allow fix for all enabled rules (when `--fix`) is provided.
 91 | fixable = ["ALL"]
 92 | unfixable = []
 93 | 
 94 | # Allow unused variables when underscore-prefixed.
 95 | dummy-variable-rgx = "^(_+|(_+[a-zA-Z0-9_]*[a-zA-Z0-9]+?))$"
 96 | 
 97 | [tool.ruff.format]
 98 | # Like Black, use double quotes for strings.
 99 | quote-style = "double"
100 | 
101 | # Like Black, indent with spaces, rather than tabs.
102 | indent-style = "space"
103 | 
104 | # Like Black, respect magic trailing commas.
105 | skip-magic-trailing-comma = false
106 | 
107 | # Like Black, automatically detect the appropriate line ending.
108 | line-ending = "auto"
109 | 
110 | [[tool.uv.index]]
111 | name = "default"
112 | url = "https://pypi.tuna.tsinghua.edu.cn/simple"
113 | 


--------------------------------------------------------------------------------
/src/swe_care/templates/code_review_llm_evaluation.yaml:
--------------------------------------------------------------------------------
 1 | system: |-
 2 |   You are a code review evaluator. Your task is to evaluate the quality of the code review. You need to evaluate the code reviews based on the quality attributes of a standard code review, which are shown below:
 3 | 
 4 |   - Functionality: An evaluation of whether the main purpose of the patch, its functionality, and any potential functional or security defects have been described.
 5 |   - Quality: An evaluation of the accuracy of code quality descriptions, including patch complexity (line-level, function-level, class-level, file-level), code readability, optimization status, and maintainability, etc.
 6 |   - Style: An evaluation of whether the patch follows the programming conventions of the original code, e.g. the naming of variables and functions.
 7 |   - Documentation: An evaluation of whether the patch provide clear and necessary comments, as well as documentation.
 8 | 
 9 |   For each field, you should analyze:
10 | 
11 |   - Correctness: whether the review is technically correct and contains no factual errors with regard to the provided issue, code base, and patch.
12 |   - Relevance: whether the review is targeted at the issue and the code patch.
13 |   - Clarity: whether the review is clear and without redundant information.
14 |   - Consistency: whether the review is logically consistent with the issue, code base, patch, and other fields in the review.
15 |   - Language: whether the review uses professional language and contains no grammatical errors. Whether it facilitate the knowledge transfer, expresses in a kind way and provides positive feedback.
16 | 
17 |   Give a score between 0 and 1 (inclusive) to each of these five dimensions, and output your final evaluation in nested json format:
18 |   ```
19 |   {
20 |       "function": {"correctness": score, "relevance": score, "clarity": score, "consistency": score, "language": score},
21 |       "quality": {"correctness": score, "relevance": score, "clarity": score, "consistency": score, "language": score},
22 |       "style": {"correctness": score, "relevance": score, "clarity": score, "consistency": score, "language": score},
23 |       "documentation": {"correctness": score, "relevance": score, "clarity": score, "consistency": score, "language": score}
24 |   }
25 |   ```
26 | 
27 |   If you cannot identify a certain field from the review, give a 0 score to all dimensions in this field.
28 | 
29 | user: |-
30 |   Please evaluate the following code review based on the provided context and review text.
31 | 
32 |   <pull_request_info>
33 |   Title: {{ input.title }}
34 |   Repository: {{ input.repo }}
35 |   </pull_request_info>
36 | 
37 |   <problem_statement>
38 |   {{ input.problem_statement }}
39 |   </problem_statement>
40 | 
41 |   <commit_to_review>
42 |   Commit: {{ input.commit_to_review.head_commit }}
43 |   Commit Message: {{ input.commit_to_review.head_commit_message }}
44 |   </commit_to_review>
45 | 
46 |   <patch_to_review>
47 |   {{ input.commit_to_review.patch_to_review }}
48 |   </patch_to_review>
49 | 
50 |   <code_review_to_evaluate>
51 |   {{ prediction.review_text }}
52 |   </code_review_to_evaluate>
53 | 


--------------------------------------------------------------------------------
/src/swe_care/collect/get_top_repos.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Fetch top repositories for a given language and save to JSONL file.
 3 | """
 4 | 
 5 | import json
 6 | from pathlib import Path
 7 | from typing import Optional
 8 | 
 9 | from loguru import logger
10 | 
11 | from swe_care.utils.github import GitHubAPI
12 | 
13 | 
14 | def get_top_repos(
15 |     language: str,
16 |     top_n: int,
17 |     output_dir: Path | str,
18 |     tokens: Optional[list[str]] = None,
19 | ) -> None:
20 |     """
21 |     Fetch top repositories for a given language and save to JSONL file.
22 | 
23 |     Args:
24 |         language: Programming language to search for
25 |         top_n: Number of top repositories to fetch
26 |         output_dir: Directory to save the output file
27 |         tokens: Optional list of GitHub tokens for API requests
28 |     """
29 |     if isinstance(output_dir, str):
30 |         output_dir = Path(output_dir)
31 |     output_dir.mkdir(parents=True, exist_ok=True)
32 |     output_file = output_dir / f"repos_top_{top_n}_{language}.jsonl"
33 | 
34 |     # Create GitHub API instance
35 |     github_api = GitHubAPI(tokens=tokens)
36 | 
37 |     repos_collected = 0
38 |     page = 1
39 | 
40 |     with open(output_file, "w") as f:
41 |         while repos_collected < top_n:
42 |             remaining_results = top_n - repos_collected
43 |             per_page = min(100, remaining_results)  # GitHub API max is 100 per page
44 | 
45 |             params = {
46 |                 "q": f"language:{language}",
47 |                 "sort": "stars",
48 |                 "order": "desc",
49 |                 "per_page": per_page,
50 |                 "page": page,
51 |             }
52 | 
53 |             logger.info(f"Fetching page {page} with {per_page} results...")
54 | 
55 |             try:
56 |                 response = github_api.call_api("search/repositories", params=params)
57 |                 data = response.json()
58 |                 items = data.get("items", [])
59 | 
60 |                 if not items:
61 |                     logger.info(
62 |                         f"No more repositories found. Collected {repos_collected} repositories."
63 |                     )
64 |                     break
65 | 
66 |                 for repo in items:
67 |                     if repos_collected >= top_n:
68 |                         break
69 | 
70 |                     repo_data = {
71 |                         "name": repo["full_name"],
72 |                         "stars": repo["stargazers_count"],
73 |                         "url": repo["html_url"],
74 |                         "description": repo.get("description", ""),
75 |                         "owner": repo["owner"]["login"],
76 |                         "language": repo.get("language", ""),
77 |                     }
78 | 
79 |                     f.write(json.dumps(repo_data) + "\n")
80 |                     repos_collected += 1
81 | 
82 |                 logger.info(f"Collected {repos_collected}/{top_n} repositories")
83 | 
84 |                 page += 1
85 | 
86 |             except Exception as e:
87 |                 logger.error(f"Error fetching repositories: {e}")
88 |                 break
89 | 
90 |     logger.success(
91 |         f"Successfully saved {repos_collected} repositories to {output_file}"
92 |     )
93 | 


--------------------------------------------------------------------------------
/src/swe_care/harness/evaluators/utils.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Utility functions for evaluators.
 3 | """
 4 | 
 5 | import re
 6 | from typing import Optional
 7 | 
 8 | import unidiff
 9 | 
10 | from swe_care.schema.dataset import (
11 |     CodeReviewTaskInstance,
12 |     ReferenceReviewComment,
13 | )
14 | 
15 | 
16 | def _parse_diff_hunks(diff_text: str) -> list[dict]:
17 |     """Parse diff hunks from a patch text."""
18 |     patch = unidiff.PatchSet(diff_text)
19 |     results = []
20 | 
21 |     for patched_file in patch:
22 |         old_path = patched_file.path
23 |         for hunk in patched_file:
24 |             hunk_info = {
25 |                 "file": old_path,
26 |                 "old_start": hunk.source_start,
27 |                 "old_lines": hunk.source_length,
28 |                 "old_end": hunk.source_start + hunk.source_length - 1,
29 |                 "new_start": hunk.target_start,
30 |                 "new_lines": hunk.target_length,
31 |                 "new_end": hunk.target_start + hunk.target_length - 1,
32 |                 "hunk": str(hunk),
33 |             }
34 |             results.append(hunk_info)
35 |     return results
36 | 
37 | 
38 | def extract_defects_from_review(
39 |     review_text: str, input: Optional[CodeReviewTaskInstance] = None
40 | ) -> list[ReferenceReviewComment]:
41 |     """Extract defects from review text that follows the REVIEW_PROMPT format.
42 | 
43 |     Args:
44 |         review_text: The code review text containing defects in <defect> tags
45 |         input: Optional input task instance for extracting diff hunks
46 | 
47 |     Returns:
48 |         List of ReferenceReviewComment objects extracted from the review text
49 |     """
50 |     defects = []
51 | 
52 |     # Pattern to match <defect> blocks
53 |     defect_pattern = r"<defect>\s*(.*?)\s*</defect>"
54 |     defect_matches = re.findall(defect_pattern, review_text, re.DOTALL)
55 | 
56 |     for defect_content in defect_matches:
57 |         # Extract file_path, line, and suggestion from defect content
58 |         file_path_match = re.search(r"file_path:\s*(.+)", defect_content)
59 |         line_match = re.search(r"line:\s*(\d+)", defect_content)
60 |         suggestion_match = re.search(r"suggestion:\s*(.+)", defect_content, re.DOTALL)
61 | 
62 |         if file_path_match and suggestion_match:
63 |             file_path = file_path_match.group(1).strip()
64 |             line_num = int(line_match.group(1)) if line_match else None
65 |             suggestion = suggestion_match.group(1).strip()
66 | 
67 |             # Extract diff hunk based on patch_to_review, file path line number
68 |             # if the line number falls within a hunk, use that hunk.
69 |             patch = (
70 |                 input.commit_to_review.patch_to_review
71 |                 if input and input.commit_to_review
72 |                 else None
73 |             )
74 |             diff_hunks = _parse_diff_hunks(patch) if patch else []
75 |             diff_hunk = None
76 | 
77 |             for hunk in diff_hunks:
78 |                 if (
79 |                     hunk["file"] == file_path
80 |                     and line_num is not None
81 |                     and hunk["new_start"] <= line_num <= hunk["new_end"]
82 |                 ):
83 |                     diff_hunk = hunk["hunk"]
84 |                     break
85 | 
86 |             defect = ReferenceReviewComment(
87 |                 text=suggestion,
88 |                 path=file_path,
89 |                 diff_hunk=diff_hunk,
90 |                 line=line_num,
91 |                 start_line=None,
92 |                 original_line=None,
93 |                 original_start_line=None,
94 |             )
95 |             defects.append(defect)
96 | 
97 |     return defects
98 | 


--------------------------------------------------------------------------------
/src/swe_care/schema/collect.py:
--------------------------------------------------------------------------------
  1 | from dataclasses import dataclass
  2 | from typing import Literal
  3 | 
  4 | from dataclasses_json import dataclass_json
  5 | 
  6 | from swe_care.schema.dataset import ReferenceReviewComment
  7 | 
  8 | 
  9 | @dataclass_json
 10 | @dataclass
 11 | class ReviewCommentLabels:
 12 |     """Labels for a review comment."""
 13 | 
 14 |     referenced_line_changed_in_merged_commit: bool
 15 |     """Whether the referenced line was changed in the merged commit. If True, the review comment was more likely to address real issues that got fixed."""
 16 |     is_resolved: bool
 17 |     """Whether the review thread was resolved"""
 18 |     is_outdated: bool
 19 |     """Whether the review thread is outdated"""
 20 |     is_collapsed: bool
 21 |     """Whether the review thread is collapsed"""
 22 |     marked_as_dismissed: bool
 23 |     """Whether the review comment was marked as dismissed (minimized for reasons other than being resolved)"""
 24 | 
 25 | 
 26 | @dataclass_json
 27 | @dataclass
 28 | class LabeledReviewComment(ReferenceReviewComment):
 29 |     """Schema for labeled review comment instances."""
 30 | 
 31 |     labels: ReviewCommentLabels
 32 |     """Labels for the review comment"""
 33 | 
 34 | 
 35 | @dataclass_json
 36 | @dataclass
 37 | class CommitClassificationResult:
 38 |     """Schema combining commit evaluation and labeled review comments."""
 39 | 
 40 |     commit_sha: str
 41 |     """The commit SHA"""
 42 |     labeled_review_comments: list[LabeledReviewComment]
 43 |     """List of labeled review comments for this commit"""
 44 |     total_score: float
 45 |     """Total evaluation score for the commit"""
 46 |     rule_results: dict[str, bool | float]
 47 |     """Results from evaluation rules"""
 48 |     patch: str
 49 |     """Patch content between base commit and this commit"""
 50 | 
 51 | 
 52 | @dataclass_json
 53 | @dataclass
 54 | class PRClassification:
 55 |     """Schema for PR with combined commit classification data."""
 56 | 
 57 |     repo_owner: str
 58 |     """Repository owner"""
 59 |     repo_name: str
 60 |     """Repository name"""
 61 |     pr_number: int
 62 |     """Pull request number"""
 63 |     url: str
 64 |     """Pull request URL"""
 65 |     commits: list[CommitClassificationResult]
 66 |     """List of commits with classification data (evaluation + labeled review comments)"""
 67 | 
 68 | 
 69 | @dataclass_json
 70 | @dataclass
 71 | class RewardModelTrainingSampleMetadata:
 72 |     """Metadata for reward model training sample instances."""
 73 | 
 74 |     repo: str
 75 |     """Repository in format 'owner/name'"""
 76 |     pr_number: int
 77 |     """Pull request number"""
 78 |     url: str
 79 |     """Pull request URL"""
 80 |     commit_to_review: str
 81 |     """Commit SHA being reviewed"""
 82 |     file_source: Literal[
 83 |         "none",
 84 |         "base_changed_files",
 85 |         "reviewed_file",
 86 |         "retrieved_base_changed_files",
 87 |         "retrieved_all_files",
 88 |     ]
 89 |     """Source for file content ('none', 'base_changed_files', 'reviewed_file', 'retrieved_base_changed_files', or 'retrieved_all_files')"""
 90 | 
 91 | 
 92 | @dataclass_json
 93 | @dataclass
 94 | class RewardModelTrainingSample:
 95 |     """Schema for reward model training sample instances."""
 96 | 
 97 |     problem_statement: str
 98 |     """The problem statement extracted from closing issues or PR description"""
 99 |     patch_to_review: str
100 |     """The patch content to be reviewed"""
101 |     pos_review: list[str]
102 |     """List of positive review comments (referenced_line_changed_in_merged_commit=True and is_resolved=True)"""
103 |     neg_review: list[str]
104 |     """List of negative review comments (all others)"""
105 |     metadata: RewardModelTrainingSampleMetadata
106 |     """Metadata about the sample"""
107 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .nox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | *.py,cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | cover/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | .pybuilder/
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | #   For a library or package, you might want to ignore these files since the code is
 87 | #   intended to run in multiple environments; otherwise, check them in:
 88 | # .python-version
 89 | 
 90 | # pipenv
 91 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 92 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 93 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 94 | #   install all needed dependencies.
 95 | #Pipfile.lock
 96 | 
 97 | # poetry
 98 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
 99 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
100 | #   commonly ignored for libraries.
101 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102 | #poetry.lock
103 | 
104 | # pdm
105 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106 | #pdm.lock
107 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108 | #   in version control.
109 | #   https://pdm.fming.dev/#use-with-ide
110 | .pdm.toml
111 | 
112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
113 | __pypackages__/
114 | 
115 | # Celery stuff
116 | celerybeat-schedule
117 | celerybeat.pid
118 | 
119 | # SageMath parsed files
120 | *.sage.py
121 | 
122 | # Environments
123 | .env
124 | .venv
125 | env/
126 | venv/
127 | ENV/
128 | env.bak/
129 | venv.bak/
130 | 
131 | # Spyder project settings
132 | .spyderproject
133 | .spyproject
134 | 
135 | # Rope project settings
136 | .ropeproject
137 | 
138 | # mkdocs documentation
139 | /site
140 | 
141 | # mypy
142 | .mypy_cache/
143 | .dmypy.json
144 | dmypy.json
145 | 
146 | # Pyre type checker
147 | .pyre/
148 | 
149 | # pytype static type analyzer
150 | .pytype/
151 | 
152 | # Cython debug symbols
153 | cython_debug/
154 | 
155 | # PyCharm
156 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
157 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
158 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
159 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
160 | #  nuclear option because steven uses PyCharm.
161 | .idea/
162 | 
163 | # some project specific stuff
164 | results/
165 | logs/
166 | dev/
167 | 
168 | # data files
169 | *.jsonl
170 | *.json
171 | !src/**/*.json
172 | 
173 | # Jupyter Notebooks
174 | *.ipynb
175 | 
176 | .DS_Store
177 | 
178 | .serena/
179 | 


--------------------------------------------------------------------------------
/CLAUDE.md:
--------------------------------------------------------------------------------
  1 | # CLAUDE.md
  2 | 
  3 | This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository.
  4 | 
  5 | ## Project Overview
  6 | 
  7 | SWE-CARE is a comprehensive benchmark for evaluating Large Language Models (LLMs) on software engineering tasks, with a focus on code analysis, review, and issue-resolving capabilities. The project currently supports Python and Java.
  8 | 
  9 | The benchmark features two main task types:
 10 | 
 11 | 1. **Issue Resolving**: Generate code patches to fix GitHub issues
 12 | 2. **Code Review**: Generate comprehensive code review reports for code diffs
 13 | 
 14 | ## Commands
 15 | 
 16 | ### Development Setup
 17 | 
 18 | ```bash
 19 | # Install dependencies using uv (recommended)
 20 | pip install uv
 21 | uv sync
 22 | 
 23 | # Or using pip
 24 | pip install -e .
 25 | 
 26 | # Install pre-commit hooks (for development)
 27 | pre-commit install
 28 | ```
 29 | 
 30 | ### Linting
 31 | 
 32 | ```bash
 33 | # Run ruff linter (configured in pyproject.toml)
 34 | ruff check .
 35 | 
 36 | # Run ruff formatter
 37 | ruff format .
 38 | 
 39 | # Pre-commit runs both automatically
 40 | pre-commit run --all-files
 41 | ```
 42 | 
 43 | ### Running Tests
 44 | 
 45 | Note: This project doesn't have traditional unit tests. Instead, it focuses on data collection, inference, and evaluation scripts.
 46 | 
 47 | ## High-Level Architecture
 48 | 
 49 | ### Core Modules
 50 | 
 51 | 1. **`src/swe_care/collect/`** - Data collection pipeline
 52 |    - `get_top_repos.py` - Find most starred repos by language
 53 |    - `get_graphql_prs_data.py` - Fetch PR data via GitHub GraphQL API
 54 |    - `classify_prs_data.py` - Analyze commits and label review comments
 55 |    - `build_code_review_dataset.py` - Build final dataset with LLM-classified metadata
 56 |    - `convert_to_rm_samples.py` - Convert to reward model training samples
 57 | 
 58 | 2. **`src/swe_care/inference/`** - LLM inference pipeline
 59 |    - `create_code_review_text.py` - Generate text datasets with different context strategies
 60 |    - `run_api.py` - Run LLM inference on code review tasks
 61 | 
 62 | 3. **`src/swe_care/harness/`** - Evaluation framework
 63 |    - `code_review_eval.py` - Evaluate model predictions using rule-based or LLM-based evaluators
 64 | 
 65 | 4. **`src/swe_care/schema/`** - Data models
 66 |    - `dataset.py` - Core task instance schemas (IssueResolvingTaskInstance, CodeReviewTaskInstance)
 67 |    - `collect.py` - GitHub PR data schemas
 68 |    - `inference.py` - Inference input/output schemas
 69 |    - `evaluation.py` - Evaluation result schemas
 70 | 
 71 | 5. **`src/swe_care/utils/`** - Utility functions
 72 |    - `github.py` - GitHub API interactions
 73 |    - `llm_models/clients.py` - LLM API clients (OpenAI, Anthropic, etc.)
 74 |    - `bm25_retrieval.py` - BM25-based file retrieval
 75 |    - `patch.py` - Patch file manipulation
 76 | 
 77 | ### Key Patterns
 78 | 
 79 | - **Modular CLI**: Each module (`collect`, `inference`, `harness`) has its own `__main__.py` with subcommands
 80 | - **Schema-driven**: All data structures use dataclasses with JSON serialization
 81 | - **Parallel Processing**: Most operations support `--jobs` for concurrent execution
 82 | - **GitHub API Token Management**: Supports multiple tokens for rate limit handling
 83 | 
 84 | ### Data Flow
 85 | 
 86 | 1. **Collection**: GitHub repos → PR data → Classified PRs → Code review dataset
 87 | 2. **Inference**: Dataset → Text generation → LLM predictions
 88 | 3. **Evaluation**: Predictions + Dataset → Evaluation results
 89 | 
 90 | ## Important Considerations
 91 | 
 92 | - **GitHub API Rate Limits**: Always provide GitHub tokens via `--tokens` parameter
 93 | - **LLM API Keys**: Set environment variables (OPENAI_API_KEY, ANTHROPIC_API_KEY, etc.)
 94 | - **Large Files**: Be careful with retrieval operations on large repositories
 95 | - **Parallel Jobs**: Adjust `--jobs` based on API rate limits and system resources
 96 | 
 97 | ## Environment Variables
 98 | 
 99 | - `OPENAI_API_KEY` - OpenAI API key for GPT models
100 | - `ANTHROPIC_API_KEY` - Anthropic API key for Claude models
101 | - `OPENAI_BASE_URL` - Custom OpenAI-compatible API endpoint
102 | - `ANTHROPIC_BASE_URL` - Custom Anthropic-compatible API endpoint
103 | 


--------------------------------------------------------------------------------
/src/swe_care/harness/evaluators/__init__.py:
--------------------------------------------------------------------------------
  1 | from abc import ABC, abstractmethod
  2 | from typing import Any, Optional
  3 | 
  4 | from loguru import logger
  5 | 
  6 | 
  7 | class Evaluator(ABC):
  8 |     """Grade, tag, or otherwise evaluate predictions relative to their inputs
  9 |     and/or reference labels."""
 10 | 
 11 |     @property
 12 |     def evaluation_name(self) -> str:
 13 |         """The name of the evaluation."""
 14 |         return self.__class__.__name__
 15 | 
 16 |     @property
 17 |     def requires_reference(self) -> bool:
 18 |         """Whether this evaluator requires a reference label."""
 19 |         return False
 20 | 
 21 |     @property
 22 |     def requires_input(self) -> bool:
 23 |         """Whether this evaluator requires an input."""
 24 |         return False
 25 | 
 26 |     @property
 27 |     def _skip_input_warning(self) -> str:
 28 |         """Warning to show when input is ignored."""
 29 |         return f"Ignoring input in {self.__class__.__name__}, as it is not expected."
 30 | 
 31 |     @property
 32 |     def _skip_reference_warning(self) -> str:
 33 |         """Warning to show when reference is ignored."""
 34 |         return (
 35 |             f"Ignoring reference in {self.__class__.__name__}, as it is not expected."
 36 |         )
 37 | 
 38 |     def _check_evaluation_args(
 39 |         self,
 40 |         reference: Optional[Any] = None,
 41 |         input: Optional[Any] = None,
 42 |     ) -> None:
 43 |         """Check if the evaluation arguments are valid.
 44 | 
 45 |         Args:
 46 |             reference (Optional[Any], optional): The reference label.
 47 |             input (Optional[Any], optional): The input.
 48 |         Raises:
 49 |             ValueError: If the evaluator requires an input but none is provided,
 50 |                 or if the evaluator requires a reference label but none is provided.
 51 |         """
 52 |         if self.requires_input and input is None:
 53 |             raise ValueError(f"{self.__class__.__name__} requires an input.")
 54 |         elif input is not None and not self.requires_input:
 55 |             logger.warning(self._skip_input_warning)
 56 |         if self.requires_reference and reference is None:
 57 |             raise ValueError(f"{self.__class__.__name__} requires a reference.")
 58 |         elif reference is not None and not self.requires_reference:
 59 |             logger.warning(self._skip_reference_warning)
 60 | 
 61 |     @abstractmethod
 62 |     def _evaluate(
 63 |         self,
 64 |         *,
 65 |         prediction: Any,
 66 |         reference: Optional[Any] = None,
 67 |         input: Optional[Any] = None,
 68 |         **kwargs: Any,
 69 |     ) -> dict:
 70 |         """Evaluate Chain or LLM output, based on optional input and label.
 71 | 
 72 |         Args:
 73 |             prediction (Any): The LLM or chain prediction to evaluate.
 74 |             reference (Optional[Any], optional): The reference label to evaluate against.
 75 |             input (Optional[Any], optional): The input to consider during evaluation.
 76 |             kwargs: Additional keyword arguments, including callbacks, tags, etc.
 77 |         Returns:
 78 |             dict: The evaluation results containing the score or value.
 79 |                 It is recommended that the dictionary contain the following keys:
 80 |                      - score: the score of the evaluation, if applicable.
 81 |                      - value: the string value of the evaluation, if applicable.
 82 |                      - reasoning: the reasoning for the evaluation, if applicable.
 83 |         """  # noqa: E501
 84 | 
 85 |     def evaluate(
 86 |         self,
 87 |         *,
 88 |         prediction: Any,
 89 |         reference: Optional[Any] = None,
 90 |         input: Optional[Any] = None,
 91 |         **kwargs: Any,
 92 |     ) -> dict:
 93 |         """Evaluate Chain or LLM output, based on optional input and label.
 94 | 
 95 |         Args:
 96 |             prediction (Any): The LLM or chain prediction to evaluate.
 97 |             reference (Optional[Any], optional): The reference label to evaluate against.
 98 |             input (Optional[Any], optional): The input to consider during evaluation.
 99 |             kwargs: Additional keyword arguments, including callbacks, tags, etc.
100 |         Returns:
101 |             dict: The evaluation results containing the score or value.
102 |         """  # noqa: E501
103 |         self._check_evaluation_args(reference=reference, input=input)
104 |         return self._evaluate(
105 |             prediction=prediction, reference=reference, input=input, **kwargs
106 |         )
107 | 


--------------------------------------------------------------------------------
/src/swe_care/utils/github_graphql/GetSpecificPullRequest.graphql:
--------------------------------------------------------------------------------
  1 | query GetSpecificPullRequest($owner: String!, $name: String!, $prNumber: Int!) {
  2 |   repository(owner: $owner, name: $name) {
  3 |     nameWithOwner
  4 |     pullRequest(
  5 |       number: $prNumber
  6 |     ) {
  7 |       id
  8 |       title
  9 |       body
 10 |       number
 11 |       url
 12 |       author {
 13 |         login
 14 |       }
 15 |       createdAt
 16 |       mergedAt
 17 |       mergedBy {
 18 |         login
 19 |       }
 20 |       baseRefOid
 21 |       baseRefName
 22 |       headRefOid
 23 |       headRefName
 24 |       changedFiles
 25 |       labels(first: 10) {
 26 |         totalCount
 27 |         pageInfo {
 28 |           hasNextPage
 29 |           endCursor
 30 |         }
 31 |         nodes {
 32 |           name
 33 |         }
 34 |       }
 35 |       commits(first: 10) {
 36 |         totalCount
 37 |         pageInfo {
 38 |           hasNextPage
 39 |           endCursor
 40 |         }
 41 |         nodes {
 42 |           commit {
 43 |             oid
 44 |             message
 45 |             changedFilesIfAvailable
 46 |             authoredDate
 47 |             author {
 48 |               user {
 49 |                 login
 50 |               }
 51 |             }
 52 |             committedDate
 53 |             committer {
 54 |               user {
 55 |                 login
 56 |               }
 57 |             }
 58 |             parents(first: 2) {
 59 |               nodes {
 60 |                 oid
 61 |               }
 62 |             }
 63 |           }
 64 |         }
 65 |       }
 66 |       reviews(first: 10) {
 67 |         totalCount
 68 |         pageInfo {
 69 |           hasNextPage
 70 |           endCursor
 71 |         }
 72 |         nodes {
 73 |           id
 74 |           author {
 75 |             login
 76 |           }
 77 |           state
 78 |           body
 79 |           submittedAt
 80 |           updatedAt
 81 |           commit {
 82 |             oid
 83 |           }
 84 |           comments(first: 10) {
 85 |             totalCount
 86 |             pageInfo {
 87 |               hasNextPage
 88 |               endCursor
 89 |             }
 90 |             nodes {
 91 |               id
 92 |               author {
 93 |                 login
 94 |               }
 95 |               body
 96 |               createdAt
 97 |               updatedAt
 98 |               path
 99 |               diffHunk
100 |               line
101 |               startLine
102 |               originalLine
103 |               originalStartLine
104 |               replyTo {
105 |                 id
106 |               }
107 |               isMinimized,
108 |               minimizedReason,
109 |               commit {
110 |                 oid
111 |               }
112 |               originalCommit {
113 |                 oid
114 |               }
115 |             }
116 |           }
117 |         }
118 |       }
119 |       reviewThreads(first: 10) {
120 |         totalCount
121 |         pageInfo {
122 |           hasNextPage
123 |           endCursor
124 |         }
125 |         nodes {
126 |           id
127 |           isResolved
128 |           isOutdated
129 |           isCollapsed
130 |           path
131 |           startLine
132 |           originalLine
133 |           diffSide
134 |           startDiffSide
135 |           resolvedBy {
136 |             login
137 |           }
138 |           comments(first: 10) {
139 |             totalCount
140 |             pageInfo {
141 |               hasNextPage
142 |               endCursor
143 |             }
144 |             nodes {
145 |               id
146 |             }
147 |           }
148 |         }
149 |       }
150 |       closingIssuesReferences(first: 10) {
151 |         totalCount
152 |         pageInfo {
153 |           hasNextPage
154 |           endCursor
155 |         }
156 |         nodes {
157 |           id
158 |           number
159 |           url
160 |           title
161 |           body
162 |           state
163 |           labels(first: 10) {
164 |             totalCount
165 |             pageInfo {
166 |               hasNextPage
167 |               endCursor
168 |             }
169 |             nodes {
170 |               name
171 |             }
172 |           }
173 |           comments(first: 10) {
174 |             totalCount
175 |             pageInfo {
176 |               hasNextPage
177 |               endCursor
178 |             }
179 |             nodes {
180 |               id
181 |               author {
182 |                 login
183 |               }
184 |               body
185 |               createdAt
186 |               updatedAt
187 |             }
188 |           }
189 |         }
190 |       }
191 |     }
192 |   }
193 | }


--------------------------------------------------------------------------------
/src/swe_care/utils/github_graphql/GetMergedPullRequests.graphql:
--------------------------------------------------------------------------------
  1 | query GetMergedPullRequests($owner: String!, $name: String!, $prCursor: String, $maxNumber: Int!) {
  2 |   repository(owner: $owner, name: $name) {
  3 |     nameWithOwner
  4 |     pullRequests(states: [MERGED], first: $maxNumber, after: $prCursor, orderBy: {field: CREATED_AT, direction: DESC}) {
  5 |       totalCount
  6 |       pageInfo {
  7 |         hasNextPage
  8 |         endCursor
  9 |       }
 10 |       nodes {
 11 |         id
 12 |         title
 13 |         body
 14 |         number
 15 |         url
 16 |         author {
 17 |           login
 18 |         }
 19 |         createdAt
 20 |         mergedAt
 21 |         mergedBy {
 22 |           login
 23 |         }
 24 |         baseRefOid
 25 |         baseRefName
 26 |         headRefOid
 27 |         headRefName
 28 |         changedFiles
 29 | 
 30 |         labels(first: 10) {
 31 |           totalCount
 32 |           pageInfo {
 33 |             hasNextPage
 34 |             endCursor
 35 |           }
 36 |           nodes {
 37 |             name
 38 |           }
 39 |         }
 40 | 
 41 |         commits(first: 10) {
 42 |           totalCount
 43 |           pageInfo {
 44 |             hasNextPage
 45 |             endCursor
 46 |           }
 47 |           nodes {
 48 |             commit {
 49 |               oid
 50 |               message
 51 |               changedFilesIfAvailable
 52 |               authoredDate
 53 |               author {
 54 |                 user {
 55 |                   login
 56 |                 }
 57 |               }
 58 |               committedDate
 59 |               committer {
 60 |                 user {
 61 |                   login
 62 |                 }
 63 |               }
 64 |               parents(first: 2) {
 65 |                 nodes {
 66 |                   oid
 67 |                 }
 68 |               }
 69 |             }
 70 |           }
 71 |         }
 72 | 
 73 |         reviews(first: 10) {
 74 |           totalCount
 75 |           pageInfo {
 76 |             hasNextPage
 77 |             endCursor
 78 |           }
 79 |           nodes {
 80 |             id
 81 |             author {
 82 |               login
 83 |             }
 84 |             state
 85 |             body
 86 |             submittedAt
 87 |             updatedAt
 88 |             commit {
 89 |               oid
 90 |             }
 91 |             comments(first: 10) {
 92 |               totalCount
 93 |               pageInfo {
 94 |                 hasNextPage
 95 |                 endCursor
 96 |               }
 97 |               nodes {
 98 |                 id
 99 |                 author {
100 |                   login
101 |                 }
102 |                 body
103 |                 createdAt
104 |                 updatedAt
105 |                 path
106 |                 diffHunk
107 |                 line
108 |                 startLine
109 |                 originalLine
110 |                 originalStartLine
111 |                 replyTo {
112 |                     id
113 |                 }
114 |                 isMinimized,
115 |                 minimizedReason,
116 |                 commit {
117 |                   oid
118 |                 }
119 |                 originalCommit {
120 |                     oid
121 |                 }
122 |               }
123 |             }
124 |           }
125 |         }
126 | 
127 |         reviewThreads(first: 10) {
128 |           totalCount
129 |           pageInfo {
130 |             hasNextPage
131 |             endCursor
132 |           }
133 |           nodes {
134 |             id
135 |             isResolved
136 |             isOutdated
137 |             isCollapsed
138 |             path
139 |             startLine
140 |             originalLine
141 |             diffSide
142 |             startDiffSide
143 |             resolvedBy {
144 |               login
145 |             }
146 |             comments(first: 10){
147 |               totalCount
148 |               pageInfo {
149 |                 hasNextPage
150 |                 endCursor
151 |               }
152 |               nodes {
153 |                 id
154 |               }
155 |             }
156 |           }
157 |         }
158 | 
159 |         closingIssuesReferences(first: 10) {
160 |           totalCount
161 |           pageInfo {
162 |             hasNextPage
163 |             endCursor
164 |           }
165 |           nodes {
166 |             id
167 |             number
168 |             url
169 |             title
170 |             body
171 |             state
172 |             labels(first: 10) {
173 |               totalCount
174 |               pageInfo {
175 |                 hasNextPage
176 |                 endCursor
177 |               }
178 |               nodes {
179 |                 name
180 |               }
181 |             }
182 |             comments(first: 10) {
183 |               totalCount
184 |               pageInfo {
185 |                 hasNextPage
186 |                 endCursor
187 |               }
188 |               nodes {
189 |                 id
190 |                 author {
191 |                   login
192 |                 }
193 |                 body
194 |                 createdAt
195 |                 updatedAt
196 |               }
197 |             }
198 |           }
199 |         }
200 |       }
201 |     }
202 |   }
203 | }


--------------------------------------------------------------------------------
/src/swe_care/utils/patch.py:
--------------------------------------------------------------------------------
  1 | import unidiff
  2 | from loguru import logger
  3 | 
  4 | 
  5 | def get_changed_file_paths(patch_content: str) -> list[str]:
  6 |     """
  7 |     Extract file paths that are changed in a patch.
  8 | 
  9 |     Args:
 10 |         patch_content: The patch content as a string
 11 | 
 12 |     Returns:
 13 |         A list of file paths that are modified in the patch
 14 |     """
 15 |     try:
 16 |         patch_set = unidiff.PatchSet(patch_content)
 17 |         changed_files = []
 18 | 
 19 |         for patched_file in patch_set:
 20 |             file_path = patched_file.source_file
 21 |             if file_path.startswith("a/"):
 22 |                 file_path = file_path[2:]  # Remove 'a/' prefix
 23 | 
 24 |             if file_path == "/dev/null" or patched_file.is_added_file:
 25 |                 logger.debug(f"Skipping newly created file: {patched_file.path}")
 26 |                 continue
 27 | 
 28 |             changed_files.append(file_path)
 29 | 
 30 |         return changed_files
 31 |     except Exception:
 32 |         # If parsing fails, return empty list
 33 |         return []
 34 | 
 35 | 
 36 | def get_changed_lines_in_file(patch_content: str, file_path: str) -> set[int]:
 37 |     """
 38 |     Extract line numbers that were changed in a specific file from a patch.
 39 | 
 40 |     Args:
 41 |         patch_content: The patch content as a string
 42 |         file_path: The file path to analyze
 43 | 
 44 |     Returns:
 45 |         A set of line numbers that were modified in the file
 46 |     """
 47 |     try:
 48 |         patch_set = unidiff.PatchSet(patch_content)
 49 |         changed_lines = set()
 50 | 
 51 |         for patched_file in patch_set:
 52 |             # Check if this is the file we're interested in
 53 |             source_file = patched_file.source_file
 54 |             target_file = patched_file.target_file
 55 | 
 56 |             # Remove 'a/' and 'b/' prefixes
 57 |             if source_file.startswith("a/"):
 58 |                 source_file = source_file[2:]
 59 |             if target_file.startswith("b/"):
 60 |                 target_file = target_file[2:]
 61 | 
 62 |             if source_file == file_path or target_file == file_path:
 63 |                 # Analyze hunks to find changed lines
 64 |                 for hunk in patched_file:
 65 |                     for line in hunk:
 66 |                         if line.is_added or line.is_removed:
 67 |                             # For removed lines, use source line number
 68 |                             # For added lines, use target line number
 69 |                             if line.is_removed and line.source_line_no:
 70 |                                 changed_lines.add(line.source_line_no)
 71 |                             elif line.is_added and line.target_line_no:
 72 |                                 changed_lines.add(line.target_line_no)
 73 | 
 74 |         return changed_lines
 75 |     except Exception as e:
 76 |         logger.warning(f"Failed to parse patch for file {file_path}: {e}")
 77 |         return set()
 78 | 
 79 | 
 80 | def is_line_changed_in_patch(
 81 |     patch_content: str, file_path: str, line_number: int
 82 | ) -> bool:
 83 |     """
 84 |     Check if a specific line was changed in the patch.
 85 | 
 86 |     Args:
 87 |         patch_content: The patch content as a string
 88 |         file_path: The file path to check
 89 |         line_number: The line number to check
 90 | 
 91 |     Returns:
 92 |         True if the line was changed, False otherwise
 93 |     """
 94 |     changed_lines = get_changed_lines_in_file(patch_content, file_path)
 95 |     return line_number in changed_lines
 96 | 
 97 | 
 98 | def extract_new_file_content_from_patch(
 99 |     patch_content: str, file_path: str
100 | ) -> str | None:
101 |     """
102 |     Extract the full content of a newly created file from a patch.
103 | 
104 |     Args:
105 |         patch_content: The patch content as a string
106 |         file_path: The file path to extract content for
107 | 
108 |     Returns:
109 |         The full content of the newly created file if found, None otherwise
110 |     """
111 |     try:
112 |         patch_set = unidiff.PatchSet(patch_content)
113 | 
114 |         for patched_file in patch_set:
115 |             # Check if this is the file we're interested in
116 |             source_file = patched_file.source_file
117 |             target_file = patched_file.target_file
118 | 
119 |             # Remove 'a/' and 'b/' prefixes
120 |             if source_file.startswith("a/"):
121 |                 source_file = source_file[2:]
122 |             if target_file.startswith("b/"):
123 |                 target_file = target_file[2:]
124 | 
125 |             # Check if this is a newly created file (source is /dev/null)
126 |             if target_file == file_path and (
127 |                 patched_file.source_file == "/dev/null" or patched_file.is_added_file
128 |             ):
129 |                 # Extract all added lines to reconstruct the file content
130 |                 file_lines = []
131 |                 for hunk in patched_file:
132 |                     for line in hunk:
133 |                         if line.is_added:
134 |                             # Remove the '+' prefix and keep the content
135 |                             content = line.value
136 |                             # line.value includes the newline character at the end for most lines
137 |                             file_lines.append(content)
138 | 
139 |                 # Join all lines to form the complete file content
140 |                 return "".join(file_lines)
141 | 
142 |         return None
143 |     except Exception as e:
144 |         logger.warning(
145 |             f"Failed to extract new file content for {file_path} from patch: {e}"
146 |         )
147 |         return None
148 | 


--------------------------------------------------------------------------------
/src/swe_care/schema/dataset.py:
--------------------------------------------------------------------------------
  1 | from dataclasses import dataclass
  2 | from typing import Any
  3 | 
  4 | from dataclasses_json import dataclass_json
  5 | 
  6 | 
  7 | @dataclass_json
  8 | @dataclass
  9 | class ResolvedIssue:
 10 |     """Schema for resolved issue instances."""
 11 | 
 12 |     number: int
 13 |     title: str
 14 |     body: str
 15 | 
 16 | 
 17 | @dataclass_json
 18 | @dataclass
 19 | class IssueResolvingTaskMetadata:
 20 |     """Schema for metadata instances."""
 21 | 
 22 |     problem_domain: str | None
 23 |     """The problem domain"""
 24 |     difficulty: str | None
 25 |     """The task difficulty of the pull request"""
 26 | 
 27 | 
 28 | @dataclass_json
 29 | @dataclass
 30 | class IssueResolvingTaskInstance:
 31 |     """Schema for Issue Resolving task instances."""
 32 | 
 33 |     instance_id: str
 34 |     repo: str
 35 |     language: str
 36 |     pull_number: int
 37 |     title: str
 38 |     body: str
 39 |     created_at: str
 40 |     problem_statement: str
 41 |     hints_text: str
 42 |     resolved_issues: list[ResolvedIssue]
 43 |     base_commit: str
 44 |     patch: str
 45 |     test_patch: str
 46 |     env_setup_config: dict[str, Any]
 47 |     FAIL_TO_PASS: list[str]
 48 |     PASS_TO_PASS: list[str]
 49 |     version: str
 50 |     metadata: IssueResolvingTaskMetadata
 51 | 
 52 | 
 53 | @dataclass_json
 54 | @dataclass
 55 | class CodeReviewTaskMetadata:
 56 |     """Schema for code review metadata instances."""
 57 | 
 58 |     problem_domain: str | None
 59 |     """The problem domain"""
 60 |     difficulty: str | None
 61 |     """The task difficulty of the pull request"""
 62 |     estimated_review_effort: int | None
 63 |     """The estimated review effort of the pull request between 1 and 5"""
 64 | 
 65 | 
 66 | @dataclass_json
 67 | @dataclass
 68 | class ReferenceReviewComment:
 69 |     """Schema for reference review comment instances."""
 70 | 
 71 |     text: str
 72 |     """The text of the review comment"""
 73 |     path: str
 74 |     """The path of the review comment"""
 75 |     diff_hunk: str | None
 76 |     """The diff hunk of the review comment"""
 77 |     line: int | None
 78 |     """The line number of the review comment"""
 79 |     start_line: int | None
 80 |     """The start line number of the review comment"""
 81 |     original_line: int | None
 82 |     """The original line number of the review comment"""
 83 |     original_start_line: int | None
 84 |     """The original start line number of the review comment"""
 85 | 
 86 | 
 87 | @dataclass_json
 88 | @dataclass
 89 | class CommitToReview:
 90 |     """Schema for commit to review instances."""
 91 | 
 92 |     head_commit: str
 93 |     head_commit_message: str
 94 |     patch_to_review: str
 95 | 
 96 | 
 97 | @dataclass_json
 98 | @dataclass
 99 | class CodeReviewTaskInstance:
100 |     """Schema for Code Review task instances."""
101 | 
102 |     instance_id: str
103 |     """A formatted instance identifier, usually as repo_owner__repo_name-PR-number@commit_sha_short"""
104 |     repo: str
105 |     """The repository owner/name identifier from GitHub"""
106 |     language: str
107 |     """The main programming language of the repo"""
108 |     pull_number: int
109 |     """Number of PR this task instance is from"""
110 |     title: str
111 |     """The PR title"""
112 |     body: str
113 |     """The PR body"""
114 |     created_at: str
115 |     """The creation date of the pull request"""
116 |     problem_statement: str
117 |     """The issue(s) title and body"""
118 |     hints_text: str
119 |     """Comments made on the issue(s) prior to the creation of the commit to review of the solution PR."""
120 |     resolved_issues: list[ResolvedIssue]
121 |     """The resolved issues of the pull request"""
122 |     base_commit: str
123 |     """The commit hash of the repository representing the HEAD of the repository before the solution PR is applied."""
124 |     commit_to_review: CommitToReview
125 |     """The commit to review of the pull request"""
126 |     reference_review_comments: list[ReferenceReviewComment]
127 |     """The reference review comments of the commit to review"""
128 |     merged_commit: str
129 |     """The merged commit of the pull request"""
130 |     merged_patch: str
131 |     """The gold patch, the patch generated by the PR, that resolved the issue"""
132 |     metadata: CodeReviewTaskMetadata
133 | 
134 |     @classmethod
135 |     def generate_instance_id(cls, repo: str, pull_number: int, commit: str) -> str:
136 |         """Generate an instance ID from repo, pull number, and commit.
137 | 
138 |         Args:
139 |             repo: The repository owner/name identifier from GitHub
140 |             pull_number: Number of PR this task instance is from
141 |             commit: The commit hash (will be shortened to first 7 characters)
142 | 
143 |         Returns:
144 |             A formatted instance identifier as repo_owner__repo_name-PR-number@commit_sha_short
145 |         """
146 |         repo_formatted = repo.replace("/", "__")
147 |         commit_short = commit[:7]
148 |         return f"{repo_formatted}-{pull_number}@{commit_short}"
149 | 
150 |     @classmethod
151 |     def parse_instance_id(cls, instance_id: str) -> tuple[str, int, str]:
152 |         """Parse an instance ID into repo, pull number, and commit.
153 | 
154 |         Args:
155 |             instance_id: The instance ID to parse
156 | 
157 |         Returns:
158 |             A tuple of repo, pull number, and commit
159 | 
160 |         Raises:
161 |             ValueError: If the instance ID format is invalid
162 |         """
163 |         if "@" not in instance_id:
164 |             raise ValueError(
165 |                 f"Invalid instance ID format: {instance_id}. Expected format: repo_owner__repo_name-PR-number@commit_sha_short"
166 |             )
167 | 
168 |         repo_pr_part, commit = instance_id.split("@", 1)
169 | 
170 |         if "-" not in repo_pr_part:
171 |             raise ValueError(
172 |                 f"Invalid instance ID format: {instance_id}. Expected format: repo_owner__repo_name-PR-number@commit_sha_short"
173 |             )
174 | 
175 |         repo_formatted, pull_number_str = repo_pr_part.rsplit("-", 1)
176 | 
177 |         try:
178 |             pull_number = int(pull_number_str)
179 |         except ValueError:
180 |             raise ValueError(f"Invalid pull number in instance ID: {instance_id}")
181 | 
182 |         repo = repo_formatted.replace("__", "/")
183 | 
184 |         return repo, pull_number, commit
185 | 


--------------------------------------------------------------------------------
/src/swe_care/utils/file_source_retrieval.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Utility functions for retrieving relevant files based on different strategies.
  3 | """
  4 | 
  5 | import string
  6 | from pathlib import Path
  7 | from typing import Literal, Optional
  8 | 
  9 | import nltk
 10 | from loguru import logger
 11 | from nltk.tokenize import word_tokenize
 12 | from rank_bm25 import BM25Okapi
 13 | 
 14 | from swe_care.schema.collect import LabeledReviewComment
 15 | from swe_care.schema.dataset import ReferenceReviewComment
 16 | from swe_care.utils.extract_prs_data import (
 17 |     fetch_repo_file_content,
 18 |     fetch_repo_files_content_by_retrieval,
 19 | )
 20 | from swe_care.utils.patch import get_changed_file_paths
 21 | 
 22 | try:
 23 |     nltk.download("punkt_tab")
 24 | except Exception:
 25 |     logger.error(
 26 |         "Failed to download punkt_tab, maybe you need to use VPN to download it?"
 27 |     )
 28 |     raise
 29 | 
 30 | 
 31 | def get_relevant_files(
 32 |     review_comment: ReferenceReviewComment | LabeledReviewComment,
 33 |     file_source: Literal[
 34 |         "none",
 35 |         "base_changed_files",
 36 |         "reviewed_file",
 37 |         "retrieved_base_changed_files",
 38 |         "retrieved_all_files",
 39 |     ],
 40 |     repo: str,
 41 |     base_commit: str,
 42 |     patch_to_review: str,
 43 |     tokens: Optional[list[str]] = None,
 44 |     retrieval_max_files: int = 5,
 45 |     retrieval_output_dir: Optional[Path | str] = None,
 46 |     changed_files: Optional[dict[str, str]] = None,
 47 | ) -> dict[str, str]:
 48 |     """Get relevant files based on file_source strategy.
 49 | 
 50 |     Args:
 51 |         review_comment: The review comment (can be ReferenceReviewComment or LabeledReviewComment)
 52 |         file_source: Source for file content
 53 |         repo: Repository in format 'owner/name'
 54 |         base_commit: Base commit SHA
 55 |         patch_to_review: Patch content to review
 56 |         tokens: GitHub API tokens
 57 |         retrieval_max_files: Maximum number of files for retrieval
 58 |         retrieval_output_dir: Output directory for retrieval operations
 59 |         changed_files: Pre-computed changed files (optional, to avoid re-fetching)
 60 | 
 61 |     Returns:
 62 |         Dictionary mapping file paths to their contents
 63 |     """
 64 |     if file_source == "none":
 65 |         return {}
 66 | 
 67 |     if file_source == "reviewed_file" and review_comment.path:
 68 |         try:
 69 |             content = fetch_repo_file_content(
 70 |                 repo, base_commit, review_comment.path, tokens, patch_to_review
 71 |             )
 72 |             return {review_comment.path: content}
 73 |         except Exception as e:
 74 |             logger.warning(f"Failed to fetch content for {review_comment.path}: {e}")
 75 |             return {}
 76 | 
 77 |     elif file_source == "base_changed_files":
 78 |         if changed_files is not None:
 79 |             return changed_files
 80 |         else:
 81 |             return get_changed_files_in_patch(
 82 |                 repo, base_commit, patch_to_review, tokens
 83 |             )
 84 | 
 85 |     elif file_source == "retrieved_base_changed_files":
 86 |         if not review_comment.diff_hunk:
 87 |             return {}
 88 | 
 89 |         # Get changed files first
 90 |         if changed_files is None:
 91 |             changed_files = get_changed_files_in_patch(
 92 |                 repo, base_commit, patch_to_review, tokens
 93 |             )
 94 | 
 95 |         if not changed_files:
 96 |             return {}
 97 | 
 98 |         # Use BM25 to retrieve relevant files
 99 |         def preprocess(text):
100 |             text = text.lower()
101 |             text = text.translate(str.maketrans("", "", string.punctuation))
102 |             return word_tokenize(text)
103 | 
104 |         try:
105 |             query_tokens = preprocess(review_comment.diff_hunk)
106 |             bm25 = BM25Okapi(
107 |                 [preprocess(content) for content in changed_files.values()]
108 |             )
109 |             doc_scores = bm25.get_scores(query_tokens)
110 | 
111 |             top_indices = sorted(
112 |                 range(len(doc_scores)),
113 |                 key=lambda i: doc_scores[i],
114 |                 reverse=True,
115 |             )[: min(retrieval_max_files, len(changed_files))]
116 | 
117 |             file_paths = list(changed_files.keys())
118 |             relevant_files = {}
119 |             for idx in top_indices:
120 |                 file_path = file_paths[idx]
121 |                 relevant_files[file_path] = changed_files[file_path]
122 |             return relevant_files
123 |         except Exception as e:
124 |             logger.warning(f"Failed to retrieve content for diff hunk: {e}")
125 |             return {}
126 | 
127 |     elif file_source == "retrieved_all_files":
128 |         if not review_comment.diff_hunk:
129 |             return {}
130 | 
131 |         if retrieval_output_dir is None:
132 |             raise ValueError(
133 |                 "retrieval_output_dir is required when file_source is 'retrieved_all_files'"
134 |             )
135 | 
136 |         # Convert Path to str if needed
137 |         if isinstance(retrieval_output_dir, Path):
138 |             retrieval_output_dir = str(retrieval_output_dir)
139 | 
140 |         return fetch_repo_files_content_by_retrieval(
141 |             repo=repo,
142 |             commit=base_commit,
143 |             query=review_comment.diff_hunk,
144 |             retrieval_output_dir=retrieval_output_dir,
145 |             tokens=tokens,
146 |             max_files=retrieval_max_files,
147 |         )
148 | 
149 |     return {}
150 | 
151 | 
152 | def get_changed_files_in_patch(
153 |     repo: str,
154 |     base_commit: str,
155 |     patch_to_review: str,
156 |     tokens: list[str] | None = None,
157 | ) -> dict[str, str]:
158 |     """
159 |     Get file path and file content that are changed in the patch.
160 |     """
161 |     changed_files = {}
162 | 
163 |     logger.debug(f"Getting changed file paths from {base_commit} to commit_to_review")
164 |     changed_file_paths = get_changed_file_paths(patch_to_review)
165 |     logger.debug(f"Changed file paths: {changed_file_paths}")
166 | 
167 |     # Fetch file contents
168 |     for file_path in changed_file_paths:
169 |         try:
170 |             logger.debug(f"Fetching content for {file_path}")
171 |             content = fetch_repo_file_content(
172 |                 repo, base_commit, file_path, tokens, patch_to_review
173 |             )
174 |             changed_files[file_path] = content
175 |         except Exception as e:
176 |             logger.warning(f"Failed to fetch content for {file_path}: {e}")
177 |             changed_files[file_path] = ""
178 | 
179 |     # Filter out files without content and return only the files we fetched
180 |     result = {
181 |         path: content for path, content in changed_files.items() if content is not None
182 |     }
183 | 
184 |     logger.info(f"Retrieved {len(result)} changed files")
185 |     return result
186 | 


--------------------------------------------------------------------------------
/src/swe_care/utils/load.py:
--------------------------------------------------------------------------------
  1 | from pathlib import Path
  2 | 
  3 | from datasets import load_dataset
  4 | from loguru import logger
  5 | 
  6 | from swe_care.schema.dataset import CodeReviewTaskInstance
  7 | from swe_care.schema.evaluation import CodeReviewEvaluationResult, CodeReviewPrediction
  8 | from swe_care.schema.inference import CodeReviewInferenceInstance
  9 | 
 10 | 
 11 | def load_code_review_dataset(
 12 |     dataset_name_or_path: Path | str = "inclusionAI/SWE-CARE",
 13 | ) -> list[CodeReviewTaskInstance]:
 14 |     """Load the code review dataset instances from a JSONL file or Hugging Face dataset.
 15 | 
 16 |     Args:
 17 |         dataset_name_or_path: Either a file path to a local JSONL file, or a Hugging Face dataset name.
 18 |                              Defaults to "inclusionAI/SWE-CARE".
 19 | 
 20 |     Returns:
 21 |         List of CodeReviewTaskInstance objects
 22 | 
 23 |     Raises:
 24 |         FileNotFoundError: If a file path is provided but the file doesn't exist
 25 |         Exception: If there's an error parsing the file or loading from Hugging Face
 26 |     """
 27 |     # Check if dataset_name_or_path is a file path
 28 |     if isinstance(dataset_name_or_path, str):
 29 |         path = Path(dataset_name_or_path)
 30 |     else:
 31 |         path = dataset_name_or_path
 32 | 
 33 |     # If it's an existing file, load from file
 34 |     if path.exists():
 35 |         logger.info("Loading dataset instances from file...")
 36 | 
 37 |         dataset_instances: list[CodeReviewTaskInstance] = []
 38 | 
 39 |         with open(path, "r") as f:
 40 |             for line_num, line in enumerate(f, 1):
 41 |                 try:
 42 |                     instance = CodeReviewTaskInstance.from_json(line.strip())
 43 |                     dataset_instances.append(instance)
 44 |                 except Exception as e:
 45 |                     logger.error(f"Error processing line {line_num}: {e}")
 46 |                     raise e
 47 | 
 48 |         logger.success(f"Loaded {len(dataset_instances)} dataset instances from file")
 49 |         return dataset_instances
 50 | 
 51 |     # Otherwise, load from Hugging Face
 52 |     else:
 53 |         logger.info(f"Loading dataset from Hugging Face: {dataset_name_or_path}")
 54 | 
 55 |         # Load the test split from Hugging Face
 56 |         dataset = load_dataset(
 57 |             str(dataset_name_or_path), split="test", revision="0.2.0"
 58 |         )
 59 | 
 60 |         # Convert Hugging Face dataset to list of CodeReviewTaskInstance
 61 |         dataset_instances: list[CodeReviewTaskInstance] = []
 62 | 
 63 |         logger.info("Processing test split")
 64 |         for idx, item in enumerate(dataset):
 65 |             try:
 66 |                 # Convert the Hugging Face dataset item to CodeReviewTaskInstance
 67 |                 # Using from_dict method provided by dataclass_json
 68 |                 instance = CodeReviewTaskInstance.from_dict(item)
 69 |                 dataset_instances.append(instance)
 70 |             except Exception as e:
 71 |                 logger.error(f"Error converting item {idx} in test split: {e}")
 72 |                 raise e
 73 | 
 74 |         logger.success(
 75 |             f"Loaded {len(dataset_instances)} dataset instances from Hugging Face test split"
 76 |         )
 77 |         return dataset_instances
 78 | 
 79 | 
 80 | def load_code_review_predictions(
 81 |     predictions_path: Path | str,
 82 | ) -> list[CodeReviewPrediction]:
 83 |     """Load the code review predictions from the JSONL file."""
 84 |     if isinstance(predictions_path, str):
 85 |         predictions_path = Path(predictions_path)
 86 |     logger.info("Loading predictions...")
 87 | 
 88 |     if not predictions_path.exists():
 89 |         raise FileNotFoundError(f"Predictions file not found: {predictions_path}")
 90 | 
 91 |     predictions: list[CodeReviewPrediction] = []
 92 | 
 93 |     with open(predictions_path, "r") as f:
 94 |         for line_num, line in enumerate(f, 1):
 95 |             try:
 96 |                 prediction = CodeReviewPrediction.from_json(line.strip())
 97 |                 predictions.append(prediction)
 98 |             except Exception as e:
 99 |                 logger.error(f"Error processing line {line_num}: {e}")
100 |                 raise e
101 |     logger.success(f"Loaded {len(predictions)} predictions")
102 |     return predictions
103 | 
104 | 
105 | def load_code_review_text(
106 |     dataset_file: Path | str,
107 | ) -> list[CodeReviewInferenceInstance]:
108 |     """Load the code review text dataset instances from the JSONL file.
109 | 
110 |     Args:
111 |         dataset_file: Path to the input JSONL file containing CodeReviewInferenceInstance objects
112 | 
113 |     Returns:
114 |         List of CodeReviewInferenceInstance objects
115 | 
116 |     Raises:
117 |         FileNotFoundError: If the dataset file doesn't exist
118 |         Exception: If there's an error parsing the file
119 |     """
120 |     if isinstance(dataset_file, str):
121 |         dataset_file = Path(dataset_file)
122 |     logger.info("Loading inference text dataset instances...")
123 | 
124 |     if not dataset_file.exists():
125 |         raise FileNotFoundError(f"Dataset file not found: {dataset_file}")
126 | 
127 |     dataset_instances: list[CodeReviewInferenceInstance] = []
128 | 
129 |     with open(dataset_file, "r") as f:
130 |         for line_num, line in enumerate(f, 1):
131 |             try:
132 |                 instance = CodeReviewInferenceInstance.from_json(line.strip())
133 |                 dataset_instances.append(instance)
134 |             except Exception as e:
135 |                 logger.error(f"Error processing line {line_num}: {e}")
136 |                 raise e
137 | 
138 |     logger.success(f"Loaded {len(dataset_instances)} inference text dataset instances")
139 |     return dataset_instances
140 | 
141 | 
142 | def load_code_review_eval_result(
143 |     eval_result_file: Path | str,
144 | ) -> list[CodeReviewEvaluationResult]:
145 |     """Load the code review evaluation results from the JSONL file.
146 | 
147 |     Args:
148 |         eval_result_file: Path to the evaluation result JSONL file
149 | 
150 |     Returns:
151 |         List of CodeReviewEvaluationResult objects
152 | 
153 |     Raises:
154 |         FileNotFoundError: If the evaluation result file doesn't exist
155 |         Exception: If there's an error parsing the file
156 |     """
157 |     if isinstance(eval_result_file, str):
158 |         eval_result_file = Path(eval_result_file)
159 |     logger.info(f"Loading evaluation results from {eval_result_file}...")
160 | 
161 |     if not eval_result_file.exists():
162 |         raise FileNotFoundError(f"Evaluation result file not found: {eval_result_file}")
163 | 
164 |     eval_results: list[CodeReviewEvaluationResult] = []
165 | 
166 |     with open(eval_result_file, "r") as f:
167 |         for line_num, line in enumerate(f, 1):
168 |             try:
169 |                 result = CodeReviewEvaluationResult.from_json(line.strip())
170 |                 eval_results.append(result)
171 |             except Exception as e:
172 |                 logger.error(f"Error processing line {line_num}: {e}")
173 |                 raise e
174 | 
175 |     logger.success(f"Loaded {len(eval_results)} evaluation results")
176 |     return eval_results
177 | 


--------------------------------------------------------------------------------
/scripts/README.md:
--------------------------------------------------------------------------------
  1 | # SWE-CARE Evaluation Pipeline Scripts
  2 | 
  3 | This directory contains scripts to automate the SWE-CARE evaluation pipeline and analyze results.
  4 | 
  5 | ## `run_eval_pipeline.py`
  6 | 
  7 | A bootstrap script that runs the complete evaluation pipeline:
  8 | 
  9 | 1. **Generate text datasets** from collected SWE-CARE data
 10 | 2. **Run LLM inference** on code review tasks  
 11 | 3. **Evaluate predictions** using LLM evaluator (fixed to OpenAI o3)
 12 | 
 13 | ### Prerequisites
 14 | 
 15 | Set up the required environment variables:
 16 | 
 17 | ```bash
 18 | # Required
 19 | export OPENAI_API_KEY="your-openai-api-key"
 20 | export LLM_EVALUATOR_OPENAI_API_KEY="your-o3-evaluation-api-key"
 21 | 
 22 | # Optional
 23 | export ANTHROPIC_API_KEY="your-anthropic-api-key"
 24 | export OPENAI_BASE_URL="https://your-custom-openai-endpoint"
 25 | export LLM_EVALUATOR_OPENAI_BASE_URL="https://your-custom-o3-endpoint"
 26 | ```
 27 | 
 28 | ### Usage
 29 | 
 30 | Basic usage with no file context:
 31 | 
 32 | ```bash
 33 | # Using default Hugging Face dataset
 34 | python scripts/run_eval_pipeline.py \
 35 |     --output-dir results/pipeline_output \
 36 |     --model gpt-4o \
 37 |     --model-provider openai \
 38 |     --file-source none
 39 | 
 40 | # Using local dataset file
 41 | python scripts/run_eval_pipeline.py \
 42 |     --dataset-name-or-path results/dataset/code_review_task_instances.jsonl \
 43 |     --output-dir results/pipeline_output \
 44 |     --model gpt-4o \
 45 |     --model-provider openai \
 46 |     --file-source none
 47 | ```
 48 | 
 49 | With oracle file source and custom model args:
 50 | 
 51 | ```bash
 52 | python scripts/run_eval_pipeline.py \
 53 |     --dataset-name-or-path results/dataset/code_review_task_instances.jsonl \
 54 |     --output-dir results/pipeline_output \
 55 |     --model claude-3-5-sonnet-20241022 \
 56 |     --model-provider anthropic \
 57 |     --model-args "temperature=0.5,max_tokens=4096" \
 58 |     --file-source oracle \
 59 |     --github-tokens "token1" "token2"
 60 | ```
 61 | 
 62 | With BM25 retrieval:
 63 | 
 64 | ```bash
 65 | python scripts/run_eval_pipeline.py \
 66 |     --dataset-name-or-path results/dataset/code_review_task_instances.jsonl \
 67 |     --output-dir results/pipeline_output \
 68 |     --model "models/gemini-2.5-pro" \
 69 |     --model-provider openai \
 70 |     --file-source bm25 \
 71 |     --k 10 \
 72 |     --retrieval-output-dir results/retrieval_output
 73 | ```
 74 | 
 75 | Using Tree-sitter skeletons for Python files (works with any file-source):
 76 | 
 77 | ```bash
 78 | python scripts/run_eval_pipeline.py \
 79 |     --dataset-name-or-path results/dataset/code_review_task_instances.jsonl \
 80 |     --output-dir results/pipeline_output \
 81 |     --model gpt-4o \
 82 |     --model-provider openai \
 83 |     --file-source oracle \
 84 |     --use-skeleton
 85 | ```
 86 | 
 87 | ### Arguments
 88 | 
 89 | **Required:**
 90 | 
 91 | - `--output-dir`: Directory to save all pipeline outputs
 92 | - `--model`: Model name to use for inference
 93 | - `--model-provider`: Model provider (openai, anthropic, deepseek, qwen, moonshot, gemini)
 94 | 
 95 | **Optional:**
 96 | 
 97 | - `--dataset-name-or-path`: Path to the input SWE-CARE dataset file or Hugging Face dataset name (default: inclusionAI/SWE-CARE)
 98 | - `--model-args`: Comma-separated model arguments (e.g., 'temperature=0.7,top_p=0.9')
 99 | - `--file-source`: Source strategy for files (none, oracle, bm25, all)
100 | - `--k`: Maximum number of files to use (required for bm25/all)
101 | - `--retrieval-output-dir`: Output directory for retrieval operations (required for bm25/all)
102 | - `--github-tokens`: GitHub API token(s) for fetching data
103 | - `--jobs`: Number of parallel jobs (default: 2)
104 | - `--skip-existing`: Skip instances that already have predictions
105 | 
106 | ### Output Structure
107 | 
108 | The script creates the following directory structure:
109 | 
110 | ```
111 | 
112 | <output-dir>/
113 | ├── pipeline_config_YYYYMMDD_HHMMSS.json    # Complete pipeline configuration (timestamped)
114 | ├── pipeline_YYYYMMDD_HHMMSS.log           # Detailed execution log (timestamped)
115 | ├── code_review_text/                       # Generated text datasets
116 | │   └── <dataset_name>__<file_source>[**skeleton].jsonl
117 | │   └── <dataset_name>**<file_source>**k<N>[**skeleton].jsonl  # For bm25/all with k parameter
118 | ├── predictions/                            # Model predictions organized by model
119 | │   └── <safe_model_name>/                  # Model-specific subdirectory
120 | │       └── <dataset_name>**<safe_model_name>.jsonl
121 | └── evaluation/                             # Evaluation results organized by model
122 |     └── <safe_model_name>/                  # Model-specific subdirectory
123 |         └── <dataset_name>**<safe_model_name>_report_YYYYMMDD_HHMMSS.jsonl
124 | 
125 | ```
126 | 
127 | ### Notes
128 | 
129 | - The script automatically handles model names with slashes (e.g., `models/gemini-2.5-pro`)
130 | - Model predictions and evaluation results are organized in subdirectories by model name for better organization
131 | - LLM evaluation is fixed to use OpenAI o3 model with `temperature=1`
132 | - Use separate API keys for inference and evaluation via environment variables
133 | - All intermediate results are saved for debugging and analysis
134 | - The pipeline configuration and logs are timestamped for reproducibility
135 | - Evaluation report detection ensures only reports generated by the current run are used
136 | - Timestamps follow the format YYYYMMDD_HHMMSS for easy sorting and identification
137 | 
138 | ## `eval_report.py`
139 | 
140 | A comprehensive analysis script that generates detailed evaluation reports from pipeline results:
141 | 
142 | 1. **Collects evaluation results** from multiple models and settings
143 | 2. **Aggregates performance metrics** across different dimensions
144 | 3. **Handles missing instances** by assigning score 0 for fair comparison
145 | 4. **Generates rankings** of model-setting configurations
146 | 
147 | ### Prerequisites
148 | 
149 | Ensure you have run the evaluation pipeline first using `run_eval_pipeline.py`.
150 | 
151 | ### Usage
152 | 
153 | Basic usage:
154 | 
155 | ```bash
156 | # Using local dataset file
157 | python scripts/eval_report.py \
158 |     --dataset-name-or-path results/dataset/code_review_task_instances.jsonl \
159 |     --eval-output-dir results/pipeline_output/evaluation \
160 |     --report-output-file results/evaluation_report.json
161 | 
162 | # Using default Hugging Face dataset
163 | python scripts/eval_report.py \
164 |     --eval-output-dir results/pipeline_output/evaluation \
165 |     --report-output-file results/evaluation_report.json
166 | ```
167 | 
168 | ### Arguments
169 | 
170 | **Required:**
171 | 
172 | - `--eval-output-dir`: Directory containing evaluation results (organized by model)
173 | - `--report-output-file`: Path for the output JSON report file
174 | 
175 | **Optional:**
176 | 
177 | - `--dataset-name-or-path`: Path to the dataset file or Hugging Face dataset name (default: inclusionAI/SWE-CARE)
178 | 
179 | ### Notes
180 | 
181 | - The script expects evaluation results to be organized in subdirectories by model name
182 | - Filename pattern must match: `{dataset}__<file_source>__<model>_report_YYYYMMDD_HHMMSS.jsonl`
183 | - For bm25 settings: `{dataset}__bm25__k<N>__<model>_report_YYYYMMDD_HHMMSS.jsonl`
184 | - All scores are averaged including zeros for missing instances to ensure fair comparison
185 | 


--------------------------------------------------------------------------------
/src/swe_care/harness/__main__.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import sys
  3 | from pathlib import Path
  4 | from typing import Any
  5 | 
  6 | from loguru import logger
  7 | 
  8 | import swe_care.harness.code_review_eval
  9 | from swe_care.harness.code_review_eval import EvaluatorType, code_review_eval
 10 | from swe_care.utils.llm_models import (
 11 |     get_available_models_and_providers,
 12 |     parse_model_args,
 13 | )
 14 | 
 15 | 
 16 | def parse_evaluator_args(evaluator_args_str: str | None) -> dict[str, dict[str, Any]]:
 17 |     """Parse evaluator args string into a dictionary.
 18 | 
 19 |     Args:
 20 |         evaluator_args_str: String in format 'evaluator1:arg1=value1,arg2=value2;evaluator2:arg1=value1'
 21 | 
 22 |     Returns:
 23 |         Dictionary mapping evaluator types to their kwargs
 24 |     """
 25 |     if not evaluator_args_str:
 26 |         return {}
 27 | 
 28 |     result = {}
 29 | 
 30 |     # Split by semicolon to get each evaluator's args
 31 |     evaluator_parts = evaluator_args_str.split(";")
 32 | 
 33 |     for part in evaluator_parts:
 34 |         if ":" not in part:
 35 |             continue
 36 | 
 37 |         evaluator_type, args_part = part.split(":", 1)
 38 |         evaluator_type = evaluator_type.strip()
 39 | 
 40 |         # Parse the args part using the existing parse_model_args function
 41 |         kwargs = parse_model_args(args_part)
 42 |         result[evaluator_type] = kwargs
 43 | 
 44 |     return result
 45 | 
 46 | 
 47 | # Mapping of subcommands to their function names
 48 | SUBCOMMAND_MAP = {
 49 |     "code_review_eval": {
 50 |         "function": code_review_eval,
 51 |         "help": swe_care.harness.code_review_eval.__doc__,
 52 |     },
 53 | }
 54 | 
 55 | 
 56 | def create_global_parser():
 57 |     """Create a parser with global arguments that can be used as a parent parser."""
 58 |     global_parser = argparse.ArgumentParser(add_help=False)
 59 |     global_parser.add_argument(
 60 |         "--output-dir",
 61 |         type=Path,
 62 |         required=True,
 63 |         help="Path to output directory",
 64 |     )
 65 |     return global_parser
 66 | 
 67 | 
 68 | def get_args():
 69 |     # Parse command line manually to handle flexible argument order
 70 |     args = sys.argv[1:]
 71 | 
 72 |     # Find the subcommand
 73 |     subcommands = list(SUBCOMMAND_MAP.keys())
 74 |     subcommand = None
 75 |     subcommand_index = None
 76 | 
 77 |     for i, arg in enumerate(args):
 78 |         if arg in subcommands:
 79 |             subcommand = arg
 80 |             subcommand_index = i
 81 |             break
 82 | 
 83 |     # Create global parser
 84 |     global_parser = create_global_parser()
 85 | 
 86 |     if subcommand is None:
 87 |         # No subcommand found, use normal argparse
 88 |         parser = argparse.ArgumentParser(
 89 |             prog="swe_care.harness",
 90 |             description="Evaluation tools for SWE-CARE",
 91 |             parents=[global_parser],
 92 |         )
 93 | 
 94 |         subparsers = parser.add_subparsers(dest="command", help="Available commands")
 95 |         for cmd, info in SUBCOMMAND_MAP.items():
 96 |             subparsers.add_parser(cmd, help=info["help"])
 97 | 
 98 |         return parser.parse_args(args)
 99 | 
100 |     # Create the appropriate subcommand parser with global parser as parent
101 |     match subcommand:
102 |         case "code_review_eval":
103 |             sub_parser = argparse.ArgumentParser(
104 |                 prog=f"swe_care.harness {subcommand}",
105 |                 parents=[global_parser],
106 |                 description=SUBCOMMAND_MAP[subcommand]["help"],
107 |             )
108 |             sub_parser.add_argument(
109 |                 "--dataset-name-or-path",
110 |                 type=str,
111 |                 required=False,
112 |                 default="inclusionAI/SWE-CARE",
113 |                 help="Path to the dataset file or Hugging Face dataset name (default: inclusionAI/SWE-CARE)",
114 |             )
115 |             sub_parser.add_argument(
116 |                 "--predictions-path",
117 |                 type=Path,
118 |                 required=True,
119 |                 help="Path to the predictions file or directory containing predictions",
120 |             )
121 |             sub_parser.add_argument(
122 |                 "--evaluator",
123 |                 type=EvaluatorType,
124 |                 nargs="+",
125 |                 required=True,
126 |                 choices=[e.value for e in EvaluatorType],
127 |                 help="Evaluator type to use",
128 |             )
129 | 
130 |             available_providers, available_models = get_available_models_and_providers()
131 | 
132 |             sub_parser.add_argument(
133 |                 "--model",
134 |                 type=str,
135 |                 required=False,
136 |                 help=f"Model name to use for LLM evaluation. Available models: {', '.join(available_models)}",
137 |             )
138 |             sub_parser.add_argument(
139 |                 "--model-provider",
140 |                 type=str,
141 |                 required=False,
142 |                 choices=available_providers,
143 |                 help=f"Model provider for LLM evaluation. Available providers: {', '.join(available_providers)}",
144 |             )
145 |             sub_parser.add_argument(
146 |                 "--model-args",
147 |                 type=str,
148 |                 required=False,
149 |                 default=None,
150 |                 help="Comma-separated model arguments for LLM evaluation (e.g., 'temperature=0.7,top_p=0.9')",
151 |             )
152 |             sub_parser.add_argument(
153 |                 "--evaluator-args",
154 |                 type=str,
155 |                 required=False,
156 |                 default=None,
157 |                 help="Evaluator-specific arguments in format 'evaluator1:arg1=value1,arg2=value2;evaluator2:arg1=value1'",
158 |             )
159 |             sub_parser.add_argument(
160 |                 "--jobs",
161 |                 type=int,
162 |                 default=2,
163 |                 help="Number of parallel jobs to run (default: 2)",
164 |             )
165 | 
166 |     # Parse all arguments with the subcommand parser
167 |     # This will include both global and subcommand-specific arguments
168 |     # Remove the subcommand itself from args
169 |     args_without_subcommand = args[:subcommand_index] + args[subcommand_index + 1 :]
170 |     final_namespace = sub_parser.parse_args(args_without_subcommand)
171 |     final_namespace.command = subcommand
172 | 
173 |     return final_namespace
174 | 
175 | 
176 | def main():
177 |     args = get_args()
178 | 
179 |     if args.command in SUBCOMMAND_MAP:
180 |         # Get the function from the mapping
181 |         cmd_info = SUBCOMMAND_MAP[args.command]
182 |         function = cmd_info["function"]
183 | 
184 |         # Prepare common arguments
185 |         common_kwargs = {"output_dir": args.output_dir}
186 | 
187 |         # Add specific arguments based on subcommand
188 |         match args.command:
189 |             case "code_review_eval":
190 |                 # Parse evaluator args
191 |                 evaluator_kwargs = parse_evaluator_args(args.evaluator_args)
192 | 
193 |                 function(
194 |                     dataset_name_or_path=args.dataset_name_or_path,
195 |                     predictions_path=args.predictions_path,
196 |                     evaluator_types=args.evaluator,
197 |                     model=args.model,
198 |                     model_provider=args.model_provider,
199 |                     model_args=args.model_args,
200 |                     evaluator_kwargs=evaluator_kwargs,
201 |                     jobs=args.jobs,
202 |                     **common_kwargs,
203 |                 )
204 |     else:
205 |         logger.info("Please specify a command. Use --help for available commands.")
206 | 
207 | 
208 | if __name__ == "__main__":
209 |     main()
210 | 


--------------------------------------------------------------------------------
/src/swe_care/utils/llm_models/__init__.py:
--------------------------------------------------------------------------------
  1 | from typing import Any
  2 | 
  3 | from loguru import logger
  4 | 
  5 | from swe_care.utils.llm_models.clients import (
  6 |     AnthropicClient,
  7 |     BaseModelClient,
  8 |     DeepSeekClient,
  9 |     GeminiClient,
 10 |     MoonshotClient,
 11 |     OpenAIClient,
 12 |     QwenClient,
 13 | )
 14 | 
 15 | # Map of available LLM clients cited from https://github.com/BerriAI/litellm/blob/main/model_prices_and_context_window.json
 16 | LLM_CLIENT_MAP = {
 17 |     "openai": {
 18 |         "client_class": OpenAIClient,
 19 |         "models": [
 20 |             {"name": "gpt-4o", "max_input_tokens": 128000},
 21 |             {"name": "gpt-4o-mini", "max_input_tokens": 128000},
 22 |             {"name": "gpt-4.1", "max_input_tokens": 1047576},
 23 |             {"name": "gpt-4.5-preview", "max_input_tokens": 128000},
 24 |             {"name": "gpt-5", "max_input_tokens": 128000},
 25 |             {"name": "gpt-5-chat", "max_input_tokens": 128000},
 26 |             {"name": "o1", "max_input_tokens": 200000},
 27 |             {"name": "o1-mini", "max_input_tokens": 128000},
 28 |             {"name": "o3", "max_input_tokens": 200000},
 29 |             {"name": "o3-mini", "max_input_tokens": 200000},
 30 |         ],
 31 |     },
 32 |     "anthropic": {
 33 |         "client_class": AnthropicClient,
 34 |         "models": [
 35 |             {"name": "claude-opus-4", "max_input_tokens": 200000},
 36 |             {"name": "claude-sonnet-4", "max_input_tokens": 200000},
 37 |             {"name": "claude-3-7-sonnet", "max_input_tokens": 200000},
 38 |         ],
 39 |     },
 40 |     "deepseek": {
 41 |         "client_class": DeepSeekClient,
 42 |         "models": [
 43 |             {"name": "deepseek-chat", "max_input_tokens": 128000},  # DeepSeek-V3.1
 44 |             {"name": "DeepSeek-V3.1", "max_input_tokens": 128000},
 45 |             {
 46 |                 "name": "deepseek-reasoner",
 47 |                 "max_input_tokens": 128000,
 48 |             },  # DeepSeek-V3.1 thinking
 49 |         ],
 50 |     },
 51 |     "qwen": {
 52 |         "client_class": QwenClient,
 53 |         "models": [
 54 |             {"name": "qwen3-32b", "max_input_tokens": 128000},
 55 |             {"name": "qwen3-30b-a3b", "max_input_tokens": 128000},
 56 |             {"name": "qwen3-235b-a22b", "max_input_tokens": 128000},
 57 |         ],
 58 |     },
 59 |     "moonshot": {
 60 |         "client_class": MoonshotClient,
 61 |         "models": [
 62 |             {"name": "kimi-k2-0711-preview", "max_input_tokens": 131072},
 63 |             {"name": "kimi-k2-0905-preview", "max_input_tokens": 131072},
 64 |         ],
 65 |     },
 66 |     "gemini": {
 67 |         "client_class": GeminiClient,
 68 |         "models": [
 69 |             {"name": "gemini-2.5-pro", "max_input_tokens": 1048576},
 70 |         ],
 71 |     },
 72 | }
 73 | 
 74 | 
 75 | def get_available_models_and_providers() -> tuple[list[str], list[str]]:
 76 |     """Get available models and providers from LLM_CLIENT_MAP."""
 77 |     available_providers = list(LLM_CLIENT_MAP.keys())
 78 |     available_models = []
 79 |     for provider_info in LLM_CLIENT_MAP.values():
 80 |         available_models.extend([model["name"] for model in provider_info["models"]])
 81 |     return available_providers, available_models
 82 | 
 83 | 
 84 | def init_llm_client(
 85 |     model: str, model_provider: str, **model_kwargs: Any
 86 | ) -> BaseModelClient:
 87 |     """Initialize an LLM client.
 88 | 
 89 |     Args:
 90 |         model: Model name
 91 |         model_provider: Provider name (openai, anthropic)
 92 |         **model_kwargs: Additional model arguments
 93 | 
 94 |     Returns:
 95 |         Initialized LLM client
 96 | 
 97 |     Raises:
 98 |         ValueError: If the model provider or model is not supported
 99 |     """
100 |     if model_provider not in LLM_CLIENT_MAP:
101 |         raise ValueError(
102 |             f"Unsupported model provider: {model_provider}. "
103 |             f"Supported providers: {list(LLM_CLIENT_MAP.keys())}"
104 |         )
105 | 
106 |     provider_info = LLM_CLIENT_MAP[model_provider]
107 | 
108 |     _, model_list = get_available_models_and_providers()
109 | 
110 |     # if model not in model_list:
111 |     #     logger.warning(
112 |     #         f"Model {model} not in known models for {model_provider}. "
113 |     #         f"Known models: {provider_info['models']}. Proceeding anyway..."
114 |     #     )
115 | 
116 |     client_class = provider_info["client_class"]
117 |     return client_class(model, model_provider, **model_kwargs)
118 | 
119 | 
120 | def parse_model_args(model_args_str: str | None) -> dict[str, Any]:
121 |     """Parse model arguments string into a dictionary.
122 | 
123 |     Args:
124 |         model_args_str: Comma-separated string of key=value pairs
125 | 
126 |     Returns:
127 |         Dictionary of parsed arguments
128 | 
129 |     Example:
130 |         "top_p=0.95,temperature=0.70" -> {"top_p": 0.95, "temperature": 0.70}
131 |     """
132 |     if not model_args_str:
133 |         return {}
134 | 
135 |     args = {}
136 |     for pair in model_args_str.split(","):
137 |         if "=" not in pair:
138 |             logger.warning(f"Skipping invalid model argument: {pair}")
139 |             continue
140 | 
141 |         key, value = pair.split("=", 1)
142 |         key = key.strip()
143 |         value = value.strip()
144 | 
145 |         # Try to convert to appropriate type
146 |         try:
147 |             # Try int first
148 |             if value.isdigit() or (value.startswith("-") and value[1:].isdigit()):
149 |                 args[key] = int(value)
150 |             # Try float
151 |             elif "." in value and value.replace(".", "").replace("-", "").isdigit():
152 |                 args[key] = float(value)
153 |             # Try boolean
154 |             elif value.lower() in ("true", "false"):
155 |                 args[key] = value.lower() == "true"
156 |             # Keep as string
157 |             else:
158 |                 args[key] = value
159 |         except ValueError:
160 |             args[key] = value
161 | 
162 |     return args
163 | 
164 | 
165 | def get_model_info(model_name: str) -> tuple[str, int]:
166 |     """Get normalized model name and max input tokens for a given model.
167 | 
168 |     Args:
169 |         model_name: Model name that might be from different providers
170 |                     (e.g., 'us.anthropic.claude-sonnet-4-20250514-v1:0')
171 | 
172 |     Returns:
173 |         Tuple of (normalized_model_name, max_input_tokens)
174 | 
175 |     Raises:
176 |         ValueError: If model cannot be found
177 |     """
178 |     # Normalize model name by checking for known patterns
179 |     normalized_name = None
180 |     max_tokens = None
181 | 
182 |     # Check each provider's models
183 |     for provider_info in LLM_CLIENT_MAP.values():
184 |         for model_info in provider_info["models"]:
185 |             model_canonical_name = model_info["name"]
186 | 
187 |             # Check if the canonical name is contained in the input model name
188 |             if model_canonical_name in model_name:
189 |                 normalized_name = model_canonical_name
190 |                 max_tokens = model_info["max_input_tokens"]
191 |                 break
192 | 
193 |             # Also check for partial matches (e.g., "claude-sonnet-4" in "claude-sonnet-4-20250514")
194 |             if "-" in model_canonical_name:
195 |                 parts = model_canonical_name.split("-")
196 |                 # Try different combinations of parts
197 |                 for i in range(len(parts), 0, -1):
198 |                     partial = "-".join(parts[:i])
199 |                     if (
200 |                         partial in model_name and len(partial) > 3
201 |                     ):  # Avoid too short matches
202 |                         normalized_name = model_canonical_name
203 |                         max_tokens = model_info["max_input_tokens"]
204 |                         break
205 | 
206 |         if normalized_name:
207 |             break
208 | 
209 |     if not normalized_name:
210 |         # If no exact match found, log warning and return defaults
211 |         logger.warning(
212 |             f"Could not find exact match for model '{model_name}' in LLM_CLIENT_MAP. "
213 |             "Using default token limit."
214 |         )
215 |         # Return the original model name with a conservative default
216 |         return model_name, 128000  # Conservative default
217 | 
218 |     return normalized_name, max_tokens
219 | 


--------------------------------------------------------------------------------
/src/swe_care/inference/__main__.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import sys
  3 | from pathlib import Path
  4 | 
  5 | from loguru import logger
  6 | 
  7 | import swe_care.inference.create_code_review_text
  8 | import swe_care.inference.run_api
  9 | from swe_care.inference.create_code_review_text import create_code_review_text
 10 | from swe_care.inference.run_api import run_api
 11 | from swe_care.utils.llm_models import get_available_models_and_providers
 12 | 
 13 | # Mapping of subcommands to their function names
 14 | SUBCOMMAND_MAP = {
 15 |     "create_code_review_text": {
 16 |         "function": create_code_review_text,
 17 |         "help": swe_care.inference.create_code_review_text.__doc__,
 18 |     },
 19 |     "run_api": {
 20 |         "function": run_api,
 21 |         "help": swe_care.inference.run_api.__doc__,
 22 |     },
 23 | }
 24 | 
 25 | 
 26 | def create_global_parser():
 27 |     """Create a parser with global arguments that can be used as a parent parser."""
 28 |     global_parser = argparse.ArgumentParser(add_help=False)
 29 |     global_parser.add_argument(
 30 |         "--output-dir",
 31 |         type=Path,
 32 |         required=True,
 33 |         help="Path to output directory",
 34 |     )
 35 |     return global_parser
 36 | 
 37 | 
 38 | def get_args():
 39 |     # Parse command line manually to handle flexible argument order
 40 |     args = sys.argv[1:]
 41 | 
 42 |     # Find the subcommand
 43 |     subcommands = list(SUBCOMMAND_MAP.keys())
 44 |     subcommand = None
 45 |     subcommand_index = None
 46 | 
 47 |     for i, arg in enumerate(args):
 48 |         if arg in subcommands:
 49 |             subcommand = arg
 50 |             subcommand_index = i
 51 |             break
 52 | 
 53 |     # Create global parser
 54 |     global_parser = create_global_parser()
 55 | 
 56 |     if subcommand is None:
 57 |         # No subcommand found, use normal argparse
 58 |         parser = argparse.ArgumentParser(
 59 |             prog="swe_care.inference",
 60 |             description="Inference tools for SWE-CARE",
 61 |             parents=[global_parser],
 62 |         )
 63 | 
 64 |         subparsers = parser.add_subparsers(dest="command", help="Available commands")
 65 |         for cmd, info in SUBCOMMAND_MAP.items():
 66 |             subparsers.add_parser(cmd, help=info["help"])
 67 | 
 68 |         return parser.parse_args(args)
 69 | 
 70 |     # Create the appropriate subcommand parser with global parser as parent
 71 |     match subcommand:
 72 |         case "create_code_review_text":
 73 |             sub_parser = argparse.ArgumentParser(
 74 |                 prog=f"swe_care.inference {subcommand}",
 75 |                 parents=[global_parser],
 76 |                 description=SUBCOMMAND_MAP[subcommand]["help"],
 77 |             )
 78 |             sub_parser.add_argument(
 79 |                 "--dataset-name-or-path",
 80 |                 type=str,
 81 |                 required=False,
 82 |                 default="inclusionAI/SWE-CARE",
 83 |                 help="Path to the input SWE-CARE dataset file, or Hugging Face dataset name (default: inclusionAI/SWE-CARE)",
 84 |             )
 85 |             sub_parser.add_argument(
 86 |                 "--file-source",
 87 |                 type=str,
 88 |                 choices=["none", "oracle", "bm25", "all"],
 89 |                 required=True,
 90 |                 help="Source strategy for files: 'none' (no files), 'oracle' (ground truth), 'bm25' (retrieval), or 'all' (all available)",
 91 |             )
 92 |             sub_parser.add_argument(
 93 |                 "--k",
 94 |                 type=int,
 95 |                 required=False,
 96 |                 default=None,
 97 |                 help="Maximum number of files to use (required when --file-source is 'bm25' or 'all')",
 98 |             )
 99 |             sub_parser.add_argument(
100 |                 "--retrieval-output-dir",
101 |                 type=Path,
102 |                 default=None,
103 |                 help="Output directory for retrieval operations (required when --file-source is 'bm25' or 'all')",
104 |             )
105 |             sub_parser.add_argument(
106 |                 "--tokens",
107 |                 type=str,
108 |                 nargs="*",
109 |                 default=None,
110 |                 help="GitHub API token(s) to be used randomly for fetching data",
111 |             )
112 |             sub_parser.add_argument(
113 |                 "--jobs",
114 |                 type=int,
115 |                 default=2,
116 |                 help="Number of parallel jobs for multithreaded processing (default: 2)",
117 |             )
118 |             sub_parser.add_argument(
119 |                 "--skip-existing",
120 |                 action="store_true",
121 |                 default=False,
122 |                 help="Skip existing instances in the output file based on instance_id (default: False)",
123 |             )
124 |             sub_parser.add_argument(
125 |                 "--use-skeleton",
126 |                 action="store_true",
127 |                 default=False,
128 |                 help="Use TreeSitter-based Python stubs for file contents (default: False)",
129 |             )
130 | 
131 |         case "run_api":
132 |             sub_parser = argparse.ArgumentParser(
133 |                 prog=f"swe_care.inference {subcommand}",
134 |                 parents=[global_parser],
135 |                 description=SUBCOMMAND_MAP[subcommand]["help"],
136 |             )
137 |             sub_parser.add_argument(
138 |                 "--dataset-file",
139 |                 type=Path,
140 |                 required=True,
141 |                 help="Path to the input dataset file containing CodeReviewInferenceInstance objects",
142 |             )
143 | 
144 |             available_providers, available_models = get_available_models_and_providers()
145 | 
146 |             sub_parser.add_argument(
147 |                 "--model",
148 |                 type=str,
149 |                 required=True,
150 |                 help=f"Model name to use for inference. Available models: {', '.join(available_models)}",
151 |             )
152 |             sub_parser.add_argument(
153 |                 "--model-provider",
154 |                 type=str,
155 |                 required=True,
156 |                 choices=available_providers,
157 |                 default="openai" if "openai" in available_providers else None,
158 |                 help=f"Model provider. Available providers: {', '.join(available_providers)}",
159 |             )
160 |             sub_parser.add_argument(
161 |                 "--model-args",
162 |                 type=str,
163 |                 required=False,
164 |                 default=None,
165 |                 help="List of model arguments separated by commas (e.g., 'top_p=0.95,temperature=0.70')",
166 |             )
167 |             sub_parser.add_argument(
168 |                 "--jobs",
169 |                 type=int,
170 |                 default=2,
171 |                 help="Number of parallel jobs for multithreaded inference (default: 2)",
172 |             )
173 |             sub_parser.add_argument(
174 |                 "--skip-existing",
175 |                 action="store_true",
176 |                 default=False,
177 |                 help="Whether to skip existing predictions in the output file (default: False)",
178 |             )
179 | 
180 |     # Parse all arguments with the subcommand parser
181 |     # This will include both global and subcommand-specific arguments
182 |     # Remove the subcommand itself from args
183 |     args_without_subcommand = args[:subcommand_index] + args[subcommand_index + 1 :]
184 |     final_namespace = sub_parser.parse_args(args_without_subcommand)
185 |     final_namespace.command = subcommand
186 | 
187 |     return final_namespace
188 | 
189 | 
190 | def main():
191 |     args = get_args()
192 | 
193 |     if args.command in SUBCOMMAND_MAP:
194 |         # Get the function from the mapping
195 |         cmd_info = SUBCOMMAND_MAP[args.command]
196 |         function = cmd_info["function"]
197 | 
198 |         # Prepare common arguments
199 |         common_kwargs = {"output_dir": args.output_dir}
200 | 
201 |         # Add specific arguments based on subcommand
202 |         match args.command:
203 |             case "create_code_review_text":
204 |                 function(
205 |                     dataset_name_or_path=args.dataset_name_or_path,
206 |                     file_source=args.file_source,
207 |                     k=args.k,
208 |                     retrieval_output_dir=args.retrieval_output_dir,
209 |                     tokens=args.tokens,
210 |                     jobs=args.jobs,
211 |                     skip_existing=args.skip_existing,
212 |                     use_skeleton=args.use_skeleton,
213 |                     **common_kwargs,
214 |                 )
215 |             case "run_api":
216 |                 function(
217 |                     dataset_file=args.dataset_file,
218 |                     model=args.model,
219 |                     model_provider=args.model_provider,
220 |                     model_args=args.model_args,
221 |                     jobs=args.jobs,
222 |                     skip_existing=args.skip_existing,
223 |                     **common_kwargs,
224 |                 )
225 |     else:
226 |         logger.info("Please specify a command. Use --help for available commands.")
227 | 
228 | 
229 | if __name__ == "__main__":
230 |     main()
231 | 


--------------------------------------------------------------------------------
/src/swe_care/utils/estimate.py:
--------------------------------------------------------------------------------
  1 | from typing import Any
  2 | 
  3 | from loguru import logger
  4 | from tenacity import retry, retry_if_exception_type, stop_after_attempt
  5 | 
  6 | from swe_care.schema.dataset import ReferenceReviewComment
  7 | from swe_care.utils.llm_models.clients import BaseModelClient
  8 | from swe_care.utils.prompt_loader import load_prompt
  9 | 
 10 | 
 11 | class InvalidResponseError(Exception):
 12 |     """Raised when LLM returns an invalid response that should trigger a retry."""
 13 | 
 14 |     pass
 15 | 
 16 | 
 17 | @retry(
 18 |     stop=stop_after_attempt(3),
 19 |     retry=retry_if_exception_type(InvalidResponseError),
 20 |     reraise=True,
 21 | )
 22 | def classify_problem_domain(client: BaseModelClient, problem_statement: str) -> str:
 23 |     """Classify problem domain of a code review task."""
 24 |     valid_categories = {
 25 |         "Bug Fixes",
 26 |         "New Feature Additions",
 27 |         "Code Refactoring / Architectural Improvement",
 28 |         "Documentation Updates",
 29 |         "Test Suite / CI Enhancements",
 30 |         "Performance Optimizations",
 31 |         "Security Patches / Vulnerability Fixes",
 32 |         "Dependency Updates & Env Compatibility",
 33 |         "Code Style, Linting, Formatting Fixes",
 34 |     }
 35 | 
 36 |     system_prompt, user_prompt = load_prompt(
 37 |         "classify_problem_domain", problem_statement=problem_statement
 38 |     )
 39 | 
 40 |     messages = [
 41 |         {"role": "system", "content": system_prompt},
 42 |         {"role": "user", "content": user_prompt},
 43 |     ]
 44 | 
 45 |     try:
 46 |         response = client.create_completion(messages).strip()
 47 |         if response in valid_categories:
 48 |             return response
 49 |         else:
 50 |             logger.warning(f"Invalid category response: '{response}'")
 51 |             raise InvalidResponseError(f"Invalid response: '{response}'")
 52 |     except Exception as e:
 53 |         if isinstance(e, InvalidResponseError):
 54 |             raise
 55 |         logger.warning(f"Error in classify_problem_domain: {e}")
 56 |         raise InvalidResponseError(f"LLM call failed: {e}")
 57 | 
 58 | 
 59 | @retry(
 60 |     stop=stop_after_attempt(3),
 61 |     retry=retry_if_exception_type(InvalidResponseError),
 62 |     reraise=True,
 63 | )
 64 | def estimate_difficulty(
 65 |     client: BaseModelClient, pr_data: dict[str, Any], commit_message: str, patch: str
 66 | ) -> str:
 67 |     """Estimate the implementation difficulty for a pull request task."""
 68 | 
 69 |     title = pr_data.get("title", "")
 70 |     body = pr_data.get("body", "")
 71 |     changed_files = pr_data.get("changedFiles", "")
 72 | 
 73 |     system_prompt, user_prompt = load_prompt(
 74 |         "estimate_difficulty",
 75 |         title=title,
 76 |         body=body,
 77 |         commit_message=commit_message,
 78 |         changed_files=str(changed_files),
 79 |         patch=patch,
 80 |     )
 81 | 
 82 |     messages = [
 83 |         {"role": "system", "content": system_prompt},
 84 |         {"role": "user", "content": user_prompt},
 85 |     ]
 86 | 
 87 |     try:
 88 |         response = client.create_completion(messages).strip()
 89 |         difficulty = int(response)
 90 |         if difficulty in [1, 2, 3]:
 91 |             if difficulty == 1:
 92 |                 return "low"
 93 |             elif difficulty == 2:
 94 |                 return "medium"
 95 |             elif difficulty == 3:
 96 |                 return "high"
 97 |         else:
 98 |             logger.warning(f"Invalid difficulty response: '{response}'")
 99 |             raise InvalidResponseError(
100 |                 f"Invalid response: '{response}' (expected 1, 2, or 3)"
101 |             )
102 |     except ValueError:
103 |         logger.warning(f"Could not parse difficulty response: {response}")
104 |         raise InvalidResponseError(f"Could not parse response: '{response}'")
105 |     except Exception as e:
106 |         if isinstance(e, InvalidResponseError):
107 |             raise
108 |         logger.warning(f"Error in estimate_difficulty: {e}")
109 |         raise InvalidResponseError(f"LLM call failed: {e}")
110 | 
111 | 
112 | def classify_review_effort(
113 |     client: BaseModelClient, pr_data: dict[str, Any], commit_message: str, patch: str
114 | ) -> int:
115 |     """Estimate review effort for a code review task."""
116 |     labels = pr_data.get("labels", [])
117 |     if "nodes" in labels:
118 |         for node in labels["nodes"]:
119 |             if "Review effort" in node.get("name"):
120 |                 try:
121 |                     effort = node.get("name").split("Review effort")[-1].strip()
122 |                     if "/" in effort:
123 |                         effort = effort.split("/")[0]
124 |                     elif ":" in effort:
125 |                         effort = effort.split(":")[-1].strip()
126 |                     return int(effort)
127 |                 except Exception as e:
128 |                     logger.warning(
129 |                         f"Error parsing review effort, fallback to LLM estimate: {e}"
130 |                     )
131 |                     break
132 | 
133 |     title = pr_data.get("title", "")
134 |     body = pr_data.get("body", "")
135 | 
136 |     system_prompt, user_prompt = load_prompt(
137 |         "classify_review_effort",
138 |         title=title,
139 |         body=body,
140 |         commit_message=commit_message,
141 |         patch=patch,
142 |     )
143 | 
144 |     @retry(
145 |         stop=stop_after_attempt(3),
146 |         retry=retry_if_exception_type(InvalidResponseError),
147 |         reraise=True,
148 |     )
149 |     def _llm_call():
150 |         messages = [
151 |             {"role": "system", "content": system_prompt},
152 |             {"role": "user", "content": user_prompt},
153 |         ]
154 | 
155 |         try:
156 |             response = client.create_completion(messages).strip()
157 |             effort = int(response)
158 |             if effort in [1, 2, 3, 4, 5]:
159 |                 return effort
160 |             else:
161 |                 logger.warning(f"Invalid effort response: '{response}'")
162 |                 raise InvalidResponseError(
163 |                     f"Invalid response: '{response}' (expected 1-5)"
164 |                 )
165 |         except ValueError:
166 |             logger.warning(f"Could not parse effort response: {response}")
167 |             raise InvalidResponseError(f"Could not parse response: '{response}'")
168 |         except Exception as e:
169 |             if isinstance(e, InvalidResponseError):
170 |                 raise
171 |             logger.warning(f"Error in classify_review_effort: {e}")
172 |             raise InvalidResponseError(f"LLM call failed: {e}")
173 | 
174 |     try:
175 |         return _llm_call()
176 |     except InvalidResponseError:
177 |         raise ValueError(
178 |             "Failed to get valid review effort estimate (1-5) after 3 attempts"
179 |         )
180 | 
181 | 
182 | @retry(
183 |     stop=stop_after_attempt(3),
184 |     retry=retry_if_exception_type(InvalidResponseError),
185 |     reraise=True,
186 | )
187 | def classify_relevant_review_comment(
188 |     client: BaseModelClient, review_comment: ReferenceReviewComment
189 | ) -> bool:
190 |     """Classify if a review comment is relevant (likely to lead to code changes).
191 | 
192 |     Args:
193 |         client: The LLM client to use for classification
194 |         review_comment: The review comment to classify
195 | 
196 |     Returns:
197 |         True if the comment is relevant (likely to lead to code changes),
198 |         False if irrelevant (unlikely to lead to code changes)
199 |     """
200 | 
201 |     # Determine the line number to check for line change detection
202 |     line_to_check = None
203 |     if review_comment.original_line is not None:
204 |         line_to_check = review_comment.original_line
205 |     elif review_comment.line is not None:
206 |         line_to_check = review_comment.line
207 |     elif review_comment.start_line is not None:
208 |         line_to_check = review_comment.start_line
209 |     elif review_comment.original_start_line is not None:
210 |         line_to_check = review_comment.original_start_line
211 | 
212 |     system_prompt, user_prompt = load_prompt(
213 |         "classify_relevant_review_comment",
214 |         comment_text=review_comment.text,
215 |         diff_hunk=review_comment.diff_hunk,
216 |         path=review_comment.path,
217 |         line=line_to_check,
218 |     )
219 | 
220 |     messages = [
221 |         {"role": "system", "content": system_prompt},
222 |         {"role": "user", "content": user_prompt},
223 |     ]
224 | 
225 |     try:
226 |         response = client.create_completion(messages).strip().lower()
227 |         if response == "relevant":
228 |             return True
229 |         elif response == "irrelevant":
230 |             return False
231 |         else:
232 |             logger.warning(
233 |                 f"Invalid relevance classification response: '{response}' "
234 |                 f"(expected 'relevant' or 'irrelevant')"
235 |             )
236 |             raise InvalidResponseError(
237 |                 f"Invalid response: '{response}' (expected 'relevant' or 'irrelevant')"
238 |             )
239 |     except Exception as e:
240 |         if isinstance(e, InvalidResponseError):
241 |             raise
242 |         logger.warning(f"Error in classify_relevant_review_comment: {e}")
243 |         raise InvalidResponseError(f"LLM call failed: {e}")
244 | 


--------------------------------------------------------------------------------
/src/swe_care/harness/code_review_eval.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Run evaluation on code review predictions.
  3 | """
  4 | 
  5 | import threading
  6 | from concurrent.futures import ThreadPoolExecutor, as_completed
  7 | from datetime import datetime
  8 | from enum import Enum
  9 | from pathlib import Path
 10 | from typing import Any, Optional
 11 | 
 12 | from loguru import logger
 13 | from tqdm import tqdm
 14 | 
 15 | from swe_care.harness.evaluators import Evaluator
 16 | from swe_care.harness.evaluators.code_review import (
 17 |     LLMEvaluator,
 18 |     RuleBasedEvaluator,
 19 | )
 20 | from swe_care.harness.evaluators.repo_level import (
 21 |     RepoLevelLLMEvaluator,
 22 | )
 23 | from swe_care.schema.evaluation import (
 24 |     CodeReviewEvaluationResult,
 25 |     EvaluatorResult,
 26 | )
 27 | from swe_care.utils.llm_models import init_llm_client, parse_model_args
 28 | from swe_care.utils.llm_models.clients import BaseModelClient
 29 | from swe_care.utils.load import load_code_review_dataset, load_code_review_predictions
 30 | 
 31 | 
 32 | class EvaluatorType(str, Enum):
 33 |     """The types of the evaluators."""
 34 | 
 35 |     LLM_EVALUATOR = "llm_evaluator"
 36 |     """The LLM evaluator."""
 37 | 
 38 |     RULE_BASED_EVALUATOR = "rule_based_evaluator"
 39 |     """The rule-based evaluator."""
 40 | 
 41 |     # REPO_LEVEL_LLM_EVALUATOR = "repo_level_llm_evaluator"
 42 |     # """The repo-level LLM evaluator."""
 43 | 
 44 | 
 45 | _EVALUATOR_MAP: dict[EvaluatorType, type[Evaluator]] = {
 46 |     EvaluatorType.LLM_EVALUATOR: LLMEvaluator,
 47 |     EvaluatorType.RULE_BASED_EVALUATOR: RuleBasedEvaluator,
 48 |     # EvaluatorType.REPO_LEVEL_LLM_EVALUATOR: RepoLevelLLMEvaluator,
 49 | }
 50 | 
 51 | 
 52 | def load_evaluator(
 53 |     evaluator_type: EvaluatorType,
 54 |     *,
 55 |     model_client: Optional[BaseModelClient] = None,
 56 |     **kwargs: Any,
 57 | ) -> Evaluator:
 58 |     """Load the evaluator based on the type."""
 59 |     if evaluator_type not in _EVALUATOR_MAP:
 60 |         raise ValueError(
 61 |             f"Unknown evaluator type: {evaluator_type}"
 62 |             f"\nValid types are: {list(_EVALUATOR_MAP.keys())}"
 63 |         )
 64 |     evaluator_cls = _EVALUATOR_MAP[evaluator_type]
 65 |     if issubclass(evaluator_cls, (LLMEvaluator, RepoLevelLLMEvaluator)):
 66 |         if model_client is None:
 67 |             raise ValueError("LLM model client is required for LLM evaluator")
 68 |         evaluator = evaluator_cls(model_client=model_client, **kwargs)
 69 |     else:
 70 |         evaluator = evaluator_cls(**kwargs)
 71 | 
 72 |     logger.info(f"Loaded evaluator {evaluator_type} with kwargs: {kwargs}")
 73 |     return evaluator
 74 | 
 75 | 
 76 | def code_review_eval_instance(
 77 |     instance,
 78 |     predictions: list,
 79 |     evaluators: list[Evaluator],
 80 | ) -> CodeReviewEvaluationResult:
 81 |     """Process a single instance and return the evaluation result."""
 82 |     prediction = [p for p in predictions if p.instance_id == instance.instance_id]
 83 |     if not prediction:
 84 |         raise ValueError(f"No prediction found for instance {instance.instance_id}")
 85 |     prediction = prediction[0]
 86 | 
 87 |     evaluation_results: list[EvaluatorResult] = []
 88 |     for evaluator in evaluators:
 89 |         evaluation = None
 90 |         try:
 91 |             evaluation = evaluator.evaluate(
 92 |                 prediction=prediction,
 93 |                 reference=instance.reference_review_comments,
 94 |                 input=instance,
 95 |             )
 96 | 
 97 |         except Exception as e:
 98 |             raise ValueError(
 99 |                 f"Error evaluating instance {instance.instance_id} with {evaluator.evaluation_name}: {e}"
100 |             )
101 | 
102 |         evaluation_results.append(
103 |             EvaluatorResult(
104 |                 evaluator=evaluator.evaluation_name,
105 |                 evaluation=evaluation,
106 |             )
107 |         )
108 | 
109 |     score = sum(
110 |         [
111 |             evaluation.evaluation.get("score", 0)
112 |             for evaluation in evaluation_results
113 |             if evaluation.evaluation.get("score", 0) is not None
114 |         ]
115 |     ) / len(evaluation_results)
116 | 
117 |     evaluation_result = CodeReviewEvaluationResult(
118 |         instance_id=instance.instance_id,
119 |         score=score,
120 |         evaluations=evaluation_results,
121 |     )
122 | 
123 |     return evaluation_result
124 | 
125 | 
126 | def code_review_eval(
127 |     predictions_path: Path | str,
128 |     output_dir: Path | str,
129 |     evaluator_types: list[EvaluatorType],
130 |     dataset_name_or_path: Path | str = "inclusionAI/SWE-CARE",
131 |     model: Optional[str] = None,
132 |     model_provider: Optional[str] = None,
133 |     model_args: Optional[str] = None,
134 |     evaluator_kwargs: Optional[dict[str, dict[str, Any]]] = None,
135 |     jobs: int = 2,
136 | ) -> None:
137 |     """
138 |     Run evaluation on code review predictions.
139 | 
140 |     Args:
141 |         predictions_path: Path to predictions file or directory containing predictions
142 |         output_dir: Directory where the final_report.json will be saved
143 |         evaluator_types: List of evaluator types to use
144 |         dataset_name_or_path: Path to the dataset file or Hugging Face dataset name (default: inclusionAI/SWE-CARE)
145 |         model: Model name to use for LLM evaluation (required if using LLM evaluator)
146 |         model_provider: Model provider (required if using LLM evaluator)
147 |         model_args: Comma-separated model arguments
148 |         evaluator_kwargs: Dict mapping evaluator types to their kwargs
149 |         jobs: Number of parallel jobs to run (default: 2)
150 |     """
151 |     if isinstance(predictions_path, str):
152 |         predictions_path = Path(predictions_path)
153 |     if isinstance(output_dir, str):
154 |         output_dir = Path(output_dir)
155 | 
156 |     instances = load_code_review_dataset(dataset_name_or_path)
157 |     predictions = load_code_review_predictions(predictions_path)
158 | 
159 |     # Initialize LLM client if needed
160 |     model_client = None
161 |     llm_evaluator_types = [
162 |         EvaluatorType.LLM_EVALUATOR,
163 |         # EvaluatorType.REPO_LEVEL_LLM_EVALUATOR,
164 |     ]
165 |     if any(et in evaluator_types for et in llm_evaluator_types):
166 |         if not model or not model_provider:
167 |             raise ValueError("Model and model provider are required for LLM evaluator")
168 | 
169 |         model_kwargs = parse_model_args(model_args)
170 |         model_client = init_llm_client(model, model_provider, **model_kwargs)
171 |         logger.info(
172 |             f"Initialized {model_provider} client with model {model} using model arguments: {model_kwargs}"
173 |         )
174 | 
175 |     # Initialize evaluator_kwargs if not provided
176 |     if evaluator_kwargs is None:
177 |         evaluator_kwargs = {}
178 | 
179 |     evaluators = []
180 |     for evaluator_type in evaluator_types:
181 |         # Get kwargs for this specific evaluator type
182 |         kwargs = evaluator_kwargs.get(evaluator_type.value, {})
183 |         evaluator = load_evaluator(
184 |             evaluator_type,
185 |             model_client=model_client,
186 |             **kwargs,
187 |         )
188 |         evaluators.append(evaluator)
189 | 
190 |     # Create output directory if it doesn't exist
191 |     output_dir.mkdir(parents=True, exist_ok=True)
192 |     output_file = (
193 |         output_dir
194 |         / f"{predictions_path.stem}_report_{datetime.now().strftime('%Y%m%d_%H%M%S')}.jsonl"
195 |     )
196 | 
197 |     # Thread-safe file writing
198 |     write_lock = threading.Lock()
199 | 
200 |     # Initialize the output file (truncate if exists)
201 |     with open(output_file, "w"):
202 |         pass  # Just create/truncate the file
203 | 
204 |     # Counters for tracking progress
205 |     successful_evaluations = 0
206 |     failed_evaluations = 0
207 | 
208 |     with ThreadPoolExecutor(max_workers=jobs) as executor:
209 |         # Submit all tasks
210 |         future_to_instance = {
211 |             executor.submit(
212 |                 code_review_eval_instance,
213 |                 instance=instance,
214 |                 predictions=predictions,
215 |                 evaluators=evaluators,
216 |             ): instance
217 |             for instance in instances
218 |         }
219 | 
220 |         # Process completed tasks with progress bar
221 |         with tqdm(
222 |             total=len(instances),
223 |             desc=f"Evaluating instances with [{', '.join([e.value for e in evaluator_types])}] ({jobs} threads)",
224 |         ) as pbar:
225 |             for future in as_completed(future_to_instance):
226 |                 instance = future_to_instance[future]
227 | 
228 |                 try:
229 |                     result = future.result()
230 |                     with write_lock:
231 |                         with open(output_file, "a") as f:
232 |                             f.write(result.to_json() + "\n")
233 |                     successful_evaluations += 1
234 | 
235 |                 except Exception as e:
236 |                     failed_evaluations += 1
237 |                     logger.error(
238 |                         f"Exception evaluating instance {instance.instance_id}: {e}"
239 |                     )
240 | 
241 |                 pbar.update(1)
242 |                 pbar.set_postfix(
243 |                     {
244 |                         "success": successful_evaluations,
245 |                         "failed": failed_evaluations,
246 |                     }
247 |                 )
248 | 
249 |     logger.info(
250 |         f"Evaluation completed. Results saved to {output_file}. "
251 |         f"Success: {successful_evaluations}, Failed: {failed_evaluations}"
252 |     )
253 | 


--------------------------------------------------------------------------------
/src/swe_care/utils/github.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import random
  3 | import time
  4 | import urllib.parse
  5 | from typing import Any, Optional
  6 | 
  7 | import requests
  8 | from loguru import logger
  9 | from tenacity import (
 10 |     after_log,
 11 |     before_sleep_log,
 12 |     retry,
 13 |     retry_if_exception_type,
 14 |     stop_after_attempt,
 15 |     wait_exponential,
 16 | )
 17 | 
 18 | 
 19 | class MaxNodeLimitExceededError(Exception):
 20 |     """Exception raised when the maximum node limit is exceeded."""
 21 | 
 22 |     def __init__(self, error: dict):
 23 |         self.error = error
 24 |         super().__init__(f"Max node limit exceeded: {error}")
 25 | 
 26 | 
 27 | class GitHubAPI:
 28 |     """GitHub API client with rate limiting, retries, and proper error handling."""
 29 | 
 30 |     def __init__(
 31 |         self,
 32 |         max_retries: int = 5,
 33 |         timeout: int = 60,
 34 |         tokens: Optional[list[str]] = None,
 35 |     ):
 36 |         """
 37 |         Initialize GitHub API client.
 38 | 
 39 |         Args:
 40 |             max_retries: Maximum number of retries for API calls
 41 |             tokens: Optional list of GitHub tokens for authentication
 42 |         """
 43 |         self.max_retries = max_retries
 44 |         self.tokens = tokens or []
 45 |         self.timeout = timeout
 46 |         self.graphql_endpoint = "https://api.github.com/graphql"
 47 |         self.rest_api_base = "https://api.github.com"
 48 | 
 49 |     def _get_token(self) -> Optional[str]:
 50 |         """Get a randomly selected token from the available tokens."""
 51 |         if not self.tokens:
 52 |             return None
 53 |         return random.choice(self.tokens)
 54 | 
 55 |     def _get_headers(self, content_type: str = "application/json") -> dict[str, str]:
 56 |         """Get headers for API requests."""
 57 |         headers = {
 58 |             "Content-Type": content_type,
 59 |             "Accept": "application/vnd.github+json",
 60 |         }
 61 | 
 62 |         token = self._get_token()
 63 |         if token:
 64 |             headers["Authorization"] = f"Bearer {token}"
 65 | 
 66 |         return headers
 67 | 
 68 |     def _handle_rate_limit(self, response: requests.Response) -> None:
 69 |         """Handle rate limiting based on response headers."""
 70 |         if "X-RateLimit-Remaining" in response.headers:
 71 |             remaining = int(response.headers["X-RateLimit-Remaining"])
 72 |             if remaining < 10:  # If less than 10 requests remaining
 73 |                 reset_time = int(response.headers.get("X-RateLimit-Reset", 0))
 74 |                 current_time = int(time.time())
 75 |                 wait_time = max(0, reset_time - current_time)
 76 |                 if wait_time > 0:
 77 |                     logger.info(
 78 |                         f"Rate limit approaching. Waiting {wait_time} seconds..."
 79 |                     )
 80 |                     time.sleep(wait_time)
 81 | 
 82 |     def _retry_wrapper(self, func):
 83 |         """Create a retry wrapper using the instance's max_retries setting."""
 84 |         return retry(
 85 |             reraise=True,
 86 |             stop=stop_after_attempt(self.max_retries),
 87 |             wait=wait_exponential(multiplier=2, min=4, max=60),
 88 |             retry=retry_if_exception_type(
 89 |                 (
 90 |                     requests.exceptions.RequestException,
 91 |                     requests.exceptions.HTTPError,
 92 |                 )
 93 |             ),
 94 |             before_sleep=before_sleep_log(logger, "WARNING"),
 95 |             after=after_log(logger, "WARNING"),
 96 |         )(func)
 97 | 
 98 |     def execute_graphql_query(
 99 |         self, query: str, variables: dict[str, Any]
100 |     ) -> dict[str, Any]:
101 |         """
102 |         Execute a GraphQL query with retry mechanism.
103 | 
104 |         Args:
105 |             query: GraphQL query string
106 |             variables: Variables for the GraphQL query
107 | 
108 |         Returns:
109 |             JSON response from the GraphQL API
110 | 
111 |         Raises:
112 |             ValueError: If GraphQL errors are returned
113 |             requests.exceptions.RequestException: If the request fails
114 |         """
115 | 
116 |         def _execute_query():
117 |             headers = self._get_headers()
118 |             payload = {
119 |                 "query": query,
120 |                 "variables": variables,
121 |             }
122 | 
123 |             response = requests.post(
124 |                 self.graphql_endpoint,
125 |                 headers=headers,
126 |                 data=json.dumps(payload),
127 |                 timeout=self.timeout,
128 |             )
129 |             response.raise_for_status()
130 | 
131 |             # Handle rate limiting
132 |             self._handle_rate_limit(response)
133 | 
134 |             results = response.json()
135 | 
136 |             # Check for GraphQL errors
137 |             if "errors" in results:
138 |                 # Check if it's a node limit error that we can handle
139 |                 for error in results["errors"]:
140 |                     if error.get("type") == "MAX_NODE_LIMIT_EXCEEDED":
141 |                         raise MaxNodeLimitExceededError(error)
142 | 
143 |                 raise ValueError(f"GraphQL errors: {results['errors']}")
144 | 
145 |             return results
146 | 
147 |         return self._retry_wrapper(_execute_query)()
148 | 
149 |     def call_api(
150 |         self,
151 |         url: str,
152 |         method: str = "GET",
153 |         params: Optional[dict[str, Any]] = None,
154 |         data: Optional[dict[str, Any]] = None,
155 |     ) -> requests.Response:
156 |         """
157 |         Call GitHub REST API endpoint with retry mechanism.
158 | 
159 |         Args:
160 |             url: Full URL or path relative to GitHub API base
161 |             method: HTTP method (GET, POST, etc.)
162 |             params: Query parameters
163 |             data: Request body data
164 |             timeout: Request timeout in seconds
165 | 
166 |         Returns:
167 |             Response object
168 | 
169 |         Raises:
170 |             requests.exceptions.RequestException: If the request fails
171 |         """
172 | 
173 |         def _call_api():
174 |             # Handle both full URLs and relative paths
175 |             full_url = url
176 |             if not url.startswith("http"):
177 |                 full_url = f"{self.rest_api_base}/{url.lstrip('/')}"
178 | 
179 |             headers = self._get_headers()
180 | 
181 |             response = requests.request(
182 |                 method=method,
183 |                 url=full_url,
184 |                 headers=headers,
185 |                 params=params,
186 |                 json=data,
187 |                 timeout=self.timeout,
188 |             )
189 |             response.raise_for_status()
190 | 
191 |             # Handle rate limiting
192 |             self._handle_rate_limit(response)
193 | 
194 |             return response
195 | 
196 |         return self._retry_wrapper(_call_api)()
197 | 
198 |     def get_patch(
199 |         self,
200 |         repo: str,
201 |         *,
202 |         pr_number: Optional[int] = None,
203 |         base_commit: Optional[str] = None,
204 |         head_commit: Optional[str] = None,
205 |     ) -> str:
206 |         """
207 |         Get patch/diff for a PR or commit range.
208 | 
209 |         Args:
210 |             repo: Repository in format 'owner/repo'
211 |             pr_number: Pull request number (for PR patch)
212 |             base_commit: Base commit for commit range
213 |             head_commit: Head commit for commit range
214 | 
215 |         Returns:
216 |             Patch/diff content as string
217 | 
218 |         Raises:
219 |             ValueError: If neither pr_number nor commit range is provided
220 |             requests.exceptions.RequestException: If the request fails
221 |         """
222 | 
223 |         def _get_patch():
224 |             if pr_number is not None:
225 |                 patch_url = f"https://github.com/{repo}/pull/{pr_number}.diff"
226 |             elif base_commit and head_commit:
227 |                 patch_url = f"https://github.com/{repo}/compare/{base_commit}...{head_commit}.diff"
228 |             else:
229 |                 raise ValueError(
230 |                     "Either pr_number or both base_commit and head_commit must be provided"
231 |                 )
232 | 
233 |             headers = self._get_headers(content_type="text/plain")
234 | 
235 |             response = requests.get(patch_url, headers=headers, timeout=self.timeout)
236 |             response.raise_for_status()
237 | 
238 |             # Handle rate limiting
239 |             self._handle_rate_limit(response)
240 | 
241 |             return response.text
242 | 
243 |         return self._retry_wrapper(_get_patch)()
244 | 
245 |     def get_file_content(self, repo: str, commit: str, file_path: str) -> str:
246 |         """
247 |         Get the content of a file at a specific commit.
248 | 
249 |         Args:
250 |             repo: Repository in format 'owner/repo'
251 |             commit: Commit SHA to fetch the file from
252 |             file_path: Path to the file in the repository
253 | 
254 |         Returns:
255 |             File content as string
256 | 
257 |         Raises:
258 |             requests.exceptions.RequestException: If the request fails
259 |         """
260 | 
261 |         encoded_path = urllib.parse.quote(file_path, safe="")
262 |         content_response = self.call_api(
263 |             f"repos/{repo}/contents/{encoded_path}", params={"ref": commit}
264 |         )
265 |         content_data = content_response.json()
266 | 
267 |         # Decode base64 content
268 |         if "content" in content_data and content_data.get("encoding") == "base64":
269 |             import base64
270 | 
271 |             content = base64.b64decode(content_data["content"]).decode("utf-8")
272 |             return content
273 |         else:
274 |             logger.warning(f"Unable to decode content for {file_path}")
275 |             return ""
276 | 


--------------------------------------------------------------------------------
/src/swe_care/harness/evaluators/repo_level.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Repo-level LLM evaluator for code review predictions.
  3 | """
  4 | 
  5 | import json
  6 | import re
  7 | from typing import Any, Literal, Optional
  8 | 
  9 | from loguru import logger
 10 | 
 11 | from swe_care.harness.evaluators import Evaluator
 12 | from swe_care.harness.evaluators.utils import extract_defects_from_review
 13 | from swe_care.schema.dataset import (
 14 |     CodeReviewTaskInstance,
 15 |     ReferenceReviewComment,
 16 | )
 17 | from swe_care.schema.evaluation import CodeReviewPrediction
 18 | from swe_care.utils.file_source_retrieval import get_relevant_files
 19 | from swe_care.utils.llm_models.clients import BaseModelClient
 20 | from swe_care.utils.prompt_loader import load_prompt
 21 | 
 22 | 
 23 | class RepoLevelLLMEvaluator(Evaluator):
 24 |     """Evaluator that uses LLM to classify review comments as positive or negative."""
 25 | 
 26 |     def __init__(
 27 |         self,
 28 |         model_client: BaseModelClient,
 29 |         file_source: Literal[
 30 |             "none",
 31 |             "base_changed_files",
 32 |             "reviewed_file",
 33 |             "retrieved_base_changed_files",
 34 |             "retrieved_all_files",
 35 |         ] = "none",
 36 |         retrieval_max_files: int = 5,
 37 |         retrieval_output_dir: Optional[str] = None,
 38 |         tokens: Optional[list[str]] = None,
 39 |         **kwargs,
 40 |     ):
 41 |         """Initialize the Repo-level LLM evaluator.
 42 | 
 43 |         Args:
 44 |             model_client: The LLM client to use for evaluation
 45 |             file_source: Source for file content
 46 |             retrieval_max_files: Maximum number of files for retrieval
 47 |             retrieval_output_dir: Output directory for retrieval operations
 48 |             tokens: GitHub API tokens
 49 |         """
 50 |         super().__init__(**kwargs)
 51 |         self.model_client = model_client
 52 |         self.file_source = file_source
 53 |         self.retrieval_max_files = retrieval_max_files
 54 |         self.retrieval_output_dir = retrieval_output_dir
 55 |         self.tokens = tokens
 56 | 
 57 |         # Validate retrieval_output_dir requirement
 58 |         if file_source == "retrieved_all_files" and retrieval_output_dir is None:
 59 |             raise ValueError(
 60 |                 "retrieval_output_dir is required when file_source is 'retrieved_all_files'"
 61 |             )
 62 | 
 63 |     @property
 64 |     def requires_input(self) -> bool:
 65 |         """Need the input to get problem statement and base commit"""
 66 |         return True
 67 | 
 68 |     @property
 69 |     def requires_reference(self) -> bool:
 70 |         """Reference is optional for this evaluator"""
 71 |         return False
 72 | 
 73 |     def _parse_json(self, text: str) -> dict:
 74 |         """Parse JSON from LLM response."""
 75 |         # First, try to find JSON string within triple backticks
 76 |         # Handle both ```json and ``` formats
 77 |         json_pattern = r"```(?:json)?\s*\n?(.*?)\n?```"
 78 |         matches = re.findall(json_pattern, text, re.DOTALL | re.MULTILINE)
 79 | 
 80 |         for match in matches:
 81 |             try:
 82 |                 cleaned_match = match.strip()
 83 |                 if cleaned_match:
 84 |                     return json.loads(cleaned_match)
 85 |             except json.JSONDecodeError:
 86 |                 continue
 87 | 
 88 |         # If no backtick-wrapped JSON found, try to extract JSON object directly
 89 |         # Look for content between { and }
 90 |         json_object_pattern = r"\{[^{}]*(?:\{[^{}]*\}[^{}]*)*\}"
 91 |         json_matches = re.findall(json_object_pattern, text, re.DOTALL)
 92 | 
 93 |         for match in json_matches:
 94 |             try:
 95 |                 return json.loads(match)
 96 |             except json.JSONDecodeError:
 97 |                 continue
 98 | 
 99 |         # Last resort: try parsing the entire text
100 |         try:
101 |             return json.loads(text.strip())
102 |         except json.JSONDecodeError:
103 |             # If all else fails, provide more detailed error
104 |             logger.error(f"Failed to parse JSON from response: {text[:500]}...")
105 |             raise ValueError("No valid JSON found in LLM response")
106 | 
107 |     def evaluate_on_reference_review_comment(
108 |         self,
109 |         review_comment: ReferenceReviewComment,
110 |         input: CodeReviewTaskInstance,
111 |     ) -> dict:
112 |         """Evaluate a single reference review comment.
113 | 
114 |         Args:
115 |             review_comment: The review comment to evaluate
116 |             input: The input task instance
117 | 
118 |         Returns:
119 |             Dictionary with label (0/1) and reason
120 |         """
121 |         # Get file content based on file_source
122 |         relevant_files = self._get_relevant_files(review_comment, input)
123 | 
124 |         # Format the review comment with context
125 |         formatted_review = self._format_review_comment_with_context(
126 |             review_comment,
127 |             relevant_files,
128 |         )
129 | 
130 |         # Load prompts from YAML template
131 |         system_prompt, user_prompt = load_prompt(
132 |             "repo_level_evaluation",
133 |             problem_statement=input.problem_statement,
134 |             patch_to_review=input.commit_to_review.patch_to_review,
135 |             formatted_review=formatted_review,
136 |         )
137 | 
138 |         messages = [
139 |             {"role": "system", "content": system_prompt},
140 |             {"role": "user", "content": user_prompt},
141 |         ]
142 | 
143 |         answer = self.model_client.create_completion(messages)
144 |         result = self._parse_json(answer)
145 | 
146 |         # Validate result
147 |         if "label" not in result or result["label"] not in [0, 1]:
148 |             raise ValueError(f"Invalid label in result: {result}")
149 |         if "reason" not in result:
150 |             result["reason"] = "No reason provided"
151 | 
152 |         return result
153 | 
154 |     def _get_relevant_files(
155 |         self,
156 |         review_comment: ReferenceReviewComment,
157 |         input: CodeReviewTaskInstance,
158 |     ) -> dict[str, str]:
159 |         """Get relevant files based on file_source strategy."""
160 |         return get_relevant_files(
161 |             review_comment=review_comment,
162 |             file_source=self.file_source,
163 |             repo=input.repo,
164 |             base_commit=input.base_commit,
165 |             patch_to_review=input.commit_to_review.patch_to_review,
166 |             tokens=self.tokens,
167 |             retrieval_max_files=self.retrieval_max_files,
168 |             retrieval_output_dir=self.retrieval_output_dir,
169 |         )
170 | 
171 |     def _format_review_comment_with_context(
172 |         self,
173 |         review_comment: ReferenceReviewComment,
174 |         relevant_files: dict[str, str],
175 |     ) -> str:
176 |         """Format review comment with file context."""
177 |         # Load and render the template
178 |         return load_prompt(
179 |             "rm_sample",
180 |             relevant_files=relevant_files,
181 |             diff_hunk=review_comment.diff_hunk or "",
182 |             path=review_comment.path or "",
183 |             line=review_comment.line or "",
184 |             review_comment=review_comment.text.strip(),
185 |             add_line_numbers=True,
186 |         )
187 | 
188 |     def _evaluate(
189 |         self,
190 |         *,
191 |         prediction: CodeReviewPrediction,
192 |         reference: Any,  # noqa: ARG002
193 |         input: CodeReviewTaskInstance,
194 |     ) -> dict:
195 |         """Evaluate code review prediction by classifying extracted defects.
196 | 
197 |         Args:
198 |             prediction: The code review prediction
199 |             reference: Not used in this evaluator
200 |             input: The input task instance
201 | 
202 |         Returns:
203 |             Dictionary containing evaluation metrics
204 |         """
205 |         # Extract defects from the predicted review
206 |         predicted_defects = extract_defects_from_review(prediction.review_text, input)
207 | 
208 |         if not predicted_defects:
209 |             # No defects found means it's a positive review
210 |             return {
211 |                 "score": 1.0,
212 |                 "num_defects": 0,
213 |                 "classifications": [],
214 |             }
215 | 
216 |         # Classify each defect
217 |         classifications = []
218 |         positive_count = 0
219 |         negative_count = 0
220 | 
221 |         for defect in predicted_defects:
222 |             try:
223 |                 result = self.evaluate_on_reference_review_comment(defect, input)
224 |                 classifications.append(
225 |                     {
226 |                         "defect_text": defect.text,
227 |                         "label": result["label"],
228 |                         "reason": result["reason"],
229 |                     }
230 |                 )
231 | 
232 |                 if result["label"] == 1:
233 |                     positive_count += 1
234 |                 else:
235 |                     negative_count += 1
236 | 
237 |             except Exception as e:
238 |                 logger.error(f"Failed to classify defect: {e}")
239 |                 classifications.append(
240 |                     {
241 |                         "defect_text": defect.text,
242 |                         "label": 0,  # Default to negative on error
243 |                         "reason": f"Classification error: {str(e)}",
244 |                     }
245 |                 )
246 |                 negative_count += 1
247 | 
248 |         # Calculate overall score
249 |         total_defects = len(predicted_defects)
250 |         if total_defects > 0:
251 |             score = positive_count / total_defects
252 |         else:
253 |             score = 1.0  # No defects means positive
254 | 
255 |         return {
256 |             "score": score,
257 |             "num_defects": total_defects,
258 |             "num_positive": positive_count,
259 |             "num_negative": negative_count,
260 |             "classifications": classifications,
261 |         }
262 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 


--------------------------------------------------------------------------------
/docs/questionnaire-demo.md:
--------------------------------------------------------------------------------
  1 | ### Code Review Annotation Questionnaire
  2 | 
  3 | **Instance ID**: `dbt-labs__dbt-core-3100@4f8c10c`  
  4 | **Repository**: `dbt-labs/dbt-core`  
  5 | **Pull Request**: [`#3100`](https://github.com/dbt-labs/dbt-core/pull/3100)  
  6 | **Language**: `Python`  
  7 | **Created at**: `2021-02-13T16:24:06Z`  
  8 | **Base commit**: `934c23bf392c4d5db9b129d0f8953c2b7872f5b0`  
  9 | **Commit to review**: `4f8c10c1aab3630f175611e87903b6051afdd8d8`
 10 | **Commit message**:
 11 | > default to get_columns_in_relation if not specified in config
 12 | > 
 13 | > Co-authored-by: Jeremy Cohen <jtcohen6@gmail.com>
 14 | 
 15 | ---
 16 | 
 17 | ### Context
 18 | 
 19 | #### Problem Statement
 20 | 
 21 | > Specify Columns to update on incremental Models
 22 | > ### Describe the feature
 23 | > Allow model developer the ability to choose the set of columns that require updating on incremental loads
 24 | > 
 25 | > ### Who will this benefit?
 26 | > This will benefit all users that are performing an incremental update of a wide table with few columns that are mutable
 27 | 
 28 | #### Hints (pre-PR comments or linked issues context)
 29 | 
 30 | > hey @richmintz! Can you say more about this? I think it's a good idea - I'm curious if you're interested in this feature for performance reasons, or something else.
 31 | > Hi @drewbanin!
 32 | > My primary goal would be performance, minimizing writes and index rebuilds. However I also think it will generate more concise sql for large tables when a merge is required. 
 33 | > Got it! Sure, this makes a lot of sense in `merge` statements. Is this on Snowflake/BigQuery? I think a similar paradigm would apply to Redshift/Postgres, but we'd modify the `update` statement instead of the `merge` statement.
 34 | > 
 35 | > I can imagine a config like `update_columns` which could be supplied like:
 36 | > ```
 37 | > {{
 38 | >   config(
 39 | >     materialized='incremental',
 40 | >     update_columns=['order_total', 'count_orders']
 41 | >   )
 42 | > }}
 43 | > ```
 44 | > 
 45 | > If `update_columns` is not provided, or if it is `null`, dbt will default to updating all of the columns in the model.
 46 | > 
 47 | > You buy all of that?
 48 | >  
 49 | > 
 50 | > As part of this proposed change, it would also be nice to be able to exclude the `when matched then update set` part of the merge altogether, as in some of my models I'm only interested in adding new rows since the source data is never updated (for event-based data for example or other append-only tables), and it makes the model execution faster (at least in BigQuery).
 51 | > 
 52 | > It could be a separate config `no_update = True` or just a convention that doesn't include the SQL block when `update_columns` is empty.
 53 | > 
 54 | > Please note I tested this and would work for BigQuery, other databases might need a different syntax to support no-op for updates.
 55 | > 
 56 | > Any thoughts?
 57 | > hey @bfil - if you do not supply a `unique_key` for your incremental model's config, then dbt should not inject a `when matched then update set....` clause to the merge statement. 
 58 | > 
 59 | > Check out the implementation here:
 60 | > https://github.com/fishtown-analytics/dbt/blob/7a07017b96ac332c872725343833a94b49129c68/core/dbt/include/global_project/macros/materializations/common/merge.sql#L23-L48
 61 | > @drewbanin I know, but I still want to compare and insert only the new data based on that unique key rather than merging on an always `FALSE` condition. Makes sense?
 62 | > ah! Sure, that makes a lot of sense. What do you think about pushing this code down from the materialization layer and into the modeling layer. Could you do something like:
 63 | > 
 64 | > ```
 65 | > {{ config(materialized='incremental') }}
 66 | > 
 67 | > select id, col1, col2 from {{ ref('some_table') }}
 68 | > {% if is_incremental() %}
 69 | > where id not in (select id from {{ this }})
 70 | > {% endif %}
 71 | > ```
 72 | > 
 73 | > This is a kind of funny adaptation of the [typical incremental modeling approach](https://docs.getdbt.com/docs/configuring-incremental-models), but it should serve to only select the _new_ records from your source table, inserting them directly into the destination table.
 74 | > @drewbanin : any update on where we are with this feature...i have a wide table [Snowflake] but i only need to update very few columns in the incremental model. any ETA's on the feature ?
 75 | > hey @ee07dazn - no movement on this issue on our end to report. I think the spec I mentioned above (pasted below) is still a good idea:
 76 | > 
 77 | > ```
 78 | > {{
 79 | >   config(
 80 | >     materialized='incremental',
 81 | >     update_columns=['order_total', 'count_orders']
 82 | >   )
 83 | > }
 84 | > ```
 85 | > 
 86 | > I think that this feature should only be supported on databases that support `merge` statements, as dbt's delete+insert approach doesn't really lend itself well to this approach. If anyone is interested in working on this one, I'd be happy to try to point you in the right direction. 
 87 | > 
 88 | > The key change here will be to replace the call to [`adapter.get_columns_in_relation`](https://github.com/fishtown-analytics/dbt/blob/f3565f3f70062c5818395d03a52c89e87616f956/plugins/snowflake/dbt/include/snowflake/macros/materializations/incremental.sql#L59) with `config.get('update_columns')`. We'll want to implement this for both the Snowflake and BigQuery incremental materializations when the `merge` strategy is used.
 89 | > Thanks @drewbanin for the information and a pointer. Apart from what you had suggested, i had to make change to the default__get_merge_sql macro to make that approach work. It works now but i am just not happy to make a change to something as low level as default__get_merge_sql. Probably i think i can update the snowflake_get_merge_sql to do this job. Thinking out loud...but thanks for the help.
 90 | 
 91 | #### Resolved Issues
 92 | 
 93 | 
 94 | 
 95 | 
 96 | ##### Issue #1862
 97 | 
 98 | - Title:
 99 | 
100 | > Specify Columns to update on incremental Models
101 | 
102 | - Body:
103 | 
104 | > ### Describe the feature
105 | > Allow model developer the ability to choose the set of columns that require updating on incremental loads
106 | > 
107 | > ### Who will this benefit?
108 | > This will benefit all users that are performing an incremental update of a wide table with few columns that are mutable
109 | 
110 | 
111 | 
112 | #### Patch to Review (diff)
113 | 
114 | ```diff
115 | diff --git a/plugins/bigquery/dbt/include/bigquery/macros/materializations/incremental.sql b/plugins/bigquery/dbt/include/bigquery/macros/materializations/incremental.sql
116 | index f4ad80d5a5c..a48178f88a3 100644
117 | --- a/plugins/bigquery/dbt/include/bigquery/macros/materializations/incremental.sql
118 | +++ b/plugins/bigquery/dbt/include/bigquery/macros/materializations/incremental.sql
119 | @@ -112,7 +112,10 @@
120 |        {% endif %}
121 |        {% set build_sql = create_table_as(False, target_relation, sql) %}
122 |    {% else %}
123 | -     {% set dest_columns = adapter.get_columns_in_relation(existing_relation) %}
124 | +     {% set dest_columns = config.get('update_columns', none) %}
125 | +     {% if dest_columns is none %}
126 | +         {% set dest_columns = adapter.get_columns_in_relation(existing_relation) %}
127 | +     {% endif %}
128 |  
129 |       {#-- if partitioned, use BQ scripting to get the range of partition values to be updated --#}
130 |       {% if strategy == 'insert_overwrite' %}
131 | diff --git a/plugins/snowflake/dbt/include/snowflake/macros/materializations/incremental.sql b/plugins/snowflake/dbt/include/snowflake/macros/materializations/incremental.sql
132 | index 188795b1537..6d0d3bf9805 100644
133 | --- a/plugins/snowflake/dbt/include/snowflake/macros/materializations/incremental.sql
134 | +++ b/plugins/snowflake/dbt/include/snowflake/macros/materializations/incremental.sql
135 | @@ -58,7 +58,7 @@
136 |      {% do adapter.expand_target_column_types(
137 |             from_relation=tmp_relation,
138 |             to_relation=target_relation) %}
139 | -    {% set dest_columns = adapter.get_columns_in_relation(target_relation) %}
140 | +    {% set dest_columns = config.get('update_columns') %}
141 |      {% set build_sql = dbt_snowflake_get_incremental_sql(strategy, tmp_relation, target_relation, unique_key, dest_columns) %}
142 |    {% endif %}
143 |  
144 | 
145 | ```
146 | 
147 | #### Reference Review Comments (from the PR)
148 | 
149 | Count: 1
150 | 
151 | 
152 | ---
153 | 
154 | ##### Comment 1
155 | 
156 | - Review comment:
157 |   
158 | > @user1:
159 | > As you did for BigQuery, this should be:
160 | > ```suggestion
161 | >     {% set dest_columns = config.get('update_columns', none) %}
162 | >     {% if dest_columns is none %}
163 | >         {% set dest_columns = adapter.get_columns_in_relation(existing_relation) %}
164 | >     {% endif %}
165 | > ```
166 | > I believe this is what's causing the failing integration test:
167 | > ```
168 | > 'NoneType' object is not iterable
169 | > ```
170 | > 
171 | > @author:
172 | > of course 🤦
173 | 
174 | - Diff hunk the review comment applied to:
175 | 
176 | ```diff
177 | @@ -58,7 +58,7 @@
178 |      {% do adapter.expand_target_column_types(
179 |             from_relation=tmp_relation,
180 |             to_relation=target_relation) %}
181 | -    {% set dest_columns = adapter.get_columns_in_relation(target_relation) %}
182 | +    {% set dest_columns = config.get('update_columns') %}
183 | ```
184 | 
185 | - Path: `plugins/snowflake/dbt/include/snowflake/macros/materializations/incremental.sql`
186 | - Line: n/a  |  Start line: n/a
187 | - Original line: 61  |  Original start line: n/a
188 | 
189 | 
190 | ---
191 | 
192 | ### Section 1 — Problem Statement and Patch Alignment
193 | 
194 | - **Q1.1 — Is the problem statement sufficiently well-specified for reviewing this patch?**
195 | 
196 |   _Your notes:_
197 | 
198 | - **Q1.2 — Explain your choice above. Reference specific files/functions when relevant.**
199 | 
200 |   _Your notes:_
201 | 
202 | - **Q1.3 — Does the patch address the stated problem?**
203 | - [ ] Yes
204 | - [ ] Partially
205 | - [ ] No
206 | 
207 |   - **Why?**
208 | 
209 | ---
210 | 
211 | ### Section 2 — Review Scope and Comment Coverage
212 | 
213 | - **Q2.1 — Do the provided reference review comments appropriately cover the patch (scope and correctness)?**
214 | 
215 |   _Your notes:_
216 | 
217 | - **Q2.2 — Explain your choice. Identify any mis-scoped or missing review topics.**
218 | 
219 |   _Your notes:_
220 | 
221 | - **Q2.3 — Reference Review Comments Assessment**
222 | 
223 | For each reference comment, mark whether it is a true positive (valid issue), false positive (not actually an issue), or informational. Briefly justify.
224 | 
225 | | Index | Category (Functionality/Correctness/Performance/Security/Maintainability/Style/Docs) | Verdict (TP/FP/Info) | Notes |
226 | | --- | --- | --- | --- |
227 | | 1 |   |   |   |
228 | 
229 | - **Q2.4 — Missing Review Points**
230 | 
231 | List important review findings not covered by the reference comments.
232 | 
233 |   - [ ] Item 1:
234 |   - [ ] Item 2:
235 |   - [ ] Item 3:
236 | 
237 | ---
238 | 
239 | ### Section 3 — Defects Identified in the Patch
240 | 
241 | Repeat the following block for each defect you identify.
242 | 
243 | ```text
244 | Defect N
245 | - Category: [ ] Functionality  [ ] Correctness  [ ] Performance  [ ] Security  [ ] Maintainability  [ ] Style  [ ] Documentation
246 | - Severity (1-5):
247 | - Files/Locations:
248 | - Short description:
249 | - Suggested fix (optional):
250 | ```
251 | 
252 | ---
253 | 
254 | ### Section 4 — Difficulty and Review Effort
255 | 
256 | - **Q4.1 — Difficulty to understand and correctly review the patch**
257 | - [ ] <15 min
258 | - [ ] 15 min - 1 hour
259 | - [ ] 1-4 hours
260 | - [ ] >4 hours
261 | 
262 |   - **Why?**
263 | 
264 |   - Time estimate:
265 | 
266 | - **Q4.2 — Estimated review effort (1-5)**
267 | - [ ] 1: Trivial, almost no cognitive load
268 | - [ ] 2: Small effort, localized changes
269 | - [ ] 3: Moderate effort, multiple files/concerns
270 | - [ ] 4: High effort, complex interactions or risk
271 | - [ ] 5: Very high effort, wide impact or intricate logic
272 | 
273 |   - **Rationale:**
274 | 
275 |   - Effort level:
276 | 
277 | ---
278 | 
279 | ### Section 5 — Overall Patch Quality and Risk
280 | 
281 | - **Q5.1 — Does the patch meet acceptance criteria for the stated problem?**
282 | - [ ] Yes
283 | - [ ] Yes, with minor issues
284 | - [ ] No
285 | 
286 |   - **Notes:**
287 | 
288 | - **Q5.2 — Regression risk assessment**
289 | - [ ] Low
290 | - [ ] Medium
291 | - [ ] High
292 | 
293 |   - **Why?**
294 | 
295 |   - Risk level:
296 | 
297 | - **Q5.3 — Additional observations (tests, docs, API changes, compatibility)**
298 | 
299 |   _Your notes:_
300 | 
301 | ---
302 | 
303 | ### Section 6 — Dataset Suitability
304 | 
305 | - **Q6.1 — Is this instance suitable for evaluating code review quality?**
306 | - [ ] Yes
307 | - [ ] No
308 | 
309 |   - If 'No', explain (e.g., ambiguous problem, non-diff artifacts, excessive binary/auto-generated changes, missing context):
310 | 
311 | - **Q6.2 — Any compliance, privacy, or licensing concerns?**
312 | - [ ] No
313 | - [ ] Yes
314 | 
315 |   - If 'Yes' explain:
316 | 
317 | ---
318 | 
319 | ### Section 7 — Confidence
320 | 
321 | - **Q7.1 — How confident are you in your annotations? (1 = very low, 5 = very high)**
322 | 
323 |   Select one: [ ] 1  [ ] 2  [ ] 3  [ ] 4  [ ] 5


--------------------------------------------------------------------------------
/src/swe_care/utils/llm_models/clients.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import os
  3 | from abc import ABC, abstractmethod
  4 | from typing import Any
  5 | 
  6 | from loguru import logger
  7 | 
  8 | try:
  9 |     import openai
 10 | except ImportError:
 11 |     raise ImportError(
 12 |         "OpenAI package not found. Please install it with: pip install openai"
 13 |     )
 14 | 
 15 | try:
 16 |     import anthropic
 17 | except ImportError:
 18 |     raise ImportError(
 19 |         "Anthropic package not found. Please install it with: pip install anthropic"
 20 |     )
 21 | 
 22 | try:
 23 |     import tiktoken
 24 | except ImportError:
 25 |     raise ImportError(
 26 |         "Tiktoken package not found. Please install it with: pip install tiktoken"
 27 |     )
 28 | 
 29 | DEFAULT_MAX_RETRIES = 10
 30 | 
 31 | 
 32 | class BaseModelClient(ABC):
 33 |     """Abstract base class for LLM model clients."""
 34 | 
 35 |     client: Any
 36 |     model: str
 37 |     model_provider: str
 38 |     model_kwargs: dict[str, Any]
 39 |     max_retries: int
 40 | 
 41 |     def __init__(
 42 |         self,
 43 |         model: str,
 44 |         model_provider: str,
 45 |         max_retries: int = DEFAULT_MAX_RETRIES,
 46 |         **model_kwargs: Any,
 47 |     ):
 48 |         self.model = model
 49 |         self.model_provider = model_provider
 50 |         self.model_kwargs = model_kwargs
 51 |         self.max_retries = max_retries
 52 | 
 53 |     @abstractmethod
 54 |     def create_completion(self, messages: list[dict[str, str]]) -> str:
 55 |         """Create a completion using the LLM API.
 56 | 
 57 |         Args:
 58 |             messages: List of messages in OpenAI format [{"role": "user", "content": "..."}]
 59 | 
 60 |         Returns:
 61 |             The generated completion text
 62 |         """
 63 |         pass
 64 | 
 65 |     @abstractmethod
 66 |     def create_completion_with_structured_output(
 67 |         self, messages: list[dict[str, str]], json_schema: dict
 68 |     ) -> dict:
 69 |         """Create a completion with structured output using the LLM API.
 70 | 
 71 |         Args:
 72 |             messages: List of messages in OpenAI format [{"role": "user", "content": "..."}]
 73 |             json_schema: JSON Schema that defines the expected output structure
 74 | 
 75 |         Returns:
 76 |             The generated completion as a dictionary matching the schema
 77 |         """
 78 |         pass
 79 | 
 80 |     @abstractmethod
 81 |     def count_tokens_from_text(self, text: str) -> int:
 82 |         """Count the number of tokens in the text."""
 83 |         pass
 84 | 
 85 |     @abstractmethod
 86 |     def count_tokens_from_messages(self, messages: list[dict[str, str]]) -> int:
 87 |         """Count the number of tokens in the messages."""
 88 |         pass
 89 | 
 90 | 
 91 | class OpenAIClient(BaseModelClient):
 92 |     """OpenAI API client."""
 93 | 
 94 |     def __init__(
 95 |         self,
 96 |         model: str,
 97 |         model_provider: str,
 98 |         max_retries: int = DEFAULT_MAX_RETRIES,
 99 |         **model_kwargs: Any,
100 |     ):
101 |         super().__init__(model, model_provider, max_retries, **model_kwargs)
102 | 
103 |         # Initialize the OpenAI client
104 |         api_key = os.getenv("OPENAI_API_KEY")
105 |         if not api_key:
106 |             raise ValueError("OPENAI_API_KEY environment variable not set")
107 | 
108 |         self.client: openai.OpenAI = openai.OpenAI(
109 |             api_key=api_key, max_retries=self.max_retries
110 |         )
111 | 
112 |     def create_completion(self, messages: list[dict[str, str]]) -> str:
113 |         """Create a completion using OpenAI API."""
114 |         try:
115 |             response = self.client.chat.completions.create(
116 |                 model=self.model, messages=messages, **self.model_kwargs
117 |             )
118 |             return response.choices[0].message.content
119 |         except Exception as e:
120 |             logger.error(f"Error creating OpenAI completion: {e}")
121 |             raise e
122 | 
123 |     def create_completion_with_structured_output(
124 |         self, messages: list[dict[str, str]], json_schema: dict, strict: bool = False
125 |     ) -> dict:
126 |         """Create a completion with structured output using OpenAI API."""
127 |         try:
128 |             response = self.client.chat.completions.create(
129 |                 model=self.model,
130 |                 messages=messages,
131 |                 response_format={
132 |                     "type": "json_schema",
133 |                     "json_schema": {
134 |                         "name": json_schema.get("name", "structured_response"),
135 |                         "strict": strict,
136 |                         "schema": json_schema,
137 |                     },
138 |                 },
139 |                 **self.model_kwargs,
140 |             )
141 |             return json.loads(response.choices[0].message.content)
142 |         except Exception as e:
143 |             logger.error(f"Error creating OpenAI structured completion: {e}")
144 |             raise e
145 | 
146 |     def count_tokens_from_text(self, text: str) -> int:
147 |         """Count the number of tokens in the text using tiktoken."""
148 |         try:
149 |             encoding = tiktoken.encoding_for_model(self.model)
150 |         except KeyError:
151 |             # Fall back to o200k_base encoding if model not found
152 |             encoding = tiktoken.get_encoding("o200k_base")
153 | 
154 |         return len(encoding.encode(text, disallowed_special=()))
155 | 
156 |     def count_tokens_from_messages(self, messages: list[dict[str, str]]) -> int:
157 |         """Count the number of tokens in the messages using tiktoken."""
158 |         tokens_per_message = 3
159 |         tokens_per_name = 1
160 | 
161 |         num_tokens = 0
162 |         for message in messages:
163 |             num_tokens += tokens_per_message
164 |             for key, value in message.items():
165 |                 num_tokens += self.count_tokens_from_text(value)
166 |                 if key == "name":
167 |                     num_tokens += tokens_per_name
168 |         num_tokens += 3  # every reply is primed with <|start|>assistant<|message|>
169 |         return num_tokens
170 | 
171 | 
172 | class DeepSeekClient(OpenAIClient):
173 |     """DeepSeek API client."""
174 | 
175 |     def __init__(
176 |         self,
177 |         model: str,
178 |         model_provider: str,
179 |         max_retries: int = DEFAULT_MAX_RETRIES,
180 |         **model_kwargs: Any,
181 |     ):
182 |         super().__init__(model, model_provider, max_retries, **model_kwargs)
183 | 
184 |         self.client = openai.OpenAI.copy(
185 |             self.client, base_url="https://api.deepseek.com/v1"
186 |         )
187 | 
188 | 
189 | class QwenClient(OpenAIClient):
190 |     """Qwen API client."""
191 | 
192 |     def __init__(self, model: str, model_provider: str, **model_kwargs: Any):
193 |         # Handle enable_thinking
194 |         if "enable_thinking" in model_kwargs:
195 |             enable_thinking = model_kwargs.pop("enable_thinking")
196 |             model_kwargs["extra_body"] = {"enable_thinking": enable_thinking}
197 |         else:
198 |             model_kwargs["extra_body"] = {"enable_thinking": False}
199 | 
200 |         super().__init__(model, model_provider, **model_kwargs)
201 | 
202 |         self.client = openai.OpenAI.copy(
203 |             self.client, base_url="https://dashscope.aliyuncs.com/compatible-mode/v1"
204 |         )
205 | 
206 |     def create_completion(self, messages: list[dict[str, str]]) -> str:
207 |         """Create a completion using Qwen API with streaming support for enable_thinking."""
208 |         # If enable_thinking is True, we need to use streaming
209 |         if self.model_kwargs.get("extra_body", {}).get("enable_thinking", False):
210 |             response = self.client.chat.completions.create(
211 |                 model=self.model,
212 |                 messages=messages,
213 |                 stream=True,
214 |                 **self.model_kwargs,
215 |             )
216 | 
217 |             # Collect the streamed response
218 |             content = ""
219 |             for chunk in response:
220 |                 if chunk.choices[0].delta.content is not None:
221 |                     content += chunk.choices[0].delta.content
222 | 
223 |             return content
224 |         else:
225 |             # Use parent implementation for non-thinking calls
226 |             return super().create_completion(messages)
227 | 
228 | 
229 | class MoonshotClient(OpenAIClient):
230 |     """DeepSeek API client."""
231 | 
232 |     def __init__(
233 |         self,
234 |         model: str,
235 |         model_provider: str,
236 |         max_retries: int = DEFAULT_MAX_RETRIES,
237 |         **model_kwargs: Any,
238 |     ):
239 |         super().__init__(model, model_provider, max_retries, **model_kwargs)
240 | 
241 |         self.client = openai.OpenAI.copy(
242 |             self.client, base_url="https://api.moonshot.cn/v1"
243 |         )
244 | 
245 | 
246 | class GeminiClient(OpenAIClient):
247 |     """Gemini API client using Google's OpenAI-compatible API."""
248 | 
249 |     def __init__(
250 |         self,
251 |         model: str,
252 |         model_provider: str,
253 |         max_retries: int = DEFAULT_MAX_RETRIES,
254 |         **model_kwargs: Any,
255 |     ):
256 |         super().__init__(model, model_provider, max_retries, **model_kwargs)
257 | 
258 |         self.client = openai.OpenAI.copy(
259 |             self.client,
260 |             base_url="https://generativelanguage.googleapis.com/v1beta/openai/",
261 |         )
262 | 
263 | 
264 | class AnthropicClient(BaseModelClient):
265 |     """Anthropic API client."""
266 | 
267 |     def __init__(
268 |         self,
269 |         model: str,
270 |         model_provider: str,
271 |         max_retries: int = DEFAULT_MAX_RETRIES,
272 |         **model_kwargs: Any,
273 |     ):
274 |         super().__init__(model, model_provider, max_retries, **model_kwargs)
275 | 
276 |         # Initialize the Anthropic client
277 |         api_key = os.getenv("ANTHROPIC_API_KEY")
278 |         if not api_key:
279 |             raise ValueError("ANTHROPIC_API_KEY environment variable not set")
280 | 
281 |         self.client: anthropic.Anthropic = anthropic.Anthropic(
282 |             api_key=api_key, max_retries=self.max_retries
283 |         )
284 | 
285 |     def _convert_to_anthropic_format(
286 |         self, messages: list[dict[str, str]]
287 |     ) -> tuple[list[dict[str, str]], str | None]:
288 |         """Convert messages from OpenAI format to Anthropic format.
289 | 
290 |         Args:
291 |             messages: List of messages in OpenAI format
292 | 
293 |         Returns:
294 |             Tuple of (anthropic_messages, system_message)
295 |         """
296 |         system_message = None
297 |         anthropic_messages = []
298 | 
299 |         # Extract system message if present
300 |         if messages and messages[0]["role"] == "system":
301 |             system_message = messages[0]["content"]
302 |             messages = messages[1:]
303 | 
304 |         # Format remaining messages
305 |         for msg in messages:
306 |             anthropic_messages.append({"role": msg["role"], "content": msg["content"]})
307 | 
308 |         return anthropic_messages, system_message
309 | 
310 |     def create_completion(self, messages: list[dict[str, str]]) -> str:
311 |         """Create a completion using Anthropic API."""
312 |         try:
313 |             # Convert OpenAI format to Anthropic format
314 |             anthropic_messages, system_message = self._convert_to_anthropic_format(
315 |                 messages
316 |             )
317 | 
318 |             kwargs = self.model_kwargs.copy()
319 |             if system_message:
320 |                 kwargs["system"] = system_message
321 | 
322 |             response = self.client.messages.create(
323 |                 model=self.model,
324 |                 messages=anthropic_messages,
325 |                 max_tokens=kwargs.pop("max_tokens", 4096),
326 |                 **kwargs,
327 |             )
328 |             return response.content[0].text
329 |         except Exception as e:
330 |             logger.error(f"Error creating Anthropic completion: {e}")
331 |             raise e
332 | 
333 |     def create_completion_with_structured_output(
334 |         self, messages: list[dict[str, str]], json_schema: dict
335 |     ) -> dict:
336 |         """Create a completion with structured output using Anthropic API."""
337 |         try:
338 |             # Convert OpenAI format to Anthropic format
339 |             anthropic_messages, system_message = self._convert_to_anthropic_format(
340 |                 messages
341 |             )
342 | 
343 |             kwargs = self.model_kwargs.copy()
344 |             if system_message:
345 |                 kwargs["system"] = system_message
346 | 
347 |             # Create a tool definition from the JSON schema
348 |             tool_name = json_schema.get("name", "record_output")
349 |             tool = {
350 |                 "name": tool_name,
351 |                 "description": f"Record output using the schema: {json_schema.get('description', 'Structured output')}",
352 |                 "input_schema": json_schema,
353 |             }
354 | 
355 |             response = self.client.messages.create(
356 |                 model=self.model,
357 |                 messages=anthropic_messages,
358 |                 max_tokens=kwargs.pop("max_tokens", 4096),
359 |                 tools=[tool],
360 |                 tool_choice={"type": "tool", "name": tool_name},
361 |                 **kwargs,
362 |             )
363 | 
364 |             # Extract the tool use result
365 |             for content in response.content:
366 |                 if content.type == "tool_use" and content.name == tool_name:
367 |                     return content.input
368 | 
369 |             raise ValueError("No tool use found in response")
370 |         except Exception as e:
371 |             logger.error(f"Error creating Anthropic structured completion: {e}")
372 |             raise e
373 | 
374 |     def count_tokens_from_text(self, text: str) -> int:
375 |         """Count the number of tokens in the text using Anthropic's API."""
376 |         try:
377 |             # Wrap the text in a user message for token counting
378 |             result = self.client.messages.count_tokens(
379 |                 model=self.model, messages=[{"role": "user", "content": text}]
380 |             )
381 |             return result.usage.input_tokens
382 |         except Exception as e:
383 |             logger.error(f"Error counting tokens with Anthropic API: {e}")
384 |             raise e
385 | 
386 |     def count_tokens_from_messages(self, messages: list[dict[str, str]]) -> int:
387 |         """Count the number of tokens in the messages using Anthropic's API."""
388 |         try:
389 |             # Convert OpenAI format to Anthropic format
390 |             anthropic_messages, system_message = self._convert_to_anthropic_format(
391 |                 messages
392 |             )
393 | 
394 |             # Count tokens with Anthropic API
395 |             kwargs = {}
396 |             if system_message:
397 |                 kwargs["system"] = system_message
398 | 
399 |             result = self.client.messages.count_tokens(
400 |                 model=self.model, messages=anthropic_messages, **kwargs
401 |             )
402 |             return result.usage.input_tokens
403 |         except Exception as e:
404 |             logger.error(f"Error counting tokens with Anthropic API: {e}")
405 |             raise e
406 | 


--------------------------------------------------------------------------------
/src/swe_care/collect/__main__.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import sys
  3 | from pathlib import Path
  4 | 
  5 | from loguru import logger
  6 | 
  7 | import swe_care.collect.build_code_review_dataset
  8 | import swe_care.collect.classify_prs_data
  9 | import swe_care.collect.convert_to_rm_samples
 10 | import swe_care.collect.get_graphql_prs_data
 11 | import swe_care.collect.get_top_repos
 12 | from swe_care.collect.build_code_review_dataset import build_code_review_dataset
 13 | from swe_care.collect.classify_prs_data import classify_prs_data
 14 | from swe_care.collect.convert_to_rm_samples import convert_to_rm_samples
 15 | from swe_care.collect.get_graphql_prs_data import get_graphql_prs_data
 16 | from swe_care.collect.get_top_repos import get_top_repos
 17 | 
 18 | # Mapping of subcommands to their function names
 19 | SUBCOMMAND_MAP = {
 20 |     "get_top_repos": {
 21 |         "function": get_top_repos,
 22 |         "help": swe_care.collect.get_top_repos.__doc__,
 23 |     },
 24 |     "get_graphql_prs_data": {
 25 |         "function": get_graphql_prs_data,
 26 |         "help": swe_care.collect.get_graphql_prs_data.__doc__,
 27 |     },
 28 |     "classify_prs_data": {
 29 |         "function": classify_prs_data,
 30 |         "help": swe_care.collect.classify_prs_data.__doc__,
 31 |     },
 32 |     "build_code_review_dataset": {
 33 |         "function": build_code_review_dataset,
 34 |         "help": swe_care.collect.build_code_review_dataset.__doc__,
 35 |     },
 36 |     "convert_to_rm_samples": {
 37 |         "function": convert_to_rm_samples,
 38 |         "help": swe_care.collect.convert_to_rm_samples.__doc__,
 39 |     },
 40 | }
 41 | 
 42 | 
 43 | def create_global_parser():
 44 |     """Create a parser with global arguments that can be used as a parent parser."""
 45 |     global_parser = argparse.ArgumentParser(add_help=False)
 46 |     global_parser.add_argument(
 47 |         "--tokens",
 48 |         type=str,
 49 |         nargs="*",
 50 |         default=None,
 51 |         help="GitHub API token(s) to be used randomly for fetching data",
 52 |     )
 53 |     global_parser.add_argument(
 54 |         "--output-dir",
 55 |         type=Path,
 56 |         required=True,
 57 |         help="Path to output directory",
 58 |     )
 59 |     return global_parser
 60 | 
 61 | 
 62 | def get_args():
 63 |     # Parse command line manually to handle flexible argument order
 64 |     args = sys.argv[1:]
 65 | 
 66 |     # Find the subcommand
 67 |     subcommands = list(SUBCOMMAND_MAP.keys())
 68 |     subcommand = None
 69 |     subcommand_index = None
 70 | 
 71 |     for i, arg in enumerate(args):
 72 |         if arg in subcommands:
 73 |             subcommand = arg
 74 |             subcommand_index = i
 75 |             break
 76 | 
 77 |     # Create global parser
 78 |     global_parser = create_global_parser()
 79 | 
 80 |     if subcommand is None:
 81 |         # No subcommand found, use normal argparse
 82 |         parser = argparse.ArgumentParser(
 83 |             prog="swe_care.collect",
 84 |             description="Data collection tools for SWE-CARE",
 85 |             parents=[global_parser],
 86 |         )
 87 | 
 88 |         subparsers = parser.add_subparsers(dest="command", help="Available commands")
 89 |         for cmd, info in SUBCOMMAND_MAP.items():
 90 |             subparsers.add_parser(cmd, help=info["help"])
 91 | 
 92 |         return parser.parse_args(args)
 93 | 
 94 |     # Create the appropriate subcommand parser with global parser as parent
 95 |     match subcommand:
 96 |         case "get_top_repos":
 97 |             sub_parser = argparse.ArgumentParser(
 98 |                 prog=f"swe_care.collect {subcommand}",
 99 |                 parents=[global_parser],
100 |                 description=SUBCOMMAND_MAP[subcommand]["help"],
101 |             )
102 |             sub_parser.add_argument(
103 |                 "--language",
104 |                 type=str,
105 |                 required=True,
106 |                 help="Programming language to search for",
107 |             )
108 |             sub_parser.add_argument(
109 |                 "--top-n",
110 |                 type=int,
111 |                 required=True,
112 |                 help="Number of top repositories to fetch",
113 |             )
114 | 
115 |         case "get_graphql_prs_data":
116 |             sub_parser = argparse.ArgumentParser(
117 |                 prog=f"swe_care.collect {subcommand}",
118 |                 parents=[global_parser],
119 |                 description=SUBCOMMAND_MAP[subcommand]["help"],
120 |             )
121 |             repo_group = sub_parser.add_mutually_exclusive_group(required=True)
122 |             repo_group.add_argument(
123 |                 "--repo-file",
124 |                 type=Path,
125 |                 help="Path to repository JSONL file containing repository information, each line should be a JSON object with at least a 'name' field in format 'owner/repo'. Optionally, include 'pr_cursor' field to resume fetching from a specific cursor for each repository.",
126 |                 default=None,
127 |             )
128 |             repo_group.add_argument(
129 |                 "--repo",
130 |                 type=str,
131 |                 help="Repository in format 'owner/repo'",
132 |                 default=None,
133 |             )
134 |             sub_parser.add_argument(
135 |                 "--max-number",
136 |                 type=int,
137 |                 default=None,
138 |                 help="Maximum number of PRs to fetch per page (ignored when specific_prs is provided). If not provided, all PRs will be fetched.",
139 |             )
140 |             sub_parser.add_argument(
141 |                 "--jobs",
142 |                 type=int,
143 |                 default=2,
144 |                 help="Number of concurrent jobs/threads to use (default: 2)",
145 |             )
146 |             sub_parser.add_argument(
147 |                 "--specific-prs",
148 |                 type=int,
149 |                 nargs="*",
150 |                 default=None,
151 |                 help="Specific PR numbers to fetch (if not specified, fetches all PRs with closing issues)",
152 |             )
153 |             sub_parser.add_argument(
154 |                 "--after-pr-cursor",
155 |                 type=str,
156 |                 default=None,
157 |                 help="Resume fetching PRs after this cursor (useful for resuming interrupted runs). When used with --repo-file, acts as fallback for repositories without pr_cursor field in the file.",
158 |             )
159 | 
160 |         case "classify_prs_data":
161 |             sub_parser = argparse.ArgumentParser(
162 |                 prog=f"swe_care.collect {subcommand}",
163 |                 parents=[global_parser],
164 |                 description=SUBCOMMAND_MAP[subcommand]["help"],
165 |             )
166 |             sub_parser.add_argument(
167 |                 "--graphql-prs-data-file",
168 |                 type=Path,
169 |                 required=True,
170 |                 help="Path to GraphQL PRs data file or directory containing *_graphql_prs_data.jsonl files",
171 |             )
172 |             sub_parser.add_argument(
173 |                 "--jobs",
174 |                 type=int,
175 |                 default=2,
176 |                 help="Number of concurrent jobs/threads to use (default: 2)",
177 |             )
178 | 
179 |         case "build_code_review_dataset":
180 |             sub_parser = argparse.ArgumentParser(
181 |                 prog=f"swe_care.collect {subcommand}",
182 |                 parents=[global_parser],
183 |                 description=SUBCOMMAND_MAP[subcommand]["help"],
184 |             )
185 |             sub_parser.add_argument(
186 |                 "--graphql-prs-data-file",
187 |                 type=Path,
188 |                 required=True,
189 |                 help="Path to GraphQL PRs data file or directory containing *_graphql_prs_data.jsonl files",
190 |             )
191 |             sub_parser.add_argument(
192 |                 "--pr-classification-file",
193 |                 type=Path,
194 |                 required=True,
195 |                 help="Path to PR classification file or directory containing *_pr_classification.jsonl files",
196 |             )
197 |             sub_parser.add_argument(
198 |                 "--model",
199 |                 type=str,
200 |                 required=True,
201 |                 help="Model name for metadata classification (e.g., gpt-4o, claude-3-5-sonnet-20241022)",
202 |             )
203 |             sub_parser.add_argument(
204 |                 "--model-provider",
205 |                 type=str,
206 |                 required=True,
207 |                 help="Model provider for metadata classification (e.g., openai, anthropic)",
208 |             )
209 |             sub_parser.add_argument(
210 |                 "--model-args",
211 |                 type=str,
212 |                 default=None,
213 |                 help="Comma-separated model arguments for metadata classification (e.g., temperature=0.7,top_p=0.9)",
214 |             )
215 |             sub_parser.add_argument(
216 |                 "--skip-existing",
217 |                 action="store_true",
218 |                 default=False,
219 |                 help="Skip processing existing instance_id in the output file (default: False)",
220 |             )
221 |             sub_parser.add_argument(
222 |                 "--jobs",
223 |                 type=int,
224 |                 default=2,
225 |                 help="Number of concurrent jobs/threads to use (default: 2)",
226 |             )
227 |             sub_parser.add_argument(
228 |                 "--keep-empty-reference-review-comments",
229 |                 action="store_true",
230 |                 default=False,
231 |                 help="Keep dataset with empty reference review comments list (default: False)",
232 |             )
233 | 
234 |         case "convert_to_rm_samples":
235 |             sub_parser = argparse.ArgumentParser(
236 |                 prog=f"swe_care.collect {subcommand}",
237 |                 parents=[global_parser],
238 |                 description=SUBCOMMAND_MAP[subcommand]["help"],
239 |             )
240 |             sub_parser.add_argument(
241 |                 "--graphql-prs-data-file",
242 |                 type=Path,
243 |                 required=True,
244 |                 help="Path to GraphQL PRs data file or directory containing *_graphql_prs_data.jsonl files",
245 |             )
246 |             sub_parser.add_argument(
247 |                 "--pr-classification-file",
248 |                 type=Path,
249 |                 required=True,
250 |                 help="Path to PR classification file or directory containing *_pr_classification.jsonl files",
251 |             )
252 |             sub_parser.add_argument(
253 |                 "--file-source",
254 |                 type=str,
255 |                 choices=[
256 |                     "none",
257 |                     "base_changed_files",
258 |                     "reviewed_file",
259 |                     "retrieved_base_changed_files",
260 |                     "retrieved_all_files",
261 |                 ],
262 |                 default="none",
263 |                 help="Source for file content in review samples. Choices: 'none' (no file content in review samples), 'base_changed_files' (include changed files contents between base commit and commit to review), 'reviewed_file' (include changed file content to the sample the review comment applied to), 'retrieved_base_changed_files' (use BM25 to retrieve relevant files from changed files based on diff_hunk), 'retrieved_all_files' (use BM25 to retrieve relevant files from entire repository based on diff_hunk). Default: none",
264 |             )
265 |             sub_parser.add_argument(
266 |                 "--jobs",
267 |                 type=int,
268 |                 default=2,
269 |                 help="Number of concurrent jobs/threads to use (default: 2)",
270 |             )
271 |             sub_parser.add_argument(
272 |                 "--retrieval-max-files",
273 |                 type=int,
274 |                 default=5,
275 |                 help="Maximum number of files to use for retrieval when file_source is 'retrieved_base_changed_files' or 'retrieved_all_files' (default: 5)",
276 |             )
277 |             sub_parser.add_argument(
278 |                 "--retrieval-output-dir",
279 |                 type=Path,
280 |                 help="Output directory for retrieval operations when file_source is 'retrieved_all_files' (required when file_source is 'retrieved_all_files')",
281 |             )
282 |             sub_parser.add_argument(
283 |                 "--skip-existing",
284 |                 action="store_true",
285 |                 default=False,
286 |                 help="Skip processing existing PR (identified by PR number) in existing repo",
287 |             )
288 | 
289 |     # Parse all arguments with the subcommand parser
290 |     # This will include both global and subcommand-specific arguments
291 |     # Remove the subcommand itself from args
292 |     args_without_subcommand = args[:subcommand_index] + args[subcommand_index + 1 :]
293 |     final_namespace = sub_parser.parse_args(args_without_subcommand)
294 |     final_namespace.command = subcommand
295 | 
296 |     return final_namespace
297 | 
298 | 
299 | def main():
300 |     args = get_args()
301 | 
302 |     if args.command in SUBCOMMAND_MAP:
303 |         # Get the function from the mapping
304 |         cmd_info = SUBCOMMAND_MAP[args.command]
305 |         function = cmd_info["function"]
306 | 
307 |         # Prepare common arguments
308 |         common_kwargs = {"output_dir": args.output_dir, "tokens": args.tokens}
309 | 
310 |         # Add specific arguments based on subcommand
311 |         match args.command:
312 |             case "get_top_repos":
313 |                 function(language=args.language, top_n=args.top_n, **common_kwargs)
314 |             case "get_graphql_prs_data":
315 |                 function(
316 |                     repo_file=args.repo_file,
317 |                     repo=args.repo,
318 |                     max_number=args.max_number,
319 |                     specific_prs=args.specific_prs,
320 |                     jobs=args.jobs,
321 |                     after_pr_cursor=args.after_pr_cursor,
322 |                     **common_kwargs,
323 |                 )
324 |             case "classify_prs_data":
325 |                 function(
326 |                     graphql_prs_data_file=args.graphql_prs_data_file,
327 |                     jobs=args.jobs,
328 |                     **common_kwargs,
329 |                 )
330 |             case "build_code_review_dataset":
331 |                 function(
332 |                     graphql_prs_data_file=args.graphql_prs_data_file,
333 |                     pr_classification_file=args.pr_classification_file,
334 |                     model=args.model,
335 |                     model_provider=args.model_provider,
336 |                     model_args=args.model_args,
337 |                     skip_existing=args.skip_existing,
338 |                     jobs=args.jobs,
339 |                     keep_empty_reference_review_comments=args.keep_empty_reference_review_comments,
340 |                     **common_kwargs,
341 |                 )
342 |             case "convert_to_rm_samples":
343 |                 function(
344 |                     graphql_prs_data_file=args.graphql_prs_data_file,
345 |                     pr_classification_file=args.pr_classification_file,
346 |                     file_source=args.file_source,
347 |                     jobs=args.jobs,
348 |                     retrieval_max_files=args.retrieval_max_files,
349 |                     retrieval_output_dir=args.retrieval_output_dir,
350 |                     skip_existing=args.skip_existing,
351 |                     **common_kwargs,
352 |                 )
353 |     else:
354 |         logger.info("Please specify a command. Use --help for available commands.")
355 | 
356 | 
357 | if __name__ == "__main__":
358 |     main()
359 | 


--------------------------------------------------------------------------------