├── src
    ├── testora
    │   ├── __init__.py
    │   ├── util
    │   │   ├── Exceptions.py
    │   │   ├── ClassificationResult.py
    │   │   ├── UndefinedRefsFinder.py
    │   │   ├── DocstringRetrieval.py
    │   │   ├── PythonLanguageServer.py
    │   │   ├── Logs.py
    │   │   └── ClonedRepoManager.py
    │   ├── execution
    │   │   ├── TestExecution.py
    │   │   ├── CoverageAnalyzer.py
    │   │   ├── DockerExecutor.py
    │   │   └── ProgramMerger.py
    │   ├── prompts
    │   │   ├── PromptCommon.py
    │   │   ├── UndefinedRefsFixingPrompt.py
    │   │   ├── SelectExpectedBehaviorPrompt.py
    │   │   ├── TemperatureExperiment.py
    │   │   ├── PRRegressionBugRanking.py
    │   │   ├── RegressionTestGeneratorPromptV1.py
    │   │   └── RegressionTestGeneratorPromptV2.py
    │   ├── evaluation
    │   │   ├── TargetPRs.py
    │   │   ├── ClassificationResultsInspector.py
    │   │   ├── TestFailureInspector.py
    │   │   ├── sql
    │   │   │   ├── tasks.sql
    │   │   │   └── classification_tasks.sql
    │   │   ├── CheckedPRsInspector.py
    │   │   ├── FindCandidateProjects.py
    │   │   ├── ResultsManager.py
    │   │   ├── PreparePRChunks.py
    │   │   └── ClassificationResultsSummarizer.py
    │   ├── llms
    │   │   ├── LLMCacheAnalyzer.py
    │   │   ├── LLMCache.py
    │   │   └── OpenAIGPT.py
    │   ├── Config.py
    │   └── webui
    │   │   └── WebUI.py
    └── multilspy
    │   ├── README.md
    │   ├── __init__.py
    │   ├── multilspy_exceptions.py
    │   ├── type_helpers.py
    │   ├── multilspy_settings.py
    │   ├── multilspy_config.py
    │   ├── language_servers
    │       ├── rust_analyzer
    │       │   └── runtime_dependencies.json
    │       ├── eclipse_jdtls
    │       │   └── runtime_dependencies.json
    │       ├── omnisharp
    │       │   └── workspace_did_change_configuration.json
    │       └── jedi_language_server
    │       │   └── jedi_server.py
    │   ├── multilspy_logger.py
    │   └── lsp_protocol_handler
    │       └── lsp_constants.py
├── .vscode
    ├── settings.json
    └── launch.json
├── requirements.txt
├── data
    ├── ground_truth
    │   ├── template.json
    │   ├── scipy
    │   │   ├── 20089.json
    │   │   ├── 19776.json
    │   │   ├── 21076.json
    │   │   ├── 21553.json
    │   │   ├── 20751.json
    │   │   ├── 19428.json
    │   │   ├── 21036.json
    │   │   ├── 19263.json
    │   │   ├── 20974.json
    │   │   ├── 21518.json
    │   │   ├── 21577.json
    │   │   ├── 21642.json
    │   │   ├── 19680.json
    │   │   ├── 19853.json
    │   │   ├── 21633.json
    │   │   ├── 19861.json
    │   │   ├── 21528.json
    │   │   ├── 21597.json
    │   │   ├── 21572.json
    │   │   └── 21629.json
    │   ├── pandas
    │   │   ├── 58376.json
    │   │   ├── 57205.json
    │   │   ├── 57034.json
    │   │   ├── 57399.json
    │   │   ├── 57046.json
    │   │   ├── 55108.json
    │   │   ├── 59810.json
    │   │   └── 59782.json
    │   ├── marshmallow
    │   │   ├── 1399.json
    │   │   ├── 2215.json
    │   │   ├── 2102.json
    │   │   ├── 2246.json
    │   │   ├── 2271.json
    │   │   ├── 2123.json
    │   │   ├── 1998.json
    │   │   ├── 2244.json
    │   │   └── 2022.json
    │   └── keras
    │   │   └── 19814.json
    └── DATA.md
├── .gitignore
├── pyproject.toml
├── .devcontainer
    ├── setup_numpy_to_run_in_container.sh
    ├── setup_scipy_to_run_in_container.sh
    ├── postCreateCommands.sh
    ├── setup_scapy.sh
    ├── devcontainer.json
    ├── setup_transformers.sh
    ├── setup_pytorch_geometric.sh
    ├── setup_keras.sh
    ├── setup_marshmallow.sh
    ├── setup_scipy.sh
    ├── setup_numpy.sh
    ├── setup_pandas.sh
    └── setup_scikit-learn.sh
├── LICENSE
├── templates
    ├── pr_result.html
    ├── index.html
    └── pr_log.html
└── README.md


/src/testora/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/testora/util/Exceptions.py:
--------------------------------------------------------------------------------
1 | class TestoraException(BaseException):
2 |     pass
3 | 


--------------------------------------------------------------------------------
/src/multilspy/README.md:
--------------------------------------------------------------------------------
1 | All code in this folder is adapted from https://github.com/microsoft/monitors4codegen


--------------------------------------------------------------------------------
/.vscode/settings.json:
--------------------------------------------------------------------------------
1 | {
2 |     "mypy.runUsingActiveInterpreter": true,
3 |     "mypy.targets": [
4 |         "src"
5 |     ],
6 |     "files.watcherExclude": {
7 |         "data/**": true,
8 |     }
9 | }


--------------------------------------------------------------------------------
/src/multilspy/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | This module contains the multilspy API
3 | """
4 | 
5 | from . import multilspy_types as Types
6 | from .language_server import LanguageServer, SyncLanguageServer
7 | 
8 | __all__ = ["LanguageServer", "Types", "SyncLanguageServer"]
9 | 


--------------------------------------------------------------------------------
/src/testora/execution/TestExecution.py:
--------------------------------------------------------------------------------
 1 | from dataclasses import dataclass
 2 | from typing import Optional
 3 | 
 4 | 
 5 | @dataclass
 6 | class TestExecution:
 7 |     code: str
 8 |     output: Optional[str] = None
 9 |     coverage_report: Optional[str] = None
10 | 


--------------------------------------------------------------------------------
/src/testora/prompts/PromptCommon.py:
--------------------------------------------------------------------------------
 1 | from testora.Config import model_version
 2 | 
 3 | # NOTE: when changing the system message, must remove the old cache
 4 | 
 5 | if model_version.startswith("gpt"):
 6 |     system_message = "You are an experienced Python developer."
 7 | elif model_version.startswith("deepseek"):
 8 |     system_message = ""
 9 | 
10 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | docker==7.0.0
 2 | Flask==3.0.3
 3 | GitPython==3.1.43
 4 | jedi-language-server==0.41.4
 5 | libcst==1.2.0
 6 | matplotlib==3.8.4
 7 | openai==1.55.3
 8 | PyCG==0.0.8
 9 | pydantic==2.7.1
10 | PyGithub==2.3.0
11 | Requests==2.31.0
12 | typing_extensions==4.11.0
13 | unidiff==0.7.5
14 | mypy
15 | mysql-connector-python
16 | requests
17 | coverage


--------------------------------------------------------------------------------
/data/ground_truth/template.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "pr_number": 9999999,
 3 |     "log_file": "data/old_results/XXXXX",
 4 |     "differentiating_tests": [
 5 |         {
 6 |             "test": {
 7 |                 "test_code": "XXXX",
 8 |                 "old_output": "XXXX",
 9 |                 "new_output": "XXXX"
10 |             },
11 |             "label": "TODO",
12 |             "comment": ""
13 |         }
14 |     ]
15 | }


--------------------------------------------------------------------------------
/src/multilspy/multilspy_exceptions.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This module contains the exceptions raised by the Multilspy framework.
 3 | """
 4 | 
 5 | class MultilspyException(Exception):
 6 |     """
 7 |     Exceptions raised by the Multilspy framework.
 8 |     """
 9 | 
10 |     def __init__(self, message: str):
11 |         """
12 |         Initializes the exception with the given message.
13 |         """
14 |         super().__init__(message)


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | __pycache__
 2 | data/llm_cache*
 3 | data/repos
 4 | data/pr_chunks
 5 | data/results
 6 | data/results_03_2025
 7 | data/classification_results
 8 | data/classification_results_03_2025
 9 | data/target_prs
10 | data/figures/*.pdf
11 | .github_token
12 | logs_*.json
13 | results_*.json
14 | .openai_token
15 | .db_token
16 | .worker_id
17 | candidate_projects*.csv
18 | .target_project
19 | .coverage
20 | coverage.json
21 | coverage_report
22 | .openrouter_token
23 | 


--------------------------------------------------------------------------------
/data/DATA.md:
--------------------------------------------------------------------------------
1 | This folder contains detailed logs of running Testora, as well as manually created ground truth data for evaluating the classifier.The logs are not stored in Git. Instead, you can download and extract them into this folder.
2 | 
3 | Run these commands from the project's main directory to download the raw logs:
4 | 
5 | 1) ```wget https://github.com/michaelpradel/Testora/releases/download/data_03_2025/data_03_2025.tar.gz```
6 | 
7 | 2) ```tar -xf data_03_2025.tar.gz```


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [build-system]
 2 | requires = ["hatchling"]
 3 | build-backend = "hatchling.build"
 4 | 
 5 | [project]
 6 | name = "Testora"
 7 | version = "0.1.0"
 8 | description = ""
 9 | authors = [
10 |     { name = "Michael Pradel", email = "michael@binaervarianz.de" }
11 | ]
12 | license = "MIT"
13 | dependencies = []
14 | 
15 | [tool.hatch.build.targets.wheel]
16 | packages = [
17 |     "src/testora",
18 | ]
19 | 
20 | [tool.hatch.build.targets.sdist]
21 | include = [
22 |     "/src",
23 | ]
24 | 


--------------------------------------------------------------------------------
/src/testora/evaluation/TargetPRs.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import os
 3 | 
 4 | 
 5 | base_dir = "data/target_prs/"
 6 | 
 7 | 
 8 | def project_to_target_prs():
 9 |     project_to_prs = {}
10 |     for project_file in os.listdir(base_dir):
11 |         if project_file.endswith(".json"):
12 |             project_name = project_file.replace(".json", "")
13 |             with open(os.path.join(base_dir, project_file), "r") as f:
14 |                 project_to_prs[project_name] = json.load(f)
15 | 
16 |     return project_to_prs
17 | 


--------------------------------------------------------------------------------
/src/testora/util/ClassificationResult.py:
--------------------------------------------------------------------------------
 1 | from dataclasses import dataclass
 2 | from enum import Enum
 3 | 
 4 | 
 5 | class Classification(str, Enum):
 6 |     UNKNOWN = "unknown"
 7 |     INTENDED_CHANGE = "intended_change"
 8 |     COINCIDENTAL_FIX = "coincidental_fix"
 9 |     REGRESSION = "regression"
10 | 
11 | 
12 | @dataclass
13 | class ClassificationResult:
14 |     test_code: str
15 |     old_output: str
16 |     new_output: str
17 |     classification: Classification
18 |     classification_explanation: str
19 | 


--------------------------------------------------------------------------------
/.devcontainer/setup_numpy_to_run_in_container.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | apt install -y gcc g++ gfortran libopenblas-dev liblapack-dev pkg-config python3-pip python3-dev
 4 | 
 5 | wget -O Miniforge3.sh "https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-$(uname)-$(uname -m).sh"
 6 | bash Miniforge3.sh -b -p "${HOME}/conda"
 7 | source "${HOME}/conda/etc/profile.d/conda.sh"
 8 | source "${HOME}/conda/etc/profile.d/mamba.sh"
 9 | 
10 | mamba env create -f environment.yml
11 | mamba activate numpy-dev
12 | 
13 | pip install -e . --no-build-isolation


--------------------------------------------------------------------------------
/.devcontainer/setup_scipy_to_run_in_container.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | apt update
 4 | apt install -y gcc g++ gfortran libopenblas-dev liblapack-dev pkg-config
 5 | 
 6 | wget -O Miniforge3.sh "https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-$(uname)-$(uname -m).sh"
 7 | bash Miniforge3.sh -b -p "${HOME}/conda"
 8 | source "${HOME}/conda/etc/profile.d/conda.sh"
 9 | source "${HOME}/conda/etc/profile.d/mamba.sh"
10 | mamba shell init
11 | 
12 | mamba env create -f environment.yml -y
13 | mamba activate scipy-dev
14 | 
15 | pip install -e . --no-build-isolation
16 | 
17 | pip install coverage


--------------------------------------------------------------------------------
/.devcontainer/postCreateCommands.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | pip install --user -r requirements.txt
 4 | pip install -e .
 5 | 
 6 | echo "Setting up project-under-analysis"
 7 | # Select which project to analyze:
 8 | .devcontainer/setup_scipy.sh
 9 | # .devcontainer/setup_pandas.sh
10 | # .devcontainer/setup_keras.sh
11 | # .devcontainer/setup_marshmallow.sh
12 | 
13 | 
14 | ## Experimental and not really supported as of now:
15 | # .devcontainer/setup_scikit-learn.sh
16 | # .devcontainer/setup_numpy.sh
17 | # .devcontainer/setup_transformers.sh
18 | # .devcontainer/setup_pytorch_geometric.sh
19 | # .devcontainer/setup_scapy.sh


--------------------------------------------------------------------------------
/data/ground_truth/scipy/20089.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "pr_number": 20089,
 3 |     "log_file": "data/old_results/results_scipy_20138_20023.json",
 4 |     "differentiating_tests": [
 5 |         {
 6 |             "test": {
 7 |                 "test_code": "# Example 12: Using hyp2f1 with complex -inf input\nimport numpy as np\nimport scipy.special\n\nresult = scipy.special.hyp2f1(1.0, -np.inf, 3.0, 4.0+1.0j)\nprint(result)\n",
 8 |                 "old_output": "(nan+nanj)\n",
 9 |                 "new_output": "(1+0j)\n"
10 |             },
11 |             "label": "unintended",
12 |             "comment": "https://github.com/scipy/scipy/issues/20988"
13 |         }
14 |     ]
15 | }


--------------------------------------------------------------------------------
/data/ground_truth/pandas/58376.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "pr_number": 58376,
 3 |     "log_file": "data/old_results/results_pandas_58389_58294.json",
 4 |     "differentiating_tests": [
 5 |         {
 6 |             "test": {
 7 |                 "test_code": "# Corner Case Example 10: Using RangeIndex.searchsorted with a negative step value\nimport pandas as pd\n\nri = pd.RangeIndex(9, 0, -3)\nvalue = 5\nresult = ri.searchsorted(value)\nprint(result)",
 8 |                 "old_output": "0\n",
 9 |                 "new_output": "2\n"
10 |             },
11 |             "label": "coincidental fix",
12 |             "comment": "https://github.com/pandas-dev/pandas/issues/58641"
13 |         }
14 |     ]
15 | }


--------------------------------------------------------------------------------
/data/ground_truth/scipy/19776.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "pr_number": 19776,
 3 |     "log_file": "data/old_results/results_scipy_19816_19725.json",
 4 |     "differentiating_tests": [
 5 |         {
 6 |             "test": {
 7 |                 "test_code": "# Corner Case Example 6: Using inf and -inf values\nimport numpy as np\nfrom scipy.stats import rankdata\n\ndata = np.array([10, np.inf, -np.inf, 25, 30])\nresult = rankdata(data, method='average')\nprint(result)",
 8 |                 "old_output": "[nan nan nan nan nan]\n",
 9 |                 "new_output": "[2. 5. 1. 3. 4.]\n"
10 |             },
11 |             "label": "coincidental fix",
12 |             "comment": ""
13 |         }
14 |     ]
15 | }


--------------------------------------------------------------------------------
/data/ground_truth/marshmallow/1399.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "pr_number": 1399,
 3 |     "log_file": "data/old_results/results_marshmallow_1488_1348.json",
 4 |     "differentiating_tests": [
 5 |         {
 6 |             "test": {
 7 |                 "test_code": "# Example 2: Normal usage scenario\nfrom marshmallow.utils import get_func_args\n\nclass ExampleClass:\n    def __init__(self, attr1, attr2):\n        pass\n\nargs = get_func_args(ExampleClass)\nprint(args)\n# Output: ['attr1', 'attr2']\n\n",
 8 |                 "old_output": "['args', 'kwargs']\n",
 9 |                 "new_output": "['attr1', 'attr2']\n"
10 |             },
11 |             "label": "coincidental fix",
12 |             "comment": ""
13 |         }
14 |     ]
15 | }


--------------------------------------------------------------------------------
/data/ground_truth/pandas/57205.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "pr_number": 57205,
 3 |     "log_file": "data/old_results/results_pandas_57278_57203.json",
 4 |     "differentiating_tests": [
 5 |         {
 6 |             "test": {
 7 |                 "test_code": "import pandas as pd\n# Example 12: Corner case - Creating a DataFrame with all None values\ndata = {'a': None, 'b': None}\ncolumns = ['a', 'b']\ndf = pd.DataFrame(data, columns=columns, index=range(2))\nprint(\"DataFrame 12:\\n\", df)",
 8 |                 "old_output": "DataFrame 12:\n      a    b\n0  NaN  NaN\n1  NaN  NaN\n",
 9 |                 "new_output": "DataFrame 12:\n       a     b\n0  None  None\n1  None  None\n"
10 |             },
11 |             "label": "coincidental fix",
12 |             "comment": ""
13 |         }
14 |     ]
15 | }


--------------------------------------------------------------------------------
/data/ground_truth/pandas/57034.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "pr_number": 57034,
 3 |     "log_file": "data/old_results/results_pandas_57112_56981.json",
 4 |     "differentiating_tests": [
 5 |         {
 6 |             "test": {
 7 |                 "test_code": "# Example 8: Combine with both Series containing all NaN values\nimport pandas as pd\n\ns1 = pd.Series([None, None, None], index=['a', 'b', 'c'])\ns2 = pd.Series([None, None, None], index=['b', 'c', 'd'])\n\nresult = s1.combine_first(s2)\nprint(result)",
 8 |                 "old_output": "a    None\nb    None\nc    None\nd    None\ndtype: object\n",
 9 |                 "new_output": "a     NaN\nb    None\nc    None\nd    None\ndtype: object\n"
10 |             },
11 |             "label": "coincidental fix",
12 |             "comment": ""
13 |         }
14 |     ]
15 | }


--------------------------------------------------------------------------------
/data/ground_truth/scipy/21076.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "pr_number": 21076,
 3 |     "log_file": "data/old_results/results_scipy_21151_20231.json",
 4 |     "differentiating_tests": [
 5 |         {
 6 |             "test": {
 7 |                 "test_code": "# Example 10: Larger dataset\nimport numpy as np\nfrom scipy.stats import differential_entropy\n\nvalues = np.array([1, 1, 2, 3, 3, 4, 5, 5, 6, 7, 8, 9, 10, 11])\nresult = differential_entropy(values)\nprint(f\"Entropy of a larger dataset: {result}\")",
 8 |                 "old_output": "Entropy of a larger dataset: 2.358820400183337\n",
 9 |                 "new_output": "Entropy of a larger dataset: 2.5285667498058793\n"
10 |             },
11 |             "label": "unintended",
12 |             "comment": "https://github.com/scipy/scipy/issues/21192"
13 |         }
14 |     ]
15 | }


--------------------------------------------------------------------------------
/data/ground_truth/marshmallow/2215.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "pr_number": 2215,
 3 |     "log_file": "data/old_results/results_marshmallow_2215_2130.json",
 4 |     "differentiating_tests": [
 5 |         {
 6 |             "test": {
 7 |                 "test_code": "# Example 10: Demonstrate warning for deprecated option in SchemaOpts\nfrom marshmallow.schema import SchemaOpts\n\nclass Meta:\n    json_module = \"custom_json\"\n\noptions = SchemaOpts(meta=Meta)",
 8 |                 "old_output": "",
 9 |                 "new_output": "/tmp/BugGPT/BugGPT_test_code.py:7: RemovedInMarshmallow4Warning: The json_module class Meta option is deprecated. Use render_module instead.\n  options = SchemaOpts(meta=Meta)\n"
10 |             },
11 |             "label": "intended",
12 |             "comment": "updating the dependencies leads to a valid deprecation warning"
13 |         }
14 |     ]
15 | }


--------------------------------------------------------------------------------
/data/ground_truth/pandas/57399.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "pr_number": 57399,
 3 |     "log_file": "data/old_results/results_pandas_57450_57356.json",
 4 |     "differentiating_tests": [
 5 |         {
 6 |             "test": {
 7 |                 "test_code": "# Example 20: Creating an interval range with non-matching dtype for start and end\nimport pandas as pd\nimport numpy as np\n\nresult = pd.interval_range(start=np.float32(0), end=5, freq=1)\nprint(result)",
 8 |                 "old_output": "IntervalIndex([(0.0, 1.0], (1.0, 2.0], (2.0, 3.0], (3.0, 4.0], (4.0, 5.0]], dtype='interval[float64, right]')\n",
 9 |                 "new_output": "IntervalIndex([(0, 1], (1, 2], (2, 3], (3, 4], (4, 5]], dtype='interval[int64, right]')\n"
10 |             },
11 |             "label": "unintended",
12 |             "comment": "https://github.com/pandas-dev/pandas/issues/58964"
13 |         }
14 |     ]
15 | }


--------------------------------------------------------------------------------
/src/multilspy/type_helpers.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This module provides type-helpers used across multilspy implementation
 3 | """
 4 | 
 5 | import inspect
 6 | 
 7 | from typing import Callable, TypeVar, Type
 8 | 
 9 | R = TypeVar("R", bound=object)
10 | 
11 | def ensure_all_methods_implemented(
12 |     source_cls: Type[object],
13 | ) -> Callable[[Type[R]], Type[R]]:
14 |     """
15 |     A decorator to ensure that all methods of source_cls class are implemented in the decorated class.
16 |     """
17 | 
18 |     def check_all_methods_implemented(target_cls: R) -> R:
19 |         for name, _ in inspect.getmembers(source_cls, inspect.isfunction):
20 |             if name not in target_cls.__dict__ or not callable(target_cls.__dict__[name]):
21 |                 raise NotImplementedError(f"{name} is not implemented in {target_cls}")
22 | 
23 |         return target_cls
24 | 
25 |     return check_all_methods_implemented


--------------------------------------------------------------------------------
/data/ground_truth/scipy/21553.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "pr_number": 21553,
 3 |     "log_file": "data/results/scipy/21553_2024-11-23 09:06:43.json",
 4 |     "differentiating_tests": [
 5 |         {
 6 |             "test": {
 7 |                 "test_code": "# Example 19:\nimport numpy as np\nfrom scipy.linalg import expm\n\nA = np.array([[1, 2], [3, 4]], dtype=np.float32)  # Single precision float matrix\nresult = expm(A)\nprint(\"Exponential of A with float32 precision:\\n\", result)",
 8 |                 "old_output": "Exponential of A with float32 precision:\n [[ 51.969006  74.73665 ]\n [112.10497  164.074   ]]\n",
 9 |                 "new_output": "Exponential of A with float32 precision:\n [[ 44.694115  65.096375]\n [ 97.64457  142.33871 ]]\n"
10 |             },
11 |             "label": "unintended",
12 |             "comment": "unintended side-effect of a performance optimization"
13 |         }
14 |     ]
15 | }


--------------------------------------------------------------------------------
/data/ground_truth/keras/19814.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "pr_number": 19814,
 3 |     "log_file": "data/old_results/results_keras_19840_19690.json",
 4 |     "differentiating_tests": [
 5 |         {
 6 |             "test": {
 7 |                 "test_code": "# Example 19: Multiple predictions with mixed types.\nimport numpy as np\nimport jax.numpy as jnp\nfrom keras.src.backend.jax.math import in_top_k\n\npredictions = jnp.array([[0.1, np.nan, 0.1], [0.9, np.nan, 0.0]])\ntargets = jnp.array([1, 2])\nk = 1\ntry:\n    result = in_top_k(targets, predictions, k)\n    print(result)\nexcept Exception as e:\n    print(f\"Error: {e}\")  # Invalid predictions/types - Error expected\n",
 8 |                 "old_output": "[False False]\n",
 9 |                 "new_output": "[ True False]\n"
10 |             },
11 |             "label": "unintended",
12 |             "comment": "https://github.com/keras-team/keras/issues/19995"
13 |         }
14 |     ]
15 | }


--------------------------------------------------------------------------------
/data/ground_truth/pandas/57046.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "pr_number": 57046,
 3 |     "log_file": "data/old_results/results_pandas_57112_56981.json",
 4 |     "differentiating_tests": [
 5 |         {
 6 |             "test": {
 7 |                 "test_code": "# Example 13: Corner Case - SeriesGroupBy.idxmin with NaN values only\nimport pandas as pd\nimport numpy as np\n\ns = pd.Series([np.nan, np.nan, np.nan, np.nan], index=['a', 'b', 'a', 'b'])\ngrouped = s.groupby(s.index)\nresult = grouped.idxmin(skipna=True)\nprint(\"SeriesGroupBy.idxmin with NaN values only result:\\n\", result)",
 8 |                 "old_output": "SeriesGroupBy.idxmin with NaN values only result:\n a    a\nb    a\ndtype: object\n",
 9 |                 "new_output": "SeriesGroupBy.idxmin with NaN values only result:\n a    NaN\nb    NaN\ndtype: object\n"
10 |             },
11 |             "label": "coincidental fix",
12 |             "comment": ""
13 |         }
14 |     ]
15 | }


--------------------------------------------------------------------------------
/src/multilspy/multilspy_settings.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Defines the settings for multilspy.
 3 | """
 4 | 
 5 | import os
 6 | import pathlib
 7 | 
 8 | class MultilspySettings:
 9 |     """
10 |     Provides the various settings for multilspy.
11 |     """
12 |     @staticmethod
13 |     def get_language_server_directory() -> str:
14 |         """Returns the directory for language servers"""
15 |         user_home = pathlib.Path.home()
16 |         multilspy_dir = str(pathlib.PurePath(user_home, ".multilspy"))
17 |         lsp_dir = str(pathlib.PurePath(multilspy_dir, "lsp"))
18 |         os.makedirs(lsp_dir, exist_ok=True)
19 |         return lsp_dir
20 | 
21 |     @staticmethod
22 |     def get_global_cache_directory() -> str:
23 |         """Returns the cache directory"""
24 |         global_cache_dir = os.path.join(str(pathlib.Path.home()), ".multilspy", "global_cache")
25 |         os.makedirs(global_cache_dir, exist_ok=True)
26 |         return global_cache_dir
27 | 


--------------------------------------------------------------------------------
/data/ground_truth/pandas/55108.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "pr_number": 55108,
 3 |     "log_file": "data/old_results/results_pandas_known_bugs.json",
 4 |     "differentiating_tests": [
 5 |         {
 6 |             "test": {
 7 |                 "test_code": "# Example 7: Using difference with one Index being a DateTimeIndex\nimport pandas as pd\n\nindex1 = pd.date_range('2022-01-01', periods=5)\nindex2 = pd.Index(['2022-01-03', '2022-01-04'])\nresult = index1.difference(index2)\nprint(result)",
 8 |                 "old_output": "DatetimeIndex(['2022-01-01', '2022-01-02', '2022-01-05'], dtype='datetime64[ns]', freq=None)\n",
 9 |                 "new_output": "DatetimeIndex(['2022-01-01', '2022-01-02', '2022-01-03', '2022-01-04',\n               '2022-01-05'],\n              dtype='datetime64[ns]', freq='D')\n"
10 |             },
11 |             "label": "unintended",
12 |             "comment": "https://github.com/pandas-dev/pandas/issues/58971"
13 |         }
14 |     ]
15 | }


--------------------------------------------------------------------------------
/src/multilspy/multilspy_config.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Configuration parameters for Multilspy.
 3 | """
 4 | 
 5 | from enum import Enum
 6 | from dataclasses import dataclass
 7 | 
 8 | class Language(str, Enum):
 9 |     """
10 |     Possible languages with Multilspy.
11 |     """
12 | 
13 |     CSHARP = "csharp"
14 |     PYTHON = "python"
15 |     RUST = "rust"
16 |     JAVA = "java"
17 | 
18 |     def __str__(self) -> str:
19 |         return self.value
20 | 
21 | @dataclass
22 | class MultilspyConfig:
23 |     """
24 |     Configuration parameters
25 |     """
26 |     code_language: Language
27 |     trace_lsp_communication: bool = False
28 | 
29 |     @classmethod
30 |     def from_dict(cls, env: dict):
31 |         """
32 |         Create a MultilspyConfig instance from a dictionary
33 |         """
34 |         import inspect
35 |         return cls(**{
36 |             k: v for k, v in env.items() 
37 |             if k in inspect.signature(cls).parameters
38 |         })


--------------------------------------------------------------------------------
/src/testora/util/UndefinedRefsFinder.py:
--------------------------------------------------------------------------------
 1 | import libcst as cst
 2 | 
 3 | 
 4 | def get_undefined_references(src):
 5 |     undefined_variables = []  # using a list here to get a deterministic order
 6 | 
 7 |     ast = cst.parse_module(src)
 8 |     ast_wrapper = cst.metadata.MetadataWrapper(ast)
 9 |     scopes = ast_wrapper.resolve(cst.metadata.ScopeProvider).values()
10 |     for scope in scopes:
11 |         for access in scope.accesses:
12 |             if len(access.referents) == 0:
13 |                 node = access.node
14 |                 undefined_variables.append(node.value)
15 | 
16 |     # remove duplicates
17 |     undefined_variables = list(dict.fromkeys(undefined_variables))
18 | 
19 |     return undefined_variables
20 | 
21 | 
22 | if __name__ == "__main__":
23 |     code = """
24 | from xx import bar
25 |     
26 | def foo(l):
27 |     l()
28 | 
29 | foo(lambda n: print(n), bar)
30 | """
31 |     undefined_refs = get_undefined_references(code)
32 |     print("Undefined references:", undefined_refs)
33 | 


--------------------------------------------------------------------------------
/src/testora/llms/LLMCacheAnalyzer.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import matplotlib.pyplot as plt
 3 | 
 4 | def analyze_llm_cache(file_path):
 5 |     with open(file_path, "r") as f:
 6 |         cache = json.load(f)
 7 |     print(f"Total cache entries: {len(cache)}")
 8 |     query_sizes = []
 9 |     answer_sizes = []
10 |     for query, answer in cache.items():
11 |         query_sizes.append(len(query))
12 |         answer_sizes.append(len(answer))
13 |         if len(answer) == 0:
14 |             print(f"Warning: empty answer found!")
15 | 
16 |     # plot histogram of query sizes
17 |     plt.hist(query_sizes, bins=50)
18 |     plt.title("Query sizes")
19 |     plt.xlabel("Size")
20 |     plt.ylabel("Frequency")
21 |     plt.show()
22 | 
23 |     # plot histogram of answer sizes
24 |     plt.hist(answer_sizes, bins=50)
25 |     plt.title("Answer sizes")
26 |     plt.xlabel("Size")
27 |     plt.ylabel("Frequency")
28 |     plt.show()
29 | 
30 | 
31 | 
32 |     
33 | 
34 | if __name__ == "__main__":
35 |     analyze_llm_cache("data/llm_cache/gpt-4-0125-preview/cache.json")
36 | 


--------------------------------------------------------------------------------
/src/multilspy/language_servers/rust_analyzer/runtime_dependencies.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "_description": "Used to download the runtime dependencies for running RustAnalyzer. Obtained from https://github.com/rust-lang/rust-analyzer/releases",
 3 |     "runtimeDependencies": [
 4 |         {
 5 |             "id": "RustAnalyzer",
 6 |             "description": "RustAnalyzer for Linux (x64)",
 7 |             "url": "https://github.com/rust-lang/rust-analyzer/releases/download/2023-10-09/rust-analyzer-x86_64-unknown-linux-gnu.gz",
 8 |             "platformId": "linux-x64",
 9 |             "archiveType": "gz",
10 |             "binaryName": "rust_analyzer"
11 |         },
12 |         {
13 |             "id": "RustAnalyzer",
14 |             "description": "RustAnalyzer for Windows (x64)",
15 |             "url": "https://github.com/rust-lang/rust-analyzer/releases/download/2023-10-09/rust-analyzer-x86_64-pc-windows-msvc.zip",
16 |             "platformId": "win-x64",
17 |             "archiveType": "zip",
18 |             "binaryName": "rust-analyzer.exe"
19 |         }
20 |     ]
21 | }


--------------------------------------------------------------------------------
/src/testora/evaluation/ClassificationResultsInspector.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | from testora.evaluation.ResultsManager import result_files_for_project
 3 | 
 4 | 
 5 | project = "scipy"
 6 | pr = 21553
 7 | test_case_to_skip = 0
 8 | 
 9 | file = list(result_files_for_project(project, is_classification=True))[0]
10 | fp = open(file, "r")
11 | result_json = json.load(fp)
12 | config = result_json[0]["message"]
13 | print("CONFIG:")
14 | print(config)
15 | for entry_idx, entry in enumerate(result_json):
16 |     if entry["pr_nb"] == pr and entry["message"] == "Pre-classification":
17 |         test_case_to_skip -= 1
18 |         if test_case_to_skip != -1:
19 |             continue
20 | 
21 |         print("\nTEST CODE:")
22 |         print(entry["test_code"])
23 |         print("\nOLD OUTPUT:")
24 |         print(entry["old_output"])
25 |         print("\nNEW OUTPUT:")
26 |         print(entry["new_output"])
27 | 
28 |         print("\nQUERY:")
29 |         print(result_json[entry_idx+1]["content"])
30 | 
31 |         print("\nANSWER:")
32 |         print(result_json[entry_idx+3]["content"])
33 | 


--------------------------------------------------------------------------------
/data/ground_truth/marshmallow/2102.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "pr_number": 2102,
 3 |     "log_file": "data/results/marshmallow/2102_2024-11-23 09:16:17.json",
 4 |     "differentiating_tests": [
 5 |         {
 6 |             "test": {
 7 |                 "test_code": "# Example 13:\nimport marshmallow.utils as utils\n\ntimestamp = float('inf')  # Value is positive infinity\ntry:\n    result = utils.from_timestamp(timestamp)\nexcept ValueError as e:\n    print(\"Timestamp:\", timestamp, \"-> Exception:\", e)",
 8 |                 "old_output": "Traceback (most recent call last):\n  File \"/tmp/BugGPT/BugGPT_test_code.py\", line 6, in <module>\n    result = utils.from_timestamp(timestamp)\n  File \"/home/marshmallow/src/marshmallow/utils.py\", line 200, in from_timestamp\n    return dt.datetime.fromtimestamp(value, tz=dt.timezone.utc).replace(tzinfo=None)\nOverflowError: timestamp out of range for platform time_t\n",
 9 |                 "new_output": "Timestamp: inf -> Exception: Timestamp is too large\n"
10 |             },
11 |             "label": "intended",
12 |             "comment": "PR adds better error handling"
13 |         }
14 |     ]
15 | }


--------------------------------------------------------------------------------
/data/ground_truth/scipy/20751.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "pr_number": 20751,
 3 |     "log_file": "data/old_results/results_scipy_20759_20656.json",
 4 |     "differentiating_tests": [
 5 |         {
 6 |             "test": {
 7 |                 "test_code": "# Example 9: Using float32 data type arrays\nimport numpy as np\nfrom scipy.stats import bartlett\n\na = np.array([10.1, 10.2, 10.3, 10.4], dtype=np.float32)\nb = np.array([10.15, 10.25, 10.35, 10.45], dtype=np.float32)\nc = np.array([10.05, 10.15, 10.25, 10.35], dtype=np.float32)\n\nresult = bartlett(a, b, c)\nprint(\"Bartlett test result for float32 data type arrays:\", result)",
 8 |                 "old_output": "Bartlett test result for float32 data type arrays: BartlettResult(statistic=np.float64(2.7743484928754286e-11), pvalue=np.float64(0.9999999999861282))\n",
 9 |                 "new_output": "Bartlett test result for float32 data type arrays: BartlettResult(statistic=np.float64(-7.080736255702299e-07), pvalue=np.float64(1.0))\n"
10 |             },
11 |             "label": "unintended",
12 |             "comment": "https://github.com/scipy/scipy/issues/21152"
13 |         }
14 |     ]
15 | }


--------------------------------------------------------------------------------
/data/ground_truth/scipy/19428.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "pr_number": 19428,
 3 |     "log_file": "data/old_results/results_scipy_19533_19428.json",
 4 |     "differentiating_tests": [
 5 |         {
 6 |             "test": {
 7 |                 "test_code": "# Example 20: Perform a Levene test with one sample vector containing both positive and negative infinity values\nfrom scipy.stats import levene\nimport numpy as np\n\nsample1 = np.array([-np.inf, 1, 2, np.inf, 4])\n\nresult = levene(sample1)",
 8 |                 "old_output": "Traceback (most recent call last):\n  File \"/tmp/BugGPT/BugGPT_test_code.py\", line 7, in <module>\n    result = levene(sample1)\n             ^^^^^^^^^^^^^^^\n  File \"/home/scipy/scipy/stats/_morestats.py\", line 3213, in levene\n    raise ValueError(\"Must enter at least two input sample vectors.\")\nValueError: Must enter at least two input sample vectors.\n",
 9 |                 "new_output": ""
10 |             },
11 |             "label": "unintended",
12 |             "comment": "API expects at least two input sample vectors; newer version went back to old behavior (independently of us)"
13 |         }
14 |     ]
15 | }


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2024-2025 Michael Pradel
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/data/ground_truth/scipy/21036.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "pr_number": 21036,
 3 |     "log_file": "data/old_results/results_scipy_21067_20937.json",
 4 |     "differentiating_tests": [
 5 |         {
 6 |             "test": {
 7 |                 "test_code": "# Example 14:\nimport numpy as np\nfrom scipy import stats\n\na = np.array([1, 2])\nprint(\"Array:\", a)\ntry:\n    tsem_value = stats.tsem(a, ddof=3)\n    print(\"Standard Error of Mean with ddof > number of elements:\", tsem_value)\nexcept Exception as e:\n    print(\"Error:\", e)",
 8 |                 "old_output": "Array: [1 2]\nStandard Error of Mean with ddof > number of elements: inf\n/root/conda/envs/scipy-dev/lib/python3.12/site-packages/numpy/core/fromnumeric.py:3787: RuntimeWarning: Degrees of freedom <= 0 for slice\n  return _methods._var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,\n/root/conda/envs/scipy-dev/lib/python3.12/site-packages/numpy/core/_methods.py:198: RuntimeWarning: divide by zero encountered in scalar divide\n  ret = ret.dtype.type(ret / rcount)\n",
 9 |                 "new_output": "Array: [1 2]\nStandard Error of Mean with ddof > number of elements: nan\n"
10 |             },
11 |             "label": "coincidental fix",
12 |             "comment": ""
13 |         }
14 |     ]
15 | }


--------------------------------------------------------------------------------
/src/testora/util/DocstringRetrieval.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from testora.util.ClonedRepoManager import ClonedRepo
 3 | from testora.util.PythonCodeUtil import get_locations_of_calls
 4 | 
 5 | 
 6 | def retrieve_relevant_docstrings(cloned_repo: ClonedRepo, code: str) -> str:
 7 |     # copy code into project
 8 |     code_dir = f"{cloned_repo.repo.working_dir}/testora_code"
 9 |     os.makedirs(code_dir, exist_ok=True)
10 |     code_path = f"{code_dir}/test.py"
11 |     with open(code_path, "w") as f:
12 |         f.write(code)
13 | 
14 |     # find all calls in the code
15 |     call_locations = get_locations_of_calls(code)
16 | 
17 |     # query language server for hover text for each call
18 |     server = cloned_repo.language_server
19 |     docs = []
20 |     for call_location in call_locations:
21 |         line = call_location.start.line - 1  # LSP lines are 0-based
22 |         column = call_location.start.column
23 |         doc = server.get_hover_text(code_path, line, column)
24 |         if doc not in docs:
25 |             docs.append(doc)
26 | 
27 |     # enforce limits: max 2000 chars per docstring, max 6000 chars overall
28 |     result = ""
29 |     for doc in docs:
30 |         result += "-------"
31 |         result += doc[:2000]
32 | 
33 |     return result[:6000]
34 | 


--------------------------------------------------------------------------------
/data/ground_truth/scipy/19263.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "pr_number": 19263,
 3 |     "log_file": "data/old_results/results_scipy_19310_19224.json",
 4 |     "differentiating_tests": [
 5 |         {
 6 |             "test": {
 7 |                 "test_code": "# Example 13: Passing a list instead of a numpy array to hfftn\nfrom scipy.fft import hfftn\n\nx = [[1, 2], [3, 4]]\nresult = hfftn(x)",
 8 |                 "old_output": "",
 9 |                 "new_output": "Traceback (most recent call last):\n  File \"/tmp/BugGPT/BugGPT_test_code.py\", line 5, in <module>\n    result = hfftn(x)\n             ^^^^^^^^\n  File \"/home/scipy/scipy/fft/_backend.py\", line 28, in __ua_function__\n    return fn(*args, **kwargs)\n           ^^^^^^^^^^^^^^^^^^^\n  File \"/home/scipy/scipy/fft/_basic_backend.py\", line 154, in hfftn\n    return _pocketfft.hfftn(x, s, axes, norm, overwrite_x, workers, plan=plan)\n           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n  File \"/home/scipy/scipy/fft/_pocketfft/basic.py\", line 208, in c2rn\n    shape[-1] = (x.shape[axes[-1]] - 1) * 2\n                 ^^^^^^^\nAttributeError: 'list' object has no attribute 'shape'\n"
10 |             },
11 |             "label": "unintended",
12 |             "comment": "https://github.com/scipy/scipy/issues/21207"
13 |         }
14 |     ]
15 | }


--------------------------------------------------------------------------------
/data/ground_truth/scipy/20974.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "pr_number": 20974,
 3 |     "log_file": "data/old_results/results_scipy_21067_20937.json",
 4 |     "differentiating_tests": [
 5 |         {
 6 |             "test": {
 7 |                 "test_code": "# Example 10: Using method 'stouffer' with weights and axis (1)\nimport numpy as np\nfrom scipy.stats import combine_pvalues\n\npvalues = np.array([[0.01, 0.02, 0.05], [0.03, 0.04, 0.07], [0.08, 0.01, 0.02]])\nweights = np.array([1, 2, 3])\nresult = combine_pvalues(pvalues, method='stouffer', weights=weights, axis=1)\nprint(\"Stouffer Method, weights, axis=1:\", result)",
 8 |                 "old_output": "Stouffer Method, weights, axis=1: SignificanceResult(statistic=array([3.03833446, 2.62170953, 3.26566887]), pvalue=array([0.00118945, 0.0043745 , 0.00054603]))\n",
 9 |                 "new_output": "Stouffer Method, weights, axis=1: SignificanceResult(statistic=array([[2.7536326 , 3.34989642, 2.87511156],\n       [2.7536326 , 3.34989642, 2.87511156],\n       [2.7536326 , 3.34989642, 2.87511156]]), pvalue=array([[0.00294689, 0.00040421, 0.00201942],\n       [0.00294689, 0.00040421, 0.00201942],\n       [0.00294689, 0.00040421, 0.00201942]]))\n"
10 |             },
11 |             "label": "unintended",
12 |             "comment": "https://github.com/scipy/scipy/issues/21106"
13 |         }
14 |     ]
15 | }


--------------------------------------------------------------------------------
/data/ground_truth/scipy/21518.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "pr_number": 21518,
 3 |     "log_file": "data/results/scipy/21518_2024-11-23 09:06:43.json",
 4 |     "differentiating_tests": [
 5 |         {
 6 |             "test": {
 7 |                 "test_code": "# Example 15:\nimport numpy as np\nfrom scipy import stats\n\ndata = np.array([-1, -2, -3])\nlmb = 1.0\n\ntry:\n    log_likelihood = stats.boxcox_llf(lmb, data)\n    print(f\"Box-Cox Log-likelihood with negative data: {log_likelihood}\")\nexcept Exception as e:\n    print(f\"Error with negative values: {e}\")",
 8 |                 "old_output": "/home/scipy/scipy/stats/_morestats.py:961: RuntimeWarning: invalid value encountered in log\n  logdata = xp.log(data)\n/home/scipy/scipy/special/_logsumexp.py:128: RuntimeWarning: invalid value encountered in exp\n  tmp = xp.exp(a - a_max)\nBox-Cox Log-likelihood with negative data: nan\n",
 9 |                 "new_output": "/home/scipy/scipy/stats/_morestats.py:967: RuntimeWarning: invalid value encountered in log\n  logdata = xp.log(data)\n/home/scipy/scipy/special/_logsumexp.py:128: RuntimeWarning: invalid value encountered in exp\n  tmp = xp.exp(a - a_max)\nBox-Cox Log-likelihood with negative data: nan\n"
10 |             },
11 |             "label": "intended",
12 |             "comment": "line numbers are different, everything else is the same"
13 |         }
14 |     ]
15 | }


--------------------------------------------------------------------------------
/data/ground_truth/scipy/21577.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "pr_number": 21577,
 3 |     "log_file": "data/results/scipy/21577_2024-11-23 09:06:43.json",
 4 |     "differentiating_tests": [
 5 |         {
 6 |             "test": {
 7 |                 "test_code": "# Example 1:\nimport numpy as np\nfrom scipy import special\n\n# Edge case with zero order and zero input\nresult = special.hankel2(0, 0)\nprint(\"hankel2(0, 0) = \", result)",
 8 |                 "old_output": "hankel2(0, 0) =  (nan+nanj)\n",
 9 |                 "new_output": "hankel2(0, 0) =  (nan+infj)\n"
10 |             },
11 |             "label": "intended",
12 |             "comment": "not 100% sure about the math, but changing the output for this input is the PR's intention"
13 |         },
14 |         {
15 |             "test": {
16 |                 "test_code": "# Example 6:\nimport numpy as np\nfrom scipy import special\n\n# Testing with a complex input of zero\nresult = special.hankel2(0, complex(0, 0))\nprint(\"hankel2(0, complex(0, 0)) = \", result)",
17 |                 "old_output": "hankel2(0, complex(0, 0)) =  (nan+nanj)\n",
18 |                 "new_output": "hankel2(0, complex(0, 0)) =  (nan+infj)\n"
19 |             },
20 |             "label": "intended",
21 |             "comment": "not 100% sure about the math, but changing the output for this input is the PR's intention"
22 |         }
23 |     ]
24 | }


--------------------------------------------------------------------------------
/.devcontainer/setup_scapy.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | echo "Creating directory for clones"
 4 | cd ..
 5 | sudo mkdir clones
 6 | sudo chown vscode:vscode clones/
 7 | cd clones
 8 | 
 9 | echo "Cleaning any existing scapy-dev containers"
10 | docker rm -f scapy-dev1
11 | docker rm -f scapy-dev2
12 | docker rm -f scapy-dev3
13 | 
14 | mkdir clone1
15 | cd clone1
16 | 
17 | echo "Creating first clone of scapy"
18 | git clone https://github.com/secdev/scapy.git
19 | cd scapy
20 | echo "Building dev container for scapy (first clone)"
21 | docker run -t -d --name scapy-dev1 -v ${PWD}:/home/scapy python:3.10
22 | docker exec -w /home/scapy scapy-dev1 pip install -e .
23 | echo "Done with first clone"
24 | 
25 | #####
26 | echo "Creating second clone of scapy"
27 | cd ../..
28 | cp -r clone1 clone2
29 | cd clone2/scapy
30 | echo "Building dev container for scapy (second clone)"
31 | docker run -t -d --name scapy-dev2 -v ${PWD}:/home/scapy python:3.10
32 | docker exec -w /home/scapy scapy-dev2 pip install -e .
33 | echo "Done with second clone"
34 | 
35 | echo "Creating third clone of scapy"
36 | cd ../..
37 | cp -r clone1 clone3
38 | cd clone3/scapy
39 | echo "Building dev container for scapy (third clone)"
40 | docker run -t -d --name scapy-dev3 -v ${PWD}:/home/scapy python:3.10
41 | docker exec -w /home/scapy scapy-dev3 pip install -e .
42 | echo "Done with third clone"
43 | 
44 | cd ../../../Testora


--------------------------------------------------------------------------------
/data/ground_truth/scipy/21642.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "pr_number": 21642,
 3 |     "log_file": "data/results/scipy/21642_2024-11-23 09:06:44.json",
 4 |     "differentiating_tests": [
 5 |         {
 6 |             "test": {
 7 |                 "test_code": "# Example 3:\nimport numpy as np\nfrom scipy.io import mmwrite, mmread\n\n# Write a matrix directly to a file without specifying extension\ndata = np.array([[5, 0, 0], [0, 0, 6]])\nmmwrite('matrix3', data)\n\n# Read back the created file\nloaded_matrix = mmread('matrix3.mtx')",
 8 |                 "old_output": "Traceback (most recent call last):\n  File \"/tmp/BugGPT/BugGPT_test_code.py\", line 10, in <module>\n    loaded_matrix = mmread('matrix3.mtx')\n  File \"/home/scipy/scipy/io/_fast_matrix_market/__init__.py\", line 354, in mmread\n    cursor, stream_to_close = _get_read_cursor(source)\n                              ~~~~~~~~~~~~~~~~^^^^^^^^\n  File \"/home/scipy/scipy/io/_fast_matrix_market/__init__.py\", line 197, in _get_read_cursor\n    return _fmm_core.open_read_file(path, parallelism), ret_stream_to_close\n           ~~~~~~~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^\nValueError: Line 1: Not a Matrix Market file. Missing banner.\n",
 9 |                 "new_output": ""
10 |             },
11 |             "label": "intended",
12 |             "comment": "exception in old version is due to the bug fixed by the PR"
13 |         }
14 |     ]
15 | }


--------------------------------------------------------------------------------
/data/ground_truth/scipy/19680.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "pr_number": 19680,
 3 |     "log_file": "data/old_results/results_scipy_19724_19637.json",
 4 |     "differentiating_tests": [
 5 |         {
 6 |             "test": {
 7 |                 "test_code": "# Example 5: Testing the shapiro function with extreme values\nfrom scipy import stats\n\ndata = [10**20, -10**20, 10**30, -10**30]  # Extreme values\nshapiro_result = stats.shapiro(data)",
 8 |                 "old_output": "",
 9 |                 "new_output": "Traceback (most recent call last):\n  File \"/tmp/BugGPT/BugGPT_test_code.py\", line 5, in <module>\n    shapiro_result = stats.shapiro(data)\n                     ^^^^^^^^^^^^^^^^^^^\n  File \"/home/scipy/scipy/stats/_axis_nan_policy.py\", line 505, in axis_nan_policy_wrapper\n    contains_nan = [_contains_nan(sample, nan_policy)[0]\n                    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n  File \"/home/scipy/scipy/_lib/_util.py\", line 716, in _contains_nan\n    if np.issubdtype(type(el), np.number) and np.isnan(el):\n                                              ^^^^^^^^^^^^\nTypeError: ufunc 'isnan' not supported for the input types, and the inputs could not be safely coerced to any supported types according to the casting rule ''safe''\n"
10 |             },
11 |             "label": "unintended",
12 |             "comment": "https://github.com/scipy/scipy/issues/21205"
13 |         }
14 |     ]
15 | }


--------------------------------------------------------------------------------
/data/ground_truth/scipy/19853.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "pr_number": 19853,
 3 |     "log_file": "data/old_results/results_scipy_19909_19818.json",
 4 |     "differentiating_tests": [
 5 |         {
 6 |             "test": {
 7 |                 "test_code": "# Corner Case Example 7: Calculating mean on an empty sparse matrix\nfrom scipy.sparse import coo_matrix\n\nA = coo_matrix((0, 0))\n\nA.mean()",
 8 |                 "old_output": "/home/scipy/scipy/sparse/_base.py:712: RuntimeWarning: divide by zero encountered in divide\n  return self.astype(np.float64)._mul_scalar(1./other)\n",
 9 |                 "new_output": "Traceback (most recent call last):\n  File \"/tmp/BugGPT/BugGPT_test_code.py\", line 6, in <module>\n    A.mean()\n  File \"/home/scipy/scipy/sparse/_base.py\", line 1209, in mean\n    return (inter_self / (self.shape[0] * self.shape[1]))\\\n            ~~~~~~~~~~~^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n  File \"/home/scipy/scipy/sparse/_base.py\", line 752, in __truediv__\n    return self._divide(other, true_divide=True)\n           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n  File \"/home/scipy/scipy/sparse/_base.py\", line 716, in _divide\n    return self.astype(np.float64)._mul_scalar(1./other)\n                                               ~~^~~~~~\nZeroDivisionError: float division by zero\n"
10 |             },
11 |             "label": "coincidental fix",
12 |             "comment": ""
13 |         }
14 |     ]
15 | }


--------------------------------------------------------------------------------
/data/ground_truth/marshmallow/2246.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "pr_number": 2246,
 3 |     "log_file": "data/results/marshmallow/2246_2024-11-23 09:16:17.json",
 4 |     "differentiating_tests": [
 5 |         {
 6 |             "test": {
 7 |                 "test_code": "# Example 1: Field Declaration as a Class (Expecting TypeError)\nfrom marshmallow import Schema, fields\n\nclass BadUserSchema(Schema):\n    name = fields.String  # Incorrect: declaring as class, should be an instance\n\ntry:",
 8 |                 "old_output": "Couldn't run '/tmp/BugGPT/BugGPT_test_code.py' as Python code: IndentationError: expected an indented block after 'try' statement on line 7 (BugGPT_test_code.py, line 7)\n/usr/local/lib/python3.10/site-packages/coverage/control.py:892: CoverageWarning: No data was collected. (no-data-collected)\n  self._warn(\"No data was collected.\", slug=\"no-data-collected\")\n",
 9 |                 "new_output": "/usr/local/lib/python3.10/site-packages/coverage/control.py:892: CoverageWarning: No data was collected. (no-data-collected)\n  self._warn(\"No data was collected.\", slug=\"no-data-collected\")\nCouldn't run '/tmp/BugGPT/BugGPT_test_code.py' as Python code: IndentationError: expected an indented block after 'try' statement on line 7 (BugGPT_test_code.py, line 7)\n"
10 |             },
11 |             "label": "intended",
12 |             "comment": "different order of error messages"
13 |         }
14 |     ]
15 | }


--------------------------------------------------------------------------------
/.devcontainer/devcontainer.json:
--------------------------------------------------------------------------------
 1 | // For format details, see https://aka.ms/devcontainer.json. For config options, see the
 2 | // README at: https://github.com/devcontainers/templates/tree/main/src/python
 3 | {
 4 | 	"name": "Testora-dev",
 5 | 	// Or use a Dockerfile or Docker Compose file. More info: https://containers.dev/guide/dockerfile
 6 | 	"image": "mcr.microsoft.com/devcontainers/python:1-3.11-bookworm",
 7 | 	"features": {
 8 | 		"ghcr.io/devcontainers/features/docker-in-docker:2": {
 9 | 			"moby": true,
10 | 			"azureDnsAutoDetection": true,
11 | 			"installDockerBuildx": true,
12 | 			"installDockerComposeSwitch": true,
13 | 			"version": "latest",
14 | 			"dockerDashComposeVersion": "latest"
15 | 		}
16 | 	},
17 | 
18 | 	// Features to add to the dev container. More info: https://containers.dev/features.
19 | 	// "features": {},
20 | 
21 | 	// Use 'forwardPorts' to make a list of ports inside the container available locally.
22 | 	// "forwardPorts": [],
23 | 
24 | 	// Use 'postCreateCommand' to run commands after the container is created.
25 | 	"postCreateCommand": "./.devcontainer/postCreateCommands.sh",
26 | 
27 | 	// Configure tool-specific properties.
28 | 	// "customizations": {},
29 | 
30 | 	// Uncomment to connect as root instead. More info: https://aka.ms/dev-containers-non-root.
31 | 	// "remoteUser": "root"
32 | 
33 | 	// keep containers running (useful for long-running experiments on servers)
34 | 	"shutdownAction": "none"
35 | }
36 | 


--------------------------------------------------------------------------------
/data/ground_truth/scipy/21633.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "pr_number": 21633,
 3 |     "log_file": "data/results/scipy/21633_2024-11-23 09:06:44.json",
 4 |     "differentiating_tests": [
 5 |         {
 6 |             "test": {
 7 |                 "test_code": "# Example 1:\nimport numpy as np\nfrom scipy.linalg import kron\n\nA = np.array([[1, 2], [3, 4]])\nB = np.array([[0, 5], [6, 7]])\nresult = kron(A, B)",
 8 |                 "old_output": "",
 9 |                 "new_output": "/tmp/BugGPT/BugGPT_test_code.py:7: DeprecationWarning: `kron` has been deprecated in favour of `numpy.kron` in SciPy 1.15.0 and will be removed in SciPy 1.17.0.\n  result = kron(A, B)\n"
10 |             },
11 |             "label": "intended",
12 |             "comment": "new code prints deprecation warning, as intended by the PR"
13 |         },
14 |         {
15 |             "test": {
16 |                 "test_code": "# Example 10:\nimport numpy as np\nfrom scipy.linalg import kron\n\nA = np.array([[1, 0], [0, 1]])\nB = np.array([[2, 2], [2, 2]])\nresult = kron(A, B)",
17 |                 "old_output": "",
18 |                 "new_output": "/tmp/BugGPT/BugGPT_test_code.py:7: DeprecationWarning: `kron` has been deprecated in favour of `numpy.kron` in SciPy 1.15.0 and will be removed in SciPy 1.17.0.\n  result = kron(A, B)\n"
19 |             },
20 |             "label": "intended",
21 |             "comment": "new code prints deprecation warning, as intended by the PR"
22 |         }
23 |     ]
24 | }


--------------------------------------------------------------------------------
/src/testora/evaluation/TestFailureInspector.py:
--------------------------------------------------------------------------------
 1 | # Helper script to inspect logs of test failures and identify the root cause of the failure
 2 | 
 3 | from collections import Counter
 4 | from testora.evaluation.ResultsManager import result_files
 5 | from testora.util.LogParser import parse_log_files
 6 | 
 7 | pr_results, _ = parse_log_files(result_files())
 8 | 
 9 | error_ctr = Counter()
10 | 
11 | for pr_result in pr_results:
12 |     if pr_result.nb_test_failures > 0:
13 |         for entry in pr_result.entries:
14 |             if entry["message"] == "Test execution" and "Traceback (most recent call last)" in entry["output"]:
15 |                 # print(entry["output"])
16 |                 last_line = entry["output"].split("\n")[-2:-1][0]
17 |                 # print(last_line)
18 |                 # print("--------------------------------------------\n")
19 |                 if "Error" in last_line:
20 |                     error_type = last_line.split(":")[0]
21 |                     error_ctr[error_type] += 1
22 | 
23 |                 if "NameError" in last_line:
24 |                     # print(last_line)
25 |                     print(entry["code"])
26 |                     print(">>>")
27 |                     print(entry["output"])
28 |                     print("--------------------------------------------\n")
29 | 
30 | 
31 | print("\n\n\n")
32 | 
33 | for error_type, count in error_ctr.most_common():
34 |     print(f"{error_type}: {count}")
35 | 
36 |                 
37 | 


--------------------------------------------------------------------------------
/.vscode/launch.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "configurations": [
 3 |         {
 4 |             "name": "RegressionFinder",
 5 |             "type": "debugpy",
 6 |             "request": "launch",
 7 |             "module": "testora.RegressionFinder",
 8 |         },
 9 |         {
10 |             "name": "RegressionFinder --db",
11 |             "type": "debugpy",
12 |             "request": "launch",
13 |             "module": "testora.RegressionFinder",
14 |             "args": ["--db"],
15 |         },
16 |         {
17 |             "name": "RegressionFinderOnePR",
18 |             "type": "debugpy",
19 |             "request": "launch",
20 |             "module": "testora.RegressionFinder",
21 |             "args": ["--project", "scipy", "--pr", "21768"],
22 |         },
23 |         {
24 |             "name": "EvalTaskManager",
25 |             "type": "debugpy",
26 |             "request": "launch",
27 |             "module": "testora.evaluation.EvalTaskManager",
28 |             "args": ["--fetch", "--classification"],
29 |         },
30 |         {
31 |             "name": "ClassificationEvaluator",
32 |             "type": "debugpy",
33 |             "request": "launch",
34 |             "module": "testora.evaluation.ClassificationEvaluator",
35 |             "args": ["--evaluate"],
36 |         },
37 |         {
38 |             "name": "ClassificationResultsSummarizer",
39 |             "type": "debugpy",
40 |             "request": "launch",
41 |             "module": "testora.evaluation.ClassificationResultsSummarizer",
42 |         },
43 |     ]
44 | }


--------------------------------------------------------------------------------
/src/testora/evaluation/sql/tasks.sql:
--------------------------------------------------------------------------------
 1 | -- phpMyAdmin SQL Dump
 2 | -- version 5.2.1
 3 | -- https://www.phpmyadmin.net/
 4 | --
 5 | -- Host: sql141.your-server.de
 6 | -- Generation Time: Mar 20, 2025 at 10:00 AM
 7 | -- Server version: 10.11.11-MariaDB-hetzner1
 8 | -- PHP Version: 8.0.30
 9 | 
10 | SET SQL_MODE = "NO_AUTO_VALUE_ON_ZERO";
11 | START TRANSACTION;
12 | SET time_zone = "+00:00";
13 | 
14 | 
15 | /*!40101 SET @OLD_CHARACTER_SET_CLIENT=@@CHARACTER_SET_CLIENT */;
16 | /*!40101 SET @OLD_CHARACTER_SET_RESULTS=@@CHARACTER_SET_RESULTS */;
17 | /*!40101 SET @OLD_COLLATION_CONNECTION=@@COLLATION_CONNECTION */;
18 | /*!40101 SET NAMES utf8mb4 */;
19 | 
20 | --
21 | -- Database: `regression_finder_db`
22 | --
23 | 
24 | -- --------------------------------------------------------
25 | 
26 | --
27 | -- Table structure for table `tasks`
28 | --
29 | 
30 | CREATE TABLE `tasks` (
31 |   `project` varchar(31) NOT NULL,
32 |   `pr` int(11) NOT NULL,
33 |   `worker` text DEFAULT NULL,
34 |   `result` longtext DEFAULT NULL,
35 |   `timestamp` timestamp NOT NULL DEFAULT current_timestamp()
36 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;
37 | 
38 | --
39 | -- Indexes for dumped tables
40 | --
41 | 
42 | --
43 | -- Indexes for table `tasks`
44 | --
45 | ALTER TABLE `tasks`
46 |   ADD PRIMARY KEY (`project`,`pr`,`timestamp`);
47 | COMMIT;
48 | 
49 | /*!40101 SET CHARACTER_SET_CLIENT=@OLD_CHARACTER_SET_CLIENT */;
50 | /*!40101 SET CHARACTER_SET_RESULTS=@OLD_CHARACTER_SET_RESULTS */;
51 | /*!40101 SET COLLATION_CONNECTION=@OLD_COLLATION_CONNECTION */;
52 | 


--------------------------------------------------------------------------------
/.devcontainer/setup_transformers.sh:
--------------------------------------------------------------------------------
 1 | 
 2 | echo "Creating directory for clones"
 3 | cd ..
 4 | sudo mkdir clones
 5 | sudo chown vscode:vscode clones/
 6 | cd clones
 7 | 
 8 | echo "Cleaning any existing transformer containers"
 9 | docker rm -f transformers-dev1
10 | docker rm -f transformers-dev2
11 | docker rm -f transformers-dev3
12 | 
13 | mkdir clone1
14 | cd clone1
15 | 
16 | echo "Creating first clone of transformers"
17 | git clone https://github.com/huggingface/transformers.git
18 | cd transformers
19 | echo "Building dev container for transformers (first clone)"
20 | docker run -t -d --name transformers-dev1 -v ${PWD}:/home/transformers python:3.10
21 | docker exec -w /home/transformers transformers-dev1 pip install -e ".[dev]"
22 | echo "Done with first clone"
23 | 
24 | echo "Creating second clone of transformers"
25 | cd ../..
26 | cp -r clone1 clone2
27 | cd clone2/transformers
28 | echo "Building dev container for transformers (second clone)"
29 | docker run -t -d --name transformers-dev2 -v ${PWD}:/home/transformers python:3.10
30 | docker exec -w /home/transformers transformers-dev2 pip install -e ".[dev]"
31 | echo "Done with second clone"
32 | 
33 | echo "Creating third clone of transformers"
34 | cd ../..
35 | cp -r clone1 clone3
36 | cd clone3/transformers
37 | echo "Building dev container for transformers (third clone)"
38 | docker run -t -d --name transformers-dev3 -v ${PWD}:/home/transformers python:3.10
39 | docker exec -w /home/transformers transformers-dev3 pip install -e ".[dev]"
40 | echo "Done with third clone"
41 | 
42 | cd ../../../Testora
43 | 
44 | 
45 | 


--------------------------------------------------------------------------------
/src/testora/prompts/UndefinedRefsFixingPrompt.py:
--------------------------------------------------------------------------------
 1 | class UndefinedRefsFixingPrompt:
 2 |     def __init__(self, code, undefined_refs):
 3 |         self.code = code
 4 |         self.undefined_refs = undefined_refs
 5 |         self.use_json_output = False
 6 | 
 7 |     def create_prompt(self):
 8 |         instruction_single = """
 9 | This Python code has an undefined reference to: <REF>. Fix it.
10 | 
11 | ```python
12 | <CODE>
13 | ```
14 | 
15 | Respond only with Python code wrapped into ```python ... ```. Give no explanations.
16 | """
17 | 
18 |         instruction_multiple = """
19 | This Python code has undefined references to: <REF>. Fix it.
20 | 
21 | ```python
22 | <CODE>
23 | ```
24 | 
25 | Respond only with Python code wrapped into ```python ... ```. Give no explanations.
26 | """
27 |         if len(self.undefined_refs) == 1:
28 |             prompt = instruction_single.replace(
29 |                 "<REF>", self.undefined_refs[0])
30 |         else:
31 |             prompt = instruction_multiple.replace(
32 |                 "<REF>", ", ".join(self.undefined_refs))
33 | 
34 |         prompt = prompt.replace("<CODE>", self.code)
35 | 
36 |         return prompt
37 | 
38 |     def parse_answer(self, raw_answer):
39 |         code = ""
40 |         in_code = False
41 |         for line in raw_answer.split("\n"):
42 |             if line.strip() == "```":
43 |                 break
44 |             if in_code:
45 |                 code += line + "\n"
46 |             if line == "```python" or line.startswith("import"):
47 |                 in_code = True
48 |         return code
49 | 


--------------------------------------------------------------------------------
/data/ground_truth/scipy/19861.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "pr_number": 19861,
 3 |     "log_file": "data/old_results/results_scipy_19909_19818.json",
 4 |     "differentiating_tests": [
 5 |         {
 6 |             "test": {
 7 |                 "test_code": "# Example 13: Corner case with empty string arrays\nimport numpy as np\nfrom scipy.io import savemat, loadmat\nimport tempfile\n\nwith tempfile.TemporaryDirectory() as tmpdirname:\n    empty_string_array = np.array([\"\", \"\"])\n    data = {\"empty_string_array\": empty_string_array}\n    savemat(f\"{tmpdirname}/empty_string.mat\", data, format=\"4\", oned_as='row')",
 8 |                 "old_output": "",
 9 |                 "new_output": "Traceback (most recent call last):\n  File \"/tmp/BugGPT/BugGPT_test_code.py\", line 9, in <module>\n    savemat(f\"{tmpdirname}/empty_string.mat\", data, format=\"4\", oned_as='row')\n  File \"/home/scipy/scipy/io/matlab/_mio.py\", line 301, in savemat\n    MW.put_variables(mdict)\n  File \"/home/scipy/scipy/io/matlab/_mio4.py\", line 624, in put_variables\n    self._matrix_writer.write(var, name)\n  File \"/home/scipy/scipy/io/matlab/_mio4.py\", line 522, in write\n    self.write_char(arr, name)\n  File \"/home/scipy/scipy/io/matlab/_mio4.py\", line 565, in write_char\n    arr = np.ndarray(shape=dims, dtype='S1', buffer=st)\n          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\nTypeError: buffer is too small for requested array\n"
10 |             },
11 |             "label": "coincidental fix",
12 |             "comment": "not 100% sure; could also be a regression; intended behavior is hard to guess"
13 |         }
14 |     ]
15 | }


--------------------------------------------------------------------------------
/src/testora/evaluation/sql/classification_tasks.sql:
--------------------------------------------------------------------------------
 1 | -- phpMyAdmin SQL Dump
 2 | -- version 5.2.1
 3 | -- https://www.phpmyadmin.net/
 4 | --
 5 | -- Host: sql141.your-server.de
 6 | -- Generation Time: Mar 20, 2025 at 10:00 AM
 7 | -- Server version: 10.11.11-MariaDB-hetzner1
 8 | -- PHP Version: 8.0.30
 9 | 
10 | SET SQL_MODE = "NO_AUTO_VALUE_ON_ZERO";
11 | START TRANSACTION;
12 | SET time_zone = "+00:00";
13 | 
14 | 
15 | /*!40101 SET @OLD_CHARACTER_SET_CLIENT=@@CHARACTER_SET_CLIENT */;
16 | /*!40101 SET @OLD_CHARACTER_SET_RESULTS=@@CHARACTER_SET_RESULTS */;
17 | /*!40101 SET @OLD_COLLATION_CONNECTION=@@COLLATION_CONNECTION */;
18 | /*!40101 SET NAMES utf8mb4 */;
19 | 
20 | --
21 | -- Database: `regression_finder_db`
22 | --
23 | 
24 | -- --------------------------------------------------------
25 | 
26 | --
27 | -- Table structure for table `classification_tasks`
28 | --
29 | 
30 | CREATE TABLE `classification_tasks` (
31 |   `project` varchar(31) NOT NULL,
32 |   `pr` int(11) NOT NULL,
33 |   `worker` text DEFAULT NULL,
34 |   `result` longtext DEFAULT NULL,
35 |   `timestamp` timestamp NOT NULL DEFAULT current_timestamp()
36 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;
37 | 
38 | --
39 | -- Indexes for dumped tables
40 | --
41 | 
42 | --
43 | -- Indexes for table `classification_tasks`
44 | --
45 | ALTER TABLE `classification_tasks`
46 |   ADD PRIMARY KEY (`project`,`pr`,`timestamp`);
47 | COMMIT;
48 | 
49 | /*!40101 SET CHARACTER_SET_CLIENT=@OLD_CHARACTER_SET_CLIENT */;
50 | /*!40101 SET CHARACTER_SET_RESULTS=@OLD_CHARACTER_SET_RESULTS */;
51 | /*!40101 SET COLLATION_CONNECTION=@OLD_COLLATION_CONNECTION */;
52 | 


--------------------------------------------------------------------------------
/src/testora/Config.py:
--------------------------------------------------------------------------------
 1 | from testora.util.Logs import Event, append_event
 2 | 
 3 | # KEEP THIS AT THE TOP: needed to log the current configuration
 4 | initial_globals = set(globals().keys())
 5 | 
 6 | # analyze only PRs that have code changes in files in specific programming languages
 7 | code_change_pl = "all"  # "all" or "python"
 8 | 
 9 | # analyze only PRs with a single parent
10 | single_parent_PRs_only = False
11 | 
12 | # use program merger to merge programs
13 | use_program_merger = False
14 | 
15 | # filter PRs based on LLM-provided risk assessment
16 | llm_risk_assessment = False
17 | 
18 | # try to fix undefined references in generated tests
19 | fix_undefined_refs = True
20 | 
21 | # model_version = "gpt-3.5-turbo-0125"
22 | # model_version = "gpt-4-0125-preview"
23 | # model_version = "gpt-4o-mini-2024-07-18"
24 | model_version = "gpt-5-mini-2025-08-07"
25 | # model_version = "gpt-4o-2024-08-06"
26 | # model_version = "deepseek/deepseek-r1"
27 | 
28 | 
29 | # OpenAI's default: 1.0
30 | classification_temp = 1.0
31 | # DeepSeek's recommended default: 0.6
32 | # classification_temp = 0.6
33 | 
34 | # different prompts
35 | test_generation_prompt_version = 2
36 | undefined_refs_fixing_prompt_version = 2
37 | classification_prompt_version = 7
38 | 
39 | # KEEP THIS AT THE END: log the current configuration
40 | current_globals = set(globals().keys())
41 | config_parameters = current_globals - initial_globals
42 | config_parameters = config_parameters - \
43 |     {"initial_globals", "current_globals", "config_parameters"}
44 | config_dict = {p: v for p, v in globals().items() if p in config_parameters}
45 | append_event(
46 |     Event(pr_nb=0, message=f"Configuration: {config_dict}"))
47 | 


--------------------------------------------------------------------------------
/.devcontainer/setup_pytorch_geometric.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | echo "Creating directory for clones"
 4 | cd ..
 5 | sudo mkdir clones
 6 | sudo chown vscode:vscode clones/
 7 | cd clones
 8 | 
 9 | echo "Cleaning any existing pytorch_geometric-dev containers"
10 | docker rm -f pytorch_geometric-dev1
11 | docker rm -f pytorch_geometric-dev2
12 | docker rm -f pytorch_geometric-dev3
13 | 
14 | mkdir clone1
15 | cd clone1
16 | 
17 | echo "Creating first clone of pytorch_geometric"
18 | git clone https://github.com/pyg-team/pytorch_geometric.git
19 | cd pytorch_geometric
20 | echo "Building dev container for pytorch_geometric (first clone)"
21 | docker run -t -d --name pytorch_geometric-dev1 -v ${PWD}:/home/pytorch_geometric python:3.10
22 | docker exec -w /home/pytorch_geometric pytorch_geometric-dev1 pip install -e '.[dev,full]'
23 | echo "Done with first clone"
24 | 
25 | #####
26 | echo "Creating second clone of pytorch_geometric"
27 | cd ../..
28 | cp -r clone1 clone2
29 | cd clone2/pytorch_geometric
30 | echo "Building dev container for pytorch_geometric (second clone)"
31 | docker run -t -d --name pytorch_geometric-dev2 -v ${PWD}:/home/pytorch_geometric python:3.10
32 | docker exec -w /home/pytorch_geometric pytorch_geometric-dev2 pip install -e '.[dev,full]'
33 | echo "Done with second clone"
34 | 
35 | echo "Creating third clone of pytorch_geometric"
36 | cd ../..
37 | cp -r clone1 clone3
38 | cd clone3/pytorch_geometric
39 | echo "Building dev container for pytorch_geometric (third clone)"
40 | docker run -t -d --name pytorch_geometric-dev3 -v ${PWD}:/home/pytorch_geometric python:3.10
41 | docker exec -w /home/pytorch_geometric pytorch_geometric-dev3 pip install -e '.[dev,full]'
42 | echo "Done with third clone"
43 | 
44 | cd ../../../Testora


--------------------------------------------------------------------------------
/.devcontainer/setup_keras.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | echo "Creating directory for clones"
 4 | cd ..
 5 | sudo mkdir clones
 6 | sudo chown vscode:vscode clones/
 7 | cd clones
 8 | 
 9 | echo "Cleaning any existing keras-dev containers"
10 | docker rm -f keras-dev1
11 | docker rm -f keras-dev2
12 | docker rm -f keras-dev3
13 | 
14 | mkdir clone1
15 | cd clone1
16 | 
17 | echo "Creating first clone of keras"
18 | git clone https://github.com/keras-team/keras.git
19 | cd keras
20 | echo "Building dev container for keras (first clone)"
21 | docker run -t -d --name keras-dev1 -v ${PWD}:/home/keras python:3.10
22 | docker exec -w /home/keras keras-dev1 pip install -r requirements.txt
23 | docker exec -w /home/keras keras-dev1 pip install -e ./
24 | docker exec -w /home/keras keras-dev1 pip install coverage
25 | echo "Done with first clone"
26 | 
27 | echo "Creating second clone of keras"
28 | cd ../..
29 | cp -r clone1 clone2
30 | cd clone2/keras
31 | echo "Building dev container for keras (second clone)"
32 | docker run -t -d --name keras-dev2 -v ${PWD}:/home/keras python:3.10
33 | docker exec -w /home/keras keras-dev2 pip install -r requirements.txt
34 | docker exec -w /home/keras keras-dev2 pip install -e ./
35 | docker exec -w /home/keras keras-dev2 pip install coverage
36 | echo "Done with second clone"
37 | 
38 | echo "Creating third clone of keras"
39 | cd ../..
40 | cp -r clone1 clone3
41 | cd clone3/keras
42 | echo "Building dev container for keras (third clone)"
43 | docker run -t -d --name keras-dev3 -v ${PWD}:/home/keras python:3.10
44 | docker exec -w /home/keras keras-dev3 pip install -r requirements.txt
45 | docker exec -w /home/keras keras-dev3 pip install -e ./
46 | docker exec -w /home/keras keras-dev3 pip install coverage
47 | echo "Done with third clone"
48 | 
49 | cd ../../../Testora


--------------------------------------------------------------------------------
/.devcontainer/setup_marshmallow.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | echo "Creating directory for clones"
 4 | cd ..
 5 | sudo mkdir clones
 6 | sudo chown vscode:vscode clones/
 7 | cd clones
 8 | 
 9 | echo "Cleaning any existing marshmallow-dev containers"
10 | docker rm -f marshmallow-dev1
11 | docker rm -f marshmallow-dev2
12 | docker rm -f marshmallow-dev3
13 | 
14 | mkdir clone1
15 | cd clone1
16 | 
17 | echo "Creating first clone of marshmallow"
18 | git clone https://github.com/marshmallow-code/marshmallow.git
19 | cd marshmallow
20 | echo "Building dev container for marshmallow (first clone)"
21 | docker run -t -d --name marshmallow-dev1 -v ${PWD}:/home/marshmallow python:3.10
22 | docker exec -w /home/marshmallow marshmallow-dev1 pip install -e '.[dev]'
23 | docker exec -w /home/marshmallow marshmallow-dev1 pip install coverage
24 | echo "Done with first clone"
25 | 
26 | #####
27 | echo "Creating second clone of marshmallow"
28 | cd ../..
29 | cp -r clone1 clone2
30 | cd clone2/marshmallow
31 | echo "Building dev container for marshmallow (second clone)"
32 | docker run -t -d --name marshmallow-dev2 -v ${PWD}:/home/marshmallow python:3.10
33 | docker exec -w /home/marshmallow marshmallow-dev2 pip install -e '.[dev]'
34 | docker exec -w /home/marshmallow marshmallow-dev2 pip install coverage
35 | echo "Done with second clone"
36 | 
37 | echo "Creating third clone of marshmallow"
38 | cd ../..
39 | cp -r clone1 clone3
40 | cd clone3/marshmallow
41 | echo "Building dev container for marshmallow (third clone)"
42 | docker run -t -d --name marshmallow-dev3 -v ${PWD}:/home/marshmallow python:3.10
43 | docker exec -w /home/marshmallow marshmallow-dev3 pip install -e '.[dev]'
44 | docker exec -w /home/marshmallow marshmallow-dev3 pip install coverage
45 | echo "Done with third clone"
46 | 
47 | cd ../../../Testora


--------------------------------------------------------------------------------
/src/multilspy/multilspy_logger.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Multilspy logger module.
 3 | """
 4 | import inspect
 5 | import logging
 6 | from datetime import datetime
 7 | from pydantic import BaseModel
 8 | 
 9 | class LogLine(BaseModel):
10 |     """
11 |     Represents a line in the Multilspy log
12 |     """
13 | 
14 |     time: str
15 |     level: str
16 |     caller_file: str
17 |     caller_name: str
18 |     caller_line: int
19 |     message: str
20 | 
21 | class MultilspyLogger:
22 |     """
23 |     Logger class
24 |     """
25 | 
26 |     def __init__(self) -> None:
27 |         self.logger = logging.getLogger("multilspy")
28 |         self.logger.setLevel(logging.INFO)
29 | 
30 |     def log(self, debug_message: str, level: int, sanitized_error_message: str = "") -> None:
31 |         """
32 |         Log the debug and santized messages using the logger
33 |         """
34 | 
35 |         debug_message = debug_message.replace("'", '"').replace("\n", " ")
36 |         sanitized_error_message = sanitized_error_message.replace("'", '"').replace("\n", " ")
37 | 
38 |         # Collect details about the callee
39 |         curframe = inspect.currentframe()
40 |         calframe = inspect.getouterframes(curframe, 2)
41 |         caller_file = calframe[1][1].split("/")[-1]
42 |         caller_line = calframe[1][2]
43 |         caller_name = calframe[1][3]
44 | 
45 |         # Construct the debug log line
46 |         debug_log_line = LogLine(
47 |             time=str(datetime.now().strftime("%Y-%m-%d %H:%M:%S")),
48 |             level=logging.getLevelName(level),
49 |             caller_file=caller_file,
50 |             caller_name=caller_name,
51 |             caller_line=caller_line,
52 |             message=debug_message,
53 |         )
54 | 
55 |         self.logger.log(
56 |             level=level,
57 |             msg=debug_log_line.json(),
58 |         )
59 | 


--------------------------------------------------------------------------------
/.devcontainer/setup_scipy.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | echo "Creating directory for clones"
 4 | cd ..
 5 | sudo mkdir clones
 6 | sudo chown vscode:vscode clones/
 7 | cd clones
 8 | 
 9 | echo "Cleaning any existing scipy-dev containers"
10 | docker rm -f scipy-dev1
11 | docker rm -f scipy-dev2
12 | docker rm -f scipy-dev3
13 | 
14 | mkdir clone1
15 | cd clone1
16 | 
17 | echo "Creating first clone of scipy"
18 | git clone https://github.com/scipy/scipy.git
19 | cd scipy
20 | git submodule update --init
21 | echo "Building dev container for scipy (first clone)"
22 | docker run -t -d --name scipy-dev1 -v ${PWD}:/home/scipy python:3.10
23 | docker cp /workspaces/Testora/.devcontainer/setup_scipy_to_run_in_container.sh scipy-dev1:/root/setup.sh
24 | docker exec scipy-dev1 chmod +x /root/setup.sh
25 | docker exec -w /home/scipy scipy-dev1 /root/setup.sh
26 | echo "Done with first clone"
27 | 
28 | echo "Creating second clone of scipy"
29 | cd ../..
30 | cp -r clone1 clone2
31 | cd clone2/scipy
32 | echo "Building dev container for scipy (second clone)"
33 | docker run -t -d --name scipy-dev2 -v ${PWD}:/home/scipy python:3.10
34 | docker cp /workspaces/Testora/.devcontainer/setup_scipy_to_run_in_container.sh scipy-dev2:/root/setup.sh
35 | docker exec scipy-dev2 chmod +x /root/setup.sh
36 | docker exec -w /home/scipy scipy-dev2 /root/setup.sh
37 | echo "Done with second clone"
38 | 
39 | echo "Creating third clone of scipy"
40 | cd ../..
41 | cp -r clone1 clone3
42 | cd clone3/scipy
43 | echo "Building dev container for scipy (third clone)"
44 | docker run -t -d --name scipy-dev3 -v ${PWD}:/home/scipy python:3.10
45 | docker cp /workspaces/Testora/.devcontainer/setup_scipy_to_run_in_container.sh scipy-dev3:/root/setup.sh
46 | docker exec scipy-dev3 chmod +x /root/setup.sh
47 | docker exec -w /home/scipy scipy-dev3 /root/setup.sh
48 | echo "Done with third clone"
49 | 
50 | cd ../../../Testora
51 | 


--------------------------------------------------------------------------------
/.devcontainer/setup_numpy.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | echo "Creating directory for clones"
 4 | cd ..
 5 | sudo mkdir clones
 6 | sudo chown vscode:vscode clones/
 7 | cd clones
 8 | 
 9 | echo "Cleaning any existing numpy-dev containers"
10 | docker rm -f numpy-dev1
11 | docker rm -f numpy-dev2
12 | docker rm -f numpy-dev3
13 | 
14 | mkdir clone1
15 | cd clone1
16 | 
17 | echo "Creating first clone of numpy"
18 | git clone https://github.com/numpy/numpy.git
19 | cd numpy
20 | git submodule update --init
21 | echo "Building dev container for numpy (first clone)"
22 | docker run -t -d --name numpy-dev1 -v ${PWD}:/home/numpy python:3.10
23 | docker cp /workspaces/Testora/.devcontainer/setup_numpy_to_run_in_container.sh numpy-dev1:/root/setup.sh
24 | docker exec numpy-dev1 chmod +x /root/setup.sh
25 | docker exec -w /home/numpy numpy-dev1 /root/setup.sh
26 | echo "Done with first clone"
27 | 
28 | echo "Creating second clone of numpy"
29 | cd ../..
30 | cp -r clone1 clone2
31 | cd clone2/numpy
32 | echo "Building dev container for numpy (second clone)"
33 | docker run -t -d --name numpy-dev2 -v ${PWD}:/home/numpy python:3.10
34 | docker cp /workspaces/Testora/.devcontainer/setup_numpy_to_run_in_container.sh numpy-dev2:/root/setup.sh
35 | docker exec numpy-dev2 chmod +x /root/setup.sh
36 | docker exec -w /home/numpy numpy-dev2 /root/setup.sh
37 | echo "Done with second clone"
38 | 
39 | echo "Creating third clone of numpy"
40 | cd ../..
41 | cp -r clone1 clone3
42 | cd clone3/numpy
43 | echo "Building dev container for numpy (third clone)"
44 | docker run -t -d --name numpy-dev3 -v ${PWD}:/home/numpy python:3.10
45 | docker cp /workspaces/Testora/.devcontainer/setup_numpy_to_run_in_container.sh numpy-dev3:/root/setup.sh
46 | docker exec numpy-dev3 chmod +x /root/setup.sh
47 | docker exec -w /home/numpy numpy-dev3 /root/setup.sh
48 | echo "Done with third clone"
49 | 
50 | 
51 | cd ../../../Testora
52 | 


--------------------------------------------------------------------------------
/.devcontainer/setup_pandas.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | echo "Creating directory for clones"
 4 | cd ..
 5 | sudo mkdir clones
 6 | sudo chown vscode:vscode clones/
 7 | cd clones
 8 | 
 9 | echo "Cleaning any existing pandas-dev containers"
10 | docker rm -f pandas-dev1
11 | docker rm -f pandas-dev2
12 | docker rm -f pandas-dev3
13 | 
14 | mkdir clone1
15 | cd clone1
16 | 
17 | echo "Creating first clone of pandas"
18 | git clone https://github.com/pandas-dev/pandas.git
19 | cd pandas
20 | git checkout e0398c4 # latest commit that still has a Dockerfile
21 | echo "Building dev container for pandas (first clone)"
22 | docker build -t pandas-dev .
23 | docker run -t -d --name pandas-dev1 -v ${PWD}:/home/pandas pandas-dev
24 | docker exec pandas-dev1 python -m pip install -ve . --no-build-isolation --config-settings editable-verbose=true
25 | docker exec pandas-dev1 python -m pip install coverage
26 | echo "Done with first clone"
27 | 
28 | echo "Creating second clone of pandas"
29 | cd ../..
30 | cp -r clone1 clone2
31 | cd clone2/pandas
32 | git checkout e0398c4 # latest commit that still has a Dockerfile
33 | echo "Building dev container for pandas (second clone)"
34 | docker run -t -d --name pandas-dev2 -v ${PWD}:/home/pandas pandas-dev
35 | docker exec pandas-dev2 python -m pip install -ve . --no-build-isolation --config-settings editable-verbose=true
36 | docker exec pandas-dev2 python -m pip install coverage
37 | echo "Done with second clone"
38 | 
39 | echo "Creating third clone of pandas"
40 | cd ../..
41 | cp -r clone1 clone3
42 | cd clone3/pandas
43 | git checkout e0398c4 # latest commit that still has a Dockerfile
44 | echo "Building dev container for pandas (third clone)"
45 | docker run -t -d --name pandas-dev3 -v ${PWD}:/home/pandas pandas-dev
46 | docker exec pandas-dev3 python -m pip install -ve . --no-build-isolation --config-settings editable-verbose=true
47 | docker exec pandas-dev3 python -m pip install coverage
48 | echo "Done with third clone"
49 | 
50 | cd ../../../Testora
51 | 


--------------------------------------------------------------------------------
/data/ground_truth/scipy/21528.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "pr_number": 21528,
 3 |     "log_file": "data/results/scipy/21528_2024-11-23 09:06:43.json",
 4 |     "differentiating_tests": [
 5 |         {
 6 |             "test": {
 7 |                 "test_code": "# Example 8:\nimport numpy as np\nfrom scipy.special import logsumexp\n\na = np.array([1.0, 2.0, 3.0], dtype=np.float32)\nresult = logsumexp(a)\nprint(\"logsumexp(a) with dtype np.float32 =\", result)",
 8 |                 "old_output": "logsumexp(a) with dtype np.float32 = 3.4076059644443806\n",
 9 |                 "new_output": "logsumexp(a) with dtype np.float32 = 3.4076061\n"
10 |             },
11 |             "label": "intended",
12 |             "comment": "floating point (in)accuracy, which seems a legitimate side-effect of the type-related fix"
13 |         },
14 |         {
15 |             "test": {
16 |                 "test_code": "# Example 13:\nimport numpy as np\nfrom scipy.special import logsumexp\n\na = None\ntry:\n    result = logsumexp(a)\nexcept Exception as e:",
17 |                 "old_output": "Couldn't run '/tmp/BugGPT/BugGPT_test_code.py' as Python code: IndentationError: expected an indented block after 'except' statement on line 8 (BugGPT_test_code.py, line 8)\n/root/conda/envs/scipy-dev/lib/python3.13/site-packages/coverage/control.py:892: CoverageWarning: No data was collected. (no-data-collected)\n  self._warn(\"No data was collected.\", slug=\"no-data-collected\")\n",
18 |                 "new_output": "/root/conda/envs/scipy-dev/lib/python3.13/site-packages/coverage/control.py:892: CoverageWarning: No data was collected. (no-data-collected)\n  self._warn(\"No data was collected.\", slug=\"no-data-collected\")\nCouldn't run '/tmp/BugGPT/BugGPT_test_code.py' as Python code: IndentationError: expected an indented block after 'except' statement on line 8 (BugGPT_test_code.py, line 8)\n"
19 |             },
20 |             "label": "intended",
21 |             "comment": "irrelevant difference in logging order"
22 |         }
23 |     ]
24 | }


--------------------------------------------------------------------------------
/src/testora/evaluation/CheckedPRsInspector.py:
--------------------------------------------------------------------------------
 1 | # Helper script to find PRs that are in-scope for checking (i.e., that are not discarded because of changing only test files, having to many changes, etc.)
 2 | 
 3 | from testora.evaluation.ResultsManager import result_files_for_project
 4 | from testora.util.LogParser import parse_log_files
 5 | from testora.evaluation.TargetPRs import project_to_target_prs
 6 | 
 7 | for project in ["keras", "marshmallow", "scipy", "pandas"]:
 8 |     print(f"Project {project}:")
 9 |     pr_results, _ = parse_log_files(result_files_for_project(project))
10 |     in_scope_pr_nbs = []
11 |     for pr_result in pr_results:
12 |         if pr_result.status() != "ignored":
13 |             in_scope_pr_nbs.append(pr_result.number)
14 |     print(",\n".join([str(n) for n in sorted(in_scope_pr_nbs)]))
15 |     print(f"--> {len(in_scope_pr_nbs)} PRs in-scope\n")
16 |     print()
17 | 
18 | 
19 | print("\n\n===========================\n\n")
20 | 
21 | # print new results as csv
22 | minimum_timestamp = "2024-11-22 09:05:00"
23 | print("Project, PR, Generated tests, Executed tests, Diff-covered tests, Failures, Differences")
24 | for project, target_prs in project_to_target_prs().items():
25 |     pr_results, _ = parse_log_files(
26 |         result_files_for_project(project, minimum_timestamp))
27 |     for target_pr in target_prs:
28 |         pr_result = next(
29 |             (r for r in pr_results if r.number == target_pr), None)
30 |         if pr_result is None:
31 |             entries = [
32 |                 project,
33 |                 str(target_pr)
34 |             ]
35 |         else:
36 |             entries = [
37 |                 project,
38 |                 str(target_pr),
39 |                 str(pr_result.nb_generated_tests),
40 |                 str(pr_result.nb_test_executions),
41 |                 str(pr_result.nb_diff_covered_tests),
42 |                 str(pr_result.nb_test_failures),
43 |                 str(pr_result.nb_different_behavior)
44 |             ]
45 |         print(", ".join(entries))
46 | 


--------------------------------------------------------------------------------
/src/testora/prompts/SelectExpectedBehaviorPrompt.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | 
 3 | from testora.util.Exceptions import TestoraException
 4 | 
 5 | answer_pattern = re.compile(r"<ANSWER>(.*?)</ANSWER>", re.DOTALL)
 6 | 
 7 | 
 8 | class SelectExpectedBehaviorPrompt:
 9 |     def __init__(self, project_name, test_code, output1, output2, docstrings):
10 |         self.project_name = project_name
11 |         self.test_code = test_code
12 |         self.output1 = output1
13 |         self.output2 = output2
14 |         self.docstrings = docstrings
15 |         self.use_json_output = False
16 | 
17 |     def create_prompt(self):
18 |         template = """
19 | # Usage Example
20 | The following is a usage example of the {project_name} project:
21 | ```python
22 | {test_code}
23 | ```
24 | 
25 | # Docstrings of Relevant APIs
26 | {docstrings}
27 | 
28 | # Possible Outputs
29 | Consider the following two outputs that the above example could produce.
30 | 
31 | Output 1:
32 | {output1}
33 | 
34 | Output 2:
35 | {output2}
36 | 
37 | # Question
38 | Which of the two outputs is the expected behavior of the example? Explain your reasoning, and then write in <ANSWER> and </ANSWER> tags either "Output 1" or "Output 2".
39 | """
40 |         return template.format(project_name=self.project_name,
41 |                                test_code=self.test_code,
42 |                                docstrings=self.docstrings,
43 |                                output1=self.output1,
44 |                                output2=self.output2)
45 | 
46 |     def parse_answer(self, raw_answer):
47 |         assert type(raw_answer) == list
48 |         assert len(raw_answer) == 1
49 | 
50 |         raw_answer = raw_answer[0]
51 |         
52 |         match = re.search(answer_pattern, raw_answer)
53 |         if match is None:
54 |             raise TestoraException("Could not find answer in the response.")
55 |         answer = match.group(1)
56 |         if answer.strip() == "Output 1":
57 |             return 1
58 |         elif answer.strip() == "Output 2":
59 |             return 2
60 |         else:
61 |             return 0
62 | 


--------------------------------------------------------------------------------
/.devcontainer/setup_scikit-learn.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | echo "Creating directory for clones"
 4 | cd ..
 5 | sudo mkdir clones
 6 | sudo chown vscode:vscode clones/
 7 | cd clones
 8 | 
 9 | echo "Cleaning any existing scikit-learn containers"
10 | docker rm -f scikit-learn-dev1
11 | docker rm -f scikit-learn-dev2
12 | docker rm -f scikit-learn-dev3
13 | 
14 | mkdir clone1
15 | cd clone1
16 | 
17 | echo "Creating first clone of scikit-learn"
18 | git clone https://github.com/scikit-learn/scikit-learn.git
19 | cd scikit-learn
20 | echo "Building dev container for scikit-learn (first clone)"
21 | docker run -t -d --name scikit-learn-dev1 -v ${PWD}:/home/scikit-learn python:3.10
22 | docker exec -w /home/scikit-learn scikit-learn-dev1 pip install wheel numpy scipy cython meson-python ninja
23 | docker exec -w /home/scikit-learn scikit-learn-dev1 pip install --editable . --verbose --no-build-isolation --config-settings editable-verbose=true
24 | echo "Done with first clone"
25 | 
26 | echo "Creating second clone of scikit-learn"
27 | cd ../..
28 | cp -r clone1 clone2
29 | cd clone2/scikit-learn
30 | echo "Building dev container for scikit-learn (first clone)"
31 | docker run -t -d --name scikit-learn-dev2 -v ${PWD}:/home/scikit-learn python:3.10
32 | docker exec -w /home/scikit-learn scikit-learn-dev2 pip install wheel numpy scipy cython meson-python ninja
33 | docker exec -w /home/scikit-learn scikit-learn-dev2 pip install --editable . --verbose --no-build-isolation --config-settings editable-verbose=true
34 | echo "Done with second clone"
35 | 
36 | echo "Creating third clone of scikit-learn"
37 | cd ../..
38 | cp -r clone1 clone3
39 | cd clone3/scikit-learn
40 | echo "Building dev container for scikit-learn (first clone)"
41 | docker run -t -d --name scikit-learn-dev3 -v ${PWD}:/home/scikit-learn python:3.10
42 | docker exec -w /home/scikit-learn scikit-learn-dev3 pip install wheel numpy scipy cython meson-python ninja
43 | docker exec -w /home/scikit-learn scikit-learn-dev3 pip install --editable . --verbose --no-build-isolation --config-settings editable-verbose=true
44 | echo "Done with third clone"
45 | 
46 | cd ../../../Testora
47 | 


--------------------------------------------------------------------------------
/data/ground_truth/marshmallow/2271.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "pr_number": 2271,
 3 |     "log_file": "data/results/marshmallow/2271_2024-11-23 09:16:17.json",
 4 |     "differentiating_tests": [
 5 |         {
 6 |             "test": {
 7 |                 "test_code": "# Example 4:\nfrom marshmallow import Schema, fields\n\nclass ManySchema(Schema):\n    foo = fields.Str()\n\n    class Meta:\n        many = True\n\nschema = ManySchema()\nresult = schema.load([{\"foo\": \"bar\"}, {\"foo\": \"baz\"}])",
 8 |                 "old_output": "Traceback (most recent call last):\n  File \"/tmp/BugGPT/BugGPT_test_code.py\", line 11, in <module>\n    result = schema.load([{\"foo\": \"bar\"}, {\"foo\": \"baz\"}])\n  File \"/home/marshmallow/src/marshmallow/schema.py\", line 724, in load\n    return self._do_load(\n  File \"/home/marshmallow/src/marshmallow/schema.py\", line 911, in _do_load\n    raise exc\nmarshmallow.exceptions.ValidationError: {'_schema': ['Invalid input type.']}\n",
 9 |                 "new_output": ""
10 |             },
11 |             "label": "intended",
12 |             "comment": "PR addresses issue that is about supporting 'many=True'"
13 |         },
14 |         {
15 |             "test": {
16 |                 "test_code": "# Example 5:\nfrom marshmallow import Schema, fields\n\nclass SampleSchema(Schema):\n    code = fields.Int()\n\n    class Meta:\n        many = True\n\nschema = SampleSchema()\nresult = schema.load([{\"code\": 100}, {\"code\": 200}])",
17 |                 "old_output": "Traceback (most recent call last):\n  File \"/tmp/BugGPT/BugGPT_test_code.py\", line 11, in <module>\n    result = schema.load([{\"code\": 100}, {\"code\": 200}])\n  File \"/home/marshmallow/src/marshmallow/schema.py\", line 724, in load\n    return self._do_load(\n  File \"/home/marshmallow/src/marshmallow/schema.py\", line 911, in _do_load\n    raise exc\nmarshmallow.exceptions.ValidationError: {'_schema': ['Invalid input type.']}\n",
18 |                 "new_output": ""
19 |             },
20 |             "label": "intended",
21 |             "comment": "PR addresses issue that is about supporting 'many=True'"
22 |         }
23 |     ]
24 | }


--------------------------------------------------------------------------------
/src/multilspy/lsp_protocol_handler/lsp_constants.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This module contains constants used in the LSP protocol.
 3 | """
 4 | 
 5 | class LSPConstants:
 6 |     """
 7 |     This class contains constants used in the LSP protocol.
 8 |     """
 9 | 
10 |     # the key for uri used to represent paths
11 |     URI = "uri"
12 | 
13 |     # the key for range, which is a from and to position within a text document
14 |     RANGE = "range"
15 | 
16 |     # A key used in LocationLink type, used as the span of the origin link
17 |     ORIGIN_SELECTION_RANGE = "originSelectionRange"
18 | 
19 |     # A key used in LocationLink type, used as the target uri of the link
20 |     TARGET_URI = "targetUri"
21 | 
22 |     # A key used in LocationLink type, used as the target range of the link
23 |     TARGET_RANGE = "targetRange"
24 | 
25 |     # A key used in LocationLink type, used as the target selection range of the link
26 |     TARGET_SELECTION_RANGE = "targetSelectionRange"
27 | 
28 |     # key for the textDocument field in the request
29 |     TEXT_DOCUMENT = "textDocument"
30 | 
31 |     # key used to represent the language a document is in - "java", "csharp", etc.
32 |     LANGUAGE_ID = "languageId"
33 | 
34 |     # key used to represent the version of a document (a shared value betwen the client and server)
35 |     VERSION = "version"
36 | 
37 |     # key used to represent the text of a document being sent from the client to the server on open
38 |     TEXT = "text"
39 | 
40 |     # key used to represent a position (line and colnum) within a text document
41 |     POSITION = "position"
42 | 
43 |     # key used to represent the line number of a position
44 |     LINE = "line"
45 | 
46 |     # key used to represent the column number of a position
47 |     CHARACTER = "character"
48 | 
49 |     # key used to represent the changes made to a document
50 |     CONTENT_CHANGES = "contentChanges"
51 | 
52 |     # key used to represent name of symbols
53 |     NAME = "name"
54 | 
55 |     # key used to represent the kind of symbols
56 |     KIND = "kind"
57 | 
58 |     # key used to represent children in document symbols
59 |     CHILDREN = "children"
60 | 


--------------------------------------------------------------------------------
/src/testora/execution/CoverageAnalyzer.py:
--------------------------------------------------------------------------------
 1 | from dataclasses import dataclass
 2 | import io
 3 | import sqlite3
 4 | from coverage.data import CoverageData
 5 | 
 6 | from testora.util.Exceptions import TestoraException
 7 | 
 8 | 
 9 | @dataclass
10 | class DiffCoverage:
11 |     percentage_covered: float
12 |     total_modified_lines: int
13 |     total_covered_modified_lines: int
14 | 
15 |     def __str__(self):
16 |         return f"Coverage: {self.percentage_covered:.2%} ({self.total_covered_modified_lines}/{self.total_modified_lines})"
17 | 
18 | 
19 | def summarize_coverage(pr, test_execution, is_old_version):
20 |     # get coverage data
21 |     tmp_coverage_file = "coverage_report"
22 |     with open(tmp_coverage_file, "wb") as f:
23 |         f.write(test_execution.coverage_report)
24 |     coverage_data = CoverageData(tmp_coverage_file)
25 |     coverage_data.read()
26 | 
27 |     # adapt file paths from the container's file system
28 |     project_name = pr.cloned_repo_manager.repo_name
29 |     file_prefix = f"/home/{project_name}/"
30 | 
31 |     # check coverage for modified files
32 |     total_modified_lines = 0
33 |     total_covered_modified_lines = 0
34 |     target_files = pr.non_test_modified_python_files
35 |     for target_file in target_files:
36 |         # get modified lines
37 |         if is_old_version:
38 |             modified_lines = pr.old_file_path_to_modified_lines[target_file]
39 |         else:
40 |             modified_lines = pr.new_file_path_to_modified_lines[target_file]
41 | 
42 |         # get covered lines
43 |         covered_lines = coverage_data.lines(file_prefix + target_file)
44 |         if covered_lines is None:
45 |             # happens, e.g., when generated test doesn't invoke the tested project
46 |             covered_lines = set()
47 | 
48 |         total_modified_lines += len(modified_lines)
49 |         total_covered_modified_lines += len(set(modified_lines)
50 |                                             & set(covered_lines))
51 | 
52 |     percentage_covered = total_covered_modified_lines / \
53 |         total_modified_lines if total_modified_lines > 0 else 0
54 | 
55 |     return DiffCoverage(
56 |         percentage_covered,
57 |         total_modified_lines,
58 |         total_covered_modified_lines)
59 | 


--------------------------------------------------------------------------------
/data/ground_truth/marshmallow/2123.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "pr_number": 2123,
 3 |     "log_file": "data/results/marshmallow/2123_2024-11-23 09:16:17.json",
 4 |     "differentiating_tests": [
 5 |         {
 6 |             "test": {
 7 |                 "test_code": "# Example 7: Setting both relative and absolute, expecting failure\nfrom marshmallow import Schema, fields, ValidationError\n\ntry:\n    class MySchema(Schema):\n        url = fields.Url(relative=True, absolute=False)\n\n    schema = MySchema()\n    result = schema.load({\"url\": \"http://example.com\"})\nexcept ValidationError as e:",
 8 |                 "old_output": "Couldn't run '/tmp/BugGPT/BugGPT_test_code.py' as Python code: IndentationError: expected an indented block after 'except' statement on line 10 (BugGPT_test_code.py, line 10)\n/usr/local/lib/python3.10/site-packages/coverage/control.py:892: CoverageWarning: No data was collected. (no-data-collected)\n  self._warn(\"No data was collected.\", slug=\"no-data-collected\")\n",
 9 |                 "new_output": "/usr/local/lib/python3.10/site-packages/coverage/control.py:892: CoverageWarning: No data was collected. (no-data-collected)\n  self._warn(\"No data was collected.\", slug=\"no-data-collected\")\nCouldn't run '/tmp/BugGPT/BugGPT_test_code.py' as Python code: IndentationError: expected an indented block after 'except' statement on line 10 (BugGPT_test_code.py, line 10)\n"
10 |             },
11 |             "label": "intended",
12 |             "comment": "different order of error messages"
13 |         },
14 |         {
15 |             "test": {
16 |                 "test_code": "from marshmallow import Schema, fields\n\n# Example 3: URL that is not absolute or relative\ntry:\n    class MySchema(Schema):\n        url = fields.Url(relative=False, absolute=False)\n        \n    result = MySchema().load({\"url\": \"example.com\"})\nexcept Exception as e:\n    print(e)  # Should raise a validation error",
17 |                 "old_output": "{'url': ['Not a valid URL.']}\n",
18 |                 "new_output": "URL validation cannot set both relative and absolute to False.\n"
19 |             },
20 |             "label": "intended",
21 |             "comment": "PR mentions that cannot set both 'relative' and 'absolute' to False"
22 |         }
23 |     ]
24 | }


--------------------------------------------------------------------------------
/templates/pr_result.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html lang="en">
 3 | 
 4 | <head>
 5 |     <meta charset="UTF-8">
 6 |     <meta name="viewport" content="width=device-width, initial-scale=1.0">
 7 |     <title>Result for PR {{ pr_result.number }}</title>
 8 |     <style>
 9 |         body {
10 |             font-family: Arial, sans-serif;
11 |             margin: 20px;
12 |         }
13 | 
14 |         pre {
15 |             background-color: #f4f4f4;
16 |             border: 1px solid #ddd;
17 |             padding: 10px;
18 |             border-radius: 5px;
19 |             max-width: 72rem;
20 |             white-space: pre-wrap;
21 |             word-wrap: break-word;
22 |             overflow-wrap: break-word;
23 |         }
24 | 
25 |         table {
26 |             width: 100%;
27 |             border-collapse: collapse;
28 |         }
29 | 
30 |         th,
31 |         td {
32 |             border: 1px solid #dddddd;
33 |             text-align: left;
34 |             padding: 8px;
35 |         }
36 | 
37 |         tr:nth-child(even) {
38 |             background-color: #f2f2f2;
39 |         }
40 | 
41 |         .info {
42 |             margin-bottom: 20px;
43 |         }
44 | 
45 |         .compact-table {
46 |             width: auto;
47 |             border: 1px solid black;
48 |         }
49 | 
50 |         .entries {
51 |             font-size: 11pt;
52 |         }
53 | 
54 |         .width100ch {
55 |             max-width: 100ch;
56 |             border: 1px solid #ccc;
57 |             padding: 0.5em;
58 |         }
59 |     </style>
60 | </head>
61 | 
62 | <body>
63 |     <div class="info">
64 |         <h1><a href="{{ pr_result.url }}">PR{{ pr_result.number }}</a>: {{ pr_result.title }}</h1>
65 |     </div>
66 | 
67 |     <h2>Test Case</h2>
68 |     <pre><code>{{ classification_result.test_code }}</code></pre>
69 | 
70 |     <h2>Old Output</h2>
71 |     <pre>{{ classification_result.old_output }}</pre>
72 | 
73 |     <h2>New Output</h2>
74 |     <pre>{{ classification_result.new_output }}</pre>
75 | 
76 |     <h2>Classification: {{ classification_result.classification }}</h2>
77 |     <pre>{{ classification_result.classification_explanation }}</pre>
78 |     <br>
79 |     <div><a href="/pr{{ pr_result.number }}_log">Detailed log</a></div>
80 | 
81 | </body>
82 | 
83 | </html>


--------------------------------------------------------------------------------
/src/testora/util/PythonLanguageServer.py:
--------------------------------------------------------------------------------
 1 | from testora.util.PythonCodeUtil import get_locations_of_calls
 2 | from multilspy import SyncLanguageServer
 3 | from multilspy.multilspy_config import MultilspyConfig
 4 | from multilspy.multilspy_logger import MultilspyLogger
 5 | from pathlib import Path
 6 | 
 7 | 
 8 | class PythonLanguageServer:
 9 |     def __init__(self, repo_path):
10 |         config = MultilspyConfig.from_dict({"code_language": "python"})
11 |         logger = MultilspyLogger()
12 |         absolute_repo_path = str(Path(repo_path).resolve())
13 |         self.lsp = SyncLanguageServer.create(
14 |             config, logger, absolute_repo_path)
15 | 
16 |     def get_hover_text(self, file_path, line, column):
17 |         with self.lsp.start_server():
18 |             raw_result = self.lsp.request_hover(file_path, line, column)
19 |             if type(raw_result) == dict and "contents" in raw_result:
20 |                 return raw_result["contents"]["value"]
21 |             else:
22 |                 return ""
23 | 
24 | 
25 | # for testing
26 | if __name__ == "__main__":
27 |     code = """import pandas as pd
28 | 
29 | series_complex = pd.Series([complex(1,2), complex(3,4)])
30 | # This will result in an error as rounding is not applicable to complex numbers
31 | try:
32 |     rounded_complex = series_complex.round(2)
33 |     print(rounded_complex)
34 | except TypeError as e:
35 |     print(f"Error: {e}")
36 | """
37 |     call_locations = get_locations_of_calls(code)
38 | 
39 |     test_path = "/workspaces/clones/clone2/pandas/testora_code/test.py"
40 |     repo_path = "/workspaces/clones/clone2/pandas/"
41 |     # test_path = "/home/m/research/collabs/Testora/data/repos/pandas_pool/clone2/pandas/testora_code/test.py"
42 |     # repo_path = "/home/m/research/collabs/Testora/data/repos/pandas_pool/clone2/pandas/"
43 | 
44 |     with open(test_path, "w") as f:
45 |         f.write(code)
46 | 
47 |     server = PythonLanguageServer(repo_path)
48 |     for call_location in call_locations:
49 |         line = call_location.start.line - 1  # LSP lines are 0-based
50 |         column = call_location.start.column
51 |         r = server.get_hover_text(
52 |             test_path, line, column)
53 |         print("--------------------------------------------------")
54 |         print(r)
55 |         print()
56 | 


--------------------------------------------------------------------------------
/src/testora/prompts/TemperatureExperiment.py:
--------------------------------------------------------------------------------
 1 | from openai import OpenAI
 2 | from testora.prompts.RegressionClassificationPromptV2 import RegressionClassificationPromptV2
 3 | from testora.prompts.PromptCommon import system_message
 4 | 
 5 | with open(".openai_token", "r") as f:
 6 |     openai_key = f.read().strip()
 7 | 
 8 | openai = OpenAI(api_key=openai_key)
 9 | gpt4o_model = "gpt-4o-2024-05-13"
10 | 
11 | with open("data/example_prompts/intended1.txt", "r") as f:
12 |     intended_prompt1 = f.read()
13 | 
14 | with open("data/example_prompts/intended2.txt", "r") as f:
15 |     intended_prompt2 = f.read()
16 | 
17 | with open("data/example_prompts/intended3.txt", "r") as f:
18 |     intended_prompt3 = f.read()
19 | 
20 | with open("data/example_prompts/surprising1.txt", "r") as f:
21 |     surprising_prompt1 = f.read()
22 | 
23 | with open("data/example_prompts/surprising2.txt", "r") as f:
24 |     surprising_prompt2 = f.read()
25 | 
26 | 
27 | def call_model(prompt, temperature):
28 |     completion = openai.chat.completions.create(
29 |         model=gpt4o_model,
30 |         messages=[
31 |             {"role": "system", "content": system_message},
32 |             {"role": "user", "content": prompt}
33 |         ],
34 |         max_tokens=4096,  # 4096 is the maximum token limit for gpt-4-0125-preview
35 |         n=1,
36 |         temperature=temperature
37 |     )
38 |     return completion.choices[0].message.content
39 | 
40 | 
41 | if __name__ == "__main__":
42 |     intended = [intended_prompt1, intended_prompt2, intended_prompt3]
43 |     surprising = [surprising_prompt1, surprising_prompt2]
44 |     r = RegressionClassificationPromptV2("", "", "", "", "", "")
45 | 
46 |     for idx, prompt in enumerate(intended):
47 |         print(f"Intended prompt {idx + 1}:")
48 |         for temperature in [0, 0.2, 0.7, 1.0]:
49 |             answer = call_model(prompt, temperature)
50 |             is_relevant_change, is_deterministic, is_public, is_legal, is_surprising = r.parse_answer(
51 |                 [answer])
52 |             print(f"  temp={temperature} gives surprising={is_surprising}")
53 | 
54 |     print()
55 |     for idx, prompt in enumerate(surprising):
56 |         print(f"Surprising prompt {idx + 1}:")
57 |         for temperature in [0, 0.2, 0.7, 1.0]:
58 |             answer = call_model(prompt, temperature)
59 |             is_relevant_change, is_deterministic, is_public, is_legal, is_surprising = r.parse_answer(
60 |                 [answer])
61 |             print(f"  temp={temperature} gives surprising={is_surprising}")
62 | 


--------------------------------------------------------------------------------
/templates/index.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html lang="en">
 3 | 
 4 | <head>
 5 |     <meta charset="UTF-8">
 6 |     <meta name="viewport" content="width=device-width, initial-scale=1.0">
 7 |     <title>Analyzed Pull Requests</title>
 8 |     <style>
 9 |         table {
10 |             width: 100%;
11 |             border-collapse: collapse;
12 |         }
13 | 
14 |         th,
15 |         td {
16 |             border: 1px solid #dddddd;
17 |             text-align: left;
18 |             padding: 8px;
19 |         }
20 | 
21 |         tr:nth-child(even) {
22 |             background-color: #f2f2f2;
23 |         }
24 | 
25 |         .compact-table {
26 |             width: auto;
27 |             border: 1px solid black;
28 |         }
29 |     </style>
30 | </head>
31 | 
32 | <body>
33 |     <h1>Summary</h1>
34 |     <table class="compact-table">
35 |         <thead>
36 |             <tr>
37 |                 <th>Status</th>
38 |                 <th>Number</th>
39 |             </tr>
40 |         </thead>
41 |         <tbody>
42 |             {% for status, nb in summary.items() %}
43 |             <tr style="background-color: {{ color_mapping[status] | default('#FFFFFF') }}">
44 |                 <td>{{ status }}</td>
45 |                 <td>{{ nb }}</td>
46 |             </tr>
47 |             {% endfor %}
48 |         </tbody>
49 |     </table>
50 | 
51 |     <h1>Pull Requests</h1>
52 |     <table>
53 |         <thead>
54 |             <tr>
55 |                 <th>PR</th>
56 |                 <th>Title</th>
57 |                 <th>Summary</th>
58 |                 <th>Time</th>
59 |                 <th>Status</th>
60 |                 <th>Classifications</th>
61 |             </tr>
62 |         </thead>
63 |         <tbody>
64 |             {% for pr_result in pr_results %}
65 |             <tr style="background-color: {{ color_mapping[pr_result.status()] | default('#FFFFFF') }}">
66 |                 <td><a href="{{ pr_result.url }}">{{ pr_result.number }}</a></td>
67 |                 <td>{{ pr_result.title }}</td>
68 |                 <td>{{ pr_result.summary() }}</td>
69 |                 <td>{{ pr_result.time_taken }}</td>
70 |                 <td><a href="/pr{{ pr_result.number }}_log">{{ pr_result.status() }}</a></td>
71 |                 <td>
72 |                     {% for classification_result in pr_result.classification_results %}
73 |                     <a href="/pr{{ pr_result.number }}_result{{ loop.index }}">{{ classification_result.classification }}</a><br>
74 |                     {% endfor %}
75 |                 </td>
76 |             </tr>
77 |             {% endfor %}
78 |         </tbody>
79 |     </table>
80 | </body>
81 | 
82 | </html>


--------------------------------------------------------------------------------
/data/ground_truth/pandas/59810.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "pr_number": 59810,
 3 |     "log_file": "data/results/pandas/59810_2024-11-23 09:09:42.json",
 4 |     "differentiating_tests": [
 5 |         {
 6 |             "test": {
 7 |                 "test_code": "# Example 6:\nimport pandas as pd\n\ndf = pd.DataFrame({'x': [1, 2, None], 'y': [2, 2, 3]})\nresult = df.query('x == y or x == None')",
 8 |                 "old_output": "/usr/local/lib/python3.10/site-packages/coverage/inorout.py:457: CoverageWarning: --include is ignored because --source is set (include-ignored)\n  self.warn(\"--include is ignored because --source is set\", slug=\"include-ignored\")\nTraceback (most recent call last):\n  File \"/tmp/BugGPT/BugGPT_test_code.py\", line 5, in <module>\n    result = df.query('x == y or x == None')\n  File \"/home/pandas/pandas/core/frame.py\", line 4616, in query\n    res = self.eval(expr, **kwargs)\n  File \"/home/pandas/pandas/core/frame.py\", line 4769, in eval\n    return _eval(expr, inplace=inplace, **kwargs)\n  File \"/home/pandas/pandas/core/computation/eval.py\", line 366, in eval\n    ret = eng_inst.evaluate()\n  File \"/home/pandas/pandas/core/computation/engines.py\", line 85, in evaluate\n    res = self._evaluate()\n  File \"/home/pandas/pandas/core/computation/engines.py\", line 129, in _evaluate\n    return ne.evaluate(s, local_dict=scope)\n  File \"/usr/local/lib/python3.10/site-packages/numexpr/necompiler.py\", line 977, in evaluate\n    raise e\n  File \"/usr/local/lib/python3.10/site-packages/numexpr/necompiler.py\", line 874, in validate\n    _names_cache[expr_key] = getExprNames(ex, context, sanitize=sanitize)\n  File \"/usr/local/lib/python3.10/site-packages/numexpr/necompiler.py\", line 723, in getExprNames\n    ex = stringToExpression(text, {}, context, sanitize)\n  File \"/usr/local/lib/python3.10/site-packages/numexpr/necompiler.py\", line 309, in stringToExpression\n    ex = eval(c, names)\n  File \"<expr>\", line 1, in <module>\n  File \"/usr/local/lib/python3.10/site-packages/numexpr/expressions.py\", line 80, in func\n    raise TypeError(\"unsupported object type: %s\" % type(x))\nTypeError: unsupported object type: <class 'NoneType'>\n",
 9 |                 "new_output": "/usr/local/lib/python3.10/site-packages/coverage/inorout.py:457: CoverageWarning: --include is ignored because --source is set (include-ignored)\n  self.warn(\"--include is ignored because --source is set\", slug=\"include-ignored\")\n"
10 |             },
11 |             "label": "coincidental fix",
12 |             "comment": "Positive side-effect of the fix. The new version uses the Python evaluation of queries, which doesn't raise an error but handles the None value correctly."
13 |         }
14 |     ]
15 | }


--------------------------------------------------------------------------------
/src/testora/util/Logs.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import os
  3 | from datetime import datetime, timedelta
  4 | import atexit
  5 | from typing import List, Optional
  6 | from pydantic import BaseModel
  7 | 
  8 | from testora.util.ClassificationResult import Classification
  9 | 
 10 | 
 11 | class Event(BaseModel):
 12 |     timestamp: str = ""
 13 |     pr_nb: int
 14 |     message: str
 15 | 
 16 | 
 17 | class PREvent(Event):
 18 |     title: str
 19 |     url: str
 20 | 
 21 | 
 22 | class TestExecutionEvent(Event):
 23 |     code: str
 24 |     output: str
 25 | 
 26 | 
 27 | class ComparisonEvent(Event):
 28 |     test_code: str
 29 |     old_output: str
 30 |     new_output: str
 31 | 
 32 | 
 33 | class PreClassificationEvent(Event):
 34 |     test_code: str
 35 |     old_output: str
 36 |     new_output: str
 37 | 
 38 | 
 39 | class ClassificationEvent(Event):
 40 |     test_code: str
 41 |     old_output: str
 42 |     new_output: str
 43 |     classification: Classification
 44 |     classification_explanation: str
 45 |     old_is_crash: bool
 46 |     new_is_crash: bool
 47 | 
 48 | 
 49 | class SelectBehaviorEvent(Event):
 50 |     expected_output: int
 51 | 
 52 | 
 53 | class LLMEvent(Event):
 54 |     content: str
 55 | 
 56 | 
 57 | class ErrorEvent(Event):
 58 |     details: str
 59 | 
 60 | 
 61 | class CoverageEvent(Event):
 62 |     details: str
 63 | 
 64 | 
 65 | class ClassifierEvalEvent(Event):
 66 |     label: str
 67 |     predictions: str
 68 | 
 69 | 
 70 | events: List[Event] = []
 71 | last_time_stored = datetime.now()
 72 | last_file_stored_to: Optional[str] = None
 73 | 
 74 | 
 75 | def append_event(evt):
 76 |     global last_time_stored
 77 | 
 78 |     evt.timestamp = datetime.now().isoformat()
 79 |     events.append(evt)
 80 |     print(json.dumps(evt.dict(), indent=2))
 81 | 
 82 |     if datetime.now() - last_time_stored > timedelta(minutes=5):
 83 |         store_logs()
 84 |         last_time_stored = datetime.now()
 85 | 
 86 | 
 87 | def get_logs_as_json():
 88 |     return json.dumps([evt.dict() for evt in events], indent=2)
 89 | 
 90 | 
 91 | def store_logs():
 92 |     global last_file_stored_to
 93 |     timestamp = datetime.now().isoformat()
 94 |     event_dicts = [evt.model_dump() for evt in events]
 95 |     out_file = f"logs_{timestamp}.json"
 96 |     json.dump(event_dicts, open(out_file, "w"), indent=2)
 97 | 
 98 |     # remove previous log from this run
 99 |     if last_file_stored_to is not None:
100 |         os.remove(last_file_stored_to)
101 |     last_file_stored_to = out_file
102 | 
103 | 
104 | def reset_logs():
105 |     global events
106 |     events = []
107 |     last_file_stored_to = None
108 | 
109 | 
110 | def start_logging():
111 |     atexit.register(store_logs)
112 | 


--------------------------------------------------------------------------------
/src/testora/prompts/PRRegressionBugRanking.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | from testora.util.Logs import LLMEvent, append_event
 3 | 
 4 | 
 5 | class PRRegressionBugRanking:
 6 |     def __init__(self, github_prs, repo_name):
 7 |         self.github_prs = github_prs
 8 |         self.repo_name = repo_name
 9 |         self.use_json_output = True
10 | 
11 |     def create_prompt(self):
12 |         template = """
13 | The following is a list of titles of pull requests in the <repo_name> project. Rank them by their likelihood to accidentally introduce a regression bug.
14 | 
15 | <pr_titles>
16 | 
17 | Provide your answer using this JSON format:
18 | ```json
19 | {
20 |     "high risk": [
21 |         "PR title 1",
22 |         "PR title 2",
23 |         ...
24 |     ],
25 |     "medium risk": [
26 |         "PR title 3",
27 |         "PR title 4",
28 |         ...
29 |     ],
30 |     "low risk": [
31 |         "PR title 5",
32 |         "PR title 6",
33 |         ...
34 |     ]
35 | }
36 | ```
37 | Make sure to include ALL the given pull requests into the output.
38 | """
39 |         pr_titles = [github_pr.title for github_pr in self.github_prs]
40 |         return template.replace("<repo_name>", self.repo_name).replace("<pr_titles>", "\n".join(pr_titles))
41 | 
42 |     def parse_answer(self, raw_answer):
43 |         assert type(raw_answer) == list
44 |         assert len(raw_answer) == 1
45 | 
46 |         raw_answer = raw_answer[0]
47 | 
48 |         try:
49 |             risk_to_titles = json.loads(raw_answer)
50 |         except json.JSONDecodeError:
51 |             return None
52 | 
53 |         if not isinstance(risk_to_titles, dict):
54 |             return None
55 | 
56 |         if not all(isinstance(risk_to_titles.get(risk), list) for risk in ["high risk", "medium risk", "low risk"]):
57 |             return None
58 | 
59 |         high_risk_titles = set(risk_to_titles.get("high risk"))
60 |         medium_risk_titles = set(risk_to_titles.get("medium risk"))
61 |         low_risk_titles = set(risk_to_titles.get("low risk"))
62 | 
63 |         high_risk_prs = []
64 |         medium_risk_prs = []
65 |         low_risk_prs = []
66 |         for github_pr in self.github_prs:
67 |             if github_pr.title in high_risk_titles:
68 |                 high_risk_prs.append(github_pr)
69 |             elif github_pr.title in medium_risk_titles:
70 |                 medium_risk_prs.append(github_pr)
71 |             elif github_pr.title in low_risk_titles:
72 |                 low_risk_prs.append(github_pr)
73 |             else:
74 |                 append_event(LLMEvent(
75 |                     pr_nb=github_pr.number, message=f"PRRegressionBugRanking omitted a PR title; assuming it's medium-risk", content=github_pr.title))
76 |                 medium_risk_prs.append(github_pr)
77 | 
78 |         return high_risk_prs, medium_risk_prs, low_risk_prs
79 | 


--------------------------------------------------------------------------------
/templates/pr_log.html:
--------------------------------------------------------------------------------
  1 | <!DOCTYPE html>
  2 | <html lang="en">
  3 | 
  4 | <head>
  5 |     <meta charset="UTF-8">
  6 |     <meta name="viewport" content="width=device-width, initial-scale=1.0">
  7 |     <title>Log of PR {{ pr_result.number }}</title>
  8 |     <style>
  9 |         body {
 10 |             font-family: Arial, sans-serif;
 11 |             margin: 20px;
 12 |         }
 13 | 
 14 |         pre {
 15 |             background-color: #f4f4f4;
 16 |             border: 1px solid #ddd;
 17 |             padding: 10px;
 18 |             border-radius: 5px;
 19 |             overflow: auto;
 20 |         }
 21 | 
 22 |         table {
 23 |             width: 100%;
 24 |             border-collapse: collapse;
 25 |         }
 26 | 
 27 |         th,
 28 |         td {
 29 |             border: 1px solid #dddddd;
 30 |             text-align: left;
 31 |             padding: 8px;
 32 |         }
 33 | 
 34 |         tr:nth-child(even) {
 35 |             background-color: #f2f2f2;
 36 |         }
 37 | 
 38 |         .info {
 39 |             margin-bottom: 20px;
 40 |         }
 41 | 
 42 |         .compact-table {
 43 |             width: auto;
 44 |             border: 1px solid black;
 45 |         }
 46 | 
 47 |         .entries {
 48 |             font-size: 11pt;
 49 |         }
 50 |     </style>
 51 | </head>
 52 | 
 53 | <body>
 54 |     <div class="info">
 55 |         <h1><a href="{{ pr_result.url }}">PR{{ pr_result.number }}</a>: {{ pr_result.title }}</h1>
 56 |         <h2>Status: {{ pr_result.status() }}</h2>
 57 |     </div>
 58 | 
 59 |     <h2>Perf Stats</h2>
 60 |     <table class="compact-table">
 61 |         <thead>
 62 |             <tr>
 63 |                 <th>Event</th>
 64 |                 <th>Count</th>
 65 |                 <th>Time (total)</th>
 66 |                 <th>Time (avg)</th>
 67 |             </tr>
 68 |         </thead>
 69 |         <tbody>
 70 |             {% for event, count, total_time, avg_time in perf_stats %}
 71 |             <tr>
 72 |                 <td>{{ event }}</td>
 73 |                 <td>{{ count }}</td>
 74 |                 <td>{{ total_time }}</td>
 75 |                 <td>{{ avg_time }}</td>
 76 |             </tr>
 77 |             {% endfor %}
 78 |         </tbody>
 79 |     </table>
 80 | 
 81 |     <h2>Log</h2>
 82 |     {% for entry in pr_result.entries %}
 83 |     <div class="entries">
 84 |         <table>
 85 |             <thead>
 86 |                 <tr>
 87 |                     <th>Key</th>
 88 |                     <th>Value</th>
 89 |                 </tr>
 90 |             </thead>
 91 |             <tbody>
 92 |                 {% for key, value in entry.items() %}
 93 |                 <tr>
 94 |                     <td>{{ key }}</td>
 95 |                     <td>{{ value | escape_tags | nl2br | safe }}</td>
 96 |                 </tr>
 97 |                 {% endfor %}
 98 |             </tbody>
 99 |         </table>
100 |     </div>
101 |     {% endfor %}
102 | </body>
103 | 
104 | </html>


--------------------------------------------------------------------------------
/src/testora/llms/LLMCache.py:
--------------------------------------------------------------------------------
 1 | import fcntl
 2 | import json
 3 | from os import makedirs
 4 | from os.path import join, exists
 5 | import atexit
 6 | from testora.prompts.PromptCommon import system_message
 7 | from testora.util.Logs import append_event, LLMEvent
 8 | 
 9 | cache_base_dir = "./data/llm_cache/"
10 | if not exists(cache_base_dir):
11 |     makedirs(cache_base_dir)
12 | 
13 | 
14 | class LLMCache:
15 |     def __init__(self, llm_module):
16 |         self.llm_module = llm_module
17 | 
18 |         name = llm_module.model
19 |         cache_dir = join(cache_base_dir, name)
20 |         if not exists(cache_dir):
21 |             makedirs(cache_dir)
22 | 
23 |         self.cache_file = join(cache_dir, "cache.json")
24 |         if exists(self.cache_file):
25 |             with open(self.cache_file, "r") as f:
26 |                 self.cache = json.load(f)
27 |         else:
28 |             self.cache = {}
29 | 
30 |         self.nb_hits = 0
31 |         self.nb_misses = 0
32 | 
33 |         self.nb_unwritten_updates = 0
34 | 
35 |         atexit.register(lambda: self.write_cache())
36 | 
37 |     def write_cache(self):
38 |         with open(self.cache_file, "w") as f:
39 |             fcntl.flock(f, fcntl.LOCK_EX)
40 |             try:
41 |                 json.dump(self.cache, f)
42 |             finally:
43 |                 fcntl.flock(f, fcntl.LOCK_UN)
44 |         print(
45 |             f"LLMCache of {self.llm_module.model} with {len(self.cache)} entries saved. {self.nb_hits} hits, {self.nb_misses} misses.")
46 | 
47 |     def query(self, prompt, nb_samples=1, temperature: float=1, no_cache=False):
48 |         prompt_str = prompt.create_prompt()
49 | 
50 |         # check for cached answer
51 |         if not no_cache:
52 |             result = self.cache.get(prompt_str)
53 |             if result is not None:
54 |                 cached_answers = []
55 |                 if type(result) == str:
56 |                     cached_answers.append(result)
57 |                 elif type(result) == list:
58 |                     cached_answers = result
59 | 
60 |                 if nb_samples <= len(cached_answers):
61 |                     append_event(LLMEvent(pr_nb=-1,
62 |                                         message=f"Cached result for querying {self.llm_module.model}",
63 |                                         content=f"System message:\n{system_message}\nUser message:\n{prompt.create_prompt()}"))
64 |                     self.nb_hits += 1
65 |                     print(f"Prompt:\n{prompt_str}\nReturning cached result\n")
66 |                     return cached_answers[:nb_samples]
67 | 
68 |         # no cached answer (or don't want to use cache), query LLM
69 |         self.nb_misses += 1
70 |         result = self.llm_module.query(prompt, nb_samples=nb_samples, temperature=temperature)
71 | 
72 |         if no_cache:
73 |             return result
74 | 
75 |         # update cache (only if answer is non-empty)
76 |         if result:
77 |             self.cache[prompt_str] = result
78 |             self.nb_unwritten_updates += 1
79 | 
80 |         # write cache every 10 updates
81 |         if self.nb_unwritten_updates > 10:
82 |             self.write_cache()
83 |             self.nb_unwritten_updates = 0
84 | 
85 |         return result
86 | 


--------------------------------------------------------------------------------
/data/ground_truth/scipy/21597.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "pr_number": 21597,
 3 |     "log_file": "data/results/scipy/21597_2024-11-23 09:06:44.json",
 4 |     "differentiating_tests": [
 5 |         {
 6 |             "test": {
 7 |                 "test_code": "# Example 1:\nimport numpy as np\nfrom scipy.special import logsumexp\n\na = np.array([1.0, 2.0, 3.0])\nresult = logsumexp(a)\nprint(\"logsumexp of [1.0, 2.0, 3.0]:\", result)",
 8 |                 "old_output": "logsumexp of [1.0, 2.0, 3.0]: 3.4076059644443806\n",
 9 |                 "new_output": "logsumexp of [1.0, 2.0, 3.0]: 3.40760596444438\n"
10 |             },
11 |             "label": "intended",
12 |             "comment": "PR is about changing the precision of the output"
13 |         },
14 |         {
15 |             "test": {
16 |                 "test_code": "# Example 5:\nimport numpy as np\nfrom scipy.special import logsumexp\n\na = np.array([1.0, 2.0, 3.0])\nresult = logsumexp(a, return_sign=True)\nprint(\"logsumexp with sign of [1.0, 2.0, 3.0]:\", result)",
17 |                 "old_output": "logsumexp with sign of [1.0, 2.0, 3.0]: (np.float64(3.4076059644443806), np.float64(1.0))\n",
18 |                 "new_output": "logsumexp with sign of [1.0, 2.0, 3.0]: (np.float64(3.40760596444438), np.float64(1.0))\n"
19 |             },
20 |             "label": "intended",
21 |             "comment": "PR is about changing the precision of the output"
22 |         },
23 |         {
24 |             "test": {
25 |                 "test_code": "# Example 1:\nimport numpy as np\nfrom scipy.special import logsumexp\n\na = np.array([1.0, 2.0, 3.0])\nresult = logsumexp(a)\nprint(\"Example 1 - Normal case:\", result)",
26 |                 "old_output": "Example 1 - Normal case: 3.4076059644443806\n",
27 |                 "new_output": "Example 1 - Normal case: 3.40760596444438\n"
28 |             },
29 |             "label": "intended",
30 |             "comment": "PR is about changing the precision of the output"
31 |         },
32 |         {
33 |             "test": {
34 |                 "test_code": "import numpy as np\nfrom scipy.special import logsumexp\n\n# Example 5:\n# Return sign\na = np.array([1.0, 2.0, 3.0])\nresult, sign = logsumexp(a, return_sign=True)\nprint(\"Example 5 - Return sign:\", result, sign)",
35 |                 "old_output": "Example 5 - Return sign: 3.4076059644443806 1.0\n",
36 |                 "new_output": "Example 5 - Return sign: 3.40760596444438 1.0\n"
37 |             },
38 |             "label": "intended",
39 |             "comment": "PR is about changing the precision of the output"
40 |         },
41 |         {
42 |             "test": {
43 |                 "test_code": "import numpy as np\nfrom scipy.special import logsumexp\n\n# Example 20:\n# Testing different data types\na = np.array([1, 2, 3], dtype=np.int32)\nresult = logsumexp(a.astype(float))  # Convert to float for logsumexp\nprint(\"Example 20 - Different data types:\", result)",
44 |                 "old_output": "Example 20 - Different data types: 3.4076059644443806\n",
45 |                 "new_output": "Example 20 - Different data types: 3.40760596444438\n"
46 |             },
47 |             "label": "intended",
48 |             "comment": "PR is about changing the precision of the output"
49 |         }
50 |     ]
51 | }


--------------------------------------------------------------------------------
/src/testora/prompts/RegressionTestGeneratorPromptV1.py:
--------------------------------------------------------------------------------
 1 | # Prompt for generating regression tests based on a given diff
 2 | # V1: As used for ICSE'26 paper
 3 | 
 4 | class RegressionTestGeneratorPromptV1:
 5 |     def __init__(self, project_name, fut_qualified_names, diff):
 6 |         self.project_name = project_name
 7 |         self.fut_qualified_names = fut_qualified_names
 8 |         self.diff = diff
 9 |         self.use_json_output = False
10 | 
11 |     def create_prompt(self):
12 |         template = """
13 | Your task is to generate usage examples of the {project_name} project that expose behavioral differences introduced by the following diff:
14 | 
15 | {diff}
16 | 
17 | The diff affects the following functions: {fut_qualified_names}.
18 | 
19 | The usage examples you create may use only the public API of the {project_name} project. You can assume that the project is installed and ready to be imported. Do NOT use any randomly generated data or timestamps in your examples; instead use fixed or deterministically created inputs. Create usage examples that are diverse and cover a wide range of scenarios, e.g., by (not) passing optional parameters or using different APIs to achieve the same purpose.
20 | 
21 | Answer by giving ten usage examples that cover normal usage scenarios and ten usage examples that focus on corner cases (e.g., unusual values, such as None, NaN or empty lists).
22 | Each example must be an executable piece of Python code, including all necessary imports.
23 | Print all relevant values, including intermediate values, in a human-readable form.
24 | 
25 | Wrap each individual example into Python code blocks by using the following output format:
26 | ```python
27 | # Example 1:
28 | ...
29 | ```
30 | ```python
31 | # Example 2:
32 | ...
33 | ```
34 | ```python
35 | # Example 3:
36 | ...
37 | ```
38 | etc.
39 | """
40 | 
41 |         return template.format(project_name=self.project_name,
42 |                                fut_qualified_names=", ".join(
43 |                                    self.fut_qualified_names),
44 |                                diff=self.diff)
45 | 
46 |     def remove_unnecessary_indentation(self, code):
47 |         lines = code.split("\n")
48 |         if len(lines) > 0:
49 |             # find number of leading spaces in first line
50 |             num_spaces = len(lines[0]) - len(lines[0].lstrip())
51 |             if num_spaces > 0:
52 |                 return "\n".join([line[num_spaces:] for line in lines])
53 |         return code
54 | 
55 |     def parse_answer(self, raw_answer):
56 |         assert type(raw_answer) == list
57 | 
58 |         tests = []
59 | 
60 |         for answer in raw_answer:
61 |             in_code = False
62 |             next_test = ""
63 |             for line in answer.split("\n"):
64 |                 if line.strip() == "```":
65 |                     in_code = False
66 |                     if next_test:
67 |                         next_test = self.remove_unnecessary_indentation(
68 |                             next_test)
69 |                         tests.append(next_test)
70 |                         next_test = ""
71 |                 if in_code:
72 |                     next_test += line + "\n"
73 |                 if line.strip() == "```python":
74 |                     in_code = True
75 | 
76 |         return tests
77 | 


--------------------------------------------------------------------------------
/src/multilspy/language_servers/eclipse_jdtls/runtime_dependencies.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "_description": "This file lists the runtime dependencies for the Java Language Server",
 3 |     "gradle": {
 4 |         "platform-agnostic": {
 5 |             "url": "https://services.gradle.org/distributions/gradle-7.3.3-bin.zip",
 6 |             "archiveType": "zip",
 7 |             "relative_extraction_path": "."
 8 |         }
 9 |     },
10 |     "vscode-java": {
11 |         "darwin-arm64": {
12 |             "url": "https://github.com/redhat-developer/vscode-java/releases/download/v1.23.0/java@darwin-arm64-1.23.0.vsix",
13 |             "archiveType": "zip",
14 |             "relative_extraction_path": "vscode-java"
15 |         },
16 |         "darwin-x64": {
17 |             "url": "https://github.com/redhat-developer/vscode-java/releases/download/v1.23.0/java@darwin-x64-1.23.0.vsix",
18 |             "archiveType": "zip",
19 |             "relative_extraction_path": "vscode-java"
20 |         },
21 |         "linux-arm64": {
22 |             "url": "https://github.com/redhat-developer/vscode-java/releases/download/v1.23.0/java@linux-arm64-1.23.0.vsix",
23 |             "archiveType": "zip",
24 |             "relative_extraction_path": "vscode-java"
25 |         },
26 |         "linux-x64": {
27 |             "url": "https://github.com/redhat-developer/vscode-java/releases/download/v1.23.0/java@linux-x64-1.23.0.vsix",
28 |             "archiveType": "zip",
29 |             "relative_extraction_path": "vscode-java",
30 |             "jre_home_path": "extension/jre/17.0.8.1-linux-x86_64",
31 |             "jre_path": "extension/jre/17.0.8.1-linux-x86_64/bin/java",
32 |             "lombok_jar_path": "extension/lombok/lombok-1.18.30.jar",
33 |             "jdtls_launcher_jar_path": "extension/server/plugins/org.eclipse.equinox.launcher_1.6.500.v20230717-2134.jar",
34 |             "jdtls_readonly_config_path": "extension/server/config_linux"
35 |         },
36 |         "win-x64": {
37 |             "url": "https://github.com/redhat-developer/vscode-java/releases/download/v1.23.0/java@win32-x64-1.23.0.vsix",
38 |             "archiveType": "zip",
39 |             "relative_extraction_path": "vscode-java",
40 |             "jre_home_path": "extension/jre/17.0.8.1-win32-x86_64",
41 |             "jre_path": "extension/jre/17.0.8.1-win32-x86_64/bin/java.exe",
42 |             "lombok_jar_path": "extension/lombok/lombok-1.18.30.jar",
43 |             "jdtls_launcher_jar_path": "extension/server/plugins/org.eclipse.equinox.launcher_1.6.500.v20230717-2134.jar",
44 |             "jdtls_readonly_config_path": "extension/server/config_win"
45 |         }
46 |     },
47 |     "intellicode": {
48 |         "platform-agnostic": {
49 |             "url": "https://VisualStudioExptTeam.gallery.vsassets.io/_apis/public/gallery/publisher/VisualStudioExptTeam/extension/vscodeintellicode/1.2.30/assetbyname/Microsoft.VisualStudio.Services.VSIXPackage",
50 |             "alternate_url": "https://marketplace.visualstudio.com/_apis/public/gallery/publishers/VisualStudioExptTeam/vsextensions/vscodeintellicode/1.2.30/vspackage",
51 |             "archiveType": "zip",
52 |             "relative_extraction_path": "intellicode",
53 |             "intellicode_jar_path": "extension/dist/com.microsoft.jdtls.intellicode.core-0.7.0.jar",
54 |             "intellisense_members_path": "extension/dist/bundledModels/java_intellisense-members"
55 |         }
56 |     }
57 | }


--------------------------------------------------------------------------------
/data/ground_truth/pandas/59782.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "pr_number": 59782,
 3 |     "log_file": "data/results/pandas/59782_2024-11-23 09:09:42.json",
 4 |     "differentiating_tests": [
 5 |         {
 6 |             "test": {
 7 |                 "test_code": "# Example 7:\nimport pandas as pd\n\n# Using a larger dataset and grouping\ndf = pd.DataFrame({\"group\": [\"A\", \"A\", \"B\", \"B\", \"C\", \"C\"],\n                   \"value\": [pd.Timedelta(1), pd.Timedelta(2), pd.NaT, pd.Timedelta(4), pd.NaT, pd.NaT]})\nresult = df.groupby(\"group\")[\"value\"].any()\nprint(\"Example 7 Result:\\n\", result)",
 8 |                 "old_output": "/usr/local/lib/python3.10/site-packages/coverage/inorout.py:457: CoverageWarning: --include is ignored because --source is set (include-ignored)\n  self.warn(\"--include is ignored because --source is set\", slug=\"include-ignored\")\nExample 7 Result:\n group\nA    True\nB    True\nC    True\nName: value, dtype: bool\n",
 9 |                 "new_output": "/usr/local/lib/python3.10/site-packages/coverage/inorout.py:457: CoverageWarning: --include is ignored because --source is set (include-ignored)\n  self.warn(\"--include is ignored because --source is set\", slug=\"include-ignored\")\nExample 7 Result:\n group\nA     True\nB     True\nC    False\nName: value, dtype: bool\n"
10 |             },
11 |             "label": "intended",
12 |             "comment": "values for C are all NaT, and hence, result should be False"
13 |         },
14 |         {
15 |             "test": {
16 |                 "test_code": "# Example 4:\nimport pandas as pd\n\n# Grouping with a single group having only NaT\ndf = pd.DataFrame({\"group\": [\"A\", \"A\", \"B\"], \"value\": [pd.NaT, pd.NaT, pd.Timedelta(1)]})\nresult = df.groupby(\"group\")[\"value\"].any()\nprint(\"Corner Case Example 4 Result:\\n\", result)",
17 |                 "old_output": "/usr/local/lib/python3.10/site-packages/coverage/inorout.py:457: CoverageWarning: --include is ignored because --source is set (include-ignored)\n  self.warn(\"--include is ignored because --source is set\", slug=\"include-ignored\")\nCorner Case Example 4 Result:\n group\nA    True\nB    True\nName: value, dtype: bool\n",
18 |                 "new_output": "/usr/local/lib/python3.10/site-packages/coverage/inorout.py:457: CoverageWarning: --include is ignored because --source is set (include-ignored)\n  self.warn(\"--include is ignored because --source is set\", slug=\"include-ignored\")\nCorner Case Example 4 Result:\n group\nA    False\nB     True\nName: value, dtype: bool\n"
19 |             },
20 |             "label": "intended",
21 |             "comment": "values for A are all NaT, and hence, result should be False"
22 |         },
23 |         {
24 |             "test": {
25 |                 "test_code": "# Example 7:\nimport pandas as pd\n\n# Grouping with a series of NaT values and expected output check\ndf = pd.DataFrame({\"group\": [\"B\", \"B\", \"C\", \"C\"], \n                   \"value\": [pd.NaT, pd.NaT, pd.NaT, pd.Timedelta(5)]})\nresult = df.groupby(\"group\")[\"value\"].any()\nprint(\"Corner Case Example 7 Result:\\n\", result)",
26 |                 "old_output": "/usr/local/lib/python3.10/site-packages/coverage/inorout.py:457: CoverageWarning: --include is ignored because --source is set (include-ignored)\n  self.warn(\"--include is ignored because --source is set\", slug=\"include-ignored\")\nCorner Case Example 7 Result:\n group\nB    True\nC    True\nName: value, dtype: bool\n",
27 |                 "new_output": "/usr/local/lib/python3.10/site-packages/coverage/inorout.py:457: CoverageWarning: --include is ignored because --source is set (include-ignored)\n  self.warn(\"--include is ignored because --source is set\", slug=\"include-ignored\")\nCorner Case Example 7 Result:\n group\nB    False\nC     True\nName: value, dtype: bool\n"
28 |             },
29 |             "label": "intended",
30 |             "comment": "values for B are all NaT, and hence, result should be False"
31 |         }
32 |     ]
33 | }


--------------------------------------------------------------------------------
/src/testora/execution/DockerExecutor.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import docker
  3 | import tarfile
  4 | import tempfile
  5 | from os.path import join
  6 | from os import chdir, getcwd
  7 | 
  8 | 
  9 | class DockerExecutor:
 10 |     def __init__(self, container_name, project_name, coverage_files):
 11 |         client = docker.from_env()
 12 |         self.container = client.containers.get(container_name)
 13 |         self.container.start()
 14 | 
 15 |         # adapt paths of coverage files to the container's file system
 16 |         self.coverage_files = [f"/home/{project_name}/{f}" for f in coverage_files]
 17 | 
 18 |     def copy_code_to_container(self, code, target_file_path):
 19 |         target_dir = target_file_path.rsplit("/", 1)[0]
 20 |         target_file_name = target_file_path.rsplit("/", 1)[1]
 21 | 
 22 |         with tempfile.TemporaryDirectory() as tmp_dir:
 23 |             code_file = join(tmp_dir, target_file_name)
 24 |             with open(code_file, "w") as f:
 25 |                 f.write(code)
 26 |             tar_file = join(tmp_dir, "archive.tar")
 27 |             with tarfile.open(tar_file, mode="w") as tar:
 28 |                 wd = getcwd()
 29 |                 try:
 30 |                     chdir(tmp_dir)
 31 |                     tar.add(target_file_name)
 32 |                 finally:
 33 |                     chdir(wd)
 34 | 
 35 |             data = open(tar_file, "rb").read()
 36 |             self.container.put_archive(target_dir, data)
 37 | 
 38 |     def copy_file_from_container(self, file_path_in_container, target_dir):
 39 |         data, _ = self.container.get_archive(file_path_in_container)
 40 |         temp_tar_file = "temp.tar"
 41 |         with open(temp_tar_file, "wb") as f:
 42 |             for d in data:
 43 |                 f.write(d)
 44 |         
 45 |         with tarfile.open(temp_tar_file, mode="r") as tar:
 46 |             tar.extractall(target_dir)
 47 | 
 48 |         os.remove(temp_tar_file)
 49 | 
 50 |     def execute_python_code(self, code):
 51 |         # create a fresh directory to get rid of any old state
 52 |         self.container.exec_run("rm -rf /tmp/Testora")
 53 |         self.container.exec_run("mkdir /tmp/Testora")
 54 | 
 55 |         self.copy_code_to_container(code, "/tmp/Testora/Testora_test_code.py")
 56 |         coverage_files = ",".join(f"\"{f}\"" for f in self.coverage_files)
 57 |         # -u to avoid non-deterministic buffering
 58 |         command = (
 59 |             f"timeout 300s python -u -m coverage run "
 60 |             f"--include={coverage_files} "
 61 |             f"--data-file /tmp/coverage_report /tmp/Testora/Testora_test_code.py"
 62 |         )
 63 | 
 64 |         # for scipy and numpy, make sure we run inside the their dev environment
 65 |         if self.container.name.startswith("scipy-dev"):
 66 |             command = (
 67 |                 f"bash -c 'source /root/conda/etc/profile.d/conda.sh"
 68 |                 f" && eval \"$(mamba shell hook --shell bash)\" && mamba activate scipy-dev"
 69 |                 f" && {command}'"
 70 |             )
 71 |         elif self.container.name.startswith("numpy-dev"):
 72 |             command = (
 73 |                 f"bash -c 'source /root/conda/etc/profile.d/conda.sh"
 74 |                 f" && source /root/conda/etc/profile.d/mamba.sh"
 75 |                 f"' && mamba activate numpy-dev && {command}'"
 76 |             )
 77 | 
 78 |         exec_result = self.container.exec_run(command)
 79 |         output = exec_result.output.decode("utf-8")
 80 | 
 81 |         self.copy_file_from_container(
 82 |             "/tmp/coverage_report", ".")
 83 |         with open("coverage_report", "rb") as f:
 84 |             coverage_report = f.read()
 85 | 
 86 |             return output, coverage_report
 87 | 
 88 | 
 89 | if __name__ == "__main__":
 90 |     code = """
 91 | x = 23
 92 | 
 93 | print(x)
 94 | x.foo()
 95 | print("never reach this")
 96 | """
 97 | 
 98 |     executor = DockerExecutor("pandas-dev", "pandas", coverage_files=[])
 99 |     output = executor.execute_python_code(code)
100 |     print(output)
101 | 


--------------------------------------------------------------------------------
/data/ground_truth/scipy/21572.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "pr_number": 21572,
 3 |     "log_file": "data/results/scipy/21572_2024-11-23 09:06:43.json",
 4 |     "differentiating_tests": [
 5 |         {
 6 |             "test": {
 7 |                 "test_code": "# Example 7:\nimport numpy as np\nfrom scipy.spatial import distance\n\nu = np.array([1, 1, 0], dtype=bool)\nv = np.array([0, 0, 1], dtype=bool)\n\n# Computing Kulczynski 1\nkulczynski_value = distance.kulczynski1(u, v)",
 8 |                 "old_output": "",
 9 |                 "new_output": "/tmp/BugGPT/BugGPT_test_code.py:9: DeprecationWarning: The kulczynski1 metric is deprecated since SciPy 1.15.0 and will be removed in SciPy 1.17.0.  Replace usage of 'kulczynski1(u, v)' with '1/jaccard(u, v) - 1'.\n  kulczynski_value = distance.kulczynski1(u, v)\n"
10 |             },
11 |             "label": "intended",
12 |             "comment": "new version prints deprecation warning, which is what the PR is about"
13 |         },
14 |         {
15 |             "test": {
16 |                 "test_code": "# Example 2:\nimport numpy as np\nfrom scipy.spatial import distance\n\nu = np.array([np.nan, 1, 0], dtype=bool)\nv = np.array([0, 1, 1], dtype=bool)\n\n# Computation with NaN\nkulczynski_result = distance.kulczynski1(u, v)",
17 |                 "old_output": "",
18 |                 "new_output": "/tmp/BugGPT/BugGPT_test_code.py:9: DeprecationWarning: The kulczynski1 metric is deprecated since SciPy 1.15.0 and will be removed in SciPy 1.17.0.  Replace usage of 'kulczynski1(u, v)' with '1/jaccard(u, v) - 1'.\n  kulczynski_result = distance.kulczynski1(u, v)\n"
19 |             },
20 |             "label": "intended",
21 |             "comment": "new version prints deprecation warning, which is what the PR is about"
22 |         },
23 |         {
24 |             "test": {
25 |                 "test_code": "# Example 3:\nimport numpy as np\nfrom scipy.spatial import distance\n\nu = np.array([1, 1, 1], dtype=bool)\nv = np.array([0, 0, 0], dtype=bool)\n\n# All equal values\nkulczynski_result = distance.kulczynski1(u, v)",
26 |                 "old_output": "",
27 |                 "new_output": "/tmp/BugGPT/BugGPT_test_code.py:9: DeprecationWarning: The kulczynski1 metric is deprecated since SciPy 1.15.0 and will be removed in SciPy 1.17.0.  Replace usage of 'kulczynski1(u, v)' with '1/jaccard(u, v) - 1'.\n  kulczynski_result = distance.kulczynski1(u, v)\n"
28 |             },
29 |             "label": "intended",
30 |             "comment": "new version prints deprecation warning, which is what the PR is about"
31 |         },
32 |         {
33 |             "test": {
34 |                 "test_code": "# Example 4:\nimport numpy as np\nfrom scipy.spatial import distance\n\nu = np.array([1], dtype=bool)\nv = np.array([0], dtype=bool)\n\n# Single element arrays\nkulczynski_result = distance.kulczynski1(u, v)",
35 |                 "old_output": "",
36 |                 "new_output": "/tmp/BugGPT/BugGPT_test_code.py:9: DeprecationWarning: The kulczynski1 metric is deprecated since SciPy 1.15.0 and will be removed in SciPy 1.17.0.  Replace usage of 'kulczynski1(u, v)' with '1/jaccard(u, v) - 1'.\n  kulczynski_result = distance.kulczynski1(u, v)\n"
37 |             },
38 |             "label": "intended",
39 |             "comment": "new version prints deprecation warning, which is what the PR is about"
40 |         },
41 |         {
42 |             "test": {
43 |                 "test_code": "# Example 6:\nimport numpy as np\nfrom scipy.spatial import distance\n\nu = np.array([1, 1, 0, 0], dtype=bool)\nv = np.array([1, 0, 1, 1], dtype=bool)\n\n# Compute distance with mixed values\nkulczynski_result = distance.kulczynski1(u, v)",
44 |                 "old_output": "",
45 |                 "new_output": "/tmp/BugGPT/BugGPT_test_code.py:9: DeprecationWarning: The kulczynski1 metric is deprecated since SciPy 1.15.0 and will be removed in SciPy 1.17.0.  Replace usage of 'kulczynski1(u, v)' with '1/jaccard(u, v) - 1'.\n  kulczynski_result = distance.kulczynski1(u, v)\n"
46 |             },
47 |             "label": "intended",
48 |             "comment": "new version prints deprecation warning, which is what the PR is about"
49 |         }
50 |     ]
51 | }


--------------------------------------------------------------------------------
/src/testora/prompts/RegressionTestGeneratorPromptV2.py:
--------------------------------------------------------------------------------
 1 | # Prompt for generating regression tests based on a given diff
 2 | # V2: Variant of V1 optimized by https://platform.openai.com/chat/edit?models=gpt-5&optimize=true for use with GPT-5
 3 | 
 4 | class RegressionTestGeneratorPromptV2:
 5 |     def __init__(self, project_name, fut_qualified_names, diff):
 6 |         self.project_name = project_name
 7 |         self.fut_qualified_names = fut_qualified_names
 8 |         self.diff = diff
 9 |         self.use_json_output = False
10 | 
11 |     def create_prompt(self):
12 |         template = """
13 | Developer: Begin with a concise checklist (3-7 bullets) of what you will do; keep items conceptual, not implementation-level.
14 | 
15 | Your task is to create usage examples for the {project_name} project, specifically designed to highlight behavioral differences introduced by the following diff:
16 | 
17 | {diff}
18 | 
19 | This diff modifies the following functions: {fut_qualified_names}.
20 | 
21 | Instructions:
22 | - Use only the public API from {project_name}. Assume the package is installed and importable.
23 | - Avoid using any randomly generated data or dynamic timestamps. All inputs must be fixed or deterministic.
24 | - Generate a total of 20 executable Python usage examples, each in a separate code block marked with triple backticks.
25 | - The first 10 examples should demonstrate standard/typical usage scenarios.
26 | - The next 10 (examples 11-20) should focus on corner cases and edge conditions, such as unusual values (e.g., None, NaN, empty lists, etc.).
27 | - Each Python code block must:
28 |     - Be self-contained, including all necessary imports.
29 |     - Begin with a comment: e.g., '# Example 1: <short description>'.
30 |     - Include clear print statements for input arguments, outputs, and any intermediate values that help show differences in behavior.
31 |     - If an exception is expected for an edge case, wrap the code in a try/except and print only the exception message (avoid printing stack traces).
32 | - Use the following output format for each example. Failing to wrap the code into backticks will make the result unusable:
33 | ```python
34 | # Example <N>: <short description>
35 | <example code>
36 | ```
37 | 
38 | Output Requirements:
39 | - Submit exactly 20 Python code blocks, numbered sequentially from 1 to 20.
40 | - Code blocks 1-10: Standard use cases.
41 | - Code blocks 11-20: Edge/corner cases.
42 | - Every block is executable and prints human-readable inputs and results.
43 | - Exceptions are handled and their messages printed only.
44 | 
45 | After generating all examples, validate that each code block is executable and correctly numbered. If any do not meet the requirements, revise as needed before final submission.
46 | """
47 | 
48 |         return template.format(project_name=self.project_name,
49 |                                fut_qualified_names=", ".join(
50 |                                    self.fut_qualified_names),
51 |                                diff=self.diff)
52 | 
53 |     def remove_unnecessary_indentation(self, code):
54 |         lines = code.split("\n")
55 |         if len(lines) > 0:
56 |             # find number of leading spaces in first line
57 |             num_spaces = len(lines[0]) - len(lines[0].lstrip())
58 |             if num_spaces > 0:
59 |                 return "\n".join([line[num_spaces:] for line in lines])
60 |         return code
61 | 
62 |     def parse_answer(self, raw_answer):
63 |         assert type(raw_answer) == list
64 | 
65 |         tests = []
66 | 
67 |         for answer in raw_answer:
68 |             in_code = False
69 |             next_test = ""
70 |             for line in answer.split("\n"):
71 |                 if line.strip() == "```":
72 |                     in_code = False
73 |                     if next_test:
74 |                         next_test = self.remove_unnecessary_indentation(
75 |                             next_test)
76 |                         tests.append(next_test)
77 |                         next_test = ""
78 |                 if in_code:
79 |                     next_test += line + "\n"
80 |                 if line.strip() == "```python":
81 |                     in_code = True
82 | 
83 |         return tests
84 | 


--------------------------------------------------------------------------------
/src/testora/evaluation/FindCandidateProjects.py:
--------------------------------------------------------------------------------
  1 | import requests
  2 | import csv
  3 | 
  4 | with open(".github_token") as f:
  5 |     github_token = f.read().strip()
  6 | headers = {'Authorization': f'token {github_token}'}
  7 | 
  8 | 
  9 | def search_repositories(query, sort='stars', order='desc', per_page=100):
 10 |     url = 'https://api.github.com/search/repositories'
 11 |     params = {
 12 |         'q': query,
 13 |         'sort': sort,
 14 |         'order': order,
 15 |         'per_page': per_page
 16 |     }
 17 |     print(".")
 18 |     response = requests.get(url, headers=headers, params=params)
 19 |     response.raise_for_status()
 20 |     return response.json()
 21 | 
 22 | 
 23 | def fetch_top_python_repos(total_repos=1000, per_page=100):
 24 |     url = 'https://api.github.com/search/repositories'
 25 | 
 26 |     params = {
 27 |         'q': 'language:Python',
 28 |         'sort': 'stars',
 29 |         'order': 'desc',
 30 |         'per_page': per_page,
 31 |         'page': 1
 32 |     }
 33 | 
 34 |     repos = []
 35 | 
 36 |     while len(repos) < total_repos:
 37 |         print(".")
 38 |         response = requests.get(url, headers=headers, params=params)
 39 |         response.raise_for_status()  # Raise an exception for HTTP errors
 40 |         data = response.json()
 41 | 
 42 |         repos.extend(data['items'])
 43 | 
 44 |         if 'next' not in response.links:
 45 |             break
 46 | 
 47 |         params['page'] += 1
 48 | 
 49 |     return repos[:total_repos]
 50 | 
 51 | 
 52 | def get_pull_requests_count_graphql(owner, repo):
 53 |     url = 'https://api.github.com/graphql'
 54 | 
 55 |     query = """
 56 |     query($owner: String!, $repo: String!) {
 57 |       repository(owner: $owner, name: $repo) {
 58 |         pullRequests {
 59 |           totalCount
 60 |         }
 61 |       }
 62 |     }
 63 |     """
 64 | 
 65 |     variables = {
 66 |         'owner': owner,
 67 |         'repo': repo
 68 |     }
 69 | 
 70 |     print(".")
 71 |     response = requests.post(
 72 |         url, json={'query': query, 'variables': variables}, headers=headers)
 73 |     response.raise_for_status()  # Raise an exception for HTTP errors
 74 | 
 75 |     data = response.json()
 76 |     total_prs = data['data']['repository']['pullRequests']['totalCount']
 77 | 
 78 |     return total_prs
 79 | 
 80 | 
 81 | def get_pull_requests_count(owner, repo):
 82 |     url = f'https://api.github.com/repos/{owner}/{repo}/pulls'
 83 |     params = {
 84 |         'state': 'all',
 85 |         'per_page': 100,
 86 |         'page': 1
 87 |     }
 88 |     print(".")
 89 |     response = requests.get(url, headers=headers, params=params)
 90 |     response.raise_for_status()
 91 | 
 92 |     total_prs = 0
 93 |     total_prs += len(response.json())
 94 | 
 95 |     while 'next' in response.links:
 96 |         print(".")
 97 |         response = requests.get(response.links['next']['url'], headers=headers)
 98 |         response.raise_for_status()
 99 |         total_prs += len(response.json())
100 | 
101 |     return total_prs
102 | 
103 | 
104 | def main():
105 |     query = 'language:Python'
106 |     # repositories = search_repositories(query)['items']
107 |     repositories = fetch_top_python_repos()
108 | 
109 |     out_file = "candidate_projects2.csv"
110 |     with open(out_file, mode='w', newline='') as out_fp:
111 |         writer = csv.writer(out_fp)
112 |         writer.writerow(['Name', 'Stars', 'PRs', 'Description'])
113 | 
114 |     print(f"Found {len(repositories)} repositories")
115 |     for repo in repositories:
116 |         repo_full_name = repo['full_name']
117 |         repo_description = repo['description']
118 |         if not repo_description or "library" not in repo_description.lower():
119 |             print(
120 |                 f'Skipping {repo_full_name} because it seems to not be a library')
121 |             continue
122 |         print(f"Counting PRs for {repo_full_name}")
123 |         pr_count = get_pull_requests_count_graphql(*repo_full_name.split('/'))
124 |         stars = repo['stargazers_count']
125 |         print(f'{repo_full_name} -- {stars} -- {pr_count} -- {repo_description}')
126 |         with open(out_file, mode='a', newline='') as out_fp:
127 |             writer = csv.writer(out_fp)
128 |             writer.writerow([repo_full_name, stars, pr_count, repo_description])
129 |             out_fp.flush()
130 | 
131 | 
132 | if __name__ == "__main__":
133 |     main()
134 | 


--------------------------------------------------------------------------------
/data/ground_truth/marshmallow/1998.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "pr_number": 1998,
 3 |     "log_file": "data/results/marshmallow/1998_2024-11-23 09:16:16.json",
 4 |     "differentiating_tests": [
 5 |         {
 6 |             "test": {
 7 |                 "test_code": "# Example 2:\nfrom marshmallow import Schema, fields\nimport datetime as dt\n\nclass MySchema(Schema):\n    duration = fields.TimeDelta(precision='minutes', serialization_type=float)\n\ndata = {'duration': dt.timedelta(minutes=2)}\nschema = MySchema()\nresult = schema.dump(data)\nprint(\"Serialized (float, minutes):\", result)",
 8 |                 "old_output": "Serialized (float, minutes): {'duration': 2}\n",
 9 |                 "new_output": "Serialized (float, minutes): {'duration': 2.0}\n"
10 |             },
11 |             "label": "intended",
12 |             "comment": "serialized value now is a float"
13 |         },
14 |         {
15 |             "test": {
16 |                 "test_code": "# Example 4:\nfrom marshmallow import Schema, fields\nimport datetime as dt\n\nclass MySchema(Schema):\n    duration = fields.TimeDelta(precision='hours', serialization_type=float)\n\ndata = {'duration': dt.timedelta(hours=1, seconds=1200)}\nschema = MySchema()\nresult = schema.dump(data)\nprint(\"Serialized (float, hours):\", result)",
17 |                 "old_output": "Serialized (float, hours): {'duration': 1}\n",
18 |                 "new_output": "Serialized (float, hours): {'duration': 1.3333333333333333}\n"
19 |             },
20 |             "label": "intended",
21 |             "comment": "serialized value now is a float"
22 |         },
23 |         {
24 |             "test": {
25 |                 "test_code": "# Example 6:\nfrom marshmallow import Schema, fields\nimport datetime as dt\n\nclass MySchema(Schema):\n    duration = fields.TimeDelta(precision='weeks', serialization_type=float)\n\ndata = {'duration': dt.timedelta(weeks=1)}\nschema = MySchema()\nresult = schema.dump(data)\nprint(\"Serialized (float, weeks):\", result)",
26 |                 "old_output": "Serialized (float, weeks): {'duration': 1}\n",
27 |                 "new_output": "Serialized (float, weeks): {'duration': 1.0}\n"
28 |             },
29 |             "label": "intended",
30 |             "comment": "serialized value now is a float"
31 |         },
32 |         {
33 |             "test": {
34 |                 "test_code": "# Example 8:\nfrom marshmallow import Schema, fields\nimport datetime as dt\n\nclass MySchema(Schema):\n    duration = fields.TimeDelta(precision='seconds', serialization_type=float)\n\ndata = {'duration': dt.timedelta(seconds=60.5)}\nschema = MySchema()\nresult = schema.dump(data)\nprint(\"Serialized (float, seconds):\", result)",
35 |                 "old_output": "Serialized (float, seconds): {'duration': 60}\n",
36 |                 "new_output": "Serialized (float, seconds): {'duration': 60.5}\n"
37 |             },
38 |             "label": "intended",
39 |             "comment": "serialized value now is a float"
40 |         },
41 |         {
42 |             "test": {
43 |                 "test_code": "# Example 10:\nfrom marshmallow import Schema, fields\nimport datetime as dt\n\nclass MySchema(Schema):\n    duration = fields.TimeDelta(precision='milliseconds', serialization_type=float)\n\ndata = {'duration': dt.timedelta(milliseconds=1500)}\nschema = MySchema()\nresult = schema.dump(data)\nprint(\"Serialized (float, milliseconds):\", result)",
44 |                 "old_output": "Serialized (float, milliseconds): {'duration': 1500}\n",
45 |                 "new_output": "Serialized (float, milliseconds): {'duration': 1500.0}\n"
46 |             },
47 |             "label": "intended",
48 |             "comment": "serialized value now is a float"
49 |         },
50 |         {
51 |             "test": {
52 |                 "test_code": "# Example 14:\nfrom marshmallow import Schema, fields\nimport datetime as dt\n\nclass MySchema(Schema):\n    duration = fields.TimeDelta(precision='seconds', serialization_type=float)\n\ndata = {'duration': dt.timedelta(seconds=-1)}\nschema = MySchema()\nresult = schema.dump(data)\nprint(\"Serialized (negative duration):\", result)",
53 |                 "old_output": "Serialized (negative duration): {'duration': -1}\n",
54 |                 "new_output": "Serialized (negative duration): {'duration': -1.0}\n"
55 |             },
56 |             "label": "intended",
57 |             "comment": "serialized value now is a float"
58 |         }
59 |     ]
60 | }


--------------------------------------------------------------------------------
/src/testora/llms/OpenAIGPT.py:
--------------------------------------------------------------------------------
 1 | from json import JSONDecodeError
 2 | import time
 3 | from typing import List
 4 | from openai import OpenAI, RateLimitError
 5 | from testora.prompts.PromptCommon import system_message
 6 | from testora.util.Logs import append_event, LLMEvent
 7 | from testora.Config import model_version
 8 | 
 9 | if model_version.startswith("gpt"):
10 |     with open(".openai_token", "r") as f:
11 |         openai_key = f.read().strip()
12 |         openai = OpenAI(api_key=openai_key)
13 | elif model_version.startswith("deepseek"):
14 |     with open(".openrouter_token", "r") as f:
15 |         openrouter_key = f.read().strip()
16 |         openai = OpenAI(api_key=openrouter_key,
17 |                         base_url="https://openrouter.ai/api/v1")
18 | 
19 | 
20 | class OpenAIGPT:
21 |     def __init__(self):
22 |         self.model = model_version
23 | 
24 |     def query(self, prompt, nb_samples=1, temperature=1) -> List:
25 |         user_message = prompt.create_prompt()
26 |         if len(user_message) > 30000:
27 |             append_event(LLMEvent(pr_nb=-1,
28 |                                   message=f"Query too long",
29 |                                   content=f"System message:\n{system_message}\nUser message:\n{user_message}"))
30 |             return [""]
31 | 
32 |         append_event(LLMEvent(pr_nb=-1,
33 |                               message=f"Querying {self.model}",
34 |                               content=f"System message:\n{system_message}\nUser message:\n{user_message}"))
35 | 
36 |         while True:
37 |             try:
38 |                 if prompt.use_json_output:
39 |                     completion = openai.chat.completions.create(
40 |                         model=self.model,
41 |                         messages=[
42 |                             {"role": "system", "content": system_message},
43 |                             {"role": "user", "content": user_message}
44 |                         ],
45 |                         n=nb_samples,
46 |                         response_format={"type": "json_object"},
47 |                         temperature=temperature
48 |                     )
49 |                     break
50 |                 else:
51 |                     completion = openai.chat.completions.create(
52 |                         model=self.model,
53 |                         messages=[
54 |                             {"role": "system", "content": system_message},
55 |                             {"role": "user", "content": user_message}
56 |                         ],
57 |                         n=nb_samples,
58 |                         temperature=temperature
59 |                     )  # type: ignore[call-overload]
60 | 
61 |                     # handle errors that lead to no model being called
62 |                     if completion.model is None:
63 |                         append_event(LLMEvent(pr_nb=-1,
64 |                                               message=f"Failed to get completion",
65 |                                               content=f"Will try again in 1 second"))
66 |                         time.sleep(1)
67 |                         continue
68 | 
69 |                     append_event(LLMEvent(pr_nb=-1,
70 |                                           message=f"Token usage",
71 |                                           content=f"prompt={completion.usage.prompt_tokens}, completion={completion.usage.completion_tokens}"))
72 | 
73 |                     answers = []
74 |                     for choice in completion.choices:
75 |                         answers.append(choice.message.content)
76 | 
77 |                     # handle errors that lead to empty answers
78 |                     if "" in answers:
79 |                         append_event(LLMEvent(pr_nb=-1,
80 |                                               message=f"Empty answer",
81 |                                               content=f"Will try again in 1 second"))
82 |                         time.sleep(1)
83 |                         continue
84 | 
85 |                     return answers
86 | 
87 |             except RateLimitError as e:
88 |                 append_event(LLMEvent(pr_nb=-1,
89 |                                       message=f"Rate limit exceeded",
90 |                                       content=f"Will try again in 60 seconds"))
91 |                 time.sleep(60)
92 |             except JSONDecodeError as e:
93 |                 append_event(LLMEvent(pr_nb=-1,
94 |                                       message=f"JSON decode error",
95 |                                       content=f"Will try again in 1 second"))
96 |                 time.sleep(1)
97 | 
98 |         raise Exception("Should not reach this point")
99 | 


--------------------------------------------------------------------------------
/data/ground_truth/scipy/21629.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "pr_number": 21629,
 3 |     "log_file": "data/results/scipy/21629_2024-11-23 09:06:44.json",
 4 |     "differentiating_tests": [
 5 |         {
 6 |             "test": {
 7 |                 "test_code": "# Corner Case Example 2:\nimport numpy as np\nfrom scipy.special import spherical_yn\n\nn = 1\nz = None\ntry:\n    result = spherical_yn(n, z)\nexcept Exception as e:\n    print(f\"spherical_yn({n}, {z}) raised an exception: {e}\")",
 8 |                 "old_output": "spherical_yn(1, None) raised an exception: ufunc '_spherical_yn' not supported for the input types, and the inputs could not be safely coerced to any supported types according to the casting rule ''safe''\n",
 9 |                 "new_output": "spherical_yn(1, None) raised an exception: '>=' not supported between instances of 'NoneType' and 'int'\n"
10 |             },
11 |             "label": "intended",
12 |             "comment": "different errors messages that both are about the same invalid input"
13 |         },
14 |         {
15 |             "test": {
16 |                 "test_code": "# Corner Case Example 4:\nimport numpy as np\nfrom scipy.special import spherical_kn\n\nn = 0\nz = -1.0\nresult = spherical_kn(n, z)\nprint(f\"spherical_kn({n}, {z}) = {result}\")",
17 |                 "old_output": "spherical_kn(0, -1.0) = nan\n",
18 |                 "new_output": "spherical_kn(0, -1.0) = -4.269867111336788\n"
19 |             },
20 |             "label": "intended",
21 |             "comment": "new version avoids NaN for negative z input, which is the intention of the PR"
22 |         },
23 |         {
24 |             "test": {
25 |                 "test_code": "# Corner Case Example 7:\nimport numpy as np\nfrom scipy.special import spherical_in\n\nn = 1\nz = -0.5\nresult = spherical_in(n, z)\nprint(f\"spherical_in({n}, {z}) = {result}\")",
26 |                 "old_output": "spherical_in(1, -0.5) = nan\n",
27 |                 "new_output": "spherical_in(1, -0.5) = -0.17087070843777216\n"
28 |             },
29 |             "label": "intended",
30 |             "comment": "new version avoids NaN for negative z input, which is the intention of the PR"
31 |         },
32 |         {
33 |             "test": {
34 |                 "test_code": "# Corner Case Example 8:\nimport numpy as np\nfrom scipy.special import spherical_kn\n\nn = 1\nz = -100.0\nresult = spherical_kn(n, z)\nprint(f\"spherical_kn({n}, {z}) = {result}\")",
35 |                 "old_output": "spherical_kn(1, -100.0) = nan\n",
36 |                 "new_output": "spherical_kn(1, -100.0) = -4.18025968703559e+41\n"
37 |             },
38 |             "label": "intended",
39 |             "comment": "new version avoids NaN for negative z input, which is the intention of the PR"
40 |         },
41 |         {
42 |             "test": {
43 |                 "test_code": "# Example 9:\nimport numpy as np\nfrom scipy.special import spherical_jn\n\nn = 3\nz = -1.0  # Negative input to see reflection behavior\nresult = spherical_jn(n, z)\nprint(f\"spherical_jn({n}, {z}) = {result}\")",
44 |                 "old_output": "spherical_jn(3, -1.0) = nan\n",
45 |                 "new_output": "spherical_jn(3, -1.0) = -0.009006581117112524\n"
46 |             },
47 |             "label": "intended",
48 |             "comment": "new version avoids NaN for negative z input, which is the intention of the PR"
49 |         },
50 |         {
51 |             "test": {
52 |                 "test_code": "# Example 15:\nimport numpy as np\nfrom scipy.special import spherical_jn\n\nn = 1\nz = np.array([-1.0, np.nan, 1.0])  # Mixed values\nresult = spherical_jn(n, z)\nprint(f\"spherical_jn({n}, [-1.0, nan, 1.0]) = {result}\")",
53 |                 "old_output": "spherical_jn(1, [-1.0, nan, 1.0]) = [       nan        nan 0.30116868]\n",
54 |                 "new_output": "spherical_jn(1, [-1.0, nan, 1.0]) = [-0.30116868         nan  0.30116868]\n"
55 |             },
56 |             "label": "intended",
57 |             "comment": "new version reduces NaNs for negative z input, which is the intention of the PR"
58 |         },
59 |         {
60 |             "test": {
61 |                 "test_code": "# Example 17:\nimport numpy as np\nfrom scipy.special import spherical_in\n\nn = 0\nz = np.array([1e10, -1e10])  # Large values\nresult = spherical_in(n, z)\nprint(f\"spherical_in({n}, [1e10, -1e10]) = {result}\")",
62 |                 "old_output": "spherical_in(0, [1e10, -1e10]) = [inf nan]\n",
63 |                 "new_output": "spherical_in(0, [1e10, -1e10]) = [inf inf]\n"
64 |             },
65 |             "label": "intended",
66 |             "comment": "new version avoids NaN for negative z input, which is the intention of the PR"
67 |         }
68 |     ]
69 | }


--------------------------------------------------------------------------------
/src/testora/evaluation/ResultsManager.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | from datetime import datetime
  3 | 
  4 | 
  5 | results_base_dir = "data/results/"
  6 | classification_results_base_dir = "data/classification_results/"
  7 | 
  8 | 
  9 | def result_files():
 10 |     for project_dir in os.listdir(results_base_dir):
 11 |         for pr_result_file in os.listdir(os.path.join(results_base_dir, project_dir)):
 12 |             if pr_result_file.endswith(".json"):
 13 |                 yield os.path.join(results_base_dir, project_dir, pr_result_file)
 14 | 
 15 | 
 16 | def result_files_for_project(project_name, minimum_timestamp=None, is_classification=False, file_name=None, base_dir_arg=None):
 17 |     if base_dir_arg:
 18 |         base_dir = base_dir_arg
 19 |     else:
 20 |         base_dir = classification_results_base_dir if is_classification else results_base_dir
 21 |     
 22 |     for pr_result_file in os.listdir(os.path.join(base_dir, project_name)):
 23 |         if pr_result_file.endswith(".json"):
 24 |             if minimum_timestamp:
 25 |                 pr_timestamp = pr_result_file.replace(
 26 |                     ".json", "").split("_")[1]
 27 |                 if datetime.strptime(pr_timestamp, "%Y-%m-%d %H:%M:%S") < datetime.strptime(minimum_timestamp, "%Y-%m-%d %H:%M:%S"):
 28 |                     continue
 29 | 
 30 |             if file_name and pr_result_file != file_name:
 31 |                 continue
 32 | 
 33 |             yield os.path.join(base_dir, project_name, pr_result_file)
 34 | 
 35 | 
 36 | def current_results(include_archive=True, is_classification=False):
 37 |     base_dir = classification_results_base_dir if is_classification else results_base_dir
 38 | 
 39 |     project_to_prs_and_timestamps = {}
 40 |     for project_dir in os.listdir(base_dir):
 41 |         project_to_prs_and_timestamps[project_dir] = []
 42 |         result_dirs = [os.path.join(base_dir, project_dir)]
 43 |         if include_archive:
 44 |             archive_dir = os.path.join(base_dir, project_dir, "archive")
 45 |             if not os.path.exists(archive_dir):
 46 |                 os.makedirs(archive_dir)
 47 |                 print(f"Created directory {archive_dir}")
 48 |             result_dirs.append(archive_dir)
 49 |         for result_dir in result_dirs:
 50 |             for pr_result_file in os.listdir(result_dir):
 51 |                 if pr_result_file.endswith(".json"):
 52 |                     pr_nb, timestamp = pr_result_file.replace(
 53 |                         ".json", "").split("_")
 54 |                     project_to_prs_and_timestamps[project_dir].append(
 55 |                         [pr_nb, timestamp])
 56 |     return project_to_prs_and_timestamps
 57 | 
 58 | 
 59 | def add_result(project_name, pr_nb, timestamp, result, is_classification):
 60 |     base_dir = classification_results_base_dir if is_classification else results_base_dir
 61 | 
 62 |     if not os.path.exists(base_dir):
 63 |         os.makedirs(base_dir)
 64 |         print(f"Created directory {base_dir}")
 65 | 
 66 |     all_old_results = current_results(is_classification=is_classification)
 67 |     non_archive_old_results = current_results(
 68 |         False, is_classification=is_classification)
 69 | 
 70 |     # check if result already exists
 71 |     for old_pr_nb, old_timestamp in all_old_results[project_name]:
 72 |         if old_pr_nb == pr_nb and old_timestamp == timestamp:
 73 |             return
 74 | 
 75 |     # Write new result to file
 76 |     if not os.path.exists(os.path.join(base_dir, project_name)):
 77 |         os.makedirs(os.path.join(base_dir, project_name))
 78 | 
 79 |     target_file = os.path.join(base_dir, project_name,
 80 |                                f"{pr_nb}_{timestamp}.json")
 81 |     with open(target_file, "w") as f:
 82 |         f.write(result)
 83 | 
 84 |     # Check if it replaces an old result (if yes, move old result to archive)
 85 |     for old_pr_nb, old_timestamp in non_archive_old_results[project_name]:
 86 |         if old_pr_nb == pr_nb:
 87 |             old_target_file = os.path.join(base_dir, project_name,
 88 |                                            f"{old_pr_nb}_{old_timestamp}.json")
 89 |             archive_dir = os.path.join(base_dir, project_name, "archive")
 90 |             if not os.path.exists(archive_dir):
 91 |                 os.makedirs(archive_dir)
 92 |                 print(f"Created directory {archive_dir}")
 93 |             renamed_target_file = os.path.join(
 94 |                 archive_dir, f"{old_pr_nb}_{old_timestamp}.json")
 95 |             os.rename(old_target_file, renamed_target_file)
 96 |             print(f"Moved old result to {renamed_target_file}")
 97 |             break
 98 | 
 99 |     print(f"New result in {target_file}")
100 | 


--------------------------------------------------------------------------------
/src/testora/webui/WebUI.py:
--------------------------------------------------------------------------------
  1 | from collections import Counter
  2 | from typing import Dict, List
  3 | from flask import Flask, render_template
  4 | import argparse
  5 | from datetime import timedelta
  6 | from testora.util.LogParser import PRResult, parse_log_files, parse_time_stamp, pr_results_as_dict
  7 | 
  8 | app = Flask("Testora Web UI")
  9 | 
 10 | 
 11 | parser = argparse.ArgumentParser(description="Web UI for Testora")
 12 | parser.add_argument("--files", help="Log file(s) to process",
 13 |                     type=str, required=False, nargs="+")
 14 | 
 15 | pr_results: List[PRResult] = []
 16 | pr_number_to_result: Dict[int, PRResult] = {}
 17 | 
 18 | 
 19 | def summarize_status():
 20 |     summary = Counter()
 21 |     for pr_result in pr_results:
 22 |         summary["total"] += 1
 23 |         summary[pr_result.status()] += 1
 24 | 
 25 |     # add percentages
 26 |     for key in summary:
 27 |         if key != "total":
 28 |             percentage = (int(summary[key]) / summary['total']) * 100
 29 |             summary[key] = f"{summary[key]} ({percentage:.1f}%)"
 30 | 
 31 |     return summary
 32 | 
 33 | 
 34 | def compute_perf_stats(entries):
 35 |     total_time = parse_time_stamp(entries[-1]["timestamp"]) - \
 36 |         parse_time_stamp(entries[0]["timestamp"])
 37 | 
 38 |     message_prefix_to_timedelta = {}
 39 |     message_prefix_to_nb = Counter()
 40 |     previous_timestamp = None
 41 |     previous_message_prefix = None
 42 | 
 43 |     for entry in entries:
 44 |         if previous_timestamp is None:
 45 |             previous_timestamp = entry["timestamp"]
 46 |             previous_message_prefix = entry["message"].split(" ")[0]
 47 |         else:
 48 |             current_timestamp = entry["timestamp"]
 49 |             current_message_prefix = entry["message"].split(" ")[0]
 50 |             message_prefix_to_timedelta[previous_message_prefix] = message_prefix_to_timedelta.get(
 51 |                 previous_message_prefix, timedelta(0)) + (parse_time_stamp(current_timestamp) - parse_time_stamp(previous_timestamp))
 52 |             message_prefix_to_nb[previous_message_prefix] += 1
 53 |             previous_timestamp = current_timestamp
 54 |             previous_message_prefix = current_message_prefix
 55 | 
 56 |     # sort by time and keep only top-k
 57 |     message_prefix_to_timedelta = dict(
 58 |         sorted(message_prefix_to_timedelta.items(), key=lambda item: item[1], reverse=True)[:6])
 59 | 
 60 |     result = [["All", len(entries), total_time, total_time / len(entries)]]
 61 |     for message_prefix, time in message_prefix_to_timedelta.items():
 62 |         if message_prefix in ["Done", "Starting"]:
 63 |             continue
 64 |         result.append([message_prefix, message_prefix_to_nb[message_prefix], time,
 65 |                        time / message_prefix_to_nb[message_prefix]])
 66 | 
 67 |     return result
 68 | 
 69 | 
 70 | status_colors = {
 71 |     "unknown": "#FFFFE0",
 72 |     "checked": "#D3D3D3",
 73 |     "intended_change": "#CCFFCC",
 74 |     "coincidental_fix": "#CBC3E3",
 75 |     "regression": "#FFCCCC",
 76 | }
 77 | 
 78 | 
 79 | def nl2br(value):
 80 |     if type(value) == str:
 81 |         return value.replace("\n", "<br>")
 82 |     else:
 83 |         return value
 84 | 
 85 | 
 86 | app.jinja_env.filters["nl2br"] = nl2br
 87 | 
 88 | 
 89 | def escape_tags(value):
 90 |     if type(value) == str:
 91 |         return value.replace("<", "&lt;").replace(">", "&gt;")
 92 |     else:
 93 |         return value
 94 | 
 95 | 
 96 | app.jinja_env.filters["escape_tags"] = escape_tags
 97 | 
 98 | 
 99 | @app.route('/')
100 | def main_page():
101 |     global pr_results, pr_number_to_result
102 |     pr_results, _ = parse_log_files(args.files)
103 |     summary = summarize_status()
104 |     pr_number_to_result = pr_results_as_dict(pr_results)
105 |     return render_template("index.html", summary=summary, pr_results=pr_results, color_mapping=status_colors)
106 | 
107 | 
108 | @app.route('/pr<int:number>_log')
109 | def pr_log_page(number):
110 |     pr_result = pr_number_to_result[int(number)]
111 |     perf_stats = compute_perf_stats(pr_result.entries)
112 |     return render_template('pr_log.html', pr_result=pr_result, perf_stats=perf_stats)
113 | 
114 | @app.route('/pr<int:pr_number>_result<int:result_number>')
115 | def pr_result_page(pr_number, result_number):
116 |     pr_result = pr_number_to_result[int(pr_number)]
117 |     classification_result = pr_result.classification_results[int(result_number) - 1]
118 |     return render_template('pr_result.html', pr_result=pr_result, classification_result=classification_result)
119 | 
120 | 
121 | if __name__ == "__main__":
122 |     args = parser.parse_args()
123 |     app.run(debug=True, port=4000)
124 | 


--------------------------------------------------------------------------------
/src/multilspy/language_servers/omnisharp/workspace_did_change_configuration.json:
--------------------------------------------------------------------------------
  1 | {
  2 |     "RoslynExtensionsOptions": {
  3 |         "EnableDecompilationSupport": false,
  4 |         "EnableAnalyzersSupport": true,
  5 |         "EnableImportCompletion": true,
  6 |         "EnableAsyncCompletion": false,
  7 |         "DocumentAnalysisTimeoutMs": 30000,
  8 |         "DiagnosticWorkersThreadCount": 18,
  9 |         "AnalyzeOpenDocumentsOnly": true,
 10 |         "InlayHintsOptions": {
 11 |             "EnableForParameters": false,
 12 |             "ForLiteralParameters": false,
 13 |             "ForIndexerParameters": false,
 14 |             "ForObjectCreationParameters": false,
 15 |             "ForOtherParameters": false,
 16 |             "SuppressForParametersThatDifferOnlyBySuffix": false,
 17 |             "SuppressForParametersThatMatchMethodIntent": false,
 18 |             "SuppressForParametersThatMatchArgumentName": false,
 19 |             "EnableForTypes": false,
 20 |             "ForImplicitVariableTypes": false,
 21 |             "ForLambdaParameterTypes": false,
 22 |             "ForImplicitObjectCreation": false
 23 |         },
 24 |         "LocationPaths": null
 25 |     },
 26 |     "FormattingOptions": {
 27 |         "OrganizeImports": false,
 28 |         "EnableEditorConfigSupport": true,
 29 |         "NewLine": "\n",
 30 |         "UseTabs": false,
 31 |         "TabSize": 4,
 32 |         "IndentationSize": 4,
 33 |         "SpacingAfterMethodDeclarationName": false,
 34 |         "SeparateImportDirectiveGroups": false,
 35 |         "SpaceWithinMethodDeclarationParenthesis": false,
 36 |         "SpaceBetweenEmptyMethodDeclarationParentheses": false,
 37 |         "SpaceAfterMethodCallName": false,
 38 |         "SpaceWithinMethodCallParentheses": false,
 39 |         "SpaceBetweenEmptyMethodCallParentheses": false,
 40 |         "SpaceAfterControlFlowStatementKeyword": true,
 41 |         "SpaceWithinExpressionParentheses": false,
 42 |         "SpaceWithinCastParentheses": false,
 43 |         "SpaceWithinOtherParentheses": false,
 44 |         "SpaceAfterCast": false,
 45 |         "SpaceBeforeOpenSquareBracket": false,
 46 |         "SpaceBetweenEmptySquareBrackets": false,
 47 |         "SpaceWithinSquareBrackets": false,
 48 |         "SpaceAfterColonInBaseTypeDeclaration": true,
 49 |         "SpaceAfterComma": true,
 50 |         "SpaceAfterDot": false,
 51 |         "SpaceAfterSemicolonsInForStatement": true,
 52 |         "SpaceBeforeColonInBaseTypeDeclaration": true,
 53 |         "SpaceBeforeComma": false,
 54 |         "SpaceBeforeDot": false,
 55 |         "SpaceBeforeSemicolonsInForStatement": false,
 56 |         "SpacingAroundBinaryOperator": "single",
 57 |         "IndentBraces": false,
 58 |         "IndentBlock": true,
 59 |         "IndentSwitchSection": true,
 60 |         "IndentSwitchCaseSection": true,
 61 |         "IndentSwitchCaseSectionWhenBlock": true,
 62 |         "LabelPositioning": "oneLess",
 63 |         "WrappingPreserveSingleLine": true,
 64 |         "WrappingKeepStatementsOnSingleLine": true,
 65 |         "NewLinesForBracesInTypes": true,
 66 |         "NewLinesForBracesInMethods": true,
 67 |         "NewLinesForBracesInProperties": true,
 68 |         "NewLinesForBracesInAccessors": true,
 69 |         "NewLinesForBracesInAnonymousMethods": true,
 70 |         "NewLinesForBracesInControlBlocks": true,
 71 |         "NewLinesForBracesInAnonymousTypes": true,
 72 |         "NewLinesForBracesInObjectCollectionArrayInitializers": true,
 73 |         "NewLinesForBracesInLambdaExpressionBody": true,
 74 |         "NewLineForElse": true,
 75 |         "NewLineForCatch": true,
 76 |         "NewLineForFinally": true,
 77 |         "NewLineForMembersInObjectInit": true,
 78 |         "NewLineForMembersInAnonymousTypes": true,
 79 |         "NewLineForClausesInQuery": true
 80 |     },
 81 |     "FileOptions": {
 82 |         "SystemExcludeSearchPatterns": [
 83 |             "**/node_modules/**/*",
 84 |             "**/bin/**/*",
 85 |             "**/obj/**/*",
 86 |             "**/.git/**/*",
 87 |             "**/.git",
 88 |             "**/.svn",
 89 |             "**/.hg",
 90 |             "**/CVS",
 91 |             "**/.DS_Store",
 92 |             "**/Thumbs.db"
 93 |         ],
 94 |         "ExcludeSearchPatterns": []
 95 |     },
 96 |     "RenameOptions": {
 97 |         "RenameOverloads": false,
 98 |         "RenameInStrings": false,
 99 |         "RenameInComments": false
100 |     },
101 |     "ImplementTypeOptions": {
102 |         "InsertionBehavior": 0,
103 |         "PropertyGenerationBehavior": 0
104 |     },
105 |     "DotNetCliOptions": {
106 |         "LocationPaths": null
107 |     },
108 |     "Plugins": {
109 |         "LocationPaths": null
110 |     }
111 | }


--------------------------------------------------------------------------------
/data/ground_truth/marshmallow/2244.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "pr_number": 2244,
 3 |     "log_file": "data/results/marshmallow/2244_2024-11-23 09:16:17.json",
 4 |     "differentiating_tests": [
 5 |         {
 6 |             "test": {
 7 |                 "test_code": "# Example 5:\nfrom marshmallow import Schema, fields\n\nclass URLSchema(Schema):\n    url = fields.URL()\n\nvalid_data = {\"url\": \"http://@example.com\"}\nresult = URLSchema().load(valid_data)",
 8 |                 "old_output": "Traceback (most recent call last):\n  File \"/tmp/BugGPT/BugGPT_test_code.py\", line 8, in <module>\n    result = URLSchema().load(valid_data)\n  File \"/home/marshmallow/src/marshmallow/schema.py\", line 722, in load\n    return self._do_load(\n  File \"/home/marshmallow/src/marshmallow/schema.py\", line 909, in _do_load\n    raise exc\nmarshmallow.exceptions.ValidationError: {'url': ['Not a valid URL.']}\n",
 9 |                 "new_output": ""
10 |             },
11 |             "label": "intended",
12 |             "comment": "it's a legal URL"
13 |         },
14 |         {
15 |             "test": {
16 |                 "test_code": "# Example 3:\nfrom marshmallow import Schema, fields, ValidationError\n\nclass URLSchema(Schema):\n    url = fields.URL()\n\ntry:\n    invalid_data = {\"url\": \"http://^@example.com\"}\n    URLSchema().load(invalid_data)\nexcept ValidationError as err:\n    print(\"Validation Error for 'http://^@example.com':\", err.messages)",
17 |                 "old_output": "",
18 |                 "new_output": "Validation Error for 'http://^@example.com': {'url': ['Not a valid URL.']}\n"
19 |             },
20 |             "label": "intended",
21 |             "comment": "it's an illegal URL (as specified in the newly added test)"
22 |         },
23 |         {
24 |             "test": {
25 |                 "test_code": "# Example 4:\nfrom marshmallow import Schema, fields, ValidationError\n\nclass URLSchema(Schema):\n    url = fields.URL()\n\ntry:\n    invalid_data = {\"url\": \"http://%0G@example.com\"}\n    URLSchema().load(invalid_data)\nexcept ValidationError as err:\n    print(\"Validation Error for 'http://%0G@example.com':\", err.messages)",
26 |                 "old_output": "",
27 |                 "new_output": "Validation Error for 'http://%0G@example.com': {'url': ['Not a valid URL.']}\n"
28 |             },
29 |             "label": "intended",
30 |             "comment": "it's an illegal URL"
31 |         },
32 |         {
33 |             "test": {
34 |                 "test_code": "# Example 5:\nfrom marshmallow import Schema, fields, ValidationError\n\nclass URLSchema(Schema):\n    url = fields.URL()\n\ntry:\n    invalid_data = {\"url\": \"http://%@example.com\"}",
35 |                 "old_output": "Couldn't run '/tmp/BugGPT/BugGPT_test_code.py' as Python code: SyntaxError: expected 'except' or 'finally' block (BugGPT_test_code.py, line 8)\n/usr/local/lib/python3.10/site-packages/coverage/control.py:892: CoverageWarning: No data was collected. (no-data-collected)\n  self._warn(\"No data was collected.\", slug=\"no-data-collected\")\n",
36 |                 "new_output": "/usr/local/lib/python3.10/site-packages/coverage/control.py:892: CoverageWarning: No data was collected. (no-data-collected)\n  self._warn(\"No data was collected.\", slug=\"no-data-collected\")\nCouldn't run '/tmp/BugGPT/BugGPT_test_code.py' as Python code: SyntaxError: expected 'except' or 'finally' block (BugGPT_test_code.py, line 8)\n"
37 |             },
38 |             "label": "intended",
39 |             "comment": "different order of error messages"
40 |         },
41 |         {
42 |             "test": {
43 |                 "test_code": "# Example 6:\nfrom marshmallow import Schema, fields, ValidationError\n\nclass URLSchema(Schema):\n    url = fields.URL()\n\ntry:\n    invalid_data = {\"url\": \"http://:pass@example.com\"}\n    URLSchema().load(invalid_data)\nexcept ValidationError as err:",
44 |                 "old_output": "/usr/local/lib/python3.10/site-packages/coverage/control.py:892: CoverageWarning: No data was collected. (no-data-collected)\n  self._warn(\"No data was collected.\", slug=\"no-data-collected\")\nCouldn't run '/tmp/BugGPT/BugGPT_test_code.py' as Python code: IndentationError: expected an indented block after 'except' statement on line 10 (BugGPT_test_code.py, line 10)\n",
45 |                 "new_output": "Couldn't run '/tmp/BugGPT/BugGPT_test_code.py' as Python code: IndentationError: expected an indented block after 'except' statement on line 10 (BugGPT_test_code.py, line 10)\n/usr/local/lib/python3.10/site-packages/coverage/control.py:892: CoverageWarning: No data was collected. (no-data-collected)\n  self._warn(\"No data was collected.\", slug=\"no-data-collected\")\n"
46 |             },
47 |             "label": "intended",
48 |             "comment": "different order of error messages"
49 |         }
50 |     ]
51 | }


--------------------------------------------------------------------------------
/data/ground_truth/marshmallow/2022.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "pr_number": 2022,
 3 |     "log_file": "data/results/marshmallow/2022_2024-11-23 09:16:16.json",
 4 |     "differentiating_tests": [
 5 |         {
 6 |             "test": {
 7 |                 "test_code": "# Example 3: Serialize using timestamp format\nfrom marshmallow import Schema, fields\nimport datetime\n\nclass EventSchema(Schema):\n    timestamp = fields.DateTime(format='timestamp')\n\ndata = {'timestamp': datetime.datetime(2023, 10, 1, 10, 0, 0)}\nschema = EventSchema()\nresult = schema.dump(data)\nprint(\"Serialized Timestamp:\", result)",
 8 |                 "old_output": "Serialized Timestamp: {'timestamp': 'timestamp'}\n",
 9 |                 "new_output": "Serialized Timestamp: {'timestamp': 1696154400.0}\n"
10 |             },
11 |             "label": "intended",
12 |             "comment": "'timestamp' field now contains a valid timestamp value"
13 |         },
14 |         {
15 |             "test": {
16 |                 "test_code": "# Example 5: Deserialization using timestamp\nfrom marshmallow import Schema, fields\n\nclass EventSchema(Schema):\n    timestamp = fields.DateTime(format='timestamp')\n\ndata = {'timestamp': 1696156800}  # POSIX timestamp for 2023-10-01 10:00:00 UTC\nschema = EventSchema()\nresult = schema.load(data)",
17 |                 "old_output": "Traceback (most recent call last):\n  File \"/tmp/BugGPT/BugGPT_test_code.py\", line 9, in <module>\n    result = schema.load(data)\n  File \"/home/marshmallow/src/marshmallow/schema.py\", line 722, in load\n    return self._do_load(\n  File \"/home/marshmallow/src/marshmallow/schema.py\", line 909, in _do_load\n    raise exc\nmarshmallow.exceptions.ValidationError: {'timestamp': ['Not a valid datetime.']}\n",
18 |                 "new_output": ""
19 |             },
20 |             "label": "intended",
21 |             "comment": "old version rejects invalid timestamp value"
22 |         },
23 |         {
24 |             "test": {
25 |                 "test_code": "# Example 6: Custom timestamp format with milliseconds\nfrom marshmallow import Schema, fields\n\nclass EventSchema(Schema):\n    timestamp = fields.DateTime(format='timestamp_ms')\n\ndata = {'timestamp': 1696156800000}  # POSIX timestamp in milliseconds\nschema = EventSchema()\nresult = schema.load(data)",
26 |                 "old_output": "Traceback (most recent call last):\n  File \"/tmp/BugGPT/BugGPT_test_code.py\", line 9, in <module>\n    result = schema.load(data)\n  File \"/home/marshmallow/src/marshmallow/schema.py\", line 722, in load\n    return self._do_load(\n  File \"/home/marshmallow/src/marshmallow/schema.py\", line 909, in _do_load\n    raise exc\nmarshmallow.exceptions.ValidationError: {'timestamp': ['Not a valid datetime.']}\n",
27 |                 "new_output": ""
28 |             },
29 |             "label": "intended",
30 |             "comment": "old version rejects invalid timestamp value"
31 |         },
32 |         {
33 |             "test": {
34 |                 "test_code": "# Example 9: Passing a valid timestamp and checking the response\nfrom marshmallow import Schema, fields\n\nclass EventSchema(Schema):\n    timestamp = fields.DateTime(format='timestamp')\n\ndata = {'timestamp': 1696156800}  # POSIX timestamp for 2023-10-01 10:00:00 UTC\nschema = EventSchema()\nresult = schema.load(data)",
35 |                 "old_output": "Traceback (most recent call last):\n  File \"/tmp/BugGPT/BugGPT_test_code.py\", line 9, in <module>\n    result = schema.load(data)\n  File \"/home/marshmallow/src/marshmallow/schema.py\", line 722, in load\n    return self._do_load(\n  File \"/home/marshmallow/src/marshmallow/schema.py\", line 909, in _do_load\n    raise exc\nmarshmallow.exceptions.ValidationError: {'timestamp': ['Not a valid datetime.']}\n",
36 |                 "new_output": ""
37 |             },
38 |             "label": "intended",
39 |             "comment": "old version rejects invalid timestamp value"
40 |         },
41 |         {
42 |             "test": {
43 |                 "test_code": "# Example 10: Deserializing a valid timestamp with milliseconds\nfrom marshmallow import Schema, fields\n\nclass EventSchema(Schema):\n    timestamp = fields.DateTime(format='timestamp_ms')\n\ndata = {'timestamp': 1696156800000}  # POSIX timestamp in milliseconds\nschema = EventSchema()\nresult = schema.load(data)",
44 |                 "old_output": "Traceback (most recent call last):\n  File \"/tmp/BugGPT/BugGPT_test_code.py\", line 9, in <module>\n    result = schema.load(data)\n  File \"/home/marshmallow/src/marshmallow/schema.py\", line 722, in load\n    return self._do_load(\n  File \"/home/marshmallow/src/marshmallow/schema.py\", line 909, in _do_load\n    raise exc\nmarshmallow.exceptions.ValidationError: {'timestamp': ['Not a valid datetime.']}\n",
45 |                 "new_output": ""
46 |             },
47 |             "label": "intended",
48 |             "comment": "old version rejects invalid timestamp value"
49 |         }
50 |     ]
51 | }


--------------------------------------------------------------------------------
/src/multilspy/language_servers/jedi_language_server/jedi_server.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Provides Python specific instantiation of the LanguageServer class. Contains various configurations and settings specific to Python.
  3 | """
  4 | 
  5 | import json
  6 | import logging
  7 | import os
  8 | import pathlib
  9 | from contextlib import asynccontextmanager
 10 | from typing import AsyncIterator
 11 | 
 12 | from multilspy.multilspy_logger import MultilspyLogger
 13 | from multilspy.language_server import LanguageServer
 14 | from multilspy.lsp_protocol_handler.server import ProcessLaunchInfo
 15 | from multilspy.lsp_protocol_handler.lsp_types import InitializeParams
 16 | from multilspy.multilspy_config import MultilspyConfig
 17 | 
 18 | 
 19 | class JediServer(LanguageServer):
 20 |     """
 21 |     Provides Python specific instantiation of the LanguageServer class. Contains various configurations and settings specific to Python.
 22 |     """
 23 | 
 24 |     def __init__(self, config: MultilspyConfig, logger: MultilspyLogger, repository_root_path: str):
 25 |         """
 26 |         Creates a JediServer instance. This class is not meant to be instantiated directly. Use LanguageServer.create() instead.
 27 |         """
 28 |         super().__init__(
 29 |             config,
 30 |             logger,
 31 |             repository_root_path,
 32 |             ProcessLaunchInfo(cmd="jedi-language-server", cwd=repository_root_path),
 33 |             "python",
 34 |         )
 35 | 
 36 |     def _get_initialize_params(self, repository_absolute_path: str) -> InitializeParams:
 37 |         """
 38 |         Returns the initialize params for the Jedi Language Server.
 39 |         """
 40 |         with open(os.path.join(os.path.dirname(__file__), "initialize_params.json"), "r") as f:
 41 |             d = json.load(f)
 42 | 
 43 |         del d["_description"]
 44 | 
 45 |         d["processId"] = os.getpid()
 46 |         assert d["rootPath"] == "$rootPath"
 47 |         d["rootPath"] = repository_absolute_path
 48 | 
 49 |         assert d["rootUri"] == "$rootUri"
 50 |         d["rootUri"] = pathlib.Path(repository_absolute_path).as_uri()
 51 | 
 52 |         assert d["workspaceFolders"][0]["uri"] == "$uri"
 53 |         d["workspaceFolders"][0]["uri"] = pathlib.Path(repository_absolute_path).as_uri()
 54 | 
 55 |         assert d["workspaceFolders"][0]["name"] == "$name"
 56 |         d["workspaceFolders"][0]["name"] = os.path.basename(repository_absolute_path)
 57 | 
 58 |         return d
 59 | 
 60 |     @asynccontextmanager
 61 |     async def start_server(self) -> AsyncIterator["JediServer"]:
 62 |         """
 63 |         Starts the JEDI Language Server, waits for the server to be ready and yields the LanguageServer instance.
 64 | 
 65 |         Usage:
 66 |         ```
 67 |         async with lsp.start_server():
 68 |             # LanguageServer has been initialized and ready to serve requests
 69 |             await lsp.request_definition(...)
 70 |             await lsp.request_references(...)
 71 |             # Shutdown the LanguageServer on exit from scope
 72 |         # LanguageServer has been shutdown
 73 |         ```
 74 |         """
 75 | 
 76 |         async def execute_client_command_handler(params):
 77 |             return []
 78 | 
 79 |         async def do_nothing(params):
 80 |             return
 81 | 
 82 |         async def check_experimental_status(params):
 83 |             if params["quiescent"] == True:
 84 |                 self.completions_available.set()
 85 | 
 86 |         async def window_log_message(msg):
 87 |             self.logger.log(f"LSP: window/logMessage: {msg}", logging.INFO)
 88 | 
 89 |         self.server.on_request("client/registerCapability", do_nothing)
 90 |         self.server.on_notification("language/status", do_nothing)
 91 |         self.server.on_notification("window/logMessage", window_log_message)
 92 |         self.server.on_request("workspace/executeClientCommand", execute_client_command_handler)
 93 |         self.server.on_notification("$/progress", do_nothing)
 94 |         self.server.on_notification("textDocument/publishDiagnostics", do_nothing)
 95 |         self.server.on_notification("language/actionableNotification", do_nothing)
 96 |         self.server.on_notification("experimental/serverStatus", check_experimental_status)
 97 | 
 98 |         async with super().start_server():
 99 |             self.logger.log("Starting jedi-language-server server process", logging.INFO)
100 |             await self.server.start()
101 |             initialize_params = self._get_initialize_params(self.repository_root_path)
102 | 
103 |             self.logger.log(
104 |                 "Sending initialize request from LSP client to LSP server and awaiting response",
105 |                 logging.INFO,
106 |             )
107 |             init_response = await self.server.send.initialize(initialize_params)
108 |             assert init_response["capabilities"]["textDocumentSync"]["change"] == 2
109 |             assert "completionProvider" in init_response["capabilities"]
110 |             assert init_response["capabilities"]["completionProvider"] == {
111 |                 "triggerCharacters": [".", "'", '"'],
112 |                 "resolveProvider": True,
113 |             }
114 | 
115 |             self.server.notify.initialized({})
116 | 
117 |             yield self
118 | 
119 |             await self.server.shutdown()
120 |             await self.server.stop()
121 | 


--------------------------------------------------------------------------------
/src/testora/evaluation/PreparePRChunks.py:
--------------------------------------------------------------------------------
 1 | from typing import List
 2 | from github import Github, Auth
 3 | from testora.RegressionFinder import get_merged_prs
 4 | from testora.evaluation import EvalTaskManager
 5 | 
 6 | 
 7 | def write_specific_PR_tasks_into_database(project_name, project_id, pr_numbers: List[int]):
 8 |     pr_numbers_to_analyze = pr_numbers
 9 |     EvalTaskManager.write_tasks(project_name, pr_numbers_to_analyze, "tasks")
10 | 
11 | 
12 | def write_range_of_PR_tasks_into_database(project_name, project_id, start_pr_nb, total):
13 |     print(f"Searching PRs for {project_name}")
14 | 
15 |     token = open(".github_token", "r").read().strip()
16 |     github = Github(auth=Auth.Token(token))
17 |     github_repo = github.get_repo(project_id)
18 | 
19 |     merged_prs = get_merged_prs(github_repo, max_prs=1)
20 |     most_recent_pr_nb = merged_prs[0].number
21 | 
22 |     print(f"Most recent PR number: {most_recent_pr_nb}")
23 |     result_pr_nbs = []
24 |     next_candidate_pr_nb = start_pr_nb
25 |     while next_candidate_pr_nb <= most_recent_pr_nb and len(result_pr_nbs) < total:
26 |         # check if nb is a PR
27 |         try:
28 |             pr = github_repo.get_pull(next_candidate_pr_nb)
29 |         except Exception:
30 |             # not a valid PR number
31 |             print(f"Skipping number {next_candidate_pr_nb}(not a valid PR number)")
32 |             next_candidate_pr_nb += 1
33 |             continue
34 | 
35 |         # check if PR is closed
36 |         if not pr.is_merged():
37 |             print(f"Skipping number {next_candidate_pr_nb} (PR not merged)")
38 |             next_candidate_pr_nb += 1
39 |             continue
40 | 
41 |         # found a valid PR number -- add to list
42 |         print(f"Adding PR number {next_candidate_pr_nb} into the list")
43 |         result_pr_nbs.append(next_candidate_pr_nb)
44 |         next_candidate_pr_nb += 1
45 | 
46 |     EvalTaskManager.write_tasks(project_name, result_pr_nbs, "tasks")
47 | 
48 | 
49 | if __name__ == "__main__":
50 |     EvalTaskManager.initialize()
51 | 
52 |     # write_range_of_PR_tasks_into_database(
53 |     #     "pandas", "pandas-dev/pandas", 60322, 300)
54 | 
55 |     # write_range_of_PR_tasks_into_database(
56 |     #     "scipy", "scipy/scipy", 22031, 300)
57 | 
58 |     # write_range_of_PR_tasks_into_database(
59 |     #     "keras", "keras-team/keras", 20711, 300)
60 | 
61 |     # write_range_of_PR_tasks_into_database(
62 |     #     "marshmallow", "marshmallow-code/marshmallow", 2804, 300)
63 | 
64 |     # write_specific_PR_tasks_into_database("scipy", "scipy/scipy",
65 |     #                                       [23609, 23607, 23606, 23574, 23521, 23520, 23511, 23502, 23501, 23498, 23497, 23494, 23483, 23475, 23471, 23454, 23442, 23426, 23415, 23388, 23350, 23348, 23341, 23322, 23311, 23298, 23294, 23293, 23280, 23276, 23266, 23235, 23194, 23138, 23121, 23103, 23091, 23071, 23059, 23055, 23048, 23047, 23044, 23019, 23005, 22989, 22982, 22971, 22944, 22941, 22913, 22910, 22899, 22869, 22864, 22855, 22801, 22772, 22763, 22760, 22725, 22718, 22689, 22660, 22651, 22632, 22624, 22611, 22610, 22600, 22585, 22582, 22532, 22494, 22482, 22481, 22475, 22462, 22455, 22447, 22433, 22421, 22398, 22372, 22353, 22344, 22313, 22284, 22283, 22278, 22273, 22251, 22242, 22226, 22221, 22220, 22219, 22215, 22213, 22199])
66 | 
67 |     # write_specific_PR_tasks_into_database("pandas", "pandas-dev/pandas",
68 |     #                                       [62349, 62325, 62320, 62300, 62298, 62289, 62281, 62280, 62276, 62248, 62246, 62166, 62116, 62101, 62085, 62076, 62073, 62038, 62032, 62025, 61990, 61972, 61969, 61966, 61947, 61946, 61924, 61894, 61891, 61884, 61874, 61855, 61827, 61800, 61786, 61773, 61771, 61743, 61699, 61697, 61658, 61646, 61633, 61625, 61623, 61597, 61541, 61517, 61514, 61508, 61484, 61472, 61467, 61451, 61422, 61399, 61376, 61352, 61340, 61332, 61320, 61293, 61286, 61234, 61229, 61225, 61207, 61198, 61193, 61183, 61162, 61131, 61114, 61105, 61103, 61054, 61046, 61041, 61017, 61008, 60987, 60985, 60983, 60975, 60974, 60963, 60952, 60949, 60936, 60924, 60916, 60906, 60894, 60882, 60867, 60860, 60828, 60826, 60795, 60793])
69 | 
70 |     # write_specific_PR_tasks_into_database("keras", "keras-team/keras",
71 |     #                                       [21682, 21680, 21650, 21646, 21611, 21603, 21595, 21590, 21588, 21569, 21535, 21534, 21532, 21512, 21496, 21495, 21480, 21473, 21456, 21449, 21440, 21434, 21432, 21428, 21423, 21414, 21412, 21407, 21406, 21399, 21393, 21392, 21373, 21361, 21349, 21336, 21335, 21331, 21317, 21304, 21302, 21291, 21290, 21277, 21256, 21239, 21211, 21192, 21184, 21170, 21163, 21148, 21138, 21129, 21117, 21101, 21095, 21081, 21077, 21066, 21053, 21030, 21014, 21010, 20993, 20989, 20974, 20973, 20956, 20954, 20928, 20926, 20916, 20913, 20909, 20905, 20892, 20879, 20854, 20853, 20829, 20824, 20815, 20791, 20784, 20782, 20777, 20768, 20765, 20758, 20755, 20736, 20689, 20643, 20637, 20630, 20626, 20613, 20612, 20602])
72 | 
73 |     # write_specific_PR_tasks_into_database("marshmallow", "marshmallow-code/marshmallow",
74 |     #                                       [2803, 2800, 2798, 2797, 2770, 2769, 2764, 2762, 2756, 2755, 2754, 2742, 2741, 2731, 2712, 2706, 2701, 2700, 2699, 2698, 2271, 2264, 2246, 2244, 2215, 2164, 2153, 2081, 2071, 1882, 1868, 1785, 1745, 1702, 1682, 1627, 1574, 1551, 1524, 1501, 1500, 1480, 1448, 1446, 1444, 1443, 1416, 1405, 1401, 1399, 1395, 1392, 1376, 1359, 1354, 1344, 1343, 1340, 1331, 1307, 1306, 1293, 1288, 1276, 1252, 1246, 1209, 1189, 1136, 1087, 1079, 1078, 1063, 1049, 1036, 1010, 1008, 983, 982, 963, 960, 959, 954, 950, 931, 911, 903, 865, 857, 856, 826, 822, 816, 808, 769, 750, 744, 725, 714, 707])
75 | 


--------------------------------------------------------------------------------
/src/testora/execution/ProgramMerger.py:
--------------------------------------------------------------------------------
  1 | import ast
  2 | from textwrap import dedent
  3 | import re
  4 | 
  5 | from testora.util.Exceptions import TestoraException
  6 | 
  7 | 
  8 | def merge_programs(programs):
  9 |     function_def_snippets = []
 10 |     for program_idx, program in enumerate(programs):
 11 |         # Parse the snippet into an AST node
 12 |         try:
 13 |             parsed_snippet = ast.parse(dedent(program))
 14 |         except Exception as _:
 15 |             function_def_snippets.append(
 16 |                 f"def program_{program_idx}():\n    pass # Couldn't parse generated test")
 17 |             continue
 18 | 
 19 |         # Create a function definition for the parsed snippet
 20 |         function_def = ast.FunctionDef(
 21 |             name=f"program_{program_idx}",
 22 |             args=ast.arguments(
 23 |                 posonlyargs=[],
 24 |                 args=[],
 25 |                 vararg=None,
 26 |                 kwonlyargs=[],
 27 |                 kw_defaults=[],
 28 |                 kwarg=None,
 29 |                 defaults=[]
 30 |             ),
 31 |             body=parsed_snippet.body,
 32 |             decorator_list=[],
 33 |             returns=None
 34 |         )
 35 | 
 36 |         # Print function definitions to string
 37 |         code = ast.unparse(ast.fix_missing_locations(function_def))
 38 |         function_def_snippets.append(code)
 39 | 
 40 |     result = "import sys\nimport traceback\nimport io\n\n"
 41 |     for function_def_snippet in function_def_snippets:
 42 |         result += function_def_snippet + "\n\n"
 43 | 
 44 |     for fct_idx in range(len(function_def_snippets)):
 45 |         result += f"""print('XXXXX Program {fct_idx} starting XXXXX')
 46 | try:
 47 |     my_stdout = io.StringIO()
 48 |     my_stderr = io.StringIO()
 49 |     sys.stdout = my_stdout
 50 |     sys.stderr = my_stderr
 51 |     program_{fct_idx}()
 52 | except BaseException as e:
 53 |     details = traceback.format_exc()
 54 |     print(details, file=my_stderr)
 55 | finally:
 56 |     sys.stdout.flush()
 57 |     sys.stderr.flush()
 58 |     sys.stdout = sys.__stdout__
 59 |     sys.stderr = sys.__stderr__
 60 |     print(my_stdout.getvalue(), end="")
 61 |     print(my_stderr.getvalue(), end="")
 62 | print('XXXXX Program {fct_idx} done XXXXX')
 63 | """
 64 | 
 65 |     return result
 66 | 
 67 | 
 68 | program_start_pattern = re.compile(r"XXXXX Program (\d+) starting XXXXX")
 69 | program_end_pattern = re.compile(r"XXXXX Program (\d+) done XXXXX")
 70 | 
 71 | 
 72 | def separate_outputs(output):
 73 |     in_program = None
 74 |     current_output = None
 75 |     result = []
 76 |     for line in output.split("\n"):
 77 |         program_start_match = program_start_pattern.match(line)
 78 |         if program_start_match:
 79 |             in_program = int(program_start_match.group(1))
 80 |             current_output = ""
 81 |             continue
 82 |         program_end_match = program_end_pattern.match(line)
 83 |         if program_end_match:
 84 |             program_nb = int(program_end_match.group(1))
 85 |             if program_nb != in_program:
 86 |                 raise TestoraException(f"Unexpected output of merged tests:\n{str(output)}")
 87 |             in_program = None
 88 |             result.append(current_output)
 89 |         elif in_program is not None:
 90 |             current_output += line + "\n"
 91 |     return result
 92 | 
 93 | 
 94 | # for testing
 95 | if __name__ == "__main__":
 96 |     program1 = """
 97 | import pandas as pd
 98 | 
 99 | df = pd.DataFrame({'A': [1.112, 3.456, 7.890], 'B': [9.876, 5.432, 1.234]})
100 | rounded_df = df.round(1)
101 | print(rounded_df)
102 | """
103 | 
104 |     program2 = """
105 | import pandas as pd
106 | 
107 | series_strings = pd.Series(['a', 'b', 'c'])
108 | # This will result in an error as rounding is not applicable to strings
109 | try:
110 |     rounded_strings = series_strings.round(2)
111 |     print(rounded_strings)
112 | except TypeError as e:
113 |     print(f"Error: {e}")
114 | """
115 | 
116 |     program3 = """
117 | import pandas as pd
118 | 
119 | # Normal usage scenario
120 | data = [1.234, 2.345, 3.456]
121 | ser = pd.Series(data)
122 | rounded_ser = ser.round(decimals=1)
123 | print(rounded_ser)
124 | 
125 | # Normal usage scenario
126 | ser = pd.Series([-1.234, -2.345, -3.456])
127 | rounded_ser = ser.round()
128 | print(rounded_ser)
129 | 
130 | # Normal usage scenario
131 | ser = pd.Series([5.678, 6.789, 7.890])
132 | rounded_ser = ser.round(decimals=2)
133 | print(rounded_ser)
134 | 
135 | # Normal usage scenario
136 | ser = pd.Series([1000, 2000, 3000])
137 | rounded_ser = ser.round(decimals=-2)
138 | print(rounded_ser)
139 | """
140 | 
141 |     program4 = """
142 | import pandas as pd
143 | import numpy as np
144 | 
145 | data = np.array([1.234, 2.345, 3.456])
146 | ser = pd.Series(data)
147 | print(ser)
148 | r = ser / zero
149 | print(r)
150 | """
151 | 
152 |     result = merge_programs([program1, program2, program3, program4])
153 |     print(result)
154 | 
155 |     output = """
156 | XXXXX Program 0 starting XXXXX
157 |      A    B
158 | 0  1.1  9.9
159 | 1  3.5  5.4
160 | 2  7.9  1.2
161 | XXXXX Program 0 done XXXXX
162 | XXXXX Program 1 starting XXXXX
163 | 0    a
164 | 1    b
165 | 2    c
166 | dtype: object
167 | XXXXX Program 1 done XXXXX
168 | XXXXX Program 2 starting XXXXX
169 | 0    1.2
170 | 1    2.3
171 | 2    3.5
172 | dtype: float64
173 | 0   -1.0
174 | 1   -2.0
175 | 2   -3.0
176 | dtype: float64
177 | 0    5.68
178 | 1    6.79
179 | 2    7.89
180 | dtype: float64
181 | 0    1000
182 | 1    2000
183 | 2    3000
184 | dtype: int64
185 | XXXXX Program 2 done XXXXX
186 | XXXXX Program 3 starting XXXXX
187 | 0    1.234
188 | 1    2.345
189 | 2    3.456
190 | dtype: float64
191 | Traceback (most recent call last):
192 |   File "/tmp/TestRemoveMe.py", line 74, in <module>
193 |     program_3()
194 |   File "/tmp/TestRemoveMe.py", line 41, in program_3
195 |     r = ser / zero
196 |               ^^^^
197 | NameError: name 'zero' is not defined
198 | 
199 | XXXXX Program 3 done XXXXX
200 | """
201 | 
202 |     split_outputs = separate_outputs(output)
203 |     for idx, split_output in enumerate(split_outputs):
204 |         print(f"Program {idx} output:")
205 |         print(split_output)
206 |         print()
207 | 


--------------------------------------------------------------------------------
/src/testora/util/ClonedRepoManager.py:
--------------------------------------------------------------------------------
  1 | from dataclasses import dataclass
  2 | import json
  3 | from os.path import exists
  4 | from pathlib import Path
  5 | import shutil
  6 | import subprocess
  7 | from typing import List
  8 | from git import Repo
  9 | 
 10 | from testora.util.PythonLanguageServer import PythonLanguageServer
 11 | 
 12 | 
 13 | @dataclass
 14 | class ClonedRepo:
 15 |     repo: Repo
 16 |     container_name: str
 17 |     language_server: PythonLanguageServer
 18 | 
 19 | 
 20 | class ClonedRepoManager:
 21 |     nb_clones = 3
 22 | 
 23 |     def __init__(self, pool_dir, repo_name, repo_id, container_base_name, module_name):
 24 |         self.pool_dir = pool_dir
 25 |         self.repo_name = repo_name
 26 |         self.repo_id = repo_id
 27 |         self.container_base_name = container_base_name
 28 |         self.module_name = module_name
 29 | 
 30 |         self.clone_state_file = f"{self.pool_dir}/clone_state.json"
 31 |         self._read_clone_state()
 32 | 
 33 |         self.usage_order: List[str] = [f"clone{i}" for i in range(
 34 |             1, self.nb_clones + 1)]  # last = last used
 35 | 
 36 |         self._reset_and_clean_all_clones()
 37 | 
 38 |         # start one language server for each clone
 39 |         self.clone_id_to_language_server = {}
 40 |         for i in range(1, self.nb_clones + 1):
 41 |             server = PythonLanguageServer(
 42 |                 f"{self.pool_dir}/clone{i}/{self.repo_name}")
 43 |             self.clone_id_to_language_server[f"clone{i}"] = server
 44 | 
 45 |     def _read_clone_state(self):
 46 |         if not exists(self.clone_state_file):
 47 |             self.clone_id_to_state = {
 48 |                 f"clone{i}": {"commit": "unknown", "container_name": f"{self.container_base_name}{i}"} for i in range(1, self.nb_clones + 1)}
 49 |             return
 50 | 
 51 |         with open(self.clone_state_file, "r") as f:
 52 |             self.clone_id_to_state = json.load(f)
 53 | 
 54 |         assert len(self.clone_id_to_state) == self.nb_clones
 55 | 
 56 |     def _write_clone_state(self):
 57 |         assert len(self.clone_id_to_state) == self.nb_clones
 58 |         with open(self.clone_state_file, "w") as f:
 59 |             json.dump(self.clone_id_to_state, f)
 60 | 
 61 |     def _reset_and_clean_all_clones(self):
 62 |         for clone_id, _ in self.clone_id_to_state.items():
 63 |             cloned_repo_dir = f"{self.pool_dir}/{clone_id}/{self.repo_name}"
 64 |             cloned_repo = Repo(cloned_repo_dir)
 65 |             cloned_repo.git.rm('--cached', '-rf', '.')
 66 |             cloned_repo.git.reset('--hard')
 67 |             cloned_repo.git.clean('-f', '-d')
 68 |             origin = cloned_repo.remotes.origin
 69 |             origin.fetch()
 70 | 
 71 |     def _get_least_recently_used_clone_id(self) -> str:
 72 |         return self.usage_order[0]
 73 | 
 74 |     def _have_used_clone_id(self, clone_id: str):
 75 |         self.usage_order.remove(clone_id)
 76 |         self.usage_order.append(clone_id)
 77 | 
 78 |     def _safe_checkout(self, cloned_repo: Repo, commit: str):
 79 |         try:
 80 |             cloned_repo.git.checkout(commit)
 81 |             cloned_repo.git.submodule('update', '--init', '--recursive')
 82 |         except Exception:
 83 |             if commit == "main":
 84 |                 self._safe_checkout(cloned_repo, "master")
 85 |             elif commit == "master":
 86 |                 self._safe_checkout(cloned_repo, "dev")
 87 |             else:
 88 |                 cloned_repo.git.rm('--cached', '-rf', '.')
 89 |                 cloned_repo.git.reset('--hard')
 90 |                 cloned_repo.git.clean('-f', '-d')
 91 |                 origin = cloned_repo.remotes.origin
 92 |                 origin.fetch()
 93 |                 try:
 94 |                     cloned_repo.git.checkout(commit)
 95 |                 except Exception:
 96 |                     # we get here when submodules are in a strange state
 97 |                     self._remove_and_reinit_submodules(cloned_repo, commit)
 98 | 
 99 |     def _remove_and_reinit_submodules(self, cloned_repo: Repo, commit: str):
100 |         # 1) de-initialize all submodules
101 |         cloned_repo.git.submodule('deinit', '-f', '--all')
102 | 
103 |         # 2) remove all submodule working trees
104 |         root = Path(cloned_repo.working_dir)
105 |         ls_output = subprocess.run(
106 |             ["git", "ls-files", "-s"], capture_output=True, text=True, check=True
107 |         ).stdout.splitlines()
108 |         for line in ls_output:
109 |             parts = line.split()
110 |             if len(parts) >= 4 and parts[0] == "160000":
111 |                 path = " ".join(parts[3:])
112 |                 shutil.rmtree(root / path, ignore_errors=True)
113 | 
114 |         # 3) remove all submodule git metadata under .git/modules
115 |         modules_dir = root / ".git" / "modules"
116 |         if modules_dir.exists():
117 |             for child in modules_dir.iterdir():
118 |                 shutil.rmtree(child, ignore_errors=True)
119 | 
120 |         # 4) checkout the desired commit
121 |         cloned_repo.git.checkout(commit)
122 | 
123 |         # 5) re-initialize submodules recursively
124 |         cloned_repo.git.submodule('update', '--init', '--recursive')
125 | 
126 |     def get_cloned_repo(self, commit) -> ClonedRepo:
127 |         # reuse existing clone if possible
128 |         for clone_id, state in self.clone_id_to_state.items():
129 |             if state["commit"] == commit:
130 |                 self._have_used_clone_id(clone_id)
131 |                 cloned_repo_dir = f"{self.pool_dir}/{clone_id}/{self.repo_name}"
132 | 
133 |                 return ClonedRepo(Repo(cloned_repo_dir),
134 |                                   state["container_name"],
135 |                                   self.clone_id_to_language_server[clone_id])
136 | 
137 |         # checkout desired commit
138 |         clone_id = self._get_least_recently_used_clone_id()
139 |         cloned_repo_dir = f"{self.pool_dir}/{clone_id}/{self.repo_name}"
140 |         cloned_repo = Repo(cloned_repo_dir)
141 |         self._safe_checkout(cloned_repo, commit)
142 | 
143 |         # update clone state
144 |         state = self.clone_id_to_state[clone_id]
145 |         state["commit"] = commit
146 |         self.clone_id_to_state[clone_id] = state
147 |         self._write_clone_state()
148 |         self._have_used_clone_id(clone_id)
149 | 
150 |         return ClonedRepo(cloned_repo,
151 |                           state["container_name"],
152 |                           self.clone_id_to_language_server)
153 | 


--------------------------------------------------------------------------------
/src/testora/evaluation/ClassificationResultsSummarizer.py:
--------------------------------------------------------------------------------
  1 | from collections import Counter
  2 | import json
  3 | 
  4 | # Select the combination of prompt and model to evaluate:
  5 | 
  6 | # single-question, GPT-4o
  7 | # result_files = ["data/classification_results_03_2025/keras/single-question_GPT-4o.json",
  8 | #                 "data/classification_results_03_2025/marshmallow/single-question_GPT-4o.json",
  9 | #                 "data/classification_results_03_2025/pandas/single-question_GPT-4o.json",
 10 | #                 "data/classification_results_03_2025/scipy/single-question_GPT-4o.json"]
 11 | 
 12 | # single-question, GPT-4o-mini
 13 | # result_files = ["data/classification_results_03_2025/keras/single-question_GPT-4o-mini.json",
 14 | #                 "data/classification_results_03_2025/marshmallow/single-question_GPT-4o-mini.json",
 15 | #                 "data/classification_results_03_2025/pandas/single-question_GPT-4o-mini.json",
 16 | #                 "data/classification_results_03_2025/scipy/single-question_GPT-4o-mini.json"]
 17 | 
 18 | # single-question, DeepSeek-R1
 19 | # result_files = ["data/classification_results_03_2025/keras/single-question_DeepSeek-R1.json",
 20 | #                 "data/classification_results_03_2025/marshmallow/single-question_DeepSeek-R1.json",
 21 | #                 "data/classification_results_03_2025/pandas/single-question_DeepSeek-R1.json",
 22 | #                 "data/classification_results_03_2025/scipy/single-question_DeepSeek-R1.json"]
 23 | 
 24 | # multi-question, GPT-4o
 25 | # result_files = ["data/classification_results_03_2025/keras/multi-question_GPT-4o.json",
 26 | #                 "data/classification_results_03_2025/marshmallow/multi-question_GPT-4o.json",
 27 | #                 "data/classification_results_03_2025/pandas/multi-question_GPT-4o.json",
 28 | #                 "data/classification_results_03_2025/scipy/multi-question_GPT-4o.json"]
 29 | 
 30 | # multi-question, GPT-4o-mini
 31 | result_files = ["data/classification_results_03_2025/keras/multi-question_GPT-4o-mini.json",
 32 |                 "data/classification_results_03_2025/marshmallow/multi-question_GPT-4o-mini.json",
 33 |                 "data/classification_results_03_2025/pandas/multi-question_GPT-4o-mini.json",
 34 |                 "data/classification_results_03_2025/scipy/multi-question_GPT-4o-mini.json"]
 35 | 
 36 | # multi-question, DeepSeek-R1
 37 | # result_files = ["data/classification_results_03_2025/keras/multi-question_DeepSeek-R1.json",
 38 | #                 "data/classification_results_03_2025/marshmallow/multi-question_DeepSeek-R1.json",
 39 | #                 "data/classification_results_03_2025/pandas/multi-question_DeepSeek-R1.json",
 40 | #                 "data/classification_results_03_2025/scipy/multi-question_DeepSeek-R1.json"]
 41 | 
 42 | # extract results
 43 | print("Project, PR, Prediction, Label, Result")
 44 | nb_fps = 0
 45 | nb_tps = 0
 46 | nb_fns = 0
 47 | nb_tns = 0
 48 | variance_ctr = Counter()
 49 | config_used = None
 50 | for result_file in result_files:
 51 |     if "keras" in result_file:
 52 |         project = "keras"
 53 |     elif "marshmallow" in result_file:
 54 |         project = "marshmallow"
 55 |     elif "pandas" in result_file:
 56 |         project = "pandas"
 57 |     elif "scipy" in result_file:
 58 |         project = "scipy"
 59 |     else:
 60 |         raise ValueError(f"Couldn't determine project from file name {result_file}")
 61 | 
 62 |     with open(result_file, "r") as f:
 63 |         result_json = json.load(f)
 64 |         config_used_here = result_json[0]["message"]
 65 |         if config_used is None:
 66 |             config_used = config_used_here
 67 |         else:
 68 |             assert config_used == config_used_here, f"Config mismatch:\nUsed before:\n{config_used}\nvs used now in {project}:\n {config_used_here}"
 69 |         for entry in result_json:
 70 |             if entry["message"] == "Classification result":
 71 |                 # compare label and predictions
 72 |                 results = []
 73 |                 if entry["label"] in ["unintended", "coincidental fix"]:
 74 |                     for prediction in entry["predictions"].split("#"):
 75 |                         if prediction == "unintended":
 76 |                             results.append("TP")
 77 |                             nb_tps += 1
 78 |                         elif prediction == "intended":
 79 |                             results.append("FN")
 80 |                             nb_fns += 1
 81 |                         else:
 82 |                             raise ValueError(
 83 |                                 f"Invalid prediction: {entry['prediction']}")
 84 |                 elif entry["label"] in ["intended"]:
 85 |                     for prediction in entry["predictions"].split("#"):
 86 |                         if prediction == "intended":
 87 |                             results.append("TN")
 88 |                             nb_tns += 1
 89 |                         elif prediction == "unintended":
 90 |                             results.append("FP")
 91 |                             nb_fps += 1
 92 |                         else:
 93 |                             raise ValueError(
 94 |                                 f"Invalid prediction: {entry['prediction']}")
 95 |                 else:
 96 |                     raise ValueError(
 97 |                         f"Invalid label: {entry['label']}, {entry['pr_nb']}")
 98 | 
 99 |                 # check variance of predictions
100 |                 results_counter = Counter(results)
101 |                 variance_str = str(
102 |                     sorted(list(results_counter.values()), reverse=True))
103 |                 variance_ctr[variance_str] += 1
104 | 
105 |                 # print into CSV
106 |                 print(f"{project}, "
107 |                       f"{entry['pr_nb']}, "
108 |                       f"{entry['predictions']}, "
109 |                       f"{entry['label']}, "
110 |                       f"{', '.join(results)}"
111 |                       )
112 | 
113 | print(config_used)
114 | print()
115 | print(f"TP: {nb_tps}, FP: {nb_fps}, FN: {nb_fns}, TN: {nb_tns}")
116 | precision = 0 if (nb_tps + nb_fps) == 0 else nb_tps / (nb_tps + nb_fps)
117 | print(f"Precision: {precision}")
118 | recall = 0 if (nb_tps + nb_fns) == 0 else nb_tps / (nb_tps + nb_fns)
119 | print(f"Recall: {recall}")
120 | f1 = 0 if (precision + recall) == 0 else 2 * \
121 |     precision * recall / (precision + recall)
122 | print(f"F1: {f1}")
123 | print()
124 | print(f"Variance of predictions: {variance_ctr}")
125 | print(f"Total data points: {nb_tps+nb_fps+nb_fns+nb_tns}")
126 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Testora: Regression Testing with a Natural Language Oracle
  2 | 
  3 | Testora is an automated approach to check behavioral changes introduced by a pull request against the title, description, etc. of the pull request.
  4 | 
  5 | Paper:
  6 | [https://arxiv.org/abs/2503.18597](https://arxiv.org/abs/2503.18597)
  7 | 
  8 | ## Installation
  9 | 
 10 | Testora uses two kinds of Docker containers:
 11 | 
 12 | * A Visual Studio Code Dev Container for running Testora itself. See [devcontainer.json](.devcontainer/devcontainer.json).
 13 | 
 14 | * Docker-in-docker containers for target projects to analyze with Testora. These containers are created when creating the dev container. See [postCreateCommands.sh](.devcontainer/postCreateCommands.sh).
 15 | 
 16 | To install and run Testora, follow these steps:
 17 | 
 18 | 1) Install [Visual Studio Code](https://code.visualstudio.com/download) and its ["Dev Containers" extension](https://marketplace.visualstudio.com/items?itemName=ms-vscode-remote.remote-containers).
 19 | 
 20 | 2) Open Testora in Visual Studio Code:
 21 |    
 22 |    ```code .```
 23 | 
 24 | 3) In Visual Studio Code, build the Dev Container and reopen the project in the container:
 25 | 
 26 |     ```Ctrl + Shift + P```
 27 | 
 28 |     ```Dev Containers: Rebuild and Reopen in Container```
 29 | 
 30 |     This will take a couple of minutes, because in addition to Testora, it will set up three instances of the project under analysis. We use three instances to efficiently switch between the commits just before and just after a PR, as well as the latest commit in the main branch.
 31 | 
 32 | 4) In the main directory, create a file `.openai_token` with an OpenAI API key. This is required for invoking an LLM, which is an essential part of Testora.
 33 | 
 34 | 5) In the main directory, create a file `.github_token` with a (free to create) GitHub API key. This is required because Testora interacts with the GitHub API to retrieve details about the PRs to analyze.
 35 | 
 36 | ## Running Testora on a Single Pull Request
 37 | 
 38 | [testora.RegressionFinder](src/testora/RegressionFinder.py) is the main entry point to run Testora.
 39 | To apply it to a specific PR of a project, run it like this:
 40 | 
 41 | ```python -m testora.RegressionFinder --project scipy --pr 21768```
 42 | 
 43 | The project must be one of the projects that were set up while building the Dev Container. The above command produces a `logs_<timestamp>.json` file.
 44 | 
 45 | ## Inspecting Results in the Web UI
 46 | 
 47 | We provide a Web UI to inspect detailed logs of Testora.
 48 | 
 49 | 1) Launch the web server:
 50 | 
 51 |     ```python -m testora.webui.WebUI --files logs_*.json```
 52 | 
 53 | 2) Visit [http://localhost:4000/](http://localhost:4000/) in your browser.
 54 | 
 55 | 3) Click on the value in the "Status" column to inspect the detailed logs of a PR.
 56 | 
 57 | ## Running Testora on Many Pull Requests
 58 | 
 59 | For large-scale experiments, we use an SQL database that stores PRs to analyze and, once a PR has been analyzed, stores the results of Testora on this PR.
 60 | The database itself is *not* part of this public release, but you may replicate the setup with your own database using [these two database schemas](src/testora/evaluation/sql/).
 61 | 
 62 | Assuming you have set up the database:
 63 | 
 64 | 1) Add PRs to check into the database:
 65 | 
 66 |     ```python -m testora.evaluation.PreparePRChunks```
 67 | 
 68 | 2) Run [testora.RegressionFinder](src/testora/RegressionFinder.py) in database mode, which fetches PRs to check from the database and applies the approach to each PR.
 69 | 
 70 |     ```python -m testora.RegressionFinder --db```
 71 | 
 72 |     You can launch multiple instances of this command in parallel in different Dev Containers. Each of the parallel instances will fetch one PR at a time and write the result back into the database, until all PRs have been analyzed.
 73 | 
 74 | 3) Check the status of PRs to analyze:
 75 | 
 76 |     ```python -m testora.evaluation.EvalTaskManager --status```
 77 | 
 78 | 4) Once some or all PRs have been analyzed, download the results (i.e., `logs_*.json` files) from the database for inspection:
 79 | 
 80 |     ```python -m testora.evaluation.EvalTaskManager --fetch```
 81 | 
 82 |     To inspect the logs, use the WebUI as described above.
 83 | 
 84 | ## Results Reported in the Paper
 85 | 
 86 | ### RQ1: Real-World Problems Found by Testora
 87 | 
 88 | See [this sheet](https://docs.google.com/spreadsheets/d/1We-EwrNv_0U1Wco_eAUbxwjyFkkPI9kM7tkaRgP0yyI/edit?usp=sharing) for details on the 30 real-world problems, the corresponding PRs, the issues we reported, and their status.
 89 | 
 90 | ### RQ2 (Effectiveness of Test Generation) and RQ4 (Costs)
 91 | 
 92 | Download the logs as described in [DATA.md](data/DATA.md).
 93 | This will create a folder [data/results_03_2025/](data/results_03_2025/), which contains the raw logs of running Testora in its default configuration.
 94 | 
 95 | To analyze the logs, run the following command:
 96 | 
 97 | ```python -m testora.evaluation.PRAnalysisStats```
 98 | 
 99 | It will do the following:
100 |  * Read the logs of all 1,274 PRs analyzed for RQ2 and RQ4
101 |  * Compute the test generation statistics reported in RQ2
102 |  * Compute the token cost statistics reported in RQ4
103 |  * Output the corresponding LaTeX tables
104 |  * Output LaTeX macros that define results used repeatedly in the paper (e.g., monetary cost per PR)
105 |  * Write the plots that show time costs and token costs into [data/figures](data/figures)
106 | 
107 | ### RQ3: Accuracy of Classifier
108 | 
109 | Our dataset of 164 manually labeled data points is in [data/ground_truth](data/ground_truth).
110 | 
111 | To run evaluate the classifier against the ground truth, we use [ClassificationEvaluator.py](src/testora/evaluation/ClassificationEvaluator.py). 
112 | If not done yet for RQ2, download the logs as described in [DATA.md](data/DATA.md).
113 | Afterward, the raw logs of running Testora with three LLMs (GPT-4o-mini, GPT-4o, DeepSeek-R1) and two different prompting techniques (multi-question classifier, single-question classifier) are available in [data/classification_results_03_2025/](data/classification_results_03_2025/).
114 | 
115 | To compute the precision, recall, and F1 score, run the following command:
116 | 
117 | ```python -m testora.evaluation.ClassificationResultsSummarizer```
118 | 
119 | It will output detailed results for each PR in the ground truth dataset, and the at end, the overall results.
120 | To switch between different LLMs and prompting techniques, edit [ClassificationResultsSummarizer.py](src/testora/evaluation/ClassificationResultsSummarizer.py) to modify the lines at the beginning that select a model-prompt combination.
121 | 


--------------------------------------------------------------------------------