├── src ├── testora │ ├── __init__.py │ ├── util │ │ ├── Exceptions.py │ │ ├── ClassificationResult.py │ │ ├── UndefinedRefsFinder.py │ │ ├── DocstringRetrieval.py │ │ ├── PythonLanguageServer.py │ │ ├── Logs.py │ │ └── ClonedRepoManager.py │ ├── execution │ │ ├── TestExecution.py │ │ ├── CoverageAnalyzer.py │ │ ├── DockerExecutor.py │ │ └── ProgramMerger.py │ ├── prompts │ │ ├── PromptCommon.py │ │ ├── UndefinedRefsFixingPrompt.py │ │ ├── SelectExpectedBehaviorPrompt.py │ │ ├── TemperatureExperiment.py │ │ ├── PRRegressionBugRanking.py │ │ ├── RegressionTestGeneratorPromptV1.py │ │ └── RegressionTestGeneratorPromptV2.py │ ├── evaluation │ │ ├── TargetPRs.py │ │ ├── ClassificationResultsInspector.py │ │ ├── TestFailureInspector.py │ │ ├── sql │ │ │ ├── tasks.sql │ │ │ └── classification_tasks.sql │ │ ├── CheckedPRsInspector.py │ │ ├── FindCandidateProjects.py │ │ ├── ResultsManager.py │ │ ├── PreparePRChunks.py │ │ └── ClassificationResultsSummarizer.py │ ├── llms │ │ ├── LLMCacheAnalyzer.py │ │ ├── LLMCache.py │ │ └── OpenAIGPT.py │ ├── Config.py │ └── webui │ │ └── WebUI.py └── multilspy │ ├── README.md │ ├── __init__.py │ ├── multilspy_exceptions.py │ ├── type_helpers.py │ ├── multilspy_settings.py │ ├── multilspy_config.py │ ├── language_servers │ ├── rust_analyzer │ │ └── runtime_dependencies.json │ ├── eclipse_jdtls │ │ └── runtime_dependencies.json │ ├── omnisharp │ │ └── workspace_did_change_configuration.json │ └── jedi_language_server │ │ └── jedi_server.py │ ├── multilspy_logger.py │ └── lsp_protocol_handler │ └── lsp_constants.py ├── .vscode ├── settings.json └── launch.json ├── requirements.txt ├── data ├── ground_truth │ ├── template.json │ ├── scipy │ │ ├── 20089.json │ │ ├── 19776.json │ │ ├── 21076.json │ │ ├── 21553.json │ │ ├── 20751.json │ │ ├── 19428.json │ │ ├── 21036.json │ │ ├── 19263.json │ │ ├── 20974.json │ │ ├── 21518.json │ │ ├── 21577.json │ │ ├── 21642.json │ │ ├── 19680.json │ │ ├── 19853.json │ │ ├── 21633.json │ │ ├── 19861.json │ │ ├── 21528.json │ │ ├── 21597.json │ │ ├── 21572.json │ │ └── 21629.json │ ├── pandas │ │ ├── 58376.json │ │ ├── 57205.json │ │ ├── 57034.json │ │ ├── 57399.json │ │ ├── 57046.json │ │ ├── 55108.json │ │ ├── 59810.json │ │ └── 59782.json │ ├── marshmallow │ │ ├── 1399.json │ │ ├── 2215.json │ │ ├── 2102.json │ │ ├── 2246.json │ │ ├── 2271.json │ │ ├── 2123.json │ │ ├── 1998.json │ │ ├── 2244.json │ │ └── 2022.json │ └── keras │ │ └── 19814.json └── DATA.md ├── .gitignore ├── pyproject.toml ├── .devcontainer ├── setup_numpy_to_run_in_container.sh ├── setup_scipy_to_run_in_container.sh ├── postCreateCommands.sh ├── setup_scapy.sh ├── devcontainer.json ├── setup_transformers.sh ├── setup_pytorch_geometric.sh ├── setup_keras.sh ├── setup_marshmallow.sh ├── setup_scipy.sh ├── setup_numpy.sh ├── setup_pandas.sh └── setup_scikit-learn.sh ├── LICENSE ├── templates ├── pr_result.html ├── index.html └── pr_log.html └── README.md /src/testora/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/testora/util/Exceptions.py: -------------------------------------------------------------------------------- 1 | class TestoraException(BaseException): 2 | pass 3 | -------------------------------------------------------------------------------- /src/multilspy/README.md: -------------------------------------------------------------------------------- 1 | All code in this folder is adapted from https://github.com/microsoft/monitors4codegen -------------------------------------------------------------------------------- /.vscode/settings.json: -------------------------------------------------------------------------------- 1 | { 2 | "mypy.runUsingActiveInterpreter": true, 3 | "mypy.targets": [ 4 | "src" 5 | ], 6 | "files.watcherExclude": { 7 | "data/**": true, 8 | } 9 | } -------------------------------------------------------------------------------- /src/multilspy/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | This module contains the multilspy API 3 | """ 4 | 5 | from . import multilspy_types as Types 6 | from .language_server import LanguageServer, SyncLanguageServer 7 | 8 | __all__ = ["LanguageServer", "Types", "SyncLanguageServer"] 9 | -------------------------------------------------------------------------------- /src/testora/execution/TestExecution.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | from typing import Optional 3 | 4 | 5 | @dataclass 6 | class TestExecution: 7 | code: str 8 | output: Optional[str] = None 9 | coverage_report: Optional[str] = None 10 | -------------------------------------------------------------------------------- /src/testora/prompts/PromptCommon.py: -------------------------------------------------------------------------------- 1 | from testora.Config import model_version 2 | 3 | # NOTE: when changing the system message, must remove the old cache 4 | 5 | if model_version.startswith("gpt"): 6 | system_message = "You are an experienced Python developer." 7 | elif model_version.startswith("deepseek"): 8 | system_message = "" 9 | 10 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | docker==7.0.0 2 | Flask==3.0.3 3 | GitPython==3.1.43 4 | jedi-language-server==0.41.4 5 | libcst==1.2.0 6 | matplotlib==3.8.4 7 | openai==1.55.3 8 | PyCG==0.0.8 9 | pydantic==2.7.1 10 | PyGithub==2.3.0 11 | Requests==2.31.0 12 | typing_extensions==4.11.0 13 | unidiff==0.7.5 14 | mypy 15 | mysql-connector-python 16 | requests 17 | coverage -------------------------------------------------------------------------------- /data/ground_truth/template.json: -------------------------------------------------------------------------------- 1 | { 2 | "pr_number": 9999999, 3 | "log_file": "data/old_results/XXXXX", 4 | "differentiating_tests": [ 5 | { 6 | "test": { 7 | "test_code": "XXXX", 8 | "old_output": "XXXX", 9 | "new_output": "XXXX" 10 | }, 11 | "label": "TODO", 12 | "comment": "" 13 | } 14 | ] 15 | } -------------------------------------------------------------------------------- /src/multilspy/multilspy_exceptions.py: -------------------------------------------------------------------------------- 1 | """ 2 | This module contains the exceptions raised by the Multilspy framework. 3 | """ 4 | 5 | class MultilspyException(Exception): 6 | """ 7 | Exceptions raised by the Multilspy framework. 8 | """ 9 | 10 | def __init__(self, message: str): 11 | """ 12 | Initializes the exception with the given message. 13 | """ 14 | super().__init__(message) -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__ 2 | data/llm_cache* 3 | data/repos 4 | data/pr_chunks 5 | data/results 6 | data/results_03_2025 7 | data/classification_results 8 | data/classification_results_03_2025 9 | data/target_prs 10 | data/figures/*.pdf 11 | .github_token 12 | logs_*.json 13 | results_*.json 14 | .openai_token 15 | .db_token 16 | .worker_id 17 | candidate_projects*.csv 18 | .target_project 19 | .coverage 20 | coverage.json 21 | coverage_report 22 | .openrouter_token 23 | -------------------------------------------------------------------------------- /data/DATA.md: -------------------------------------------------------------------------------- 1 | This folder contains detailed logs of running Testora, as well as manually created ground truth data for evaluating the classifier.The logs are not stored in Git. Instead, you can download and extract them into this folder. 2 | 3 | Run these commands from the project's main directory to download the raw logs: 4 | 5 | 1) ```wget https://github.com/michaelpradel/Testora/releases/download/data_03_2025/data_03_2025.tar.gz``` 6 | 7 | 2) ```tar -xf data_03_2025.tar.gz``` -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["hatchling"] 3 | build-backend = "hatchling.build" 4 | 5 | [project] 6 | name = "Testora" 7 | version = "0.1.0" 8 | description = "" 9 | authors = [ 10 | { name = "Michael Pradel", email = "michael@binaervarianz.de" } 11 | ] 12 | license = "MIT" 13 | dependencies = [] 14 | 15 | [tool.hatch.build.targets.wheel] 16 | packages = [ 17 | "src/testora", 18 | ] 19 | 20 | [tool.hatch.build.targets.sdist] 21 | include = [ 22 | "/src", 23 | ] 24 | -------------------------------------------------------------------------------- /src/testora/evaluation/TargetPRs.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | 4 | 5 | base_dir = "data/target_prs/" 6 | 7 | 8 | def project_to_target_prs(): 9 | project_to_prs = {} 10 | for project_file in os.listdir(base_dir): 11 | if project_file.endswith(".json"): 12 | project_name = project_file.replace(".json", "") 13 | with open(os.path.join(base_dir, project_file), "r") as f: 14 | project_to_prs[project_name] = json.load(f) 15 | 16 | return project_to_prs 17 | -------------------------------------------------------------------------------- /src/testora/util/ClassificationResult.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | from enum import Enum 3 | 4 | 5 | class Classification(str, Enum): 6 | UNKNOWN = "unknown" 7 | INTENDED_CHANGE = "intended_change" 8 | COINCIDENTAL_FIX = "coincidental_fix" 9 | REGRESSION = "regression" 10 | 11 | 12 | @dataclass 13 | class ClassificationResult: 14 | test_code: str 15 | old_output: str 16 | new_output: str 17 | classification: Classification 18 | classification_explanation: str 19 | -------------------------------------------------------------------------------- /.devcontainer/setup_numpy_to_run_in_container.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | apt install -y gcc g++ gfortran libopenblas-dev liblapack-dev pkg-config python3-pip python3-dev 4 | 5 | wget -O Miniforge3.sh "https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-$(uname)-$(uname -m).sh" 6 | bash Miniforge3.sh -b -p "${HOME}/conda" 7 | source "${HOME}/conda/etc/profile.d/conda.sh" 8 | source "${HOME}/conda/etc/profile.d/mamba.sh" 9 | 10 | mamba env create -f environment.yml 11 | mamba activate numpy-dev 12 | 13 | pip install -e . --no-build-isolation -------------------------------------------------------------------------------- /.devcontainer/setup_scipy_to_run_in_container.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | apt update 4 | apt install -y gcc g++ gfortran libopenblas-dev liblapack-dev pkg-config 5 | 6 | wget -O Miniforge3.sh "https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-$(uname)-$(uname -m).sh" 7 | bash Miniforge3.sh -b -p "${HOME}/conda" 8 | source "${HOME}/conda/etc/profile.d/conda.sh" 9 | source "${HOME}/conda/etc/profile.d/mamba.sh" 10 | mamba shell init 11 | 12 | mamba env create -f environment.yml -y 13 | mamba activate scipy-dev 14 | 15 | pip install -e . --no-build-isolation 16 | 17 | pip install coverage -------------------------------------------------------------------------------- /.devcontainer/postCreateCommands.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | pip install --user -r requirements.txt 4 | pip install -e . 5 | 6 | echo "Setting up project-under-analysis" 7 | # Select which project to analyze: 8 | .devcontainer/setup_scipy.sh 9 | # .devcontainer/setup_pandas.sh 10 | # .devcontainer/setup_keras.sh 11 | # .devcontainer/setup_marshmallow.sh 12 | 13 | 14 | ## Experimental and not really supported as of now: 15 | # .devcontainer/setup_scikit-learn.sh 16 | # .devcontainer/setup_numpy.sh 17 | # .devcontainer/setup_transformers.sh 18 | # .devcontainer/setup_pytorch_geometric.sh 19 | # .devcontainer/setup_scapy.sh -------------------------------------------------------------------------------- /data/ground_truth/scipy/20089.json: -------------------------------------------------------------------------------- 1 | { 2 | "pr_number": 20089, 3 | "log_file": "data/old_results/results_scipy_20138_20023.json", 4 | "differentiating_tests": [ 5 | { 6 | "test": { 7 | "test_code": "# Example 12: Using hyp2f1 with complex -inf input\nimport numpy as np\nimport scipy.special\n\nresult = scipy.special.hyp2f1(1.0, -np.inf, 3.0, 4.0+1.0j)\nprint(result)\n", 8 | "old_output": "(nan+nanj)\n", 9 | "new_output": "(1+0j)\n" 10 | }, 11 | "label": "unintended", 12 | "comment": "https://github.com/scipy/scipy/issues/20988" 13 | } 14 | ] 15 | } -------------------------------------------------------------------------------- /data/ground_truth/pandas/58376.json: -------------------------------------------------------------------------------- 1 | { 2 | "pr_number": 58376, 3 | "log_file": "data/old_results/results_pandas_58389_58294.json", 4 | "differentiating_tests": [ 5 | { 6 | "test": { 7 | "test_code": "# Corner Case Example 10: Using RangeIndex.searchsorted with a negative step value\nimport pandas as pd\n\nri = pd.RangeIndex(9, 0, -3)\nvalue = 5\nresult = ri.searchsorted(value)\nprint(result)", 8 | "old_output": "0\n", 9 | "new_output": "2\n" 10 | }, 11 | "label": "coincidental fix", 12 | "comment": "https://github.com/pandas-dev/pandas/issues/58641" 13 | } 14 | ] 15 | } -------------------------------------------------------------------------------- /data/ground_truth/scipy/19776.json: -------------------------------------------------------------------------------- 1 | { 2 | "pr_number": 19776, 3 | "log_file": "data/old_results/results_scipy_19816_19725.json", 4 | "differentiating_tests": [ 5 | { 6 | "test": { 7 | "test_code": "# Corner Case Example 6: Using inf and -inf values\nimport numpy as np\nfrom scipy.stats import rankdata\n\ndata = np.array([10, np.inf, -np.inf, 25, 30])\nresult = rankdata(data, method='average')\nprint(result)", 8 | "old_output": "[nan nan nan nan nan]\n", 9 | "new_output": "[2. 5. 1. 3. 4.]\n" 10 | }, 11 | "label": "coincidental fix", 12 | "comment": "" 13 | } 14 | ] 15 | } -------------------------------------------------------------------------------- /data/ground_truth/marshmallow/1399.json: -------------------------------------------------------------------------------- 1 | { 2 | "pr_number": 1399, 3 | "log_file": "data/old_results/results_marshmallow_1488_1348.json", 4 | "differentiating_tests": [ 5 | { 6 | "test": { 7 | "test_code": "# Example 2: Normal usage scenario\nfrom marshmallow.utils import get_func_args\n\nclass ExampleClass:\n def __init__(self, attr1, attr2):\n pass\n\nargs = get_func_args(ExampleClass)\nprint(args)\n# Output: ['attr1', 'attr2']\n\n", 8 | "old_output": "['args', 'kwargs']\n", 9 | "new_output": "['attr1', 'attr2']\n" 10 | }, 11 | "label": "coincidental fix", 12 | "comment": "" 13 | } 14 | ] 15 | } -------------------------------------------------------------------------------- /data/ground_truth/pandas/57205.json: -------------------------------------------------------------------------------- 1 | { 2 | "pr_number": 57205, 3 | "log_file": "data/old_results/results_pandas_57278_57203.json", 4 | "differentiating_tests": [ 5 | { 6 | "test": { 7 | "test_code": "import pandas as pd\n# Example 12: Corner case - Creating a DataFrame with all None values\ndata = {'a': None, 'b': None}\ncolumns = ['a', 'b']\ndf = pd.DataFrame(data, columns=columns, index=range(2))\nprint(\"DataFrame 12:\\n\", df)", 8 | "old_output": "DataFrame 12:\n a b\n0 NaN NaN\n1 NaN NaN\n", 9 | "new_output": "DataFrame 12:\n a b\n0 None None\n1 None None\n" 10 | }, 11 | "label": "coincidental fix", 12 | "comment": "" 13 | } 14 | ] 15 | } -------------------------------------------------------------------------------- /data/ground_truth/pandas/57034.json: -------------------------------------------------------------------------------- 1 | { 2 | "pr_number": 57034, 3 | "log_file": "data/old_results/results_pandas_57112_56981.json", 4 | "differentiating_tests": [ 5 | { 6 | "test": { 7 | "test_code": "# Example 8: Combine with both Series containing all NaN values\nimport pandas as pd\n\ns1 = pd.Series([None, None, None], index=['a', 'b', 'c'])\ns2 = pd.Series([None, None, None], index=['b', 'c', 'd'])\n\nresult = s1.combine_first(s2)\nprint(result)", 8 | "old_output": "a None\nb None\nc None\nd None\ndtype: object\n", 9 | "new_output": "a NaN\nb None\nc None\nd None\ndtype: object\n" 10 | }, 11 | "label": "coincidental fix", 12 | "comment": "" 13 | } 14 | ] 15 | } -------------------------------------------------------------------------------- /data/ground_truth/scipy/21076.json: -------------------------------------------------------------------------------- 1 | { 2 | "pr_number": 21076, 3 | "log_file": "data/old_results/results_scipy_21151_20231.json", 4 | "differentiating_tests": [ 5 | { 6 | "test": { 7 | "test_code": "# Example 10: Larger dataset\nimport numpy as np\nfrom scipy.stats import differential_entropy\n\nvalues = np.array([1, 1, 2, 3, 3, 4, 5, 5, 6, 7, 8, 9, 10, 11])\nresult = differential_entropy(values)\nprint(f\"Entropy of a larger dataset: {result}\")", 8 | "old_output": "Entropy of a larger dataset: 2.358820400183337\n", 9 | "new_output": "Entropy of a larger dataset: 2.5285667498058793\n" 10 | }, 11 | "label": "unintended", 12 | "comment": "https://github.com/scipy/scipy/issues/21192" 13 | } 14 | ] 15 | } -------------------------------------------------------------------------------- /data/ground_truth/marshmallow/2215.json: -------------------------------------------------------------------------------- 1 | { 2 | "pr_number": 2215, 3 | "log_file": "data/old_results/results_marshmallow_2215_2130.json", 4 | "differentiating_tests": [ 5 | { 6 | "test": { 7 | "test_code": "# Example 10: Demonstrate warning for deprecated option in SchemaOpts\nfrom marshmallow.schema import SchemaOpts\n\nclass Meta:\n json_module = \"custom_json\"\n\noptions = SchemaOpts(meta=Meta)", 8 | "old_output": "", 9 | "new_output": "/tmp/BugGPT/BugGPT_test_code.py:7: RemovedInMarshmallow4Warning: The json_module class Meta option is deprecated. Use render_module instead.\n options = SchemaOpts(meta=Meta)\n" 10 | }, 11 | "label": "intended", 12 | "comment": "updating the dependencies leads to a valid deprecation warning" 13 | } 14 | ] 15 | } -------------------------------------------------------------------------------- /data/ground_truth/pandas/57399.json: -------------------------------------------------------------------------------- 1 | { 2 | "pr_number": 57399, 3 | "log_file": "data/old_results/results_pandas_57450_57356.json", 4 | "differentiating_tests": [ 5 | { 6 | "test": { 7 | "test_code": "# Example 20: Creating an interval range with non-matching dtype for start and end\nimport pandas as pd\nimport numpy as np\n\nresult = pd.interval_range(start=np.float32(0), end=5, freq=1)\nprint(result)", 8 | "old_output": "IntervalIndex([(0.0, 1.0], (1.0, 2.0], (2.0, 3.0], (3.0, 4.0], (4.0, 5.0]], dtype='interval[float64, right]')\n", 9 | "new_output": "IntervalIndex([(0, 1], (1, 2], (2, 3], (3, 4], (4, 5]], dtype='interval[int64, right]')\n" 10 | }, 11 | "label": "unintended", 12 | "comment": "https://github.com/pandas-dev/pandas/issues/58964" 13 | } 14 | ] 15 | } -------------------------------------------------------------------------------- /src/multilspy/type_helpers.py: -------------------------------------------------------------------------------- 1 | """ 2 | This module provides type-helpers used across multilspy implementation 3 | """ 4 | 5 | import inspect 6 | 7 | from typing import Callable, TypeVar, Type 8 | 9 | R = TypeVar("R", bound=object) 10 | 11 | def ensure_all_methods_implemented( 12 | source_cls: Type[object], 13 | ) -> Callable[[Type[R]], Type[R]]: 14 | """ 15 | A decorator to ensure that all methods of source_cls class are implemented in the decorated class. 16 | """ 17 | 18 | def check_all_methods_implemented(target_cls: R) -> R: 19 | for name, _ in inspect.getmembers(source_cls, inspect.isfunction): 20 | if name not in target_cls.__dict__ or not callable(target_cls.__dict__[name]): 21 | raise NotImplementedError(f"{name} is not implemented in {target_cls}") 22 | 23 | return target_cls 24 | 25 | return check_all_methods_implemented -------------------------------------------------------------------------------- /data/ground_truth/scipy/21553.json: -------------------------------------------------------------------------------- 1 | { 2 | "pr_number": 21553, 3 | "log_file": "data/results/scipy/21553_2024-11-23 09:06:43.json", 4 | "differentiating_tests": [ 5 | { 6 | "test": { 7 | "test_code": "# Example 19:\nimport numpy as np\nfrom scipy.linalg import expm\n\nA = np.array([[1, 2], [3, 4]], dtype=np.float32) # Single precision float matrix\nresult = expm(A)\nprint(\"Exponential of A with float32 precision:\\n\", result)", 8 | "old_output": "Exponential of A with float32 precision:\n [[ 51.969006 74.73665 ]\n [112.10497 164.074 ]]\n", 9 | "new_output": "Exponential of A with float32 precision:\n [[ 44.694115 65.096375]\n [ 97.64457 142.33871 ]]\n" 10 | }, 11 | "label": "unintended", 12 | "comment": "unintended side-effect of a performance optimization" 13 | } 14 | ] 15 | } -------------------------------------------------------------------------------- /data/ground_truth/keras/19814.json: -------------------------------------------------------------------------------- 1 | { 2 | "pr_number": 19814, 3 | "log_file": "data/old_results/results_keras_19840_19690.json", 4 | "differentiating_tests": [ 5 | { 6 | "test": { 7 | "test_code": "# Example 19: Multiple predictions with mixed types.\nimport numpy as np\nimport jax.numpy as jnp\nfrom keras.src.backend.jax.math import in_top_k\n\npredictions = jnp.array([[0.1, np.nan, 0.1], [0.9, np.nan, 0.0]])\ntargets = jnp.array([1, 2])\nk = 1\ntry:\n result = in_top_k(targets, predictions, k)\n print(result)\nexcept Exception as e:\n print(f\"Error: {e}\") # Invalid predictions/types - Error expected\n", 8 | "old_output": "[False False]\n", 9 | "new_output": "[ True False]\n" 10 | }, 11 | "label": "unintended", 12 | "comment": "https://github.com/keras-team/keras/issues/19995" 13 | } 14 | ] 15 | } -------------------------------------------------------------------------------- /data/ground_truth/pandas/57046.json: -------------------------------------------------------------------------------- 1 | { 2 | "pr_number": 57046, 3 | "log_file": "data/old_results/results_pandas_57112_56981.json", 4 | "differentiating_tests": [ 5 | { 6 | "test": { 7 | "test_code": "# Example 13: Corner Case - SeriesGroupBy.idxmin with NaN values only\nimport pandas as pd\nimport numpy as np\n\ns = pd.Series([np.nan, np.nan, np.nan, np.nan], index=['a', 'b', 'a', 'b'])\ngrouped = s.groupby(s.index)\nresult = grouped.idxmin(skipna=True)\nprint(\"SeriesGroupBy.idxmin with NaN values only result:\\n\", result)", 8 | "old_output": "SeriesGroupBy.idxmin with NaN values only result:\n a a\nb a\ndtype: object\n", 9 | "new_output": "SeriesGroupBy.idxmin with NaN values only result:\n a NaN\nb NaN\ndtype: object\n" 10 | }, 11 | "label": "coincidental fix", 12 | "comment": "" 13 | } 14 | ] 15 | } -------------------------------------------------------------------------------- /src/multilspy/multilspy_settings.py: -------------------------------------------------------------------------------- 1 | """ 2 | Defines the settings for multilspy. 3 | """ 4 | 5 | import os 6 | import pathlib 7 | 8 | class MultilspySettings: 9 | """ 10 | Provides the various settings for multilspy. 11 | """ 12 | @staticmethod 13 | def get_language_server_directory() -> str: 14 | """Returns the directory for language servers""" 15 | user_home = pathlib.Path.home() 16 | multilspy_dir = str(pathlib.PurePath(user_home, ".multilspy")) 17 | lsp_dir = str(pathlib.PurePath(multilspy_dir, "lsp")) 18 | os.makedirs(lsp_dir, exist_ok=True) 19 | return lsp_dir 20 | 21 | @staticmethod 22 | def get_global_cache_directory() -> str: 23 | """Returns the cache directory""" 24 | global_cache_dir = os.path.join(str(pathlib.Path.home()), ".multilspy", "global_cache") 25 | os.makedirs(global_cache_dir, exist_ok=True) 26 | return global_cache_dir 27 | -------------------------------------------------------------------------------- /data/ground_truth/pandas/55108.json: -------------------------------------------------------------------------------- 1 | { 2 | "pr_number": 55108, 3 | "log_file": "data/old_results/results_pandas_known_bugs.json", 4 | "differentiating_tests": [ 5 | { 6 | "test": { 7 | "test_code": "# Example 7: Using difference with one Index being a DateTimeIndex\nimport pandas as pd\n\nindex1 = pd.date_range('2022-01-01', periods=5)\nindex2 = pd.Index(['2022-01-03', '2022-01-04'])\nresult = index1.difference(index2)\nprint(result)", 8 | "old_output": "DatetimeIndex(['2022-01-01', '2022-01-02', '2022-01-05'], dtype='datetime64[ns]', freq=None)\n", 9 | "new_output": "DatetimeIndex(['2022-01-01', '2022-01-02', '2022-01-03', '2022-01-04',\n '2022-01-05'],\n dtype='datetime64[ns]', freq='D')\n" 10 | }, 11 | "label": "unintended", 12 | "comment": "https://github.com/pandas-dev/pandas/issues/58971" 13 | } 14 | ] 15 | } -------------------------------------------------------------------------------- /src/multilspy/multilspy_config.py: -------------------------------------------------------------------------------- 1 | """ 2 | Configuration parameters for Multilspy. 3 | """ 4 | 5 | from enum import Enum 6 | from dataclasses import dataclass 7 | 8 | class Language(str, Enum): 9 | """ 10 | Possible languages with Multilspy. 11 | """ 12 | 13 | CSHARP = "csharp" 14 | PYTHON = "python" 15 | RUST = "rust" 16 | JAVA = "java" 17 | 18 | def __str__(self) -> str: 19 | return self.value 20 | 21 | @dataclass 22 | class MultilspyConfig: 23 | """ 24 | Configuration parameters 25 | """ 26 | code_language: Language 27 | trace_lsp_communication: bool = False 28 | 29 | @classmethod 30 | def from_dict(cls, env: dict): 31 | """ 32 | Create a MultilspyConfig instance from a dictionary 33 | """ 34 | import inspect 35 | return cls(**{ 36 | k: v for k, v in env.items() 37 | if k in inspect.signature(cls).parameters 38 | }) -------------------------------------------------------------------------------- /src/testora/util/UndefinedRefsFinder.py: -------------------------------------------------------------------------------- 1 | import libcst as cst 2 | 3 | 4 | def get_undefined_references(src): 5 | undefined_variables = [] # using a list here to get a deterministic order 6 | 7 | ast = cst.parse_module(src) 8 | ast_wrapper = cst.metadata.MetadataWrapper(ast) 9 | scopes = ast_wrapper.resolve(cst.metadata.ScopeProvider).values() 10 | for scope in scopes: 11 | for access in scope.accesses: 12 | if len(access.referents) == 0: 13 | node = access.node 14 | undefined_variables.append(node.value) 15 | 16 | # remove duplicates 17 | undefined_variables = list(dict.fromkeys(undefined_variables)) 18 | 19 | return undefined_variables 20 | 21 | 22 | if __name__ == "__main__": 23 | code = """ 24 | from xx import bar 25 | 26 | def foo(l): 27 | l() 28 | 29 | foo(lambda n: print(n), bar) 30 | """ 31 | undefined_refs = get_undefined_references(code) 32 | print("Undefined references:", undefined_refs) 33 | -------------------------------------------------------------------------------- /src/testora/llms/LLMCacheAnalyzer.py: -------------------------------------------------------------------------------- 1 | import json 2 | import matplotlib.pyplot as plt 3 | 4 | def analyze_llm_cache(file_path): 5 | with open(file_path, "r") as f: 6 | cache = json.load(f) 7 | print(f"Total cache entries: {len(cache)}") 8 | query_sizes = [] 9 | answer_sizes = [] 10 | for query, answer in cache.items(): 11 | query_sizes.append(len(query)) 12 | answer_sizes.append(len(answer)) 13 | if len(answer) == 0: 14 | print(f"Warning: empty answer found!") 15 | 16 | # plot histogram of query sizes 17 | plt.hist(query_sizes, bins=50) 18 | plt.title("Query sizes") 19 | plt.xlabel("Size") 20 | plt.ylabel("Frequency") 21 | plt.show() 22 | 23 | # plot histogram of answer sizes 24 | plt.hist(answer_sizes, bins=50) 25 | plt.title("Answer sizes") 26 | plt.xlabel("Size") 27 | plt.ylabel("Frequency") 28 | plt.show() 29 | 30 | 31 | 32 | 33 | 34 | if __name__ == "__main__": 35 | analyze_llm_cache("data/llm_cache/gpt-4-0125-preview/cache.json") 36 | -------------------------------------------------------------------------------- /src/multilspy/language_servers/rust_analyzer/runtime_dependencies.json: -------------------------------------------------------------------------------- 1 | { 2 | "_description": "Used to download the runtime dependencies for running RustAnalyzer. Obtained from https://github.com/rust-lang/rust-analyzer/releases", 3 | "runtimeDependencies": [ 4 | { 5 | "id": "RustAnalyzer", 6 | "description": "RustAnalyzer for Linux (x64)", 7 | "url": "https://github.com/rust-lang/rust-analyzer/releases/download/2023-10-09/rust-analyzer-x86_64-unknown-linux-gnu.gz", 8 | "platformId": "linux-x64", 9 | "archiveType": "gz", 10 | "binaryName": "rust_analyzer" 11 | }, 12 | { 13 | "id": "RustAnalyzer", 14 | "description": "RustAnalyzer for Windows (x64)", 15 | "url": "https://github.com/rust-lang/rust-analyzer/releases/download/2023-10-09/rust-analyzer-x86_64-pc-windows-msvc.zip", 16 | "platformId": "win-x64", 17 | "archiveType": "zip", 18 | "binaryName": "rust-analyzer.exe" 19 | } 20 | ] 21 | } -------------------------------------------------------------------------------- /src/testora/evaluation/ClassificationResultsInspector.py: -------------------------------------------------------------------------------- 1 | import json 2 | from testora.evaluation.ResultsManager import result_files_for_project 3 | 4 | 5 | project = "scipy" 6 | pr = 21553 7 | test_case_to_skip = 0 8 | 9 | file = list(result_files_for_project(project, is_classification=True))[0] 10 | fp = open(file, "r") 11 | result_json = json.load(fp) 12 | config = result_json[0]["message"] 13 | print("CONFIG:") 14 | print(config) 15 | for entry_idx, entry in enumerate(result_json): 16 | if entry["pr_nb"] == pr and entry["message"] == "Pre-classification": 17 | test_case_to_skip -= 1 18 | if test_case_to_skip != -1: 19 | continue 20 | 21 | print("\nTEST CODE:") 22 | print(entry["test_code"]) 23 | print("\nOLD OUTPUT:") 24 | print(entry["old_output"]) 25 | print("\nNEW OUTPUT:") 26 | print(entry["new_output"]) 27 | 28 | print("\nQUERY:") 29 | print(result_json[entry_idx+1]["content"]) 30 | 31 | print("\nANSWER:") 32 | print(result_json[entry_idx+3]["content"]) 33 | -------------------------------------------------------------------------------- /data/ground_truth/marshmallow/2102.json: -------------------------------------------------------------------------------- 1 | { 2 | "pr_number": 2102, 3 | "log_file": "data/results/marshmallow/2102_2024-11-23 09:16:17.json", 4 | "differentiating_tests": [ 5 | { 6 | "test": { 7 | "test_code": "# Example 13:\nimport marshmallow.utils as utils\n\ntimestamp = float('inf') # Value is positive infinity\ntry:\n result = utils.from_timestamp(timestamp)\nexcept ValueError as e:\n print(\"Timestamp:\", timestamp, \"-> Exception:\", e)", 8 | "old_output": "Traceback (most recent call last):\n File \"/tmp/BugGPT/BugGPT_test_code.py\", line 6, in \n result = utils.from_timestamp(timestamp)\n File \"/home/marshmallow/src/marshmallow/utils.py\", line 200, in from_timestamp\n return dt.datetime.fromtimestamp(value, tz=dt.timezone.utc).replace(tzinfo=None)\nOverflowError: timestamp out of range for platform time_t\n", 9 | "new_output": "Timestamp: inf -> Exception: Timestamp is too large\n" 10 | }, 11 | "label": "intended", 12 | "comment": "PR adds better error handling" 13 | } 14 | ] 15 | } -------------------------------------------------------------------------------- /data/ground_truth/scipy/20751.json: -------------------------------------------------------------------------------- 1 | { 2 | "pr_number": 20751, 3 | "log_file": "data/old_results/results_scipy_20759_20656.json", 4 | "differentiating_tests": [ 5 | { 6 | "test": { 7 | "test_code": "# Example 9: Using float32 data type arrays\nimport numpy as np\nfrom scipy.stats import bartlett\n\na = np.array([10.1, 10.2, 10.3, 10.4], dtype=np.float32)\nb = np.array([10.15, 10.25, 10.35, 10.45], dtype=np.float32)\nc = np.array([10.05, 10.15, 10.25, 10.35], dtype=np.float32)\n\nresult = bartlett(a, b, c)\nprint(\"Bartlett test result for float32 data type arrays:\", result)", 8 | "old_output": "Bartlett test result for float32 data type arrays: BartlettResult(statistic=np.float64(2.7743484928754286e-11), pvalue=np.float64(0.9999999999861282))\n", 9 | "new_output": "Bartlett test result for float32 data type arrays: BartlettResult(statistic=np.float64(-7.080736255702299e-07), pvalue=np.float64(1.0))\n" 10 | }, 11 | "label": "unintended", 12 | "comment": "https://github.com/scipy/scipy/issues/21152" 13 | } 14 | ] 15 | } -------------------------------------------------------------------------------- /data/ground_truth/scipy/19428.json: -------------------------------------------------------------------------------- 1 | { 2 | "pr_number": 19428, 3 | "log_file": "data/old_results/results_scipy_19533_19428.json", 4 | "differentiating_tests": [ 5 | { 6 | "test": { 7 | "test_code": "# Example 20: Perform a Levene test with one sample vector containing both positive and negative infinity values\nfrom scipy.stats import levene\nimport numpy as np\n\nsample1 = np.array([-np.inf, 1, 2, np.inf, 4])\n\nresult = levene(sample1)", 8 | "old_output": "Traceback (most recent call last):\n File \"/tmp/BugGPT/BugGPT_test_code.py\", line 7, in \n result = levene(sample1)\n ^^^^^^^^^^^^^^^\n File \"/home/scipy/scipy/stats/_morestats.py\", line 3213, in levene\n raise ValueError(\"Must enter at least two input sample vectors.\")\nValueError: Must enter at least two input sample vectors.\n", 9 | "new_output": "" 10 | }, 11 | "label": "unintended", 12 | "comment": "API expects at least two input sample vectors; newer version went back to old behavior (independently of us)" 13 | } 14 | ] 15 | } -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024-2025 Michael Pradel 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /data/ground_truth/scipy/21036.json: -------------------------------------------------------------------------------- 1 | { 2 | "pr_number": 21036, 3 | "log_file": "data/old_results/results_scipy_21067_20937.json", 4 | "differentiating_tests": [ 5 | { 6 | "test": { 7 | "test_code": "# Example 14:\nimport numpy as np\nfrom scipy import stats\n\na = np.array([1, 2])\nprint(\"Array:\", a)\ntry:\n tsem_value = stats.tsem(a, ddof=3)\n print(\"Standard Error of Mean with ddof > number of elements:\", tsem_value)\nexcept Exception as e:\n print(\"Error:\", e)", 8 | "old_output": "Array: [1 2]\nStandard Error of Mean with ddof > number of elements: inf\n/root/conda/envs/scipy-dev/lib/python3.12/site-packages/numpy/core/fromnumeric.py:3787: RuntimeWarning: Degrees of freedom <= 0 for slice\n return _methods._var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,\n/root/conda/envs/scipy-dev/lib/python3.12/site-packages/numpy/core/_methods.py:198: RuntimeWarning: divide by zero encountered in scalar divide\n ret = ret.dtype.type(ret / rcount)\n", 9 | "new_output": "Array: [1 2]\nStandard Error of Mean with ddof > number of elements: nan\n" 10 | }, 11 | "label": "coincidental fix", 12 | "comment": "" 13 | } 14 | ] 15 | } -------------------------------------------------------------------------------- /src/testora/util/DocstringRetrieval.py: -------------------------------------------------------------------------------- 1 | import os 2 | from testora.util.ClonedRepoManager import ClonedRepo 3 | from testora.util.PythonCodeUtil import get_locations_of_calls 4 | 5 | 6 | def retrieve_relevant_docstrings(cloned_repo: ClonedRepo, code: str) -> str: 7 | # copy code into project 8 | code_dir = f"{cloned_repo.repo.working_dir}/testora_code" 9 | os.makedirs(code_dir, exist_ok=True) 10 | code_path = f"{code_dir}/test.py" 11 | with open(code_path, "w") as f: 12 | f.write(code) 13 | 14 | # find all calls in the code 15 | call_locations = get_locations_of_calls(code) 16 | 17 | # query language server for hover text for each call 18 | server = cloned_repo.language_server 19 | docs = [] 20 | for call_location in call_locations: 21 | line = call_location.start.line - 1 # LSP lines are 0-based 22 | column = call_location.start.column 23 | doc = server.get_hover_text(code_path, line, column) 24 | if doc not in docs: 25 | docs.append(doc) 26 | 27 | # enforce limits: max 2000 chars per docstring, max 6000 chars overall 28 | result = "" 29 | for doc in docs: 30 | result += "-------" 31 | result += doc[:2000] 32 | 33 | return result[:6000] 34 | -------------------------------------------------------------------------------- /data/ground_truth/scipy/19263.json: -------------------------------------------------------------------------------- 1 | { 2 | "pr_number": 19263, 3 | "log_file": "data/old_results/results_scipy_19310_19224.json", 4 | "differentiating_tests": [ 5 | { 6 | "test": { 7 | "test_code": "# Example 13: Passing a list instead of a numpy array to hfftn\nfrom scipy.fft import hfftn\n\nx = [[1, 2], [3, 4]]\nresult = hfftn(x)", 8 | "old_output": "", 9 | "new_output": "Traceback (most recent call last):\n File \"/tmp/BugGPT/BugGPT_test_code.py\", line 5, in \n result = hfftn(x)\n ^^^^^^^^\n File \"/home/scipy/scipy/fft/_backend.py\", line 28, in __ua_function__\n return fn(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^\n File \"/home/scipy/scipy/fft/_basic_backend.py\", line 154, in hfftn\n return _pocketfft.hfftn(x, s, axes, norm, overwrite_x, workers, plan=plan)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/home/scipy/scipy/fft/_pocketfft/basic.py\", line 208, in c2rn\n shape[-1] = (x.shape[axes[-1]] - 1) * 2\n ^^^^^^^\nAttributeError: 'list' object has no attribute 'shape'\n" 10 | }, 11 | "label": "unintended", 12 | "comment": "https://github.com/scipy/scipy/issues/21207" 13 | } 14 | ] 15 | } -------------------------------------------------------------------------------- /data/ground_truth/scipy/20974.json: -------------------------------------------------------------------------------- 1 | { 2 | "pr_number": 20974, 3 | "log_file": "data/old_results/results_scipy_21067_20937.json", 4 | "differentiating_tests": [ 5 | { 6 | "test": { 7 | "test_code": "# Example 10: Using method 'stouffer' with weights and axis (1)\nimport numpy as np\nfrom scipy.stats import combine_pvalues\n\npvalues = np.array([[0.01, 0.02, 0.05], [0.03, 0.04, 0.07], [0.08, 0.01, 0.02]])\nweights = np.array([1, 2, 3])\nresult = combine_pvalues(pvalues, method='stouffer', weights=weights, axis=1)\nprint(\"Stouffer Method, weights, axis=1:\", result)", 8 | "old_output": "Stouffer Method, weights, axis=1: SignificanceResult(statistic=array([3.03833446, 2.62170953, 3.26566887]), pvalue=array([0.00118945, 0.0043745 , 0.00054603]))\n", 9 | "new_output": "Stouffer Method, weights, axis=1: SignificanceResult(statistic=array([[2.7536326 , 3.34989642, 2.87511156],\n [2.7536326 , 3.34989642, 2.87511156],\n [2.7536326 , 3.34989642, 2.87511156]]), pvalue=array([[0.00294689, 0.00040421, 0.00201942],\n [0.00294689, 0.00040421, 0.00201942],\n [0.00294689, 0.00040421, 0.00201942]]))\n" 10 | }, 11 | "label": "unintended", 12 | "comment": "https://github.com/scipy/scipy/issues/21106" 13 | } 14 | ] 15 | } -------------------------------------------------------------------------------- /data/ground_truth/scipy/21518.json: -------------------------------------------------------------------------------- 1 | { 2 | "pr_number": 21518, 3 | "log_file": "data/results/scipy/21518_2024-11-23 09:06:43.json", 4 | "differentiating_tests": [ 5 | { 6 | "test": { 7 | "test_code": "# Example 15:\nimport numpy as np\nfrom scipy import stats\n\ndata = np.array([-1, -2, -3])\nlmb = 1.0\n\ntry:\n log_likelihood = stats.boxcox_llf(lmb, data)\n print(f\"Box-Cox Log-likelihood with negative data: {log_likelihood}\")\nexcept Exception as e:\n print(f\"Error with negative values: {e}\")", 8 | "old_output": "/home/scipy/scipy/stats/_morestats.py:961: RuntimeWarning: invalid value encountered in log\n logdata = xp.log(data)\n/home/scipy/scipy/special/_logsumexp.py:128: RuntimeWarning: invalid value encountered in exp\n tmp = xp.exp(a - a_max)\nBox-Cox Log-likelihood with negative data: nan\n", 9 | "new_output": "/home/scipy/scipy/stats/_morestats.py:967: RuntimeWarning: invalid value encountered in log\n logdata = xp.log(data)\n/home/scipy/scipy/special/_logsumexp.py:128: RuntimeWarning: invalid value encountered in exp\n tmp = xp.exp(a - a_max)\nBox-Cox Log-likelihood with negative data: nan\n" 10 | }, 11 | "label": "intended", 12 | "comment": "line numbers are different, everything else is the same" 13 | } 14 | ] 15 | } -------------------------------------------------------------------------------- /data/ground_truth/scipy/21577.json: -------------------------------------------------------------------------------- 1 | { 2 | "pr_number": 21577, 3 | "log_file": "data/results/scipy/21577_2024-11-23 09:06:43.json", 4 | "differentiating_tests": [ 5 | { 6 | "test": { 7 | "test_code": "# Example 1:\nimport numpy as np\nfrom scipy import special\n\n# Edge case with zero order and zero input\nresult = special.hankel2(0, 0)\nprint(\"hankel2(0, 0) = \", result)", 8 | "old_output": "hankel2(0, 0) = (nan+nanj)\n", 9 | "new_output": "hankel2(0, 0) = (nan+infj)\n" 10 | }, 11 | "label": "intended", 12 | "comment": "not 100% sure about the math, but changing the output for this input is the PR's intention" 13 | }, 14 | { 15 | "test": { 16 | "test_code": "# Example 6:\nimport numpy as np\nfrom scipy import special\n\n# Testing with a complex input of zero\nresult = special.hankel2(0, complex(0, 0))\nprint(\"hankel2(0, complex(0, 0)) = \", result)", 17 | "old_output": "hankel2(0, complex(0, 0)) = (nan+nanj)\n", 18 | "new_output": "hankel2(0, complex(0, 0)) = (nan+infj)\n" 19 | }, 20 | "label": "intended", 21 | "comment": "not 100% sure about the math, but changing the output for this input is the PR's intention" 22 | } 23 | ] 24 | } -------------------------------------------------------------------------------- /.devcontainer/setup_scapy.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | echo "Creating directory for clones" 4 | cd .. 5 | sudo mkdir clones 6 | sudo chown vscode:vscode clones/ 7 | cd clones 8 | 9 | echo "Cleaning any existing scapy-dev containers" 10 | docker rm -f scapy-dev1 11 | docker rm -f scapy-dev2 12 | docker rm -f scapy-dev3 13 | 14 | mkdir clone1 15 | cd clone1 16 | 17 | echo "Creating first clone of scapy" 18 | git clone https://github.com/secdev/scapy.git 19 | cd scapy 20 | echo "Building dev container for scapy (first clone)" 21 | docker run -t -d --name scapy-dev1 -v ${PWD}:/home/scapy python:3.10 22 | docker exec -w /home/scapy scapy-dev1 pip install -e . 23 | echo "Done with first clone" 24 | 25 | ##### 26 | echo "Creating second clone of scapy" 27 | cd ../.. 28 | cp -r clone1 clone2 29 | cd clone2/scapy 30 | echo "Building dev container for scapy (second clone)" 31 | docker run -t -d --name scapy-dev2 -v ${PWD}:/home/scapy python:3.10 32 | docker exec -w /home/scapy scapy-dev2 pip install -e . 33 | echo "Done with second clone" 34 | 35 | echo "Creating third clone of scapy" 36 | cd ../.. 37 | cp -r clone1 clone3 38 | cd clone3/scapy 39 | echo "Building dev container for scapy (third clone)" 40 | docker run -t -d --name scapy-dev3 -v ${PWD}:/home/scapy python:3.10 41 | docker exec -w /home/scapy scapy-dev3 pip install -e . 42 | echo "Done with third clone" 43 | 44 | cd ../../../Testora -------------------------------------------------------------------------------- /data/ground_truth/scipy/21642.json: -------------------------------------------------------------------------------- 1 | { 2 | "pr_number": 21642, 3 | "log_file": "data/results/scipy/21642_2024-11-23 09:06:44.json", 4 | "differentiating_tests": [ 5 | { 6 | "test": { 7 | "test_code": "# Example 3:\nimport numpy as np\nfrom scipy.io import mmwrite, mmread\n\n# Write a matrix directly to a file without specifying extension\ndata = np.array([[5, 0, 0], [0, 0, 6]])\nmmwrite('matrix3', data)\n\n# Read back the created file\nloaded_matrix = mmread('matrix3.mtx')", 8 | "old_output": "Traceback (most recent call last):\n File \"/tmp/BugGPT/BugGPT_test_code.py\", line 10, in \n loaded_matrix = mmread('matrix3.mtx')\n File \"/home/scipy/scipy/io/_fast_matrix_market/__init__.py\", line 354, in mmread\n cursor, stream_to_close = _get_read_cursor(source)\n ~~~~~~~~~~~~~~~~^^^^^^^^\n File \"/home/scipy/scipy/io/_fast_matrix_market/__init__.py\", line 197, in _get_read_cursor\n return _fmm_core.open_read_file(path, parallelism), ret_stream_to_close\n ~~~~~~~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^\nValueError: Line 1: Not a Matrix Market file. Missing banner.\n", 9 | "new_output": "" 10 | }, 11 | "label": "intended", 12 | "comment": "exception in old version is due to the bug fixed by the PR" 13 | } 14 | ] 15 | } -------------------------------------------------------------------------------- /data/ground_truth/scipy/19680.json: -------------------------------------------------------------------------------- 1 | { 2 | "pr_number": 19680, 3 | "log_file": "data/old_results/results_scipy_19724_19637.json", 4 | "differentiating_tests": [ 5 | { 6 | "test": { 7 | "test_code": "# Example 5: Testing the shapiro function with extreme values\nfrom scipy import stats\n\ndata = [10**20, -10**20, 10**30, -10**30] # Extreme values\nshapiro_result = stats.shapiro(data)", 8 | "old_output": "", 9 | "new_output": "Traceback (most recent call last):\n File \"/tmp/BugGPT/BugGPT_test_code.py\", line 5, in \n shapiro_result = stats.shapiro(data)\n ^^^^^^^^^^^^^^^^^^^\n File \"/home/scipy/scipy/stats/_axis_nan_policy.py\", line 505, in axis_nan_policy_wrapper\n contains_nan = [_contains_nan(sample, nan_policy)[0]\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/home/scipy/scipy/_lib/_util.py\", line 716, in _contains_nan\n if np.issubdtype(type(el), np.number) and np.isnan(el):\n ^^^^^^^^^^^^\nTypeError: ufunc 'isnan' not supported for the input types, and the inputs could not be safely coerced to any supported types according to the casting rule ''safe''\n" 10 | }, 11 | "label": "unintended", 12 | "comment": "https://github.com/scipy/scipy/issues/21205" 13 | } 14 | ] 15 | } -------------------------------------------------------------------------------- /data/ground_truth/scipy/19853.json: -------------------------------------------------------------------------------- 1 | { 2 | "pr_number": 19853, 3 | "log_file": "data/old_results/results_scipy_19909_19818.json", 4 | "differentiating_tests": [ 5 | { 6 | "test": { 7 | "test_code": "# Corner Case Example 7: Calculating mean on an empty sparse matrix\nfrom scipy.sparse import coo_matrix\n\nA = coo_matrix((0, 0))\n\nA.mean()", 8 | "old_output": "/home/scipy/scipy/sparse/_base.py:712: RuntimeWarning: divide by zero encountered in divide\n return self.astype(np.float64)._mul_scalar(1./other)\n", 9 | "new_output": "Traceback (most recent call last):\n File \"/tmp/BugGPT/BugGPT_test_code.py\", line 6, in \n A.mean()\n File \"/home/scipy/scipy/sparse/_base.py\", line 1209, in mean\n return (inter_self / (self.shape[0] * self.shape[1]))\\\n ~~~~~~~~~~~^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n File \"/home/scipy/scipy/sparse/_base.py\", line 752, in __truediv__\n return self._divide(other, true_divide=True)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/home/scipy/scipy/sparse/_base.py\", line 716, in _divide\n return self.astype(np.float64)._mul_scalar(1./other)\n ~~^~~~~~\nZeroDivisionError: float division by zero\n" 10 | }, 11 | "label": "coincidental fix", 12 | "comment": "" 13 | } 14 | ] 15 | } -------------------------------------------------------------------------------- /data/ground_truth/marshmallow/2246.json: -------------------------------------------------------------------------------- 1 | { 2 | "pr_number": 2246, 3 | "log_file": "data/results/marshmallow/2246_2024-11-23 09:16:17.json", 4 | "differentiating_tests": [ 5 | { 6 | "test": { 7 | "test_code": "# Example 1: Field Declaration as a Class (Expecting TypeError)\nfrom marshmallow import Schema, fields\n\nclass BadUserSchema(Schema):\n name = fields.String # Incorrect: declaring as class, should be an instance\n\ntry:", 8 | "old_output": "Couldn't run '/tmp/BugGPT/BugGPT_test_code.py' as Python code: IndentationError: expected an indented block after 'try' statement on line 7 (BugGPT_test_code.py, line 7)\n/usr/local/lib/python3.10/site-packages/coverage/control.py:892: CoverageWarning: No data was collected. (no-data-collected)\n self._warn(\"No data was collected.\", slug=\"no-data-collected\")\n", 9 | "new_output": "/usr/local/lib/python3.10/site-packages/coverage/control.py:892: CoverageWarning: No data was collected. (no-data-collected)\n self._warn(\"No data was collected.\", slug=\"no-data-collected\")\nCouldn't run '/tmp/BugGPT/BugGPT_test_code.py' as Python code: IndentationError: expected an indented block after 'try' statement on line 7 (BugGPT_test_code.py, line 7)\n" 10 | }, 11 | "label": "intended", 12 | "comment": "different order of error messages" 13 | } 14 | ] 15 | } -------------------------------------------------------------------------------- /.devcontainer/devcontainer.json: -------------------------------------------------------------------------------- 1 | // For format details, see https://aka.ms/devcontainer.json. For config options, see the 2 | // README at: https://github.com/devcontainers/templates/tree/main/src/python 3 | { 4 | "name": "Testora-dev", 5 | // Or use a Dockerfile or Docker Compose file. More info: https://containers.dev/guide/dockerfile 6 | "image": "mcr.microsoft.com/devcontainers/python:1-3.11-bookworm", 7 | "features": { 8 | "ghcr.io/devcontainers/features/docker-in-docker:2": { 9 | "moby": true, 10 | "azureDnsAutoDetection": true, 11 | "installDockerBuildx": true, 12 | "installDockerComposeSwitch": true, 13 | "version": "latest", 14 | "dockerDashComposeVersion": "latest" 15 | } 16 | }, 17 | 18 | // Features to add to the dev container. More info: https://containers.dev/features. 19 | // "features": {}, 20 | 21 | // Use 'forwardPorts' to make a list of ports inside the container available locally. 22 | // "forwardPorts": [], 23 | 24 | // Use 'postCreateCommand' to run commands after the container is created. 25 | "postCreateCommand": "./.devcontainer/postCreateCommands.sh", 26 | 27 | // Configure tool-specific properties. 28 | // "customizations": {}, 29 | 30 | // Uncomment to connect as root instead. More info: https://aka.ms/dev-containers-non-root. 31 | // "remoteUser": "root" 32 | 33 | // keep containers running (useful for long-running experiments on servers) 34 | "shutdownAction": "none" 35 | } 36 | -------------------------------------------------------------------------------- /data/ground_truth/scipy/21633.json: -------------------------------------------------------------------------------- 1 | { 2 | "pr_number": 21633, 3 | "log_file": "data/results/scipy/21633_2024-11-23 09:06:44.json", 4 | "differentiating_tests": [ 5 | { 6 | "test": { 7 | "test_code": "# Example 1:\nimport numpy as np\nfrom scipy.linalg import kron\n\nA = np.array([[1, 2], [3, 4]])\nB = np.array([[0, 5], [6, 7]])\nresult = kron(A, B)", 8 | "old_output": "", 9 | "new_output": "/tmp/BugGPT/BugGPT_test_code.py:7: DeprecationWarning: `kron` has been deprecated in favour of `numpy.kron` in SciPy 1.15.0 and will be removed in SciPy 1.17.0.\n result = kron(A, B)\n" 10 | }, 11 | "label": "intended", 12 | "comment": "new code prints deprecation warning, as intended by the PR" 13 | }, 14 | { 15 | "test": { 16 | "test_code": "# Example 10:\nimport numpy as np\nfrom scipy.linalg import kron\n\nA = np.array([[1, 0], [0, 1]])\nB = np.array([[2, 2], [2, 2]])\nresult = kron(A, B)", 17 | "old_output": "", 18 | "new_output": "/tmp/BugGPT/BugGPT_test_code.py:7: DeprecationWarning: `kron` has been deprecated in favour of `numpy.kron` in SciPy 1.15.0 and will be removed in SciPy 1.17.0.\n result = kron(A, B)\n" 19 | }, 20 | "label": "intended", 21 | "comment": "new code prints deprecation warning, as intended by the PR" 22 | } 23 | ] 24 | } -------------------------------------------------------------------------------- /src/testora/evaluation/TestFailureInspector.py: -------------------------------------------------------------------------------- 1 | # Helper script to inspect logs of test failures and identify the root cause of the failure 2 | 3 | from collections import Counter 4 | from testora.evaluation.ResultsManager import result_files 5 | from testora.util.LogParser import parse_log_files 6 | 7 | pr_results, _ = parse_log_files(result_files()) 8 | 9 | error_ctr = Counter() 10 | 11 | for pr_result in pr_results: 12 | if pr_result.nb_test_failures > 0: 13 | for entry in pr_result.entries: 14 | if entry["message"] == "Test execution" and "Traceback (most recent call last)" in entry["output"]: 15 | # print(entry["output"]) 16 | last_line = entry["output"].split("\n")[-2:-1][0] 17 | # print(last_line) 18 | # print("--------------------------------------------\n") 19 | if "Error" in last_line: 20 | error_type = last_line.split(":")[0] 21 | error_ctr[error_type] += 1 22 | 23 | if "NameError" in last_line: 24 | # print(last_line) 25 | print(entry["code"]) 26 | print(">>>") 27 | print(entry["output"]) 28 | print("--------------------------------------------\n") 29 | 30 | 31 | print("\n\n\n") 32 | 33 | for error_type, count in error_ctr.most_common(): 34 | print(f"{error_type}: {count}") 35 | 36 | 37 | -------------------------------------------------------------------------------- /.vscode/launch.json: -------------------------------------------------------------------------------- 1 | { 2 | "configurations": [ 3 | { 4 | "name": "RegressionFinder", 5 | "type": "debugpy", 6 | "request": "launch", 7 | "module": "testora.RegressionFinder", 8 | }, 9 | { 10 | "name": "RegressionFinder --db", 11 | "type": "debugpy", 12 | "request": "launch", 13 | "module": "testora.RegressionFinder", 14 | "args": ["--db"], 15 | }, 16 | { 17 | "name": "RegressionFinderOnePR", 18 | "type": "debugpy", 19 | "request": "launch", 20 | "module": "testora.RegressionFinder", 21 | "args": ["--project", "scipy", "--pr", "21768"], 22 | }, 23 | { 24 | "name": "EvalTaskManager", 25 | "type": "debugpy", 26 | "request": "launch", 27 | "module": "testora.evaluation.EvalTaskManager", 28 | "args": ["--fetch", "--classification"], 29 | }, 30 | { 31 | "name": "ClassificationEvaluator", 32 | "type": "debugpy", 33 | "request": "launch", 34 | "module": "testora.evaluation.ClassificationEvaluator", 35 | "args": ["--evaluate"], 36 | }, 37 | { 38 | "name": "ClassificationResultsSummarizer", 39 | "type": "debugpy", 40 | "request": "launch", 41 | "module": "testora.evaluation.ClassificationResultsSummarizer", 42 | }, 43 | ] 44 | } -------------------------------------------------------------------------------- /src/testora/evaluation/sql/tasks.sql: -------------------------------------------------------------------------------- 1 | -- phpMyAdmin SQL Dump 2 | -- version 5.2.1 3 | -- https://www.phpmyadmin.net/ 4 | -- 5 | -- Host: sql141.your-server.de 6 | -- Generation Time: Mar 20, 2025 at 10:00 AM 7 | -- Server version: 10.11.11-MariaDB-hetzner1 8 | -- PHP Version: 8.0.30 9 | 10 | SET SQL_MODE = "NO_AUTO_VALUE_ON_ZERO"; 11 | START TRANSACTION; 12 | SET time_zone = "+00:00"; 13 | 14 | 15 | /*!40101 SET @OLD_CHARACTER_SET_CLIENT=@@CHARACTER_SET_CLIENT */; 16 | /*!40101 SET @OLD_CHARACTER_SET_RESULTS=@@CHARACTER_SET_RESULTS */; 17 | /*!40101 SET @OLD_COLLATION_CONNECTION=@@COLLATION_CONNECTION */; 18 | /*!40101 SET NAMES utf8mb4 */; 19 | 20 | -- 21 | -- Database: `regression_finder_db` 22 | -- 23 | 24 | -- -------------------------------------------------------- 25 | 26 | -- 27 | -- Table structure for table `tasks` 28 | -- 29 | 30 | CREATE TABLE `tasks` ( 31 | `project` varchar(31) NOT NULL, 32 | `pr` int(11) NOT NULL, 33 | `worker` text DEFAULT NULL, 34 | `result` longtext DEFAULT NULL, 35 | `timestamp` timestamp NOT NULL DEFAULT current_timestamp() 36 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci; 37 | 38 | -- 39 | -- Indexes for dumped tables 40 | -- 41 | 42 | -- 43 | -- Indexes for table `tasks` 44 | -- 45 | ALTER TABLE `tasks` 46 | ADD PRIMARY KEY (`project`,`pr`,`timestamp`); 47 | COMMIT; 48 | 49 | /*!40101 SET CHARACTER_SET_CLIENT=@OLD_CHARACTER_SET_CLIENT */; 50 | /*!40101 SET CHARACTER_SET_RESULTS=@OLD_CHARACTER_SET_RESULTS */; 51 | /*!40101 SET COLLATION_CONNECTION=@OLD_COLLATION_CONNECTION */; 52 | -------------------------------------------------------------------------------- /.devcontainer/setup_transformers.sh: -------------------------------------------------------------------------------- 1 | 2 | echo "Creating directory for clones" 3 | cd .. 4 | sudo mkdir clones 5 | sudo chown vscode:vscode clones/ 6 | cd clones 7 | 8 | echo "Cleaning any existing transformer containers" 9 | docker rm -f transformers-dev1 10 | docker rm -f transformers-dev2 11 | docker rm -f transformers-dev3 12 | 13 | mkdir clone1 14 | cd clone1 15 | 16 | echo "Creating first clone of transformers" 17 | git clone https://github.com/huggingface/transformers.git 18 | cd transformers 19 | echo "Building dev container for transformers (first clone)" 20 | docker run -t -d --name transformers-dev1 -v ${PWD}:/home/transformers python:3.10 21 | docker exec -w /home/transformers transformers-dev1 pip install -e ".[dev]" 22 | echo "Done with first clone" 23 | 24 | echo "Creating second clone of transformers" 25 | cd ../.. 26 | cp -r clone1 clone2 27 | cd clone2/transformers 28 | echo "Building dev container for transformers (second clone)" 29 | docker run -t -d --name transformers-dev2 -v ${PWD}:/home/transformers python:3.10 30 | docker exec -w /home/transformers transformers-dev2 pip install -e ".[dev]" 31 | echo "Done with second clone" 32 | 33 | echo "Creating third clone of transformers" 34 | cd ../.. 35 | cp -r clone1 clone3 36 | cd clone3/transformers 37 | echo "Building dev container for transformers (third clone)" 38 | docker run -t -d --name transformers-dev3 -v ${PWD}:/home/transformers python:3.10 39 | docker exec -w /home/transformers transformers-dev3 pip install -e ".[dev]" 40 | echo "Done with third clone" 41 | 42 | cd ../../../Testora 43 | 44 | 45 | -------------------------------------------------------------------------------- /src/testora/prompts/UndefinedRefsFixingPrompt.py: -------------------------------------------------------------------------------- 1 | class UndefinedRefsFixingPrompt: 2 | def __init__(self, code, undefined_refs): 3 | self.code = code 4 | self.undefined_refs = undefined_refs 5 | self.use_json_output = False 6 | 7 | def create_prompt(self): 8 | instruction_single = """ 9 | This Python code has an undefined reference to: . Fix it. 10 | 11 | ```python 12 | 13 | ``` 14 | 15 | Respond only with Python code wrapped into ```python ... ```. Give no explanations. 16 | """ 17 | 18 | instruction_multiple = """ 19 | This Python code has undefined references to: . Fix it. 20 | 21 | ```python 22 | 23 | ``` 24 | 25 | Respond only with Python code wrapped into ```python ... ```. Give no explanations. 26 | """ 27 | if len(self.undefined_refs) == 1: 28 | prompt = instruction_single.replace( 29 | "", self.undefined_refs[0]) 30 | else: 31 | prompt = instruction_multiple.replace( 32 | "", ", ".join(self.undefined_refs)) 33 | 34 | prompt = prompt.replace("", self.code) 35 | 36 | return prompt 37 | 38 | def parse_answer(self, raw_answer): 39 | code = "" 40 | in_code = False 41 | for line in raw_answer.split("\n"): 42 | if line.strip() == "```": 43 | break 44 | if in_code: 45 | code += line + "\n" 46 | if line == "```python" or line.startswith("import"): 47 | in_code = True 48 | return code 49 | -------------------------------------------------------------------------------- /data/ground_truth/scipy/19861.json: -------------------------------------------------------------------------------- 1 | { 2 | "pr_number": 19861, 3 | "log_file": "data/old_results/results_scipy_19909_19818.json", 4 | "differentiating_tests": [ 5 | { 6 | "test": { 7 | "test_code": "# Example 13: Corner case with empty string arrays\nimport numpy as np\nfrom scipy.io import savemat, loadmat\nimport tempfile\n\nwith tempfile.TemporaryDirectory() as tmpdirname:\n empty_string_array = np.array([\"\", \"\"])\n data = {\"empty_string_array\": empty_string_array}\n savemat(f\"{tmpdirname}/empty_string.mat\", data, format=\"4\", oned_as='row')", 8 | "old_output": "", 9 | "new_output": "Traceback (most recent call last):\n File \"/tmp/BugGPT/BugGPT_test_code.py\", line 9, in \n savemat(f\"{tmpdirname}/empty_string.mat\", data, format=\"4\", oned_as='row')\n File \"/home/scipy/scipy/io/matlab/_mio.py\", line 301, in savemat\n MW.put_variables(mdict)\n File \"/home/scipy/scipy/io/matlab/_mio4.py\", line 624, in put_variables\n self._matrix_writer.write(var, name)\n File \"/home/scipy/scipy/io/matlab/_mio4.py\", line 522, in write\n self.write_char(arr, name)\n File \"/home/scipy/scipy/io/matlab/_mio4.py\", line 565, in write_char\n arr = np.ndarray(shape=dims, dtype='S1', buffer=st)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\nTypeError: buffer is too small for requested array\n" 10 | }, 11 | "label": "coincidental fix", 12 | "comment": "not 100% sure; could also be a regression; intended behavior is hard to guess" 13 | } 14 | ] 15 | } -------------------------------------------------------------------------------- /src/testora/evaluation/sql/classification_tasks.sql: -------------------------------------------------------------------------------- 1 | -- phpMyAdmin SQL Dump 2 | -- version 5.2.1 3 | -- https://www.phpmyadmin.net/ 4 | -- 5 | -- Host: sql141.your-server.de 6 | -- Generation Time: Mar 20, 2025 at 10:00 AM 7 | -- Server version: 10.11.11-MariaDB-hetzner1 8 | -- PHP Version: 8.0.30 9 | 10 | SET SQL_MODE = "NO_AUTO_VALUE_ON_ZERO"; 11 | START TRANSACTION; 12 | SET time_zone = "+00:00"; 13 | 14 | 15 | /*!40101 SET @OLD_CHARACTER_SET_CLIENT=@@CHARACTER_SET_CLIENT */; 16 | /*!40101 SET @OLD_CHARACTER_SET_RESULTS=@@CHARACTER_SET_RESULTS */; 17 | /*!40101 SET @OLD_COLLATION_CONNECTION=@@COLLATION_CONNECTION */; 18 | /*!40101 SET NAMES utf8mb4 */; 19 | 20 | -- 21 | -- Database: `regression_finder_db` 22 | -- 23 | 24 | -- -------------------------------------------------------- 25 | 26 | -- 27 | -- Table structure for table `classification_tasks` 28 | -- 29 | 30 | CREATE TABLE `classification_tasks` ( 31 | `project` varchar(31) NOT NULL, 32 | `pr` int(11) NOT NULL, 33 | `worker` text DEFAULT NULL, 34 | `result` longtext DEFAULT NULL, 35 | `timestamp` timestamp NOT NULL DEFAULT current_timestamp() 36 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci; 37 | 38 | -- 39 | -- Indexes for dumped tables 40 | -- 41 | 42 | -- 43 | -- Indexes for table `classification_tasks` 44 | -- 45 | ALTER TABLE `classification_tasks` 46 | ADD PRIMARY KEY (`project`,`pr`,`timestamp`); 47 | COMMIT; 48 | 49 | /*!40101 SET CHARACTER_SET_CLIENT=@OLD_CHARACTER_SET_CLIENT */; 50 | /*!40101 SET CHARACTER_SET_RESULTS=@OLD_CHARACTER_SET_RESULTS */; 51 | /*!40101 SET COLLATION_CONNECTION=@OLD_COLLATION_CONNECTION */; 52 | -------------------------------------------------------------------------------- /src/testora/Config.py: -------------------------------------------------------------------------------- 1 | from testora.util.Logs import Event, append_event 2 | 3 | # KEEP THIS AT THE TOP: needed to log the current configuration 4 | initial_globals = set(globals().keys()) 5 | 6 | # analyze only PRs that have code changes in files in specific programming languages 7 | code_change_pl = "all" # "all" or "python" 8 | 9 | # analyze only PRs with a single parent 10 | single_parent_PRs_only = False 11 | 12 | # use program merger to merge programs 13 | use_program_merger = False 14 | 15 | # filter PRs based on LLM-provided risk assessment 16 | llm_risk_assessment = False 17 | 18 | # try to fix undefined references in generated tests 19 | fix_undefined_refs = True 20 | 21 | # model_version = "gpt-3.5-turbo-0125" 22 | # model_version = "gpt-4-0125-preview" 23 | # model_version = "gpt-4o-mini-2024-07-18" 24 | model_version = "gpt-5-mini-2025-08-07" 25 | # model_version = "gpt-4o-2024-08-06" 26 | # model_version = "deepseek/deepseek-r1" 27 | 28 | 29 | # OpenAI's default: 1.0 30 | classification_temp = 1.0 31 | # DeepSeek's recommended default: 0.6 32 | # classification_temp = 0.6 33 | 34 | # different prompts 35 | test_generation_prompt_version = 2 36 | undefined_refs_fixing_prompt_version = 2 37 | classification_prompt_version = 7 38 | 39 | # KEEP THIS AT THE END: log the current configuration 40 | current_globals = set(globals().keys()) 41 | config_parameters = current_globals - initial_globals 42 | config_parameters = config_parameters - \ 43 | {"initial_globals", "current_globals", "config_parameters"} 44 | config_dict = {p: v for p, v in globals().items() if p in config_parameters} 45 | append_event( 46 | Event(pr_nb=0, message=f"Configuration: {config_dict}")) 47 | -------------------------------------------------------------------------------- /.devcontainer/setup_pytorch_geometric.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | echo "Creating directory for clones" 4 | cd .. 5 | sudo mkdir clones 6 | sudo chown vscode:vscode clones/ 7 | cd clones 8 | 9 | echo "Cleaning any existing pytorch_geometric-dev containers" 10 | docker rm -f pytorch_geometric-dev1 11 | docker rm -f pytorch_geometric-dev2 12 | docker rm -f pytorch_geometric-dev3 13 | 14 | mkdir clone1 15 | cd clone1 16 | 17 | echo "Creating first clone of pytorch_geometric" 18 | git clone https://github.com/pyg-team/pytorch_geometric.git 19 | cd pytorch_geometric 20 | echo "Building dev container for pytorch_geometric (first clone)" 21 | docker run -t -d --name pytorch_geometric-dev1 -v ${PWD}:/home/pytorch_geometric python:3.10 22 | docker exec -w /home/pytorch_geometric pytorch_geometric-dev1 pip install -e '.[dev,full]' 23 | echo "Done with first clone" 24 | 25 | ##### 26 | echo "Creating second clone of pytorch_geometric" 27 | cd ../.. 28 | cp -r clone1 clone2 29 | cd clone2/pytorch_geometric 30 | echo "Building dev container for pytorch_geometric (second clone)" 31 | docker run -t -d --name pytorch_geometric-dev2 -v ${PWD}:/home/pytorch_geometric python:3.10 32 | docker exec -w /home/pytorch_geometric pytorch_geometric-dev2 pip install -e '.[dev,full]' 33 | echo "Done with second clone" 34 | 35 | echo "Creating third clone of pytorch_geometric" 36 | cd ../.. 37 | cp -r clone1 clone3 38 | cd clone3/pytorch_geometric 39 | echo "Building dev container for pytorch_geometric (third clone)" 40 | docker run -t -d --name pytorch_geometric-dev3 -v ${PWD}:/home/pytorch_geometric python:3.10 41 | docker exec -w /home/pytorch_geometric pytorch_geometric-dev3 pip install -e '.[dev,full]' 42 | echo "Done with third clone" 43 | 44 | cd ../../../Testora -------------------------------------------------------------------------------- /.devcontainer/setup_keras.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | echo "Creating directory for clones" 4 | cd .. 5 | sudo mkdir clones 6 | sudo chown vscode:vscode clones/ 7 | cd clones 8 | 9 | echo "Cleaning any existing keras-dev containers" 10 | docker rm -f keras-dev1 11 | docker rm -f keras-dev2 12 | docker rm -f keras-dev3 13 | 14 | mkdir clone1 15 | cd clone1 16 | 17 | echo "Creating first clone of keras" 18 | git clone https://github.com/keras-team/keras.git 19 | cd keras 20 | echo "Building dev container for keras (first clone)" 21 | docker run -t -d --name keras-dev1 -v ${PWD}:/home/keras python:3.10 22 | docker exec -w /home/keras keras-dev1 pip install -r requirements.txt 23 | docker exec -w /home/keras keras-dev1 pip install -e ./ 24 | docker exec -w /home/keras keras-dev1 pip install coverage 25 | echo "Done with first clone" 26 | 27 | echo "Creating second clone of keras" 28 | cd ../.. 29 | cp -r clone1 clone2 30 | cd clone2/keras 31 | echo "Building dev container for keras (second clone)" 32 | docker run -t -d --name keras-dev2 -v ${PWD}:/home/keras python:3.10 33 | docker exec -w /home/keras keras-dev2 pip install -r requirements.txt 34 | docker exec -w /home/keras keras-dev2 pip install -e ./ 35 | docker exec -w /home/keras keras-dev2 pip install coverage 36 | echo "Done with second clone" 37 | 38 | echo "Creating third clone of keras" 39 | cd ../.. 40 | cp -r clone1 clone3 41 | cd clone3/keras 42 | echo "Building dev container for keras (third clone)" 43 | docker run -t -d --name keras-dev3 -v ${PWD}:/home/keras python:3.10 44 | docker exec -w /home/keras keras-dev3 pip install -r requirements.txt 45 | docker exec -w /home/keras keras-dev3 pip install -e ./ 46 | docker exec -w /home/keras keras-dev3 pip install coverage 47 | echo "Done with third clone" 48 | 49 | cd ../../../Testora -------------------------------------------------------------------------------- /.devcontainer/setup_marshmallow.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | echo "Creating directory for clones" 4 | cd .. 5 | sudo mkdir clones 6 | sudo chown vscode:vscode clones/ 7 | cd clones 8 | 9 | echo "Cleaning any existing marshmallow-dev containers" 10 | docker rm -f marshmallow-dev1 11 | docker rm -f marshmallow-dev2 12 | docker rm -f marshmallow-dev3 13 | 14 | mkdir clone1 15 | cd clone1 16 | 17 | echo "Creating first clone of marshmallow" 18 | git clone https://github.com/marshmallow-code/marshmallow.git 19 | cd marshmallow 20 | echo "Building dev container for marshmallow (first clone)" 21 | docker run -t -d --name marshmallow-dev1 -v ${PWD}:/home/marshmallow python:3.10 22 | docker exec -w /home/marshmallow marshmallow-dev1 pip install -e '.[dev]' 23 | docker exec -w /home/marshmallow marshmallow-dev1 pip install coverage 24 | echo "Done with first clone" 25 | 26 | ##### 27 | echo "Creating second clone of marshmallow" 28 | cd ../.. 29 | cp -r clone1 clone2 30 | cd clone2/marshmallow 31 | echo "Building dev container for marshmallow (second clone)" 32 | docker run -t -d --name marshmallow-dev2 -v ${PWD}:/home/marshmallow python:3.10 33 | docker exec -w /home/marshmallow marshmallow-dev2 pip install -e '.[dev]' 34 | docker exec -w /home/marshmallow marshmallow-dev2 pip install coverage 35 | echo "Done with second clone" 36 | 37 | echo "Creating third clone of marshmallow" 38 | cd ../.. 39 | cp -r clone1 clone3 40 | cd clone3/marshmallow 41 | echo "Building dev container for marshmallow (third clone)" 42 | docker run -t -d --name marshmallow-dev3 -v ${PWD}:/home/marshmallow python:3.10 43 | docker exec -w /home/marshmallow marshmallow-dev3 pip install -e '.[dev]' 44 | docker exec -w /home/marshmallow marshmallow-dev3 pip install coverage 45 | echo "Done with third clone" 46 | 47 | cd ../../../Testora -------------------------------------------------------------------------------- /src/multilspy/multilspy_logger.py: -------------------------------------------------------------------------------- 1 | """ 2 | Multilspy logger module. 3 | """ 4 | import inspect 5 | import logging 6 | from datetime import datetime 7 | from pydantic import BaseModel 8 | 9 | class LogLine(BaseModel): 10 | """ 11 | Represents a line in the Multilspy log 12 | """ 13 | 14 | time: str 15 | level: str 16 | caller_file: str 17 | caller_name: str 18 | caller_line: int 19 | message: str 20 | 21 | class MultilspyLogger: 22 | """ 23 | Logger class 24 | """ 25 | 26 | def __init__(self) -> None: 27 | self.logger = logging.getLogger("multilspy") 28 | self.logger.setLevel(logging.INFO) 29 | 30 | def log(self, debug_message: str, level: int, sanitized_error_message: str = "") -> None: 31 | """ 32 | Log the debug and santized messages using the logger 33 | """ 34 | 35 | debug_message = debug_message.replace("'", '"').replace("\n", " ") 36 | sanitized_error_message = sanitized_error_message.replace("'", '"').replace("\n", " ") 37 | 38 | # Collect details about the callee 39 | curframe = inspect.currentframe() 40 | calframe = inspect.getouterframes(curframe, 2) 41 | caller_file = calframe[1][1].split("/")[-1] 42 | caller_line = calframe[1][2] 43 | caller_name = calframe[1][3] 44 | 45 | # Construct the debug log line 46 | debug_log_line = LogLine( 47 | time=str(datetime.now().strftime("%Y-%m-%d %H:%M:%S")), 48 | level=logging.getLevelName(level), 49 | caller_file=caller_file, 50 | caller_name=caller_name, 51 | caller_line=caller_line, 52 | message=debug_message, 53 | ) 54 | 55 | self.logger.log( 56 | level=level, 57 | msg=debug_log_line.json(), 58 | ) 59 | -------------------------------------------------------------------------------- /.devcontainer/setup_scipy.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | echo "Creating directory for clones" 4 | cd .. 5 | sudo mkdir clones 6 | sudo chown vscode:vscode clones/ 7 | cd clones 8 | 9 | echo "Cleaning any existing scipy-dev containers" 10 | docker rm -f scipy-dev1 11 | docker rm -f scipy-dev2 12 | docker rm -f scipy-dev3 13 | 14 | mkdir clone1 15 | cd clone1 16 | 17 | echo "Creating first clone of scipy" 18 | git clone https://github.com/scipy/scipy.git 19 | cd scipy 20 | git submodule update --init 21 | echo "Building dev container for scipy (first clone)" 22 | docker run -t -d --name scipy-dev1 -v ${PWD}:/home/scipy python:3.10 23 | docker cp /workspaces/Testora/.devcontainer/setup_scipy_to_run_in_container.sh scipy-dev1:/root/setup.sh 24 | docker exec scipy-dev1 chmod +x /root/setup.sh 25 | docker exec -w /home/scipy scipy-dev1 /root/setup.sh 26 | echo "Done with first clone" 27 | 28 | echo "Creating second clone of scipy" 29 | cd ../.. 30 | cp -r clone1 clone2 31 | cd clone2/scipy 32 | echo "Building dev container for scipy (second clone)" 33 | docker run -t -d --name scipy-dev2 -v ${PWD}:/home/scipy python:3.10 34 | docker cp /workspaces/Testora/.devcontainer/setup_scipy_to_run_in_container.sh scipy-dev2:/root/setup.sh 35 | docker exec scipy-dev2 chmod +x /root/setup.sh 36 | docker exec -w /home/scipy scipy-dev2 /root/setup.sh 37 | echo "Done with second clone" 38 | 39 | echo "Creating third clone of scipy" 40 | cd ../.. 41 | cp -r clone1 clone3 42 | cd clone3/scipy 43 | echo "Building dev container for scipy (third clone)" 44 | docker run -t -d --name scipy-dev3 -v ${PWD}:/home/scipy python:3.10 45 | docker cp /workspaces/Testora/.devcontainer/setup_scipy_to_run_in_container.sh scipy-dev3:/root/setup.sh 46 | docker exec scipy-dev3 chmod +x /root/setup.sh 47 | docker exec -w /home/scipy scipy-dev3 /root/setup.sh 48 | echo "Done with third clone" 49 | 50 | cd ../../../Testora 51 | -------------------------------------------------------------------------------- /.devcontainer/setup_numpy.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | echo "Creating directory for clones" 4 | cd .. 5 | sudo mkdir clones 6 | sudo chown vscode:vscode clones/ 7 | cd clones 8 | 9 | echo "Cleaning any existing numpy-dev containers" 10 | docker rm -f numpy-dev1 11 | docker rm -f numpy-dev2 12 | docker rm -f numpy-dev3 13 | 14 | mkdir clone1 15 | cd clone1 16 | 17 | echo "Creating first clone of numpy" 18 | git clone https://github.com/numpy/numpy.git 19 | cd numpy 20 | git submodule update --init 21 | echo "Building dev container for numpy (first clone)" 22 | docker run -t -d --name numpy-dev1 -v ${PWD}:/home/numpy python:3.10 23 | docker cp /workspaces/Testora/.devcontainer/setup_numpy_to_run_in_container.sh numpy-dev1:/root/setup.sh 24 | docker exec numpy-dev1 chmod +x /root/setup.sh 25 | docker exec -w /home/numpy numpy-dev1 /root/setup.sh 26 | echo "Done with first clone" 27 | 28 | echo "Creating second clone of numpy" 29 | cd ../.. 30 | cp -r clone1 clone2 31 | cd clone2/numpy 32 | echo "Building dev container for numpy (second clone)" 33 | docker run -t -d --name numpy-dev2 -v ${PWD}:/home/numpy python:3.10 34 | docker cp /workspaces/Testora/.devcontainer/setup_numpy_to_run_in_container.sh numpy-dev2:/root/setup.sh 35 | docker exec numpy-dev2 chmod +x /root/setup.sh 36 | docker exec -w /home/numpy numpy-dev2 /root/setup.sh 37 | echo "Done with second clone" 38 | 39 | echo "Creating third clone of numpy" 40 | cd ../.. 41 | cp -r clone1 clone3 42 | cd clone3/numpy 43 | echo "Building dev container for numpy (third clone)" 44 | docker run -t -d --name numpy-dev3 -v ${PWD}:/home/numpy python:3.10 45 | docker cp /workspaces/Testora/.devcontainer/setup_numpy_to_run_in_container.sh numpy-dev3:/root/setup.sh 46 | docker exec numpy-dev3 chmod +x /root/setup.sh 47 | docker exec -w /home/numpy numpy-dev3 /root/setup.sh 48 | echo "Done with third clone" 49 | 50 | 51 | cd ../../../Testora 52 | -------------------------------------------------------------------------------- /.devcontainer/setup_pandas.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | echo "Creating directory for clones" 4 | cd .. 5 | sudo mkdir clones 6 | sudo chown vscode:vscode clones/ 7 | cd clones 8 | 9 | echo "Cleaning any existing pandas-dev containers" 10 | docker rm -f pandas-dev1 11 | docker rm -f pandas-dev2 12 | docker rm -f pandas-dev3 13 | 14 | mkdir clone1 15 | cd clone1 16 | 17 | echo "Creating first clone of pandas" 18 | git clone https://github.com/pandas-dev/pandas.git 19 | cd pandas 20 | git checkout e0398c4 # latest commit that still has a Dockerfile 21 | echo "Building dev container for pandas (first clone)" 22 | docker build -t pandas-dev . 23 | docker run -t -d --name pandas-dev1 -v ${PWD}:/home/pandas pandas-dev 24 | docker exec pandas-dev1 python -m pip install -ve . --no-build-isolation --config-settings editable-verbose=true 25 | docker exec pandas-dev1 python -m pip install coverage 26 | echo "Done with first clone" 27 | 28 | echo "Creating second clone of pandas" 29 | cd ../.. 30 | cp -r clone1 clone2 31 | cd clone2/pandas 32 | git checkout e0398c4 # latest commit that still has a Dockerfile 33 | echo "Building dev container for pandas (second clone)" 34 | docker run -t -d --name pandas-dev2 -v ${PWD}:/home/pandas pandas-dev 35 | docker exec pandas-dev2 python -m pip install -ve . --no-build-isolation --config-settings editable-verbose=true 36 | docker exec pandas-dev2 python -m pip install coverage 37 | echo "Done with second clone" 38 | 39 | echo "Creating third clone of pandas" 40 | cd ../.. 41 | cp -r clone1 clone3 42 | cd clone3/pandas 43 | git checkout e0398c4 # latest commit that still has a Dockerfile 44 | echo "Building dev container for pandas (third clone)" 45 | docker run -t -d --name pandas-dev3 -v ${PWD}:/home/pandas pandas-dev 46 | docker exec pandas-dev3 python -m pip install -ve . --no-build-isolation --config-settings editable-verbose=true 47 | docker exec pandas-dev3 python -m pip install coverage 48 | echo "Done with third clone" 49 | 50 | cd ../../../Testora 51 | -------------------------------------------------------------------------------- /data/ground_truth/scipy/21528.json: -------------------------------------------------------------------------------- 1 | { 2 | "pr_number": 21528, 3 | "log_file": "data/results/scipy/21528_2024-11-23 09:06:43.json", 4 | "differentiating_tests": [ 5 | { 6 | "test": { 7 | "test_code": "# Example 8:\nimport numpy as np\nfrom scipy.special import logsumexp\n\na = np.array([1.0, 2.0, 3.0], dtype=np.float32)\nresult = logsumexp(a)\nprint(\"logsumexp(a) with dtype np.float32 =\", result)", 8 | "old_output": "logsumexp(a) with dtype np.float32 = 3.4076059644443806\n", 9 | "new_output": "logsumexp(a) with dtype np.float32 = 3.4076061\n" 10 | }, 11 | "label": "intended", 12 | "comment": "floating point (in)accuracy, which seems a legitimate side-effect of the type-related fix" 13 | }, 14 | { 15 | "test": { 16 | "test_code": "# Example 13:\nimport numpy as np\nfrom scipy.special import logsumexp\n\na = None\ntry:\n result = logsumexp(a)\nexcept Exception as e:", 17 | "old_output": "Couldn't run '/tmp/BugGPT/BugGPT_test_code.py' as Python code: IndentationError: expected an indented block after 'except' statement on line 8 (BugGPT_test_code.py, line 8)\n/root/conda/envs/scipy-dev/lib/python3.13/site-packages/coverage/control.py:892: CoverageWarning: No data was collected. (no-data-collected)\n self._warn(\"No data was collected.\", slug=\"no-data-collected\")\n", 18 | "new_output": "/root/conda/envs/scipy-dev/lib/python3.13/site-packages/coverage/control.py:892: CoverageWarning: No data was collected. (no-data-collected)\n self._warn(\"No data was collected.\", slug=\"no-data-collected\")\nCouldn't run '/tmp/BugGPT/BugGPT_test_code.py' as Python code: IndentationError: expected an indented block after 'except' statement on line 8 (BugGPT_test_code.py, line 8)\n" 19 | }, 20 | "label": "intended", 21 | "comment": "irrelevant difference in logging order" 22 | } 23 | ] 24 | } -------------------------------------------------------------------------------- /src/testora/evaluation/CheckedPRsInspector.py: -------------------------------------------------------------------------------- 1 | # Helper script to find PRs that are in-scope for checking (i.e., that are not discarded because of changing only test files, having to many changes, etc.) 2 | 3 | from testora.evaluation.ResultsManager import result_files_for_project 4 | from testora.util.LogParser import parse_log_files 5 | from testora.evaluation.TargetPRs import project_to_target_prs 6 | 7 | for project in ["keras", "marshmallow", "scipy", "pandas"]: 8 | print(f"Project {project}:") 9 | pr_results, _ = parse_log_files(result_files_for_project(project)) 10 | in_scope_pr_nbs = [] 11 | for pr_result in pr_results: 12 | if pr_result.status() != "ignored": 13 | in_scope_pr_nbs.append(pr_result.number) 14 | print(",\n".join([str(n) for n in sorted(in_scope_pr_nbs)])) 15 | print(f"--> {len(in_scope_pr_nbs)} PRs in-scope\n") 16 | print() 17 | 18 | 19 | print("\n\n===========================\n\n") 20 | 21 | # print new results as csv 22 | minimum_timestamp = "2024-11-22 09:05:00" 23 | print("Project, PR, Generated tests, Executed tests, Diff-covered tests, Failures, Differences") 24 | for project, target_prs in project_to_target_prs().items(): 25 | pr_results, _ = parse_log_files( 26 | result_files_for_project(project, minimum_timestamp)) 27 | for target_pr in target_prs: 28 | pr_result = next( 29 | (r for r in pr_results if r.number == target_pr), None) 30 | if pr_result is None: 31 | entries = [ 32 | project, 33 | str(target_pr) 34 | ] 35 | else: 36 | entries = [ 37 | project, 38 | str(target_pr), 39 | str(pr_result.nb_generated_tests), 40 | str(pr_result.nb_test_executions), 41 | str(pr_result.nb_diff_covered_tests), 42 | str(pr_result.nb_test_failures), 43 | str(pr_result.nb_different_behavior) 44 | ] 45 | print(", ".join(entries)) 46 | -------------------------------------------------------------------------------- /src/testora/prompts/SelectExpectedBehaviorPrompt.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | from testora.util.Exceptions import TestoraException 4 | 5 | answer_pattern = re.compile(r"(.*?)", re.DOTALL) 6 | 7 | 8 | class SelectExpectedBehaviorPrompt: 9 | def __init__(self, project_name, test_code, output1, output2, docstrings): 10 | self.project_name = project_name 11 | self.test_code = test_code 12 | self.output1 = output1 13 | self.output2 = output2 14 | self.docstrings = docstrings 15 | self.use_json_output = False 16 | 17 | def create_prompt(self): 18 | template = """ 19 | # Usage Example 20 | The following is a usage example of the {project_name} project: 21 | ```python 22 | {test_code} 23 | ``` 24 | 25 | # Docstrings of Relevant APIs 26 | {docstrings} 27 | 28 | # Possible Outputs 29 | Consider the following two outputs that the above example could produce. 30 | 31 | Output 1: 32 | {output1} 33 | 34 | Output 2: 35 | {output2} 36 | 37 | # Question 38 | Which of the two outputs is the expected behavior of the example? Explain your reasoning, and then write in and tags either "Output 1" or "Output 2". 39 | """ 40 | return template.format(project_name=self.project_name, 41 | test_code=self.test_code, 42 | docstrings=self.docstrings, 43 | output1=self.output1, 44 | output2=self.output2) 45 | 46 | def parse_answer(self, raw_answer): 47 | assert type(raw_answer) == list 48 | assert len(raw_answer) == 1 49 | 50 | raw_answer = raw_answer[0] 51 | 52 | match = re.search(answer_pattern, raw_answer) 53 | if match is None: 54 | raise TestoraException("Could not find answer in the response.") 55 | answer = match.group(1) 56 | if answer.strip() == "Output 1": 57 | return 1 58 | elif answer.strip() == "Output 2": 59 | return 2 60 | else: 61 | return 0 62 | -------------------------------------------------------------------------------- /.devcontainer/setup_scikit-learn.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | echo "Creating directory for clones" 4 | cd .. 5 | sudo mkdir clones 6 | sudo chown vscode:vscode clones/ 7 | cd clones 8 | 9 | echo "Cleaning any existing scikit-learn containers" 10 | docker rm -f scikit-learn-dev1 11 | docker rm -f scikit-learn-dev2 12 | docker rm -f scikit-learn-dev3 13 | 14 | mkdir clone1 15 | cd clone1 16 | 17 | echo "Creating first clone of scikit-learn" 18 | git clone https://github.com/scikit-learn/scikit-learn.git 19 | cd scikit-learn 20 | echo "Building dev container for scikit-learn (first clone)" 21 | docker run -t -d --name scikit-learn-dev1 -v ${PWD}:/home/scikit-learn python:3.10 22 | docker exec -w /home/scikit-learn scikit-learn-dev1 pip install wheel numpy scipy cython meson-python ninja 23 | docker exec -w /home/scikit-learn scikit-learn-dev1 pip install --editable . --verbose --no-build-isolation --config-settings editable-verbose=true 24 | echo "Done with first clone" 25 | 26 | echo "Creating second clone of scikit-learn" 27 | cd ../.. 28 | cp -r clone1 clone2 29 | cd clone2/scikit-learn 30 | echo "Building dev container for scikit-learn (first clone)" 31 | docker run -t -d --name scikit-learn-dev2 -v ${PWD}:/home/scikit-learn python:3.10 32 | docker exec -w /home/scikit-learn scikit-learn-dev2 pip install wheel numpy scipy cython meson-python ninja 33 | docker exec -w /home/scikit-learn scikit-learn-dev2 pip install --editable . --verbose --no-build-isolation --config-settings editable-verbose=true 34 | echo "Done with second clone" 35 | 36 | echo "Creating third clone of scikit-learn" 37 | cd ../.. 38 | cp -r clone1 clone3 39 | cd clone3/scikit-learn 40 | echo "Building dev container for scikit-learn (first clone)" 41 | docker run -t -d --name scikit-learn-dev3 -v ${PWD}:/home/scikit-learn python:3.10 42 | docker exec -w /home/scikit-learn scikit-learn-dev3 pip install wheel numpy scipy cython meson-python ninja 43 | docker exec -w /home/scikit-learn scikit-learn-dev3 pip install --editable . --verbose --no-build-isolation --config-settings editable-verbose=true 44 | echo "Done with third clone" 45 | 46 | cd ../../../Testora 47 | -------------------------------------------------------------------------------- /data/ground_truth/marshmallow/2271.json: -------------------------------------------------------------------------------- 1 | { 2 | "pr_number": 2271, 3 | "log_file": "data/results/marshmallow/2271_2024-11-23 09:16:17.json", 4 | "differentiating_tests": [ 5 | { 6 | "test": { 7 | "test_code": "# Example 4:\nfrom marshmallow import Schema, fields\n\nclass ManySchema(Schema):\n foo = fields.Str()\n\n class Meta:\n many = True\n\nschema = ManySchema()\nresult = schema.load([{\"foo\": \"bar\"}, {\"foo\": \"baz\"}])", 8 | "old_output": "Traceback (most recent call last):\n File \"/tmp/BugGPT/BugGPT_test_code.py\", line 11, in \n result = schema.load([{\"foo\": \"bar\"}, {\"foo\": \"baz\"}])\n File \"/home/marshmallow/src/marshmallow/schema.py\", line 724, in load\n return self._do_load(\n File \"/home/marshmallow/src/marshmallow/schema.py\", line 911, in _do_load\n raise exc\nmarshmallow.exceptions.ValidationError: {'_schema': ['Invalid input type.']}\n", 9 | "new_output": "" 10 | }, 11 | "label": "intended", 12 | "comment": "PR addresses issue that is about supporting 'many=True'" 13 | }, 14 | { 15 | "test": { 16 | "test_code": "# Example 5:\nfrom marshmallow import Schema, fields\n\nclass SampleSchema(Schema):\n code = fields.Int()\n\n class Meta:\n many = True\n\nschema = SampleSchema()\nresult = schema.load([{\"code\": 100}, {\"code\": 200}])", 17 | "old_output": "Traceback (most recent call last):\n File \"/tmp/BugGPT/BugGPT_test_code.py\", line 11, in \n result = schema.load([{\"code\": 100}, {\"code\": 200}])\n File \"/home/marshmallow/src/marshmallow/schema.py\", line 724, in load\n return self._do_load(\n File \"/home/marshmallow/src/marshmallow/schema.py\", line 911, in _do_load\n raise exc\nmarshmallow.exceptions.ValidationError: {'_schema': ['Invalid input type.']}\n", 18 | "new_output": "" 19 | }, 20 | "label": "intended", 21 | "comment": "PR addresses issue that is about supporting 'many=True'" 22 | } 23 | ] 24 | } -------------------------------------------------------------------------------- /src/multilspy/lsp_protocol_handler/lsp_constants.py: -------------------------------------------------------------------------------- 1 | """ 2 | This module contains constants used in the LSP protocol. 3 | """ 4 | 5 | class LSPConstants: 6 | """ 7 | This class contains constants used in the LSP protocol. 8 | """ 9 | 10 | # the key for uri used to represent paths 11 | URI = "uri" 12 | 13 | # the key for range, which is a from and to position within a text document 14 | RANGE = "range" 15 | 16 | # A key used in LocationLink type, used as the span of the origin link 17 | ORIGIN_SELECTION_RANGE = "originSelectionRange" 18 | 19 | # A key used in LocationLink type, used as the target uri of the link 20 | TARGET_URI = "targetUri" 21 | 22 | # A key used in LocationLink type, used as the target range of the link 23 | TARGET_RANGE = "targetRange" 24 | 25 | # A key used in LocationLink type, used as the target selection range of the link 26 | TARGET_SELECTION_RANGE = "targetSelectionRange" 27 | 28 | # key for the textDocument field in the request 29 | TEXT_DOCUMENT = "textDocument" 30 | 31 | # key used to represent the language a document is in - "java", "csharp", etc. 32 | LANGUAGE_ID = "languageId" 33 | 34 | # key used to represent the version of a document (a shared value betwen the client and server) 35 | VERSION = "version" 36 | 37 | # key used to represent the text of a document being sent from the client to the server on open 38 | TEXT = "text" 39 | 40 | # key used to represent a position (line and colnum) within a text document 41 | POSITION = "position" 42 | 43 | # key used to represent the line number of a position 44 | LINE = "line" 45 | 46 | # key used to represent the column number of a position 47 | CHARACTER = "character" 48 | 49 | # key used to represent the changes made to a document 50 | CONTENT_CHANGES = "contentChanges" 51 | 52 | # key used to represent name of symbols 53 | NAME = "name" 54 | 55 | # key used to represent the kind of symbols 56 | KIND = "kind" 57 | 58 | # key used to represent children in document symbols 59 | CHILDREN = "children" 60 | -------------------------------------------------------------------------------- /src/testora/execution/CoverageAnalyzer.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | import io 3 | import sqlite3 4 | from coverage.data import CoverageData 5 | 6 | from testora.util.Exceptions import TestoraException 7 | 8 | 9 | @dataclass 10 | class DiffCoverage: 11 | percentage_covered: float 12 | total_modified_lines: int 13 | total_covered_modified_lines: int 14 | 15 | def __str__(self): 16 | return f"Coverage: {self.percentage_covered:.2%} ({self.total_covered_modified_lines}/{self.total_modified_lines})" 17 | 18 | 19 | def summarize_coverage(pr, test_execution, is_old_version): 20 | # get coverage data 21 | tmp_coverage_file = "coverage_report" 22 | with open(tmp_coverage_file, "wb") as f: 23 | f.write(test_execution.coverage_report) 24 | coverage_data = CoverageData(tmp_coverage_file) 25 | coverage_data.read() 26 | 27 | # adapt file paths from the container's file system 28 | project_name = pr.cloned_repo_manager.repo_name 29 | file_prefix = f"/home/{project_name}/" 30 | 31 | # check coverage for modified files 32 | total_modified_lines = 0 33 | total_covered_modified_lines = 0 34 | target_files = pr.non_test_modified_python_files 35 | for target_file in target_files: 36 | # get modified lines 37 | if is_old_version: 38 | modified_lines = pr.old_file_path_to_modified_lines[target_file] 39 | else: 40 | modified_lines = pr.new_file_path_to_modified_lines[target_file] 41 | 42 | # get covered lines 43 | covered_lines = coverage_data.lines(file_prefix + target_file) 44 | if covered_lines is None: 45 | # happens, e.g., when generated test doesn't invoke the tested project 46 | covered_lines = set() 47 | 48 | total_modified_lines += len(modified_lines) 49 | total_covered_modified_lines += len(set(modified_lines) 50 | & set(covered_lines)) 51 | 52 | percentage_covered = total_covered_modified_lines / \ 53 | total_modified_lines if total_modified_lines > 0 else 0 54 | 55 | return DiffCoverage( 56 | percentage_covered, 57 | total_modified_lines, 58 | total_covered_modified_lines) 59 | -------------------------------------------------------------------------------- /data/ground_truth/marshmallow/2123.json: -------------------------------------------------------------------------------- 1 | { 2 | "pr_number": 2123, 3 | "log_file": "data/results/marshmallow/2123_2024-11-23 09:16:17.json", 4 | "differentiating_tests": [ 5 | { 6 | "test": { 7 | "test_code": "# Example 7: Setting both relative and absolute, expecting failure\nfrom marshmallow import Schema, fields, ValidationError\n\ntry:\n class MySchema(Schema):\n url = fields.Url(relative=True, absolute=False)\n\n schema = MySchema()\n result = schema.load({\"url\": \"http://example.com\"})\nexcept ValidationError as e:", 8 | "old_output": "Couldn't run '/tmp/BugGPT/BugGPT_test_code.py' as Python code: IndentationError: expected an indented block after 'except' statement on line 10 (BugGPT_test_code.py, line 10)\n/usr/local/lib/python3.10/site-packages/coverage/control.py:892: CoverageWarning: No data was collected. (no-data-collected)\n self._warn(\"No data was collected.\", slug=\"no-data-collected\")\n", 9 | "new_output": "/usr/local/lib/python3.10/site-packages/coverage/control.py:892: CoverageWarning: No data was collected. (no-data-collected)\n self._warn(\"No data was collected.\", slug=\"no-data-collected\")\nCouldn't run '/tmp/BugGPT/BugGPT_test_code.py' as Python code: IndentationError: expected an indented block after 'except' statement on line 10 (BugGPT_test_code.py, line 10)\n" 10 | }, 11 | "label": "intended", 12 | "comment": "different order of error messages" 13 | }, 14 | { 15 | "test": { 16 | "test_code": "from marshmallow import Schema, fields\n\n# Example 3: URL that is not absolute or relative\ntry:\n class MySchema(Schema):\n url = fields.Url(relative=False, absolute=False)\n \n result = MySchema().load({\"url\": \"example.com\"})\nexcept Exception as e:\n print(e) # Should raise a validation error", 17 | "old_output": "{'url': ['Not a valid URL.']}\n", 18 | "new_output": "URL validation cannot set both relative and absolute to False.\n" 19 | }, 20 | "label": "intended", 21 | "comment": "PR mentions that cannot set both 'relative' and 'absolute' to False" 22 | } 23 | ] 24 | } -------------------------------------------------------------------------------- /templates/pr_result.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | Result for PR {{ pr_result.number }} 8 | 60 | 61 | 62 | 63 |
64 |

PR{{ pr_result.number }}: {{ pr_result.title }}

65 |
66 | 67 |

Test Case

68 |
{{ classification_result.test_code }}
69 | 70 |

Old Output

71 |
{{ classification_result.old_output }}
72 | 73 |

New Output

74 |
{{ classification_result.new_output }}
75 | 76 |

Classification: {{ classification_result.classification }}

77 |
{{ classification_result.classification_explanation }}
78 |
79 | 80 | 81 | 82 | 83 | -------------------------------------------------------------------------------- /src/testora/util/PythonLanguageServer.py: -------------------------------------------------------------------------------- 1 | from testora.util.PythonCodeUtil import get_locations_of_calls 2 | from multilspy import SyncLanguageServer 3 | from multilspy.multilspy_config import MultilspyConfig 4 | from multilspy.multilspy_logger import MultilspyLogger 5 | from pathlib import Path 6 | 7 | 8 | class PythonLanguageServer: 9 | def __init__(self, repo_path): 10 | config = MultilspyConfig.from_dict({"code_language": "python"}) 11 | logger = MultilspyLogger() 12 | absolute_repo_path = str(Path(repo_path).resolve()) 13 | self.lsp = SyncLanguageServer.create( 14 | config, logger, absolute_repo_path) 15 | 16 | def get_hover_text(self, file_path, line, column): 17 | with self.lsp.start_server(): 18 | raw_result = self.lsp.request_hover(file_path, line, column) 19 | if type(raw_result) == dict and "contents" in raw_result: 20 | return raw_result["contents"]["value"] 21 | else: 22 | return "" 23 | 24 | 25 | # for testing 26 | if __name__ == "__main__": 27 | code = """import pandas as pd 28 | 29 | series_complex = pd.Series([complex(1,2), complex(3,4)]) 30 | # This will result in an error as rounding is not applicable to complex numbers 31 | try: 32 | rounded_complex = series_complex.round(2) 33 | print(rounded_complex) 34 | except TypeError as e: 35 | print(f"Error: {e}") 36 | """ 37 | call_locations = get_locations_of_calls(code) 38 | 39 | test_path = "/workspaces/clones/clone2/pandas/testora_code/test.py" 40 | repo_path = "/workspaces/clones/clone2/pandas/" 41 | # test_path = "/home/m/research/collabs/Testora/data/repos/pandas_pool/clone2/pandas/testora_code/test.py" 42 | # repo_path = "/home/m/research/collabs/Testora/data/repos/pandas_pool/clone2/pandas/" 43 | 44 | with open(test_path, "w") as f: 45 | f.write(code) 46 | 47 | server = PythonLanguageServer(repo_path) 48 | for call_location in call_locations: 49 | line = call_location.start.line - 1 # LSP lines are 0-based 50 | column = call_location.start.column 51 | r = server.get_hover_text( 52 | test_path, line, column) 53 | print("--------------------------------------------------") 54 | print(r) 55 | print() 56 | -------------------------------------------------------------------------------- /src/testora/prompts/TemperatureExperiment.py: -------------------------------------------------------------------------------- 1 | from openai import OpenAI 2 | from testora.prompts.RegressionClassificationPromptV2 import RegressionClassificationPromptV2 3 | from testora.prompts.PromptCommon import system_message 4 | 5 | with open(".openai_token", "r") as f: 6 | openai_key = f.read().strip() 7 | 8 | openai = OpenAI(api_key=openai_key) 9 | gpt4o_model = "gpt-4o-2024-05-13" 10 | 11 | with open("data/example_prompts/intended1.txt", "r") as f: 12 | intended_prompt1 = f.read() 13 | 14 | with open("data/example_prompts/intended2.txt", "r") as f: 15 | intended_prompt2 = f.read() 16 | 17 | with open("data/example_prompts/intended3.txt", "r") as f: 18 | intended_prompt3 = f.read() 19 | 20 | with open("data/example_prompts/surprising1.txt", "r") as f: 21 | surprising_prompt1 = f.read() 22 | 23 | with open("data/example_prompts/surprising2.txt", "r") as f: 24 | surprising_prompt2 = f.read() 25 | 26 | 27 | def call_model(prompt, temperature): 28 | completion = openai.chat.completions.create( 29 | model=gpt4o_model, 30 | messages=[ 31 | {"role": "system", "content": system_message}, 32 | {"role": "user", "content": prompt} 33 | ], 34 | max_tokens=4096, # 4096 is the maximum token limit for gpt-4-0125-preview 35 | n=1, 36 | temperature=temperature 37 | ) 38 | return completion.choices[0].message.content 39 | 40 | 41 | if __name__ == "__main__": 42 | intended = [intended_prompt1, intended_prompt2, intended_prompt3] 43 | surprising = [surprising_prompt1, surprising_prompt2] 44 | r = RegressionClassificationPromptV2("", "", "", "", "", "") 45 | 46 | for idx, prompt in enumerate(intended): 47 | print(f"Intended prompt {idx + 1}:") 48 | for temperature in [0, 0.2, 0.7, 1.0]: 49 | answer = call_model(prompt, temperature) 50 | is_relevant_change, is_deterministic, is_public, is_legal, is_surprising = r.parse_answer( 51 | [answer]) 52 | print(f" temp={temperature} gives surprising={is_surprising}") 53 | 54 | print() 55 | for idx, prompt in enumerate(surprising): 56 | print(f"Surprising prompt {idx + 1}:") 57 | for temperature in [0, 0.2, 0.7, 1.0]: 58 | answer = call_model(prompt, temperature) 59 | is_relevant_change, is_deterministic, is_public, is_legal, is_surprising = r.parse_answer( 60 | [answer]) 61 | print(f" temp={temperature} gives surprising={is_surprising}") 62 | -------------------------------------------------------------------------------- /templates/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | Analyzed Pull Requests 8 | 30 | 31 | 32 | 33 |

Summary

34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | {% for status, nb in summary.items() %} 43 | 44 | 45 | 46 | 47 | {% endfor %} 48 | 49 |
StatusNumber
{{ status }}{{ nb }}
50 | 51 |

Pull Requests

52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | {% for pr_result in pr_results %} 65 | 66 | 67 | 68 | 69 | 70 | 71 | 76 | 77 | {% endfor %} 78 | 79 |
PRTitleSummaryTimeStatusClassifications
{{ pr_result.number }}{{ pr_result.title }}{{ pr_result.summary() }}{{ pr_result.time_taken }}{{ pr_result.status() }} 72 | {% for classification_result in pr_result.classification_results %} 73 | {{ classification_result.classification }}
74 | {% endfor %} 75 |
80 | 81 | 82 | -------------------------------------------------------------------------------- /data/ground_truth/pandas/59810.json: -------------------------------------------------------------------------------- 1 | { 2 | "pr_number": 59810, 3 | "log_file": "data/results/pandas/59810_2024-11-23 09:09:42.json", 4 | "differentiating_tests": [ 5 | { 6 | "test": { 7 | "test_code": "# Example 6:\nimport pandas as pd\n\ndf = pd.DataFrame({'x': [1, 2, None], 'y': [2, 2, 3]})\nresult = df.query('x == y or x == None')", 8 | "old_output": "/usr/local/lib/python3.10/site-packages/coverage/inorout.py:457: CoverageWarning: --include is ignored because --source is set (include-ignored)\n self.warn(\"--include is ignored because --source is set\", slug=\"include-ignored\")\nTraceback (most recent call last):\n File \"/tmp/BugGPT/BugGPT_test_code.py\", line 5, in \n result = df.query('x == y or x == None')\n File \"/home/pandas/pandas/core/frame.py\", line 4616, in query\n res = self.eval(expr, **kwargs)\n File \"/home/pandas/pandas/core/frame.py\", line 4769, in eval\n return _eval(expr, inplace=inplace, **kwargs)\n File \"/home/pandas/pandas/core/computation/eval.py\", line 366, in eval\n ret = eng_inst.evaluate()\n File \"/home/pandas/pandas/core/computation/engines.py\", line 85, in evaluate\n res = self._evaluate()\n File \"/home/pandas/pandas/core/computation/engines.py\", line 129, in _evaluate\n return ne.evaluate(s, local_dict=scope)\n File \"/usr/local/lib/python3.10/site-packages/numexpr/necompiler.py\", line 977, in evaluate\n raise e\n File \"/usr/local/lib/python3.10/site-packages/numexpr/necompiler.py\", line 874, in validate\n _names_cache[expr_key] = getExprNames(ex, context, sanitize=sanitize)\n File \"/usr/local/lib/python3.10/site-packages/numexpr/necompiler.py\", line 723, in getExprNames\n ex = stringToExpression(text, {}, context, sanitize)\n File \"/usr/local/lib/python3.10/site-packages/numexpr/necompiler.py\", line 309, in stringToExpression\n ex = eval(c, names)\n File \"\", line 1, in \n File \"/usr/local/lib/python3.10/site-packages/numexpr/expressions.py\", line 80, in func\n raise TypeError(\"unsupported object type: %s\" % type(x))\nTypeError: unsupported object type: \n", 9 | "new_output": "/usr/local/lib/python3.10/site-packages/coverage/inorout.py:457: CoverageWarning: --include is ignored because --source is set (include-ignored)\n self.warn(\"--include is ignored because --source is set\", slug=\"include-ignored\")\n" 10 | }, 11 | "label": "coincidental fix", 12 | "comment": "Positive side-effect of the fix. The new version uses the Python evaluation of queries, which doesn't raise an error but handles the None value correctly." 13 | } 14 | ] 15 | } -------------------------------------------------------------------------------- /src/testora/util/Logs.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | from datetime import datetime, timedelta 4 | import atexit 5 | from typing import List, Optional 6 | from pydantic import BaseModel 7 | 8 | from testora.util.ClassificationResult import Classification 9 | 10 | 11 | class Event(BaseModel): 12 | timestamp: str = "" 13 | pr_nb: int 14 | message: str 15 | 16 | 17 | class PREvent(Event): 18 | title: str 19 | url: str 20 | 21 | 22 | class TestExecutionEvent(Event): 23 | code: str 24 | output: str 25 | 26 | 27 | class ComparisonEvent(Event): 28 | test_code: str 29 | old_output: str 30 | new_output: str 31 | 32 | 33 | class PreClassificationEvent(Event): 34 | test_code: str 35 | old_output: str 36 | new_output: str 37 | 38 | 39 | class ClassificationEvent(Event): 40 | test_code: str 41 | old_output: str 42 | new_output: str 43 | classification: Classification 44 | classification_explanation: str 45 | old_is_crash: bool 46 | new_is_crash: bool 47 | 48 | 49 | class SelectBehaviorEvent(Event): 50 | expected_output: int 51 | 52 | 53 | class LLMEvent(Event): 54 | content: str 55 | 56 | 57 | class ErrorEvent(Event): 58 | details: str 59 | 60 | 61 | class CoverageEvent(Event): 62 | details: str 63 | 64 | 65 | class ClassifierEvalEvent(Event): 66 | label: str 67 | predictions: str 68 | 69 | 70 | events: List[Event] = [] 71 | last_time_stored = datetime.now() 72 | last_file_stored_to: Optional[str] = None 73 | 74 | 75 | def append_event(evt): 76 | global last_time_stored 77 | 78 | evt.timestamp = datetime.now().isoformat() 79 | events.append(evt) 80 | print(json.dumps(evt.dict(), indent=2)) 81 | 82 | if datetime.now() - last_time_stored > timedelta(minutes=5): 83 | store_logs() 84 | last_time_stored = datetime.now() 85 | 86 | 87 | def get_logs_as_json(): 88 | return json.dumps([evt.dict() for evt in events], indent=2) 89 | 90 | 91 | def store_logs(): 92 | global last_file_stored_to 93 | timestamp = datetime.now().isoformat() 94 | event_dicts = [evt.model_dump() for evt in events] 95 | out_file = f"logs_{timestamp}.json" 96 | json.dump(event_dicts, open(out_file, "w"), indent=2) 97 | 98 | # remove previous log from this run 99 | if last_file_stored_to is not None: 100 | os.remove(last_file_stored_to) 101 | last_file_stored_to = out_file 102 | 103 | 104 | def reset_logs(): 105 | global events 106 | events = [] 107 | last_file_stored_to = None 108 | 109 | 110 | def start_logging(): 111 | atexit.register(store_logs) 112 | -------------------------------------------------------------------------------- /src/testora/prompts/PRRegressionBugRanking.py: -------------------------------------------------------------------------------- 1 | import json 2 | from testora.util.Logs import LLMEvent, append_event 3 | 4 | 5 | class PRRegressionBugRanking: 6 | def __init__(self, github_prs, repo_name): 7 | self.github_prs = github_prs 8 | self.repo_name = repo_name 9 | self.use_json_output = True 10 | 11 | def create_prompt(self): 12 | template = """ 13 | The following is a list of titles of pull requests in the project. Rank them by their likelihood to accidentally introduce a regression bug. 14 | 15 | 16 | 17 | Provide your answer using this JSON format: 18 | ```json 19 | { 20 | "high risk": [ 21 | "PR title 1", 22 | "PR title 2", 23 | ... 24 | ], 25 | "medium risk": [ 26 | "PR title 3", 27 | "PR title 4", 28 | ... 29 | ], 30 | "low risk": [ 31 | "PR title 5", 32 | "PR title 6", 33 | ... 34 | ] 35 | } 36 | ``` 37 | Make sure to include ALL the given pull requests into the output. 38 | """ 39 | pr_titles = [github_pr.title for github_pr in self.github_prs] 40 | return template.replace("", self.repo_name).replace("", "\n".join(pr_titles)) 41 | 42 | def parse_answer(self, raw_answer): 43 | assert type(raw_answer) == list 44 | assert len(raw_answer) == 1 45 | 46 | raw_answer = raw_answer[0] 47 | 48 | try: 49 | risk_to_titles = json.loads(raw_answer) 50 | except json.JSONDecodeError: 51 | return None 52 | 53 | if not isinstance(risk_to_titles, dict): 54 | return None 55 | 56 | if not all(isinstance(risk_to_titles.get(risk), list) for risk in ["high risk", "medium risk", "low risk"]): 57 | return None 58 | 59 | high_risk_titles = set(risk_to_titles.get("high risk")) 60 | medium_risk_titles = set(risk_to_titles.get("medium risk")) 61 | low_risk_titles = set(risk_to_titles.get("low risk")) 62 | 63 | high_risk_prs = [] 64 | medium_risk_prs = [] 65 | low_risk_prs = [] 66 | for github_pr in self.github_prs: 67 | if github_pr.title in high_risk_titles: 68 | high_risk_prs.append(github_pr) 69 | elif github_pr.title in medium_risk_titles: 70 | medium_risk_prs.append(github_pr) 71 | elif github_pr.title in low_risk_titles: 72 | low_risk_prs.append(github_pr) 73 | else: 74 | append_event(LLMEvent( 75 | pr_nb=github_pr.number, message=f"PRRegressionBugRanking omitted a PR title; assuming it's medium-risk", content=github_pr.title)) 76 | medium_risk_prs.append(github_pr) 77 | 78 | return high_risk_prs, medium_risk_prs, low_risk_prs 79 | -------------------------------------------------------------------------------- /templates/pr_log.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | Log of PR {{ pr_result.number }} 8 | 51 | 52 | 53 | 54 |
55 |

PR{{ pr_result.number }}: {{ pr_result.title }}

56 |

Status: {{ pr_result.status() }}

57 |
58 | 59 |

Perf Stats

60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | {% for event, count, total_time, avg_time in perf_stats %} 71 | 72 | 73 | 74 | 75 | 76 | 77 | {% endfor %} 78 | 79 |
EventCountTime (total)Time (avg)
{{ event }}{{ count }}{{ total_time }}{{ avg_time }}
80 | 81 |

Log

82 | {% for entry in pr_result.entries %} 83 |
84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | {% for key, value in entry.items() %} 93 | 94 | 95 | 96 | 97 | {% endfor %} 98 | 99 |
KeyValue
{{ key }}{{ value | escape_tags | nl2br | safe }}
100 |
101 | {% endfor %} 102 | 103 | 104 | -------------------------------------------------------------------------------- /src/testora/llms/LLMCache.py: -------------------------------------------------------------------------------- 1 | import fcntl 2 | import json 3 | from os import makedirs 4 | from os.path import join, exists 5 | import atexit 6 | from testora.prompts.PromptCommon import system_message 7 | from testora.util.Logs import append_event, LLMEvent 8 | 9 | cache_base_dir = "./data/llm_cache/" 10 | if not exists(cache_base_dir): 11 | makedirs(cache_base_dir) 12 | 13 | 14 | class LLMCache: 15 | def __init__(self, llm_module): 16 | self.llm_module = llm_module 17 | 18 | name = llm_module.model 19 | cache_dir = join(cache_base_dir, name) 20 | if not exists(cache_dir): 21 | makedirs(cache_dir) 22 | 23 | self.cache_file = join(cache_dir, "cache.json") 24 | if exists(self.cache_file): 25 | with open(self.cache_file, "r") as f: 26 | self.cache = json.load(f) 27 | else: 28 | self.cache = {} 29 | 30 | self.nb_hits = 0 31 | self.nb_misses = 0 32 | 33 | self.nb_unwritten_updates = 0 34 | 35 | atexit.register(lambda: self.write_cache()) 36 | 37 | def write_cache(self): 38 | with open(self.cache_file, "w") as f: 39 | fcntl.flock(f, fcntl.LOCK_EX) 40 | try: 41 | json.dump(self.cache, f) 42 | finally: 43 | fcntl.flock(f, fcntl.LOCK_UN) 44 | print( 45 | f"LLMCache of {self.llm_module.model} with {len(self.cache)} entries saved. {self.nb_hits} hits, {self.nb_misses} misses.") 46 | 47 | def query(self, prompt, nb_samples=1, temperature: float=1, no_cache=False): 48 | prompt_str = prompt.create_prompt() 49 | 50 | # check for cached answer 51 | if not no_cache: 52 | result = self.cache.get(prompt_str) 53 | if result is not None: 54 | cached_answers = [] 55 | if type(result) == str: 56 | cached_answers.append(result) 57 | elif type(result) == list: 58 | cached_answers = result 59 | 60 | if nb_samples <= len(cached_answers): 61 | append_event(LLMEvent(pr_nb=-1, 62 | message=f"Cached result for querying {self.llm_module.model}", 63 | content=f"System message:\n{system_message}\nUser message:\n{prompt.create_prompt()}")) 64 | self.nb_hits += 1 65 | print(f"Prompt:\n{prompt_str}\nReturning cached result\n") 66 | return cached_answers[:nb_samples] 67 | 68 | # no cached answer (or don't want to use cache), query LLM 69 | self.nb_misses += 1 70 | result = self.llm_module.query(prompt, nb_samples=nb_samples, temperature=temperature) 71 | 72 | if no_cache: 73 | return result 74 | 75 | # update cache (only if answer is non-empty) 76 | if result: 77 | self.cache[prompt_str] = result 78 | self.nb_unwritten_updates += 1 79 | 80 | # write cache every 10 updates 81 | if self.nb_unwritten_updates > 10: 82 | self.write_cache() 83 | self.nb_unwritten_updates = 0 84 | 85 | return result 86 | -------------------------------------------------------------------------------- /data/ground_truth/scipy/21597.json: -------------------------------------------------------------------------------- 1 | { 2 | "pr_number": 21597, 3 | "log_file": "data/results/scipy/21597_2024-11-23 09:06:44.json", 4 | "differentiating_tests": [ 5 | { 6 | "test": { 7 | "test_code": "# Example 1:\nimport numpy as np\nfrom scipy.special import logsumexp\n\na = np.array([1.0, 2.0, 3.0])\nresult = logsumexp(a)\nprint(\"logsumexp of [1.0, 2.0, 3.0]:\", result)", 8 | "old_output": "logsumexp of [1.0, 2.0, 3.0]: 3.4076059644443806\n", 9 | "new_output": "logsumexp of [1.0, 2.0, 3.0]: 3.40760596444438\n" 10 | }, 11 | "label": "intended", 12 | "comment": "PR is about changing the precision of the output" 13 | }, 14 | { 15 | "test": { 16 | "test_code": "# Example 5:\nimport numpy as np\nfrom scipy.special import logsumexp\n\na = np.array([1.0, 2.0, 3.0])\nresult = logsumexp(a, return_sign=True)\nprint(\"logsumexp with sign of [1.0, 2.0, 3.0]:\", result)", 17 | "old_output": "logsumexp with sign of [1.0, 2.0, 3.0]: (np.float64(3.4076059644443806), np.float64(1.0))\n", 18 | "new_output": "logsumexp with sign of [1.0, 2.0, 3.0]: (np.float64(3.40760596444438), np.float64(1.0))\n" 19 | }, 20 | "label": "intended", 21 | "comment": "PR is about changing the precision of the output" 22 | }, 23 | { 24 | "test": { 25 | "test_code": "# Example 1:\nimport numpy as np\nfrom scipy.special import logsumexp\n\na = np.array([1.0, 2.0, 3.0])\nresult = logsumexp(a)\nprint(\"Example 1 - Normal case:\", result)", 26 | "old_output": "Example 1 - Normal case: 3.4076059644443806\n", 27 | "new_output": "Example 1 - Normal case: 3.40760596444438\n" 28 | }, 29 | "label": "intended", 30 | "comment": "PR is about changing the precision of the output" 31 | }, 32 | { 33 | "test": { 34 | "test_code": "import numpy as np\nfrom scipy.special import logsumexp\n\n# Example 5:\n# Return sign\na = np.array([1.0, 2.0, 3.0])\nresult, sign = logsumexp(a, return_sign=True)\nprint(\"Example 5 - Return sign:\", result, sign)", 35 | "old_output": "Example 5 - Return sign: 3.4076059644443806 1.0\n", 36 | "new_output": "Example 5 - Return sign: 3.40760596444438 1.0\n" 37 | }, 38 | "label": "intended", 39 | "comment": "PR is about changing the precision of the output" 40 | }, 41 | { 42 | "test": { 43 | "test_code": "import numpy as np\nfrom scipy.special import logsumexp\n\n# Example 20:\n# Testing different data types\na = np.array([1, 2, 3], dtype=np.int32)\nresult = logsumexp(a.astype(float)) # Convert to float for logsumexp\nprint(\"Example 20 - Different data types:\", result)", 44 | "old_output": "Example 20 - Different data types: 3.4076059644443806\n", 45 | "new_output": "Example 20 - Different data types: 3.40760596444438\n" 46 | }, 47 | "label": "intended", 48 | "comment": "PR is about changing the precision of the output" 49 | } 50 | ] 51 | } -------------------------------------------------------------------------------- /src/testora/prompts/RegressionTestGeneratorPromptV1.py: -------------------------------------------------------------------------------- 1 | # Prompt for generating regression tests based on a given diff 2 | # V1: As used for ICSE'26 paper 3 | 4 | class RegressionTestGeneratorPromptV1: 5 | def __init__(self, project_name, fut_qualified_names, diff): 6 | self.project_name = project_name 7 | self.fut_qualified_names = fut_qualified_names 8 | self.diff = diff 9 | self.use_json_output = False 10 | 11 | def create_prompt(self): 12 | template = """ 13 | Your task is to generate usage examples of the {project_name} project that expose behavioral differences introduced by the following diff: 14 | 15 | {diff} 16 | 17 | The diff affects the following functions: {fut_qualified_names}. 18 | 19 | The usage examples you create may use only the public API of the {project_name} project. You can assume that the project is installed and ready to be imported. Do NOT use any randomly generated data or timestamps in your examples; instead use fixed or deterministically created inputs. Create usage examples that are diverse and cover a wide range of scenarios, e.g., by (not) passing optional parameters or using different APIs to achieve the same purpose. 20 | 21 | Answer by giving ten usage examples that cover normal usage scenarios and ten usage examples that focus on corner cases (e.g., unusual values, such as None, NaN or empty lists). 22 | Each example must be an executable piece of Python code, including all necessary imports. 23 | Print all relevant values, including intermediate values, in a human-readable form. 24 | 25 | Wrap each individual example into Python code blocks by using the following output format: 26 | ```python 27 | # Example 1: 28 | ... 29 | ``` 30 | ```python 31 | # Example 2: 32 | ... 33 | ``` 34 | ```python 35 | # Example 3: 36 | ... 37 | ``` 38 | etc. 39 | """ 40 | 41 | return template.format(project_name=self.project_name, 42 | fut_qualified_names=", ".join( 43 | self.fut_qualified_names), 44 | diff=self.diff) 45 | 46 | def remove_unnecessary_indentation(self, code): 47 | lines = code.split("\n") 48 | if len(lines) > 0: 49 | # find number of leading spaces in first line 50 | num_spaces = len(lines[0]) - len(lines[0].lstrip()) 51 | if num_spaces > 0: 52 | return "\n".join([line[num_spaces:] for line in lines]) 53 | return code 54 | 55 | def parse_answer(self, raw_answer): 56 | assert type(raw_answer) == list 57 | 58 | tests = [] 59 | 60 | for answer in raw_answer: 61 | in_code = False 62 | next_test = "" 63 | for line in answer.split("\n"): 64 | if line.strip() == "```": 65 | in_code = False 66 | if next_test: 67 | next_test = self.remove_unnecessary_indentation( 68 | next_test) 69 | tests.append(next_test) 70 | next_test = "" 71 | if in_code: 72 | next_test += line + "\n" 73 | if line.strip() == "```python": 74 | in_code = True 75 | 76 | return tests 77 | -------------------------------------------------------------------------------- /src/multilspy/language_servers/eclipse_jdtls/runtime_dependencies.json: -------------------------------------------------------------------------------- 1 | { 2 | "_description": "This file lists the runtime dependencies for the Java Language Server", 3 | "gradle": { 4 | "platform-agnostic": { 5 | "url": "https://services.gradle.org/distributions/gradle-7.3.3-bin.zip", 6 | "archiveType": "zip", 7 | "relative_extraction_path": "." 8 | } 9 | }, 10 | "vscode-java": { 11 | "darwin-arm64": { 12 | "url": "https://github.com/redhat-developer/vscode-java/releases/download/v1.23.0/java@darwin-arm64-1.23.0.vsix", 13 | "archiveType": "zip", 14 | "relative_extraction_path": "vscode-java" 15 | }, 16 | "darwin-x64": { 17 | "url": "https://github.com/redhat-developer/vscode-java/releases/download/v1.23.0/java@darwin-x64-1.23.0.vsix", 18 | "archiveType": "zip", 19 | "relative_extraction_path": "vscode-java" 20 | }, 21 | "linux-arm64": { 22 | "url": "https://github.com/redhat-developer/vscode-java/releases/download/v1.23.0/java@linux-arm64-1.23.0.vsix", 23 | "archiveType": "zip", 24 | "relative_extraction_path": "vscode-java" 25 | }, 26 | "linux-x64": { 27 | "url": "https://github.com/redhat-developer/vscode-java/releases/download/v1.23.0/java@linux-x64-1.23.0.vsix", 28 | "archiveType": "zip", 29 | "relative_extraction_path": "vscode-java", 30 | "jre_home_path": "extension/jre/17.0.8.1-linux-x86_64", 31 | "jre_path": "extension/jre/17.0.8.1-linux-x86_64/bin/java", 32 | "lombok_jar_path": "extension/lombok/lombok-1.18.30.jar", 33 | "jdtls_launcher_jar_path": "extension/server/plugins/org.eclipse.equinox.launcher_1.6.500.v20230717-2134.jar", 34 | "jdtls_readonly_config_path": "extension/server/config_linux" 35 | }, 36 | "win-x64": { 37 | "url": "https://github.com/redhat-developer/vscode-java/releases/download/v1.23.0/java@win32-x64-1.23.0.vsix", 38 | "archiveType": "zip", 39 | "relative_extraction_path": "vscode-java", 40 | "jre_home_path": "extension/jre/17.0.8.1-win32-x86_64", 41 | "jre_path": "extension/jre/17.0.8.1-win32-x86_64/bin/java.exe", 42 | "lombok_jar_path": "extension/lombok/lombok-1.18.30.jar", 43 | "jdtls_launcher_jar_path": "extension/server/plugins/org.eclipse.equinox.launcher_1.6.500.v20230717-2134.jar", 44 | "jdtls_readonly_config_path": "extension/server/config_win" 45 | } 46 | }, 47 | "intellicode": { 48 | "platform-agnostic": { 49 | "url": "https://VisualStudioExptTeam.gallery.vsassets.io/_apis/public/gallery/publisher/VisualStudioExptTeam/extension/vscodeintellicode/1.2.30/assetbyname/Microsoft.VisualStudio.Services.VSIXPackage", 50 | "alternate_url": "https://marketplace.visualstudio.com/_apis/public/gallery/publishers/VisualStudioExptTeam/vsextensions/vscodeintellicode/1.2.30/vspackage", 51 | "archiveType": "zip", 52 | "relative_extraction_path": "intellicode", 53 | "intellicode_jar_path": "extension/dist/com.microsoft.jdtls.intellicode.core-0.7.0.jar", 54 | "intellisense_members_path": "extension/dist/bundledModels/java_intellisense-members" 55 | } 56 | } 57 | } -------------------------------------------------------------------------------- /data/ground_truth/pandas/59782.json: -------------------------------------------------------------------------------- 1 | { 2 | "pr_number": 59782, 3 | "log_file": "data/results/pandas/59782_2024-11-23 09:09:42.json", 4 | "differentiating_tests": [ 5 | { 6 | "test": { 7 | "test_code": "# Example 7:\nimport pandas as pd\n\n# Using a larger dataset and grouping\ndf = pd.DataFrame({\"group\": [\"A\", \"A\", \"B\", \"B\", \"C\", \"C\"],\n \"value\": [pd.Timedelta(1), pd.Timedelta(2), pd.NaT, pd.Timedelta(4), pd.NaT, pd.NaT]})\nresult = df.groupby(\"group\")[\"value\"].any()\nprint(\"Example 7 Result:\\n\", result)", 8 | "old_output": "/usr/local/lib/python3.10/site-packages/coverage/inorout.py:457: CoverageWarning: --include is ignored because --source is set (include-ignored)\n self.warn(\"--include is ignored because --source is set\", slug=\"include-ignored\")\nExample 7 Result:\n group\nA True\nB True\nC True\nName: value, dtype: bool\n", 9 | "new_output": "/usr/local/lib/python3.10/site-packages/coverage/inorout.py:457: CoverageWarning: --include is ignored because --source is set (include-ignored)\n self.warn(\"--include is ignored because --source is set\", slug=\"include-ignored\")\nExample 7 Result:\n group\nA True\nB True\nC False\nName: value, dtype: bool\n" 10 | }, 11 | "label": "intended", 12 | "comment": "values for C are all NaT, and hence, result should be False" 13 | }, 14 | { 15 | "test": { 16 | "test_code": "# Example 4:\nimport pandas as pd\n\n# Grouping with a single group having only NaT\ndf = pd.DataFrame({\"group\": [\"A\", \"A\", \"B\"], \"value\": [pd.NaT, pd.NaT, pd.Timedelta(1)]})\nresult = df.groupby(\"group\")[\"value\"].any()\nprint(\"Corner Case Example 4 Result:\\n\", result)", 17 | "old_output": "/usr/local/lib/python3.10/site-packages/coverage/inorout.py:457: CoverageWarning: --include is ignored because --source is set (include-ignored)\n self.warn(\"--include is ignored because --source is set\", slug=\"include-ignored\")\nCorner Case Example 4 Result:\n group\nA True\nB True\nName: value, dtype: bool\n", 18 | "new_output": "/usr/local/lib/python3.10/site-packages/coverage/inorout.py:457: CoverageWarning: --include is ignored because --source is set (include-ignored)\n self.warn(\"--include is ignored because --source is set\", slug=\"include-ignored\")\nCorner Case Example 4 Result:\n group\nA False\nB True\nName: value, dtype: bool\n" 19 | }, 20 | "label": "intended", 21 | "comment": "values for A are all NaT, and hence, result should be False" 22 | }, 23 | { 24 | "test": { 25 | "test_code": "# Example 7:\nimport pandas as pd\n\n# Grouping with a series of NaT values and expected output check\ndf = pd.DataFrame({\"group\": [\"B\", \"B\", \"C\", \"C\"], \n \"value\": [pd.NaT, pd.NaT, pd.NaT, pd.Timedelta(5)]})\nresult = df.groupby(\"group\")[\"value\"].any()\nprint(\"Corner Case Example 7 Result:\\n\", result)", 26 | "old_output": "/usr/local/lib/python3.10/site-packages/coverage/inorout.py:457: CoverageWarning: --include is ignored because --source is set (include-ignored)\n self.warn(\"--include is ignored because --source is set\", slug=\"include-ignored\")\nCorner Case Example 7 Result:\n group\nB True\nC True\nName: value, dtype: bool\n", 27 | "new_output": "/usr/local/lib/python3.10/site-packages/coverage/inorout.py:457: CoverageWarning: --include is ignored because --source is set (include-ignored)\n self.warn(\"--include is ignored because --source is set\", slug=\"include-ignored\")\nCorner Case Example 7 Result:\n group\nB False\nC True\nName: value, dtype: bool\n" 28 | }, 29 | "label": "intended", 30 | "comment": "values for B are all NaT, and hence, result should be False" 31 | } 32 | ] 33 | } -------------------------------------------------------------------------------- /src/testora/execution/DockerExecutor.py: -------------------------------------------------------------------------------- 1 | import os 2 | import docker 3 | import tarfile 4 | import tempfile 5 | from os.path import join 6 | from os import chdir, getcwd 7 | 8 | 9 | class DockerExecutor: 10 | def __init__(self, container_name, project_name, coverage_files): 11 | client = docker.from_env() 12 | self.container = client.containers.get(container_name) 13 | self.container.start() 14 | 15 | # adapt paths of coverage files to the container's file system 16 | self.coverage_files = [f"/home/{project_name}/{f}" for f in coverage_files] 17 | 18 | def copy_code_to_container(self, code, target_file_path): 19 | target_dir = target_file_path.rsplit("/", 1)[0] 20 | target_file_name = target_file_path.rsplit("/", 1)[1] 21 | 22 | with tempfile.TemporaryDirectory() as tmp_dir: 23 | code_file = join(tmp_dir, target_file_name) 24 | with open(code_file, "w") as f: 25 | f.write(code) 26 | tar_file = join(tmp_dir, "archive.tar") 27 | with tarfile.open(tar_file, mode="w") as tar: 28 | wd = getcwd() 29 | try: 30 | chdir(tmp_dir) 31 | tar.add(target_file_name) 32 | finally: 33 | chdir(wd) 34 | 35 | data = open(tar_file, "rb").read() 36 | self.container.put_archive(target_dir, data) 37 | 38 | def copy_file_from_container(self, file_path_in_container, target_dir): 39 | data, _ = self.container.get_archive(file_path_in_container) 40 | temp_tar_file = "temp.tar" 41 | with open(temp_tar_file, "wb") as f: 42 | for d in data: 43 | f.write(d) 44 | 45 | with tarfile.open(temp_tar_file, mode="r") as tar: 46 | tar.extractall(target_dir) 47 | 48 | os.remove(temp_tar_file) 49 | 50 | def execute_python_code(self, code): 51 | # create a fresh directory to get rid of any old state 52 | self.container.exec_run("rm -rf /tmp/Testora") 53 | self.container.exec_run("mkdir /tmp/Testora") 54 | 55 | self.copy_code_to_container(code, "/tmp/Testora/Testora_test_code.py") 56 | coverage_files = ",".join(f"\"{f}\"" for f in self.coverage_files) 57 | # -u to avoid non-deterministic buffering 58 | command = ( 59 | f"timeout 300s python -u -m coverage run " 60 | f"--include={coverage_files} " 61 | f"--data-file /tmp/coverage_report /tmp/Testora/Testora_test_code.py" 62 | ) 63 | 64 | # for scipy and numpy, make sure we run inside the their dev environment 65 | if self.container.name.startswith("scipy-dev"): 66 | command = ( 67 | f"bash -c 'source /root/conda/etc/profile.d/conda.sh" 68 | f" && eval \"$(mamba shell hook --shell bash)\" && mamba activate scipy-dev" 69 | f" && {command}'" 70 | ) 71 | elif self.container.name.startswith("numpy-dev"): 72 | command = ( 73 | f"bash -c 'source /root/conda/etc/profile.d/conda.sh" 74 | f" && source /root/conda/etc/profile.d/mamba.sh" 75 | f"' && mamba activate numpy-dev && {command}'" 76 | ) 77 | 78 | exec_result = self.container.exec_run(command) 79 | output = exec_result.output.decode("utf-8") 80 | 81 | self.copy_file_from_container( 82 | "/tmp/coverage_report", ".") 83 | with open("coverage_report", "rb") as f: 84 | coverage_report = f.read() 85 | 86 | return output, coverage_report 87 | 88 | 89 | if __name__ == "__main__": 90 | code = """ 91 | x = 23 92 | 93 | print(x) 94 | x.foo() 95 | print("never reach this") 96 | """ 97 | 98 | executor = DockerExecutor("pandas-dev", "pandas", coverage_files=[]) 99 | output = executor.execute_python_code(code) 100 | print(output) 101 | -------------------------------------------------------------------------------- /data/ground_truth/scipy/21572.json: -------------------------------------------------------------------------------- 1 | { 2 | "pr_number": 21572, 3 | "log_file": "data/results/scipy/21572_2024-11-23 09:06:43.json", 4 | "differentiating_tests": [ 5 | { 6 | "test": { 7 | "test_code": "# Example 7:\nimport numpy as np\nfrom scipy.spatial import distance\n\nu = np.array([1, 1, 0], dtype=bool)\nv = np.array([0, 0, 1], dtype=bool)\n\n# Computing Kulczynski 1\nkulczynski_value = distance.kulczynski1(u, v)", 8 | "old_output": "", 9 | "new_output": "/tmp/BugGPT/BugGPT_test_code.py:9: DeprecationWarning: The kulczynski1 metric is deprecated since SciPy 1.15.0 and will be removed in SciPy 1.17.0. Replace usage of 'kulczynski1(u, v)' with '1/jaccard(u, v) - 1'.\n kulczynski_value = distance.kulczynski1(u, v)\n" 10 | }, 11 | "label": "intended", 12 | "comment": "new version prints deprecation warning, which is what the PR is about" 13 | }, 14 | { 15 | "test": { 16 | "test_code": "# Example 2:\nimport numpy as np\nfrom scipy.spatial import distance\n\nu = np.array([np.nan, 1, 0], dtype=bool)\nv = np.array([0, 1, 1], dtype=bool)\n\n# Computation with NaN\nkulczynski_result = distance.kulczynski1(u, v)", 17 | "old_output": "", 18 | "new_output": "/tmp/BugGPT/BugGPT_test_code.py:9: DeprecationWarning: The kulczynski1 metric is deprecated since SciPy 1.15.0 and will be removed in SciPy 1.17.0. Replace usage of 'kulczynski1(u, v)' with '1/jaccard(u, v) - 1'.\n kulczynski_result = distance.kulczynski1(u, v)\n" 19 | }, 20 | "label": "intended", 21 | "comment": "new version prints deprecation warning, which is what the PR is about" 22 | }, 23 | { 24 | "test": { 25 | "test_code": "# Example 3:\nimport numpy as np\nfrom scipy.spatial import distance\n\nu = np.array([1, 1, 1], dtype=bool)\nv = np.array([0, 0, 0], dtype=bool)\n\n# All equal values\nkulczynski_result = distance.kulczynski1(u, v)", 26 | "old_output": "", 27 | "new_output": "/tmp/BugGPT/BugGPT_test_code.py:9: DeprecationWarning: The kulczynski1 metric is deprecated since SciPy 1.15.0 and will be removed in SciPy 1.17.0. Replace usage of 'kulczynski1(u, v)' with '1/jaccard(u, v) - 1'.\n kulczynski_result = distance.kulczynski1(u, v)\n" 28 | }, 29 | "label": "intended", 30 | "comment": "new version prints deprecation warning, which is what the PR is about" 31 | }, 32 | { 33 | "test": { 34 | "test_code": "# Example 4:\nimport numpy as np\nfrom scipy.spatial import distance\n\nu = np.array([1], dtype=bool)\nv = np.array([0], dtype=bool)\n\n# Single element arrays\nkulczynski_result = distance.kulczynski1(u, v)", 35 | "old_output": "", 36 | "new_output": "/tmp/BugGPT/BugGPT_test_code.py:9: DeprecationWarning: The kulczynski1 metric is deprecated since SciPy 1.15.0 and will be removed in SciPy 1.17.0. Replace usage of 'kulczynski1(u, v)' with '1/jaccard(u, v) - 1'.\n kulczynski_result = distance.kulczynski1(u, v)\n" 37 | }, 38 | "label": "intended", 39 | "comment": "new version prints deprecation warning, which is what the PR is about" 40 | }, 41 | { 42 | "test": { 43 | "test_code": "# Example 6:\nimport numpy as np\nfrom scipy.spatial import distance\n\nu = np.array([1, 1, 0, 0], dtype=bool)\nv = np.array([1, 0, 1, 1], dtype=bool)\n\n# Compute distance with mixed values\nkulczynski_result = distance.kulczynski1(u, v)", 44 | "old_output": "", 45 | "new_output": "/tmp/BugGPT/BugGPT_test_code.py:9: DeprecationWarning: The kulczynski1 metric is deprecated since SciPy 1.15.0 and will be removed in SciPy 1.17.0. Replace usage of 'kulczynski1(u, v)' with '1/jaccard(u, v) - 1'.\n kulczynski_result = distance.kulczynski1(u, v)\n" 46 | }, 47 | "label": "intended", 48 | "comment": "new version prints deprecation warning, which is what the PR is about" 49 | } 50 | ] 51 | } -------------------------------------------------------------------------------- /src/testora/prompts/RegressionTestGeneratorPromptV2.py: -------------------------------------------------------------------------------- 1 | # Prompt for generating regression tests based on a given diff 2 | # V2: Variant of V1 optimized by https://platform.openai.com/chat/edit?models=gpt-5&optimize=true for use with GPT-5 3 | 4 | class RegressionTestGeneratorPromptV2: 5 | def __init__(self, project_name, fut_qualified_names, diff): 6 | self.project_name = project_name 7 | self.fut_qualified_names = fut_qualified_names 8 | self.diff = diff 9 | self.use_json_output = False 10 | 11 | def create_prompt(self): 12 | template = """ 13 | Developer: Begin with a concise checklist (3-7 bullets) of what you will do; keep items conceptual, not implementation-level. 14 | 15 | Your task is to create usage examples for the {project_name} project, specifically designed to highlight behavioral differences introduced by the following diff: 16 | 17 | {diff} 18 | 19 | This diff modifies the following functions: {fut_qualified_names}. 20 | 21 | Instructions: 22 | - Use only the public API from {project_name}. Assume the package is installed and importable. 23 | - Avoid using any randomly generated data or dynamic timestamps. All inputs must be fixed or deterministic. 24 | - Generate a total of 20 executable Python usage examples, each in a separate code block marked with triple backticks. 25 | - The first 10 examples should demonstrate standard/typical usage scenarios. 26 | - The next 10 (examples 11-20) should focus on corner cases and edge conditions, such as unusual values (e.g., None, NaN, empty lists, etc.). 27 | - Each Python code block must: 28 | - Be self-contained, including all necessary imports. 29 | - Begin with a comment: e.g., '# Example 1: '. 30 | - Include clear print statements for input arguments, outputs, and any intermediate values that help show differences in behavior. 31 | - If an exception is expected for an edge case, wrap the code in a try/except and print only the exception message (avoid printing stack traces). 32 | - Use the following output format for each example. Failing to wrap the code into backticks will make the result unusable: 33 | ```python 34 | # Example : 35 | 36 | ``` 37 | 38 | Output Requirements: 39 | - Submit exactly 20 Python code blocks, numbered sequentially from 1 to 20. 40 | - Code blocks 1-10: Standard use cases. 41 | - Code blocks 11-20: Edge/corner cases. 42 | - Every block is executable and prints human-readable inputs and results. 43 | - Exceptions are handled and their messages printed only. 44 | 45 | After generating all examples, validate that each code block is executable and correctly numbered. If any do not meet the requirements, revise as needed before final submission. 46 | """ 47 | 48 | return template.format(project_name=self.project_name, 49 | fut_qualified_names=", ".join( 50 | self.fut_qualified_names), 51 | diff=self.diff) 52 | 53 | def remove_unnecessary_indentation(self, code): 54 | lines = code.split("\n") 55 | if len(lines) > 0: 56 | # find number of leading spaces in first line 57 | num_spaces = len(lines[0]) - len(lines[0].lstrip()) 58 | if num_spaces > 0: 59 | return "\n".join([line[num_spaces:] for line in lines]) 60 | return code 61 | 62 | def parse_answer(self, raw_answer): 63 | assert type(raw_answer) == list 64 | 65 | tests = [] 66 | 67 | for answer in raw_answer: 68 | in_code = False 69 | next_test = "" 70 | for line in answer.split("\n"): 71 | if line.strip() == "```": 72 | in_code = False 73 | if next_test: 74 | next_test = self.remove_unnecessary_indentation( 75 | next_test) 76 | tests.append(next_test) 77 | next_test = "" 78 | if in_code: 79 | next_test += line + "\n" 80 | if line.strip() == "```python": 81 | in_code = True 82 | 83 | return tests 84 | -------------------------------------------------------------------------------- /src/testora/evaluation/FindCandidateProjects.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import csv 3 | 4 | with open(".github_token") as f: 5 | github_token = f.read().strip() 6 | headers = {'Authorization': f'token {github_token}'} 7 | 8 | 9 | def search_repositories(query, sort='stars', order='desc', per_page=100): 10 | url = 'https://api.github.com/search/repositories' 11 | params = { 12 | 'q': query, 13 | 'sort': sort, 14 | 'order': order, 15 | 'per_page': per_page 16 | } 17 | print(".") 18 | response = requests.get(url, headers=headers, params=params) 19 | response.raise_for_status() 20 | return response.json() 21 | 22 | 23 | def fetch_top_python_repos(total_repos=1000, per_page=100): 24 | url = 'https://api.github.com/search/repositories' 25 | 26 | params = { 27 | 'q': 'language:Python', 28 | 'sort': 'stars', 29 | 'order': 'desc', 30 | 'per_page': per_page, 31 | 'page': 1 32 | } 33 | 34 | repos = [] 35 | 36 | while len(repos) < total_repos: 37 | print(".") 38 | response = requests.get(url, headers=headers, params=params) 39 | response.raise_for_status() # Raise an exception for HTTP errors 40 | data = response.json() 41 | 42 | repos.extend(data['items']) 43 | 44 | if 'next' not in response.links: 45 | break 46 | 47 | params['page'] += 1 48 | 49 | return repos[:total_repos] 50 | 51 | 52 | def get_pull_requests_count_graphql(owner, repo): 53 | url = 'https://api.github.com/graphql' 54 | 55 | query = """ 56 | query($owner: String!, $repo: String!) { 57 | repository(owner: $owner, name: $repo) { 58 | pullRequests { 59 | totalCount 60 | } 61 | } 62 | } 63 | """ 64 | 65 | variables = { 66 | 'owner': owner, 67 | 'repo': repo 68 | } 69 | 70 | print(".") 71 | response = requests.post( 72 | url, json={'query': query, 'variables': variables}, headers=headers) 73 | response.raise_for_status() # Raise an exception for HTTP errors 74 | 75 | data = response.json() 76 | total_prs = data['data']['repository']['pullRequests']['totalCount'] 77 | 78 | return total_prs 79 | 80 | 81 | def get_pull_requests_count(owner, repo): 82 | url = f'https://api.github.com/repos/{owner}/{repo}/pulls' 83 | params = { 84 | 'state': 'all', 85 | 'per_page': 100, 86 | 'page': 1 87 | } 88 | print(".") 89 | response = requests.get(url, headers=headers, params=params) 90 | response.raise_for_status() 91 | 92 | total_prs = 0 93 | total_prs += len(response.json()) 94 | 95 | while 'next' in response.links: 96 | print(".") 97 | response = requests.get(response.links['next']['url'], headers=headers) 98 | response.raise_for_status() 99 | total_prs += len(response.json()) 100 | 101 | return total_prs 102 | 103 | 104 | def main(): 105 | query = 'language:Python' 106 | # repositories = search_repositories(query)['items'] 107 | repositories = fetch_top_python_repos() 108 | 109 | out_file = "candidate_projects2.csv" 110 | with open(out_file, mode='w', newline='') as out_fp: 111 | writer = csv.writer(out_fp) 112 | writer.writerow(['Name', 'Stars', 'PRs', 'Description']) 113 | 114 | print(f"Found {len(repositories)} repositories") 115 | for repo in repositories: 116 | repo_full_name = repo['full_name'] 117 | repo_description = repo['description'] 118 | if not repo_description or "library" not in repo_description.lower(): 119 | print( 120 | f'Skipping {repo_full_name} because it seems to not be a library') 121 | continue 122 | print(f"Counting PRs for {repo_full_name}") 123 | pr_count = get_pull_requests_count_graphql(*repo_full_name.split('/')) 124 | stars = repo['stargazers_count'] 125 | print(f'{repo_full_name} -- {stars} -- {pr_count} -- {repo_description}') 126 | with open(out_file, mode='a', newline='') as out_fp: 127 | writer = csv.writer(out_fp) 128 | writer.writerow([repo_full_name, stars, pr_count, repo_description]) 129 | out_fp.flush() 130 | 131 | 132 | if __name__ == "__main__": 133 | main() 134 | -------------------------------------------------------------------------------- /data/ground_truth/marshmallow/1998.json: -------------------------------------------------------------------------------- 1 | { 2 | "pr_number": 1998, 3 | "log_file": "data/results/marshmallow/1998_2024-11-23 09:16:16.json", 4 | "differentiating_tests": [ 5 | { 6 | "test": { 7 | "test_code": "# Example 2:\nfrom marshmallow import Schema, fields\nimport datetime as dt\n\nclass MySchema(Schema):\n duration = fields.TimeDelta(precision='minutes', serialization_type=float)\n\ndata = {'duration': dt.timedelta(minutes=2)}\nschema = MySchema()\nresult = schema.dump(data)\nprint(\"Serialized (float, minutes):\", result)", 8 | "old_output": "Serialized (float, minutes): {'duration': 2}\n", 9 | "new_output": "Serialized (float, minutes): {'duration': 2.0}\n" 10 | }, 11 | "label": "intended", 12 | "comment": "serialized value now is a float" 13 | }, 14 | { 15 | "test": { 16 | "test_code": "# Example 4:\nfrom marshmallow import Schema, fields\nimport datetime as dt\n\nclass MySchema(Schema):\n duration = fields.TimeDelta(precision='hours', serialization_type=float)\n\ndata = {'duration': dt.timedelta(hours=1, seconds=1200)}\nschema = MySchema()\nresult = schema.dump(data)\nprint(\"Serialized (float, hours):\", result)", 17 | "old_output": "Serialized (float, hours): {'duration': 1}\n", 18 | "new_output": "Serialized (float, hours): {'duration': 1.3333333333333333}\n" 19 | }, 20 | "label": "intended", 21 | "comment": "serialized value now is a float" 22 | }, 23 | { 24 | "test": { 25 | "test_code": "# Example 6:\nfrom marshmallow import Schema, fields\nimport datetime as dt\n\nclass MySchema(Schema):\n duration = fields.TimeDelta(precision='weeks', serialization_type=float)\n\ndata = {'duration': dt.timedelta(weeks=1)}\nschema = MySchema()\nresult = schema.dump(data)\nprint(\"Serialized (float, weeks):\", result)", 26 | "old_output": "Serialized (float, weeks): {'duration': 1}\n", 27 | "new_output": "Serialized (float, weeks): {'duration': 1.0}\n" 28 | }, 29 | "label": "intended", 30 | "comment": "serialized value now is a float" 31 | }, 32 | { 33 | "test": { 34 | "test_code": "# Example 8:\nfrom marshmallow import Schema, fields\nimport datetime as dt\n\nclass MySchema(Schema):\n duration = fields.TimeDelta(precision='seconds', serialization_type=float)\n\ndata = {'duration': dt.timedelta(seconds=60.5)}\nschema = MySchema()\nresult = schema.dump(data)\nprint(\"Serialized (float, seconds):\", result)", 35 | "old_output": "Serialized (float, seconds): {'duration': 60}\n", 36 | "new_output": "Serialized (float, seconds): {'duration': 60.5}\n" 37 | }, 38 | "label": "intended", 39 | "comment": "serialized value now is a float" 40 | }, 41 | { 42 | "test": { 43 | "test_code": "# Example 10:\nfrom marshmallow import Schema, fields\nimport datetime as dt\n\nclass MySchema(Schema):\n duration = fields.TimeDelta(precision='milliseconds', serialization_type=float)\n\ndata = {'duration': dt.timedelta(milliseconds=1500)}\nschema = MySchema()\nresult = schema.dump(data)\nprint(\"Serialized (float, milliseconds):\", result)", 44 | "old_output": "Serialized (float, milliseconds): {'duration': 1500}\n", 45 | "new_output": "Serialized (float, milliseconds): {'duration': 1500.0}\n" 46 | }, 47 | "label": "intended", 48 | "comment": "serialized value now is a float" 49 | }, 50 | { 51 | "test": { 52 | "test_code": "# Example 14:\nfrom marshmallow import Schema, fields\nimport datetime as dt\n\nclass MySchema(Schema):\n duration = fields.TimeDelta(precision='seconds', serialization_type=float)\n\ndata = {'duration': dt.timedelta(seconds=-1)}\nschema = MySchema()\nresult = schema.dump(data)\nprint(\"Serialized (negative duration):\", result)", 53 | "old_output": "Serialized (negative duration): {'duration': -1}\n", 54 | "new_output": "Serialized (negative duration): {'duration': -1.0}\n" 55 | }, 56 | "label": "intended", 57 | "comment": "serialized value now is a float" 58 | } 59 | ] 60 | } -------------------------------------------------------------------------------- /src/testora/llms/OpenAIGPT.py: -------------------------------------------------------------------------------- 1 | from json import JSONDecodeError 2 | import time 3 | from typing import List 4 | from openai import OpenAI, RateLimitError 5 | from testora.prompts.PromptCommon import system_message 6 | from testora.util.Logs import append_event, LLMEvent 7 | from testora.Config import model_version 8 | 9 | if model_version.startswith("gpt"): 10 | with open(".openai_token", "r") as f: 11 | openai_key = f.read().strip() 12 | openai = OpenAI(api_key=openai_key) 13 | elif model_version.startswith("deepseek"): 14 | with open(".openrouter_token", "r") as f: 15 | openrouter_key = f.read().strip() 16 | openai = OpenAI(api_key=openrouter_key, 17 | base_url="https://openrouter.ai/api/v1") 18 | 19 | 20 | class OpenAIGPT: 21 | def __init__(self): 22 | self.model = model_version 23 | 24 | def query(self, prompt, nb_samples=1, temperature=1) -> List: 25 | user_message = prompt.create_prompt() 26 | if len(user_message) > 30000: 27 | append_event(LLMEvent(pr_nb=-1, 28 | message=f"Query too long", 29 | content=f"System message:\n{system_message}\nUser message:\n{user_message}")) 30 | return [""] 31 | 32 | append_event(LLMEvent(pr_nb=-1, 33 | message=f"Querying {self.model}", 34 | content=f"System message:\n{system_message}\nUser message:\n{user_message}")) 35 | 36 | while True: 37 | try: 38 | if prompt.use_json_output: 39 | completion = openai.chat.completions.create( 40 | model=self.model, 41 | messages=[ 42 | {"role": "system", "content": system_message}, 43 | {"role": "user", "content": user_message} 44 | ], 45 | n=nb_samples, 46 | response_format={"type": "json_object"}, 47 | temperature=temperature 48 | ) 49 | break 50 | else: 51 | completion = openai.chat.completions.create( 52 | model=self.model, 53 | messages=[ 54 | {"role": "system", "content": system_message}, 55 | {"role": "user", "content": user_message} 56 | ], 57 | n=nb_samples, 58 | temperature=temperature 59 | ) # type: ignore[call-overload] 60 | 61 | # handle errors that lead to no model being called 62 | if completion.model is None: 63 | append_event(LLMEvent(pr_nb=-1, 64 | message=f"Failed to get completion", 65 | content=f"Will try again in 1 second")) 66 | time.sleep(1) 67 | continue 68 | 69 | append_event(LLMEvent(pr_nb=-1, 70 | message=f"Token usage", 71 | content=f"prompt={completion.usage.prompt_tokens}, completion={completion.usage.completion_tokens}")) 72 | 73 | answers = [] 74 | for choice in completion.choices: 75 | answers.append(choice.message.content) 76 | 77 | # handle errors that lead to empty answers 78 | if "" in answers: 79 | append_event(LLMEvent(pr_nb=-1, 80 | message=f"Empty answer", 81 | content=f"Will try again in 1 second")) 82 | time.sleep(1) 83 | continue 84 | 85 | return answers 86 | 87 | except RateLimitError as e: 88 | append_event(LLMEvent(pr_nb=-1, 89 | message=f"Rate limit exceeded", 90 | content=f"Will try again in 60 seconds")) 91 | time.sleep(60) 92 | except JSONDecodeError as e: 93 | append_event(LLMEvent(pr_nb=-1, 94 | message=f"JSON decode error", 95 | content=f"Will try again in 1 second")) 96 | time.sleep(1) 97 | 98 | raise Exception("Should not reach this point") 99 | -------------------------------------------------------------------------------- /data/ground_truth/scipy/21629.json: -------------------------------------------------------------------------------- 1 | { 2 | "pr_number": 21629, 3 | "log_file": "data/results/scipy/21629_2024-11-23 09:06:44.json", 4 | "differentiating_tests": [ 5 | { 6 | "test": { 7 | "test_code": "# Corner Case Example 2:\nimport numpy as np\nfrom scipy.special import spherical_yn\n\nn = 1\nz = None\ntry:\n result = spherical_yn(n, z)\nexcept Exception as e:\n print(f\"spherical_yn({n}, {z}) raised an exception: {e}\")", 8 | "old_output": "spherical_yn(1, None) raised an exception: ufunc '_spherical_yn' not supported for the input types, and the inputs could not be safely coerced to any supported types according to the casting rule ''safe''\n", 9 | "new_output": "spherical_yn(1, None) raised an exception: '>=' not supported between instances of 'NoneType' and 'int'\n" 10 | }, 11 | "label": "intended", 12 | "comment": "different errors messages that both are about the same invalid input" 13 | }, 14 | { 15 | "test": { 16 | "test_code": "# Corner Case Example 4:\nimport numpy as np\nfrom scipy.special import spherical_kn\n\nn = 0\nz = -1.0\nresult = spherical_kn(n, z)\nprint(f\"spherical_kn({n}, {z}) = {result}\")", 17 | "old_output": "spherical_kn(0, -1.0) = nan\n", 18 | "new_output": "spherical_kn(0, -1.0) = -4.269867111336788\n" 19 | }, 20 | "label": "intended", 21 | "comment": "new version avoids NaN for negative z input, which is the intention of the PR" 22 | }, 23 | { 24 | "test": { 25 | "test_code": "# Corner Case Example 7:\nimport numpy as np\nfrom scipy.special import spherical_in\n\nn = 1\nz = -0.5\nresult = spherical_in(n, z)\nprint(f\"spherical_in({n}, {z}) = {result}\")", 26 | "old_output": "spherical_in(1, -0.5) = nan\n", 27 | "new_output": "spherical_in(1, -0.5) = -0.17087070843777216\n" 28 | }, 29 | "label": "intended", 30 | "comment": "new version avoids NaN for negative z input, which is the intention of the PR" 31 | }, 32 | { 33 | "test": { 34 | "test_code": "# Corner Case Example 8:\nimport numpy as np\nfrom scipy.special import spherical_kn\n\nn = 1\nz = -100.0\nresult = spherical_kn(n, z)\nprint(f\"spherical_kn({n}, {z}) = {result}\")", 35 | "old_output": "spherical_kn(1, -100.0) = nan\n", 36 | "new_output": "spherical_kn(1, -100.0) = -4.18025968703559e+41\n" 37 | }, 38 | "label": "intended", 39 | "comment": "new version avoids NaN for negative z input, which is the intention of the PR" 40 | }, 41 | { 42 | "test": { 43 | "test_code": "# Example 9:\nimport numpy as np\nfrom scipy.special import spherical_jn\n\nn = 3\nz = -1.0 # Negative input to see reflection behavior\nresult = spherical_jn(n, z)\nprint(f\"spherical_jn({n}, {z}) = {result}\")", 44 | "old_output": "spherical_jn(3, -1.0) = nan\n", 45 | "new_output": "spherical_jn(3, -1.0) = -0.009006581117112524\n" 46 | }, 47 | "label": "intended", 48 | "comment": "new version avoids NaN for negative z input, which is the intention of the PR" 49 | }, 50 | { 51 | "test": { 52 | "test_code": "# Example 15:\nimport numpy as np\nfrom scipy.special import spherical_jn\n\nn = 1\nz = np.array([-1.0, np.nan, 1.0]) # Mixed values\nresult = spherical_jn(n, z)\nprint(f\"spherical_jn({n}, [-1.0, nan, 1.0]) = {result}\")", 53 | "old_output": "spherical_jn(1, [-1.0, nan, 1.0]) = [ nan nan 0.30116868]\n", 54 | "new_output": "spherical_jn(1, [-1.0, nan, 1.0]) = [-0.30116868 nan 0.30116868]\n" 55 | }, 56 | "label": "intended", 57 | "comment": "new version reduces NaNs for negative z input, which is the intention of the PR" 58 | }, 59 | { 60 | "test": { 61 | "test_code": "# Example 17:\nimport numpy as np\nfrom scipy.special import spherical_in\n\nn = 0\nz = np.array([1e10, -1e10]) # Large values\nresult = spherical_in(n, z)\nprint(f\"spherical_in({n}, [1e10, -1e10]) = {result}\")", 62 | "old_output": "spherical_in(0, [1e10, -1e10]) = [inf nan]\n", 63 | "new_output": "spherical_in(0, [1e10, -1e10]) = [inf inf]\n" 64 | }, 65 | "label": "intended", 66 | "comment": "new version avoids NaN for negative z input, which is the intention of the PR" 67 | } 68 | ] 69 | } -------------------------------------------------------------------------------- /src/testora/evaluation/ResultsManager.py: -------------------------------------------------------------------------------- 1 | import os 2 | from datetime import datetime 3 | 4 | 5 | results_base_dir = "data/results/" 6 | classification_results_base_dir = "data/classification_results/" 7 | 8 | 9 | def result_files(): 10 | for project_dir in os.listdir(results_base_dir): 11 | for pr_result_file in os.listdir(os.path.join(results_base_dir, project_dir)): 12 | if pr_result_file.endswith(".json"): 13 | yield os.path.join(results_base_dir, project_dir, pr_result_file) 14 | 15 | 16 | def result_files_for_project(project_name, minimum_timestamp=None, is_classification=False, file_name=None, base_dir_arg=None): 17 | if base_dir_arg: 18 | base_dir = base_dir_arg 19 | else: 20 | base_dir = classification_results_base_dir if is_classification else results_base_dir 21 | 22 | for pr_result_file in os.listdir(os.path.join(base_dir, project_name)): 23 | if pr_result_file.endswith(".json"): 24 | if minimum_timestamp: 25 | pr_timestamp = pr_result_file.replace( 26 | ".json", "").split("_")[1] 27 | if datetime.strptime(pr_timestamp, "%Y-%m-%d %H:%M:%S") < datetime.strptime(minimum_timestamp, "%Y-%m-%d %H:%M:%S"): 28 | continue 29 | 30 | if file_name and pr_result_file != file_name: 31 | continue 32 | 33 | yield os.path.join(base_dir, project_name, pr_result_file) 34 | 35 | 36 | def current_results(include_archive=True, is_classification=False): 37 | base_dir = classification_results_base_dir if is_classification else results_base_dir 38 | 39 | project_to_prs_and_timestamps = {} 40 | for project_dir in os.listdir(base_dir): 41 | project_to_prs_and_timestamps[project_dir] = [] 42 | result_dirs = [os.path.join(base_dir, project_dir)] 43 | if include_archive: 44 | archive_dir = os.path.join(base_dir, project_dir, "archive") 45 | if not os.path.exists(archive_dir): 46 | os.makedirs(archive_dir) 47 | print(f"Created directory {archive_dir}") 48 | result_dirs.append(archive_dir) 49 | for result_dir in result_dirs: 50 | for pr_result_file in os.listdir(result_dir): 51 | if pr_result_file.endswith(".json"): 52 | pr_nb, timestamp = pr_result_file.replace( 53 | ".json", "").split("_") 54 | project_to_prs_and_timestamps[project_dir].append( 55 | [pr_nb, timestamp]) 56 | return project_to_prs_and_timestamps 57 | 58 | 59 | def add_result(project_name, pr_nb, timestamp, result, is_classification): 60 | base_dir = classification_results_base_dir if is_classification else results_base_dir 61 | 62 | if not os.path.exists(base_dir): 63 | os.makedirs(base_dir) 64 | print(f"Created directory {base_dir}") 65 | 66 | all_old_results = current_results(is_classification=is_classification) 67 | non_archive_old_results = current_results( 68 | False, is_classification=is_classification) 69 | 70 | # check if result already exists 71 | for old_pr_nb, old_timestamp in all_old_results[project_name]: 72 | if old_pr_nb == pr_nb and old_timestamp == timestamp: 73 | return 74 | 75 | # Write new result to file 76 | if not os.path.exists(os.path.join(base_dir, project_name)): 77 | os.makedirs(os.path.join(base_dir, project_name)) 78 | 79 | target_file = os.path.join(base_dir, project_name, 80 | f"{pr_nb}_{timestamp}.json") 81 | with open(target_file, "w") as f: 82 | f.write(result) 83 | 84 | # Check if it replaces an old result (if yes, move old result to archive) 85 | for old_pr_nb, old_timestamp in non_archive_old_results[project_name]: 86 | if old_pr_nb == pr_nb: 87 | old_target_file = os.path.join(base_dir, project_name, 88 | f"{old_pr_nb}_{old_timestamp}.json") 89 | archive_dir = os.path.join(base_dir, project_name, "archive") 90 | if not os.path.exists(archive_dir): 91 | os.makedirs(archive_dir) 92 | print(f"Created directory {archive_dir}") 93 | renamed_target_file = os.path.join( 94 | archive_dir, f"{old_pr_nb}_{old_timestamp}.json") 95 | os.rename(old_target_file, renamed_target_file) 96 | print(f"Moved old result to {renamed_target_file}") 97 | break 98 | 99 | print(f"New result in {target_file}") 100 | -------------------------------------------------------------------------------- /src/testora/webui/WebUI.py: -------------------------------------------------------------------------------- 1 | from collections import Counter 2 | from typing import Dict, List 3 | from flask import Flask, render_template 4 | import argparse 5 | from datetime import timedelta 6 | from testora.util.LogParser import PRResult, parse_log_files, parse_time_stamp, pr_results_as_dict 7 | 8 | app = Flask("Testora Web UI") 9 | 10 | 11 | parser = argparse.ArgumentParser(description="Web UI for Testora") 12 | parser.add_argument("--files", help="Log file(s) to process", 13 | type=str, required=False, nargs="+") 14 | 15 | pr_results: List[PRResult] = [] 16 | pr_number_to_result: Dict[int, PRResult] = {} 17 | 18 | 19 | def summarize_status(): 20 | summary = Counter() 21 | for pr_result in pr_results: 22 | summary["total"] += 1 23 | summary[pr_result.status()] += 1 24 | 25 | # add percentages 26 | for key in summary: 27 | if key != "total": 28 | percentage = (int(summary[key]) / summary['total']) * 100 29 | summary[key] = f"{summary[key]} ({percentage:.1f}%)" 30 | 31 | return summary 32 | 33 | 34 | def compute_perf_stats(entries): 35 | total_time = parse_time_stamp(entries[-1]["timestamp"]) - \ 36 | parse_time_stamp(entries[0]["timestamp"]) 37 | 38 | message_prefix_to_timedelta = {} 39 | message_prefix_to_nb = Counter() 40 | previous_timestamp = None 41 | previous_message_prefix = None 42 | 43 | for entry in entries: 44 | if previous_timestamp is None: 45 | previous_timestamp = entry["timestamp"] 46 | previous_message_prefix = entry["message"].split(" ")[0] 47 | else: 48 | current_timestamp = entry["timestamp"] 49 | current_message_prefix = entry["message"].split(" ")[0] 50 | message_prefix_to_timedelta[previous_message_prefix] = message_prefix_to_timedelta.get( 51 | previous_message_prefix, timedelta(0)) + (parse_time_stamp(current_timestamp) - parse_time_stamp(previous_timestamp)) 52 | message_prefix_to_nb[previous_message_prefix] += 1 53 | previous_timestamp = current_timestamp 54 | previous_message_prefix = current_message_prefix 55 | 56 | # sort by time and keep only top-k 57 | message_prefix_to_timedelta = dict( 58 | sorted(message_prefix_to_timedelta.items(), key=lambda item: item[1], reverse=True)[:6]) 59 | 60 | result = [["All", len(entries), total_time, total_time / len(entries)]] 61 | for message_prefix, time in message_prefix_to_timedelta.items(): 62 | if message_prefix in ["Done", "Starting"]: 63 | continue 64 | result.append([message_prefix, message_prefix_to_nb[message_prefix], time, 65 | time / message_prefix_to_nb[message_prefix]]) 66 | 67 | return result 68 | 69 | 70 | status_colors = { 71 | "unknown": "#FFFFE0", 72 | "checked": "#D3D3D3", 73 | "intended_change": "#CCFFCC", 74 | "coincidental_fix": "#CBC3E3", 75 | "regression": "#FFCCCC", 76 | } 77 | 78 | 79 | def nl2br(value): 80 | if type(value) == str: 81 | return value.replace("\n", "
") 82 | else: 83 | return value 84 | 85 | 86 | app.jinja_env.filters["nl2br"] = nl2br 87 | 88 | 89 | def escape_tags(value): 90 | if type(value) == str: 91 | return value.replace("<", "<").replace(">", ">") 92 | else: 93 | return value 94 | 95 | 96 | app.jinja_env.filters["escape_tags"] = escape_tags 97 | 98 | 99 | @app.route('/') 100 | def main_page(): 101 | global pr_results, pr_number_to_result 102 | pr_results, _ = parse_log_files(args.files) 103 | summary = summarize_status() 104 | pr_number_to_result = pr_results_as_dict(pr_results) 105 | return render_template("index.html", summary=summary, pr_results=pr_results, color_mapping=status_colors) 106 | 107 | 108 | @app.route('/pr_log') 109 | def pr_log_page(number): 110 | pr_result = pr_number_to_result[int(number)] 111 | perf_stats = compute_perf_stats(pr_result.entries) 112 | return render_template('pr_log.html', pr_result=pr_result, perf_stats=perf_stats) 113 | 114 | @app.route('/pr_result') 115 | def pr_result_page(pr_number, result_number): 116 | pr_result = pr_number_to_result[int(pr_number)] 117 | classification_result = pr_result.classification_results[int(result_number) - 1] 118 | return render_template('pr_result.html', pr_result=pr_result, classification_result=classification_result) 119 | 120 | 121 | if __name__ == "__main__": 122 | args = parser.parse_args() 123 | app.run(debug=True, port=4000) 124 | -------------------------------------------------------------------------------- /src/multilspy/language_servers/omnisharp/workspace_did_change_configuration.json: -------------------------------------------------------------------------------- 1 | { 2 | "RoslynExtensionsOptions": { 3 | "EnableDecompilationSupport": false, 4 | "EnableAnalyzersSupport": true, 5 | "EnableImportCompletion": true, 6 | "EnableAsyncCompletion": false, 7 | "DocumentAnalysisTimeoutMs": 30000, 8 | "DiagnosticWorkersThreadCount": 18, 9 | "AnalyzeOpenDocumentsOnly": true, 10 | "InlayHintsOptions": { 11 | "EnableForParameters": false, 12 | "ForLiteralParameters": false, 13 | "ForIndexerParameters": false, 14 | "ForObjectCreationParameters": false, 15 | "ForOtherParameters": false, 16 | "SuppressForParametersThatDifferOnlyBySuffix": false, 17 | "SuppressForParametersThatMatchMethodIntent": false, 18 | "SuppressForParametersThatMatchArgumentName": false, 19 | "EnableForTypes": false, 20 | "ForImplicitVariableTypes": false, 21 | "ForLambdaParameterTypes": false, 22 | "ForImplicitObjectCreation": false 23 | }, 24 | "LocationPaths": null 25 | }, 26 | "FormattingOptions": { 27 | "OrganizeImports": false, 28 | "EnableEditorConfigSupport": true, 29 | "NewLine": "\n", 30 | "UseTabs": false, 31 | "TabSize": 4, 32 | "IndentationSize": 4, 33 | "SpacingAfterMethodDeclarationName": false, 34 | "SeparateImportDirectiveGroups": false, 35 | "SpaceWithinMethodDeclarationParenthesis": false, 36 | "SpaceBetweenEmptyMethodDeclarationParentheses": false, 37 | "SpaceAfterMethodCallName": false, 38 | "SpaceWithinMethodCallParentheses": false, 39 | "SpaceBetweenEmptyMethodCallParentheses": false, 40 | "SpaceAfterControlFlowStatementKeyword": true, 41 | "SpaceWithinExpressionParentheses": false, 42 | "SpaceWithinCastParentheses": false, 43 | "SpaceWithinOtherParentheses": false, 44 | "SpaceAfterCast": false, 45 | "SpaceBeforeOpenSquareBracket": false, 46 | "SpaceBetweenEmptySquareBrackets": false, 47 | "SpaceWithinSquareBrackets": false, 48 | "SpaceAfterColonInBaseTypeDeclaration": true, 49 | "SpaceAfterComma": true, 50 | "SpaceAfterDot": false, 51 | "SpaceAfterSemicolonsInForStatement": true, 52 | "SpaceBeforeColonInBaseTypeDeclaration": true, 53 | "SpaceBeforeComma": false, 54 | "SpaceBeforeDot": false, 55 | "SpaceBeforeSemicolonsInForStatement": false, 56 | "SpacingAroundBinaryOperator": "single", 57 | "IndentBraces": false, 58 | "IndentBlock": true, 59 | "IndentSwitchSection": true, 60 | "IndentSwitchCaseSection": true, 61 | "IndentSwitchCaseSectionWhenBlock": true, 62 | "LabelPositioning": "oneLess", 63 | "WrappingPreserveSingleLine": true, 64 | "WrappingKeepStatementsOnSingleLine": true, 65 | "NewLinesForBracesInTypes": true, 66 | "NewLinesForBracesInMethods": true, 67 | "NewLinesForBracesInProperties": true, 68 | "NewLinesForBracesInAccessors": true, 69 | "NewLinesForBracesInAnonymousMethods": true, 70 | "NewLinesForBracesInControlBlocks": true, 71 | "NewLinesForBracesInAnonymousTypes": true, 72 | "NewLinesForBracesInObjectCollectionArrayInitializers": true, 73 | "NewLinesForBracesInLambdaExpressionBody": true, 74 | "NewLineForElse": true, 75 | "NewLineForCatch": true, 76 | "NewLineForFinally": true, 77 | "NewLineForMembersInObjectInit": true, 78 | "NewLineForMembersInAnonymousTypes": true, 79 | "NewLineForClausesInQuery": true 80 | }, 81 | "FileOptions": { 82 | "SystemExcludeSearchPatterns": [ 83 | "**/node_modules/**/*", 84 | "**/bin/**/*", 85 | "**/obj/**/*", 86 | "**/.git/**/*", 87 | "**/.git", 88 | "**/.svn", 89 | "**/.hg", 90 | "**/CVS", 91 | "**/.DS_Store", 92 | "**/Thumbs.db" 93 | ], 94 | "ExcludeSearchPatterns": [] 95 | }, 96 | "RenameOptions": { 97 | "RenameOverloads": false, 98 | "RenameInStrings": false, 99 | "RenameInComments": false 100 | }, 101 | "ImplementTypeOptions": { 102 | "InsertionBehavior": 0, 103 | "PropertyGenerationBehavior": 0 104 | }, 105 | "DotNetCliOptions": { 106 | "LocationPaths": null 107 | }, 108 | "Plugins": { 109 | "LocationPaths": null 110 | } 111 | } -------------------------------------------------------------------------------- /data/ground_truth/marshmallow/2244.json: -------------------------------------------------------------------------------- 1 | { 2 | "pr_number": 2244, 3 | "log_file": "data/results/marshmallow/2244_2024-11-23 09:16:17.json", 4 | "differentiating_tests": [ 5 | { 6 | "test": { 7 | "test_code": "# Example 5:\nfrom marshmallow import Schema, fields\n\nclass URLSchema(Schema):\n url = fields.URL()\n\nvalid_data = {\"url\": \"http://@example.com\"}\nresult = URLSchema().load(valid_data)", 8 | "old_output": "Traceback (most recent call last):\n File \"/tmp/BugGPT/BugGPT_test_code.py\", line 8, in \n result = URLSchema().load(valid_data)\n File \"/home/marshmallow/src/marshmallow/schema.py\", line 722, in load\n return self._do_load(\n File \"/home/marshmallow/src/marshmallow/schema.py\", line 909, in _do_load\n raise exc\nmarshmallow.exceptions.ValidationError: {'url': ['Not a valid URL.']}\n", 9 | "new_output": "" 10 | }, 11 | "label": "intended", 12 | "comment": "it's a legal URL" 13 | }, 14 | { 15 | "test": { 16 | "test_code": "# Example 3:\nfrom marshmallow import Schema, fields, ValidationError\n\nclass URLSchema(Schema):\n url = fields.URL()\n\ntry:\n invalid_data = {\"url\": \"http://^@example.com\"}\n URLSchema().load(invalid_data)\nexcept ValidationError as err:\n print(\"Validation Error for 'http://^@example.com':\", err.messages)", 17 | "old_output": "", 18 | "new_output": "Validation Error for 'http://^@example.com': {'url': ['Not a valid URL.']}\n" 19 | }, 20 | "label": "intended", 21 | "comment": "it's an illegal URL (as specified in the newly added test)" 22 | }, 23 | { 24 | "test": { 25 | "test_code": "# Example 4:\nfrom marshmallow import Schema, fields, ValidationError\n\nclass URLSchema(Schema):\n url = fields.URL()\n\ntry:\n invalid_data = {\"url\": \"http://%0G@example.com\"}\n URLSchema().load(invalid_data)\nexcept ValidationError as err:\n print(\"Validation Error for 'http://%0G@example.com':\", err.messages)", 26 | "old_output": "", 27 | "new_output": "Validation Error for 'http://%0G@example.com': {'url': ['Not a valid URL.']}\n" 28 | }, 29 | "label": "intended", 30 | "comment": "it's an illegal URL" 31 | }, 32 | { 33 | "test": { 34 | "test_code": "# Example 5:\nfrom marshmallow import Schema, fields, ValidationError\n\nclass URLSchema(Schema):\n url = fields.URL()\n\ntry:\n invalid_data = {\"url\": \"http://%@example.com\"}", 35 | "old_output": "Couldn't run '/tmp/BugGPT/BugGPT_test_code.py' as Python code: SyntaxError: expected 'except' or 'finally' block (BugGPT_test_code.py, line 8)\n/usr/local/lib/python3.10/site-packages/coverage/control.py:892: CoverageWarning: No data was collected. (no-data-collected)\n self._warn(\"No data was collected.\", slug=\"no-data-collected\")\n", 36 | "new_output": "/usr/local/lib/python3.10/site-packages/coverage/control.py:892: CoverageWarning: No data was collected. (no-data-collected)\n self._warn(\"No data was collected.\", slug=\"no-data-collected\")\nCouldn't run '/tmp/BugGPT/BugGPT_test_code.py' as Python code: SyntaxError: expected 'except' or 'finally' block (BugGPT_test_code.py, line 8)\n" 37 | }, 38 | "label": "intended", 39 | "comment": "different order of error messages" 40 | }, 41 | { 42 | "test": { 43 | "test_code": "# Example 6:\nfrom marshmallow import Schema, fields, ValidationError\n\nclass URLSchema(Schema):\n url = fields.URL()\n\ntry:\n invalid_data = {\"url\": \"http://:pass@example.com\"}\n URLSchema().load(invalid_data)\nexcept ValidationError as err:", 44 | "old_output": "/usr/local/lib/python3.10/site-packages/coverage/control.py:892: CoverageWarning: No data was collected. (no-data-collected)\n self._warn(\"No data was collected.\", slug=\"no-data-collected\")\nCouldn't run '/tmp/BugGPT/BugGPT_test_code.py' as Python code: IndentationError: expected an indented block after 'except' statement on line 10 (BugGPT_test_code.py, line 10)\n", 45 | "new_output": "Couldn't run '/tmp/BugGPT/BugGPT_test_code.py' as Python code: IndentationError: expected an indented block after 'except' statement on line 10 (BugGPT_test_code.py, line 10)\n/usr/local/lib/python3.10/site-packages/coverage/control.py:892: CoverageWarning: No data was collected. (no-data-collected)\n self._warn(\"No data was collected.\", slug=\"no-data-collected\")\n" 46 | }, 47 | "label": "intended", 48 | "comment": "different order of error messages" 49 | } 50 | ] 51 | } -------------------------------------------------------------------------------- /data/ground_truth/marshmallow/2022.json: -------------------------------------------------------------------------------- 1 | { 2 | "pr_number": 2022, 3 | "log_file": "data/results/marshmallow/2022_2024-11-23 09:16:16.json", 4 | "differentiating_tests": [ 5 | { 6 | "test": { 7 | "test_code": "# Example 3: Serialize using timestamp format\nfrom marshmallow import Schema, fields\nimport datetime\n\nclass EventSchema(Schema):\n timestamp = fields.DateTime(format='timestamp')\n\ndata = {'timestamp': datetime.datetime(2023, 10, 1, 10, 0, 0)}\nschema = EventSchema()\nresult = schema.dump(data)\nprint(\"Serialized Timestamp:\", result)", 8 | "old_output": "Serialized Timestamp: {'timestamp': 'timestamp'}\n", 9 | "new_output": "Serialized Timestamp: {'timestamp': 1696154400.0}\n" 10 | }, 11 | "label": "intended", 12 | "comment": "'timestamp' field now contains a valid timestamp value" 13 | }, 14 | { 15 | "test": { 16 | "test_code": "# Example 5: Deserialization using timestamp\nfrom marshmallow import Schema, fields\n\nclass EventSchema(Schema):\n timestamp = fields.DateTime(format='timestamp')\n\ndata = {'timestamp': 1696156800} # POSIX timestamp for 2023-10-01 10:00:00 UTC\nschema = EventSchema()\nresult = schema.load(data)", 17 | "old_output": "Traceback (most recent call last):\n File \"/tmp/BugGPT/BugGPT_test_code.py\", line 9, in \n result = schema.load(data)\n File \"/home/marshmallow/src/marshmallow/schema.py\", line 722, in load\n return self._do_load(\n File \"/home/marshmallow/src/marshmallow/schema.py\", line 909, in _do_load\n raise exc\nmarshmallow.exceptions.ValidationError: {'timestamp': ['Not a valid datetime.']}\n", 18 | "new_output": "" 19 | }, 20 | "label": "intended", 21 | "comment": "old version rejects invalid timestamp value" 22 | }, 23 | { 24 | "test": { 25 | "test_code": "# Example 6: Custom timestamp format with milliseconds\nfrom marshmallow import Schema, fields\n\nclass EventSchema(Schema):\n timestamp = fields.DateTime(format='timestamp_ms')\n\ndata = {'timestamp': 1696156800000} # POSIX timestamp in milliseconds\nschema = EventSchema()\nresult = schema.load(data)", 26 | "old_output": "Traceback (most recent call last):\n File \"/tmp/BugGPT/BugGPT_test_code.py\", line 9, in \n result = schema.load(data)\n File \"/home/marshmallow/src/marshmallow/schema.py\", line 722, in load\n return self._do_load(\n File \"/home/marshmallow/src/marshmallow/schema.py\", line 909, in _do_load\n raise exc\nmarshmallow.exceptions.ValidationError: {'timestamp': ['Not a valid datetime.']}\n", 27 | "new_output": "" 28 | }, 29 | "label": "intended", 30 | "comment": "old version rejects invalid timestamp value" 31 | }, 32 | { 33 | "test": { 34 | "test_code": "# Example 9: Passing a valid timestamp and checking the response\nfrom marshmallow import Schema, fields\n\nclass EventSchema(Schema):\n timestamp = fields.DateTime(format='timestamp')\n\ndata = {'timestamp': 1696156800} # POSIX timestamp for 2023-10-01 10:00:00 UTC\nschema = EventSchema()\nresult = schema.load(data)", 35 | "old_output": "Traceback (most recent call last):\n File \"/tmp/BugGPT/BugGPT_test_code.py\", line 9, in \n result = schema.load(data)\n File \"/home/marshmallow/src/marshmallow/schema.py\", line 722, in load\n return self._do_load(\n File \"/home/marshmallow/src/marshmallow/schema.py\", line 909, in _do_load\n raise exc\nmarshmallow.exceptions.ValidationError: {'timestamp': ['Not a valid datetime.']}\n", 36 | "new_output": "" 37 | }, 38 | "label": "intended", 39 | "comment": "old version rejects invalid timestamp value" 40 | }, 41 | { 42 | "test": { 43 | "test_code": "# Example 10: Deserializing a valid timestamp with milliseconds\nfrom marshmallow import Schema, fields\n\nclass EventSchema(Schema):\n timestamp = fields.DateTime(format='timestamp_ms')\n\ndata = {'timestamp': 1696156800000} # POSIX timestamp in milliseconds\nschema = EventSchema()\nresult = schema.load(data)", 44 | "old_output": "Traceback (most recent call last):\n File \"/tmp/BugGPT/BugGPT_test_code.py\", line 9, in \n result = schema.load(data)\n File \"/home/marshmallow/src/marshmallow/schema.py\", line 722, in load\n return self._do_load(\n File \"/home/marshmallow/src/marshmallow/schema.py\", line 909, in _do_load\n raise exc\nmarshmallow.exceptions.ValidationError: {'timestamp': ['Not a valid datetime.']}\n", 45 | "new_output": "" 46 | }, 47 | "label": "intended", 48 | "comment": "old version rejects invalid timestamp value" 49 | } 50 | ] 51 | } -------------------------------------------------------------------------------- /src/multilspy/language_servers/jedi_language_server/jedi_server.py: -------------------------------------------------------------------------------- 1 | """ 2 | Provides Python specific instantiation of the LanguageServer class. Contains various configurations and settings specific to Python. 3 | """ 4 | 5 | import json 6 | import logging 7 | import os 8 | import pathlib 9 | from contextlib import asynccontextmanager 10 | from typing import AsyncIterator 11 | 12 | from multilspy.multilspy_logger import MultilspyLogger 13 | from multilspy.language_server import LanguageServer 14 | from multilspy.lsp_protocol_handler.server import ProcessLaunchInfo 15 | from multilspy.lsp_protocol_handler.lsp_types import InitializeParams 16 | from multilspy.multilspy_config import MultilspyConfig 17 | 18 | 19 | class JediServer(LanguageServer): 20 | """ 21 | Provides Python specific instantiation of the LanguageServer class. Contains various configurations and settings specific to Python. 22 | """ 23 | 24 | def __init__(self, config: MultilspyConfig, logger: MultilspyLogger, repository_root_path: str): 25 | """ 26 | Creates a JediServer instance. This class is not meant to be instantiated directly. Use LanguageServer.create() instead. 27 | """ 28 | super().__init__( 29 | config, 30 | logger, 31 | repository_root_path, 32 | ProcessLaunchInfo(cmd="jedi-language-server", cwd=repository_root_path), 33 | "python", 34 | ) 35 | 36 | def _get_initialize_params(self, repository_absolute_path: str) -> InitializeParams: 37 | """ 38 | Returns the initialize params for the Jedi Language Server. 39 | """ 40 | with open(os.path.join(os.path.dirname(__file__), "initialize_params.json"), "r") as f: 41 | d = json.load(f) 42 | 43 | del d["_description"] 44 | 45 | d["processId"] = os.getpid() 46 | assert d["rootPath"] == "$rootPath" 47 | d["rootPath"] = repository_absolute_path 48 | 49 | assert d["rootUri"] == "$rootUri" 50 | d["rootUri"] = pathlib.Path(repository_absolute_path).as_uri() 51 | 52 | assert d["workspaceFolders"][0]["uri"] == "$uri" 53 | d["workspaceFolders"][0]["uri"] = pathlib.Path(repository_absolute_path).as_uri() 54 | 55 | assert d["workspaceFolders"][0]["name"] == "$name" 56 | d["workspaceFolders"][0]["name"] = os.path.basename(repository_absolute_path) 57 | 58 | return d 59 | 60 | @asynccontextmanager 61 | async def start_server(self) -> AsyncIterator["JediServer"]: 62 | """ 63 | Starts the JEDI Language Server, waits for the server to be ready and yields the LanguageServer instance. 64 | 65 | Usage: 66 | ``` 67 | async with lsp.start_server(): 68 | # LanguageServer has been initialized and ready to serve requests 69 | await lsp.request_definition(...) 70 | await lsp.request_references(...) 71 | # Shutdown the LanguageServer on exit from scope 72 | # LanguageServer has been shutdown 73 | ``` 74 | """ 75 | 76 | async def execute_client_command_handler(params): 77 | return [] 78 | 79 | async def do_nothing(params): 80 | return 81 | 82 | async def check_experimental_status(params): 83 | if params["quiescent"] == True: 84 | self.completions_available.set() 85 | 86 | async def window_log_message(msg): 87 | self.logger.log(f"LSP: window/logMessage: {msg}", logging.INFO) 88 | 89 | self.server.on_request("client/registerCapability", do_nothing) 90 | self.server.on_notification("language/status", do_nothing) 91 | self.server.on_notification("window/logMessage", window_log_message) 92 | self.server.on_request("workspace/executeClientCommand", execute_client_command_handler) 93 | self.server.on_notification("$/progress", do_nothing) 94 | self.server.on_notification("textDocument/publishDiagnostics", do_nothing) 95 | self.server.on_notification("language/actionableNotification", do_nothing) 96 | self.server.on_notification("experimental/serverStatus", check_experimental_status) 97 | 98 | async with super().start_server(): 99 | self.logger.log("Starting jedi-language-server server process", logging.INFO) 100 | await self.server.start() 101 | initialize_params = self._get_initialize_params(self.repository_root_path) 102 | 103 | self.logger.log( 104 | "Sending initialize request from LSP client to LSP server and awaiting response", 105 | logging.INFO, 106 | ) 107 | init_response = await self.server.send.initialize(initialize_params) 108 | assert init_response["capabilities"]["textDocumentSync"]["change"] == 2 109 | assert "completionProvider" in init_response["capabilities"] 110 | assert init_response["capabilities"]["completionProvider"] == { 111 | "triggerCharacters": [".", "'", '"'], 112 | "resolveProvider": True, 113 | } 114 | 115 | self.server.notify.initialized({}) 116 | 117 | yield self 118 | 119 | await self.server.shutdown() 120 | await self.server.stop() 121 | -------------------------------------------------------------------------------- /src/testora/evaluation/PreparePRChunks.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | from github import Github, Auth 3 | from testora.RegressionFinder import get_merged_prs 4 | from testora.evaluation import EvalTaskManager 5 | 6 | 7 | def write_specific_PR_tasks_into_database(project_name, project_id, pr_numbers: List[int]): 8 | pr_numbers_to_analyze = pr_numbers 9 | EvalTaskManager.write_tasks(project_name, pr_numbers_to_analyze, "tasks") 10 | 11 | 12 | def write_range_of_PR_tasks_into_database(project_name, project_id, start_pr_nb, total): 13 | print(f"Searching PRs for {project_name}") 14 | 15 | token = open(".github_token", "r").read().strip() 16 | github = Github(auth=Auth.Token(token)) 17 | github_repo = github.get_repo(project_id) 18 | 19 | merged_prs = get_merged_prs(github_repo, max_prs=1) 20 | most_recent_pr_nb = merged_prs[0].number 21 | 22 | print(f"Most recent PR number: {most_recent_pr_nb}") 23 | result_pr_nbs = [] 24 | next_candidate_pr_nb = start_pr_nb 25 | while next_candidate_pr_nb <= most_recent_pr_nb and len(result_pr_nbs) < total: 26 | # check if nb is a PR 27 | try: 28 | pr = github_repo.get_pull(next_candidate_pr_nb) 29 | except Exception: 30 | # not a valid PR number 31 | print(f"Skipping number {next_candidate_pr_nb}(not a valid PR number)") 32 | next_candidate_pr_nb += 1 33 | continue 34 | 35 | # check if PR is closed 36 | if not pr.is_merged(): 37 | print(f"Skipping number {next_candidate_pr_nb} (PR not merged)") 38 | next_candidate_pr_nb += 1 39 | continue 40 | 41 | # found a valid PR number -- add to list 42 | print(f"Adding PR number {next_candidate_pr_nb} into the list") 43 | result_pr_nbs.append(next_candidate_pr_nb) 44 | next_candidate_pr_nb += 1 45 | 46 | EvalTaskManager.write_tasks(project_name, result_pr_nbs, "tasks") 47 | 48 | 49 | if __name__ == "__main__": 50 | EvalTaskManager.initialize() 51 | 52 | # write_range_of_PR_tasks_into_database( 53 | # "pandas", "pandas-dev/pandas", 60322, 300) 54 | 55 | # write_range_of_PR_tasks_into_database( 56 | # "scipy", "scipy/scipy", 22031, 300) 57 | 58 | # write_range_of_PR_tasks_into_database( 59 | # "keras", "keras-team/keras", 20711, 300) 60 | 61 | # write_range_of_PR_tasks_into_database( 62 | # "marshmallow", "marshmallow-code/marshmallow", 2804, 300) 63 | 64 | # write_specific_PR_tasks_into_database("scipy", "scipy/scipy", 65 | # [23609, 23607, 23606, 23574, 23521, 23520, 23511, 23502, 23501, 23498, 23497, 23494, 23483, 23475, 23471, 23454, 23442, 23426, 23415, 23388, 23350, 23348, 23341, 23322, 23311, 23298, 23294, 23293, 23280, 23276, 23266, 23235, 23194, 23138, 23121, 23103, 23091, 23071, 23059, 23055, 23048, 23047, 23044, 23019, 23005, 22989, 22982, 22971, 22944, 22941, 22913, 22910, 22899, 22869, 22864, 22855, 22801, 22772, 22763, 22760, 22725, 22718, 22689, 22660, 22651, 22632, 22624, 22611, 22610, 22600, 22585, 22582, 22532, 22494, 22482, 22481, 22475, 22462, 22455, 22447, 22433, 22421, 22398, 22372, 22353, 22344, 22313, 22284, 22283, 22278, 22273, 22251, 22242, 22226, 22221, 22220, 22219, 22215, 22213, 22199]) 66 | 67 | # write_specific_PR_tasks_into_database("pandas", "pandas-dev/pandas", 68 | # [62349, 62325, 62320, 62300, 62298, 62289, 62281, 62280, 62276, 62248, 62246, 62166, 62116, 62101, 62085, 62076, 62073, 62038, 62032, 62025, 61990, 61972, 61969, 61966, 61947, 61946, 61924, 61894, 61891, 61884, 61874, 61855, 61827, 61800, 61786, 61773, 61771, 61743, 61699, 61697, 61658, 61646, 61633, 61625, 61623, 61597, 61541, 61517, 61514, 61508, 61484, 61472, 61467, 61451, 61422, 61399, 61376, 61352, 61340, 61332, 61320, 61293, 61286, 61234, 61229, 61225, 61207, 61198, 61193, 61183, 61162, 61131, 61114, 61105, 61103, 61054, 61046, 61041, 61017, 61008, 60987, 60985, 60983, 60975, 60974, 60963, 60952, 60949, 60936, 60924, 60916, 60906, 60894, 60882, 60867, 60860, 60828, 60826, 60795, 60793]) 69 | 70 | # write_specific_PR_tasks_into_database("keras", "keras-team/keras", 71 | # [21682, 21680, 21650, 21646, 21611, 21603, 21595, 21590, 21588, 21569, 21535, 21534, 21532, 21512, 21496, 21495, 21480, 21473, 21456, 21449, 21440, 21434, 21432, 21428, 21423, 21414, 21412, 21407, 21406, 21399, 21393, 21392, 21373, 21361, 21349, 21336, 21335, 21331, 21317, 21304, 21302, 21291, 21290, 21277, 21256, 21239, 21211, 21192, 21184, 21170, 21163, 21148, 21138, 21129, 21117, 21101, 21095, 21081, 21077, 21066, 21053, 21030, 21014, 21010, 20993, 20989, 20974, 20973, 20956, 20954, 20928, 20926, 20916, 20913, 20909, 20905, 20892, 20879, 20854, 20853, 20829, 20824, 20815, 20791, 20784, 20782, 20777, 20768, 20765, 20758, 20755, 20736, 20689, 20643, 20637, 20630, 20626, 20613, 20612, 20602]) 72 | 73 | # write_specific_PR_tasks_into_database("marshmallow", "marshmallow-code/marshmallow", 74 | # [2803, 2800, 2798, 2797, 2770, 2769, 2764, 2762, 2756, 2755, 2754, 2742, 2741, 2731, 2712, 2706, 2701, 2700, 2699, 2698, 2271, 2264, 2246, 2244, 2215, 2164, 2153, 2081, 2071, 1882, 1868, 1785, 1745, 1702, 1682, 1627, 1574, 1551, 1524, 1501, 1500, 1480, 1448, 1446, 1444, 1443, 1416, 1405, 1401, 1399, 1395, 1392, 1376, 1359, 1354, 1344, 1343, 1340, 1331, 1307, 1306, 1293, 1288, 1276, 1252, 1246, 1209, 1189, 1136, 1087, 1079, 1078, 1063, 1049, 1036, 1010, 1008, 983, 982, 963, 960, 959, 954, 950, 931, 911, 903, 865, 857, 856, 826, 822, 816, 808, 769, 750, 744, 725, 714, 707]) 75 | -------------------------------------------------------------------------------- /src/testora/execution/ProgramMerger.py: -------------------------------------------------------------------------------- 1 | import ast 2 | from textwrap import dedent 3 | import re 4 | 5 | from testora.util.Exceptions import TestoraException 6 | 7 | 8 | def merge_programs(programs): 9 | function_def_snippets = [] 10 | for program_idx, program in enumerate(programs): 11 | # Parse the snippet into an AST node 12 | try: 13 | parsed_snippet = ast.parse(dedent(program)) 14 | except Exception as _: 15 | function_def_snippets.append( 16 | f"def program_{program_idx}():\n pass # Couldn't parse generated test") 17 | continue 18 | 19 | # Create a function definition for the parsed snippet 20 | function_def = ast.FunctionDef( 21 | name=f"program_{program_idx}", 22 | args=ast.arguments( 23 | posonlyargs=[], 24 | args=[], 25 | vararg=None, 26 | kwonlyargs=[], 27 | kw_defaults=[], 28 | kwarg=None, 29 | defaults=[] 30 | ), 31 | body=parsed_snippet.body, 32 | decorator_list=[], 33 | returns=None 34 | ) 35 | 36 | # Print function definitions to string 37 | code = ast.unparse(ast.fix_missing_locations(function_def)) 38 | function_def_snippets.append(code) 39 | 40 | result = "import sys\nimport traceback\nimport io\n\n" 41 | for function_def_snippet in function_def_snippets: 42 | result += function_def_snippet + "\n\n" 43 | 44 | for fct_idx in range(len(function_def_snippets)): 45 | result += f"""print('XXXXX Program {fct_idx} starting XXXXX') 46 | try: 47 | my_stdout = io.StringIO() 48 | my_stderr = io.StringIO() 49 | sys.stdout = my_stdout 50 | sys.stderr = my_stderr 51 | program_{fct_idx}() 52 | except BaseException as e: 53 | details = traceback.format_exc() 54 | print(details, file=my_stderr) 55 | finally: 56 | sys.stdout.flush() 57 | sys.stderr.flush() 58 | sys.stdout = sys.__stdout__ 59 | sys.stderr = sys.__stderr__ 60 | print(my_stdout.getvalue(), end="") 61 | print(my_stderr.getvalue(), end="") 62 | print('XXXXX Program {fct_idx} done XXXXX') 63 | """ 64 | 65 | return result 66 | 67 | 68 | program_start_pattern = re.compile(r"XXXXX Program (\d+) starting XXXXX") 69 | program_end_pattern = re.compile(r"XXXXX Program (\d+) done XXXXX") 70 | 71 | 72 | def separate_outputs(output): 73 | in_program = None 74 | current_output = None 75 | result = [] 76 | for line in output.split("\n"): 77 | program_start_match = program_start_pattern.match(line) 78 | if program_start_match: 79 | in_program = int(program_start_match.group(1)) 80 | current_output = "" 81 | continue 82 | program_end_match = program_end_pattern.match(line) 83 | if program_end_match: 84 | program_nb = int(program_end_match.group(1)) 85 | if program_nb != in_program: 86 | raise TestoraException(f"Unexpected output of merged tests:\n{str(output)}") 87 | in_program = None 88 | result.append(current_output) 89 | elif in_program is not None: 90 | current_output += line + "\n" 91 | return result 92 | 93 | 94 | # for testing 95 | if __name__ == "__main__": 96 | program1 = """ 97 | import pandas as pd 98 | 99 | df = pd.DataFrame({'A': [1.112, 3.456, 7.890], 'B': [9.876, 5.432, 1.234]}) 100 | rounded_df = df.round(1) 101 | print(rounded_df) 102 | """ 103 | 104 | program2 = """ 105 | import pandas as pd 106 | 107 | series_strings = pd.Series(['a', 'b', 'c']) 108 | # This will result in an error as rounding is not applicable to strings 109 | try: 110 | rounded_strings = series_strings.round(2) 111 | print(rounded_strings) 112 | except TypeError as e: 113 | print(f"Error: {e}") 114 | """ 115 | 116 | program3 = """ 117 | import pandas as pd 118 | 119 | # Normal usage scenario 120 | data = [1.234, 2.345, 3.456] 121 | ser = pd.Series(data) 122 | rounded_ser = ser.round(decimals=1) 123 | print(rounded_ser) 124 | 125 | # Normal usage scenario 126 | ser = pd.Series([-1.234, -2.345, -3.456]) 127 | rounded_ser = ser.round() 128 | print(rounded_ser) 129 | 130 | # Normal usage scenario 131 | ser = pd.Series([5.678, 6.789, 7.890]) 132 | rounded_ser = ser.round(decimals=2) 133 | print(rounded_ser) 134 | 135 | # Normal usage scenario 136 | ser = pd.Series([1000, 2000, 3000]) 137 | rounded_ser = ser.round(decimals=-2) 138 | print(rounded_ser) 139 | """ 140 | 141 | program4 = """ 142 | import pandas as pd 143 | import numpy as np 144 | 145 | data = np.array([1.234, 2.345, 3.456]) 146 | ser = pd.Series(data) 147 | print(ser) 148 | r = ser / zero 149 | print(r) 150 | """ 151 | 152 | result = merge_programs([program1, program2, program3, program4]) 153 | print(result) 154 | 155 | output = """ 156 | XXXXX Program 0 starting XXXXX 157 | A B 158 | 0 1.1 9.9 159 | 1 3.5 5.4 160 | 2 7.9 1.2 161 | XXXXX Program 0 done XXXXX 162 | XXXXX Program 1 starting XXXXX 163 | 0 a 164 | 1 b 165 | 2 c 166 | dtype: object 167 | XXXXX Program 1 done XXXXX 168 | XXXXX Program 2 starting XXXXX 169 | 0 1.2 170 | 1 2.3 171 | 2 3.5 172 | dtype: float64 173 | 0 -1.0 174 | 1 -2.0 175 | 2 -3.0 176 | dtype: float64 177 | 0 5.68 178 | 1 6.79 179 | 2 7.89 180 | dtype: float64 181 | 0 1000 182 | 1 2000 183 | 2 3000 184 | dtype: int64 185 | XXXXX Program 2 done XXXXX 186 | XXXXX Program 3 starting XXXXX 187 | 0 1.234 188 | 1 2.345 189 | 2 3.456 190 | dtype: float64 191 | Traceback (most recent call last): 192 | File "/tmp/TestRemoveMe.py", line 74, in 193 | program_3() 194 | File "/tmp/TestRemoveMe.py", line 41, in program_3 195 | r = ser / zero 196 | ^^^^ 197 | NameError: name 'zero' is not defined 198 | 199 | XXXXX Program 3 done XXXXX 200 | """ 201 | 202 | split_outputs = separate_outputs(output) 203 | for idx, split_output in enumerate(split_outputs): 204 | print(f"Program {idx} output:") 205 | print(split_output) 206 | print() 207 | -------------------------------------------------------------------------------- /src/testora/util/ClonedRepoManager.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | import json 3 | from os.path import exists 4 | from pathlib import Path 5 | import shutil 6 | import subprocess 7 | from typing import List 8 | from git import Repo 9 | 10 | from testora.util.PythonLanguageServer import PythonLanguageServer 11 | 12 | 13 | @dataclass 14 | class ClonedRepo: 15 | repo: Repo 16 | container_name: str 17 | language_server: PythonLanguageServer 18 | 19 | 20 | class ClonedRepoManager: 21 | nb_clones = 3 22 | 23 | def __init__(self, pool_dir, repo_name, repo_id, container_base_name, module_name): 24 | self.pool_dir = pool_dir 25 | self.repo_name = repo_name 26 | self.repo_id = repo_id 27 | self.container_base_name = container_base_name 28 | self.module_name = module_name 29 | 30 | self.clone_state_file = f"{self.pool_dir}/clone_state.json" 31 | self._read_clone_state() 32 | 33 | self.usage_order: List[str] = [f"clone{i}" for i in range( 34 | 1, self.nb_clones + 1)] # last = last used 35 | 36 | self._reset_and_clean_all_clones() 37 | 38 | # start one language server for each clone 39 | self.clone_id_to_language_server = {} 40 | for i in range(1, self.nb_clones + 1): 41 | server = PythonLanguageServer( 42 | f"{self.pool_dir}/clone{i}/{self.repo_name}") 43 | self.clone_id_to_language_server[f"clone{i}"] = server 44 | 45 | def _read_clone_state(self): 46 | if not exists(self.clone_state_file): 47 | self.clone_id_to_state = { 48 | f"clone{i}": {"commit": "unknown", "container_name": f"{self.container_base_name}{i}"} for i in range(1, self.nb_clones + 1)} 49 | return 50 | 51 | with open(self.clone_state_file, "r") as f: 52 | self.clone_id_to_state = json.load(f) 53 | 54 | assert len(self.clone_id_to_state) == self.nb_clones 55 | 56 | def _write_clone_state(self): 57 | assert len(self.clone_id_to_state) == self.nb_clones 58 | with open(self.clone_state_file, "w") as f: 59 | json.dump(self.clone_id_to_state, f) 60 | 61 | def _reset_and_clean_all_clones(self): 62 | for clone_id, _ in self.clone_id_to_state.items(): 63 | cloned_repo_dir = f"{self.pool_dir}/{clone_id}/{self.repo_name}" 64 | cloned_repo = Repo(cloned_repo_dir) 65 | cloned_repo.git.rm('--cached', '-rf', '.') 66 | cloned_repo.git.reset('--hard') 67 | cloned_repo.git.clean('-f', '-d') 68 | origin = cloned_repo.remotes.origin 69 | origin.fetch() 70 | 71 | def _get_least_recently_used_clone_id(self) -> str: 72 | return self.usage_order[0] 73 | 74 | def _have_used_clone_id(self, clone_id: str): 75 | self.usage_order.remove(clone_id) 76 | self.usage_order.append(clone_id) 77 | 78 | def _safe_checkout(self, cloned_repo: Repo, commit: str): 79 | try: 80 | cloned_repo.git.checkout(commit) 81 | cloned_repo.git.submodule('update', '--init', '--recursive') 82 | except Exception: 83 | if commit == "main": 84 | self._safe_checkout(cloned_repo, "master") 85 | elif commit == "master": 86 | self._safe_checkout(cloned_repo, "dev") 87 | else: 88 | cloned_repo.git.rm('--cached', '-rf', '.') 89 | cloned_repo.git.reset('--hard') 90 | cloned_repo.git.clean('-f', '-d') 91 | origin = cloned_repo.remotes.origin 92 | origin.fetch() 93 | try: 94 | cloned_repo.git.checkout(commit) 95 | except Exception: 96 | # we get here when submodules are in a strange state 97 | self._remove_and_reinit_submodules(cloned_repo, commit) 98 | 99 | def _remove_and_reinit_submodules(self, cloned_repo: Repo, commit: str): 100 | # 1) de-initialize all submodules 101 | cloned_repo.git.submodule('deinit', '-f', '--all') 102 | 103 | # 2) remove all submodule working trees 104 | root = Path(cloned_repo.working_dir) 105 | ls_output = subprocess.run( 106 | ["git", "ls-files", "-s"], capture_output=True, text=True, check=True 107 | ).stdout.splitlines() 108 | for line in ls_output: 109 | parts = line.split() 110 | if len(parts) >= 4 and parts[0] == "160000": 111 | path = " ".join(parts[3:]) 112 | shutil.rmtree(root / path, ignore_errors=True) 113 | 114 | # 3) remove all submodule git metadata under .git/modules 115 | modules_dir = root / ".git" / "modules" 116 | if modules_dir.exists(): 117 | for child in modules_dir.iterdir(): 118 | shutil.rmtree(child, ignore_errors=True) 119 | 120 | # 4) checkout the desired commit 121 | cloned_repo.git.checkout(commit) 122 | 123 | # 5) re-initialize submodules recursively 124 | cloned_repo.git.submodule('update', '--init', '--recursive') 125 | 126 | def get_cloned_repo(self, commit) -> ClonedRepo: 127 | # reuse existing clone if possible 128 | for clone_id, state in self.clone_id_to_state.items(): 129 | if state["commit"] == commit: 130 | self._have_used_clone_id(clone_id) 131 | cloned_repo_dir = f"{self.pool_dir}/{clone_id}/{self.repo_name}" 132 | 133 | return ClonedRepo(Repo(cloned_repo_dir), 134 | state["container_name"], 135 | self.clone_id_to_language_server[clone_id]) 136 | 137 | # checkout desired commit 138 | clone_id = self._get_least_recently_used_clone_id() 139 | cloned_repo_dir = f"{self.pool_dir}/{clone_id}/{self.repo_name}" 140 | cloned_repo = Repo(cloned_repo_dir) 141 | self._safe_checkout(cloned_repo, commit) 142 | 143 | # update clone state 144 | state = self.clone_id_to_state[clone_id] 145 | state["commit"] = commit 146 | self.clone_id_to_state[clone_id] = state 147 | self._write_clone_state() 148 | self._have_used_clone_id(clone_id) 149 | 150 | return ClonedRepo(cloned_repo, 151 | state["container_name"], 152 | self.clone_id_to_language_server) 153 | -------------------------------------------------------------------------------- /src/testora/evaluation/ClassificationResultsSummarizer.py: -------------------------------------------------------------------------------- 1 | from collections import Counter 2 | import json 3 | 4 | # Select the combination of prompt and model to evaluate: 5 | 6 | # single-question, GPT-4o 7 | # result_files = ["data/classification_results_03_2025/keras/single-question_GPT-4o.json", 8 | # "data/classification_results_03_2025/marshmallow/single-question_GPT-4o.json", 9 | # "data/classification_results_03_2025/pandas/single-question_GPT-4o.json", 10 | # "data/classification_results_03_2025/scipy/single-question_GPT-4o.json"] 11 | 12 | # single-question, GPT-4o-mini 13 | # result_files = ["data/classification_results_03_2025/keras/single-question_GPT-4o-mini.json", 14 | # "data/classification_results_03_2025/marshmallow/single-question_GPT-4o-mini.json", 15 | # "data/classification_results_03_2025/pandas/single-question_GPT-4o-mini.json", 16 | # "data/classification_results_03_2025/scipy/single-question_GPT-4o-mini.json"] 17 | 18 | # single-question, DeepSeek-R1 19 | # result_files = ["data/classification_results_03_2025/keras/single-question_DeepSeek-R1.json", 20 | # "data/classification_results_03_2025/marshmallow/single-question_DeepSeek-R1.json", 21 | # "data/classification_results_03_2025/pandas/single-question_DeepSeek-R1.json", 22 | # "data/classification_results_03_2025/scipy/single-question_DeepSeek-R1.json"] 23 | 24 | # multi-question, GPT-4o 25 | # result_files = ["data/classification_results_03_2025/keras/multi-question_GPT-4o.json", 26 | # "data/classification_results_03_2025/marshmallow/multi-question_GPT-4o.json", 27 | # "data/classification_results_03_2025/pandas/multi-question_GPT-4o.json", 28 | # "data/classification_results_03_2025/scipy/multi-question_GPT-4o.json"] 29 | 30 | # multi-question, GPT-4o-mini 31 | result_files = ["data/classification_results_03_2025/keras/multi-question_GPT-4o-mini.json", 32 | "data/classification_results_03_2025/marshmallow/multi-question_GPT-4o-mini.json", 33 | "data/classification_results_03_2025/pandas/multi-question_GPT-4o-mini.json", 34 | "data/classification_results_03_2025/scipy/multi-question_GPT-4o-mini.json"] 35 | 36 | # multi-question, DeepSeek-R1 37 | # result_files = ["data/classification_results_03_2025/keras/multi-question_DeepSeek-R1.json", 38 | # "data/classification_results_03_2025/marshmallow/multi-question_DeepSeek-R1.json", 39 | # "data/classification_results_03_2025/pandas/multi-question_DeepSeek-R1.json", 40 | # "data/classification_results_03_2025/scipy/multi-question_DeepSeek-R1.json"] 41 | 42 | # extract results 43 | print("Project, PR, Prediction, Label, Result") 44 | nb_fps = 0 45 | nb_tps = 0 46 | nb_fns = 0 47 | nb_tns = 0 48 | variance_ctr = Counter() 49 | config_used = None 50 | for result_file in result_files: 51 | if "keras" in result_file: 52 | project = "keras" 53 | elif "marshmallow" in result_file: 54 | project = "marshmallow" 55 | elif "pandas" in result_file: 56 | project = "pandas" 57 | elif "scipy" in result_file: 58 | project = "scipy" 59 | else: 60 | raise ValueError(f"Couldn't determine project from file name {result_file}") 61 | 62 | with open(result_file, "r") as f: 63 | result_json = json.load(f) 64 | config_used_here = result_json[0]["message"] 65 | if config_used is None: 66 | config_used = config_used_here 67 | else: 68 | assert config_used == config_used_here, f"Config mismatch:\nUsed before:\n{config_used}\nvs used now in {project}:\n {config_used_here}" 69 | for entry in result_json: 70 | if entry["message"] == "Classification result": 71 | # compare label and predictions 72 | results = [] 73 | if entry["label"] in ["unintended", "coincidental fix"]: 74 | for prediction in entry["predictions"].split("#"): 75 | if prediction == "unintended": 76 | results.append("TP") 77 | nb_tps += 1 78 | elif prediction == "intended": 79 | results.append("FN") 80 | nb_fns += 1 81 | else: 82 | raise ValueError( 83 | f"Invalid prediction: {entry['prediction']}") 84 | elif entry["label"] in ["intended"]: 85 | for prediction in entry["predictions"].split("#"): 86 | if prediction == "intended": 87 | results.append("TN") 88 | nb_tns += 1 89 | elif prediction == "unintended": 90 | results.append("FP") 91 | nb_fps += 1 92 | else: 93 | raise ValueError( 94 | f"Invalid prediction: {entry['prediction']}") 95 | else: 96 | raise ValueError( 97 | f"Invalid label: {entry['label']}, {entry['pr_nb']}") 98 | 99 | # check variance of predictions 100 | results_counter = Counter(results) 101 | variance_str = str( 102 | sorted(list(results_counter.values()), reverse=True)) 103 | variance_ctr[variance_str] += 1 104 | 105 | # print into CSV 106 | print(f"{project}, " 107 | f"{entry['pr_nb']}, " 108 | f"{entry['predictions']}, " 109 | f"{entry['label']}, " 110 | f"{', '.join(results)}" 111 | ) 112 | 113 | print(config_used) 114 | print() 115 | print(f"TP: {nb_tps}, FP: {nb_fps}, FN: {nb_fns}, TN: {nb_tns}") 116 | precision = 0 if (nb_tps + nb_fps) == 0 else nb_tps / (nb_tps + nb_fps) 117 | print(f"Precision: {precision}") 118 | recall = 0 if (nb_tps + nb_fns) == 0 else nb_tps / (nb_tps + nb_fns) 119 | print(f"Recall: {recall}") 120 | f1 = 0 if (precision + recall) == 0 else 2 * \ 121 | precision * recall / (precision + recall) 122 | print(f"F1: {f1}") 123 | print() 124 | print(f"Variance of predictions: {variance_ctr}") 125 | print(f"Total data points: {nb_tps+nb_fps+nb_fns+nb_tns}") 126 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Testora: Regression Testing with a Natural Language Oracle 2 | 3 | Testora is an automated approach to check behavioral changes introduced by a pull request against the title, description, etc. of the pull request. 4 | 5 | Paper: 6 | [https://arxiv.org/abs/2503.18597](https://arxiv.org/abs/2503.18597) 7 | 8 | ## Installation 9 | 10 | Testora uses two kinds of Docker containers: 11 | 12 | * A Visual Studio Code Dev Container for running Testora itself. See [devcontainer.json](.devcontainer/devcontainer.json). 13 | 14 | * Docker-in-docker containers for target projects to analyze with Testora. These containers are created when creating the dev container. See [postCreateCommands.sh](.devcontainer/postCreateCommands.sh). 15 | 16 | To install and run Testora, follow these steps: 17 | 18 | 1) Install [Visual Studio Code](https://code.visualstudio.com/download) and its ["Dev Containers" extension](https://marketplace.visualstudio.com/items?itemName=ms-vscode-remote.remote-containers). 19 | 20 | 2) Open Testora in Visual Studio Code: 21 | 22 | ```code .``` 23 | 24 | 3) In Visual Studio Code, build the Dev Container and reopen the project in the container: 25 | 26 | ```Ctrl + Shift + P``` 27 | 28 | ```Dev Containers: Rebuild and Reopen in Container``` 29 | 30 | This will take a couple of minutes, because in addition to Testora, it will set up three instances of the project under analysis. We use three instances to efficiently switch between the commits just before and just after a PR, as well as the latest commit in the main branch. 31 | 32 | 4) In the main directory, create a file `.openai_token` with an OpenAI API key. This is required for invoking an LLM, which is an essential part of Testora. 33 | 34 | 5) In the main directory, create a file `.github_token` with a (free to create) GitHub API key. This is required because Testora interacts with the GitHub API to retrieve details about the PRs to analyze. 35 | 36 | ## Running Testora on a Single Pull Request 37 | 38 | [testora.RegressionFinder](src/testora/RegressionFinder.py) is the main entry point to run Testora. 39 | To apply it to a specific PR of a project, run it like this: 40 | 41 | ```python -m testora.RegressionFinder --project scipy --pr 21768``` 42 | 43 | The project must be one of the projects that were set up while building the Dev Container. The above command produces a `logs_.json` file. 44 | 45 | ## Inspecting Results in the Web UI 46 | 47 | We provide a Web UI to inspect detailed logs of Testora. 48 | 49 | 1) Launch the web server: 50 | 51 | ```python -m testora.webui.WebUI --files logs_*.json``` 52 | 53 | 2) Visit [http://localhost:4000/](http://localhost:4000/) in your browser. 54 | 55 | 3) Click on the value in the "Status" column to inspect the detailed logs of a PR. 56 | 57 | ## Running Testora on Many Pull Requests 58 | 59 | For large-scale experiments, we use an SQL database that stores PRs to analyze and, once a PR has been analyzed, stores the results of Testora on this PR. 60 | The database itself is *not* part of this public release, but you may replicate the setup with your own database using [these two database schemas](src/testora/evaluation/sql/). 61 | 62 | Assuming you have set up the database: 63 | 64 | 1) Add PRs to check into the database: 65 | 66 | ```python -m testora.evaluation.PreparePRChunks``` 67 | 68 | 2) Run [testora.RegressionFinder](src/testora/RegressionFinder.py) in database mode, which fetches PRs to check from the database and applies the approach to each PR. 69 | 70 | ```python -m testora.RegressionFinder --db``` 71 | 72 | You can launch multiple instances of this command in parallel in different Dev Containers. Each of the parallel instances will fetch one PR at a time and write the result back into the database, until all PRs have been analyzed. 73 | 74 | 3) Check the status of PRs to analyze: 75 | 76 | ```python -m testora.evaluation.EvalTaskManager --status``` 77 | 78 | 4) Once some or all PRs have been analyzed, download the results (i.e., `logs_*.json` files) from the database for inspection: 79 | 80 | ```python -m testora.evaluation.EvalTaskManager --fetch``` 81 | 82 | To inspect the logs, use the WebUI as described above. 83 | 84 | ## Results Reported in the Paper 85 | 86 | ### RQ1: Real-World Problems Found by Testora 87 | 88 | See [this sheet](https://docs.google.com/spreadsheets/d/1We-EwrNv_0U1Wco_eAUbxwjyFkkPI9kM7tkaRgP0yyI/edit?usp=sharing) for details on the 30 real-world problems, the corresponding PRs, the issues we reported, and their status. 89 | 90 | ### RQ2 (Effectiveness of Test Generation) and RQ4 (Costs) 91 | 92 | Download the logs as described in [DATA.md](data/DATA.md). 93 | This will create a folder [data/results_03_2025/](data/results_03_2025/), which contains the raw logs of running Testora in its default configuration. 94 | 95 | To analyze the logs, run the following command: 96 | 97 | ```python -m testora.evaluation.PRAnalysisStats``` 98 | 99 | It will do the following: 100 | * Read the logs of all 1,274 PRs analyzed for RQ2 and RQ4 101 | * Compute the test generation statistics reported in RQ2 102 | * Compute the token cost statistics reported in RQ4 103 | * Output the corresponding LaTeX tables 104 | * Output LaTeX macros that define results used repeatedly in the paper (e.g., monetary cost per PR) 105 | * Write the plots that show time costs and token costs into [data/figures](data/figures) 106 | 107 | ### RQ3: Accuracy of Classifier 108 | 109 | Our dataset of 164 manually labeled data points is in [data/ground_truth](data/ground_truth). 110 | 111 | To run evaluate the classifier against the ground truth, we use [ClassificationEvaluator.py](src/testora/evaluation/ClassificationEvaluator.py). 112 | If not done yet for RQ2, download the logs as described in [DATA.md](data/DATA.md). 113 | Afterward, the raw logs of running Testora with three LLMs (GPT-4o-mini, GPT-4o, DeepSeek-R1) and two different prompting techniques (multi-question classifier, single-question classifier) are available in [data/classification_results_03_2025/](data/classification_results_03_2025/). 114 | 115 | To compute the precision, recall, and F1 score, run the following command: 116 | 117 | ```python -m testora.evaluation.ClassificationResultsSummarizer``` 118 | 119 | It will output detailed results for each PR in the ground truth dataset, and the at end, the overall results. 120 | To switch between different LLMs and prompting techniques, edit [ClassificationResultsSummarizer.py](src/testora/evaluation/ClassificationResultsSummarizer.py) to modify the lines at the beginning that select a model-prompt combination. 121 | --------------------------------------------------------------------------------