├── tests ├── __init__.py ├── test_grader.py └── test_narrator.py ├── poetry.toml ├── evaluation ├── local_cache │ └── compiler │ │ ├── all.d943856c9b1e8f80.jsonl │ │ └── all.e415303eb7359b9a.jsonl ├── results │ ├── fluency_results.png │ ├── heatmap_metrics.png │ ├── heatmap_metrics_tight.png │ ├── metrics_over_narratives.png │ ├── results_by_dataset.csv │ ├── results_by_dataset.tex │ ├── results_local2.csv │ ├── results_by_technique.csv │ ├── results_by_technique.tex │ ├── cleaned_results.csv │ └── results_old.csv ├── examples.py ├── experiment_runner.py ├── explingo.py ├── metrics.py └── eval_data │ ├── mushroom_2.json │ ├── mushroom_1.json │ ├── pdf_2.json │ ├── pdf_1.json │ ├── student_2.json │ ├── housing_3.json │ ├── student_1.json │ └── housing_2.json ├── parrot.jpg ├── .flake8 ├── explingo ├── __init__.py ├── testing.py ├── narrator.py ├── grader.py └── tutorial.ipynb ├── .idea └── .gitignore ├── .github └── workflows │ └── python-publish.yml ├── pyproject.toml ├── tasks.py ├── README.md └── .gitignore /tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /poetry.toml: -------------------------------------------------------------------------------- 1 | [virtualenvs] 2 | in-project = true 3 | -------------------------------------------------------------------------------- /evaluation/local_cache/compiler/all.d943856c9b1e8f80.jsonl: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /parrot.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sibyl-dev/Explingo/HEAD/parrot.jpg -------------------------------------------------------------------------------- /.flake8: -------------------------------------------------------------------------------- 1 | [flake8] 2 | max-line-length = 99 3 | exclude = docs, .git, __pycache__, .ipynb_checkpoints 4 | extend-ignore = E203 -------------------------------------------------------------------------------- /evaluation/results/fluency_results.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sibyl-dev/Explingo/HEAD/evaluation/results/fluency_results.png -------------------------------------------------------------------------------- /evaluation/results/heatmap_metrics.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sibyl-dev/Explingo/HEAD/evaluation/results/heatmap_metrics.png -------------------------------------------------------------------------------- /evaluation/results/heatmap_metrics_tight.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sibyl-dev/Explingo/HEAD/evaluation/results/heatmap_metrics_tight.png -------------------------------------------------------------------------------- /evaluation/results/metrics_over_narratives.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sibyl-dev/Explingo/HEAD/evaluation/results/metrics_over_narratives.png -------------------------------------------------------------------------------- /explingo/__init__.py: -------------------------------------------------------------------------------- 1 | from explingo.grader import Grader 2 | from explingo.narrator import Narrator 3 | 4 | __author__ = "MIT Data To AI Lab" 5 | __email__ = "dailabmit@gmail.com" 6 | __version__ = "0.1.1" 7 | 8 | __all__ = ["Narrator", "Grader"] 9 | -------------------------------------------------------------------------------- /.idea/.gitignore: -------------------------------------------------------------------------------- 1 | # Default ignored files 2 | /shelf/ 3 | /workspace.xml 4 | # Editor-based HTTP Client requests 5 | /httpRequests/ 6 | # Datasource local storage ignored files 7 | /dataSources/ 8 | /dataSources.local.xml 9 | # GitHub Copilot persisted chat sessions 10 | /copilot/chatSessions 11 | -------------------------------------------------------------------------------- /.github/workflows/python-publish.yml: -------------------------------------------------------------------------------- 1 | name: Publish Python Package 2 | on: 3 | push: 4 | tags: 5 | - '*' 6 | 7 | jobs: 8 | deploy: 9 | uses: sibyl-dev/.github/.github/workflows/python-publish.yml@main 10 | with: 11 | repository_url: https://upload.pypi.org/legacy/ 12 | secrets: 13 | PYPI_API_TOKEN: ${{ secrets.PYPI_API_TOKEN }} -------------------------------------------------------------------------------- /tests/test_grader.py: -------------------------------------------------------------------------------- 1 | import explingo 2 | 3 | 4 | def test_grader_run_metrics(): 5 | response = 4 6 | mock_grader_llm = explingo.testing.MockGraderLLM(response) 7 | grader = explingo.Grader(llm=mock_grader_llm, metrics="all") 8 | result = grader("explanation", "explanation_format", "narrative") 9 | for metric in ["accuracy", "fluency", "conciseness", "completeness"]: 10 | assert result[metric] == response 11 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "explingo" 3 | version = "0.1.1" 4 | description = "" 5 | authors = ["Ola Zytek "] 6 | readme = "README.md" 7 | 8 | [tool.poetry.dependencies] 9 | python = "^3.10" 10 | dspy-ai = "2.4.13" 11 | pytest = "^8.3.3" 12 | notebook = "^7.2.2" 13 | jupyter = "^1.1.1" 14 | invoke = "^2.2.0" 15 | isort = "^5.13.2" 16 | flake8 = "^7.1.1" 17 | 18 | [tool.poetry.group.dev.dependencies] 19 | jupyter = "^1.1.1" 20 | 21 | 22 | [build-system] 23 | requires = ["poetry-core"] 24 | build-backend = "poetry.core.masonry.api" 25 | 26 | [tool.black] 27 | line-length = 99 28 | preview = true 29 | 30 | [tool.isort] 31 | profile = "black" 32 | line_length = 99 33 | skip = ["__init__.py"] 34 | -------------------------------------------------------------------------------- /tests/test_narrator.py: -------------------------------------------------------------------------------- 1 | import explingo 2 | 3 | 4 | def test_narrate_basic_prompt(): 5 | response = "narrative" 6 | mock_llm = explingo.testing.MockNarratorLLM(response) 7 | narrator = explingo.Narrator(llm=mock_llm, explanation_format="test", context="test") 8 | explanation = "explanation" 9 | assert narrator.narrate(explanation) == response 10 | 11 | 12 | def test_narrative_few_shot(): 13 | response = "narrative" 14 | mock_llm = explingo.testing.MockNarratorLLM(response) 15 | narrator = explingo.Narrator( 16 | llm=mock_llm, 17 | explanation_format="test", 18 | context="test", 19 | sample_narratives=["sample 1", "sample 2"], 20 | ) 21 | explanation = "explanation" 22 | assert narrator.narrate(explanation, n_examples=2) == response 23 | 24 | 25 | def test_narrative_bootstrapped_few_shot(): 26 | response = "narrative" 27 | mock_llm = explingo.testing.MockNarratorLLM(response, include_tags=False) 28 | mock_grader = explingo.Grader( 29 | llm=explingo.testing.MockGraderLLM(4), 30 | metrics=["fluency, conciseness"], 31 | sample_narratives=["sample 1", "sample 2"], 32 | ) 33 | narrator = explingo.Narrator( 34 | llm=mock_llm, 35 | explanation_format="test", 36 | context="test", 37 | sample_narratives=["sample 1", "sample 2"], 38 | ) 39 | explanation = "explanation" 40 | assert ( 41 | narrator.narrate(explanation, n_examples=2, n_bootstrapped=2, grader=mock_grader) 42 | == response 43 | ) 44 | -------------------------------------------------------------------------------- /evaluation/results/results_by_dataset.csv: -------------------------------------------------------------------------------- 1 | Dataset,Accuracy,Completeness,Fluency,Conciseness,Total score 2 | House 1,\textcolor{blue}{3.733 $\pm$ 0.40},\textcolor{blue}{4.000 $\pm$ 0.00},\textcolor{blue}{3.800 $\pm$ 0.30},\textcolor{blue}{3.719 $\pm$ 0.24},\textcolor{blue}{15.252 $\pm$ 0.58} 3 | House 2,\textcolor{blue}{4.000 $\pm$ 0.00},\textcolor{blue}{4.000 $\pm$ 0.00},\textcolor{blue}{3.911 $\pm$ 0.11},\textcolor{blue}{3.836 $\pm$ 0.16},\textcolor{blue}{15.748 $\pm$ 0.24} 4 | House 3,\textcolor{blue}{4.000 $\pm$ 0.00},\textcolor{blue}{3.689 $\pm$ 0.27},\textcolor{blue}{3.933 $\pm$ 0.10},\textcolor{blue}{3.869 $\pm$ 0.14},\textcolor{blue}{15.491 $\pm$ 0.35} 5 | Mush 1,\textcolor{blue}{3.556 $\pm$ 0.42},\textcolor{blue}{3.600 $\pm$ 0.00},\textcolor{blue}{3.511 $\pm$ 0.15},\textcolor{blue}{4.000 $\pm$ 0.00},\textcolor{blue}{14.667 $\pm$ 0.33} 6 | Mush 2,\textcolor{red}{1.760 $\pm$ 0.88},\textcolor{red}{2.640 $\pm$ 0.36},\textcolor{blue}{3.920 $\pm$ 0.18},\textcolor{blue}{3.989 $\pm$ 0.03},\textcolor{blue}{12.309 $\pm$ 0.67} 7 | PDF 1,\textcolor{blue}{4.000 $\pm$ 0.00},\textcolor{red}{2.400 $\pm$ 0.00},\textcolor{blue}{4.000 $\pm$ 0.00},\textcolor{blue}{3.977 $\pm$ 0.03},\textcolor{blue}{14.377 $\pm$ 0.03} 8 | PDF 2,\textcolor{red}{0.000 $\pm$ 0.00},3.040 $\pm$ 0.22,\textcolor{blue}{3.840 $\pm$ 0.22},\textcolor{blue}{3.949 $\pm$ 0.02},\textcolor{blue}{10.829 $\pm$ 0.46} 9 | Student 1,\textcolor{blue}{4.000 $\pm$ 0.00},\textcolor{blue}{3.600 $\pm$ 0.28},\textcolor{blue}{3.960 $\pm$ 0.09},\textcolor{blue}{4.000 $\pm$ 0.00},\textcolor{blue}{15.560 $\pm$ 0.36} 10 | Student 2,\textcolor{blue}{3.840 $\pm$ 0.36},\textcolor{blue}{3.920 $\pm$ 0.18},\textcolor{blue}{4.000 $\pm$ 0.00},\textcolor{blue}{3.880 $\pm$ 0.15},\textcolor{blue}{15.640 $\pm$ 0.47} 11 | -------------------------------------------------------------------------------- /tasks.py: -------------------------------------------------------------------------------- 1 | import subprocess 2 | import webbrowser 3 | import shutil 4 | 5 | from pathlib import Path 6 | from invoke import task 7 | from sys import executable 8 | import os 9 | 10 | 11 | def print_red(s): 12 | print("\033[91m {}\033[00m".format(s), end="") 13 | 14 | 15 | def print_green(s): 16 | print("\033[92m {}\033[00m".format(s), end="") 17 | 18 | 19 | @task 20 | def clean_test(context): 21 | """ 22 | Cleans the test store 23 | """ 24 | 25 | shutil.rmtree(Path(".pytest_cache"), ignore_errors=True) 26 | 27 | 28 | @task 29 | def fix_lint(context): 30 | """ 31 | Fixes all linting and import sort errors. Skips init.py files for import sorts 32 | """ 33 | 34 | subprocess.run(["black", "explingo"]) 35 | subprocess.run(["black", "tests"]) 36 | subprocess.run(["isort", "--atomic", "explingo", "tests"]) 37 | 38 | 39 | @task 40 | def lint(context): 41 | """ 42 | Runs the linting and import sort process on all library files and tests and prints errors. 43 | Skips init.py files for import sorts 44 | """ 45 | subprocess.run(["flake8", "explingo", "tests"], check=True) 46 | subprocess.run(["isort", "explingo", "tests"], check=True) 47 | 48 | 49 | @task 50 | def test(context): 51 | """ 52 | Runs all test commands. 53 | """ 54 | 55 | failures_in = [] 56 | 57 | try: 58 | test_unit(context) 59 | except subprocess.CalledProcessError: 60 | failures_in.append("Unit tests") 61 | 62 | if len(failures_in) == 0: 63 | print_green("\nAll tests successful :)") 64 | else: 65 | print_red("\n:( Failures in: ") 66 | for i in failures_in: 67 | print_red(i + ", ") 68 | 69 | 70 | @task 71 | def test_unit(context): 72 | """ 73 | Runs all unit tests and outputs results and coverage 74 | """ 75 | subprocess.run(["pytest"], check=True) 76 | -------------------------------------------------------------------------------- /evaluation/examples.py: -------------------------------------------------------------------------------- 1 | import dspy 2 | import json 3 | import random 4 | 5 | 6 | def create_example(entry): 7 | example = dspy.Example( 8 | explanation=entry["explanation"], 9 | context=entry["context"], 10 | explanation_format=entry["explanation_format"], 11 | ) 12 | if "narrative" in entry: 13 | example.narrative = entry["narrative"] 14 | if "bad_narrative" in entry: 15 | example.bad_narrative = entry["bad_narrative"] 16 | return example.with_inputs("explanation", "context", "explanation_format") 17 | 18 | 19 | def load_examples(json_file): 20 | training_data = json.load(open(json_file, "r")) 21 | examples = [] 22 | for entry in training_data: 23 | examples.append(create_example(entry)) 24 | return examples 25 | 26 | 27 | def get_data(json_file, split=None): 28 | all_data = load_examples(json_file) 29 | labeled_data = [example for example in all_data if hasattr(example, "narrative")] 30 | unlabeled_data = [ 31 | example for example in all_data if not hasattr(example, "narrative") 32 | ] 33 | if split is not None: 34 | labeled_train = labeled_data[: int(split * len(labeled_data))] 35 | labeled_eval = labeled_data[int(split * len(labeled_data)) :] 36 | unlabeled_train = unlabeled_data[: int(split * len(unlabeled_data))] 37 | unlabeled_eval = unlabeled_data[int(split * len(unlabeled_data)) :] 38 | else: 39 | labeled_train = labeled_data[:5] 40 | labeled_eval = labeled_data[5:] 41 | unlabeled_train = unlabeled_data[:5] 42 | unlabeled_eval = unlabeled_data[5:] 43 | if len(unlabeled_train) < 5: 44 | additional_count = 5 - len(unlabeled_train) 45 | labeled_train += labeled_eval[:additional_count] 46 | labeled_eval = labeled_eval[additional_count:] 47 | 48 | return labeled_train, labeled_eval, unlabeled_train, unlabeled_eval 49 | -------------------------------------------------------------------------------- /evaluation/results/results_by_dataset.tex: -------------------------------------------------------------------------------- 1 | \begin{table} 2 | \caption{Overall results for each prompt and few-shot setting.} 3 | \begin{tabular}{llllll} 4 | \toprule 5 | Dataset & Accuracy & Completeness & Fluency & Conciseness & Total score \\ 6 | \midrule 7 | House 1 & \textcolor{blue}{3.733 $\pm$ 0.40} & \textcolor{blue}{4.000 $\pm$ 0.00} & \textcolor{blue}{3.800 $\pm$ 0.30} & \textcolor{blue}{3.719 $\pm$ 0.24} & \textcolor{blue}{15.252 $\pm$ 0.58} \\ 8 | House 2 & \textcolor{blue}{4.000 $\pm$ 0.00} & \textcolor{blue}{4.000 $\pm$ 0.00} & \textcolor{blue}{3.911 $\pm$ 0.11} & \textcolor{blue}{3.836 $\pm$ 0.16} & \textcolor{blue}{15.748 $\pm$ 0.24} \\ 9 | House 3 & \textcolor{blue}{4.000 $\pm$ 0.00} & \textcolor{blue}{3.689 $\pm$ 0.27} & \textcolor{blue}{3.933 $\pm$ 0.10} & \textcolor{blue}{3.869 $\pm$ 0.14} & \textcolor{blue}{15.491 $\pm$ 0.35} \\ 10 | Mush 1 & \textcolor{blue}{3.556 $\pm$ 0.42} & \textcolor{blue}{3.600 $\pm$ 0.00} & \textcolor{blue}{3.511 $\pm$ 0.15} & \textcolor{blue}{4.000 $\pm$ 0.00} & \textcolor{blue}{14.667 $\pm$ 0.33} \\ 11 | Mush 2 & \textcolor{red}{1.760 $\pm$ 0.88} & \textcolor{red}{2.640 $\pm$ 0.36} & \textcolor{blue}{3.920 $\pm$ 0.18} & \textcolor{blue}{3.989 $\pm$ 0.03} & \textcolor{blue}{12.309 $\pm$ 0.67} \\ 12 | PDF 1 & \textcolor{blue}{4.000 $\pm$ 0.00} & \textcolor{red}{2.400 $\pm$ 0.00} & \textcolor{blue}{4.000 $\pm$ 0.00} & \textcolor{blue}{3.977 $\pm$ 0.03} & \textcolor{blue}{14.377 $\pm$ 0.03} \\ 13 | PDF 2 & \textcolor{red}{0.000 $\pm$ 0.00} & 3.040 $\pm$ 0.22 & \textcolor{blue}{3.840 $\pm$ 0.22} & \textcolor{blue}{3.949 $\pm$ 0.02} & \textcolor{blue}{10.829 $\pm$ 0.46} \\ 14 | Student 1 & \textcolor{blue}{4.000 $\pm$ 0.00} & \textcolor{blue}{3.600 $\pm$ 0.28} & \textcolor{blue}{3.960 $\pm$ 0.09} & \textcolor{blue}{4.000 $\pm$ 0.00} & \textcolor{blue}{15.560 $\pm$ 0.36} \\ 15 | Student 2 & \textcolor{blue}{3.840 $\pm$ 0.36} & \textcolor{blue}{3.920 $\pm$ 0.18} & \textcolor{blue}{4.000 $\pm$ 0.00} & \textcolor{blue}{3.880 $\pm$ 0.15} & \textcolor{blue}{15.640 $\pm$ 0.47} \\ 16 | \bottomrule 17 | \end{tabular} 18 | \end{table} 19 | -------------------------------------------------------------------------------- /evaluation/results/results_local2.csv: -------------------------------------------------------------------------------- 1 | dataset,total score,accuracy,completeness,fluency,conciseness,n_few_shot,prompt 2 | housing_3.json,8.208187134502925,0.0,0.4,4.0,3.808187134502924,3,"You are helping users understand an ML model's prediction. Given an explanation and information about the model, convert the explanation into a human-readable narrative." 3 | pdf_1.json,11.907936507936508,2.4,2.0,3.8,3.707936507936508,3,"You are helping users understand an ML model's prediction. Given an explanation and information about the model, convert the explanation into a human-readable narrative." 4 | pdf_2.json,9.684848484848484,0.0,2.0,4.0,3.6848484848484846,3,"You are helping users understand an ML model's prediction. Given an explanation and information about the model, convert the explanation into a human-readable narrative." 5 | mushroom_2.json,9.711111111111112,0.0,2.0,3.8,3.9111111111111114,3,"You are helping users understand an ML model's prediction. Given an explanation and information about the model, convert the explanation into a human-readable narrative." 6 | student_2.json,11.11111111111111,0.8,2.4,4.0,3.9111111111111114,3,"You are helping users understand an ML model's prediction. Given an explanation and information about the model, convert the explanation into a human-readable narrative." 7 | mushroom_1.json,14.022222222222222,3.2,3.2,3.8,3.822222222222223,3,"You are helping users understand an ML model's prediction. Given an explanation and information about the model, convert the explanation into a human-readable narrative." 8 | student_1.json,13.0,2.4,2.8,3.8,4.0,3,"You are helping users understand an ML model's prediction. Given an explanation and information about the model, convert the explanation into a human-readable narrative." 9 | housing_2.json,7.626666666666667,0.0,0.0,4.0,3.6266666666666665,3,"You are helping users understand an ML model's prediction. Given an explanation and information about the model, convert the explanation into a human-readable narrative." 10 | housing_1.json,7.850980392156863,0.0,0.0,4.0,3.8509803921568633,3,"You are helping users understand an ML model's prediction. Given an explanation and information about the model, convert the explanation into a human-readable narrative." 11 | -------------------------------------------------------------------------------- /explingo/testing.py: -------------------------------------------------------------------------------- 1 | import dspy 2 | 3 | 4 | class MockNarratorLLM(dspy.LM): 5 | def __init__(self, response, include_tags=True, **kwargs): 6 | """ 7 | Create a mock LLM for testing purposes 8 | 9 | Args: 10 | response (String): Narrative response expected from the LLM 11 | include_tags (bool): Include tags (ie. "Narrative") in the response. Should be set to 12 | False to test functionality that directly uses DSPy (ie. bootstrapped few-shot), 13 | True otherwise 14 | """ 15 | self.response = response 16 | self.kwargs = kwargs 17 | self.history = [] 18 | self.include_tags = include_tags 19 | super().__init__(model=None) 20 | 21 | def basic_request(self, prompt, **kwargs): 22 | return self(prompt, **kwargs) 23 | 24 | def __call__(self, prompt=None, **kwargs): 25 | if self.include_tags: 26 | completions = "Narrative: " + self.response 27 | else: 28 | completions = self.response 29 | self.history.append({"prompt": prompt, "completions": completions}) 30 | return [completions] 31 | 32 | def copy(self, **kwargs): 33 | return self.__class__(self.response, **kwargs) 34 | 35 | def inspect_history(self, n=1, skip=0): 36 | print(self.history) 37 | 38 | 39 | class MockGraderLLM(dspy.LM): 40 | def __init__(self, response, **kwargs): 41 | """ 42 | Create a mock Grader for testing purposes 43 | 44 | Args: 45 | response (int): Grader response expected from the Grader 46 | """ 47 | self.response = response 48 | self.kwargs = kwargs 49 | self.history = [] 50 | super().__init__(model=None) 51 | 52 | def __call__(self, prompt=None, *args, **kwargs): 53 | completions = str(self.response) 54 | self.history.append({"prompt": prompt, "completions": completions}) 55 | return [completions] 56 | 57 | def basic_request(self, prompt, **kwargs): 58 | return self(prompt, **kwargs) 59 | 60 | def copy(self, **kwargs): 61 | return self.__class__(self.response, **kwargs) 62 | 63 | def inspect_history(self, n=1, skip=0): 64 | print(self.history) 65 | -------------------------------------------------------------------------------- /evaluation/results/results_by_technique.csv: -------------------------------------------------------------------------------- 1 | Prompt,$L$,$B$,Accuracy,Completeness,Fluency,Conciseness,Total score 2 | Prompt 1,0,0,\textbf{\textcolor{blue}{4.000 $\pm$ 0.00}},\textbf{\textcolor{blue}{4.000 $\pm$ 0.00}},\textcolor{red}{2.467 $\pm$ 0.76},\textcolor{red}{0.850 $\pm$ 0.94},\textcolor{blue}{11.317 $\pm$ 1.23} 3 | Prompt 1,1,0,3.289 $\pm$ 1.35,\textcolor{blue}{3.511 $\pm$ 0.56},\textcolor{blue}{3.622 $\pm$ 0.47},\textcolor{blue}{3.749 $\pm$ 0.14},\textcolor{blue}{14.171 $\pm$ 1.67} 4 | Prompt 1,1,1,\textcolor{blue}{3.800 $\pm$ 0.40},\textcolor{blue}{3.800 $\pm$ 0.23},\textcolor{blue}{3.650 $\pm$ 0.34},\textcolor{blue}{3.702 $\pm$ 0.32},\textcolor{blue}{14.952 $\pm$ 0.58} 5 | Prompt 1,1,3,\textcolor{blue}{3.800 $\pm$ 0.40},\textcolor{blue}{3.800 $\pm$ 0.23},\textcolor{blue}{3.700 $\pm$ 0.26},\textcolor{blue}{3.734 $\pm$ 0.28},\textbf{\textcolor{blue}{15.034 $\pm$ 0.51}} 6 | Prompt 1,3,0,3.111 $\pm$ 1.57,3.422 $\pm$ 0.53,\textcolor{blue}{3.689 $\pm$ 0.41},\textcolor{blue}{3.708 $\pm$ 0.19},\textcolor{blue}{13.930 $\pm$ 1.82} 7 | Prompt 1,3,1,3.323 $\pm$ 1.34,\textcolor{blue}{3.538 $\pm$ 0.49},\textcolor{blue}{3.846 $\pm$ 0.19},\textcolor{blue}{3.843 $\pm$ 0.16},\textcolor{blue}{14.551 $\pm$ 1.72} 8 | Prompt 1,3,3,3.262 $\pm$ 1.33,\textcolor{blue}{3.631 $\pm$ 0.58},\textcolor{blue}{3.846 $\pm$ 0.23},\textcolor{blue}{3.890 $\pm$ 0.11},\textcolor{blue}{14.629 $\pm$ 1.73} 9 | Prompt 1,5,0,3.111 $\pm$ 1.57,3.289 $\pm$ 0.56,\textcolor{blue}{3.644 $\pm$ 0.49},\textcolor{blue}{3.669 $\pm$ 0.35},\textcolor{blue}{13.713 $\pm$ 2.08} 10 | Prompt 1,5,1,3.289 $\pm$ 1.35,3.333 $\pm$ 0.63,\textbf{\textcolor{blue}{3.933 $\pm$ 0.14}},\textcolor{blue}{3.983 $\pm$ 0.03},\textcolor{blue}{14.538 $\pm$ 1.63} 11 | Prompt 1,5,3,3.289 $\pm$ 1.35,3.467 $\pm$ 0.66,\textcolor{blue}{3.889 $\pm$ 0.27},\textbf{\textcolor{blue}{3.988 $\pm$ 0.02}},\textcolor{blue}{14.632 $\pm$ 1.65} 12 | Prompt 1,5,5,3.378 $\pm$ 1.37,3.422 $\pm$ 0.64,\textcolor{blue}{3.911 $\pm$ 0.15},\textcolor{blue}{3.977 $\pm$ 0.03},\textcolor{blue}{14.688 $\pm$ 1.66} 13 | Prompt 2,0,0,\textcolor{blue}{3.911 $\pm$ 0.27},\textbf{\textcolor{blue}{4.000 $\pm$ 0.00}},\textcolor{red}{2.467 $\pm$ 0.66},\textcolor{red}{0.872 $\pm$ 0.89},\textcolor{blue}{11.250 $\pm$ 0.97} 14 | Prompt 3,0,0,\textbf{\textcolor{blue}{4.000 $\pm$ 0.00}},\textbf{\textcolor{blue}{4.000 $\pm$ 0.00}},\textcolor{red}{2.222 $\pm$ 0.82},\textcolor{red}{1.056 $\pm$ 1.07},\textcolor{blue}{11.278 $\pm$ 1.44} 15 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Explingo 2 | 3 | # Explingo 4 | Transform your ML explanations into human-friendly natural-language narratives. 5 | 6 | NOTE: Explingo is still under active development and currently only supports a few basic explanation types 7 | and GPT-API models. 8 | 9 | ## Installation 10 | Explingo can be installed through PIP 11 | ```bash 12 | pip install explingo 13 | ``` 14 | 15 | ## Usage 16 | To transform explanations into narratives, you can use the Narrator class. 17 | ```python 18 | from explingo import Narrator, Grader 19 | 20 | example_narratives = [ 21 | ("(Above ground living area square feet, 1256.00, -12527.46), (Overall material and finish of the house, 5.00, -10743.76), (Second floor square feet, 0.00, -10142.29)", 22 | "The house's living area size of around 1,200 sq. ft., lower quality materials (5/10), and lack of a second floor are the main reasons for the low price."), 23 | ("(Second floor square feet, 854.00, 12757.84), (Original construction date, 2003.00, 9115.72)", 24 | "The house's large second floor of around 850 sq. ft and recent construction date of 2003 increases its value."), 25 | ("(Overall material and finish of the house, 8.00, 10743.76), (Above ground living area square feet, 2000.00, 12527.46), (Second floor square feet, 1000.00, 10142.29)", 26 | "The house's high quality materials (8/10), large living area size of around 2,000 sq. ft., and a second floor of around 1,000 sq. ft. are the main reasons for the high price."), 27 | ] 28 | 29 | explanation_format = "(feature name, feature value, SHAP feature contribution)" 30 | context = "The model predicts house prices" 31 | 32 | narrator = Narrator(openai_api_key=[OPENAI_API_KEY], 33 | explanation_format=explanation_format, 34 | context=context, 35 | labeled_train_data=example_narratives) 36 | 37 | explanation = "(number of bathrooms, 3, 7020), (number of bedrooms, 4, 12903)" 38 | 39 | narrative = narrator.narrate(explanation) 40 | ``` 41 | 42 | To evaluate the quality of the generated narratives, you can use the Grader class. 43 | ```python 44 | grader = Grader(openai_api_key=[OPENAI_API_KEY], 45 | metrics="all", 46 | sample_narratives=[narrative[1] for narrative in example_narratives]) 47 | 48 | metrics = grader(explanation=explanation, 49 | explanation_format=explanation_format, 50 | narrative=narrative) 51 | ``` 52 | -------------------------------------------------------------------------------- /evaluation/results/results_by_technique.tex: -------------------------------------------------------------------------------- 1 | \begin{table} 2 | \caption{Overall results for each prompt and few-shot setting.} 3 | \begin{tabular}{lrrlllll} 4 | \toprule 5 | Prompt & $L$ & $B$ & Accuracy & Completeness & Fluency & Conciseness & Total score \\ 6 | \midrule 7 | Prompt 1 & 0 & 0 & \textbf{\textcolor{blue}{4.000 $\pm$ 0.00}} & \textbf{\textcolor{blue}{4.000 $\pm$ 0.00}} & \textcolor{red}{2.467 $\pm$ 0.76} & \textcolor{red}{0.850 $\pm$ 0.94} & \textcolor{blue}{11.317 $\pm$ 1.23} \\ 8 | Prompt 1 & 1 & 0 & 3.289 $\pm$ 1.35 & \textcolor{blue}{3.511 $\pm$ 0.56} & \textcolor{blue}{3.622 $\pm$ 0.47} & \textcolor{blue}{3.749 $\pm$ 0.14} & \textcolor{blue}{14.171 $\pm$ 1.67} \\ 9 | Prompt 1 & 1 & 1 & \textcolor{blue}{3.800 $\pm$ 0.40} & \textcolor{blue}{3.800 $\pm$ 0.23} & \textcolor{blue}{3.650 $\pm$ 0.34} & \textcolor{blue}{3.702 $\pm$ 0.32} & \textcolor{blue}{14.952 $\pm$ 0.58} \\ 10 | Prompt 1 & 1 & 3 & \textcolor{blue}{3.800 $\pm$ 0.40} & \textcolor{blue}{3.800 $\pm$ 0.23} & \textcolor{blue}{3.700 $\pm$ 0.26} & \textcolor{blue}{3.734 $\pm$ 0.28} & \textbf{\textcolor{blue}{15.034 $\pm$ 0.51}} \\ 11 | Prompt 1 & 3 & 0 & 3.111 $\pm$ 1.57 & 3.422 $\pm$ 0.53 & \textcolor{blue}{3.689 $\pm$ 0.41} & \textcolor{blue}{3.708 $\pm$ 0.19} & \textcolor{blue}{13.930 $\pm$ 1.82} \\ 12 | Prompt 1 & 3 & 1 & 3.323 $\pm$ 1.34 & \textcolor{blue}{3.538 $\pm$ 0.49} & \textcolor{blue}{3.846 $\pm$ 0.19} & \textcolor{blue}{3.843 $\pm$ 0.16} & \textcolor{blue}{14.551 $\pm$ 1.72} \\ 13 | Prompt 1 & 3 & 3 & 3.262 $\pm$ 1.33 & \textcolor{blue}{3.631 $\pm$ 0.58} & \textcolor{blue}{3.846 $\pm$ 0.23} & \textcolor{blue}{3.890 $\pm$ 0.11} & \textcolor{blue}{14.629 $\pm$ 1.73} \\ 14 | Prompt 1 & 5 & 0 & 3.111 $\pm$ 1.57 & 3.289 $\pm$ 0.56 & \textcolor{blue}{3.644 $\pm$ 0.49} & \textcolor{blue}{3.669 $\pm$ 0.35} & \textcolor{blue}{13.713 $\pm$ 2.08} \\ 15 | Prompt 1 & 5 & 1 & 3.289 $\pm$ 1.35 & 3.333 $\pm$ 0.63 & \textbf{\textcolor{blue}{3.933 $\pm$ 0.14}} & \textcolor{blue}{3.983 $\pm$ 0.03} & \textcolor{blue}{14.538 $\pm$ 1.63} \\ 16 | Prompt 1 & 5 & 3 & 3.289 $\pm$ 1.35 & 3.467 $\pm$ 0.66 & \textcolor{blue}{3.889 $\pm$ 0.27} & \textbf{\textcolor{blue}{3.988 $\pm$ 0.02}} & \textcolor{blue}{14.632 $\pm$ 1.65} \\ 17 | Prompt 1 & 5 & 5 & 3.378 $\pm$ 1.37 & 3.422 $\pm$ 0.64 & \textcolor{blue}{3.911 $\pm$ 0.15} & \textcolor{blue}{3.977 $\pm$ 0.03} & \textcolor{blue}{14.688 $\pm$ 1.66} \\ 18 | Prompt 2 & 0 & 0 & \textcolor{blue}{3.911 $\pm$ 0.27} & \textbf{\textcolor{blue}{4.000 $\pm$ 0.00}} & \textcolor{red}{2.467 $\pm$ 0.66} & \textcolor{red}{0.872 $\pm$ 0.89} & \textcolor{blue}{11.250 $\pm$ 0.97} \\ 19 | Prompt 3 & 0 & 0 & \textbf{\textcolor{blue}{4.000 $\pm$ 0.00}} & \textbf{\textcolor{blue}{4.000 $\pm$ 0.00}} & \textcolor{red}{2.222 $\pm$ 0.82} & \textcolor{red}{1.056 $\pm$ 1.07} & \textcolor{blue}{11.278 $\pm$ 1.44} \\ 20 | \bottomrule 21 | \end{tabular} 22 | \end{table} 23 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # poetry 98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 99 | # This is especially recommended for binary packages to ensure reproducibility, and is more 100 | # commonly ignored for libraries. 101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 102 | #poetry.lock 103 | 104 | # pdm 105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 106 | #pdm.lock 107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 108 | # in version control. 109 | # https://pdm.fming.dev/#use-with-ide 110 | .pdm.toml 111 | 112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 113 | __pypackages__/ 114 | 115 | # Celery stuff 116 | celerybeat-schedule 117 | celerybeat.pid 118 | 119 | # SageMath parsed files 120 | *.sage.py 121 | 122 | # Environments 123 | .env 124 | .venv 125 | env/ 126 | venv/ 127 | ENV/ 128 | env.bak/ 129 | venv.bak/ 130 | 131 | # Spyder project settings 132 | .spyderproject 133 | .spyproject 134 | 135 | # Rope project settings 136 | .ropeproject 137 | 138 | # mkdocs documentation 139 | /site 140 | 141 | # mypy 142 | .mypy_cache/ 143 | .dmypy.json 144 | dmypy.json 145 | 146 | # Pyre type checker 147 | .pyre/ 148 | 149 | # pytype static type analyzer 150 | .pytype/ 151 | 152 | # Cython debug symbols 153 | cython_debug/ 154 | 155 | # PyCharm 156 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 157 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 158 | # and can be added to the global gitignore or merged into this file. For a more nuclear 159 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 160 | .idea/ 161 | 162 | key.yaml 163 | keys.yaml 164 | 165 | data/real-estate-info/* 166 | -------------------------------------------------------------------------------- /evaluation/results/cleaned_results.csv: -------------------------------------------------------------------------------- 1 | ,Dataset,Prompt,$L$,$B$,Accuracy,Completeness,Fluency,Conciseness,Total score 2 | 0,House 1,Prompt 1,0,0,4.0,4.0,3.2,2.196,13.396 3 | 1,House 1,Prompt 2,0,0,4.0,4.0,3.0,2.275,13.275 4 | 2,House 1,Prompt 3,0,0,4.0,4.0,3.2,2.81,14.01 5 | 3,House 2,Prompt 1,0,0,4.0,4.0,2.6,0.48,11.08 6 | 4,House 2,Prompt 2,0,0,4.0,4.0,2.2,0.96,11.16 7 | 5,House 2,Prompt 3,0,0,4.0,4.0,2.2,0.942,11.142 8 | 6,House 3,Prompt 1,0,0,4.0,4.0,1.8,0.0,9.8 9 | 7,House 3,Prompt 2,0,0,4.0,4.0,2.2,0.0,10.2 10 | 8,House 3,Prompt 3,0,0,4.0,4.0,1.8,0.0,9.8 11 | 9,Mush 1,Prompt 1,0,0,4.0,4.0,1.2,1.697,10.897 12 | 10,Mush 1,Prompt 2,0,0,4.0,4.0,1.4,1.417,10.817 13 | 11,Mush 1,Prompt 3,0,0,4.0,4.0,0.8,1.778,10.578 14 | 12,Mush 2,Prompt 1,0,0,4.0,4.0,1.6,0.571,10.171 15 | 13,Mush 2,Prompt 2,0,0,4.0,4.0,1.6,0.527,10.127 16 | 14,Mush 2,Prompt 3,0,0,4.0,4.0,1.4,0.737,10.137 17 | 15,PDF 1,Prompt 1,0,0,4.0,4.0,3.0,0.065,11.065 18 | 16,PDF 1,Prompt 2,0,0,4.0,4.0,3.2,0.0,11.2 19 | 17,PDF 1,Prompt 3,0,0,4.0,4.0,2.8,0.267,11.067 20 | 18,PDF 2,Prompt 1,0,0,4.0,4.0,2.6,0.0,10.6 21 | 19,PDF 2,Prompt 2,0,0,4.0,4.0,2.8,0.0,10.8 22 | 20,PDF 2,Prompt 3,0,0,4.0,4.0,2.0,0.0,10.0 23 | 21,Student 1,Prompt 1,0,0,4.0,4.0,2.8,2.289,13.089 24 | 22,Student 1,Prompt 2,0,0,3.2,4.0,2.6,2.119,11.919 25 | 23,Student 1,Prompt 3,0,0,4.0,4.0,2.6,2.527,13.127 26 | 24,Student 2,Prompt 1,0,0,4.0,4.0,3.4,0.356,11.756 27 | 25,Student 2,Prompt 2,0,0,4.0,4.0,3.2,0.55,11.75 28 | 26,Student 2,Prompt 3,0,0,4.0,4.0,3.2,0.439,11.639 29 | 27,House 1,Prompt 1,1,0,4.0,4.0,4.0,3.775,15.775 30 | 28,House 1,Prompt 1,3,0,4.0,4.0,4.0,3.671,15.671 31 | 29,House 1,Prompt 1,5,0,4.0,4.0,4.0,3.723,15.723 32 | 30,House 2,Prompt 1,1,0,4.0,4.0,3.8,3.627,15.427 33 | 31,House 2,Prompt 1,3,0,4.0,3.6,3.8,3.609,15.009 34 | 32,House 2,Prompt 1,5,0,4.0,4.0,3.8,3.68,15.48 35 | 33,House 3,Prompt 1,1,0,4.0,3.6,4.0,3.897,15.497 36 | 34,House 3,Prompt 1,3,0,4.0,3.6,4.0,3.574,15.174 37 | 35,House 3,Prompt 1,5,0,4.0,3.6,4.0,3.892,15.492 38 | 36,Mush 1,Prompt 1,1,0,3.2,3.6,3.0,3.93,13.73 39 | 37,Mush 1,Prompt 1,3,0,3.2,3.6,3.0,3.93,13.73 40 | 38,Mush 1,Prompt 1,5,0,3.2,2.8,2.6,2.807,11.407 41 | 39,Mush 2,Prompt 1,1,0,2.4,4.0,3.0,3.695,13.095 42 | 40,Mush 2,Prompt 1,3,0,0.8,3.2,3.6,3.911,11.511 43 | 41,Mush 2,Prompt 1,5,0,0.8,3.2,3.4,3.911,11.311 44 | 42,PDF 1,Prompt 1,1,0,4.0,2.4,3.8,3.581,13.781 45 | 43,PDF 1,Prompt 1,3,0,4.0,2.4,3.8,3.581,13.781 46 | 44,PDF 1,Prompt 1,5,0,4.0,2.4,3.8,3.581,13.781 47 | 45,PDF 2,Prompt 1,1,0,0.0,2.8,4.0,3.685,10.485 48 | 46,PDF 2,Prompt 1,3,0,0.0,2.8,4.0,3.644,10.444 49 | 47,PDF 2,Prompt 1,5,0,0.0,2.8,4.0,3.725,10.525 50 | 48,Student 1,Prompt 1,1,0,4.0,3.6,3.0,3.932,14.532 51 | 49,Student 1,Prompt 1,3,0,4.0,3.6,3.0,4.0,14.6 52 | 50,Student 1,Prompt 1,5,0,4.0,3.2,3.2,3.973,14.373 53 | 51,Student 2,Prompt 1,1,0,4.0,3.6,4.0,3.617,15.217 54 | 52,Student 2,Prompt 1,3,0,4.0,4.0,4.0,3.45,15.45 55 | 53,Student 2,Prompt 1,5,0,4.0,3.6,4.0,3.728,15.328 56 | 54,House 1,Prompt 1,1,1,4.0,4.0,3.2,3.318,14.518 57 | 55,House 1,Prompt 1,1,3,4.0,4.0,3.4,3.447,14.847 58 | 56,House 1,Prompt 1,3,1,4.0,4.0,4.0,3.671,15.671 59 | 57,House 1,Prompt 1,3,3,3.2,4.0,3.8,3.706,14.706 60 | 58,House 2,Prompt 1,1,1,4.0,4.0,3.8,3.552,15.352 61 | 59,House 2,Prompt 1,1,3,4.0,4.0,3.8,3.552,15.352 62 | 60,House 2,Prompt 1,3,1,4.0,4.0,4.0,3.936,15.936 63 | 61,House 2,Prompt 1,3,3,4.0,4.0,4.0,3.872,15.872 64 | 62,House 3,Prompt 1,1,1,4.0,3.6,4.0,3.937,15.537 65 | 63,House 3,Prompt 1,1,3,4.0,3.6,4.0,3.937,15.537 66 | 64,House 3,Prompt 1,3,1,4.0,3.6,3.8,3.642,15.042 67 | 65,House 3,Prompt 1,3,3,4.0,4.0,4.0,3.832,15.832 68 | 66,Mush 1,Prompt 1,1,1,3.2,3.6,3.6,4.0,14.4 69 | 67,Mush 1,Prompt 1,1,3,3.2,3.6,3.6,4.0,14.4 70 | 68,Mush 1,Prompt 1,3,1,3.2,3.6,3.6,4.0,14.4 71 | 69,Mush 1,Prompt 1,3,3,4.0,3.6,3.4,4.0,15.0 72 | 70,House 1,Prompt 1,3,1,4.0,4.0,4.0,3.671,15.671 73 | 71,House 1,Prompt 1,3,3,3.2,4.0,3.8,3.706,14.706 74 | 72,House 1,Prompt 1,5,1,4.0,4.0,4.0,4.0,16.0 75 | 73,House 1,Prompt 1,5,3,3.2,4.0,4.0,3.988,15.188 76 | 74,House 1,Prompt 1,5,5,4.0,4.0,4.0,3.965,15.965 77 | 75,House 2,Prompt 1,3,1,4.0,4.0,4.0,3.936,15.936 78 | 76,House 2,Prompt 1,3,3,4.0,4.0,4.0,3.872,15.872 79 | 77,House 2,Prompt 1,5,1,4.0,4.0,4.0,3.936,15.936 80 | 78,House 2,Prompt 1,5,3,4.0,4.0,3.8,3.936,15.736 81 | 79,House 2,Prompt 1,5,5,4.0,4.0,3.8,3.936,15.736 82 | 80,House 3,Prompt 1,3,1,4.0,3.6,3.8,3.642,15.042 83 | 81,House 3,Prompt 1,3,3,4.0,4.0,4.0,3.832,15.832 84 | 82,House 3,Prompt 1,5,1,4.0,3.2,4.0,4.0,15.2 85 | 83,House 3,Prompt 1,5,3,4.0,4.0,4.0,4.0,16.0 86 | 84,House 3,Prompt 1,5,5,4.0,3.6,3.8,4.0,15.4 87 | 85,Mush 1,Prompt 1,3,1,3.2,3.6,3.6,4.0,14.4 88 | 86,Mush 1,Prompt 1,3,3,4.0,3.6,3.4,4.0,15.0 89 | 87,Mush 1,Prompt 1,5,1,3.2,3.6,3.6,4.0,14.4 90 | 88,Mush 1,Prompt 1,5,3,4.0,3.6,3.2,4.0,14.8 91 | 89,Mush 1,Prompt 1,5,5,4.0,3.6,3.6,4.0,15.2 92 | 90,Mush 2,Prompt 1,3,1,0.8,3.2,3.6,3.943,11.543 93 | 91,Mush 2,Prompt 1,3,3,0.8,2.8,4.0,4.0,11.6 94 | 92,Mush 2,Prompt 1,5,1,2.4,2.4,4.0,4.0,12.8 95 | 93,Mush 2,Prompt 1,5,3,2.4,2.4,4.0,4.0,12.8 96 | 94,Mush 2,Prompt 1,5,5,2.4,2.4,4.0,4.0,12.8 97 | 95,PDF 1,Prompt 1,3,1,4.0,2.4,4.0,3.943,14.343 98 | 96,PDF 1,Prompt 1,3,3,4.0,2.4,4.0,4.0,14.4 99 | 97,PDF 1,Prompt 1,5,1,4.0,2.4,4.0,3.943,14.343 100 | 98,PDF 1,Prompt 1,5,3,4.0,2.4,4.0,4.0,14.4 101 | 99,PDF 1,Prompt 1,5,5,4.0,2.4,4.0,4.0,14.4 102 | 100,PDF 2,Prompt 1,3,1,0.0,2.8,3.6,3.927,10.327 103 | 101,PDF 2,Prompt 1,3,3,0.0,2.8,3.6,3.927,10.327 104 | 102,PDF 2,Prompt 1,5,1,0.0,3.2,4.0,3.964,11.164 105 | 103,PDF 2,Prompt 1,5,3,0.0,3.2,4.0,3.964,11.164 106 | 104,PDF 2,Prompt 1,5,5,0.0,3.2,4.0,3.964,11.164 107 | 105,Student 1,Prompt 1,3,1,4.0,3.6,4.0,4.0,15.6 108 | 106,Student 1,Prompt 1,3,3,4.0,4.0,4.0,4.0,16.0 109 | 107,Student 1,Prompt 1,5,1,4.0,3.2,3.8,4.0,15.0 110 | 108,Student 1,Prompt 1,5,3,4.0,3.6,4.0,4.0,15.6 111 | 109,Student 1,Prompt 1,5,5,4.0,3.6,4.0,4.0,15.6 112 | 110,Student 2,Prompt 1,3,1,4.0,3.6,4.0,3.65,15.25 113 | 111,Student 2,Prompt 1,3,3,3.2,4.0,4.0,3.825,15.025 114 | 112,Student 2,Prompt 1,5,1,4.0,4.0,4.0,4.0,16.0 115 | 113,Student 2,Prompt 1,5,3,4.0,4.0,4.0,4.0,16.0 116 | 114,Student 2,Prompt 1,5,5,4.0,4.0,4.0,3.925,15.925 117 | -------------------------------------------------------------------------------- /evaluation/experiment_runner.py: -------------------------------------------------------------------------------- 1 | import metrics 2 | import os 3 | import examples 4 | import random 5 | from explingo import Explingo 6 | 7 | 8 | class ExplingoExperimentRunner: 9 | def __init__( 10 | self, llm, dataset_filepath, openai_api_key, verbose=0, save_results=True 11 | ): 12 | ( 13 | self.labeled_train, 14 | self.labeled_eval, 15 | self.unlabeled_train, 16 | self.unlabeled_eval, 17 | ) = examples.get_data(dataset_filepath) 18 | self.train_data = self.labeled_train + self.unlabeled_train 19 | self.eval_data = self.labeled_eval + self.unlabeled_eval 20 | assert len(self.train_data) == 10 21 | print(dataset_filepath) 22 | print(f"Total number of examples: {len(self.train_data) + len(self.eval_data)}") 23 | print(f"Labeled training examples: {len(self.labeled_train)}") 24 | print(f"Labeled evaluation examples: {len(self.labeled_eval)}") 25 | print(f"Unlabeled training examples: {len(self.unlabeled_train)}") 26 | print(f"Unlabeled evaluation examples: {len(self.unlabeled_eval)}") 27 | 28 | max_optimal_length = max( 29 | [ 30 | len(d.narrative.split()) / d.explanation.count("(") 31 | for d in self.labeled_train 32 | ] 33 | ) 34 | print("Max optimal length:", max_optimal_length) 35 | print("---") 36 | 37 | example_good_narratives = random.sample( 38 | [d.narrative for d in self.labeled_train], 5 39 | ) 40 | 41 | self.metrics = metrics.Metrics( 42 | metric_funcs=[ 43 | metrics.accuracy, 44 | metrics.completeness, 45 | metrics.fluency, 46 | metrics.conciseness, 47 | ], 48 | openai_key=openai_api_key, 49 | verbose=verbose, 50 | metric_kwargs={ 51 | "conciseness": {"max_optimal_length_per_feature": max_optimal_length}, 52 | "fluency": {"good_narratives": example_good_narratives}, 53 | }, 54 | ) 55 | 56 | self.verbose = verbose 57 | self.save_results = save_results 58 | 59 | self.explingo = Explingo( 60 | llm, 61 | context=self.labeled_train[0]["context"], 62 | labeled_train_data=self.labeled_train, 63 | unlabeled_train_data=self.unlabeled_train, 64 | ) 65 | 66 | def run_experiment(self, func, prompt=None, max_iters=100, kwargs=None): 67 | if kwargs is None: 68 | kwargs = {} 69 | 70 | total_scores = None 71 | results = [] 72 | for i, example in enumerate(self.eval_data): 73 | if i >= max_iters: 74 | break 75 | result = func( 76 | prompt=prompt, 77 | explanation=example.explanation, 78 | explanation_format=example.explanation_format, 79 | **kwargs, 80 | ) 81 | if result is not None: 82 | score = self.metrics(example, result) 83 | if total_scores is None: 84 | total_scores = score[1] 85 | else: 86 | total_scores += score[1] 87 | if self.verbose >= 1: 88 | print("Explanation:", example.explanation) 89 | print("Narrative:", result.narrative) 90 | print("Total Score:", score[0]) 91 | print( 92 | "".join( 93 | f"{metric}: {score}, " for metric, score in score[1].items() 94 | ) 95 | ) 96 | print("--") 97 | if self.save_results: 98 | results.append( 99 | { 100 | "func": func.__name__, 101 | "prompt": kwargs.get("prompt", ""), 102 | "n_few_shot": kwargs.get("n_few_shot", 0), 103 | "n_labeled_few_shot": kwargs.get("n_labeled_few_shot", 0), 104 | "n_bootstrapped_few_shot": kwargs.get( 105 | "n_bootstrapped_few_shot", 0 106 | ), 107 | "explanation": example.explanation, 108 | "narrative": result.narrative, 109 | "scores": "".join( 110 | f"{metric}: {score}, " 111 | for metric, score in score[1].items() 112 | ), 113 | } 114 | ) 115 | 116 | total = min(max_iters, len(self.eval_data)) 117 | average_scores = total_scores / total 118 | total_average_score = total_scores.sum() / total 119 | 120 | if self.save_results: 121 | return total_average_score, average_scores, results 122 | return total_average_score, average_scores 123 | 124 | def run_basic_prompting_experiment(self, prompt=None, max_iters=100): 125 | """ 126 | Run a basic prompting experiment 127 | 128 | Args: 129 | prompt (string): Prompt 130 | max_iters (int): Maximum number of examples to run on 131 | 132 | Returns: 133 | total_average_score (float): Average total score over all explanations 134 | average_scores (pd.Series): Average scores for each metric 135 | """ 136 | return self.run_experiment( 137 | self.explingo.basic_prompt, 138 | prompt=prompt, 139 | max_iters=max_iters, 140 | ) 141 | 142 | def run_few_shot_experiment(self, prompt=None, max_iters=100, n_few_shot=3): 143 | """ 144 | Run a few-shot experiment 145 | 146 | Args: 147 | prompt (string): Prompt 148 | max_iters (int): Maximum number of examples to run on 149 | n_few_shot (int): Number of examples to use in few-shot learning 150 | 151 | Returns: 152 | total_average_score (float): Average total score over all explanations 153 | average_scores (pd.Series): Average scores for each metric 154 | """ 155 | return self.run_experiment( 156 | self.explingo.few_shot, 157 | prompt=prompt, 158 | max_iters=max_iters, 159 | kwargs={"n_few_shot": n_few_shot}, 160 | ) 161 | 162 | def run_bootstrap_few_shot_experiment( 163 | self, 164 | prompt=None, 165 | max_iters=100, 166 | n_labeled_few_shot=3, 167 | n_bootstrapped_few_shot=3, 168 | ): 169 | """ 170 | Run a bootstrap few-shot experiment 171 | Args: 172 | prompt (string): Prompt 173 | max_iters (int): Maximum number of examples to run on 174 | n_labeled_few_shot (int): Number of examples to use in few-shot learning 175 | n_bootstrapped_few_shot (int): Number of bootstrapped examples to use in few-shot learning 176 | 177 | Returns: 178 | total_average_score (float): Average total score over all explanations 179 | average_scores (pd.Series): Average scores for each metric 180 | """ 181 | return self.run_experiment( 182 | self.explingo.bootstrap_few_shot, 183 | prompt=prompt, 184 | max_iters=max_iters, 185 | kwargs={ 186 | "metric": self.metrics, 187 | "n_labeled_few_shot": n_labeled_few_shot, 188 | "n_bootstrapped_few_shot": n_bootstrapped_few_shot, 189 | }, 190 | ) 191 | -------------------------------------------------------------------------------- /evaluation/explingo.py: -------------------------------------------------------------------------------- 1 | import dspy 2 | from dspy.teleprompt import LabeledFewShot, BootstrapFewShot 3 | import random 4 | 5 | 6 | def _manually_parse_output(output): 7 | try: 8 | narrative = output.split("Narrative: ")[1].split("\n")[0] 9 | except IndexError: 10 | print(f"Unable to parse output: {output}") 11 | return None 12 | # rationalization = output.split("Rationalization: ")[1].split("\n")[0] 13 | return dspy.Prediction( 14 | narrative=narrative, 15 | # rationalization=rationalization, 16 | ) 17 | 18 | 19 | class ExplingoSig(dspy.Signature): 20 | """You are helping users understand an ML model's prediction. Given an explanation and information about the model, 21 | convert the explanation into a human-readable narrative.""" 22 | 23 | context = dspy.InputField(desc="what the ML model predicts") 24 | explanation = dspy.InputField(desc="explanation of an ML model's prediction") 25 | explanation_format = dspy.InputField(desc="format the explanation is given in") 26 | 27 | narrative = dspy.OutputField( 28 | desc="human-readable narrative version of the explanation" 29 | ) 30 | # rationalization = dspy.OutputField( 31 | # desc="explains why given features may be relevant" 32 | # ) 33 | 34 | 35 | class Explingo: 36 | def __init__(self, llm, context, labeled_train_data, unlabeled_train_data=None): 37 | dspy.settings.configure(lm=llm, experimental=True) 38 | self.llm = llm 39 | self.context = context 40 | self.labeled_train_data = labeled_train_data 41 | self.unlabeled_train_data = ( 42 | [] if unlabeled_train_data is None else unlabeled_train_data 43 | ) 44 | self.few_shot_prompter = None 45 | self.bootstrapped_few_shot_prompter = None 46 | self.default_prompt = ( 47 | "You are helping users understand an ML model's prediction. " 48 | "Given an explanation and information about the model, " 49 | "convert the explanation into a human-readable narrative." 50 | ) 51 | 52 | def assemble_prompt( 53 | self, prompt, explanation, explanation_format, examples=None, k=3 54 | ): 55 | header_string = f"{prompt}\n" 56 | format_string = ( 57 | f"Follow the following format\n" 58 | f"Context: what the model predicts\n" 59 | f"Explanation: explanation of the model's prediction\n" 60 | f"Explanation Format: format the explanation is given in\n" 61 | f"Narrative: human-readable narrative version of the explanation\n" 62 | ) 63 | input_string = ( 64 | f"Context: {self.context}\n" 65 | f"Explanation: {explanation}\n" 66 | f"Explanation Format: {explanation_format}\n" 67 | "Please provide the output field Narrative. " 68 | "Do so immediately, without additional content before or after, " 69 | "and precisely as the format above shows." 70 | ) 71 | 72 | examples_string = "" 73 | if examples is not None: 74 | for i, example in enumerate(random.sample(examples, k)): 75 | examples_string += ( 76 | f"Example {i+1}\n" 77 | f"Context: {example.context}\n" 78 | f"Explanation: {example.explanation}\n" 79 | f"Explanation Format: {example.explanation_format}\n" 80 | f"Narrative: {example.narrative}\n" 81 | ) 82 | 83 | if len(examples_string) == 0: 84 | return "---\n".join([header_string, format_string, input_string]) 85 | else: 86 | return "---\n".join( 87 | [header_string, format_string, examples_string, input_string] 88 | ) 89 | 90 | def basic_prompt(self, explanation, explanation_format, prompt=None, few_shot_n=0): 91 | """ 92 | Basic prompting 93 | 94 | Args: 95 | explanation (string): Explanation 96 | explanation_format (string): Explanation format 97 | prompt (string): Prompt 98 | few_shot_n (int): Number of examples to use in few-shot learning 99 | """ 100 | if prompt is None: 101 | prompt = self.default_prompt 102 | full_prompt = self.assemble_prompt( 103 | prompt, explanation, explanation_format, examples=None 104 | ) 105 | output = self.llm(full_prompt)[0] 106 | return _manually_parse_output(output) 107 | 108 | def few_shot( 109 | self, explanation, explanation_format, prompt=None, n_few_shot=3, use_dspy=False 110 | ): 111 | """ 112 | Few-shot prompting 113 | 114 | Args: 115 | explanation (string): Explanation 116 | explanation_format (string): Explanation format 117 | prompt (string): Prompt 118 | n_few_shot (int): Number of examples to use in few-shot learning 119 | use_dspy (bool): Should be set to False, saving legacy version using DSPy in case needed later 120 | 121 | Returns: 122 | DSPy Prediction object 123 | """ 124 | if prompt is None: 125 | prompt = self.default_prompt 126 | if not use_dspy: 127 | full_prompt = self.assemble_prompt( 128 | prompt, 129 | explanation, 130 | explanation_format, 131 | examples=self.labeled_train_data, 132 | n=n_few_shot, 133 | ) 134 | output = self.llm(full_prompt)[0] 135 | return _manually_parse_output(output) 136 | if use_dspy: 137 | if self.few_shot_prompter is None: 138 | optimizer = LabeledFewShot(k=n_few_shot) 139 | self.few_shot_prompter = optimizer.compile( 140 | dspy.Predict(ExplingoSig), trainset=self.labeled_train_data 141 | ) 142 | return self.few_shot_prompter( 143 | explanation=explanation, 144 | explanation_format=explanation_format, 145 | context=self.context, 146 | ) 147 | 148 | def bootstrap_few_shot( 149 | self, 150 | explanation, 151 | explanation_format, 152 | metric, 153 | prompt=None, 154 | n_labeled_few_shot=3, 155 | n_bootstrapped_few_shot=3, 156 | ): 157 | """ 158 | Use DSPy to bootstrap few-shot prompts to optimize metrics 159 | 160 | Args: 161 | prompt (string): Not supported, included for consistency. To modify prompt, manually 162 | edit the docstrings in the ExplingoSig object 163 | explanation (string): Explanation 164 | explanation_format (string): Explanation format 165 | metric (string): Metric to optimize 166 | n_labeled_few_shot (int): Number of examples to use in few-shot learning 167 | n_bootstrapped_few_shot (int): Number of bootstrapped examples to use in few-shot learning 168 | 169 | Returns: 170 | DSPy Prediction object 171 | """ 172 | optimizer = BootstrapFewShot( 173 | metric=metric, 174 | max_bootstrapped_demos=n_bootstrapped_few_shot, 175 | max_labeled_demos=n_labeled_few_shot, 176 | max_rounds=3, 177 | ) 178 | self.bootstrapped_few_shot_prompter = optimizer.compile( 179 | dspy.Predict(ExplingoSig), 180 | trainset=self.labeled_train_data + self.unlabeled_train_data, 181 | ) 182 | return self.bootstrapped_few_shot_prompter( 183 | explanation=explanation, 184 | explanation_format=explanation_format, 185 | context=self.context, 186 | ) 187 | -------------------------------------------------------------------------------- /evaluation/metrics.py: -------------------------------------------------------------------------------- 1 | import dspy 2 | import pandas as pd 3 | import random 4 | 5 | MAX_SCORE = 4 6 | 7 | 8 | class RubricAssess(dspy.Signature): 9 | """Assess a narrative based on a rubric.""" 10 | 11 | question = dspy.InputField(format=str) 12 | narrative = dspy.InputField() 13 | rubric = dspy.InputField() 14 | 15 | assessment = dspy.OutputField( 16 | desc="A single number from the options in the rubric. Provide only a single number with no other text." 17 | ) 18 | 19 | 20 | class BooleanAssess(dspy.Signature): 21 | """Assess a narrative with a yes/no question.""" 22 | 23 | question = dspy.InputField(format=str) 24 | narrative = dspy.InputField() 25 | 26 | assessment = dspy.OutputField(desc="yes or no. Include only the word yes or no.") 27 | 28 | 29 | class Metrics: 30 | def __init__(self, metric_funcs, openai_key, verbose=0, metric_kwargs=None): 31 | self.metric_funcs = metric_funcs 32 | self.verbose = verbose 33 | self.metric_kwargs = metric_kwargs if metric_kwargs is not None else {} 34 | self.grader = dspy.OpenAI( 35 | model="gpt-4o", 36 | max_tokens=500, 37 | model_type="chat", 38 | api_key=openai_key, 39 | temperature=0.3, 40 | ) 41 | 42 | def __call__(self, input_, output_, trace=None): 43 | metrics = {} 44 | for metric in self.metric_funcs: 45 | metric_name = metric.__name__ 46 | kwargs = self.metric_kwargs.get(metric_name, {}) 47 | metrics[metric_name] = metric( 48 | input_, output_, grader=self.grader, trace=trace, **kwargs 49 | ) 50 | 51 | total_score = sum(metrics.values()) 52 | 53 | if trace is None: 54 | return total_score, pd.Series(metrics) 55 | else: 56 | # print("Narrative:") 57 | # print(output_.narrative) 58 | # print("Metrics:") 59 | # for metric, score in metrics.items(): 60 | # print(f"{metric}: {score}") 61 | return ( 62 | (metrics["accuracy"] == MAX_SCORE) 63 | and (metrics["fluency"] == MAX_SCORE) 64 | and (metrics["completeness"] == MAX_SCORE) 65 | and (metrics["conciseness"] >= 3.5) 66 | ) 67 | # return (metrics["accuracy"] == MAX_SCORE) and ( 68 | # total_score >= (len(metrics) * MAX_SCORE * 0.9) 69 | # ) 70 | 71 | 72 | def compute_score_from_boolean(metric, question, narrative, grader, iters=3): 73 | total_score = 0.0 74 | 75 | with dspy.context(lm=grader): 76 | for i in range(iters): 77 | score = dspy.Predict(BooleanAssess)( 78 | question=question, narrative=narrative 79 | ).assessment.lower() 80 | if score == "yes": 81 | total_score += 1 82 | elif score == "no": 83 | pass 84 | else: 85 | print("Invalid score for metric %s: %s" % (metric, score)) 86 | score = total_score / iters 87 | 88 | if 0.3 < score < 0.7: 89 | print("Inconsistent score for metric %s: %s" % (metric, score)) 90 | 91 | return score * MAX_SCORE 92 | 93 | 94 | def compute_score_from_rubric( 95 | metric, question, rubric, narrative, grader, iters=3, rational_type=None 96 | ): 97 | scores = [] 98 | with dspy.context(lm=grader): 99 | for i in range(iters): 100 | if rational_type is None: 101 | score = dspy.Predict(RubricAssess)( 102 | question=question, rubric=rubric, narrative=narrative 103 | ).assessment 104 | else: 105 | score = dspy.ChainOfThought(RubricAssess, rationale_type=rational_type)( 106 | question=question, 107 | rubric=rubric, 108 | narrative=narrative, 109 | ).assessment 110 | try: 111 | scores.append(int(score)) 112 | except ValueError: 113 | print("Invalid score for metric %s: %s" % (metric, score)) 114 | 115 | if 0 in scores and MAX_SCORE in scores: 116 | print("Inconsistent score for metric %s: %s" % (metric, scores)) 117 | 118 | return sum(scores) / iters 119 | 120 | 121 | def accuracy(input_, output_, grader, trace=None): 122 | question = ( 123 | f"How accurate is the information in the narrative, based on the explanation given? " 124 | f"A narrative can score 4 even if it is missing information as long as everything in the narrative is correct. " 125 | f"Make sure the contribution direction is correct - positive contributions increase the output, negative contributions decrease the output." 126 | f"\n\nExplanation format: {input_.explanation_format}.\nExplanation: {input_.explanation}" 127 | ) 128 | rubric = f"0 - Contains one or more errors in value or contribution direction. 4 - Contains no errors, but may be missing information." 129 | 130 | rational_type = dspy.OutputField( 131 | prefix="Start by listing out all the features in the narrative, and then for each one compare it to the explanation to ensure its value and contribution are approximately correct.", 132 | ) 133 | 134 | return compute_score_from_rubric( 135 | "accuracy", 136 | question, 137 | rubric=rubric, 138 | narrative=output_.narrative, 139 | grader=grader, 140 | rational_type=rational_type, 141 | ) 142 | 143 | 144 | def fluency( 145 | input_, output_, grader, trace=None, good_narratives=None, bad_narratives=None 146 | ): 147 | if good_narratives is None: 148 | question = f"How natural and human is the narrative?" 149 | else: 150 | question = f"How well does the style of the narrative match the style of the example narratives? Consider only the linguistic style, not the topic. Example narratives:" 151 | for narrative in good_narratives: 152 | question += f"\n{narrative}" 153 | if good_narratives is not None: 154 | rubric = f"0: Very dissimilar. 1: Dissimilar. 2: Neutral. 3: Similar. 4: Very similar" 155 | else: 156 | rubric = ( 157 | f"0: Very unnatural. 1: Unnatural. 2: Neutral. 3: Natural. 4: Very natural" 158 | ) 159 | return compute_score_from_rubric( 160 | "fluency", question, rubric, output_.narrative, grader 161 | ) 162 | 163 | 164 | def completeness(input_, output_, grader, trace=None): 165 | question = f"How completely does the narrative below describe the explanation given in <<>>?\nExplanation format: {input_.explanation_format}.\nExplanation: <<{input_.explanation}>>" 166 | rubric = "0 - One or more feature names from the explanation are not mentioned at all in the narrative. 2 - All features are mentioned, but not all feature values and/or contribution directions. 4 - All features are mentioned, and for each feature, includes at least an approximation of the feature's value and contribution direction." 167 | rational_type = dspy.OutputField( 168 | prefix="Start by listing out all the features in the explanations, and then determine every feature is present in the narrative, along with its value and contribution direction.", 169 | ) 170 | 171 | return compute_score_from_rubric( 172 | "completeness", 173 | question, 174 | rubric, 175 | output_.narrative, 176 | grader, 177 | rational_type=rational_type, 178 | ) 179 | 180 | 181 | def conciseness( 182 | input_, output_, grader=None, trace=None, max_optimal_length_per_feature=20 183 | ): 184 | num_features = input_.explanation.count("(") 185 | if num_features == 0: 186 | num_features = 1 187 | length = len(output_.narrative.split()) 188 | max_optimal_length = max_optimal_length_per_feature * num_features 189 | # scale length between 0 and 2 190 | return max( 191 | 0.0, 192 | min( 193 | MAX_SCORE, 194 | MAX_SCORE * (2 - length / max_optimal_length), 195 | ), 196 | ) 197 | 198 | 199 | def context_awareness(input_, output_, grader, trace=None): 200 | question = ( 201 | f"How well does the rationalization help explain the logic in the narrative?" 202 | ) 203 | rubric = f"0: Not at all. 2: Somewhat. 4: Very well." 204 | narrative_input = ( 205 | f"Narrative: {output_.narrative}. Rationalization: {output_.rationalization}" 206 | ) 207 | return compute_score_from_rubric( 208 | "context_awareness", question, rubric, narrative_input, grader 209 | ) 210 | -------------------------------------------------------------------------------- /evaluation/local_cache/compiler/all.e415303eb7359b9a.jsonl: -------------------------------------------------------------------------------- 1 | {"prompt":"Narrative: The above ground living area increases the prediction.\n\nQuestion: Does the narrative contain all information from the explanation? Explanation format: SHAP feature contribution in (feature_name, feature_value, contribution) format. Explanation: (Above ground living area square feet, 1256.00, -12527.46), (Rates the overall material and finish of the house, 5.00, -10743.76), (Second floor square feet, 0.00, -10142.29), (Physical locations within Ames city limits, Edwards, -9913.81), (Wood deck area in square feet, 736.00, 9846.38)\n\nRubric: 0: Does not mention all features. 2: Mentions all features. 4: Mentions all features, and gives information about their values and impact on the model.\n\nAssessment:","completion":"0"} 2 | {"prompt":"Narrative: The brick exterior has a positive impact.\n\nQuestion: Does the narrative contain all information from the explanation? Explanation format: SHAP feature contribution in (feature_name, feature_value, contribution) format. Explanation: (Exterior covering on house, BrkFace, 16798.14), (Original construction date, 1931.00, -13042.68), (Kitchens above grade, 2.00, -12983.78), (Home functionality, Min1, -11474.19), (Second floor square feet, 752.00, 10022.69)\n\nRubric: 0: Does not mention all features. 2: Mentions all features. 4: Mentions all features, and gives information about their values and impact on the model.\n\nAssessment:","completion":"0"} 3 | {"prompt":"Narrative: The above ground living area increases the prediction.\n\nQuestion: Does the narrative contain all information from the explanation? Explanation format: SHAP feature contribution in (feature_name, feature_value, contribution) format. Explanation: (Above ground living area square feet, 2090.00, 16382.07), (Second floor square feet, 983.00, 16216.99), (Physical locations within Ames city limits, NWAmes, -9769.73), (Type 1 finished square feet, 859.00, 6193.63), (Masonry veneer type, Stone, 5446.26)\n\nRubric: 0: Does not mention all features. 2: Mentions all features. 4: Mentions all features, and gives information about their values and impact on the model.\n\nAssessment:","completion":"0"} 4 | {"prompt":"Narrative: The construction date is a negative factor.\n\nQuestion: Does the narrative contain all information from the explanation? Explanation format: SHAP feature contribution in (feature_name, feature_value, contribution) format. Explanation: (Original construction date, 1915.00, -17966.77), (Physical locations within Ames city limits, Crawfor, 17703.26), (Second floor square feet, 756.00, 10129.96), (Total square feet of basement area, 756.00, -8362.22), (Condition of sale, Abnorml, -6786.66)\n\nRubric: 0: Does not mention all features. 2: Mentions all features. 4: Mentions all features, and gives information about their values and impact on the model.\n\nAssessment:","completion":"0"} 5 | {"prompt":"Narrative: This houses exterior covering is brick, which increased the predicted price by about $17,000. The house is older than average, with a construction year of 1931, which reduced the predicted price by about $13,000. The house's two kitchens reduced the price by about $13,000. The house's second floor size of over 700 sq ft increased the price by about $10,000.\n\nQuestion: Does the narrative contain all information from the explanation? Explanation format: SHAP feature contribution in (feature_name, feature_value, contribution) format. Explanation: (Exterior covering on house, BrkFace, 16798.14), (Original construction date, 1931.00, -13042.68), (Kitchens above grade, 2.00, -12983.78), (Second floor square feet, 752.00, 10022.69)\n\nRubric: 0: Does not mention all features. 2: Mentions all features. 4: Mentions all features, and gives information about their values and impact on the model.\n\nAssessment:","completion":"4"} 6 | {"prompt":"Narrative: The original construction date plays a significant role.\n\nQuestion: Does the narrative contain all information from the explanation? Explanation format: SHAP feature contribution in (feature_name, feature_value, contribution) format. Explanation: (Second floor square feet, 854.00, 12757.84), (Original construction date, 2003.00, 9115.72), (Total square feet of basement area, 856.00, -6157.86)\n\nRubric: 0: Does not mention all features. 2: Mentions all features. 4: Mentions all features, and gives information about their values and impact on the model.\n\nAssessment:","completion":"0"} 7 | {"prompt":"Narrative: The garden level walls have a positive impact.\n\nQuestion: Does the narrative contain all information from the explanation? Explanation format: SHAP feature contribution in (feature_name, feature_value, contribution) format. Explanation: (Refers to walkout or garden level walls, Gd, 17607.43), (Rates the overall condition of the house, 8.00, 13038.14), (Above ground living area square feet, 1262.00, -12319.48), (Second floor square feet, 0.00, -10142.29), (Proximity to various conditions, Feedr, -8251.83)\n\nRubric: 0: Does not mention all features. 2: Mentions all features. 4: Mentions all features, and gives information about their values and impact on the model.\n\nAssessment:","completion":"0"} 8 | {"prompt":"Narrative: The wood foundation decreases the predicted price.\n\nQuestion: Does the narrative contain all information from the explanation? Explanation format: SHAP feature contribution in (feature_name, feature_value, contribution) format. Explanation: (Type of foundation, Wood, -18650.67), (Physical locations within Ames city limits, Mitchel, -13510.92), (Rates the overall material and finish of the house, 5.00, -10743.76), (Three season porch area in square feet, 320.00, 9959.33), (Bedrooms above ground, 1.00, 8905.73)\n\nRubric: 0: Does not mention all features. 2: Mentions all features. 4: Mentions all features, and gives information about their values and impact on the model.\n\nAssessment:","completion":"0"} 9 | {"prompt":"Narrative: The total square feet of the basement area has a significant impact.\n\nQuestion: Does the narrative contain all information from the explanation? Explanation format: SHAP feature contribution in (feature_name, feature_value, contribution) format. Explanation: (Type 1 finished square feet, 1369.00, 14641.53), (Evaluates the height of the basement, Ex, 13233.24), (Total square feet of basement area, 1686.00, 12138.28), (Second floor square feet, 0.00, -10142.29), (Rates the overall material and finish of the house, 8.00, 9655.79)\n\nRubric: 0: Does not mention all features. 2: Mentions all features. 4: Mentions all features, and gives information about their values and impact on the model.\n\nAssessment:","completion":"0"} 10 | {"prompt":"Narrative: The total square feet of the basement area influences the prediction.\n\nQuestion: Does the narrative contain all information from the explanation? Explanation format: SHAP feature contribution in (feature_name, feature_value, contribution) format. Explanation: (Second floor square feet, 866.00, 13079.62), (Original construction date, 2001.00, 8500.21), (Above ground living area square feet, 1786.00, 5844.30), (Physical locations within Ames city limits, CollgCr, -4761.42), (Total square feet of basement area, 920.00, -4747.08)\n\nRubric: 0: Does not mention all features. 2: Mentions all features. 4: Mentions all features, and gives information about their values and impact on the model.\n\nAssessment:","completion":"0"} 11 | {"prompt":"Narrative: The large finished square footage (of around 1300) increases the price by over $14,000, while the lack of a second floor decreases it by around $10,000.\n\nQuestion: Does the narrative contain all information from the explanation? Explanation format: SHAP feature contribution in (feature_name, feature_value, contribution) format. Explanation: (Type 1 finished square feet, 1369.00, 14641.53), (Second floor square feet, 0.00, -10142.29)\n\nRubric: 0: Does not mention all features. 2: Mentions all features. 4: Mentions all features, and gives information about their values and impact on the model.\n\nAssessment:","completion":"4"} 12 | {"prompt":"Narrative: The second floor square footage of 752 increased the predicted price by about $10,000.\n\nQuestion: Does the narrative contain all information from the explanation? Explanation format: SHAP feature contribution in (feature_name, feature_value, contribution) format. Explanation: (Second floor square feet, 752.00, 10022.69)\n\nRubric: 0: Does not mention all features. 2: Mentions all features. 4: Mentions all features, and gives information about their values and impact on the model.\n\nAssessment:","completion":"4"} 13 | {"prompt":"Narrative: The location in NoRidge affects the price.\n\nQuestion: Does the narrative contain all information from the explanation? Explanation format: SHAP feature contribution in (feature_name, feature_value, contribution) format. Explanation: (Physical locations within Ames city limits, NoRidge, 23069.89), (Above ground living area square feet, 2198.00, 20125.75), (Second floor square feet, 1053.00, 18094.05), (Rates the overall material and finish of the house, 8.00, 9655.79), (Original construction date, 2000.00, 8192.46)\n\nRubric: 0: Does not mention all features. 2: Mentions all features. 4: Mentions all features, and gives information about their values and impact on the model.\n\nAssessment:","completion":"0"} 14 | -------------------------------------------------------------------------------- /explingo/narrator.py: -------------------------------------------------------------------------------- 1 | import random 2 | 3 | import dspy 4 | from dspy.teleprompt import BootstrapFewShot 5 | 6 | 7 | def _manually_parse_output(output): 8 | try: 9 | narrative = output.split("Narrative: ")[1].split("\n")[0] 10 | except IndexError: 11 | print(f"Unable to parse output: {output}") 12 | return None 13 | # rationalization = output.split("Rationalization: ")[1].split("\n")[0] 14 | return dspy.Prediction( 15 | narrative=narrative, 16 | # rationalization=rationalization, 17 | ) 18 | 19 | 20 | class NarratorSig(dspy.Signature): 21 | """You are helping users understand an ML model's prediction. Given an explanation 22 | and information about the model, convert the explanation into a human-readable narrative.""" 23 | 24 | context = dspy.InputField(desc="what the ML model predicts") 25 | explanation = dspy.InputField(desc="explanation of an ML model's prediction") 26 | explanation_format = dspy.InputField(desc="format the explanation is given in") 27 | 28 | narrative = dspy.OutputField(desc="human-readable narrative version of the explanation") 29 | # rationalization = dspy.OutputField( 30 | # desc="explains why given features may be relevant" 31 | # ) 32 | 33 | 34 | class Narrator: 35 | def __init__( 36 | self, 37 | explanation_format, 38 | context, 39 | llm=None, 40 | openai_api_key=None, 41 | sample_narratives=None, 42 | gpt_model_name="gpt-4o", 43 | ): 44 | """ 45 | Args: 46 | explanation_format (string): Format explanations will take 47 | context (string): Brief description of what the model predicts 48 | (ie. "the model predicts house prices") 49 | llm (LLM object): DSPy LLM object to use. 50 | See https://dspy-docs.vercel.app/docs/building-blocks/language_models for examples 51 | One of llm or openai_api_key must be provided 52 | openai_api_key (string): OpenAI API key to use 53 | gpt_model_name (string): if openai_api_key is provided, 54 | specifies the GPT version to use 55 | sample_narratives (list of tuples of strings): 56 | List of (explanation, narrative) examples 57 | """ 58 | self.llm = llm 59 | if self.llm is None and openai_api_key is not None: 60 | self.llm = dspy.OpenAI(model=gpt_model_name, api_key=openai_api_key, max_tokens=1000) 61 | self.context = context 62 | self.explanation_format = explanation_format 63 | self.sample_narratives = [] 64 | if sample_narratives is not None: 65 | for example in sample_narratives: 66 | self.sample_narratives.append( 67 | dspy.Example( 68 | explanation=example[0], 69 | narrative=example[1], 70 | context=self.context, 71 | explanation_format=explanation_format, 72 | ).with_inputs("explanation", "context", "explanation_format") 73 | ) 74 | 75 | self.few_shot_prompter = None 76 | self.bootstrapped_few_shot_prompter = None 77 | self.default_prompt = ( 78 | "You are helping users understand an ML model's prediction. " 79 | "Given an explanation and information about the model, " 80 | "convert the explanation into a human-readable narrative." 81 | ) 82 | 83 | def _assemble_prompt(self, prompt, explanation, explanation_format, examples=None, n=3): 84 | header_string = f"{prompt}\n" 85 | format_string = ( 86 | "Follow the following format\n" 87 | "Context: what the model predicts\n" 88 | "Explanation: explanation of the model's prediction\n" 89 | "Explanation Format: format the explanation is given in\n" 90 | "Narrative: human-readable narrative version of the explanation\n" 91 | ) 92 | input_string = ( 93 | f"Context: {self.context}\n" 94 | f"Explanation: {explanation}\n" 95 | f"Explanation Format: {explanation_format}\n" 96 | "Please provide the output field Narrative. " 97 | "Do so immediately, without additional content before or after, " 98 | "and precisely as the format above shows." 99 | ) 100 | 101 | examples_string = "" 102 | if examples is not None: 103 | for i, example in enumerate(random.sample(examples, n)): 104 | examples_string += ( 105 | f"Example {i+1}\n" 106 | f"Context: {example.context}\n" 107 | f"Explanation: {example.explanation}\n" 108 | f"Explanation Format: {example.explanation_format}\n" 109 | f"Narrative: {example.narrative}\n" 110 | ) 111 | 112 | if len(examples_string) == 0: 113 | return "---\n".join([header_string, format_string, input_string]) 114 | else: 115 | return "---\n".join([header_string, format_string, examples_string, input_string]) 116 | 117 | def narrate(self, explanation, n_examples=3, n_bootstrapped=0, grader=None): 118 | """ 119 | Transform an explanation into a human-readable narrative 120 | 121 | Args: 122 | explanation (string): Explanation, in the format specified by self.explanation_format 123 | n_examples (int): Number of examples to pass 124 | n_bootstrapped (int): Number of bootstrapped examples to pass. Increasing this number 125 | will incur additional calls to the LLM, but may improve the quality of the output 126 | n_bootstrapped should be less than or equal to n_examples 127 | grader (Grader): Grader object to use for bootstrapping. Must be provided if 128 | n_bootstrapped > 0 129 | """ 130 | if n_bootstrapped > 0: 131 | return self.bootstrap_few_shot( 132 | explanation, 133 | self.explanation_format, 134 | metric=grader.run_metrics, 135 | n_labeled_few_shot=n_examples, 136 | n_bootstrapped_few_shot=n_bootstrapped, 137 | ).narrative 138 | if self.sample_narratives: 139 | return self.few_shot( 140 | explanation, self.explanation_format, n_few_shot=n_examples 141 | ).narrative 142 | else: 143 | return self.basic_prompt(explanation, self.explanation_format).narrative 144 | 145 | def basic_prompt(self, explanation, explanation_format, prompt=None, few_shot_n=0): 146 | """ 147 | Basic prompting 148 | 149 | Args: 150 | explanation (string): Explanation 151 | explanation_format (string): Explanation format 152 | prompt (string): Prompt 153 | few_shot_n (int): Number of examples to use in few-shot learning 154 | """ 155 | if prompt is None: 156 | prompt = self.default_prompt 157 | full_prompt = self._assemble_prompt(prompt, explanation, explanation_format, examples=None) 158 | output = self.llm(full_prompt)[0] 159 | return _manually_parse_output(output) 160 | 161 | def few_shot(self, explanation, explanation_format, prompt=None, n_few_shot=3, use_dspy=False): 162 | """ 163 | Few-shot prompting 164 | 165 | Args: 166 | explanation (string): Explanation 167 | explanation_format (string): Explanation format 168 | prompt (string): Prompt 169 | n_few_shot (int): Number of examples to use in few-shot learning 170 | use_dspy (bool): Should be set to False, saving legacy version using DSPy 171 | in case needed later 172 | 173 | Returns: 174 | DSPy Prediction object 175 | """ 176 | if prompt is None: 177 | prompt = self.default_prompt 178 | if not use_dspy: 179 | full_prompt = self._assemble_prompt( 180 | prompt, 181 | explanation, 182 | explanation_format, 183 | examples=self.sample_narratives, 184 | n=n_few_shot, 185 | ) 186 | output = self.llm(full_prompt)[0] 187 | return _manually_parse_output(output) 188 | 189 | def bootstrap_few_shot( 190 | self, 191 | explanation, 192 | explanation_format, 193 | metric, 194 | n_labeled_few_shot=3, 195 | n_bootstrapped_few_shot=3, 196 | ): 197 | """ 198 | Use DSPy to bootstrap few-shot prompts to optimize metrics 199 | 200 | Args: 201 | explanation (string): Explanation 202 | explanation_format (string): Explanation format 203 | metric (func): Metric to use for optimization 204 | n_labeled_few_shot (int): Number of examples to use in few-shot learning 205 | n_bootstrapped_few_shot (int): Number of bootstrapped examples to use in 206 | few-shot learning 207 | 208 | Returns: 209 | DSPy Prediction object 210 | """ 211 | with dspy.context(lm=self.llm): 212 | optimizer = BootstrapFewShot( 213 | metric=metric, 214 | max_bootstrapped_demos=n_bootstrapped_few_shot, 215 | max_labeled_demos=n_labeled_few_shot, 216 | max_rounds=3, 217 | ) 218 | self.bootstrapped_few_shot_prompter = optimizer.compile( 219 | dspy.Predict(NarratorSig), 220 | trainset=self.sample_narratives, 221 | ) 222 | return self.bootstrapped_few_shot_prompter( 223 | explanation=explanation, 224 | explanation_format=explanation_format, 225 | context=self.context, 226 | ) 227 | -------------------------------------------------------------------------------- /evaluation/eval_data/mushroom_2.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "explanation": "(odor, foul, 0.24), (stalk-surface-above-ring, silky, 0.10), (spore-print-color, chocolate, 0.06)", 4 | "context": "The model predicts whether a mushroom is poisonous", 5 | "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format", 6 | "narrative": "The foul odor, silky stalk surface, and chocolate spore print color suggest the mushroom more likely to be poisonous." 7 | }, 8 | { 9 | "explanation": "(odor, foul, 0.22), (stalk-surface-above-ring, silky, 0.11), (spore-print-color, chocolate, 0.07)", 10 | "context": "The model predicts whether a mushroom is poisonous", 11 | "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format", 12 | "narrative": "The foul odor, silky stalk surface, and chocolate spore print color suggest the mushroom more likely to be poisonous." 13 | }, 14 | { 15 | "explanation": "(odor, none, -0.15), (gill-size, broad, -0.05), (spore-print-color, brown, 0.05)", 16 | "context": "The model predicts whether a mushroom is poisonous", 17 | "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format", 18 | "narrative": "The absence of odor and broad gill size suggest the mushroom is less likely to be poisonous, but the brown spore print indicates a higher risk of toxicity." 19 | }, 20 | { 21 | "explanation": "(odor, foul, 0.24), (stalk-surface-above-ring, silky, 0.10), (spore-print-color, chocolate, 0.07)", 22 | "context": "The model predicts whether a mushroom is poisonous", 23 | "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format" 24 | }, 25 | { 26 | "explanation": "(odor, none, -0.15), (gill-size, broad, -0.06), (spore-print-color, black, -0.05)", 27 | "context": "The model predicts whether a mushroom is poisonous", 28 | "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format", 29 | "narrative": "The lack of odor, broad gill size, and black spore print suggest the mushroom is less likely to be poisonous" 30 | }, 31 | { 32 | "explanation": "(odor, none, -0.14), (gill-size, broad, -0.07), (spore-print-color, black, -0.05)", 33 | "context": "The model predicts whether a mushroom is poisonous", 34 | "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format", 35 | "narrative": "The lack of odor, a broad gill size, and a black spore print color suggest the mushroom is less likely to be poisonous." 36 | }, 37 | { 38 | "explanation": "(odor, foul, 0.24), (stalk-surface-above-ring, silky, 0.09), (spore-print-color, chocolate, 0.07)", 39 | "context": "The model predicts whether a mushroom is poisonous", 40 | "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format", 41 | "narrative": "The foul odor, silky stalk surface, and chocolate spore print color suggest the mushroom is more likely to be poisonous." 42 | }, 43 | { 44 | "explanation": "(odor, none, -0.15), (gill-spacing, crowded, 0.07), (gill-size, broad, 0.05)", 45 | "context": "The model predicts whether a mushroom is poisonous", 46 | "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format", 47 | "narrative": "The lack of an odor suggest this mushroom to be less likely to be poisonous, but the broad gill size and crowded gill spacing indicate a higher risk of toxicity." 48 | }, 49 | { 50 | "explanation": "(odor, none, 0.14), (gill-size, broad, 0.08), (ring-type, pendant, 0.05)", 51 | "context": "The model predicts whether a mushroom is poisonous", 52 | "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format", 53 | "narrative": "The lack of odor, a broad gill size, and a pendant ring type suggest the mushroom is more likely to be poisonous." 54 | }, 55 | { 56 | "explanation": "(odor, none, 0.15), (gill-size, broad, 0.06), (spore-print-color, black, 0.05)", 57 | "context": "The model predicts whether a mushroom is poisonous", 58 | "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format", 59 | "narrative": "The lack of odor, a broad gill size, and a black spore print color suggest the mushroom is more likely to be poisonous." 60 | }, 61 | { 62 | "explanation": "(odor, none, -0.14), (gill-spacing, crowded, 0.09), (gill-size, broad, 0.06)", 63 | "context": "The model predicts whether a mushroom is poisonous", 64 | "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format", 65 | "narrative": "The lack of odor suggest this mushroom is less likely to be poisonous. However, the crowded gill spacing and broad gill size indicate a higher risk of toxicity." 66 | }, 67 | { 68 | "explanation": "(odor, foul, 0.19), (gill-color, buff, 0.07), (gill-size, narrow, 0.07)", 69 | "context": "The model predicts whether a mushroom is poisonous", 70 | "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format" 71 | }, 72 | { 73 | "explanation": "(gill-size, narrow, -0.13), (odor, fishy, -0.10), (gill-color, buff, -0.08)", 74 | "context": "The model predicts whether a mushroom is poisonous", 75 | "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format", 76 | "narrative": "The narrow gill size, fishy odor, and buff gill color suggest the mushroom is less likely to be poisonous." 77 | }, 78 | { 79 | "explanation": "(odor, foul, 0.19), (gill-size, narrow, 0.09), (gill-color, buff, 0.07)", 80 | "context": "The model predicts whether a mushroom is poisonous", 81 | "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format" 82 | }, 83 | { 84 | "explanation": "(gill-size, narrow, 0.12), (odor, fishy, 0.11), (gill-color, buff, 0.08)", 85 | "context": "The model predicts whether a mushroom is poisonous", 86 | "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format" 87 | }, 88 | { 89 | "explanation": "(odor, none, 0.14), (gill-spacing, crowded, 0.08), (gill-size, broad, 0.06)", 90 | "context": "The model predicts whether a mushroom is poisonous", 91 | "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format" 92 | }, 93 | { 94 | "explanation": "(odor, foul, 0.25), (stalk-surface-above-ring, silky, 0.11), (spore-print-color, chocolate, 0.06)", 95 | "context": "The model predicts whether a mushroom is poisonous", 96 | "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format" 97 | }, 98 | { 99 | "explanation": "(odor, none, 0.13), (gill-size, broad, 0.08), (stalk-surface-above-ring, smooth, 0.04)", 100 | "context": "The model predicts whether a mushroom is poisonous", 101 | "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format" 102 | }, 103 | { 104 | "explanation": "(gill-size, narrow, 0.12), (odor, fishy, 0.11), (gill-color, buff, 0.08)", 105 | "context": "The model predicts whether a mushroom is poisonous", 106 | "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format" 107 | }, 108 | { 109 | "explanation": "(gill-size, narrow, 0.10), (odor, spicy, 0.08), (gill-color, buff, 0.07)", 110 | "context": "The model predicts whether a mushroom is poisonous", 111 | "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format" 112 | }, 113 | { 114 | "explanation": "(spore-print-color, green, 0.26), (ring-number, two, 0.10), (odor, none, -0.06)", 115 | "context": "The model predicts whether a mushroom is poisonous", 116 | "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format" 117 | }, 118 | { 119 | "explanation": "(odor, pungent, 0.18), (gill-size, narrow, 0.18), (stalk-shape, enlarging, 0.05)", 120 | "context": "The model predicts whether a mushroom is poisonous", 121 | "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format" 122 | }, 123 | { 124 | "explanation": "(odor, none, 0.15), (gill-size, broad, 0.08), (spore-print-color, black, 0.05)", 125 | "context": "The model predicts whether a mushroom is poisonous", 126 | "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format" 127 | }, 128 | { 129 | "explanation": "(odor, none, 0.14), (gill-spacing, crowded, 0.08), (gill-size, broad, 0.06)", 130 | "context": "The model predicts whether a mushroom is poisonous", 131 | "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format" 132 | }, 133 | { 134 | "explanation": "(gill-size, narrow, 0.12), (odor, spicy, 0.10), (gill-color, buff, 0.08)", 135 | "context": "The model predicts whether a mushroom is poisonous", 136 | "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format" 137 | }, 138 | { 139 | "explanation": "(odor, none, 0.14), (gill-size, broad, 0.07), (spore-print-color, brown, 0.05)", 140 | "context": "The model predicts whether a mushroom is poisonous", 141 | "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format" 142 | }, 143 | { 144 | "explanation": "(gill-size, narrow, 0.12), (odor, fishy, 0.11), (gill-color, buff, 0.08)", 145 | "context": "The model predicts whether a mushroom is poisonous", 146 | "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format" 147 | }, 148 | { 149 | "explanation": "(gill-size, narrow, 0.12), (odor, fishy, 0.10), (gill-color, buff, 0.08)", 150 | "context": "The model predicts whether a mushroom is poisonous", 151 | "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format" 152 | }, 153 | { 154 | "explanation": "(odor, none, 0.14), (gill-size, broad, 0.07), (spore-print-color, brown, 0.05)", 155 | "context": "The model predicts whether a mushroom is poisonous", 156 | "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format" 157 | }, 158 | { 159 | "explanation": "(odor, foul, 0.22), (stalk-surface-above-ring, silky, 0.10), (spore-print-color, chocolate, 0.06)", 160 | "context": "The model predicts whether a mushroom is poisonous", 161 | "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format" 162 | } 163 | ] -------------------------------------------------------------------------------- /evaluation/eval_data/mushroom_1.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "explanation": "(odor, foul, 0.24), (stalk-surface-above-ring, silky, 0.10), (spore-print-color, chocolate, 0.06)", 4 | "context": "The model predicts whether a mushroom is poisonous", 5 | "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format", 6 | "narrative": "This mushroom is more likely to be poisonous because its foul odor, silky stalk surface, and chocolate spore print color. Be careful!" 7 | }, 8 | { 9 | "explanation": "(odor, foul, 0.22), (stalk-surface-above-ring, silky, 0.11), (spore-print-color, chocolate, 0.07)", 10 | "context": "The model predicts whether a mushroom is poisonous", 11 | "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format", 12 | "narrative": "This mushroom is more likely to be poisonous because its foul odor, silky stalk surface, and chocolate spore print color. Be careful!" 13 | }, 14 | { 15 | "explanation": "(odor, none, -0.15), (gill-size, broad, -0.05), (spore-print-color, brown, 0.05)", 16 | "context": "The model predicts whether a mushroom is poisonous", 17 | "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format", 18 | "narrative": "This mushroom is less likely to be poisonous because it has no odor and a broad gill size. However, its brown spore print color increases the likelihood of it being poisonous. Be cautious!" 19 | }, 20 | { 21 | "explanation": "(odor, foul, 0.24), (stalk-surface-above-ring, silky, 0.10), (spore-print-color, chocolate, 0.07)", 22 | "context": "The model predicts whether a mushroom is poisonous", 23 | "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format" 24 | }, 25 | { 26 | "explanation": "(odor, none, -0.15), (gill-size, broad, -0.06), (spore-print-color, black, -0.05)", 27 | "context": "The model predicts whether a mushroom is poisonous", 28 | "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format", 29 | "narrative": "This mushroom is less likely to be poisonous because it has no odor, a broad gill size, and a black spore print color. You should still confirm with external sources." 30 | }, 31 | { 32 | "explanation": "(odor, none, -0.14), (gill-size, broad, -0.07), (spore-print-color, black, -0.05)", 33 | "context": "The model predicts whether a mushroom is poisonous", 34 | "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format", 35 | "narrative": "This mushroom is less likely to be poisonous because it has no odor, a broad gill size, and a black spore print color. You should still confirm with external sources." 36 | }, 37 | { 38 | "explanation": "(odor, foul, 0.24), (stalk-surface-above-ring, silky, 0.09), (spore-print-color, chocolate, 0.07)", 39 | "context": "The model predicts whether a mushroom is poisonous", 40 | "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format", 41 | "narrative": "This mushroom is more likely to be poisonous because its foul odor, silky stalk surface, and chocolate spore print color. Be careful!" 42 | }, 43 | { 44 | "explanation": "(odor, none, -0.15), (gill-spacing, crowded, 0.07), (gill-size, broad, 0.05)", 45 | "context": "The model predicts whether a mushroom is poisonous", 46 | "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format", 47 | "narrative": "While the lack of an odor make this mushroom less likely to be poisonous, its broad gill size and crowded gill spacing increases the likelihood of it being poisonous. Be cautious!" 48 | }, 49 | { 50 | "explanation": "(odor, none, 0.14), (gill-size, broad, 0.08), (ring-type, pendant, 0.05)", 51 | "context": "The model predicts whether a mushroom is poisonous", 52 | "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format", 53 | "narrative": "This mushroom is more likely to be poisonous because it has no odor, a broad gill size, and a pendant ring type. Be careful!" 54 | }, 55 | { 56 | "explanation": "(odor, none, 0.15), (gill-size, broad, 0.06), (spore-print-color, black, 0.05)", 57 | "context": "The model predicts whether a mushroom is poisonous", 58 | "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format", 59 | "narrative": "This mushroom is more likely to be poisonous because it has no odor, a broad gill size, and a black spore print color. Be careful!" 60 | }, 61 | { 62 | "explanation": "(odor, none, -0.14), (gill-spacing, crowded, 0.09), (gill-size, broad, 0.06)", 63 | "context": "The model predicts whether a mushroom is poisonous", 64 | "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format", 65 | "narrative": "This mushroom is less likely to be poisonous because it has no odor. However, its crowded gill spacing and broad gill size increases the likelihood of it being poisonous. Be cautious!" 66 | }, 67 | { 68 | "explanation": "(odor, foul, 0.19), (gill-color, buff, 0.07), (gill-size, narrow, 0.07)", 69 | "context": "The model predicts whether a mushroom is poisonous", 70 | "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format" 71 | }, 72 | { 73 | "explanation": "(gill-size, narrow, -0.13), (odor, fishy, -0.10), (gill-color, buff, -0.08)", 74 | "context": "The model predicts whether a mushroom is poisonous", 75 | "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format", 76 | "narrative": "This mushroom is less likely to be poisonous because it has a narrow gill size, fishy odor, and buff gill color. We still recommend confirming with external sources." 77 | }, 78 | { 79 | "explanation": "(odor, foul, 0.19), (gill-size, narrow, 0.09), (gill-color, buff, 0.07)", 80 | "context": "The model predicts whether a mushroom is poisonous", 81 | "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format" 82 | }, 83 | { 84 | "explanation": "(gill-size, narrow, 0.12), (odor, fishy, 0.11), (gill-color, buff, 0.08)", 85 | "context": "The model predicts whether a mushroom is poisonous", 86 | "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format" 87 | }, 88 | { 89 | "explanation": "(odor, none, 0.14), (gill-spacing, crowded, 0.08), (gill-size, broad, 0.06)", 90 | "context": "The model predicts whether a mushroom is poisonous", 91 | "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format" 92 | }, 93 | { 94 | "explanation": "(odor, foul, 0.25), (stalk-surface-above-ring, silky, 0.11), (spore-print-color, chocolate, 0.06)", 95 | "context": "The model predicts whether a mushroom is poisonous", 96 | "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format" 97 | }, 98 | { 99 | "explanation": "(odor, none, 0.13), (gill-size, broad, 0.08), (stalk-surface-above-ring, smooth, 0.04)", 100 | "context": "The model predicts whether a mushroom is poisonous", 101 | "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format" 102 | }, 103 | { 104 | "explanation": "(gill-size, narrow, 0.12), (odor, fishy, 0.11), (gill-color, buff, 0.08)", 105 | "context": "The model predicts whether a mushroom is poisonous", 106 | "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format" 107 | }, 108 | { 109 | "explanation": "(gill-size, narrow, 0.10), (odor, spicy, 0.08), (gill-color, buff, 0.07)", 110 | "context": "The model predicts whether a mushroom is poisonous", 111 | "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format" 112 | }, 113 | { 114 | "explanation": "(spore-print-color, green, 0.26), (ring-number, two, 0.10), (odor, none, -0.06)", 115 | "context": "The model predicts whether a mushroom is poisonous", 116 | "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format" 117 | }, 118 | { 119 | "explanation": "(odor, pungent, 0.18), (gill-size, narrow, 0.18), (stalk-shape, enlarging, 0.05)", 120 | "context": "The model predicts whether a mushroom is poisonous", 121 | "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format" 122 | }, 123 | { 124 | "explanation": "(odor, none, 0.15), (gill-size, broad, 0.08), (spore-print-color, black, 0.05)", 125 | "context": "The model predicts whether a mushroom is poisonous", 126 | "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format" 127 | }, 128 | { 129 | "explanation": "(odor, none, 0.14), (gill-spacing, crowded, 0.08), (gill-size, broad, 0.06)", 130 | "context": "The model predicts whether a mushroom is poisonous", 131 | "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format" 132 | }, 133 | { 134 | "explanation": "(gill-size, narrow, 0.12), (odor, spicy, 0.10), (gill-color, buff, 0.08)", 135 | "context": "The model predicts whether a mushroom is poisonous", 136 | "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format" 137 | }, 138 | { 139 | "explanation": "(odor, none, 0.14), (gill-size, broad, 0.07), (spore-print-color, brown, 0.05)", 140 | "context": "The model predicts whether a mushroom is poisonous", 141 | "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format" 142 | }, 143 | { 144 | "explanation": "(gill-size, narrow, 0.12), (odor, fishy, 0.11), (gill-color, buff, 0.08)", 145 | "context": "The model predicts whether a mushroom is poisonous", 146 | "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format" 147 | }, 148 | { 149 | "explanation": "(gill-size, narrow, 0.12), (odor, fishy, 0.10), (gill-color, buff, 0.08)", 150 | "context": "The model predicts whether a mushroom is poisonous", 151 | "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format" 152 | }, 153 | { 154 | "explanation": "(odor, none, 0.14), (gill-size, broad, 0.07), (spore-print-color, brown, 0.05)", 155 | "context": "The model predicts whether a mushroom is poisonous", 156 | "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format" 157 | }, 158 | { 159 | "explanation": "(odor, foul, 0.22), (stalk-surface-above-ring, silky, 0.10), (spore-print-color, chocolate, 0.06)", 160 | "context": "The model predicts whether a mushroom is poisonous", 161 | "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format" 162 | } 163 | ] -------------------------------------------------------------------------------- /evaluation/eval_data/pdf_2.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "explanation": "(Size of metadata in KB, 262.0, 0.11), (Total size in KB, 74.0, 0.09), (Number of Javascript keywords, 0.0, 0.09)", 4 | "context": "The model predicts whether a PDF file contains malware", 5 | "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format", 6 | "narrative": "The larger metadata size (262 KB), a larger total size (74 KB), and no Javascript keywords suggest that the PDF contains malware." 7 | }, 8 | { 9 | "explanation": "(Number of objects, -1.0, 0.10), (Number of keywords that denote end of streams, -1.0, 0.10), (Number of streams (sequences of binary data), -1.0, 0.10)", 10 | "context": "The model predicts whether a PDF file contains malware", 11 | "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format", 12 | "narrative": "The presence of fewer objects, fewer keywords that denote the end of streams, and fewer streams suggest that the PDF contains malware." 13 | }, 14 | { 15 | "explanation": "(Size of metadata in KB, 272.0, 0.10), (Number of Javascript keywords, 0.0, 0.09), (Total size in KB, 90.0, 0.08)", 16 | "context": "The model predicts whether a PDF file contains malware", 17 | "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format", 18 | "narrative": "A larger metadata size (272 KB), no Javascript keywords, and a larger total size (90 KB) suggest that the PDF contains malware." 19 | }, 20 | { 21 | "explanation": "(Size of metadata in KB, 180.0, 0.11), (Total size in KB, 7.0, 0.06), (Number of objects, -1.0, 0.04)", 22 | "context": "The model predicts whether a PDF file contains malware", 23 | "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format", 24 | "narrative": "The large metadata size (180 KB), the large total size (7 KB), and fewer objects suggest the PDF contains malware." 25 | }, 26 | { 27 | "explanation": "(Size of metadata in KB, 262.0, 0.10), (Number of Javascript keywords, 0.0, 0.09), (Total size in KB, 91.0, 0.08)", 28 | "context": "The model predicts whether a PDF file contains malware", 29 | "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format", 30 | "narrative": "The large metadata size (262 KB), no Javascript keywords, and the large total size (91 KB) suggest the PDF contains malware." 31 | }, 32 | { 33 | "explanation": "(Size of metadata in KB, 180.0, 0.16), (Total size in KB, 3.0, 0.06), (Number of streams (sequences of binary data), 1.0, 0.04)", 34 | "context": "The model predicts whether a PDF file contains malware", 35 | "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format", 36 | "narrative": "The large metadata size (180 KB), the large total size (3 KB), and more streams suggest the PDF contains malware." 37 | }, 38 | { 39 | "explanation": "(Size of metadata in KB, 358.0, 0.10), (Number of Javascript keywords, 0.0, 0.07), (Total size in KB, 63.0, 0.06)", 40 | "context": "The model predicts whether a PDF file contains malware", 41 | "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format", 42 | "narrative": "The large metadata size (358 KB), no Javascript keywords, and the large total size (63 KB) suggest the PDF contains malware." 43 | }, 44 | { 45 | "explanation": "(Number of Javascript keywords, 3.0, 0.08), (Size of metadata in KB, 224.0, 0.06), (Number of keywords with startxref, 0.0, 0.04)", 46 | "context": "The model predicts whether a PDF file contains malware", 47 | "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format", 48 | "narrative": "The number of Javascript keywords (3), the large metadata size (224 KB), and no keywords with startxref suggest the PDF contains malware." 49 | }, 50 | { 51 | "explanation": "(Size of metadata in KB, 403.0, 0.09), (Total size in KB, 145.0, 0.07), (Number of Javascript keywords, 0.0, 0.06)", 52 | "context": "The model predicts whether a PDF file contains malware", 53 | "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format", 54 | "narrative": "The large metadata size (403 KB), the large total size (145 KB), and no Javascript keywords suggest the PDF contains malware." 55 | }, 56 | { 57 | "explanation": "(Number of Javascript keywords, 1.0, 0.10), (Number of images, -1.0, 0.06), (Number of JS keywords, 1.0, 0.05)", 58 | "context": "The model predicts whether a PDF file contains malware", 59 | "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format" 60 | }, 61 | { 62 | "explanation": "(Size of metadata in KB, 336.0, 0.11), (Total size in KB, 58.0, 0.07), (Number of objects, 121.0, 0.07)", 63 | "context": "The model predicts whether a PDF file contains malware", 64 | "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format" 65 | }, 66 | { 67 | "explanation": "(Number of Javascript keywords, 3.0, 0.15), (Number of JS keywords, 2.0, 0.07), (Number of keywords that denote end of streams, 2.0, 0.06)", 68 | "context": "The model predicts whether a PDF file contains malware", 69 | "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format" 70 | }, 71 | { 72 | "explanation": "(Size of metadata in KB, 289.0, 0.09), (Number of Javascript keywords, 0.0, 0.08), (Total size in KB, 27.0, 0.06)", 73 | "context": "The model predicts whether a PDF file contains malware", 74 | "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format" 75 | }, 76 | { 77 | "explanation": "(Number of Javascript keywords, 2.0, 0.10), (Total size in KB, 4.0, 0.08), (Number of entries in Xref tables, 10.0, 0.05)", 78 | "context": "The model predicts whether a PDF file contains malware", 79 | "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format" 80 | }, 81 | { 82 | "explanation": "(Size of metadata in KB, 298.0, 0.11), (Number of Javascript keywords, 0.0, 0.10), (Total size in KB, 32.0, 0.07)", 83 | "context": "The model predicts whether a PDF file contains malware", 84 | "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format" 85 | }, 86 | { 87 | "explanation": "(Size of metadata in KB, 180.0, 0.11), (Number of Javascript keywords, 1.0, 0.08), (Total size in KB, 1.0, 0.06)", 88 | "context": "The model predicts whether a PDF file contains malware", 89 | "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format" 90 | }, 91 | { 92 | "explanation": "(Number of objects, 289.0, 0.12), (Size of metadata in KB, 388.0, 0.10), (Number of entries in Xref tables, 354.0, 0.09)", 93 | "context": "The model predicts whether a PDF file contains malware", 94 | "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format" 95 | }, 96 | { 97 | "explanation": "(Number of XFA keywords, 1.0, 0.11), (Total size in KB, 9.0, 0.08), (Size of metadata in KB, 252.0, 0.07)", 98 | "context": "The model predicts whether a PDF file contains malware", 99 | "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format" 100 | }, 101 | { 102 | "explanation": "(Number of Javascript keywords, 0.0, 0.12), (Size of metadata in KB, 299.0, 0.08), (Number of JS keywords, 0.0, 0.06)", 103 | "context": "The model predicts whether a PDF file contains malware", 104 | "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format" 105 | }, 106 | { 107 | "explanation": "(Number of Javascript keywords, 2.0, 0.12), (Total size in KB, 9.0, 0.09), (Number of JS keywords, 1.0, 0.04)", 108 | "context": "The model predicts whether a PDF file contains malware", 109 | "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format" 110 | }, 111 | { 112 | "explanation": "(Size of metadata in KB, 288.0, 0.09), (Total size in KB, 32.0, 0.07), (Number of Javascript keywords, 0.0, 0.06)", 113 | "context": "The model predicts whether a PDF file contains malware", 114 | "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format" 115 | }, 116 | { 117 | "explanation": "(Total size in KB, 7.0, 0.09), (Size of metadata in KB, 239.0, 0.07), (Number of entries in Xref tables, 10.0, 0.05)", 118 | "context": "The model predicts whether a PDF file contains malware", 119 | "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format" 120 | }, 121 | { 122 | "explanation": "(Number of Javascript keywords, 0.0, 0.09), (Size of metadata in KB, 278.0, 0.08), (Number of keywords with startxref, 2.0, 0.08)", 123 | "context": "The model predicts whether a PDF file contains malware", 124 | "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format" 125 | }, 126 | { 127 | "explanation": "(Number of Javascript keywords, 1.0, 0.11), (Number of images, -1.0, 0.07), (Number of keywords with startxref, 1.0, 0.05)", 128 | "context": "The model predicts whether a PDF file contains malware", 129 | "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format" 130 | }, 131 | { 132 | "explanation": "(Size of metadata in KB, 327.0, 0.10), (Total size in KB, 75.0, 0.06), (Number of entries in Xref tables, 368.0, 0.05)", 133 | "context": "The model predicts whether a PDF file contains malware", 134 | "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format" 135 | }, 136 | { 137 | "explanation": "(Size of metadata in KB, -1.0, 0.09), (Number of Javascript keywords, 1.0, 0.06), (Total size in KB, -1.0, 0.04)", 138 | "context": "The model predicts whether a PDF file contains malware", 139 | "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format" 140 | }, 141 | { 142 | "explanation": "(Number of Javascript keywords, 0.0, 0.10), (Size of metadata in KB, 283.0, 0.09), (Total size in KB, 78.0, 0.07)", 143 | "context": "The model predicts whether a PDF file contains malware", 144 | "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format" 145 | }, 146 | { 147 | "explanation": "(Size of metadata in KB, 180.0, 0.21), (Total size in KB, 9.0, 0.13), (Contains text, 0.0, 0.04)", 148 | "context": "The model predicts whether a PDF file contains malware", 149 | "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format" 150 | }, 151 | { 152 | "explanation": "(Size of metadata in KB, 262.0, 0.11), (Total size in KB, 80.0, 0.09), (Number of Javascript keywords, 0.0, 0.09)", 153 | "context": "The model predicts whether a PDF file contains malware", 154 | "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format" 155 | }, 156 | { 157 | "explanation": "(Number of Javascript keywords, 4.0, 0.12), (Total size in KB, 3.0, 0.08), (Number of JS keywords, 3.0, 0.06)", 158 | "context": "The model predicts whether a PDF file contains malware", 159 | "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format" 160 | } 161 | ] -------------------------------------------------------------------------------- /explingo/grader.py: -------------------------------------------------------------------------------- 1 | import dspy 2 | import pandas as pd 3 | 4 | MAX_SCORE = 4 5 | 6 | 7 | class RubricAssess(dspy.Signature): 8 | """Assess a narrative based on a rubric.""" 9 | 10 | question = dspy.InputField(format=str) 11 | narrative = dspy.InputField() 12 | rubric = dspy.InputField() 13 | 14 | assessment = dspy.OutputField( 15 | desc="A single number from the options in the rubric. " 16 | "Provide only a single number with no other text." 17 | ) 18 | 19 | 20 | class BooleanAssess(dspy.Signature): 21 | """Assess a narrative with a yes/no question.""" 22 | 23 | question = dspy.InputField(format=str) 24 | narrative = dspy.InputField() 25 | 26 | assessment = dspy.OutputField(desc="yes or no. Include only the word yes or no.") 27 | 28 | 29 | class Grader: 30 | def __init__( 31 | self, 32 | llm=None, 33 | openai_api_key=None, 34 | metrics="all", 35 | sample_narratives=None, 36 | max_optimal_length=None, 37 | ): 38 | """ 39 | Grades narratives 40 | 41 | Args: 42 | llm (LLM): LLM to use to grade accuracy, completeness, and fluency. 43 | One of llm or openai_api_key must be provided 44 | openai_api_key (string): OpenAI API key to use to grade accuracy, completeness, 45 | and fluency 46 | metrics (list of strings or "all"): One or more of 47 | "accuracy", "completeness", "fluency", "conciseness" 48 | sample_narratives (list of strings, or (string, string) tuples): 49 | Sample narratives to use to grade fluency. Can pass in either just the narratives 50 | or (explanation, narrative) tuples 51 | max_optimal_length (int): Hyperparameter for conciseness metric, defaults to number of 52 | words in longest sample narrative or 100 if not given 53 | """ 54 | self.metrics = metrics 55 | 56 | if metrics == "all": 57 | self.metrics = ["accuracy", "completeness", "fluency", "conciseness"] 58 | 59 | self.metric_funcs = [] 60 | # TODO: CLEAN THIS UP TO DIRECTLY TAKE FUNCTION FROM NAME 61 | if "accuracy" in metrics: 62 | self.metric_funcs.append(accuracy) 63 | if "completeness" in metrics: 64 | self.metric_funcs.append(completeness) 65 | if "fluency" in metrics: 66 | self.metric_funcs.append("fluency") 67 | if "conciseness" in metrics: 68 | self.metric_funcs.append("conciseness") 69 | 70 | self.sample_narratives = sample_narratives 71 | 72 | if sample_narratives is not None and ( 73 | isinstance(self.sample_narratives[0], list) 74 | or isinstance(self.sample_narratives[0], tuple) 75 | ): 76 | self.sample_narratives = [narrative[1] for narrative in self.sample_narratives] 77 | 78 | self.max_optimal_length = max_optimal_length 79 | if max_optimal_length is None and self.sample_narratives is not None: 80 | self.max_optimal_length = max( 81 | [len(narrative.split()) for narrative in self.sample_narratives] 82 | ) 83 | if self.max_optimal_length is None: 84 | self.max_optimal_length = 100 85 | 86 | self.grader_llm = llm 87 | self.openai_api_key = openai_api_key 88 | if self.grader_llm is None and self.openai_api_key is not None: 89 | self.grader_llm = dspy.OpenAI( 90 | model="gpt-4o", 91 | api_key=self.openai_api_key, 92 | max_tokens=1000, 93 | temperature=0.0, 94 | ) 95 | 96 | def run_metrics(self, input_, output_, trace): 97 | results = {} 98 | if "accuracy" in self.metrics: 99 | results["accuracy"] = accuracy(input_, output_, grader=self.grader_llm, trace=trace) 100 | if "completeness" in self.metrics: 101 | results["completeness"] = completeness( 102 | input_, output_, grader=self.grader_llm, trace=trace 103 | ) 104 | if "fluency" in self.metrics: 105 | results["fluency"] = fluency( 106 | input_, 107 | output_, 108 | grader=self.grader_llm, 109 | trace=trace, 110 | good_narratives=self.sample_narratives, 111 | ) 112 | if "conciseness" in self.metrics: 113 | results["conciseness"] = conciseness( 114 | input_, output_, max_optimal_length_per_feature=self.max_optimal_length 115 | ) 116 | 117 | if trace is None: 118 | return pd.Series(results) 119 | else: 120 | return ( 121 | (results.get("accuracy", MAX_SCORE) == MAX_SCORE) 122 | and (results.get("fluency", MAX_SCORE) == MAX_SCORE) 123 | and (results.get("completeness", MAX_SCORE) == MAX_SCORE) 124 | and (results.get("conciseness", MAX_SCORE) >= 3.5) 125 | ) 126 | 127 | def __call__(self, explanation, explanation_format, narrative, trace=None): 128 | input_ = dspy.Example(explanation=explanation, explanation_format=explanation_format) 129 | output_ = dspy.Prediction(narrative=narrative) 130 | return self.run_metrics(input_, output_, trace) 131 | 132 | 133 | def compute_score_from_boolean(metric, question, narrative, grader, iters=3): 134 | total_score = 0.0 135 | 136 | with dspy.context(lm=grader): 137 | for i in range(iters): 138 | score = dspy.Predict(BooleanAssess)( 139 | question=question, narrative=narrative 140 | ).assessment.lower() 141 | if score == "yes": 142 | total_score += 1 143 | elif score == "no": 144 | pass 145 | else: 146 | print("Invalid score for metric %s: %s" % (metric, score)) 147 | score = total_score / iters 148 | 149 | if 0.3 < score < 0.7: 150 | print("Inconsistent score for metric %s: %s" % (metric, score)) 151 | 152 | return score * MAX_SCORE 153 | 154 | 155 | def compute_score_from_rubric( 156 | metric, question, rubric, narrative, grader, iters=3, rational_type=None 157 | ): 158 | scores = [] 159 | with dspy.context(lm=grader): 160 | for i in range(iters): 161 | if rational_type is None: 162 | score = dspy.Predict(RubricAssess)( 163 | question=question, rubric=rubric, narrative=narrative 164 | ).assessment 165 | else: 166 | score = dspy.ChainOfThought(RubricAssess, rationale_type=rational_type)( 167 | question=question, 168 | rubric=rubric, 169 | narrative=narrative, 170 | ).assessment 171 | try: 172 | scores.append(int(score)) 173 | except ValueError: 174 | print("Invalid score for metric %s: %s" % (metric, score)) 175 | 176 | if 0 in scores and MAX_SCORE in scores: 177 | print("Inconsistent score for metric %s: %s" % (metric, scores)) 178 | 179 | return sum(scores) / iters 180 | 181 | 182 | def accuracy(input_, output_, grader, trace=None): 183 | question = ( 184 | f"How accurate is the information in the narrative, based on the explanation given? " 185 | f"A narrative can score 4 even if it is missing information as long as everything " 186 | f"in the narrative is correct. Make sure the contribution direction is correct - " 187 | f"positive contributions increase the output, negative contributions decrease the output." 188 | f"\n\nExplanation format: {input_.explanation_format}.\nExplanation: {input_.explanation}" 189 | ) 190 | rubric = ( 191 | "0 - Contains one or more errors in value or contribution direction. " 192 | "4 - Contains no errors, but may be missing information." 193 | ) 194 | 195 | rational_type = dspy.OutputField( 196 | prefix="Start by listing out all the features in the narrative, and then for each one " 197 | "compare it to the explanation to ensure its value and contribution " 198 | "are approximately correct.", 199 | ) 200 | 201 | return compute_score_from_rubric( 202 | "accuracy", 203 | question, 204 | rubric=rubric, 205 | narrative=output_.narrative, 206 | grader=grader, 207 | rational_type=rational_type, 208 | ) 209 | 210 | 211 | def fluency(input_, output_, grader, trace=None, good_narratives=None): 212 | if good_narratives is None: 213 | question = "How natural and human is the narrative?" 214 | else: 215 | question = ( 216 | "How well does the style of the narrative match the style of the example " 217 | "narratives? Consider only the linguistic style, not the topic. " 218 | "Example narratives:" 219 | ) 220 | for narrative in good_narratives: 221 | question += f"\n{narrative}" 222 | if good_narratives is not None: 223 | rubric = "0: Very dissimilar. 1: Dissimilar. 2: Neutral. 3: Similar. 4: Very similar" 224 | else: 225 | rubric = "0: Very unnatural. 1: Unnatural. 2: Neutral. 3: Natural. 4: Very natural" 226 | return compute_score_from_rubric("fluency", question, rubric, output_.narrative, grader) 227 | 228 | 229 | def completeness(input_, output_, grader, trace=None): 230 | question = ( 231 | f"How completely does the narrative below describe the explanation given?" 232 | f"\nExplanation format: {input_.explanation_format}." 233 | f"\nExplanation: {input_.explanation}" 234 | ) 235 | rubric = ( 236 | "0 - One or more feature names from the explanation are not mentioned at all in the " 237 | "narrative. 2 - All features are mentioned, but not all feature values and/or " 238 | "contribution directions. 4 - All features are mentioned, and for each feature, " 239 | "includes at least an approximation of the feature's value and contribution " 240 | "direction." 241 | ) 242 | rational_type = dspy.OutputField( 243 | prefix="Start by listing out all the features in the explanations, and then determine " 244 | "every feature is present in the narrative, along with its value and " 245 | "contribution direction.", 246 | ) 247 | 248 | return compute_score_from_rubric( 249 | "completeness", 250 | question, 251 | rubric, 252 | output_.narrative, 253 | grader, 254 | rational_type=rational_type, 255 | ) 256 | 257 | 258 | def conciseness(input_, output_, grader=None, trace=None, max_optimal_length_per_feature=20): 259 | num_features = input_.explanation.count("(") 260 | if num_features == 0: 261 | num_features = 1 262 | length = len(output_.narrative.split()) 263 | max_optimal_length = max_optimal_length_per_feature * num_features 264 | # scale length between 0 and 2 265 | return max( 266 | 0.0, 267 | min( 268 | MAX_SCORE, 269 | MAX_SCORE * (2 - length / max_optimal_length), 270 | ), 271 | ) 272 | 273 | 274 | def context_awareness(input_, output_, grader, trace=None): 275 | question = "How well does the rationalization help explain the logic in the narrative?" 276 | rubric = "0: Not at all. 2: Somewhat. 4: Very well." 277 | narrative_input = f"Narrative: {output_.narrative}. Rationalization: {output_.rationalization}" 278 | return compute_score_from_rubric( 279 | "context_awareness", question, rubric, narrative_input, grader 280 | ) 281 | -------------------------------------------------------------------------------- /evaluation/eval_data/pdf_1.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "explanation": "(Size of metadata in KB, 262.0, 0.11), (Total size in KB, 74.0, 0.09), (Number of Javascript keywords, 0.0, 0.09)", 4 | "context": "The model predicts whether a PDF file contains malware", 5 | "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format", 6 | "narrative": "The PDF file is more likely to contain malware because it has a larger metadata size (262 KB), a larger total size (74 KB), and no Javascript keywords." 7 | }, 8 | { 9 | "explanation": "(Number of objects, -1.0, 0.10), (Number of keywords that denote end of streams, -1.0, 0.10), (Number of streams (sequences of binary data), -1.0, 0.10)", 10 | "context": "The model predicts whether a PDF file contains malware", 11 | "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format", 12 | "narrative": "The PDF file is more likely to contain malware because it has fewer objects, fewer keywords that denote the end of streams, and fewer streams." 13 | }, 14 | { 15 | "explanation": "(Size of metadata in KB, 272.0, 0.10), (Number of Javascript keywords, 0.0, 0.09), (Total size in KB, 90.0, 0.08)", 16 | "context": "The model predicts whether a PDF file contains malware", 17 | "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format", 18 | "narrative": "The PDF file is more likely to contain malware because it has a larger metadata size (272 KB), no Javascript keywords, and a larger total size (90 KB)." 19 | }, 20 | { 21 | "explanation": "(Size of metadata in KB, 180.0, 0.11), (Total size in KB, 7.0, 0.06), (Number of objects, -1.0, 0.04)", 22 | "context": "The model predicts whether a PDF file contains malware", 23 | "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format", 24 | "narrative": "The PDF file is more likely to contain malware because it has a larger metadata size (180 KB), a larger total size (7 KB), and fewer objects." 25 | }, 26 | { 27 | "explanation": "(Size of metadata in KB, 262.0, 0.10), (Number of Javascript keywords, 0.0, 0.09), (Total size in KB, 91.0, 0.08)", 28 | "context": "The model predicts whether a PDF file contains malware", 29 | "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format", 30 | "narrative": "The PDF file is more likely to contain malware because it has a larger metadata size (262 KB), no Javascript keywords, and a larger total size (91 KB)." 31 | }, 32 | { 33 | "explanation": "(Size of metadata in KB, 180.0, 0.16), (Total size in KB, 3.0, 0.06), (Number of streams (sequences of binary data), 1.0, 0.04)", 34 | "context": "The model predicts whether a PDF file contains malware", 35 | "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format", 36 | "narrative": "The PDF file is more likely to contain malware because it has a larger metadata size (180 KB), a larger total size (3 KB), and more streams." 37 | }, 38 | { 39 | "explanation": "(Size of metadata in KB, 358.0, 0.10), (Number of Javascript keywords, 0.0, 0.07), (Total size in KB, 63.0, 0.06)", 40 | "context": "The model predicts whether a PDF file contains malware", 41 | "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format", 42 | "narrative": "The PDF file is more likely to contain malware because it has a larger metadata size (358 KB), no Javascript keywords, and a larger total size (63 KB)." 43 | }, 44 | { 45 | "explanation": "(Number of Javascript keywords, 3.0, 0.08), (Size of metadata in KB, 224.0, 0.06), (Number of keywords with startxref, 0.0, 0.04)", 46 | "context": "The model predicts whether a PDF file contains malware", 47 | "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format", 48 | "narrative": "The PDF file is more likely to contain malware because it has more Javascript keywords (3), a larger metadata size (224 KB), and no keywords with startxref." 49 | }, 50 | { 51 | "explanation": "(Size of metadata in KB, 403.0, 0.09), (Total size in KB, 145.0, 0.07), (Number of Javascript keywords, 0.0, 0.06)", 52 | "context": "The model predicts whether a PDF file contains malware", 53 | "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format", 54 | "narrative": "The PDF file is more likely to contain malware because it has a larger metadata size (403 KB), a larger total size (145 KB), and no Javascript keywords." 55 | }, 56 | { 57 | "explanation": "(Number of Javascript keywords, 1.0, 0.10), (Number of images, -1.0, 0.06), (Number of JS keywords, 1.0, 0.05)", 58 | "context": "The model predicts whether a PDF file contains malware", 59 | "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format" 60 | }, 61 | { 62 | "explanation": "(Size of metadata in KB, 336.0, 0.11), (Total size in KB, 58.0, 0.07), (Number of objects, 121.0, 0.07)", 63 | "context": "The model predicts whether a PDF file contains malware", 64 | "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format" 65 | }, 66 | { 67 | "explanation": "(Number of Javascript keywords, 3.0, 0.15), (Number of JS keywords, 2.0, 0.07), (Number of keywords that denote end of streams, 2.0, 0.06)", 68 | "context": "The model predicts whether a PDF file contains malware", 69 | "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format" 70 | }, 71 | { 72 | "explanation": "(Size of metadata in KB, 289.0, 0.09), (Number of Javascript keywords, 0.0, 0.08), (Total size in KB, 27.0, 0.06)", 73 | "context": "The model predicts whether a PDF file contains malware", 74 | "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format" 75 | }, 76 | { 77 | "explanation": "(Number of Javascript keywords, 2.0, 0.10), (Total size in KB, 4.0, 0.08), (Number of entries in Xref tables, 10.0, 0.05)", 78 | "context": "The model predicts whether a PDF file contains malware", 79 | "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format" 80 | }, 81 | { 82 | "explanation": "(Size of metadata in KB, 298.0, 0.11), (Number of Javascript keywords, 0.0, 0.10), (Total size in KB, 32.0, 0.07)", 83 | "context": "The model predicts whether a PDF file contains malware", 84 | "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format" 85 | }, 86 | { 87 | "explanation": "(Size of metadata in KB, 180.0, 0.11), (Number of Javascript keywords, 1.0, 0.08), (Total size in KB, 1.0, 0.06)", 88 | "context": "The model predicts whether a PDF file contains malware", 89 | "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format" 90 | }, 91 | { 92 | "explanation": "(Number of objects, 289.0, 0.12), (Size of metadata in KB, 388.0, 0.10), (Number of entries in Xref tables, 354.0, 0.09)", 93 | "context": "The model predicts whether a PDF file contains malware", 94 | "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format" 95 | }, 96 | { 97 | "explanation": "(Number of XFA keywords, 1.0, 0.11), (Total size in KB, 9.0, 0.08), (Size of metadata in KB, 252.0, 0.07)", 98 | "context": "The model predicts whether a PDF file contains malware", 99 | "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format" 100 | }, 101 | { 102 | "explanation": "(Number of Javascript keywords, 0.0, 0.12), (Size of metadata in KB, 299.0, 0.08), (Number of JS keywords, 0.0, 0.06)", 103 | "context": "The model predicts whether a PDF file contains malware", 104 | "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format" 105 | }, 106 | { 107 | "explanation": "(Number of Javascript keywords, 2.0, 0.12), (Total size in KB, 9.0, 0.09), (Number of JS keywords, 1.0, 0.04)", 108 | "context": "The model predicts whether a PDF file contains malware", 109 | "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format" 110 | }, 111 | { 112 | "explanation": "(Size of metadata in KB, 288.0, 0.09), (Total size in KB, 32.0, 0.07), (Number of Javascript keywords, 0.0, 0.06)", 113 | "context": "The model predicts whether a PDF file contains malware", 114 | "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format" 115 | }, 116 | { 117 | "explanation": "(Total size in KB, 7.0, 0.09), (Size of metadata in KB, 239.0, 0.07), (Number of entries in Xref tables, 10.0, 0.05)", 118 | "context": "The model predicts whether a PDF file contains malware", 119 | "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format" 120 | }, 121 | { 122 | "explanation": "(Number of Javascript keywords, 0.0, 0.09), (Size of metadata in KB, 278.0, 0.08), (Number of keywords with startxref, 2.0, 0.08)", 123 | "context": "The model predicts whether a PDF file contains malware", 124 | "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format" 125 | }, 126 | { 127 | "explanation": "(Number of Javascript keywords, 1.0, 0.11), (Number of images, -1.0, 0.07), (Number of keywords with startxref, 1.0, 0.05)", 128 | "context": "The model predicts whether a PDF file contains malware", 129 | "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format" 130 | }, 131 | { 132 | "explanation": "(Size of metadata in KB, 327.0, 0.10), (Total size in KB, 75.0, 0.06), (Number of entries in Xref tables, 368.0, 0.05)", 133 | "context": "The model predicts whether a PDF file contains malware", 134 | "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format" 135 | }, 136 | { 137 | "explanation": "(Size of metadata in KB, -1.0, 0.09), (Number of Javascript keywords, 1.0, 0.06), (Total size in KB, -1.0, 0.04)", 138 | "context": "The model predicts whether a PDF file contains malware", 139 | "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format" 140 | }, 141 | { 142 | "explanation": "(Number of Javascript keywords, 0.0, 0.10), (Size of metadata in KB, 283.0, 0.09), (Total size in KB, 78.0, 0.07)", 143 | "context": "The model predicts whether a PDF file contains malware", 144 | "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format" 145 | }, 146 | { 147 | "explanation": "(Size of metadata in KB, 180.0, 0.21), (Total size in KB, 9.0, 0.13), (Contains text, 0.0, 0.04)", 148 | "context": "The model predicts whether a PDF file contains malware", 149 | "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format" 150 | }, 151 | { 152 | "explanation": "(Size of metadata in KB, 262.0, 0.11), (Total size in KB, 80.0, 0.09), (Number of Javascript keywords, 0.0, 0.09)", 153 | "context": "The model predicts whether a PDF file contains malware", 154 | "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format" 155 | }, 156 | { 157 | "explanation": "(Number of Javascript keywords, 4.0, 0.12), (Total size in KB, 3.0, 0.08), (Number of JS keywords, 3.0, 0.06)", 158 | "context": "The model predicts whether a PDF file contains malware", 159 | "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format" 160 | } 161 | ] -------------------------------------------------------------------------------- /evaluation/eval_data/student_2.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "explanation": "(Family eductional support, no, -2.26), (In a romantic relationship, no, 1.11), (Sex, M, -0.60)", 4 | "context": "The model predicts whether a student will pass their class", 5 | "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format", 6 | "narrative": "The lack of family support, and the sex (male) suggest the student is less likely to pass the class. But, the lack of a romantic relationship indicates an higher probability of passing." 7 | }, 8 | { 9 | "explanation": "(Family eductional support, yes, 1.21), (In a romantic relationship, no, 1.12), (Age, 17, -0.45)", 10 | "context": "The model predicts whether a student will pass their class", 11 | "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format", 12 | "narrative": "The lack of a romantic relationship and having family support suggest the student is more likely to pass. However, being 17 indicates a lower probability of passing." 13 | }, 14 | { 15 | "explanation": "(In a romantic relationship, yes, -2.00), (Family eductional support, no, -1.99), (Sex, M, -0.49)", 16 | "context": "The model predicts whether a student will pass their class", 17 | "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format", 18 | "narrative": "The student's involvement in a romantic relationship, lack of family support, and being male suggest they are less likely to pass the class." 19 | }, 20 | { 21 | "explanation": "(Family eductional support, no, -1.37), (School, MS, -0.59)", 22 | "context": "The model predicts whether a student will pass their class", 23 | "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format", 24 | "narrative": "The lack of family support and attending the MS school indicate a lower probability of passing." 25 | }, 26 | { 27 | "explanation": "(Student's guardian, father, 1.74), (In a romantic relationship, no, 1.53), (Family eductional support, yes, 1.09)", 28 | "context": "The model predicts whether a student will pass their class", 29 | "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format", 30 | "narrative": "The presence of a father as a guardian, the lack of a romantic relationship, and family support suggest the student is more likely to pass." 31 | }, 32 | { 33 | "explanation": "(In a romantic relationship, yes, -1.74), (Family eductional support, no, -1.51), (Student's guardian, mother, -0.31)", 34 | "context": "The model predicts whether a student will pass their class", 35 | "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format", 36 | "narrative": "Being in a romantic relationship, lacking family support, and having their mother as a guardian suggest the student may face challenges in passing." 37 | }, 38 | { 39 | "explanation": "(In a romantic relationship, yes, -1.86), (Student's guardian, father, 1.12), (Family eductional support, no, -1.02)", 40 | "context": "The model predicts whether a student will pass their class", 41 | "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format", 42 | "narrative": "While having a father as a guardian is a positive factor, the romantic relationship and lack of family support might hinder the student's chances of passing." 43 | }, 44 | { 45 | "explanation": "(Family eductional support, no, -2.01), (Student's guardian, father, 1.49), (In a romantic relationship, no, 1.07)", 46 | "context": "The model predicts whether a student will pass their class", 47 | "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format", 48 | "narrative": "The lack of family support is concerning, but having a father as a guardian and not being in a romantic relationship might increase the student's chances of passing." 49 | }, 50 | { 51 | "explanation": "(Family eductional support, no, -2.36), (In a romantic relationship, no, 1.05), (Sex, M, -0.44)", 52 | "context": "The model predicts whether a student will pass their class", 53 | "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format", 54 | "narrative": "The student's sex (male) and lack of family support are concerning, but not being in a romantic relationship might slightly improve their chances of passing" 55 | }, 56 | { 57 | "explanation": "(Family eductional support, no, -1.60), (In a romantic relationship, no, 0.81), (Attended nursery school, no, -0.61)", 58 | "context": "The model predicts whether a student will pass their class", 59 | "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format", 60 | "narrative": "The lack of family support and not attending nursery school decrease the student's likelihood of passing. However, not being in a romantic relationship could be an advantage" 61 | }, 62 | { 63 | "explanation": "(Family eductional support, no, -2.01), (In a romantic relationship, no, 1.02), (Student's guardian, mother, -0.40)", 64 | "context": "The model predicts whether a student will pass their class", 65 | "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format", 66 | "narrative": "The lack of family support and having the mother as a guardian might negatively impact the student's chances of passing. However, not being in a romantic relationship may offer some improvement." 67 | }, 68 | { 69 | "explanation": "(Family eductional support, yes, 1.25), (In a romantic relationship, no, 1.06), (Frequency of going out with friends (1-5), 5, -0.60)", 70 | "context": "The model predicts whether a student will pass their class", 71 | "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format", 72 | "narrative": "Strong family support and not being in a romantic relationship suggest the student is more likely to pass. However, frequently going out with friends could negatively impact their chances." 73 | }, 74 | { 75 | "explanation": "(In a romantic relationship, no, 0.92), (Family eductional support, yes, 0.84), (Quality of family relationships (1-5), 2, -0.82)", 76 | "context": "The model predicts whether a student will pass their class", 77 | "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format" 78 | }, 79 | { 80 | "explanation": "(In a romantic relationship, no, 1.24), (Family eductional support, yes, 1.12), (Quality of family relationships (1-5), 4, 0.44)", 81 | "context": "The model predicts whether a student will pass their class", 82 | "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format" 83 | }, 84 | { 85 | "explanation": "(In a romantic relationship, yes, -2.23), (Family eductional support, yes, 0.78), (School, MS, -0.77)", 86 | "context": "The model predicts whether a student will pass their class", 87 | "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format", 88 | "narrative": "The student's involvement in a romantic relationship and attendance at the MS school present challenges to passing, though the presence of family support offers some positive influence." 89 | }, 90 | { 91 | "explanation": "(Family eductional support, no, -1.70), (Student's guardian, father, 1.00), (In a romantic relationship, no, 0.95)", 92 | "context": "The model predicts whether a student will pass their class", 93 | "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format" 94 | }, 95 | { 96 | "explanation": "(In a romantic relationship, yes, -1.71), (Family eductional support, yes, 0.78), (School, MS, -0.69)", 97 | "context": "The model predicts whether a student will pass their class", 98 | "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format" 99 | }, 100 | { 101 | "explanation": "(In a romantic relationship, no, 1.13), (Family eductional support, yes, 1.10), (Reason for choosing this school, home, 0.62)", 102 | "context": "The model predicts whether a student will pass their class", 103 | "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format", 104 | "narrative": "The lack of a romantic relationship, presence of family support, and choosing the school based on home environment suggest the student is more likely to pass" 105 | }, 106 | { 107 | "explanation": "(Family eductional support, no, -1.42), (Student's guardian, father, 1.27), (In a romantic relationship, no, 1.00)", 108 | "context": "The model predicts whether a student will pass their class", 109 | "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format" 110 | }, 111 | { 112 | "explanation": "(In a romantic relationship, no, 1.25), (Family eductional support, yes, 1.16), (Sex, M, -0.54)", 113 | "context": "The model predicts whether a student will pass their class", 114 | "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format" 115 | }, 116 | { 117 | "explanation": "(In a romantic relationship, yes, -1.84), (Family eductional support, yes, 0.82), (Home to school travel time, 2, 0.39)", 118 | "context": "The model predicts whether a student will pass their class", 119 | "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format" 120 | }, 121 | { 122 | "explanation": "(Family eductional support, no, -2.14), (In a romantic relationship, no, 0.96), (Student's guardian, mother, -0.37)", 123 | "context": "The model predicts whether a student will pass their class", 124 | "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format" 125 | }, 126 | { 127 | "explanation": "(Family eductional support, yes, 1.27), (In a romantic relationship, no, 1.05), (Age, 18, -0.40)", 128 | "context": "The model predicts whether a student will pass their class", 129 | "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format" 130 | }, 131 | { 132 | "explanation": "(In a romantic relationship, yes, -2.67), (Family eductional support, yes, 0.83), (Sex, F, 0.32)", 133 | "context": "The model predicts whether a student will pass their class", 134 | "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format" 135 | }, 136 | { 137 | "explanation": "(In a romantic relationship, yes, -2.41), (Family eductional support, yes, 0.63), (Amount of free time after school (1-5), 4, -0.47)", 138 | "context": "The model predicts whether a student will pass their class", 139 | "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format" 140 | }, 141 | { 142 | "explanation": "(Family eductional support, no, -2.22), (In a romantic relationship, no, 0.99), (Age, 16, 0.39)", 143 | "context": "The model predicts whether a student will pass their class", 144 | "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format" 145 | }, 146 | { 147 | "explanation": "(In a romantic relationship, no, 1.13), (Family eductional support, yes, 1.06), (Attended nursery school, no, -0.81)", 148 | "context": "The model predicts whether a student will pass their class", 149 | "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format" 150 | }, 151 | { 152 | "explanation": "(In a romantic relationship, yes, -2.31), (Student's guardian, father, 0.79), (Family eductional support, yes, 0.61)", 153 | "context": "The model predicts whether a student will pass their class", 154 | "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format" 155 | }, 156 | { 157 | "explanation": "(In a romantic relationship, yes, -1.91), (Family eductional support, no, -1.67), (Reason for choosing this school, home, 0.62)", 158 | "context": "The model predicts whether a student will pass their class", 159 | "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format" 160 | }, 161 | { 162 | "explanation": "(In a romantic relationship, yes, -1.77), (Student's guardian, father, 1.36), (Family eductional support, no, -0.99)", 163 | "context": "The model predicts whether a student will pass their class", 164 | "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format" 165 | } 166 | ] 167 | -------------------------------------------------------------------------------- /evaluation/eval_data/housing_3.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "explanation": "(Above ground living area square feet, 1256.00, -12527.46), (Rates the overall material and finish of the house, 5.00, -10743.76), (Second floor square feet, 0.00, -10142.29), (Physical locations within Ames city limits, Edwards, -9913.81), (Wood deck area in square feet, 736.00, 9846.38)", 4 | "context": "The ML model predicts house prices", 5 | "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format", 6 | "narrative": "This house is cheaper because it has less above ground living space (size=1256), lower material quality (rating=5), no second floor (size=0), and it's in Edwards. However, the deck is larger (size=736), which makes it a bit more expensive." 7 | }, 8 | { 9 | "explanation": "(Second floor square feet, 854.00, 12757.84), (Original construction date, 2003.00, 9115.72), (Total square feet of basement area, 856.00, -6157.86), (First Floor square feet, 856.00, -5466.64), (Physical locations within Ames city limits, CollgCr, -4761.42)", 10 | "context": "The ML model predicts house prices", 11 | "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format", 12 | "narrative": "This house costs more because of its larger second floor (size=854) and newer construction year (2003). It’s cheaper due to a smaller basement (size=856) and first floor (size=856), and its location in CollgCr." 13 | }, 14 | { 15 | "explanation": "(Refers to walkout or garden level walls, Gd, 17607.43), (Rates the overall condition of the house, 8.00, 13038.14), (Above ground living area square feet, 1262.00, -12319.48), (Second floor square feet, 0.00, -10142.29), (Proximity to various conditions, Feedr, -8251.83)", 16 | "context": "The ML model predicts house prices", 17 | "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format", 18 | "narrative": "This house is more expensive because of its good garden level walls and overall condition (rating=8). It’s cheaper due to less above ground space (size=1262), no second floor (size=0), and being on a feeder street." 19 | }, 20 | { 21 | "explanation": "(Second floor square feet, 866.00, 13079.62), (Original construction date, 2001.00, 8500.21), (Above ground living area square feet, 1786.00, 5844.30), (Physical locations within Ames city limits, CollgCr, -4761.42), (Total square feet of basement area, 920.00, -4747.08)", 22 | "context": "The ML model predicts house prices", 23 | "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format", 24 | "narrative": "This house costs more because of its larger second floor (size=866), newer construction year (2001), and larger above ground space (size=1786). It’s cheaper due to its location in CollgCr and smaller basement (size=920)." 25 | }, 26 | { 27 | "explanation": "(Original construction date, 1915.00, -17966.77), (Physical locations within Ames city limits, Crawfor, 17703.26), (Second floor square feet, 756.00, 10129.96), (Total square feet of basement area, 756.00, -8362.22), (Condition of sale, Abnorml, -6786.66)", 28 | "context": "The ML model predicts house prices", 29 | "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format", 30 | "narrative": "This house is cheaper because it's older (year=1915), has a smaller basement (size=756), and the sale condition is abnormal. It’s more expensive due to its location in Crawfor and a larger second floor (size=756)." 31 | }, 32 | { 33 | "explanation": "(Physical locations within Ames city limits, NoRidge, 23069.89), (Above ground living area square feet, 2198.00, 20125.75), (Second floor square feet, 1053.00, 18094.05), (Rates the overall material and finish of the house, 8.00, 9655.79), (Original construction date, 2000.00, 8192.46)", 34 | "context": "The ML model predicts house prices", 35 | "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format", 36 | "narrative": "This house is more expensive because it’s in No Ridge, has more above ground space (size=2198), a larger second floor (size=1053), better material quality (rating=8), and a newer construction year (2000)." 37 | }, 38 | { 39 | "explanation": "(Type of foundation, Wood, -18650.67), (Physical locations within Ames city limits, Mitchel, -13510.92), (Rates the overall material and finish of the house, 5.00, -10743.76), (Three season porch area in square feet, 320.00, 9959.33), (Bedrooms above ground, 1.00, 8905.73)", 40 | "context": "The ML model predicts house prices", 41 | "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format", 42 | "narrative": "This house is cheaper because it has a wood foundation, is located in Mitchel, and has lower material quality (rating=5). It’s more expensive due to a larger porch (size=320) and fewer bedrooms (count=1)." 43 | }, 44 | { 45 | "explanation": "(Type 1 finished square feet, 1369.00, 14641.53), (Evaluates the height of the basement, Ex, 13233.24), (Total square feet of basement area, 1686.00, 12138.28), (Second floor square feet, 0.00, -10142.29), (Rates the overall material and finish of the house, 8.00, 9655.79)", 46 | "context": "The ML model predicts house prices", 47 | "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format", 48 | "narrative": "This house is more expensive because it has more finished space (size=1369), a taller basement (height=Ex), and better material quality (rating=8). It’s cheaper because it lacks a second floor (size=0)." 49 | }, 50 | { 51 | "explanation": "(Above ground living area square feet, 2090.00, 16382.07), (Second floor square feet, 983.00, 16216.99), (Physical locations within Ames city limits, NWAmes, -9769.73), (Type 1 finished square feet, 859.00, 6193.63), (Masonry veneer type, Stone, 5446.26)", 52 | "context": "The ML model predicts house prices", 53 | "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format", 54 | "narrative": "This house is more expensive due to its larger above ground space (size=2090), a bigger second floor (size=983), and a stone veneer. It’s cheaper because of its location in NWAmes." 55 | }, 56 | { 57 | "explanation": "(Lot area in square feet, 10000.00, 14876.45), (Garage size in square feet, 600.00, 12445.76), (Above ground living area square feet, 1500.00, 11122.58), (Physical locations within Ames city limits, SawyerW, -8734.25), (Year of remodel, 1995.00, -5231.12)", 58 | "context": "The ML model predicts house prices", 59 | "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format", 60 | "narrative": "This house is more expensive due to its larger lot area (size=10000), a larger garage (size=600), and more above ground living space (size=1500). It’s cheaper because of its location in SawyerW and an older remodel year (1995)." 61 | }, 62 | { 63 | "explanation": "(Central air conditioning, Yes, 13876.23), (Lot area in square feet, 9500.00, 12975.67), (Rates the overall condition of the house, 7.00, 10123.44), (Physical locations within Ames city limits, Blmngtn, -9243.76), (Total square feet of basement area, 850.00, -5123.89)", 64 | "context": "The ML model predicts house prices", 65 | "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format", 66 | "narrative": "This house is more expensive because it has central air conditioning, a large lot (size=9500), and a better overall condition (rating=7). It’s cheaper due to a smaller basement (size=850) and its location in Blmngtn." 67 | }, 68 | { 69 | "explanation": "(Garage size in square feet, 450.00, 12674.34), (Above ground living area square feet, 1800.00, 11754.29), (Rates the overall material and finish of the house, 7.00, 10342.76), (Physical locations within Ames city limits, OldTown, -8329.12), (Condition of sale, Partial, -5343.82)", 70 | "context": "The ML model predicts house prices", 71 | "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format", 72 | "narrative": "This house is more expensive due to a larger garage (size=450), more above ground living space (size=1800), and better material quality (rating=7). It’s cheaper because it’s located in OldTown and has a partial sale condition." 73 | }, 74 | { 75 | "explanation": "(Lot area in square feet, 8500.00, 12346.22), (Three season porch area in square feet, 250.00, 9762.53), (Above ground living area square feet, 1400.00, 8793.24), (Physical locations within Ames city limits, ClearCr, -7865.43), (Year of remodel, 1985.00, -6531.44)", 76 | "context": "The ML model predicts house prices", 77 | "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format", 78 | "narrative": "This house is more expensive because it has a larger lot (size=8500), a three-season porch (size=250), and more above ground living space (size=1400). It’s cheaper due to its location in ClearCr and an older remodel year (1985)." 79 | }, 80 | { 81 | "explanation": "(Second floor square feet, 900.00, 12500.45), (Original construction date, 2002.00, 8600.78), (Above ground living area square feet, 1800.00, 5900.00), (Physical locations within Ames city limits, CollgCr, -4700.00), (Total square feet of basement area, 950.00, -4800.00)", 82 | "context": "The ML model predicts house prices", 83 | "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format" 84 | }, 85 | { 86 | "explanation": "(Original construction date, 1918.00, -17500.50), (Physical locations within Ames city limits, Crawfor, 17500.00), (Second floor square feet, 800.00, 10500.25), (Total square feet of basement area, 770.00, -8200.00), (Condition of sale, Abnorml, -6700.00)", 87 | "context": "The ML model predicts house prices", 88 | "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format" 89 | }, 90 | { 91 | "explanation": "(Physical locations within Ames city limits, NoRidge, 22000.45), (Above ground living area square feet, 2200.00, 20000.00), (Second floor square feet, 1100.00, 18500.00), (Rates the overall material and finish of the house, 8.00, 9700.00), (Original construction date, 1999.00, 8200.00)", 92 | "context": "The ML model predicts house prices", 93 | "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format" 94 | }, 95 | { 96 | "explanation": "(Type of foundation, Concrete, -18000.00), (Physical locations within Ames city limits, Mitchel, -13000.00), (Rates the overall material and finish of the house, 6.00, -10500.00), (Three season porch area in square feet, 350.00, 10000.00), (Bedrooms above ground, 2.00, 9000.00)", 97 | "context": "The ML model predicts house prices", 98 | "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format" 99 | }, 100 | { 101 | "explanation": "(Type 1 finished square feet, 1400.00, 15000.00), (Evaluates the height of the basement, Good, 13000.00), (Total square feet of basement area, 1700.00, 12000.00), (Second floor square feet, 0.00, -10000.00), (Rates the overall material and finish of the house, 7.00, 9500.00)", 102 | "context": "The ML model predicts house prices", 103 | "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format" 104 | }, 105 | { 106 | "explanation": "(Garage size in square feet, 600.00, 12000.00), (Central air conditioning, Yes, 9800.00), (First Floor square feet, 1100.00, -8500.00), (Physical locations within Ames city limits, Sawyer, -6400.00), (Type of foundation, Slab, -5300.00)", 107 | "context": "The ML model predicts house prices", 108 | "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format" 109 | }, 110 | { 111 | "explanation": "(Lot area in square feet, 9500.00, 15000.00), (Garage size in square feet, 720.00, 13000.00), (Rates the overall condition of the house, 7.00, 8600.00), (Physical locations within Ames city limits, NWAmes, -7500.00), (Second floor square feet, 0.00, -6900.00)", 112 | "context": "The ML model predicts house prices", 113 | "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format" 114 | }, 115 | { 116 | "explanation": "(Rates the overall condition of the house, 8.00, 11500.00), (Fireplace quality, Excellent, 11000.00), (Garage size in square feet, 450.00, 8600.00), (Physical locations within Ames city limits, OldTown, -7300.00), (Second floor square feet, 0.00, -6400.00)", 117 | "context": "The ML model predicts house prices", 118 | "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format" 119 | }, 120 | { 121 | "explanation": "(Rates the overall material and finish of the house, 8.50, 18000.00), (Lot area in square feet, 10500.00, 14000.00), (Garage size in square feet, 750.00, 13000.00), (First Floor square feet, 2100.00, -10500.00), (Physical locations within Ames city limits, Meadow, -8400.00)", 122 | "context": "The ML model predicts house prices", 123 | "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format" 124 | } 125 | ] 126 | -------------------------------------------------------------------------------- /explingo/tutorial.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "id": "initial_id", 6 | "metadata": { 7 | "is_executing": true, 8 | "ExecuteTime": { 9 | "end_time": "2024-10-16T19:13:13.237509Z", 10 | "start_time": "2024-10-16T19:13:13.186974Z" 11 | } 12 | }, 13 | "source": [ 14 | "import yaml \n", 15 | "import os\n", 16 | "\n", 17 | "with open(os.path.join(\"..\", \"keys.yaml\"), \"r\") as file:\n", 18 | " config = yaml.safe_load(file)\n", 19 | " openai_api_key = config[\"openai_api_key\"]" 20 | ], 21 | "outputs": [], 22 | "execution_count": 1 23 | }, 24 | { 25 | "cell_type": "code", 26 | "id": "94b1d6514180c940", 27 | "metadata": { 28 | "is_executing": true, 29 | "ExecuteTime": { 30 | "end_time": "2024-10-16T19:13:15.418493Z", 31 | "start_time": "2024-10-16T19:13:13.240494Z" 32 | } 33 | }, 34 | "source": [ 35 | "from explingo import Narrator, Grader \n", 36 | "\n", 37 | "example_narratives = [\n", 38 | " (\"(Above ground living area square feet, 1256.00, -12527.46), (Overall material and finish of the house, 5.00, -10743.76), (Second floor square feet, 0.00, -10142.29)\", \n", 39 | " \"The house's living area size of around 1,200 sq. ft., lower quality materials (5/10), and lack of a second floor all reduce the house's value.\"),\n", 40 | " (\"(Second floor square feet, 854.00, 12757.84), (Original construction date, 2003.00, 9115.72)\",\n", 41 | " \"The house's large second floor of around 850 sq. ft and recent construction date of 2003 increase its value.\"),\n", 42 | " (\"(Overall material and finish of the house, 8.00, 10743.76), (Above ground living area square feet, 2000.00, 12527.46), (Second floor square feet, 1000.00, 10142.29)\",\n", 43 | " \"The house's high quality materials (8/10), large living area size of around 2,000 sq. ft., and a second floor of around 1,000 sq. ft. all increase the house's value.\"),\n", 44 | "]\n", 45 | "\n", 46 | "explanation_format = \"(feature name, feature value, SHAP feature contribution)\"\n", 47 | "context = \"The model predicts house prices\"\n", 48 | "\n", 49 | "narrator = Narrator(openai_api_key=openai_api_key,\n", 50 | " explanation_format=explanation_format,\n", 51 | " context=context,\n", 52 | " sample_narratives=example_narratives)\n" 53 | ], 54 | "outputs": [], 55 | "execution_count": 2 56 | }, 57 | { 58 | "cell_type": "markdown", 59 | "id": "225625fd117fec33", 60 | "metadata": {}, 61 | "source": [] 62 | }, 63 | { 64 | "cell_type": "code", 65 | "id": "435b1b4daa990205", 66 | "metadata": { 67 | "ExecuteTime": { 68 | "end_time": "2024-10-16T19:13:15.496257Z", 69 | "start_time": "2024-10-16T19:13:15.420480Z" 70 | } 71 | }, 72 | "source": [ 73 | "explanation = \"(number of rooms, 11, 7020), (fireplace, yes, 12903)\"\n", 74 | "\n", 75 | "narrative = narrator.narrate(explanation)\n", 76 | "narrative" 77 | ], 78 | "outputs": [ 79 | { 80 | "data": { 81 | "text/plain": [ 82 | "\"The house's large number of rooms (11) and the presence of a fireplace both increase its value.\"" 83 | ] 84 | }, 85 | "execution_count": 3, 86 | "metadata": {}, 87 | "output_type": "execute_result" 88 | } 89 | ], 90 | "execution_count": 3 91 | }, 92 | { 93 | "cell_type": "code", 94 | "id": "1b3c383915180de", 95 | "metadata": { 96 | "ExecuteTime": { 97 | "end_time": "2024-10-16T19:13:15.622039Z", 98 | "start_time": "2024-10-16T19:13:15.504007Z" 99 | } 100 | }, 101 | "source": [ 102 | "grader = Grader(openai_api_key=openai_api_key, \n", 103 | " metrics=\"all\", \n", 104 | " sample_narratives=example_narratives)\n", 105 | "\n", 106 | "grader(explanation=explanation, explanation_format=explanation_format, narrative=narrative)" 107 | ], 108 | "outputs": [ 109 | { 110 | "data": { 111 | "text/plain": [ 112 | "accuracy 4.0\n", 113 | "completeness 4.0\n", 114 | "fluency 4.0\n", 115 | "conciseness 4.0\n", 116 | "dtype: float64" 117 | ] 118 | }, 119 | "execution_count": 4, 120 | "metadata": {}, 121 | "output_type": "execute_result" 122 | } 123 | ], 124 | "execution_count": 4 125 | }, 126 | { 127 | "cell_type": "code", 128 | "id": "bbba0e6516ec56b2", 129 | "metadata": { 130 | "ExecuteTime": { 131 | "end_time": "2024-10-16T19:13:27.155656Z", 132 | "start_time": "2024-10-16T19:13:15.625039Z" 133 | } 134 | }, 135 | "source": [ 136 | "narrative_with_bootstrap = narrator.narrate(explanation, n_bootstrapped=1, grader=grader)\n", 137 | "narrative_with_bootstrap" 138 | ], 139 | "outputs": [ 140 | { 141 | "name": "stderr", 142 | "output_type": "stream", 143 | "text": [ 144 | " 0%| | 0/3 [00:00 narrative\n", 152 | " instructions=\"You are helping users understand an ML model's prediction. Given an explanation and information about the model,\\nconvert the explanation into a human-readable narrative.\"\n", 153 | " context = Field(annotation=str required=True json_schema_extra={'desc': 'what the ML model predicts', '__dspy_field_type': 'input', 'prefix': 'Context:'})\n", 154 | " explanation = Field(annotation=str required=True json_schema_extra={'desc': \"explanation of an ML model's prediction\", '__dspy_field_type': 'input', 'prefix': 'Explanation:'})\n", 155 | " explanation_format = Field(annotation=str required=True json_schema_extra={'desc': 'format the explanation is given in', '__dspy_field_type': 'input', 'prefix': 'Explanation Format:'})\n", 156 | " narrative = Field(annotation=str required=True json_schema_extra={'desc': 'human-readable narrative version of the explanation', '__dspy_field_type': 'output', 'prefix': 'Narrative:'})\n", 157 | ")), {'explanation': '(Above ground living area square feet, 1256.00, -12527.46), (Overall material and finish of the house, 5.00, -10743.76), (Second floor square feet, 0.00, -10142.29)', 'context': 'The model predicts house prices', 'explanation_format': '(feature name, feature value, SHAP feature contribution)'}, Prediction(\n", 158 | " narrative=\"Narrative: The house's relatively small above ground living area of around 1,256 sq. ft., average quality materials (5/10), and lack of a second floor all decrease the house's value.\"\n", 159 | "))]\n" 160 | ] 161 | }, 162 | { 163 | "name": "stderr", 164 | "output_type": "stream", 165 | "text": [ 166 | " 33%|███▎ | 1/3 [00:10<00:20, 10.27s/it]\n", 167 | " 0%| | 0/3 [00:00