├── tests
    ├── __init__.py
    ├── test_grader.py
    └── test_narrator.py
├── poetry.toml
├── evaluation
    ├── local_cache
    │   └── compiler
    │   │   ├── all.d943856c9b1e8f80.jsonl
    │   │   └── all.e415303eb7359b9a.jsonl
    ├── results
    │   ├── fluency_results.png
    │   ├── heatmap_metrics.png
    │   ├── heatmap_metrics_tight.png
    │   ├── metrics_over_narratives.png
    │   ├── results_by_dataset.csv
    │   ├── results_by_dataset.tex
    │   ├── results_local2.csv
    │   ├── results_by_technique.csv
    │   ├── results_by_technique.tex
    │   ├── cleaned_results.csv
    │   └── results_old.csv
    ├── examples.py
    ├── experiment_runner.py
    ├── explingo.py
    ├── metrics.py
    └── eval_data
    │   ├── mushroom_2.json
    │   ├── mushroom_1.json
    │   ├── pdf_2.json
    │   ├── pdf_1.json
    │   ├── student_2.json
    │   ├── housing_3.json
    │   ├── student_1.json
    │   └── housing_2.json
├── parrot.jpg
├── .flake8
├── explingo
    ├── __init__.py
    ├── testing.py
    ├── narrator.py
    ├── grader.py
    └── tutorial.ipynb
├── .idea
    └── .gitignore
├── .github
    └── workflows
    │   └── python-publish.yml
├── pyproject.toml
├── tasks.py
├── README.md
└── .gitignore


/tests/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/poetry.toml:
--------------------------------------------------------------------------------
1 | [virtualenvs]
2 | in-project = true
3 | 


--------------------------------------------------------------------------------
/evaluation/local_cache/compiler/all.d943856c9b1e8f80.jsonl:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/parrot.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sibyl-dev/Explingo/HEAD/parrot.jpg


--------------------------------------------------------------------------------
/.flake8:
--------------------------------------------------------------------------------
1 | [flake8]
2 | max-line-length = 99
3 | exclude = docs, .git, __pycache__, .ipynb_checkpoints
4 | extend-ignore = E203


--------------------------------------------------------------------------------
/evaluation/results/fluency_results.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sibyl-dev/Explingo/HEAD/evaluation/results/fluency_results.png


--------------------------------------------------------------------------------
/evaluation/results/heatmap_metrics.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sibyl-dev/Explingo/HEAD/evaluation/results/heatmap_metrics.png


--------------------------------------------------------------------------------
/evaluation/results/heatmap_metrics_tight.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sibyl-dev/Explingo/HEAD/evaluation/results/heatmap_metrics_tight.png


--------------------------------------------------------------------------------
/evaluation/results/metrics_over_narratives.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sibyl-dev/Explingo/HEAD/evaluation/results/metrics_over_narratives.png


--------------------------------------------------------------------------------
/explingo/__init__.py:
--------------------------------------------------------------------------------
1 | from explingo.grader import Grader
2 | from explingo.narrator import Narrator
3 | 
4 | __author__ = "MIT Data To AI Lab"
5 | __email__ = "dailabmit@gmail.com"
6 | __version__ = "0.1.1"
7 | 
8 | __all__ = ["Narrator", "Grader"]
9 | 


--------------------------------------------------------------------------------
/.idea/.gitignore:
--------------------------------------------------------------------------------
 1 | # Default ignored files
 2 | /shelf/
 3 | /workspace.xml
 4 | # Editor-based HTTP Client requests
 5 | /httpRequests/
 6 | # Datasource local storage ignored files
 7 | /dataSources/
 8 | /dataSources.local.xml
 9 | # GitHub Copilot persisted chat sessions
10 | /copilot/chatSessions
11 | 


--------------------------------------------------------------------------------
/.github/workflows/python-publish.yml:
--------------------------------------------------------------------------------
 1 | name: Publish Python Package
 2 | on:
 3 |   push:
 4 |     tags:
 5 |     - '*'
 6 | 
 7 | jobs:
 8 |   deploy:
 9 |     uses: sibyl-dev/.github/.github/workflows/python-publish.yml@main
10 |     with:
11 |       repository_url: https://upload.pypi.org/legacy/
12 |     secrets:
13 |       PYPI_API_TOKEN: ${{ secrets.PYPI_API_TOKEN }}


--------------------------------------------------------------------------------
/tests/test_grader.py:
--------------------------------------------------------------------------------
 1 | import explingo
 2 | 
 3 | 
 4 | def test_grader_run_metrics():
 5 |     response = 4
 6 |     mock_grader_llm = explingo.testing.MockGraderLLM(response)
 7 |     grader = explingo.Grader(llm=mock_grader_llm, metrics="all")
 8 |     result = grader("explanation", "explanation_format", "narrative")
 9 |     for metric in ["accuracy", "fluency", "conciseness", "completeness"]:
10 |         assert result[metric] == response
11 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [tool.poetry]
 2 | name = "explingo"
 3 | version = "0.1.1"
 4 | description = ""
 5 | authors = ["Ola Zytek <zyteka@mit.edu>"]
 6 | readme = "README.md"
 7 | 
 8 | [tool.poetry.dependencies]
 9 | python = "^3.10"
10 | dspy-ai = "2.4.13"
11 | pytest = "^8.3.3"
12 | notebook = "^7.2.2"
13 | jupyter = "^1.1.1"
14 | invoke = "^2.2.0"
15 | isort = "^5.13.2"
16 | flake8 = "^7.1.1"
17 | 
18 | [tool.poetry.group.dev.dependencies]
19 | jupyter = "^1.1.1"
20 | 
21 | 
22 | [build-system]
23 | requires = ["poetry-core"]
24 | build-backend = "poetry.core.masonry.api"
25 | 
26 | [tool.black]
27 | line-length = 99
28 | preview = true
29 | 
30 | [tool.isort]
31 | profile = "black"
32 | line_length = 99
33 | skip = ["__init__.py"]
34 | 


--------------------------------------------------------------------------------
/tests/test_narrator.py:
--------------------------------------------------------------------------------
 1 | import explingo
 2 | 
 3 | 
 4 | def test_narrate_basic_prompt():
 5 |     response = "narrative"
 6 |     mock_llm = explingo.testing.MockNarratorLLM(response)
 7 |     narrator = explingo.Narrator(llm=mock_llm, explanation_format="test", context="test")
 8 |     explanation = "explanation"
 9 |     assert narrator.narrate(explanation) == response
10 | 
11 | 
12 | def test_narrative_few_shot():
13 |     response = "narrative"
14 |     mock_llm = explingo.testing.MockNarratorLLM(response)
15 |     narrator = explingo.Narrator(
16 |         llm=mock_llm,
17 |         explanation_format="test",
18 |         context="test",
19 |         sample_narratives=["sample 1", "sample 2"],
20 |     )
21 |     explanation = "explanation"
22 |     assert narrator.narrate(explanation, n_examples=2) == response
23 | 
24 | 
25 | def test_narrative_bootstrapped_few_shot():
26 |     response = "narrative"
27 |     mock_llm = explingo.testing.MockNarratorLLM(response, include_tags=False)
28 |     mock_grader = explingo.Grader(
29 |         llm=explingo.testing.MockGraderLLM(4),
30 |         metrics=["fluency, conciseness"],
31 |         sample_narratives=["sample 1", "sample 2"],
32 |     )
33 |     narrator = explingo.Narrator(
34 |         llm=mock_llm,
35 |         explanation_format="test",
36 |         context="test",
37 |         sample_narratives=["sample 1", "sample 2"],
38 |     )
39 |     explanation = "explanation"
40 |     assert (
41 |         narrator.narrate(explanation, n_examples=2, n_bootstrapped=2, grader=mock_grader)
42 |         == response
43 |     )
44 | 


--------------------------------------------------------------------------------
/evaluation/results/results_by_dataset.csv:
--------------------------------------------------------------------------------
 1 | Dataset,Accuracy,Completeness,Fluency,Conciseness,Total score
 2 | House 1,\textcolor{blue}{3.733 $\pm$ 0.40},\textcolor{blue}{4.000 $\pm$ 0.00},\textcolor{blue}{3.800 $\pm$ 0.30},\textcolor{blue}{3.719 $\pm$ 0.24},\textcolor{blue}{15.252 $\pm$ 0.58}
 3 | House 2,\textcolor{blue}{4.000 $\pm$ 0.00},\textcolor{blue}{4.000 $\pm$ 0.00},\textcolor{blue}{3.911 $\pm$ 0.11},\textcolor{blue}{3.836 $\pm$ 0.16},\textcolor{blue}{15.748 $\pm$ 0.24}
 4 | House 3,\textcolor{blue}{4.000 $\pm$ 0.00},\textcolor{blue}{3.689 $\pm$ 0.27},\textcolor{blue}{3.933 $\pm$ 0.10},\textcolor{blue}{3.869 $\pm$ 0.14},\textcolor{blue}{15.491 $\pm$ 0.35}
 5 | Mush 1,\textcolor{blue}{3.556 $\pm$ 0.42},\textcolor{blue}{3.600 $\pm$ 0.00},\textcolor{blue}{3.511 $\pm$ 0.15},\textcolor{blue}{4.000 $\pm$ 0.00},\textcolor{blue}{14.667 $\pm$ 0.33}
 6 | Mush 2,\textcolor{red}{1.760 $\pm$ 0.88},\textcolor{red}{2.640 $\pm$ 0.36},\textcolor{blue}{3.920 $\pm$ 0.18},\textcolor{blue}{3.989 $\pm$ 0.03},\textcolor{blue}{12.309 $\pm$ 0.67}
 7 | PDF 1,\textcolor{blue}{4.000 $\pm$ 0.00},\textcolor{red}{2.400 $\pm$ 0.00},\textcolor{blue}{4.000 $\pm$ 0.00},\textcolor{blue}{3.977 $\pm$ 0.03},\textcolor{blue}{14.377 $\pm$ 0.03}
 8 | PDF 2,\textcolor{red}{0.000 $\pm$ 0.00},3.040 $\pm$ 0.22,\textcolor{blue}{3.840 $\pm$ 0.22},\textcolor{blue}{3.949 $\pm$ 0.02},\textcolor{blue}{10.829 $\pm$ 0.46}
 9 | Student 1,\textcolor{blue}{4.000 $\pm$ 0.00},\textcolor{blue}{3.600 $\pm$ 0.28},\textcolor{blue}{3.960 $\pm$ 0.09},\textcolor{blue}{4.000 $\pm$ 0.00},\textcolor{blue}{15.560 $\pm$ 0.36}
10 | Student 2,\textcolor{blue}{3.840 $\pm$ 0.36},\textcolor{blue}{3.920 $\pm$ 0.18},\textcolor{blue}{4.000 $\pm$ 0.00},\textcolor{blue}{3.880 $\pm$ 0.15},\textcolor{blue}{15.640 $\pm$ 0.47}
11 | 


--------------------------------------------------------------------------------
/tasks.py:
--------------------------------------------------------------------------------
 1 | import subprocess
 2 | import webbrowser
 3 | import shutil
 4 | 
 5 | from pathlib import Path
 6 | from invoke import task
 7 | from sys import executable
 8 | import os
 9 | 
10 | 
11 | def print_red(s):
12 |     print("\033[91m {}\033[00m".format(s), end="")
13 | 
14 | 
15 | def print_green(s):
16 |     print("\033[92m {}\033[00m".format(s), end="")
17 | 
18 | 
19 | @task
20 | def clean_test(context):
21 |     """
22 |     Cleans the test store
23 |     """
24 | 
25 |     shutil.rmtree(Path(".pytest_cache"), ignore_errors=True)
26 | 
27 | 
28 | @task
29 | def fix_lint(context):
30 |     """
31 |     Fixes all linting and import sort errors. Skips init.py files for import sorts
32 |     """
33 | 
34 |     subprocess.run(["black", "explingo"])
35 |     subprocess.run(["black", "tests"])
36 |     subprocess.run(["isort", "--atomic", "explingo", "tests"])
37 | 
38 | 
39 | @task
40 | def lint(context):
41 |     """
42 |     Runs the linting and import sort process on all library files and tests and prints errors.
43 |         Skips init.py files for import sorts
44 |     """
45 |     subprocess.run(["flake8", "explingo", "tests"], check=True)
46 |     subprocess.run(["isort", "explingo", "tests"], check=True)
47 | 
48 | 
49 | @task
50 | def test(context):
51 |     """
52 |     Runs all test commands.
53 |     """
54 | 
55 |     failures_in = []
56 | 
57 |     try:
58 |         test_unit(context)
59 |     except subprocess.CalledProcessError:
60 |         failures_in.append("Unit tests")
61 | 
62 |     if len(failures_in) == 0:
63 |         print_green("\nAll tests successful :)")
64 |     else:
65 |         print_red("\n:( Failures in: ")
66 |         for i in failures_in:
67 |             print_red(i + ", ")
68 | 
69 | 
70 | @task
71 | def test_unit(context):
72 |     """
73 |     Runs all unit tests and outputs results and coverage
74 |     """
75 |     subprocess.run(["pytest"], check=True)
76 | 


--------------------------------------------------------------------------------
/evaluation/examples.py:
--------------------------------------------------------------------------------
 1 | import dspy
 2 | import json
 3 | import random
 4 | 
 5 | 
 6 | def create_example(entry):
 7 |     example = dspy.Example(
 8 |         explanation=entry["explanation"],
 9 |         context=entry["context"],
10 |         explanation_format=entry["explanation_format"],
11 |     )
12 |     if "narrative" in entry:
13 |         example.narrative = entry["narrative"]
14 |     if "bad_narrative" in entry:
15 |         example.bad_narrative = entry["bad_narrative"]
16 |     return example.with_inputs("explanation", "context", "explanation_format")
17 | 
18 | 
19 | def load_examples(json_file):
20 |     training_data = json.load(open(json_file, "r"))
21 |     examples = []
22 |     for entry in training_data:
23 |         examples.append(create_example(entry))
24 |     return examples
25 | 
26 | 
27 | def get_data(json_file, split=None):
28 |     all_data = load_examples(json_file)
29 |     labeled_data = [example for example in all_data if hasattr(example, "narrative")]
30 |     unlabeled_data = [
31 |         example for example in all_data if not hasattr(example, "narrative")
32 |     ]
33 |     if split is not None:
34 |         labeled_train = labeled_data[: int(split * len(labeled_data))]
35 |         labeled_eval = labeled_data[int(split * len(labeled_data)) :]
36 |         unlabeled_train = unlabeled_data[: int(split * len(unlabeled_data))]
37 |         unlabeled_eval = unlabeled_data[int(split * len(unlabeled_data)) :]
38 |     else:
39 |         labeled_train = labeled_data[:5]
40 |         labeled_eval = labeled_data[5:]
41 |         unlabeled_train = unlabeled_data[:5]
42 |         unlabeled_eval = unlabeled_data[5:]
43 |         if len(unlabeled_train) < 5:
44 |             additional_count = 5 - len(unlabeled_train)
45 |             labeled_train += labeled_eval[:additional_count]
46 |             labeled_eval = labeled_eval[additional_count:]
47 | 
48 |     return labeled_train, labeled_eval, unlabeled_train, unlabeled_eval
49 | 


--------------------------------------------------------------------------------
/evaluation/results/results_by_dataset.tex:
--------------------------------------------------------------------------------
 1 | \begin{table}
 2 | \caption{Overall results for each prompt and few-shot setting.}
 3 | \begin{tabular}{llllll}
 4 | \toprule
 5 | Dataset & Accuracy & Completeness & Fluency & Conciseness & Total score \\
 6 | \midrule
 7 | House 1 & \textcolor{blue}{3.733 $\pm$ 0.40} & \textcolor{blue}{4.000 $\pm$ 0.00} & \textcolor{blue}{3.800 $\pm$ 0.30} & \textcolor{blue}{3.719 $\pm$ 0.24} & \textcolor{blue}{15.252 $\pm$ 0.58} \\
 8 | House 2 & \textcolor{blue}{4.000 $\pm$ 0.00} & \textcolor{blue}{4.000 $\pm$ 0.00} & \textcolor{blue}{3.911 $\pm$ 0.11} & \textcolor{blue}{3.836 $\pm$ 0.16} & \textcolor{blue}{15.748 $\pm$ 0.24} \\
 9 | House 3 & \textcolor{blue}{4.000 $\pm$ 0.00} & \textcolor{blue}{3.689 $\pm$ 0.27} & \textcolor{blue}{3.933 $\pm$ 0.10} & \textcolor{blue}{3.869 $\pm$ 0.14} & \textcolor{blue}{15.491 $\pm$ 0.35} \\
10 | Mush 1 & \textcolor{blue}{3.556 $\pm$ 0.42} & \textcolor{blue}{3.600 $\pm$ 0.00} & \textcolor{blue}{3.511 $\pm$ 0.15} & \textcolor{blue}{4.000 $\pm$ 0.00} & \textcolor{blue}{14.667 $\pm$ 0.33} \\
11 | Mush 2 & \textcolor{red}{1.760 $\pm$ 0.88} & \textcolor{red}{2.640 $\pm$ 0.36} & \textcolor{blue}{3.920 $\pm$ 0.18} & \textcolor{blue}{3.989 $\pm$ 0.03} & \textcolor{blue}{12.309 $\pm$ 0.67} \\
12 | PDF 1 & \textcolor{blue}{4.000 $\pm$ 0.00} & \textcolor{red}{2.400 $\pm$ 0.00} & \textcolor{blue}{4.000 $\pm$ 0.00} & \textcolor{blue}{3.977 $\pm$ 0.03} & \textcolor{blue}{14.377 $\pm$ 0.03} \\
13 | PDF 2 & \textcolor{red}{0.000 $\pm$ 0.00} & 3.040 $\pm$ 0.22 & \textcolor{blue}{3.840 $\pm$ 0.22} & \textcolor{blue}{3.949 $\pm$ 0.02} & \textcolor{blue}{10.829 $\pm$ 0.46} \\
14 | Student 1 & \textcolor{blue}{4.000 $\pm$ 0.00} & \textcolor{blue}{3.600 $\pm$ 0.28} & \textcolor{blue}{3.960 $\pm$ 0.09} & \textcolor{blue}{4.000 $\pm$ 0.00} & \textcolor{blue}{15.560 $\pm$ 0.36} \\
15 | Student 2 & \textcolor{blue}{3.840 $\pm$ 0.36} & \textcolor{blue}{3.920 $\pm$ 0.18} & \textcolor{blue}{4.000 $\pm$ 0.00} & \textcolor{blue}{3.880 $\pm$ 0.15} & \textcolor{blue}{15.640 $\pm$ 0.47} \\
16 | \bottomrule
17 | \end{tabular}
18 | \end{table}
19 | 


--------------------------------------------------------------------------------
/evaluation/results/results_local2.csv:
--------------------------------------------------------------------------------
 1 | dataset,total score,accuracy,completeness,fluency,conciseness,n_few_shot,prompt
 2 | housing_3.json,8.208187134502925,0.0,0.4,4.0,3.808187134502924,3,"You are helping users understand an ML model's prediction. Given an explanation and information about the model, convert the explanation into a human-readable narrative."
 3 | pdf_1.json,11.907936507936508,2.4,2.0,3.8,3.707936507936508,3,"You are helping users understand an ML model's prediction. Given an explanation and information about the model, convert the explanation into a human-readable narrative."
 4 | pdf_2.json,9.684848484848484,0.0,2.0,4.0,3.6848484848484846,3,"You are helping users understand an ML model's prediction. Given an explanation and information about the model, convert the explanation into a human-readable narrative."
 5 | mushroom_2.json,9.711111111111112,0.0,2.0,3.8,3.9111111111111114,3,"You are helping users understand an ML model's prediction. Given an explanation and information about the model, convert the explanation into a human-readable narrative."
 6 | student_2.json,11.11111111111111,0.8,2.4,4.0,3.9111111111111114,3,"You are helping users understand an ML model's prediction. Given an explanation and information about the model, convert the explanation into a human-readable narrative."
 7 | mushroom_1.json,14.022222222222222,3.2,3.2,3.8,3.822222222222223,3,"You are helping users understand an ML model's prediction. Given an explanation and information about the model, convert the explanation into a human-readable narrative."
 8 | student_1.json,13.0,2.4,2.8,3.8,4.0,3,"You are helping users understand an ML model's prediction. Given an explanation and information about the model, convert the explanation into a human-readable narrative."
 9 | housing_2.json,7.626666666666667,0.0,0.0,4.0,3.6266666666666665,3,"You are helping users understand an ML model's prediction. Given an explanation and information about the model, convert the explanation into a human-readable narrative."
10 | housing_1.json,7.850980392156863,0.0,0.0,4.0,3.8509803921568633,3,"You are helping users understand an ML model's prediction. Given an explanation and information about the model, convert the explanation into a human-readable narrative."
11 | 


--------------------------------------------------------------------------------
/explingo/testing.py:
--------------------------------------------------------------------------------
 1 | import dspy
 2 | 
 3 | 
 4 | class MockNarratorLLM(dspy.LM):
 5 |     def __init__(self, response, include_tags=True, **kwargs):
 6 |         """
 7 |         Create a mock LLM for testing purposes
 8 | 
 9 |         Args:
10 |             response (String): Narrative response expected from the LLM
11 |             include_tags (bool): Include tags (ie. "Narrative") in the response. Should be set to
12 |                 False to test functionality that directly uses DSPy (ie. bootstrapped few-shot),
13 |                 True otherwise
14 |         """
15 |         self.response = response
16 |         self.kwargs = kwargs
17 |         self.history = []
18 |         self.include_tags = include_tags
19 |         super().__init__(model=None)
20 | 
21 |     def basic_request(self, prompt, **kwargs):
22 |         return self(prompt, **kwargs)
23 | 
24 |     def __call__(self, prompt=None, **kwargs):
25 |         if self.include_tags:
26 |             completions = "Narrative: " + self.response
27 |         else:
28 |             completions = self.response
29 |         self.history.append({"prompt": prompt, "completions": completions})
30 |         return [completions]
31 | 
32 |     def copy(self, **kwargs):
33 |         return self.__class__(self.response, **kwargs)
34 | 
35 |     def inspect_history(self, n=1, skip=0):
36 |         print(self.history)
37 | 
38 | 
39 | class MockGraderLLM(dspy.LM):
40 |     def __init__(self, response, **kwargs):
41 |         """
42 |         Create a mock Grader for testing purposes
43 | 
44 |         Args:
45 |             response (int): Grader response expected from the Grader
46 |         """
47 |         self.response = response
48 |         self.kwargs = kwargs
49 |         self.history = []
50 |         super().__init__(model=None)
51 | 
52 |     def __call__(self, prompt=None, *args, **kwargs):
53 |         completions = str(self.response)
54 |         self.history.append({"prompt": prompt, "completions": completions})
55 |         return [completions]
56 | 
57 |     def basic_request(self, prompt, **kwargs):
58 |         return self(prompt, **kwargs)
59 | 
60 |     def copy(self, **kwargs):
61 |         return self.__class__(self.response, **kwargs)
62 | 
63 |     def inspect_history(self, n=1, skip=0):
64 |         print(self.history)
65 | 


--------------------------------------------------------------------------------
/evaluation/results/results_by_technique.csv:
--------------------------------------------------------------------------------
 1 | Prompt,$L$,$B$,Accuracy,Completeness,Fluency,Conciseness,Total score
 2 | Prompt 1,0,0,\textbf{\textcolor{blue}{4.000 $\pm$ 0.00}},\textbf{\textcolor{blue}{4.000 $\pm$ 0.00}},\textcolor{red}{2.467 $\pm$ 0.76},\textcolor{red}{0.850 $\pm$ 0.94},\textcolor{blue}{11.317 $\pm$ 1.23}
 3 | Prompt 1,1,0,3.289 $\pm$ 1.35,\textcolor{blue}{3.511 $\pm$ 0.56},\textcolor{blue}{3.622 $\pm$ 0.47},\textcolor{blue}{3.749 $\pm$ 0.14},\textcolor{blue}{14.171 $\pm$ 1.67}
 4 | Prompt 1,1,1,\textcolor{blue}{3.800 $\pm$ 0.40},\textcolor{blue}{3.800 $\pm$ 0.23},\textcolor{blue}{3.650 $\pm$ 0.34},\textcolor{blue}{3.702 $\pm$ 0.32},\textcolor{blue}{14.952 $\pm$ 0.58}
 5 | Prompt 1,1,3,\textcolor{blue}{3.800 $\pm$ 0.40},\textcolor{blue}{3.800 $\pm$ 0.23},\textcolor{blue}{3.700 $\pm$ 0.26},\textcolor{blue}{3.734 $\pm$ 0.28},\textbf{\textcolor{blue}{15.034 $\pm$ 0.51}}
 6 | Prompt 1,3,0,3.111 $\pm$ 1.57,3.422 $\pm$ 0.53,\textcolor{blue}{3.689 $\pm$ 0.41},\textcolor{blue}{3.708 $\pm$ 0.19},\textcolor{blue}{13.930 $\pm$ 1.82}
 7 | Prompt 1,3,1,3.323 $\pm$ 1.34,\textcolor{blue}{3.538 $\pm$ 0.49},\textcolor{blue}{3.846 $\pm$ 0.19},\textcolor{blue}{3.843 $\pm$ 0.16},\textcolor{blue}{14.551 $\pm$ 1.72}
 8 | Prompt 1,3,3,3.262 $\pm$ 1.33,\textcolor{blue}{3.631 $\pm$ 0.58},\textcolor{blue}{3.846 $\pm$ 0.23},\textcolor{blue}{3.890 $\pm$ 0.11},\textcolor{blue}{14.629 $\pm$ 1.73}
 9 | Prompt 1,5,0,3.111 $\pm$ 1.57,3.289 $\pm$ 0.56,\textcolor{blue}{3.644 $\pm$ 0.49},\textcolor{blue}{3.669 $\pm$ 0.35},\textcolor{blue}{13.713 $\pm$ 2.08}
10 | Prompt 1,5,1,3.289 $\pm$ 1.35,3.333 $\pm$ 0.63,\textbf{\textcolor{blue}{3.933 $\pm$ 0.14}},\textcolor{blue}{3.983 $\pm$ 0.03},\textcolor{blue}{14.538 $\pm$ 1.63}
11 | Prompt 1,5,3,3.289 $\pm$ 1.35,3.467 $\pm$ 0.66,\textcolor{blue}{3.889 $\pm$ 0.27},\textbf{\textcolor{blue}{3.988 $\pm$ 0.02}},\textcolor{blue}{14.632 $\pm$ 1.65}
12 | Prompt 1,5,5,3.378 $\pm$ 1.37,3.422 $\pm$ 0.64,\textcolor{blue}{3.911 $\pm$ 0.15},\textcolor{blue}{3.977 $\pm$ 0.03},\textcolor{blue}{14.688 $\pm$ 1.66}
13 | Prompt 2,0,0,\textcolor{blue}{3.911 $\pm$ 0.27},\textbf{\textcolor{blue}{4.000 $\pm$ 0.00}},\textcolor{red}{2.467 $\pm$ 0.66},\textcolor{red}{0.872 $\pm$ 0.89},\textcolor{blue}{11.250 $\pm$ 0.97}
14 | Prompt 3,0,0,\textbf{\textcolor{blue}{4.000 $\pm$ 0.00}},\textbf{\textcolor{blue}{4.000 $\pm$ 0.00}},\textcolor{red}{2.222 $\pm$ 0.82},\textcolor{red}{1.056 $\pm$ 1.07},\textcolor{blue}{11.278 $\pm$ 1.44}
15 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # <img src="https://github.com/sibyl-dev/Explingo/blob/main/parrot.jpg" width="auto" height="75"> Explingo
 2 | 
 3 | # Explingo
 4 | Transform your ML explanations into human-friendly natural-language narratives.
 5 | 
 6 | NOTE: Explingo is still under active development and currently only supports a few basic explanation types
 7 | and GPT-API models. 
 8 | 
 9 | ## Installation
10 | Explingo can be installed through PIP
11 | ```bash
12 | pip install explingo
13 | ```
14 | 
15 | ## Usage
16 | To transform explanations into narratives, you can use the Narrator class.
17 | ```python
18 | from explingo import Narrator, Grader 
19 | 
20 | example_narratives = [
21 |     ("(Above ground living area square feet, 1256.00, -12527.46), (Overall material and finish of the house, 5.00, -10743.76), (Second floor square feet, 0.00, -10142.29)", 
22 |      "The house's living area size of around 1,200 sq. ft., lower quality materials (5/10), and lack of a second floor are the main reasons for the low price."),
23 |     ("(Second floor square feet, 854.00, 12757.84), (Original construction date, 2003.00, 9115.72)",
24 |      "The house's large second floor of around 850 sq. ft and recent construction date of 2003 increases its value."),
25 |     ("(Overall material and finish of the house, 8.00, 10743.76), (Above ground living area square feet, 2000.00, 12527.46), (Second floor square feet, 1000.00, 10142.29)",
26 |         "The house's high quality materials (8/10), large living area size of around 2,000 sq. ft., and a second floor of around 1,000 sq. ft. are the main reasons for the high price."),
27 | ]
28 | 
29 | explanation_format = "(feature name, feature value, SHAP feature contribution)"
30 | context = "The model predicts house prices"
31 | 
32 | narrator = Narrator(openai_api_key=[OPENAI_API_KEY], 
33 |                     explanation_format=explanation_format,
34 |                     context=context,
35 |                     labeled_train_data=example_narratives)
36 | 
37 | explanation = "(number of bathrooms, 3, 7020), (number of bedrooms, 4, 12903)"
38 | 
39 | narrative = narrator.narrate(explanation)
40 | ```
41 | 
42 | To evaluate the quality of the generated narratives, you can use the Grader class.
43 | ```python
44 | grader = Grader(openai_api_key=[OPENAI_API_KEY], 
45 |                 metrics="all", 
46 |                 sample_narratives=[narrative[1] for narrative in example_narratives])
47 | 
48 | metrics = grader(explanation=explanation, 
49 |                  explanation_format=explanation_format, 
50 |                  narrative=narrative)
51 | ```
52 | 


--------------------------------------------------------------------------------
/evaluation/results/results_by_technique.tex:
--------------------------------------------------------------------------------
 1 | \begin{table}
 2 | \caption{Overall results for each prompt and few-shot setting.}
 3 | \begin{tabular}{lrrlllll}
 4 | \toprule
 5 | Prompt & $L$ & $B$ & Accuracy & Completeness & Fluency & Conciseness & Total score \\
 6 | \midrule
 7 | Prompt 1 & 0 & 0 & \textbf{\textcolor{blue}{4.000 $\pm$ 0.00}} & \textbf{\textcolor{blue}{4.000 $\pm$ 0.00}} & \textcolor{red}{2.467 $\pm$ 0.76} & \textcolor{red}{0.850 $\pm$ 0.94} & \textcolor{blue}{11.317 $\pm$ 1.23} \\
 8 | Prompt 1 & 1 & 0 & 3.289 $\pm$ 1.35 & \textcolor{blue}{3.511 $\pm$ 0.56} & \textcolor{blue}{3.622 $\pm$ 0.47} & \textcolor{blue}{3.749 $\pm$ 0.14} & \textcolor{blue}{14.171 $\pm$ 1.67} \\
 9 | Prompt 1 & 1 & 1 & \textcolor{blue}{3.800 $\pm$ 0.40} & \textcolor{blue}{3.800 $\pm$ 0.23} & \textcolor{blue}{3.650 $\pm$ 0.34} & \textcolor{blue}{3.702 $\pm$ 0.32} & \textcolor{blue}{14.952 $\pm$ 0.58} \\
10 | Prompt 1 & 1 & 3 & \textcolor{blue}{3.800 $\pm$ 0.40} & \textcolor{blue}{3.800 $\pm$ 0.23} & \textcolor{blue}{3.700 $\pm$ 0.26} & \textcolor{blue}{3.734 $\pm$ 0.28} & \textbf{\textcolor{blue}{15.034 $\pm$ 0.51}} \\
11 | Prompt 1 & 3 & 0 & 3.111 $\pm$ 1.57 & 3.422 $\pm$ 0.53 & \textcolor{blue}{3.689 $\pm$ 0.41} & \textcolor{blue}{3.708 $\pm$ 0.19} & \textcolor{blue}{13.930 $\pm$ 1.82} \\
12 | Prompt 1 & 3 & 1 & 3.323 $\pm$ 1.34 & \textcolor{blue}{3.538 $\pm$ 0.49} & \textcolor{blue}{3.846 $\pm$ 0.19} & \textcolor{blue}{3.843 $\pm$ 0.16} & \textcolor{blue}{14.551 $\pm$ 1.72} \\
13 | Prompt 1 & 3 & 3 & 3.262 $\pm$ 1.33 & \textcolor{blue}{3.631 $\pm$ 0.58} & \textcolor{blue}{3.846 $\pm$ 0.23} & \textcolor{blue}{3.890 $\pm$ 0.11} & \textcolor{blue}{14.629 $\pm$ 1.73} \\
14 | Prompt 1 & 5 & 0 & 3.111 $\pm$ 1.57 & 3.289 $\pm$ 0.56 & \textcolor{blue}{3.644 $\pm$ 0.49} & \textcolor{blue}{3.669 $\pm$ 0.35} & \textcolor{blue}{13.713 $\pm$ 2.08} \\
15 | Prompt 1 & 5 & 1 & 3.289 $\pm$ 1.35 & 3.333 $\pm$ 0.63 & \textbf{\textcolor{blue}{3.933 $\pm$ 0.14}} & \textcolor{blue}{3.983 $\pm$ 0.03} & \textcolor{blue}{14.538 $\pm$ 1.63} \\
16 | Prompt 1 & 5 & 3 & 3.289 $\pm$ 1.35 & 3.467 $\pm$ 0.66 & \textcolor{blue}{3.889 $\pm$ 0.27} & \textbf{\textcolor{blue}{3.988 $\pm$ 0.02}} & \textcolor{blue}{14.632 $\pm$ 1.65} \\
17 | Prompt 1 & 5 & 5 & 3.378 $\pm$ 1.37 & 3.422 $\pm$ 0.64 & \textcolor{blue}{3.911 $\pm$ 0.15} & \textcolor{blue}{3.977 $\pm$ 0.03} & \textcolor{blue}{14.688 $\pm$ 1.66} \\
18 | Prompt 2 & 0 & 0 & \textcolor{blue}{3.911 $\pm$ 0.27} & \textbf{\textcolor{blue}{4.000 $\pm$ 0.00}} & \textcolor{red}{2.467 $\pm$ 0.66} & \textcolor{red}{0.872 $\pm$ 0.89} & \textcolor{blue}{11.250 $\pm$ 0.97} \\
19 | Prompt 3 & 0 & 0 & \textbf{\textcolor{blue}{4.000 $\pm$ 0.00}} & \textbf{\textcolor{blue}{4.000 $\pm$ 0.00}} & \textcolor{red}{2.222 $\pm$ 0.82} & \textcolor{red}{1.056 $\pm$ 1.07} & \textcolor{blue}{11.278 $\pm$ 1.44} \\
20 | \bottomrule
21 | \end{tabular}
22 | \end{table}
23 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .nox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | *.py,cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | cover/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | .pybuilder/
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | #   For a library or package, you might want to ignore these files since the code is
 87 | #   intended to run in multiple environments; otherwise, check them in:
 88 | # .python-version
 89 | 
 90 | # pipenv
 91 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 92 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 93 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 94 | #   install all needed dependencies.
 95 | #Pipfile.lock
 96 | 
 97 | # poetry
 98 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
 99 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
100 | #   commonly ignored for libraries.
101 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102 | #poetry.lock
103 | 
104 | # pdm
105 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106 | #pdm.lock
107 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108 | #   in version control.
109 | #   https://pdm.fming.dev/#use-with-ide
110 | .pdm.toml
111 | 
112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
113 | __pypackages__/
114 | 
115 | # Celery stuff
116 | celerybeat-schedule
117 | celerybeat.pid
118 | 
119 | # SageMath parsed files
120 | *.sage.py
121 | 
122 | # Environments
123 | .env
124 | .venv
125 | env/
126 | venv/
127 | ENV/
128 | env.bak/
129 | venv.bak/
130 | 
131 | # Spyder project settings
132 | .spyderproject
133 | .spyproject
134 | 
135 | # Rope project settings
136 | .ropeproject
137 | 
138 | # mkdocs documentation
139 | /site
140 | 
141 | # mypy
142 | .mypy_cache/
143 | .dmypy.json
144 | dmypy.json
145 | 
146 | # Pyre type checker
147 | .pyre/
148 | 
149 | # pytype static type analyzer
150 | .pytype/
151 | 
152 | # Cython debug symbols
153 | cython_debug/
154 | 
155 | # PyCharm
156 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
157 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
158 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
159 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
160 | .idea/
161 | 
162 | key.yaml
163 | keys.yaml
164 | 
165 | data/real-estate-info/*
166 | 


--------------------------------------------------------------------------------
/evaluation/results/cleaned_results.csv:
--------------------------------------------------------------------------------
  1 | ,Dataset,Prompt,$L$,$B$,Accuracy,Completeness,Fluency,Conciseness,Total score
  2 | 0,House 1,Prompt 1,0,0,4.0,4.0,3.2,2.196,13.396
  3 | 1,House 1,Prompt 2,0,0,4.0,4.0,3.0,2.275,13.275
  4 | 2,House 1,Prompt 3,0,0,4.0,4.0,3.2,2.81,14.01
  5 | 3,House 2,Prompt 1,0,0,4.0,4.0,2.6,0.48,11.08
  6 | 4,House 2,Prompt 2,0,0,4.0,4.0,2.2,0.96,11.16
  7 | 5,House 2,Prompt 3,0,0,4.0,4.0,2.2,0.942,11.142
  8 | 6,House 3,Prompt 1,0,0,4.0,4.0,1.8,0.0,9.8
  9 | 7,House 3,Prompt 2,0,0,4.0,4.0,2.2,0.0,10.2
 10 | 8,House 3,Prompt 3,0,0,4.0,4.0,1.8,0.0,9.8
 11 | 9,Mush 1,Prompt 1,0,0,4.0,4.0,1.2,1.697,10.897
 12 | 10,Mush 1,Prompt 2,0,0,4.0,4.0,1.4,1.417,10.817
 13 | 11,Mush 1,Prompt 3,0,0,4.0,4.0,0.8,1.778,10.578
 14 | 12,Mush 2,Prompt 1,0,0,4.0,4.0,1.6,0.571,10.171
 15 | 13,Mush 2,Prompt 2,0,0,4.0,4.0,1.6,0.527,10.127
 16 | 14,Mush 2,Prompt 3,0,0,4.0,4.0,1.4,0.737,10.137
 17 | 15,PDF 1,Prompt 1,0,0,4.0,4.0,3.0,0.065,11.065
 18 | 16,PDF 1,Prompt 2,0,0,4.0,4.0,3.2,0.0,11.2
 19 | 17,PDF 1,Prompt 3,0,0,4.0,4.0,2.8,0.267,11.067
 20 | 18,PDF 2,Prompt 1,0,0,4.0,4.0,2.6,0.0,10.6
 21 | 19,PDF 2,Prompt 2,0,0,4.0,4.0,2.8,0.0,10.8
 22 | 20,PDF 2,Prompt 3,0,0,4.0,4.0,2.0,0.0,10.0
 23 | 21,Student 1,Prompt 1,0,0,4.0,4.0,2.8,2.289,13.089
 24 | 22,Student 1,Prompt 2,0,0,3.2,4.0,2.6,2.119,11.919
 25 | 23,Student 1,Prompt 3,0,0,4.0,4.0,2.6,2.527,13.127
 26 | 24,Student 2,Prompt 1,0,0,4.0,4.0,3.4,0.356,11.756
 27 | 25,Student 2,Prompt 2,0,0,4.0,4.0,3.2,0.55,11.75
 28 | 26,Student 2,Prompt 3,0,0,4.0,4.0,3.2,0.439,11.639
 29 | 27,House 1,Prompt 1,1,0,4.0,4.0,4.0,3.775,15.775
 30 | 28,House 1,Prompt 1,3,0,4.0,4.0,4.0,3.671,15.671
 31 | 29,House 1,Prompt 1,5,0,4.0,4.0,4.0,3.723,15.723
 32 | 30,House 2,Prompt 1,1,0,4.0,4.0,3.8,3.627,15.427
 33 | 31,House 2,Prompt 1,3,0,4.0,3.6,3.8,3.609,15.009
 34 | 32,House 2,Prompt 1,5,0,4.0,4.0,3.8,3.68,15.48
 35 | 33,House 3,Prompt 1,1,0,4.0,3.6,4.0,3.897,15.497
 36 | 34,House 3,Prompt 1,3,0,4.0,3.6,4.0,3.574,15.174
 37 | 35,House 3,Prompt 1,5,0,4.0,3.6,4.0,3.892,15.492
 38 | 36,Mush 1,Prompt 1,1,0,3.2,3.6,3.0,3.93,13.73
 39 | 37,Mush 1,Prompt 1,3,0,3.2,3.6,3.0,3.93,13.73
 40 | 38,Mush 1,Prompt 1,5,0,3.2,2.8,2.6,2.807,11.407
 41 | 39,Mush 2,Prompt 1,1,0,2.4,4.0,3.0,3.695,13.095
 42 | 40,Mush 2,Prompt 1,3,0,0.8,3.2,3.6,3.911,11.511
 43 | 41,Mush 2,Prompt 1,5,0,0.8,3.2,3.4,3.911,11.311
 44 | 42,PDF 1,Prompt 1,1,0,4.0,2.4,3.8,3.581,13.781
 45 | 43,PDF 1,Prompt 1,3,0,4.0,2.4,3.8,3.581,13.781
 46 | 44,PDF 1,Prompt 1,5,0,4.0,2.4,3.8,3.581,13.781
 47 | 45,PDF 2,Prompt 1,1,0,0.0,2.8,4.0,3.685,10.485
 48 | 46,PDF 2,Prompt 1,3,0,0.0,2.8,4.0,3.644,10.444
 49 | 47,PDF 2,Prompt 1,5,0,0.0,2.8,4.0,3.725,10.525
 50 | 48,Student 1,Prompt 1,1,0,4.0,3.6,3.0,3.932,14.532
 51 | 49,Student 1,Prompt 1,3,0,4.0,3.6,3.0,4.0,14.6
 52 | 50,Student 1,Prompt 1,5,0,4.0,3.2,3.2,3.973,14.373
 53 | 51,Student 2,Prompt 1,1,0,4.0,3.6,4.0,3.617,15.217
 54 | 52,Student 2,Prompt 1,3,0,4.0,4.0,4.0,3.45,15.45
 55 | 53,Student 2,Prompt 1,5,0,4.0,3.6,4.0,3.728,15.328
 56 | 54,House 1,Prompt 1,1,1,4.0,4.0,3.2,3.318,14.518
 57 | 55,House 1,Prompt 1,1,3,4.0,4.0,3.4,3.447,14.847
 58 | 56,House 1,Prompt 1,3,1,4.0,4.0,4.0,3.671,15.671
 59 | 57,House 1,Prompt 1,3,3,3.2,4.0,3.8,3.706,14.706
 60 | 58,House 2,Prompt 1,1,1,4.0,4.0,3.8,3.552,15.352
 61 | 59,House 2,Prompt 1,1,3,4.0,4.0,3.8,3.552,15.352
 62 | 60,House 2,Prompt 1,3,1,4.0,4.0,4.0,3.936,15.936
 63 | 61,House 2,Prompt 1,3,3,4.0,4.0,4.0,3.872,15.872
 64 | 62,House 3,Prompt 1,1,1,4.0,3.6,4.0,3.937,15.537
 65 | 63,House 3,Prompt 1,1,3,4.0,3.6,4.0,3.937,15.537
 66 | 64,House 3,Prompt 1,3,1,4.0,3.6,3.8,3.642,15.042
 67 | 65,House 3,Prompt 1,3,3,4.0,4.0,4.0,3.832,15.832
 68 | 66,Mush 1,Prompt 1,1,1,3.2,3.6,3.6,4.0,14.4
 69 | 67,Mush 1,Prompt 1,1,3,3.2,3.6,3.6,4.0,14.4
 70 | 68,Mush 1,Prompt 1,3,1,3.2,3.6,3.6,4.0,14.4
 71 | 69,Mush 1,Prompt 1,3,3,4.0,3.6,3.4,4.0,15.0
 72 | 70,House 1,Prompt 1,3,1,4.0,4.0,4.0,3.671,15.671
 73 | 71,House 1,Prompt 1,3,3,3.2,4.0,3.8,3.706,14.706
 74 | 72,House 1,Prompt 1,5,1,4.0,4.0,4.0,4.0,16.0
 75 | 73,House 1,Prompt 1,5,3,3.2,4.0,4.0,3.988,15.188
 76 | 74,House 1,Prompt 1,5,5,4.0,4.0,4.0,3.965,15.965
 77 | 75,House 2,Prompt 1,3,1,4.0,4.0,4.0,3.936,15.936
 78 | 76,House 2,Prompt 1,3,3,4.0,4.0,4.0,3.872,15.872
 79 | 77,House 2,Prompt 1,5,1,4.0,4.0,4.0,3.936,15.936
 80 | 78,House 2,Prompt 1,5,3,4.0,4.0,3.8,3.936,15.736
 81 | 79,House 2,Prompt 1,5,5,4.0,4.0,3.8,3.936,15.736
 82 | 80,House 3,Prompt 1,3,1,4.0,3.6,3.8,3.642,15.042
 83 | 81,House 3,Prompt 1,3,3,4.0,4.0,4.0,3.832,15.832
 84 | 82,House 3,Prompt 1,5,1,4.0,3.2,4.0,4.0,15.2
 85 | 83,House 3,Prompt 1,5,3,4.0,4.0,4.0,4.0,16.0
 86 | 84,House 3,Prompt 1,5,5,4.0,3.6,3.8,4.0,15.4
 87 | 85,Mush 1,Prompt 1,3,1,3.2,3.6,3.6,4.0,14.4
 88 | 86,Mush 1,Prompt 1,3,3,4.0,3.6,3.4,4.0,15.0
 89 | 87,Mush 1,Prompt 1,5,1,3.2,3.6,3.6,4.0,14.4
 90 | 88,Mush 1,Prompt 1,5,3,4.0,3.6,3.2,4.0,14.8
 91 | 89,Mush 1,Prompt 1,5,5,4.0,3.6,3.6,4.0,15.2
 92 | 90,Mush 2,Prompt 1,3,1,0.8,3.2,3.6,3.943,11.543
 93 | 91,Mush 2,Prompt 1,3,3,0.8,2.8,4.0,4.0,11.6
 94 | 92,Mush 2,Prompt 1,5,1,2.4,2.4,4.0,4.0,12.8
 95 | 93,Mush 2,Prompt 1,5,3,2.4,2.4,4.0,4.0,12.8
 96 | 94,Mush 2,Prompt 1,5,5,2.4,2.4,4.0,4.0,12.8
 97 | 95,PDF 1,Prompt 1,3,1,4.0,2.4,4.0,3.943,14.343
 98 | 96,PDF 1,Prompt 1,3,3,4.0,2.4,4.0,4.0,14.4
 99 | 97,PDF 1,Prompt 1,5,1,4.0,2.4,4.0,3.943,14.343
100 | 98,PDF 1,Prompt 1,5,3,4.0,2.4,4.0,4.0,14.4
101 | 99,PDF 1,Prompt 1,5,5,4.0,2.4,4.0,4.0,14.4
102 | 100,PDF 2,Prompt 1,3,1,0.0,2.8,3.6,3.927,10.327
103 | 101,PDF 2,Prompt 1,3,3,0.0,2.8,3.6,3.927,10.327
104 | 102,PDF 2,Prompt 1,5,1,0.0,3.2,4.0,3.964,11.164
105 | 103,PDF 2,Prompt 1,5,3,0.0,3.2,4.0,3.964,11.164
106 | 104,PDF 2,Prompt 1,5,5,0.0,3.2,4.0,3.964,11.164
107 | 105,Student 1,Prompt 1,3,1,4.0,3.6,4.0,4.0,15.6
108 | 106,Student 1,Prompt 1,3,3,4.0,4.0,4.0,4.0,16.0
109 | 107,Student 1,Prompt 1,5,1,4.0,3.2,3.8,4.0,15.0
110 | 108,Student 1,Prompt 1,5,3,4.0,3.6,4.0,4.0,15.6
111 | 109,Student 1,Prompt 1,5,5,4.0,3.6,4.0,4.0,15.6
112 | 110,Student 2,Prompt 1,3,1,4.0,3.6,4.0,3.65,15.25
113 | 111,Student 2,Prompt 1,3,3,3.2,4.0,4.0,3.825,15.025
114 | 112,Student 2,Prompt 1,5,1,4.0,4.0,4.0,4.0,16.0
115 | 113,Student 2,Prompt 1,5,3,4.0,4.0,4.0,4.0,16.0
116 | 114,Student 2,Prompt 1,5,5,4.0,4.0,4.0,3.925,15.925
117 | 


--------------------------------------------------------------------------------
/evaluation/experiment_runner.py:
--------------------------------------------------------------------------------
  1 | import metrics
  2 | import os
  3 | import examples
  4 | import random
  5 | from explingo import Explingo
  6 | 
  7 | 
  8 | class ExplingoExperimentRunner:
  9 |     def __init__(
 10 |         self, llm, dataset_filepath, openai_api_key, verbose=0, save_results=True
 11 |     ):
 12 |         (
 13 |             self.labeled_train,
 14 |             self.labeled_eval,
 15 |             self.unlabeled_train,
 16 |             self.unlabeled_eval,
 17 |         ) = examples.get_data(dataset_filepath)
 18 |         self.train_data = self.labeled_train + self.unlabeled_train
 19 |         self.eval_data = self.labeled_eval + self.unlabeled_eval
 20 |         assert len(self.train_data) == 10
 21 |         print(dataset_filepath)
 22 |         print(f"Total number of examples: {len(self.train_data) + len(self.eval_data)}")
 23 |         print(f"Labeled training examples: {len(self.labeled_train)}")
 24 |         print(f"Labeled evaluation examples: {len(self.labeled_eval)}")
 25 |         print(f"Unlabeled training examples: {len(self.unlabeled_train)}")
 26 |         print(f"Unlabeled evaluation examples: {len(self.unlabeled_eval)}")
 27 | 
 28 |         max_optimal_length = max(
 29 |             [
 30 |                 len(d.narrative.split()) / d.explanation.count("(")
 31 |                 for d in self.labeled_train
 32 |             ]
 33 |         )
 34 |         print("Max optimal length:", max_optimal_length)
 35 |         print("---")
 36 | 
 37 |         example_good_narratives = random.sample(
 38 |             [d.narrative for d in self.labeled_train], 5
 39 |         )
 40 | 
 41 |         self.metrics = metrics.Metrics(
 42 |             metric_funcs=[
 43 |                 metrics.accuracy,
 44 |                 metrics.completeness,
 45 |                 metrics.fluency,
 46 |                 metrics.conciseness,
 47 |             ],
 48 |             openai_key=openai_api_key,
 49 |             verbose=verbose,
 50 |             metric_kwargs={
 51 |                 "conciseness": {"max_optimal_length_per_feature": max_optimal_length},
 52 |                 "fluency": {"good_narratives": example_good_narratives},
 53 |             },
 54 |         )
 55 | 
 56 |         self.verbose = verbose
 57 |         self.save_results = save_results
 58 | 
 59 |         self.explingo = Explingo(
 60 |             llm,
 61 |             context=self.labeled_train[0]["context"],
 62 |             labeled_train_data=self.labeled_train,
 63 |             unlabeled_train_data=self.unlabeled_train,
 64 |         )
 65 | 
 66 |     def run_experiment(self, func, prompt=None, max_iters=100, kwargs=None):
 67 |         if kwargs is None:
 68 |             kwargs = {}
 69 | 
 70 |         total_scores = None
 71 |         results = []
 72 |         for i, example in enumerate(self.eval_data):
 73 |             if i >= max_iters:
 74 |                 break
 75 |             result = func(
 76 |                 prompt=prompt,
 77 |                 explanation=example.explanation,
 78 |                 explanation_format=example.explanation_format,
 79 |                 **kwargs,
 80 |             )
 81 |             if result is not None:
 82 |                 score = self.metrics(example, result)
 83 |                 if total_scores is None:
 84 |                     total_scores = score[1]
 85 |                 else:
 86 |                     total_scores += score[1]
 87 |                 if self.verbose >= 1:
 88 |                     print("Explanation:", example.explanation)
 89 |                     print("Narrative:", result.narrative)
 90 |                     print("Total Score:", score[0])
 91 |                     print(
 92 |                         "".join(
 93 |                             f"{metric}: {score}, " for metric, score in score[1].items()
 94 |                         )
 95 |                     )
 96 |                     print("--")
 97 |                 if self.save_results:
 98 |                     results.append(
 99 |                         {
100 |                             "func": func.__name__,
101 |                             "prompt": kwargs.get("prompt", ""),
102 |                             "n_few_shot": kwargs.get("n_few_shot", 0),
103 |                             "n_labeled_few_shot": kwargs.get("n_labeled_few_shot", 0),
104 |                             "n_bootstrapped_few_shot": kwargs.get(
105 |                                 "n_bootstrapped_few_shot", 0
106 |                             ),
107 |                             "explanation": example.explanation,
108 |                             "narrative": result.narrative,
109 |                             "scores": "".join(
110 |                                 f"{metric}: {score}, "
111 |                                 for metric, score in score[1].items()
112 |                             ),
113 |                         }
114 |                     )
115 | 
116 |         total = min(max_iters, len(self.eval_data))
117 |         average_scores = total_scores / total
118 |         total_average_score = total_scores.sum() / total
119 | 
120 |         if self.save_results:
121 |             return total_average_score, average_scores, results
122 |         return total_average_score, average_scores
123 | 
124 |     def run_basic_prompting_experiment(self, prompt=None, max_iters=100):
125 |         """
126 |         Run a basic prompting experiment
127 | 
128 |         Args:
129 |             prompt (string): Prompt
130 |             max_iters (int): Maximum number of examples to run on
131 | 
132 |         Returns:
133 |             total_average_score (float): Average total score over all explanations
134 |             average_scores (pd.Series): Average scores for each metric
135 |         """
136 |         return self.run_experiment(
137 |             self.explingo.basic_prompt,
138 |             prompt=prompt,
139 |             max_iters=max_iters,
140 |         )
141 | 
142 |     def run_few_shot_experiment(self, prompt=None, max_iters=100, n_few_shot=3):
143 |         """
144 |         Run a few-shot experiment
145 | 
146 |         Args:
147 |             prompt (string): Prompt
148 |             max_iters (int): Maximum number of examples to run on
149 |             n_few_shot (int): Number of examples to use in few-shot learning
150 | 
151 |         Returns:
152 |             total_average_score (float): Average total score over all explanations
153 |             average_scores (pd.Series): Average scores for each metric
154 |         """
155 |         return self.run_experiment(
156 |             self.explingo.few_shot,
157 |             prompt=prompt,
158 |             max_iters=max_iters,
159 |             kwargs={"n_few_shot": n_few_shot},
160 |         )
161 | 
162 |     def run_bootstrap_few_shot_experiment(
163 |         self,
164 |         prompt=None,
165 |         max_iters=100,
166 |         n_labeled_few_shot=3,
167 |         n_bootstrapped_few_shot=3,
168 |     ):
169 |         """
170 |         Run a bootstrap few-shot experiment
171 |         Args:
172 |             prompt (string): Prompt
173 |             max_iters (int): Maximum number of examples to run on
174 |             n_labeled_few_shot (int): Number of examples to use in few-shot learning
175 |             n_bootstrapped_few_shot (int): Number of  bootstrapped examples to use in few-shot learning
176 | 
177 |         Returns:
178 |             total_average_score (float): Average total score over all explanations
179 |             average_scores (pd.Series): Average scores for each metric
180 |         """
181 |         return self.run_experiment(
182 |             self.explingo.bootstrap_few_shot,
183 |             prompt=prompt,
184 |             max_iters=max_iters,
185 |             kwargs={
186 |                 "metric": self.metrics,
187 |                 "n_labeled_few_shot": n_labeled_few_shot,
188 |                 "n_bootstrapped_few_shot": n_bootstrapped_few_shot,
189 |             },
190 |         )
191 | 


--------------------------------------------------------------------------------
/evaluation/explingo.py:
--------------------------------------------------------------------------------
  1 | import dspy
  2 | from dspy.teleprompt import LabeledFewShot, BootstrapFewShot
  3 | import random
  4 | 
  5 | 
  6 | def _manually_parse_output(output):
  7 |     try:
  8 |         narrative = output.split("Narrative: ")[1].split("\n")[0]
  9 |     except IndexError:
 10 |         print(f"Unable to parse output: {output}")
 11 |         return None
 12 |     # rationalization = output.split("Rationalization: ")[1].split("\n")[0]
 13 |     return dspy.Prediction(
 14 |         narrative=narrative,
 15 |         # rationalization=rationalization,
 16 |     )
 17 | 
 18 | 
 19 | class ExplingoSig(dspy.Signature):
 20 |     """You are helping users understand an ML model's prediction. Given an explanation and information about the model,
 21 |     convert the explanation into a human-readable narrative."""
 22 | 
 23 |     context = dspy.InputField(desc="what the ML model predicts")
 24 |     explanation = dspy.InputField(desc="explanation of an ML model's prediction")
 25 |     explanation_format = dspy.InputField(desc="format the explanation is given in")
 26 | 
 27 |     narrative = dspy.OutputField(
 28 |         desc="human-readable narrative version of the explanation"
 29 |     )
 30 |     # rationalization = dspy.OutputField(
 31 |     #     desc="explains why given features may be relevant"
 32 |     # )
 33 | 
 34 | 
 35 | class Explingo:
 36 |     def __init__(self, llm, context, labeled_train_data, unlabeled_train_data=None):
 37 |         dspy.settings.configure(lm=llm, experimental=True)
 38 |         self.llm = llm
 39 |         self.context = context
 40 |         self.labeled_train_data = labeled_train_data
 41 |         self.unlabeled_train_data = (
 42 |             [] if unlabeled_train_data is None else unlabeled_train_data
 43 |         )
 44 |         self.few_shot_prompter = None
 45 |         self.bootstrapped_few_shot_prompter = None
 46 |         self.default_prompt = (
 47 |             "You are helping users understand an ML model's prediction. "
 48 |             "Given an explanation and information about the model, "
 49 |             "convert the explanation into a human-readable narrative."
 50 |         )
 51 | 
 52 |     def assemble_prompt(
 53 |         self, prompt, explanation, explanation_format, examples=None, k=3
 54 |     ):
 55 |         header_string = f"{prompt}\n"
 56 |         format_string = (
 57 |             f"Follow the following format\n"
 58 |             f"Context: what the model predicts\n"
 59 |             f"Explanation: explanation of the model's prediction\n"
 60 |             f"Explanation Format: format the explanation is given in\n"
 61 |             f"Narrative: human-readable narrative version of the explanation\n"
 62 |         )
 63 |         input_string = (
 64 |             f"Context: {self.context}\n"
 65 |             f"Explanation: {explanation}\n"
 66 |             f"Explanation Format: {explanation_format}\n"
 67 |             "Please provide the output field Narrative. "
 68 |             "Do so immediately, without additional content before or after, "
 69 |             "and precisely as the format above shows."
 70 |         )
 71 | 
 72 |         examples_string = ""
 73 |         if examples is not None:
 74 |             for i, example in enumerate(random.sample(examples, k)):
 75 |                 examples_string += (
 76 |                     f"Example {i+1}\n"
 77 |                     f"Context: {example.context}\n"
 78 |                     f"Explanation: {example.explanation}\n"
 79 |                     f"Explanation Format: {example.explanation_format}\n"
 80 |                     f"Narrative: {example.narrative}\n"
 81 |                 )
 82 | 
 83 |         if len(examples_string) == 0:
 84 |             return "---\n".join([header_string, format_string, input_string])
 85 |         else:
 86 |             return "---\n".join(
 87 |                 [header_string, format_string, examples_string, input_string]
 88 |             )
 89 | 
 90 |     def basic_prompt(self, explanation, explanation_format, prompt=None, few_shot_n=0):
 91 |         """
 92 |         Basic prompting
 93 | 
 94 |         Args:
 95 |             explanation (string): Explanation
 96 |             explanation_format (string): Explanation format
 97 |             prompt (string): Prompt
 98 |             few_shot_n (int): Number of examples to use in few-shot learning
 99 |         """
100 |         if prompt is None:
101 |             prompt = self.default_prompt
102 |         full_prompt = self.assemble_prompt(
103 |             prompt, explanation, explanation_format, examples=None
104 |         )
105 |         output = self.llm(full_prompt)[0]
106 |         return _manually_parse_output(output)
107 | 
108 |     def few_shot(
109 |         self, explanation, explanation_format, prompt=None, n_few_shot=3, use_dspy=False
110 |     ):
111 |         """
112 |         Few-shot prompting
113 | 
114 |         Args:
115 |             explanation (string): Explanation
116 |             explanation_format (string): Explanation format
117 |             prompt (string): Prompt
118 |             n_few_shot (int): Number of examples to use in few-shot learning
119 |             use_dspy (bool): Should be set to False, saving legacy version using DSPy in case needed later
120 | 
121 |         Returns:
122 |             DSPy Prediction object
123 |         """
124 |         if prompt is None:
125 |             prompt = self.default_prompt
126 |         if not use_dspy:
127 |             full_prompt = self.assemble_prompt(
128 |                 prompt,
129 |                 explanation,
130 |                 explanation_format,
131 |                 examples=self.labeled_train_data,
132 |                 n=n_few_shot,
133 |             )
134 |             output = self.llm(full_prompt)[0]
135 |             return _manually_parse_output(output)
136 |         if use_dspy:
137 |             if self.few_shot_prompter is None:
138 |                 optimizer = LabeledFewShot(k=n_few_shot)
139 |                 self.few_shot_prompter = optimizer.compile(
140 |                     dspy.Predict(ExplingoSig), trainset=self.labeled_train_data
141 |                 )
142 |             return self.few_shot_prompter(
143 |                 explanation=explanation,
144 |                 explanation_format=explanation_format,
145 |                 context=self.context,
146 |             )
147 | 
148 |     def bootstrap_few_shot(
149 |         self,
150 |         explanation,
151 |         explanation_format,
152 |         metric,
153 |         prompt=None,
154 |         n_labeled_few_shot=3,
155 |         n_bootstrapped_few_shot=3,
156 |     ):
157 |         """
158 |         Use DSPy to bootstrap few-shot prompts to optimize metrics
159 | 
160 |         Args:
161 |             prompt (string): Not supported, included for consistency. To modify prompt, manually
162 |                              edit the docstrings in the ExplingoSig object
163 |             explanation (string): Explanation
164 |             explanation_format (string): Explanation format
165 |             metric (string): Metric to optimize
166 |             n_labeled_few_shot (int): Number of examples to use in few-shot learning
167 |             n_bootstrapped_few_shot (int): Number of bootstrapped examples to use in few-shot learning
168 | 
169 |         Returns:
170 |             DSPy Prediction object
171 |         """
172 |         optimizer = BootstrapFewShot(
173 |             metric=metric,
174 |             max_bootstrapped_demos=n_bootstrapped_few_shot,
175 |             max_labeled_demos=n_labeled_few_shot,
176 |             max_rounds=3,
177 |         )
178 |         self.bootstrapped_few_shot_prompter = optimizer.compile(
179 |             dspy.Predict(ExplingoSig),
180 |             trainset=self.labeled_train_data + self.unlabeled_train_data,
181 |         )
182 |         return self.bootstrapped_few_shot_prompter(
183 |             explanation=explanation,
184 |             explanation_format=explanation_format,
185 |             context=self.context,
186 |         )
187 | 


--------------------------------------------------------------------------------
/evaluation/metrics.py:
--------------------------------------------------------------------------------
  1 | import dspy
  2 | import pandas as pd
  3 | import random
  4 | 
  5 | MAX_SCORE = 4
  6 | 
  7 | 
  8 | class RubricAssess(dspy.Signature):
  9 |     """Assess a narrative based on a rubric."""
 10 | 
 11 |     question = dspy.InputField(format=str)
 12 |     narrative = dspy.InputField()
 13 |     rubric = dspy.InputField()
 14 | 
 15 |     assessment = dspy.OutputField(
 16 |         desc="A single number from the options in the rubric. Provide only a single number with no other text."
 17 |     )
 18 | 
 19 | 
 20 | class BooleanAssess(dspy.Signature):
 21 |     """Assess a narrative with a yes/no question."""
 22 | 
 23 |     question = dspy.InputField(format=str)
 24 |     narrative = dspy.InputField()
 25 | 
 26 |     assessment = dspy.OutputField(desc="yes or no. Include only the word yes or no.")
 27 | 
 28 | 
 29 | class Metrics:
 30 |     def __init__(self, metric_funcs, openai_key, verbose=0, metric_kwargs=None):
 31 |         self.metric_funcs = metric_funcs
 32 |         self.verbose = verbose
 33 |         self.metric_kwargs = metric_kwargs if metric_kwargs is not None else {}
 34 |         self.grader = dspy.OpenAI(
 35 |             model="gpt-4o",
 36 |             max_tokens=500,
 37 |             model_type="chat",
 38 |             api_key=openai_key,
 39 |             temperature=0.3,
 40 |         )
 41 | 
 42 |     def __call__(self, input_, output_, trace=None):
 43 |         metrics = {}
 44 |         for metric in self.metric_funcs:
 45 |             metric_name = metric.__name__
 46 |             kwargs = self.metric_kwargs.get(metric_name, {})
 47 |             metrics[metric_name] = metric(
 48 |                 input_, output_, grader=self.grader, trace=trace, **kwargs
 49 |             )
 50 | 
 51 |         total_score = sum(metrics.values())
 52 | 
 53 |         if trace is None:
 54 |             return total_score, pd.Series(metrics)
 55 |         else:
 56 |             # print("Narrative:")
 57 |             # print(output_.narrative)
 58 |             # print("Metrics:")
 59 |             # for metric, score in metrics.items():
 60 |             #     print(f"{metric}: {score}")
 61 |             return (
 62 |                 (metrics["accuracy"] == MAX_SCORE)
 63 |                 and (metrics["fluency"] == MAX_SCORE)
 64 |                 and (metrics["completeness"] == MAX_SCORE)
 65 |                 and (metrics["conciseness"] >= 3.5)
 66 |             )
 67 |             # return (metrics["accuracy"] == MAX_SCORE) and (
 68 |             #     total_score >= (len(metrics) * MAX_SCORE * 0.9)
 69 |             # )
 70 | 
 71 | 
 72 | def compute_score_from_boolean(metric, question, narrative, grader, iters=3):
 73 |     total_score = 0.0
 74 | 
 75 |     with dspy.context(lm=grader):
 76 |         for i in range(iters):
 77 |             score = dspy.Predict(BooleanAssess)(
 78 |                 question=question, narrative=narrative
 79 |             ).assessment.lower()
 80 |             if score == "yes":
 81 |                 total_score += 1
 82 |             elif score == "no":
 83 |                 pass
 84 |             else:
 85 |                 print("Invalid score for metric %s: %s" % (metric, score))
 86 |     score = total_score / iters
 87 | 
 88 |     if 0.3 < score < 0.7:
 89 |         print("Inconsistent score for metric %s: %s" % (metric, score))
 90 | 
 91 |     return score * MAX_SCORE
 92 | 
 93 | 
 94 | def compute_score_from_rubric(
 95 |     metric, question, rubric, narrative, grader, iters=3, rational_type=None
 96 | ):
 97 |     scores = []
 98 |     with dspy.context(lm=grader):
 99 |         for i in range(iters):
100 |             if rational_type is None:
101 |                 score = dspy.Predict(RubricAssess)(
102 |                     question=question, rubric=rubric, narrative=narrative
103 |                 ).assessment
104 |             else:
105 |                 score = dspy.ChainOfThought(RubricAssess, rationale_type=rational_type)(
106 |                     question=question,
107 |                     rubric=rubric,
108 |                     narrative=narrative,
109 |                 ).assessment
110 |             try:
111 |                 scores.append(int(score))
112 |             except ValueError:
113 |                 print("Invalid score for metric %s: %s" % (metric, score))
114 | 
115 |     if 0 in scores and MAX_SCORE in scores:
116 |         print("Inconsistent score for metric %s: %s" % (metric, scores))
117 | 
118 |     return sum(scores) / iters
119 | 
120 | 
121 | def accuracy(input_, output_, grader, trace=None):
122 |     question = (
123 |         f"How accurate is the information in the narrative, based on the explanation given? "
124 |         f"A narrative can score 4 even if it is missing information as long as everything in the narrative is correct. "
125 |         f"Make sure the contribution direction is correct - positive contributions increase the output, negative contributions decrease the output."
126 |         f"\n\nExplanation format: {input_.explanation_format}.\nExplanation: {input_.explanation}"
127 |     )
128 |     rubric = f"0 - Contains one or more errors in value or contribution direction. 4 - Contains no errors, but may be missing information."
129 | 
130 |     rational_type = dspy.OutputField(
131 |         prefix="Start by listing out all the features in the narrative, and then for each one compare it to the explanation to ensure its value and contribution are approximately correct.",
132 |     )
133 | 
134 |     return compute_score_from_rubric(
135 |         "accuracy",
136 |         question,
137 |         rubric=rubric,
138 |         narrative=output_.narrative,
139 |         grader=grader,
140 |         rational_type=rational_type,
141 |     )
142 | 
143 | 
144 | def fluency(
145 |     input_, output_, grader, trace=None, good_narratives=None, bad_narratives=None
146 | ):
147 |     if good_narratives is None:
148 |         question = f"How natural and human is the narrative?"
149 |     else:
150 |         question = f"How well does the style of the narrative match the style of the example narratives? Consider only the linguistic style, not the topic. Example narratives:"
151 |         for narrative in good_narratives:
152 |             question += f"\n{narrative}"
153 |     if good_narratives is not None:
154 |         rubric = f"0: Very dissimilar. 1: Dissimilar. 2: Neutral. 3: Similar. 4: Very similar"
155 |     else:
156 |         rubric = (
157 |             f"0: Very unnatural. 1: Unnatural. 2: Neutral. 3: Natural. 4: Very natural"
158 |         )
159 |     return compute_score_from_rubric(
160 |         "fluency", question, rubric, output_.narrative, grader
161 |     )
162 | 
163 | 
164 | def completeness(input_, output_, grader, trace=None):
165 |     question = f"How completely does the narrative below describe the explanation given in <<>>?\nExplanation format: {input_.explanation_format}.\nExplanation: <<{input_.explanation}>>"
166 |     rubric = "0 - One or more feature names from the explanation are not mentioned at all in the narrative. 2 - All features are mentioned, but not all feature values and/or contribution directions. 4 - All features are mentioned, and for each feature, includes at least an approximation of the feature's value and contribution direction."
167 |     rational_type = dspy.OutputField(
168 |         prefix="Start by listing out all the features in the explanations, and then determine every feature is present in the narrative, along with its value and contribution direction.",
169 |     )
170 | 
171 |     return compute_score_from_rubric(
172 |         "completeness",
173 |         question,
174 |         rubric,
175 |         output_.narrative,
176 |         grader,
177 |         rational_type=rational_type,
178 |     )
179 | 
180 | 
181 | def conciseness(
182 |     input_, output_, grader=None, trace=None, max_optimal_length_per_feature=20
183 | ):
184 |     num_features = input_.explanation.count("(")
185 |     if num_features == 0:
186 |         num_features = 1
187 |     length = len(output_.narrative.split())
188 |     max_optimal_length = max_optimal_length_per_feature * num_features
189 |     # scale length between 0 and 2
190 |     return max(
191 |         0.0,
192 |         min(
193 |             MAX_SCORE,
194 |             MAX_SCORE * (2 - length / max_optimal_length),
195 |         ),
196 |     )
197 | 
198 | 
199 | def context_awareness(input_, output_, grader, trace=None):
200 |     question = (
201 |         f"How well does the rationalization help explain the logic in the narrative?"
202 |     )
203 |     rubric = f"0: Not at all. 2: Somewhat. 4: Very well."
204 |     narrative_input = (
205 |         f"Narrative: {output_.narrative}. Rationalization: {output_.rationalization}"
206 |     )
207 |     return compute_score_from_rubric(
208 |         "context_awareness", question, rubric, narrative_input, grader
209 |     )
210 | 


--------------------------------------------------------------------------------
/evaluation/local_cache/compiler/all.e415303eb7359b9a.jsonl:
--------------------------------------------------------------------------------
 1 | {"prompt":"Narrative: The above ground living area increases the prediction.\n\nQuestion: Does the narrative contain all information from the explanation? Explanation format: SHAP feature contribution in (feature_name, feature_value, contribution) format. Explanation: (Above ground living area square feet, 1256.00, -12527.46), (Rates the overall material and finish of the house, 5.00, -10743.76), (Second floor square feet, 0.00, -10142.29), (Physical locations within Ames city limits, Edwards, -9913.81), (Wood deck area in square feet, 736.00, 9846.38)\n\nRubric: 0: Does not mention all features. 2: Mentions all features. 4: Mentions all features, and gives information about their values and impact on the model.\n\nAssessment:","completion":"0"}
 2 | {"prompt":"Narrative: The brick exterior has a positive impact.\n\nQuestion: Does the narrative contain all information from the explanation? Explanation format: SHAP feature contribution in (feature_name, feature_value, contribution) format. Explanation: (Exterior covering on house, BrkFace, 16798.14), (Original construction date, 1931.00, -13042.68), (Kitchens above grade, 2.00, -12983.78), (Home functionality, Min1, -11474.19), (Second floor square feet, 752.00, 10022.69)\n\nRubric: 0: Does not mention all features. 2: Mentions all features. 4: Mentions all features, and gives information about their values and impact on the model.\n\nAssessment:","completion":"0"}
 3 | {"prompt":"Narrative: The above ground living area increases the prediction.\n\nQuestion: Does the narrative contain all information from the explanation? Explanation format: SHAP feature contribution in (feature_name, feature_value, contribution) format. Explanation: (Above ground living area square feet, 2090.00, 16382.07), (Second floor square feet, 983.00, 16216.99), (Physical locations within Ames city limits, NWAmes, -9769.73), (Type 1 finished square feet, 859.00, 6193.63), (Masonry veneer type, Stone, 5446.26)\n\nRubric: 0: Does not mention all features. 2: Mentions all features. 4: Mentions all features, and gives information about their values and impact on the model.\n\nAssessment:","completion":"0"}
 4 | {"prompt":"Narrative: The construction date is a negative factor.\n\nQuestion: Does the narrative contain all information from the explanation? Explanation format: SHAP feature contribution in (feature_name, feature_value, contribution) format. Explanation: (Original construction date, 1915.00, -17966.77), (Physical locations within Ames city limits, Crawfor, 17703.26), (Second floor square feet, 756.00, 10129.96), (Total square feet of basement area, 756.00, -8362.22), (Condition of sale, Abnorml, -6786.66)\n\nRubric: 0: Does not mention all features. 2: Mentions all features. 4: Mentions all features, and gives information about their values and impact on the model.\n\nAssessment:","completion":"0"}
 5 | {"prompt":"Narrative: This houses exterior covering is brick, which increased the predicted price by about $17,000. The house is older than average, with a construction year of 1931, which reduced the predicted price by about $13,000. The house's two kitchens reduced the price by about $13,000. The house's second floor size of over 700 sq ft increased the price by about $10,000.\n\nQuestion: Does the narrative contain all information from the explanation? Explanation format: SHAP feature contribution in (feature_name, feature_value, contribution) format. Explanation: (Exterior covering on house, BrkFace, 16798.14), (Original construction date, 1931.00, -13042.68), (Kitchens above grade, 2.00, -12983.78), (Second floor square feet, 752.00, 10022.69)\n\nRubric: 0: Does not mention all features. 2: Mentions all features. 4: Mentions all features, and gives information about their values and impact on the model.\n\nAssessment:","completion":"4"}
 6 | {"prompt":"Narrative: The original construction date plays a significant role.\n\nQuestion: Does the narrative contain all information from the explanation? Explanation format: SHAP feature contribution in (feature_name, feature_value, contribution) format. Explanation: (Second floor square feet, 854.00, 12757.84), (Original construction date, 2003.00, 9115.72), (Total square feet of basement area, 856.00, -6157.86)\n\nRubric: 0: Does not mention all features. 2: Mentions all features. 4: Mentions all features, and gives information about their values and impact on the model.\n\nAssessment:","completion":"0"}
 7 | {"prompt":"Narrative: The garden level walls have a positive impact.\n\nQuestion: Does the narrative contain all information from the explanation? Explanation format: SHAP feature contribution in (feature_name, feature_value, contribution) format. Explanation: (Refers to walkout or garden level walls, Gd, 17607.43), (Rates the overall condition of the house, 8.00, 13038.14), (Above ground living area square feet, 1262.00, -12319.48), (Second floor square feet, 0.00, -10142.29), (Proximity to various conditions, Feedr, -8251.83)\n\nRubric: 0: Does not mention all features. 2: Mentions all features. 4: Mentions all features, and gives information about their values and impact on the model.\n\nAssessment:","completion":"0"}
 8 | {"prompt":"Narrative: The wood foundation decreases the predicted price.\n\nQuestion: Does the narrative contain all information from the explanation? Explanation format: SHAP feature contribution in (feature_name, feature_value, contribution) format. Explanation: (Type of foundation, Wood, -18650.67), (Physical locations within Ames city limits, Mitchel, -13510.92), (Rates the overall material and finish of the house, 5.00, -10743.76), (Three season porch area in square feet, 320.00, 9959.33), (Bedrooms above ground, 1.00, 8905.73)\n\nRubric: 0: Does not mention all features. 2: Mentions all features. 4: Mentions all features, and gives information about their values and impact on the model.\n\nAssessment:","completion":"0"}
 9 | {"prompt":"Narrative: The total square feet of the basement area has a significant impact.\n\nQuestion: Does the narrative contain all information from the explanation? Explanation format: SHAP feature contribution in (feature_name, feature_value, contribution) format. Explanation: (Type 1 finished square feet, 1369.00, 14641.53), (Evaluates the height of the basement, Ex, 13233.24), (Total square feet of basement area, 1686.00, 12138.28), (Second floor square feet, 0.00, -10142.29), (Rates the overall material and finish of the house, 8.00, 9655.79)\n\nRubric: 0: Does not mention all features. 2: Mentions all features. 4: Mentions all features, and gives information about their values and impact on the model.\n\nAssessment:","completion":"0"}
10 | {"prompt":"Narrative: The total square feet of the basement area influences the prediction.\n\nQuestion: Does the narrative contain all information from the explanation? Explanation format: SHAP feature contribution in (feature_name, feature_value, contribution) format. Explanation: (Second floor square feet, 866.00, 13079.62), (Original construction date, 2001.00, 8500.21), (Above ground living area square feet, 1786.00, 5844.30), (Physical locations within Ames city limits, CollgCr, -4761.42), (Total square feet of basement area, 920.00, -4747.08)\n\nRubric: 0: Does not mention all features. 2: Mentions all features. 4: Mentions all features, and gives information about their values and impact on the model.\n\nAssessment:","completion":"0"}
11 | {"prompt":"Narrative: The large finished square footage (of around 1300) increases the price by over $14,000, while the lack of a second floor decreases it by around $10,000.\n\nQuestion: Does the narrative contain all information from the explanation? Explanation format: SHAP feature contribution in (feature_name, feature_value, contribution) format. Explanation: (Type 1 finished square feet, 1369.00, 14641.53), (Second floor square feet, 0.00, -10142.29)\n\nRubric: 0: Does not mention all features. 2: Mentions all features. 4: Mentions all features, and gives information about their values and impact on the model.\n\nAssessment:","completion":"4"}
12 | {"prompt":"Narrative: The second floor square footage of 752 increased the predicted price by about $10,000.\n\nQuestion: Does the narrative contain all information from the explanation? Explanation format: SHAP feature contribution in (feature_name, feature_value, contribution) format. Explanation: (Second floor square feet, 752.00, 10022.69)\n\nRubric: 0: Does not mention all features. 2: Mentions all features. 4: Mentions all features, and gives information about their values and impact on the model.\n\nAssessment:","completion":"4"}
13 | {"prompt":"Narrative: The location in NoRidge affects the price.\n\nQuestion: Does the narrative contain all information from the explanation? Explanation format: SHAP feature contribution in (feature_name, feature_value, contribution) format. Explanation: (Physical locations within Ames city limits, NoRidge, 23069.89), (Above ground living area square feet, 2198.00, 20125.75), (Second floor square feet, 1053.00, 18094.05), (Rates the overall material and finish of the house, 8.00, 9655.79), (Original construction date, 2000.00, 8192.46)\n\nRubric: 0: Does not mention all features. 2: Mentions all features. 4: Mentions all features, and gives information about their values and impact on the model.\n\nAssessment:","completion":"0"}
14 | 


--------------------------------------------------------------------------------
/explingo/narrator.py:
--------------------------------------------------------------------------------
  1 | import random
  2 | 
  3 | import dspy
  4 | from dspy.teleprompt import BootstrapFewShot
  5 | 
  6 | 
  7 | def _manually_parse_output(output):
  8 |     try:
  9 |         narrative = output.split("Narrative: ")[1].split("\n")[0]
 10 |     except IndexError:
 11 |         print(f"Unable to parse output: {output}")
 12 |         return None
 13 |     # rationalization = output.split("Rationalization: ")[1].split("\n")[0]
 14 |     return dspy.Prediction(
 15 |         narrative=narrative,
 16 |         # rationalization=rationalization,
 17 |     )
 18 | 
 19 | 
 20 | class NarratorSig(dspy.Signature):
 21 |     """You are helping users understand an ML model's prediction. Given an explanation
 22 |     and information about the model, convert the explanation into a human-readable narrative."""
 23 | 
 24 |     context = dspy.InputField(desc="what the ML model predicts")
 25 |     explanation = dspy.InputField(desc="explanation of an ML model's prediction")
 26 |     explanation_format = dspy.InputField(desc="format the explanation is given in")
 27 | 
 28 |     narrative = dspy.OutputField(desc="human-readable narrative version of the explanation")
 29 |     # rationalization = dspy.OutputField(
 30 |     #     desc="explains why given features may be relevant"
 31 |     # )
 32 | 
 33 | 
 34 | class Narrator:
 35 |     def __init__(
 36 |         self,
 37 |         explanation_format,
 38 |         context,
 39 |         llm=None,
 40 |         openai_api_key=None,
 41 |         sample_narratives=None,
 42 |         gpt_model_name="gpt-4o",
 43 |     ):
 44 |         """
 45 |         Args:
 46 |             explanation_format (string): Format explanations will take
 47 |             context (string): Brief description of what the model predicts
 48 |                 (ie. "the model predicts house prices")
 49 |             llm (LLM object): DSPy LLM object to use.
 50 |                 See https://dspy-docs.vercel.app/docs/building-blocks/language_models for examples
 51 |                 One of llm or openai_api_key must be provided
 52 |             openai_api_key (string): OpenAI API key to use
 53 |             gpt_model_name (string): if openai_api_key is provided,
 54 |                 specifies the GPT version to use
 55 |             sample_narratives (list of tuples of strings):
 56 |                 List of (explanation, narrative) examples
 57 |         """
 58 |         self.llm = llm
 59 |         if self.llm is None and openai_api_key is not None:
 60 |             self.llm = dspy.OpenAI(model=gpt_model_name, api_key=openai_api_key, max_tokens=1000)
 61 |         self.context = context
 62 |         self.explanation_format = explanation_format
 63 |         self.sample_narratives = []
 64 |         if sample_narratives is not None:
 65 |             for example in sample_narratives:
 66 |                 self.sample_narratives.append(
 67 |                     dspy.Example(
 68 |                         explanation=example[0],
 69 |                         narrative=example[1],
 70 |                         context=self.context,
 71 |                         explanation_format=explanation_format,
 72 |                     ).with_inputs("explanation", "context", "explanation_format")
 73 |                 )
 74 | 
 75 |         self.few_shot_prompter = None
 76 |         self.bootstrapped_few_shot_prompter = None
 77 |         self.default_prompt = (
 78 |             "You are helping users understand an ML model's prediction. "
 79 |             "Given an explanation and information about the model, "
 80 |             "convert the explanation into a human-readable narrative."
 81 |         )
 82 | 
 83 |     def _assemble_prompt(self, prompt, explanation, explanation_format, examples=None, n=3):
 84 |         header_string = f"{prompt}\n"
 85 |         format_string = (
 86 |             "Follow the following format\n"
 87 |             "Context: what the model predicts\n"
 88 |             "Explanation: explanation of the model's prediction\n"
 89 |             "Explanation Format: format the explanation is given in\n"
 90 |             "Narrative: human-readable narrative version of the explanation\n"
 91 |         )
 92 |         input_string = (
 93 |             f"Context: {self.context}\n"
 94 |             f"Explanation: {explanation}\n"
 95 |             f"Explanation Format: {explanation_format}\n"
 96 |             "Please provide the output field Narrative. "
 97 |             "Do so immediately, without additional content before or after, "
 98 |             "and precisely as the format above shows."
 99 |         )
100 | 
101 |         examples_string = ""
102 |         if examples is not None:
103 |             for i, example in enumerate(random.sample(examples, n)):
104 |                 examples_string += (
105 |                     f"Example {i+1}\n"
106 |                     f"Context: {example.context}\n"
107 |                     f"Explanation: {example.explanation}\n"
108 |                     f"Explanation Format: {example.explanation_format}\n"
109 |                     f"Narrative: {example.narrative}\n"
110 |                 )
111 | 
112 |         if len(examples_string) == 0:
113 |             return "---\n".join([header_string, format_string, input_string])
114 |         else:
115 |             return "---\n".join([header_string, format_string, examples_string, input_string])
116 | 
117 |     def narrate(self, explanation, n_examples=3, n_bootstrapped=0, grader=None):
118 |         """
119 |         Transform an explanation into a human-readable narrative
120 | 
121 |         Args:
122 |             explanation (string): Explanation, in the format specified by self.explanation_format
123 |             n_examples (int): Number of examples to pass
124 |             n_bootstrapped (int): Number of bootstrapped examples to pass. Increasing this number
125 |                 will incur additional calls to the LLM, but may improve the quality of the output
126 |                 n_bootstrapped should be less than or equal to n_examples
127 |             grader (Grader): Grader object to use for bootstrapping. Must be provided if
128 |                 n_bootstrapped > 0
129 |         """
130 |         if n_bootstrapped > 0:
131 |             return self.bootstrap_few_shot(
132 |                 explanation,
133 |                 self.explanation_format,
134 |                 metric=grader.run_metrics,
135 |                 n_labeled_few_shot=n_examples,
136 |                 n_bootstrapped_few_shot=n_bootstrapped,
137 |             ).narrative
138 |         if self.sample_narratives:
139 |             return self.few_shot(
140 |                 explanation, self.explanation_format, n_few_shot=n_examples
141 |             ).narrative
142 |         else:
143 |             return self.basic_prompt(explanation, self.explanation_format).narrative
144 | 
145 |     def basic_prompt(self, explanation, explanation_format, prompt=None, few_shot_n=0):
146 |         """
147 |         Basic prompting
148 | 
149 |         Args:
150 |             explanation (string): Explanation
151 |             explanation_format (string): Explanation format
152 |             prompt (string): Prompt
153 |             few_shot_n (int): Number of examples to use in few-shot learning
154 |         """
155 |         if prompt is None:
156 |             prompt = self.default_prompt
157 |         full_prompt = self._assemble_prompt(prompt, explanation, explanation_format, examples=None)
158 |         output = self.llm(full_prompt)[0]
159 |         return _manually_parse_output(output)
160 | 
161 |     def few_shot(self, explanation, explanation_format, prompt=None, n_few_shot=3, use_dspy=False):
162 |         """
163 |         Few-shot prompting
164 | 
165 |         Args:
166 |             explanation (string): Explanation
167 |             explanation_format (string): Explanation format
168 |             prompt (string): Prompt
169 |             n_few_shot (int): Number of examples to use in few-shot learning
170 |             use_dspy (bool): Should be set to False, saving legacy version using DSPy
171 |                 in case needed later
172 | 
173 |         Returns:
174 |             DSPy Prediction object
175 |         """
176 |         if prompt is None:
177 |             prompt = self.default_prompt
178 |         if not use_dspy:
179 |             full_prompt = self._assemble_prompt(
180 |                 prompt,
181 |                 explanation,
182 |                 explanation_format,
183 |                 examples=self.sample_narratives,
184 |                 n=n_few_shot,
185 |             )
186 |             output = self.llm(full_prompt)[0]
187 |             return _manually_parse_output(output)
188 | 
189 |     def bootstrap_few_shot(
190 |         self,
191 |         explanation,
192 |         explanation_format,
193 |         metric,
194 |         n_labeled_few_shot=3,
195 |         n_bootstrapped_few_shot=3,
196 |     ):
197 |         """
198 |         Use DSPy to bootstrap few-shot prompts to optimize metrics
199 | 
200 |         Args:
201 |             explanation (string): Explanation
202 |             explanation_format (string): Explanation format
203 |             metric (func): Metric to use for optimization
204 |             n_labeled_few_shot (int): Number of examples to use in few-shot learning
205 |             n_bootstrapped_few_shot (int): Number of bootstrapped examples to use in
206 |                 few-shot learning
207 | 
208 |         Returns:
209 |             DSPy Prediction object
210 |         """
211 |         with dspy.context(lm=self.llm):
212 |             optimizer = BootstrapFewShot(
213 |                 metric=metric,
214 |                 max_bootstrapped_demos=n_bootstrapped_few_shot,
215 |                 max_labeled_demos=n_labeled_few_shot,
216 |                 max_rounds=3,
217 |             )
218 |             self.bootstrapped_few_shot_prompter = optimizer.compile(
219 |                 dspy.Predict(NarratorSig),
220 |                 trainset=self.sample_narratives,
221 |             )
222 |             return self.bootstrapped_few_shot_prompter(
223 |                 explanation=explanation,
224 |                 explanation_format=explanation_format,
225 |                 context=self.context,
226 |             )
227 | 


--------------------------------------------------------------------------------
/evaluation/eval_data/mushroom_2.json:
--------------------------------------------------------------------------------
  1 | [
  2 |   {
  3 |     "explanation": "(odor, foul, 0.24), (stalk-surface-above-ring, silky, 0.10), (spore-print-color, chocolate, 0.06)",
  4 |     "context": "The model predicts whether a mushroom is poisonous",
  5 |     "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format",
  6 |     "narrative": "The foul odor, silky stalk surface, and chocolate spore print color suggest the mushroom more likely to be poisonous."
  7 |   },
  8 |   {
  9 |     "explanation": "(odor, foul, 0.22), (stalk-surface-above-ring, silky, 0.11), (spore-print-color, chocolate, 0.07)",
 10 |     "context": "The model predicts whether a mushroom is poisonous",
 11 |     "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format",
 12 |     "narrative": "The foul odor, silky stalk surface, and chocolate spore print color suggest the mushroom more likely to be poisonous."
 13 |   },
 14 |   {
 15 |     "explanation": "(odor, none, -0.15), (gill-size, broad, -0.05), (spore-print-color, brown, 0.05)",
 16 |     "context": "The model predicts whether a mushroom is poisonous",
 17 |     "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format",
 18 |     "narrative": "The absence of odor and broad gill size suggest the mushroom is less likely to be poisonous, but the brown spore print indicates a higher risk of toxicity."
 19 |   },
 20 |   {
 21 |     "explanation": "(odor, foul, 0.24), (stalk-surface-above-ring, silky, 0.10), (spore-print-color, chocolate, 0.07)",
 22 |     "context": "The model predicts whether a mushroom is poisonous",
 23 |     "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format"
 24 |   },
 25 |   {
 26 |     "explanation": "(odor, none, -0.15), (gill-size, broad, -0.06), (spore-print-color, black, -0.05)",
 27 |     "context": "The model predicts whether a mushroom is poisonous",
 28 |     "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format",
 29 |     "narrative": "The lack of odor, broad gill size, and black spore print suggest the mushroom is less likely to be poisonous"
 30 |   },
 31 |   {
 32 |     "explanation": "(odor, none, -0.14), (gill-size, broad, -0.07), (spore-print-color, black, -0.05)",
 33 |     "context": "The model predicts whether a mushroom is poisonous",
 34 |     "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format",
 35 |     "narrative": "The lack of odor, a broad gill size, and a black spore print color suggest the mushroom is less likely to be poisonous."
 36 |   },
 37 |   {
 38 |     "explanation": "(odor, foul, 0.24), (stalk-surface-above-ring, silky, 0.09), (spore-print-color, chocolate, 0.07)",
 39 |     "context": "The model predicts whether a mushroom is poisonous",
 40 |     "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format",
 41 |     "narrative": "The foul odor, silky stalk surface, and chocolate spore print color suggest the mushroom is more likely to be poisonous."
 42 |   },
 43 |   {
 44 |     "explanation": "(odor, none, -0.15), (gill-spacing, crowded, 0.07), (gill-size, broad, 0.05)",
 45 |     "context": "The model predicts whether a mushroom is poisonous",
 46 |     "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format",
 47 |     "narrative": "The lack of an odor suggest this mushroom to be less likely to be poisonous, but the broad gill size and crowded gill spacing indicate a higher risk of toxicity."
 48 |   },
 49 |   {
 50 |     "explanation": "(odor, none, 0.14), (gill-size, broad, 0.08), (ring-type, pendant, 0.05)",
 51 |     "context": "The model predicts whether a mushroom is poisonous",
 52 |     "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format",
 53 |     "narrative": "The lack of odor, a broad gill size, and a pendant ring type suggest the mushroom is more likely to be poisonous."
 54 |   },
 55 |   {
 56 |     "explanation": "(odor, none, 0.15), (gill-size, broad, 0.06), (spore-print-color, black, 0.05)",
 57 |     "context": "The model predicts whether a mushroom is poisonous",
 58 |     "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format",
 59 |     "narrative": "The lack of odor, a broad gill size, and a black spore print color suggest the mushroom is more likely to be poisonous."
 60 |   },
 61 |   {
 62 |     "explanation": "(odor, none, -0.14), (gill-spacing, crowded, 0.09), (gill-size, broad, 0.06)",
 63 |     "context": "The model predicts whether a mushroom is poisonous",
 64 |     "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format",
 65 |     "narrative": "The lack of odor suggest this mushroom is less likely to be poisonous. However, the crowded gill spacing and broad gill size indicate a higher risk of toxicity."
 66 |   },
 67 |   {
 68 |     "explanation": "(odor, foul, 0.19), (gill-color, buff, 0.07), (gill-size, narrow, 0.07)",
 69 |     "context": "The model predicts whether a mushroom is poisonous",
 70 |     "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format"
 71 |   },
 72 |   {
 73 |     "explanation": "(gill-size, narrow, -0.13), (odor, fishy, -0.10), (gill-color, buff, -0.08)",
 74 |     "context": "The model predicts whether a mushroom is poisonous",
 75 |     "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format",
 76 |     "narrative": "The narrow gill size, fishy odor, and buff gill color suggest the mushroom is less likely to be poisonous."
 77 |   },
 78 |   {
 79 |     "explanation": "(odor, foul, 0.19), (gill-size, narrow, 0.09), (gill-color, buff, 0.07)",
 80 |     "context": "The model predicts whether a mushroom is poisonous",
 81 |     "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format"
 82 |   },
 83 |   {
 84 |     "explanation": "(gill-size, narrow, 0.12), (odor, fishy, 0.11), (gill-color, buff, 0.08)",
 85 |     "context": "The model predicts whether a mushroom is poisonous",
 86 |     "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format"
 87 |   },
 88 |   {
 89 |     "explanation": "(odor, none, 0.14), (gill-spacing, crowded, 0.08), (gill-size, broad, 0.06)",
 90 |     "context": "The model predicts whether a mushroom is poisonous",
 91 |     "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format"
 92 |   },
 93 |   {
 94 |     "explanation": "(odor, foul, 0.25), (stalk-surface-above-ring, silky, 0.11), (spore-print-color, chocolate, 0.06)",
 95 |     "context": "The model predicts whether a mushroom is poisonous",
 96 |     "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format"
 97 |   },
 98 |   {
 99 |     "explanation": "(odor, none, 0.13), (gill-size, broad, 0.08), (stalk-surface-above-ring, smooth, 0.04)",
100 |     "context": "The model predicts whether a mushroom is poisonous",
101 |     "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format"
102 |   },
103 |   {
104 |     "explanation": "(gill-size, narrow, 0.12), (odor, fishy, 0.11), (gill-color, buff, 0.08)",
105 |     "context": "The model predicts whether a mushroom is poisonous",
106 |     "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format"
107 |   },
108 |   {
109 |     "explanation": "(gill-size, narrow, 0.10), (odor, spicy, 0.08), (gill-color, buff, 0.07)",
110 |     "context": "The model predicts whether a mushroom is poisonous",
111 |     "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format"
112 |   },
113 |   {
114 |     "explanation": "(spore-print-color, green, 0.26), (ring-number, two, 0.10), (odor, none, -0.06)",
115 |     "context": "The model predicts whether a mushroom is poisonous",
116 |     "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format"
117 |   },
118 |   {
119 |     "explanation": "(odor, pungent, 0.18), (gill-size, narrow, 0.18), (stalk-shape, enlarging, 0.05)",
120 |     "context": "The model predicts whether a mushroom is poisonous",
121 |     "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format"
122 |   },
123 |   {
124 |     "explanation": "(odor, none, 0.15), (gill-size, broad, 0.08), (spore-print-color, black, 0.05)",
125 |     "context": "The model predicts whether a mushroom is poisonous",
126 |     "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format"
127 |   },
128 |   {
129 |     "explanation": "(odor, none, 0.14), (gill-spacing, crowded, 0.08), (gill-size, broad, 0.06)",
130 |     "context": "The model predicts whether a mushroom is poisonous",
131 |     "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format"
132 |   },
133 |   {
134 |     "explanation": "(gill-size, narrow, 0.12), (odor, spicy, 0.10), (gill-color, buff, 0.08)",
135 |     "context": "The model predicts whether a mushroom is poisonous",
136 |     "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format"
137 |   },
138 |   {
139 |     "explanation": "(odor, none, 0.14), (gill-size, broad, 0.07), (spore-print-color, brown, 0.05)",
140 |     "context": "The model predicts whether a mushroom is poisonous",
141 |     "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format"
142 |   },
143 |   {
144 |     "explanation": "(gill-size, narrow, 0.12), (odor, fishy, 0.11), (gill-color, buff, 0.08)",
145 |     "context": "The model predicts whether a mushroom is poisonous",
146 |     "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format"
147 |   },
148 |   {
149 |     "explanation": "(gill-size, narrow, 0.12), (odor, fishy, 0.10), (gill-color, buff, 0.08)",
150 |     "context": "The model predicts whether a mushroom is poisonous",
151 |     "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format"
152 |   },
153 |   {
154 |     "explanation": "(odor, none, 0.14), (gill-size, broad, 0.07), (spore-print-color, brown, 0.05)",
155 |     "context": "The model predicts whether a mushroom is poisonous",
156 |     "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format"
157 |   },
158 |   {
159 |     "explanation": "(odor, foul, 0.22), (stalk-surface-above-ring, silky, 0.10), (spore-print-color, chocolate, 0.06)",
160 |     "context": "The model predicts whether a mushroom is poisonous",
161 |     "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format"
162 |   }
163 | ]


--------------------------------------------------------------------------------
/evaluation/eval_data/mushroom_1.json:
--------------------------------------------------------------------------------
  1 | [
  2 |   {
  3 |     "explanation": "(odor, foul, 0.24), (stalk-surface-above-ring, silky, 0.10), (spore-print-color, chocolate, 0.06)",
  4 |     "context": "The model predicts whether a mushroom is poisonous",
  5 |     "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format",
  6 |     "narrative": "This mushroom is more likely to be poisonous because its foul odor, silky stalk surface, and chocolate spore print color. Be careful!"
  7 |   },
  8 |   {
  9 |     "explanation": "(odor, foul, 0.22), (stalk-surface-above-ring, silky, 0.11), (spore-print-color, chocolate, 0.07)",
 10 |     "context": "The model predicts whether a mushroom is poisonous",
 11 |     "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format",
 12 |     "narrative": "This mushroom is more likely to be poisonous because its foul odor, silky stalk surface, and chocolate spore print color. Be careful!"
 13 |   },
 14 |   {
 15 |     "explanation": "(odor, none, -0.15), (gill-size, broad, -0.05), (spore-print-color, brown, 0.05)",
 16 |     "context": "The model predicts whether a mushroom is poisonous",
 17 |     "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format",
 18 |     "narrative": "This mushroom is less likely to be poisonous because it has no odor and a broad gill size. However, its brown spore print color increases the likelihood of it being poisonous. Be cautious!"
 19 |   },
 20 |   {
 21 |     "explanation": "(odor, foul, 0.24), (stalk-surface-above-ring, silky, 0.10), (spore-print-color, chocolate, 0.07)",
 22 |     "context": "The model predicts whether a mushroom is poisonous",
 23 |     "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format"
 24 |   },
 25 |   {
 26 |     "explanation": "(odor, none, -0.15), (gill-size, broad, -0.06), (spore-print-color, black, -0.05)",
 27 |     "context": "The model predicts whether a mushroom is poisonous",
 28 |     "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format",
 29 |     "narrative": "This mushroom is less likely to be poisonous because it has no odor, a broad gill size, and a black spore print color. You should still confirm with external sources."
 30 |   },
 31 |   {
 32 |     "explanation": "(odor, none, -0.14), (gill-size, broad, -0.07), (spore-print-color, black, -0.05)",
 33 |     "context": "The model predicts whether a mushroom is poisonous",
 34 |     "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format",
 35 |     "narrative": "This mushroom is less likely to be poisonous because it has no odor, a broad gill size, and a black spore print color. You should still confirm with external sources."
 36 |   },
 37 |   {
 38 |     "explanation": "(odor, foul, 0.24), (stalk-surface-above-ring, silky, 0.09), (spore-print-color, chocolate, 0.07)",
 39 |     "context": "The model predicts whether a mushroom is poisonous",
 40 |     "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format",
 41 |     "narrative": "This mushroom is more likely to be poisonous because its foul odor, silky stalk surface, and chocolate spore print color. Be careful!"
 42 |   },
 43 |   {
 44 |     "explanation": "(odor, none, -0.15), (gill-spacing, crowded, 0.07), (gill-size, broad, 0.05)",
 45 |     "context": "The model predicts whether a mushroom is poisonous",
 46 |     "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format",
 47 |     "narrative": "While the lack of an odor make this mushroom less likely to be poisonous, its broad gill size and crowded gill spacing increases the likelihood of it being poisonous. Be cautious!"
 48 |   },
 49 |   {
 50 |     "explanation": "(odor, none, 0.14), (gill-size, broad, 0.08), (ring-type, pendant, 0.05)",
 51 |     "context": "The model predicts whether a mushroom is poisonous",
 52 |     "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format",
 53 |     "narrative": "This mushroom is more likely to be poisonous because it has no odor, a broad gill size, and a pendant ring type. Be careful!"
 54 |   },
 55 |   {
 56 |     "explanation": "(odor, none, 0.15), (gill-size, broad, 0.06), (spore-print-color, black, 0.05)",
 57 |     "context": "The model predicts whether a mushroom is poisonous",
 58 |     "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format",
 59 |     "narrative": "This mushroom is more likely to be poisonous because it has no odor, a broad gill size, and a black spore print color. Be careful!"
 60 |   },
 61 |   {
 62 |     "explanation": "(odor, none, -0.14), (gill-spacing, crowded, 0.09), (gill-size, broad, 0.06)",
 63 |     "context": "The model predicts whether a mushroom is poisonous",
 64 |     "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format",
 65 |     "narrative": "This mushroom is less likely to be poisonous because it has no odor. However, its crowded gill spacing and broad gill size increases the likelihood of it being poisonous. Be cautious!"
 66 |   },
 67 |   {
 68 |     "explanation": "(odor, foul, 0.19), (gill-color, buff, 0.07), (gill-size, narrow, 0.07)",
 69 |     "context": "The model predicts whether a mushroom is poisonous",
 70 |     "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format"
 71 |   },
 72 |   {
 73 |     "explanation": "(gill-size, narrow, -0.13), (odor, fishy, -0.10), (gill-color, buff, -0.08)",
 74 |     "context": "The model predicts whether a mushroom is poisonous",
 75 |     "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format",
 76 |     "narrative": "This mushroom is less likely to be poisonous because it has a narrow gill size, fishy odor, and buff gill color. We still recommend confirming with external sources."
 77 |   },
 78 |   {
 79 |     "explanation": "(odor, foul, 0.19), (gill-size, narrow, 0.09), (gill-color, buff, 0.07)",
 80 |     "context": "The model predicts whether a mushroom is poisonous",
 81 |     "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format"
 82 |   },
 83 |   {
 84 |     "explanation": "(gill-size, narrow, 0.12), (odor, fishy, 0.11), (gill-color, buff, 0.08)",
 85 |     "context": "The model predicts whether a mushroom is poisonous",
 86 |     "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format"
 87 |   },
 88 |   {
 89 |     "explanation": "(odor, none, 0.14), (gill-spacing, crowded, 0.08), (gill-size, broad, 0.06)",
 90 |     "context": "The model predicts whether a mushroom is poisonous",
 91 |     "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format"
 92 |   },
 93 |   {
 94 |     "explanation": "(odor, foul, 0.25), (stalk-surface-above-ring, silky, 0.11), (spore-print-color, chocolate, 0.06)",
 95 |     "context": "The model predicts whether a mushroom is poisonous",
 96 |     "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format"
 97 |   },
 98 |   {
 99 |     "explanation": "(odor, none, 0.13), (gill-size, broad, 0.08), (stalk-surface-above-ring, smooth, 0.04)",
100 |     "context": "The model predicts whether a mushroom is poisonous",
101 |     "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format"
102 |   },
103 |   {
104 |     "explanation": "(gill-size, narrow, 0.12), (odor, fishy, 0.11), (gill-color, buff, 0.08)",
105 |     "context": "The model predicts whether a mushroom is poisonous",
106 |     "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format"
107 |   },
108 |   {
109 |     "explanation": "(gill-size, narrow, 0.10), (odor, spicy, 0.08), (gill-color, buff, 0.07)",
110 |     "context": "The model predicts whether a mushroom is poisonous",
111 |     "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format"
112 |   },
113 |   {
114 |     "explanation": "(spore-print-color, green, 0.26), (ring-number, two, 0.10), (odor, none, -0.06)",
115 |     "context": "The model predicts whether a mushroom is poisonous",
116 |     "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format"
117 |   },
118 |   {
119 |     "explanation": "(odor, pungent, 0.18), (gill-size, narrow, 0.18), (stalk-shape, enlarging, 0.05)",
120 |     "context": "The model predicts whether a mushroom is poisonous",
121 |     "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format"
122 |   },
123 |   {
124 |     "explanation": "(odor, none, 0.15), (gill-size, broad, 0.08), (spore-print-color, black, 0.05)",
125 |     "context": "The model predicts whether a mushroom is poisonous",
126 |     "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format"
127 |   },
128 |   {
129 |     "explanation": "(odor, none, 0.14), (gill-spacing, crowded, 0.08), (gill-size, broad, 0.06)",
130 |     "context": "The model predicts whether a mushroom is poisonous",
131 |     "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format"
132 |   },
133 |   {
134 |     "explanation": "(gill-size, narrow, 0.12), (odor, spicy, 0.10), (gill-color, buff, 0.08)",
135 |     "context": "The model predicts whether a mushroom is poisonous",
136 |     "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format"
137 |   },
138 |   {
139 |     "explanation": "(odor, none, 0.14), (gill-size, broad, 0.07), (spore-print-color, brown, 0.05)",
140 |     "context": "The model predicts whether a mushroom is poisonous",
141 |     "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format"
142 |   },
143 |   {
144 |     "explanation": "(gill-size, narrow, 0.12), (odor, fishy, 0.11), (gill-color, buff, 0.08)",
145 |     "context": "The model predicts whether a mushroom is poisonous",
146 |     "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format"
147 |   },
148 |   {
149 |     "explanation": "(gill-size, narrow, 0.12), (odor, fishy, 0.10), (gill-color, buff, 0.08)",
150 |     "context": "The model predicts whether a mushroom is poisonous",
151 |     "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format"
152 |   },
153 |   {
154 |     "explanation": "(odor, none, 0.14), (gill-size, broad, 0.07), (spore-print-color, brown, 0.05)",
155 |     "context": "The model predicts whether a mushroom is poisonous",
156 |     "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format"
157 |   },
158 |   {
159 |     "explanation": "(odor, foul, 0.22), (stalk-surface-above-ring, silky, 0.10), (spore-print-color, chocolate, 0.06)",
160 |     "context": "The model predicts whether a mushroom is poisonous",
161 |     "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format"
162 |   }
163 | ]


--------------------------------------------------------------------------------
/evaluation/eval_data/pdf_2.json:
--------------------------------------------------------------------------------
  1 | [
  2 |   {
  3 |     "explanation": "(Size of metadata in KB, 262.0, 0.11), (Total size in KB, 74.0, 0.09), (Number of Javascript keywords, 0.0, 0.09)",
  4 |     "context": "The model predicts whether a PDF file contains malware",
  5 |     "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format",
  6 |     "narrative": "The larger metadata size (262 KB), a larger total size (74 KB), and no Javascript keywords suggest that the PDF contains malware."
  7 |   },
  8 |   {
  9 |     "explanation": "(Number of objects, -1.0, 0.10), (Number of keywords that denote end of streams, -1.0, 0.10), (Number of streams (sequences of binary data), -1.0, 0.10)",
 10 |     "context": "The model predicts whether a PDF file contains malware",
 11 |     "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format",
 12 |     "narrative": "The presence of fewer objects, fewer keywords that denote the end of streams, and fewer streams suggest that the PDF contains malware."
 13 |   },
 14 |   {
 15 |     "explanation": "(Size of metadata in KB, 272.0, 0.10), (Number of Javascript keywords, 0.0, 0.09), (Total size in KB, 90.0, 0.08)",
 16 |     "context": "The model predicts whether a PDF file contains malware",
 17 |     "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format",
 18 |     "narrative": "A larger metadata size (272 KB), no Javascript keywords, and a larger total size (90 KB) suggest that the PDF contains malware."
 19 |   },
 20 |   {
 21 |     "explanation": "(Size of metadata in KB, 180.0, 0.11), (Total size in KB, 7.0, 0.06), (Number of objects, -1.0, 0.04)",
 22 |     "context": "The model predicts whether a PDF file contains malware",
 23 |     "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format",
 24 |     "narrative": "The large metadata size (180 KB), the large total size (7 KB), and fewer objects suggest the PDF contains malware."
 25 |   },
 26 |   {
 27 |     "explanation": "(Size of metadata in KB, 262.0, 0.10), (Number of Javascript keywords, 0.0, 0.09), (Total size in KB, 91.0, 0.08)",
 28 |     "context": "The model predicts whether a PDF file contains malware",
 29 |     "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format",
 30 |     "narrative": "The large metadata size (262 KB), no Javascript keywords, and the large total size (91 KB) suggest the PDF contains malware."
 31 |   },
 32 |   {
 33 |     "explanation": "(Size of metadata in KB, 180.0, 0.16), (Total size in KB, 3.0, 0.06), (Number of streams (sequences of binary data), 1.0, 0.04)",
 34 |     "context": "The model predicts whether a PDF file contains malware",
 35 |     "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format",
 36 |     "narrative": "The large metadata size (180 KB), the large total size (3 KB), and more streams suggest the PDF contains malware."
 37 |   },
 38 |   {
 39 |     "explanation": "(Size of metadata in KB, 358.0, 0.10), (Number of Javascript keywords, 0.0, 0.07), (Total size in KB, 63.0, 0.06)",
 40 |     "context": "The model predicts whether a PDF file contains malware",
 41 |     "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format",
 42 |     "narrative": "The large metadata size (358 KB), no Javascript keywords, and the large total size (63 KB) suggest the PDF contains malware."
 43 |   },
 44 |   {
 45 |     "explanation": "(Number of Javascript keywords, 3.0, 0.08), (Size of metadata in KB, 224.0, 0.06), (Number of keywords with startxref, 0.0, 0.04)",
 46 |     "context": "The model predicts whether a PDF file contains malware",
 47 |     "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format",
 48 |     "narrative": "The number of Javascript keywords (3), the large metadata size (224 KB), and no keywords with startxref suggest the PDF contains malware."
 49 |   },
 50 |   {
 51 |     "explanation": "(Size of metadata in KB, 403.0, 0.09), (Total size in KB, 145.0, 0.07), (Number of Javascript keywords, 0.0, 0.06)",
 52 |     "context": "The model predicts whether a PDF file contains malware",
 53 |     "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format",
 54 |     "narrative": "The large metadata size (403 KB), the large total size (145 KB), and no Javascript keywords suggest the PDF contains malware."
 55 |   },
 56 |   {
 57 |     "explanation": "(Number of Javascript keywords, 1.0, 0.10), (Number of images, -1.0, 0.06), (Number of JS keywords, 1.0, 0.05)",
 58 |     "context": "The model predicts whether a PDF file contains malware",
 59 |     "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format"
 60 |   },
 61 |   {
 62 |     "explanation": "(Size of metadata in KB, 336.0, 0.11), (Total size in KB, 58.0, 0.07), (Number of objects, 121.0, 0.07)",
 63 |     "context": "The model predicts whether a PDF file contains malware",
 64 |     "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format"
 65 |   },
 66 |   {
 67 |     "explanation": "(Number of Javascript keywords, 3.0, 0.15), (Number of JS keywords, 2.0, 0.07), (Number of keywords that denote end of streams, 2.0, 0.06)",
 68 |     "context": "The model predicts whether a PDF file contains malware",
 69 |     "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format"
 70 |   },
 71 |   {
 72 |     "explanation": "(Size of metadata in KB, 289.0, 0.09), (Number of Javascript keywords, 0.0, 0.08), (Total size in KB, 27.0, 0.06)",
 73 |     "context": "The model predicts whether a PDF file contains malware",
 74 |     "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format"
 75 |   },
 76 |   {
 77 |     "explanation": "(Number of Javascript keywords, 2.0, 0.10), (Total size in KB, 4.0, 0.08), (Number of entries in Xref tables, 10.0, 0.05)",
 78 |     "context": "The model predicts whether a PDF file contains malware",
 79 |     "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format"
 80 |   },
 81 |   {
 82 |     "explanation": "(Size of metadata in KB, 298.0, 0.11), (Number of Javascript keywords, 0.0, 0.10), (Total size in KB, 32.0, 0.07)",
 83 |     "context": "The model predicts whether a PDF file contains malware",
 84 |     "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format"
 85 |   },
 86 |   {
 87 |     "explanation": "(Size of metadata in KB, 180.0, 0.11), (Number of Javascript keywords, 1.0, 0.08), (Total size in KB, 1.0, 0.06)",
 88 |     "context": "The model predicts whether a PDF file contains malware",
 89 |     "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format"
 90 |   },
 91 |   {
 92 |     "explanation": "(Number of objects, 289.0, 0.12), (Size of metadata in KB, 388.0, 0.10), (Number of entries in Xref tables, 354.0, 0.09)",
 93 |     "context": "The model predicts whether a PDF file contains malware",
 94 |     "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format"
 95 |   },
 96 |   {
 97 |     "explanation": "(Number of XFA keywords, 1.0, 0.11), (Total size in KB, 9.0, 0.08), (Size of metadata in KB, 252.0, 0.07)",
 98 |     "context": "The model predicts whether a PDF file contains malware",
 99 |     "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format"
100 |   },
101 |   {
102 |     "explanation": "(Number of Javascript keywords, 0.0, 0.12), (Size of metadata in KB, 299.0, 0.08), (Number of JS keywords, 0.0, 0.06)",
103 |     "context": "The model predicts whether a PDF file contains malware",
104 |     "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format"
105 |   },
106 |   {
107 |     "explanation": "(Number of Javascript keywords, 2.0, 0.12), (Total size in KB, 9.0, 0.09), (Number of JS keywords, 1.0, 0.04)",
108 |     "context": "The model predicts whether a PDF file contains malware",
109 |     "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format"
110 |   },
111 |   {
112 |     "explanation": "(Size of metadata in KB, 288.0, 0.09), (Total size in KB, 32.0, 0.07), (Number of Javascript keywords, 0.0, 0.06)",
113 |     "context": "The model predicts whether a PDF file contains malware",
114 |     "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format"
115 |   },
116 |   {
117 |     "explanation": "(Total size in KB, 7.0, 0.09), (Size of metadata in KB, 239.0, 0.07), (Number of entries in Xref tables, 10.0, 0.05)",
118 |     "context": "The model predicts whether a PDF file contains malware",
119 |     "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format"
120 |   },
121 |   {
122 |     "explanation": "(Number of Javascript keywords, 0.0, 0.09), (Size of metadata in KB, 278.0, 0.08), (Number of keywords with startxref, 2.0, 0.08)",
123 |     "context": "The model predicts whether a PDF file contains malware",
124 |     "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format"
125 |   },
126 |   {
127 |     "explanation": "(Number of Javascript keywords, 1.0, 0.11), (Number of images, -1.0, 0.07), (Number of keywords with startxref, 1.0, 0.05)",
128 |     "context": "The model predicts whether a PDF file contains malware",
129 |     "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format"
130 |   },
131 |   {
132 |     "explanation": "(Size of metadata in KB, 327.0, 0.10), (Total size in KB, 75.0, 0.06), (Number of entries in Xref tables, 368.0, 0.05)",
133 |     "context": "The model predicts whether a PDF file contains malware",
134 |     "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format"
135 |   },
136 |   {
137 |     "explanation": "(Size of metadata in KB, -1.0, 0.09), (Number of Javascript keywords, 1.0, 0.06), (Total size in KB, -1.0, 0.04)",
138 |     "context": "The model predicts whether a PDF file contains malware",
139 |     "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format"
140 |   },
141 |   {
142 |     "explanation": "(Number of Javascript keywords, 0.0, 0.10), (Size of metadata in KB, 283.0, 0.09), (Total size in KB, 78.0, 0.07)",
143 |     "context": "The model predicts whether a PDF file contains malware",
144 |     "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format"
145 |   },
146 |   {
147 |     "explanation": "(Size of metadata in KB, 180.0, 0.21), (Total size in KB, 9.0, 0.13), (Contains text, 0.0, 0.04)",
148 |     "context": "The model predicts whether a PDF file contains malware",
149 |     "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format"
150 |   },
151 |   {
152 |     "explanation": "(Size of metadata in KB, 262.0, 0.11), (Total size in KB, 80.0, 0.09), (Number of Javascript keywords, 0.0, 0.09)",
153 |     "context": "The model predicts whether a PDF file contains malware",
154 |     "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format"
155 |   },
156 |   {
157 |     "explanation": "(Number of Javascript keywords, 4.0, 0.12), (Total size in KB, 3.0, 0.08), (Number of JS keywords, 3.0, 0.06)",
158 |     "context": "The model predicts whether a PDF file contains malware",
159 |     "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format"
160 |   }
161 | ]


--------------------------------------------------------------------------------
/explingo/grader.py:
--------------------------------------------------------------------------------
  1 | import dspy
  2 | import pandas as pd
  3 | 
  4 | MAX_SCORE = 4
  5 | 
  6 | 
  7 | class RubricAssess(dspy.Signature):
  8 |     """Assess a narrative based on a rubric."""
  9 | 
 10 |     question = dspy.InputField(format=str)
 11 |     narrative = dspy.InputField()
 12 |     rubric = dspy.InputField()
 13 | 
 14 |     assessment = dspy.OutputField(
 15 |         desc="A single number from the options in the rubric. "
 16 |         "Provide only a single number with no other text."
 17 |     )
 18 | 
 19 | 
 20 | class BooleanAssess(dspy.Signature):
 21 |     """Assess a narrative with a yes/no question."""
 22 | 
 23 |     question = dspy.InputField(format=str)
 24 |     narrative = dspy.InputField()
 25 | 
 26 |     assessment = dspy.OutputField(desc="yes or no. Include only the word yes or no.")
 27 | 
 28 | 
 29 | class Grader:
 30 |     def __init__(
 31 |         self,
 32 |         llm=None,
 33 |         openai_api_key=None,
 34 |         metrics="all",
 35 |         sample_narratives=None,
 36 |         max_optimal_length=None,
 37 |     ):
 38 |         """
 39 |         Grades narratives
 40 | 
 41 |         Args:
 42 |             llm (LLM): LLM to use to grade accuracy, completeness, and fluency.
 43 |                 One of llm or openai_api_key must be provided
 44 |             openai_api_key (string): OpenAI API key to use to grade accuracy, completeness,
 45 |                 and fluency
 46 |             metrics (list of strings or "all"): One or more of
 47 |                 "accuracy", "completeness", "fluency", "conciseness"
 48 |             sample_narratives (list of strings, or (string, string) tuples):
 49 |                 Sample narratives to use to grade fluency. Can pass in either just the narratives
 50 |                 or (explanation, narrative) tuples
 51 |             max_optimal_length (int): Hyperparameter for conciseness metric, defaults to number of
 52 |                 words in longest sample narrative or 100 if not given
 53 |         """
 54 |         self.metrics = metrics
 55 | 
 56 |         if metrics == "all":
 57 |             self.metrics = ["accuracy", "completeness", "fluency", "conciseness"]
 58 | 
 59 |         self.metric_funcs = []
 60 |         # TODO: CLEAN THIS UP TO DIRECTLY TAKE FUNCTION FROM NAME
 61 |         if "accuracy" in metrics:
 62 |             self.metric_funcs.append(accuracy)
 63 |         if "completeness" in metrics:
 64 |             self.metric_funcs.append(completeness)
 65 |         if "fluency" in metrics:
 66 |             self.metric_funcs.append("fluency")
 67 |         if "conciseness" in metrics:
 68 |             self.metric_funcs.append("conciseness")
 69 | 
 70 |         self.sample_narratives = sample_narratives
 71 | 
 72 |         if sample_narratives is not None and (
 73 |             isinstance(self.sample_narratives[0], list)
 74 |             or isinstance(self.sample_narratives[0], tuple)
 75 |         ):
 76 |             self.sample_narratives = [narrative[1] for narrative in self.sample_narratives]
 77 | 
 78 |         self.max_optimal_length = max_optimal_length
 79 |         if max_optimal_length is None and self.sample_narratives is not None:
 80 |             self.max_optimal_length = max(
 81 |                 [len(narrative.split()) for narrative in self.sample_narratives]
 82 |             )
 83 |         if self.max_optimal_length is None:
 84 |             self.max_optimal_length = 100
 85 | 
 86 |         self.grader_llm = llm
 87 |         self.openai_api_key = openai_api_key
 88 |         if self.grader_llm is None and self.openai_api_key is not None:
 89 |             self.grader_llm = dspy.OpenAI(
 90 |                 model="gpt-4o",
 91 |                 api_key=self.openai_api_key,
 92 |                 max_tokens=1000,
 93 |                 temperature=0.0,
 94 |             )
 95 | 
 96 |     def run_metrics(self, input_, output_, trace):
 97 |         results = {}
 98 |         if "accuracy" in self.metrics:
 99 |             results["accuracy"] = accuracy(input_, output_, grader=self.grader_llm, trace=trace)
100 |         if "completeness" in self.metrics:
101 |             results["completeness"] = completeness(
102 |                 input_, output_, grader=self.grader_llm, trace=trace
103 |             )
104 |         if "fluency" in self.metrics:
105 |             results["fluency"] = fluency(
106 |                 input_,
107 |                 output_,
108 |                 grader=self.grader_llm,
109 |                 trace=trace,
110 |                 good_narratives=self.sample_narratives,
111 |             )
112 |         if "conciseness" in self.metrics:
113 |             results["conciseness"] = conciseness(
114 |                 input_, output_, max_optimal_length_per_feature=self.max_optimal_length
115 |             )
116 | 
117 |         if trace is None:
118 |             return pd.Series(results)
119 |         else:
120 |             return (
121 |                 (results.get("accuracy", MAX_SCORE) == MAX_SCORE)
122 |                 and (results.get("fluency", MAX_SCORE) == MAX_SCORE)
123 |                 and (results.get("completeness", MAX_SCORE) == MAX_SCORE)
124 |                 and (results.get("conciseness", MAX_SCORE) >= 3.5)
125 |             )
126 | 
127 |     def __call__(self, explanation, explanation_format, narrative, trace=None):
128 |         input_ = dspy.Example(explanation=explanation, explanation_format=explanation_format)
129 |         output_ = dspy.Prediction(narrative=narrative)
130 |         return self.run_metrics(input_, output_, trace)
131 | 
132 | 
133 | def compute_score_from_boolean(metric, question, narrative, grader, iters=3):
134 |     total_score = 0.0
135 | 
136 |     with dspy.context(lm=grader):
137 |         for i in range(iters):
138 |             score = dspy.Predict(BooleanAssess)(
139 |                 question=question, narrative=narrative
140 |             ).assessment.lower()
141 |             if score == "yes":
142 |                 total_score += 1
143 |             elif score == "no":
144 |                 pass
145 |             else:
146 |                 print("Invalid score for metric %s: %s" % (metric, score))
147 |     score = total_score / iters
148 | 
149 |     if 0.3 < score < 0.7:
150 |         print("Inconsistent score for metric %s: %s" % (metric, score))
151 | 
152 |     return score * MAX_SCORE
153 | 
154 | 
155 | def compute_score_from_rubric(
156 |     metric, question, rubric, narrative, grader, iters=3, rational_type=None
157 | ):
158 |     scores = []
159 |     with dspy.context(lm=grader):
160 |         for i in range(iters):
161 |             if rational_type is None:
162 |                 score = dspy.Predict(RubricAssess)(
163 |                     question=question, rubric=rubric, narrative=narrative
164 |                 ).assessment
165 |             else:
166 |                 score = dspy.ChainOfThought(RubricAssess, rationale_type=rational_type)(
167 |                     question=question,
168 |                     rubric=rubric,
169 |                     narrative=narrative,
170 |                 ).assessment
171 |             try:
172 |                 scores.append(int(score))
173 |             except ValueError:
174 |                 print("Invalid score for metric %s: %s" % (metric, score))
175 | 
176 |     if 0 in scores and MAX_SCORE in scores:
177 |         print("Inconsistent score for metric %s: %s" % (metric, scores))
178 | 
179 |     return sum(scores) / iters
180 | 
181 | 
182 | def accuracy(input_, output_, grader, trace=None):
183 |     question = (
184 |         f"How accurate is the information in the narrative, based on the explanation given? "
185 |         f"A narrative can score 4 even if it is missing information as long as everything "
186 |         f"in the narrative is correct. Make sure the contribution direction is correct - "
187 |         f"positive contributions increase the output, negative contributions decrease the output."
188 |         f"\n\nExplanation format: {input_.explanation_format}.\nExplanation: {input_.explanation}"
189 |     )
190 |     rubric = (
191 |         "0 - Contains one or more errors in value or contribution direction. "
192 |         "4 - Contains no errors, but may be missing information."
193 |     )
194 | 
195 |     rational_type = dspy.OutputField(
196 |         prefix="Start by listing out all the features in the narrative, and then for each one "
197 |         "compare it to the explanation to ensure its value and contribution "
198 |         "are approximately correct.",
199 |     )
200 | 
201 |     return compute_score_from_rubric(
202 |         "accuracy",
203 |         question,
204 |         rubric=rubric,
205 |         narrative=output_.narrative,
206 |         grader=grader,
207 |         rational_type=rational_type,
208 |     )
209 | 
210 | 
211 | def fluency(input_, output_, grader, trace=None, good_narratives=None):
212 |     if good_narratives is None:
213 |         question = "How natural and human is the narrative?"
214 |     else:
215 |         question = (
216 |             "How well does the style of the narrative match the style of the example "
217 |             "narratives? Consider only the linguistic style, not the topic. "
218 |             "Example narratives:"
219 |         )
220 |         for narrative in good_narratives:
221 |             question += f"\n{narrative}"
222 |     if good_narratives is not None:
223 |         rubric = "0: Very dissimilar. 1: Dissimilar. 2: Neutral. 3: Similar. 4: Very similar"
224 |     else:
225 |         rubric = "0: Very unnatural. 1: Unnatural. 2: Neutral. 3: Natural. 4: Very natural"
226 |     return compute_score_from_rubric("fluency", question, rubric, output_.narrative, grader)
227 | 
228 | 
229 | def completeness(input_, output_, grader, trace=None):
230 |     question = (
231 |         f"How completely does the narrative below describe the explanation given?"
232 |         f"\nExplanation format: {input_.explanation_format}."
233 |         f"\nExplanation: {input_.explanation}"
234 |     )
235 |     rubric = (
236 |         "0 - One or more feature names from the explanation are not mentioned at all in the "
237 |         "narrative. 2 - All features are mentioned, but not all feature values and/or "
238 |         "contribution directions. 4 - All features are mentioned, and for each feature, "
239 |         "includes at least an approximation of the feature's value and contribution "
240 |         "direction."
241 |     )
242 |     rational_type = dspy.OutputField(
243 |         prefix="Start by listing out all the features in the explanations, and then determine "
244 |         "every feature is present in the narrative, along with its value and "
245 |         "contribution direction.",
246 |     )
247 | 
248 |     return compute_score_from_rubric(
249 |         "completeness",
250 |         question,
251 |         rubric,
252 |         output_.narrative,
253 |         grader,
254 |         rational_type=rational_type,
255 |     )
256 | 
257 | 
258 | def conciseness(input_, output_, grader=None, trace=None, max_optimal_length_per_feature=20):
259 |     num_features = input_.explanation.count("(")
260 |     if num_features == 0:
261 |         num_features = 1
262 |     length = len(output_.narrative.split())
263 |     max_optimal_length = max_optimal_length_per_feature * num_features
264 |     # scale length between 0 and 2
265 |     return max(
266 |         0.0,
267 |         min(
268 |             MAX_SCORE,
269 |             MAX_SCORE * (2 - length / max_optimal_length),
270 |         ),
271 |     )
272 | 
273 | 
274 | def context_awareness(input_, output_, grader, trace=None):
275 |     question = "How well does the rationalization help explain the logic in the narrative?"
276 |     rubric = "0: Not at all. 2: Somewhat. 4: Very well."
277 |     narrative_input = f"Narrative: {output_.narrative}. Rationalization: {output_.rationalization}"
278 |     return compute_score_from_rubric(
279 |         "context_awareness", question, rubric, narrative_input, grader
280 |     )
281 | 


--------------------------------------------------------------------------------
/evaluation/eval_data/pdf_1.json:
--------------------------------------------------------------------------------
  1 | [
  2 |   {
  3 |     "explanation": "(Size of metadata in KB, 262.0, 0.11), (Total size in KB, 74.0, 0.09), (Number of Javascript keywords, 0.0, 0.09)",
  4 |     "context": "The model predicts whether a PDF file contains malware",
  5 |     "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format",
  6 |     "narrative": "The PDF file is more likely to contain malware because it has a larger metadata size (262 KB), a larger total size (74 KB), and no Javascript keywords."
  7 |   },
  8 |   {
  9 |     "explanation": "(Number of objects, -1.0, 0.10), (Number of keywords that denote end of streams, -1.0, 0.10), (Number of streams (sequences of binary data), -1.0, 0.10)",
 10 |     "context": "The model predicts whether a PDF file contains malware",
 11 |     "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format",
 12 |     "narrative": "The PDF file is more likely to contain malware because it has fewer objects, fewer keywords that denote the end of streams, and fewer streams."
 13 |   },
 14 |   {
 15 |     "explanation": "(Size of metadata in KB, 272.0, 0.10), (Number of Javascript keywords, 0.0, 0.09), (Total size in KB, 90.0, 0.08)",
 16 |     "context": "The model predicts whether a PDF file contains malware",
 17 |     "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format",
 18 |     "narrative": "The PDF file is more likely to contain malware because it has a larger metadata size (272 KB), no Javascript keywords, and a larger total size (90 KB)."
 19 |   },
 20 |   {
 21 |     "explanation": "(Size of metadata in KB, 180.0, 0.11), (Total size in KB, 7.0, 0.06), (Number of objects, -1.0, 0.04)",
 22 |     "context": "The model predicts whether a PDF file contains malware",
 23 |     "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format",
 24 |     "narrative": "The PDF file is more likely to contain malware because it has a larger metadata size (180 KB), a larger total size (7 KB), and fewer objects."
 25 |   },
 26 |   {
 27 |     "explanation": "(Size of metadata in KB, 262.0, 0.10), (Number of Javascript keywords, 0.0, 0.09), (Total size in KB, 91.0, 0.08)",
 28 |     "context": "The model predicts whether a PDF file contains malware",
 29 |     "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format",
 30 |     "narrative": "The PDF file is more likely to contain malware because it has a larger metadata size (262 KB), no Javascript keywords, and a larger total size (91 KB)."
 31 |   },
 32 |   {
 33 |     "explanation": "(Size of metadata in KB, 180.0, 0.16), (Total size in KB, 3.0, 0.06), (Number of streams (sequences of binary data), 1.0, 0.04)",
 34 |     "context": "The model predicts whether a PDF file contains malware",
 35 |     "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format",
 36 |     "narrative": "The PDF file is more likely to contain malware because it has a larger metadata size (180 KB), a larger total size (3 KB), and more streams."
 37 |   },
 38 |   {
 39 |     "explanation": "(Size of metadata in KB, 358.0, 0.10), (Number of Javascript keywords, 0.0, 0.07), (Total size in KB, 63.0, 0.06)",
 40 |     "context": "The model predicts whether a PDF file contains malware",
 41 |     "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format",
 42 |     "narrative": "The PDF file is more likely to contain malware because it has a larger metadata size (358 KB), no Javascript keywords, and a larger total size (63 KB)."
 43 |   },
 44 |   {
 45 |     "explanation": "(Number of Javascript keywords, 3.0, 0.08), (Size of metadata in KB, 224.0, 0.06), (Number of keywords with startxref, 0.0, 0.04)",
 46 |     "context": "The model predicts whether a PDF file contains malware",
 47 |     "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format",
 48 |     "narrative": "The PDF file is more likely to contain malware because it has more Javascript keywords (3), a larger metadata size (224 KB), and no keywords with startxref."
 49 |   },
 50 |   {
 51 |     "explanation": "(Size of metadata in KB, 403.0, 0.09), (Total size in KB, 145.0, 0.07), (Number of Javascript keywords, 0.0, 0.06)",
 52 |     "context": "The model predicts whether a PDF file contains malware",
 53 |     "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format",
 54 |     "narrative": "The PDF file is more likely to contain malware because it has a larger metadata size (403 KB), a larger total size (145 KB), and no Javascript keywords."
 55 |   },
 56 |   {
 57 |     "explanation": "(Number of Javascript keywords, 1.0, 0.10), (Number of images, -1.0, 0.06), (Number of JS keywords, 1.0, 0.05)",
 58 |     "context": "The model predicts whether a PDF file contains malware",
 59 |     "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format"
 60 |   },
 61 |   {
 62 |     "explanation": "(Size of metadata in KB, 336.0, 0.11), (Total size in KB, 58.0, 0.07), (Number of objects, 121.0, 0.07)",
 63 |     "context": "The model predicts whether a PDF file contains malware",
 64 |     "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format"
 65 |   },
 66 |   {
 67 |     "explanation": "(Number of Javascript keywords, 3.0, 0.15), (Number of JS keywords, 2.0, 0.07), (Number of keywords that denote end of streams, 2.0, 0.06)",
 68 |     "context": "The model predicts whether a PDF file contains malware",
 69 |     "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format"
 70 |   },
 71 |   {
 72 |     "explanation": "(Size of metadata in KB, 289.0, 0.09), (Number of Javascript keywords, 0.0, 0.08), (Total size in KB, 27.0, 0.06)",
 73 |     "context": "The model predicts whether a PDF file contains malware",
 74 |     "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format"
 75 |   },
 76 |   {
 77 |     "explanation": "(Number of Javascript keywords, 2.0, 0.10), (Total size in KB, 4.0, 0.08), (Number of entries in Xref tables, 10.0, 0.05)",
 78 |     "context": "The model predicts whether a PDF file contains malware",
 79 |     "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format"
 80 |   },
 81 |   {
 82 |     "explanation": "(Size of metadata in KB, 298.0, 0.11), (Number of Javascript keywords, 0.0, 0.10), (Total size in KB, 32.0, 0.07)",
 83 |     "context": "The model predicts whether a PDF file contains malware",
 84 |     "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format"
 85 |   },
 86 |   {
 87 |     "explanation": "(Size of metadata in KB, 180.0, 0.11), (Number of Javascript keywords, 1.0, 0.08), (Total size in KB, 1.0, 0.06)",
 88 |     "context": "The model predicts whether a PDF file contains malware",
 89 |     "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format"
 90 |   },
 91 |   {
 92 |     "explanation": "(Number of objects, 289.0, 0.12), (Size of metadata in KB, 388.0, 0.10), (Number of entries in Xref tables, 354.0, 0.09)",
 93 |     "context": "The model predicts whether a PDF file contains malware",
 94 |     "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format"
 95 |   },
 96 |   {
 97 |     "explanation": "(Number of XFA keywords, 1.0, 0.11), (Total size in KB, 9.0, 0.08), (Size of metadata in KB, 252.0, 0.07)",
 98 |     "context": "The model predicts whether a PDF file contains malware",
 99 |     "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format"
100 |   },
101 |   {
102 |     "explanation": "(Number of Javascript keywords, 0.0, 0.12), (Size of metadata in KB, 299.0, 0.08), (Number of JS keywords, 0.0, 0.06)",
103 |     "context": "The model predicts whether a PDF file contains malware",
104 |     "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format"
105 |   },
106 |   {
107 |     "explanation": "(Number of Javascript keywords, 2.0, 0.12), (Total size in KB, 9.0, 0.09), (Number of JS keywords, 1.0, 0.04)",
108 |     "context": "The model predicts whether a PDF file contains malware",
109 |     "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format"
110 |   },
111 |   {
112 |     "explanation": "(Size of metadata in KB, 288.0, 0.09), (Total size in KB, 32.0, 0.07), (Number of Javascript keywords, 0.0, 0.06)",
113 |     "context": "The model predicts whether a PDF file contains malware",
114 |     "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format"
115 |   },
116 |   {
117 |     "explanation": "(Total size in KB, 7.0, 0.09), (Size of metadata in KB, 239.0, 0.07), (Number of entries in Xref tables, 10.0, 0.05)",
118 |     "context": "The model predicts whether a PDF file contains malware",
119 |     "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format"
120 |   },
121 |   {
122 |     "explanation": "(Number of Javascript keywords, 0.0, 0.09), (Size of metadata in KB, 278.0, 0.08), (Number of keywords with startxref, 2.0, 0.08)",
123 |     "context": "The model predicts whether a PDF file contains malware",
124 |     "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format"
125 |   },
126 |   {
127 |     "explanation": "(Number of Javascript keywords, 1.0, 0.11), (Number of images, -1.0, 0.07), (Number of keywords with startxref, 1.0, 0.05)",
128 |     "context": "The model predicts whether a PDF file contains malware",
129 |     "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format"
130 |   },
131 |   {
132 |     "explanation": "(Size of metadata in KB, 327.0, 0.10), (Total size in KB, 75.0, 0.06), (Number of entries in Xref tables, 368.0, 0.05)",
133 |     "context": "The model predicts whether a PDF file contains malware",
134 |     "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format"
135 |   },
136 |   {
137 |     "explanation": "(Size of metadata in KB, -1.0, 0.09), (Number of Javascript keywords, 1.0, 0.06), (Total size in KB, -1.0, 0.04)",
138 |     "context": "The model predicts whether a PDF file contains malware",
139 |     "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format"
140 |   },
141 |   {
142 |     "explanation": "(Number of Javascript keywords, 0.0, 0.10), (Size of metadata in KB, 283.0, 0.09), (Total size in KB, 78.0, 0.07)",
143 |     "context": "The model predicts whether a PDF file contains malware",
144 |     "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format"
145 |   },
146 |   {
147 |     "explanation": "(Size of metadata in KB, 180.0, 0.21), (Total size in KB, 9.0, 0.13), (Contains text, 0.0, 0.04)",
148 |     "context": "The model predicts whether a PDF file contains malware",
149 |     "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format"
150 |   },
151 |   {
152 |     "explanation": "(Size of metadata in KB, 262.0, 0.11), (Total size in KB, 80.0, 0.09), (Number of Javascript keywords, 0.0, 0.09)",
153 |     "context": "The model predicts whether a PDF file contains malware",
154 |     "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format"
155 |   },
156 |   {
157 |     "explanation": "(Number of Javascript keywords, 4.0, 0.12), (Total size in KB, 3.0, 0.08), (Number of JS keywords, 3.0, 0.06)",
158 |     "context": "The model predicts whether a PDF file contains malware",
159 |     "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format"
160 |   }
161 | ]


--------------------------------------------------------------------------------
/evaluation/eval_data/student_2.json:
--------------------------------------------------------------------------------
  1 | [
  2 |   {
  3 |     "explanation": "(Family eductional support, no, -2.26), (In a romantic relationship, no, 1.11), (Sex, M, -0.60)",
  4 |     "context": "The model predicts whether a student will pass their class",
  5 |     "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format",
  6 |     "narrative": "The lack of family support, and the sex (male) suggest the student is less likely to pass the class. But, the lack of a romantic relationship indicates an higher probability of passing."
  7 |   },
  8 |   {
  9 |     "explanation": "(Family eductional support, yes, 1.21), (In a romantic relationship, no, 1.12), (Age, 17, -0.45)",
 10 |     "context": "The model predicts whether a student will pass their class",
 11 |     "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format",
 12 |     "narrative": "The lack of a romantic relationship and having family support suggest the student is more likely to pass. However, being 17 indicates a lower probability of passing."
 13 |   },
 14 |   {
 15 |     "explanation": "(In a romantic relationship, yes, -2.00), (Family eductional support, no, -1.99), (Sex, M, -0.49)",
 16 |     "context": "The model predicts whether a student will pass their class",
 17 |     "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format",
 18 |     "narrative": "The student's involvement in a romantic relationship, lack of family support, and being male suggest they are less likely to pass the class."
 19 |   },
 20 |   {
 21 |     "explanation": "(Family eductional support, no, -1.37), (School, MS, -0.59)",
 22 |     "context": "The model predicts whether a student will pass their class",
 23 |     "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format",
 24 |     "narrative": "The lack of family support and attending the MS school indicate a lower probability of passing."
 25 |   },
 26 |   {
 27 |     "explanation": "(Student's guardian, father, 1.74), (In a romantic relationship, no, 1.53), (Family eductional support, yes, 1.09)",
 28 |     "context": "The model predicts whether a student will pass their class",
 29 |     "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format",
 30 |     "narrative": "The presence of a father as a guardian, the lack of a romantic relationship, and family support suggest the student is more likely to pass."
 31 |   },
 32 |   {
 33 |     "explanation": "(In a romantic relationship, yes, -1.74), (Family eductional support, no, -1.51), (Student's guardian, mother, -0.31)",
 34 |     "context": "The model predicts whether a student will pass their class",
 35 |     "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format",
 36 |     "narrative": "Being in a romantic relationship, lacking family support, and having their mother as a guardian suggest the student may face challenges in passing."
 37 |   },
 38 |   {
 39 |     "explanation": "(In a romantic relationship, yes, -1.86), (Student's guardian, father, 1.12), (Family eductional support, no, -1.02)",
 40 |     "context": "The model predicts whether a student will pass their class",
 41 |     "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format",
 42 |     "narrative": "While having a father as a guardian is a positive factor, the romantic relationship and lack of family support might hinder the student's chances of passing."
 43 |   },
 44 |   {
 45 |     "explanation": "(Family eductional support, no, -2.01), (Student's guardian, father, 1.49), (In a romantic relationship, no, 1.07)",
 46 |     "context": "The model predicts whether a student will pass their class",
 47 |     "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format",
 48 |     "narrative": "The lack of family support is concerning, but having a father as a guardian and not being in a romantic relationship might increase the student's chances of passing."
 49 |   },
 50 |   {
 51 |     "explanation": "(Family eductional support, no, -2.36), (In a romantic relationship, no, 1.05), (Sex, M, -0.44)",
 52 |     "context": "The model predicts whether a student will pass their class",
 53 |     "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format",
 54 |     "narrative": "The student's sex (male) and lack of family support are concerning, but not being in a romantic relationship might slightly improve their chances of passing"
 55 |   },
 56 |   {
 57 |     "explanation": "(Family eductional support, no, -1.60), (In a romantic relationship, no, 0.81), (Attended nursery school, no, -0.61)",
 58 |     "context": "The model predicts whether a student will pass their class",
 59 |     "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format",
 60 |     "narrative": "The lack of family support and not attending nursery school decrease the student's likelihood of passing. However, not being in a romantic relationship could be an advantage"
 61 |   },
 62 |   {
 63 |     "explanation": "(Family eductional support, no, -2.01), (In a romantic relationship, no, 1.02), (Student's guardian, mother, -0.40)",
 64 |     "context": "The model predicts whether a student will pass their class",
 65 |     "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format",
 66 |     "narrative": "The lack of family support and having the mother as a guardian might negatively impact the student's chances of passing. However, not being in a romantic relationship may offer some improvement."
 67 |   },
 68 |   {
 69 |     "explanation": "(Family eductional support, yes, 1.25), (In a romantic relationship, no, 1.06), (Frequency of going out with friends (1-5), 5, -0.60)",
 70 |     "context": "The model predicts whether a student will pass their class",
 71 |     "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format",
 72 |     "narrative": "Strong family support and not being in a romantic relationship suggest the student is more likely to pass. However, frequently going out with friends could negatively impact their chances."
 73 |   },
 74 |   {
 75 |     "explanation": "(In a romantic relationship, no, 0.92), (Family eductional support, yes, 0.84), (Quality of family relationships (1-5), 2, -0.82)",
 76 |     "context": "The model predicts whether a student will pass their class",
 77 |     "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format"
 78 |   },
 79 |   {
 80 |     "explanation": "(In a romantic relationship, no, 1.24), (Family eductional support, yes, 1.12), (Quality of family relationships (1-5), 4, 0.44)",
 81 |     "context": "The model predicts whether a student will pass their class",
 82 |     "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format"
 83 |   },
 84 |   {
 85 |     "explanation": "(In a romantic relationship, yes, -2.23), (Family eductional support, yes, 0.78), (School, MS, -0.77)",
 86 |     "context": "The model predicts whether a student will pass their class",
 87 |     "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format",
 88 |     "narrative": "The student's involvement in a romantic relationship and attendance at the MS school present challenges to passing, though the presence of family support offers some positive influence."
 89 |   },
 90 |   {
 91 |     "explanation": "(Family eductional support, no, -1.70), (Student's guardian, father, 1.00), (In a romantic relationship, no, 0.95)",
 92 |     "context": "The model predicts whether a student will pass their class",
 93 |     "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format"
 94 |   },
 95 |   {
 96 |     "explanation": "(In a romantic relationship, yes, -1.71), (Family eductional support, yes, 0.78), (School, MS, -0.69)",
 97 |     "context": "The model predicts whether a student will pass their class",
 98 |     "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format"
 99 |   },
100 |   {
101 |     "explanation": "(In a romantic relationship, no, 1.13), (Family eductional support, yes, 1.10), (Reason for choosing this school, home, 0.62)",
102 |     "context": "The model predicts whether a student will pass their class",
103 |     "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format",
104 |     "narrative": "The lack of a romantic relationship, presence of family support, and choosing the school based on home environment suggest the student is more likely to pass"
105 |   },
106 |   {
107 |     "explanation": "(Family eductional support, no, -1.42), (Student's guardian, father, 1.27), (In a romantic relationship, no, 1.00)",
108 |     "context": "The model predicts whether a student will pass their class",
109 |     "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format"
110 |   },
111 |   {
112 |     "explanation": "(In a romantic relationship, no, 1.25), (Family eductional support, yes, 1.16), (Sex, M, -0.54)",
113 |     "context": "The model predicts whether a student will pass their class",
114 |     "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format"
115 |   },
116 |   {
117 |     "explanation": "(In a romantic relationship, yes, -1.84), (Family eductional support, yes, 0.82), (Home to school travel time, 2, 0.39)",
118 |     "context": "The model predicts whether a student will pass their class",
119 |     "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format"
120 |   },
121 |   {
122 |     "explanation": "(Family eductional support, no, -2.14), (In a romantic relationship, no, 0.96), (Student's guardian, mother, -0.37)",
123 |     "context": "The model predicts whether a student will pass their class",
124 |     "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format"
125 |   },
126 |   {
127 |     "explanation": "(Family eductional support, yes, 1.27), (In a romantic relationship, no, 1.05), (Age, 18, -0.40)",
128 |     "context": "The model predicts whether a student will pass their class",
129 |     "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format"
130 |   },
131 |   {
132 |     "explanation": "(In a romantic relationship, yes, -2.67), (Family eductional support, yes, 0.83), (Sex, F, 0.32)",
133 |     "context": "The model predicts whether a student will pass their class",
134 |     "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format"
135 |   },
136 |   {
137 |     "explanation": "(In a romantic relationship, yes, -2.41), (Family eductional support, yes, 0.63), (Amount of free time after school (1-5), 4, -0.47)",
138 |     "context": "The model predicts whether a student will pass their class",
139 |     "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format"
140 |   },
141 |   {
142 |     "explanation": "(Family eductional support, no, -2.22), (In a romantic relationship, no, 0.99), (Age, 16, 0.39)",
143 |     "context": "The model predicts whether a student will pass their class",
144 |     "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format"
145 |   },
146 |   {
147 |     "explanation": "(In a romantic relationship, no, 1.13), (Family eductional support, yes, 1.06), (Attended nursery school, no, -0.81)",
148 |     "context": "The model predicts whether a student will pass their class",
149 |     "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format"
150 |   },
151 |   {
152 |     "explanation": "(In a romantic relationship, yes, -2.31), (Student's guardian, father, 0.79), (Family eductional support, yes, 0.61)",
153 |     "context": "The model predicts whether a student will pass their class",
154 |     "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format"
155 |   },
156 |   {
157 |     "explanation": "(In a romantic relationship, yes, -1.91), (Family eductional support, no, -1.67), (Reason for choosing this school, home, 0.62)",
158 |     "context": "The model predicts whether a student will pass their class",
159 |     "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format"
160 |   },
161 |   {
162 |     "explanation": "(In a romantic relationship, yes, -1.77), (Student's guardian, father, 1.36), (Family eductional support, no, -0.99)",
163 |     "context": "The model predicts whether a student will pass their class",
164 |     "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format"
165 |    }
166 | ]
167 | 


--------------------------------------------------------------------------------
/evaluation/eval_data/housing_3.json:
--------------------------------------------------------------------------------
  1 | [
  2 |   {
  3 |     "explanation": "(Above ground living area square feet, 1256.00, -12527.46), (Rates the overall material and finish of the house, 5.00, -10743.76), (Second floor square feet, 0.00, -10142.29), (Physical locations within Ames city limits, Edwards, -9913.81), (Wood deck area in square feet, 736.00, 9846.38)",
  4 |     "context": "The ML model predicts house prices",
  5 |     "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format",
  6 |     "narrative": "This house is cheaper because it has less above ground living space (size=1256), lower material quality (rating=5), no second floor (size=0), and it's in Edwards. However, the deck is larger (size=736), which makes it a bit more expensive."
  7 |   },
  8 |   {
  9 |     "explanation": "(Second floor square feet, 854.00, 12757.84), (Original construction date, 2003.00, 9115.72), (Total square feet of basement area, 856.00, -6157.86), (First Floor square feet, 856.00, -5466.64), (Physical locations within Ames city limits, CollgCr, -4761.42)",
 10 |     "context": "The ML model predicts house prices",
 11 |     "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format",
 12 |     "narrative": "This house costs more because of its larger second floor (size=854) and newer construction year (2003). It’s cheaper due to a smaller basement (size=856) and first floor (size=856), and its location in CollgCr."
 13 |   },
 14 |   {
 15 |     "explanation": "(Refers to walkout or garden level walls, Gd, 17607.43), (Rates the overall condition of the house, 8.00, 13038.14), (Above ground living area square feet, 1262.00, -12319.48), (Second floor square feet, 0.00, -10142.29), (Proximity to various conditions, Feedr, -8251.83)",
 16 |     "context": "The ML model predicts house prices",
 17 |     "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format",
 18 |     "narrative": "This house is more expensive because of its good garden level walls and overall condition (rating=8). It’s cheaper due to less above ground space (size=1262), no second floor (size=0), and being on a feeder street."
 19 |   },
 20 |   {
 21 |     "explanation": "(Second floor square feet, 866.00, 13079.62), (Original construction date, 2001.00, 8500.21), (Above ground living area square feet, 1786.00, 5844.30), (Physical locations within Ames city limits, CollgCr, -4761.42), (Total square feet of basement area, 920.00, -4747.08)",
 22 |     "context": "The ML model predicts house prices",
 23 |     "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format",
 24 |     "narrative": "This house costs more because of its larger second floor (size=866), newer construction year (2001), and larger above ground space (size=1786). It’s cheaper due to its location in CollgCr and smaller basement (size=920)."
 25 |   },
 26 |   {
 27 |     "explanation": "(Original construction date, 1915.00, -17966.77), (Physical locations within Ames city limits, Crawfor, 17703.26), (Second floor square feet, 756.00, 10129.96), (Total square feet of basement area, 756.00, -8362.22), (Condition of sale, Abnorml, -6786.66)",
 28 |     "context": "The ML model predicts house prices",
 29 |     "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format",
 30 |     "narrative": "This house is cheaper because it's older (year=1915), has a smaller basement (size=756), and the sale condition is abnormal. It’s more expensive due to its location in Crawfor and a larger second floor (size=756)."
 31 |   },
 32 |   {
 33 |     "explanation": "(Physical locations within Ames city limits, NoRidge, 23069.89), (Above ground living area square feet, 2198.00, 20125.75), (Second floor square feet, 1053.00, 18094.05), (Rates the overall material and finish of the house, 8.00, 9655.79), (Original construction date, 2000.00, 8192.46)",
 34 |     "context": "The ML model predicts house prices",
 35 |     "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format",
 36 |     "narrative": "This house is more expensive because it’s in No Ridge, has more above ground space (size=2198), a larger second floor (size=1053), better material quality (rating=8), and a newer construction year (2000)."
 37 |   },
 38 |   {
 39 |     "explanation": "(Type of foundation, Wood, -18650.67), (Physical locations within Ames city limits, Mitchel, -13510.92), (Rates the overall material and finish of the house, 5.00, -10743.76), (Three season porch area in square feet, 320.00, 9959.33), (Bedrooms above ground, 1.00, 8905.73)",
 40 |     "context": "The ML model predicts house prices",
 41 |     "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format",
 42 |     "narrative": "This house is cheaper because it has a wood foundation, is located in Mitchel, and has lower material quality (rating=5). It’s more expensive due to a larger porch (size=320) and fewer bedrooms (count=1)."
 43 |   },
 44 |   {
 45 |     "explanation": "(Type 1 finished square feet, 1369.00, 14641.53), (Evaluates the height of the basement, Ex, 13233.24), (Total square feet of basement area, 1686.00, 12138.28), (Second floor square feet, 0.00, -10142.29), (Rates the overall material and finish of the house, 8.00, 9655.79)",
 46 |     "context": "The ML model predicts house prices",
 47 |     "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format",
 48 |     "narrative": "This house is more expensive because it has more finished space (size=1369), a taller basement (height=Ex), and better material quality (rating=8). It’s cheaper because it lacks a second floor (size=0)."
 49 |   },
 50 |   {
 51 |     "explanation": "(Above ground living area square feet, 2090.00, 16382.07), (Second floor square feet, 983.00, 16216.99), (Physical locations within Ames city limits, NWAmes, -9769.73), (Type 1 finished square feet, 859.00, 6193.63), (Masonry veneer type, Stone, 5446.26)",
 52 |     "context": "The ML model predicts house prices",
 53 |     "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format",
 54 |     "narrative": "This house is more expensive due to its larger above ground space (size=2090), a bigger second floor (size=983), and a stone veneer. It’s cheaper because of its location in NWAmes."
 55 |   },
 56 |   {
 57 |     "explanation": "(Lot area in square feet, 10000.00, 14876.45), (Garage size in square feet, 600.00, 12445.76), (Above ground living area square feet, 1500.00, 11122.58), (Physical locations within Ames city limits, SawyerW, -8734.25), (Year of remodel, 1995.00, -5231.12)",
 58 |     "context": "The ML model predicts house prices",
 59 |     "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format",
 60 |     "narrative": "This house is more expensive due to its larger lot area (size=10000), a larger garage (size=600), and more above ground living space (size=1500). It’s cheaper because of its location in SawyerW and an older remodel year (1995)."
 61 |   },
 62 |   {
 63 |     "explanation": "(Central air conditioning, Yes, 13876.23), (Lot area in square feet, 9500.00, 12975.67), (Rates the overall condition of the house, 7.00, 10123.44), (Physical locations within Ames city limits, Blmngtn, -9243.76), (Total square feet of basement area, 850.00, -5123.89)",
 64 |     "context": "The ML model predicts house prices",
 65 |     "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format",
 66 |     "narrative": "This house is more expensive because it has central air conditioning, a large lot (size=9500), and a better overall condition (rating=7). It’s cheaper due to a smaller basement (size=850) and its location in Blmngtn."
 67 |   },
 68 |   {
 69 |     "explanation": "(Garage size in square feet, 450.00, 12674.34), (Above ground living area square feet, 1800.00, 11754.29), (Rates the overall material and finish of the house, 7.00, 10342.76), (Physical locations within Ames city limits, OldTown, -8329.12), (Condition of sale, Partial, -5343.82)",
 70 |     "context": "The ML model predicts house prices",
 71 |     "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format",
 72 |     "narrative": "This house is more expensive due to a larger garage (size=450), more above ground living space (size=1800), and better material quality (rating=7). It’s cheaper because it’s located in OldTown and has a partial sale condition."
 73 |   },
 74 |   {
 75 |     "explanation": "(Lot area in square feet, 8500.00, 12346.22), (Three season porch area in square feet, 250.00, 9762.53), (Above ground living area square feet, 1400.00, 8793.24), (Physical locations within Ames city limits, ClearCr, -7865.43), (Year of remodel, 1985.00, -6531.44)",
 76 |     "context": "The ML model predicts house prices",
 77 |     "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format",
 78 |     "narrative": "This house is more expensive because it has a larger lot (size=8500), a three-season porch (size=250), and more above ground living space (size=1400). It’s cheaper due to its location in ClearCr and an older remodel year (1985)."
 79 |   },
 80 |   {
 81 |     "explanation": "(Second floor square feet, 900.00, 12500.45), (Original construction date, 2002.00, 8600.78), (Above ground living area square feet, 1800.00, 5900.00), (Physical locations within Ames city limits, CollgCr, -4700.00), (Total square feet of basement area, 950.00, -4800.00)",
 82 |     "context": "The ML model predicts house prices",
 83 |     "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format"
 84 |   },
 85 |   {
 86 |     "explanation": "(Original construction date, 1918.00, -17500.50), (Physical locations within Ames city limits, Crawfor, 17500.00), (Second floor square feet, 800.00, 10500.25), (Total square feet of basement area, 770.00, -8200.00), (Condition of sale, Abnorml, -6700.00)",
 87 |     "context": "The ML model predicts house prices",
 88 |     "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format"
 89 |   },
 90 |   {
 91 |     "explanation": "(Physical locations within Ames city limits, NoRidge, 22000.45), (Above ground living area square feet, 2200.00, 20000.00), (Second floor square feet, 1100.00, 18500.00), (Rates the overall material and finish of the house, 8.00, 9700.00), (Original construction date, 1999.00, 8200.00)",
 92 |     "context": "The ML model predicts house prices",
 93 |     "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format"
 94 |   },
 95 |   {
 96 |     "explanation": "(Type of foundation, Concrete, -18000.00), (Physical locations within Ames city limits, Mitchel, -13000.00), (Rates the overall material and finish of the house, 6.00, -10500.00), (Three season porch area in square feet, 350.00, 10000.00), (Bedrooms above ground, 2.00, 9000.00)",
 97 |     "context": "The ML model predicts house prices",
 98 |     "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format"
 99 |   },
100 |   {
101 |     "explanation": "(Type 1 finished square feet, 1400.00, 15000.00), (Evaluates the height of the basement, Good, 13000.00), (Total square feet of basement area, 1700.00, 12000.00), (Second floor square feet, 0.00, -10000.00), (Rates the overall material and finish of the house, 7.00, 9500.00)",
102 |     "context": "The ML model predicts house prices",
103 |     "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format"
104 |   },
105 |   {
106 |     "explanation": "(Garage size in square feet, 600.00, 12000.00), (Central air conditioning, Yes, 9800.00), (First Floor square feet, 1100.00, -8500.00), (Physical locations within Ames city limits, Sawyer, -6400.00), (Type of foundation, Slab, -5300.00)",
107 |     "context": "The ML model predicts house prices",
108 |     "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format"
109 |   },
110 |   {
111 |     "explanation": "(Lot area in square feet, 9500.00, 15000.00), (Garage size in square feet, 720.00, 13000.00), (Rates the overall condition of the house, 7.00, 8600.00), (Physical locations within Ames city limits, NWAmes, -7500.00), (Second floor square feet, 0.00, -6900.00)",
112 |     "context": "The ML model predicts house prices",
113 |     "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format"
114 |   },
115 |   {
116 |     "explanation": "(Rates the overall condition of the house, 8.00, 11500.00), (Fireplace quality, Excellent, 11000.00), (Garage size in square feet, 450.00, 8600.00), (Physical locations within Ames city limits, OldTown, -7300.00), (Second floor square feet, 0.00, -6400.00)",
117 |     "context": "The ML model predicts house prices",
118 |     "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format"
119 |   },
120 |   {
121 |     "explanation": "(Rates the overall material and finish of the house, 8.50, 18000.00), (Lot area in square feet, 10500.00, 14000.00), (Garage size in square feet, 750.00, 13000.00), (First Floor square feet, 2100.00, -10500.00), (Physical locations within Ames city limits, Meadow, -8400.00)",
122 |     "context": "The ML model predicts house prices",
123 |     "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format"
124 |   }
125 | ]
126 | 


--------------------------------------------------------------------------------
/explingo/tutorial.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "id": "initial_id",
  6 |    "metadata": {
  7 |     "is_executing": true,
  8 |     "ExecuteTime": {
  9 |      "end_time": "2024-10-16T19:13:13.237509Z",
 10 |      "start_time": "2024-10-16T19:13:13.186974Z"
 11 |     }
 12 |    },
 13 |    "source": [
 14 |     "import yaml \n",
 15 |     "import os\n",
 16 |     "\n",
 17 |     "with open(os.path.join(\"..\", \"keys.yaml\"), \"r\") as file:\n",
 18 |     "    config = yaml.safe_load(file)\n",
 19 |     "    openai_api_key = config[\"openai_api_key\"]"
 20 |    ],
 21 |    "outputs": [],
 22 |    "execution_count": 1
 23 |   },
 24 |   {
 25 |    "cell_type": "code",
 26 |    "id": "94b1d6514180c940",
 27 |    "metadata": {
 28 |     "is_executing": true,
 29 |     "ExecuteTime": {
 30 |      "end_time": "2024-10-16T19:13:15.418493Z",
 31 |      "start_time": "2024-10-16T19:13:13.240494Z"
 32 |     }
 33 |    },
 34 |    "source": [
 35 |     "from explingo import Narrator, Grader \n",
 36 |     "\n",
 37 |     "example_narratives = [\n",
 38 |     "    (\"(Above ground living area square feet, 1256.00, -12527.46), (Overall material and finish of the house, 5.00, -10743.76), (Second floor square feet, 0.00, -10142.29)\", \n",
 39 |     "     \"The house's living area size of around 1,200 sq. ft., lower quality materials (5/10), and lack of a second floor all reduce the house's value.\"),\n",
 40 |     "    (\"(Second floor square feet, 854.00, 12757.84), (Original construction date, 2003.00, 9115.72)\",\n",
 41 |     "     \"The house's large second floor of around 850 sq. ft and recent construction date of 2003 increase its value.\"),\n",
 42 |     "    (\"(Overall material and finish of the house, 8.00, 10743.76), (Above ground living area square feet, 2000.00, 12527.46), (Second floor square feet, 1000.00, 10142.29)\",\n",
 43 |     "        \"The house's high quality materials (8/10), large living area size of around 2,000 sq. ft., and a second floor of around 1,000 sq. ft. all increase the house's value.\"),\n",
 44 |     "]\n",
 45 |     "\n",
 46 |     "explanation_format = \"(feature name, feature value, SHAP feature contribution)\"\n",
 47 |     "context = \"The model predicts house prices\"\n",
 48 |     "\n",
 49 |     "narrator = Narrator(openai_api_key=openai_api_key,\n",
 50 |     "                    explanation_format=explanation_format,\n",
 51 |     "                    context=context,\n",
 52 |     "                    sample_narratives=example_narratives)\n"
 53 |    ],
 54 |    "outputs": [],
 55 |    "execution_count": 2
 56 |   },
 57 |   {
 58 |    "cell_type": "markdown",
 59 |    "id": "225625fd117fec33",
 60 |    "metadata": {},
 61 |    "source": []
 62 |   },
 63 |   {
 64 |    "cell_type": "code",
 65 |    "id": "435b1b4daa990205",
 66 |    "metadata": {
 67 |     "ExecuteTime": {
 68 |      "end_time": "2024-10-16T19:13:15.496257Z",
 69 |      "start_time": "2024-10-16T19:13:15.420480Z"
 70 |     }
 71 |    },
 72 |    "source": [
 73 |     "explanation = \"(number of rooms, 11, 7020), (fireplace, yes, 12903)\"\n",
 74 |     "\n",
 75 |     "narrative = narrator.narrate(explanation)\n",
 76 |     "narrative"
 77 |    ],
 78 |    "outputs": [
 79 |     {
 80 |      "data": {
 81 |       "text/plain": [
 82 |        "\"The house's large number of rooms (11) and the presence of a fireplace both increase its value.\""
 83 |       ]
 84 |      },
 85 |      "execution_count": 3,
 86 |      "metadata": {},
 87 |      "output_type": "execute_result"
 88 |     }
 89 |    ],
 90 |    "execution_count": 3
 91 |   },
 92 |   {
 93 |    "cell_type": "code",
 94 |    "id": "1b3c383915180de",
 95 |    "metadata": {
 96 |     "ExecuteTime": {
 97 |      "end_time": "2024-10-16T19:13:15.622039Z",
 98 |      "start_time": "2024-10-16T19:13:15.504007Z"
 99 |     }
100 |    },
101 |    "source": [
102 |     "grader = Grader(openai_api_key=openai_api_key, \n",
103 |     "                metrics=\"all\", \n",
104 |     "                sample_narratives=example_narratives)\n",
105 |     "\n",
106 |     "grader(explanation=explanation, explanation_format=explanation_format, narrative=narrative)"
107 |    ],
108 |    "outputs": [
109 |     {
110 |      "data": {
111 |       "text/plain": [
112 |        "accuracy        4.0\n",
113 |        "completeness    4.0\n",
114 |        "fluency         4.0\n",
115 |        "conciseness     4.0\n",
116 |        "dtype: float64"
117 |       ]
118 |      },
119 |      "execution_count": 4,
120 |      "metadata": {},
121 |      "output_type": "execute_result"
122 |     }
123 |    ],
124 |    "execution_count": 4
125 |   },
126 |   {
127 |    "cell_type": "code",
128 |    "id": "bbba0e6516ec56b2",
129 |    "metadata": {
130 |     "ExecuteTime": {
131 |      "end_time": "2024-10-16T19:13:27.155656Z",
132 |      "start_time": "2024-10-16T19:13:15.625039Z"
133 |     }
134 |    },
135 |    "source": [
136 |     "narrative_with_bootstrap = narrator.narrate(explanation, n_bootstrapped=1, grader=grader)\n",
137 |     "narrative_with_bootstrap"
138 |    ],
139 |    "outputs": [
140 |     {
141 |      "name": "stderr",
142 |      "output_type": "stream",
143 |      "text": [
144 |       "  0%|          | 0/3 [00:00<?, ?it/s]"
145 |      ]
146 |     },
147 |     {
148 |      "name": "stdout",
149 |      "output_type": "stream",
150 |      "text": [
151 |       "TRACE [(Predict(NarratorSig(context, explanation, explanation_format -> narrative\n",
152 |       "    instructions=\"You are helping users understand an ML model's prediction. Given an explanation and information about the model,\\nconvert the explanation into a human-readable narrative.\"\n",
153 |       "    context = Field(annotation=str required=True json_schema_extra={'desc': 'what the ML model predicts', '__dspy_field_type': 'input', 'prefix': 'Context:'})\n",
154 |       "    explanation = Field(annotation=str required=True json_schema_extra={'desc': \"explanation of an ML model's prediction\", '__dspy_field_type': 'input', 'prefix': 'Explanation:'})\n",
155 |       "    explanation_format = Field(annotation=str required=True json_schema_extra={'desc': 'format the explanation is given in', '__dspy_field_type': 'input', 'prefix': 'Explanation Format:'})\n",
156 |       "    narrative = Field(annotation=str required=True json_schema_extra={'desc': 'human-readable narrative version of the explanation', '__dspy_field_type': 'output', 'prefix': 'Narrative:'})\n",
157 |       ")), {'explanation': '(Above ground living area square feet, 1256.00, -12527.46), (Overall material and finish of the house, 5.00, -10743.76), (Second floor square feet, 0.00, -10142.29)', 'context': 'The model predicts house prices', 'explanation_format': '(feature name, feature value, SHAP feature contribution)'}, Prediction(\n",
158 |       "    narrative=\"Narrative: The house's relatively small above ground living area of around 1,256 sq. ft., average quality materials (5/10), and lack of a second floor all decrease the house's value.\"\n",
159 |       "))]\n"
160 |      ]
161 |     },
162 |     {
163 |      "name": "stderr",
164 |      "output_type": "stream",
165 |      "text": [
166 |       " 33%|███▎      | 1/3 [00:10<00:20, 10.27s/it]\n",
167 |       "  0%|          | 0/3 [00:00<?, ?it/s]\n",
168 |       "  0%|          | 0/3 [00:00<?, ?it/s]\n"
169 |      ]
170 |     },
171 |     {
172 |      "name": "stdout",
173 |      "output_type": "stream",
174 |      "text": [
175 |       "Bootstrapped 1 full traces after 1 examples in round 2.\n"
176 |      ]
177 |     },
178 |     {
179 |      "data": {
180 |       "text/plain": [
181 |        "\"The house's large number of rooms, totaling 11, and the presence of a fireplace both contribute to an increase in its value.\""
182 |       ]
183 |      },
184 |      "execution_count": 5,
185 |      "metadata": {},
186 |      "output_type": "execute_result"
187 |     }
188 |    ],
189 |    "execution_count": 5
190 |   },
191 |   {
192 |    "cell_type": "code",
193 |    "id": "3c8dcad6baab3802",
194 |    "metadata": {
195 |     "ExecuteTime": {
196 |      "end_time": "2024-10-16T19:13:29.520885Z",
197 |      "start_time": "2024-10-16T19:13:27.158011Z"
198 |     }
199 |    },
200 |    "source": [
201 |     "grader(explanation=explanation, explanation_format=explanation_format, narrative=narrative_with_bootstrap)"
202 |    ],
203 |    "outputs": [
204 |     {
205 |      "data": {
206 |       "text/plain": [
207 |        "accuracy        4.0\n",
208 |        "completeness    4.0\n",
209 |        "fluency         4.0\n",
210 |        "conciseness     4.0\n",
211 |        "dtype: float64"
212 |       ]
213 |      },
214 |      "execution_count": 6,
215 |      "metadata": {},
216 |      "output_type": "execute_result"
217 |     }
218 |    ],
219 |    "execution_count": 6
220 |   },
221 |   {
222 |    "metadata": {
223 |     "ExecuteTime": {
224 |      "end_time": "2024-10-16T19:17:22.387316Z",
225 |      "start_time": "2024-10-16T19:17:22.372937Z"
226 |     }
227 |    },
228 |    "cell_type": "code",
229 |    "source": "narrator.llm.inspect_history()",
230 |    "id": "451c13bd7361ba12",
231 |    "outputs": [
232 |     {
233 |      "name": "stdout",
234 |      "output_type": "stream",
235 |      "text": [
236 |       "\n",
237 |       "\n",
238 |       "\n",
239 |       "You are helping users understand an ML model's prediction. Given an explanation and information about the model,\n",
240 |       "convert the explanation into a human-readable narrative.\n",
241 |       "\n",
242 |       "---\n",
243 |       "\n",
244 |       "Follow the following format.\n",
245 |       "\n",
246 |       "Context: what the ML model predicts\n",
247 |       "\n",
248 |       "Explanation: explanation of an ML model's prediction\n",
249 |       "\n",
250 |       "Explanation Format: format the explanation is given in\n",
251 |       "\n",
252 |       "Narrative: human-readable narrative version of the explanation\n",
253 |       "\n",
254 |       "---\n",
255 |       "\n",
256 |       "Context: The model predicts house prices\n",
257 |       "\n",
258 |       "Explanation: (Above ground living area square feet, 1256.00, -12527.46), (Overall material and finish of the house, 5.00, -10743.76), (Second floor square feet, 0.00, -10142.29)\n",
259 |       "\n",
260 |       "Explanation Format: (feature name, feature value, SHAP feature contribution)\n",
261 |       "\n",
262 |       "Narrative: Narrative: The house's relatively small above ground living area of around 1,256 sq. ft., average quality materials (5/10), and lack of a second floor all decrease the house's value.\n",
263 |       "\n",
264 |       "---\n",
265 |       "\n",
266 |       "Context: The model predicts house prices\n",
267 |       "Explanation: (Overall material and finish of the house, 8.00, 10743.76), (Above ground living area square feet, 2000.00, 12527.46), (Second floor square feet, 1000.00, 10142.29)\n",
268 |       "Explanation Format: (feature name, feature value, SHAP feature contribution)\n",
269 |       "Narrative: The house's high quality materials (8/10), large living area size of around 2,000 sq. ft., and a second floor of around 1,000 sq. ft. all increase the house's value.\n",
270 |       "\n",
271 |       "---\n",
272 |       "\n",
273 |       "Context: The model predicts house prices\n",
274 |       "Explanation: (Second floor square feet, 854.00, 12757.84), (Original construction date, 2003.00, 9115.72)\n",
275 |       "Explanation Format: (feature name, feature value, SHAP feature contribution)\n",
276 |       "Narrative: The house's large second floor of around 850 sq. ft and recent construction date of 2003 increase its value.\n",
277 |       "\n",
278 |       "---\n",
279 |       "\n",
280 |       "Context: The model predicts house prices\n",
281 |       "\n",
282 |       "Explanation: (number of rooms, 11, 7020), (fireplace, yes, 12903)\n",
283 |       "\n",
284 |       "Explanation Format: (feature name, feature value, SHAP feature contribution)\n",
285 |       "\n",
286 |       "Narrative: The house's large number of rooms, totaling 11, and the presence of a fireplace both contribute to an increase in its value.\n",
287 |       "\n",
288 |       "\n",
289 |       "\n"
290 |      ]
291 |     },
292 |     {
293 |      "data": {
294 |       "text/plain": [
295 |        "\"\\n\\n\\nYou are helping users understand an ML model's prediction. Given an explanation and information about the model,\\nconvert the explanation into a human-readable narrative.\\n\\n---\\n\\nFollow the following format.\\n\\nContext: what the ML model predicts\\n\\nExplanation: explanation of an ML model's prediction\\n\\nExplanation Format: format the explanation is given in\\n\\nNarrative: human-readable narrative version of the explanation\\n\\n---\\n\\nContext: The model predicts house prices\\n\\nExplanation: (Above ground living area square feet, 1256.00, -12527.46), (Overall material and finish of the house, 5.00, -10743.76), (Second floor square feet, 0.00, -10142.29)\\n\\nExplanation Format: (feature name, feature value, SHAP feature contribution)\\n\\nNarrative: Narrative: The house's relatively small above ground living area of around 1,256 sq. ft., average quality materials (5/10), and lack of a second floor all decrease the house's value.\\n\\n---\\n\\nContext: The model predicts house prices\\nExplanation: (Overall material and finish of the house, 8.00, 10743.76), (Above ground living area square feet, 2000.00, 12527.46), (Second floor square feet, 1000.00, 10142.29)\\nExplanation Format: (feature name, feature value, SHAP feature contribution)\\nNarrative: The house's high quality materials (8/10), large living area size of around 2,000 sq. ft., and a second floor of around 1,000 sq. ft. all increase the house's value.\\n\\n---\\n\\nContext: The model predicts house prices\\nExplanation: (Second floor square feet, 854.00, 12757.84), (Original construction date, 2003.00, 9115.72)\\nExplanation Format: (feature name, feature value, SHAP feature contribution)\\nNarrative: The house's large second floor of around 850 sq. ft and recent construction date of 2003 increase its value.\\n\\n---\\n\\nContext: The model predicts house prices\\n\\nExplanation: (number of rooms, 11, 7020), (fireplace, yes, 12903)\\n\\nExplanation Format: (feature name, feature value, SHAP feature contribution)\\n\\nNarrative:\\x1b[32m The house's large number of rooms, totaling 11, and the presence of a fireplace both contribute to an increase in its value.\\x1b[0m\\n\\n\\n\""
296 |       ]
297 |      },
298 |      "execution_count": 7,
299 |      "metadata": {},
300 |      "output_type": "execute_result"
301 |     }
302 |    ],
303 |    "execution_count": 7
304 |   },
305 |   {
306 |    "metadata": {},
307 |    "cell_type": "code",
308 |    "outputs": [],
309 |    "execution_count": null,
310 |    "source": "",
311 |    "id": "2a4ae0dbc4963ed7"
312 |   }
313 |  ],
314 |  "metadata": {
315 |   "kernelspec": {
316 |    "display_name": "Python 3 (ipykernel)",
317 |    "language": "python",
318 |    "name": "python3"
319 |   },
320 |   "language_info": {
321 |    "codemirror_mode": {
322 |     "name": "ipython",
323 |     "version": 3
324 |    },
325 |    "file_extension": ".py",
326 |    "mimetype": "text/x-python",
327 |    "name": "python",
328 |    "nbconvert_exporter": "python",
329 |    "pygments_lexer": "ipython3",
330 |    "version": "3.11.4"
331 |   }
332 |  },
333 |  "nbformat": 4,
334 |  "nbformat_minor": 5
335 | }
336 | 


--------------------------------------------------------------------------------
/evaluation/eval_data/student_1.json:
--------------------------------------------------------------------------------
  1 | [
  2 |   {
  3 |     "explanation": "(Family eductional support, no, -2.26), (In a romantic relationship, no, 1.11), (Sex, M, -0.60)",
  4 |     "context": "The model predicts whether a student will pass their class",
  5 |     "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format",
  6 |     "narrative": "We believe this child is less likely to pass because they do not have family support, and we have seen that male students tend to be more likely to fail. However, the lack of a romantic relationship may increase their odds of passing."
  7 |   },
  8 |   {
  9 |     "explanation": "(Family eductional support, yes, 1.21), (In a romantic relationship, no, 1.12), (Age, 17, -0.45)",
 10 |     "context": "The model predicts whether a student will pass their class",
 11 |     "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format",
 12 |     "narrative": "Interestingly, we have seen that students that are not in a romantic relationship are more likely to pass. This student has family support, which may also increase their odds of passing. However, being 17 may decrease their odds of passing."
 13 |   },
 14 |   {
 15 |     "explanation": "(In a romantic relationship, yes, -2.00), (Family eductional support, no, -1.99), (Sex, M, -0.49)",
 16 |     "context": "The model predicts whether a student will pass their class",
 17 |     "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format",
 18 |     "narrative": "This student may require additional support - we believe that they are less likely to pass because they are in a romantic relationship, do not have family support, and are male."
 19 |   },
 20 |   {
 21 |     "explanation": "(Family eductional support, no, -1.37), (School, MS, -0.59)",
 22 |     "context": "The model predicts whether a student will pass their class",
 23 |     "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format",
 24 |     "narrative": "The lack of family support and the fact that this student attends the MS school may decrease their odds of passing."
 25 |   },
 26 |   {
 27 |     "explanation": "(Student's guardian, father, 1.74), (In a romantic relationship, no, 1.53), (Family eductional support, yes, 1.09)",
 28 |     "context": "The model predicts whether a student will pass their class",
 29 |     "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format",
 30 |     "narrative": "This student is more likely to pass because they have a father as a guardian, are not in a romantic relationship, and have family support."
 31 |   },
 32 |   {
 33 |     "explanation": "(In a romantic relationship, yes, -1.74), (Family eductional support, no, -1.51), (Student's guardian, mother, -0.31)",
 34 |     "context": "The model predicts whether a student will pass their class",
 35 |     "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format",
 36 |     "narrative": "This student may face challenges in passing due to being in a romantic relationship, lacking family support, and having their mother as a guardian."
 37 |   },
 38 |   {
 39 |     "explanation": "(In a romantic relationship, yes, -1.86), (Student's guardian, father, 1.12), (Family eductional support, no, -1.02)",
 40 |     "context": "The model predicts whether a student will pass their class",
 41 |     "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format",
 42 |     "narrative": "While the father's guardianship provides some positive impact, the student's involvement in a romantic relationship and the lack of family support might hinder their chances of passing."
 43 |   },
 44 |   {
 45 |     "explanation": "(Family eductional support, no, -2.01), (Student's guardian, father, 1.49), (In a romantic relationship, no, 1.07)",
 46 |     "context": "The model predicts whether a student will pass their class",
 47 |     "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format",
 48 |     "narrative": "The lack of family support is a significant concern, but the student has some positive factors such as having their father as a guardian and not being in a romantic relationship, which might help increase their chances of passing."
 49 |   },
 50 |   {
 51 |     "explanation": "(Family eductional support, no, -2.36), (In a romantic relationship, no, 1.05), (Sex, M, -0.44)",
 52 |     "context": "The model predicts whether a student will pass their class",
 53 |     "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format",
 54 |     "narrative": "This student faces several challenges, such as a lack of family support and being male, which are associated with lower odds of passing. However, not being in a romantic relationship could slightly improve their chances."
 55 |   },
 56 |   {
 57 |     "explanation": "(Family eductional support, no, -1.60), (In a romantic relationship, no, 0.81), (Attended nursery school, no, -0.61)",
 58 |     "context": "The model predicts whether a student will pass their class",
 59 |     "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format",
 60 |     "narrative": "The absence of family support and not attending nursery school are factors that decrease this student's likelihood of passing. However, not being in a romantic relationship might be a slight advantage."
 61 |   },
 62 |   {
 63 |     "explanation": "(Family eductional support, no, -2.01), (In a romantic relationship, no, 1.02), (Student's guardian, mother, -0.40)",
 64 |     "context": "The model predicts whether a student will pass their class",
 65 |     "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format",
 66 |     "narrative": "The lack of family support and having the mother as a guardian could negatively impact the student's chances of passing. However, not being in a romantic relationship may offer some improvement in their odds."
 67 |   },
 68 |   {
 69 |     "explanation": "(Family eductional support, yes, 1.25), (In a romantic relationship, no, 1.06), (Frequency of going out with friends (1-5), 5, -0.60)",
 70 |     "context": "The model predicts whether a student will pass their class",
 71 |     "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format",
 72 |     "narrative": "This student is more likely to pass due to strong family support and not being in a romantic relationship. However, frequently going out with friends could negatively impact their chances."
 73 |   },
 74 |   {
 75 |     "explanation": "(In a romantic relationship, no, 0.92), (Family eductional support, yes, 0.84), (Quality of family relationships (1-5), 2, -0.82)",
 76 |     "context": "The model predicts whether a student will pass their class",
 77 |     "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format",
 78 |     "narrative": "While this student benefits from not being in a romantic relationship and having family support, poor family relationships might decrease their likelihood of passing."
 79 |   },
 80 |   {
 81 |     "explanation": "(In a romantic relationship, no, 1.24), (Family eductional support, yes, 1.12), (Quality of family relationships (1-5), 4, 0.44)",
 82 |     "context": "The model predicts whether a student will pass their class",
 83 |     "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format",
 84 |     "narrative": "This student appears to have a good chance of passing, as they are not in a romantic relationship, have family support, and enjoy relatively strong family relationships."
 85 |   },
 86 |   {
 87 |     "explanation": "(In a romantic relationship, yes, -2.23), (Family eductional support, yes, 0.78), (School, MS, -0.77)",
 88 |     "context": "The model predicts whether a student will pass their class",
 89 |     "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format",
 90 |     "narrative": "The student's involvement in a romantic relationship and attendance at the MS school are significant challenges to passing, though the presence of family support offers some positive influence."
 91 |   },
 92 |   {
 93 |     "explanation": "(Family eductional support, no, -1.70), (Student's guardian, father, 1.00), (In a romantic relationship, no, 0.95)",
 94 |     "context": "The model predicts whether a student will pass their class",
 95 |     "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format",
 96 |     "narrative": "Despite the lack of family support, the student's chances of passing are improved by having their father as a guardian and not being in a romantic relationship."
 97 |   },
 98 |   {
 99 |     "explanation": "(In a romantic relationship, yes, -1.71), (Family eductional support, yes, 0.78), (School, MS, -0.69)",
100 |     "context": "The model predicts whether a student will pass their class",
101 |     "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format",
102 |     "narrative": "The student's involvement in a romantic relationship and attendance at the MS school negatively impact their chances of passing, though the presence of family support provides some positive influence."
103 |   },
104 |   {
105 |     "explanation": "(In a romantic relationship, no, 1.13), (Family eductional support, yes, 1.10), (Reason for choosing this school, home, 0.62)",
106 |     "context": "The model predicts whether a student will pass their class",
107 |     "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format",
108 |     "narrative": "This student is more likely to pass because they are not in a romantic relationship, have family support, and chose their school for a reason related to their home environment."
109 |   },
110 |   {
111 |     "explanation": "(Family eductional support, no, -1.42), (Student's guardian, father, 1.27), (In a romantic relationship, no, 1.00)",
112 |     "context": "The model predicts whether a student will pass their class",
113 |     "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format",
114 |     "narrative": "The lack of family support might hinder this student's chances of passing, but having their father as a guardian and not being in a romantic relationship may help improve their odds."
115 |   },
116 |   {
117 |     "explanation": "(In a romantic relationship, no, 1.25), (Family eductional support, yes, 1.16), (Sex, M, -0.54)",
118 |     "context": "The model predicts whether a student will pass their class",
119 |     "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format",
120 |     "narrative": "This student is more likely to pass due to strong family support and not being in a romantic relationship, although being male slightly reduces their chances."
121 |   },
122 |   {
123 |     "explanation": "(In a romantic relationship, yes, -1.84), (Family eductional support, yes, 0.82), (Home to school travel time, 2, 0.39)",
124 |     "context": "The model predicts whether a student will pass their class",
125 |     "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format",
126 |     "narrative": "The student's involvement in a romantic relationship might hinder their chances of passing, but having family support and a shorter travel time to school provide some positive factors."
127 |   },
128 |   {
129 |     "explanation": "(Family eductional support, no, -2.14), (In a romantic relationship, no, 0.96), (Student's guardian, mother, -0.37)",
130 |     "context": "The model predicts whether a student will pass their class",
131 |     "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format",
132 |     "narrative": "The lack of family support and having the mother as a guardian are factors that decrease the likelihood of passing. However, not being in a romantic relationship offers a slight improvement in their odds."
133 |   },
134 |   {
135 |     "explanation": "(Family eductional support, yes, 1.27), (In a romantic relationship, no, 1.05), (Age, 18, -0.40)",
136 |     "context": "The model predicts whether a student will pass their class",
137 |     "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format",
138 |     "narrative": "This student is more likely to pass because they have family support and are not in a romantic relationship, although being 18 years old may slightly reduce their chances."
139 |   },
140 |   {
141 |     "explanation": "(In a romantic relationship, yes, -2.67), (Family eductional support, yes, 0.83), (Sex, F, 0.32)",
142 |     "context": "The model predicts whether a student will pass their class",
143 |     "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format",
144 |     "narrative": "This student's involvement in a romantic relationship significantly decreases their chances of passing, but family support and being female provide some positive influences."
145 |   },
146 |   {
147 |     "explanation": "(In a romantic relationship, yes, -2.41), (Family eductional support, yes, 0.63), (Amount of free time after school (1-5), 4, -0.47)",
148 |     "context": "The model predicts whether a student will pass their class",
149 |     "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format",
150 |     "narrative": "The student's involvement in a romantic relationship and a significant amount of free time after school negatively impact their chances of passing, though family support offers some positive influence."
151 |   },
152 |   {
153 |     "explanation": "(Family eductional support, no, -2.22), (In a romantic relationship, no, 0.99), (Age, 16, 0.39)",
154 |     "context": "The model predicts whether a student will pass their class",
155 |     "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format",
156 |     "narrative": "The lack of family support is a significant concern, but the student's chances of passing are slightly improved by not being in a romantic relationship and being 16 years old."
157 |   },
158 |   {
159 |     "explanation": "(In a romantic relationship, no, 1.13), (Family eductional support, yes, 1.06), (Attended nursery school, no, -0.81)",
160 |     "context": "The model predicts whether a student will pass their class",
161 |     "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format",
162 |     "narrative": "This student is more likely to pass due to strong family support and not being in a romantic relationship, although not having attended nursery school slightly reduces their chances."
163 |   },
164 |   {
165 |     "explanation": "(In a romantic relationship, yes, -2.31), (Student's guardian, father, 0.79), (Family eductional support, yes, 0.61)",
166 |     "context": "The model predicts whether a student will pass their class",
167 |     "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format",
168 |     "narrative": "The student's involvement in a romantic relationship is a major factor in decreasing their likelihood of passing. However, having their father as a guardian and family support provides some positive influence."
169 |   },
170 |   {
171 |     "explanation": "(In a romantic relationship, yes, -1.91), (Family eductional support, no, -1.67), (Reason for choosing this school, home, 0.62)",
172 |     "context": "The model predicts whether a student will pass their class",
173 |     "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format",
174 |     "narrative": "This student faces significant challenges due to being in a romantic relationship and lacking family support, although choosing the school for reasons related to home may slightly improve their chances."
175 |   },
176 |   {
177 |     "explanation": "(In a romantic relationship, yes, -1.77), (Student's guardian, father, 1.36), (Family eductional support, no, -0.99)",
178 |     "context": "The model predicts whether a student will pass their class",
179 |     "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format",
180 |     "narrative": "While the student is likely to struggle due to being in a romantic relationship and lacking family support, having their father as a guardian offers some positive impact on their chances of passing."
181 |   }
182 | ]
183 | 


--------------------------------------------------------------------------------
/evaluation/eval_data/housing_2.json:
--------------------------------------------------------------------------------
  1 | [
  2 |   {
  3 |     "explanation": "(Above ground living area square feet, 1256.00, -12527.46), (Rates the overall material and finish of the house, 5.00, -10743.76), (Second floor square feet, 0.00, -10142.29), (Physical locations within Ames city limits, Edwards, -9913.81), (Wood deck area in square feet, 736.00, 9846.38)",
  4 |     "context": "The ML model predicts house prices",
  5 |     "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format",
  6 |     "narrative": "The SHAP value indicates that the house price decreases because the above ground living area is 1256 sq ft, the material quality is rated 5, there's no second floor, and the location is Edwards. However, the price increases due to the larger wood deck of 736 sq ft.",
  7 |     "bad_narrative": "The Above ground living area square feet feature has a values of 1256.00 and a SHAP value of -12527.46. The Rates the overall material and finish of the house feature has a values of 5.00 and a SHAP value of -10743.76. The Second floor square feet feature has a values of 0.00 and a SHAP value of -10142.29. The Physical locations within Ames city limits feature has a values of Edwards and a SHAP value of -9913.81. The Wood deck area in square feet feature has a values of 736.00 and a SHAP value of 9846.38."
  8 |   },
  9 |   {
 10 |     "explanation": "(Second floor square feet, 854.00, 12757.84), (Original construction date, 2003.00, 9115.72), (Total square feet of basement area, 856.00, -6157.86), (First Floor square feet, 856.00, -5466.64), (Physical locations within Ames city limits, CollgCr, -4761.42)",
 11 |     "context": "The ML model predicts house prices",
 12 |     "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format",
 13 |     "narrative": "The SHAP value says that the house price increases because the second floor is 854 sq ft and the construction year is 2003. However, the price decreases due to the basement area of 856 sq ft, first floor of 856 sq ft, and the location in CollgCr.",
 14 |     "bad_narrative": "The Second floor square feet feature has a values of 854.00 and a SHAP value of 12757.84. The Original construction date feature has a values of 2003.00 and a SHAP value of 9115.72. The Total square feet of basement area feature has a values of 856.00 and a SHAP value of -6157.86. The First Floor square feet feature has a values of 856.00 and a SHAP value of -5466.64. The Physical locations within Ames city limits feature has a values of CollgCr and a SHAP value of -4761.42."
 15 |   },
 16 |   {
 17 |     "explanation": "(Refers to walkout or garden level walls, Gd, 17607.43), (Rates the overall condition of the house, 8.00, 13038.14), (Above ground living area square feet, 1262.00, -12319.48), (Second floor square feet, 0.00, -10142.29), (Proximity to various conditions, Feedr, -8251.83)",
 18 |     "context": "The ML model predicts house prices",
 19 |     "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format",
 20 |     "narrative": "The SHAP value shows that the house price increases because the garden level walls are rated 'Good' and the overall condition is rated 8. But the price decreases because the above ground living area is 1262 sq ft, there's no second floor, and the house is near a feeder road.",
 21 |     "bad_narrative": "The Refers to walkout or garden level walls feature has a values of Gd and a SHAP value of 17607.43. The Rates the overall condition of the house feature has a values of 8.00 and a SHAP value of 13038.14. The Above ground living area square feet feature has a values of 1262.00 and a SHAP value of -12319.48. The Second floor square feet feature has a values of 0.00 and a SHAP value of -10142.29. The Proximity to various conditions feature has a values of Feedr and a SHAP value of -8251.83."
 22 |   },
 23 |   {
 24 |     "explanation": "(Second floor square feet, 866.00, 13079.62), (Original construction date, 2001.00, 8500.21), (Above ground living area square feet, 1786.00, 5844.30), (Physical locations within Ames city limits, CollgCr, -4761.42), (Total square feet of basement area, 920.00, -4747.08)",
 25 |     "context": "The ML model predicts house prices",
 26 |     "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format",
 27 |     "narrative": "The SHAP value suggests that the house price increases because the second floor is 866 sq ft, the construction year is 2001, and the above ground living area is 1786 sq ft. However, the price decreases due to the location in CollgCr and a basement area of 920 sq ft.",
 28 |     "bad_narrative": "The Second floor square feet feature has a values of 866.00 and a SHAP value of 13079.62. The Original construction date feature has a values of 2001.00 and a SHAP value of 8500.21. The Above ground living area square feet feature has a values of 1786.00 and a SHAP value of 5844.30. The Physical locations within Ames city limits feature has a values of CollgCr and a SHAP value of -4761.42. The Total square feet of basement area feature has a values of 920.00 and a SHAP value of -4747.08."
 29 |   },
 30 |   {
 31 |     "explanation": "(Original construction date, 1915.00, -17966.77), (Physical locations within Ames city limits, Crawfor, 17703.26), (Second floor square feet, 756.00, 10129.96), (Total square feet of basement area, 756.00, -8362.22), (Condition of sale, Abnorml, -6786.66)",
 32 |     "context": "The ML model predicts house prices",
 33 |     "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format",
 34 |     "narrative": "The SHAP value says that the house price decreases because the construction year is 1915 and the basement area is 756 sq ft. However, it increases because the location is Crawfor and the second floor is 756 sq ft.",
 35 |     "bad_narrative": "The Original construction date feature has a values of 1915.00 and a SHAP value of -17966.77. The Physical locations within Ames city limits feature has a values of Crawfor and a SHAP value of 17703.26. The Second floor square feet feature has a values of 756.00 and a SHAP value of 10129.96. The Total square feet of basement area feature has a values of 756.00 and a SHAP value of -8362.22. The Condition of sale feature has a values of Abnorml and a SHAP value of -6786.66."
 36 |   },
 37 |   {
 38 |     "explanation": "(Physical locations within Ames city limits, NoRidge, 23069.89), (Above ground living area square feet, 2198.00, 20125.75), (Second floor square feet, 1053.00, 18094.05), (Rates the overall material and finish of the house, 8.00, 9655.79), (Original construction date, 2000.00, 8192.46)",
 39 |     "context": "The ML model predicts house prices",
 40 |     "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format",
 41 |     "narrative": "The SHAP value indicates that the house price increases because the location is NoRidge, the above ground living area is 2198 sq ft, the second floor is 1053 sq ft, the material quality is rated 8, and the construction year is 2000.",
 42 |     "bad_narrative": "The Physical locations within Ames city limits feature has a values of NoRidge and a SHAP value of 23069.89. The Above ground living area square feet feature has a values of 2198.00 and a SHAP value of 20125.75. The Second floor square feet feature has a values of 1053.00 and a SHAP value of 18094.05. The Rates the overall material and finish of the house feature has a values of 8.00 and a SHAP value of 9655.79. The Original construction date feature has a values of 2000.00 and a SHAP value of 8192.46."
 43 |   },
 44 |   {
 45 |     "explanation": "(Type of foundation, Wood, -18650.67), (Physical locations within Ames city limits, Mitchel, -13510.92), (Rates the overall material and finish of the house, 5.00, -10743.76), (Three season porch area in square feet, 320.00, 9959.33), (Bedrooms above ground, 1.00, 8905.73)",
 46 |     "context": "The ML model predicts house prices",
 47 |     "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format",
 48 |     "narrative": "The SHAP value suggests that the house price decreases because the foundation is wood, the location is Mitchel, and the material quality is rated 5. But the price increases because there is a three-season porch (320 sq ft) and one bedroom above ground.",
 49 |     "bad_narrative": "The Type of foundation feature has a values of Wood and a SHAP value of -18650.67. The Physical locations within Ames city limits feature has a values of Mitchel and a SHAP value of -13510.92. The Rates the overall material and finish of the house feature has a values of 5.00 and a SHAP value of -10743.76. The Three season porch area in square feet feature has a values of 320.00 and a SHAP value of 9959.33. The Bedrooms above ground feature has a values of 1.00 and a SHAP value of 8905.73."
 50 |   },
 51 |   {
 52 |     "explanation": "(Type 1 finished square feet, 1369.00, 14641.53), (Evaluates the height of the basement, Ex, 13233.24), (Total square feet of basement area, 1686.00, 12138.28), (Second floor square feet, 0.00, -10142.29), (Rates the overall material and finish of the house, 8.00, 9655.79)",
 53 |     "context": "The ML model predicts house prices",
 54 |     "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format",
 55 |     "narrative": "The SHAP value shows that the house price increases because the finished square footage is 1369 sq ft, the basement height is excellent, and the material quality is rated 8. However, the price decreases because there's no second floor.",
 56 |     "bad_narrative": "The Type 1 finished square feet feature has a values of 1369.00 and a SHAP value of 14641.53. The Evaluates the height of the basement feature has a values of Ex and a SHAP value of 13233.24. The Total square feet of basement area feature has a values of 1686.00 and a SHAP value of 12138.28. The Second floor square feet feature has a values of 0.00 and a SHAP value of -10142.29. The Rates the overall material and finish of the house feature has a values of 8.00 and a SHAP value of 9655.79."
 57 |   },
 58 |   {
 59 |     "explanation": "(Garage size in square feet, 550.00, 12012.34), (Central air conditioning, Yes, 9874.21), (First Floor square feet, 1000.00, -8467.92), (Physical locations within Ames city limits, Sawyer, -6354.76), (Type of foundation, Slab, -5347.65)",
 60 |     "context": "The ML model predicts house prices",
 61 |     "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format",
 62 |     "narrative": "The SHAP value indicates that the house price increases because of the garage size of 550 sq ft and the presence of central air conditioning. However, the price decreases due to the first floor size of 1000 sq ft, location in Sawyer, and slab foundation.",
 63 |     "bad_narrative": "The Garage size in square feet feature has a value of 550.00 and a SHAP value of 12012.34. The Central air conditioning feature has a value of Yes and a SHAP value of 9874.21. The First Floor square feet feature has a value of 1000.00 and a SHAP value of -8467.92. The Physical locations within Ames city limits feature has a value of Sawyer and a SHAP value of -6354.76. The Type of foundation feature has a value of Slab and a SHAP value of -5347.65."
 64 |   },
 65 |   {
 66 |     "explanation": "(Lot area in square feet, 9000.00, 15321.67), (Garage size in square feet, 700.00, 13222.58), (Rates the overall condition of the house, 6.00, 8500.43), (Physical locations within Ames city limits, NWAmes, -7543.12), (Second floor square feet, 0.00, -6872.91)",
 67 |     "context": "The ML model predicts house prices",
 68 |     "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format",
 69 |     "narrative": "The SHAP value shows that the house price increases because of the large lot area (9000 sq ft), a spacious garage (700 sq ft), and a house condition rating of 6. However, the price decreases due to the location in NWAmes and no second floor.",
 70 |     "bad_narrative": "The Lot area in square feet feature has a value of 9000.00 and a SHAP value of 15321.67. The Garage size in square feet feature has a value of 700.00 and a SHAP value of 13222.58. The Rates the overall condition of the house feature has a value of 6.00 and a SHAP value of 8500.43. The Physical locations within Ames city limits feature has a value of NWAmes and a SHAP value of -7543.12. The Second floor square feet feature has a value of 0.00 and a SHAP value of -6872.91."
 71 |   },
 72 |   {
 73 |     "explanation": "(Rates the overall condition of the house, 7.00, 11564.44), (Fireplace quality, Excellent, 10798.23), (Garage size in square feet, 400.00, 8562.67), (Physical locations within Ames city limits, OldTown, -7324.15), (Second floor square feet, 0.00, -6451.33)",
 74 |     "context": "The ML model predicts house prices",
 75 |     "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format",
 76 |     "narrative": "The SHAP value suggests that the house price increases due to the condition rating of 7, excellent fireplace quality, and garage size of 400 sq ft. However, the price decreases because the house is located in OldTown and there is no second floor.",
 77 |     "bad_narrative": "The Rates the overall condition of the house feature has a value of 7.00 and a SHAP value of 11564.44. The Fireplace quality feature has a value of Excellent and a SHAP value of 10798.23. The Garage size in square feet feature has a value of 400.00 and a SHAP value of 8562.67. The Physical locations within Ames city limits feature has a value of OldTown and a SHAP value of -7324.15. The Second floor square feet feature has a value of 0.00 and a SHAP value of -6451.33."
 78 |   },
 79 |   {
 80 |     "explanation": "(Rates the overall material and finish of the house, 9.00, 18643.52), (Lot area in square feet, 11000.00, 14421.11), (Garage size in square feet, 800.00, 13211.79), (First Floor square feet, 2000.00, -10422.68), (Physical locations within Ames city limits, Meadow, -8421.33)",
 81 |     "context": "The ML model predicts house prices",
 82 |     "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format",
 83 |     "narrative": "The SHAP value shows that the house price increases due to a high material and finish rating of 9, a large lot area of 11000 sq ft, and a garage size of 800 sq ft. However, the price decreases because the first floor is 2000 sq ft and the location is in Meadow.",
 84 |     "bad_narrative": "The Rates the overall material and finish of the house feature has a value of 9.00 and a SHAP value of 18643.52. The Lot area in square feet feature has a value of 11000.00 and a SHAP value of 14421.11. The Garage size in square feet feature has a value of 800.00 and a SHAP value of 13211.79. The First Floor square feet feature has a value of 2000.00 and a SHAP value of -10422.68. The Physical locations within Ames city limits feature has a value of Meadow and a SHAP value of -8421.33."
 85 |   },
 86 |   {
 87 |     "explanation": "(Rates the overall material and finish of the house, 6.00, 14643.52), (Lot area in square feet, 9000.00, 9421.11), (Garage size in square feet, 800.00, 13211.79), (First Floor square feet, 2000.00, -10422.68), (Physical locations within Ames city limits, Meadow, -8421.33)",
 88 |     "context": "The ML model predicts house prices",
 89 |     "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format"
 90 |   },
 91 |   {
 92 |     "explanation": "(Second floor square feet, 900.00, 12500.45), (Original construction date, 2002.00, 8600.78), (Above ground living area square feet, 1800.00, 5900.00), (Physical locations within Ames city limits, CollgCr, -4700.00), (Total square feet of basement area, 950.00, -4800.00)",
 93 |     "context": "The ML model predicts house prices",
 94 |     "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format"
 95 |   },
 96 |   {
 97 |     "explanation": "(Original construction date, 1918.00, -17500.50), (Physical locations within Ames city limits, Crawfor, 17500.00), (Second floor square feet, 800.00, 10500.25), (Total square feet of basement area, 770.00, -8200.00), (Condition of sale, Abnorml, -6700.00)",
 98 |     "context": "The ML model predicts house prices",
 99 |     "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format"
100 |   },
101 |   {
102 |     "explanation": "(Physical locations within Ames city limits, NoRidge, 22000.45), (Above ground living area square feet, 2200.00, 20000.00), (Second floor square feet, 1100.00, 18500.00), (Rates the overall material and finish of the house, 8.00, 9700.00), (Original construction date, 1999.00, 8200.00)",
103 |     "context": "The ML model predicts house prices",
104 |     "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format"
105 |   },
106 |   {
107 |     "explanation": "(Type of foundation, Concrete, -18000.00), (Physical locations within Ames city limits, Mitchel, -13000.00), (Rates the overall material and finish of the house, 6.00, -10500.00), (Three season porch area in square feet, 350.00, 10000.00), (Bedrooms above ground, 2.00, 9000.00)",
108 |     "context": "The ML model predicts house prices",
109 |     "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format"
110 |   },
111 |   {
112 |     "explanation": "(Type 1 finished square feet, 1400.00, 15000.00), (Evaluates the height of the basement, Good, 13000.00), (Total square feet of basement area, 1700.00, 12000.00), (Second floor square feet, 0.00, -10000.00), (Rates the overall material and finish of the house, 7.00, 9500.00)",
113 |     "context": "The ML model predicts house prices",
114 |     "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format"
115 |   },
116 |   {
117 |     "explanation": "(Garage size in square feet, 600.00, 12000.00), (Central air conditioning, Yes, 9800.00), (First Floor square feet, 1100.00, -8500.00), (Physical locations within Ames city limits, Sawyer, -6400.00), (Type of foundation, Slab, -5300.00)",
118 |     "context": "The ML model predicts house prices",
119 |     "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format"
120 |   },
121 |   {
122 |     "explanation": "(Lot area in square feet, 9500.00, 15000.00), (Garage size in square feet, 720.00, 13000.00), (Rates the overall condition of the house, 7.00, 8600.00), (Physical locations within Ames city limits, NWAmes, -7500.00), (Second floor square feet, 0.00, -6900.00)",
123 |     "context": "The ML model predicts house prices",
124 |     "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format"
125 |   },
126 |   {
127 |     "explanation": "(Rates the overall condition of the house, 8.00, 11500.00), (Fireplace quality, Excellent, 11000.00), (Garage size in square feet, 450.00, 8600.00), (Physical locations within Ames city limits, OldTown, -7300.00), (Second floor square feet, 0.00, -6400.00)",
128 |     "context": "The ML model predicts house prices",
129 |     "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format"
130 |   },
131 |   {
132 |     "explanation": "(Rates the overall material and finish of the house, 8.50, 18000.00), (Lot area in square feet, 10500.00, 14000.00), (Garage size in square feet, 750.00, 13000.00), (First Floor square feet, 2100.00, -10500.00), (Physical locations within Ames city limits, Meadow, -8400.00)",
133 |     "context": "The ML model predicts house prices",
134 |     "explanation_format": "SHAP feature contribution in (feature_name, feature_value, contribution) format"
135 |   }
136 | ]
137 | 


--------------------------------------------------------------------------------
/evaluation/results/results_old.csv:
--------------------------------------------------------------------------------
 1 | ,dataset,prompt,total score,accuracy,completeness,fluency,conciseness,n_few_shot,n_bootstrapped_few_shot
 2 | 0,housing_1.json,"You are helping users understand an ML model's prediction. Given an explanation and information about the model, convert the explanation into a human-readable narrative.",15.24,3.52,3.92,3.8,3.6705882352941175,1.0,
 3 | 1,housing_1.json,"You are helping users understand an ML model's prediction. Given an explanation and information about the model, convert the explanation into a human-readable narrative.",14.96,3.2,3.92,3.84,3.696732026143791,3.0,
 4 | 2,housing_1.json,"You are helping users understand an ML model's prediction. Given an explanation and information about the model, convert the explanation into a human-readable narrative.",14.92,3.36,3.92,3.64,3.605228758169935,3.0,3.0
 5 | 3,housing_1.json,"You are helping users understand an ML model's prediction. Given an explanation and information about the model, convert the explanation into a human-readable narrative.",15.24,3.52,4.0,3.72,3.681045751633987,5.0,
 6 | 4,housing_2.json,"You are helping users understand an ML model's prediction. Given an explanation and information about the model, convert the explanation into a human-readable narrative.",16.0,4.0,4.0,4.0,3.662222222222223,1.0,
 7 | 5,housing_2.json,"You are helping users understand an ML model's prediction. Given an explanation and information about the model, convert the explanation into a human-readable narrative.",16.0,4.0,4.0,4.0,3.733333333333333,3.0,
 8 | 6,housing_2.json,"You are helping users understand an ML model's prediction. Given an explanation and information about the model, convert the explanation into a human-readable narrative.",15.916666666666666,4.0,4.0,3.9166666666666665,3.075555555555556,3.0,3.0
 9 | 7,housing_2.json,"You are helping users understand an ML model's prediction. Given an explanation and information about the model, convert the explanation into a human-readable narrative.",16.0,4.0,4.0,4.0,3.644444444444445,5.0,
10 | 8,housing_3.json,"You are helping users understand an ML model's prediction. Given an explanation and information about the model, convert the explanation into a human-readable narrative.",15.75,4.0,3.8333333333333335,3.9166666666666665,3.939181286549708,1.0,
11 | 9,housing_3.json,"You are helping users understand an ML model's prediction. Given an explanation and information about the model, convert the explanation into a human-readable narrative.",15.75,4.0,3.8333333333333335,3.9166666666666665,3.8970760233918127,3.0,
12 | 10,housing_3.json,"You are helping users understand an ML model's prediction. Given an explanation and information about the model, convert the explanation into a human-readable narrative.",15.666666666666666,4.0,3.8333333333333335,3.8333333333333335,3.60233918128655,3.0,3.0
13 | 11,housing_3.json,"You are helping users understand an ML model's prediction. Given an explanation and information about the model, convert the explanation into a human-readable narrative.",15.916666666666666,4.0,4.0,3.9166666666666665,3.962573099415205,5.0,
14 | 12,mushroom_1.json,"You are helping users understand an ML model's prediction. Given an explanation and information about the model, convert the explanation into a human-readable narrative.",14.05,2.8,3.8,3.45,3.92996632996633,1.0,
15 | 13,mushroom_1.json,"You are helping users understand an ML model's prediction. Given an explanation and information about the model, convert the explanation into a human-readable narrative.",14.5,3.4,3.6,3.5,3.741414141414141,3.0,
16 | 14,mushroom_1.json,"You are helping users understand an ML model's prediction. Given an explanation and information about the model, convert the explanation into a human-readable narrative.",14.25,3.0,3.8,3.45,4.0,3.0,3.0
17 | 15,mushroom_1.json,"You are helping users understand an ML model's prediction. Given an explanation and information about the model, convert the explanation into a human-readable narrative.",13.95,2.6,3.8,3.55,3.92996632996633,5.0,
18 | 16,mushroom_2.json,"You are helping users understand an ML model's prediction. Given an explanation and information about the model, convert the explanation into a human-readable narrative.",12.45,2.0,2.7,3.75,3.758730158730159,1.0,
19 | 17,mushroom_2.json,"You are helping users understand an ML model's prediction. Given an explanation and information about the model, convert the explanation into a human-readable narrative.",12.5,1.8,3.0,3.7,3.822222222222223,3.0,
20 | 18,mushroom_2.json,"You are helping users understand an ML model's prediction. Given an explanation and information about the model, convert the explanation into a human-readable narrative.",13.6,3.8,3.8,2.0,3.847619047619048,3.0,3.0
21 | 19,mushroom_2.json,"You are helping users understand an ML model's prediction. Given an explanation and information about the model, convert the explanation into a human-readable narrative.",12.55,2.2,2.7,3.65,3.911111111111112,5.0,
22 | 20,pdf_1.json,"You are helping users understand an ML model's prediction. Given an explanation and information about the model, convert the explanation into a human-readable narrative.",13.8,3.4,2.5,3.9,3.5809523809523816,1.0,
23 | 21,pdf_1.json,"You are helping users understand an ML model's prediction. Given an explanation and information about the model, convert the explanation into a human-readable narrative.",13.7,3.4,2.5,3.8,3.5809523809523816,3.0,
24 | 22,pdf_1.json,"You are helping users understand an ML model's prediction. Given an explanation and information about the model, convert the explanation into a human-readable narrative.",13.85,3.2,2.8,3.85,3.707936507936509,3.0,3.0
25 | 23,pdf_1.json,"You are helping users understand an ML model's prediction. Given an explanation and information about the model, convert the explanation into a human-readable narrative.",13.2,2.8,2.6,3.8,3.5809523809523816,5.0,
26 | 24,pdf_2.json,"You are helping users understand an ML model's prediction. Given an explanation and information about the model, convert the explanation into a human-readable narrative.",10.85,0.8,2.4,3.65,3.644444444444445,1.0,
27 | 25,pdf_2.json,"You are helping users understand an ML model's prediction. Given an explanation and information about the model, convert the explanation into a human-readable narrative.",10.2,0.6,2.3,3.3,3.6848484848484846,3.0,
28 | 26,pdf_2.json,"You are helping users understand an ML model's prediction. Given an explanation and information about the model, convert the explanation into a human-readable narrative.",10.05,0.4,2.2,3.45,3.5636363636363635,3.0,3.0
29 | 27,pdf_2.json,"You are helping users understand an ML model's prediction. Given an explanation and information about the model, convert the explanation into a human-readable narrative.",9.95,0.4,2.1,3.45,3.6848484848484846,5.0,
30 | 28,student_1.json,"You are helping users understand an ML model's prediction. Given an explanation and information about the model, convert the explanation into a human-readable narrative.",15.5,4.0,3.7,3.8,3.9937984496124033,1.0,
31 | 29,student_1.json,"You are helping users understand an ML model's prediction. Given an explanation and information about the model, convert the explanation into a human-readable narrative.",15.5,4.0,3.8,3.7,3.8490956072351423,3.0,
32 | 30,student_1.json,"You are helping users understand an ML model's prediction. Given an explanation and information about the model, convert the explanation into a human-readable narrative.",15.6,4.0,3.9,3.7,4.0,3.0,3.0
33 | 31,student_1.json,"You are helping users understand an ML model's prediction. Given an explanation and information about the model, convert the explanation into a human-readable narrative.",15.3,4.0,3.7,3.6,4.0,5.0,
34 | 32,student_2.json,"You are helping users understand an ML model's prediction. Given an explanation and information about the model, convert the explanation into a human-readable narrative.",15.5,4.0,3.6,3.9,3.561111111111111,1.0,
35 | 33,student_2.json,"You are helping users understand an ML model's prediction. Given an explanation and information about the model, convert the explanation into a human-readable narrative.",15.9,4.0,3.9,4.0,3.705555555555555,3.0,
36 | 34,student_2.json,"You are helping users understand an ML model's prediction. Given an explanation and information about the model, convert the explanation into a human-readable narrative.",15.75,3.8,4.0,3.95,3.311111111111111,3.0,3.0
37 | 35,student_2.json,"You are helping users understand an ML model's prediction. Given an explanation and information about the model, convert the explanation into a human-readable narrative.",15.2,3.8,3.5,3.9,3.588888888888889,5.0,
38 | 36,housing_1.json,"You are helping users understand an ML model's prediction. Given an explanation and information about the model, convert the explanation into a human-readable narrative.",13.64,3.04,4.0,2.6,2.287581699346405,0.0,1.0
39 | 37,housing_1.json,"You are helping users understand an ML model's prediction. Given an explanation and information about the model, convert the explanation into a human-readable narrative.",13.76,3.2,3.92,2.64,2.287581699346405,0.0,3.0
40 | 38,housing_2.json,"You are helping users understand an ML model's prediction. Given an explanation and information about the model, convert the explanation into a human-readable narrative.",14.416666666666666,4.0,4.0,2.4166666666666665,0.4444444444444443,0.0,1.0
41 | 39,housing_2.json,"You are helping users understand an ML model's prediction. Given an explanation and information about the model, convert the explanation into a human-readable narrative.",14.416666666666666,4.0,4.0,2.4166666666666665,0.4444444444444443,0.0,3.0
42 | 40,housing_3.json,"You are helping users understand an ML model's prediction. Given an explanation and information about the model, convert the explanation into a human-readable narrative.",14.833333333333334,4.0,4.0,2.833333333333333,0.0,0.0,1.0
43 | 41,housing_3.json,"You are helping users understand an ML model's prediction. Given an explanation and information about the model, convert the explanation into a human-readable narrative.",14.833333333333334,4.0,4.0,2.833333333333333,0.0,0.0,3.0
44 | 42,mushroom_1.json,"You are helping users understand an ML model's prediction. Given an explanation and information about the model, convert the explanation into a human-readable narrative.",13.85,4.0,4.0,1.85,0.9589225589225596,0.0,1.0
45 | 43,mushroom_1.json,"You are helping users understand an ML model's prediction. Given an explanation and information about the model, convert the explanation into a human-readable narrative.",13.85,4.0,4.0,1.85,0.9589225589225596,0.0,3.0
46 | 44,mushroom_2.json,"You are helping users understand an ML model's prediction. Given an explanation and information about the model, convert the explanation into a human-readable narrative.",13.95,3.8,3.9,2.25,0.1396825396825399,0.0,1.0
47 | 45,mushroom_2.json,"You are helping users understand an ML model's prediction. Given an explanation and information about the model, convert the explanation into a human-readable narrative.",13.95,3.8,3.9,2.25,0.1396825396825399,0.0,3.0
48 | 46,pdf_1.json,"You are helping users understand an ML model's prediction. Given an explanation and information about the model, convert the explanation into a human-readable narrative.",13.85,3.6,3.8,2.45,0.2666666666666668,0.0,1.0
49 | 47,pdf_1.json,"You are helping users understand an ML model's prediction. Given an explanation and information about the model, convert the explanation into a human-readable narrative.",13.85,3.6,3.8,2.45,0.2666666666666668,0.0,3.0
50 | 48,pdf_2.json,"You are helping users understand an ML model's prediction. Given an explanation and information about the model, convert the explanation into a human-readable narrative.",13.2,3.6,3.8,1.8,0.0,0.0,1.0
51 | 49,pdf_2.json,"You are helping users understand an ML model's prediction. Given an explanation and information about the model, convert the explanation into a human-readable narrative.",13.2,3.6,3.8,1.8,0.0,0.0,3.0
52 | 50,student_1.json,"You are helping users understand an ML model's prediction. Given an explanation and information about the model, convert the explanation into a human-readable narrative.",14.75,4.0,4.0,2.75,2.2118863049095614,0.0,1.0
53 | 51,student_1.json,"You are helping users understand an ML model's prediction. Given an explanation and information about the model, convert the explanation into a human-readable narrative.",15.15,4.0,4.0,3.15,2.2118863049095614,0.0,3.0
54 | 52,student_2.json,"You are helping users understand an ML model's prediction. Given an explanation and information about the model, convert the explanation into a human-readable narrative.",15.1,4.0,4.0,3.1,0.3666666666666662,0.0,1.0
55 | 53,student_2.json,"You are helping users understand an ML model's prediction. Given an explanation and information about the model, convert the explanation into a human-readable narrative.",14.9,4.0,4.0,2.9,0.3666666666666662,0.0,3.0
56 | 54,housing_1.json,"You are helping users understand an ML model's prediction. Given an explanation and information about the model, convert the explanation into a human-readable narrative.",13.28,3.2,4.0,2.08,2.1960784313725497,,
57 | 55,housing_1.json,"You are helping users who do not have experience working with ML understand an ML model's prediction. Given an explanation and information about the model, convert the explanation into a human-readable narrative. Make your answers sound as natural as possible.",12.68,3.04,3.52,2.12,2.274509803921569,,
58 | 56,housing_1.json,"You are helping users understand an ML model's prediction. Given an explanation and information about the model, convert the explanation into a human-readable narrative. Be sure to explicitly mention all values from the explanation in your response.",13.36,3.2,4.0,2.16,2.810457516339869,,
59 | 57,housing_2.json,"You are helping users understand an ML model's prediction. Given an explanation and information about the model, convert the explanation into a human-readable narrative.",15.166666666666666,4.0,4.0,3.1666666666666665,0.4800000000000001,,
60 | 58,housing_2.json,"You are helping users who do not have experience working with ML understand an ML model's prediction. Given an explanation and information about the model, convert the explanation into a human-readable narrative. Make your answers sound as natural as possible.",14.666666666666666,3.6666666666666665,4.0,3.0,0.96,,
61 | 59,housing_2.json,"You are helping users understand an ML model's prediction. Given an explanation and information about the model, convert the explanation into a human-readable narrative. Be sure to explicitly mention all values from the explanation in your response.",15.0,4.0,4.0,3.0,0.942222222222222,,
62 | 60,housing_3.json,"You are helping users understand an ML model's prediction. Given an explanation and information about the model, convert the explanation into a human-readable narrative.",14.0,4.0,4.0,2.0,0.0,,
63 | 61,housing_3.json,"You are helping users who do not have experience working with ML understand an ML model's prediction. Given an explanation and information about the model, convert the explanation into a human-readable narrative. Make your answers sound as natural as possible.",13.916666666666666,3.6666666666666665,4.0,2.25,0.0,,
64 | 62,housing_3.json,"You are helping users understand an ML model's prediction. Given an explanation and information about the model, convert the explanation into a human-readable narrative. Be sure to explicitly mention all values from the explanation in your response.",14.25,4.0,4.0,2.25,0.0,,
65 | 63,mushroom_1.json,"You are helping users understand an ML model's prediction. Given an explanation and information about the model, convert the explanation into a human-readable narrative.",13.3,4.0,4.0,1.3,1.6969696969696977,,
66 | 64,mushroom_1.json,"You are helping users who do not have experience working with ML understand an ML model's prediction. Given an explanation and information about the model, convert the explanation into a human-readable narrative. Make your answers sound as natural as possible.",13.75,4.0,4.0,1.75,1.4168350168350172,,
67 | 65,mushroom_1.json,"You are helping users understand an ML model's prediction. Given an explanation and information about the model, convert the explanation into a human-readable narrative. Be sure to explicitly mention all values from the explanation in your response.",12.85,4.0,4.0,0.85,1.7777777777777786,,
68 | 66,mushroom_2.json,"You are helping users understand an ML model's prediction. Given an explanation and information about the model, convert the explanation into a human-readable narrative.",13.85,4.0,4.0,1.85,0.5714285714285721,,
69 | 67,mushroom_2.json,"You are helping users who do not have experience working with ML understand an ML model's prediction. Given an explanation and information about the model, convert the explanation into a human-readable narrative. Make your answers sound as natural as possible.",13.65,4.0,4.0,1.65,0.5269841269841276,,
70 | 68,mushroom_2.json,"You are helping users understand an ML model's prediction. Given an explanation and information about the model, convert the explanation into a human-readable narrative. Be sure to explicitly mention all values from the explanation in your response.",13.3,4.0,4.0,1.3,0.7365079365079369,,
71 | 69,pdf_1.json,"You are helping users understand an ML model's prediction. Given an explanation and information about the model, convert the explanation into a human-readable narrative.",14.65,4.0,4.0,2.65,0.0650793650793652,,
72 | 70,pdf_1.json,"You are helping users who do not have experience working with ML understand an ML model's prediction. Given an explanation and information about the model, convert the explanation into a human-readable narrative. Make your answers sound as natural as possible.",13.85,4.0,3.5,2.35,0.0,,
73 | 71,pdf_1.json,"You are helping users understand an ML model's prediction. Given an explanation and information about the model, convert the explanation into a human-readable narrative. Be sure to explicitly mention all values from the explanation in your response.",13.85,3.6,4.0,2.25,0.2666666666666668,,
74 | 72,pdf_2.json,"You are helping users understand an ML model's prediction. Given an explanation and information about the model, convert the explanation into a human-readable narrative.",13.65,4.0,4.0,1.65,0.0,,
75 | 73,pdf_2.json,"You are helping users who do not have experience working with ML understand an ML model's prediction. Given an explanation and information about the model, convert the explanation into a human-readable narrative. Make your answers sound as natural as possible.",13.0,4.0,3.5,1.5,0.0,,
76 | 74,pdf_2.json,"You are helping users understand an ML model's prediction. Given an explanation and information about the model, convert the explanation into a human-readable narrative. Be sure to explicitly mention all values from the explanation in your response.",13.05,3.6,4.0,1.45,0.0,,
77 | 75,student_1.json,"You are helping users understand an ML model's prediction. Given an explanation and information about the model, convert the explanation into a human-readable narrative.",14.75,4.0,4.0,2.75,2.2894056847545223,,
78 | 76,student_1.json,"You are helping users who do not have experience working with ML understand an ML model's prediction. Given an explanation and information about the model, convert the explanation into a human-readable narrative. Make your answers sound as natural as possible.",14.5,3.6,4.0,2.9,2.118863049095608,,
79 | 77,student_1.json,"You are helping users understand an ML model's prediction. Given an explanation and information about the model, convert the explanation into a human-readable narrative. Be sure to explicitly mention all values from the explanation in your response.",14.5,4.0,4.0,2.5,2.5271317829457365,,
80 | 78,student_2.json,"You are helping users understand an ML model's prediction. Given an explanation and information about the model, convert the explanation into a human-readable narrative.",14.95,4.0,4.0,2.95,0.355555555555555,,
81 | 79,student_2.json,"You are helping users who do not have experience working with ML understand an ML model's prediction. Given an explanation and information about the model, convert the explanation into a human-readable narrative. Make your answers sound as natural as possible.",14.8,3.8,4.0,3.0,0.5499999999999996,,
82 | 80,student_2.json,"You are helping users understand an ML model's prediction. Given an explanation and information about the model, convert the explanation into a human-readable narrative. Be sure to explicitly mention all values from the explanation in your response.",15.0,4.0,4.0,3.0,0.4388888888888884,,
83 | 


--------------------------------------------------------------------------------