├── probsem
    ├── __init__.py
    ├── __main__.py
    ├── abstract.py
    ├── utils.py
    ├── benchmarks.py
    ├── probsem.py
    └── models.py
├── Dockerfile
├── requirements.txt
├── test
    └── test_integration.py
├── inputs
    ├── test_A.json
    ├── test_B.json
    └── test.txt
├── .github
    └── workflows
    │   └── testing.yml
├── setup.py
├── Makefile
├── BENCHMARKS.md
└── README.md


/probsem/__init__.py:
--------------------------------------------------------------------------------
1 | import logging
2 | import sys
3 | 
4 | from probsem.probsem import ProbSem
5 | 
6 | logging.basicConfig(stream=sys.stdout, level=logging.INFO)
7 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM continuumio/miniconda3
 2 | 
 3 | SHELL ["/bin/bash", "-c"]
 4 | 
 5 | WORKDIR /app
 6 | 
 7 | COPY ./ .
 8 | 
 9 | RUN apt-get update
10 | RUN apt-get -y install make
11 | RUN make env
12 | 
13 | ENTRYPOINT ["/bin/bash", "-c"]
14 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | numpy==1.23.4
 2 | openai==0.25.0
 3 | pandas==1.5.1
 4 | torch==1.12.1
 5 | transformers==4.23.1
 6 | accelerate==0.13.2
 7 | diskcache==5.6.1
 8 | mypy==0.982
 9 | lxml==4.9.1
10 | tqdm-stubs==0.2.1
11 | pylint==2.15.5
12 | pylint-json2html==0.4.0
13 | pylint-exit==1.2.0
14 | pytest==7.2.0
15 | pytest-html==3.2.0
16 | coverage==6.5.0


--------------------------------------------------------------------------------
/test/test_integration.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from probsem.probsem import ProbSem
 4 | 
 5 | 
 6 | @pytest.mark.parametrize(
 7 |     "prompt, benchmark, model",
 8 |     [pytest.param("test", "A", "Salesforce/codegen-350M-mono")],
 9 | )
10 | def test_run(prompt, benchmark, model):
11 |     ProbSem(prompt=prompt, test=benchmark, model=model).run()
12 | 


--------------------------------------------------------------------------------
/inputs/test_A.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "pretext": "The following questions are based on the above text, and should be answered True or False.",
 3 |     "context": [
 4 |         {
 5 |             "text": "The author is discussing the role of LLMs in Wikipedia editing.", 
 6 |             "expected": 0
 7 |         }, {
 8 |             "text": "LLM use for wikipedia editing is accepted without restrictions.", 
 9 |             "expected": 1
10 |         }
11 |     ],
12 |     "posttext": "The correct answer is:",
13 |     "queries": [
14 |         "True",
15 |         "False"
16 |     ]
17 | }


--------------------------------------------------------------------------------
/inputs/test_B.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "pretext": "The following questions are based on the above text, and should be answered via Multiple Choice response.",
 3 |     "context": [
 4 |         {
 5 |             "text": "Based on the above passage, the author believes that LLM use for writing is as a whole:\nA) Only positive.\nB) Mostly positive.\nC) Mostly negative.\nD) Only negative.", 
 6 |             "expected": -1
 7 |         }
 8 |     ],
 9 |     "posttext": "The correct answer is:",
10 |     "queries": [
11 |         "A",
12 |         "B",
13 |         "C",
14 |         "D"
15 |     ]
16 | }


--------------------------------------------------------------------------------
/.github/workflows/testing.yml:
--------------------------------------------------------------------------------
 1 | name:                  Tests
 2 | 
 3 | on:
 4 |   push:
 5 |     branches:          [main]
 6 |   pull_request:
 7 |     branches:          [main]
 8 | 
 9 | jobs:
10 |   run_tests:
11 | 
12 |     runs-on:           ubuntu-latest
13 | 
14 |     steps:
15 |       - name:          checkout repo
16 |         uses:          actions/checkout@v3.0.0
17 |         with:
18 |           fetch-depth: 1
19 | 
20 |       - name:          build docker image
21 |         run:           docker build . --file Dockerfile --tag probsem:latest
22 | 
23 |       - name:          run testing pipeline
24 |         run:           docker run probsem:latest make test
25 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import find_packages, setup
 2 | 
 3 | with open("README.md") as readme_file:
 4 |     readme = readme_file.read()
 5 | 
 6 | requirements = [
 7 |     "numpy",
 8 |     "openai",
 9 |     "pandas",
10 |     "torch",
11 |     "transformers",
12 |     "accelerate",
13 | ]
14 | 
15 | test_requirements = [
16 |     "mypy",
17 |     "lxml",
18 |     "tqdm-stubs",
19 |     "pylint",
20 |     "pylint-json2html",
21 |     "pylint-exit",
22 |     "pytest",
23 |     "pytest-html",
24 |     "coverage",
25 | ]
26 | 
27 | setup(
28 |     name="probsem",
29 |     version="0.1.0",
30 |     description="probabilistic semantic parsing via program synthesis",
31 |     long_description=readme,
32 |     author="Benjamin Lipkin",
33 |     author_email="lipkinb@mit.edu",
34 |     license="MIT",
35 |     packages=find_packages(where="probsem"),
36 |     install_requires=requirements,
37 |     extras_require={"test": test_requirements},
38 |     python_requires=">=3.10",
39 | )
40 | 


--------------------------------------------------------------------------------
/inputs/test.txt:
--------------------------------------------------------------------------------
 1 | This next section will involve a reading comprehension test. 
 2 | 
 3 | Please read the following paragraph, and answer any questions that follow.
 4 | 
 5 | The following was sourced from the top of https://en.wikipedia.org/wiki/Wikipedia:Large_language_models on 01/30/2023
 6 | 
 7 | """
 8 | Large language models (LLMs) such as GPT-3 are increasingly being used to generate text.
 9 | These tools should be used with care, since they can generate content that is biased, non-verifiable, constitutes original research, or fails to follow our other policies and guidelines.
10 | Editors retain full responsibility for LLM-assisted edits, which should still comply with all relevant Wikipedia policies.
11 | While the use of LLMs is not prohibited, their use should be reserved to experienced editors, who should carefully scrutinize their LLM-assisted edits before hitting Publish. 
12 | The use of such programs to create whole articles or generate passages from scratch is forbidden.
13 | Furthermore, LLM use must be declared in the edit summary.
14 | """
15 | 
16 | Based on the above paragraph, please answer the following questions.
17 | 


--------------------------------------------------------------------------------
/probsem/__main__.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import datetime
 3 | 
 4 | from probsem.probsem import ProbSem
 5 | from probsem.abstract import Object
 6 | 
 7 | 
 8 | class CLI(Object):
 9 |     def __init__(self) -> None:
10 |         super().__init__()
11 |         self._parser = argparse.ArgumentParser()
12 |         self._parser.add_argument("--prompt", required=True)
13 |         self._parser.add_argument("--test", required=True)
14 |         self._parser.add_argument("--model", default="code-davinci-002")
15 |         self._parser.add_argument("--norm", default=False, action="store_true")
16 |         self._parser.add_argument("--temp", default=1.0, type=float)
17 |         self._parser.add_argument("--input_dir", default="")
18 |         self._parser.add_argument("--output_dir", default="")
19 |         self._parser.add_argument("--cache_dir", default="")
20 | 
21 |     def run_main(self) -> None:
22 |         start = datetime.datetime.now()
23 |         ProbSem(**vars(self._parser.parse_args())).run()
24 |         elapsed = datetime.datetime.now() - start
25 |         self.info(f"Completed successfully in {elapsed}.")
26 | 
27 | 
28 | if __name__ == "__main__":
29 |     CLI().run_main()
30 | 


--------------------------------------------------------------------------------
/probsem/abstract.py:
--------------------------------------------------------------------------------
 1 | import abc
 2 | import logging
 3 | import pathlib
 4 | import typing
 5 | 
 6 | import numpy as np
 7 | 
 8 | 
 9 | class Object(abc.ABC):
10 |     def __init__(self) -> None:
11 |         self._base = pathlib.Path(__file__).parents[1]
12 |         self._name = self.__class__.__name__
13 |         self._logger = logging.getLogger(self._name)
14 | 
15 |     def _log(self, message: str, level: str, offset: int) -> None:
16 |         assert hasattr(self._logger, level)
17 |         if "\n" in message:
18 |             lines = message.split("\n")
19 |         else:
20 |             lines = [message]
21 |         for line in lines:
22 |             formatted = f"{' ' * (offset - len(self._name))}{line}"
23 |             getattr(self._logger, level)(formatted)
24 | 
25 |     def info(self, message: str) -> None:
26 |         self._log(message, "info", 20)
27 | 
28 |     def warn(self, message: str) -> None:
29 |         self._log(message, "warning", 17)
30 | 
31 |     def __setattr__(self, name: str, value: typing.Any) -> None:
32 |         super().__setattr__(name, value)
33 | 
34 |     def __getattribute__(self, name: str) -> typing.Any:
35 |         return super().__getattribute__(name)
36 | 
37 | 
38 | @typing.runtime_checkable
39 | class IModel(typing.Protocol):
40 |     def __init__(self, model_id: str) -> None:
41 |         raise NotImplementedError()  # pragma: no cover
42 | 
43 |     def score(self, full_text: str, eval_text: str) -> typing.Tuple[np.float64, int]:
44 |         raise NotImplementedError()  # pragma: no cover
45 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | SHELL := /usr/bin/env bash
 2 | EXEC = python=3.10
 3 | PACKAGE = probsem
 4 | INSTALL = python -m pip install
 5 | ACTIVATE = source activate $(PACKAGE)
 6 | .DEFAULT_GOAL := help
 7 | 
 8 | ## help      : print available build commands.
 9 | .PHONY : help
10 | help : Makefile
11 | 	@sed -n 's/^##//p' $<
12 | 
13 | ## update    : update repo with latest version from GitHub.
14 | .PHONY : update
15 | update :
16 | 	@git pull origin main
17 | 
18 | ## env       : setup environment and install dependencies.
19 | .PHONY : env
20 | env : $(PACKAGE).egg-info/
21 | $(PACKAGE).egg-info/ : setup.py requirements.txt
22 | 	@conda create -yn $(PACKAGE) $(EXEC)
23 | 	@$(ACTIVATE) ; $(INSTALL) -r requirements.txt ; $(INSTALL) -e ".[test]"
24 | 
25 | ## test      : run testing pipeline.
26 | .PHONY : test
27 | test : mypy pylint pytest
28 | mypy : env html/mypy/index.html
29 | pylint : env html/pylint/index.html
30 | pytest : env html/coverage/index.html
31 | html/mypy/index.html : $(PACKAGE)/*.py
32 | 	@$(ACTIVATE) ; mypy \
33 | 	-p $(PACKAGE) \
34 | 	--ignore-missing-imports \
35 | 	--html-report $(@D)
36 | html/pylint/index.html : html/pylint/index.json
37 | 	@$(ACTIVATE) ; pylint-json2html -o $@ -e utf-8 $<
38 | html/pylint/index.json : $(PACKAGE)/*.py
39 | 	@mkdir -p $(@D)
40 | 	@$(ACTIVATE) ; pylint $(PACKAGE) \
41 | 	--disable C0114,C0115,C0116 \
42 | 	--generated-members torch.* \
43 | 	--output-format=colorized,json:$@ \
44 | 	|| pylint-exit $$?
45 | html/coverage/index.html : html/pytest/report.html
46 | 	@$(ACTIVATE) ; coverage html -d $(@D)
47 | html/pytest/report.html : $(PACKAGE)/*.py test/*.py
48 | 	@$(ACTIVATE) ; coverage run --branch -m pytest \
49 | 	--html=$@ --self-contained-html
50 | 


--------------------------------------------------------------------------------
/probsem/utils.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | import typing
 3 | 
 4 | import numpy as np
 5 | import numpy.typing as npt
 6 | 
 7 | 
 8 | def sanitize_filename(text: str) -> str:
 9 |     return re.sub(r"^[ .]|[/<>:\"\\|?*]+|[ .]$", "-", text)
10 | 
11 | 
12 | def normalize(weights: npt.NDArray[np.float64]) -> npt.NDArray[np.float64]:
13 |     return np.exp(weights) / np.sum(np.exp(weights))
14 | 
15 | 
16 | def print_sample(sample: typing.Dict[str, typing.Any]) -> str:
17 |     ostream = []
18 |     ostream.append("\nText:")
19 |     ostream.append('"""')
20 |     ostream.append(f"{sample['text'][0]}")
21 |     ostream.append('"""')
22 |     ostream.append("\nScores:")
23 |     ostream.append('"""')
24 |     for _, (query, score) in enumerate(zip(sample["queries"], sample["scores"])):
25 |         ostream.append(f"{score:.3f}\t{query}")
26 |     ostream.append('"""')
27 |     if sample["correct"] == -1:
28 |         ostream.append("")
29 |     elif np.argmax(sample["scores"]) == sample["correct"]:
30 |         ostream.append("\n" + "TEST SAMPLE PASSED." + "\n")
31 |     else:
32 |         ostream.append("\n" + "TEST SAMPLE FAILED." + "\n")
33 |     return "\n".join([30 * "_"] + ostream + [30 * "_"])
34 | 
35 | 
36 | def print_summary(samples: typing.List[typing.Dict[str, typing.Any]]) -> str:
37 |     scores = np.array([s["scores"] for s in samples])
38 |     indices = np.array([s["correct"] for s in samples])
39 |     if -1 in indices:
40 |         accuracy = np.float64(np.nan)
41 |     else:
42 |         correct = scores[np.arange(indices.size), indices] == scores.max(axis=1)
43 |         accuracy = correct.mean()
44 |     ostream = []
45 |     ostream.append(f"TEST SUITE ACCURACY:\t{accuracy:.3f}")
46 |     return "\n".join([30 * "_"] + ostream + [30 * "_"])
47 | 


--------------------------------------------------------------------------------
/probsem/benchmarks.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import pathlib
 3 | import typing
 4 | 
 5 | from probsem.abstract import Object
 6 | 
 7 | 
 8 | class Prompt(Object):
 9 |     def __init__(self, prompt: str, input_dir: pathlib.Path) -> None:
10 |         super().__init__()
11 |         self._text = self._load(prompt, input_dir)
12 | 
13 |     @property
14 |     def text(self) -> str:
15 |         return self._text
16 | 
17 |     def _load(self, prompt: str, input_dir: pathlib.Path) -> str:
18 |         prompt_file = input_dir / f"{prompt}.txt"
19 |         with open(prompt_file, "r", encoding="utf-8") as fstream:
20 |             return fstream.read()
21 | 
22 | 
23 | class TestSuite(Object):
24 |     def __init__(self, prompt: str, suite: str, input_dir: pathlib.Path) -> None:
25 |         super().__init__()
26 |         self._suite = self._load(prompt, suite, input_dir)
27 | 
28 |     @property
29 |     def samples(self) -> typing.Iterator[typing.Tuple[int, typing.List[str]]]:
30 |         def _sample(i: int) -> str:
31 |             parts = [self._pretext, example["text"], self._posttext, self._queries[i]]
32 |             parts = [p for p in parts if p != ""]
33 |             return "\n".join(parts)
34 | 
35 |         for example in self._context:
36 |             index = example["expected"]
37 |             samples = [_sample(i) for i in range(len(self._queries))]
38 |             yield index, samples
39 | 
40 |     @property
41 |     def n_examples(self) -> int:
42 |         return len(self._context)
43 | 
44 |     @property
45 |     def n_queries(self) -> int:
46 |         return len(self._queries)
47 | 
48 |     @property
49 |     def _pretext(self) -> str:
50 |         assert "pretext" in self._suite
51 |         assert isinstance(self._suite["pretext"], str)
52 |         return self._suite["pretext"]
53 | 
54 |     @property
55 |     def _context(self) -> typing.List[typing.Dict[str, typing.Any]]:
56 |         assert "context" in self._suite
57 |         assert isinstance(self._suite["context"], list)
58 |         assert len(self._suite["context"]) > 0
59 |         assert all(isinstance(c, dict) for c in self._suite["context"])
60 |         assert all(("text" in c) and ("expected" in c) for c in self._suite["context"])
61 |         assert all(isinstance(c["text"], str) for c in self._suite["context"])
62 |         assert all(isinstance(c["expected"], int) for c in self._suite["context"])
63 |         return self._suite["context"]
64 | 
65 |     @property
66 |     def _posttext(self) -> str:
67 |         assert "posttext" in self._suite
68 |         assert isinstance(self._suite["posttext"], str)
69 |         return self._suite["posttext"]
70 | 
71 |     @property
72 |     def _queries(self) -> list[str]:
73 |         assert "queries" in self._suite
74 |         assert isinstance(self._suite["queries"], list)
75 |         assert all(isinstance(p, str) for p in self._suite["queries"])
76 |         return self._suite["queries"]
77 | 
78 |     def _load(self, prompt: str, suite: str, input_dir: pathlib.Path) -> dict:
79 |         suite_file = input_dir / f"{prompt}_{suite}.json"
80 |         with open(suite_file, "r", encoding="utf-8") as fstream:
81 |             return json.load(fstream)
82 | 


--------------------------------------------------------------------------------
/probsem/probsem.py:
--------------------------------------------------------------------------------
 1 | import collections
 2 | import pathlib
 3 | import typing
 4 | 
 5 | import numpy as np
 6 | import pandas as pd
 7 | from tqdm import tqdm
 8 | 
 9 | from probsem.abstract import Object
10 | from probsem.benchmarks import Prompt, TestSuite
11 | from probsem.models import Model
12 | from probsem.utils import normalize, print_sample, print_summary, sanitize_filename
13 | 
14 | 
15 | class ProbSem(Object):
16 |     def __init__(
17 |         self,
18 |         prompt: str,
19 |         test: str,
20 |         model: str,
21 |         norm: bool = False,
22 |         temp: float = 1.0,
23 |         input_dir: str | pathlib.Path = "",
24 |         output_dir: str | pathlib.Path = "",
25 |         cache_dir: str | pathlib.Path = "",
26 |     ) -> None:
27 |         super().__init__()
28 |         if input_dir == "":
29 |             input_dir = self._base / "inputs"
30 |         if output_dir == "":
31 |             output_dir = self._base / "outputs"
32 |         if cache_dir == "":
33 |             cache_dir = self._base / ".cache"
34 |         self._input_dir = pathlib.Path(input_dir)
35 |         self._output_dir = pathlib.Path(output_dir)
36 |         self._cache_dir = pathlib.Path(cache_dir)
37 |         self._run_id = sanitize_filename(f"{prompt}_{test}_{model}")
38 |         self._prompt = Prompt(prompt, self._input_dir)
39 |         self._suite = TestSuite(prompt, test, self._input_dir)
40 |         self._model = Model(model, norm, temp, self._cache_dir)
41 | 
42 |     @property
43 |     def _samples(self) -> typing.Iterable[typing.Dict[str, typing.Any]]:
44 |         for index, samples in self._suite.samples:
45 |             iterator = range(len(samples))
46 |             yield {
47 |                 "prompt": [self._prompt.text for _ in iterator],
48 |                 "text": ["\n".join(samples[i].split("\n")[:-1]) for i in iterator],
49 |                 "queries": [samples[i].split("\n")[-1] for i in iterator],
50 |                 "correct": index,
51 |             }
52 | 
53 |     def _score(self, prompt: str, text: str, query: str) -> np.float64:
54 |         full_text = "\n".join([prompt, text, query])
55 |         return self._model.score(full_text, query)
56 | 
57 |     def _export_results_table(
58 |         self, samples: typing.List[typing.Dict[str, typing.Any]]
59 |     ) -> None:
60 |         fname = self._output_dir / f"{self._run_id}_results.csv"
61 |         fname.parent.mkdir(parents=True, exist_ok=True)
62 |         table = collections.defaultdict(list)
63 |         for sample in samples:
64 |             table["text"].extend(sample["text"])
65 |             table["query"].extend(sample["queries"])
66 |             table["logp"].extend(sample["logp"])
67 |             table["score"].extend(sample["scores"])
68 |         pd.DataFrame(table).to_csv(fname, index=False)
69 | 
70 |     def run(self) -> typing.List[typing.Dict[str, typing.Any]]:
71 |         samples = []
72 |         for sample in tqdm(self._samples, total=self._suite.n_examples):
73 |             assert len(set(sample["prompt"])) == 1
74 |             assert len(set(sample["text"])) == 1
75 |             assert len(set(sample["queries"])) == self._suite.n_queries
76 |             sample["logp"] = []
77 |             for prompt, text, query in zip(
78 |                 sample["prompt"], sample["text"], sample["queries"]
79 |             ):
80 |                 sample["logp"].append(self._score(prompt, text, query))
81 |             sample["logp"] = np.array(sample["logp"])
82 |             sample["scores"] = normalize(sample["logp"])
83 |             samples.append(sample)
84 |             self.info(print_sample(sample))
85 |         self.info(print_summary(samples))
86 |         self._export_results_table(samples)
87 |         return samples
88 | 


--------------------------------------------------------------------------------
/BENCHMARKS.md:
--------------------------------------------------------------------------------
 1 | # Benchmark Walkthrough
 2 | 
 3 | As explained in the [README.md](https://github.com/benlipkin/probsem/blob/main/README.md), evaluations of new benchmarks require materials to be prepared in a certain form. The critical components are a `Prompt` and at least one `TestSuite`.
 4 | 
 5 | ## Prompts
 6 | 
 7 | Your prompt should define any general context you'd like to be prepended to all your evaluations. This often includes a description of the domain of interest, general instructions, sometimes few shot examples, and generally any text that should be available to your model across your evaluations. 
 8 | 
 9 | For example, for a reading comprehension task, an introduction, the passage itself, and some task instructions to answer questions might be included. See `inputs/test.txt` for an example.
10 | 
11 | ```
12 | This next section will involve a reading comprehension test. 
13 | 
14 | Please read the following paragraph, and answer any questions that follow.
15 | 
16 | The following was sourced from the top of https://en.wikipedia.org/wiki/Wikipedia:Large_language_models on 01/30/2023
17 | 
18 | """
19 | Large language models (LLMs) such as GPT-3 are increasingly being used to generate text.
20 | These tools should be used with care, since they can generate content that is biased, non-verifiable, constitutes original research, or fails to follow our other policies and guidelines.
21 | Editors retain full responsibility for LLM-assisted edits, which should still comply with all relevant Wikipedia policies.
22 | While the use of LLMs is not prohibited, their use should be reserved to experienced editors, who should carefully scrutinize their LLM-assisted edits before hitting Publish. 
23 | The use of such programs to create whole articles or generate passages from scratch is forbidden.
24 | Furthermore, LLM use must be declared in the edit summary.
25 | """
26 | 
27 | Based on the above paragraph, please answer the following questions.
28 | 
29 | ```
30 | 
31 | ## TestSuites
32 | 
33 | Next, you can develop a series of test suites to evaluate individual collections of test cases for a prompt. For example, for our reading comprehension task, each unique class of question about our passage would typically be placed in a separate test suite.
34 | 
35 | Within each test suite JSON file, there are several components: the `pretext`, `context`, `posttext`, and `queries`. Following the prompt, the `pretext` is the next thing concatenated to an LLM evaluation. It provides any test suite specific instructions or specifications. For example, two test suites might require two different question formats, one T/F and one multiple-choice. The `posttext`, which wraps the `context` on the other side, is also static across a test suite. Both `pretext` and `posttext` are optional, and can be left as an empty string `""` if not needed. 
36 | 
37 | The core manipulation of any test suite is in the `context` and the `queries`. For a given test suite, the full cross-product of these is evaluated. For example, a user might specify several unique questions, but want to evaluate the labels `True` vs. `False` for each of them. Pasted below is an example from `inputs/test_A.json` that does just this.
38 | 
39 | ```json
40 | {
41 |     "pretext": "The following questions are based on the above text, and should be answered True or False.",
42 |     "context": [
43 |         {
44 |             "text": "The author is discussing the role of LLMs in Wikipedia editing.", 
45 |             "expected": 0
46 |         }, {
47 |             "text": "LLM use for wikipedia editing is accepted without restrictions.", 
48 |             "expected": 1
49 |         }
50 |     ],
51 |     "posttext": "The correct answer is:",
52 |     "queries": [
53 |         "True",
54 |         "False"
55 |     ]
56 | }
57 | ```
58 | 
59 | As you can see, for a single pretext and posttext, a variety of questions of similar form can be iterated over in context, and for each, logit scores or a normalized probability distribution over the queries can be returned. 
60 | 
61 | You may also note that each context example includes an integer under the `expected` key. This integer maps to the index of the query expected to have the maximum score. This allows for automatic accuracy evaluation. If, however, there is no ground-truth answer, this value can be substituted with `-1`, which disables automated scoring. 
62 | 
63 | Let's check out `inputs/test_B.json` next, which poses a different question format and has no a-priori correct answer.
64 | 
65 | ```json
66 | {
67 |     "pretext": "The following questions are based on the above text, and should be answered via Multiple Choice response.",
68 |     "context": [
69 |         {
70 |             "text": "Based on the above passage, the author believes that LLM use for writing is as a whole:\nA) Only positive.\nB) Mostly positive.\nC) Mostly negative.\nD) Only negative.", 
71 |             "expected": -1
72 |         }
73 |     ],
74 |     "posttext": "The correct answer is:",
75 |     "queries": [
76 |         "A",
77 |         "B",
78 |         "C",
79 |         "D"
80 |     ]
81 | }
82 | ```
83 | 
84 | As shown above, the text of the `pretext`, `context`, `posttext`, or any of the `queries` may also be a multiline entry. However, for JSON parsing purposes, these must be written on a single line, and the character `\n` used explicitly to represent any line breaks.
85 | 


--------------------------------------------------------------------------------
/probsem/models.py:
--------------------------------------------------------------------------------
  1 | import functools
  2 | import hashlib
  3 | import pathlib
  4 | import time
  5 | import typing
  6 | import warnings
  7 | 
  8 | import diskcache
  9 | import numpy as np
 10 | import openai
 11 | import torch
 12 | 
 13 | from transformers import AutoConfig, AutoTokenizer, AutoModelForCausalLM
 14 | 
 15 | from probsem.abstract import Object, IModel
 16 | 
 17 | openai.api_key_path = str(pathlib.Path.home() / ".openai_api_key")
 18 | 
 19 | 
 20 | class Model(Object):
 21 |     def __init__(
 22 |         self, model_id: str, norm: bool, temp: float, cache_dir: pathlib.Path
 23 |     ) -> None:
 24 |         super().__init__()
 25 |         self._id = model_id
 26 |         self._norm = norm
 27 |         if temp <= 0:
 28 |             raise ValueError("Temperature must be positive.")
 29 |         self._temp = temp
 30 |         self._model: IModel
 31 |         openai_engines = [engine["id"] for engine in openai.Engine.list()["data"]]
 32 |         if self._id in openai_engines:
 33 |             self.info("Model ID found in OpenAI engines.")
 34 |             setattr(self, "_model", OpenAIModel(self._id))
 35 |         else:
 36 |             self.info("Model ID not found in OpenAI engines. Checking HuggingFace.")
 37 |             setattr(self, "_model", HuggingFaceModel(self._id))
 38 |         self._cache = diskcache.Cache(cache_dir)
 39 | 
 40 |     def score(
 41 |         self,
 42 |         full_text: str,
 43 |         eval_text: str,
 44 |     ) -> np.float64:
 45 |         key_id = "_".join([self._id, full_text, eval_text])
 46 |         key = hashlib.sha256(key_id.encode("utf-8")).hexdigest()
 47 |         if key in self._cache:
 48 |             logp, num_eval = self._cache[key]
 49 |         else:
 50 |             logp, num_eval = self._model.score(full_text, eval_text)
 51 |             self._cache[key] = logp, num_eval
 52 |         if self._norm:
 53 |             logp /= num_eval
 54 |         return logp / self._temp
 55 | 
 56 | 
 57 | class OpenAIModel(Object, IModel):
 58 |     def __init__(self, model_id: str) -> None:
 59 |         super().__init__()
 60 |         self._id = model_id
 61 |         self.info(f"Selected OpenAI {self._id} model.")
 62 | 
 63 |     def _get_response(
 64 |         self, text: str, retry_after=10
 65 |     ) -> openai.openai_object.OpenAIObject:
 66 |         try:
 67 |             return openai.Completion.create(
 68 |                 engine=self._id,
 69 |                 prompt=text,
 70 |                 max_tokens=0,
 71 |                 logprobs=0,
 72 |                 echo=True,
 73 |             )
 74 |         except (openai.error.RateLimitError, openai.error.APIError) as e:
 75 |             self.warn(f"Rate limit exceeded. Retrying after {retry_after} seconds.")
 76 |             time.sleep(retry_after)
 77 |             return self._get_response(text, retry_after * 2)
 78 | 
 79 |     def score(self, full_text: str, eval_text: str) -> typing.Tuple[np.float64, int]:
 80 |         full_resp = self._get_response(full_text)
 81 |         try:
 82 |             eval_resp = self._get_response(eval_text)
 83 |             num_eval = eval_resp["usage"]["total_tokens"]
 84 |         except openai.error.InvalidRequestError:
 85 |             num_eval = 1 if eval_text else 0
 86 |         logp = np.sum(full_resp["choices"][0]["logprobs"]["token_logprobs"][-num_eval:])
 87 |         return logp, num_eval
 88 | 
 89 | 
 90 | class HuggingFaceModel(Object, IModel):
 91 |     def __init__(self, model_id: str) -> None:
 92 |         super().__init__()
 93 |         self._id = model_id
 94 |         self.info(f"Attempting to load HuggingFace {self._id} model...")
 95 |         try:
 96 |             self._config = AutoConfig.from_pretrained(self._id)
 97 |             self._tokenizer = AutoTokenizer.from_pretrained(
 98 |                 self._id, add_prefix_space=True
 99 |             )
100 |             self._model = AutoModelForCausalLM.from_pretrained(
101 |                 self._id, torch_dtype=torch.float32, low_cpu_mem_usage=True
102 |             )
103 |             self._model.eval()
104 |         except Exception as invalid_id:
105 |             raise ValueError(
106 |                 "model must be valid HuggingFace CausalLM."
107 |             ) from invalid_id
108 |         self._set_torch_device()
109 |         self.info(f"Successfully loaded pretrained {self._id} model on {self._device}.")
110 | 
111 |     def _set_torch_device(self) -> None:
112 |         if torch.cuda.is_available():
113 |             self._device = torch.device("cuda")
114 |             torch.set_default_tensor_type(torch.cuda.FloatTensor)  # type: ignore
115 |             try:
116 |                 self._model = self._model.to(self._device)
117 |                 return
118 |             except RuntimeError:
119 |                 self._device = torch.device("cpu")
120 |                 torch.set_default_tensor_type(torch.FloatTensor)
121 |                 self._model = self._model.to(self._device)
122 |         else:
123 |             self._device = torch.device("cpu")
124 |             torch.set_default_tensor_type(torch.FloatTensor)
125 |             self._model = self._model.to(self._device)
126 | 
127 |     @functools.lru_cache(maxsize=128)
128 |     def _encode_text(self, text: str) -> typing.Dict[str, torch.Tensor]:
129 |         return self._tokenizer(text, return_tensors="pt").to(self._device)
130 | 
131 |     def _decode_text(self, tokens: torch.Tensor) -> str:
132 |         return self._tokenizer.decode(tokens, skip_special_tokens=True)
133 | 
134 |     def score(self, full_text: str, eval_text: str) -> typing.Tuple[np.float64, int]:
135 |         with torch.no_grad():
136 |             inputs = self._encode_text(full_text)
137 |             num_eval = self._encode_text(eval_text)["input_ids"].shape[1]
138 |             tokens = inputs["input_ids"]
139 |             mask = inputs["attention_mask"]
140 |             with warnings.catch_warnings():
141 |                 warnings.simplefilter("ignore")
142 |                 outputs = self._model(input_ids=tokens, attention_mask=mask)
143 |             loss = torch.nn.CrossEntropyLoss(reduction="none")(
144 |                 outputs.logits[..., :-1, :]
145 |                 .contiguous()
146 |                 .view(-1, outputs.logits.size(-1)),
147 |                 tokens[..., 1:].contiguous().view(-1),
148 |             ).view(tokens.size(0), tokens.size(-1) - 1)
149 |             loss = loss * mask[..., 1:].contiguous()
150 |             loss = loss[:, -num_eval:].sum(dim=1)
151 |             logp = -loss.cpu().detach().item()
152 |             return logp, num_eval
153 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | [![Tests](https://github.com/benlipkin/probsem/actions/workflows/testing.yml/badge.svg)](https://github.com/benlipkin/probsem/actions/workflows/testing.yml) [![DOI](https://zenodo.org/badge/558137294.svg)](https://zenodo.org/badge/latestdoi/558137294)
  2 | 
  3 | # ProbSem
  4 | 
  5 | ## Deprecation Notice
  6 | 
  7 | ⚠️ This project is functional, but is no longer being actively maintained. I recommend using [minicons](https://github.com/kanishkamisra/minicons) for most LLM scoring needs. If you'd like to replicate any paper results using probsem, the paper branches are still supported.
  8 | 
  9 | ## Summary
 10 | 
 11 | This repository provides a framework to leverage large language models (LLMs) to assign context-conditional probability distributions over queried strings, with default support for all OpenAI engines and HuggingFace CausalLM models.
 12 | 
 13 | It is intended to be flexible across a wide range of research applications spanning linguistics, cognitive science, program synthesis, and NLP.
 14 | 
 15 | Here are a few examples:
 16 | 
 17 | - Cloze Completion Task
 18 |     ```bash
 19 |     .. prompt, task instructions ..
 20 |     context:    The color of the Boston sky during January is
 21 |     query1:     blue  # P=0.4
 22 |     query2:     gray  # P=0.6
 23 |     ```
 24 | 
 25 | - Multiple Choice QA
 26 |     ```bash
 27 |     .. prompt, task instructions ..
 28 |     context:    The girl pushed the boy.
 29 |     posttext:   Which of the following logically entails?
 30 |                 A: The girl was pushed by the boy.
 31 |                 B: The boy was pushed by the boy.
 32 |                 C: The boy was pushed by the girl.
 33 |                 D: The girl was pushed by the girl.
 34 |                 The correct response is:
 35 |     query1:     A   # P=0.03
 36 |     query2:     B   # P=0.01
 37 |     query3:     C   # P=0.95
 38 |     query4:     D   # P=0.01
 39 |     ```
 40 | 
 41 | - Semantic Parsing
 42 |     ```scheme
 43 |     .. prompt, task instructions ..
 44 |     pretext:    ;; Player strengths were distributed ~N(50,20)
 45 |     context:    ;; X has nearly average strength.
 46 |     query1:     (λ (x) (= (abs (- (strength x) 50)) 0))   ;; P=0.1
 47 |     query2:     (λ (x) (< (abs (- (strength x) 50)) 10))  ;; P=0.9
 48 |     ```
 49 | 
 50 | - Code completion
 51 |     ```python
 52 |     .. prompt, task instructions ..
 53 |     context:    def reverse(lst:list):
 54 |     query1:       return lst[::-1]      # P=0.40
 55 |     query2:       return reversed(lst)  # P=0.30
 56 |     query3:       lst.reverse()         # P=0.20
 57 |     query4:       list.reverse(lst)     # P=0.10
 58 |     ```
 59 | 
 60 | In each of these examples, a user may define a flexible frame of reference using the concatenation of a `prompt`, `context`, and optional `pretext` and `posttext`, which wrap the `context`, to derive a probability distribution over possible completions defined as `queries`. The precise formulation of such evaluations can be explored further by viewing the examples in the `inputs` folder or checking out the [BENCHMARKS.md](https://github.com/benlipkin/probsem/blob/main/BENCHMARKS.md) walkthrough.
 61 | 
 62 | ### Version Note
 63 | 
 64 | _The name of this repository `ProbSem` is a legacy reference to the original use case for which it was developed: Evaluations of **Prob**abilistic **Sem**antics and Pragmatics. It was generalized into its current form after expressed interest from collaborators and colleagues._
 65 | 
 66 | As such the `main` branch is under development and evolving. To replicate specific papers, `git checkout` the corresponding paper branch and follow instructions in the associated `README.md`.
 67 | 
 68 | ## Getting Started
 69 | 
 70 | ### Download the repo:
 71 | ```bash
 72 | git clone --branch main --depth 1 git@github.com:benlipkin/probsem.git
 73 | ```
 74 | ### Build environment:
 75 | 
 76 | _Note: Multiple installation strategies are provided._
 77 | 
 78 | - [Anaconda](https://conda.io/projects/conda/en/latest/user-guide/install/index.html), [Make](https://www.gnu.org/software/make/manual/make.html): automatically build and populate virtual environment (recommended).
 79 |     ```bash
 80 |     make env
 81 |     ```
 82 |     Can test installation via:
 83 |     ```bash
 84 |     make test
 85 |     ```
 86 | 
 87 | 
 88 | - pip[strict]: install exact dependencies used during development into current environment.
 89 |     ```bash
 90 |     python -m pip install -r requirements.txt
 91 |     ```
 92 | 
 93 | - pip[flexible]: install general dependencies with fewer version specifications at discretion of user.
 94 |     ```bash
 95 |     python -m pip install -e .
 96 |     ```
 97 | 
 98 | ### Setup API Key:
 99 | To use OpenAI models, an API key must be placed at `~/.openai_api_key`
100 | 
101 | ## Run
102 | 
103 | The first step is to generate your benchmark. This includes, at minimum, a `Prompt` file and one `TestSuite`. See [BENCHMARKS.md](https://github.com/benlipkin/probsem/blob/main/BENCHMARKS.md) for more info on the structure of these files.
104 | 
105 | ```bash
106 | nano inputs/prompt.txt
107 | nano inputs/prompt_testsuite.json
108 | ```
109 | 
110 | Once a prompt and test suite are defined, they can be evaluated at the command line. For a given prompt `prompt` and test suite `testsuite`, as shown above, the following syntax can be used for evaluation.
111 | 
112 | ### CLI
113 | 
114 | ```bash
115 | python -m probsem --prompt prompt --test testsuite
116 | ```
117 | 
118 | The prompt `*.txt` file and test suite `*.json` file must share the same prefix (`prompt` above) to be linked, and are assumed by default to exist in the `inputs` folder. This default, and others, can be overwritten. See below.
119 | 
120 | Optional arguments (and other relevant internal details):
121 | 
122 | - `--input_dir [STR] {default: "inputs"}` Update path to directory containing the benchmark files to be read in.
123 | - `--output_dir [STR] {default: "outputs"}` Update path to directory where output files should be saved. On each run, a CSV is saved with the resulting scores.
124 | - `--model [STR] {default: "code-davinci-002"}` Customize the model used for scoring. All OpenAI API engines and HuggingFace CausalLM models are currently supported. HF models run on GPU by default else CPU if not available.
125 | - `--norm [BOOL True] {default: False}` This flag can be used to turn on normalization. By default scores returned reflect the sum of the query token context-conditional log-probabilties. When this flag is passed, these values are normalized for the number of tokens, uniquely for each tokenizer.
126 | - `--temp [FLOAT >0] {default: 1.0}` Following the derivation of individual query-level scores, a probability distribution over the batch of queries is calculated by passing the array of logit scores to a softmax function with temperature parameter $\alpha$. Specifying $\alpha<1.0$ decreases the entropy of the returned multinomial distribution and $\alpha>1.0$ increases the entropy. Entropy can be thought of qualitatively as inverse to the _peakiness_ of the distribution, being maximized at the uniform distribution and returning $0$ when all probability mass is on a single value.
127 | 
128 | ### API
129 | 
130 | An API is also supported for integration with existing applications. To run the same default example from above, the following code will suffice. All optional parameters are available as well.
131 | 
132 | ```python
133 | from probsem.probsem import ProbSem
134 | 
135 | probsem = ProbSem(
136 |     prompt="prompt",
137 |     test="testsuite",
138 | )
139 | results = probsem.run()
140 | ```
141 | 
142 | ## Issues/Contributing
143 | 
144 | If you find any particular aspects of this repository unclear, or if you encounter any errors, please open an issue. Comments on documentation, examples, and clarity are also appreciated. If you find an issue, and have ideas on how to address it, feel free to open a pull request. Community contributions are greatly appreciated.
145 | 
146 | ## Citation
147 | 
148 | ```bibtex
149 | @software{LipkinProbSem2023,
150 |   author = {Lipkin, Benjamin},
151 |   title = {ProbSem},
152 |   year = {2023},
153 |   url = {https://github.com/benlipkin/probsem},
154 |   doi = {10.5281/zenodo.7603078}
155 | }
156 | ```
157 | 
158 | ## License
159 | 
160 | [![License: MIT](https://img.shields.io/badge/License-MIT-brightgreen.svg)](https://opensource.org/licenses/MIT)
161 | 


--------------------------------------------------------------------------------