├── probsem ├── __init__.py ├── __main__.py ├── abstract.py ├── utils.py ├── benchmarks.py ├── probsem.py └── models.py ├── Dockerfile ├── requirements.txt ├── test └── test_integration.py ├── inputs ├── test_A.json ├── test_B.json └── test.txt ├── .github └── workflows │ └── testing.yml ├── setup.py ├── Makefile ├── BENCHMARKS.md └── README.md /probsem/__init__.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import sys 3 | 4 | from probsem.probsem import ProbSem 5 | 6 | logging.basicConfig(stream=sys.stdout, level=logging.INFO) 7 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM continuumio/miniconda3 2 | 3 | SHELL ["/bin/bash", "-c"] 4 | 5 | WORKDIR /app 6 | 7 | COPY ./ . 8 | 9 | RUN apt-get update 10 | RUN apt-get -y install make 11 | RUN make env 12 | 13 | ENTRYPOINT ["/bin/bash", "-c"] 14 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | numpy==1.23.4 2 | openai==0.25.0 3 | pandas==1.5.1 4 | torch==1.12.1 5 | transformers==4.23.1 6 | accelerate==0.13.2 7 | diskcache==5.6.1 8 | mypy==0.982 9 | lxml==4.9.1 10 | tqdm-stubs==0.2.1 11 | pylint==2.15.5 12 | pylint-json2html==0.4.0 13 | pylint-exit==1.2.0 14 | pytest==7.2.0 15 | pytest-html==3.2.0 16 | coverage==6.5.0 -------------------------------------------------------------------------------- /test/test_integration.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from probsem.probsem import ProbSem 4 | 5 | 6 | @pytest.mark.parametrize( 7 | "prompt, benchmark, model", 8 | [pytest.param("test", "A", "Salesforce/codegen-350M-mono")], 9 | ) 10 | def test_run(prompt, benchmark, model): 11 | ProbSem(prompt=prompt, test=benchmark, model=model).run() 12 | -------------------------------------------------------------------------------- /inputs/test_A.json: -------------------------------------------------------------------------------- 1 | { 2 | "pretext": "The following questions are based on the above text, and should be answered True or False.", 3 | "context": [ 4 | { 5 | "text": "The author is discussing the role of LLMs in Wikipedia editing.", 6 | "expected": 0 7 | }, { 8 | "text": "LLM use for wikipedia editing is accepted without restrictions.", 9 | "expected": 1 10 | } 11 | ], 12 | "posttext": "The correct answer is:", 13 | "queries": [ 14 | "True", 15 | "False" 16 | ] 17 | } -------------------------------------------------------------------------------- /inputs/test_B.json: -------------------------------------------------------------------------------- 1 | { 2 | "pretext": "The following questions are based on the above text, and should be answered via Multiple Choice response.", 3 | "context": [ 4 | { 5 | "text": "Based on the above passage, the author believes that LLM use for writing is as a whole:\nA) Only positive.\nB) Mostly positive.\nC) Mostly negative.\nD) Only negative.", 6 | "expected": -1 7 | } 8 | ], 9 | "posttext": "The correct answer is:", 10 | "queries": [ 11 | "A", 12 | "B", 13 | "C", 14 | "D" 15 | ] 16 | } -------------------------------------------------------------------------------- /.github/workflows/testing.yml: -------------------------------------------------------------------------------- 1 | name: Tests 2 | 3 | on: 4 | push: 5 | branches: [main] 6 | pull_request: 7 | branches: [main] 8 | 9 | jobs: 10 | run_tests: 11 | 12 | runs-on: ubuntu-latest 13 | 14 | steps: 15 | - name: checkout repo 16 | uses: actions/checkout@v3.0.0 17 | with: 18 | fetch-depth: 1 19 | 20 | - name: build docker image 21 | run: docker build . --file Dockerfile --tag probsem:latest 22 | 23 | - name: run testing pipeline 24 | run: docker run probsem:latest make test 25 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import find_packages, setup 2 | 3 | with open("README.md") as readme_file: 4 | readme = readme_file.read() 5 | 6 | requirements = [ 7 | "numpy", 8 | "openai", 9 | "pandas", 10 | "torch", 11 | "transformers", 12 | "accelerate", 13 | ] 14 | 15 | test_requirements = [ 16 | "mypy", 17 | "lxml", 18 | "tqdm-stubs", 19 | "pylint", 20 | "pylint-json2html", 21 | "pylint-exit", 22 | "pytest", 23 | "pytest-html", 24 | "coverage", 25 | ] 26 | 27 | setup( 28 | name="probsem", 29 | version="0.1.0", 30 | description="probabilistic semantic parsing via program synthesis", 31 | long_description=readme, 32 | author="Benjamin Lipkin", 33 | author_email="lipkinb@mit.edu", 34 | license="MIT", 35 | packages=find_packages(where="probsem"), 36 | install_requires=requirements, 37 | extras_require={"test": test_requirements}, 38 | python_requires=">=3.10", 39 | ) 40 | -------------------------------------------------------------------------------- /inputs/test.txt: -------------------------------------------------------------------------------- 1 | This next section will involve a reading comprehension test. 2 | 3 | Please read the following paragraph, and answer any questions that follow. 4 | 5 | The following was sourced from the top of https://en.wikipedia.org/wiki/Wikipedia:Large_language_models on 01/30/2023 6 | 7 | """ 8 | Large language models (LLMs) such as GPT-3 are increasingly being used to generate text. 9 | These tools should be used with care, since they can generate content that is biased, non-verifiable, constitutes original research, or fails to follow our other policies and guidelines. 10 | Editors retain full responsibility for LLM-assisted edits, which should still comply with all relevant Wikipedia policies. 11 | While the use of LLMs is not prohibited, their use should be reserved to experienced editors, who should carefully scrutinize their LLM-assisted edits before hitting Publish. 12 | The use of such programs to create whole articles or generate passages from scratch is forbidden. 13 | Furthermore, LLM use must be declared in the edit summary. 14 | """ 15 | 16 | Based on the above paragraph, please answer the following questions. 17 | -------------------------------------------------------------------------------- /probsem/__main__.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import datetime 3 | 4 | from probsem.probsem import ProbSem 5 | from probsem.abstract import Object 6 | 7 | 8 | class CLI(Object): 9 | def __init__(self) -> None: 10 | super().__init__() 11 | self._parser = argparse.ArgumentParser() 12 | self._parser.add_argument("--prompt", required=True) 13 | self._parser.add_argument("--test", required=True) 14 | self._parser.add_argument("--model", default="code-davinci-002") 15 | self._parser.add_argument("--norm", default=False, action="store_true") 16 | self._parser.add_argument("--temp", default=1.0, type=float) 17 | self._parser.add_argument("--input_dir", default="") 18 | self._parser.add_argument("--output_dir", default="") 19 | self._parser.add_argument("--cache_dir", default="") 20 | 21 | def run_main(self) -> None: 22 | start = datetime.datetime.now() 23 | ProbSem(**vars(self._parser.parse_args())).run() 24 | elapsed = datetime.datetime.now() - start 25 | self.info(f"Completed successfully in {elapsed}.") 26 | 27 | 28 | if __name__ == "__main__": 29 | CLI().run_main() 30 | -------------------------------------------------------------------------------- /probsem/abstract.py: -------------------------------------------------------------------------------- 1 | import abc 2 | import logging 3 | import pathlib 4 | import typing 5 | 6 | import numpy as np 7 | 8 | 9 | class Object(abc.ABC): 10 | def __init__(self) -> None: 11 | self._base = pathlib.Path(__file__).parents[1] 12 | self._name = self.__class__.__name__ 13 | self._logger = logging.getLogger(self._name) 14 | 15 | def _log(self, message: str, level: str, offset: int) -> None: 16 | assert hasattr(self._logger, level) 17 | if "\n" in message: 18 | lines = message.split("\n") 19 | else: 20 | lines = [message] 21 | for line in lines: 22 | formatted = f"{' ' * (offset - len(self._name))}{line}" 23 | getattr(self._logger, level)(formatted) 24 | 25 | def info(self, message: str) -> None: 26 | self._log(message, "info", 20) 27 | 28 | def warn(self, message: str) -> None: 29 | self._log(message, "warning", 17) 30 | 31 | def __setattr__(self, name: str, value: typing.Any) -> None: 32 | super().__setattr__(name, value) 33 | 34 | def __getattribute__(self, name: str) -> typing.Any: 35 | return super().__getattribute__(name) 36 | 37 | 38 | @typing.runtime_checkable 39 | class IModel(typing.Protocol): 40 | def __init__(self, model_id: str) -> None: 41 | raise NotImplementedError() # pragma: no cover 42 | 43 | def score(self, full_text: str, eval_text: str) -> typing.Tuple[np.float64, int]: 44 | raise NotImplementedError() # pragma: no cover 45 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | SHELL := /usr/bin/env bash 2 | EXEC = python=3.10 3 | PACKAGE = probsem 4 | INSTALL = python -m pip install 5 | ACTIVATE = source activate $(PACKAGE) 6 | .DEFAULT_GOAL := help 7 | 8 | ## help : print available build commands. 9 | .PHONY : help 10 | help : Makefile 11 | @sed -n 's/^##//p' $< 12 | 13 | ## update : update repo with latest version from GitHub. 14 | .PHONY : update 15 | update : 16 | @git pull origin main 17 | 18 | ## env : setup environment and install dependencies. 19 | .PHONY : env 20 | env : $(PACKAGE).egg-info/ 21 | $(PACKAGE).egg-info/ : setup.py requirements.txt 22 | @conda create -yn $(PACKAGE) $(EXEC) 23 | @$(ACTIVATE) ; $(INSTALL) -r requirements.txt ; $(INSTALL) -e ".[test]" 24 | 25 | ## test : run testing pipeline. 26 | .PHONY : test 27 | test : mypy pylint pytest 28 | mypy : env html/mypy/index.html 29 | pylint : env html/pylint/index.html 30 | pytest : env html/coverage/index.html 31 | html/mypy/index.html : $(PACKAGE)/*.py 32 | @$(ACTIVATE) ; mypy \ 33 | -p $(PACKAGE) \ 34 | --ignore-missing-imports \ 35 | --html-report $(@D) 36 | html/pylint/index.html : html/pylint/index.json 37 | @$(ACTIVATE) ; pylint-json2html -o $@ -e utf-8 $< 38 | html/pylint/index.json : $(PACKAGE)/*.py 39 | @mkdir -p $(@D) 40 | @$(ACTIVATE) ; pylint $(PACKAGE) \ 41 | --disable C0114,C0115,C0116 \ 42 | --generated-members torch.* \ 43 | --output-format=colorized,json:$@ \ 44 | || pylint-exit $$? 45 | html/coverage/index.html : html/pytest/report.html 46 | @$(ACTIVATE) ; coverage html -d $(@D) 47 | html/pytest/report.html : $(PACKAGE)/*.py test/*.py 48 | @$(ACTIVATE) ; coverage run --branch -m pytest \ 49 | --html=$@ --self-contained-html 50 | -------------------------------------------------------------------------------- /probsem/utils.py: -------------------------------------------------------------------------------- 1 | import re 2 | import typing 3 | 4 | import numpy as np 5 | import numpy.typing as npt 6 | 7 | 8 | def sanitize_filename(text: str) -> str: 9 | return re.sub(r"^[ .]|[/<>:\"\\|?*]+|[ .]$", "-", text) 10 | 11 | 12 | def normalize(weights: npt.NDArray[np.float64]) -> npt.NDArray[np.float64]: 13 | return np.exp(weights) / np.sum(np.exp(weights)) 14 | 15 | 16 | def print_sample(sample: typing.Dict[str, typing.Any]) -> str: 17 | ostream = [] 18 | ostream.append("\nText:") 19 | ostream.append('"""') 20 | ostream.append(f"{sample['text'][0]}") 21 | ostream.append('"""') 22 | ostream.append("\nScores:") 23 | ostream.append('"""') 24 | for _, (query, score) in enumerate(zip(sample["queries"], sample["scores"])): 25 | ostream.append(f"{score:.3f}\t{query}") 26 | ostream.append('"""') 27 | if sample["correct"] == -1: 28 | ostream.append("") 29 | elif np.argmax(sample["scores"]) == sample["correct"]: 30 | ostream.append("\n" + "TEST SAMPLE PASSED." + "\n") 31 | else: 32 | ostream.append("\n" + "TEST SAMPLE FAILED." + "\n") 33 | return "\n".join([30 * "_"] + ostream + [30 * "_"]) 34 | 35 | 36 | def print_summary(samples: typing.List[typing.Dict[str, typing.Any]]) -> str: 37 | scores = np.array([s["scores"] for s in samples]) 38 | indices = np.array([s["correct"] for s in samples]) 39 | if -1 in indices: 40 | accuracy = np.float64(np.nan) 41 | else: 42 | correct = scores[np.arange(indices.size), indices] == scores.max(axis=1) 43 | accuracy = correct.mean() 44 | ostream = [] 45 | ostream.append(f"TEST SUITE ACCURACY:\t{accuracy:.3f}") 46 | return "\n".join([30 * "_"] + ostream + [30 * "_"]) 47 | -------------------------------------------------------------------------------- /probsem/benchmarks.py: -------------------------------------------------------------------------------- 1 | import json 2 | import pathlib 3 | import typing 4 | 5 | from probsem.abstract import Object 6 | 7 | 8 | class Prompt(Object): 9 | def __init__(self, prompt: str, input_dir: pathlib.Path) -> None: 10 | super().__init__() 11 | self._text = self._load(prompt, input_dir) 12 | 13 | @property 14 | def text(self) -> str: 15 | return self._text 16 | 17 | def _load(self, prompt: str, input_dir: pathlib.Path) -> str: 18 | prompt_file = input_dir / f"{prompt}.txt" 19 | with open(prompt_file, "r", encoding="utf-8") as fstream: 20 | return fstream.read() 21 | 22 | 23 | class TestSuite(Object): 24 | def __init__(self, prompt: str, suite: str, input_dir: pathlib.Path) -> None: 25 | super().__init__() 26 | self._suite = self._load(prompt, suite, input_dir) 27 | 28 | @property 29 | def samples(self) -> typing.Iterator[typing.Tuple[int, typing.List[str]]]: 30 | def _sample(i: int) -> str: 31 | parts = [self._pretext, example["text"], self._posttext, self._queries[i]] 32 | parts = [p for p in parts if p != ""] 33 | return "\n".join(parts) 34 | 35 | for example in self._context: 36 | index = example["expected"] 37 | samples = [_sample(i) for i in range(len(self._queries))] 38 | yield index, samples 39 | 40 | @property 41 | def n_examples(self) -> int: 42 | return len(self._context) 43 | 44 | @property 45 | def n_queries(self) -> int: 46 | return len(self._queries) 47 | 48 | @property 49 | def _pretext(self) -> str: 50 | assert "pretext" in self._suite 51 | assert isinstance(self._suite["pretext"], str) 52 | return self._suite["pretext"] 53 | 54 | @property 55 | def _context(self) -> typing.List[typing.Dict[str, typing.Any]]: 56 | assert "context" in self._suite 57 | assert isinstance(self._suite["context"], list) 58 | assert len(self._suite["context"]) > 0 59 | assert all(isinstance(c, dict) for c in self._suite["context"]) 60 | assert all(("text" in c) and ("expected" in c) for c in self._suite["context"]) 61 | assert all(isinstance(c["text"], str) for c in self._suite["context"]) 62 | assert all(isinstance(c["expected"], int) for c in self._suite["context"]) 63 | return self._suite["context"] 64 | 65 | @property 66 | def _posttext(self) -> str: 67 | assert "posttext" in self._suite 68 | assert isinstance(self._suite["posttext"], str) 69 | return self._suite["posttext"] 70 | 71 | @property 72 | def _queries(self) -> list[str]: 73 | assert "queries" in self._suite 74 | assert isinstance(self._suite["queries"], list) 75 | assert all(isinstance(p, str) for p in self._suite["queries"]) 76 | return self._suite["queries"] 77 | 78 | def _load(self, prompt: str, suite: str, input_dir: pathlib.Path) -> dict: 79 | suite_file = input_dir / f"{prompt}_{suite}.json" 80 | with open(suite_file, "r", encoding="utf-8") as fstream: 81 | return json.load(fstream) 82 | -------------------------------------------------------------------------------- /probsem/probsem.py: -------------------------------------------------------------------------------- 1 | import collections 2 | import pathlib 3 | import typing 4 | 5 | import numpy as np 6 | import pandas as pd 7 | from tqdm import tqdm 8 | 9 | from probsem.abstract import Object 10 | from probsem.benchmarks import Prompt, TestSuite 11 | from probsem.models import Model 12 | from probsem.utils import normalize, print_sample, print_summary, sanitize_filename 13 | 14 | 15 | class ProbSem(Object): 16 | def __init__( 17 | self, 18 | prompt: str, 19 | test: str, 20 | model: str, 21 | norm: bool = False, 22 | temp: float = 1.0, 23 | input_dir: str | pathlib.Path = "", 24 | output_dir: str | pathlib.Path = "", 25 | cache_dir: str | pathlib.Path = "", 26 | ) -> None: 27 | super().__init__() 28 | if input_dir == "": 29 | input_dir = self._base / "inputs" 30 | if output_dir == "": 31 | output_dir = self._base / "outputs" 32 | if cache_dir == "": 33 | cache_dir = self._base / ".cache" 34 | self._input_dir = pathlib.Path(input_dir) 35 | self._output_dir = pathlib.Path(output_dir) 36 | self._cache_dir = pathlib.Path(cache_dir) 37 | self._run_id = sanitize_filename(f"{prompt}_{test}_{model}") 38 | self._prompt = Prompt(prompt, self._input_dir) 39 | self._suite = TestSuite(prompt, test, self._input_dir) 40 | self._model = Model(model, norm, temp, self._cache_dir) 41 | 42 | @property 43 | def _samples(self) -> typing.Iterable[typing.Dict[str, typing.Any]]: 44 | for index, samples in self._suite.samples: 45 | iterator = range(len(samples)) 46 | yield { 47 | "prompt": [self._prompt.text for _ in iterator], 48 | "text": ["\n".join(samples[i].split("\n")[:-1]) for i in iterator], 49 | "queries": [samples[i].split("\n")[-1] for i in iterator], 50 | "correct": index, 51 | } 52 | 53 | def _score(self, prompt: str, text: str, query: str) -> np.float64: 54 | full_text = "\n".join([prompt, text, query]) 55 | return self._model.score(full_text, query) 56 | 57 | def _export_results_table( 58 | self, samples: typing.List[typing.Dict[str, typing.Any]] 59 | ) -> None: 60 | fname = self._output_dir / f"{self._run_id}_results.csv" 61 | fname.parent.mkdir(parents=True, exist_ok=True) 62 | table = collections.defaultdict(list) 63 | for sample in samples: 64 | table["text"].extend(sample["text"]) 65 | table["query"].extend(sample["queries"]) 66 | table["logp"].extend(sample["logp"]) 67 | table["score"].extend(sample["scores"]) 68 | pd.DataFrame(table).to_csv(fname, index=False) 69 | 70 | def run(self) -> typing.List[typing.Dict[str, typing.Any]]: 71 | samples = [] 72 | for sample in tqdm(self._samples, total=self._suite.n_examples): 73 | assert len(set(sample["prompt"])) == 1 74 | assert len(set(sample["text"])) == 1 75 | assert len(set(sample["queries"])) == self._suite.n_queries 76 | sample["logp"] = [] 77 | for prompt, text, query in zip( 78 | sample["prompt"], sample["text"], sample["queries"] 79 | ): 80 | sample["logp"].append(self._score(prompt, text, query)) 81 | sample["logp"] = np.array(sample["logp"]) 82 | sample["scores"] = normalize(sample["logp"]) 83 | samples.append(sample) 84 | self.info(print_sample(sample)) 85 | self.info(print_summary(samples)) 86 | self._export_results_table(samples) 87 | return samples 88 | -------------------------------------------------------------------------------- /BENCHMARKS.md: -------------------------------------------------------------------------------- 1 | # Benchmark Walkthrough 2 | 3 | As explained in the [README.md](https://github.com/benlipkin/probsem/blob/main/README.md), evaluations of new benchmarks require materials to be prepared in a certain form. The critical components are a `Prompt` and at least one `TestSuite`. 4 | 5 | ## Prompts 6 | 7 | Your prompt should define any general context you'd like to be prepended to all your evaluations. This often includes a description of the domain of interest, general instructions, sometimes few shot examples, and generally any text that should be available to your model across your evaluations. 8 | 9 | For example, for a reading comprehension task, an introduction, the passage itself, and some task instructions to answer questions might be included. See `inputs/test.txt` for an example. 10 | 11 | ``` 12 | This next section will involve a reading comprehension test. 13 | 14 | Please read the following paragraph, and answer any questions that follow. 15 | 16 | The following was sourced from the top of https://en.wikipedia.org/wiki/Wikipedia:Large_language_models on 01/30/2023 17 | 18 | """ 19 | Large language models (LLMs) such as GPT-3 are increasingly being used to generate text. 20 | These tools should be used with care, since they can generate content that is biased, non-verifiable, constitutes original research, or fails to follow our other policies and guidelines. 21 | Editors retain full responsibility for LLM-assisted edits, which should still comply with all relevant Wikipedia policies. 22 | While the use of LLMs is not prohibited, their use should be reserved to experienced editors, who should carefully scrutinize their LLM-assisted edits before hitting Publish. 23 | The use of such programs to create whole articles or generate passages from scratch is forbidden. 24 | Furthermore, LLM use must be declared in the edit summary. 25 | """ 26 | 27 | Based on the above paragraph, please answer the following questions. 28 | 29 | ``` 30 | 31 | ## TestSuites 32 | 33 | Next, you can develop a series of test suites to evaluate individual collections of test cases for a prompt. For example, for our reading comprehension task, each unique class of question about our passage would typically be placed in a separate test suite. 34 | 35 | Within each test suite JSON file, there are several components: the `pretext`, `context`, `posttext`, and `queries`. Following the prompt, the `pretext` is the next thing concatenated to an LLM evaluation. It provides any test suite specific instructions or specifications. For example, two test suites might require two different question formats, one T/F and one multiple-choice. The `posttext`, which wraps the `context` on the other side, is also static across a test suite. Both `pretext` and `posttext` are optional, and can be left as an empty string `""` if not needed. 36 | 37 | The core manipulation of any test suite is in the `context` and the `queries`. For a given test suite, the full cross-product of these is evaluated. For example, a user might specify several unique questions, but want to evaluate the labels `True` vs. `False` for each of them. Pasted below is an example from `inputs/test_A.json` that does just this. 38 | 39 | ```json 40 | { 41 | "pretext": "The following questions are based on the above text, and should be answered True or False.", 42 | "context": [ 43 | { 44 | "text": "The author is discussing the role of LLMs in Wikipedia editing.", 45 | "expected": 0 46 | }, { 47 | "text": "LLM use for wikipedia editing is accepted without restrictions.", 48 | "expected": 1 49 | } 50 | ], 51 | "posttext": "The correct answer is:", 52 | "queries": [ 53 | "True", 54 | "False" 55 | ] 56 | } 57 | ``` 58 | 59 | As you can see, for a single pretext and posttext, a variety of questions of similar form can be iterated over in context, and for each, logit scores or a normalized probability distribution over the queries can be returned. 60 | 61 | You may also note that each context example includes an integer under the `expected` key. This integer maps to the index of the query expected to have the maximum score. This allows for automatic accuracy evaluation. If, however, there is no ground-truth answer, this value can be substituted with `-1`, which disables automated scoring. 62 | 63 | Let's check out `inputs/test_B.json` next, which poses a different question format and has no a-priori correct answer. 64 | 65 | ```json 66 | { 67 | "pretext": "The following questions are based on the above text, and should be answered via Multiple Choice response.", 68 | "context": [ 69 | { 70 | "text": "Based on the above passage, the author believes that LLM use for writing is as a whole:\nA) Only positive.\nB) Mostly positive.\nC) Mostly negative.\nD) Only negative.", 71 | "expected": -1 72 | } 73 | ], 74 | "posttext": "The correct answer is:", 75 | "queries": [ 76 | "A", 77 | "B", 78 | "C", 79 | "D" 80 | ] 81 | } 82 | ``` 83 | 84 | As shown above, the text of the `pretext`, `context`, `posttext`, or any of the `queries` may also be a multiline entry. However, for JSON parsing purposes, these must be written on a single line, and the character `\n` used explicitly to represent any line breaks. 85 | -------------------------------------------------------------------------------- /probsem/models.py: -------------------------------------------------------------------------------- 1 | import functools 2 | import hashlib 3 | import pathlib 4 | import time 5 | import typing 6 | import warnings 7 | 8 | import diskcache 9 | import numpy as np 10 | import openai 11 | import torch 12 | 13 | from transformers import AutoConfig, AutoTokenizer, AutoModelForCausalLM 14 | 15 | from probsem.abstract import Object, IModel 16 | 17 | openai.api_key_path = str(pathlib.Path.home() / ".openai_api_key") 18 | 19 | 20 | class Model(Object): 21 | def __init__( 22 | self, model_id: str, norm: bool, temp: float, cache_dir: pathlib.Path 23 | ) -> None: 24 | super().__init__() 25 | self._id = model_id 26 | self._norm = norm 27 | if temp <= 0: 28 | raise ValueError("Temperature must be positive.") 29 | self._temp = temp 30 | self._model: IModel 31 | openai_engines = [engine["id"] for engine in openai.Engine.list()["data"]] 32 | if self._id in openai_engines: 33 | self.info("Model ID found in OpenAI engines.") 34 | setattr(self, "_model", OpenAIModel(self._id)) 35 | else: 36 | self.info("Model ID not found in OpenAI engines. Checking HuggingFace.") 37 | setattr(self, "_model", HuggingFaceModel(self._id)) 38 | self._cache = diskcache.Cache(cache_dir) 39 | 40 | def score( 41 | self, 42 | full_text: str, 43 | eval_text: str, 44 | ) -> np.float64: 45 | key_id = "_".join([self._id, full_text, eval_text]) 46 | key = hashlib.sha256(key_id.encode("utf-8")).hexdigest() 47 | if key in self._cache: 48 | logp, num_eval = self._cache[key] 49 | else: 50 | logp, num_eval = self._model.score(full_text, eval_text) 51 | self._cache[key] = logp, num_eval 52 | if self._norm: 53 | logp /= num_eval 54 | return logp / self._temp 55 | 56 | 57 | class OpenAIModel(Object, IModel): 58 | def __init__(self, model_id: str) -> None: 59 | super().__init__() 60 | self._id = model_id 61 | self.info(f"Selected OpenAI {self._id} model.") 62 | 63 | def _get_response( 64 | self, text: str, retry_after=10 65 | ) -> openai.openai_object.OpenAIObject: 66 | try: 67 | return openai.Completion.create( 68 | engine=self._id, 69 | prompt=text, 70 | max_tokens=0, 71 | logprobs=0, 72 | echo=True, 73 | ) 74 | except (openai.error.RateLimitError, openai.error.APIError) as e: 75 | self.warn(f"Rate limit exceeded. Retrying after {retry_after} seconds.") 76 | time.sleep(retry_after) 77 | return self._get_response(text, retry_after * 2) 78 | 79 | def score(self, full_text: str, eval_text: str) -> typing.Tuple[np.float64, int]: 80 | full_resp = self._get_response(full_text) 81 | try: 82 | eval_resp = self._get_response(eval_text) 83 | num_eval = eval_resp["usage"]["total_tokens"] 84 | except openai.error.InvalidRequestError: 85 | num_eval = 1 if eval_text else 0 86 | logp = np.sum(full_resp["choices"][0]["logprobs"]["token_logprobs"][-num_eval:]) 87 | return logp, num_eval 88 | 89 | 90 | class HuggingFaceModel(Object, IModel): 91 | def __init__(self, model_id: str) -> None: 92 | super().__init__() 93 | self._id = model_id 94 | self.info(f"Attempting to load HuggingFace {self._id} model...") 95 | try: 96 | self._config = AutoConfig.from_pretrained(self._id) 97 | self._tokenizer = AutoTokenizer.from_pretrained( 98 | self._id, add_prefix_space=True 99 | ) 100 | self._model = AutoModelForCausalLM.from_pretrained( 101 | self._id, torch_dtype=torch.float32, low_cpu_mem_usage=True 102 | ) 103 | self._model.eval() 104 | except Exception as invalid_id: 105 | raise ValueError( 106 | "model must be valid HuggingFace CausalLM." 107 | ) from invalid_id 108 | self._set_torch_device() 109 | self.info(f"Successfully loaded pretrained {self._id} model on {self._device}.") 110 | 111 | def _set_torch_device(self) -> None: 112 | if torch.cuda.is_available(): 113 | self._device = torch.device("cuda") 114 | torch.set_default_tensor_type(torch.cuda.FloatTensor) # type: ignore 115 | try: 116 | self._model = self._model.to(self._device) 117 | return 118 | except RuntimeError: 119 | self._device = torch.device("cpu") 120 | torch.set_default_tensor_type(torch.FloatTensor) 121 | self._model = self._model.to(self._device) 122 | else: 123 | self._device = torch.device("cpu") 124 | torch.set_default_tensor_type(torch.FloatTensor) 125 | self._model = self._model.to(self._device) 126 | 127 | @functools.lru_cache(maxsize=128) 128 | def _encode_text(self, text: str) -> typing.Dict[str, torch.Tensor]: 129 | return self._tokenizer(text, return_tensors="pt").to(self._device) 130 | 131 | def _decode_text(self, tokens: torch.Tensor) -> str: 132 | return self._tokenizer.decode(tokens, skip_special_tokens=True) 133 | 134 | def score(self, full_text: str, eval_text: str) -> typing.Tuple[np.float64, int]: 135 | with torch.no_grad(): 136 | inputs = self._encode_text(full_text) 137 | num_eval = self._encode_text(eval_text)["input_ids"].shape[1] 138 | tokens = inputs["input_ids"] 139 | mask = inputs["attention_mask"] 140 | with warnings.catch_warnings(): 141 | warnings.simplefilter("ignore") 142 | outputs = self._model(input_ids=tokens, attention_mask=mask) 143 | loss = torch.nn.CrossEntropyLoss(reduction="none")( 144 | outputs.logits[..., :-1, :] 145 | .contiguous() 146 | .view(-1, outputs.logits.size(-1)), 147 | tokens[..., 1:].contiguous().view(-1), 148 | ).view(tokens.size(0), tokens.size(-1) - 1) 149 | loss = loss * mask[..., 1:].contiguous() 150 | loss = loss[:, -num_eval:].sum(dim=1) 151 | logp = -loss.cpu().detach().item() 152 | return logp, num_eval 153 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [![Tests](https://github.com/benlipkin/probsem/actions/workflows/testing.yml/badge.svg)](https://github.com/benlipkin/probsem/actions/workflows/testing.yml) [![DOI](https://zenodo.org/badge/558137294.svg)](https://zenodo.org/badge/latestdoi/558137294) 2 | 3 | # ProbSem 4 | 5 | ## Deprecation Notice 6 | 7 | ⚠️ This project is functional, but is no longer being actively maintained. I recommend using [minicons](https://github.com/kanishkamisra/minicons) for most LLM scoring needs. If you'd like to replicate any paper results using probsem, the paper branches are still supported. 8 | 9 | ## Summary 10 | 11 | This repository provides a framework to leverage large language models (LLMs) to assign context-conditional probability distributions over queried strings, with default support for all OpenAI engines and HuggingFace CausalLM models. 12 | 13 | It is intended to be flexible across a wide range of research applications spanning linguistics, cognitive science, program synthesis, and NLP. 14 | 15 | Here are a few examples: 16 | 17 | - Cloze Completion Task 18 | ```bash 19 | .. prompt, task instructions .. 20 | context: The color of the Boston sky during January is 21 | query1: blue # P=0.4 22 | query2: gray # P=0.6 23 | ``` 24 | 25 | - Multiple Choice QA 26 | ```bash 27 | .. prompt, task instructions .. 28 | context: The girl pushed the boy. 29 | posttext: Which of the following logically entails? 30 | A: The girl was pushed by the boy. 31 | B: The boy was pushed by the boy. 32 | C: The boy was pushed by the girl. 33 | D: The girl was pushed by the girl. 34 | The correct response is: 35 | query1: A # P=0.03 36 | query2: B # P=0.01 37 | query3: C # P=0.95 38 | query4: D # P=0.01 39 | ``` 40 | 41 | - Semantic Parsing 42 | ```scheme 43 | .. prompt, task instructions .. 44 | pretext: ;; Player strengths were distributed ~N(50,20) 45 | context: ;; X has nearly average strength. 46 | query1: (λ (x) (= (abs (- (strength x) 50)) 0)) ;; P=0.1 47 | query2: (λ (x) (< (abs (- (strength x) 50)) 10)) ;; P=0.9 48 | ``` 49 | 50 | - Code completion 51 | ```python 52 | .. prompt, task instructions .. 53 | context: def reverse(lst:list): 54 | query1: return lst[::-1] # P=0.40 55 | query2: return reversed(lst) # P=0.30 56 | query3: lst.reverse() # P=0.20 57 | query4: list.reverse(lst) # P=0.10 58 | ``` 59 | 60 | In each of these examples, a user may define a flexible frame of reference using the concatenation of a `prompt`, `context`, and optional `pretext` and `posttext`, which wrap the `context`, to derive a probability distribution over possible completions defined as `queries`. The precise formulation of such evaluations can be explored further by viewing the examples in the `inputs` folder or checking out the [BENCHMARKS.md](https://github.com/benlipkin/probsem/blob/main/BENCHMARKS.md) walkthrough. 61 | 62 | ### Version Note 63 | 64 | _The name of this repository `ProbSem` is a legacy reference to the original use case for which it was developed: Evaluations of **Prob**abilistic **Sem**antics and Pragmatics. It was generalized into its current form after expressed interest from collaborators and colleagues._ 65 | 66 | As such the `main` branch is under development and evolving. To replicate specific papers, `git checkout` the corresponding paper branch and follow instructions in the associated `README.md`. 67 | 68 | ## Getting Started 69 | 70 | ### Download the repo: 71 | ```bash 72 | git clone --branch main --depth 1 git@github.com:benlipkin/probsem.git 73 | ``` 74 | ### Build environment: 75 | 76 | _Note: Multiple installation strategies are provided._ 77 | 78 | - [Anaconda](https://conda.io/projects/conda/en/latest/user-guide/install/index.html), [Make](https://www.gnu.org/software/make/manual/make.html): automatically build and populate virtual environment (recommended). 79 | ```bash 80 | make env 81 | ``` 82 | Can test installation via: 83 | ```bash 84 | make test 85 | ``` 86 | 87 | 88 | - pip[strict]: install exact dependencies used during development into current environment. 89 | ```bash 90 | python -m pip install -r requirements.txt 91 | ``` 92 | 93 | - pip[flexible]: install general dependencies with fewer version specifications at discretion of user. 94 | ```bash 95 | python -m pip install -e . 96 | ``` 97 | 98 | ### Setup API Key: 99 | To use OpenAI models, an API key must be placed at `~/.openai_api_key` 100 | 101 | ## Run 102 | 103 | The first step is to generate your benchmark. This includes, at minimum, a `Prompt` file and one `TestSuite`. See [BENCHMARKS.md](https://github.com/benlipkin/probsem/blob/main/BENCHMARKS.md) for more info on the structure of these files. 104 | 105 | ```bash 106 | nano inputs/prompt.txt 107 | nano inputs/prompt_testsuite.json 108 | ``` 109 | 110 | Once a prompt and test suite are defined, they can be evaluated at the command line. For a given prompt `prompt` and test suite `testsuite`, as shown above, the following syntax can be used for evaluation. 111 | 112 | ### CLI 113 | 114 | ```bash 115 | python -m probsem --prompt prompt --test testsuite 116 | ``` 117 | 118 | The prompt `*.txt` file and test suite `*.json` file must share the same prefix (`prompt` above) to be linked, and are assumed by default to exist in the `inputs` folder. This default, and others, can be overwritten. See below. 119 | 120 | Optional arguments (and other relevant internal details): 121 | 122 | - `--input_dir [STR] {default: "inputs"}` Update path to directory containing the benchmark files to be read in. 123 | - `--output_dir [STR] {default: "outputs"}` Update path to directory where output files should be saved. On each run, a CSV is saved with the resulting scores. 124 | - `--model [STR] {default: "code-davinci-002"}` Customize the model used for scoring. All OpenAI API engines and HuggingFace CausalLM models are currently supported. HF models run on GPU by default else CPU if not available. 125 | - `--norm [BOOL True] {default: False}` This flag can be used to turn on normalization. By default scores returned reflect the sum of the query token context-conditional log-probabilties. When this flag is passed, these values are normalized for the number of tokens, uniquely for each tokenizer. 126 | - `--temp [FLOAT >0] {default: 1.0}` Following the derivation of individual query-level scores, a probability distribution over the batch of queries is calculated by passing the array of logit scores to a softmax function with temperature parameter $\alpha$. Specifying $\alpha<1.0$ decreases the entropy of the returned multinomial distribution and $\alpha>1.0$ increases the entropy. Entropy can be thought of qualitatively as inverse to the _peakiness_ of the distribution, being maximized at the uniform distribution and returning $0$ when all probability mass is on a single value. 127 | 128 | ### API 129 | 130 | An API is also supported for integration with existing applications. To run the same default example from above, the following code will suffice. All optional parameters are available as well. 131 | 132 | ```python 133 | from probsem.probsem import ProbSem 134 | 135 | probsem = ProbSem( 136 | prompt="prompt", 137 | test="testsuite", 138 | ) 139 | results = probsem.run() 140 | ``` 141 | 142 | ## Issues/Contributing 143 | 144 | If you find any particular aspects of this repository unclear, or if you encounter any errors, please open an issue. Comments on documentation, examples, and clarity are also appreciated. If you find an issue, and have ideas on how to address it, feel free to open a pull request. Community contributions are greatly appreciated. 145 | 146 | ## Citation 147 | 148 | ```bibtex 149 | @software{LipkinProbSem2023, 150 | author = {Lipkin, Benjamin}, 151 | title = {ProbSem}, 152 | year = {2023}, 153 | url = {https://github.com/benlipkin/probsem}, 154 | doi = {10.5281/zenodo.7603078} 155 | } 156 | ``` 157 | 158 | ## License 159 | 160 | [![License: MIT](https://img.shields.io/badge/License-MIT-brightgreen.svg)](https://opensource.org/licenses/MIT) 161 | --------------------------------------------------------------------------------