├── .gitignore
├── CheckEmbed
    ├── __init__.py
    ├── config_template.json
    ├── embedder
    │   ├── __init__.py
    │   └── embedder.py
    ├── embedding_models
    │   ├── README.md
    │   ├── __init__.py
    │   ├── abstract_embedding_model.py
    │   ├── clip_vit_large.py
    │   ├── e5_mistral_7b_instruct.py
    │   ├── embeddinggpt.py
    │   ├── gte_qwen1_5_7b_instruct.py
    │   ├── sfr_embedding_mistral.py
    │   └── stella.py
    ├── language_models
    │   ├── README.md
    │   ├── __init__.py
    │   ├── abstract_language_model.py
    │   ├── chatgpt.py
    │   └── chatollama.py
    ├── operations
    │   ├── README.md
    │   ├── __init__.py
    │   ├── bertscore_operation.py
    │   ├── checkembed_operation.py
    │   ├── llm_as_a_judge_operation.py
    │   ├── operations.py
    │   └── selfcheckgpt_operation.py
    ├── parser
    │   ├── __init__.py
    │   └── parser.py
    ├── plotters
    │   ├── README.md
    │   ├── __init__.py
    │   ├── bert_plot.py
    │   ├── checkembed_plot.py
    │   ├── plot_operations.py
    │   ├── raw_embedding_heatmap.py
    │   └── selfcheckgpt_plot.py
    ├── scheduler
    │   ├── __init__.py
    │   └── scheduler.py
    ├── utility
    │   ├── __init__.py
    │   ├── stderr_filter.py
    │   ├── stderr_filter.txt
    │   └── utility.py
    └── vision_models
    │   ├── README.md
    │   ├── __init__.py
    │   ├── abstract_vision_model.py
    │   └── stable_diffusion3_5.py
├── LICENSE
├── README.md
├── examples
    ├── RAGTruth
    │   ├── README.md
    │   ├── dataset
    │   │   ├── response.json
    │   │   ├── sampler.py
    │   │   ├── samples.json
    │   │   ├── source_info.json
    │   │   └── training_data.json
    │   ├── hallu_detect.py
    │   └── main.py
    ├── README.md
    ├── description
    │   ├── README.md
    │   ├── different
    │   │   ├── main.py
    │   │   └── prompt_scheme.txt
    │   └── similar
    │   │   ├── main.py
    │   │   └── prompt_scheme.txt
    ├── incremental_forced_hallucination
    │   ├── legal_summaries
    │   │   ├── README.md
    │   │   ├── dataset
    │   │   │   └── legal_definitions.json
    │   │   ├── main.py
    │   │   ├── prompt_scheme.txt
    │   │   └── prompt_scheme_ground_truth.txt
    │   ├── operation_variants
    │   │   ├── __init__.py
    │   │   ├── bertscore_operation_variant.py
    │   │   ├── checkembed_operation_variant.py
    │   │   └── selfcheckgpt_operation_variant.py
    │   └── scientific_descriptions
    │   │   ├── README.md
    │   │   ├── main.py
    │   │   ├── prompt_scheme.txt
    │   │   └── prompt_scheme_ground_truth.txt
    ├── legal_definitions
    │   ├── README.md
    │   ├── dataset
    │   │   └── legal_definitions.json
    │   ├── main.py
    │   └── prompt_scheme.txt
    ├── performance_test
    │   ├── README.md
    │   ├── data_extractor.py
    │   ├── main.py
    │   └── plot.py
    ├── vision
    │   ├── README.md
    │   └── main.py
    └── wiki_bio
    │   ├── README.md
    │   ├── data
    │       ├── dataset.json
    │       ├── download.py
    │       ├── passage_scores.json
    │       └── passage_scores.py
    │   └── main.py
├── paper
    ├── README.md
    ├── pics
    │   └── checkembed_overview.svg
    ├── plots.py
    └── results.tar.bz2
└── pyproject.toml


/.gitignore:
--------------------------------------------------------------------------------
 1 | # Byte-compiled / optimized / DLL files
 2 | __pycache__/
 3 | *.py[cod]
 4 | *$py.class
 5 | 
 6 | # Distribution / packaging
 7 | dist/
 8 | build/
 9 | 
10 | # Installer logs
11 | pip-log.txt
12 | 
13 | # IDEs
14 | .idea/
15 | *.vscode/
16 | *.pycproj
17 | *.user
18 | *.pyproj.user
19 | 
20 | # Data
21 | *.out
22 | *.err
23 | *.log
24 | 
25 | # Environments
26 | env/
27 | venv/
28 | 
29 | # Config File
30 | **/config.json
31 | 
32 | # Mac folder attributes
33 | .DS_Store
34 | 
35 | # Exclude working dir
36 | results/
37 | error_*/
38 | ground_truth/
39 | BertScore/
40 | SelfCheckGPT/
41 | embeddings/
42 | plots/
43 | *_samples/
44 | examples/**/CheckEmbed/
45 | paper/**/CheckEmbed/


--------------------------------------------------------------------------------
/CheckEmbed/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/spcl/CheckEmbed/008357ed0b6572575ec4c16daf52b549a9c38e25/CheckEmbed/__init__.py


--------------------------------------------------------------------------------
/CheckEmbed/config_template.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "chatgpt": {
 3 |     "model_id": "gpt-3.5-turbo-0125",
 4 |     "name": "gpt-3.5-turbo",
 5 |     "prompt_token_cost": 0.0005,
 6 |     "response_token_cost": 0.0015,
 7 |     "temperature": 1.0,
 8 |     "max_tokens": 4096,
 9 |     "stop": null,
10 |     "organization": "",
11 |     "api_key": ""
12 |   },
13 |   "chatgpt4": {
14 |     "model_id": "gpt-4",
15 |     "name": "gpt-4",
16 |     "prompt_token_cost": 0.03,
17 |     "response_token_cost": 0.06,
18 |     "temperature": 1.0,
19 |     "max_tokens": 4096,
20 |     "stop": null,
21 |     "organization": "",
22 |     "api_key": ""
23 |   },
24 |   "chatgpt4-turbo": {
25 |     "model_id": "gpt-4-turbo",
26 |     "name": "gpt-4-turbo",
27 |     "prompt_token_cost": 0.01,
28 |     "response_token_cost": 0.03,
29 |     "temperature": 1.0,
30 |     "max_tokens": 4096,
31 |     "stop": null,
32 |     "organization": "",
33 |     "api_key": ""
34 |   },
35 |   "chatgpt4-o": {
36 |     "model_id": "gpt-4o",
37 |     "name": "gpt-4o",
38 |     "prompt_token_cost": 0.0025,
39 |     "response_token_cost": 0.01,
40 |     "temperature": 1.0,
41 |     "max_tokens": 4096,
42 |     "stop": null,
43 |     "organization": "",
44 |     "api_key": ""
45 |   },
46 |   "chatgpt4-o-mini": {
47 |     "model_id": "gpt-4o-mini",
48 |     "name": "4o-mini",
49 |     "prompt_token_cost": 0.00015,
50 |     "response_token_cost": 0.0006,
51 |     "temperature": 1.0,
52 |     "max_tokens": 4096,
53 |     "stop": null,
54 |     "organization": "",
55 |     "api_key": ""
56 |   },
57 |   "llama70": {
58 |     "model_id": "llama3.3:70b-instruct-q8_0",
59 |     "name": "llama70b",
60 |     "temperature": 1.0,
61 |     "num_ctx": 131072,
62 |     "num_predict": 16384,
63 |     "num_batch": 1024,
64 |     "keep_alive": -1
65 |   },
66 |   "llama8b": {
67 |     "model_id": "llama3.1:8b-instruct-fp16",
68 |     "name": "llama8b",
69 |     "temperature": 1.0,
70 |     "num_ctx": 131072,
71 |     "num_predict": 16384,
72 |     "num_batch": 4096,
73 |     "keep_alive": -1
74 |   },
75 |   "gpt-embedding-large": {
76 |     "model_id": "text-embedding-3-large",
77 |     "name": "gpt-embedding-large",
78 |     "token_cost": 0.00013,
79 |     "encoding": "float",
80 |     "dimension": 3072,
81 |     "organization": "",
82 |     "api_key": ""
83 |   },
84 |   "gpt-embedding-small": {
85 |     "model_id": "text-embedding-3-small",
86 |     "name": "gpt-embedding-small",
87 |     "token_cost": 0.00002,
88 |     "encoding": "float",
89 |     "dimension": 1536,
90 |     "organization": "",
91 |     "api_key": ""
92 |   }
93 | }
94 | 


--------------------------------------------------------------------------------
/CheckEmbed/embedder/__init__.py:
--------------------------------------------------------------------------------
1 | from .embedder import Embedder


--------------------------------------------------------------------------------
/CheckEmbed/embedder/embedder.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 ETH Zurich.
 2 | #                    All rights reserved.
 3 | #
 4 | # Use of this source code is governed by a BSD-style license that can be
 5 | # found in the LICENSE file.
 6 | #
 7 | # main author: Lorenzo Paleari
 8 | 
 9 | from abc import ABC
10 | from typing import List
11 | 
12 | from tqdm import tqdm
13 | from CheckEmbed.embedding_models import AbstractEmbeddingModel
14 | import numpy as np
15 | 
16 | class Embedder(ABC):
17 |     """
18 |     Abstract base class that defines the interface for all embedders.
19 |     Embedders are used to embed text into a vector space.
20 |     """
21 | 
22 |     def embed(self, lm: AbstractEmbeddingModel, texts: List[str]) -> List[List[float]]:
23 |         """
24 |         Embed the given texts into vectors.
25 | 
26 |         :param lm: The embedding model that will be used to generate the text embeddings.
27 |         :type lm: AbstractEmbeddingModel
28 |         :param texts: The texts to embed.
29 |         :type texts: List[str]
30 |         :return: The embeddings of the texts.
31 |         :rtype: List[List[float]]
32 |         """
33 |         embedding_query = []
34 |         void_indexes = []
35 |         for index, text in enumerate(texts):
36 |             if text == "":
37 |                 void_indexes.append(index)
38 |             else:
39 |                 embedding_query.append(text)
40 | 
41 |         full_responses = np.zeros((len(texts))).tolist()
42 |         responses = lm.generate_embedding(embedding_query)
43 | 
44 |         for index in void_indexes:
45 |             full_responses[index] = []
46 | 
47 |         # fill remaining places in full_responses with responses in oroder
48 |         for index, response in enumerate(responses):
49 |             temp_index = index
50 |             while full_responses[temp_index] != 0.0:
51 |                 temp_index += 1
52 |             full_responses[temp_index] = response
53 |             
54 |         return full_responses
55 | 


--------------------------------------------------------------------------------
/CheckEmbed/embedding_models/README.md:
--------------------------------------------------------------------------------
  1 | # Embedding Models
  2 | 
  3 | The Embedding Models module is responsible for managing the embedding models.
  4 | 
  5 | Currently, the framework supports the following embedding models:
  6 | 
  7 | - text-embedding-large / small (remote - OpenAI API)
  8 | - Salesforce/SFR-Embedding-Mistral (local - GPU with 32GB VRAM recommended, model size is roughly 26GB)
  9 | - intfloat/e5-mistral-7b-instruct (local - GPU with 32GB VRAM recommended, model size is roughly 26GB)
 10 | - Alibaba-NLP/gte-Qwen1.5-7B-instruct (local - GPU with 32GB VRAM recommended, model size is roughly 26GB)
 11 | - NovaSearch/stella_en_1.5B_v5 (local - GPU with 12GB VRAM recommended, model size is roughly 6GB)
 12 | - NovaSearch/stella_en_400M_v5 (local - GPU with 4GB VRAM recommended, model size is roughly 2GB)
 13 | - openai/clip-vit-large-patch14 (local - GPU with 4GB VRAM recommended, model size is roughly 2GB)
 14 | 
 15 | The following sections describe how to instantiate individual models and how to add new models to the framework.
 16 | 
 17 | ## Embedding Model Instantiation
 18 | 
 19 | - Create a copy of `config_template.json` named `config.json` in the CheckEmbed folder. (Not necessary for local models)
 20 | - Fill in the configuration details based on the used model (below).
 21 | 
 22 | ### Embedding-Text-Large / Embedding-Text-Small
 23 | 
 24 | - Adjust the predefined `gpt-embedding-large` or `gpt-embedding-small` configurations or create a new configuration with an unique key.
 25 | 
 26 | | Key                 | Value                                                                                                                                                                                                                                                                                                                                                               |
 27 | |---------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
 28 | | model_id            | Model name based on [OpenAI model overview](https://platform.openai.com/docs/models/overview).                                                                                                                                                                                                                                                                      |
 29 | | name                | Name used for CheckEmbed output files. We suggest to use the default names for local models.                                                                                                                                                                                                                                                                        |
 30 | | token_cost          | Price per 1000 tokens based on [OpenAI pricing](https://openai.com/pricing), used for calculating cumulative price per LLM instance.                                                                                                                                                                                  |
 31 | | encoding            | String indicating the format to return the embeddings in. Can be either float or base64. More information can be found in the [OpenAI API reference](https://platform.openai.com/docs/api-reference/embeddings/create#embeddings-create-encoding_format). |
 32 | | dimension           | Number indicating output dimension for the embedding model. More information can be found in the [OpenAI model overview](https://platform.openai.com/docs/models/overview).                                                                                                       |
 33 | | organization        | Organization to use for the API requests (may be empty).                                                                                                                                                                                                                                                                                                            |
 34 | | api_key             | Personal API key that will be used to access the OpenAI API.                                                                                                                                                                                                                                                                                                        |
 35 | 
 36 | - Instantiate the embedding model based on the selected configuration key (predefined / custom).
 37 |   - `max_concurrent_request` is by default 10. Adjust the value based on your tier [rate limits](https://platform.openai.com/docs/guides/rate-limits).
 38 | 
 39 | ```python
 40 | embedding_lm = language_models.EmbeddingGPT(
 41 |                     config_path,
 42 |                     model_name = <configuration-key>,
 43 |                     cache = <False | True>,
 44 |                     max_concurrent_requests = <int number>
 45 |                 )
 46 | ```
 47 | 
 48 | ### Local Models
 49 | 
 50 | The framework currently supports the following local models: `Salesforce/SFR-Embedding-Mistral`, `intfloat/e5-mistral-7b-instruct`, `Alibaba-NLP/gte-Qwen1.5-7B-instruct`, `NovaSearch/stella_en_1.5B_v5`, `NovaSearch/stella_en_400M_v5` and `openai/clip-vit-large-patch14`.
 51 | 
 52 | - Instantiate the embedding model based on the owned device.
 53 | - Device can be specified in the `Scheduler`, more [here](/CheckEmbed/scheduler/scheduler.py)
 54 | 
 55 | ```python
 56 | sfrEmbeddingMistral = language_models.SFREmbeddingMistral(
 57 |                           model_name = "Salesforce/SFR-Embedding-Mistral",
 58 |                           cache = False,
 59 |                           batch_size = 64,
 60 |                       )
 61 | 
 62 | e5mistral7b = language_models.E5Mistral7b(
 63 |                     model_name = "intfloat/e5-mistral-7b-instruct",
 64 |                     cache = False,
 65 |                     batch_size = 64,
 66 |                 )
 67 | 
 68 | gteQwen157bInstruct = language_models.GteQwenInstruct(
 69 |                             model_name = "Alibaba-NLP/gte-Qwen1.5-7B-instruct",
 70 |                             cache = False,
 71 |                             access_token = "", # Add your access token here (Hugging Face)
 72 |                             batch_size = 1, # Unless you have more than 32GB of GPU VRAM at your disposal use 1.
 73 |                         )
 74 | 
 75 | stella_en_15B_v5 = embedding_models.Stella(
 76 |         model_name = "NovaSearch/stella_en_1.5B_v5",
 77 |         cache = False,
 78 |         batch_size = 64,
 79 |     )
 80 | 
 81 | stella_en_400M_v5 = embedding_models.Stella(
 82 |         model_name = "NovaSearch/stella_en_400M_v5",
 83 |         cache = False,
 84 |         batch_size = 64,
 85 |     )
 86 | 
 87 | clip_vit_large = embedding_models.ClipVitLarge(
 88 |         model_name = "openai/clip-vit-large-patch14",
 89 |         cache = False,
 90 |     )
 91 | ```
 92 | 
 93 | ## Adding Embedding Models
 94 | 
 95 | More embedding models can be added by following these steps:
 96 | 
 97 | - Create new class as a subclass of `AbstractEmbeddingModel`.
 98 | - Use the constructor for loading the configuration and instantiating the embedding model (if needed).
 99 | 
100 | ```python
101 | class CustomLanguageModel(AbstractEmbeddingModel):
102 |     def __init__(
103 |         self,
104 |         config_path: str = "",
105 |         model_name: str = "text-embedding-large",
106 |         name: str = "CustomLanguageModel",
107 |         cache: bool = False
108 |     ) -> None:
109 |         super().__init__(config_path, model_name, name, cache)
110 |         self.config: Dict = self.config[model_name]
111 |         
112 |         # Load data from configuration into variables if needed
113 | 
114 |         # Instantiate model if needed
115 | ```
116 | 
117 | - Implement the `load_model`, `unload_model` and `generate_embedding` abstract methods that are used to load/unload the model from the GPU (if necessary) and get a list of embeddings from the model (remote API call or local model inference) respectively.
118 | 
119 | ```python
120 | def load_model(self, device: str = None) -> None:
121 |     """
122 |     Load the model and tokenizer based on the given model name.
123 | 
124 |     :param device: The device to load the model on. Defaults to None.
125 |     :type device: str
126 |     """
127 | 
128 | def unload_model(self) -> None:
129 |     """
130 |     Unload the model and tokenizer.
131 |     """
132 | 
133 | def generate_embedding(
134 |         self,
135 |         input: Union[List[Any], Any]
136 |     ) -> List[float]:
137 |     # Call model and retrieve an embedding
138 |     # Return model response
139 | ```
140 | 


--------------------------------------------------------------------------------
/CheckEmbed/embedding_models/__init__.py:
--------------------------------------------------------------------------------
1 | from .abstract_embedding_model import AbstractEmbeddingModel
2 | from .clip_vit_large import ClipVitLarge
3 | from .e5_mistral_7b_instruct import E5Mistral7b
4 | from .embeddinggpt import EmbeddingGPT
5 | from .gte_qwen1_5_7b_instruct import GteQwenInstruct
6 | from .sfr_embedding_mistral import SFREmbeddingMistral
7 | from .stella import Stella
8 | 


--------------------------------------------------------------------------------
/CheckEmbed/embedding_models/abstract_embedding_model.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 ETH Zurich.
 2 | #                    All rights reserved.
 3 | #
 4 | # Use of this source code is governed by a BSD-style license that can be
 5 | # found in the LICENSE file.
 6 | #
 7 | # main author: Lorenzo Paleari
 8 | 
 9 | from abc import ABC, abstractmethod
10 | from typing import Any, Dict, List, Union
11 | import json
12 | import logging
13 | 
14 | 
15 | class AbstractEmbeddingModel(ABC):
16 |     """
17 |     Abstract base class that defines the interface for all embedding models.
18 |     """
19 | 
20 |     def __init__(
21 |         self, config_path: str = None, model_name: str = "", name: str = "INVALID_NAME", cache: bool = False
22 |     ) -> None:
23 |         """
24 |         Initialize the AbstractEmbeddingModel instance with configuration, model details, and caching options.
25 | 
26 |         :param config_path: Path to the config file. If provided, the config is loaded from the file. Defaults to "".
27 |         :type config_path: str
28 |         :param model_name: Name of the language model. Defaults to "".
29 |         :type model_name: str
30 |         :param name: Name of the embedding model. Defaults to "INVALID_NAME".
31 |         :type name: str
32 |         :param cache: Flag to determine whether to cache responses. Defaults to False.
33 |         :type cache: bool
34 |         """
35 |         self.logger = logging.getLogger(self.__class__.__name__)
36 |         self.config: Dict = None
37 |         self.model_name: str = model_name
38 |         self.cache = cache
39 |         if self.cache:
40 |             self.response_cache: Dict[str, List[Any]] = {}
41 |         if config_path is not None:
42 |             self.load_config(config_path)
43 |         self.name: str = name
44 |         try: 
45 |             if self.config is not None:
46 |                 if self.config[model_name] is not None:
47 |                     self.name = self.config[model_name]["name"]
48 |         except Exception:
49 |             pass
50 |         self.prompt_tokens: int = 0
51 |         self.cost: float = 0.0
52 | 
53 |     def load_config(self, path: str) -> None:
54 |         """
55 |         Load configuration from a specified path.
56 | 
57 |         :param path: Path to the config file.
58 |         :type path: str
59 |         """
60 |         with open(path, "r") as f:
61 |             self.config = json.load(f)
62 | 
63 |         self.logger.debug(f"Loaded config from {path} for {self.model_name}")
64 | 
65 |     def clear_cache(self) -> None:
66 |         """
67 |         Clear the response cache.
68 |         """
69 |         self.response_cache.clear()
70 | 
71 |     @abstractmethod
72 |     def load_model(self, device: str = None) -> None:
73 |         """
74 |         Abstract method to load the embedding model.
75 | 
76 |         :param device: The device to load the model on. Defaults to None.
77 |         :type device: str
78 |         """
79 |         pass
80 | 
81 |     @abstractmethod
82 |     def unload_model(self) -> None:
83 |         """
84 |         Abstract method to unload the embedding model.
85 |         """
86 |         pass
87 | 
88 |     @abstractmethod
89 |     def generate_embedding(self, input: Union[List[Any], Any]) -> List[List[float]]:
90 |         """
91 |         Abstract method to generate embedding for the given input text.
92 | 
93 |         :param input: The input text to embed.
94 |         :type input: Union[List[Any], Any]
95 |         :return: The embeddings of the text.
96 |         :rtype: List[List[float]]
97 |         """
98 |         pass
99 | 


--------------------------------------------------------------------------------
/CheckEmbed/embedding_models/clip_vit_large.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2025 ETH Zurich.
 2 | #                    All rights reserved.
 3 | #
 4 | # Use of this source code is governed by a BSD-style license that can be
 5 | # found in the LICENSE file.
 6 | #
 7 | # main authors: Lorenzo Paleari
 8 | #               Eric Schreiber
 9 | 
10 | import gc
11 | from typing import List, Union
12 | 
13 | import torch
14 | from PIL.Image import Image
15 | from transformers import CLIPModel, CLIPProcessor
16 | 
17 | from CheckEmbed.embedding_models import AbstractEmbeddingModel
18 | 
19 | 
20 | class ClipVitLarge(AbstractEmbeddingModel):
21 |     """
22 |     The ClipVitLarge class handles interactions with the CLIP ViT Large model using the provided configuration.
23 | 
24 |     Inherits from the AbstractEmbeddingModel class and implements its abstract methods.
25 |     """
26 | 
27 |     def __init__(
28 |         self, model_name: str = "", name: str = "clip-vit-large-patch-14", cache: bool = False
29 |     ) -> None:
30 |         """
31 |         Initialize the ClipVitLarge instance with configuration, model details, and caching options.
32 |         :param model_name: Name of the model, which is used to select the correct configuration. Defaults to "".
33 |         :type model_name: str
34 |         :param name: Name used for output files. Defaults to "clip-vit-large-patch-14".
35 |         :type name: str
36 |         :param cache: Flag to determine whether to cache responses. Defaults to False.
37 |         :type cache: bool
38 |         """
39 |         super().__init__(model_name=model_name, name=name, cache=cache)
40 |         self.processor_name = model_name
41 | 
42 |     def load_model(self, device: str = None) -> None:
43 |         """
44 |         Load the model and tokenizer based on the given model name.
45 | 
46 |         :param device: The device to load the model on. Defaults to None.
47 |         :type device: str
48 |         """
49 |         self.model = CLIPModel.from_pretrained(self.model_name).eval()
50 |         self.processor = CLIPProcessor.from_pretrained(self.processor_name)
51 |         self.model = self.model.to(device)
52 | 
53 |     def unload_model(self) -> None:
54 |         """
55 |         Unload the model and tokenizer.
56 |         """
57 |         del self.processor
58 |         del self.model
59 | 
60 |         gc.collect()
61 |         torch.cuda.empty_cache()
62 | 
63 |         self.processor = None
64 |         self.model = None
65 | 
66 |     def generate_embedding(self, input: Union[List[Image], Image]) -> List[List[float]]:
67 |         """
68 |         Abstract method to generate embedding for the given input text.
69 | 
70 |         :param input: The input image to embed.
71 |         :type input: Union[List[Image], Image]
72 |         :return: The embeddings of the image.
73 |         :rtype: List[List[float]]
74 |         """
75 |         if not isinstance(input, List):
76 |             input = [input]
77 | 
78 |         total_embeddings = []
79 |         for image in input:
80 |             inputs = self.processor(images=image, return_tensors="pt")
81 |             inputs = {k: v.to(self.model.device) for k, v in inputs.items()}
82 |             with torch.no_grad():
83 |                 latents = self.model.get_image_features(**inputs).squeeze().cpu().numpy().tolist()
84 |             total_embeddings.append(latents)
85 |         return total_embeddings
86 | 


--------------------------------------------------------------------------------
/CheckEmbed/embedding_models/e5_mistral_7b_instruct.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2024 ETH Zurich.
  2 | #                    All rights reserved.
  3 | #
  4 | # Use of this source code is governed by a BSD-style license that can be
  5 | # found in the LICENSE file.
  6 | #
  7 | # main author: Lorenzo Paleari
  8 | 
  9 | import torch
 10 | import gc
 11 | import torch.nn.functional as F
 12 | from torch import Tensor
 13 | from tqdm import tqdm
 14 | from transformers import AutoTokenizer, AutoModel
 15 | 
 16 | from typing import List, Union
 17 | 
 18 | from CheckEmbed.embedding_models import AbstractEmbeddingModel
 19 | 
 20 | 
 21 | class E5Mistral7b(AbstractEmbeddingModel):
 22 |     """
 23 |     The E5Mistral7b class handles interactions with the E5Mistral7b embedding model using the provided configuration.
 24 | 
 25 |     Inherits from the AbstractEmbeddingModel class and implements its abstract methods.
 26 |     """
 27 | 
 28 |     def __init__(
 29 |         self, model_name: str = "", name: str = "e5-mistral-7B-instruct", cache: bool = False, max_length: int = 4096, batch_size: int = 64
 30 |     ) -> None:
 31 |         """
 32 |         Initialize the E5Mistral7b instance with configuration, model details, and caching options.
 33 | 
 34 |         :param model_name: Name of the model, which is used to select the correct configuration. Defaults to "".
 35 |         :type model_name: str
 36 |         :param name: Name used for output files. Defaults to "e5-mistral-7B-instruct".
 37 |         :type name: str
 38 |         :param cache: Flag to determine whether to cache responses. Defaults to False.
 39 |         :type cache: bool
 40 |         :param max_length: The maximum length of the input text. Defaults to 4096.
 41 |         :type max_length: int
 42 |         :param batch_size: The batch size to be used for the model. Defaults to 64.
 43 |         :type batch_size: int
 44 |         """
 45 |         super().__init__(model_name=model_name, name=name, cache=cache)
 46 |         self.tokenizer_name = model_name
 47 |         self.max_length = max_length
 48 |         self.batch_size = batch_size
 49 | 
 50 |     def load_model(self, device: str = None) -> None:
 51 |         """
 52 |         Load the model and tokenizer based on the given model name.
 53 | 
 54 |         :param device: The device to load the model on.
 55 |         :type device: str
 56 |         """
 57 |         self.tokenizer = AutoTokenizer.from_pretrained(self.tokenizer_name)
 58 |         self.model = AutoModel.from_pretrained(self.model_name, device_map=device)
 59 |     
 60 |     def unload_model(self) -> None:
 61 |         """
 62 |         Unload the model and tokenizer.
 63 |         """
 64 |         del self.tokenizer
 65 |         del self.model
 66 | 
 67 |         gc.collect()
 68 |         torch.cuda.empty_cache()
 69 | 
 70 |         self.tokenizer = None
 71 |         self.model = None
 72 | 
 73 |     def generate_embedding(self, input: Union[List[str], str]) -> List[List[float]]:
 74 |         """
 75 |         Abstract method to generate embedding for the given input text.
 76 | 
 77 |         :param input: The input text to embed.
 78 |         :type input: Union[List[str], str]
 79 |         :return: The embeddings of the text.
 80 |         :rtype: List[List[float]]
 81 |         """
 82 |         if isinstance(input, str):
 83 |             input = [input]
 84 | 
 85 |         total_embeddings = []
 86 |         flag = True
 87 | 
 88 |         while flag:
 89 |             try:
 90 |                 batched_responses = [input[i:i+self.batch_size] for i in range(0, len(input), self.batch_size)]
 91 | 
 92 |                 embeddings = None
 93 |                 outputs = None
 94 |                 batch_dict = None
 95 | 
 96 |                 for batch in tqdm(batched_responses, desc="Batches to Embed", leave=False, total=len(batched_responses)):
 97 |                     batch_dict = self.tokenizer(batch, max_length=self.max_length, padding=True, truncation=True, return_tensors='pt')
 98 |                     batch_dict.to(self.model.device)
 99 | 
100 |                     with torch.no_grad():
101 |                         outputs = self.model(**batch_dict)
102 |                     embeddings = self.last_token_pool(outputs.last_hidden_state, batch_dict['attention_mask'])
103 | 
104 |                     # normalize embeddings
105 |                     embeddings = F.normalize(embeddings, p=2, dim=1)
106 |                     total_embeddings.extend(embeddings.cpu().detach().numpy().tolist())
107 | 
108 |                     del embeddings, outputs, batch_dict
109 |                     gc.collect()
110 |                     torch.cuda.empty_cache()
111 |                 
112 |                 flag = False
113 | 
114 |             except Exception as e:
115 |                 embeddings = None
116 |                 outputs = None
117 |                 batch_dict = None
118 |                 total_embeddings = []
119 |                 gc.collect()
120 |                 torch.cuda.empty_cache()
121 |                 
122 |                 print("Error occurred, reducing batch size and retrying")
123 |                 if self.batch_size == 1:
124 |                     raise e
125 |                 self.batch_size = self.batch_size // 2  # reduce batch size by half
126 |             
127 |         return total_embeddings
128 | 
129 |     def last_token_pool(self, last_hidden_states: Tensor,
130 |                  attention_mask: Tensor) -> Tensor:
131 |         """
132 |         Pools the last non-padding token's hidden state from the model's output.
133 | 
134 |         This method extracts the hidden state of the last token that is not a padding token.
135 |         If the last token is a padding token, it retrieves the hidden state of the 
136 |         second to last token that is not a padding token.
137 | 
138 |         :param last_hidden_states: A tensor containing the hidden states from the last layer of the model.
139 |         :type last_hidden_states: Tensor
140 |         :param attention_mask: A tensor indicating the positions of non-padding tokens (1 for non-padding, 0 for padding).
141 |         :type attention_mask: Tensor
142 |         :return: A tensor containing the hidden states of the last non-padding token for each sequence in the batch.
143 |         :rtype: Tensor
144 |         """
145 |         left_padding = (attention_mask[:, -1].sum() == attention_mask.shape[0])
146 |         if left_padding:
147 |             return last_hidden_states[:, -1]
148 |         else:
149 |             sequence_lengths = attention_mask.sum(dim=1) - 1
150 |             batch_size = last_hidden_states.shape[0]
151 |             return last_hidden_states[torch.arange(batch_size, device=last_hidden_states.device), sequence_lengths]
152 | 


--------------------------------------------------------------------------------
/CheckEmbed/embedding_models/embeddinggpt.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2024 ETH Zurich.
  2 | #                    All rights reserved.
  3 | #
  4 | # Use of this source code is governed by a BSD-style license that can be
  5 | # found in the LICENSE file.
  6 | #
  7 | # main author: Lorenzo Paleari
  8 | 
  9 | import backoff
 10 | import os
 11 | from typing import Dict, List, Union
 12 | from openai import OpenAI, OpenAIError
 13 | from openai.types import CreateEmbeddingResponse
 14 | from tqdm import tqdm
 15 | from concurrent.futures import ThreadPoolExecutor, as_completed
 16 | 
 17 | from CheckEmbed.embedding_models import AbstractEmbeddingModel
 18 | 
 19 | 
 20 | class EmbeddingGPT(AbstractEmbeddingModel):
 21 |     """
 22 |     The EmbeddingGPT class handles interactions with the OpenAI embedding models using the provided configuration.
 23 | 
 24 |     Inherits from the AbstractEmbeddingModel class and implements its abstract methods.
 25 |     """
 26 | 
 27 |     def __init__(
 28 |         self, config_path: str = "", model_name: str = "gpt-embedding-large", cache: bool = False, max_concurrent_requests: int = 10
 29 |     ) -> None:
 30 |         """
 31 |         Initialize the EmbeddingGPT instance with configuration, model details, and caching options.
 32 | 
 33 |         :param config_path: Path to the configuration file. Defaults to "".
 34 |         :type config_path: str
 35 |         :param model_name: Name of the model, which is used to select the correct configuration. Defaults to 'gpt-embedding-large'.
 36 |         :type model_name: str
 37 |         :param cache: Flag to determine whether to cache responses. Defaults to False.
 38 |         :type cache: bool
 39 |         :param max_concurrent_requests: The maximum number of concurrent requests. Defaults to 10.
 40 |         :type max_concurrent_requests: int
 41 |         """
 42 |         super().__init__(config_path, model_name, cache=cache)
 43 |         self.config: Dict = self.config[model_name]
 44 |         # The model_id is the id of the model that is used for chatgpt, i.e. gpt-4, gpt-3.5-turbo, etc.
 45 |         self.model_id: str = self.config["model_id"]
 46 |         # The prompt_token_cost and response_token_cost are the costs for 1000 prompt tokens and 1000 response tokens respectively.
 47 |         self.prompt_token_cost: float = self.config["token_cost"]
 48 |         self.encoding: str = self.config["encoding"]
 49 |         self.dimension: int = self.config["dimension"]
 50 |         # The account organization is the organization that is used for chatgpt.
 51 |         self.organization: str = self.config["organization"]
 52 |         if self.config["organization"] == "":
 53 |             self.logger.warning("OPENAI_ORGANIZATION is not set")
 54 |         self.api_key: str = os.getenv("OPENAI_API_KEY", self.config["api_key"])
 55 |         if os.getenv("OPENAI_API_KEY", self.config["api_key"]) == "":
 56 |             self.logger.warning("OPENAI_API_KEY is not set")
 57 |         # Initialize the OpenAI Client
 58 |         self.client = OpenAI(api_key=self.api_key, organization=self.organization)
 59 | 
 60 |         self.max_concurrent_requests = max_concurrent_requests
 61 | 
 62 |     def load_model(self, device: str = None) -> None:
 63 |         """
 64 |         Load the embedding model locally.
 65 | 
 66 |         :param device: The device to load the model on.
 67 |         :type device: str
 68 |         """
 69 |         pass
 70 | 
 71 |     def unload_model(self) -> None:
 72 |         """
 73 |         Unload the embedding model locally.
 74 |         """
 75 |         pass
 76 | 
 77 |     def generate_embedding(self, input: Union[List[str], str]) -> List[List[float]]:
 78 |         """
 79 |         Generate embeddings for the given input text.
 80 | 
 81 |         :param input: The input texts to embed.
 82 |         :type input: Union[List[str], str]
 83 |         :return: The embeddings of the text.
 84 |         :rtype: List[List[float]]
 85 |         """
 86 |         if isinstance(input, str):
 87 |             input = [input]
 88 | 
 89 |         with ThreadPoolExecutor(max_workers=self.max_concurrent_requests) as executor:
 90 |             futures = [executor.submit(self.embed_query, i) for i in input]
 91 |             results = []
 92 |             for future in tqdm(as_completed(futures), total=len(futures), desc="Embeddings", leave=False):
 93 |                 try:
 94 |                     response = future.result()
 95 |                     results.append(response.data[0].embedding)
 96 |                 except OpenAIError as e:
 97 |                     self.logger.error(f"OpenAIError: {e}")
 98 |                 except Exception as e:
 99 |                     self.logger.error(f"Unexpected error: {e}")
100 |         return results
101 | 
102 |     @backoff.on_exception(backoff.expo, OpenAIError, max_time=10, max_tries=6)
103 |     def embed_query(self, input: str) -> CreateEmbeddingResponse:
104 |         """
105 |         Embed the given text into a vector.
106 | 
107 |         :param input: The text to embed.
108 |         :type input: str
109 |         :return: The embedding of the text.
110 |         :rtype: CreateEmbeddingResponse
111 |         """
112 |         response = self.client.embeddings.create(
113 |             model=self.model_id,
114 |             input=input,
115 |             dimensions=self.dimension,
116 |             encoding_format=self.encoding,
117 |         )
118 | 
119 |         self.prompt_tokens += response.usage.prompt_tokens
120 |         prompt_tokens_k = float(self.prompt_tokens) / 1000.0
121 |         self.cost = (
122 |             self.prompt_token_cost * prompt_tokens_k
123 |         )
124 |         self.logger.info(
125 |             #f"This is the response from chatgpt: {response}"
126 |             f"\nRESPONDED - This is the cost of the response: {self.prompt_token_cost * float(response.usage.prompt_tokens) / 1000.0}"
127 |         )
128 |         return response
129 | 


--------------------------------------------------------------------------------
/CheckEmbed/embedding_models/gte_qwen1_5_7b_instruct.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2024 ETH Zurich.
  2 | #                    All rights reserved.
  3 | #
  4 | # Use of this source code is governed by a BSD-style license that can be
  5 | # found in the LICENSE file.
  6 | #
  7 | # main author: Lorenzo Paleari
  8 | 
  9 | import torch
 10 | import gc
 11 | import torch.nn.functional as F
 12 | from torch import Tensor
 13 | from tqdm import tqdm
 14 | from transformers import AutoTokenizer, AutoModel
 15 | 
 16 | from typing import List, Union
 17 | 
 18 | from CheckEmbed.embedding_models import AbstractEmbeddingModel
 19 | 
 20 | 
 21 | class GteQwenInstruct(AbstractEmbeddingModel):
 22 |     """
 23 |     The GteQwenInstruct class handles interactions with the gte-Qwen1.5-7B-instruct embedding model using the provided configuration.
 24 | 
 25 |     Inherits from the AbstractEmbeddingModel class and implements its abstract methods.
 26 |     """
 27 | 
 28 |     def __init__(
 29 |         self, access_token: str = "", model_name: str = "", name: str = "gte-qwen1.5-7B-instruct", cache: bool = False, max_length: int = 8192, batch_size: int = 64
 30 |     ) -> None:
 31 |         """
 32 |         Initialize the GteQwenInstruct instance with configuration, model details, and caching options.
 33 | 
 34 |         :param access_token: The Hugging Face access token to use for the model. Defaults to "".
 35 |         :type access_token: str
 36 |         :param model_name: Name of the model, which is used to select the correct configuration. Defaults to "".
 37 |         :type model_name: str
 38 |         :param name: Name used for output files. Defaults to "gte-qwen1.5-7B-instruct".
 39 |         :type name: str
 40 |         :param cache: Flag to determine whether to cache responses. Defaults to False.
 41 |         :type cache: bool
 42 |         :param max_length: The maximum length of the input text. Defaults to 8192.
 43 |         :type max_length: int
 44 |         :param batch_size: The batch size to be used for the model. Defaults to 64.
 45 |         :type batch_size: int
 46 |         """
 47 |         super().__init__(model_name=model_name, name=name, cache=cache)
 48 |         self.tokenizer_name = model_name
 49 |         self.max_length = max_length
 50 |         self.access_token = access_token
 51 |         self.batch_size = batch_size
 52 | 
 53 |     def load_model(self, device: str = None) -> None:
 54 |         """
 55 |         Load the model and tokenizer based on the given model name.
 56 | 
 57 |         :param device: The device to load the model on.
 58 |         :type device: str
 59 |         """
 60 | 
 61 |         self.tokenizer = AutoTokenizer.from_pretrained(self.tokenizer_name, trust_remote_code=True, token=self.access_token)
 62 |         self.model = AutoModel.from_pretrained(self.model_name, device_map=device, trust_remote_code=True, token=self.access_token)
 63 | 
 64 |     def unload_model(self) -> None:
 65 |         """
 66 |         Unload the model and tokenizer.
 67 |         """
 68 |         del self.tokenizer
 69 |         del self.model
 70 | 
 71 |         gc.collect()
 72 |         torch.cuda.empty_cache()
 73 | 
 74 |         self.tokenizer = None
 75 |         self.model = None
 76 | 
 77 |     def generate_embedding(self, input: Union[List[str], str]) -> List[List[float]]:
 78 |         """
 79 |         Abstract method to generate embedding for the given input text.
 80 | 
 81 |         :param input: The input text to embed.
 82 |         :type input: Union[List[str], str]
 83 |         :return: The embeddings of the text.
 84 |         :rtype: List[List[float]]
 85 |         """
 86 |         if isinstance(input, str):
 87 |             input = [input]
 88 | 
 89 |         total_embeddings = []
 90 |         flag = True
 91 | 
 92 |         while flag:
 93 |             try:
 94 |                 batched_responses = [input[i:i+self.batch_size] for i in range(0, len(input), self.batch_size)]
 95 | 
 96 |                 embeddings = None
 97 |                 outputs = None
 98 |                 batch_dict = None
 99 | 
100 |                 for batch in tqdm(batched_responses, desc="Batches to Embed", leave=False, total=len(batched_responses)):
101 |                     batch_dict = self.tokenizer(batch, max_length=self.max_length, padding=True, truncation=True, return_tensors='pt')
102 |                     batch_dict.to(self.model.device)
103 | 
104 |                     with torch.no_grad():
105 |                         outputs = self.model(**batch_dict)
106 |                     embeddings = self.last_token_pool(outputs.last_hidden_state, batch_dict['attention_mask'])
107 | 
108 |                     # normalize embeddings
109 |                     embeddings = F.normalize(embeddings, p=2, dim=1)
110 |                     total_embeddings.extend(embeddings.cpu().detach().numpy().tolist())
111 | 
112 |                     del embeddings, outputs, batch_dict
113 |                     gc.collect()
114 |                     torch.cuda.empty_cache()
115 |                 
116 |                 flag = False
117 | 
118 |             except Exception as e:
119 |                 embeddings = None
120 |                 outputs = None
121 |                 batch_dict = None
122 |                 total_embeddings = []
123 |                 gc.collect()
124 |                 torch.cuda.empty_cache()
125 |                 
126 |                 print("Error occurred, reducing batch size and retrying")
127 |                 if self.batch_size == 1:
128 |                     raise e
129 |                 self.batch_size = self.batch_size // 2  # reduce batch size by half
130 |             
131 |         return total_embeddings
132 | 
133 |     def last_token_pool(self, last_hidden_states: Tensor,
134 |                  attention_mask: Tensor) -> Tensor:
135 |         """
136 |         Pools the last non-padding token's hidden state from the model's output.
137 | 
138 |         This method extracts the hidden state of the last token that is not a padding token.
139 |         If the last token is a padding token, it retrieves the hidden state of the 
140 |         second to last token that is not a padding token.
141 | 
142 |         :param last_hidden_states: A tensor containing the hidden states from the last layer of the model.
143 |         :type last_hidden_states: Tensor
144 |         :param attention_mask: A tensor indicating the positions of non-padding tokens (1 for non-padding, 0 for padding).
145 |         :type attention_mask: Tensor
146 |         :return: A tensor containing the hidden states of the last non-padding token for each sequence in the batch.
147 |         :rtype: Tensor
148 |         """
149 |         left_padding = (attention_mask[:, -1].sum() == attention_mask.shape[0])
150 |         if left_padding:
151 |             return last_hidden_states[:, -1]
152 |         else:
153 |             sequence_lengths = attention_mask.sum(dim=1) - 1
154 |             batch_size = last_hidden_states.shape[0]
155 |             return last_hidden_states[torch.arange(batch_size, device=last_hidden_states.device), sequence_lengths]
156 | 


--------------------------------------------------------------------------------
/CheckEmbed/embedding_models/sfr_embedding_mistral.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2024 ETH Zurich.
  2 | #                    All rights reserved.
  3 | #
  4 | # Use of this source code is governed by a BSD-style license that can be
  5 | # found in the LICENSE file.
  6 | #
  7 | # main author: Lorenzo Paleari
  8 | 
  9 | import torch
 10 | import gc
 11 | import torch.nn.functional as F
 12 | from torch import Tensor
 13 | from tqdm import tqdm
 14 | from transformers import AutoTokenizer, AutoModel
 15 | 
 16 | from typing import List, Union
 17 | 
 18 | from CheckEmbed.embedding_models import AbstractEmbeddingModel
 19 | 
 20 | 
 21 | class SFREmbeddingMistral(AbstractEmbeddingModel):
 22 |     """
 23 |     The SFREmbeddingMistral class handles interactions with the SFR Embedding Mistral model using the provided configuration.
 24 | 
 25 |     Inherits from the AbstractEmbeddingModel class and implements its abstract methods.
 26 |     """
 27 | 
 28 |     def __init__(
 29 |         self, model_name: str = "", name: str = "sfr-embedding-mistral", cache: bool = False, max_length: int = 4096, batch_size: int = 64
 30 |     ) -> None:
 31 |         """
 32 |         Initialize the SFR Embedding Mistral instance with configuration, model details, and caching options.
 33 | 
 34 |         :param model_name: Name of the model, which is used to select the correct configuration. Defaults to "".
 35 |         :type model_name: str
 36 |         :param name: Name used for output files. Defaults to "sfr-embedding-mistral".
 37 |         :type name: str
 38 |         :param cache: Flag to determine whether to cache responses. Defaults to False.
 39 |         :type cache: bool
 40 |         :param max_length: The maximum length of the input text. Defaults to 4096.
 41 |         :type max_length: int
 42 |         :param batch_size: The batch size to be used for the model. Defaults to 64.
 43 |         :type batch_size: int
 44 |         """
 45 |         super().__init__(model_name=model_name, name=name, cache=cache)
 46 |         self.tokenizer_name = model_name
 47 |         self.max_length = max_length
 48 |         self.batch_size = batch_size
 49 | 
 50 |     def load_model(self, device: str = None) -> None:
 51 |         """
 52 |         Load the model and tokenizer based on the given model name.
 53 | 
 54 |         :param device: The device to load the model on.
 55 |         :type device: str
 56 |         """
 57 | 
 58 |         self.tokenizer = AutoTokenizer.from_pretrained(self.tokenizer_name)
 59 |         self.model = AutoModel.from_pretrained(self.model_name, device_map=device)
 60 | 
 61 |     def unload_model(self) -> None:
 62 |         """
 63 |         Unload the model and tokenizer.
 64 |         """
 65 |         del self.tokenizer
 66 |         del self.model
 67 | 
 68 |         gc.collect()
 69 |         torch.cuda.empty_cache()
 70 | 
 71 |         self.tokenizer = None
 72 |         self.model = None
 73 |     
 74 |     def generate_embedding(self, input: Union[List[str], str]) -> List[List[float]]:
 75 |         """
 76 |         Abstract method to generate embedding for the given input text.
 77 | 
 78 |         :param input: The input text to embed.
 79 |         :type input: Union[List[str], str]
 80 |         :return: The embeddings of the text.
 81 |         :rtype: List[List[float]]
 82 |         """
 83 |         if isinstance(input, str):
 84 |             input = [input]
 85 | 
 86 |         total_embeddings = []
 87 |         flag = True
 88 | 
 89 |         while flag:
 90 |             try:
 91 |                 batched_responses = [input[i:i+self.batch_size] for i in range(0, len(input), self.batch_size)]
 92 | 
 93 |                 embeddings = None
 94 |                 outputs = None
 95 |                 batch_dict = None
 96 | 
 97 |                 for batch in tqdm(batched_responses, desc="Batches to Embed", leave=False, total=len(batched_responses)):
 98 |                     batch_dict = self.tokenizer(batch, max_length=self.max_length, padding=True, truncation=True, return_tensors='pt')
 99 |                     batch_dict.to(self.model.device)
100 | 
101 |                     with torch.no_grad():
102 |                         outputs = self.model(**batch_dict)
103 |                     embeddings = self.last_token_pool(outputs.last_hidden_state, batch_dict['attention_mask'])
104 | 
105 |                     # normalize embeddings
106 |                     embeddings = F.normalize(embeddings, p=2, dim=1)
107 |                     total_embeddings.extend(embeddings.cpu().detach().numpy().tolist())
108 | 
109 |                     del embeddings, outputs, batch_dict
110 |                     gc.collect()
111 |                     torch.cuda.empty_cache()
112 |                 
113 |                 flag = False
114 | 
115 |             except Exception as e:
116 |                 embeddings = None
117 |                 outputs = None
118 |                 batch_dict = None
119 |                 total_embeddings = []
120 |                 gc.collect()
121 |                 torch.cuda.empty_cache()
122 |                 
123 |                 print("Error occurred, reducing batch size and retrying")
124 |                 if self.batch_size == 1:
125 |                     raise e
126 |                 self.batch_size = self.batch_size // 2  # reduce batch size by half
127 |             
128 |         return total_embeddings
129 | 
130 |     def last_token_pool(self, last_hidden_states: Tensor,
131 |                  attention_mask: Tensor) -> Tensor:
132 |         """
133 |         Pools the last non-padding token's hidden state from the model's output.
134 | 
135 |         This method extracts the hidden state of the last token that is not a padding token.
136 |         If the last token is a padding token, it retrieves the hidden state of the 
137 |         second to last token that is not a padding token.
138 | 
139 |         :param last_hidden_states: A tensor containing the hidden states from the last layer of the model.
140 |         :type last_hidden_states: Tensor
141 |         :param attention_mask: A tensor indicating the positions of non-padding tokens (1 for non-padding, 0 for padding).
142 |         :type attention_mask: Tensor
143 |         :return: A tensor containing the hidden states of the last non-padding token for each sequence in the batch.
144 |         :rtype: Tensor
145 |         """
146 |         left_padding = (attention_mask[:, -1].sum() == attention_mask.shape[0])
147 |         if left_padding:
148 |             return last_hidden_states[:, -1]
149 |         else:
150 |             sequence_lengths = attention_mask.sum(dim=1) - 1
151 |             batch_size = last_hidden_states.shape[0]
152 |             return last_hidden_states[torch.arange(batch_size, device=last_hidden_states.device), sequence_lengths]
153 | 


--------------------------------------------------------------------------------
/CheckEmbed/embedding_models/stella.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2024 ETH Zurich.
  2 | #                    All rights reserved.
  3 | #
  4 | # Use of this source code is governed by a BSD-style license that can be
  5 | # found in the LICENSE file.
  6 | #
  7 | # main author: Lorenzo Paleari
  8 | 
  9 | import os
 10 | import gc
 11 | import torch
 12 | from tqdm import tqdm
 13 | from transformers import AutoModel, AutoTokenizer
 14 | from sklearn.preprocessing import normalize
 15 | from huggingface_hub import snapshot_download
 16 | 
 17 | from typing import List, Literal, Union
 18 | 
 19 | from CheckEmbed.embedding_models import AbstractEmbeddingModel
 20 | 
 21 | 
 22 | class Stella(AbstractEmbeddingModel):
 23 |     """
 24 |     The Stella class handles interactions with the Stella embedding model family using the provided configuration.
 25 |     
 26 |     Inherits from the AbstractEmbeddingModel class and implements its abstract methods.
 27 |     """
 28 | 
 29 |     def __init__(
 30 |         self, model_name: str = "", variant: Literal["400M-v5", "1.5B-v5", ""] = "400M-v5", name: str = "stella-en-", cache: bool = False, max_length: int = 4096, batch_size: int = 64
 31 |     ) -> None:
 32 |         """
 33 |         Initialize the Stella instance with configuration, model details, and caching options.
 34 |         
 35 |         :param model_name: Name of the model, default is "". Used to select the correct configuration.
 36 |         :type model_name: str
 37 |         :param variant: The variant of the Stella model to use. Defaults to "400M_v5".
 38 |         :type variant: Literal["400M-v5", "1.5B-v5", ""]
 39 |         :param name: Name used for output files. Defaults to "stella-en-".
 40 |         :type name: str
 41 |         :param cache: Flag to determine whether to cache responses. Defaults to False.
 42 |         :type cache: bool
 43 |         :param max_length: The maximum length of the input text. Defaults to 4096.
 44 |         :type max_length: int
 45 |         :param batch_size: The batch size to be used for the model. Defaults to 64.
 46 |         :type batch_size: int
 47 |         """
 48 |         super().__init__(model_name=model_name, name=name + variant, cache=cache)
 49 |         self.max_length = max_length
 50 |         self.batch_size = batch_size
 51 | 
 52 |     def load_model(self, device: str = None) -> None:
 53 |         """
 54 |         Load the model and tokenizer based on the given model name.
 55 |         
 56 |         :param device: The device to load the model on.
 57 |         :type device: str
 58 |         """
 59 |         try:
 60 |             model_dir = snapshot_download(repo_id=self.model_name)
 61 |         except Exception as e:
 62 |             raise ValueError(f"Model {self.model_name} not found in the Hugging Face Hub") from e
 63 |         vector_linear_directory = f"2_Dense_{self.max_length}"
 64 |         self.model = AutoModel.from_pretrained(model_dir, trust_remote_code=True).to(device).eval()
 65 |         self.tokenizer = AutoTokenizer.from_pretrained(model_dir, trust_remote_code=True)
 66 |         self.vector_linear = torch.nn.Linear(in_features=self.model.config.hidden_size, out_features=self.max_length)
 67 |         vector_linear_dict = {
 68 |             k.replace("linear.", ""): v for k, v in
 69 |             torch.load(os.path.join(model_dir, f"{vector_linear_directory}/pytorch_model.bin")).items()
 70 |         }
 71 |         self.vector_linear.load_state_dict(vector_linear_dict)
 72 |         self.vector_linear.to(device)
 73 |         
 74 |     def unload_model(self) -> None:
 75 |         """
 76 |         Unload the model and tokenizer from memory.
 77 |         """
 78 |         del self.model
 79 |         
 80 |         gc.collect()
 81 |         torch.cuda.empty_cache()
 82 | 
 83 |         self.model = None
 84 | 
 85 |     def generate_embedding(self, input: Union[List[str], str]) -> List[List[float]]:
 86 |         """
 87 |         Generate the embeddings for the given input text.
 88 |         
 89 |         :param input: The input text to embed.
 90 |         :type input: Union[List[str], str]
 91 |         :return: The embeddings of the text.
 92 |         :rtype: List[List[float]]
 93 |         """
 94 |         if isinstance(input, str):
 95 |             input = [input]
 96 |         
 97 |         total_embeddings = []
 98 |         flag = True
 99 | 
100 |         while flag:
101 |             try:
102 |                 batched_responses = [input[i:i+self.batch_size] for i in range(0, len(input), self.batch_size)]
103 | 
104 |                 batch_dict = None
105 |                 attention_mask = None
106 |                 last_hidden_state = None
107 |                 last_hidden = None
108 |                 docs_vectors = None
109 | 
110 |                 for batch in tqdm(batched_responses, desc="Batches to Embed", leave=False, total=len(batched_responses)):
111 |                     with torch.no_grad():
112 |                         batch_dict = self.tokenizer(batch, padding="longest", truncation=True, max_length=512, return_tensors="pt")
113 |                         batch_dict = {k: v.to(self.model.device) for k, v in batch_dict.items()}
114 |                         attention_mask = batch_dict["attention_mask"]
115 |                         last_hidden_state = self.model(**batch_dict)[0]
116 |                         last_hidden = last_hidden_state.masked_fill(~attention_mask[..., None].bool(), 0.0)
117 |                         docs_vectors = last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None]
118 |                         docs_vectors = normalize(self.vector_linear(docs_vectors).cpu().detach().numpy())
119 | 
120 |                     total_embeddings.extend(docs_vectors.tolist())
121 | 
122 |                     del batch_dict, attention_mask, last_hidden_state, last_hidden, docs_vectors
123 |                     gc.collect()
124 |                     torch.cuda.empty_cache()
125 | 
126 |                 flag = False
127 |             
128 |             except Exception as e:
129 |                 batch_dict = None
130 |                 attention_mask = None
131 |                 last_hidden_state = None
132 |                 last_hidden = None
133 |                 docs_vectors = None
134 |                 total_embeddings = []
135 |                 gc.collect()
136 |                 torch.cuda.empty_cache()
137 | 
138 |                 print("Error occurred, reducing batch size and retrying")
139 |                 if self.batch_size == 1:
140 |                     raise e
141 |                 self.batch_size = self.batch_size // 2
142 | 
143 |         return total_embeddings
144 | 


--------------------------------------------------------------------------------
/CheckEmbed/language_models/__init__.py:
--------------------------------------------------------------------------------
1 | from .abstract_language_model import AbstractLanguageModel
2 | from .chatgpt import ChatGPT
3 | from .chatollama import LLMChatOllama


--------------------------------------------------------------------------------
/CheckEmbed/language_models/abstract_language_model.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2023, 2024 ETH Zurich.
  2 | #                    All rights reserved.
  3 | #
  4 | # Use of this source code is governed by a BSD-style license that can be
  5 | # found in the LICENSE file.
  6 | #
  7 | # original file from Graph of Thoughts framework:
  8 | # https://github.com/spcl/graph-of-thoughts
  9 | #
 10 | # main author: Nils Blach
 11 | #
 12 | # modifications: Lorenzo Paleari
 13 | 
 14 | from abc import ABC, abstractmethod
 15 | from typing import Any, Dict, List, Union
 16 | import json
 17 | import logging
 18 | 
 19 | 
 20 | class AbstractLanguageModel(ABC):
 21 |     """
 22 |     Abstract base class that defines the interface for all language models.
 23 |     """
 24 | 
 25 |     # modified by Lorenzo Paleari
 26 |     def __init__(
 27 |         self, config_path: str = None, model_name: str = "", cache: bool = False
 28 |     ) -> None:
 29 |         """
 30 |         Initialize the AbstractLanguageModel instance with configuration, model details, and caching options.
 31 | 
 32 |         :param config_path: Path to the config file. Defaults to None. If provided, the config is loaded from the file.
 33 |         :type config_path: str
 34 |         :param model_name: Name of the language model. Defaults to "".
 35 |         :type model_name: str
 36 |         :param cache: Flag to determine whether to cache responses. Defaults to False.
 37 |         :type cache: bool
 38 |         """
 39 |         self.logger = logging.getLogger(self.__class__.__name__)
 40 |         self.config: Dict = None
 41 |         self.model_name: str = model_name
 42 |         self.cache = cache
 43 |         if self.cache:
 44 |             self.response_cache: Dict[str, List[Any]] = {}
 45 |         if config_path is not None:
 46 |             self.load_config(config_path)
 47 |         self.name: str = self.config[model_name]["name"]
 48 |         self.prompt_tokens: int = 0
 49 |         self.completion_tokens: int = 0
 50 |         self.cost: float = 0.0
 51 | 
 52 |     # modified by Lorenzo Paleari
 53 |     def load_config(self, path: str) -> None:
 54 |         """
 55 |         Load configuration from a specified path.
 56 | 
 57 |         :param path: Path to the config file.
 58 |         :type path: str
 59 |         """
 60 |         with open(path, "r") as f:
 61 |             self.config = json.load(f)
 62 | 
 63 |         self.logger.debug(f"Loaded config from {path} for {self.model_name}")
 64 | 
 65 |     def clear_cache(self) -> None:
 66 |         """
 67 |         Clear the response cache.
 68 |         """
 69 |         self.response_cache.clear()
 70 | 
 71 |     # written by Lorenzo Paleari
 72 |     @abstractmethod
 73 |     def load_model(self, device: str = None) -> None:
 74 |         """
 75 |         Abstract method to load the language model.
 76 | 
 77 |         :param device: The device to load the model on.
 78 |         :type device: str
 79 |         """
 80 |         pass
 81 | 
 82 |     # written by Lorenzo Paleari
 83 |     @abstractmethod
 84 |     def unload_model(self) -> None:
 85 |         """
 86 |         Abstract method to unload the language model.
 87 |         """
 88 |         pass
 89 | 
 90 |     # modified by Lorenzo Paleari
 91 |     @abstractmethod
 92 |     def query(self, query: str, num_query: int = 1) -> Any:
 93 |         """
 94 |         Abstract method to query the language model.
 95 | 
 96 |         :param query: The prompt that is going to be used as query to the language model.
 97 |         :type query: str
 98 |         :param num_query: The number of queries to be posed to the language model for each prompt. Defaults to 1.
 99 |         :type num_query: int
100 |         :return: The language model's response(s).
101 |         :rtype: Any
102 |         """
103 |         pass
104 | 
105 |     # modified by Lorenzo Paleari
106 |     @abstractmethod
107 |     def get_response_texts(self, query_response: Union[List[Any], Any]) -> List[str]:
108 |         """
109 |         Abstract method to extract response texts from the language model's response(s).
110 | 
111 |         :param query_response: The responses returned from the language model.
112 |         :type query_response: Union[List[Any], Any]
113 |         :return: List of textual responses.
114 |         :rtype: List[str]
115 |         """
116 |         pass
117 | 


--------------------------------------------------------------------------------
/CheckEmbed/language_models/chatgpt.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2023, 2024 ETH Zurich.
  2 | #                    All rights reserved.
  3 | #
  4 | # Use of this source code is governed by a BSD-style license that can be
  5 | # found in the LICENSE file.
  6 | #
  7 | # original file from Graph of Thoughts framework:
  8 | # https://github.com/spcl/graph-of-thoughts
  9 | #
 10 | # main author: Nils Blach
 11 | #
 12 | # modifications: Lorenzo Paleari
 13 | 
 14 | 
 15 | from concurrent.futures import ThreadPoolExecutor, as_completed
 16 | from typing import Dict, List, Union
 17 | 
 18 | import backoff
 19 | from openai import OpenAI, OpenAIError
 20 | from openai.types.chat.chat_completion import ChatCompletion
 21 | from tqdm import tqdm
 22 | 
 23 | from CheckEmbed.language_models import AbstractLanguageModel
 24 | 
 25 | 
 26 | class ChatGPT(AbstractLanguageModel):
 27 |     """
 28 |     The ChatGPT class handles interactions with the OpenAI models using the provided configuration.
 29 | 
 30 |     Inherits from the AbstractLanguageModel class and implements its abstract methods.
 31 |     """
 32 | 
 33 |     # modified by Lorenzo Paleari
 34 |     def __init__(
 35 |         self, config_path: str = "", model_name: str = "chatgpt4", cache: bool = False, max_concurrent_requests: int = 10, temperature: float = None
 36 |     ) -> None:
 37 |         """
 38 |         Initialize the ChatGPT instance with configuration, model details, and caching options.
 39 | 
 40 |         :param config_path: Path to the configuration file. Defaults to "".
 41 |         :type config_path: str
 42 |         :param model_name: Name of the model, default is 'chatgpt4'. Used to select the correct configuration.
 43 |         :type model_name: str
 44 |         :param cache: Flag to determine whether to cache responses. Defaults to False.
 45 |         :type cache: bool
 46 |         :param max_concurrent_requests: The maximum number of concurrent requests. Defaults to 10.
 47 |         :type max_concurrent_requests: int
 48 |         :param temperature: The temperature for the model. If not provided, it will be taken from the config.
 49 |         :type temperature: float
 50 |         """
 51 |         super().__init__(config_path, model_name, cache)
 52 |         self.config: Dict = self.config[model_name]
 53 |         # The model_id is the id of the model that is used for chatgpt, i.e. gpt-4, gpt-3.5-turbo, etc.
 54 |         self.model_id: str = self.config["model_id"]
 55 |         self.name = self.config["name"]
 56 |         # The prompt_token_cost and response_token_cost are the costs for 1000 prompt tokens and 1000 response tokens respectively.
 57 |         self.prompt_token_cost: float = self.config["prompt_token_cost"]
 58 |         self.response_token_cost: float = self.config["response_token_cost"]
 59 |         # The temperature of a model is defined as the randomness of the model's output.
 60 |         self.temperature: float = temperature if temperature is not None else self.config["temperature"]
 61 |         # The maximum number of tokens to generate in the chat completion.
 62 |         self.max_tokens: int = self.config["max_tokens"]
 63 |         # The stop sequence is a sequence of tokens that the model will stop generating at (it will not generate the stop sequence).
 64 |         self.stop: Union[str, List[str]] = self.config["stop"]
 65 |         # The account organization is the organization that is used for chatgpt.
 66 |         self.organization: str = self.config["organization"]
 67 |         if self.config["organization"] == "":
 68 |             self.logger.warning("OPENAI_ORGANIZATION is not set")
 69 |         self.api_key: str = self.config["api_key"]
 70 |         if self.config["api_key"] == "":
 71 |             self.logger.warning("OPENAI_API_KEY is not set")
 72 |         # Initialize the OpenAI Client
 73 |         self.client = OpenAI(api_key=self.api_key, organization=self.organization)
 74 | 
 75 |         self.max_concurrent_requests = max_concurrent_requests
 76 | 
 77 |     # written by Lorenzo Paleari
 78 |     def load_model(self, device: str = None) -> None:
 79 |         """
 80 |         Load the language model locally.
 81 | 
 82 |         :param device: The device to load the model on.
 83 |         :type device: str
 84 |         """
 85 |         pass
 86 | 
 87 |     # written by Lorenzo Paleari
 88 |     def unload_model(self) -> None:
 89 |         """
 90 |         Unload the language model locally.
 91 |         """
 92 |         pass
 93 | 
 94 |     # modified by Lorenzo Paleari
 95 |     def query(
 96 |         self, query: str, num_query: int = 1
 97 |     ) -> List[ChatCompletion]:
 98 |         """
 99 |         Query the OpenAI model for responses.
100 | 
101 |         :param query: The prompt that is going to be used as query to the language model.
102 |         :type query: str
103 |         :param num_query: The number of queries to be posed to the language model for each prompt. Defaults to 1.
104 |         :type num_query: int
105 |         :return: Response(s) from the OpenAI model.
106 |         :rtype: List[ChatCompletion]
107 |         """
108 |         if self.cache and query in self.response_cache:
109 |                 self.logger.debug(f"Used cache for query: {query}")
110 |                 return self.response_cache[query]
111 |         
112 |         with ThreadPoolExecutor(max_workers=self.max_concurrent_requests) as executor:
113 |             futures = [executor.submit(self.chat, [{"role": "user", "content": query}], 1) for _ in range(num_query)]
114 |             results = []
115 |             for future in tqdm(as_completed(futures), total=num_query, desc="Samples", leave=False):
116 |                 try:
117 |                     response = future.result()
118 |                     results.append(response)
119 |                 except OpenAIError as e:
120 |                     self.logger.error(f"OpenAIError: {e}")
121 |                 except Exception as e:
122 |                     self.logger.error(f"Unexpected error: {e}")
123 |         
124 |         if self.cache:
125 |             self.response_cache[query] = results
126 |         return results
127 |  
128 | 
129 |     @backoff.on_exception(backoff.expo, OpenAIError, max_time=10, max_tries=6)
130 |     def chat(self, messages: List[Dict], num_responses: int = 1) -> ChatCompletion:
131 |         """
132 |         Send chat messages to the OpenAI model and retrieves the model's response.
133 |         Implements backoff on OpenAI error.
134 | 
135 |         :param messages: A list of message dictionaries for the chat.
136 |         :type messages: List[Dict]
137 |         :param num_responses: Number of desired responses, default is 1.
138 |         :type num_responses: int
139 |         :return: The OpenAI model's response.
140 |         :rtype: ChatCompletion
141 |         """
142 |         response = self.client.chat.completions.create(
143 |             model=self.model_id,
144 |             messages=messages,
145 |             temperature=self.temperature,
146 |             max_tokens=self.max_tokens,
147 |             n=num_responses,
148 |             stop=self.stop,
149 |         )
150 |         self.prompt_tokens += response.usage.prompt_tokens
151 |         self.completion_tokens += response.usage.completion_tokens
152 |         prompt_tokens_k = float(self.prompt_tokens) / 1000.0
153 |         completion_tokens_k = float(self.completion_tokens) / 1000.0
154 |         self.cost = (
155 |             self.prompt_token_cost * prompt_tokens_k
156 |             + self.response_token_cost * completion_tokens_k
157 |         )
158 |         self.logger.info(
159 |             #f"This is the response from chatgpt: {response}"
160 |             f"This is the cost of the response: {self.prompt_token_cost * float(response.usage.prompt_tokens) / 1000.0 + self.response_token_cost * float(response.usage.completion_tokens) / 1000.0}"
161 |         )
162 |         return response
163 | 
164 |     def get_response_texts(
165 |         self, query_response: Union[List[ChatCompletion], ChatCompletion]
166 |     ) -> List[str]:
167 |         """
168 |         Extract the response texts from the query response.
169 | 
170 |         :param query_response: The response dictionary (or list of dictionaries) from the OpenAI model.
171 |         :type query_response: Union[List[ChatCompletion], ChatCompletion]
172 |         :return: List of response strings.
173 |         :rtype: List[str]
174 |         """
175 |         if not isinstance(query_response, List):
176 |             query_response = [query_response]
177 |         return [
178 |             choice.message.content
179 |             for response in query_response
180 |             for choice in response.choices
181 |         ]
182 | 


--------------------------------------------------------------------------------
/CheckEmbed/language_models/chatollama.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2025 ETH Zurich.
  2 | #                    All rights reserved.
  3 | #
  4 | # Use of this source code is governed by a BSD-style license that can be
  5 | # found in the LICENSE file.
  6 | #
  7 | # main author: Lorenzo Paleari
  8 | 
  9 | 
 10 | from typing import Dict, List, Union
 11 | 
 12 | from langchain_ollama import ChatOllama
 13 | from pydantic import BaseModel
 14 | 
 15 | from CheckEmbed.language_models import AbstractLanguageModel
 16 | 
 17 | 
 18 | class LLMChatOllama(AbstractLanguageModel):
 19 |     """
 20 |     The LLMChatOllama class handles interactions with Ollama models using the provided configuration.
 21 | 
 22 |     Inherits from the AbstractLanguageModel class and implements its abstract methods.
 23 |     """
 24 | 
 25 |     def __init__(
 26 |         self, config_path: str = "", model_name: str = "llama8b", cache: bool = False, temperature: float = None
 27 |     ) -> None:
 28 |         """
 29 |         Initialize the LLMChatOllama instance with configuration, model details, and caching options.
 30 | 
 31 |         :param config_path: Path to the configuration file. Defaults to "".
 32 |         :type config_path: str
 33 |         :param model_name: Name of the model, default is 'llama8b'. Used to select the correct configuration.
 34 |         :type model_name: str
 35 |         :param cache: Flag to determine whether to cache responses. Defaults to False.
 36 |         :type cache: bool
 37 |         :param temperature: The temperature for the model. If not provided, it will be taken from the config.
 38 |         :type temperature: float
 39 |         """
 40 |         super().__init__(config_path, model_name, cache)
 41 |         self.config: Dict = self.config[model_name]
 42 |         self.model_id: str = self.config["model_id"]
 43 |         self.name = self.config["name"]
 44 |         self.num_ctx = self.config["num_ctx"]
 45 |         self.num_predict = self.config["num_predict"]
 46 |         self.num_batch = self.config["num_batch"]
 47 |         self.keep_alive = self.config["keep_alive"]
 48 |         self.temperature: float = temperature if temperature is not None else self.config["temperature"]
 49 |         # Initialize the Ollama Client
 50 |         self.client = ChatOllama(
 51 |             model=self.model_id,
 52 |             temperature=self.temperature,
 53 |             base_url="localhost:11434",
 54 |             num_ctx=self.num_ctx,
 55 |             num_predict=self.num_predict,
 56 |             num_batch=self.num_batch,
 57 |             keep_alive=self.keep_alive,
 58 |         )
 59 | 
 60 |     def load_model(self, device: str = None) -> None:
 61 |         """
 62 |         Load the language model locally.
 63 | 
 64 |         :param device: The device to load the model on.
 65 |         :type device: str
 66 |         """
 67 |         pass
 68 | 
 69 |     def unload_model(self) -> None:
 70 |         """
 71 |         Unload the language model locally.
 72 |         """
 73 |         pass
 74 | 
 75 |     def add_structured_output(self, response: BaseModel) -> None:
 76 |         """
 77 |         Add structured output to the response.
 78 | 
 79 |         :param response: The response from the language model.
 80 |         :type response: BaseModel
 81 |         """
 82 |         self.client = self.client.with_structured_output(
 83 |             response, method="json_schema"
 84 |         )
 85 | 
 86 |     def query(
 87 |         self, query: str, num_query: int = 1
 88 |     ) -> str:
 89 |         """
 90 |         Query the Ollama model for responses.
 91 | 
 92 |         :param query: The prompt that is going to be used as query to the language model.
 93 |         :type query: str
 94 |         :param num_query: The number of queries to be posed to the language model for each prompt. Defaults to 1.
 95 |         :type num_query: int
 96 |         :return: Response(s) from the Ollama model.
 97 |         :rtype: str
 98 |         """
 99 |         if self.cache and query in self.response_cache:
100 |                 self.logger.debug(f"Used cache for query: {query}")
101 |                 return self.response_cache[query]
102 |     
103 |         result = self.client.invoke(
104 |              query
105 |         )        
106 |         
107 |         if self.cache:
108 |             self.response_cache[query] = result
109 |         return result
110 | 
111 | 
112 |     def get_response_texts(
113 |         self, query_response: Union[List[str], str]
114 |     ) -> List[str]:
115 |         """
116 |         Extract the response texts from the query response.
117 | 
118 |         :param query_response: The response dictionary (or list of dictionaries) from the Ollama model.
119 |         :type query_response: Union[List[ChatCompletion], ChatCompletion]
120 |         :return: List of response strings.
121 |         :rtype: List[str]
122 |         """
123 |         pass
124 | 


--------------------------------------------------------------------------------
/CheckEmbed/operations/README.md:
--------------------------------------------------------------------------------
 1 | # Evaluation
 2 | 
 3 | This section aims to provide insights into the results generated by the default operations and offers useful tips on how to effectively use these results for a correct evaluation.
 4 | 
 5 | To aid in the visualization and the correct interpretation of the results, it is recommended to take a look at the plots that will be generated by executing any of the experiments in the examples folder.
 6 | 
 7 | ## SelfCheckGPT
 8 | 
 9 | The SelfCheckGPT baseline is usually evaluated based on three metrics. The results are stored in a JSON file inside the SelfCheckGPT folder, that is created during the execution of the pipeline.
10 | - `result`: A list of all sentence-level scores referred to by the prompt indicated by `index`. The score is computed based on $`1 - \text{SelfCheckGPTscore}`$, since the original score indicates the hallucination level of a sentence, while the CheckEmbed pipeline compares the level of similarity.
11 | - `passage_score`: Aggregated score for the complete passage by computing the formula
12 | $$S_{\text{passage}} = \frac{1}{|R|} \sum_{i} S(i)$$
13 |    where $`|R|`$ is the number of sentences and $`S(i)`$ is the score associated with each sentence.
14 | - `std_dev`: The standard deviation of the sentence level scores.
15 | 
16 | An interpretation of the results and more information on the significance of the respective scores can be found in the [SelfCheckGPT paper](https://arxiv.org/pdf/2303.08896).
17 | 
18 | ## BertScore - CheckEmbed
19 | 
20 | Both BertScore and CheckEmbed use similar metrics:
21 | - `result \ cosine_sim`: A cosine similarity matrix representing the cosine similarity score for each pair of embeddings compared.
22 | - `frobenius_norm \ frob_norm_cosine_sim`: The Frobenius norm obtained from the cosine similarity matrix.
23 | - `std_dev \ std_dev_cosine_sim`: The standard deviation of the cosine similarity scores within the matrix.
24 | 
25 | Additionally CheckEmbed results can be evaluated using the following metrics:
26 | - `pearson_corr`: A Pearson correlation matrix representing the Pearson correlation score for each pair of embeddings compared.
27 | - `frob_norm_pearson_corr`: The Frobenius norm obtained from the Pearson correlation matrix.
28 | - `std_dev_pearson_corr`: The standard deviation of the Pearson correlation scores within the matrix.
29 | 
30 | Additionally it is possible to incorporate the ground-truth if available.
31 | 
32 | ## Frobenius Norm
33 | 
34 | The following sections references only the Frobenius norm, but the same concepts also apply to the Pearson correlation.
35 | 
36 | ### With Ground-Truth
37 | 
38 | Evaluation with the ground-truth is straightforward.
39 | It is sufficient to compare the Cosine Similarity results to determine if the generated sample is closer to or further from the correct answer.
40 | The ground-truth results are represented as the last row of the cosine similarity matrix.
41 | Remember that based on the used embedding model, the threshold for defining a result as correct or not can vary.
42 | 
43 | ### Without Ground-Truth
44 | 
45 | To evaluate results in the absence of ground-truth, the full cosine similarity matrix should be considered to understand how close the different samples are to each other.
46 | Keeping in mind the possibility of having different thresholds based on the used embedding model used, the general stability of the answer can be grasped by looking at the matrix.
47 | 
48 | The `standard deviation` can additionally help to assess the correctness of an answer. A higher standard deviation indicates higher variance and insecurity in the LLM answers, while a lower one indicates stability.
49 | 
50 | The `Frobenius norm` is useful as a unique score for the whole similarity matrix without the need to compare multiple matrices together.
51 | Please keep in mind that the Frobenius norm will always output a positive number, treating negative ones as positive (by squaring them).
52 | It is recommended to also look at the `standard deviation` results to in order to identify situations where two opposite results (-0.7 and 0.7) end up having a similarly high Frobenius score.
53 | The resulting high standard deviation should suggest the need to examine the cosine similarity matrix more closely to better understand the results.
54 | 


--------------------------------------------------------------------------------
/CheckEmbed/operations/__init__.py:
--------------------------------------------------------------------------------
1 | from .operations import Operation
2 | from .bertscore_operation import BertScoreOperation
3 | from .checkembed_operation import CheckEmbedOperation
4 | from .selfcheckgpt_operation import SelfCheckGPT_Operation, SelfCheckGPT_BERT_Operation, SelfCheckGPT_NLI_Operation
5 | from .llm_as_a_judge_operation import LLMAsAJudgeOperation
6 | 


--------------------------------------------------------------------------------
/CheckEmbed/operations/bertscore_operation.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2024 ETH Zurich.
  2 | #                    All rights reserved.
  3 | #
  4 | # Use of this source code is governed by a BSD-style license that can be
  5 | # found in the LICENSE file.
  6 | #
  7 | # main author: Lorenzo Paleari
  8 | 
  9 | import logging
 10 | import os
 11 | import json
 12 | 
 13 | import bert_score
 14 | import numpy as np
 15 | from tqdm import tqdm
 16 | from typing import Any
 17 | from timeit import default_timer as timer
 18 | 
 19 | from CheckEmbed.operations import Operation
 20 | from CheckEmbed.utility import capture_specific_stderr, frobenius_norm_no_diag, matrix_std_dev_no_diag
 21 | 
 22 | class BertScoreOperation(Operation):
 23 |     """
 24 |     Operation that computes the BertScore between the reference and the sample embeddings.
 25 | 
 26 |     Inherits from the Operation class and implements its abstract methods.
 27 |     """
 28 | 
 29 |     def __init__(self, result_dir_path: str, sample_dir_path: str) -> None:
 30 |         """
 31 |         Initialize the operation.
 32 | 
 33 |         :param result_dir_path: The path to the directory where the results will be stored.
 34 |         :type result_dir_path: str
 35 |         :param sample_dir_path: The path to the directory where the samples are stored.
 36 |         :type sample_dir_path: str
 37 |         """
 38 |         super().__init__(result_dir_path)
 39 |         self.sample_dir_path = sample_dir_path
 40 | 
 41 |     def execute(self, custom_inputs: Any) -> Any:
 42 |         """
 43 |         Execute the operation on the embeddings/samples.
 44 | 
 45 |         :param custom_inputs: The custom inputs for the operation.
 46 |         :type custom_inputs: any
 47 |         """
 48 | 
 49 |         print("\n\nRunning BertScore operation.")
 50 |         time_performance = custom_inputs["time_performance"]
 51 |         
 52 |         # Initialize logging
 53 |         logging.basicConfig(
 54 |             filename=os.path.join(self.result_dir_path, "log.log"),
 55 |             filemode="w",
 56 |             format="%(name)s - %(levelname)s - %(message)s",
 57 |             level=custom_inputs["logging_level"],
 58 |         )
 59 | 
 60 |         if time_performance:
 61 |             with open(os.path.join(self.sample_dir_path, "runtimes", "performance_log.log"), "a") as f:
 62 |                 f.write("\n\nBERTScore operation\n")
 63 | 
 64 |         # Run BertScore for every pair of language model and samples
 65 |         performance_times = []
 66 |         for lm_name in (pbar := tqdm(custom_inputs["lm_names"], desc="Language Models", leave=True)):
 67 |             pbar.set_postfix_str(f"{lm_name}")
 68 |             logging.info(f"Loading responses from {lm_name}.")
 69 |             samples = []
 70 | 
 71 |             start = timer() if time_performance else None
 72 | 
 73 |             # Load samples from the language model
 74 |             with open(os.path.join(self.sample_dir_path, f"{lm_name}_samples.json")) as f:
 75 |                 responses = json.load(f)
 76 | 
 77 |             for index, response in enumerate(responses["data"]):
 78 |                 samples.append(response["samples"])
 79 |                 logging.debug(f"Sample {index}: {samples[index]}")
 80 | 
 81 |             logging.info("Loaded samples.")
 82 | 
 83 |             if custom_inputs["ground_truth"]:
 84 |                 # Load definitions
 85 |                 with open(os.path.join(self.sample_dir_path, "ground_truth.json")) as f:
 86 |                     definitions = json.load(f)
 87 | 
 88 |                 # Add definitions to the samples
 89 |                 for index, sample in enumerate(samples):
 90 |                     sample.append(definitions["ground_truth"][index])
 91 |                     samples[index] = sample
 92 | 
 93 |             # For every prompt compare every sample with every other sample
 94 |             logging.info(f"Running BertScore for {lm_name}.")
 95 | 
 96 |             same_samples = []
 97 |             for sample in samples:
 98 |                 same_s = []
 99 |                 for i in range(len(sample)):
100 |                     temp = []
101 |                     for j in range(len(sample)):
102 |                         temp.append(sample[i])
103 |                     same_s.append(temp)
104 |                 same_samples.append(same_s)
105 | 
106 |             results = []
107 |             for sample, same_sample in tqdm(zip(samples, same_samples), total=len(samples), desc="Prompts", leave=False):
108 |                 result = []
109 |                 for s in tqdm(same_sample, desc="Samples", leave=False):
110 |                     with capture_specific_stderr():
111 |                         result.append(bert_score.score(
112 |                             sample, s, model_type=custom_inputs["model_type"],
113 |                             batch_size=custom_inputs["batch_size"], device=custom_inputs["device"],
114 |                             lang="en", verbose=False, rescale_with_baseline=True,
115 |                         )[2].tolist())
116 |                 results.append(result)
117 |                 logging.debug(f"Results: {result}")
118 |             
119 |             logging.info(f"Finished running BertScore for {lm_name}.")
120 | 
121 |             # Fix the results that are less than -1
122 |             for index, result in enumerate(results):
123 |                 temp_res = np.zeros((len(result), len(result[0])))
124 |                 for i in range(temp_res.shape[0]):
125 |                     for j in range(temp_res.shape[1]):
126 |                         if temp_res[i][j] < -1:
127 |                             temp_res[i][j] = -1
128 |                         else:
129 |                             temp_res[i][j] = result[i][j]
130 |                 results[index] = temp_res
131 | 
132 |             frobenius_norms = [frobenius_norm_no_diag(result[:-1,:-1], True) if custom_inputs["ground_truth"]
133 |                                     else frobenius_norm_no_diag(result, True) for result in results]
134 |             std_devs = [matrix_std_dev_no_diag(result[:-1,:-1]) if custom_inputs["ground_truth"] 
135 |                             else matrix_std_dev_no_diag(result) for result in results]
136 |             
137 |             end = timer() if time_performance else None
138 |             if time_performance:
139 |                 performance_times.append(end - start)
140 |                 with open(os.path.join(self.sample_dir_path, "runtimes", "performance_log.log"), "a") as f:
141 |                     f.write(f"\t - Time for {lm_name}: {end - start}\n")
142 | 
143 |             # Store results
144 |             with open(os.path.join(self.result_dir_path, f"{lm_name}_bert.json"), "w") as f:
145 |                 results_json = [{
146 |                     "index": i,
147 |                     "result": result.tolist(),
148 |                     "frobenius_norm": frob_norm,
149 |                     "std_dev": std_dev
150 |                 } for i, result, frob_norm, std_dev in zip(range(len(results)), results, frobenius_norms, std_devs)]
151 |                 json.dump({"data": results_json}, f, indent=4)
152 | 
153 |             logging.info(f"Saved results for {lm_name}.")
154 |         
155 |         if time_performance:
156 |             with open(os.path.join(self.sample_dir_path, "runtimes", "performance_log.log"), "a") as f:
157 |                 f.write(f"\n\tTotal time: {sum(performance_times)}\n")
158 | 


--------------------------------------------------------------------------------
/CheckEmbed/operations/checkembed_operation.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2024 ETH Zurich.
  2 | #                    All rights reserved.
  3 | #
  4 | # Use of this source code is governed by a BSD-style license that can be
  5 | # found in the LICENSE file.
  6 | #
  7 | # main author: Lorenzo Paleari
  8 | 
  9 | import os
 10 | import json
 11 | import numpy as np
 12 | 
 13 | from typing import Any
 14 | from timeit import default_timer as timer
 15 | 
 16 | from CheckEmbed.operations import Operation
 17 | from CheckEmbed.utility import cosine_similarity, frobenius_norm_no_diag, matrix_std_dev_no_diag
 18 | 
 19 | class CheckEmbedOperation(Operation):
 20 |     """
 21 |     Operation that computes the cosine similarity, the Pearson correlation, the Frobenius norm and standard deviation between the embeddings.
 22 | 
 23 |     Inherits from the Operation class and implements its abstract methods.
 24 |     """
 25 | 
 26 |     def __init__(self, result_dir_path: str, embeddings_dir_path: str) -> None:
 27 |         """
 28 |         Initialize the operation.
 29 | 
 30 |         :param result_dir_path: The path to the directory where the results will be stored.
 31 |         :type result_dir_path: str
 32 |         :param embeddings_dir_path: The path to the directory where the embeddings are stored.
 33 |         :type embeddings_dir_path: str
 34 |         """
 35 |         super().__init__(result_dir_path)
 36 |         self.embeddings_dir_path = embeddings_dir_path
 37 | 
 38 |     def execute(self, custom_inputs: Any) -> Any:
 39 |         """
 40 |         Execute the operation on the embeddings/samples.
 41 | 
 42 |         :param custom_inputs: The custom inputs for the operation.
 43 |         :type custom_inputs: Any
 44 |         """
 45 |         time_performance = custom_inputs["time_performance"]
 46 | 
 47 |         performance_times = []
 48 |         # For every language model / embedding model 
 49 |         for file in os.listdir(self.embeddings_dir_path):
 50 |             if ".json" in file and not file.startswith("ground_truth_"):
 51 |                 
 52 |                 start = timer() if time_performance else None
 53 |                 folder_name = file.replace("_" + file.split("_")[2], "")
 54 |                 file_name_completion_for_ground_truth = file.replace(file.split("_")[0] + "_", "")
 55 | 
 56 |                 # Load the samples embeddings
 57 |                 with open(os.path.join(self.embeddings_dir_path, file), "r") as f:
 58 |                     data = json.load(f)
 59 |                 data_array = data["data"]
 60 |                 embeddings = [d["embeddings"] for d in data_array]  # Convert to numpy array
 61 | 
 62 |                 # Load the definitions embeddings
 63 |                 dimensions = len(embeddings[0])
 64 |                 if custom_inputs["ground_truth"]:
 65 |                     with open(os.path.join(self.embeddings_dir_path, "ground_truth_" + file_name_completion_for_ground_truth), "r") as f:
 66 |                         definitions = json.load(f)
 67 |                     definitions = definitions["data"]
 68 |                     definitions_embedded = [d["embeddings"] for d in definitions]
 69 | 
 70 |                     for index, embedding in enumerate(embeddings):
 71 |                         new_embedding = embedding
 72 |                         if len(definitions_embedded[index]) > 0:
 73 |                             new_embedding.append(definitions_embedded[index])
 74 |                         embeddings[index] = new_embedding
 75 |                     
 76 |                     dimensions += 1
 77 | 
 78 |                 # Compute the cosine similarity matrix
 79 |                 cosine_similarity_matrix_array = []
 80 |                 for index, embedding in enumerate(embeddings):
 81 |                     # -1 array to initialize the cosine similarity matrix  
 82 |                     cosine_similarity_matrix = np.full((dimensions, dimensions), -1.0)
 83 |                     for i in range(len(embedding)):
 84 |                         for j in range(len(embedding)):
 85 |                             cosine_similarity_matrix[i, j] = cosine_similarity(embedding[i], embedding[j], custom_inputs["rebase_results"], file.split("_")[1])
 86 | 
 87 |                     cosine_similarity_matrix_array.append(cosine_similarity_matrix)
 88 |         
 89 |                 # Compute the frobenius norm of each cosine similarity matrix
 90 |                 frobenius_norms_cosine_sim = [frobenius_norm_no_diag(cosine_similarity_matrix[:-1,:-1]) if custom_inputs["ground_truth"]
 91 |                                                 else frobenius_norm_no_diag(cosine_similarity_matrix) 
 92 |                                                 for cosine_similarity_matrix in cosine_similarity_matrix_array]
 93 | 
 94 |                 # Compute the standard deviation of each cosine similarity matrix
 95 |                 std_dev_cosine_sim_array = [matrix_std_dev_no_diag(cosine_similarity_matrix[:-1,:-1]) if custom_inputs["ground_truth"] 
 96 |                                                 else matrix_std_dev_no_diag(cosine_similarity_matrix) 
 97 |                                                 for cosine_similarity_matrix in cosine_similarity_matrix_array]
 98 | 
 99 |                 # Compute the Pearson correlation matrix
100 |                 pearson_corr_array = []
101 |                 for index, embedding in enumerate(embeddings):
102 |                     pearson_corr = np.full((dimensions, dimensions), -1.0)
103 |                     for i in range(len(embedding)):
104 |                         for j in range(len(embedding)):
105 |                             if len(embedding[i]) == 0 and len(embedding[j]) == 0:
106 |                                 pearson_corr[i, j] = 1.0
107 |                                 continue
108 |                             if len(embedding[i]) == 0 or len(embedding[j]) == 0:
109 |                                 pearson_corr[i, j] = -1.0
110 |                                 continue
111 |                             pearson_corr[i, j] = np.corrcoef(embedding[i], embedding[j])[0, 1]
112 | 
113 |                     pearson_corr_array.append(pearson_corr)
114 | 
115 |                 # Compute the Frobenius norm of each Pearson correlation matrix
116 |                 frobenius_norms_pearson_corr = [frobenius_norm_no_diag(pearson_corr[:-1,:-1]) if custom_inputs["ground_truth"] 
117 |                                                     else frobenius_norm_no_diag(pearson_corr) 
118 |                                                     for pearson_corr in pearson_corr_array]
119 | 
120 |                 std_dev_pearson_corr_array = [matrix_std_dev_no_diag(pearson_corr[:-1,:-1]) if custom_inputs["ground_truth"] 
121 |                                                 else matrix_std_dev_no_diag(pearson_corr) 
122 |                                                 for pearson_corr in pearson_corr_array]
123 | 
124 |                 end = timer() if time_performance else None
125 |                 if time_performance:
126 |                     performance_times.append({folder_name: end - start})
127 | 
128 |                 # Store the results
129 |                 with open(os.path.join(self.result_dir_path, folder_name + "_results.json"), "w") as f:
130 |                     results_json = [{
131 |                         "index": index,
132 |                         "cosine_sim": cosine_sim.tolist(),
133 |                         "frob_norm_cosine_sim": frob_norm_cosine_sim,
134 |                         "std_dev_cosine_sim": std_dev_cosine_sim,
135 |                         "pearson_corr": pearson_corr.tolist(),
136 |                         "frob_norm_pearson_corr": frob_norm_pearson_corr,
137 |                         "std_dev_pearson_corr": std_dev_pearson_corr
138 |                     } for index, cosine_sim, frob_norm_cosine_sim, std_dev_cosine_sim, pearson_corr, frob_norm_pearson_corr, std_dev_pearson_corr 
139 |                         in zip(range(len(cosine_similarity_matrix_array)), cosine_similarity_matrix_array, frobenius_norms_cosine_sim, std_dev_cosine_sim_array, pearson_corr_array, frobenius_norms_pearson_corr, std_dev_pearson_corr_array)]
140 |                     json.dump({"data": results_json}, f, indent=4)
141 | 
142 |         if time_performance:
143 |             # Reorder the performance times first on embedding and then on language model names
144 |             performance_times.sort(key=lambda x: (list(x.keys())[0].split("_")[1], list(x.keys())[0].split("_")[0]))
145 |             with open(os.path.join(self.result_dir_path, "../runtimes", "performance_log.log"), "a") as f:
146 |                 f.write("\n\nCheckEmbed operation:\n")
147 |                 for time in performance_times:
148 |                     time_key = list(time.keys())[0]
149 |                     time_value = list(time.values())[0]
150 |                     formatted_string = f"\t - Time for {time_key.split('_')[0]:<10} {time_key.split('_')[1]:>15}: {time_value}\n"
151 |                     f.write(formatted_string)
152 | 
153 |                 
154 | 


--------------------------------------------------------------------------------
/CheckEmbed/operations/llm_as_a_judge_operation.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2025 ETH Zurich.
  2 | #                    All rights reserved.
  3 | #
  4 | # Use of this source code is governed by a BSD-style license that can be
  5 | # found in the LICENSE file.
  6 | #
  7 | # main author: Lorenzo Paleari
  8 | 
  9 | import json
 10 | import os
 11 | from typing import Any
 12 | 
 13 | from langchain.prompts import PromptTemplate
 14 | from pydantic import BaseModel, Field
 15 | 
 16 | from CheckEmbed.language_models import ChatGPT
 17 | from CheckEmbed.operations import Operation
 18 | 
 19 | 
 20 | class Score(BaseModel):
 21 |     score: int = Field(description="The score from 0 to 100")
 22 | 
 23 | class LLMAsAJudgeOperation(Operation):
 24 |     """
 25 |     Operation that computes the hallucination score of an answer using a language model as a judge.
 26 | 
 27 |     Inherits from the Operation class and implements its abstract methods.
 28 |     """
 29 | 
 30 |     def __init__(self, result_dir_path: str, answer_dir_path: str, prompt_template: PromptTemplate, original: str = None, original_position: int = 0, reference_txt: str = None) -> None:
 31 |         """
 32 |         Initialize the operation.
 33 | 
 34 |         :param result_dir_path: The path to the directory where the results will be stored.
 35 |         :type result_dir_path: str
 36 |         :param answer_dir_path: The path to the directory where the answers are stored.
 37 |         :type answer_dir_path: str
 38 |         :param prompt_template: The prompt template to be used for the language model.
 39 |         :type prompt_template: PromptTemplate
 40 |         :param original: The original data. Defaults to None.
 41 |         :type original: str
 42 |         :param original_position: The position of the original data in the prompt template. Defaults to 0.
 43 |         :type original_position: int
 44 |         :param reference_txt: Reference text for the comparison. Defaults to None.
 45 |         :type reference_txt: str
 46 |         """
 47 |         super().__init__(result_dir_path)
 48 |         self.answer_dir_path = answer_dir_path
 49 |         self.prompt_template = prompt_template
 50 |         self.original = original
 51 |         self.original_position = original_position
 52 |         self.reference_txt = reference_txt
 53 | 
 54 |     def execute(self, custom_inputs: Any) -> None:
 55 |         """
 56 |         Execute the operation on the embeddings/samples.
 57 | 
 58 |         :param custom_inputs: The custom inputs for the operation.
 59 |         :type custom_inputs: Any
 60 |         """
 61 |         model = custom_inputs["model"]
 62 |         if not isinstance(model, ChatGPT):
 63 |             model.add_structured_output(Score)
 64 | 
 65 |         original_data = None
 66 |         if self.original is not None:
 67 |             if self.original.endswith(".json"):
 68 |                 with open(self.result_dir_path + self.original, "r") as f:
 69 |                     original_data = json.load(f)["data"]
 70 | 
 71 |         # For every language model
 72 |         for file in os.listdir(self.answer_dir_path):
 73 |             if "samples.json" in file and not file.startswith("ground_truth_"):
 74 | 
 75 |                 if self.original is not None and original_data is None:
 76 |                      with open(self.result_dir_path + "/" + self.original + f"/{file.split('_')[0]}_original.json", "r") as f:
 77 |                         original_data = json.load(f)["data"]
 78 |                 
 79 |                 name = model.name + "_" + file.split("_")[0]
 80 |                 if name.startswith("gpt4-o"):
 81 |                     name = name[6:]
 82 |                     name = "4o" + name
 83 | 
 84 |                 # Load the samples
 85 |                 with open(os.path.join(self.answer_dir_path, file), "r") as f:
 86 |                     data = json.load(f)
 87 |                 data_array = data["data"]
 88 |                 samples = [d["samples"] for d in data_array]
 89 | 
 90 |                 inputs = self.prompt_template.input_variables
 91 | 
 92 |                 results = []
 93 |                 if self.original is not None:
 94 |                     for i, sample in enumerate(samples):
 95 |                         prep = {}
 96 |                         for j, input in enumerate(inputs):
 97 |                             if j == self.original_position:
 98 |                                 prep[input] = original_data[i]
 99 |                             else:
100 |                                 prep[input] = sample[j]
101 | 
102 |                         final_prompt = self.prompt_template.invoke(prep)
103 |                         if isinstance(model, ChatGPT):
104 |                             final_prompt = final_prompt.text
105 |                         result = model.query(final_prompt)
106 |                         if not isinstance(result, Score):
107 |                             result = model.get_response_texts(result)[0]
108 |                         else:
109 |                             result = result.score
110 | 
111 |                         results.append(result)
112 |                 else:
113 |                     for sample in samples:
114 |                         prep = {}
115 |                         for i, input in enumerate(inputs):
116 |                             prep[input] = sample[i]
117 | 
118 |                         final_prompt = self.prompt_template.invoke(prep)
119 |                         if isinstance(model, ChatGPT):
120 |                             final_prompt = final_prompt.text
121 |                         result = model.query(final_prompt)
122 |                         if not isinstance(result, Score):
123 |                             result = model.get_response_texts(result)[0]
124 |                         else:
125 |                             result = result.score
126 | 
127 |                         results.append(result)
128 | 
129 |                 # Store the results
130 |                 if self.reference_txt is not None:
131 |                     with open(os.path.join(self.result_dir_path, name + "_judge_ref.json"), "w") as f:
132 |                         json.dump({"data": results}, f, indent=4)
133 |                     return
134 |                 with open(os.path.join(self.result_dir_path, name + "_judge.json"), "w") as f:
135 |                     json.dump({"data": results}, f, indent=4)
136 | 


--------------------------------------------------------------------------------
/CheckEmbed/operations/operations.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 ETH Zurich.
 2 | #                    All rights reserved.
 3 | #
 4 | # Use of this source code is governed by a BSD-style license that can be
 5 | # found in the LICENSE file.
 6 | #
 7 | # main author: Lorenzo Paleari
 8 | 
 9 | from abc import ABC, abstractmethod
10 | from typing import Any
11 | 
12 | class Operation(ABC):
13 |     """
14 |     Abstract base class that defines the interface for all operations to be performed on the embeddings/samples.
15 |     """
16 | 
17 |     def __init__(self, result_dir_path: str) -> None:
18 |         """
19 |         Initialize the operation.
20 | 
21 |         :param result_dir_path: The path to the directory where the results will be stored.
22 |         :type result_dir_path: str
23 |         """
24 |         self.result_dir_path = result_dir_path
25 | 
26 |     @abstractmethod
27 |     def execute(self, custom_inputs: Any = None) -> Any:
28 |         """
29 |         Execute the operation on the embeddings/samples.
30 | 
31 |         :param custom_inputs: The custom inputs for the operation. Defaults to None.
32 |         :type custom_inputs: Any
33 |         """
34 |         pass
35 | 


--------------------------------------------------------------------------------
/CheckEmbed/parser/__init__.py:
--------------------------------------------------------------------------------
1 | from .parser import Parser


--------------------------------------------------------------------------------
/CheckEmbed/parser/parser.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 ETH Zurich.
 2 | #                    All rights reserved.
 3 | #
 4 | # Use of this source code is governed by a BSD-style license that can be
 5 | # found in the LICENSE file.
 6 | #
 7 | # main author: Lorenzo Paleari
 8 | 
 9 | from abc import ABC, abstractmethod
10 | from typing import Any, List, Union
11 | 
12 | from PIL.Image import Image
13 | 
14 | 
15 | class Parser(ABC):
16 |     """
17 |     Abstract base class that defines the interface for parsing.
18 | 
19 |     The class supports the following functionality:
20 |     - take the raw data from a dataset and create the necessary prompts for the model
21 |     - extract the ground truth
22 |     - custom parsing of the model responses
23 |     """
24 | 
25 |     def __init__(self, dataset_path: str) -> None:
26 |         """
27 |         Initialize the parser.
28 | 
29 |         :param dataset_path: The path to the dataset.
30 |         :type dataset_path: str
31 |         """
32 |         self.dataset_path = dataset_path
33 | 
34 |     @abstractmethod
35 |     def prompt_generation(self, custom_inputs: Any = None) -> List[str]:
36 |         """
37 |         Parse the dataset and generate the prompts for the model.
38 | 
39 |         :param custom_inputs: The custom inputs to the parser. Defaults to None.
40 |         :type custom_inputs: Any
41 |         :return: List of prompts.
42 |         :rtype: List[str]
43 |         """
44 |         pass
45 |     
46 |     @abstractmethod
47 |     def ground_truth_extraction(self, custom_inputs: Any = None) -> List[str]:
48 |         """
49 |         Parse the dataset and extract the ground truth.
50 | 
51 |         :param custom_inputs: The custom inputs to the parser. Defaults to None.
52 |         :type custom_inputs: Any
53 |         :return: List of ground truths.
54 |         :rtype: List[str]
55 |         """
56 |         pass
57 |     
58 |     def answer_parser(self, responses: List[List[Union[str, Image]]], custom_inputs: Any = None) -> List[List[Union[str, Image]]]:
59 |         """
60 |         Parse the responses from the model.
61 | 
62 |         The default behavior is to return the responses as they are.
63 |         Overwrite this method if you want to parse the responses in a different way. You can use the CustomParser
64 |         classes in the examples folder as reference.
65 | 
66 |         Remember that the responses returned from this method will be stored in a file and used for the evaluation,
67 |         so please follow the following format, when returning the responses:
68 |         [
69 |             [response1_prompt1, response2_prompt1, ...],
70 |             [response1_prompt2, response2_prompt2, ...],
71 |             ...
72 |         ]
73 | 
74 |         :param responses: The responses from the model.
75 |         :type responses: List[List[Union[str, Image]]]
76 |         :param custom_inputs: The custom inputs to the parser. Defaults to None.
77 |         :type custom_inputs: Any
78 |         :return: The parsed responses.
79 |         :rtype: List[List[Union[str, Image]]]
80 |         """
81 |         return responses
82 | 


--------------------------------------------------------------------------------
/CheckEmbed/plotters/README.md:
--------------------------------------------------------------------------------
1 | ## Plotters
2 | 
3 | Old plotters, they need to be updated to the new version of the library.


--------------------------------------------------------------------------------
/CheckEmbed/plotters/__init__.py:
--------------------------------------------------------------------------------
1 | from .plot_operations import PlotOperation
2 | from .bert_plot import BertPlot
3 | from .checkembed_plot import CheckEmbedPlot
4 | from .raw_embedding_heatmap import RawEmbeddingHeatPlot
5 | from .selfcheckgpt_plot import SelfCheckGPTPlot
6 | 


--------------------------------------------------------------------------------
/CheckEmbed/plotters/bert_plot.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2024 ETH Zurich.
  2 | #                    All rights reserved.
  3 | #
  4 | # Use of this source code is governed by a BSD-style license that can be
  5 | # found in the LICENSE file.
  6 | #
  7 | # main author: Lorenzo Paleari
  8 | 
  9 | import os
 10 | import json
 11 | 
 12 | import matplotlib.pyplot as plt
 13 | import numpy as np
 14 | from typing import Any
 15 | 
 16 | from CheckEmbed.plotters import PlotOperation
 17 | 
 18 | class BertPlot(PlotOperation):
 19 |     """
 20 |     The BertPlot class handles the plotting of BERTScore data.
 21 | 
 22 |     Inherits from the PlotOperation class and implements its abstract methods.
 23 |     """
 24 | 
 25 |     def __init__(self, result_dir_path: str, data_dir_path: str) -> None:
 26 |         """
 27 |         Initialize the operation.
 28 | 
 29 |         :param result_dir_path: The path to the directory where the results will be stored.
 30 |         :type result_dir_path: str
 31 |         :param data_dir_path: The path to the directory where the data is stored.
 32 |         :type data_dir_path: str
 33 |         """
 34 |         super().__init__(result_dir_path, data_dir_path)
 35 |     
 36 |     def execute(self, custom_inputs: Any = None) -> Any:
 37 |         """
 38 |         Plot the data.
 39 | 
 40 |         :param custom_inputs: The custom inputs for the operation. Defaults to None.
 41 |         :type custom_inputs: Any
 42 |         """
 43 |         print("Running BertPlot operation.")
 44 |         
 45 |         for file in os.listdir(self.data_dir_path):
 46 |             if ".json" in file:
 47 | 
 48 |                 if not os.path.exists(os.path.join(self.result_dir_path, file.split("_")[0])):
 49 |                     os.mkdir(os.path.join(self.result_dir_path, file.split("_")[0]))
 50 | 
 51 |                 with open(os.path.join(self.data_dir_path, file), "r") as f:
 52 |                     data = json.load(f)
 53 | 
 54 |                 data_array = data["data"]
 55 |                 results = [np.array(d["result"]) for d in data_array]
 56 |                 frobenius_norms = [np.array(d["frobenius_norm"]) for d in data_array]
 57 | 
 58 |                 # Plot a separate heatmap for every example
 59 |                 for index, result in enumerate(results):
 60 |                     fig, ax = plt.subplots(figsize=(12, 10))  # Adjust the figure size as needed
 61 |                     
 62 |                     im = ax.imshow(result, cmap='YlGnBu', interpolation='nearest', aspect="auto", vmin=-1, vmax=1)
 63 |                     plt.colorbar(im, ax=ax)  # Use ax argument to specify the axis for the colorbar
 64 | 
 65 |                     plt.title(f"Heatmap of BertScore of Example {index}", weight='bold', fontsize=26)  # Add a title with index starting from 1
 66 |                     plt.xlabel("LLM Reply ID or Ground-Truth (GT)", fontsize=18)
 67 |                     plt.ylabel("LLM Reply ID or Ground-Truth (GT)", fontsize=18)
 68 | 
 69 |                     # Set ticks and labels
 70 |                     tick_labels = list(range(1, result.shape[0])) + ['GT'] if custom_inputs["ground_truth"] else list(range(1, result.shape[0] + 1))
 71 |                     ax.set_xticks(np.arange(result.shape[0]))
 72 |                     ax.set_yticks(np.arange(result.shape[0]))
 73 |                     ax.set_xticklabels(tick_labels, fontsize=18)
 74 |                     ax.set_yticklabels(tick_labels, fontsize=18)
 75 | 
 76 |                     # Add numbers to the heatmap
 77 |                     for i in range(result.shape[0]):
 78 |                         for j in range(result.shape[0]):
 79 |                             text = ax.text(j, i, round(result[i, j], 2), ha="center", va="center", color="red", fontsize=18)
 80 |                     
 81 |                     plt.savefig(os.path.join(self.result_dir_path, file.split("_")[0], f"example_{index}.pdf"), bbox_inches='tight')
 82 |                     plt.close()
 83 | 
 84 |                 # Plot the Frobenius norm of the cosine similarity matrices
 85 |                 fig, ax = plt.subplots()
 86 |                 ax.bar(range(len(frobenius_norms)), frobenius_norms)
 87 |                 ax.set_xlabel("Prompt")
 88 |                 ax.set_ylabel("Frobenius Norm")
 89 |                 ax.set_title("Frobenius Norm of BertScore Matrices")
 90 | 
 91 |                 tick_labels = [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
 92 |                 ax.set_yticks(tick_labels)
 93 |                 ax.set_yticklabels(tick_labels)
 94 |                 tick_labels = list(range(1, len(frobenius_norms) + 1))
 95 |                 ax.set_xticks(np.arange(len(frobenius_norms)))
 96 |                 ax.set_xticklabels(tick_labels)
 97 | 
 98 |                 plt.savefig(os.path.join(self.result_dir_path, file.split("_")[0], "frobenius_norm.pdf"), bbox_inches='tight')
 99 |                 plt.close()
100 | 


--------------------------------------------------------------------------------
/CheckEmbed/plotters/plot_operations.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 ETH Zurich.
 2 | #                    All rights reserved.
 3 | #
 4 | # Use of this source code is governed by a BSD-style license that can be
 5 | # found in the LICENSE file.
 6 | #
 7 | # main author: Lorenzo Paleari
 8 | 
 9 | from abc import abstractmethod
10 | from typing import Any
11 | 
12 | from CheckEmbed.operations import Operation
13 | 
14 | class PlotOperation(Operation):
15 |     """
16 |     Abstract base class that defines the interface for all operations that plot data.
17 |     """
18 | 
19 |     def __init__(self, result_dir_path: str, data_dir_path: str) -> None:
20 |         """
21 |         Initialize the operation.
22 | 
23 |         :param result_dir_path: The path to the directory where the results will be stored.
24 |         :type result_dir_path: str
25 |         :param data_dir_path: The path to the directory where the data is stored.
26 |         :type data_dir_path: str
27 |         """
28 |         super().__init__(result_dir_path)
29 |         self.data_dir_path = data_dir_path
30 | 
31 |     @abstractmethod
32 |     def execute(self, custom_inputs: Any = None) -> None:
33 |         """
34 |         Plot the data.
35 | 
36 |         :param custom_inputs: The custom inputs for the operation. Defaults to None.
37 |         :type custom_inputs: Any
38 |         """
39 |         pass


--------------------------------------------------------------------------------
/CheckEmbed/plotters/raw_embedding_heatmap.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2024 ETH Zurich.
  2 | #                    All rights reserved.
  3 | #
  4 | # Use of this source code is governed by a BSD-style license that can be
  5 | # found in the LICENSE file.
  6 | #
  7 | # main author: Lorenzo Paleari
  8 | 
  9 | import os
 10 | import json
 11 | 
 12 | import numpy as np
 13 | import matplotlib.pyplot as plt
 14 | from typing import Any
 15 | 
 16 | from CheckEmbed.plotters import PlotOperation
 17 | 
 18 | class RawEmbeddingHeatPlot(PlotOperation):
 19 |     """
 20 |     The RawEmbeddingHeatPlot class handles the plotting of the raw embedding data.
 21 | 
 22 |     Inherits from the PlotOperation class and implements its abstract methods.
 23 |     """
 24 | 
 25 |     def __init__(self, result_dir_path: str, data_dir_path: str) -> None:
 26 |         """
 27 |         Initialize the operation.
 28 | 
 29 |         :param result_dir_path: The path to the directory where the results will be stored.
 30 |         :type result_dir_path: str
 31 |         :param data_dir_path: The path to the directory where the data is stored.
 32 |         :type data_dir_path: str
 33 |         """
 34 |         super().__init__(result_dir_path, data_dir_path)
 35 | 
 36 |     def execute(self, custom_inputs: Any = None) -> Any:
 37 |         """
 38 |         Plot the data.
 39 | 
 40 |         :param custom_inputs: The custom inputs for the operation. Defaults to None.
 41 |         :type custom_inputs: Any
 42 |         """
 43 |         print("Running RawEmbeddingHeatPlot operation.")
 44 |         
 45 |         for file in os.listdir(self.data_dir_path):
 46 |             if ".json" in file and not file.startswith("ground_truth_"):
 47 |                 
 48 |                 folder_name = file.replace("_" + file.split("_")[2], "")
 49 |                 file_name_completion_for_ground_truth = file.replace(file.split("_")[0] + "_", "")
 50 | 
 51 |                 # Directory creation
 52 |                 if not os.path.exists(os.path.join(self.result_dir_path, folder_name)):
 53 |                     os.mkdir(os.path.join(self.result_dir_path, folder_name))
 54 | 
 55 |                 if not os.path.exists(os.path.join(self.result_dir_path, folder_name, "raw_embeddings_heat_map")):
 56 |                     os.mkdir(os.path.join(self.result_dir_path, folder_name, "raw_embeddings_heat_map"))
 57 | 
 58 |                 # Load the sample embeddings
 59 |                 with open(os.path.join(self.data_dir_path, file), "r") as f:
 60 |                     data = json.load(f)
 61 |                 data_array = data["data"]
 62 |                 embeddings = [d["embeddings"] for d in data_array]  # Convert to numpy array
 63 |                 # Remove empty ones inside embedding
 64 |                 for embedding in embeddings:
 65 |                     new_embedding = []
 66 |                     for index, emb in enumerate(embedding):
 67 |                         if len(emb) == 0:
 68 |                             continue
 69 |                         new_embedding.append(emb)
 70 |                     embeddings[embeddings.index(embedding)] = new_embedding
 71 |                 embeddings = [np.array(embedding) for embedding in embeddings]
 72 | 
 73 |                 # Load the definition embeddings
 74 |                 if custom_inputs["ground_truth"]:
 75 |                     with open(os.path.join(self.data_dir_path, "ground_truth_" + file_name_completion_for_ground_truth), "r") as f:
 76 |                         definitions = json.load(f)
 77 |                     definitions = definitions["data"]
 78 |                     definitions_embedded = [np.array(d["embeddings"]) for d in definitions]
 79 |                     
 80 |                     for index, embedding in enumerate(embeddings):
 81 |                         if len(embedding) == 0:
 82 |                             continue
 83 |                         embedding = np.vstack([embedding, definitions_embedded[index].reshape(1, -1)]) if len(definitions_embedded[index]) != 0 else embedding
 84 |                         embeddings[index] = embedding
 85 | 
 86 |                 # Find the min and max values for the colorbar
 87 |                 min_value = float('inf')
 88 |                 max_value = float('-inf')
 89 | 
 90 |                 for embedding in embeddings:
 91 |                     if len(embedding) == 0:
 92 |                         continue
 93 |                     min_value = min(min_value, np.min(embedding))
 94 |                     max_value = max(max_value, np.max(embedding))
 95 | 
 96 |                 # Plot each heatmap
 97 |                 for index, embedding in enumerate(embeddings):
 98 |                     if len(embedding) == 0:
 99 |                         continue
100 |                     fig, ax = plt.subplots(figsize=(12, 6))  # Adjust the figure size as needed
101 |                     
102 |                     im = ax.imshow(embedding, cmap='YlGnBu', interpolation='nearest', aspect="auto", vmin=min_value, vmax=max_value)
103 |                     plt.colorbar(im, ax=ax)  # Use ax argument to specify the axis for the colorbar
104 | 
105 |                     plt.title(f"Heatmap of Example {index}", weight='bold', fontsize=26)  # Add a title with index starting from 1
106 |                     plt.xlabel("i-th element of the embedded answers", fontsize=18)
107 |                     plt.ylabel("Embedded Answers", fontsize=18)
108 | 
109 |                     # Set ticks and labels
110 |                     tick_labels = list(range(1, embedding.shape[0])) + ['GT'] if custom_inputs["ground_truth"] and len(definitions_embedded[index]) > 0 else list(range(1, embedding.shape[0] + 1))
111 |                     ax.set_yticks(np.arange(embedding.shape[0]))
112 |                     ax.set_yticklabels(tick_labels, fontsize=18)
113 |                     
114 |                     plt.savefig(os.path.join(self.result_dir_path, folder_name, "raw_embeddings_heat_map", f"raw_embeddings_heat_map_{index}.pdf"), bbox_inches='tight')
115 |                     plt.close()
116 | 


--------------------------------------------------------------------------------
/CheckEmbed/plotters/selfcheckgpt_plot.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 ETH Zurich.
 2 | #                    All rights reserved.
 3 | #
 4 | # Use of this source code is governed by a BSD-style license that can be
 5 | # found in the LICENSE file.
 6 | #
 7 | # main author: Lorenzo Paleari
 8 | 
 9 | import os
10 | import json
11 | 
12 | import matplotlib.pyplot as plt
13 | from typing import Any
14 | 
15 | from CheckEmbed.plotters import PlotOperation
16 | 
17 | class SelfCheckGPTPlot(PlotOperation):
18 |     """
19 |     The SelfCheckGPTPlot class handles the plotting of SelfCheckGPT data.
20 | 
21 |     Inherits from the PlotOperation class and implements its abstract methods.
22 |     """
23 | 
24 |     def __init__(self, result_dir_path: str, data_dir_path: str) -> None:
25 |         """
26 |         Initialize the operation.
27 | 
28 |         :param result_dir_path: The path to the directory where the results will be stored.
29 |         :type result_dir_path: str
30 |         :param data_dir_path: The path to the directory where the data is stored.
31 |         :type data_dir_path: str
32 |         """
33 |         super().__init__(result_dir_path, data_dir_path)
34 | 
35 |     def execute(self, custom_inputs: Any = None) -> Any:
36 |         """
37 |         Plot the data.
38 | 
39 |         :param custom_inputs: The custom inputs for the operation. Defaults to None.
40 |         :type custom_inputs: Any
41 |         """
42 |         print("Running SelfCheckGPTPlot operation.")
43 |         
44 |         for file in os.listdir(self.data_dir_path):
45 |             if ".json" in file:
46 | 
47 |                 if not os.path.exists(os.path.join(self.result_dir_path, file.split("_")[0])):
48 |                     os.mkdir(os.path.join(self.result_dir_path, file.split("_")[0]))
49 |                 
50 |                 with open(os.path.join(self.data_dir_path, file), "r") as f:
51 |                     data = json.load(f)
52 | 
53 |                 data_array = data["data"]
54 |                 results = [d["result"] for d in data_array]
55 |                 passage_scores = [d["passage_score"] for d in data_array]   
56 | 
57 |                 # Bar plot for every one of the examples
58 |                 for index, result in enumerate(results):
59 |                     if len(result) == 0:
60 |                         continue
61 |                     fig, ax = plt.subplots()
62 |                     ax.bar(range(len(result)), result)
63 | 
64 |                     # Set ticks from 0 to 1
65 |                     ticks = [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1]
66 | 
67 |                     ax.set_xticks(range(len(result)))
68 |                     ax.set_xticklabels(range(len(result)))
69 |                     ax.set_yticks(ticks)
70 |                     ax.set_yticklabels(ticks)
71 |                     ax.set_xlabel("Sentence")
72 |                     ax.set_ylabel("SelfCheckGPT Sentence Score")
73 |                     ax.set_title(f"SelfCheckGPT Score for Prompt {int(index)}")
74 |                     plt.savefig(os.path.join(self.result_dir_path, file.split("_")[0], f"prompt_{int(index)}.pdf"), bbox_inches='tight')
75 |                     plt.close()
76 | 
77 |                 # Bar plot for the passage scores
78 |                 fig, ax = plt.subplots()
79 |                 ax.bar(range(len(passage_scores)), passage_scores)
80 |                 ax.set_xticks(range(len(passage_scores)))
81 |                 ax.set_xticklabels(range(len(passage_scores)))
82 |                 ax.set_xlabel("Prompt")
83 |                 ax.set_ylabel("SelfCheckGPT Passage Score")
84 |                 ax.set_title("SelfCheckGPT Score for Passages")
85 |                 plt.savefig(os.path.join(self.result_dir_path, file.split("_")[0], "passage_scores.pdf"), bbox_inches='tight')
86 |                 plt.close()
87 | 


--------------------------------------------------------------------------------
/CheckEmbed/scheduler/__init__.py:
--------------------------------------------------------------------------------
1 | from .scheduler import Scheduler, StartingPoint


--------------------------------------------------------------------------------
/CheckEmbed/utility/__init__.py:
--------------------------------------------------------------------------------
1 | from .stderr_filter import capture_specific_stderr
2 | from .utility import cosine_similarity, frobenius_norm, frobenius_norm_no_diag, matrix_std_dev_no_diag
3 | 


--------------------------------------------------------------------------------
/CheckEmbed/utility/stderr_filter.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2024 ETH Zurich.
  2 | #                    All rights reserved.
  3 | #
  4 | # Use of this source code is governed by a BSD-style license that can be
  5 | # found in the LICENSE file.
  6 | #
  7 | # main author: Lorenzo Paleari
  8 | 
  9 | import contextlib
 10 | import sys
 11 | import tempfile
 12 | import os
 13 | 
 14 | from typing import Generator, List, Union, Any
 15 | 
 16 | class FilteredStderr:
 17 |     """
 18 |     A class that captures and filters stderr output.
 19 | 
 20 |     The class creates a temporary file to capture the stderr stream and filters the stream based on target string(s).
 21 |     """
 22 | 
 23 |     def __init__(self, target_string: Union[List[str], str]) -> None:
 24 |         """
 25 |         Initializes a FilteredStderr instance.
 26 | 
 27 |         :param target_string: Target string(s) for filtering stderr.
 28 |         :type target_string: Union[List[str], str]
 29 |         """
 30 |         self.target_string = target_string
 31 |         self.captured = ""
 32 |         self.original_stderr_fd = None
 33 |         self.temp_fd = None
 34 |         self.temp_file = None
 35 | 
 36 |     def start(self) -> None:
 37 |         """
 38 |         Start capturing stderr and redirecting the stream to a temporary file.
 39 |         """
 40 |         # Save the original stderr file descriptor
 41 |         self.original_stderr_fd = os.dup(2)
 42 |         # Create a temporary file and file descriptor to capture stderr
 43 |         self.temp_file = tempfile.TemporaryFile(mode='w+')
 44 |         self.temp_fd = self.temp_file.fileno()
 45 |         # Redirect stderr to the temporary file
 46 |         os.dup2(self.temp_fd, 2)
 47 | 
 48 |     def stop(self) -> None:
 49 |         """
 50 |         Stop capturing stderr. Filter the stream for the target string(s) and restore the original stderr file descriptor.
 51 |         """
 52 |         # Restore the original stderr file descriptor
 53 |         os.dup2(self.original_stderr_fd, 2)
 54 |         os.close(self.original_stderr_fd)
 55 |         self.original_stderr_fd = None
 56 |         # Read the captured output
 57 |         self.temp_file.seek(0)
 58 |         output = self.temp_file.read()
 59 |         self.temp_file.close()
 60 |         self.temp_fd = None
 61 |         self.temp_file = None
 62 |         # Filter the output
 63 |         for line in output.splitlines():
 64 |             # target string can be an array
 65 |             if isinstance(self.target_string, str):
 66 |                 self.target_string = [self.target_string]
 67 |             
 68 |             captured = False
 69 |             for target in self.target_string:
 70 |                 if target in line:
 71 |                     self.captured += line + "\n"
 72 |                     captured = True
 73 |                     break
 74 |             
 75 |             if not captured:
 76 |                 sys.__stderr__.write(line + "\n")
 77 | 
 78 | 
 79 | @contextlib.contextmanager
 80 | def capture_specific_stderr(custom_target: Union[List[str], str] = None) -> Generator[FilteredStderr, None, None]:
 81 |     """
 82 |     Context manager that captures and filters the stderr stream.
 83 | 
 84 |     :param custom_target: Target string(s) for filtering stderr. If None, default target strings are loaded from a file.
 85 |     :type custom_target: Union[List[str], str]
 86 |     :return: A FilteredStderr instance, which can be used to access the captured stderr stream.
 87 |     :rtype: Generator[FilteredStderr, None, None]
 88 |     """
 89 |     # load default target strings from file
 90 |     if custom_target is None:
 91 |         with open(os.path.join(os.path.dirname(__file__), "./stderr_filter.txt"), "r") as file:
 92 |             custom_target = file.read().splitlines()
 93 |     
 94 |     filtered_stderr = FilteredStderr(custom_target)
 95 |     filtered_stderr.start()
 96 |     try:
 97 |         yield filtered_stderr
 98 |     finally:
 99 |         filtered_stderr.stop()
100 | 


--------------------------------------------------------------------------------
/CheckEmbed/utility/stderr_filter.txt:
--------------------------------------------------------------------------------
1 | Empty candidate sentence detected; setting raw BERTscores to 0
2 | Warning: Empty reference sentence detected; setting raw BERTScores to 0
3 | Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
4 | You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
5 | `clean_up_tokenization_spaces` was not set. It will be set to `True` by default. This behavior will be depracted in transformers v4.45, and will be then set to `False` by default.
6 | For more details check this issue: https://github.com/huggingface/transformers/issues/31884
7 | warnings.warn(


--------------------------------------------------------------------------------
/CheckEmbed/utility/utility.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2024 ETH Zurich.
  2 | #                    All rights reserved.
  3 | #
  4 | # Use of this source code is governed by a BSD-style license that can be
  5 | # found in the LICENSE file.
  6 | #
  7 | # main author: Lorenzo Paleari
  8 | 
  9 | import numpy as np
 10 | import math
 11 | 
 12 | # These values are the lowest empiricial values observed for a given
 13 | # embedding model during our evaluation.
 14 | REBASING_VALUES = {
 15 |     "gpt-embedding-large": 0.36156142737003805,
 16 |     "sfr-embedding-mistral": 0.4590856938212389,
 17 |     "e5-mistral-7B-instruct": 0.5347691513588488,
 18 |     "gte-qwen1.5-7B-instruct": 0.17701296296393593,
 19 |     "stella-en-400M-v5": 0.3189337589450308,
 20 |     "stella-en-1.5B-v5": 0.3655769126487221,
 21 | }
 22 | 
 23 | def cosine_similarity(a: np.ndarray, b: np.ndarray, rebase: bool = False, emb_name: str = "") -> float:
 24 |         """
 25 |         Compute cosine similarity between two vectors.
 26 | 
 27 |         :param a: The first vector.
 28 |         :type a: np.ndarray
 29 |         :param b: The second vector.
 30 |         :type b: np.ndarray
 31 |         :param rebase: Whether to rebase the cosine similarity. Defaults to False.
 32 |         :type rebase: bool
 33 |         :param emb_name: The name of the embedding model. Defaults to "".
 34 |         :type emb_name: str
 35 |         :return: The cosine similarity between the two vectors.
 36 |         :rtype: float
 37 |         """
 38 |         global REBASING_VALUES
 39 | 
 40 |         # Special case for empty vectors
 41 |         if len(a) == 0 and len(b) == 0:
 42 |             return 1.0
 43 |         if len(a) == 0 or len(b) == 0:
 44 |             return -1.0
 45 |         
 46 |         # Compute the cosine similarity
 47 |         dot_product = np.dot(a, b)
 48 |         norm_a = np.linalg.norm(a)
 49 |         norm_b = np.linalg.norm(b)
 50 |         cos_similarity = dot_product / (norm_a * norm_b)
 51 | 
 52 |         if rebase and emb_name in REBASING_VALUES:
 53 |             # Rebase the cosine similarity
 54 |             cos_similarity = 2 * (cos_similarity - REBASING_VALUES[emb_name]) / (1.0 - REBASING_VALUES[emb_name]) - 1.0
 55 |             cos_similarity = 1.0 if cos_similarity > 1.0 else -1.0 if cos_similarity < -1.0 else cos_similarity
 56 | 
 57 |         return cos_similarity
 58 | 
 59 | def frobenius_norm(matrix: np.ndarray, bert: bool = False) -> float:
 60 |     """
 61 |     Compute the Frobenius norm of the input matrix normalized by the number of elements in the matrix.
 62 | 
 63 |     :param matrix: Input matrix.
 64 |     :type matrix: np.ndarray
 65 |     :param bert: Whether the matrix is a BertScore matrix. Defaults to False.
 66 |     :type bert: bool
 67 |     :return: Frobenius norm.
 68 |     :rtype: float
 69 |     """
 70 |     adder = 1
 71 |     div = 4
 72 |     if bert:
 73 |         adder = 0
 74 |         div = 1
 75 |     sum = 0
 76 |     for i in range(matrix.shape[0]):
 77 |         for j in range(matrix.shape[1]):
 78 |             sum += (matrix[i, j] + adder) ** 2
 79 |     
 80 |     # normalize by the number of elements in the matrix
 81 |     return math.sqrt(sum / (matrix.shape[0] * matrix.shape[1] * div))
 82 | 
 83 | def frobenius_norm_no_diag(matrix: np.ndarray, bert: bool = False) -> float:
 84 |     """
 85 |     Compute the Frobenius norm of the input matrix without its diagonal elements.
 86 |     The Frobenius is further normalized by the number of elements in the matrix.
 87 | 
 88 |     :param matrix: Input matrix.
 89 |     :type matrix: np.ndarray
 90 |     :param bert: Whether the matrix is a BertScore matrix. Defaults to False.
 91 |     :type bert: bool
 92 |     :return: Frobenius norm.
 93 |     :rtype: float
 94 |     """
 95 |     matrix_no_diag = matrix[~np.eye(matrix.shape[0],dtype=bool)].reshape(matrix.shape[0],-1)
 96 |     return frobenius_norm(matrix_no_diag, bert)
 97 | 
 98 | def matrix_std_dev_no_diag(matrix: np.ndarray) -> float:
 99 |     """
100 |     Compute the standard deviation of the input matrix without its diagonal elements.
101 | 
102 |     :param matrix: Input matrix.
103 |     :type matrix: np.ndarray
104 |     :return: Standard deviation.
105 |     :rtype: float
106 |     """
107 |     matrix_no_diag = matrix[~np.eye(matrix.shape[0],dtype=bool)].reshape(matrix.shape[0],-1)
108 |     return np.std(matrix_no_diag)
109 | 


--------------------------------------------------------------------------------
/CheckEmbed/vision_models/README.md:
--------------------------------------------------------------------------------
 1 | # Vision Models
 2 | 
 3 | The Vision Models module is responsible for managing the vision models.
 4 | 
 5 | Currently, the framework supports the following vision model:
 6 | 
 7 | - stabilityai/stable-diffusion-3.5-medium (local - GPU with 12GB VRAM recommended, model size is roughly 6GB )
 8 | 
 9 | The following sections describe how to instantiate the model and how to add new models to the framework.
10 | 
11 | ## Vision Model Instantiation
12 | 
13 | If your model needs a configuration file, follow these steps:
14 | 
15 | - Create a copy of `config_template.json` named `config.json` in the CheckEmbed folder. (Not necessary for local models)
16 | - Fill configuration details based on the used model.
17 | 
18 | ### Local Models
19 | 
20 | The framework currently supports the following local model: `stabilityai/stable-diffusion-3.5-medium`.
21 | 
22 | - Instantiate the vision model based on the owned device.
23 | - Device can be specified in the `Scheduler`, more [here](/CheckEmbed/scheduler/scheduler.py).
24 | 
25 | ```python
26 | stable_diffusion = vision_models.StableDiffusion3(
27 |         model_name = "stabilityai/stable-diffusion-3.5-medium",
28 |         cache = False,
29 |         data_type = torch.bfloat16,
30 |         num_inference_steps = 40,
31 |         guidance_scale = 4.5
32 |     )
33 | ```
34 | 
35 | ## Adding Vision Models
36 | 
37 | More vision models can be added by following these steps:
38 | 
39 | - Create new class as a subclass of `AbstractVisionModel`.
40 | - Use the constructor for loading the configuration and instantiating the vision model (if needed).
41 | 
42 | ```python
43 | class CustomVisionModel(AbstractVisionModel):
44 |     def __init__(
45 |         self,
46 |         config_path: str = "",
47 |         model_name: str = "official model-name",
48 |         name: str = "CustomVisionModel",
49 |         cache: bool = False
50 |     ) -> None:
51 |         super().__init__(config_path, model_name, name, cache)
52 |         self.config: Dict = self.config[model_name]
53 | 
54 |         # Load data from configuration into variables if needed
55 | 
56 |         # Instantiate model if needed
57 | ```
58 | 
59 | - Implement the `load_model`, `unload_model` and `generate_image` abstract methods that are used to load/unload the model from the GPU (if necessary) and get a list of images from the model (remote API call or local model inference) respectively.
60 | 
61 | ```python
62 | def load_model(self, device: str = None) -> None:
63 |     """
64 |     Load the model and tokenizer based on the given model name.
65 | 
66 |     :param device: The device to load the model on. Defaults to None.
67 |     :type device: str
68 |     """
69 | 
70 | def unload_model(self) -> None:
71 |     """
72 |     Unload the model and tokenizer.
73 |     """
74 | 
75 | def generate_image(
76 |     self,
77 |     input: Union[List[str], str]
78 |     ) -> List[Image]:
79 |     # Call model and retrieve an Image
80 |     # Return model response
81 | ```
82 | 


--------------------------------------------------------------------------------
/CheckEmbed/vision_models/__init__.py:
--------------------------------------------------------------------------------
1 | from .abstract_vision_model import AbstractVisionModel
2 | from .stable_diffusion3_5 import StableDiffusion3


--------------------------------------------------------------------------------
/CheckEmbed/vision_models/abstract_vision_model.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2025 ETH Zurich.
  2 | #                    All rights reserved.
  3 | #
  4 | # Use of this source code is governed by a BSD-style license that can be
  5 | # found in the LICENSE file.
  6 | #
  7 | # main author: Lorenzo Paleari
  8 | 
  9 | import json
 10 | import logging
 11 | from abc import ABC, abstractmethod
 12 | from typing import Any, Dict, List, Union
 13 | 
 14 | from PIL.Image import Image
 15 | 
 16 | 
 17 | class AbstractVisionModel(ABC):
 18 |     """
 19 |     Abstract base class that defines the interface for all vision models.
 20 |     """
 21 | 
 22 |     def __init__(
 23 |         self, config_path: str = None, model_name: str = "", name: str = "INVALID_NAME", cache: bool = False
 24 |     ) -> None:
 25 |         """
 26 |         Initialize the AbstractVisionModel instance with configuration, model details, and caching options.
 27 | 
 28 |         :param config_path: Path to the config file. If provided, the config is loaded from the file. Defaults to "".
 29 |         :type config_path: str
 30 |         :param model_name: Name of the vision model. Defaults to "".
 31 |         :type model_name: str
 32 |         :param name: Name of the vision model. Defaults to "INVALID_NAME".
 33 |         :type name: str
 34 |         :param cache: Flag to determine whether to cache responses. Defaults to False.
 35 |         :type cache: bool
 36 |         """
 37 |         self.logger = logging.getLogger(self.__class__.__name__)
 38 |         self.config: Dict = None
 39 |         self.model_name: str = model_name
 40 |         self.cache = cache
 41 |         if self.cache:
 42 |             self.response_cache: Dict[str, List[Any]] = {}
 43 |         if config_path is not None:
 44 |             self.load_config(config_path)
 45 |         self.name: str = name
 46 |         try:
 47 |             if self.config is not None:
 48 |                 if self.config[model_name] is not None:
 49 |                     self.name = self.config[model_name]["name"]
 50 |         except Exception:
 51 |             pass
 52 |         self.prompt_tokens: int = 0
 53 |         self.cost: float = 0.0
 54 | 
 55 |     def load_config(self, path: str) -> None:
 56 |         """
 57 |         Load configuration from a specified path.
 58 | 
 59 |         :param path: Path to the config file.
 60 |         :type path: str
 61 |         """
 62 |         with open(path, "r") as f:
 63 |             self.config = json.load(f)
 64 | 
 65 |         self.logger.debug(f"Loaded config from {path} for {self.model_name}")
 66 | 
 67 |     def clear_cache(self) -> None:
 68 |         """
 69 |         Clear the response cache.
 70 |         """
 71 |         self.response_cache.clear()
 72 | 
 73 |     @abstractmethod
 74 |     def load_model(self, device: str = None) -> None:
 75 |         """
 76 |         Abstract method to load the vision model.
 77 | 
 78 |         :param device: The device to load the model on. Defaults to None.
 79 |         :type device: str
 80 |         """
 81 |         pass
 82 | 
 83 |     @abstractmethod
 84 |     def unload_model(self) -> None:
 85 |         """
 86 |         Abstract method to unload the vision model.
 87 |         """
 88 |         pass
 89 | 
 90 |     @abstractmethod
 91 |     def generate_image(self, input: Union[List[str], str]) -> List[Image]:
 92 |         """
 93 |         Abstract method to generate images for the given input text.
 94 | 
 95 |         :param input: A list of prompts or a single prompt string to generate images for.
 96 |         :type input: Union[List[str], str]
 97 |         :return: The generated images.
 98 |         :rtype: List[Image]
 99 |         """
100 |         pass
101 | 


--------------------------------------------------------------------------------
/CheckEmbed/vision_models/stable_diffusion3_5.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2025 ETH Zurich.
  2 | #                    All rights reserved.
  3 | #
  4 | # Use of this source code is governed by a BSD-style license that can be
  5 | # found in the LICENSE file.
  6 | #
  7 | # main authors: Lorenzo Paleari
  8 | #               Eric Schreiber
  9 | 
 10 | import gc
 11 | from typing import List, Union
 12 | 
 13 | import torch
 14 | from diffusers import StableDiffusion3Pipeline
 15 | from PIL.Image import Image
 16 | from tqdm import tqdm
 17 | 
 18 | from CheckEmbed.vision_models import AbstractVisionModel
 19 | 
 20 | 
 21 | class StableDiffusion3(AbstractVisionModel):
 22 |     """
 23 |     The StableDiffusion3 class handles interactions with the Stable Diffusion 3.5 Medium model using the provided configuration.
 24 | 
 25 |     Inherits from the AbstractVisionModel class and implements its abstract methods.
 26 |     """
 27 | 
 28 |     def __init__(
 29 |         self, model_name: str = "", name: str = "stable-diffusion3.5-medium", cache: bool = False, data_type: torch.dtype = torch.bfloat16, num_inference_steps: int = 40, guidance_scale: float = 4.5
 30 |     ) -> None:
 31 |         """
 32 |         Initialize the StableDiffusion3 instance with configuration, model details, and caching options.
 33 | 
 34 |         :param model_name: Name of the model, which is used to select the correct configuration. Defaults to "".
 35 |         :type model_name: str
 36 |         :param name: Name used for output files. Defaults to "stable-diffusion3.5-medium".
 37 |         :type name: str
 38 |         :param cache: Flag to determine whether to cache responses. Defaults to False.
 39 |         :type cache: bool
 40 |         :param data_type: The data type for the model, typically torch.bfloat16 or torch.float32. Defaults to torch.bfloat16.
 41 |         :type data_type: torch.dtype
 42 |         :param num_inference_steps: The number of inference steps for image generation. Defaults to 40.
 43 |         :type num_inference_steps: int
 44 |         :param guidance_scale: The guidance scale for image generation, which controls the adherence to the prompt. Defaults to 4.5.
 45 |         :type guidance_scale: float
 46 |         """
 47 |         super().__init__(model_name=model_name, name=name, cache=cache)
 48 |         self.data_type = data_type
 49 |         self.num_inference_steps = num_inference_steps
 50 |         self.guidance_scale = guidance_scale
 51 | 
 52 |     def load_model(self, device: str = None) -> None:
 53 |         """
 54 |         Load the model and tokenizer based on the given model name.
 55 | 
 56 |         :param device: The device to load the model on. Defaults to None.
 57 |         :type device: str
 58 |         """
 59 | 
 60 |         self.model = StableDiffusion3Pipeline.from_pretrained(self.model_name, torch_dtype=self.data_type)
 61 |         self.model = self.model.to(device)
 62 | 
 63 |     def unload_model(self) -> None:
 64 |         """
 65 |         Unload the model and tokenizer.
 66 |         """
 67 |         del self.model
 68 | 
 69 |         gc.collect()
 70 |         torch.cuda.empty_cache()
 71 | 
 72 |         self.model = None
 73 | 
 74 |     def generate_image(self, input: Union[List[str], str]) -> List[Image]:
 75 |         """
 76 |         Generate images based on the input prompts using the Stable Diffusion 3.5 Medium model.
 77 | 
 78 |         This method takes a list of prompts or a single prompt string, generates images for each prompt,
 79 |         and returns a list of generated images. The prompts are processed in batches to optimize performance.
 80 | 
 81 |         :param input: A list of prompts or a single prompt string to generate images for.
 82 |         :type input: Union[List[str], str]
 83 |         :return: A list of generated images corresponding to the input prompts.
 84 |         :rtype: List[Image]
 85 |         """
 86 |         if isinstance(input, str):
 87 |             input = [input]
 88 | 
 89 |         images = []
 90 |         for prompt in tqdm(input, desc="Images to Generate", leave=False, total=len(input)):
 91 |             # Generate images in batches
 92 |             image = self.model(
 93 |                 prompt,
 94 |                 num_inference_steps=self.num_inference_steps,
 95 |                 guidance_scale=self.guidance_scale,
 96 |             ).images[0]
 97 | 
 98 |             images.append(image)
 99 | 
100 |         return images
101 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 | Copyright (c) 2024 ETH Zurich.
  2 |                    All rights reserved.
  3 | 
  4 | Redistribution and use in source and binary forms, with or without
  5 | modification, are permitted provided that the following conditions are
  6 | met:
  7 | 
  8 | - Redistributions of source code must retain the above copyright
  9 |   notice, this list of conditions and the following disclaimer.
 10 | 
 11 | - Redistributions in binary form must reproduce the above copyright
 12 |   notice, this list of conditions and the following disclaimer listed
 13 |   in this license in the documentation and/or other materials
 14 |   provided with the distribution.
 15 | 
 16 | - Neither the name of the copyright holders nor the names of its
 17 |   contributors may be used to endorse or promote products derived from
 18 |   this software without specific prior written permission.
 19 | 
 20 | The copyright holders provide no reassurances that the source code
 21 | provided does not infringe any patent, copyright, or any other
 22 | intellectual property rights of third parties.  The copyright holders
 23 | disclaim any liability to any recipient for claims brought against
 24 | recipient by any third party for infringement of that parties
 25 | intellectual property rights.
 26 | 
 27 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 28 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 29 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 30 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 31 | OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 32 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 33 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 34 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 35 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 36 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 37 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 38 | 
 39 | 
 40 | Citation
 41 | ========
 42 | 
 43 | Any published work which uses this software should include the
 44 | following citation:
 45 | 
 46 | ----------------------------------------------------------------------
 47 | Maciej Besta, Lorenzo Paleari, Ales Kubicek, Piotr Nyczyk, Robert
 48 | Gerstenberger, Patrick Iff, Tomasz Lehmann, Hubert Niewiadomski,
 49 | Torsten Hoefler: CheckEmbed: Effective Verification of LLM Solutions
 50 | to Open-Ended Tasks. In: arXiv preprint arXiv:2406.02524
 51 | ----------------------------------------------------------------------
 52 | 
 53 | 
 54 | CheckEmbed uses code from the Graph-of-Thoughts, SelfCheckGPT
 55 | and HalluDetect frameworks. You find their original licenses below.
 56 | 
 57 | Graph-of-Thoughts (https://github.com/spcl/graph-of-thoughts)
 58 | =============================================================
 59 | 
 60 | Copyright (c) 2023 ETH Zurich.
 61 |                    All rights reserved.
 62 | 
 63 | Redistribution and use in source and binary forms, with or without
 64 | modification, are permitted provided that the following conditions are
 65 | met:
 66 | 
 67 | - Redistributions of source code must retain the above copyright
 68 |   notice, this list of conditions and the following disclaimer.
 69 | 
 70 | - Redistributions in binary form must reproduce the above copyright
 71 |   notice, this list of conditions and the following disclaimer listed
 72 |   in this license in the documentation and/or other materials
 73 |   provided with the distribution.
 74 | 
 75 | - Neither the name of the copyright holders nor the names of its
 76 |   contributors may be used to endorse or promote products derived from
 77 |   this software without specific prior written permission.
 78 | 
 79 | The copyright holders provide no reassurances that the source code
 80 | provided does not infringe any patent, copyright, or any other
 81 | intellectual property rights of third parties.  The copyright holders
 82 | disclaim any liability to any recipient for claims brought against
 83 | recipient by any third party for infringement of that parties
 84 | intellectual property rights.
 85 | 
 86 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 87 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 88 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 89 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 90 | OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 91 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 92 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 93 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 94 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 95 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 96 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 97 | 
 98 | 
 99 | Citation
100 | ========
101 | 
102 | Any published work which uses this software should include the
103 | following citation:
104 | 
105 | ----------------------------------------------------------------------
106 | Maciej Besta, Nils Blach, Ales Kubicek, Robert Gerstenberger, Lukas
107 | Gianinazzi, Joanna Gajda, Tomasz Lehmann, Michał Podstawski, Hubert
108 | Niewiadomski, Piotr Nyczyk, Torsten Hoefler (2024): Graph of Thoughts:
109 | Solving Elaborate Problems with Large Language Models. In: Proceedings
110 | of the AAAI Conference on Artificial Intelligence, 38(16),
111 | 17682-17690. https://doi.org/10.1609/aaai.v38i16.29720
112 | ----------------------------------------------------------------------
113 | 
114 | 
115 | SelfCheckGPT (https://github.com/potsawee/selfcheckgpt)
116 | =======================================================
117 | 
118 | MIT License
119 | 
120 | Copyright (c) 2023 Potsawee Manakul
121 | 
122 | Permission is hereby granted, free of charge, to any person obtaining a copy
123 | of this software and associated documentation files (the "Software"), to deal
124 | in the Software without restriction, including without limitation the rights
125 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
126 | copies of the Software, and to permit persons to whom the Software is
127 | furnished to do so, subject to the following conditions:
128 | 
129 | The above copyright notice and this permission notice shall be included in all
130 | copies or substantial portions of the Software.
131 | 
132 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
133 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
134 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
135 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
136 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
137 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
138 | SOFTWARE.
139 | 
140 | Citation:
141 | ----------------------------------------------------------------------
142 | Potsawee Manakul, Adian Liusie, Mark J.F. Gales (2023): SelfCheckGPT:
143 | Zero-Resource Black-Box Hallucination Detection for Generative Large
144 | Language Models. In: arXiv preprint arXiv:2303.08896
145 | ----------------------------------------------------------------------
146 | 
147 | 
148 | HalluDetect (https://github.com/Rivas-AI/HalluDetect)
149 | =====================================================
150 | We used an adapted version of HalluDetect as a baseline for comparison
151 | with CheckEmbed.
152 | 
153 | MIT License
154 | 
155 | Copyright (c) 2024 Fidac
156 | 
157 | Permission is hereby granted, free of charge, to any person obtaining a copy
158 | of this software and associated documentation files (the "Software"), to deal
159 | in the Software without restriction, including without limitation the rights
160 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
161 | copies of the Software, and to permit persons to whom the Software is
162 | furnished to do so, subject to the following conditions:
163 | 
164 | The above copyright notice and this permission notice shall be included in all
165 | copies or substantial portions of the Software.
166 | 
167 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
168 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
169 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
170 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
171 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
172 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
173 | SOFTWARE.
174 | 
175 | Citation:
176 | -----------------------------------------------------------------------
177 | Ernesto Quevedo, Jorge Yero Salazar, Rachel Koerner, Pablo Rivas, Tomas
178 | Cerny (2024): Detecting Hallucinations in Large Language Model
179 | Generation: A Token Probability Approach. In: Proceedings of the 26th
180 | International Conference on Artificial Intelligence and Applications
181 | (ICAI '24)
182 | -----------------------------------------------------------------------
183 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # CheckEmbed
 2 | 
 3 | <p align="center">
 4 |   <img src="paper/pics/checkembed_overview.svg" width="80%">
 5 | </p>
 6 | 
 7 | This is the official implementation of [CheckEmbed: Effective Verification of LLM Solutions to Open-Ended Tasks](https://arxiv.org/abs/2406.02524).
 8 | 
 9 | This framework gives you the ability to verify LLM answers, especially for
10 | intricate open-ended tasks such as consolidation, summarization, and extraction
11 | of knowledge. CheckEmbed implements verification by running the LLMs' answers through
12 | an embedding model and comparing the corresponding answer-level embeddings.
13 | This reduction of a complex textual answer to a single embedding facilites a
14 | straightforward, fast, and meaningful verification, while showcasing
15 | significant improvements in accuracy, cost-effectiveness, and runtime
16 | performance compared to existing token-, sentence-, and fact-level schemes such
17 | as BERTScore or SelfCheckGPT.
18 | 
19 | 
20 | ## Setup Guide
21 | 
22 | In order to use this framework, you need to have a working installation of Python 3.8 or newer.
23 | 
24 | 
25 | ### Installing CheckEmbed
26 | 
27 | Before running either of the following two installation methods, make sure to activate your Python environment (if any) beforehand.
28 | If you are a user and you just want to use `CheckEmbed`, you can install it from source:
29 | ```bash
30 | git clone https://github.com/spcl/CheckEmbed.git
31 | cd CheckEmbed
32 | pip install .
33 | 
34 | # If you want to use a CUDA GPU, please install the following environment as well.
35 | pip install ".[cuda]"
36 | ```
37 | If you are a developer and you want to modify the code, you can install it in editable mode from source:
38 | ```bash
39 | git clone https://github.com/spcl/CheckEmbed.git
40 | cd CheckEmbed
41 | pip install -e .
42 | 
43 | # If you want to use a CUDA GPU, please install the following environment as well.
44 | pip install -e ".[cuda]"
45 | ```
46 | 
47 | ### Configuring the Models
48 | 
49 | In order to use parts of the framework, you need to have access to an LLM and/or an embedding model.
50 | Please follow the instructions in the READMEs of the respective modules to configure the [LLMs](CheckEmbed/language_models/README.md) and [embedding models](CheckEmbed/embedding_models/README.md) of your choice.
51 | Please create a copy of `config_template.json` named `config.json` in the CheckEmbed directory and update its details according to your needs.
52 | 
53 | 
54 | ## Documentation
55 | The paper gives a high-level overview of the framework and its components.
56 | In order to understand the framework in more detail, you can read the documentation of the individual modules.
57 | Especially the [Scheduler](CheckEmbed/scheduler/scheduler.py) module is important for understanding how to make the most out of the framework
58 | as well as the [Operation](CheckEmbed/operations/README.md) module for the interpretation of the results.
59 | 
60 | 
61 | ## Examples
62 | 
63 | The [examples](examples) directory contains several examples of use cases that can be solved using the framework, including the ones presented in the paper.
64 | It is a great starting point for learning how to use the framework to solve real problems.
65 | Each example contains a `README.md` file with instructions on how to run it and play with it.
66 | 
67 | 
68 | ## Paper Results
69 | 
70 | You can run the experiments from the paper by following the instructions in the [examples](examples) directory.
71 | However, if you just want to inspect and replot the results, you can use the [paper](paper) directory.
72 | 
73 | 
74 | ## Citations
75 | 
76 | If you find this repository valuable, please give it a star!
77 | Got any questions or feedback? Feel free to reach out and open an issue.
78 | Using this in your work? Please reference us using the provided citation:
79 | 
80 | ```bibtex
81 | @misc{besta2024checkembed,
82 |   title = {{CheckEmbed: Effective Verification of LLM Solutions to Open-Ended Tasks}},
83 |   author = {Besta, Maciej and Paleari, Lorenzo and Kubicek, Ales and Nyczyk, Piotr and Gerstenberger, Robert and Iff, Patrick and Lehmann, Tomasz and Niewiadomski, Hubert and Hoefler, Torsten},
84 |   year = 2024,
85 |   month = Jun,
86 |   eprinttype = {arXiv},
87 |   eprint = {2406.02524}
88 | }
89 | ```
90 | 


--------------------------------------------------------------------------------
/examples/RAGTruth/README.md:
--------------------------------------------------------------------------------
 1 | # RAGTruth Benchmark
 2 | 
 3 | This benchmark is based on the [RAGTruth](https://github.com/ParticleMedia/RAGTruth) dataset, a hallucination detection benchmark tailored for Retrieval-Augmented Generation (RAG) systems. The dataset consists of three task types: **Summarization**, **Data-to-Text Generation**, and **Question Answering (QA)**. Each task includes human-written source documents paired with responses generated by multiple Large Language Models (LLMs).
 4 | 
 5 | ## Tasks
 6 | 
 7 | RAGTruth includes 2,965 unique source tasks:
 8 | 
 9 | * **943** for Summarization
10 | * **1,033** for Data-to-Text
11 | * **989** for Question Answering
12 | 
13 | For each task, answers are generated using **6 different LLMs**, resulting in a total of **17,790 documents**:
14 | 
15 | * **5,658** Summarization responses
16 | * **6,198** Data-to-Text responses
17 | * **5,934** Question Answering responses
18 | 
19 | These generated responses are located in the `dataset/` folder and are split into `training_data.json` and `response.json` (test) sets.
20 | 
21 | ## Splits
22 | 
23 | * The **test split** consists of **2,700** LLM-generated answers (900 per task type).
24 | * The remaining samples are in the **train split**.
25 | 
26 | ## Evaluation Setup
27 | 
28 | Following the instructions of the official [paper](https://arxiv.org/abs/2401.00396), we generated **10 samples per LLM-generated answer** in the test set. These synthetic samples enable hallucination detection evaluation using methods such as:
29 | 
30 | * **CheckEmbed**
31 | * [**SelfCheckGPT**](https://github.com/potsawee/selfcheckgpt)
32 | 
33 | The code for generating samples is included in the `samples/` directory. All generations were run locally, with **no additional inference cost**.
34 | 
35 | ## Baselines
36 | 
37 | We also include an adjusted version of [**HalluDetect**](https://github.com/Baylor-AI/HalluDetect), an effective method for hallucination classification in text generation.
38 | 
39 | > **Note:** HalluDetect requires significant computational resources. For optimal performance, we recommend at least **2 × NVIDIA A100 GPUs** (40GB+ VRAM).
40 | 
41 | ## Cost Estimation
42 | 
43 | * **LLM-as-a-Judge**: \~\$20 total for scoring using default LLMs.
44 | 
45 | ## How to Run
46 | 
47 | Please execute the following steps to reproduce the experiment:
48 | 
49 | ```bash
50 | cd examples/RAGTruth/dataset
51 | # (Optional) Run sampling script if not already done
52 | # We already provide the necessary additional samples in the repository.
53 | python3 sampler.py
54 | 
55 | # Evaluate using CheckEmbed
56 | cd ..
57 | python3 main.py
58 | 
59 | # Run HalluDetect
60 | python3 hallu_detect.py
61 | ```
62 | 


--------------------------------------------------------------------------------
/examples/README.md:
--------------------------------------------------------------------------------
 1 | # Examples
 2 | 
 3 | This directory contains scripts for running various examples using the CheckEmbed package. Each script is a standalone Python program that sets up and runs a particular example.
 4 | 
 5 | Please refer to the individual example directories for more information on the specific example, specifically the `main.py` file, which is almost ready to be executed.
 6 | 
 7 | ## General Information
 8 | 
 9 | In each `main.py` file, the following parameters need to be set up for the desired environment:
10 | 
11 | - Check that the `config_path` variable is set up correctly.
12 | - Choose the language model(s) to evaluate.
13 | - Choose the embedding model(s).
14 | - Check the `device` and `batch_size` parameters for the embeddings models and scheduler.
15 | - Modify the `startingPoint` parameter of `scheduler.run(...)` to influence which stages will be executed:
16 |   - `StartingPoint.PROMPT`: prompt generation, sample generation, embedding generation and evaluation (plotting)
17 |   - `StartingPoint.SAMPLES`: sample generation, embedding generation and evaluation (plotting)
18 |   - `StartingPoint.EMBEDDINGS`: embedding generation and evaluation (plotting)
19 | - If you want to use the `Alibaba-NLP/gte-Qwen1.5-7B-instruct` embedding model, please add your Huggingface access token to respective initialisation call.
20 | 
21 | Once everything is set up, change into the desired example folder and execute:
22 | 
23 | ```
24 | python3 main.py
25 | ```
26 | 
27 | ## Scheduler Setup
28 | 
29 | The file [scheduler.py](/CheckEmbed/scheduler/scheduler.py) contains specific documentation for each parameter.
30 | 
31 | ```python
32 | scheduler = Scheduler(
33 |     current_dir,
34 |     logging_level = logging.DEBUG,
35 | 
36 |     # Adjust the budget based on the estimations documented for each example.
37 |     # If the budget is too low, the execution of the pipeline will be stopped as soon as the limit is detected.
38 |     budget = 12,
39 |     parser = customParser,
40 | 
41 |     # Update to include more or fewer LLMs / embedding models.
42 |     lm = [gpt4_o, gpt4, gpt3],
43 |     embedding_lm = [embedd_large, sfrEmbeddingMistral, e5mistral7b, gteQwen157bInstruct],
44 |     llm_as_a_judge_Operation = llm_as_a_judge_Operation,
45 |     llm_as_a_judge_models = [gpt4_o_mini, gpt4_o, llama70, llama8],
46 | 
47 |     # Operations to be executed during the evaluation stage.
48 |     operations = [operation1, operation2, ...],
49 | )
50 | 
51 | # The order of lm_names and embedding_lm_names should be the same
52 | # as the order of the language models and embedding language models respectively.
53 | scheduler.run(
54 |     # If an error occurs, the starting point can be adjusted to avoid recomputation.
55 |     startingPoint = StartingPoint.PROMPT,
56 | 
57 |     # utility functions
58 |     defaultDirectories = True,
59 | 
60 |     # Indicate which operations to run.
61 |     bertScore = True,
62 |     selfCheckGPT = True,
63 |     checkEmbed = True,
64 |     llm_as_a_judge = True,
65 | 
66 |     # Settings for the pipeline.
67 |     ground_truth = False,
68 |     spacy_separator = True,
69 |     time_performance = False,
70 |     rebase_results = False,
71 |     reference_text = False,
72 | 
73 |     # Number of samples per prompt example.
74 |     num_samples = 10,
75 | 
76 |     # Optional values, if not set, the default values will be used.
77 |     lm_names = ["gpt4-o", "gpt4-turbo", "gpt"],
78 |     embedding_lm_names = ["gpt-embedding-large", "sfr-embedding-mistral", "e5-mistral-7B-instruct", "gte-Qwen1.5-7B-instruct", "stella-en-400M-v5", "stella-en-1.5B-v5"],
79 | 
80 |     # Do not modify
81 |     bertScore_model = "microsoft/deberta-xlarge-mnli",
82 | 
83 |     # It may be necessary to reduce the batch size if the model is too large, with 8GB of GPU VRAM we suggest the use of batch_size = 1.
84 |     batch_size = 64,
85 |     device = "cuda" # or "cpu" "mps" ...
86 | )
87 | ```
88 | 


--------------------------------------------------------------------------------
/examples/description/README.md:
--------------------------------------------------------------------------------
 1 | # Distinguishing Similar and Different Text Passages
 2 | 
 3 | The use case in this directory analyzes, whether a verification method is able to clearly distinguish two passages of text that either look
 4 | similar, but come with very different meanings ("different") or look different, but have similar or identical meanings ("similar").
 5 | 
 6 | ## Data
 7 | 
 8 | The list of topics for the different subtask can be found in `different_topics_list` list in the `different/main.py` file.
 9 | There are two lists of topics for the similar subtask: `precise` and `generic`. Both lists (`precise_topics` and `general_topics`) can be found
10 | in the `similar/main.py` file.
11 | 
12 | ## Prompt Templates
13 | 
14 | The prompt templates for the subtasks can be found in `different/prompt_scheme.txt` and `similar/prompt_scheme.txt` respectively.
15 | 
16 | ## Runtime / Cost Estimation
17 | 
18 | The samples have been generated with a temperature of 1.0. The temperature can be adjusted in your `config.json`.
19 | We estimate a compute time of 90 minutes with an NVIDIA Tesla V100-PCIE-32GB for each subtask.
20 | 
21 | The total estimated costs are $1.55 for each subtask:
22 | 
23 | - GPT4-o: $0.5
24 | - GPT4-turbo: $1
25 | - GPT3.5: $0.05
26 | 
27 | Running LLM-as-a-Judge for all subtasks will result in an additional cost of no more than $1.
28 | 


--------------------------------------------------------------------------------
/examples/description/different/prompt_scheme.txt:
--------------------------------------------------------------------------------
 1 | ### INSTRUCTION ###
 2 | 
 3 | Hello. Please generate two passages of text. They should describe two diffent things:
 4 | 1. ### HERE 1 ###
 5 | 2. ### HERE 2 ###
 6 | 
 7 | However, these two passages should have the same length and style
 8 | I want you to give an answer using the following format:
 9 | <formatting>
10 | ### DESCRIPTION 1 ###
11 | the actual description here...
12 | ### DESCRIPTION 2 ###
13 | the actual description here...
14 | </formatting>
15 | 
16 | ### ANSWER ###


--------------------------------------------------------------------------------
/examples/description/similar/prompt_scheme.txt:
--------------------------------------------------------------------------------
 1 | ### INSTRUCTION ###
 2 | 
 3 | Hello. Please generate two passages of text. They should both describe the same thing (### HERE ###). However, these two passages should differ VASTLY in their length, style.
 4 | I want you to give an answer using the following format:
 5 | <formatting>
 6 | ### DESCRIPTION 1 ###
 7 | the actual description here...
 8 | ### DESCRIPTION 2 ###
 9 | the actual description here...
10 | </formatting>
11 | 
12 | ### ANSWER ###


--------------------------------------------------------------------------------
/examples/incremental_forced_hallucination/legal_summaries/README.md:
--------------------------------------------------------------------------------
 1 | # Hallucination
 2 | 
 3 | The use case in this directory detects small fine-grained hallucinations, such
 4 | as mistakes in individual facts. The use case is based on the summarization of different legal text chunks.
 5 | For each chunk considered, the ground truth is generated using a
 6 | special prompt `prompt_scheme_ground_truth.txt`, which gathers 10 samples from the LLM by asking for a correct summarization of that chunk.
 7 | The LLM is also tasked to provide errors for that chunk, in the range from 1 to 10.
 8 | These errors are then incorporated separately into the summary, so that the number of errors inside the summary varies between 1 to 10.
 9 | These error-ridden summary are then sampled with an LLM and compared against the zero error original summary via the CheckEmbed pipeline.
10 | 
11 | ## Data
12 | 
13 | The dataset can be found in the file `dataset/legal_definitions.json`. It consists of text chunks to be summarized.
14 | 
15 | ## Prompt Templates
16 | 
17 | The prompt templates can be found in the files `prompt_scheme.txt` and `prompt_scheme_ground_truth.txt`.
18 | 
19 | ## Runtime / Cost Estimation
20 | 
21 | The samples have been generated with a temperature of 0.25. The temperature can be adjusted in your `config.json`.
22 | We estimate a compute time of 20 hours with an NVIDIA GH200.
23 | 
24 | The total estimated costs are $35:
25 | 
26 | - GPT4-o: $33
27 | - GPT3.5: $2
28 | 
29 | Running LLM-as-a-Judge will result in an additional cost of no more than $2.
30 | 


--------------------------------------------------------------------------------
/examples/incremental_forced_hallucination/legal_summaries/prompt_scheme.txt:
--------------------------------------------------------------------------------
 1 | ### INSTRUCTION ###
 2 | 
 3 | You are a lawyer.
 4 | 
 5 | ### TASK ###
 6 | 
 7 | ## SUBTASK 1 ##
 8 | Based on the provided context generate ### NUMBER ### completely false information (fact hallucinations) about it.
 9 | 
10 | ## SUBTASK 2 ##
11 | Create a complete and detailed summary of the provided context including all the errors generated in SUBTASK 1.
12 | When including the hallucinations you must not say they are wrong.
13 | 
14 | Answer using the following formatting.
15 | <formatting>
16 | ### ERRORS ###
17 | List of fact hallucinations to be later included in the summary...
18 | ### SUMMARY ###
19 | The summary here....
20 | </formatting>
21 | 
22 | ### CONTEXT ###
23 | 
24 | [###REPLACE WITH CONTEXT###]
25 | 
26 | ### ANSWER ###


--------------------------------------------------------------------------------
/examples/incremental_forced_hallucination/legal_summaries/prompt_scheme_ground_truth.txt:
--------------------------------------------------------------------------------
 1 | ### INSTRUCTION ###
 2 | 
 3 | You are a lawyer.
 4 | 
 5 | ### TASK ###
 6 | 
 7 | Based on the provided context generate a complete and detaild summary of it. Answer using the following formatting.
 8 | </formatting>
 9 | ### SUMMARY ###
10 | The summary here....
11 | </formatting>
12 | 
13 | ### CONTEXT ###
14 | 
15 | [###REPLACE WITH CONTEXT###]
16 | 
17 | ### ANSWER ###


--------------------------------------------------------------------------------
/examples/incremental_forced_hallucination/operation_variants/__init__.py:
--------------------------------------------------------------------------------
1 | from .bertscore_operation_variant import BertScoreOperation_Variant
2 | from .selfcheckgpt_operation_variant import SelfCheckGPT_BERT_Operation_Variant, SelfCheckGPT_NLI_Operation_Variant
3 | from .checkembed_operation_variant import CheckEmbedOperation_Variant
4 | 


--------------------------------------------------------------------------------
/examples/incremental_forced_hallucination/operation_variants/bertscore_operation_variant.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2024 ETH Zurich.
  2 | #                    All rights reserved.
  3 | #
  4 | # Use of this source code is governed by a BSD-style license that can be
  5 | # found in the LICENSE file.
  6 | #
  7 | # main author: Lorenzo Paleari
  8 | 
  9 | import logging
 10 | import os
 11 | import json
 12 | 
 13 | import bert_score
 14 | import numpy as np
 15 | from tqdm import tqdm
 16 | from typing import Any
 17 | 
 18 | from CheckEmbed.operations import Operation
 19 | from CheckEmbed.utility import capture_specific_stderr, frobenius_norm
 20 | 
 21 | class BertScoreOperation_Variant(Operation):
 22 |     """
 23 |     Operation that computes the BertScore between the reference and the sample embeddings.
 24 | 
 25 |     Inherits from the Operation class and implements its abstract methods.
 26 |     """
 27 | 
 28 |     def __init__(self, result_dir_path: str, ground_truth_dir_path: str, sample_dir_path: str) -> None:
 29 |         """
 30 |         Initialize the operation.
 31 | 
 32 |         :param result_dir_path: The path to the directory where the results will be stored.
 33 |         :type result_dir_path: str
 34 |         :param ground_truth_dir_path: The path to the directory where the ground truth samples are stored.
 35 |         :type ground_truth_dir_path: str
 36 |         :param sample_dir_path: The path to the directory where the samples are stored.
 37 |         :type sample_dir_path: str
 38 |         """
 39 |         super().__init__(result_dir_path)
 40 |         self.sample_dir_path = sample_dir_path
 41 |         self.ground_truth = ground_truth_dir_path
 42 | 
 43 |     def execute(self, custom_inputs: Any) -> Any:
 44 |         """
 45 |         Execute the operation on the embeddings/samples.
 46 | 
 47 |         :param custom_inputs: The custom inputs for the operation.
 48 |         :type custom_inputs: any
 49 |         """
 50 | 
 51 |         print("\n\nRunning BertScore operation.")
 52 |         
 53 |         # Initialize logging
 54 |         logging.basicConfig(
 55 |             filename=os.path.join(self.result_dir_path, "log.log"),
 56 |             filemode="w",
 57 |             format="%(name)s - %(levelname)s - %(message)s",
 58 |             level=custom_inputs["logging_level"],
 59 |         )
 60 | 
 61 |         # Run BertScore for every pair of language model and samples
 62 |         for lm_name in (pbar := tqdm(custom_inputs["lm_names"], desc="Language Models", leave=True)):
 63 |             pbar.set_postfix_str(f"{lm_name}")
 64 |             logging.info(f"Loading responses from {lm_name}.")
 65 |             samples = []
 66 | 
 67 |             # Load samples from the language model
 68 |             with open(os.path.join(self.sample_dir_path, f"{lm_name}_samples.json")) as f:
 69 |                 responses = json.load(f)
 70 | 
 71 |             for index, response in enumerate(responses["data"]):
 72 |                 samples.append(response["samples"])
 73 |                 logging.debug(f"Sample {index}: {samples[index]}")
 74 | 
 75 |             logging.info("Loaded samples.")
 76 | 
 77 |             # load second set of samples that will be ground_truth data
 78 |             ground_truth_list = []
 79 |             with open(os.path.join(self.ground_truth, f"{lm_name}_samples.json")) as f:
 80 |                 responses = json.load(f)
 81 | 
 82 |             for index, response in enumerate(responses["data"]):
 83 |                 ground_truth_list.append(response["samples"])
 84 |                 logging.debug(f"ground_truth {index}: {ground_truth_list[index]}")
 85 |             
 86 |             # For every prompt compare every sample with every other sample
 87 |             logging.info(f"Running BertScore for {lm_name}.")
 88 | 
 89 |             same_samples = []
 90 |             for sample in samples:
 91 |                 same_s = []
 92 |                 for i in range(len(sample)):
 93 |                     temp = []
 94 |                     for j in range(len(sample)):
 95 |                         temp.append(sample[i])
 96 |                     same_s.append(temp)
 97 |                 same_samples.append(same_s)
 98 | 
 99 |             results = []
100 |             for ground_truth, same_sample in tqdm(zip(ground_truth_list, same_samples), total=len(samples), desc="Prompts", leave=False):
101 |                 result = []
102 |                 for s in tqdm(same_sample, desc="Samples", leave=False):
103 |                     target_string = ["Warning: Empty reference sentence detected; setting raw BERTScores to 0.","Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']", "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference."]
104 |                     with capture_specific_stderr(target_string):
105 |                         result.append(bert_score.score(
106 |                             ground_truth, s, model_type=custom_inputs["model_type"],
107 |                             batch_size=custom_inputs["batch_size"], device=custom_inputs["device"],
108 |                             lang="en", verbose=False,
109 |                             rescale_with_baseline=True,
110 |                         )[2].tolist())
111 |                 results.append(result)
112 |                 logging.debug(f"Results: {result}")
113 |             
114 |             logging.info(f"Finished running BertScore for {lm_name}.")
115 | 
116 |             # Fix the results that are less than -1
117 |             for index, result in enumerate(results):
118 |                 temp_res = np.zeros((len(result), len(result[0])))
119 |                 for i in range(temp_res.shape[0]):
120 |                     for j in range(temp_res.shape[1]):
121 |                         if temp_res[i][j] < -1:
122 |                             temp_res[i][j] = -1
123 |                         else:
124 |                             temp_res[i][j] = result[i][j]
125 |                 results[index] = temp_res
126 | 
127 |             frobenius_norms = [frobenius_norm(result) for result in results]
128 |             std_devs = [np.std(result) for result in results]
129 | 
130 |             # Store results
131 |             with open(os.path.join(self.result_dir_path, f"{lm_name}_bert.json"), "w") as f:
132 |                 results_json = [{
133 |                     "index": i,
134 |                     "result": result.tolist(),
135 |                     "frobenius_norm": frob_norm,
136 |                     "std_dev": std_dev
137 |                 } for i, result, frob_norm, std_dev in zip(range(len(results)), results, frobenius_norms, std_devs)]
138 |                 json.dump({"data": results_json}, f, indent=4)
139 | 
140 |             logging.info(f"Saved results for {lm_name}.")
141 | 


--------------------------------------------------------------------------------
/examples/incremental_forced_hallucination/operation_variants/checkembed_operation_variant.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2024 ETH Zurich.
  2 | #                    All rights reserved.
  3 | #
  4 | # Use of this source code is governed by a BSD-style license that can be
  5 | # found in the LICENSE file.
  6 | #
  7 | # main author: Lorenzo Paleari
  8 | 
  9 | import os
 10 | import json
 11 | 
 12 | import numpy as np
 13 | 
 14 | from typing import Any
 15 | 
 16 | from CheckEmbed.operations import Operation
 17 | from CheckEmbed.utility import cosine_similarity, frobenius_norm
 18 | 
 19 | class CheckEmbedOperation_Variant(Operation):
 20 |     """
 21 |     Operation that computes the cosine similarity, the Pearson correlation, the Frobenius norm and standard deviation between the embeddings.
 22 |     """
 23 | 
 24 |     def __init__(self, result_dir_path: str, ground_truth_dir_path: str, embeddings_dir_path: str) -> None:
 25 |         """
 26 |         Initialize the operation.
 27 | 
 28 |         :param result_dir_path: The path to the directory where the results will be stored.
 29 |         :type result_dir_path: str
 30 |         :param ground_truth_dir_path: The path to the directory where the ground truth embeddings are stored.
 31 |         :type ground_truth_dir_path: str
 32 |         :param embeddings_dir_path: The path to the directory where the embeddings are stored.
 33 |         :type embeddings_dir_path: str
 34 |         """
 35 |         super().__init__(result_dir_path)
 36 |         self.ground_truth_dir = ground_truth_dir_path
 37 |         self.embeddings_dir_path = embeddings_dir_path
 38 | 
 39 |     def execute(self, custom_inputs: Any) -> Any:
 40 |         """
 41 |         Execute the operation on the embeddings/samples.
 42 | 
 43 |         :param custom_inputs: The custom inputs for the operation.
 44 |         :type custom_inputs: Any
 45 |         """
 46 | 
 47 |         # For every language model / embedding model 
 48 |         for file in os.listdir(self.embeddings_dir_path):
 49 |             if ".json" in file and not file.startswith("ground_truth_"):
 50 |                 
 51 |                 folder_name = file.replace("_" + file.split("_")[2], "")
 52 | 
 53 |                 # Load the samples embeddings
 54 |                 with open(os.path.join(self.embeddings_dir_path, file), "r") as f:
 55 |                     data = json.load(f)
 56 |                 data_array = data["data"]
 57 |                 embeddings = [d["embeddings"] for d in data_array]  # Convert to numpy array
 58 | 
 59 |                 # Load the ground_truth embeddings
 60 |                 dimensions = len(embeddings[0])
 61 |                 with open(os.path.join(self.ground_truth_dir, file), "r") as f:
 62 |                     data = json.load(f)
 63 |                 data_array = data["data"]
 64 |                 ground_truth_embeddings = [d["embeddings"] for d in data_array]  # Convert to numpy array
 65 | 
 66 |                 # Compute the cosine similarity matrix with the ground truth
 67 |                 cosine_similarity_matrix_array = []
 68 |                 for embedding, ground_truth in zip(embeddings, ground_truth_embeddings):
 69 |                     # -1 array to initialize the cosine similarity matrix  
 70 |                     cosine_similarity_matrix = np.full((dimensions, dimensions), -1.0)
 71 |                     for i in range(len(embedding)):
 72 |                         for j in range(len(embedding)):
 73 |                             cosine_similarity_matrix[i, j] = cosine_similarity(ground_truth[i], embedding[j])
 74 | 
 75 |                     cosine_similarity_matrix_array.append(cosine_similarity_matrix)
 76 |         
 77 |                 # Compute the Frobenius norm of each cosine similarity matrix
 78 |                 frobenius_norms_cosine_sim = [frobenius_norm(cosine_similarity_matrix) for cosine_similarity_matrix in cosine_similarity_matrix_array]
 79 | 
 80 |                 # Compute the standard deviation of each cosine similarity matrix
 81 |                 std_dev_cosine_sim_array = [np.std(cosine_similarity_matrix) for cosine_similarity_matrix in cosine_similarity_matrix_array]
 82 | 
 83 |                 # Compute the Pearson correlation matrix
 84 |                 pearson_corr_array = []
 85 |                 for embedding, ground_truth in zip(embeddings, ground_truth_embeddings):
 86 |                     pearson_corr = np.full((dimensions, dimensions), -1.0)
 87 |                     for i in range(len(embedding)):
 88 |                         for j in range(len(embedding)):
 89 |                             if len(embedding[i]) == 0 and len(embedding[j]) == 0:
 90 |                                 pearson_corr[i, j] = 1.0
 91 |                                 continue
 92 |                             if len(embedding[i]) == 0 or len(embedding[j]) == 0:
 93 |                                 pearson_corr[i, j] = -1.0
 94 |                                 continue
 95 |                             pearson_corr[i, j] = np.corrcoef(embedding[i], embedding[j])[0, 1]
 96 | 
 97 |                     pearson_corr_array.append(pearson_corr)
 98 | 
 99 |                 # Compute the Frobenius norm of each cosine similarity matrix
100 |                 frobenius_norms_pearson_corr = [frobenius_norm(pearson_corr) for pearson_corr in pearson_corr_array]
101 | 
102 |                 std_dev_pearson_corr_array = [np.std(pearson_corr) for pearson_corr in pearson_corr_array]
103 | 
104 |                 with open(os.path.join(self.result_dir_path, folder_name + "_results.json"), "w") as f:
105 |                     results_json = [{
106 |                         "index": index,
107 |                         "cosine_sim": cosine_sim.tolist(),
108 |                         "frob_norm_cosine_sim": frob_norm_cosine_sim,
109 |                         "std_dev_cosine_sim": std_dev_cosine_sim,
110 |                         "pearson_corr": pearson_corr.tolist(),
111 |                         "frob_norm_pearson_corr": frob_norm_pearson_corr,
112 |                         "std_dev_pearson_corr": std_dev_pearson_corr
113 |                     } for index, cosine_sim, frob_norm_cosine_sim, std_dev_cosine_sim, pearson_corr, frob_norm_pearson_corr, std_dev_pearson_corr in zip(range(len(cosine_similarity_matrix_array)), cosine_similarity_matrix_array, frobenius_norms_cosine_sim, std_dev_cosine_sim_array, pearson_corr_array, frobenius_norms_pearson_corr, std_dev_pearson_corr_array)]
114 |                     json.dump({"data": results_json}, f, indent=4)
115 | 


--------------------------------------------------------------------------------
/examples/incremental_forced_hallucination/scientific_descriptions/README.md:
--------------------------------------------------------------------------------
 1 | # Hallucination
 2 | 
 3 | The use case in this directory detects small fine-grained hallucinations, such
 4 | as mistakes in individual facts. The use case is based on the description of different scientific topics.
 5 | For each topic considered, the ground truth is generated using a
 6 | special prompt `prompt_scheme_ground_truth.txt`, which gathers 10 samples from the LLM by asking for a correct description of that specific topic.
 7 | The LLM is also tasked to provide errors for that topic, in the range from 1 to 10.
 8 | These errors are then incorporated separately into the description, so that the number of errors inside the description varies between 1 to 10.
 9 | These error-ridden descriptions are then sampled with an LLM and compared against the zero error original description via the CheckEmbed pipeline.
10 | 
11 | ## Data
12 | 
13 | The list of topics can be found in `topics_list` list in the `main.py` file.
14 | 
15 | ## Prompt Templates
16 | 
17 | The prompt templates can be found in the files `prompt_scheme.txt` and `prompt_scheme_ground_truth.txt`.
18 | 
19 | ## Runtime / Cost Estimation
20 | 
21 | The samples have been generated with a temperature of 1.0. The temperature can be adjusted in your `config.json`.
22 | We estimate a compute time of 20 hours with an NVIDIA GH200.
23 | 
24 | The total estimated costs are $35:
25 | 
26 | - GPT4-o: $33
27 | - GPT3.5: $2
28 | 
29 | Running LLM-as-a-Judge will result in an additional cost of no more than $2.
30 | 


--------------------------------------------------------------------------------
/examples/incremental_forced_hallucination/scientific_descriptions/prompt_scheme.txt:
--------------------------------------------------------------------------------
 1 | ### INSTRUCTION ###
 2 | 
 3 | Hello. Please generate ### NUMBER ### completely false information (fact hallucinations) on (### TOPIC ###).
 4 | Then insert the errors inside a passage of text that talks about (### TOPIC ###). 
 5 | You should convince a reader that the false informations are actually correct ones.
 6 | 
 7 | Please, use the following format for answering:
 8 | 
 9 | <formatting>
10 | ### ERRORS ###
11 | List of fact hallucinations to be later included in the passage...
12 | ### PASSAGE ###
13 | The passage here....
14 | </formatting>


--------------------------------------------------------------------------------
/examples/incremental_forced_hallucination/scientific_descriptions/prompt_scheme_ground_truth.txt:
--------------------------------------------------------------------------------
1 | ### INSTRUCTION ###
2 | 
3 | Hello. Please generate a passage of text that talks about (### TOPIC ###).
4 | 
5 | Please, use the following format for answering:
6 | <formatting>
7 | ### PASSAGE ###
8 | The passage here....
9 | </formatting>


--------------------------------------------------------------------------------
/examples/legal_definitions/README.md:
--------------------------------------------------------------------------------
 1 | # Legal Definitions
 2 | 
 3 | The use case in this directory extracts terms and their definitions from legal documents. It is based on an in-house legal analytics project.
 4 | 
 5 | We use this example also for an ablation study by varying the chunk sizes that are processed in a single step.
 6 | An increase in chunk size means that more terms and their definitions need to be extracted at a time.
 7 | The general assumption is that the LLM will perform worse if the processed document size increases, which should be reflected in the resulting CheckEmbed scores.
 8 | If you wish the run the original use case with a single chunk size, please comment out the lines 233 to 241 in `main.py`.
 9 | 
10 | ## Data
11 | 
12 | The dataset can be found in the file `dataset/legal_definitions.json`. It consists of text chunks as well as expected terms to be found (the "ground truth").
13 | 
14 | ## Prompt Template
15 | 
16 | The prompt template can be found in the file `prompt_scheme.txt`.
17 | 
18 | ## Runtime / Cost Estimation
19 | 
20 | The samples have been generated with a temperature of 0.25. The temperature can be adjusted in your `config.json`.
21 | We estimate a compute time of 90 minutes with an NVIDIA A100-SXM-40GB for each experiment.
22 | 
23 | Based on the experiment the total estimated costs are $7 (1 chunk), $11 (2 chunks) and $18 (4 chunks):
24 | - GPT4-o: $2.25, $3.5, $5.75
25 | - GPT4-turbo: $4.5, $7, $10.5
26 | - GPT3.5: $0.15, $0.5, $1.5
27 | 


--------------------------------------------------------------------------------
/examples/legal_definitions/main.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2024 ETH Zurich.
  2 | #                    All rights reserved.
  3 | #
  4 | # Use of this source code is governed by a BSD-style license that can be
  5 | # found in the LICENSE file.
  6 | #
  7 | # main author: Lorenzo Paleari
  8 | 
  9 | import json
 10 | import logging
 11 | import os
 12 | from typing import Any, List
 13 | 
 14 | from CheckEmbed import embedding_models
 15 | from CheckEmbed import language_models
 16 | from CheckEmbed.parser import Parser
 17 | from CheckEmbed.scheduler import Scheduler, StartingPoint
 18 | from CheckEmbed.operations import SelfCheckGPT_BERT_Operation, SelfCheckGPT_NLI_Operation
 19 | 
 20 | 
 21 | class CustomParser(Parser):
 22 |     """
 23 |     The CustomParser class handles the dataset parsing.
 24 | 
 25 |     Inherits from the Parser class and implements its abstract methods.
 26 |     """
 27 | 
 28 |     def __init__(self, dataset_path: str, prompt_scheme_path: str, num_chunks: int) -> None:
 29 |         """
 30 |         Initialize the parser.
 31 | 
 32 |         :param dataset_path: The path to the dataset.
 33 |         :type dataset_path: str
 34 |         :param prompt_scheme_path: The path to the prompt scheme file.
 35 |         :type prompt_scheme_path: str
 36 |         :param num_chunks: The number of chunks.
 37 |         :type num_chunks: int
 38 |         """
 39 |         super().__init__(dataset_path)
 40 |         self.prompt_scheme_path = prompt_scheme_path
 41 |         self.num_chunks = num_chunks
 42 | 
 43 |     def prompt_generation(self, custom_inputs: Any = None) -> List[str]:
 44 |         """
 45 |         Parse the dataset and generate the prompts for the model.
 46 | 
 47 |         :param custom_inputs: The custom inputs to the parser. Defaults to None.
 48 |         :type custom_inputs: Any
 49 |         :return: List of prompts.
 50 |         :rtype: List[str]
 51 |         """
 52 |         # Getting the input data from the dataset
 53 |         input_data = []
 54 |         with open(self.dataset_path) as f:
 55 |             json_data = json.load(f)
 56 | 
 57 |         data_array = json_data['data']
 58 |         for data in data_array:
 59 |             input_data.append(data['chunk_txt'])
 60 | 
 61 |         # Prompts generation
 62 |         prompt_complete = None
 63 |         with open(self.prompt_scheme_path) as f:
 64 |             prompt_complete = f.read()
 65 | 
 66 |         prompt_initial = prompt_complete[0:prompt_complete.find('[###REPLACE WITH CONTEXT###]')]
 67 |         prompt_final = prompt_complete[prompt_complete.find('[###REPLACE WITH CONTEXT###]')+len('[###REPLACE WITH CONTEXT###]'):]
 68 | 
 69 |         start_index = 0
 70 |         if self.num_chunks == 1:
 71 |             start_index = 1
 72 | 
 73 |         # Use the input data as context inside the prompts
 74 |         prompts = []
 75 |         for i in range(start_index, len(input_data) - self.num_chunks + 1):
 76 |             prompts.append(prompt_initial + "".join(input_data[i:i+self.num_chunks]) + prompt_final)
 77 | 
 78 |         return prompts
 79 | 
 80 |     def ground_truth_extraction(self, custom_inputs: Any = None) -> List[str]:
 81 |         """
 82 |         Parse the dataset and extract the ground truth.
 83 | 
 84 |         :param custom_inputs: The custom inputs to the parser. Defaults to None.
 85 |         :type custom_inputs: Any
 86 |         :return: List of ground truths.
 87 |         :rtype: List[str]
 88 |         """
 89 |         ground_truth = []
 90 |         with open(self.dataset_path) as f:
 91 |             json_data = json.load(f)
 92 | 
 93 |         data_array = json_data['data']
 94 |         for data in data_array:
 95 |             text = ""
 96 |             for definition in data['definitions']:
 97 |                 text += definition["term"] + ". " + definition["context"] + "\n"
 98 | 
 99 |             text = text[:-1]
100 |             ground_truth.append(text)
101 | 
102 |         start_index = 0
103 |         if self.num_chunks == 1:
104 |             start_index = 1
105 | 
106 |         composite_ground_truth = []
107 |         for i in range(start_index, len(ground_truth) - self.num_chunks + 1):
108 |             composite_ground_truth.append("\n".join(ground_truth[i:i+self.num_chunks]))
109 |         
110 |         return composite_ground_truth
111 | 
112 | def start(current_dir: str, num_chunks: int = 1, start: int = StartingPoint.PROMPT) -> None:
113 |     """
114 |     Start the main function.
115 | 
116 |     :param current_dir: The current directory.
117 |     :type current_dir: str
118 |     :param num_chunks: The number of chunks. Defaults to 1.
119 |     :type num_chunks: int
120 |     :param start: The starting point. Defaults to StartingPoint.PROMPT.
121 |     :type start: int
122 |     """
123 | 
124 |     # Config file for the LLM(s)
125 |     config_path = os.path.join(
126 |             current_dir,
127 |             "../../CheckEmbed/config.json",
128 |         )
129 | 
130 |     # Initialize the parser and the embedder
131 |     customParser = CustomParser("./dataset/legal_definitions.json", os.path.join(current_dir, "prompt_scheme.txt"), num_chunks=num_chunks)
132 | 
133 |     # Initialize the language models
134 |     gpt3 = language_models.ChatGPT(
135 |         config_path,
136 |         model_name = "chatgpt",
137 |         cache = True,
138 |     ) 
139 | 
140 |     gpt4 = language_models.ChatGPT(
141 |         config_path,
142 |         model_name = "chatgpt4-turbo",
143 |         cache = True,
144 |     )
145 | 
146 |     gpt4_o = language_models.ChatGPT(
147 |         config_path,
148 |         model_name = "chatgpt4-o",
149 |         cache = True,
150 |     )
151 | 
152 |     embedd_large = embedding_models.EmbeddingGPT(
153 |         config_path,
154 |         model_name = "gpt-embedding-large",
155 |         cache = False,
156 |     )
157 | 
158 |     sfrEmbeddingMistral = embedding_models.SFREmbeddingMistral(
159 |         model_name = "Salesforce/SFR-Embedding-Mistral",
160 |         cache = False,
161 |     )
162 | 
163 |     e5mistral7b = embedding_models.E5Mistral7b(
164 |         model_name = "intfloat/e5-mistral-7b-instruct",
165 |         cache = False,
166 |     )
167 | 
168 |     gteQwen157bInstruct = embedding_models.GteQwenInstruct(
169 |         model_name = "Alibaba-NLP/gte-Qwen1.5-7B-instruct",
170 |         cache = False,
171 |         access_token = "", # Add your access token here
172 |         batch_size = 4, # it may be necessary to reduce the batch size if the GPU VRAM < 40GB
173 |     )
174 | 
175 |     stella_en_15B_v5 = embedding_models.Stella(
176 |         model_name = "NovaSearch/stella_en_1.5B_v5",
177 |         variant = "1.5B-v5",
178 |         cache = False,
179 |     )
180 | 
181 |     stella_en_400M_v5 = embedding_models.Stella(
182 |         model_name = "NovaSearch/stella_en_400M_v5",
183 |         cache = False,
184 |     )
185 | 
186 |     selfCheckGPT_BERT_Operation = SelfCheckGPT_BERT_Operation(
187 |         os.path.join(current_dir, "SelfCheckGPT"),
188 |         current_dir,
189 |     )
190 | 
191 |     selfCheckGPT_NLI_Operation = SelfCheckGPT_NLI_Operation(
192 |         os.path.join(current_dir, "SelfCheckGPT"),
193 |         current_dir,
194 |     )
195 | 
196 |     # Initialize the scheduler
197 |     scheduler = Scheduler(
198 |         current_dir,
199 |         logging_level = logging.DEBUG,
200 |         budget = 30,
201 |         parser = customParser,
202 |         lm = [gpt4_o, gpt4, gpt3],
203 |         embedding_lm = [stella_en_15B_v5, stella_en_400M_v5, gteQwen157bInstruct, e5mistral7b, sfrEmbeddingMistral, embedd_large],
204 |         selfCheckGPTOperation=[selfCheckGPT_NLI_Operation, selfCheckGPT_BERT_Operation],
205 |     )
206 | 
207 |     # The order of lm_names and embedding_lm_names should be the same
208 |     # as the order of the language models and embedding language models respectively.
209 |     scheduler.run(
210 |         startingPoint = start,
211 |         bertScore = True, 
212 |         selfCheckGPT = True,
213 |         ground_truth = True, 
214 |         rebase_results=True,
215 |         num_samples = 10, 
216 |         bertScore_model = "microsoft/deberta-xlarge-mnli",
217 |         batch_size = 64, # it may be necessary to reduce the batch size if the model is too large
218 |         device = "cuda" # or "cpu" "mps" ...
219 |     )
220 | 
221 | if __name__ == "__main__":
222 |     current_dir = os.path.dirname(os.path.abspath(__file__)) + "/chunk_dim_1"
223 |     os.makedirs(current_dir, exist_ok=True)
224 |     start(current_dir, num_chunks=1, start=StartingPoint.PROMPT)
225 | 
226 |     current_dir = os.path.dirname(os.path.abspath(__file__)) + "/chunk_dim_2"
227 |     os.makedirs(current_dir, exist_ok=True)
228 |     start(current_dir, num_chunks=2, start=StartingPoint.PROMPT)
229 | 
230 |     current_dir = os.path.dirname(os.path.abspath(__file__)) + "/chunk_dim_4"
231 |     os.makedirs(current_dir, exist_ok=True)
232 |     start(current_dir, num_chunks=4, start=StartingPoint.PROMPT)
233 | 


--------------------------------------------------------------------------------
/examples/legal_definitions/prompt_scheme.txt:
--------------------------------------------------------------------------------
 1 | ### INSTRUCTION ###
 2 | 
 3 | You are a lawyer.
 4 | 
 5 | ### QUESTION ###
 6 | 
 7 | Based on the provided context extract all the legal definitions. Answer using the following formatting.
 8 | <formatting>
 9 | Term.Definition
10 | Term.Definition
11 | ...
12 | </formatting>
13 | <example>
14 | [...]
15 | ### CONTEXT ###
16 | 
17 | Preliminary Note
18 | The Stock Purchase Agreement sets forth the basic terms of the purchase and sale of the preferred stock to the investors (such as the purchase price, closing date, conditions to closing) and identifies the other financing documents. Generally this agreement does not set forth either (1) the characteristics of the stock being sold (which are defined in the Certificate of Incorporation) or (2) the relationship among the parties after the closing, such as registration rights, rights of first refusal and co-sale and voting arrangements (these matters often implicate persons other than just the Company and the investors in this round of financing and are usually embodied in separate agreements to which those others persons are parties, or in some cases in the Certificate of Incorporation). The main items of negotiation in the Stock Purchase Agreement are therefore the price and number of shares being sold, the representations and warranties that the Company must make to the investors and the closing conditions for the transaction.
19 | SERIES A PREFERRED STOCK PURCHASE AGREEMENT
20 | THIS SERIES A PREFERRED STOCK PURCHASE AGREEMENT (this “Agreement”), is made as of [], 20[], by and among [______], a Delaware corporation (the “Company”), and the investors listed on Exhibit A attached to this Agreement (each a “Purchaser” and together the “Purchasers”).
21 | The parties hereby agree as follows:
22 | 
23 | ### ANSWER ###
24 | 
25 | Agreement. THIS SERIES A PREFERRED STOCK PURCHASE AGREEMENT
26 | Company. Delaware corporation
27 | Purchaser. Company or the investors listed on Exhibit A
28 | Purchasers. Company and the investors listed on Exhibit A together
29 | </example>
30 | 
31 | ### CONTEXT ###
32 | 
33 | [###REPLACE WITH CONTEXT###]
34 | 
35 | ### ANSWER ###


--------------------------------------------------------------------------------
/examples/performance_test/README.md:
--------------------------------------------------------------------------------
 1 | # Performance Testing
 2 | 
 3 | This directory contains scripts and configurations to evaluate the performance, specifically the runtime, of CheckEmbed on various embedding models in comparison to SelfCheckGPT and BERTScore.
 4 | The script generates input text for each datapoint while varying the sizes of these texts, i.e. the number of tokens in the text, as well as the number of samples for each datapoint and measures the runtime performance of the embedding and the operations.
 5 | The samples of a datapoint are all generated locally via script instead of querying an LLM.
 6 | Varying the number of tokens to embed gives insights on the overall efficiency of the different embedding models used by CheckEmbed, SelfCheckGPT and BERTScore, while varying the sample number examines the the scalability of the respective pipelines.
 7 | 
 8 | By default, the script tests multiple text sizes, ranging from 200 to 4000 tokens in steps of 200, as well as different number of samples (2, 4, 6, 8 and 10).
 9 | 
10 | ## Data
11 | 
12 | The dataset with the generated text samples is created using the `Faker` library. Samples of varying lengths are generated and stored in a JSON format in directories (`2_samples`, `4_samples`, etc.) corresponding to the number of samples.
13 | 
14 | Once the evaluation is finished, `data_extractor.py` can be used (and/or modified) to aggregate the runtime logs and write the results into a single JSON file containing all runtime measurements.
15 | ```python
16 | python3 data_extractor.py
17 | ```
18 | 
19 | The extracted JSON file has the following structure in general:
20 | ```json
21 | {
22 |   "#_samples": {   //2_samples, 4_samples...
23 |     "embedding": {
24 |       "embedding_model_name": {   //gpt-embedding-large, sfr-embedding-mistral...
25 |         "#tokens": "time",
26 |         "#tokens": "time",
27 |         //...
28 |       },
29 |       //more embeddings...
30 |     },
31 |     "bertscore": {
32 |       "#tokens": "time",
33 |       //...
34 |     },
35 |     "selfcheckgpt_bertscore": {
36 |       "#tokens": "time",
37 |       //...
38 |     },
39 |     "selfcheckgpt_nli": {
40 |       "#tokens": "time",
41 |       //...
42 |     },
43 |     "checkembed": {
44 |       "embedding_model_name": {   //gpt-embedding-large, sfr-embedding-mistral...
45 |         "#tokens": "time",
46 |         "#tokens": "time",
47 |         //...
48 |       },
49 |       //more embeddings...
50 |     },
51 |     "operations": {} //To customize.
52 |   },
53 |   //additional number of samples...
54 | } 
55 | ```
56 | The runtime is reported in seconds.
57 | 
58 | The extracted data can be visualized with the help of the provided plotting script:
59 | ```python
60 | python3 plot.py
61 | ```
62 | 
63 | ## Runtime / Cost Estimation
64 | 
65 | The estimated compute time for running the evaluation is approximately 24 hours on an NVIDIA A100-SXM-40GB.
66 | 
67 | The sample step is only emulated for these runtime measurements to avoid the cost of calling the LLM for the sampling, so cost only occur for the embedding with the OpenAI models.
68 | 
69 | The embedding model from OpenAI has a cost of $0.13 / 1M tokens.
70 | 
71 | ### Example
72 | In the following, we calculate the total cost for running the runtime measurements with the default parameters:
73 | - varying the number of samples from 2 to 10 in increments of 2
74 | - varying the text size from 200 to 4000 tokens in steps of 200 tokens
75 | - 20 prompts, meaning 20 datapoints for each specific combination of number of samples and number of tokens
76 | 
77 | The total costs are $3.28:
78 | - total number of samples per text size: (2 + 4 + 6 + 8 + 10) * 20 = 30 * 20 = 600
79 | - total number of tokens:
80 |   - 200 tokens: 200 * 600 = 120K
81 |   - 400 tokens: 400 * 600 = 240K
82 |   - ...
83 |   ---
84 |   - 25.2M tokens
85 | - 25.2M tokens * $0.13 / 1M tokens = $3.28
86 | 


--------------------------------------------------------------------------------
/examples/performance_test/main.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2024 ETH Zurich.
  2 | #                    All rights reserved.
  3 | #
  4 | # Use of this source code is governed by a BSD-style license that can be
  5 | # found in the LICENSE file.
  6 | #
  7 | # main author: Lorenzo Paleari
  8 | 
  9 | import logging
 10 | import os
 11 | import json
 12 | import random
 13 | import tiktoken
 14 | 
 15 | from faker import Faker
 16 | from datetime import datetime as time
 17 | 
 18 | from CheckEmbed import embedding_models
 19 | from CheckEmbed.scheduler import Scheduler, StartingPoint
 20 | from CheckEmbed.operations import SelfCheckGPT_BERT_Operation, SelfCheckGPT_NLI_Operation
 21 | 
 22 | def start(current_dir: str, start: int = StartingPoint.PROMPT, n_samples: int = 10) -> None:
 23 |     """
 24 |     Execute the runtime measurements.
 25 | 
 26 |     :param current_dir: Directory path from the the script is called.
 27 |     :type current_dir: str
 28 |     :param start: The starting point of the scheduler. Defaults to StartingPoint.PROMPT.
 29 |     :type start: int
 30 |     :param n_samples: Number of samples to generate. Defaults to 10.
 31 |     :type n_samples: int
 32 |     """
 33 | 
 34 |     config_path = os.path.join(
 35 |         current_dir,
 36 |         "../../../CheckEmbed/config.json",
 37 |     )
 38 | 
 39 |     embedd_large = embedding_models.EmbeddingGPT(
 40 |         config_path,
 41 |         model_name = "gpt-embedding-large",
 42 |         cache = False,
 43 |         max_concurrent_requests=5,
 44 |     )
 45 | 
 46 |     sfrEmbeddingMistral = embedding_models.SFREmbeddingMistral(
 47 |         model_name = "Salesforce/SFR-Embedding-Mistral",
 48 |         cache = False,
 49 |     )
 50 | 
 51 |     e5mistral7b = embedding_models.E5Mistral7b(
 52 |         model_name = "intfloat/e5-mistral-7b-instruct",
 53 |         cache = False,
 54 |     )
 55 | 
 56 |     gteQwen157bInstruct = embedding_models.GteQwenInstruct(
 57 |         model_name = "Alibaba-NLP/gte-Qwen1.5-7B-instruct",
 58 |         cache = False,
 59 |         access_token = "", # Add your access token here (Hugging Face)
 60 |     )
 61 | 
 62 |     stella_en_15B_v5 = embedding_models.Stella(
 63 |         model_name = "NovaSearch/stella_en_1.5B_v5",
 64 |         variant = "1.5B-v5",
 65 |         cache = False,
 66 |     )
 67 | 
 68 |     stella_en_400M_v5 = embedding_models.Stella(
 69 |         model_name = "NovaSearch/stella_en_400M_v5",
 70 |         cache = False,
 71 |     )
 72 | 
 73 |     selfCheckGPT_BERT_Operation = SelfCheckGPT_BERT_Operation(
 74 |         os.path.join(current_dir, "SelfCheckGPT"),
 75 |         current_dir,
 76 |     )
 77 | 
 78 |     selfCheckGPT_NLI_Operation = SelfCheckGPT_NLI_Operation(
 79 |         os.path.join(current_dir, "SelfCheckGPT"),
 80 |         current_dir,
 81 |     )
 82 | 
 83 |     # Initialize the scheduler
 84 |     scheduler = Scheduler(
 85 |         current_dir,
 86 |         logging_level = logging.DEBUG,
 87 |         budget = 8,
 88 |         selfCheckGPTOperation=[selfCheckGPT_BERT_Operation, selfCheckGPT_NLI_Operation],
 89 |         embedding_lm = [embedd_large, sfrEmbeddingMistral, e5mistral7b, gteQwen157bInstruct, stella_en_400M_v5, stella_en_15B_v5],
 90 |     )
 91 | 
 92 |     # The order of lm_names and embedding_lm_names should be the same 
 93 |     # as the order of the language models and embedding language models respectively.
 94 |     scheduler.run(
 95 |         startingPoint = start,
 96 |         bertScore = True, # Set to True if you want to test BERTScore
 97 |         selfCheckGPT = True, # Set to True if you want to test SelfCheckGPT
 98 |         time_performance = True,
 99 |         num_samples = n_samples,
100 |         lm_names = [str(i) for i in range(200, 4200, 200)], # Overwrite the default lm names
101 |         bertScore_model = "microsoft/deberta-xlarge-mnli",
102 |         device = "cuda",
103 |         batch_size = 64 # it may be necessary to reduce the batch size if the model is too large
104 |     )
105 | 
106 | 
107 | def text_gen(n_prompt: int = 50, n_samples: int = 10, dir: str = ".") -> None:
108 |     """
109 |     Generate text with different number of tokens for a specific number of samples.
110 | 
111 |     :param n_prompt: Number of datapoints for a specific combination of token size and number of
112 |                      samples. Defaults to 50.
113 |     :type n_prompt: int
114 |     :param n_samples: Number of samples. Default to 10.
115 |     :type n_samples: int
116 |     :param dir: Path to the output directory. Defaults to the current directory.
117 |     :type dir: str
118 |     """
119 | 
120 |     fake = Faker()
121 |     fake.seed_instance(int(random.Random(time.now().microsecond).random() * 1000))
122 | 
123 |     fake.name()
124 |     fake.address()
125 | 
126 |     encoding = tiktoken.get_encoding("cl100k_base")
127 |     
128 |     for length in range(200, 4200, 200):
129 |         len_samples = []
130 |         for _ in range(n_prompt):
131 |             samples = []
132 |             for _ in range(n_samples):
133 |                 temp = fake.text(max_nb_chars=length*10).replace("\n", " ")
134 |                 while len(encoding.encode(temp)) < length:
135 |                     temp += fake.text(max_nb_chars=length*10).replace("\n", " ")
136 | 
137 |                 final_dimension = len(encoding.encode(temp))
138 | 
139 |                 # Add the samples to the list and keep only around the desired token length
140 |                 samples.append(temp[0:int(len(temp) * (length / final_dimension))])
141 |             len_samples.append(samples)
142 |         
143 |         with open(f"{dir}/{length}_samples.json", "w") as f:
144 |             json_data = [{"index": i, "samples": samples} for i, samples in enumerate(len_samples)]
145 |             json.dump({"data": json_data}, f, indent=4)
146 | 
147 | 
148 | if __name__ == "__main__":
149 |     print("Performance test\n")
150 | 
151 |     for sample_count in [2, 4, 6, 8, 10]:
152 |         print(f"\n\n\n#########################\n#\t{sample_count} SAMPLES\t#\n#########################")
153 |         current_dir = os.path.dirname(os.path.abspath(__file__)) + f"/{sample_count}_samples"
154 |         os.makedirs(current_dir, exist_ok=True)
155 |         text_gen(20, n_samples=sample_count, dir=f"{sample_count}_samples")
156 |         start(current_dir, start=StartingPoint.EMBEDDINGS, n_samples=sample_count)
157 | 


--------------------------------------------------------------------------------
/examples/vision/README.md:
--------------------------------------------------------------------------------
 1 | # Vision Hallucination Evaluation with CheckEmbed
 2 | 
 3 | This example demonstrates an end-to-end experiment for assessing hallucinations in image generation using CheckEmbed and Stable Diffusion 3.5.
 4 | 
 5 | ## Structure
 6 | 
 7 | ```
 8 | imgs/
 9 |     └── counting_items/  # Images generated for this experiment
10 | main.py                  # Script to generate images, embeddings, and run CheckEmbed
11 | README.md                # This document
12 | ```
13 | 
14 | ## Usage
15 | 
16 | ```bash
17 | python main.py --start_idx 0 --end_idx 8
18 | ```
19 | Varying the `--start_idx` and `--end_idx` parameters allows you to process in parallel. However, run the CheckEmbed step sequentially.
20 | 
21 | The prompts are hardcoded in `main.py` and are designed to generate images with a specific number of items. 
22 | 
23 | * Outputs:
24 |   * `imgs/counting_items/`: Generated PNG images.
25 |   * `clip_embeddings/counting_items/`: JSON files of CLIP embeddings.
26 |   * `checkembed_outputs/counting_items/`: CheckEmbed result JSONs.
27 | 
28 | ## Configuration
29 | 
30 | * Paths in `main.py` (e.g., `path/to/...`) should be updated to your local directories before running.
31 | * Modify `input_prompts` in `main.py` to extend or change prompt sets.
32 | 
33 | ## Results
34 | 
35 | * Compare CheckEmbed scores against manual correctness counts to evaluate precision.
36 | 


--------------------------------------------------------------------------------
/examples/vision/main.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2025 ETH Zurich.
  2 | #                    All rights reserved.
  3 | #
  4 | # Use of this source code is governed by a BSD-style license that can be
  5 | # found in the LICENSE file.
  6 | #
  7 | # main author: Eric Schreiber
  8 | #
  9 | # contributions: Lorenzo Paleari
 10 | 
 11 | import logging
 12 | import os
 13 | from typing import Any, List, Union
 14 | 
 15 | from PIL.Image import Image
 16 | 
 17 | from CheckEmbed import embedding_models, vision_models
 18 | from CheckEmbed.parser import Parser
 19 | from CheckEmbed.scheduler import Scheduler, StartingPoint
 20 | 
 21 | input_prompts = [
 22 |     ["One red apple on a white background",
 23 |     "Two red apples on a white background",
 24 |     "Three red apples on a white background",
 25 |     "Four red apples on a white background",
 26 |     "Five red apples on a white background",
 27 |     ],
 28 |     ["One yellow tennis ball on a white background",
 29 |     "Two yellow tennis balls on a white background",
 30 |     "Three yellow tennis balls on a white background",
 31 |     "Four yellow tennis balls on a white background",
 32 |     "Five yellow tennis balls on a white background",
 33 |     ],
 34 |     ["One orange on a white background",
 35 |     "Two oranges on a white background",
 36 |     "Three oranges on a white background",
 37 |     "Four oranges on a white background",
 38 |     "Five oranges on a white background",
 39 |     ],
 40 |     ["One yellow lemon on a white background",
 41 |     "Two yellow lemons on a white background",
 42 |     "Three yellow lemons on a white background",
 43 |     "Four yellow lemons on a white background",
 44 |     "Five yellow lemons on a white background",
 45 |     ],
 46 |     ["One green lime on a white background",
 47 |     "Two green limes on a white background",
 48 |     "Three green limes on a white background",
 49 |     "Four green limes on a white background",
 50 |     "Five green limes on a white background",
 51 |     ],
 52 |     ["One red tomato on a white background",
 53 |     "Two red tomatoes on a white background",
 54 |     "Three red tomatoes on a white background",
 55 |     "Four red tomatoes on a white background",
 56 |     "Five red tomatoes on a white background",
 57 |     ],
 58 |     ["One yellow banana on a white background",
 59 |     "Two yellow bananas on a white background",
 60 |     "Three yellow bananas on a white background",
 61 |     "Four yellow bananas on a white background",
 62 |     "Five yellow bananas on a white background",
 63 |     ],
 64 |     ["One blue circle on a white background",
 65 |     "Two blue circles on a white background",
 66 |     "Three blue circles on a white background",
 67 |     "Four blue circles on a white background",
 68 |     "Five blue circles on a white background",
 69 |     ]
 70 | ]
 71 | 
 72 | class CustomParser(Parser):
 73 |     """
 74 |     The CustomParser class handles the dataset parsing.
 75 | 
 76 |     Inherits from the Parser class and implements its abstract methods.
 77 |     """
 78 | 
 79 |     def __init__(self, dataset_path: str, list: List[str]) -> None:
 80 |         """
 81 |         Initialize the parser.
 82 | 
 83 |         :param dataset_path: The path to the dataset.
 84 |         :type dataset_path: str
 85 |         :param list: The list of input prompts.
 86 |         :type list: List[str]
 87 |         """
 88 |         super().__init__(dataset_path)
 89 |         self.list = list
 90 | 
 91 |     def prompt_generation(self, custom_inputs: Any = None) -> List[str]:
 92 |         """
 93 |         Parse the dataset and generate the prompts for the model.
 94 | 
 95 |         :param custom_inputs: The custom inputs to the parser. Defaults to None.
 96 |         :type custom_inputs: Any
 97 |         :return: List of prompts.
 98 |         :rtype: List[str]
 99 |         """
100 |         prompts = []
101 |         for item in self.list:
102 |             prompts.extend(item)
103 | 
104 |         return prompts
105 | 
106 |     def ground_truth_extraction(self, custom_inputs: Any = None) -> List[str]:
107 |         """
108 |         Parse the dataset and extract the ground truth.
109 | 
110 |         :param custom_inputs: The custom inputs to the parser. Defaults to None.
111 |         :type custom_inputs: Any
112 |         :return: List of ground truths.
113 |         :rtype: List[str]
114 |         """
115 |         pass
116 | 
117 |     def answer_parser(self, responses: List[List[Union[str, Image]]], custom_inputs: Any = None) -> List[List[Union[str, Image]]]:
118 |         """
119 |         Parse the responses from the model: Return the responses as they are.
120 | 
121 |         :param responses: The responses from the model.
122 |         :type responses: List[List[Union[str, Image]]]
123 |         :param custom_inputs: The custom inputs to the parser. Defaults to None.
124 |         :type custom_inputs: Any
125 |         :return: The parsed responses.
126 |         :rtype: List[List[Union[str, Image]]]
127 |         """
128 |         return responses
129 | 
130 | 
131 | def start(current_dir: str, list: List[str]) -> None:
132 |     """
133 |     Execute the vision use case.
134 | 
135 |     :param current_dir: Directory path from the the script is called.
136 |     :type current_dir: str
137 |     :param list: The list of input prompts.
138 |     :type list: List[str]
139 |     """
140 | 
141 |     # Initialize the parser, the vision and embedding models
142 |     customParser = CustomParser(
143 |         dataset_path = current_dir,
144 |         list = list
145 |     )
146 | 
147 |     stable_diffusion = vision_models.StableDiffusion3(
148 |         model_name = "stabilityai/stable-diffusion-3.5-medium",
149 |         cache = False,
150 |     )
151 | 
152 |     clip_vit_large = embedding_models.ClipVitLarge(
153 |         model_name = "openai/clip-vit-large-patch14",
154 |         cache = False,
155 |     )
156 | 
157 |     # Initialize the scheduler
158 |     scheduler = Scheduler(
159 |         current_dir,
160 |         logging_level = logging.DEBUG,
161 |         budget = 12,
162 |         parser = customParser,
163 |         lm = [stable_diffusion],
164 |         embedding_lm = [clip_vit_large],
165 |     )
166 | 
167 |     # The order of lm_names and embedding_lm_names should be the same 
168 |     # as the order of the generation models and embedding models respectively.
169 |     scheduler.run(
170 |         startingPoint = StartingPoint.PROMPT,
171 |         bertScore = False,
172 |         selfCheckGPT = False,
173 |         llm_as_a_judge = False,
174 |         vision = True,
175 |         rebase_results = True,
176 |         num_samples = 10,
177 |         device = "cuda",
178 |         batch_size = 64 # it may be necessary to reduce the batch size if the model is too large
179 |     )
180 | 
181 | if __name__ == "__main__":
182 |     current_dir = os.path.dirname(os.path.abspath(__file__))
183 |     start(current_dir, input_prompts)
184 | 


--------------------------------------------------------------------------------
/examples/wiki_bio/README.md:
--------------------------------------------------------------------------------
 1 | # WikiBio Benchmark
 2 | 
 3 | This example uses a subset of the WikiBio dataset (Lebret et al., 2016) that was modified by Manakul et al. (2023) for their evaluation of SelfCheckGPT. It consists of 238 documents based on Wikipedia articles, that were used to generate samples in which hallucinations were introduced. Each sentence of those samples was manually labeled as either “major inaccurate”, “minor inaccurate”, or “accurate”.
 4 | 
 5 | ## Data
 6 | 
 7 | The dataset and the conversion script from sentence scores into passage scores is located in the `data` directory.
 8 | To download the dataset and recompute the passage scores, run the following commands:
 9 | 
10 | ```bash
11 | cd data
12 | python3 download.py
13 | python3 passage_scores.py
14 | ```
15 | 
16 | ## Runtime / Cost Estimation
17 | 
18 | The estimated compute time for running the evaluation is approximately 36 hours on an NVIDIA A100-SXM-40GB.
19 | 
20 | The sample step is skipped, since the samples are already provided in the dataset. Cost only occur for the embedding with the OpenAI models.
21 | 
22 | The embedding model from OpenAI has a cost of $0.13 / 1M tokens, which results in an approximate cost of $0.65 for the evaluation of this example.
23 | 
24 | Using LLM-as-a-judge will cost around $1.
25 | 


--------------------------------------------------------------------------------
/examples/wiki_bio/data/download.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 ETH Zurich.
 2 | #                    All rights reserved.
 3 | #
 4 | # Use of this source code is governed by a BSD-style license that can be
 5 | # found in the LICENSE file.
 6 | #
 7 | # main author: Lorenzo Paleari
 8 | 
 9 | from datasets import load_dataset
10 | import json
11 | 
12 | ds = load_dataset("potsawee/wiki_bio_gpt3_hallucination")
13 | ds = ds["evaluation"]
14 | 
15 | features = ['gpt3_text', 'wiki_bio_text', 'gpt3_sentences', 'annotation', 'wiki_bio_test_idx', 'gpt3_text_samples']
16 | dataset = {}
17 | for feat in features:
18 |     dataset.update({feat: ds[feat]})
19 | 
20 | dataset_final = {}
21 | for i in range(len(dataset[features[0]])):
22 |     dataset_passage = {}
23 |     for feat in features:
24 |         dataset_passage.update({feat: dataset[feat][i]})
25 |     name = f"passage_{i}"
26 |     dataset_final.append({
27 |         name: dataset_passage
28 |     })
29 | 
30 | with open("dataset.json", "w") as f:
31 |     json.dump(dataset_final, f, indent=4)
32 | 


--------------------------------------------------------------------------------
/examples/wiki_bio/data/passage_scores.json:
--------------------------------------------------------------------------------
  1 | {
  2 |     "passage_0": 0.0,
  3 |     "passage_1": 21.428571428571427,
  4 |     "passage_2": 50.0,
  5 |     "passage_3": 5.555555555555555,
  6 |     "passage_4": 54.54545454545454,
  7 |     "passage_5": 36.36363636363637,
  8 |     "passage_6": 0.0,
  9 |     "passage_7": 64.28571428571429,
 10 |     "passage_8": 56.25,
 11 |     "passage_9": 33.33333333333333,
 12 |     "passage_10": 75.0,
 13 |     "passage_11": 66.66666666666666,
 14 |     "passage_12": 28.57142857142857,
 15 |     "passage_13": 85.0,
 16 |     "passage_14": 50.0,
 17 |     "passage_15": 42.857142857142854,
 18 |     "passage_16": 8.333333333333332,
 19 |     "passage_17": 100.0,
 20 |     "passage_18": 35.714285714285715,
 21 |     "passage_19": 75.0,
 22 |     "passage_20": 45.0,
 23 |     "passage_21": 15.384615384615385,
 24 |     "passage_22": 80.0,
 25 |     "passage_23": 50.0,
 26 |     "passage_24": 16.666666666666664,
 27 |     "passage_25": 0.0,
 28 |     "passage_26": 88.88888888888889,
 29 |     "passage_27": 44.44444444444444,
 30 |     "passage_28": 81.81818181818183,
 31 |     "passage_29": 37.5,
 32 |     "passage_30": 6.25,
 33 |     "passage_31": 25.0,
 34 |     "passage_32": 43.75,
 35 |     "passage_33": 0.0,
 36 |     "passage_34": 50.0,
 37 |     "passage_35": 0.0,
 38 |     "passage_36": 59.09090909090909,
 39 |     "passage_37": 22.22222222222222,
 40 |     "passage_38": 71.42857142857143,
 41 |     "passage_39": 42.857142857142854,
 42 |     "passage_40": 100.0,
 43 |     "passage_41": 40.0,
 44 |     "passage_42": 66.66666666666666,
 45 |     "passage_43": 7.142857142857142,
 46 |     "passage_44": 36.36363636363637,
 47 |     "passage_45": 0.0,
 48 |     "passage_46": 50.0,
 49 |     "passage_47": 100.0,
 50 |     "passage_48": 81.25,
 51 |     "passage_49": 27.77777777777778,
 52 |     "passage_50": 50.0,
 53 |     "passage_51": 0.0,
 54 |     "passage_52": 90.0,
 55 |     "passage_53": 81.25,
 56 |     "passage_54": 83.33333333333334,
 57 |     "passage_55": 0.0,
 58 |     "passage_56": 0.0,
 59 |     "passage_57": 60.0,
 60 |     "passage_58": 0.0,
 61 |     "passage_59": 45.0,
 62 |     "passage_60": 75.0,
 63 |     "passage_61": 68.75,
 64 |     "passage_62": 83.33333333333334,
 65 |     "passage_63": 28.57142857142857,
 66 |     "passage_64": 35.714285714285715,
 67 |     "passage_65": 45.83333333333333,
 68 |     "passage_66": 0.0,
 69 |     "passage_67": 16.666666666666664,
 70 |     "passage_68": 30.0,
 71 |     "passage_69": 65.0,
 72 |     "passage_70": 44.44444444444444,
 73 |     "passage_71": 68.75,
 74 |     "passage_72": 83.33333333333334,
 75 |     "passage_73": 22.22222222222222,
 76 |     "passage_74": 100.0,
 77 |     "passage_75": 0.0,
 78 |     "passage_76": 0.0,
 79 |     "passage_77": 77.77777777777779,
 80 |     "passage_78": 55.55555555555556,
 81 |     "passage_79": 8.333333333333332,
 82 |     "passage_80": 60.0,
 83 |     "passage_81": 62.5,
 84 |     "passage_82": 41.66666666666667,
 85 |     "passage_83": 64.28571428571429,
 86 |     "passage_84": 10.0,
 87 |     "passage_85": 16.666666666666664,
 88 |     "passage_86": 60.0,
 89 |     "passage_87": 50.0,
 90 |     "passage_88": 75.0,
 91 |     "passage_89": 0.0,
 92 |     "passage_90": 75.0,
 93 |     "passage_91": 60.0,
 94 |     "passage_92": 44.44444444444444,
 95 |     "passage_93": 50.0,
 96 |     "passage_94": 83.33333333333334,
 97 |     "passage_95": 6.25,
 98 |     "passage_96": 41.66666666666667,
 99 |     "passage_97": 64.28571428571429,
100 |     "passage_98": 43.75,
101 |     "passage_99": 54.54545454545454,
102 |     "passage_100": 56.25,
103 |     "passage_101": 28.57142857142857,
104 |     "passage_102": 91.66666666666666,
105 |     "passage_103": 91.66666666666666,
106 |     "passage_104": 83.33333333333334,
107 |     "passage_105": 16.666666666666664,
108 |     "passage_106": 81.25,
109 |     "passage_107": 70.0,
110 |     "passage_108": 90.0,
111 |     "passage_109": 83.33333333333334,
112 |     "passage_110": 40.909090909090914,
113 |     "passage_111": 0.0,
114 |     "passage_112": 59.09090909090909,
115 |     "passage_113": 0.0,
116 |     "passage_114": 0.0,
117 |     "passage_115": 57.14285714285714,
118 |     "passage_116": 16.666666666666664,
119 |     "passage_117": 68.75,
120 |     "passage_118": 16.666666666666664,
121 |     "passage_119": 65.0,
122 |     "passage_120": 62.5,
123 |     "passage_121": 30.0,
124 |     "passage_122": 31.818181818181817,
125 |     "passage_123": 20.0,
126 |     "passage_124": 11.11111111111111,
127 |     "passage_125": 34.61538461538461,
128 |     "passage_126": 92.85714285714286,
129 |     "passage_127": 62.5,
130 |     "passage_128": 35.714285714285715,
131 |     "passage_129": 0.0,
132 |     "passage_130": 31.818181818181817,
133 |     "passage_131": 75.0,
134 |     "passage_132": 16.666666666666664,
135 |     "passage_133": 5.0,
136 |     "passage_134": 75.0,
137 |     "passage_135": 87.5,
138 |     "passage_136": 37.5,
139 |     "passage_137": 40.0,
140 |     "passage_138": 90.0,
141 |     "passage_139": 50.0,
142 |     "passage_140": 35.714285714285715,
143 |     "passage_141": 20.0,
144 |     "passage_142": 83.33333333333334,
145 |     "passage_143": 35.714285714285715,
146 |     "passage_144": 35.714285714285715,
147 |     "passage_145": 40.0,
148 |     "passage_146": 81.25,
149 |     "passage_147": 25.0,
150 |     "passage_148": 63.63636363636363,
151 |     "passage_149": 30.0,
152 |     "passage_150": 8.333333333333332,
153 |     "passage_151": 92.85714285714286,
154 |     "passage_152": 0.0,
155 |     "passage_153": 50.0,
156 |     "passage_154": 0.0,
157 |     "passage_155": 13.636363636363635,
158 |     "passage_156": 33.33333333333333,
159 |     "passage_157": 15.0,
160 |     "passage_158": 60.0,
161 |     "passage_159": 33.33333333333333,
162 |     "passage_160": 60.0,
163 |     "passage_161": 20.0,
164 |     "passage_162": 68.18181818181817,
165 |     "passage_163": 18.75,
166 |     "passage_164": 56.25,
167 |     "passage_165": 0.0,
168 |     "passage_166": 100.0,
169 |     "passage_167": 0.0,
170 |     "passage_168": 0.0,
171 |     "passage_169": 100.0,
172 |     "passage_170": 64.28571428571429,
173 |     "passage_171": 40.0,
174 |     "passage_172": 37.5,
175 |     "passage_173": 78.57142857142857,
176 |     "passage_174": 77.77777777777779,
177 |     "passage_175": 71.42857142857143,
178 |     "passage_176": 100.0,
179 |     "passage_177": 68.75,
180 |     "passage_178": 50.0,
181 |     "passage_179": 60.0,
182 |     "passage_180": 0.0,
183 |     "passage_181": 80.0,
184 |     "passage_182": 100.0,
185 |     "passage_183": 0.0,
186 |     "passage_184": 0.0,
187 |     "passage_185": 12.5,
188 |     "passage_186": 25.0,
189 |     "passage_187": 31.818181818181817,
190 |     "passage_188": 80.0,
191 |     "passage_189": 40.0,
192 |     "passage_190": 78.57142857142857,
193 |     "passage_191": 30.0,
194 |     "passage_192": 37.5,
195 |     "passage_193": 50.0,
196 |     "passage_194": 50.0,
197 |     "passage_195": 0.0,
198 |     "passage_196": 43.75,
199 |     "passage_197": 87.5,
200 |     "passage_198": 95.83333333333334,
201 |     "passage_199": 43.75,
202 |     "passage_200": 68.75,
203 |     "passage_201": 100.0,
204 |     "passage_202": 22.22222222222222,
205 |     "passage_203": 65.0,
206 |     "passage_204": 0.0,
207 |     "passage_205": 100.0,
208 |     "passage_206": 55.00000000000001,
209 |     "passage_207": 10.0,
210 |     "passage_208": 0.0,
211 |     "passage_209": 18.75,
212 |     "passage_210": 33.33333333333333,
213 |     "passage_211": 56.25,
214 |     "passage_212": 81.25,
215 |     "passage_213": 42.857142857142854,
216 |     "passage_214": 55.00000000000001,
217 |     "passage_215": 40.0,
218 |     "passage_216": 65.0,
219 |     "passage_217": 31.25,
220 |     "passage_218": 0.0,
221 |     "passage_219": 58.333333333333336,
222 |     "passage_220": 100.0,
223 |     "passage_221": 70.0,
224 |     "passage_222": 33.33333333333333,
225 |     "passage_223": 45.0,
226 |     "passage_224": 92.85714285714286,
227 |     "passage_225": 60.0,
228 |     "passage_226": 75.0,
229 |     "passage_227": 62.5,
230 |     "passage_228": 45.0,
231 |     "passage_229": 0.0,
232 |     "passage_230": 44.44444444444444,
233 |     "passage_231": 8.333333333333332,
234 |     "passage_232": 66.66666666666666,
235 |     "passage_233": 14.285714285714285,
236 |     "passage_234": 100.0,
237 |     "passage_235": 15.0,
238 |     "passage_236": 0.0,
239 |     "passage_237": 44.44444444444444
240 | }


--------------------------------------------------------------------------------
/examples/wiki_bio/data/passage_scores.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2024 ETH Zurich.
 2 | #                    All rights reserved.
 3 | #
 4 | # Use of this source code is governed by a BSD-style license that can be
 5 | # found in the LICENSE file.
 6 | #
 7 | # main author: Lorenzo Paleari
 8 | 
 9 | import json
10 | from collections import defaultdict
11 | 
12 | def calculate_accuracy_percentage(accurate: float, total: int) -> float:
13 |     """
14 |     Function to calculate accuracy percentage.
15 | 
16 |     :param accurate: Combined value of scores for the dataset.
17 |     :type accurate: float
18 |     :param total: Number of items in the dataset.
19 |     :type total: int
20 |     :return: Accuracy percentage.
21 |     :rtype: float
22 |     """
23 |     if total == 0:
24 |         return 0
25 |     return (accurate / total) * 100
26 | 
27 | 
28 | # Load the dataset
29 | with open("dataset.json", "r") as f:
30 |     dataset = json.load(f)
31 | 
32 | # Initialize the new dataset
33 | categorized_dataset = defaultdict(list)
34 | 
35 | # Iterate over the dataset
36 | for passage_number, value in dataset.items():
37 |     annotation = value["annotation"]
38 |     
39 |     # Count the number of accurate annotations
40 |     accurate = sum(1 for label in annotation if label == "accurate")
41 |     half_accurate = sum(0.5 for label in annotation if "minor" in label)
42 |     
43 |     # Calculate total annotations
44 |     total_annotations = len(annotation)
45 |     
46 |     # Calculate accuracy percentage
47 |     accuracy_percentage = calculate_accuracy_percentage(accurate + half_accurate, total_annotations)
48 |     
49 |     # Add the passage number to the respective category in the new dataset
50 |     categorized_dataset[passage_number] = accuracy_percentage
51 | 
52 | with open("./passage_scores.json", "w") as outfile:
53 |     json.dump(categorized_dataset, outfile, indent=4)
54 | 


--------------------------------------------------------------------------------
/paper/README.md:
--------------------------------------------------------------------------------
 1 | ## Plot Data
 2 | 
 3 | The data used to create the figures of the arXiv preprint article can be
 4 | found in the `results.tar.bz2` archive. Unpack the archive and run the
 5 | file `plots.py`.
 6 | 
 7 | ```bash
 8 | tar xfj results.tar.bz2
 9 | ```
10 | 


--------------------------------------------------------------------------------
/paper/results.tar.bz2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/spcl/CheckEmbed/008357ed0b6572575ec4c16daf52b549a9c38e25/paper/results.tar.bz2


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [build-system]
 2 | requires = ["hatchling"]
 3 | build-backend = "hatchling.build"
 4 | 
 5 | [project]
 6 | name = "CheckEmbed"
 7 | version = "0.0.1"
 8 | authors = [
 9 |   { name="Maciej Besta", email="maciej.besta@inf.ethz.ch" },
10 |   { name="Lorenzo Paleari", email="lpaleari@student.ethz.ch" },
11 |   { name="Ales Kubicek", email="akubicek@student.ethz.ch" },
12 |   { name="Robert Gerstenberger", email="gerstenberger.robert@gmail.com" },
13 |   { name="Patrick Iff", email="patrick.iff@inf.ethz.ch" },
14 |   { name="Eric Schreiber", email="eric.schreiber@inf.ethz.ch" },
15 | ]
16 | description = "Python package for 'CheckEmbed'"
17 | readme = "README.md"
18 | license = {file = "LICENSE"}
19 | requires-python = ">=3.8.12,<3.11"
20 | classifiers = [
21 |   "Programming Language :: Python :: 3",
22 |   "Operating System :: OS Independent",
23 | ]
24 | dependencies = [
25 |   "accelerate>=0.30.1,<0.35.0",
26 |   "backoff>=2.2.1,<3.0.0",
27 |   "bert-score>=0.3.13,<1.0.0",
28 |   "diffusers==0.33.1",
29 |   "faker>=25.8.0,<26.0.0",
30 |   "huggingface-hub>=0.30.0,<0.31.0",
31 |   "joblib>=1.4.2,<2.0.0",
32 |   "langchain>=0.2.11",
33 |   "langchain_ollama",
34 |   "matplotlib>=3.7.1,<4.0.0",
35 |   "numpy>=1.24.3,<2.0.0",
36 |   "openai>=1.0.0,<2.0.0",
37 |   "packaging>=24.1,<25.0",
38 |   "pandas>2.0.0,<3.0.0",
39 |   "pillow>=11.2.1",
40 |   "pydantic>=2.8.2",
41 |   "scikit-learn>=1.5.1,<2.0.0",
42 |   "scipy>1.15.0,<2.0.0",
43 |   "seaborn>=0.13.2,<0.14.0",
44 |   "selfcheckgpt>=0.1.7,<1.0.0",
45 |   "sentencepiece>=0.2.0,<0.3.0",
46 |   "tiktoken>=0.7.0,<1.0.0",
47 |   "torch==2.6.0",
48 |   "transformers>=4.51.3,<5.0.0",
49 |   "transformers[torch]",
50 |   "wheel>=0.43.0,<1.0.0",
51 |   "vllm>=0.8.2,<1.0.0"
52 | ]
53 | 
54 | [project.optional-dependencies]
55 | cuda = [
56 |   "xformers>=0.0.27,<0.0.40",
57 |   "flash-attn>=2.5.6,<3.0.0",
58 | ]
59 | 
60 | [project.urls]
61 | Homepage = "https://github.com/spcl/CheckEmbed"
62 | 
63 | [project.scripts]
64 | 


--------------------------------------------------------------------------------