├── .gitignore ├── CheckEmbed ├── __init__.py ├── config_template.json ├── embedder │ ├── __init__.py │ └── embedder.py ├── embedding_models │ ├── README.md │ ├── __init__.py │ ├── abstract_embedding_model.py │ ├── clip_vit_large.py │ ├── e5_mistral_7b_instruct.py │ ├── embeddinggpt.py │ ├── gte_qwen1_5_7b_instruct.py │ ├── sfr_embedding_mistral.py │ └── stella.py ├── language_models │ ├── README.md │ ├── __init__.py │ ├── abstract_language_model.py │ ├── chatgpt.py │ └── chatollama.py ├── operations │ ├── README.md │ ├── __init__.py │ ├── bertscore_operation.py │ ├── checkembed_operation.py │ ├── llm_as_a_judge_operation.py │ ├── operations.py │ └── selfcheckgpt_operation.py ├── parser │ ├── __init__.py │ └── parser.py ├── plotters │ ├── README.md │ ├── __init__.py │ ├── bert_plot.py │ ├── checkembed_plot.py │ ├── plot_operations.py │ ├── raw_embedding_heatmap.py │ └── selfcheckgpt_plot.py ├── scheduler │ ├── __init__.py │ └── scheduler.py ├── utility │ ├── __init__.py │ ├── stderr_filter.py │ ├── stderr_filter.txt │ └── utility.py └── vision_models │ ├── README.md │ ├── __init__.py │ ├── abstract_vision_model.py │ └── stable_diffusion3_5.py ├── LICENSE ├── README.md ├── examples ├── RAGTruth │ ├── README.md │ ├── dataset │ │ ├── response.json │ │ ├── sampler.py │ │ ├── samples.json │ │ ├── source_info.json │ │ └── training_data.json │ ├── hallu_detect.py │ └── main.py ├── README.md ├── description │ ├── README.md │ ├── different │ │ ├── main.py │ │ └── prompt_scheme.txt │ └── similar │ │ ├── main.py │ │ └── prompt_scheme.txt ├── incremental_forced_hallucination │ ├── legal_summaries │ │ ├── README.md │ │ ├── dataset │ │ │ └── legal_definitions.json │ │ ├── main.py │ │ ├── prompt_scheme.txt │ │ └── prompt_scheme_ground_truth.txt │ ├── operation_variants │ │ ├── __init__.py │ │ ├── bertscore_operation_variant.py │ │ ├── checkembed_operation_variant.py │ │ └── selfcheckgpt_operation_variant.py │ └── scientific_descriptions │ │ ├── README.md │ │ ├── main.py │ │ ├── prompt_scheme.txt │ │ └── prompt_scheme_ground_truth.txt ├── legal_definitions │ ├── README.md │ ├── dataset │ │ └── legal_definitions.json │ ├── main.py │ └── prompt_scheme.txt ├── performance_test │ ├── README.md │ ├── data_extractor.py │ ├── main.py │ └── plot.py ├── vision │ ├── README.md │ └── main.py └── wiki_bio │ ├── README.md │ ├── data │ ├── dataset.json │ ├── download.py │ ├── passage_scores.json │ └── passage_scores.py │ └── main.py ├── paper ├── README.md ├── pics │ └── checkembed_overview.svg ├── plots.py └── results.tar.bz2 └── pyproject.toml /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # Distribution / packaging 7 | dist/ 8 | build/ 9 | 10 | # Installer logs 11 | pip-log.txt 12 | 13 | # IDEs 14 | .idea/ 15 | *.vscode/ 16 | *.pycproj 17 | *.user 18 | *.pyproj.user 19 | 20 | # Data 21 | *.out 22 | *.err 23 | *.log 24 | 25 | # Environments 26 | env/ 27 | venv/ 28 | 29 | # Config File 30 | **/config.json 31 | 32 | # Mac folder attributes 33 | .DS_Store 34 | 35 | # Exclude working dir 36 | results/ 37 | error_*/ 38 | ground_truth/ 39 | BertScore/ 40 | SelfCheckGPT/ 41 | embeddings/ 42 | plots/ 43 | *_samples/ 44 | examples/**/CheckEmbed/ 45 | paper/**/CheckEmbed/ -------------------------------------------------------------------------------- /CheckEmbed/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/spcl/CheckEmbed/008357ed0b6572575ec4c16daf52b549a9c38e25/CheckEmbed/__init__.py -------------------------------------------------------------------------------- /CheckEmbed/config_template.json: -------------------------------------------------------------------------------- 1 | { 2 | "chatgpt": { 3 | "model_id": "gpt-3.5-turbo-0125", 4 | "name": "gpt-3.5-turbo", 5 | "prompt_token_cost": 0.0005, 6 | "response_token_cost": 0.0015, 7 | "temperature": 1.0, 8 | "max_tokens": 4096, 9 | "stop": null, 10 | "organization": "", 11 | "api_key": "" 12 | }, 13 | "chatgpt4": { 14 | "model_id": "gpt-4", 15 | "name": "gpt-4", 16 | "prompt_token_cost": 0.03, 17 | "response_token_cost": 0.06, 18 | "temperature": 1.0, 19 | "max_tokens": 4096, 20 | "stop": null, 21 | "organization": "", 22 | "api_key": "" 23 | }, 24 | "chatgpt4-turbo": { 25 | "model_id": "gpt-4-turbo", 26 | "name": "gpt-4-turbo", 27 | "prompt_token_cost": 0.01, 28 | "response_token_cost": 0.03, 29 | "temperature": 1.0, 30 | "max_tokens": 4096, 31 | "stop": null, 32 | "organization": "", 33 | "api_key": "" 34 | }, 35 | "chatgpt4-o": { 36 | "model_id": "gpt-4o", 37 | "name": "gpt-4o", 38 | "prompt_token_cost": 0.0025, 39 | "response_token_cost": 0.01, 40 | "temperature": 1.0, 41 | "max_tokens": 4096, 42 | "stop": null, 43 | "organization": "", 44 | "api_key": "" 45 | }, 46 | "chatgpt4-o-mini": { 47 | "model_id": "gpt-4o-mini", 48 | "name": "4o-mini", 49 | "prompt_token_cost": 0.00015, 50 | "response_token_cost": 0.0006, 51 | "temperature": 1.0, 52 | "max_tokens": 4096, 53 | "stop": null, 54 | "organization": "", 55 | "api_key": "" 56 | }, 57 | "llama70": { 58 | "model_id": "llama3.3:70b-instruct-q8_0", 59 | "name": "llama70b", 60 | "temperature": 1.0, 61 | "num_ctx": 131072, 62 | "num_predict": 16384, 63 | "num_batch": 1024, 64 | "keep_alive": -1 65 | }, 66 | "llama8b": { 67 | "model_id": "llama3.1:8b-instruct-fp16", 68 | "name": "llama8b", 69 | "temperature": 1.0, 70 | "num_ctx": 131072, 71 | "num_predict": 16384, 72 | "num_batch": 4096, 73 | "keep_alive": -1 74 | }, 75 | "gpt-embedding-large": { 76 | "model_id": "text-embedding-3-large", 77 | "name": "gpt-embedding-large", 78 | "token_cost": 0.00013, 79 | "encoding": "float", 80 | "dimension": 3072, 81 | "organization": "", 82 | "api_key": "" 83 | }, 84 | "gpt-embedding-small": { 85 | "model_id": "text-embedding-3-small", 86 | "name": "gpt-embedding-small", 87 | "token_cost": 0.00002, 88 | "encoding": "float", 89 | "dimension": 1536, 90 | "organization": "", 91 | "api_key": "" 92 | } 93 | } 94 | -------------------------------------------------------------------------------- /CheckEmbed/embedder/__init__.py: -------------------------------------------------------------------------------- 1 | from .embedder import Embedder -------------------------------------------------------------------------------- /CheckEmbed/embedder/embedder.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 ETH Zurich. 2 | # All rights reserved. 3 | # 4 | # Use of this source code is governed by a BSD-style license that can be 5 | # found in the LICENSE file. 6 | # 7 | # main author: Lorenzo Paleari 8 | 9 | from abc import ABC 10 | from typing import List 11 | 12 | from tqdm import tqdm 13 | from CheckEmbed.embedding_models import AbstractEmbeddingModel 14 | import numpy as np 15 | 16 | class Embedder(ABC): 17 | """ 18 | Abstract base class that defines the interface for all embedders. 19 | Embedders are used to embed text into a vector space. 20 | """ 21 | 22 | def embed(self, lm: AbstractEmbeddingModel, texts: List[str]) -> List[List[float]]: 23 | """ 24 | Embed the given texts into vectors. 25 | 26 | :param lm: The embedding model that will be used to generate the text embeddings. 27 | :type lm: AbstractEmbeddingModel 28 | :param texts: The texts to embed. 29 | :type texts: List[str] 30 | :return: The embeddings of the texts. 31 | :rtype: List[List[float]] 32 | """ 33 | embedding_query = [] 34 | void_indexes = [] 35 | for index, text in enumerate(texts): 36 | if text == "": 37 | void_indexes.append(index) 38 | else: 39 | embedding_query.append(text) 40 | 41 | full_responses = np.zeros((len(texts))).tolist() 42 | responses = lm.generate_embedding(embedding_query) 43 | 44 | for index in void_indexes: 45 | full_responses[index] = [] 46 | 47 | # fill remaining places in full_responses with responses in oroder 48 | for index, response in enumerate(responses): 49 | temp_index = index 50 | while full_responses[temp_index] != 0.0: 51 | temp_index += 1 52 | full_responses[temp_index] = response 53 | 54 | return full_responses 55 | -------------------------------------------------------------------------------- /CheckEmbed/embedding_models/README.md: -------------------------------------------------------------------------------- 1 | # Embedding Models 2 | 3 | The Embedding Models module is responsible for managing the embedding models. 4 | 5 | Currently, the framework supports the following embedding models: 6 | 7 | - text-embedding-large / small (remote - OpenAI API) 8 | - Salesforce/SFR-Embedding-Mistral (local - GPU with 32GB VRAM recommended, model size is roughly 26GB) 9 | - intfloat/e5-mistral-7b-instruct (local - GPU with 32GB VRAM recommended, model size is roughly 26GB) 10 | - Alibaba-NLP/gte-Qwen1.5-7B-instruct (local - GPU with 32GB VRAM recommended, model size is roughly 26GB) 11 | - NovaSearch/stella_en_1.5B_v5 (local - GPU with 12GB VRAM recommended, model size is roughly 6GB) 12 | - NovaSearch/stella_en_400M_v5 (local - GPU with 4GB VRAM recommended, model size is roughly 2GB) 13 | - openai/clip-vit-large-patch14 (local - GPU with 4GB VRAM recommended, model size is roughly 2GB) 14 | 15 | The following sections describe how to instantiate individual models and how to add new models to the framework. 16 | 17 | ## Embedding Model Instantiation 18 | 19 | - Create a copy of `config_template.json` named `config.json` in the CheckEmbed folder. (Not necessary for local models) 20 | - Fill in the configuration details based on the used model (below). 21 | 22 | ### Embedding-Text-Large / Embedding-Text-Small 23 | 24 | - Adjust the predefined `gpt-embedding-large` or `gpt-embedding-small` configurations or create a new configuration with an unique key. 25 | 26 | | Key | Value | 27 | |---------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| 28 | | model_id | Model name based on [OpenAI model overview](https://platform.openai.com/docs/models/overview). | 29 | | name | Name used for CheckEmbed output files. We suggest to use the default names for local models. | 30 | | token_cost | Price per 1000 tokens based on [OpenAI pricing](https://openai.com/pricing), used for calculating cumulative price per LLM instance. | 31 | | encoding | String indicating the format to return the embeddings in. Can be either float or base64. More information can be found in the [OpenAI API reference](https://platform.openai.com/docs/api-reference/embeddings/create#embeddings-create-encoding_format). | 32 | | dimension | Number indicating output dimension for the embedding model. More information can be found in the [OpenAI model overview](https://platform.openai.com/docs/models/overview). | 33 | | organization | Organization to use for the API requests (may be empty). | 34 | | api_key | Personal API key that will be used to access the OpenAI API. | 35 | 36 | - Instantiate the embedding model based on the selected configuration key (predefined / custom). 37 | - `max_concurrent_request` is by default 10. Adjust the value based on your tier [rate limits](https://platform.openai.com/docs/guides/rate-limits). 38 | 39 | ```python 40 | embedding_lm = language_models.EmbeddingGPT( 41 | config_path, 42 | model_name = , 43 | cache = , 44 | max_concurrent_requests = 45 | ) 46 | ``` 47 | 48 | ### Local Models 49 | 50 | The framework currently supports the following local models: `Salesforce/SFR-Embedding-Mistral`, `intfloat/e5-mistral-7b-instruct`, `Alibaba-NLP/gte-Qwen1.5-7B-instruct`, `NovaSearch/stella_en_1.5B_v5`, `NovaSearch/stella_en_400M_v5` and `openai/clip-vit-large-patch14`. 51 | 52 | - Instantiate the embedding model based on the owned device. 53 | - Device can be specified in the `Scheduler`, more [here](/CheckEmbed/scheduler/scheduler.py) 54 | 55 | ```python 56 | sfrEmbeddingMistral = language_models.SFREmbeddingMistral( 57 | model_name = "Salesforce/SFR-Embedding-Mistral", 58 | cache = False, 59 | batch_size = 64, 60 | ) 61 | 62 | e5mistral7b = language_models.E5Mistral7b( 63 | model_name = "intfloat/e5-mistral-7b-instruct", 64 | cache = False, 65 | batch_size = 64, 66 | ) 67 | 68 | gteQwen157bInstruct = language_models.GteQwenInstruct( 69 | model_name = "Alibaba-NLP/gte-Qwen1.5-7B-instruct", 70 | cache = False, 71 | access_token = "", # Add your access token here (Hugging Face) 72 | batch_size = 1, # Unless you have more than 32GB of GPU VRAM at your disposal use 1. 73 | ) 74 | 75 | stella_en_15B_v5 = embedding_models.Stella( 76 | model_name = "NovaSearch/stella_en_1.5B_v5", 77 | cache = False, 78 | batch_size = 64, 79 | ) 80 | 81 | stella_en_400M_v5 = embedding_models.Stella( 82 | model_name = "NovaSearch/stella_en_400M_v5", 83 | cache = False, 84 | batch_size = 64, 85 | ) 86 | 87 | clip_vit_large = embedding_models.ClipVitLarge( 88 | model_name = "openai/clip-vit-large-patch14", 89 | cache = False, 90 | ) 91 | ``` 92 | 93 | ## Adding Embedding Models 94 | 95 | More embedding models can be added by following these steps: 96 | 97 | - Create new class as a subclass of `AbstractEmbeddingModel`. 98 | - Use the constructor for loading the configuration and instantiating the embedding model (if needed). 99 | 100 | ```python 101 | class CustomLanguageModel(AbstractEmbeddingModel): 102 | def __init__( 103 | self, 104 | config_path: str = "", 105 | model_name: str = "text-embedding-large", 106 | name: str = "CustomLanguageModel", 107 | cache: bool = False 108 | ) -> None: 109 | super().__init__(config_path, model_name, name, cache) 110 | self.config: Dict = self.config[model_name] 111 | 112 | # Load data from configuration into variables if needed 113 | 114 | # Instantiate model if needed 115 | ``` 116 | 117 | - Implement the `load_model`, `unload_model` and `generate_embedding` abstract methods that are used to load/unload the model from the GPU (if necessary) and get a list of embeddings from the model (remote API call or local model inference) respectively. 118 | 119 | ```python 120 | def load_model(self, device: str = None) -> None: 121 | """ 122 | Load the model and tokenizer based on the given model name. 123 | 124 | :param device: The device to load the model on. Defaults to None. 125 | :type device: str 126 | """ 127 | 128 | def unload_model(self) -> None: 129 | """ 130 | Unload the model and tokenizer. 131 | """ 132 | 133 | def generate_embedding( 134 | self, 135 | input: Union[List[Any], Any] 136 | ) -> List[float]: 137 | # Call model and retrieve an embedding 138 | # Return model response 139 | ``` 140 | -------------------------------------------------------------------------------- /CheckEmbed/embedding_models/__init__.py: -------------------------------------------------------------------------------- 1 | from .abstract_embedding_model import AbstractEmbeddingModel 2 | from .clip_vit_large import ClipVitLarge 3 | from .e5_mistral_7b_instruct import E5Mistral7b 4 | from .embeddinggpt import EmbeddingGPT 5 | from .gte_qwen1_5_7b_instruct import GteQwenInstruct 6 | from .sfr_embedding_mistral import SFREmbeddingMistral 7 | from .stella import Stella 8 | -------------------------------------------------------------------------------- /CheckEmbed/embedding_models/abstract_embedding_model.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 ETH Zurich. 2 | # All rights reserved. 3 | # 4 | # Use of this source code is governed by a BSD-style license that can be 5 | # found in the LICENSE file. 6 | # 7 | # main author: Lorenzo Paleari 8 | 9 | from abc import ABC, abstractmethod 10 | from typing import Any, Dict, List, Union 11 | import json 12 | import logging 13 | 14 | 15 | class AbstractEmbeddingModel(ABC): 16 | """ 17 | Abstract base class that defines the interface for all embedding models. 18 | """ 19 | 20 | def __init__( 21 | self, config_path: str = None, model_name: str = "", name: str = "INVALID_NAME", cache: bool = False 22 | ) -> None: 23 | """ 24 | Initialize the AbstractEmbeddingModel instance with configuration, model details, and caching options. 25 | 26 | :param config_path: Path to the config file. If provided, the config is loaded from the file. Defaults to "". 27 | :type config_path: str 28 | :param model_name: Name of the language model. Defaults to "". 29 | :type model_name: str 30 | :param name: Name of the embedding model. Defaults to "INVALID_NAME". 31 | :type name: str 32 | :param cache: Flag to determine whether to cache responses. Defaults to False. 33 | :type cache: bool 34 | """ 35 | self.logger = logging.getLogger(self.__class__.__name__) 36 | self.config: Dict = None 37 | self.model_name: str = model_name 38 | self.cache = cache 39 | if self.cache: 40 | self.response_cache: Dict[str, List[Any]] = {} 41 | if config_path is not None: 42 | self.load_config(config_path) 43 | self.name: str = name 44 | try: 45 | if self.config is not None: 46 | if self.config[model_name] is not None: 47 | self.name = self.config[model_name]["name"] 48 | except Exception: 49 | pass 50 | self.prompt_tokens: int = 0 51 | self.cost: float = 0.0 52 | 53 | def load_config(self, path: str) -> None: 54 | """ 55 | Load configuration from a specified path. 56 | 57 | :param path: Path to the config file. 58 | :type path: str 59 | """ 60 | with open(path, "r") as f: 61 | self.config = json.load(f) 62 | 63 | self.logger.debug(f"Loaded config from {path} for {self.model_name}") 64 | 65 | def clear_cache(self) -> None: 66 | """ 67 | Clear the response cache. 68 | """ 69 | self.response_cache.clear() 70 | 71 | @abstractmethod 72 | def load_model(self, device: str = None) -> None: 73 | """ 74 | Abstract method to load the embedding model. 75 | 76 | :param device: The device to load the model on. Defaults to None. 77 | :type device: str 78 | """ 79 | pass 80 | 81 | @abstractmethod 82 | def unload_model(self) -> None: 83 | """ 84 | Abstract method to unload the embedding model. 85 | """ 86 | pass 87 | 88 | @abstractmethod 89 | def generate_embedding(self, input: Union[List[Any], Any]) -> List[List[float]]: 90 | """ 91 | Abstract method to generate embedding for the given input text. 92 | 93 | :param input: The input text to embed. 94 | :type input: Union[List[Any], Any] 95 | :return: The embeddings of the text. 96 | :rtype: List[List[float]] 97 | """ 98 | pass 99 | -------------------------------------------------------------------------------- /CheckEmbed/embedding_models/clip_vit_large.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2025 ETH Zurich. 2 | # All rights reserved. 3 | # 4 | # Use of this source code is governed by a BSD-style license that can be 5 | # found in the LICENSE file. 6 | # 7 | # main authors: Lorenzo Paleari 8 | # Eric Schreiber 9 | 10 | import gc 11 | from typing import List, Union 12 | 13 | import torch 14 | from PIL.Image import Image 15 | from transformers import CLIPModel, CLIPProcessor 16 | 17 | from CheckEmbed.embedding_models import AbstractEmbeddingModel 18 | 19 | 20 | class ClipVitLarge(AbstractEmbeddingModel): 21 | """ 22 | The ClipVitLarge class handles interactions with the CLIP ViT Large model using the provided configuration. 23 | 24 | Inherits from the AbstractEmbeddingModel class and implements its abstract methods. 25 | """ 26 | 27 | def __init__( 28 | self, model_name: str = "", name: str = "clip-vit-large-patch-14", cache: bool = False 29 | ) -> None: 30 | """ 31 | Initialize the ClipVitLarge instance with configuration, model details, and caching options. 32 | :param model_name: Name of the model, which is used to select the correct configuration. Defaults to "". 33 | :type model_name: str 34 | :param name: Name used for output files. Defaults to "clip-vit-large-patch-14". 35 | :type name: str 36 | :param cache: Flag to determine whether to cache responses. Defaults to False. 37 | :type cache: bool 38 | """ 39 | super().__init__(model_name=model_name, name=name, cache=cache) 40 | self.processor_name = model_name 41 | 42 | def load_model(self, device: str = None) -> None: 43 | """ 44 | Load the model and tokenizer based on the given model name. 45 | 46 | :param device: The device to load the model on. Defaults to None. 47 | :type device: str 48 | """ 49 | self.model = CLIPModel.from_pretrained(self.model_name).eval() 50 | self.processor = CLIPProcessor.from_pretrained(self.processor_name) 51 | self.model = self.model.to(device) 52 | 53 | def unload_model(self) -> None: 54 | """ 55 | Unload the model and tokenizer. 56 | """ 57 | del self.processor 58 | del self.model 59 | 60 | gc.collect() 61 | torch.cuda.empty_cache() 62 | 63 | self.processor = None 64 | self.model = None 65 | 66 | def generate_embedding(self, input: Union[List[Image], Image]) -> List[List[float]]: 67 | """ 68 | Abstract method to generate embedding for the given input text. 69 | 70 | :param input: The input image to embed. 71 | :type input: Union[List[Image], Image] 72 | :return: The embeddings of the image. 73 | :rtype: List[List[float]] 74 | """ 75 | if not isinstance(input, List): 76 | input = [input] 77 | 78 | total_embeddings = [] 79 | for image in input: 80 | inputs = self.processor(images=image, return_tensors="pt") 81 | inputs = {k: v.to(self.model.device) for k, v in inputs.items()} 82 | with torch.no_grad(): 83 | latents = self.model.get_image_features(**inputs).squeeze().cpu().numpy().tolist() 84 | total_embeddings.append(latents) 85 | return total_embeddings 86 | -------------------------------------------------------------------------------- /CheckEmbed/embedding_models/e5_mistral_7b_instruct.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 ETH Zurich. 2 | # All rights reserved. 3 | # 4 | # Use of this source code is governed by a BSD-style license that can be 5 | # found in the LICENSE file. 6 | # 7 | # main author: Lorenzo Paleari 8 | 9 | import torch 10 | import gc 11 | import torch.nn.functional as F 12 | from torch import Tensor 13 | from tqdm import tqdm 14 | from transformers import AutoTokenizer, AutoModel 15 | 16 | from typing import List, Union 17 | 18 | from CheckEmbed.embedding_models import AbstractEmbeddingModel 19 | 20 | 21 | class E5Mistral7b(AbstractEmbeddingModel): 22 | """ 23 | The E5Mistral7b class handles interactions with the E5Mistral7b embedding model using the provided configuration. 24 | 25 | Inherits from the AbstractEmbeddingModel class and implements its abstract methods. 26 | """ 27 | 28 | def __init__( 29 | self, model_name: str = "", name: str = "e5-mistral-7B-instruct", cache: bool = False, max_length: int = 4096, batch_size: int = 64 30 | ) -> None: 31 | """ 32 | Initialize the E5Mistral7b instance with configuration, model details, and caching options. 33 | 34 | :param model_name: Name of the model, which is used to select the correct configuration. Defaults to "". 35 | :type model_name: str 36 | :param name: Name used for output files. Defaults to "e5-mistral-7B-instruct". 37 | :type name: str 38 | :param cache: Flag to determine whether to cache responses. Defaults to False. 39 | :type cache: bool 40 | :param max_length: The maximum length of the input text. Defaults to 4096. 41 | :type max_length: int 42 | :param batch_size: The batch size to be used for the model. Defaults to 64. 43 | :type batch_size: int 44 | """ 45 | super().__init__(model_name=model_name, name=name, cache=cache) 46 | self.tokenizer_name = model_name 47 | self.max_length = max_length 48 | self.batch_size = batch_size 49 | 50 | def load_model(self, device: str = None) -> None: 51 | """ 52 | Load the model and tokenizer based on the given model name. 53 | 54 | :param device: The device to load the model on. 55 | :type device: str 56 | """ 57 | self.tokenizer = AutoTokenizer.from_pretrained(self.tokenizer_name) 58 | self.model = AutoModel.from_pretrained(self.model_name, device_map=device) 59 | 60 | def unload_model(self) -> None: 61 | """ 62 | Unload the model and tokenizer. 63 | """ 64 | del self.tokenizer 65 | del self.model 66 | 67 | gc.collect() 68 | torch.cuda.empty_cache() 69 | 70 | self.tokenizer = None 71 | self.model = None 72 | 73 | def generate_embedding(self, input: Union[List[str], str]) -> List[List[float]]: 74 | """ 75 | Abstract method to generate embedding for the given input text. 76 | 77 | :param input: The input text to embed. 78 | :type input: Union[List[str], str] 79 | :return: The embeddings of the text. 80 | :rtype: List[List[float]] 81 | """ 82 | if isinstance(input, str): 83 | input = [input] 84 | 85 | total_embeddings = [] 86 | flag = True 87 | 88 | while flag: 89 | try: 90 | batched_responses = [input[i:i+self.batch_size] for i in range(0, len(input), self.batch_size)] 91 | 92 | embeddings = None 93 | outputs = None 94 | batch_dict = None 95 | 96 | for batch in tqdm(batched_responses, desc="Batches to Embed", leave=False, total=len(batched_responses)): 97 | batch_dict = self.tokenizer(batch, max_length=self.max_length, padding=True, truncation=True, return_tensors='pt') 98 | batch_dict.to(self.model.device) 99 | 100 | with torch.no_grad(): 101 | outputs = self.model(**batch_dict) 102 | embeddings = self.last_token_pool(outputs.last_hidden_state, batch_dict['attention_mask']) 103 | 104 | # normalize embeddings 105 | embeddings = F.normalize(embeddings, p=2, dim=1) 106 | total_embeddings.extend(embeddings.cpu().detach().numpy().tolist()) 107 | 108 | del embeddings, outputs, batch_dict 109 | gc.collect() 110 | torch.cuda.empty_cache() 111 | 112 | flag = False 113 | 114 | except Exception as e: 115 | embeddings = None 116 | outputs = None 117 | batch_dict = None 118 | total_embeddings = [] 119 | gc.collect() 120 | torch.cuda.empty_cache() 121 | 122 | print("Error occurred, reducing batch size and retrying") 123 | if self.batch_size == 1: 124 | raise e 125 | self.batch_size = self.batch_size // 2 # reduce batch size by half 126 | 127 | return total_embeddings 128 | 129 | def last_token_pool(self, last_hidden_states: Tensor, 130 | attention_mask: Tensor) -> Tensor: 131 | """ 132 | Pools the last non-padding token's hidden state from the model's output. 133 | 134 | This method extracts the hidden state of the last token that is not a padding token. 135 | If the last token is a padding token, it retrieves the hidden state of the 136 | second to last token that is not a padding token. 137 | 138 | :param last_hidden_states: A tensor containing the hidden states from the last layer of the model. 139 | :type last_hidden_states: Tensor 140 | :param attention_mask: A tensor indicating the positions of non-padding tokens (1 for non-padding, 0 for padding). 141 | :type attention_mask: Tensor 142 | :return: A tensor containing the hidden states of the last non-padding token for each sequence in the batch. 143 | :rtype: Tensor 144 | """ 145 | left_padding = (attention_mask[:, -1].sum() == attention_mask.shape[0]) 146 | if left_padding: 147 | return last_hidden_states[:, -1] 148 | else: 149 | sequence_lengths = attention_mask.sum(dim=1) - 1 150 | batch_size = last_hidden_states.shape[0] 151 | return last_hidden_states[torch.arange(batch_size, device=last_hidden_states.device), sequence_lengths] 152 | -------------------------------------------------------------------------------- /CheckEmbed/embedding_models/embeddinggpt.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 ETH Zurich. 2 | # All rights reserved. 3 | # 4 | # Use of this source code is governed by a BSD-style license that can be 5 | # found in the LICENSE file. 6 | # 7 | # main author: Lorenzo Paleari 8 | 9 | import backoff 10 | import os 11 | from typing import Dict, List, Union 12 | from openai import OpenAI, OpenAIError 13 | from openai.types import CreateEmbeddingResponse 14 | from tqdm import tqdm 15 | from concurrent.futures import ThreadPoolExecutor, as_completed 16 | 17 | from CheckEmbed.embedding_models import AbstractEmbeddingModel 18 | 19 | 20 | class EmbeddingGPT(AbstractEmbeddingModel): 21 | """ 22 | The EmbeddingGPT class handles interactions with the OpenAI embedding models using the provided configuration. 23 | 24 | Inherits from the AbstractEmbeddingModel class and implements its abstract methods. 25 | """ 26 | 27 | def __init__( 28 | self, config_path: str = "", model_name: str = "gpt-embedding-large", cache: bool = False, max_concurrent_requests: int = 10 29 | ) -> None: 30 | """ 31 | Initialize the EmbeddingGPT instance with configuration, model details, and caching options. 32 | 33 | :param config_path: Path to the configuration file. Defaults to "". 34 | :type config_path: str 35 | :param model_name: Name of the model, which is used to select the correct configuration. Defaults to 'gpt-embedding-large'. 36 | :type model_name: str 37 | :param cache: Flag to determine whether to cache responses. Defaults to False. 38 | :type cache: bool 39 | :param max_concurrent_requests: The maximum number of concurrent requests. Defaults to 10. 40 | :type max_concurrent_requests: int 41 | """ 42 | super().__init__(config_path, model_name, cache=cache) 43 | self.config: Dict = self.config[model_name] 44 | # The model_id is the id of the model that is used for chatgpt, i.e. gpt-4, gpt-3.5-turbo, etc. 45 | self.model_id: str = self.config["model_id"] 46 | # The prompt_token_cost and response_token_cost are the costs for 1000 prompt tokens and 1000 response tokens respectively. 47 | self.prompt_token_cost: float = self.config["token_cost"] 48 | self.encoding: str = self.config["encoding"] 49 | self.dimension: int = self.config["dimension"] 50 | # The account organization is the organization that is used for chatgpt. 51 | self.organization: str = self.config["organization"] 52 | if self.config["organization"] == "": 53 | self.logger.warning("OPENAI_ORGANIZATION is not set") 54 | self.api_key: str = os.getenv("OPENAI_API_KEY", self.config["api_key"]) 55 | if os.getenv("OPENAI_API_KEY", self.config["api_key"]) == "": 56 | self.logger.warning("OPENAI_API_KEY is not set") 57 | # Initialize the OpenAI Client 58 | self.client = OpenAI(api_key=self.api_key, organization=self.organization) 59 | 60 | self.max_concurrent_requests = max_concurrent_requests 61 | 62 | def load_model(self, device: str = None) -> None: 63 | """ 64 | Load the embedding model locally. 65 | 66 | :param device: The device to load the model on. 67 | :type device: str 68 | """ 69 | pass 70 | 71 | def unload_model(self) -> None: 72 | """ 73 | Unload the embedding model locally. 74 | """ 75 | pass 76 | 77 | def generate_embedding(self, input: Union[List[str], str]) -> List[List[float]]: 78 | """ 79 | Generate embeddings for the given input text. 80 | 81 | :param input: The input texts to embed. 82 | :type input: Union[List[str], str] 83 | :return: The embeddings of the text. 84 | :rtype: List[List[float]] 85 | """ 86 | if isinstance(input, str): 87 | input = [input] 88 | 89 | with ThreadPoolExecutor(max_workers=self.max_concurrent_requests) as executor: 90 | futures = [executor.submit(self.embed_query, i) for i in input] 91 | results = [] 92 | for future in tqdm(as_completed(futures), total=len(futures), desc="Embeddings", leave=False): 93 | try: 94 | response = future.result() 95 | results.append(response.data[0].embedding) 96 | except OpenAIError as e: 97 | self.logger.error(f"OpenAIError: {e}") 98 | except Exception as e: 99 | self.logger.error(f"Unexpected error: {e}") 100 | return results 101 | 102 | @backoff.on_exception(backoff.expo, OpenAIError, max_time=10, max_tries=6) 103 | def embed_query(self, input: str) -> CreateEmbeddingResponse: 104 | """ 105 | Embed the given text into a vector. 106 | 107 | :param input: The text to embed. 108 | :type input: str 109 | :return: The embedding of the text. 110 | :rtype: CreateEmbeddingResponse 111 | """ 112 | response = self.client.embeddings.create( 113 | model=self.model_id, 114 | input=input, 115 | dimensions=self.dimension, 116 | encoding_format=self.encoding, 117 | ) 118 | 119 | self.prompt_tokens += response.usage.prompt_tokens 120 | prompt_tokens_k = float(self.prompt_tokens) / 1000.0 121 | self.cost = ( 122 | self.prompt_token_cost * prompt_tokens_k 123 | ) 124 | self.logger.info( 125 | #f"This is the response from chatgpt: {response}" 126 | f"\nRESPONDED - This is the cost of the response: {self.prompt_token_cost * float(response.usage.prompt_tokens) / 1000.0}" 127 | ) 128 | return response 129 | -------------------------------------------------------------------------------- /CheckEmbed/embedding_models/gte_qwen1_5_7b_instruct.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 ETH Zurich. 2 | # All rights reserved. 3 | # 4 | # Use of this source code is governed by a BSD-style license that can be 5 | # found in the LICENSE file. 6 | # 7 | # main author: Lorenzo Paleari 8 | 9 | import torch 10 | import gc 11 | import torch.nn.functional as F 12 | from torch import Tensor 13 | from tqdm import tqdm 14 | from transformers import AutoTokenizer, AutoModel 15 | 16 | from typing import List, Union 17 | 18 | from CheckEmbed.embedding_models import AbstractEmbeddingModel 19 | 20 | 21 | class GteQwenInstruct(AbstractEmbeddingModel): 22 | """ 23 | The GteQwenInstruct class handles interactions with the gte-Qwen1.5-7B-instruct embedding model using the provided configuration. 24 | 25 | Inherits from the AbstractEmbeddingModel class and implements its abstract methods. 26 | """ 27 | 28 | def __init__( 29 | self, access_token: str = "", model_name: str = "", name: str = "gte-qwen1.5-7B-instruct", cache: bool = False, max_length: int = 8192, batch_size: int = 64 30 | ) -> None: 31 | """ 32 | Initialize the GteQwenInstruct instance with configuration, model details, and caching options. 33 | 34 | :param access_token: The Hugging Face access token to use for the model. Defaults to "". 35 | :type access_token: str 36 | :param model_name: Name of the model, which is used to select the correct configuration. Defaults to "". 37 | :type model_name: str 38 | :param name: Name used for output files. Defaults to "gte-qwen1.5-7B-instruct". 39 | :type name: str 40 | :param cache: Flag to determine whether to cache responses. Defaults to False. 41 | :type cache: bool 42 | :param max_length: The maximum length of the input text. Defaults to 8192. 43 | :type max_length: int 44 | :param batch_size: The batch size to be used for the model. Defaults to 64. 45 | :type batch_size: int 46 | """ 47 | super().__init__(model_name=model_name, name=name, cache=cache) 48 | self.tokenizer_name = model_name 49 | self.max_length = max_length 50 | self.access_token = access_token 51 | self.batch_size = batch_size 52 | 53 | def load_model(self, device: str = None) -> None: 54 | """ 55 | Load the model and tokenizer based on the given model name. 56 | 57 | :param device: The device to load the model on. 58 | :type device: str 59 | """ 60 | 61 | self.tokenizer = AutoTokenizer.from_pretrained(self.tokenizer_name, trust_remote_code=True, token=self.access_token) 62 | self.model = AutoModel.from_pretrained(self.model_name, device_map=device, trust_remote_code=True, token=self.access_token) 63 | 64 | def unload_model(self) -> None: 65 | """ 66 | Unload the model and tokenizer. 67 | """ 68 | del self.tokenizer 69 | del self.model 70 | 71 | gc.collect() 72 | torch.cuda.empty_cache() 73 | 74 | self.tokenizer = None 75 | self.model = None 76 | 77 | def generate_embedding(self, input: Union[List[str], str]) -> List[List[float]]: 78 | """ 79 | Abstract method to generate embedding for the given input text. 80 | 81 | :param input: The input text to embed. 82 | :type input: Union[List[str], str] 83 | :return: The embeddings of the text. 84 | :rtype: List[List[float]] 85 | """ 86 | if isinstance(input, str): 87 | input = [input] 88 | 89 | total_embeddings = [] 90 | flag = True 91 | 92 | while flag: 93 | try: 94 | batched_responses = [input[i:i+self.batch_size] for i in range(0, len(input), self.batch_size)] 95 | 96 | embeddings = None 97 | outputs = None 98 | batch_dict = None 99 | 100 | for batch in tqdm(batched_responses, desc="Batches to Embed", leave=False, total=len(batched_responses)): 101 | batch_dict = self.tokenizer(batch, max_length=self.max_length, padding=True, truncation=True, return_tensors='pt') 102 | batch_dict.to(self.model.device) 103 | 104 | with torch.no_grad(): 105 | outputs = self.model(**batch_dict) 106 | embeddings = self.last_token_pool(outputs.last_hidden_state, batch_dict['attention_mask']) 107 | 108 | # normalize embeddings 109 | embeddings = F.normalize(embeddings, p=2, dim=1) 110 | total_embeddings.extend(embeddings.cpu().detach().numpy().tolist()) 111 | 112 | del embeddings, outputs, batch_dict 113 | gc.collect() 114 | torch.cuda.empty_cache() 115 | 116 | flag = False 117 | 118 | except Exception as e: 119 | embeddings = None 120 | outputs = None 121 | batch_dict = None 122 | total_embeddings = [] 123 | gc.collect() 124 | torch.cuda.empty_cache() 125 | 126 | print("Error occurred, reducing batch size and retrying") 127 | if self.batch_size == 1: 128 | raise e 129 | self.batch_size = self.batch_size // 2 # reduce batch size by half 130 | 131 | return total_embeddings 132 | 133 | def last_token_pool(self, last_hidden_states: Tensor, 134 | attention_mask: Tensor) -> Tensor: 135 | """ 136 | Pools the last non-padding token's hidden state from the model's output. 137 | 138 | This method extracts the hidden state of the last token that is not a padding token. 139 | If the last token is a padding token, it retrieves the hidden state of the 140 | second to last token that is not a padding token. 141 | 142 | :param last_hidden_states: A tensor containing the hidden states from the last layer of the model. 143 | :type last_hidden_states: Tensor 144 | :param attention_mask: A tensor indicating the positions of non-padding tokens (1 for non-padding, 0 for padding). 145 | :type attention_mask: Tensor 146 | :return: A tensor containing the hidden states of the last non-padding token for each sequence in the batch. 147 | :rtype: Tensor 148 | """ 149 | left_padding = (attention_mask[:, -1].sum() == attention_mask.shape[0]) 150 | if left_padding: 151 | return last_hidden_states[:, -1] 152 | else: 153 | sequence_lengths = attention_mask.sum(dim=1) - 1 154 | batch_size = last_hidden_states.shape[0] 155 | return last_hidden_states[torch.arange(batch_size, device=last_hidden_states.device), sequence_lengths] 156 | -------------------------------------------------------------------------------- /CheckEmbed/embedding_models/sfr_embedding_mistral.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 ETH Zurich. 2 | # All rights reserved. 3 | # 4 | # Use of this source code is governed by a BSD-style license that can be 5 | # found in the LICENSE file. 6 | # 7 | # main author: Lorenzo Paleari 8 | 9 | import torch 10 | import gc 11 | import torch.nn.functional as F 12 | from torch import Tensor 13 | from tqdm import tqdm 14 | from transformers import AutoTokenizer, AutoModel 15 | 16 | from typing import List, Union 17 | 18 | from CheckEmbed.embedding_models import AbstractEmbeddingModel 19 | 20 | 21 | class SFREmbeddingMistral(AbstractEmbeddingModel): 22 | """ 23 | The SFREmbeddingMistral class handles interactions with the SFR Embedding Mistral model using the provided configuration. 24 | 25 | Inherits from the AbstractEmbeddingModel class and implements its abstract methods. 26 | """ 27 | 28 | def __init__( 29 | self, model_name: str = "", name: str = "sfr-embedding-mistral", cache: bool = False, max_length: int = 4096, batch_size: int = 64 30 | ) -> None: 31 | """ 32 | Initialize the SFR Embedding Mistral instance with configuration, model details, and caching options. 33 | 34 | :param model_name: Name of the model, which is used to select the correct configuration. Defaults to "". 35 | :type model_name: str 36 | :param name: Name used for output files. Defaults to "sfr-embedding-mistral". 37 | :type name: str 38 | :param cache: Flag to determine whether to cache responses. Defaults to False. 39 | :type cache: bool 40 | :param max_length: The maximum length of the input text. Defaults to 4096. 41 | :type max_length: int 42 | :param batch_size: The batch size to be used for the model. Defaults to 64. 43 | :type batch_size: int 44 | """ 45 | super().__init__(model_name=model_name, name=name, cache=cache) 46 | self.tokenizer_name = model_name 47 | self.max_length = max_length 48 | self.batch_size = batch_size 49 | 50 | def load_model(self, device: str = None) -> None: 51 | """ 52 | Load the model and tokenizer based on the given model name. 53 | 54 | :param device: The device to load the model on. 55 | :type device: str 56 | """ 57 | 58 | self.tokenizer = AutoTokenizer.from_pretrained(self.tokenizer_name) 59 | self.model = AutoModel.from_pretrained(self.model_name, device_map=device) 60 | 61 | def unload_model(self) -> None: 62 | """ 63 | Unload the model and tokenizer. 64 | """ 65 | del self.tokenizer 66 | del self.model 67 | 68 | gc.collect() 69 | torch.cuda.empty_cache() 70 | 71 | self.tokenizer = None 72 | self.model = None 73 | 74 | def generate_embedding(self, input: Union[List[str], str]) -> List[List[float]]: 75 | """ 76 | Abstract method to generate embedding for the given input text. 77 | 78 | :param input: The input text to embed. 79 | :type input: Union[List[str], str] 80 | :return: The embeddings of the text. 81 | :rtype: List[List[float]] 82 | """ 83 | if isinstance(input, str): 84 | input = [input] 85 | 86 | total_embeddings = [] 87 | flag = True 88 | 89 | while flag: 90 | try: 91 | batched_responses = [input[i:i+self.batch_size] for i in range(0, len(input), self.batch_size)] 92 | 93 | embeddings = None 94 | outputs = None 95 | batch_dict = None 96 | 97 | for batch in tqdm(batched_responses, desc="Batches to Embed", leave=False, total=len(batched_responses)): 98 | batch_dict = self.tokenizer(batch, max_length=self.max_length, padding=True, truncation=True, return_tensors='pt') 99 | batch_dict.to(self.model.device) 100 | 101 | with torch.no_grad(): 102 | outputs = self.model(**batch_dict) 103 | embeddings = self.last_token_pool(outputs.last_hidden_state, batch_dict['attention_mask']) 104 | 105 | # normalize embeddings 106 | embeddings = F.normalize(embeddings, p=2, dim=1) 107 | total_embeddings.extend(embeddings.cpu().detach().numpy().tolist()) 108 | 109 | del embeddings, outputs, batch_dict 110 | gc.collect() 111 | torch.cuda.empty_cache() 112 | 113 | flag = False 114 | 115 | except Exception as e: 116 | embeddings = None 117 | outputs = None 118 | batch_dict = None 119 | total_embeddings = [] 120 | gc.collect() 121 | torch.cuda.empty_cache() 122 | 123 | print("Error occurred, reducing batch size and retrying") 124 | if self.batch_size == 1: 125 | raise e 126 | self.batch_size = self.batch_size // 2 # reduce batch size by half 127 | 128 | return total_embeddings 129 | 130 | def last_token_pool(self, last_hidden_states: Tensor, 131 | attention_mask: Tensor) -> Tensor: 132 | """ 133 | Pools the last non-padding token's hidden state from the model's output. 134 | 135 | This method extracts the hidden state of the last token that is not a padding token. 136 | If the last token is a padding token, it retrieves the hidden state of the 137 | second to last token that is not a padding token. 138 | 139 | :param last_hidden_states: A tensor containing the hidden states from the last layer of the model. 140 | :type last_hidden_states: Tensor 141 | :param attention_mask: A tensor indicating the positions of non-padding tokens (1 for non-padding, 0 for padding). 142 | :type attention_mask: Tensor 143 | :return: A tensor containing the hidden states of the last non-padding token for each sequence in the batch. 144 | :rtype: Tensor 145 | """ 146 | left_padding = (attention_mask[:, -1].sum() == attention_mask.shape[0]) 147 | if left_padding: 148 | return last_hidden_states[:, -1] 149 | else: 150 | sequence_lengths = attention_mask.sum(dim=1) - 1 151 | batch_size = last_hidden_states.shape[0] 152 | return last_hidden_states[torch.arange(batch_size, device=last_hidden_states.device), sequence_lengths] 153 | -------------------------------------------------------------------------------- /CheckEmbed/embedding_models/stella.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 ETH Zurich. 2 | # All rights reserved. 3 | # 4 | # Use of this source code is governed by a BSD-style license that can be 5 | # found in the LICENSE file. 6 | # 7 | # main author: Lorenzo Paleari 8 | 9 | import os 10 | import gc 11 | import torch 12 | from tqdm import tqdm 13 | from transformers import AutoModel, AutoTokenizer 14 | from sklearn.preprocessing import normalize 15 | from huggingface_hub import snapshot_download 16 | 17 | from typing import List, Literal, Union 18 | 19 | from CheckEmbed.embedding_models import AbstractEmbeddingModel 20 | 21 | 22 | class Stella(AbstractEmbeddingModel): 23 | """ 24 | The Stella class handles interactions with the Stella embedding model family using the provided configuration. 25 | 26 | Inherits from the AbstractEmbeddingModel class and implements its abstract methods. 27 | """ 28 | 29 | def __init__( 30 | self, model_name: str = "", variant: Literal["400M-v5", "1.5B-v5", ""] = "400M-v5", name: str = "stella-en-", cache: bool = False, max_length: int = 4096, batch_size: int = 64 31 | ) -> None: 32 | """ 33 | Initialize the Stella instance with configuration, model details, and caching options. 34 | 35 | :param model_name: Name of the model, default is "". Used to select the correct configuration. 36 | :type model_name: str 37 | :param variant: The variant of the Stella model to use. Defaults to "400M_v5". 38 | :type variant: Literal["400M-v5", "1.5B-v5", ""] 39 | :param name: Name used for output files. Defaults to "stella-en-". 40 | :type name: str 41 | :param cache: Flag to determine whether to cache responses. Defaults to False. 42 | :type cache: bool 43 | :param max_length: The maximum length of the input text. Defaults to 4096. 44 | :type max_length: int 45 | :param batch_size: The batch size to be used for the model. Defaults to 64. 46 | :type batch_size: int 47 | """ 48 | super().__init__(model_name=model_name, name=name + variant, cache=cache) 49 | self.max_length = max_length 50 | self.batch_size = batch_size 51 | 52 | def load_model(self, device: str = None) -> None: 53 | """ 54 | Load the model and tokenizer based on the given model name. 55 | 56 | :param device: The device to load the model on. 57 | :type device: str 58 | """ 59 | try: 60 | model_dir = snapshot_download(repo_id=self.model_name) 61 | except Exception as e: 62 | raise ValueError(f"Model {self.model_name} not found in the Hugging Face Hub") from e 63 | vector_linear_directory = f"2_Dense_{self.max_length}" 64 | self.model = AutoModel.from_pretrained(model_dir, trust_remote_code=True).to(device).eval() 65 | self.tokenizer = AutoTokenizer.from_pretrained(model_dir, trust_remote_code=True) 66 | self.vector_linear = torch.nn.Linear(in_features=self.model.config.hidden_size, out_features=self.max_length) 67 | vector_linear_dict = { 68 | k.replace("linear.", ""): v for k, v in 69 | torch.load(os.path.join(model_dir, f"{vector_linear_directory}/pytorch_model.bin")).items() 70 | } 71 | self.vector_linear.load_state_dict(vector_linear_dict) 72 | self.vector_linear.to(device) 73 | 74 | def unload_model(self) -> None: 75 | """ 76 | Unload the model and tokenizer from memory. 77 | """ 78 | del self.model 79 | 80 | gc.collect() 81 | torch.cuda.empty_cache() 82 | 83 | self.model = None 84 | 85 | def generate_embedding(self, input: Union[List[str], str]) -> List[List[float]]: 86 | """ 87 | Generate the embeddings for the given input text. 88 | 89 | :param input: The input text to embed. 90 | :type input: Union[List[str], str] 91 | :return: The embeddings of the text. 92 | :rtype: List[List[float]] 93 | """ 94 | if isinstance(input, str): 95 | input = [input] 96 | 97 | total_embeddings = [] 98 | flag = True 99 | 100 | while flag: 101 | try: 102 | batched_responses = [input[i:i+self.batch_size] for i in range(0, len(input), self.batch_size)] 103 | 104 | batch_dict = None 105 | attention_mask = None 106 | last_hidden_state = None 107 | last_hidden = None 108 | docs_vectors = None 109 | 110 | for batch in tqdm(batched_responses, desc="Batches to Embed", leave=False, total=len(batched_responses)): 111 | with torch.no_grad(): 112 | batch_dict = self.tokenizer(batch, padding="longest", truncation=True, max_length=512, return_tensors="pt") 113 | batch_dict = {k: v.to(self.model.device) for k, v in batch_dict.items()} 114 | attention_mask = batch_dict["attention_mask"] 115 | last_hidden_state = self.model(**batch_dict)[0] 116 | last_hidden = last_hidden_state.masked_fill(~attention_mask[..., None].bool(), 0.0) 117 | docs_vectors = last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None] 118 | docs_vectors = normalize(self.vector_linear(docs_vectors).cpu().detach().numpy()) 119 | 120 | total_embeddings.extend(docs_vectors.tolist()) 121 | 122 | del batch_dict, attention_mask, last_hidden_state, last_hidden, docs_vectors 123 | gc.collect() 124 | torch.cuda.empty_cache() 125 | 126 | flag = False 127 | 128 | except Exception as e: 129 | batch_dict = None 130 | attention_mask = None 131 | last_hidden_state = None 132 | last_hidden = None 133 | docs_vectors = None 134 | total_embeddings = [] 135 | gc.collect() 136 | torch.cuda.empty_cache() 137 | 138 | print("Error occurred, reducing batch size and retrying") 139 | if self.batch_size == 1: 140 | raise e 141 | self.batch_size = self.batch_size // 2 142 | 143 | return total_embeddings 144 | -------------------------------------------------------------------------------- /CheckEmbed/language_models/__init__.py: -------------------------------------------------------------------------------- 1 | from .abstract_language_model import AbstractLanguageModel 2 | from .chatgpt import ChatGPT 3 | from .chatollama import LLMChatOllama -------------------------------------------------------------------------------- /CheckEmbed/language_models/abstract_language_model.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023, 2024 ETH Zurich. 2 | # All rights reserved. 3 | # 4 | # Use of this source code is governed by a BSD-style license that can be 5 | # found in the LICENSE file. 6 | # 7 | # original file from Graph of Thoughts framework: 8 | # https://github.com/spcl/graph-of-thoughts 9 | # 10 | # main author: Nils Blach 11 | # 12 | # modifications: Lorenzo Paleari 13 | 14 | from abc import ABC, abstractmethod 15 | from typing import Any, Dict, List, Union 16 | import json 17 | import logging 18 | 19 | 20 | class AbstractLanguageModel(ABC): 21 | """ 22 | Abstract base class that defines the interface for all language models. 23 | """ 24 | 25 | # modified by Lorenzo Paleari 26 | def __init__( 27 | self, config_path: str = None, model_name: str = "", cache: bool = False 28 | ) -> None: 29 | """ 30 | Initialize the AbstractLanguageModel instance with configuration, model details, and caching options. 31 | 32 | :param config_path: Path to the config file. Defaults to None. If provided, the config is loaded from the file. 33 | :type config_path: str 34 | :param model_name: Name of the language model. Defaults to "". 35 | :type model_name: str 36 | :param cache: Flag to determine whether to cache responses. Defaults to False. 37 | :type cache: bool 38 | """ 39 | self.logger = logging.getLogger(self.__class__.__name__) 40 | self.config: Dict = None 41 | self.model_name: str = model_name 42 | self.cache = cache 43 | if self.cache: 44 | self.response_cache: Dict[str, List[Any]] = {} 45 | if config_path is not None: 46 | self.load_config(config_path) 47 | self.name: str = self.config[model_name]["name"] 48 | self.prompt_tokens: int = 0 49 | self.completion_tokens: int = 0 50 | self.cost: float = 0.0 51 | 52 | # modified by Lorenzo Paleari 53 | def load_config(self, path: str) -> None: 54 | """ 55 | Load configuration from a specified path. 56 | 57 | :param path: Path to the config file. 58 | :type path: str 59 | """ 60 | with open(path, "r") as f: 61 | self.config = json.load(f) 62 | 63 | self.logger.debug(f"Loaded config from {path} for {self.model_name}") 64 | 65 | def clear_cache(self) -> None: 66 | """ 67 | Clear the response cache. 68 | """ 69 | self.response_cache.clear() 70 | 71 | # written by Lorenzo Paleari 72 | @abstractmethod 73 | def load_model(self, device: str = None) -> None: 74 | """ 75 | Abstract method to load the language model. 76 | 77 | :param device: The device to load the model on. 78 | :type device: str 79 | """ 80 | pass 81 | 82 | # written by Lorenzo Paleari 83 | @abstractmethod 84 | def unload_model(self) -> None: 85 | """ 86 | Abstract method to unload the language model. 87 | """ 88 | pass 89 | 90 | # modified by Lorenzo Paleari 91 | @abstractmethod 92 | def query(self, query: str, num_query: int = 1) -> Any: 93 | """ 94 | Abstract method to query the language model. 95 | 96 | :param query: The prompt that is going to be used as query to the language model. 97 | :type query: str 98 | :param num_query: The number of queries to be posed to the language model for each prompt. Defaults to 1. 99 | :type num_query: int 100 | :return: The language model's response(s). 101 | :rtype: Any 102 | """ 103 | pass 104 | 105 | # modified by Lorenzo Paleari 106 | @abstractmethod 107 | def get_response_texts(self, query_response: Union[List[Any], Any]) -> List[str]: 108 | """ 109 | Abstract method to extract response texts from the language model's response(s). 110 | 111 | :param query_response: The responses returned from the language model. 112 | :type query_response: Union[List[Any], Any] 113 | :return: List of textual responses. 114 | :rtype: List[str] 115 | """ 116 | pass 117 | -------------------------------------------------------------------------------- /CheckEmbed/language_models/chatgpt.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023, 2024 ETH Zurich. 2 | # All rights reserved. 3 | # 4 | # Use of this source code is governed by a BSD-style license that can be 5 | # found in the LICENSE file. 6 | # 7 | # original file from Graph of Thoughts framework: 8 | # https://github.com/spcl/graph-of-thoughts 9 | # 10 | # main author: Nils Blach 11 | # 12 | # modifications: Lorenzo Paleari 13 | 14 | 15 | from concurrent.futures import ThreadPoolExecutor, as_completed 16 | from typing import Dict, List, Union 17 | 18 | import backoff 19 | from openai import OpenAI, OpenAIError 20 | from openai.types.chat.chat_completion import ChatCompletion 21 | from tqdm import tqdm 22 | 23 | from CheckEmbed.language_models import AbstractLanguageModel 24 | 25 | 26 | class ChatGPT(AbstractLanguageModel): 27 | """ 28 | The ChatGPT class handles interactions with the OpenAI models using the provided configuration. 29 | 30 | Inherits from the AbstractLanguageModel class and implements its abstract methods. 31 | """ 32 | 33 | # modified by Lorenzo Paleari 34 | def __init__( 35 | self, config_path: str = "", model_name: str = "chatgpt4", cache: bool = False, max_concurrent_requests: int = 10, temperature: float = None 36 | ) -> None: 37 | """ 38 | Initialize the ChatGPT instance with configuration, model details, and caching options. 39 | 40 | :param config_path: Path to the configuration file. Defaults to "". 41 | :type config_path: str 42 | :param model_name: Name of the model, default is 'chatgpt4'. Used to select the correct configuration. 43 | :type model_name: str 44 | :param cache: Flag to determine whether to cache responses. Defaults to False. 45 | :type cache: bool 46 | :param max_concurrent_requests: The maximum number of concurrent requests. Defaults to 10. 47 | :type max_concurrent_requests: int 48 | :param temperature: The temperature for the model. If not provided, it will be taken from the config. 49 | :type temperature: float 50 | """ 51 | super().__init__(config_path, model_name, cache) 52 | self.config: Dict = self.config[model_name] 53 | # The model_id is the id of the model that is used for chatgpt, i.e. gpt-4, gpt-3.5-turbo, etc. 54 | self.model_id: str = self.config["model_id"] 55 | self.name = self.config["name"] 56 | # The prompt_token_cost and response_token_cost are the costs for 1000 prompt tokens and 1000 response tokens respectively. 57 | self.prompt_token_cost: float = self.config["prompt_token_cost"] 58 | self.response_token_cost: float = self.config["response_token_cost"] 59 | # The temperature of a model is defined as the randomness of the model's output. 60 | self.temperature: float = temperature if temperature is not None else self.config["temperature"] 61 | # The maximum number of tokens to generate in the chat completion. 62 | self.max_tokens: int = self.config["max_tokens"] 63 | # The stop sequence is a sequence of tokens that the model will stop generating at (it will not generate the stop sequence). 64 | self.stop: Union[str, List[str]] = self.config["stop"] 65 | # The account organization is the organization that is used for chatgpt. 66 | self.organization: str = self.config["organization"] 67 | if self.config["organization"] == "": 68 | self.logger.warning("OPENAI_ORGANIZATION is not set") 69 | self.api_key: str = self.config["api_key"] 70 | if self.config["api_key"] == "": 71 | self.logger.warning("OPENAI_API_KEY is not set") 72 | # Initialize the OpenAI Client 73 | self.client = OpenAI(api_key=self.api_key, organization=self.organization) 74 | 75 | self.max_concurrent_requests = max_concurrent_requests 76 | 77 | # written by Lorenzo Paleari 78 | def load_model(self, device: str = None) -> None: 79 | """ 80 | Load the language model locally. 81 | 82 | :param device: The device to load the model on. 83 | :type device: str 84 | """ 85 | pass 86 | 87 | # written by Lorenzo Paleari 88 | def unload_model(self) -> None: 89 | """ 90 | Unload the language model locally. 91 | """ 92 | pass 93 | 94 | # modified by Lorenzo Paleari 95 | def query( 96 | self, query: str, num_query: int = 1 97 | ) -> List[ChatCompletion]: 98 | """ 99 | Query the OpenAI model for responses. 100 | 101 | :param query: The prompt that is going to be used as query to the language model. 102 | :type query: str 103 | :param num_query: The number of queries to be posed to the language model for each prompt. Defaults to 1. 104 | :type num_query: int 105 | :return: Response(s) from the OpenAI model. 106 | :rtype: List[ChatCompletion] 107 | """ 108 | if self.cache and query in self.response_cache: 109 | self.logger.debug(f"Used cache for query: {query}") 110 | return self.response_cache[query] 111 | 112 | with ThreadPoolExecutor(max_workers=self.max_concurrent_requests) as executor: 113 | futures = [executor.submit(self.chat, [{"role": "user", "content": query}], 1) for _ in range(num_query)] 114 | results = [] 115 | for future in tqdm(as_completed(futures), total=num_query, desc="Samples", leave=False): 116 | try: 117 | response = future.result() 118 | results.append(response) 119 | except OpenAIError as e: 120 | self.logger.error(f"OpenAIError: {e}") 121 | except Exception as e: 122 | self.logger.error(f"Unexpected error: {e}") 123 | 124 | if self.cache: 125 | self.response_cache[query] = results 126 | return results 127 | 128 | 129 | @backoff.on_exception(backoff.expo, OpenAIError, max_time=10, max_tries=6) 130 | def chat(self, messages: List[Dict], num_responses: int = 1) -> ChatCompletion: 131 | """ 132 | Send chat messages to the OpenAI model and retrieves the model's response. 133 | Implements backoff on OpenAI error. 134 | 135 | :param messages: A list of message dictionaries for the chat. 136 | :type messages: List[Dict] 137 | :param num_responses: Number of desired responses, default is 1. 138 | :type num_responses: int 139 | :return: The OpenAI model's response. 140 | :rtype: ChatCompletion 141 | """ 142 | response = self.client.chat.completions.create( 143 | model=self.model_id, 144 | messages=messages, 145 | temperature=self.temperature, 146 | max_tokens=self.max_tokens, 147 | n=num_responses, 148 | stop=self.stop, 149 | ) 150 | self.prompt_tokens += response.usage.prompt_tokens 151 | self.completion_tokens += response.usage.completion_tokens 152 | prompt_tokens_k = float(self.prompt_tokens) / 1000.0 153 | completion_tokens_k = float(self.completion_tokens) / 1000.0 154 | self.cost = ( 155 | self.prompt_token_cost * prompt_tokens_k 156 | + self.response_token_cost * completion_tokens_k 157 | ) 158 | self.logger.info( 159 | #f"This is the response from chatgpt: {response}" 160 | f"This is the cost of the response: {self.prompt_token_cost * float(response.usage.prompt_tokens) / 1000.0 + self.response_token_cost * float(response.usage.completion_tokens) / 1000.0}" 161 | ) 162 | return response 163 | 164 | def get_response_texts( 165 | self, query_response: Union[List[ChatCompletion], ChatCompletion] 166 | ) -> List[str]: 167 | """ 168 | Extract the response texts from the query response. 169 | 170 | :param query_response: The response dictionary (or list of dictionaries) from the OpenAI model. 171 | :type query_response: Union[List[ChatCompletion], ChatCompletion] 172 | :return: List of response strings. 173 | :rtype: List[str] 174 | """ 175 | if not isinstance(query_response, List): 176 | query_response = [query_response] 177 | return [ 178 | choice.message.content 179 | for response in query_response 180 | for choice in response.choices 181 | ] 182 | -------------------------------------------------------------------------------- /CheckEmbed/language_models/chatollama.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2025 ETH Zurich. 2 | # All rights reserved. 3 | # 4 | # Use of this source code is governed by a BSD-style license that can be 5 | # found in the LICENSE file. 6 | # 7 | # main author: Lorenzo Paleari 8 | 9 | 10 | from typing import Dict, List, Union 11 | 12 | from langchain_ollama import ChatOllama 13 | from pydantic import BaseModel 14 | 15 | from CheckEmbed.language_models import AbstractLanguageModel 16 | 17 | 18 | class LLMChatOllama(AbstractLanguageModel): 19 | """ 20 | The LLMChatOllama class handles interactions with Ollama models using the provided configuration. 21 | 22 | Inherits from the AbstractLanguageModel class and implements its abstract methods. 23 | """ 24 | 25 | def __init__( 26 | self, config_path: str = "", model_name: str = "llama8b", cache: bool = False, temperature: float = None 27 | ) -> None: 28 | """ 29 | Initialize the LLMChatOllama instance with configuration, model details, and caching options. 30 | 31 | :param config_path: Path to the configuration file. Defaults to "". 32 | :type config_path: str 33 | :param model_name: Name of the model, default is 'llama8b'. Used to select the correct configuration. 34 | :type model_name: str 35 | :param cache: Flag to determine whether to cache responses. Defaults to False. 36 | :type cache: bool 37 | :param temperature: The temperature for the model. If not provided, it will be taken from the config. 38 | :type temperature: float 39 | """ 40 | super().__init__(config_path, model_name, cache) 41 | self.config: Dict = self.config[model_name] 42 | self.model_id: str = self.config["model_id"] 43 | self.name = self.config["name"] 44 | self.num_ctx = self.config["num_ctx"] 45 | self.num_predict = self.config["num_predict"] 46 | self.num_batch = self.config["num_batch"] 47 | self.keep_alive = self.config["keep_alive"] 48 | self.temperature: float = temperature if temperature is not None else self.config["temperature"] 49 | # Initialize the Ollama Client 50 | self.client = ChatOllama( 51 | model=self.model_id, 52 | temperature=self.temperature, 53 | base_url="localhost:11434", 54 | num_ctx=self.num_ctx, 55 | num_predict=self.num_predict, 56 | num_batch=self.num_batch, 57 | keep_alive=self.keep_alive, 58 | ) 59 | 60 | def load_model(self, device: str = None) -> None: 61 | """ 62 | Load the language model locally. 63 | 64 | :param device: The device to load the model on. 65 | :type device: str 66 | """ 67 | pass 68 | 69 | def unload_model(self) -> None: 70 | """ 71 | Unload the language model locally. 72 | """ 73 | pass 74 | 75 | def add_structured_output(self, response: BaseModel) -> None: 76 | """ 77 | Add structured output to the response. 78 | 79 | :param response: The response from the language model. 80 | :type response: BaseModel 81 | """ 82 | self.client = self.client.with_structured_output( 83 | response, method="json_schema" 84 | ) 85 | 86 | def query( 87 | self, query: str, num_query: int = 1 88 | ) -> str: 89 | """ 90 | Query the Ollama model for responses. 91 | 92 | :param query: The prompt that is going to be used as query to the language model. 93 | :type query: str 94 | :param num_query: The number of queries to be posed to the language model for each prompt. Defaults to 1. 95 | :type num_query: int 96 | :return: Response(s) from the Ollama model. 97 | :rtype: str 98 | """ 99 | if self.cache and query in self.response_cache: 100 | self.logger.debug(f"Used cache for query: {query}") 101 | return self.response_cache[query] 102 | 103 | result = self.client.invoke( 104 | query 105 | ) 106 | 107 | if self.cache: 108 | self.response_cache[query] = result 109 | return result 110 | 111 | 112 | def get_response_texts( 113 | self, query_response: Union[List[str], str] 114 | ) -> List[str]: 115 | """ 116 | Extract the response texts from the query response. 117 | 118 | :param query_response: The response dictionary (or list of dictionaries) from the Ollama model. 119 | :type query_response: Union[List[ChatCompletion], ChatCompletion] 120 | :return: List of response strings. 121 | :rtype: List[str] 122 | """ 123 | pass 124 | -------------------------------------------------------------------------------- /CheckEmbed/operations/README.md: -------------------------------------------------------------------------------- 1 | # Evaluation 2 | 3 | This section aims to provide insights into the results generated by the default operations and offers useful tips on how to effectively use these results for a correct evaluation. 4 | 5 | To aid in the visualization and the correct interpretation of the results, it is recommended to take a look at the plots that will be generated by executing any of the experiments in the examples folder. 6 | 7 | ## SelfCheckGPT 8 | 9 | The SelfCheckGPT baseline is usually evaluated based on three metrics. The results are stored in a JSON file inside the SelfCheckGPT folder, that is created during the execution of the pipeline. 10 | - `result`: A list of all sentence-level scores referred to by the prompt indicated by `index`. The score is computed based on $`1 - \text{SelfCheckGPTscore}`$, since the original score indicates the hallucination level of a sentence, while the CheckEmbed pipeline compares the level of similarity. 11 | - `passage_score`: Aggregated score for the complete passage by computing the formula 12 | $$S_{\text{passage}} = \frac{1}{|R|} \sum_{i} S(i)$$ 13 | where $`|R|`$ is the number of sentences and $`S(i)`$ is the score associated with each sentence. 14 | - `std_dev`: The standard deviation of the sentence level scores. 15 | 16 | An interpretation of the results and more information on the significance of the respective scores can be found in the [SelfCheckGPT paper](https://arxiv.org/pdf/2303.08896). 17 | 18 | ## BertScore - CheckEmbed 19 | 20 | Both BertScore and CheckEmbed use similar metrics: 21 | - `result \ cosine_sim`: A cosine similarity matrix representing the cosine similarity score for each pair of embeddings compared. 22 | - `frobenius_norm \ frob_norm_cosine_sim`: The Frobenius norm obtained from the cosine similarity matrix. 23 | - `std_dev \ std_dev_cosine_sim`: The standard deviation of the cosine similarity scores within the matrix. 24 | 25 | Additionally CheckEmbed results can be evaluated using the following metrics: 26 | - `pearson_corr`: A Pearson correlation matrix representing the Pearson correlation score for each pair of embeddings compared. 27 | - `frob_norm_pearson_corr`: The Frobenius norm obtained from the Pearson correlation matrix. 28 | - `std_dev_pearson_corr`: The standard deviation of the Pearson correlation scores within the matrix. 29 | 30 | Additionally it is possible to incorporate the ground-truth if available. 31 | 32 | ## Frobenius Norm 33 | 34 | The following sections references only the Frobenius norm, but the same concepts also apply to the Pearson correlation. 35 | 36 | ### With Ground-Truth 37 | 38 | Evaluation with the ground-truth is straightforward. 39 | It is sufficient to compare the Cosine Similarity results to determine if the generated sample is closer to or further from the correct answer. 40 | The ground-truth results are represented as the last row of the cosine similarity matrix. 41 | Remember that based on the used embedding model, the threshold for defining a result as correct or not can vary. 42 | 43 | ### Without Ground-Truth 44 | 45 | To evaluate results in the absence of ground-truth, the full cosine similarity matrix should be considered to understand how close the different samples are to each other. 46 | Keeping in mind the possibility of having different thresholds based on the used embedding model used, the general stability of the answer can be grasped by looking at the matrix. 47 | 48 | The `standard deviation` can additionally help to assess the correctness of an answer. A higher standard deviation indicates higher variance and insecurity in the LLM answers, while a lower one indicates stability. 49 | 50 | The `Frobenius norm` is useful as a unique score for the whole similarity matrix without the need to compare multiple matrices together. 51 | Please keep in mind that the Frobenius norm will always output a positive number, treating negative ones as positive (by squaring them). 52 | It is recommended to also look at the `standard deviation` results to in order to identify situations where two opposite results (-0.7 and 0.7) end up having a similarly high Frobenius score. 53 | The resulting high standard deviation should suggest the need to examine the cosine similarity matrix more closely to better understand the results. 54 | -------------------------------------------------------------------------------- /CheckEmbed/operations/__init__.py: -------------------------------------------------------------------------------- 1 | from .operations import Operation 2 | from .bertscore_operation import BertScoreOperation 3 | from .checkembed_operation import CheckEmbedOperation 4 | from .selfcheckgpt_operation import SelfCheckGPT_Operation, SelfCheckGPT_BERT_Operation, SelfCheckGPT_NLI_Operation 5 | from .llm_as_a_judge_operation import LLMAsAJudgeOperation 6 | -------------------------------------------------------------------------------- /CheckEmbed/operations/bertscore_operation.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 ETH Zurich. 2 | # All rights reserved. 3 | # 4 | # Use of this source code is governed by a BSD-style license that can be 5 | # found in the LICENSE file. 6 | # 7 | # main author: Lorenzo Paleari 8 | 9 | import logging 10 | import os 11 | import json 12 | 13 | import bert_score 14 | import numpy as np 15 | from tqdm import tqdm 16 | from typing import Any 17 | from timeit import default_timer as timer 18 | 19 | from CheckEmbed.operations import Operation 20 | from CheckEmbed.utility import capture_specific_stderr, frobenius_norm_no_diag, matrix_std_dev_no_diag 21 | 22 | class BertScoreOperation(Operation): 23 | """ 24 | Operation that computes the BertScore between the reference and the sample embeddings. 25 | 26 | Inherits from the Operation class and implements its abstract methods. 27 | """ 28 | 29 | def __init__(self, result_dir_path: str, sample_dir_path: str) -> None: 30 | """ 31 | Initialize the operation. 32 | 33 | :param result_dir_path: The path to the directory where the results will be stored. 34 | :type result_dir_path: str 35 | :param sample_dir_path: The path to the directory where the samples are stored. 36 | :type sample_dir_path: str 37 | """ 38 | super().__init__(result_dir_path) 39 | self.sample_dir_path = sample_dir_path 40 | 41 | def execute(self, custom_inputs: Any) -> Any: 42 | """ 43 | Execute the operation on the embeddings/samples. 44 | 45 | :param custom_inputs: The custom inputs for the operation. 46 | :type custom_inputs: any 47 | """ 48 | 49 | print("\n\nRunning BertScore operation.") 50 | time_performance = custom_inputs["time_performance"] 51 | 52 | # Initialize logging 53 | logging.basicConfig( 54 | filename=os.path.join(self.result_dir_path, "log.log"), 55 | filemode="w", 56 | format="%(name)s - %(levelname)s - %(message)s", 57 | level=custom_inputs["logging_level"], 58 | ) 59 | 60 | if time_performance: 61 | with open(os.path.join(self.sample_dir_path, "runtimes", "performance_log.log"), "a") as f: 62 | f.write("\n\nBERTScore operation\n") 63 | 64 | # Run BertScore for every pair of language model and samples 65 | performance_times = [] 66 | for lm_name in (pbar := tqdm(custom_inputs["lm_names"], desc="Language Models", leave=True)): 67 | pbar.set_postfix_str(f"{lm_name}") 68 | logging.info(f"Loading responses from {lm_name}.") 69 | samples = [] 70 | 71 | start = timer() if time_performance else None 72 | 73 | # Load samples from the language model 74 | with open(os.path.join(self.sample_dir_path, f"{lm_name}_samples.json")) as f: 75 | responses = json.load(f) 76 | 77 | for index, response in enumerate(responses["data"]): 78 | samples.append(response["samples"]) 79 | logging.debug(f"Sample {index}: {samples[index]}") 80 | 81 | logging.info("Loaded samples.") 82 | 83 | if custom_inputs["ground_truth"]: 84 | # Load definitions 85 | with open(os.path.join(self.sample_dir_path, "ground_truth.json")) as f: 86 | definitions = json.load(f) 87 | 88 | # Add definitions to the samples 89 | for index, sample in enumerate(samples): 90 | sample.append(definitions["ground_truth"][index]) 91 | samples[index] = sample 92 | 93 | # For every prompt compare every sample with every other sample 94 | logging.info(f"Running BertScore for {lm_name}.") 95 | 96 | same_samples = [] 97 | for sample in samples: 98 | same_s = [] 99 | for i in range(len(sample)): 100 | temp = [] 101 | for j in range(len(sample)): 102 | temp.append(sample[i]) 103 | same_s.append(temp) 104 | same_samples.append(same_s) 105 | 106 | results = [] 107 | for sample, same_sample in tqdm(zip(samples, same_samples), total=len(samples), desc="Prompts", leave=False): 108 | result = [] 109 | for s in tqdm(same_sample, desc="Samples", leave=False): 110 | with capture_specific_stderr(): 111 | result.append(bert_score.score( 112 | sample, s, model_type=custom_inputs["model_type"], 113 | batch_size=custom_inputs["batch_size"], device=custom_inputs["device"], 114 | lang="en", verbose=False, rescale_with_baseline=True, 115 | )[2].tolist()) 116 | results.append(result) 117 | logging.debug(f"Results: {result}") 118 | 119 | logging.info(f"Finished running BertScore for {lm_name}.") 120 | 121 | # Fix the results that are less than -1 122 | for index, result in enumerate(results): 123 | temp_res = np.zeros((len(result), len(result[0]))) 124 | for i in range(temp_res.shape[0]): 125 | for j in range(temp_res.shape[1]): 126 | if temp_res[i][j] < -1: 127 | temp_res[i][j] = -1 128 | else: 129 | temp_res[i][j] = result[i][j] 130 | results[index] = temp_res 131 | 132 | frobenius_norms = [frobenius_norm_no_diag(result[:-1,:-1], True) if custom_inputs["ground_truth"] 133 | else frobenius_norm_no_diag(result, True) for result in results] 134 | std_devs = [matrix_std_dev_no_diag(result[:-1,:-1]) if custom_inputs["ground_truth"] 135 | else matrix_std_dev_no_diag(result) for result in results] 136 | 137 | end = timer() if time_performance else None 138 | if time_performance: 139 | performance_times.append(end - start) 140 | with open(os.path.join(self.sample_dir_path, "runtimes", "performance_log.log"), "a") as f: 141 | f.write(f"\t - Time for {lm_name}: {end - start}\n") 142 | 143 | # Store results 144 | with open(os.path.join(self.result_dir_path, f"{lm_name}_bert.json"), "w") as f: 145 | results_json = [{ 146 | "index": i, 147 | "result": result.tolist(), 148 | "frobenius_norm": frob_norm, 149 | "std_dev": std_dev 150 | } for i, result, frob_norm, std_dev in zip(range(len(results)), results, frobenius_norms, std_devs)] 151 | json.dump({"data": results_json}, f, indent=4) 152 | 153 | logging.info(f"Saved results for {lm_name}.") 154 | 155 | if time_performance: 156 | with open(os.path.join(self.sample_dir_path, "runtimes", "performance_log.log"), "a") as f: 157 | f.write(f"\n\tTotal time: {sum(performance_times)}\n") 158 | -------------------------------------------------------------------------------- /CheckEmbed/operations/checkembed_operation.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 ETH Zurich. 2 | # All rights reserved. 3 | # 4 | # Use of this source code is governed by a BSD-style license that can be 5 | # found in the LICENSE file. 6 | # 7 | # main author: Lorenzo Paleari 8 | 9 | import os 10 | import json 11 | import numpy as np 12 | 13 | from typing import Any 14 | from timeit import default_timer as timer 15 | 16 | from CheckEmbed.operations import Operation 17 | from CheckEmbed.utility import cosine_similarity, frobenius_norm_no_diag, matrix_std_dev_no_diag 18 | 19 | class CheckEmbedOperation(Operation): 20 | """ 21 | Operation that computes the cosine similarity, the Pearson correlation, the Frobenius norm and standard deviation between the embeddings. 22 | 23 | Inherits from the Operation class and implements its abstract methods. 24 | """ 25 | 26 | def __init__(self, result_dir_path: str, embeddings_dir_path: str) -> None: 27 | """ 28 | Initialize the operation. 29 | 30 | :param result_dir_path: The path to the directory where the results will be stored. 31 | :type result_dir_path: str 32 | :param embeddings_dir_path: The path to the directory where the embeddings are stored. 33 | :type embeddings_dir_path: str 34 | """ 35 | super().__init__(result_dir_path) 36 | self.embeddings_dir_path = embeddings_dir_path 37 | 38 | def execute(self, custom_inputs: Any) -> Any: 39 | """ 40 | Execute the operation on the embeddings/samples. 41 | 42 | :param custom_inputs: The custom inputs for the operation. 43 | :type custom_inputs: Any 44 | """ 45 | time_performance = custom_inputs["time_performance"] 46 | 47 | performance_times = [] 48 | # For every language model / embedding model 49 | for file in os.listdir(self.embeddings_dir_path): 50 | if ".json" in file and not file.startswith("ground_truth_"): 51 | 52 | start = timer() if time_performance else None 53 | folder_name = file.replace("_" + file.split("_")[2], "") 54 | file_name_completion_for_ground_truth = file.replace(file.split("_")[0] + "_", "") 55 | 56 | # Load the samples embeddings 57 | with open(os.path.join(self.embeddings_dir_path, file), "r") as f: 58 | data = json.load(f) 59 | data_array = data["data"] 60 | embeddings = [d["embeddings"] for d in data_array] # Convert to numpy array 61 | 62 | # Load the definitions embeddings 63 | dimensions = len(embeddings[0]) 64 | if custom_inputs["ground_truth"]: 65 | with open(os.path.join(self.embeddings_dir_path, "ground_truth_" + file_name_completion_for_ground_truth), "r") as f: 66 | definitions = json.load(f) 67 | definitions = definitions["data"] 68 | definitions_embedded = [d["embeddings"] for d in definitions] 69 | 70 | for index, embedding in enumerate(embeddings): 71 | new_embedding = embedding 72 | if len(definitions_embedded[index]) > 0: 73 | new_embedding.append(definitions_embedded[index]) 74 | embeddings[index] = new_embedding 75 | 76 | dimensions += 1 77 | 78 | # Compute the cosine similarity matrix 79 | cosine_similarity_matrix_array = [] 80 | for index, embedding in enumerate(embeddings): 81 | # -1 array to initialize the cosine similarity matrix 82 | cosine_similarity_matrix = np.full((dimensions, dimensions), -1.0) 83 | for i in range(len(embedding)): 84 | for j in range(len(embedding)): 85 | cosine_similarity_matrix[i, j] = cosine_similarity(embedding[i], embedding[j], custom_inputs["rebase_results"], file.split("_")[1]) 86 | 87 | cosine_similarity_matrix_array.append(cosine_similarity_matrix) 88 | 89 | # Compute the frobenius norm of each cosine similarity matrix 90 | frobenius_norms_cosine_sim = [frobenius_norm_no_diag(cosine_similarity_matrix[:-1,:-1]) if custom_inputs["ground_truth"] 91 | else frobenius_norm_no_diag(cosine_similarity_matrix) 92 | for cosine_similarity_matrix in cosine_similarity_matrix_array] 93 | 94 | # Compute the standard deviation of each cosine similarity matrix 95 | std_dev_cosine_sim_array = [matrix_std_dev_no_diag(cosine_similarity_matrix[:-1,:-1]) if custom_inputs["ground_truth"] 96 | else matrix_std_dev_no_diag(cosine_similarity_matrix) 97 | for cosine_similarity_matrix in cosine_similarity_matrix_array] 98 | 99 | # Compute the Pearson correlation matrix 100 | pearson_corr_array = [] 101 | for index, embedding in enumerate(embeddings): 102 | pearson_corr = np.full((dimensions, dimensions), -1.0) 103 | for i in range(len(embedding)): 104 | for j in range(len(embedding)): 105 | if len(embedding[i]) == 0 and len(embedding[j]) == 0: 106 | pearson_corr[i, j] = 1.0 107 | continue 108 | if len(embedding[i]) == 0 or len(embedding[j]) == 0: 109 | pearson_corr[i, j] = -1.0 110 | continue 111 | pearson_corr[i, j] = np.corrcoef(embedding[i], embedding[j])[0, 1] 112 | 113 | pearson_corr_array.append(pearson_corr) 114 | 115 | # Compute the Frobenius norm of each Pearson correlation matrix 116 | frobenius_norms_pearson_corr = [frobenius_norm_no_diag(pearson_corr[:-1,:-1]) if custom_inputs["ground_truth"] 117 | else frobenius_norm_no_diag(pearson_corr) 118 | for pearson_corr in pearson_corr_array] 119 | 120 | std_dev_pearson_corr_array = [matrix_std_dev_no_diag(pearson_corr[:-1,:-1]) if custom_inputs["ground_truth"] 121 | else matrix_std_dev_no_diag(pearson_corr) 122 | for pearson_corr in pearson_corr_array] 123 | 124 | end = timer() if time_performance else None 125 | if time_performance: 126 | performance_times.append({folder_name: end - start}) 127 | 128 | # Store the results 129 | with open(os.path.join(self.result_dir_path, folder_name + "_results.json"), "w") as f: 130 | results_json = [{ 131 | "index": index, 132 | "cosine_sim": cosine_sim.tolist(), 133 | "frob_norm_cosine_sim": frob_norm_cosine_sim, 134 | "std_dev_cosine_sim": std_dev_cosine_sim, 135 | "pearson_corr": pearson_corr.tolist(), 136 | "frob_norm_pearson_corr": frob_norm_pearson_corr, 137 | "std_dev_pearson_corr": std_dev_pearson_corr 138 | } for index, cosine_sim, frob_norm_cosine_sim, std_dev_cosine_sim, pearson_corr, frob_norm_pearson_corr, std_dev_pearson_corr 139 | in zip(range(len(cosine_similarity_matrix_array)), cosine_similarity_matrix_array, frobenius_norms_cosine_sim, std_dev_cosine_sim_array, pearson_corr_array, frobenius_norms_pearson_corr, std_dev_pearson_corr_array)] 140 | json.dump({"data": results_json}, f, indent=4) 141 | 142 | if time_performance: 143 | # Reorder the performance times first on embedding and then on language model names 144 | performance_times.sort(key=lambda x: (list(x.keys())[0].split("_")[1], list(x.keys())[0].split("_")[0])) 145 | with open(os.path.join(self.result_dir_path, "../runtimes", "performance_log.log"), "a") as f: 146 | f.write("\n\nCheckEmbed operation:\n") 147 | for time in performance_times: 148 | time_key = list(time.keys())[0] 149 | time_value = list(time.values())[0] 150 | formatted_string = f"\t - Time for {time_key.split('_')[0]:<10} {time_key.split('_')[1]:>15}: {time_value}\n" 151 | f.write(formatted_string) 152 | 153 | 154 | -------------------------------------------------------------------------------- /CheckEmbed/operations/llm_as_a_judge_operation.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2025 ETH Zurich. 2 | # All rights reserved. 3 | # 4 | # Use of this source code is governed by a BSD-style license that can be 5 | # found in the LICENSE file. 6 | # 7 | # main author: Lorenzo Paleari 8 | 9 | import json 10 | import os 11 | from typing import Any 12 | 13 | from langchain.prompts import PromptTemplate 14 | from pydantic import BaseModel, Field 15 | 16 | from CheckEmbed.language_models import ChatGPT 17 | from CheckEmbed.operations import Operation 18 | 19 | 20 | class Score(BaseModel): 21 | score: int = Field(description="The score from 0 to 100") 22 | 23 | class LLMAsAJudgeOperation(Operation): 24 | """ 25 | Operation that computes the hallucination score of an answer using a language model as a judge. 26 | 27 | Inherits from the Operation class and implements its abstract methods. 28 | """ 29 | 30 | def __init__(self, result_dir_path: str, answer_dir_path: str, prompt_template: PromptTemplate, original: str = None, original_position: int = 0, reference_txt: str = None) -> None: 31 | """ 32 | Initialize the operation. 33 | 34 | :param result_dir_path: The path to the directory where the results will be stored. 35 | :type result_dir_path: str 36 | :param answer_dir_path: The path to the directory where the answers are stored. 37 | :type answer_dir_path: str 38 | :param prompt_template: The prompt template to be used for the language model. 39 | :type prompt_template: PromptTemplate 40 | :param original: The original data. Defaults to None. 41 | :type original: str 42 | :param original_position: The position of the original data in the prompt template. Defaults to 0. 43 | :type original_position: int 44 | :param reference_txt: Reference text for the comparison. Defaults to None. 45 | :type reference_txt: str 46 | """ 47 | super().__init__(result_dir_path) 48 | self.answer_dir_path = answer_dir_path 49 | self.prompt_template = prompt_template 50 | self.original = original 51 | self.original_position = original_position 52 | self.reference_txt = reference_txt 53 | 54 | def execute(self, custom_inputs: Any) -> None: 55 | """ 56 | Execute the operation on the embeddings/samples. 57 | 58 | :param custom_inputs: The custom inputs for the operation. 59 | :type custom_inputs: Any 60 | """ 61 | model = custom_inputs["model"] 62 | if not isinstance(model, ChatGPT): 63 | model.add_structured_output(Score) 64 | 65 | original_data = None 66 | if self.original is not None: 67 | if self.original.endswith(".json"): 68 | with open(self.result_dir_path + self.original, "r") as f: 69 | original_data = json.load(f)["data"] 70 | 71 | # For every language model 72 | for file in os.listdir(self.answer_dir_path): 73 | if "samples.json" in file and not file.startswith("ground_truth_"): 74 | 75 | if self.original is not None and original_data is None: 76 | with open(self.result_dir_path + "/" + self.original + f"/{file.split('_')[0]}_original.json", "r") as f: 77 | original_data = json.load(f)["data"] 78 | 79 | name = model.name + "_" + file.split("_")[0] 80 | if name.startswith("gpt4-o"): 81 | name = name[6:] 82 | name = "4o" + name 83 | 84 | # Load the samples 85 | with open(os.path.join(self.answer_dir_path, file), "r") as f: 86 | data = json.load(f) 87 | data_array = data["data"] 88 | samples = [d["samples"] for d in data_array] 89 | 90 | inputs = self.prompt_template.input_variables 91 | 92 | results = [] 93 | if self.original is not None: 94 | for i, sample in enumerate(samples): 95 | prep = {} 96 | for j, input in enumerate(inputs): 97 | if j == self.original_position: 98 | prep[input] = original_data[i] 99 | else: 100 | prep[input] = sample[j] 101 | 102 | final_prompt = self.prompt_template.invoke(prep) 103 | if isinstance(model, ChatGPT): 104 | final_prompt = final_prompt.text 105 | result = model.query(final_prompt) 106 | if not isinstance(result, Score): 107 | result = model.get_response_texts(result)[0] 108 | else: 109 | result = result.score 110 | 111 | results.append(result) 112 | else: 113 | for sample in samples: 114 | prep = {} 115 | for i, input in enumerate(inputs): 116 | prep[input] = sample[i] 117 | 118 | final_prompt = self.prompt_template.invoke(prep) 119 | if isinstance(model, ChatGPT): 120 | final_prompt = final_prompt.text 121 | result = model.query(final_prompt) 122 | if not isinstance(result, Score): 123 | result = model.get_response_texts(result)[0] 124 | else: 125 | result = result.score 126 | 127 | results.append(result) 128 | 129 | # Store the results 130 | if self.reference_txt is not None: 131 | with open(os.path.join(self.result_dir_path, name + "_judge_ref.json"), "w") as f: 132 | json.dump({"data": results}, f, indent=4) 133 | return 134 | with open(os.path.join(self.result_dir_path, name + "_judge.json"), "w") as f: 135 | json.dump({"data": results}, f, indent=4) 136 | -------------------------------------------------------------------------------- /CheckEmbed/operations/operations.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 ETH Zurich. 2 | # All rights reserved. 3 | # 4 | # Use of this source code is governed by a BSD-style license that can be 5 | # found in the LICENSE file. 6 | # 7 | # main author: Lorenzo Paleari 8 | 9 | from abc import ABC, abstractmethod 10 | from typing import Any 11 | 12 | class Operation(ABC): 13 | """ 14 | Abstract base class that defines the interface for all operations to be performed on the embeddings/samples. 15 | """ 16 | 17 | def __init__(self, result_dir_path: str) -> None: 18 | """ 19 | Initialize the operation. 20 | 21 | :param result_dir_path: The path to the directory where the results will be stored. 22 | :type result_dir_path: str 23 | """ 24 | self.result_dir_path = result_dir_path 25 | 26 | @abstractmethod 27 | def execute(self, custom_inputs: Any = None) -> Any: 28 | """ 29 | Execute the operation on the embeddings/samples. 30 | 31 | :param custom_inputs: The custom inputs for the operation. Defaults to None. 32 | :type custom_inputs: Any 33 | """ 34 | pass 35 | -------------------------------------------------------------------------------- /CheckEmbed/parser/__init__.py: -------------------------------------------------------------------------------- 1 | from .parser import Parser -------------------------------------------------------------------------------- /CheckEmbed/parser/parser.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 ETH Zurich. 2 | # All rights reserved. 3 | # 4 | # Use of this source code is governed by a BSD-style license that can be 5 | # found in the LICENSE file. 6 | # 7 | # main author: Lorenzo Paleari 8 | 9 | from abc import ABC, abstractmethod 10 | from typing import Any, List, Union 11 | 12 | from PIL.Image import Image 13 | 14 | 15 | class Parser(ABC): 16 | """ 17 | Abstract base class that defines the interface for parsing. 18 | 19 | The class supports the following functionality: 20 | - take the raw data from a dataset and create the necessary prompts for the model 21 | - extract the ground truth 22 | - custom parsing of the model responses 23 | """ 24 | 25 | def __init__(self, dataset_path: str) -> None: 26 | """ 27 | Initialize the parser. 28 | 29 | :param dataset_path: The path to the dataset. 30 | :type dataset_path: str 31 | """ 32 | self.dataset_path = dataset_path 33 | 34 | @abstractmethod 35 | def prompt_generation(self, custom_inputs: Any = None) -> List[str]: 36 | """ 37 | Parse the dataset and generate the prompts for the model. 38 | 39 | :param custom_inputs: The custom inputs to the parser. Defaults to None. 40 | :type custom_inputs: Any 41 | :return: List of prompts. 42 | :rtype: List[str] 43 | """ 44 | pass 45 | 46 | @abstractmethod 47 | def ground_truth_extraction(self, custom_inputs: Any = None) -> List[str]: 48 | """ 49 | Parse the dataset and extract the ground truth. 50 | 51 | :param custom_inputs: The custom inputs to the parser. Defaults to None. 52 | :type custom_inputs: Any 53 | :return: List of ground truths. 54 | :rtype: List[str] 55 | """ 56 | pass 57 | 58 | def answer_parser(self, responses: List[List[Union[str, Image]]], custom_inputs: Any = None) -> List[List[Union[str, Image]]]: 59 | """ 60 | Parse the responses from the model. 61 | 62 | The default behavior is to return the responses as they are. 63 | Overwrite this method if you want to parse the responses in a different way. You can use the CustomParser 64 | classes in the examples folder as reference. 65 | 66 | Remember that the responses returned from this method will be stored in a file and used for the evaluation, 67 | so please follow the following format, when returning the responses: 68 | [ 69 | [response1_prompt1, response2_prompt1, ...], 70 | [response1_prompt2, response2_prompt2, ...], 71 | ... 72 | ] 73 | 74 | :param responses: The responses from the model. 75 | :type responses: List[List[Union[str, Image]]] 76 | :param custom_inputs: The custom inputs to the parser. Defaults to None. 77 | :type custom_inputs: Any 78 | :return: The parsed responses. 79 | :rtype: List[List[Union[str, Image]]] 80 | """ 81 | return responses 82 | -------------------------------------------------------------------------------- /CheckEmbed/plotters/README.md: -------------------------------------------------------------------------------- 1 | ## Plotters 2 | 3 | Old plotters, they need to be updated to the new version of the library. -------------------------------------------------------------------------------- /CheckEmbed/plotters/__init__.py: -------------------------------------------------------------------------------- 1 | from .plot_operations import PlotOperation 2 | from .bert_plot import BertPlot 3 | from .checkembed_plot import CheckEmbedPlot 4 | from .raw_embedding_heatmap import RawEmbeddingHeatPlot 5 | from .selfcheckgpt_plot import SelfCheckGPTPlot 6 | -------------------------------------------------------------------------------- /CheckEmbed/plotters/bert_plot.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 ETH Zurich. 2 | # All rights reserved. 3 | # 4 | # Use of this source code is governed by a BSD-style license that can be 5 | # found in the LICENSE file. 6 | # 7 | # main author: Lorenzo Paleari 8 | 9 | import os 10 | import json 11 | 12 | import matplotlib.pyplot as plt 13 | import numpy as np 14 | from typing import Any 15 | 16 | from CheckEmbed.plotters import PlotOperation 17 | 18 | class BertPlot(PlotOperation): 19 | """ 20 | The BertPlot class handles the plotting of BERTScore data. 21 | 22 | Inherits from the PlotOperation class and implements its abstract methods. 23 | """ 24 | 25 | def __init__(self, result_dir_path: str, data_dir_path: str) -> None: 26 | """ 27 | Initialize the operation. 28 | 29 | :param result_dir_path: The path to the directory where the results will be stored. 30 | :type result_dir_path: str 31 | :param data_dir_path: The path to the directory where the data is stored. 32 | :type data_dir_path: str 33 | """ 34 | super().__init__(result_dir_path, data_dir_path) 35 | 36 | def execute(self, custom_inputs: Any = None) -> Any: 37 | """ 38 | Plot the data. 39 | 40 | :param custom_inputs: The custom inputs for the operation. Defaults to None. 41 | :type custom_inputs: Any 42 | """ 43 | print("Running BertPlot operation.") 44 | 45 | for file in os.listdir(self.data_dir_path): 46 | if ".json" in file: 47 | 48 | if not os.path.exists(os.path.join(self.result_dir_path, file.split("_")[0])): 49 | os.mkdir(os.path.join(self.result_dir_path, file.split("_")[0])) 50 | 51 | with open(os.path.join(self.data_dir_path, file), "r") as f: 52 | data = json.load(f) 53 | 54 | data_array = data["data"] 55 | results = [np.array(d["result"]) for d in data_array] 56 | frobenius_norms = [np.array(d["frobenius_norm"]) for d in data_array] 57 | 58 | # Plot a separate heatmap for every example 59 | for index, result in enumerate(results): 60 | fig, ax = plt.subplots(figsize=(12, 10)) # Adjust the figure size as needed 61 | 62 | im = ax.imshow(result, cmap='YlGnBu', interpolation='nearest', aspect="auto", vmin=-1, vmax=1) 63 | plt.colorbar(im, ax=ax) # Use ax argument to specify the axis for the colorbar 64 | 65 | plt.title(f"Heatmap of BertScore of Example {index}", weight='bold', fontsize=26) # Add a title with index starting from 1 66 | plt.xlabel("LLM Reply ID or Ground-Truth (GT)", fontsize=18) 67 | plt.ylabel("LLM Reply ID or Ground-Truth (GT)", fontsize=18) 68 | 69 | # Set ticks and labels 70 | tick_labels = list(range(1, result.shape[0])) + ['GT'] if custom_inputs["ground_truth"] else list(range(1, result.shape[0] + 1)) 71 | ax.set_xticks(np.arange(result.shape[0])) 72 | ax.set_yticks(np.arange(result.shape[0])) 73 | ax.set_xticklabels(tick_labels, fontsize=18) 74 | ax.set_yticklabels(tick_labels, fontsize=18) 75 | 76 | # Add numbers to the heatmap 77 | for i in range(result.shape[0]): 78 | for j in range(result.shape[0]): 79 | text = ax.text(j, i, round(result[i, j], 2), ha="center", va="center", color="red", fontsize=18) 80 | 81 | plt.savefig(os.path.join(self.result_dir_path, file.split("_")[0], f"example_{index}.pdf"), bbox_inches='tight') 82 | plt.close() 83 | 84 | # Plot the Frobenius norm of the cosine similarity matrices 85 | fig, ax = plt.subplots() 86 | ax.bar(range(len(frobenius_norms)), frobenius_norms) 87 | ax.set_xlabel("Prompt") 88 | ax.set_ylabel("Frobenius Norm") 89 | ax.set_title("Frobenius Norm of BertScore Matrices") 90 | 91 | tick_labels = [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0] 92 | ax.set_yticks(tick_labels) 93 | ax.set_yticklabels(tick_labels) 94 | tick_labels = list(range(1, len(frobenius_norms) + 1)) 95 | ax.set_xticks(np.arange(len(frobenius_norms))) 96 | ax.set_xticklabels(tick_labels) 97 | 98 | plt.savefig(os.path.join(self.result_dir_path, file.split("_")[0], "frobenius_norm.pdf"), bbox_inches='tight') 99 | plt.close() 100 | -------------------------------------------------------------------------------- /CheckEmbed/plotters/plot_operations.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 ETH Zurich. 2 | # All rights reserved. 3 | # 4 | # Use of this source code is governed by a BSD-style license that can be 5 | # found in the LICENSE file. 6 | # 7 | # main author: Lorenzo Paleari 8 | 9 | from abc import abstractmethod 10 | from typing import Any 11 | 12 | from CheckEmbed.operations import Operation 13 | 14 | class PlotOperation(Operation): 15 | """ 16 | Abstract base class that defines the interface for all operations that plot data. 17 | """ 18 | 19 | def __init__(self, result_dir_path: str, data_dir_path: str) -> None: 20 | """ 21 | Initialize the operation. 22 | 23 | :param result_dir_path: The path to the directory where the results will be stored. 24 | :type result_dir_path: str 25 | :param data_dir_path: The path to the directory where the data is stored. 26 | :type data_dir_path: str 27 | """ 28 | super().__init__(result_dir_path) 29 | self.data_dir_path = data_dir_path 30 | 31 | @abstractmethod 32 | def execute(self, custom_inputs: Any = None) -> None: 33 | """ 34 | Plot the data. 35 | 36 | :param custom_inputs: The custom inputs for the operation. Defaults to None. 37 | :type custom_inputs: Any 38 | """ 39 | pass -------------------------------------------------------------------------------- /CheckEmbed/plotters/raw_embedding_heatmap.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 ETH Zurich. 2 | # All rights reserved. 3 | # 4 | # Use of this source code is governed by a BSD-style license that can be 5 | # found in the LICENSE file. 6 | # 7 | # main author: Lorenzo Paleari 8 | 9 | import os 10 | import json 11 | 12 | import numpy as np 13 | import matplotlib.pyplot as plt 14 | from typing import Any 15 | 16 | from CheckEmbed.plotters import PlotOperation 17 | 18 | class RawEmbeddingHeatPlot(PlotOperation): 19 | """ 20 | The RawEmbeddingHeatPlot class handles the plotting of the raw embedding data. 21 | 22 | Inherits from the PlotOperation class and implements its abstract methods. 23 | """ 24 | 25 | def __init__(self, result_dir_path: str, data_dir_path: str) -> None: 26 | """ 27 | Initialize the operation. 28 | 29 | :param result_dir_path: The path to the directory where the results will be stored. 30 | :type result_dir_path: str 31 | :param data_dir_path: The path to the directory where the data is stored. 32 | :type data_dir_path: str 33 | """ 34 | super().__init__(result_dir_path, data_dir_path) 35 | 36 | def execute(self, custom_inputs: Any = None) -> Any: 37 | """ 38 | Plot the data. 39 | 40 | :param custom_inputs: The custom inputs for the operation. Defaults to None. 41 | :type custom_inputs: Any 42 | """ 43 | print("Running RawEmbeddingHeatPlot operation.") 44 | 45 | for file in os.listdir(self.data_dir_path): 46 | if ".json" in file and not file.startswith("ground_truth_"): 47 | 48 | folder_name = file.replace("_" + file.split("_")[2], "") 49 | file_name_completion_for_ground_truth = file.replace(file.split("_")[0] + "_", "") 50 | 51 | # Directory creation 52 | if not os.path.exists(os.path.join(self.result_dir_path, folder_name)): 53 | os.mkdir(os.path.join(self.result_dir_path, folder_name)) 54 | 55 | if not os.path.exists(os.path.join(self.result_dir_path, folder_name, "raw_embeddings_heat_map")): 56 | os.mkdir(os.path.join(self.result_dir_path, folder_name, "raw_embeddings_heat_map")) 57 | 58 | # Load the sample embeddings 59 | with open(os.path.join(self.data_dir_path, file), "r") as f: 60 | data = json.load(f) 61 | data_array = data["data"] 62 | embeddings = [d["embeddings"] for d in data_array] # Convert to numpy array 63 | # Remove empty ones inside embedding 64 | for embedding in embeddings: 65 | new_embedding = [] 66 | for index, emb in enumerate(embedding): 67 | if len(emb) == 0: 68 | continue 69 | new_embedding.append(emb) 70 | embeddings[embeddings.index(embedding)] = new_embedding 71 | embeddings = [np.array(embedding) for embedding in embeddings] 72 | 73 | # Load the definition embeddings 74 | if custom_inputs["ground_truth"]: 75 | with open(os.path.join(self.data_dir_path, "ground_truth_" + file_name_completion_for_ground_truth), "r") as f: 76 | definitions = json.load(f) 77 | definitions = definitions["data"] 78 | definitions_embedded = [np.array(d["embeddings"]) for d in definitions] 79 | 80 | for index, embedding in enumerate(embeddings): 81 | if len(embedding) == 0: 82 | continue 83 | embedding = np.vstack([embedding, definitions_embedded[index].reshape(1, -1)]) if len(definitions_embedded[index]) != 0 else embedding 84 | embeddings[index] = embedding 85 | 86 | # Find the min and max values for the colorbar 87 | min_value = float('inf') 88 | max_value = float('-inf') 89 | 90 | for embedding in embeddings: 91 | if len(embedding) == 0: 92 | continue 93 | min_value = min(min_value, np.min(embedding)) 94 | max_value = max(max_value, np.max(embedding)) 95 | 96 | # Plot each heatmap 97 | for index, embedding in enumerate(embeddings): 98 | if len(embedding) == 0: 99 | continue 100 | fig, ax = plt.subplots(figsize=(12, 6)) # Adjust the figure size as needed 101 | 102 | im = ax.imshow(embedding, cmap='YlGnBu', interpolation='nearest', aspect="auto", vmin=min_value, vmax=max_value) 103 | plt.colorbar(im, ax=ax) # Use ax argument to specify the axis for the colorbar 104 | 105 | plt.title(f"Heatmap of Example {index}", weight='bold', fontsize=26) # Add a title with index starting from 1 106 | plt.xlabel("i-th element of the embedded answers", fontsize=18) 107 | plt.ylabel("Embedded Answers", fontsize=18) 108 | 109 | # Set ticks and labels 110 | tick_labels = list(range(1, embedding.shape[0])) + ['GT'] if custom_inputs["ground_truth"] and len(definitions_embedded[index]) > 0 else list(range(1, embedding.shape[0] + 1)) 111 | ax.set_yticks(np.arange(embedding.shape[0])) 112 | ax.set_yticklabels(tick_labels, fontsize=18) 113 | 114 | plt.savefig(os.path.join(self.result_dir_path, folder_name, "raw_embeddings_heat_map", f"raw_embeddings_heat_map_{index}.pdf"), bbox_inches='tight') 115 | plt.close() 116 | -------------------------------------------------------------------------------- /CheckEmbed/plotters/selfcheckgpt_plot.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 ETH Zurich. 2 | # All rights reserved. 3 | # 4 | # Use of this source code is governed by a BSD-style license that can be 5 | # found in the LICENSE file. 6 | # 7 | # main author: Lorenzo Paleari 8 | 9 | import os 10 | import json 11 | 12 | import matplotlib.pyplot as plt 13 | from typing import Any 14 | 15 | from CheckEmbed.plotters import PlotOperation 16 | 17 | class SelfCheckGPTPlot(PlotOperation): 18 | """ 19 | The SelfCheckGPTPlot class handles the plotting of SelfCheckGPT data. 20 | 21 | Inherits from the PlotOperation class and implements its abstract methods. 22 | """ 23 | 24 | def __init__(self, result_dir_path: str, data_dir_path: str) -> None: 25 | """ 26 | Initialize the operation. 27 | 28 | :param result_dir_path: The path to the directory where the results will be stored. 29 | :type result_dir_path: str 30 | :param data_dir_path: The path to the directory where the data is stored. 31 | :type data_dir_path: str 32 | """ 33 | super().__init__(result_dir_path, data_dir_path) 34 | 35 | def execute(self, custom_inputs: Any = None) -> Any: 36 | """ 37 | Plot the data. 38 | 39 | :param custom_inputs: The custom inputs for the operation. Defaults to None. 40 | :type custom_inputs: Any 41 | """ 42 | print("Running SelfCheckGPTPlot operation.") 43 | 44 | for file in os.listdir(self.data_dir_path): 45 | if ".json" in file: 46 | 47 | if not os.path.exists(os.path.join(self.result_dir_path, file.split("_")[0])): 48 | os.mkdir(os.path.join(self.result_dir_path, file.split("_")[0])) 49 | 50 | with open(os.path.join(self.data_dir_path, file), "r") as f: 51 | data = json.load(f) 52 | 53 | data_array = data["data"] 54 | results = [d["result"] for d in data_array] 55 | passage_scores = [d["passage_score"] for d in data_array] 56 | 57 | # Bar plot for every one of the examples 58 | for index, result in enumerate(results): 59 | if len(result) == 0: 60 | continue 61 | fig, ax = plt.subplots() 62 | ax.bar(range(len(result)), result) 63 | 64 | # Set ticks from 0 to 1 65 | ticks = [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1] 66 | 67 | ax.set_xticks(range(len(result))) 68 | ax.set_xticklabels(range(len(result))) 69 | ax.set_yticks(ticks) 70 | ax.set_yticklabels(ticks) 71 | ax.set_xlabel("Sentence") 72 | ax.set_ylabel("SelfCheckGPT Sentence Score") 73 | ax.set_title(f"SelfCheckGPT Score for Prompt {int(index)}") 74 | plt.savefig(os.path.join(self.result_dir_path, file.split("_")[0], f"prompt_{int(index)}.pdf"), bbox_inches='tight') 75 | plt.close() 76 | 77 | # Bar plot for the passage scores 78 | fig, ax = plt.subplots() 79 | ax.bar(range(len(passage_scores)), passage_scores) 80 | ax.set_xticks(range(len(passage_scores))) 81 | ax.set_xticklabels(range(len(passage_scores))) 82 | ax.set_xlabel("Prompt") 83 | ax.set_ylabel("SelfCheckGPT Passage Score") 84 | ax.set_title("SelfCheckGPT Score for Passages") 85 | plt.savefig(os.path.join(self.result_dir_path, file.split("_")[0], "passage_scores.pdf"), bbox_inches='tight') 86 | plt.close() 87 | -------------------------------------------------------------------------------- /CheckEmbed/scheduler/__init__.py: -------------------------------------------------------------------------------- 1 | from .scheduler import Scheduler, StartingPoint -------------------------------------------------------------------------------- /CheckEmbed/utility/__init__.py: -------------------------------------------------------------------------------- 1 | from .stderr_filter import capture_specific_stderr 2 | from .utility import cosine_similarity, frobenius_norm, frobenius_norm_no_diag, matrix_std_dev_no_diag 3 | -------------------------------------------------------------------------------- /CheckEmbed/utility/stderr_filter.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 ETH Zurich. 2 | # All rights reserved. 3 | # 4 | # Use of this source code is governed by a BSD-style license that can be 5 | # found in the LICENSE file. 6 | # 7 | # main author: Lorenzo Paleari 8 | 9 | import contextlib 10 | import sys 11 | import tempfile 12 | import os 13 | 14 | from typing import Generator, List, Union, Any 15 | 16 | class FilteredStderr: 17 | """ 18 | A class that captures and filters stderr output. 19 | 20 | The class creates a temporary file to capture the stderr stream and filters the stream based on target string(s). 21 | """ 22 | 23 | def __init__(self, target_string: Union[List[str], str]) -> None: 24 | """ 25 | Initializes a FilteredStderr instance. 26 | 27 | :param target_string: Target string(s) for filtering stderr. 28 | :type target_string: Union[List[str], str] 29 | """ 30 | self.target_string = target_string 31 | self.captured = "" 32 | self.original_stderr_fd = None 33 | self.temp_fd = None 34 | self.temp_file = None 35 | 36 | def start(self) -> None: 37 | """ 38 | Start capturing stderr and redirecting the stream to a temporary file. 39 | """ 40 | # Save the original stderr file descriptor 41 | self.original_stderr_fd = os.dup(2) 42 | # Create a temporary file and file descriptor to capture stderr 43 | self.temp_file = tempfile.TemporaryFile(mode='w+') 44 | self.temp_fd = self.temp_file.fileno() 45 | # Redirect stderr to the temporary file 46 | os.dup2(self.temp_fd, 2) 47 | 48 | def stop(self) -> None: 49 | """ 50 | Stop capturing stderr. Filter the stream for the target string(s) and restore the original stderr file descriptor. 51 | """ 52 | # Restore the original stderr file descriptor 53 | os.dup2(self.original_stderr_fd, 2) 54 | os.close(self.original_stderr_fd) 55 | self.original_stderr_fd = None 56 | # Read the captured output 57 | self.temp_file.seek(0) 58 | output = self.temp_file.read() 59 | self.temp_file.close() 60 | self.temp_fd = None 61 | self.temp_file = None 62 | # Filter the output 63 | for line in output.splitlines(): 64 | # target string can be an array 65 | if isinstance(self.target_string, str): 66 | self.target_string = [self.target_string] 67 | 68 | captured = False 69 | for target in self.target_string: 70 | if target in line: 71 | self.captured += line + "\n" 72 | captured = True 73 | break 74 | 75 | if not captured: 76 | sys.__stderr__.write(line + "\n") 77 | 78 | 79 | @contextlib.contextmanager 80 | def capture_specific_stderr(custom_target: Union[List[str], str] = None) -> Generator[FilteredStderr, None, None]: 81 | """ 82 | Context manager that captures and filters the stderr stream. 83 | 84 | :param custom_target: Target string(s) for filtering stderr. If None, default target strings are loaded from a file. 85 | :type custom_target: Union[List[str], str] 86 | :return: A FilteredStderr instance, which can be used to access the captured stderr stream. 87 | :rtype: Generator[FilteredStderr, None, None] 88 | """ 89 | # load default target strings from file 90 | if custom_target is None: 91 | with open(os.path.join(os.path.dirname(__file__), "./stderr_filter.txt"), "r") as file: 92 | custom_target = file.read().splitlines() 93 | 94 | filtered_stderr = FilteredStderr(custom_target) 95 | filtered_stderr.start() 96 | try: 97 | yield filtered_stderr 98 | finally: 99 | filtered_stderr.stop() 100 | -------------------------------------------------------------------------------- /CheckEmbed/utility/stderr_filter.txt: -------------------------------------------------------------------------------- 1 | Empty candidate sentence detected; setting raw BERTscores to 0 2 | Warning: Empty reference sentence detected; setting raw BERTScores to 0 3 | Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight'] 4 | You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference. 5 | `clean_up_tokenization_spaces` was not set. It will be set to `True` by default. This behavior will be depracted in transformers v4.45, and will be then set to `False` by default. 6 | For more details check this issue: https://github.com/huggingface/transformers/issues/31884 7 | warnings.warn( -------------------------------------------------------------------------------- /CheckEmbed/utility/utility.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 ETH Zurich. 2 | # All rights reserved. 3 | # 4 | # Use of this source code is governed by a BSD-style license that can be 5 | # found in the LICENSE file. 6 | # 7 | # main author: Lorenzo Paleari 8 | 9 | import numpy as np 10 | import math 11 | 12 | # These values are the lowest empiricial values observed for a given 13 | # embedding model during our evaluation. 14 | REBASING_VALUES = { 15 | "gpt-embedding-large": 0.36156142737003805, 16 | "sfr-embedding-mistral": 0.4590856938212389, 17 | "e5-mistral-7B-instruct": 0.5347691513588488, 18 | "gte-qwen1.5-7B-instruct": 0.17701296296393593, 19 | "stella-en-400M-v5": 0.3189337589450308, 20 | "stella-en-1.5B-v5": 0.3655769126487221, 21 | } 22 | 23 | def cosine_similarity(a: np.ndarray, b: np.ndarray, rebase: bool = False, emb_name: str = "") -> float: 24 | """ 25 | Compute cosine similarity between two vectors. 26 | 27 | :param a: The first vector. 28 | :type a: np.ndarray 29 | :param b: The second vector. 30 | :type b: np.ndarray 31 | :param rebase: Whether to rebase the cosine similarity. Defaults to False. 32 | :type rebase: bool 33 | :param emb_name: The name of the embedding model. Defaults to "". 34 | :type emb_name: str 35 | :return: The cosine similarity between the two vectors. 36 | :rtype: float 37 | """ 38 | global REBASING_VALUES 39 | 40 | # Special case for empty vectors 41 | if len(a) == 0 and len(b) == 0: 42 | return 1.0 43 | if len(a) == 0 or len(b) == 0: 44 | return -1.0 45 | 46 | # Compute the cosine similarity 47 | dot_product = np.dot(a, b) 48 | norm_a = np.linalg.norm(a) 49 | norm_b = np.linalg.norm(b) 50 | cos_similarity = dot_product / (norm_a * norm_b) 51 | 52 | if rebase and emb_name in REBASING_VALUES: 53 | # Rebase the cosine similarity 54 | cos_similarity = 2 * (cos_similarity - REBASING_VALUES[emb_name]) / (1.0 - REBASING_VALUES[emb_name]) - 1.0 55 | cos_similarity = 1.0 if cos_similarity > 1.0 else -1.0 if cos_similarity < -1.0 else cos_similarity 56 | 57 | return cos_similarity 58 | 59 | def frobenius_norm(matrix: np.ndarray, bert: bool = False) -> float: 60 | """ 61 | Compute the Frobenius norm of the input matrix normalized by the number of elements in the matrix. 62 | 63 | :param matrix: Input matrix. 64 | :type matrix: np.ndarray 65 | :param bert: Whether the matrix is a BertScore matrix. Defaults to False. 66 | :type bert: bool 67 | :return: Frobenius norm. 68 | :rtype: float 69 | """ 70 | adder = 1 71 | div = 4 72 | if bert: 73 | adder = 0 74 | div = 1 75 | sum = 0 76 | for i in range(matrix.shape[0]): 77 | for j in range(matrix.shape[1]): 78 | sum += (matrix[i, j] + adder) ** 2 79 | 80 | # normalize by the number of elements in the matrix 81 | return math.sqrt(sum / (matrix.shape[0] * matrix.shape[1] * div)) 82 | 83 | def frobenius_norm_no_diag(matrix: np.ndarray, bert: bool = False) -> float: 84 | """ 85 | Compute the Frobenius norm of the input matrix without its diagonal elements. 86 | The Frobenius is further normalized by the number of elements in the matrix. 87 | 88 | :param matrix: Input matrix. 89 | :type matrix: np.ndarray 90 | :param bert: Whether the matrix is a BertScore matrix. Defaults to False. 91 | :type bert: bool 92 | :return: Frobenius norm. 93 | :rtype: float 94 | """ 95 | matrix_no_diag = matrix[~np.eye(matrix.shape[0],dtype=bool)].reshape(matrix.shape[0],-1) 96 | return frobenius_norm(matrix_no_diag, bert) 97 | 98 | def matrix_std_dev_no_diag(matrix: np.ndarray) -> float: 99 | """ 100 | Compute the standard deviation of the input matrix without its diagonal elements. 101 | 102 | :param matrix: Input matrix. 103 | :type matrix: np.ndarray 104 | :return: Standard deviation. 105 | :rtype: float 106 | """ 107 | matrix_no_diag = matrix[~np.eye(matrix.shape[0],dtype=bool)].reshape(matrix.shape[0],-1) 108 | return np.std(matrix_no_diag) 109 | -------------------------------------------------------------------------------- /CheckEmbed/vision_models/README.md: -------------------------------------------------------------------------------- 1 | # Vision Models 2 | 3 | The Vision Models module is responsible for managing the vision models. 4 | 5 | Currently, the framework supports the following vision model: 6 | 7 | - stabilityai/stable-diffusion-3.5-medium (local - GPU with 12GB VRAM recommended, model size is roughly 6GB ) 8 | 9 | The following sections describe how to instantiate the model and how to add new models to the framework. 10 | 11 | ## Vision Model Instantiation 12 | 13 | If your model needs a configuration file, follow these steps: 14 | 15 | - Create a copy of `config_template.json` named `config.json` in the CheckEmbed folder. (Not necessary for local models) 16 | - Fill configuration details based on the used model. 17 | 18 | ### Local Models 19 | 20 | The framework currently supports the following local model: `stabilityai/stable-diffusion-3.5-medium`. 21 | 22 | - Instantiate the vision model based on the owned device. 23 | - Device can be specified in the `Scheduler`, more [here](/CheckEmbed/scheduler/scheduler.py). 24 | 25 | ```python 26 | stable_diffusion = vision_models.StableDiffusion3( 27 | model_name = "stabilityai/stable-diffusion-3.5-medium", 28 | cache = False, 29 | data_type = torch.bfloat16, 30 | num_inference_steps = 40, 31 | guidance_scale = 4.5 32 | ) 33 | ``` 34 | 35 | ## Adding Vision Models 36 | 37 | More vision models can be added by following these steps: 38 | 39 | - Create new class as a subclass of `AbstractVisionModel`. 40 | - Use the constructor for loading the configuration and instantiating the vision model (if needed). 41 | 42 | ```python 43 | class CustomVisionModel(AbstractVisionModel): 44 | def __init__( 45 | self, 46 | config_path: str = "", 47 | model_name: str = "official model-name", 48 | name: str = "CustomVisionModel", 49 | cache: bool = False 50 | ) -> None: 51 | super().__init__(config_path, model_name, name, cache) 52 | self.config: Dict = self.config[model_name] 53 | 54 | # Load data from configuration into variables if needed 55 | 56 | # Instantiate model if needed 57 | ``` 58 | 59 | - Implement the `load_model`, `unload_model` and `generate_image` abstract methods that are used to load/unload the model from the GPU (if necessary) and get a list of images from the model (remote API call or local model inference) respectively. 60 | 61 | ```python 62 | def load_model(self, device: str = None) -> None: 63 | """ 64 | Load the model and tokenizer based on the given model name. 65 | 66 | :param device: The device to load the model on. Defaults to None. 67 | :type device: str 68 | """ 69 | 70 | def unload_model(self) -> None: 71 | """ 72 | Unload the model and tokenizer. 73 | """ 74 | 75 | def generate_image( 76 | self, 77 | input: Union[List[str], str] 78 | ) -> List[Image]: 79 | # Call model and retrieve an Image 80 | # Return model response 81 | ``` 82 | -------------------------------------------------------------------------------- /CheckEmbed/vision_models/__init__.py: -------------------------------------------------------------------------------- 1 | from .abstract_vision_model import AbstractVisionModel 2 | from .stable_diffusion3_5 import StableDiffusion3 -------------------------------------------------------------------------------- /CheckEmbed/vision_models/abstract_vision_model.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2025 ETH Zurich. 2 | # All rights reserved. 3 | # 4 | # Use of this source code is governed by a BSD-style license that can be 5 | # found in the LICENSE file. 6 | # 7 | # main author: Lorenzo Paleari 8 | 9 | import json 10 | import logging 11 | from abc import ABC, abstractmethod 12 | from typing import Any, Dict, List, Union 13 | 14 | from PIL.Image import Image 15 | 16 | 17 | class AbstractVisionModel(ABC): 18 | """ 19 | Abstract base class that defines the interface for all vision models. 20 | """ 21 | 22 | def __init__( 23 | self, config_path: str = None, model_name: str = "", name: str = "INVALID_NAME", cache: bool = False 24 | ) -> None: 25 | """ 26 | Initialize the AbstractVisionModel instance with configuration, model details, and caching options. 27 | 28 | :param config_path: Path to the config file. If provided, the config is loaded from the file. Defaults to "". 29 | :type config_path: str 30 | :param model_name: Name of the vision model. Defaults to "". 31 | :type model_name: str 32 | :param name: Name of the vision model. Defaults to "INVALID_NAME". 33 | :type name: str 34 | :param cache: Flag to determine whether to cache responses. Defaults to False. 35 | :type cache: bool 36 | """ 37 | self.logger = logging.getLogger(self.__class__.__name__) 38 | self.config: Dict = None 39 | self.model_name: str = model_name 40 | self.cache = cache 41 | if self.cache: 42 | self.response_cache: Dict[str, List[Any]] = {} 43 | if config_path is not None: 44 | self.load_config(config_path) 45 | self.name: str = name 46 | try: 47 | if self.config is not None: 48 | if self.config[model_name] is not None: 49 | self.name = self.config[model_name]["name"] 50 | except Exception: 51 | pass 52 | self.prompt_tokens: int = 0 53 | self.cost: float = 0.0 54 | 55 | def load_config(self, path: str) -> None: 56 | """ 57 | Load configuration from a specified path. 58 | 59 | :param path: Path to the config file. 60 | :type path: str 61 | """ 62 | with open(path, "r") as f: 63 | self.config = json.load(f) 64 | 65 | self.logger.debug(f"Loaded config from {path} for {self.model_name}") 66 | 67 | def clear_cache(self) -> None: 68 | """ 69 | Clear the response cache. 70 | """ 71 | self.response_cache.clear() 72 | 73 | @abstractmethod 74 | def load_model(self, device: str = None) -> None: 75 | """ 76 | Abstract method to load the vision model. 77 | 78 | :param device: The device to load the model on. Defaults to None. 79 | :type device: str 80 | """ 81 | pass 82 | 83 | @abstractmethod 84 | def unload_model(self) -> None: 85 | """ 86 | Abstract method to unload the vision model. 87 | """ 88 | pass 89 | 90 | @abstractmethod 91 | def generate_image(self, input: Union[List[str], str]) -> List[Image]: 92 | """ 93 | Abstract method to generate images for the given input text. 94 | 95 | :param input: A list of prompts or a single prompt string to generate images for. 96 | :type input: Union[List[str], str] 97 | :return: The generated images. 98 | :rtype: List[Image] 99 | """ 100 | pass 101 | -------------------------------------------------------------------------------- /CheckEmbed/vision_models/stable_diffusion3_5.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2025 ETH Zurich. 2 | # All rights reserved. 3 | # 4 | # Use of this source code is governed by a BSD-style license that can be 5 | # found in the LICENSE file. 6 | # 7 | # main authors: Lorenzo Paleari 8 | # Eric Schreiber 9 | 10 | import gc 11 | from typing import List, Union 12 | 13 | import torch 14 | from diffusers import StableDiffusion3Pipeline 15 | from PIL.Image import Image 16 | from tqdm import tqdm 17 | 18 | from CheckEmbed.vision_models import AbstractVisionModel 19 | 20 | 21 | class StableDiffusion3(AbstractVisionModel): 22 | """ 23 | The StableDiffusion3 class handles interactions with the Stable Diffusion 3.5 Medium model using the provided configuration. 24 | 25 | Inherits from the AbstractVisionModel class and implements its abstract methods. 26 | """ 27 | 28 | def __init__( 29 | self, model_name: str = "", name: str = "stable-diffusion3.5-medium", cache: bool = False, data_type: torch.dtype = torch.bfloat16, num_inference_steps: int = 40, guidance_scale: float = 4.5 30 | ) -> None: 31 | """ 32 | Initialize the StableDiffusion3 instance with configuration, model details, and caching options. 33 | 34 | :param model_name: Name of the model, which is used to select the correct configuration. Defaults to "". 35 | :type model_name: str 36 | :param name: Name used for output files. Defaults to "stable-diffusion3.5-medium". 37 | :type name: str 38 | :param cache: Flag to determine whether to cache responses. Defaults to False. 39 | :type cache: bool 40 | :param data_type: The data type for the model, typically torch.bfloat16 or torch.float32. Defaults to torch.bfloat16. 41 | :type data_type: torch.dtype 42 | :param num_inference_steps: The number of inference steps for image generation. Defaults to 40. 43 | :type num_inference_steps: int 44 | :param guidance_scale: The guidance scale for image generation, which controls the adherence to the prompt. Defaults to 4.5. 45 | :type guidance_scale: float 46 | """ 47 | super().__init__(model_name=model_name, name=name, cache=cache) 48 | self.data_type = data_type 49 | self.num_inference_steps = num_inference_steps 50 | self.guidance_scale = guidance_scale 51 | 52 | def load_model(self, device: str = None) -> None: 53 | """ 54 | Load the model and tokenizer based on the given model name. 55 | 56 | :param device: The device to load the model on. Defaults to None. 57 | :type device: str 58 | """ 59 | 60 | self.model = StableDiffusion3Pipeline.from_pretrained(self.model_name, torch_dtype=self.data_type) 61 | self.model = self.model.to(device) 62 | 63 | def unload_model(self) -> None: 64 | """ 65 | Unload the model and tokenizer. 66 | """ 67 | del self.model 68 | 69 | gc.collect() 70 | torch.cuda.empty_cache() 71 | 72 | self.model = None 73 | 74 | def generate_image(self, input: Union[List[str], str]) -> List[Image]: 75 | """ 76 | Generate images based on the input prompts using the Stable Diffusion 3.5 Medium model. 77 | 78 | This method takes a list of prompts or a single prompt string, generates images for each prompt, 79 | and returns a list of generated images. The prompts are processed in batches to optimize performance. 80 | 81 | :param input: A list of prompts or a single prompt string to generate images for. 82 | :type input: Union[List[str], str] 83 | :return: A list of generated images corresponding to the input prompts. 84 | :rtype: List[Image] 85 | """ 86 | if isinstance(input, str): 87 | input = [input] 88 | 89 | images = [] 90 | for prompt in tqdm(input, desc="Images to Generate", leave=False, total=len(input)): 91 | # Generate images in batches 92 | image = self.model( 93 | prompt, 94 | num_inference_steps=self.num_inference_steps, 95 | guidance_scale=self.guidance_scale, 96 | ).images[0] 97 | 98 | images.append(image) 99 | 100 | return images 101 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2024 ETH Zurich. 2 | All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without 5 | modification, are permitted provided that the following conditions are 6 | met: 7 | 8 | - Redistributions of source code must retain the above copyright 9 | notice, this list of conditions and the following disclaimer. 10 | 11 | - Redistributions in binary form must reproduce the above copyright 12 | notice, this list of conditions and the following disclaimer listed 13 | in this license in the documentation and/or other materials 14 | provided with the distribution. 15 | 16 | - Neither the name of the copyright holders nor the names of its 17 | contributors may be used to endorse or promote products derived from 18 | this software without specific prior written permission. 19 | 20 | The copyright holders provide no reassurances that the source code 21 | provided does not infringe any patent, copyright, or any other 22 | intellectual property rights of third parties. The copyright holders 23 | disclaim any liability to any recipient for claims brought against 24 | recipient by any third party for infringement of that parties 25 | intellectual property rights. 26 | 27 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 28 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 29 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 30 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 31 | OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 32 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 33 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 34 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 35 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 36 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 37 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 38 | 39 | 40 | Citation 41 | ======== 42 | 43 | Any published work which uses this software should include the 44 | following citation: 45 | 46 | ---------------------------------------------------------------------- 47 | Maciej Besta, Lorenzo Paleari, Ales Kubicek, Piotr Nyczyk, Robert 48 | Gerstenberger, Patrick Iff, Tomasz Lehmann, Hubert Niewiadomski, 49 | Torsten Hoefler: CheckEmbed: Effective Verification of LLM Solutions 50 | to Open-Ended Tasks. In: arXiv preprint arXiv:2406.02524 51 | ---------------------------------------------------------------------- 52 | 53 | 54 | CheckEmbed uses code from the Graph-of-Thoughts, SelfCheckGPT 55 | and HalluDetect frameworks. You find their original licenses below. 56 | 57 | Graph-of-Thoughts (https://github.com/spcl/graph-of-thoughts) 58 | ============================================================= 59 | 60 | Copyright (c) 2023 ETH Zurich. 61 | All rights reserved. 62 | 63 | Redistribution and use in source and binary forms, with or without 64 | modification, are permitted provided that the following conditions are 65 | met: 66 | 67 | - Redistributions of source code must retain the above copyright 68 | notice, this list of conditions and the following disclaimer. 69 | 70 | - Redistributions in binary form must reproduce the above copyright 71 | notice, this list of conditions and the following disclaimer listed 72 | in this license in the documentation and/or other materials 73 | provided with the distribution. 74 | 75 | - Neither the name of the copyright holders nor the names of its 76 | contributors may be used to endorse or promote products derived from 77 | this software without specific prior written permission. 78 | 79 | The copyright holders provide no reassurances that the source code 80 | provided does not infringe any patent, copyright, or any other 81 | intellectual property rights of third parties. The copyright holders 82 | disclaim any liability to any recipient for claims brought against 83 | recipient by any third party for infringement of that parties 84 | intellectual property rights. 85 | 86 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 87 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 88 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 89 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 90 | OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 91 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 92 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 93 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 94 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 95 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 96 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 97 | 98 | 99 | Citation 100 | ======== 101 | 102 | Any published work which uses this software should include the 103 | following citation: 104 | 105 | ---------------------------------------------------------------------- 106 | Maciej Besta, Nils Blach, Ales Kubicek, Robert Gerstenberger, Lukas 107 | Gianinazzi, Joanna Gajda, Tomasz Lehmann, Michał Podstawski, Hubert 108 | Niewiadomski, Piotr Nyczyk, Torsten Hoefler (2024): Graph of Thoughts: 109 | Solving Elaborate Problems with Large Language Models. In: Proceedings 110 | of the AAAI Conference on Artificial Intelligence, 38(16), 111 | 17682-17690. https://doi.org/10.1609/aaai.v38i16.29720 112 | ---------------------------------------------------------------------- 113 | 114 | 115 | SelfCheckGPT (https://github.com/potsawee/selfcheckgpt) 116 | ======================================================= 117 | 118 | MIT License 119 | 120 | Copyright (c) 2023 Potsawee Manakul 121 | 122 | Permission is hereby granted, free of charge, to any person obtaining a copy 123 | of this software and associated documentation files (the "Software"), to deal 124 | in the Software without restriction, including without limitation the rights 125 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 126 | copies of the Software, and to permit persons to whom the Software is 127 | furnished to do so, subject to the following conditions: 128 | 129 | The above copyright notice and this permission notice shall be included in all 130 | copies or substantial portions of the Software. 131 | 132 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 133 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 134 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 135 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 136 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 137 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 138 | SOFTWARE. 139 | 140 | Citation: 141 | ---------------------------------------------------------------------- 142 | Potsawee Manakul, Adian Liusie, Mark J.F. Gales (2023): SelfCheckGPT: 143 | Zero-Resource Black-Box Hallucination Detection for Generative Large 144 | Language Models. In: arXiv preprint arXiv:2303.08896 145 | ---------------------------------------------------------------------- 146 | 147 | 148 | HalluDetect (https://github.com/Rivas-AI/HalluDetect) 149 | ===================================================== 150 | We used an adapted version of HalluDetect as a baseline for comparison 151 | with CheckEmbed. 152 | 153 | MIT License 154 | 155 | Copyright (c) 2024 Fidac 156 | 157 | Permission is hereby granted, free of charge, to any person obtaining a copy 158 | of this software and associated documentation files (the "Software"), to deal 159 | in the Software without restriction, including without limitation the rights 160 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 161 | copies of the Software, and to permit persons to whom the Software is 162 | furnished to do so, subject to the following conditions: 163 | 164 | The above copyright notice and this permission notice shall be included in all 165 | copies or substantial portions of the Software. 166 | 167 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 168 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 169 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 170 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 171 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 172 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 173 | SOFTWARE. 174 | 175 | Citation: 176 | ----------------------------------------------------------------------- 177 | Ernesto Quevedo, Jorge Yero Salazar, Rachel Koerner, Pablo Rivas, Tomas 178 | Cerny (2024): Detecting Hallucinations in Large Language Model 179 | Generation: A Token Probability Approach. In: Proceedings of the 26th 180 | International Conference on Artificial Intelligence and Applications 181 | (ICAI '24) 182 | ----------------------------------------------------------------------- 183 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # CheckEmbed 2 | 3 |

4 | 5 |

6 | 7 | This is the official implementation of [CheckEmbed: Effective Verification of LLM Solutions to Open-Ended Tasks](https://arxiv.org/abs/2406.02524). 8 | 9 | This framework gives you the ability to verify LLM answers, especially for 10 | intricate open-ended tasks such as consolidation, summarization, and extraction 11 | of knowledge. CheckEmbed implements verification by running the LLMs' answers through 12 | an embedding model and comparing the corresponding answer-level embeddings. 13 | This reduction of a complex textual answer to a single embedding facilites a 14 | straightforward, fast, and meaningful verification, while showcasing 15 | significant improvements in accuracy, cost-effectiveness, and runtime 16 | performance compared to existing token-, sentence-, and fact-level schemes such 17 | as BERTScore or SelfCheckGPT. 18 | 19 | 20 | ## Setup Guide 21 | 22 | In order to use this framework, you need to have a working installation of Python 3.8 or newer. 23 | 24 | 25 | ### Installing CheckEmbed 26 | 27 | Before running either of the following two installation methods, make sure to activate your Python environment (if any) beforehand. 28 | If you are a user and you just want to use `CheckEmbed`, you can install it from source: 29 | ```bash 30 | git clone https://github.com/spcl/CheckEmbed.git 31 | cd CheckEmbed 32 | pip install . 33 | 34 | # If you want to use a CUDA GPU, please install the following environment as well. 35 | pip install ".[cuda]" 36 | ``` 37 | If you are a developer and you want to modify the code, you can install it in editable mode from source: 38 | ```bash 39 | git clone https://github.com/spcl/CheckEmbed.git 40 | cd CheckEmbed 41 | pip install -e . 42 | 43 | # If you want to use a CUDA GPU, please install the following environment as well. 44 | pip install -e ".[cuda]" 45 | ``` 46 | 47 | ### Configuring the Models 48 | 49 | In order to use parts of the framework, you need to have access to an LLM and/or an embedding model. 50 | Please follow the instructions in the READMEs of the respective modules to configure the [LLMs](CheckEmbed/language_models/README.md) and [embedding models](CheckEmbed/embedding_models/README.md) of your choice. 51 | Please create a copy of `config_template.json` named `config.json` in the CheckEmbed directory and update its details according to your needs. 52 | 53 | 54 | ## Documentation 55 | The paper gives a high-level overview of the framework and its components. 56 | In order to understand the framework in more detail, you can read the documentation of the individual modules. 57 | Especially the [Scheduler](CheckEmbed/scheduler/scheduler.py) module is important for understanding how to make the most out of the framework 58 | as well as the [Operation](CheckEmbed/operations/README.md) module for the interpretation of the results. 59 | 60 | 61 | ## Examples 62 | 63 | The [examples](examples) directory contains several examples of use cases that can be solved using the framework, including the ones presented in the paper. 64 | It is a great starting point for learning how to use the framework to solve real problems. 65 | Each example contains a `README.md` file with instructions on how to run it and play with it. 66 | 67 | 68 | ## Paper Results 69 | 70 | You can run the experiments from the paper by following the instructions in the [examples](examples) directory. 71 | However, if you just want to inspect and replot the results, you can use the [paper](paper) directory. 72 | 73 | 74 | ## Citations 75 | 76 | If you find this repository valuable, please give it a star! 77 | Got any questions or feedback? Feel free to reach out and open an issue. 78 | Using this in your work? Please reference us using the provided citation: 79 | 80 | ```bibtex 81 | @misc{besta2024checkembed, 82 | title = {{CheckEmbed: Effective Verification of LLM Solutions to Open-Ended Tasks}}, 83 | author = {Besta, Maciej and Paleari, Lorenzo and Kubicek, Ales and Nyczyk, Piotr and Gerstenberger, Robert and Iff, Patrick and Lehmann, Tomasz and Niewiadomski, Hubert and Hoefler, Torsten}, 84 | year = 2024, 85 | month = Jun, 86 | eprinttype = {arXiv}, 87 | eprint = {2406.02524} 88 | } 89 | ``` 90 | -------------------------------------------------------------------------------- /examples/RAGTruth/README.md: -------------------------------------------------------------------------------- 1 | # RAGTruth Benchmark 2 | 3 | This benchmark is based on the [RAGTruth](https://github.com/ParticleMedia/RAGTruth) dataset, a hallucination detection benchmark tailored for Retrieval-Augmented Generation (RAG) systems. The dataset consists of three task types: **Summarization**, **Data-to-Text Generation**, and **Question Answering (QA)**. Each task includes human-written source documents paired with responses generated by multiple Large Language Models (LLMs). 4 | 5 | ## Tasks 6 | 7 | RAGTruth includes 2,965 unique source tasks: 8 | 9 | * **943** for Summarization 10 | * **1,033** for Data-to-Text 11 | * **989** for Question Answering 12 | 13 | For each task, answers are generated using **6 different LLMs**, resulting in a total of **17,790 documents**: 14 | 15 | * **5,658** Summarization responses 16 | * **6,198** Data-to-Text responses 17 | * **5,934** Question Answering responses 18 | 19 | These generated responses are located in the `dataset/` folder and are split into `training_data.json` and `response.json` (test) sets. 20 | 21 | ## Splits 22 | 23 | * The **test split** consists of **2,700** LLM-generated answers (900 per task type). 24 | * The remaining samples are in the **train split**. 25 | 26 | ## Evaluation Setup 27 | 28 | Following the instructions of the official [paper](https://arxiv.org/abs/2401.00396), we generated **10 samples per LLM-generated answer** in the test set. These synthetic samples enable hallucination detection evaluation using methods such as: 29 | 30 | * **CheckEmbed** 31 | * [**SelfCheckGPT**](https://github.com/potsawee/selfcheckgpt) 32 | 33 | The code for generating samples is included in the `samples/` directory. All generations were run locally, with **no additional inference cost**. 34 | 35 | ## Baselines 36 | 37 | We also include an adjusted version of [**HalluDetect**](https://github.com/Baylor-AI/HalluDetect), an effective method for hallucination classification in text generation. 38 | 39 | > **Note:** HalluDetect requires significant computational resources. For optimal performance, we recommend at least **2 × NVIDIA A100 GPUs** (40GB+ VRAM). 40 | 41 | ## Cost Estimation 42 | 43 | * **LLM-as-a-Judge**: \~\$20 total for scoring using default LLMs. 44 | 45 | ## How to Run 46 | 47 | Please execute the following steps to reproduce the experiment: 48 | 49 | ```bash 50 | cd examples/RAGTruth/dataset 51 | # (Optional) Run sampling script if not already done 52 | # We already provide the necessary additional samples in the repository. 53 | python3 sampler.py 54 | 55 | # Evaluate using CheckEmbed 56 | cd .. 57 | python3 main.py 58 | 59 | # Run HalluDetect 60 | python3 hallu_detect.py 61 | ``` 62 | -------------------------------------------------------------------------------- /examples/README.md: -------------------------------------------------------------------------------- 1 | # Examples 2 | 3 | This directory contains scripts for running various examples using the CheckEmbed package. Each script is a standalone Python program that sets up and runs a particular example. 4 | 5 | Please refer to the individual example directories for more information on the specific example, specifically the `main.py` file, which is almost ready to be executed. 6 | 7 | ## General Information 8 | 9 | In each `main.py` file, the following parameters need to be set up for the desired environment: 10 | 11 | - Check that the `config_path` variable is set up correctly. 12 | - Choose the language model(s) to evaluate. 13 | - Choose the embedding model(s). 14 | - Check the `device` and `batch_size` parameters for the embeddings models and scheduler. 15 | - Modify the `startingPoint` parameter of `scheduler.run(...)` to influence which stages will be executed: 16 | - `StartingPoint.PROMPT`: prompt generation, sample generation, embedding generation and evaluation (plotting) 17 | - `StartingPoint.SAMPLES`: sample generation, embedding generation and evaluation (plotting) 18 | - `StartingPoint.EMBEDDINGS`: embedding generation and evaluation (plotting) 19 | - If you want to use the `Alibaba-NLP/gte-Qwen1.5-7B-instruct` embedding model, please add your Huggingface access token to respective initialisation call. 20 | 21 | Once everything is set up, change into the desired example folder and execute: 22 | 23 | ``` 24 | python3 main.py 25 | ``` 26 | 27 | ## Scheduler Setup 28 | 29 | The file [scheduler.py](/CheckEmbed/scheduler/scheduler.py) contains specific documentation for each parameter. 30 | 31 | ```python 32 | scheduler = Scheduler( 33 | current_dir, 34 | logging_level = logging.DEBUG, 35 | 36 | # Adjust the budget based on the estimations documented for each example. 37 | # If the budget is too low, the execution of the pipeline will be stopped as soon as the limit is detected. 38 | budget = 12, 39 | parser = customParser, 40 | 41 | # Update to include more or fewer LLMs / embedding models. 42 | lm = [gpt4_o, gpt4, gpt3], 43 | embedding_lm = [embedd_large, sfrEmbeddingMistral, e5mistral7b, gteQwen157bInstruct], 44 | llm_as_a_judge_Operation = llm_as_a_judge_Operation, 45 | llm_as_a_judge_models = [gpt4_o_mini, gpt4_o, llama70, llama8], 46 | 47 | # Operations to be executed during the evaluation stage. 48 | operations = [operation1, operation2, ...], 49 | ) 50 | 51 | # The order of lm_names and embedding_lm_names should be the same 52 | # as the order of the language models and embedding language models respectively. 53 | scheduler.run( 54 | # If an error occurs, the starting point can be adjusted to avoid recomputation. 55 | startingPoint = StartingPoint.PROMPT, 56 | 57 | # utility functions 58 | defaultDirectories = True, 59 | 60 | # Indicate which operations to run. 61 | bertScore = True, 62 | selfCheckGPT = True, 63 | checkEmbed = True, 64 | llm_as_a_judge = True, 65 | 66 | # Settings for the pipeline. 67 | ground_truth = False, 68 | spacy_separator = True, 69 | time_performance = False, 70 | rebase_results = False, 71 | reference_text = False, 72 | 73 | # Number of samples per prompt example. 74 | num_samples = 10, 75 | 76 | # Optional values, if not set, the default values will be used. 77 | lm_names = ["gpt4-o", "gpt4-turbo", "gpt"], 78 | embedding_lm_names = ["gpt-embedding-large", "sfr-embedding-mistral", "e5-mistral-7B-instruct", "gte-Qwen1.5-7B-instruct", "stella-en-400M-v5", "stella-en-1.5B-v5"], 79 | 80 | # Do not modify 81 | bertScore_model = "microsoft/deberta-xlarge-mnli", 82 | 83 | # It may be necessary to reduce the batch size if the model is too large, with 8GB of GPU VRAM we suggest the use of batch_size = 1. 84 | batch_size = 64, 85 | device = "cuda" # or "cpu" "mps" ... 86 | ) 87 | ``` 88 | -------------------------------------------------------------------------------- /examples/description/README.md: -------------------------------------------------------------------------------- 1 | # Distinguishing Similar and Different Text Passages 2 | 3 | The use case in this directory analyzes, whether a verification method is able to clearly distinguish two passages of text that either look 4 | similar, but come with very different meanings ("different") or look different, but have similar or identical meanings ("similar"). 5 | 6 | ## Data 7 | 8 | The list of topics for the different subtask can be found in `different_topics_list` list in the `different/main.py` file. 9 | There are two lists of topics for the similar subtask: `precise` and `generic`. Both lists (`precise_topics` and `general_topics`) can be found 10 | in the `similar/main.py` file. 11 | 12 | ## Prompt Templates 13 | 14 | The prompt templates for the subtasks can be found in `different/prompt_scheme.txt` and `similar/prompt_scheme.txt` respectively. 15 | 16 | ## Runtime / Cost Estimation 17 | 18 | The samples have been generated with a temperature of 1.0. The temperature can be adjusted in your `config.json`. 19 | We estimate a compute time of 90 minutes with an NVIDIA Tesla V100-PCIE-32GB for each subtask. 20 | 21 | The total estimated costs are $1.55 for each subtask: 22 | 23 | - GPT4-o: $0.5 24 | - GPT4-turbo: $1 25 | - GPT3.5: $0.05 26 | 27 | Running LLM-as-a-Judge for all subtasks will result in an additional cost of no more than $1. 28 | -------------------------------------------------------------------------------- /examples/description/different/prompt_scheme.txt: -------------------------------------------------------------------------------- 1 | ### INSTRUCTION ### 2 | 3 | Hello. Please generate two passages of text. They should describe two diffent things: 4 | 1. ### HERE 1 ### 5 | 2. ### HERE 2 ### 6 | 7 | However, these two passages should have the same length and style 8 | I want you to give an answer using the following format: 9 | 10 | ### DESCRIPTION 1 ### 11 | the actual description here... 12 | ### DESCRIPTION 2 ### 13 | the actual description here... 14 | 15 | 16 | ### ANSWER ### -------------------------------------------------------------------------------- /examples/description/similar/prompt_scheme.txt: -------------------------------------------------------------------------------- 1 | ### INSTRUCTION ### 2 | 3 | Hello. Please generate two passages of text. They should both describe the same thing (### HERE ###). However, these two passages should differ VASTLY in their length, style. 4 | I want you to give an answer using the following format: 5 | 6 | ### DESCRIPTION 1 ### 7 | the actual description here... 8 | ### DESCRIPTION 2 ### 9 | the actual description here... 10 | 11 | 12 | ### ANSWER ### -------------------------------------------------------------------------------- /examples/incremental_forced_hallucination/legal_summaries/README.md: -------------------------------------------------------------------------------- 1 | # Hallucination 2 | 3 | The use case in this directory detects small fine-grained hallucinations, such 4 | as mistakes in individual facts. The use case is based on the summarization of different legal text chunks. 5 | For each chunk considered, the ground truth is generated using a 6 | special prompt `prompt_scheme_ground_truth.txt`, which gathers 10 samples from the LLM by asking for a correct summarization of that chunk. 7 | The LLM is also tasked to provide errors for that chunk, in the range from 1 to 10. 8 | These errors are then incorporated separately into the summary, so that the number of errors inside the summary varies between 1 to 10. 9 | These error-ridden summary are then sampled with an LLM and compared against the zero error original summary via the CheckEmbed pipeline. 10 | 11 | ## Data 12 | 13 | The dataset can be found in the file `dataset/legal_definitions.json`. It consists of text chunks to be summarized. 14 | 15 | ## Prompt Templates 16 | 17 | The prompt templates can be found in the files `prompt_scheme.txt` and `prompt_scheme_ground_truth.txt`. 18 | 19 | ## Runtime / Cost Estimation 20 | 21 | The samples have been generated with a temperature of 0.25. The temperature can be adjusted in your `config.json`. 22 | We estimate a compute time of 20 hours with an NVIDIA GH200. 23 | 24 | The total estimated costs are $35: 25 | 26 | - GPT4-o: $33 27 | - GPT3.5: $2 28 | 29 | Running LLM-as-a-Judge will result in an additional cost of no more than $2. 30 | -------------------------------------------------------------------------------- /examples/incremental_forced_hallucination/legal_summaries/prompt_scheme.txt: -------------------------------------------------------------------------------- 1 | ### INSTRUCTION ### 2 | 3 | You are a lawyer. 4 | 5 | ### TASK ### 6 | 7 | ## SUBTASK 1 ## 8 | Based on the provided context generate ### NUMBER ### completely false information (fact hallucinations) about it. 9 | 10 | ## SUBTASK 2 ## 11 | Create a complete and detailed summary of the provided context including all the errors generated in SUBTASK 1. 12 | When including the hallucinations you must not say they are wrong. 13 | 14 | Answer using the following formatting. 15 | 16 | ### ERRORS ### 17 | List of fact hallucinations to be later included in the summary... 18 | ### SUMMARY ### 19 | The summary here.... 20 | 21 | 22 | ### CONTEXT ### 23 | 24 | [###REPLACE WITH CONTEXT###] 25 | 26 | ### ANSWER ### -------------------------------------------------------------------------------- /examples/incremental_forced_hallucination/legal_summaries/prompt_scheme_ground_truth.txt: -------------------------------------------------------------------------------- 1 | ### INSTRUCTION ### 2 | 3 | You are a lawyer. 4 | 5 | ### TASK ### 6 | 7 | Based on the provided context generate a complete and detaild summary of it. Answer using the following formatting. 8 | 9 | ### SUMMARY ### 10 | The summary here.... 11 | 12 | 13 | ### CONTEXT ### 14 | 15 | [###REPLACE WITH CONTEXT###] 16 | 17 | ### ANSWER ### -------------------------------------------------------------------------------- /examples/incremental_forced_hallucination/operation_variants/__init__.py: -------------------------------------------------------------------------------- 1 | from .bertscore_operation_variant import BertScoreOperation_Variant 2 | from .selfcheckgpt_operation_variant import SelfCheckGPT_BERT_Operation_Variant, SelfCheckGPT_NLI_Operation_Variant 3 | from .checkembed_operation_variant import CheckEmbedOperation_Variant 4 | -------------------------------------------------------------------------------- /examples/incremental_forced_hallucination/operation_variants/bertscore_operation_variant.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 ETH Zurich. 2 | # All rights reserved. 3 | # 4 | # Use of this source code is governed by a BSD-style license that can be 5 | # found in the LICENSE file. 6 | # 7 | # main author: Lorenzo Paleari 8 | 9 | import logging 10 | import os 11 | import json 12 | 13 | import bert_score 14 | import numpy as np 15 | from tqdm import tqdm 16 | from typing import Any 17 | 18 | from CheckEmbed.operations import Operation 19 | from CheckEmbed.utility import capture_specific_stderr, frobenius_norm 20 | 21 | class BertScoreOperation_Variant(Operation): 22 | """ 23 | Operation that computes the BertScore between the reference and the sample embeddings. 24 | 25 | Inherits from the Operation class and implements its abstract methods. 26 | """ 27 | 28 | def __init__(self, result_dir_path: str, ground_truth_dir_path: str, sample_dir_path: str) -> None: 29 | """ 30 | Initialize the operation. 31 | 32 | :param result_dir_path: The path to the directory where the results will be stored. 33 | :type result_dir_path: str 34 | :param ground_truth_dir_path: The path to the directory where the ground truth samples are stored. 35 | :type ground_truth_dir_path: str 36 | :param sample_dir_path: The path to the directory where the samples are stored. 37 | :type sample_dir_path: str 38 | """ 39 | super().__init__(result_dir_path) 40 | self.sample_dir_path = sample_dir_path 41 | self.ground_truth = ground_truth_dir_path 42 | 43 | def execute(self, custom_inputs: Any) -> Any: 44 | """ 45 | Execute the operation on the embeddings/samples. 46 | 47 | :param custom_inputs: The custom inputs for the operation. 48 | :type custom_inputs: any 49 | """ 50 | 51 | print("\n\nRunning BertScore operation.") 52 | 53 | # Initialize logging 54 | logging.basicConfig( 55 | filename=os.path.join(self.result_dir_path, "log.log"), 56 | filemode="w", 57 | format="%(name)s - %(levelname)s - %(message)s", 58 | level=custom_inputs["logging_level"], 59 | ) 60 | 61 | # Run BertScore for every pair of language model and samples 62 | for lm_name in (pbar := tqdm(custom_inputs["lm_names"], desc="Language Models", leave=True)): 63 | pbar.set_postfix_str(f"{lm_name}") 64 | logging.info(f"Loading responses from {lm_name}.") 65 | samples = [] 66 | 67 | # Load samples from the language model 68 | with open(os.path.join(self.sample_dir_path, f"{lm_name}_samples.json")) as f: 69 | responses = json.load(f) 70 | 71 | for index, response in enumerate(responses["data"]): 72 | samples.append(response["samples"]) 73 | logging.debug(f"Sample {index}: {samples[index]}") 74 | 75 | logging.info("Loaded samples.") 76 | 77 | # load second set of samples that will be ground_truth data 78 | ground_truth_list = [] 79 | with open(os.path.join(self.ground_truth, f"{lm_name}_samples.json")) as f: 80 | responses = json.load(f) 81 | 82 | for index, response in enumerate(responses["data"]): 83 | ground_truth_list.append(response["samples"]) 84 | logging.debug(f"ground_truth {index}: {ground_truth_list[index]}") 85 | 86 | # For every prompt compare every sample with every other sample 87 | logging.info(f"Running BertScore for {lm_name}.") 88 | 89 | same_samples = [] 90 | for sample in samples: 91 | same_s = [] 92 | for i in range(len(sample)): 93 | temp = [] 94 | for j in range(len(sample)): 95 | temp.append(sample[i]) 96 | same_s.append(temp) 97 | same_samples.append(same_s) 98 | 99 | results = [] 100 | for ground_truth, same_sample in tqdm(zip(ground_truth_list, same_samples), total=len(samples), desc="Prompts", leave=False): 101 | result = [] 102 | for s in tqdm(same_sample, desc="Samples", leave=False): 103 | target_string = ["Warning: Empty reference sentence detected; setting raw BERTScores to 0.","Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']", "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference."] 104 | with capture_specific_stderr(target_string): 105 | result.append(bert_score.score( 106 | ground_truth, s, model_type=custom_inputs["model_type"], 107 | batch_size=custom_inputs["batch_size"], device=custom_inputs["device"], 108 | lang="en", verbose=False, 109 | rescale_with_baseline=True, 110 | )[2].tolist()) 111 | results.append(result) 112 | logging.debug(f"Results: {result}") 113 | 114 | logging.info(f"Finished running BertScore for {lm_name}.") 115 | 116 | # Fix the results that are less than -1 117 | for index, result in enumerate(results): 118 | temp_res = np.zeros((len(result), len(result[0]))) 119 | for i in range(temp_res.shape[0]): 120 | for j in range(temp_res.shape[1]): 121 | if temp_res[i][j] < -1: 122 | temp_res[i][j] = -1 123 | else: 124 | temp_res[i][j] = result[i][j] 125 | results[index] = temp_res 126 | 127 | frobenius_norms = [frobenius_norm(result) for result in results] 128 | std_devs = [np.std(result) for result in results] 129 | 130 | # Store results 131 | with open(os.path.join(self.result_dir_path, f"{lm_name}_bert.json"), "w") as f: 132 | results_json = [{ 133 | "index": i, 134 | "result": result.tolist(), 135 | "frobenius_norm": frob_norm, 136 | "std_dev": std_dev 137 | } for i, result, frob_norm, std_dev in zip(range(len(results)), results, frobenius_norms, std_devs)] 138 | json.dump({"data": results_json}, f, indent=4) 139 | 140 | logging.info(f"Saved results for {lm_name}.") 141 | -------------------------------------------------------------------------------- /examples/incremental_forced_hallucination/operation_variants/checkembed_operation_variant.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 ETH Zurich. 2 | # All rights reserved. 3 | # 4 | # Use of this source code is governed by a BSD-style license that can be 5 | # found in the LICENSE file. 6 | # 7 | # main author: Lorenzo Paleari 8 | 9 | import os 10 | import json 11 | 12 | import numpy as np 13 | 14 | from typing import Any 15 | 16 | from CheckEmbed.operations import Operation 17 | from CheckEmbed.utility import cosine_similarity, frobenius_norm 18 | 19 | class CheckEmbedOperation_Variant(Operation): 20 | """ 21 | Operation that computes the cosine similarity, the Pearson correlation, the Frobenius norm and standard deviation between the embeddings. 22 | """ 23 | 24 | def __init__(self, result_dir_path: str, ground_truth_dir_path: str, embeddings_dir_path: str) -> None: 25 | """ 26 | Initialize the operation. 27 | 28 | :param result_dir_path: The path to the directory where the results will be stored. 29 | :type result_dir_path: str 30 | :param ground_truth_dir_path: The path to the directory where the ground truth embeddings are stored. 31 | :type ground_truth_dir_path: str 32 | :param embeddings_dir_path: The path to the directory where the embeddings are stored. 33 | :type embeddings_dir_path: str 34 | """ 35 | super().__init__(result_dir_path) 36 | self.ground_truth_dir = ground_truth_dir_path 37 | self.embeddings_dir_path = embeddings_dir_path 38 | 39 | def execute(self, custom_inputs: Any) -> Any: 40 | """ 41 | Execute the operation on the embeddings/samples. 42 | 43 | :param custom_inputs: The custom inputs for the operation. 44 | :type custom_inputs: Any 45 | """ 46 | 47 | # For every language model / embedding model 48 | for file in os.listdir(self.embeddings_dir_path): 49 | if ".json" in file and not file.startswith("ground_truth_"): 50 | 51 | folder_name = file.replace("_" + file.split("_")[2], "") 52 | 53 | # Load the samples embeddings 54 | with open(os.path.join(self.embeddings_dir_path, file), "r") as f: 55 | data = json.load(f) 56 | data_array = data["data"] 57 | embeddings = [d["embeddings"] for d in data_array] # Convert to numpy array 58 | 59 | # Load the ground_truth embeddings 60 | dimensions = len(embeddings[0]) 61 | with open(os.path.join(self.ground_truth_dir, file), "r") as f: 62 | data = json.load(f) 63 | data_array = data["data"] 64 | ground_truth_embeddings = [d["embeddings"] for d in data_array] # Convert to numpy array 65 | 66 | # Compute the cosine similarity matrix with the ground truth 67 | cosine_similarity_matrix_array = [] 68 | for embedding, ground_truth in zip(embeddings, ground_truth_embeddings): 69 | # -1 array to initialize the cosine similarity matrix 70 | cosine_similarity_matrix = np.full((dimensions, dimensions), -1.0) 71 | for i in range(len(embedding)): 72 | for j in range(len(embedding)): 73 | cosine_similarity_matrix[i, j] = cosine_similarity(ground_truth[i], embedding[j]) 74 | 75 | cosine_similarity_matrix_array.append(cosine_similarity_matrix) 76 | 77 | # Compute the Frobenius norm of each cosine similarity matrix 78 | frobenius_norms_cosine_sim = [frobenius_norm(cosine_similarity_matrix) for cosine_similarity_matrix in cosine_similarity_matrix_array] 79 | 80 | # Compute the standard deviation of each cosine similarity matrix 81 | std_dev_cosine_sim_array = [np.std(cosine_similarity_matrix) for cosine_similarity_matrix in cosine_similarity_matrix_array] 82 | 83 | # Compute the Pearson correlation matrix 84 | pearson_corr_array = [] 85 | for embedding, ground_truth in zip(embeddings, ground_truth_embeddings): 86 | pearson_corr = np.full((dimensions, dimensions), -1.0) 87 | for i in range(len(embedding)): 88 | for j in range(len(embedding)): 89 | if len(embedding[i]) == 0 and len(embedding[j]) == 0: 90 | pearson_corr[i, j] = 1.0 91 | continue 92 | if len(embedding[i]) == 0 or len(embedding[j]) == 0: 93 | pearson_corr[i, j] = -1.0 94 | continue 95 | pearson_corr[i, j] = np.corrcoef(embedding[i], embedding[j])[0, 1] 96 | 97 | pearson_corr_array.append(pearson_corr) 98 | 99 | # Compute the Frobenius norm of each cosine similarity matrix 100 | frobenius_norms_pearson_corr = [frobenius_norm(pearson_corr) for pearson_corr in pearson_corr_array] 101 | 102 | std_dev_pearson_corr_array = [np.std(pearson_corr) for pearson_corr in pearson_corr_array] 103 | 104 | with open(os.path.join(self.result_dir_path, folder_name + "_results.json"), "w") as f: 105 | results_json = [{ 106 | "index": index, 107 | "cosine_sim": cosine_sim.tolist(), 108 | "frob_norm_cosine_sim": frob_norm_cosine_sim, 109 | "std_dev_cosine_sim": std_dev_cosine_sim, 110 | "pearson_corr": pearson_corr.tolist(), 111 | "frob_norm_pearson_corr": frob_norm_pearson_corr, 112 | "std_dev_pearson_corr": std_dev_pearson_corr 113 | } for index, cosine_sim, frob_norm_cosine_sim, std_dev_cosine_sim, pearson_corr, frob_norm_pearson_corr, std_dev_pearson_corr in zip(range(len(cosine_similarity_matrix_array)), cosine_similarity_matrix_array, frobenius_norms_cosine_sim, std_dev_cosine_sim_array, pearson_corr_array, frobenius_norms_pearson_corr, std_dev_pearson_corr_array)] 114 | json.dump({"data": results_json}, f, indent=4) 115 | -------------------------------------------------------------------------------- /examples/incremental_forced_hallucination/scientific_descriptions/README.md: -------------------------------------------------------------------------------- 1 | # Hallucination 2 | 3 | The use case in this directory detects small fine-grained hallucinations, such 4 | as mistakes in individual facts. The use case is based on the description of different scientific topics. 5 | For each topic considered, the ground truth is generated using a 6 | special prompt `prompt_scheme_ground_truth.txt`, which gathers 10 samples from the LLM by asking for a correct description of that specific topic. 7 | The LLM is also tasked to provide errors for that topic, in the range from 1 to 10. 8 | These errors are then incorporated separately into the description, so that the number of errors inside the description varies between 1 to 10. 9 | These error-ridden descriptions are then sampled with an LLM and compared against the zero error original description via the CheckEmbed pipeline. 10 | 11 | ## Data 12 | 13 | The list of topics can be found in `topics_list` list in the `main.py` file. 14 | 15 | ## Prompt Templates 16 | 17 | The prompt templates can be found in the files `prompt_scheme.txt` and `prompt_scheme_ground_truth.txt`. 18 | 19 | ## Runtime / Cost Estimation 20 | 21 | The samples have been generated with a temperature of 1.0. The temperature can be adjusted in your `config.json`. 22 | We estimate a compute time of 20 hours with an NVIDIA GH200. 23 | 24 | The total estimated costs are $35: 25 | 26 | - GPT4-o: $33 27 | - GPT3.5: $2 28 | 29 | Running LLM-as-a-Judge will result in an additional cost of no more than $2. 30 | -------------------------------------------------------------------------------- /examples/incremental_forced_hallucination/scientific_descriptions/prompt_scheme.txt: -------------------------------------------------------------------------------- 1 | ### INSTRUCTION ### 2 | 3 | Hello. Please generate ### NUMBER ### completely false information (fact hallucinations) on (### TOPIC ###). 4 | Then insert the errors inside a passage of text that talks about (### TOPIC ###). 5 | You should convince a reader that the false informations are actually correct ones. 6 | 7 | Please, use the following format for answering: 8 | 9 | 10 | ### ERRORS ### 11 | List of fact hallucinations to be later included in the passage... 12 | ### PASSAGE ### 13 | The passage here.... 14 | -------------------------------------------------------------------------------- /examples/incremental_forced_hallucination/scientific_descriptions/prompt_scheme_ground_truth.txt: -------------------------------------------------------------------------------- 1 | ### INSTRUCTION ### 2 | 3 | Hello. Please generate a passage of text that talks about (### TOPIC ###). 4 | 5 | Please, use the following format for answering: 6 | 7 | ### PASSAGE ### 8 | The passage here.... 9 | -------------------------------------------------------------------------------- /examples/legal_definitions/README.md: -------------------------------------------------------------------------------- 1 | # Legal Definitions 2 | 3 | The use case in this directory extracts terms and their definitions from legal documents. It is based on an in-house legal analytics project. 4 | 5 | We use this example also for an ablation study by varying the chunk sizes that are processed in a single step. 6 | An increase in chunk size means that more terms and their definitions need to be extracted at a time. 7 | The general assumption is that the LLM will perform worse if the processed document size increases, which should be reflected in the resulting CheckEmbed scores. 8 | If you wish the run the original use case with a single chunk size, please comment out the lines 233 to 241 in `main.py`. 9 | 10 | ## Data 11 | 12 | The dataset can be found in the file `dataset/legal_definitions.json`. It consists of text chunks as well as expected terms to be found (the "ground truth"). 13 | 14 | ## Prompt Template 15 | 16 | The prompt template can be found in the file `prompt_scheme.txt`. 17 | 18 | ## Runtime / Cost Estimation 19 | 20 | The samples have been generated with a temperature of 0.25. The temperature can be adjusted in your `config.json`. 21 | We estimate a compute time of 90 minutes with an NVIDIA A100-SXM-40GB for each experiment. 22 | 23 | Based on the experiment the total estimated costs are $7 (1 chunk), $11 (2 chunks) and $18 (4 chunks): 24 | - GPT4-o: $2.25, $3.5, $5.75 25 | - GPT4-turbo: $4.5, $7, $10.5 26 | - GPT3.5: $0.15, $0.5, $1.5 27 | -------------------------------------------------------------------------------- /examples/legal_definitions/main.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 ETH Zurich. 2 | # All rights reserved. 3 | # 4 | # Use of this source code is governed by a BSD-style license that can be 5 | # found in the LICENSE file. 6 | # 7 | # main author: Lorenzo Paleari 8 | 9 | import json 10 | import logging 11 | import os 12 | from typing import Any, List 13 | 14 | from CheckEmbed import embedding_models 15 | from CheckEmbed import language_models 16 | from CheckEmbed.parser import Parser 17 | from CheckEmbed.scheduler import Scheduler, StartingPoint 18 | from CheckEmbed.operations import SelfCheckGPT_BERT_Operation, SelfCheckGPT_NLI_Operation 19 | 20 | 21 | class CustomParser(Parser): 22 | """ 23 | The CustomParser class handles the dataset parsing. 24 | 25 | Inherits from the Parser class and implements its abstract methods. 26 | """ 27 | 28 | def __init__(self, dataset_path: str, prompt_scheme_path: str, num_chunks: int) -> None: 29 | """ 30 | Initialize the parser. 31 | 32 | :param dataset_path: The path to the dataset. 33 | :type dataset_path: str 34 | :param prompt_scheme_path: The path to the prompt scheme file. 35 | :type prompt_scheme_path: str 36 | :param num_chunks: The number of chunks. 37 | :type num_chunks: int 38 | """ 39 | super().__init__(dataset_path) 40 | self.prompt_scheme_path = prompt_scheme_path 41 | self.num_chunks = num_chunks 42 | 43 | def prompt_generation(self, custom_inputs: Any = None) -> List[str]: 44 | """ 45 | Parse the dataset and generate the prompts for the model. 46 | 47 | :param custom_inputs: The custom inputs to the parser. Defaults to None. 48 | :type custom_inputs: Any 49 | :return: List of prompts. 50 | :rtype: List[str] 51 | """ 52 | # Getting the input data from the dataset 53 | input_data = [] 54 | with open(self.dataset_path) as f: 55 | json_data = json.load(f) 56 | 57 | data_array = json_data['data'] 58 | for data in data_array: 59 | input_data.append(data['chunk_txt']) 60 | 61 | # Prompts generation 62 | prompt_complete = None 63 | with open(self.prompt_scheme_path) as f: 64 | prompt_complete = f.read() 65 | 66 | prompt_initial = prompt_complete[0:prompt_complete.find('[###REPLACE WITH CONTEXT###]')] 67 | prompt_final = prompt_complete[prompt_complete.find('[###REPLACE WITH CONTEXT###]')+len('[###REPLACE WITH CONTEXT###]'):] 68 | 69 | start_index = 0 70 | if self.num_chunks == 1: 71 | start_index = 1 72 | 73 | # Use the input data as context inside the prompts 74 | prompts = [] 75 | for i in range(start_index, len(input_data) - self.num_chunks + 1): 76 | prompts.append(prompt_initial + "".join(input_data[i:i+self.num_chunks]) + prompt_final) 77 | 78 | return prompts 79 | 80 | def ground_truth_extraction(self, custom_inputs: Any = None) -> List[str]: 81 | """ 82 | Parse the dataset and extract the ground truth. 83 | 84 | :param custom_inputs: The custom inputs to the parser. Defaults to None. 85 | :type custom_inputs: Any 86 | :return: List of ground truths. 87 | :rtype: List[str] 88 | """ 89 | ground_truth = [] 90 | with open(self.dataset_path) as f: 91 | json_data = json.load(f) 92 | 93 | data_array = json_data['data'] 94 | for data in data_array: 95 | text = "" 96 | for definition in data['definitions']: 97 | text += definition["term"] + ". " + definition["context"] + "\n" 98 | 99 | text = text[:-1] 100 | ground_truth.append(text) 101 | 102 | start_index = 0 103 | if self.num_chunks == 1: 104 | start_index = 1 105 | 106 | composite_ground_truth = [] 107 | for i in range(start_index, len(ground_truth) - self.num_chunks + 1): 108 | composite_ground_truth.append("\n".join(ground_truth[i:i+self.num_chunks])) 109 | 110 | return composite_ground_truth 111 | 112 | def start(current_dir: str, num_chunks: int = 1, start: int = StartingPoint.PROMPT) -> None: 113 | """ 114 | Start the main function. 115 | 116 | :param current_dir: The current directory. 117 | :type current_dir: str 118 | :param num_chunks: The number of chunks. Defaults to 1. 119 | :type num_chunks: int 120 | :param start: The starting point. Defaults to StartingPoint.PROMPT. 121 | :type start: int 122 | """ 123 | 124 | # Config file for the LLM(s) 125 | config_path = os.path.join( 126 | current_dir, 127 | "../../CheckEmbed/config.json", 128 | ) 129 | 130 | # Initialize the parser and the embedder 131 | customParser = CustomParser("./dataset/legal_definitions.json", os.path.join(current_dir, "prompt_scheme.txt"), num_chunks=num_chunks) 132 | 133 | # Initialize the language models 134 | gpt3 = language_models.ChatGPT( 135 | config_path, 136 | model_name = "chatgpt", 137 | cache = True, 138 | ) 139 | 140 | gpt4 = language_models.ChatGPT( 141 | config_path, 142 | model_name = "chatgpt4-turbo", 143 | cache = True, 144 | ) 145 | 146 | gpt4_o = language_models.ChatGPT( 147 | config_path, 148 | model_name = "chatgpt4-o", 149 | cache = True, 150 | ) 151 | 152 | embedd_large = embedding_models.EmbeddingGPT( 153 | config_path, 154 | model_name = "gpt-embedding-large", 155 | cache = False, 156 | ) 157 | 158 | sfrEmbeddingMistral = embedding_models.SFREmbeddingMistral( 159 | model_name = "Salesforce/SFR-Embedding-Mistral", 160 | cache = False, 161 | ) 162 | 163 | e5mistral7b = embedding_models.E5Mistral7b( 164 | model_name = "intfloat/e5-mistral-7b-instruct", 165 | cache = False, 166 | ) 167 | 168 | gteQwen157bInstruct = embedding_models.GteQwenInstruct( 169 | model_name = "Alibaba-NLP/gte-Qwen1.5-7B-instruct", 170 | cache = False, 171 | access_token = "", # Add your access token here 172 | batch_size = 4, # it may be necessary to reduce the batch size if the GPU VRAM < 40GB 173 | ) 174 | 175 | stella_en_15B_v5 = embedding_models.Stella( 176 | model_name = "NovaSearch/stella_en_1.5B_v5", 177 | variant = "1.5B-v5", 178 | cache = False, 179 | ) 180 | 181 | stella_en_400M_v5 = embedding_models.Stella( 182 | model_name = "NovaSearch/stella_en_400M_v5", 183 | cache = False, 184 | ) 185 | 186 | selfCheckGPT_BERT_Operation = SelfCheckGPT_BERT_Operation( 187 | os.path.join(current_dir, "SelfCheckGPT"), 188 | current_dir, 189 | ) 190 | 191 | selfCheckGPT_NLI_Operation = SelfCheckGPT_NLI_Operation( 192 | os.path.join(current_dir, "SelfCheckGPT"), 193 | current_dir, 194 | ) 195 | 196 | # Initialize the scheduler 197 | scheduler = Scheduler( 198 | current_dir, 199 | logging_level = logging.DEBUG, 200 | budget = 30, 201 | parser = customParser, 202 | lm = [gpt4_o, gpt4, gpt3], 203 | embedding_lm = [stella_en_15B_v5, stella_en_400M_v5, gteQwen157bInstruct, e5mistral7b, sfrEmbeddingMistral, embedd_large], 204 | selfCheckGPTOperation=[selfCheckGPT_NLI_Operation, selfCheckGPT_BERT_Operation], 205 | ) 206 | 207 | # The order of lm_names and embedding_lm_names should be the same 208 | # as the order of the language models and embedding language models respectively. 209 | scheduler.run( 210 | startingPoint = start, 211 | bertScore = True, 212 | selfCheckGPT = True, 213 | ground_truth = True, 214 | rebase_results=True, 215 | num_samples = 10, 216 | bertScore_model = "microsoft/deberta-xlarge-mnli", 217 | batch_size = 64, # it may be necessary to reduce the batch size if the model is too large 218 | device = "cuda" # or "cpu" "mps" ... 219 | ) 220 | 221 | if __name__ == "__main__": 222 | current_dir = os.path.dirname(os.path.abspath(__file__)) + "/chunk_dim_1" 223 | os.makedirs(current_dir, exist_ok=True) 224 | start(current_dir, num_chunks=1, start=StartingPoint.PROMPT) 225 | 226 | current_dir = os.path.dirname(os.path.abspath(__file__)) + "/chunk_dim_2" 227 | os.makedirs(current_dir, exist_ok=True) 228 | start(current_dir, num_chunks=2, start=StartingPoint.PROMPT) 229 | 230 | current_dir = os.path.dirname(os.path.abspath(__file__)) + "/chunk_dim_4" 231 | os.makedirs(current_dir, exist_ok=True) 232 | start(current_dir, num_chunks=4, start=StartingPoint.PROMPT) 233 | -------------------------------------------------------------------------------- /examples/legal_definitions/prompt_scheme.txt: -------------------------------------------------------------------------------- 1 | ### INSTRUCTION ### 2 | 3 | You are a lawyer. 4 | 5 | ### QUESTION ### 6 | 7 | Based on the provided context extract all the legal definitions. Answer using the following formatting. 8 | 9 | Term.Definition 10 | Term.Definition 11 | ... 12 | 13 | 14 | [...] 15 | ### CONTEXT ### 16 | 17 | Preliminary Note 18 | The Stock Purchase Agreement sets forth the basic terms of the purchase and sale of the preferred stock to the investors (such as the purchase price, closing date, conditions to closing) and identifies the other financing documents. Generally this agreement does not set forth either (1) the characteristics of the stock being sold (which are defined in the Certificate of Incorporation) or (2) the relationship among the parties after the closing, such as registration rights, rights of first refusal and co-sale and voting arrangements (these matters often implicate persons other than just the Company and the investors in this round of financing and are usually embodied in separate agreements to which those others persons are parties, or in some cases in the Certificate of Incorporation). The main items of negotiation in the Stock Purchase Agreement are therefore the price and number of shares being sold, the representations and warranties that the Company must make to the investors and the closing conditions for the transaction. 19 | SERIES A PREFERRED STOCK PURCHASE AGREEMENT 20 | THIS SERIES A PREFERRED STOCK PURCHASE AGREEMENT (this “Agreement”), is made as of [], 20[], by and among [______], a Delaware corporation (the “Company”), and the investors listed on Exhibit A attached to this Agreement (each a “Purchaser” and together the “Purchasers”). 21 | The parties hereby agree as follows: 22 | 23 | ### ANSWER ### 24 | 25 | Agreement. THIS SERIES A PREFERRED STOCK PURCHASE AGREEMENT 26 | Company. Delaware corporation 27 | Purchaser. Company or the investors listed on Exhibit A 28 | Purchasers. Company and the investors listed on Exhibit A together 29 | 30 | 31 | ### CONTEXT ### 32 | 33 | [###REPLACE WITH CONTEXT###] 34 | 35 | ### ANSWER ### -------------------------------------------------------------------------------- /examples/performance_test/README.md: -------------------------------------------------------------------------------- 1 | # Performance Testing 2 | 3 | This directory contains scripts and configurations to evaluate the performance, specifically the runtime, of CheckEmbed on various embedding models in comparison to SelfCheckGPT and BERTScore. 4 | The script generates input text for each datapoint while varying the sizes of these texts, i.e. the number of tokens in the text, as well as the number of samples for each datapoint and measures the runtime performance of the embedding and the operations. 5 | The samples of a datapoint are all generated locally via script instead of querying an LLM. 6 | Varying the number of tokens to embed gives insights on the overall efficiency of the different embedding models used by CheckEmbed, SelfCheckGPT and BERTScore, while varying the sample number examines the the scalability of the respective pipelines. 7 | 8 | By default, the script tests multiple text sizes, ranging from 200 to 4000 tokens in steps of 200, as well as different number of samples (2, 4, 6, 8 and 10). 9 | 10 | ## Data 11 | 12 | The dataset with the generated text samples is created using the `Faker` library. Samples of varying lengths are generated and stored in a JSON format in directories (`2_samples`, `4_samples`, etc.) corresponding to the number of samples. 13 | 14 | Once the evaluation is finished, `data_extractor.py` can be used (and/or modified) to aggregate the runtime logs and write the results into a single JSON file containing all runtime measurements. 15 | ```python 16 | python3 data_extractor.py 17 | ``` 18 | 19 | The extracted JSON file has the following structure in general: 20 | ```json 21 | { 22 | "#_samples": { //2_samples, 4_samples... 23 | "embedding": { 24 | "embedding_model_name": { //gpt-embedding-large, sfr-embedding-mistral... 25 | "#tokens": "time", 26 | "#tokens": "time", 27 | //... 28 | }, 29 | //more embeddings... 30 | }, 31 | "bertscore": { 32 | "#tokens": "time", 33 | //... 34 | }, 35 | "selfcheckgpt_bertscore": { 36 | "#tokens": "time", 37 | //... 38 | }, 39 | "selfcheckgpt_nli": { 40 | "#tokens": "time", 41 | //... 42 | }, 43 | "checkembed": { 44 | "embedding_model_name": { //gpt-embedding-large, sfr-embedding-mistral... 45 | "#tokens": "time", 46 | "#tokens": "time", 47 | //... 48 | }, 49 | //more embeddings... 50 | }, 51 | "operations": {} //To customize. 52 | }, 53 | //additional number of samples... 54 | } 55 | ``` 56 | The runtime is reported in seconds. 57 | 58 | The extracted data can be visualized with the help of the provided plotting script: 59 | ```python 60 | python3 plot.py 61 | ``` 62 | 63 | ## Runtime / Cost Estimation 64 | 65 | The estimated compute time for running the evaluation is approximately 24 hours on an NVIDIA A100-SXM-40GB. 66 | 67 | The sample step is only emulated for these runtime measurements to avoid the cost of calling the LLM for the sampling, so cost only occur for the embedding with the OpenAI models. 68 | 69 | The embedding model from OpenAI has a cost of $0.13 / 1M tokens. 70 | 71 | ### Example 72 | In the following, we calculate the total cost for running the runtime measurements with the default parameters: 73 | - varying the number of samples from 2 to 10 in increments of 2 74 | - varying the text size from 200 to 4000 tokens in steps of 200 tokens 75 | - 20 prompts, meaning 20 datapoints for each specific combination of number of samples and number of tokens 76 | 77 | The total costs are $3.28: 78 | - total number of samples per text size: (2 + 4 + 6 + 8 + 10) * 20 = 30 * 20 = 600 79 | - total number of tokens: 80 | - 200 tokens: 200 * 600 = 120K 81 | - 400 tokens: 400 * 600 = 240K 82 | - ... 83 | --- 84 | - 25.2M tokens 85 | - 25.2M tokens * $0.13 / 1M tokens = $3.28 86 | -------------------------------------------------------------------------------- /examples/performance_test/main.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 ETH Zurich. 2 | # All rights reserved. 3 | # 4 | # Use of this source code is governed by a BSD-style license that can be 5 | # found in the LICENSE file. 6 | # 7 | # main author: Lorenzo Paleari 8 | 9 | import logging 10 | import os 11 | import json 12 | import random 13 | import tiktoken 14 | 15 | from faker import Faker 16 | from datetime import datetime as time 17 | 18 | from CheckEmbed import embedding_models 19 | from CheckEmbed.scheduler import Scheduler, StartingPoint 20 | from CheckEmbed.operations import SelfCheckGPT_BERT_Operation, SelfCheckGPT_NLI_Operation 21 | 22 | def start(current_dir: str, start: int = StartingPoint.PROMPT, n_samples: int = 10) -> None: 23 | """ 24 | Execute the runtime measurements. 25 | 26 | :param current_dir: Directory path from the the script is called. 27 | :type current_dir: str 28 | :param start: The starting point of the scheduler. Defaults to StartingPoint.PROMPT. 29 | :type start: int 30 | :param n_samples: Number of samples to generate. Defaults to 10. 31 | :type n_samples: int 32 | """ 33 | 34 | config_path = os.path.join( 35 | current_dir, 36 | "../../../CheckEmbed/config.json", 37 | ) 38 | 39 | embedd_large = embedding_models.EmbeddingGPT( 40 | config_path, 41 | model_name = "gpt-embedding-large", 42 | cache = False, 43 | max_concurrent_requests=5, 44 | ) 45 | 46 | sfrEmbeddingMistral = embedding_models.SFREmbeddingMistral( 47 | model_name = "Salesforce/SFR-Embedding-Mistral", 48 | cache = False, 49 | ) 50 | 51 | e5mistral7b = embedding_models.E5Mistral7b( 52 | model_name = "intfloat/e5-mistral-7b-instruct", 53 | cache = False, 54 | ) 55 | 56 | gteQwen157bInstruct = embedding_models.GteQwenInstruct( 57 | model_name = "Alibaba-NLP/gte-Qwen1.5-7B-instruct", 58 | cache = False, 59 | access_token = "", # Add your access token here (Hugging Face) 60 | ) 61 | 62 | stella_en_15B_v5 = embedding_models.Stella( 63 | model_name = "NovaSearch/stella_en_1.5B_v5", 64 | variant = "1.5B-v5", 65 | cache = False, 66 | ) 67 | 68 | stella_en_400M_v5 = embedding_models.Stella( 69 | model_name = "NovaSearch/stella_en_400M_v5", 70 | cache = False, 71 | ) 72 | 73 | selfCheckGPT_BERT_Operation = SelfCheckGPT_BERT_Operation( 74 | os.path.join(current_dir, "SelfCheckGPT"), 75 | current_dir, 76 | ) 77 | 78 | selfCheckGPT_NLI_Operation = SelfCheckGPT_NLI_Operation( 79 | os.path.join(current_dir, "SelfCheckGPT"), 80 | current_dir, 81 | ) 82 | 83 | # Initialize the scheduler 84 | scheduler = Scheduler( 85 | current_dir, 86 | logging_level = logging.DEBUG, 87 | budget = 8, 88 | selfCheckGPTOperation=[selfCheckGPT_BERT_Operation, selfCheckGPT_NLI_Operation], 89 | embedding_lm = [embedd_large, sfrEmbeddingMistral, e5mistral7b, gteQwen157bInstruct, stella_en_400M_v5, stella_en_15B_v5], 90 | ) 91 | 92 | # The order of lm_names and embedding_lm_names should be the same 93 | # as the order of the language models and embedding language models respectively. 94 | scheduler.run( 95 | startingPoint = start, 96 | bertScore = True, # Set to True if you want to test BERTScore 97 | selfCheckGPT = True, # Set to True if you want to test SelfCheckGPT 98 | time_performance = True, 99 | num_samples = n_samples, 100 | lm_names = [str(i) for i in range(200, 4200, 200)], # Overwrite the default lm names 101 | bertScore_model = "microsoft/deberta-xlarge-mnli", 102 | device = "cuda", 103 | batch_size = 64 # it may be necessary to reduce the batch size if the model is too large 104 | ) 105 | 106 | 107 | def text_gen(n_prompt: int = 50, n_samples: int = 10, dir: str = ".") -> None: 108 | """ 109 | Generate text with different number of tokens for a specific number of samples. 110 | 111 | :param n_prompt: Number of datapoints for a specific combination of token size and number of 112 | samples. Defaults to 50. 113 | :type n_prompt: int 114 | :param n_samples: Number of samples. Default to 10. 115 | :type n_samples: int 116 | :param dir: Path to the output directory. Defaults to the current directory. 117 | :type dir: str 118 | """ 119 | 120 | fake = Faker() 121 | fake.seed_instance(int(random.Random(time.now().microsecond).random() * 1000)) 122 | 123 | fake.name() 124 | fake.address() 125 | 126 | encoding = tiktoken.get_encoding("cl100k_base") 127 | 128 | for length in range(200, 4200, 200): 129 | len_samples = [] 130 | for _ in range(n_prompt): 131 | samples = [] 132 | for _ in range(n_samples): 133 | temp = fake.text(max_nb_chars=length*10).replace("\n", " ") 134 | while len(encoding.encode(temp)) < length: 135 | temp += fake.text(max_nb_chars=length*10).replace("\n", " ") 136 | 137 | final_dimension = len(encoding.encode(temp)) 138 | 139 | # Add the samples to the list and keep only around the desired token length 140 | samples.append(temp[0:int(len(temp) * (length / final_dimension))]) 141 | len_samples.append(samples) 142 | 143 | with open(f"{dir}/{length}_samples.json", "w") as f: 144 | json_data = [{"index": i, "samples": samples} for i, samples in enumerate(len_samples)] 145 | json.dump({"data": json_data}, f, indent=4) 146 | 147 | 148 | if __name__ == "__main__": 149 | print("Performance test\n") 150 | 151 | for sample_count in [2, 4, 6, 8, 10]: 152 | print(f"\n\n\n#########################\n#\t{sample_count} SAMPLES\t#\n#########################") 153 | current_dir = os.path.dirname(os.path.abspath(__file__)) + f"/{sample_count}_samples" 154 | os.makedirs(current_dir, exist_ok=True) 155 | text_gen(20, n_samples=sample_count, dir=f"{sample_count}_samples") 156 | start(current_dir, start=StartingPoint.EMBEDDINGS, n_samples=sample_count) 157 | -------------------------------------------------------------------------------- /examples/vision/README.md: -------------------------------------------------------------------------------- 1 | # Vision Hallucination Evaluation with CheckEmbed 2 | 3 | This example demonstrates an end-to-end experiment for assessing hallucinations in image generation using CheckEmbed and Stable Diffusion 3.5. 4 | 5 | ## Structure 6 | 7 | ``` 8 | imgs/ 9 | └── counting_items/ # Images generated for this experiment 10 | main.py # Script to generate images, embeddings, and run CheckEmbed 11 | README.md # This document 12 | ``` 13 | 14 | ## Usage 15 | 16 | ```bash 17 | python main.py --start_idx 0 --end_idx 8 18 | ``` 19 | Varying the `--start_idx` and `--end_idx` parameters allows you to process in parallel. However, run the CheckEmbed step sequentially. 20 | 21 | The prompts are hardcoded in `main.py` and are designed to generate images with a specific number of items. 22 | 23 | * Outputs: 24 | * `imgs/counting_items/`: Generated PNG images. 25 | * `clip_embeddings/counting_items/`: JSON files of CLIP embeddings. 26 | * `checkembed_outputs/counting_items/`: CheckEmbed result JSONs. 27 | 28 | ## Configuration 29 | 30 | * Paths in `main.py` (e.g., `path/to/...`) should be updated to your local directories before running. 31 | * Modify `input_prompts` in `main.py` to extend or change prompt sets. 32 | 33 | ## Results 34 | 35 | * Compare CheckEmbed scores against manual correctness counts to evaluate precision. 36 | -------------------------------------------------------------------------------- /examples/vision/main.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2025 ETH Zurich. 2 | # All rights reserved. 3 | # 4 | # Use of this source code is governed by a BSD-style license that can be 5 | # found in the LICENSE file. 6 | # 7 | # main author: Eric Schreiber 8 | # 9 | # contributions: Lorenzo Paleari 10 | 11 | import logging 12 | import os 13 | from typing import Any, List, Union 14 | 15 | from PIL.Image import Image 16 | 17 | from CheckEmbed import embedding_models, vision_models 18 | from CheckEmbed.parser import Parser 19 | from CheckEmbed.scheduler import Scheduler, StartingPoint 20 | 21 | input_prompts = [ 22 | ["One red apple on a white background", 23 | "Two red apples on a white background", 24 | "Three red apples on a white background", 25 | "Four red apples on a white background", 26 | "Five red apples on a white background", 27 | ], 28 | ["One yellow tennis ball on a white background", 29 | "Two yellow tennis balls on a white background", 30 | "Three yellow tennis balls on a white background", 31 | "Four yellow tennis balls on a white background", 32 | "Five yellow tennis balls on a white background", 33 | ], 34 | ["One orange on a white background", 35 | "Two oranges on a white background", 36 | "Three oranges on a white background", 37 | "Four oranges on a white background", 38 | "Five oranges on a white background", 39 | ], 40 | ["One yellow lemon on a white background", 41 | "Two yellow lemons on a white background", 42 | "Three yellow lemons on a white background", 43 | "Four yellow lemons on a white background", 44 | "Five yellow lemons on a white background", 45 | ], 46 | ["One green lime on a white background", 47 | "Two green limes on a white background", 48 | "Three green limes on a white background", 49 | "Four green limes on a white background", 50 | "Five green limes on a white background", 51 | ], 52 | ["One red tomato on a white background", 53 | "Two red tomatoes on a white background", 54 | "Three red tomatoes on a white background", 55 | "Four red tomatoes on a white background", 56 | "Five red tomatoes on a white background", 57 | ], 58 | ["One yellow banana on a white background", 59 | "Two yellow bananas on a white background", 60 | "Three yellow bananas on a white background", 61 | "Four yellow bananas on a white background", 62 | "Five yellow bananas on a white background", 63 | ], 64 | ["One blue circle on a white background", 65 | "Two blue circles on a white background", 66 | "Three blue circles on a white background", 67 | "Four blue circles on a white background", 68 | "Five blue circles on a white background", 69 | ] 70 | ] 71 | 72 | class CustomParser(Parser): 73 | """ 74 | The CustomParser class handles the dataset parsing. 75 | 76 | Inherits from the Parser class and implements its abstract methods. 77 | """ 78 | 79 | def __init__(self, dataset_path: str, list: List[str]) -> None: 80 | """ 81 | Initialize the parser. 82 | 83 | :param dataset_path: The path to the dataset. 84 | :type dataset_path: str 85 | :param list: The list of input prompts. 86 | :type list: List[str] 87 | """ 88 | super().__init__(dataset_path) 89 | self.list = list 90 | 91 | def prompt_generation(self, custom_inputs: Any = None) -> List[str]: 92 | """ 93 | Parse the dataset and generate the prompts for the model. 94 | 95 | :param custom_inputs: The custom inputs to the parser. Defaults to None. 96 | :type custom_inputs: Any 97 | :return: List of prompts. 98 | :rtype: List[str] 99 | """ 100 | prompts = [] 101 | for item in self.list: 102 | prompts.extend(item) 103 | 104 | return prompts 105 | 106 | def ground_truth_extraction(self, custom_inputs: Any = None) -> List[str]: 107 | """ 108 | Parse the dataset and extract the ground truth. 109 | 110 | :param custom_inputs: The custom inputs to the parser. Defaults to None. 111 | :type custom_inputs: Any 112 | :return: List of ground truths. 113 | :rtype: List[str] 114 | """ 115 | pass 116 | 117 | def answer_parser(self, responses: List[List[Union[str, Image]]], custom_inputs: Any = None) -> List[List[Union[str, Image]]]: 118 | """ 119 | Parse the responses from the model: Return the responses as they are. 120 | 121 | :param responses: The responses from the model. 122 | :type responses: List[List[Union[str, Image]]] 123 | :param custom_inputs: The custom inputs to the parser. Defaults to None. 124 | :type custom_inputs: Any 125 | :return: The parsed responses. 126 | :rtype: List[List[Union[str, Image]]] 127 | """ 128 | return responses 129 | 130 | 131 | def start(current_dir: str, list: List[str]) -> None: 132 | """ 133 | Execute the vision use case. 134 | 135 | :param current_dir: Directory path from the the script is called. 136 | :type current_dir: str 137 | :param list: The list of input prompts. 138 | :type list: List[str] 139 | """ 140 | 141 | # Initialize the parser, the vision and embedding models 142 | customParser = CustomParser( 143 | dataset_path = current_dir, 144 | list = list 145 | ) 146 | 147 | stable_diffusion = vision_models.StableDiffusion3( 148 | model_name = "stabilityai/stable-diffusion-3.5-medium", 149 | cache = False, 150 | ) 151 | 152 | clip_vit_large = embedding_models.ClipVitLarge( 153 | model_name = "openai/clip-vit-large-patch14", 154 | cache = False, 155 | ) 156 | 157 | # Initialize the scheduler 158 | scheduler = Scheduler( 159 | current_dir, 160 | logging_level = logging.DEBUG, 161 | budget = 12, 162 | parser = customParser, 163 | lm = [stable_diffusion], 164 | embedding_lm = [clip_vit_large], 165 | ) 166 | 167 | # The order of lm_names and embedding_lm_names should be the same 168 | # as the order of the generation models and embedding models respectively. 169 | scheduler.run( 170 | startingPoint = StartingPoint.PROMPT, 171 | bertScore = False, 172 | selfCheckGPT = False, 173 | llm_as_a_judge = False, 174 | vision = True, 175 | rebase_results = True, 176 | num_samples = 10, 177 | device = "cuda", 178 | batch_size = 64 # it may be necessary to reduce the batch size if the model is too large 179 | ) 180 | 181 | if __name__ == "__main__": 182 | current_dir = os.path.dirname(os.path.abspath(__file__)) 183 | start(current_dir, input_prompts) 184 | -------------------------------------------------------------------------------- /examples/wiki_bio/README.md: -------------------------------------------------------------------------------- 1 | # WikiBio Benchmark 2 | 3 | This example uses a subset of the WikiBio dataset (Lebret et al., 2016) that was modified by Manakul et al. (2023) for their evaluation of SelfCheckGPT. It consists of 238 documents based on Wikipedia articles, that were used to generate samples in which hallucinations were introduced. Each sentence of those samples was manually labeled as either “major inaccurate”, “minor inaccurate”, or “accurate”. 4 | 5 | ## Data 6 | 7 | The dataset and the conversion script from sentence scores into passage scores is located in the `data` directory. 8 | To download the dataset and recompute the passage scores, run the following commands: 9 | 10 | ```bash 11 | cd data 12 | python3 download.py 13 | python3 passage_scores.py 14 | ``` 15 | 16 | ## Runtime / Cost Estimation 17 | 18 | The estimated compute time for running the evaluation is approximately 36 hours on an NVIDIA A100-SXM-40GB. 19 | 20 | The sample step is skipped, since the samples are already provided in the dataset. Cost only occur for the embedding with the OpenAI models. 21 | 22 | The embedding model from OpenAI has a cost of $0.13 / 1M tokens, which results in an approximate cost of $0.65 for the evaluation of this example. 23 | 24 | Using LLM-as-a-judge will cost around $1. 25 | -------------------------------------------------------------------------------- /examples/wiki_bio/data/download.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 ETH Zurich. 2 | # All rights reserved. 3 | # 4 | # Use of this source code is governed by a BSD-style license that can be 5 | # found in the LICENSE file. 6 | # 7 | # main author: Lorenzo Paleari 8 | 9 | from datasets import load_dataset 10 | import json 11 | 12 | ds = load_dataset("potsawee/wiki_bio_gpt3_hallucination") 13 | ds = ds["evaluation"] 14 | 15 | features = ['gpt3_text', 'wiki_bio_text', 'gpt3_sentences', 'annotation', 'wiki_bio_test_idx', 'gpt3_text_samples'] 16 | dataset = {} 17 | for feat in features: 18 | dataset.update({feat: ds[feat]}) 19 | 20 | dataset_final = {} 21 | for i in range(len(dataset[features[0]])): 22 | dataset_passage = {} 23 | for feat in features: 24 | dataset_passage.update({feat: dataset[feat][i]}) 25 | name = f"passage_{i}" 26 | dataset_final.append({ 27 | name: dataset_passage 28 | }) 29 | 30 | with open("dataset.json", "w") as f: 31 | json.dump(dataset_final, f, indent=4) 32 | -------------------------------------------------------------------------------- /examples/wiki_bio/data/passage_scores.json: -------------------------------------------------------------------------------- 1 | { 2 | "passage_0": 0.0, 3 | "passage_1": 21.428571428571427, 4 | "passage_2": 50.0, 5 | "passage_3": 5.555555555555555, 6 | "passage_4": 54.54545454545454, 7 | "passage_5": 36.36363636363637, 8 | "passage_6": 0.0, 9 | "passage_7": 64.28571428571429, 10 | "passage_8": 56.25, 11 | "passage_9": 33.33333333333333, 12 | "passage_10": 75.0, 13 | "passage_11": 66.66666666666666, 14 | "passage_12": 28.57142857142857, 15 | "passage_13": 85.0, 16 | "passage_14": 50.0, 17 | "passage_15": 42.857142857142854, 18 | "passage_16": 8.333333333333332, 19 | "passage_17": 100.0, 20 | "passage_18": 35.714285714285715, 21 | "passage_19": 75.0, 22 | "passage_20": 45.0, 23 | "passage_21": 15.384615384615385, 24 | "passage_22": 80.0, 25 | "passage_23": 50.0, 26 | "passage_24": 16.666666666666664, 27 | "passage_25": 0.0, 28 | "passage_26": 88.88888888888889, 29 | "passage_27": 44.44444444444444, 30 | "passage_28": 81.81818181818183, 31 | "passage_29": 37.5, 32 | "passage_30": 6.25, 33 | "passage_31": 25.0, 34 | "passage_32": 43.75, 35 | "passage_33": 0.0, 36 | "passage_34": 50.0, 37 | "passage_35": 0.0, 38 | "passage_36": 59.09090909090909, 39 | "passage_37": 22.22222222222222, 40 | "passage_38": 71.42857142857143, 41 | "passage_39": 42.857142857142854, 42 | "passage_40": 100.0, 43 | "passage_41": 40.0, 44 | "passage_42": 66.66666666666666, 45 | "passage_43": 7.142857142857142, 46 | "passage_44": 36.36363636363637, 47 | "passage_45": 0.0, 48 | "passage_46": 50.0, 49 | "passage_47": 100.0, 50 | "passage_48": 81.25, 51 | "passage_49": 27.77777777777778, 52 | "passage_50": 50.0, 53 | "passage_51": 0.0, 54 | "passage_52": 90.0, 55 | "passage_53": 81.25, 56 | "passage_54": 83.33333333333334, 57 | "passage_55": 0.0, 58 | "passage_56": 0.0, 59 | "passage_57": 60.0, 60 | "passage_58": 0.0, 61 | "passage_59": 45.0, 62 | "passage_60": 75.0, 63 | "passage_61": 68.75, 64 | "passage_62": 83.33333333333334, 65 | "passage_63": 28.57142857142857, 66 | "passage_64": 35.714285714285715, 67 | "passage_65": 45.83333333333333, 68 | "passage_66": 0.0, 69 | "passage_67": 16.666666666666664, 70 | "passage_68": 30.0, 71 | "passage_69": 65.0, 72 | "passage_70": 44.44444444444444, 73 | "passage_71": 68.75, 74 | "passage_72": 83.33333333333334, 75 | "passage_73": 22.22222222222222, 76 | "passage_74": 100.0, 77 | "passage_75": 0.0, 78 | "passage_76": 0.0, 79 | "passage_77": 77.77777777777779, 80 | "passage_78": 55.55555555555556, 81 | "passage_79": 8.333333333333332, 82 | "passage_80": 60.0, 83 | "passage_81": 62.5, 84 | "passage_82": 41.66666666666667, 85 | "passage_83": 64.28571428571429, 86 | "passage_84": 10.0, 87 | "passage_85": 16.666666666666664, 88 | "passage_86": 60.0, 89 | "passage_87": 50.0, 90 | "passage_88": 75.0, 91 | "passage_89": 0.0, 92 | "passage_90": 75.0, 93 | "passage_91": 60.0, 94 | "passage_92": 44.44444444444444, 95 | "passage_93": 50.0, 96 | "passage_94": 83.33333333333334, 97 | "passage_95": 6.25, 98 | "passage_96": 41.66666666666667, 99 | "passage_97": 64.28571428571429, 100 | "passage_98": 43.75, 101 | "passage_99": 54.54545454545454, 102 | "passage_100": 56.25, 103 | "passage_101": 28.57142857142857, 104 | "passage_102": 91.66666666666666, 105 | "passage_103": 91.66666666666666, 106 | "passage_104": 83.33333333333334, 107 | "passage_105": 16.666666666666664, 108 | "passage_106": 81.25, 109 | "passage_107": 70.0, 110 | "passage_108": 90.0, 111 | "passage_109": 83.33333333333334, 112 | "passage_110": 40.909090909090914, 113 | "passage_111": 0.0, 114 | "passage_112": 59.09090909090909, 115 | "passage_113": 0.0, 116 | "passage_114": 0.0, 117 | "passage_115": 57.14285714285714, 118 | "passage_116": 16.666666666666664, 119 | "passage_117": 68.75, 120 | "passage_118": 16.666666666666664, 121 | "passage_119": 65.0, 122 | "passage_120": 62.5, 123 | "passage_121": 30.0, 124 | "passage_122": 31.818181818181817, 125 | "passage_123": 20.0, 126 | "passage_124": 11.11111111111111, 127 | "passage_125": 34.61538461538461, 128 | "passage_126": 92.85714285714286, 129 | "passage_127": 62.5, 130 | "passage_128": 35.714285714285715, 131 | "passage_129": 0.0, 132 | "passage_130": 31.818181818181817, 133 | "passage_131": 75.0, 134 | "passage_132": 16.666666666666664, 135 | "passage_133": 5.0, 136 | "passage_134": 75.0, 137 | "passage_135": 87.5, 138 | "passage_136": 37.5, 139 | "passage_137": 40.0, 140 | "passage_138": 90.0, 141 | "passage_139": 50.0, 142 | "passage_140": 35.714285714285715, 143 | "passage_141": 20.0, 144 | "passage_142": 83.33333333333334, 145 | "passage_143": 35.714285714285715, 146 | "passage_144": 35.714285714285715, 147 | "passage_145": 40.0, 148 | "passage_146": 81.25, 149 | "passage_147": 25.0, 150 | "passage_148": 63.63636363636363, 151 | "passage_149": 30.0, 152 | "passage_150": 8.333333333333332, 153 | "passage_151": 92.85714285714286, 154 | "passage_152": 0.0, 155 | "passage_153": 50.0, 156 | "passage_154": 0.0, 157 | "passage_155": 13.636363636363635, 158 | "passage_156": 33.33333333333333, 159 | "passage_157": 15.0, 160 | "passage_158": 60.0, 161 | "passage_159": 33.33333333333333, 162 | "passage_160": 60.0, 163 | "passage_161": 20.0, 164 | "passage_162": 68.18181818181817, 165 | "passage_163": 18.75, 166 | "passage_164": 56.25, 167 | "passage_165": 0.0, 168 | "passage_166": 100.0, 169 | "passage_167": 0.0, 170 | "passage_168": 0.0, 171 | "passage_169": 100.0, 172 | "passage_170": 64.28571428571429, 173 | "passage_171": 40.0, 174 | "passage_172": 37.5, 175 | "passage_173": 78.57142857142857, 176 | "passage_174": 77.77777777777779, 177 | "passage_175": 71.42857142857143, 178 | "passage_176": 100.0, 179 | "passage_177": 68.75, 180 | "passage_178": 50.0, 181 | "passage_179": 60.0, 182 | "passage_180": 0.0, 183 | "passage_181": 80.0, 184 | "passage_182": 100.0, 185 | "passage_183": 0.0, 186 | "passage_184": 0.0, 187 | "passage_185": 12.5, 188 | "passage_186": 25.0, 189 | "passage_187": 31.818181818181817, 190 | "passage_188": 80.0, 191 | "passage_189": 40.0, 192 | "passage_190": 78.57142857142857, 193 | "passage_191": 30.0, 194 | "passage_192": 37.5, 195 | "passage_193": 50.0, 196 | "passage_194": 50.0, 197 | "passage_195": 0.0, 198 | "passage_196": 43.75, 199 | "passage_197": 87.5, 200 | "passage_198": 95.83333333333334, 201 | "passage_199": 43.75, 202 | "passage_200": 68.75, 203 | "passage_201": 100.0, 204 | "passage_202": 22.22222222222222, 205 | "passage_203": 65.0, 206 | "passage_204": 0.0, 207 | "passage_205": 100.0, 208 | "passage_206": 55.00000000000001, 209 | "passage_207": 10.0, 210 | "passage_208": 0.0, 211 | "passage_209": 18.75, 212 | "passage_210": 33.33333333333333, 213 | "passage_211": 56.25, 214 | "passage_212": 81.25, 215 | "passage_213": 42.857142857142854, 216 | "passage_214": 55.00000000000001, 217 | "passage_215": 40.0, 218 | "passage_216": 65.0, 219 | "passage_217": 31.25, 220 | "passage_218": 0.0, 221 | "passage_219": 58.333333333333336, 222 | "passage_220": 100.0, 223 | "passage_221": 70.0, 224 | "passage_222": 33.33333333333333, 225 | "passage_223": 45.0, 226 | "passage_224": 92.85714285714286, 227 | "passage_225": 60.0, 228 | "passage_226": 75.0, 229 | "passage_227": 62.5, 230 | "passage_228": 45.0, 231 | "passage_229": 0.0, 232 | "passage_230": 44.44444444444444, 233 | "passage_231": 8.333333333333332, 234 | "passage_232": 66.66666666666666, 235 | "passage_233": 14.285714285714285, 236 | "passage_234": 100.0, 237 | "passage_235": 15.0, 238 | "passage_236": 0.0, 239 | "passage_237": 44.44444444444444 240 | } -------------------------------------------------------------------------------- /examples/wiki_bio/data/passage_scores.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 ETH Zurich. 2 | # All rights reserved. 3 | # 4 | # Use of this source code is governed by a BSD-style license that can be 5 | # found in the LICENSE file. 6 | # 7 | # main author: Lorenzo Paleari 8 | 9 | import json 10 | from collections import defaultdict 11 | 12 | def calculate_accuracy_percentage(accurate: float, total: int) -> float: 13 | """ 14 | Function to calculate accuracy percentage. 15 | 16 | :param accurate: Combined value of scores for the dataset. 17 | :type accurate: float 18 | :param total: Number of items in the dataset. 19 | :type total: int 20 | :return: Accuracy percentage. 21 | :rtype: float 22 | """ 23 | if total == 0: 24 | return 0 25 | return (accurate / total) * 100 26 | 27 | 28 | # Load the dataset 29 | with open("dataset.json", "r") as f: 30 | dataset = json.load(f) 31 | 32 | # Initialize the new dataset 33 | categorized_dataset = defaultdict(list) 34 | 35 | # Iterate over the dataset 36 | for passage_number, value in dataset.items(): 37 | annotation = value["annotation"] 38 | 39 | # Count the number of accurate annotations 40 | accurate = sum(1 for label in annotation if label == "accurate") 41 | half_accurate = sum(0.5 for label in annotation if "minor" in label) 42 | 43 | # Calculate total annotations 44 | total_annotations = len(annotation) 45 | 46 | # Calculate accuracy percentage 47 | accuracy_percentage = calculate_accuracy_percentage(accurate + half_accurate, total_annotations) 48 | 49 | # Add the passage number to the respective category in the new dataset 50 | categorized_dataset[passage_number] = accuracy_percentage 51 | 52 | with open("./passage_scores.json", "w") as outfile: 53 | json.dump(categorized_dataset, outfile, indent=4) 54 | -------------------------------------------------------------------------------- /paper/README.md: -------------------------------------------------------------------------------- 1 | ## Plot Data 2 | 3 | The data used to create the figures of the arXiv preprint article can be 4 | found in the `results.tar.bz2` archive. Unpack the archive and run the 5 | file `plots.py`. 6 | 7 | ```bash 8 | tar xfj results.tar.bz2 9 | ``` 10 | -------------------------------------------------------------------------------- /paper/results.tar.bz2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/spcl/CheckEmbed/008357ed0b6572575ec4c16daf52b549a9c38e25/paper/results.tar.bz2 -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["hatchling"] 3 | build-backend = "hatchling.build" 4 | 5 | [project] 6 | name = "CheckEmbed" 7 | version = "0.0.1" 8 | authors = [ 9 | { name="Maciej Besta", email="maciej.besta@inf.ethz.ch" }, 10 | { name="Lorenzo Paleari", email="lpaleari@student.ethz.ch" }, 11 | { name="Ales Kubicek", email="akubicek@student.ethz.ch" }, 12 | { name="Robert Gerstenberger", email="gerstenberger.robert@gmail.com" }, 13 | { name="Patrick Iff", email="patrick.iff@inf.ethz.ch" }, 14 | { name="Eric Schreiber", email="eric.schreiber@inf.ethz.ch" }, 15 | ] 16 | description = "Python package for 'CheckEmbed'" 17 | readme = "README.md" 18 | license = {file = "LICENSE"} 19 | requires-python = ">=3.8.12,<3.11" 20 | classifiers = [ 21 | "Programming Language :: Python :: 3", 22 | "Operating System :: OS Independent", 23 | ] 24 | dependencies = [ 25 | "accelerate>=0.30.1,<0.35.0", 26 | "backoff>=2.2.1,<3.0.0", 27 | "bert-score>=0.3.13,<1.0.0", 28 | "diffusers==0.33.1", 29 | "faker>=25.8.0,<26.0.0", 30 | "huggingface-hub>=0.30.0,<0.31.0", 31 | "joblib>=1.4.2,<2.0.0", 32 | "langchain>=0.2.11", 33 | "langchain_ollama", 34 | "matplotlib>=3.7.1,<4.0.0", 35 | "numpy>=1.24.3,<2.0.0", 36 | "openai>=1.0.0,<2.0.0", 37 | "packaging>=24.1,<25.0", 38 | "pandas>2.0.0,<3.0.0", 39 | "pillow>=11.2.1", 40 | "pydantic>=2.8.2", 41 | "scikit-learn>=1.5.1,<2.0.0", 42 | "scipy>1.15.0,<2.0.0", 43 | "seaborn>=0.13.2,<0.14.0", 44 | "selfcheckgpt>=0.1.7,<1.0.0", 45 | "sentencepiece>=0.2.0,<0.3.0", 46 | "tiktoken>=0.7.0,<1.0.0", 47 | "torch==2.6.0", 48 | "transformers>=4.51.3,<5.0.0", 49 | "transformers[torch]", 50 | "wheel>=0.43.0,<1.0.0", 51 | "vllm>=0.8.2,<1.0.0" 52 | ] 53 | 54 | [project.optional-dependencies] 55 | cuda = [ 56 | "xformers>=0.0.27,<0.0.40", 57 | "flash-attn>=2.5.6,<3.0.0", 58 | ] 59 | 60 | [project.urls] 61 | Homepage = "https://github.com/spcl/CheckEmbed" 62 | 63 | [project.scripts] 64 | --------------------------------------------------------------------------------