├── activation_steering ├── console.py ├── __init__.py ├── config.py ├── steering_dataset.py ├── utils.py ├── leash_layer.py ├── steering_vector.py └── malleable_model.py ├── docs ├── console.md ├── steering_dataset.md ├── utils.md ├── config.md ├── leash_layer.md ├── demo-data │ └── behavior_refusal.json ├── steering_vector.md ├── faq.md ├── malleable_model.md └── quickstart.md ├── pyproject.toml ├── .gitignore ├── README.md └── LICENSE /activation_steering/console.py: -------------------------------------------------------------------------------- 1 | from rich.console import Console 2 | console = Console(quiet=False) -------------------------------------------------------------------------------- /docs/console.md: -------------------------------------------------------------------------------- 1 | 2 | 3 |

4 | 5 | # module `console` 6 | 7 | 8 | 9 | 10 | 11 | 12 | -------------------------------------------------------------------------------- /activation_steering/__init__.py: -------------------------------------------------------------------------------- 1 | import dataclasses 2 | 3 | import numpy as np 4 | import torch 5 | from transformers import PreTrainedModel, PreTrainedTokenizerBase 6 | 7 | 8 | from . import malleable_model, steering_dataset, steering_vector 9 | from .malleable_model import MalleableModel 10 | from .steering_dataset import SteeringDataset 11 | from .steering_vector import SteeringVector -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "activation_steering" 3 | version = "0.1.0" 4 | description = "" 5 | authors = ["Bruce W. Lee "] 6 | maintainers = ["Bruce W. Lee "] 7 | readme = "README.md" 8 | 9 | [tool.poetry.dependencies] 10 | python = "^3.10" 11 | numpy = "^1.26.4" 12 | scikit-learn = "^1.5.0" 13 | torch = "^2.3.1" 14 | transformers = "^4.41.2" 15 | accelerate = "^0.31.0" 16 | tqdm = "^4.66.4" 17 | gguf = "^0.6.0" 18 | rich = "^13.7.1" 19 | matplotlib = "^3.9.0" 20 | einops = "^0.8.0" 21 | 22 | 23 | [build-system] 24 | requires = ["poetry-core"] 25 | build-backend = "poetry.core.masonry.api" 26 | -------------------------------------------------------------------------------- /docs/steering_dataset.md: -------------------------------------------------------------------------------- 1 | 2 | 3 |

4 | 5 | # module `steering_dataset` 6 | 7 | 8 | 9 | 10 | 11 | 12 | --- 13 | 14 |

15 | 16 | ## class `SteeringDataset` 17 | Create a formatted dataset for steering a language model. 18 | 19 | This class takes a list of examples (either contrastive messages or contrastive text) and a tokenizer, and formats the examples into a dataset of ContrastivePair objects. 20 | 21 |

22 | 23 | ### method `__init__` 24 | 25 | ```python 26 | __init__( 27 | tokenizer: PreTrainedTokenizerBase, 28 | examples: List, 29 | suffixes: List[Tuple[str, str]] = None, 30 | disable_suffixes: bool = False, 31 | use_chat_template: bool = True, 32 | system_message: Optional[Tuple[str, str]] = None 33 | ) 34 | ``` 35 | 36 | Initialize the SteeringDataset. 37 | 38 | 39 | 40 | **Args:** 41 | 42 | - `tokenizer`: The tokenizer used to tokenize and format the examples. 43 | - `examples`: A list of examples, either contrastive messages or contrastive text. 44 | - `suffixes`: A list of suffixes to append to the formatted dataset. If None, default suffixes will be used. 45 | - `disable_suffixes`: If True, no suffixes will be appended to the examples. 46 | - `use_chat_template`: If True, applies the chat template to the examples. 47 | - `system_message`: Optional system message to be included in the chat template. 48 | 49 | 50 | 51 | 52 | --- 53 | 54 |

55 | 56 | ### method `clean_text` 57 | 58 | ```python 59 | clean_text(text: str) → str 60 | ``` 61 | 62 | Clean the input text by replacing special tokens. 63 | 64 | 65 | 66 | **Args:** 67 | 68 | - `text`: The input text to be cleaned. 69 | 70 | 71 | 72 | **Returns:** 73 | The cleaned text with special tokens replaced. 74 | 75 | 76 | -------------------------------------------------------------------------------- /docs/utils.md: -------------------------------------------------------------------------------- 1 | 2 | 3 |

4 | 5 | # module `utils` 6 | 7 | 8 | 9 | 10 | 11 | --- 12 | 13 |

14 | 15 | ## function `custom_progress` 16 | 17 | ```python 18 | custom_progress(iterable, description) 19 | ``` 20 | 21 | Create a custom progress bar for iterating over items. 22 | 23 | 24 | 25 | **Args:** 26 | 27 | - `iterable`: The iterable to process. 28 | - `description`: A string describing the progress bar. 29 | 30 | 31 | 32 | **Yields:** 33 | Items from the iterable. 34 | 35 | 36 | --- 37 | 38 |

39 | 40 | ## function `return_default_suffixes` 41 | 42 | ```python 43 | return_default_suffixes() 44 | ``` 45 | 46 | Return a list of default suffixes used in the CAIS representation engineering paper. 47 | 48 | 49 | 50 | **Returns:** 51 | A list of string suffixes. 52 | 53 | 54 | --- 55 | 56 |

57 | 58 | ## class `LayerControlParams` 59 | LayerControlParams(control: torch.Tensor | None = None, operator: Callable[[torch.Tensor, torch.Tensor], torch.Tensor] = at 0x7f4069cc2560>) 60 | 61 |

62 | 63 | ### method `__init__` 64 | 65 | ```python 66 | __init__( 67 | control: Tensor | None = None, 68 | operator: Callable[[Tensor, Tensor], Tensor] = at 0x7f4069cc2560> 69 | ) → None 70 | ``` 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | --- 80 | 81 |

82 | 83 | ### classmethod `default` 84 | 85 | ```python 86 | default() 87 | ``` 88 | 89 | Return a default instance of LayerControlParams. 90 | 91 | 92 | 93 | **Returns:** 94 | A LayerControlParams instance with default values. 95 | 96 | --- 97 | 98 |

99 | 100 | ### method `` 101 | 102 | ```python 103 | (current, control) 104 | ``` 105 | 106 | 107 | 108 | 109 | 110 | 111 | --- 112 | 113 |

114 | 115 | ## class `ContrastivePair` 116 | A dataclass representing a pair of contrasting strings. 117 | 118 | 119 | 120 | **Attributes:** 121 | 122 | - `positive`: The positive string in the pair. 123 | - `negative`: The negative string in the pair. 124 | 125 |

126 | 127 | ### method `__init__` 128 | 129 | ```python 130 | __init__(positive: str, negative: str) → None 131 | ``` 132 | 133 | 134 | 135 | 136 | 137 | 138 | 139 | 140 | 141 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # poetry 98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 99 | # This is especially recommended for binary packages to ensure reproducibility, and is more 100 | # commonly ignored for libraries. 101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 102 | #poetry.lock 103 | 104 | # pdm 105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 106 | #pdm.lock 107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 108 | # in version control. 109 | # https://pdm.fming.dev/#use-with-ide 110 | .pdm.toml 111 | 112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 113 | __pypackages__/ 114 | 115 | # Celery stuff 116 | celerybeat-schedule 117 | celerybeat.pid 118 | 119 | # SageMath parsed files 120 | *.sage.py 121 | 122 | # Environments 123 | .env 124 | .venv 125 | env/ 126 | venv/ 127 | ENV/ 128 | env.bak/ 129 | venv.bak/ 130 | 131 | # Spyder project settings 132 | .spyderproject 133 | .spyproject 134 | 135 | # Rope project settings 136 | .ropeproject 137 | 138 | # mkdocs documentation 139 | /site 140 | 141 | # mypy 142 | .mypy_cache/ 143 | .dmypy.json 144 | dmypy.json 145 | 146 | # Pyre type checker 147 | .pyre/ 148 | 149 | # pytype static type analyzer 150 | .pytype/ 151 | 152 | # Cython debug symbols 153 | cython_debug/ 154 | 155 | # PyCharm 156 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 157 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 158 | # and can be added to the global gitignore or merged into this file. For a more nuclear 159 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 160 | #.idea/ -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ![Python](https://img.shields.io/badge/python-3.10+-blue.svg) 2 | 3 | # Activation Steering 4 | 5 | 👉 (Aug-2025) Added `pca_pairwise` method and set as default. Use `method="pca_pairwise"` to reproduce results closer to those reported in the paper. Colab demos (see below) are fixed accordingly, and they should work as expected. 6 | 7 | 👉 (Jul-2025) Bug fixed: PCA_centering (@Reason239) 8 | 9 | 👉 (Apr-2025) Conditional Activation Steering is a spotlight paper at ICLR 2025! 10 | 11 | 👉 (Nov-2024) A few Colab demos are added. 12 | 13 | 👉 (Sep-2024) Preprint released on [arXiv](https://arxiv.org/abs/2409.05907). 14 | 15 | ## Overview 16 | 17 | This is a general-purpose activation steering library to (1) extract vectors and (2) steer model behavior. We release this library alongside our recent paper on [*Programming Refusal with Conditional Activation Steering*](https://arxiv.org/abs/2409.05907) to provide an intuitive toolchain for activation steering efforts. 18 | 19 | ## Installation 20 | ```bash 21 | git clone https://github.com/IBM/activation-steering 22 | 23 | pip install -e activation-steering 24 | ``` 25 | 26 | ## Activation Steering 27 | Activation steering is a technique for influencing the behavior of language models by modifying their internal activations during inference. This library provides tools for: 28 | 29 | - Extracting steering vectors from contrastive examples 30 | - Applying steering vectors to modify model behavior 31 | 32 | This part is conceptually similar to [*Steering Language Models With Activation Engineering*](https://arxiv.org/abs/2308.10248), but our code implementation could be different. 33 | 34 | ## Conditional Activation Steering 35 | Conditional activation steering selectively applies or withholds activation steering based on the input context. Conditional activation steering extends the activation steering framework by introducing: 36 | 37 | - Context-dependent control capabilities through condition vectors 38 | - Logical composition of multiple condition vectors 39 | 40 | Refer to our [*paper*](https://arxiv.org/abs/2409.05907) and [*documentation*](docs/quickstart.md) for detailed implementation and usage of activation steering and conditional activation steering. 41 | 42 | ## Documentation 43 | Refer to /docs to understand this library. We recommend starting with Quick Start Tutorial as it covers most concepts that you need to get started with activation steering and conditional activation steering. 44 | 45 | - Quick Start Tutorial (10 minutes ~ 60 minutes, depending on your hardware) 👉 [here!](docs/quickstart.md) 46 | - FAQ 👉 [here!](docs/faq.md) 47 | 48 | ## Colab Demos 49 | 50 | - Adding Refusal Behavior to LLaMA 3.1 8B Inst 👉 [here!](https://colab.research.google.com/drive/1IpAPMFHZW6CNrE0L16TXSvIApAK9jAFZ?usp=sharing) 51 | - Adding CoT Behavior to Gemma 2 9B 👉 [here!](https://colab.research.google.com/drive/1dnG000syxHwOt-Z9_bpRLnBbfugI_CBh?usp=sharing) 52 | - Making Hermes 2 Pro Conditionally Refuse Legal Instructions 👉 [here!](https://colab.research.google.com/drive/18lOzaFOK4CB_mYe9jlQbJCdHBDlhGxcQ?usp=sharing) 53 | 54 | ## Acknowledgement 55 | This library builds on top of the excellent work done in the following repositories: 56 | 57 | - [vgel/repeng](https://github.com/vgel/repeng) 58 | - [andyzoujm/representation-engineering](https://github.com/andyzoujm/representation-engineering) 59 | - [nrimsky/CAA](https://github.com/nrimsky/CAA) 60 | 61 | Some parts of the documentation for this library are generated by 62 | 63 | - [ml-tooling/lazydocs](https://github.com/ml-tooling/lazydocs) > lazydocs activation_steering/ --no-watermark 64 | 65 | ## Citation 66 | 67 | ``` 68 | @misc{lee2024programmingrefusalconditionalactivation, 69 | title={Programming Refusal with Conditional Activation Steering}, 70 | author={Bruce W. Lee and Inkit Padhi and Karthikeyan Natesan Ramamurthy and Erik Miehling and Pierre Dognin and Manish Nagireddy and Amit Dhurandhar}, 71 | year={2024}, 72 | eprint={2409.05907}, 73 | archivePrefix={arXiv}, 74 | primaryClass={cs.LG}, 75 | url={https://arxiv.org/abs/2409.05907}, 76 | } 77 | ``` 78 | -------------------------------------------------------------------------------- /activation_steering/config.py: -------------------------------------------------------------------------------- 1 | from rich.console import Console 2 | import os 3 | from datetime import datetime 4 | 5 | class LogConfig: 6 | """ 7 | Configuration class for logging settings. 8 | 9 | Attributes: 10 | enabled (bool): Whether logging is enabled. 11 | file_output (bool): Whether to output logs to a file. 12 | file_path (str): Path to the log file. 13 | """ 14 | def __init__(self, enabled=True): 15 | """ 16 | Initialize a LogConfig instance. 17 | 18 | Args: 19 | enabled (bool): Initial enabled state for logging. 20 | """ 21 | self.enabled = enabled 22 | self.file_output = False 23 | self.file_path = None 24 | 25 | class GlobalConfig: 26 | """ 27 | Singleton class for global configuration settings. 28 | 29 | Class Attributes: 30 | console (Console): Rich console instance for pretty printing. 31 | log_configs (dict): Dictionary of LogConfig instances for different classes. 32 | log_directory (str): Directory for storing log files. 33 | """ 34 | _instance = None 35 | console = Console() 36 | log_configs = { 37 | "global": LogConfig(enabled=True), 38 | "LeashLayer": LogConfig(enabled=False), 39 | "MalleableModel": LogConfig(enabled=True), 40 | "SteeringVector": LogConfig(enabled=True), 41 | "SteeringDataset": LogConfig(enabled=True) 42 | } 43 | log_directory = "activation_steering_logs" 44 | _initialized = False 45 | 46 | def __new__(cls): 47 | if cls._instance is None: 48 | cls._instance = super(GlobalConfig, cls).__new__(cls) 49 | cls._instance.initialize_log_files() 50 | return cls._instance 51 | 52 | @classmethod 53 | def initialize_log_files(cls): 54 | """ 55 | Initialize log files for all configured classes. 56 | """ 57 | if not cls._initialized: 58 | timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") 59 | os.makedirs(cls.log_directory, exist_ok=True) 60 | for class_name in cls.log_configs: 61 | cls.log_configs[class_name].file_path = os.path.join(cls.log_directory, f"{class_name}_{timestamp}.log") 62 | cls._initialized = True 63 | 64 | @classmethod 65 | def set_verbose(cls, verbose: bool, class_name: str = "global"): 66 | """ 67 | Set the verbose state for a specific class or globally. 68 | 69 | Args: 70 | verbose (bool): Whether to enable verbose logging. 71 | class_name (str): The class name to set verbose for. Defaults to "global". 72 | """ 73 | if class_name in cls.log_configs: 74 | cls.log_configs[class_name].enabled = verbose 75 | 76 | @classmethod 77 | def is_verbose(cls, class_name: str = "global"): 78 | """ 79 | Check if verbose logging is enabled for a specific class. 80 | 81 | Args: 82 | class_name (str): The class name to check. Defaults to "global". 83 | 84 | Returns: 85 | bool: True if verbose logging is enabled, False otherwise. 86 | """ 87 | return cls.log_configs[class_name].enabled and cls.log_configs["global"].enabled 88 | 89 | @classmethod 90 | def set_file_output(cls, enabled: bool, class_name: str = "global"): 91 | """ 92 | Set whether to output logs to a file for a specific class or globally. 93 | 94 | Args: 95 | enabled (bool): Whether to enable file output. 96 | class_name (str): The class name to set file output for. Defaults to "global". 97 | """ 98 | cls.initialize_log_files() # Ensure file paths are set 99 | if class_name in cls.log_configs: 100 | cls.log_configs[class_name].file_output = enabled 101 | 102 | @classmethod 103 | def should_log_to_file(cls, class_name: str): 104 | """ 105 | Check if logging to a file is enabled for a specific class. 106 | 107 | Args: 108 | class_name (str): The class name to check. 109 | 110 | Returns: 111 | bool: True if file logging is enabled, False otherwise. 112 | """ 113 | return cls.log_configs[class_name].file_output or cls.log_configs["global"].file_output 114 | 115 | @classmethod 116 | def get_file_path(cls, class_name: str): 117 | """ 118 | Get the log file path for a specific class. 119 | 120 | Args: 121 | class_name (str): The class name to get the file path for. 122 | 123 | Returns: 124 | str: The path to the log file. 125 | """ 126 | cls.initialize_log_files() # Ensure file paths are set 127 | return cls.log_configs[class_name].file_path 128 | 129 | def log(message: str, style: str = None, class_name: str = "global"): 130 | """ 131 | Log a message to the console and/or file based on the current configuration. 132 | 133 | Args: 134 | message (str): The message to log. 135 | style (str, optional): The style to apply to the console output. 136 | class_name (str): The class name associated with the log message. Defaults to "global". 137 | """ 138 | if GlobalConfig.is_verbose(class_name): 139 | GlobalConfig.console.print(message, style=style) 140 | 141 | if GlobalConfig.should_log_to_file(class_name): 142 | file_path = GlobalConfig.get_file_path(class_name) 143 | with open(file_path, "a") as f: 144 | f.write(f"{datetime.now().isoformat()} - {message}\n") -------------------------------------------------------------------------------- /docs/config.md: -------------------------------------------------------------------------------- 1 | 2 | 3 |

4 | 5 | # module `config` 6 | 7 | 8 | 9 | 10 | 11 | --- 12 | 13 |

14 | 15 | ## function `log` 16 | 17 | ```python 18 | log(message: str, style: str = None, class_name: str = 'global') 19 | ``` 20 | 21 | Log a message to the console and/or file based on the current configuration. 22 | 23 | 24 | 25 | **Args:** 26 | 27 | - `message` (str): The message to log. 28 | - `style` (str, optional): The style to apply to the console output. 29 | - `class_name` (str): The class name associated with the log message. Defaults to "global". 30 | 31 | 32 | --- 33 | 34 |

35 | 36 | ## class `LogConfig` 37 | Configuration class for logging settings. 38 | 39 | 40 | 41 | **Attributes:** 42 | 43 | - `enabled` (bool): Whether logging is enabled. 44 | - `file_output` (bool): Whether to output logs to a file. 45 | - `file_path` (str): Path to the log file. 46 | 47 |

48 | 49 | ### method `__init__` 50 | 51 | ```python 52 | __init__(enabled=True) 53 | ``` 54 | 55 | Initialize a LogConfig instance. 56 | 57 | 58 | 59 | **Args:** 60 | 61 | - `enabled` (bool): Initial enabled state for logging. 62 | 63 | 64 | 65 | 66 | 67 | --- 68 | 69 |

70 | 71 | ## class `GlobalConfig` 72 | Singleton class for global configuration settings. 73 | 74 | Class Attributes: console (Console): Rich console instance for pretty printing. log_configs (dict): Dictionary of LogConfig instances for different classes. log_directory (str): Directory for storing log files. 75 | 76 | 77 | 78 | 79 | --- 80 | 81 |

82 | 83 | ### classmethod `get_file_path` 84 | 85 | ```python 86 | get_file_path(class_name: str) 87 | ``` 88 | 89 | Get the log file path for a specific class. 90 | 91 | 92 | 93 | **Args:** 94 | 95 | - `class_name` (str): The class name to get the file path for. 96 | 97 | 98 | 99 | **Returns:** 100 | 101 | - `str`: The path to the log file. 102 | 103 | --- 104 | 105 |

106 | 107 | ### classmethod `initialize_log_files` 108 | 109 | ```python 110 | initialize_log_files() 111 | ``` 112 | 113 | Initialize log files for all configured classes. 114 | 115 | --- 116 | 117 |

118 | 119 | ### classmethod `is_verbose` 120 | 121 | ```python 122 | is_verbose(class_name: str = 'global') 123 | ``` 124 | 125 | Check if verbose logging is enabled for a specific class. 126 | 127 | 128 | 129 | **Args:** 130 | 131 | - `class_name` (str): The class name to check. Defaults to "global". 132 | 133 | 134 | 135 | **Returns:** 136 | 137 | - `bool`: True if verbose logging is enabled, False otherwise. 138 | 139 | --- 140 | 141 |

142 | 143 | ### classmethod `set_file_output` 144 | 145 | ```python 146 | set_file_output(enabled: bool, class_name: str = 'global') 147 | ``` 148 | 149 | Set whether to output logs to a file for a specific class or globally. 150 | 151 | 152 | 153 | **Args:** 154 | 155 | - `enabled` (bool): Whether to enable file output. 156 | - `class_name` (str): The class name to set file output for. Defaults to "global". 157 | 158 | --- 159 | 160 |

161 | 162 | ### classmethod `set_verbose` 163 | 164 | ```python 165 | set_verbose(verbose: bool, class_name: str = 'global') 166 | ``` 167 | 168 | Set the verbose state for a specific class or globally. 169 | 170 | 171 | 172 | **Args:** 173 | 174 | - `verbose` (bool): Whether to enable verbose logging. 175 | - `class_name` (str): The class name to set verbose for. Defaults to "global". 176 | 177 | --- 178 | 179 |

180 | 181 | ### classmethod `should_log_to_file` 182 | 183 | ```python 184 | should_log_to_file(class_name: str) 185 | ``` 186 | 187 | Check if logging to a file is enabled for a specific class. 188 | 189 | 190 | 191 | **Args:** 192 | 193 | - `class_name` (str): The class name to check. 194 | 195 | 196 | 197 | **Returns:** 198 | 199 | - `bool`: True if file logging is enabled, False otherwise. 200 | 201 | 202 | -------------------------------------------------------------------------------- /docs/leash_layer.md: -------------------------------------------------------------------------------- 1 | 2 | 3 |

4 | 5 | # module `leash_layer` 6 | 7 | 8 | 9 | 10 | 11 | 12 | --- 13 | 14 |

15 | 16 | ## class `LeashLayer` 17 | A wrapper layer that implements conditional activation steering for language models. 18 | 19 | This layer can be applied to existing model layers to enable fine-grained control over the model's behavior through steering and conditional activation. 20 | 21 | Class Attributes: condition_met: A defaultdict tracking whether conditions have been met. forward_calls: A defaultdict counting forward passes for each layer. condition_layers: Tracks which layers are condition layers. behavior_layers: Tracks which layers are behavior layers. condition_similarities: Stores condition similarities for each layer. 22 | 23 |

24 | 25 | ### method `__init__` 26 | 27 | ```python 28 | __init__(layer: Module, layer_id: int) → None 29 | ``` 30 | 31 | Initialize a LeashLayer. 32 | 33 | 34 | 35 | **Args:** 36 | 37 | - `layer`: The underlying layer to be wrapped. 38 | - `layer_id`: The ID of this layer in the model. 39 | 40 | 41 | 42 | 43 | --- 44 | 45 |

46 | 47 | ### method `compute_similarity` 48 | 49 | ```python 50 | compute_similarity(x: Tensor, y: Tensor) → float 51 | ``` 52 | 53 | Compute the cosine similarity between two tensors. 54 | 55 | 56 | 57 | **Args:** 58 | 59 | - `x`: First tensor. 60 | - `y`: Second tensor. 61 | 62 | 63 | 64 | **Returns:** 65 | The cosine similarity as a float. 66 | 67 | --- 68 | 69 |

70 | 71 | ### method `forward` 72 | 73 | ```python 74 | forward(hidden_states, *args, **kwargs) 75 | ``` 76 | 77 | Perform a forward pass through this layer, applying steering if configured. 78 | 79 | 80 | 81 | **Args:** 82 | 83 | - `hidden_states`: The input hidden states. 84 | - `*args`: Additional positional arguments for the underlying layer. 85 | - `**kwargs`: Additional keyword arguments for the underlying layer. 86 | 87 | 88 | 89 | **Returns:** 90 | The output of the underlying layer, potentially modified by steering. 91 | 92 | --- 93 | 94 |

95 | 96 | ### method `multisteer` 97 | 98 | ```python 99 | multisteer( 100 | behavior_vectors: List[Tensor], 101 | condition_projectors: List[Tensor], 102 | thresholds: List[float], 103 | use_ooi_preventive_normalization: bool = True, 104 | apply_behavior_on_first_call: bool = True, 105 | condition_comparator_threshold_is: List[str] = ['larger'], 106 | condition_threshold_comparison_modes: List[str] = ['mean'], 107 | rules: List[str] = None, 108 | **kwargs 109 | ) → None 110 | ``` 111 | 112 | Configure multi-steering for this layer. 113 | 114 | 115 | 116 | **Args:** 117 | 118 | - `behavior_vectors`: List of behavior vectors to apply. 119 | - `condition_projectors`: List of condition projectors to use. 120 | - `thresholds`: List of thresholds for condition activation. 121 | - `use_ooi_preventive_normalization`: Whether to use OOI preventive normalization. 122 | - `apply_behavior_on_first_call`: Whether to apply behavior on the first forward call. 123 | - `condition_comparator_threshold_is`: How to compare each condition to its threshold. 124 | - `condition_threshold_comparison_modes`: How to compute each condition value. 125 | - `rules`: List of rules for applying behaviors based on conditions. 126 | - `**kwargs`: Additional parameters for LayerControlParams. 127 | 128 | --- 129 | 130 |

131 | 132 | ### classmethod `reset_class` 133 | 134 | ```python 135 | reset_class() → None 136 | ``` 137 | 138 | Reset the class-level attributes of LeashLayer. 139 | 140 | --- 141 | 142 |

143 | 144 | ### method `reset_instance` 145 | 146 | ```python 147 | reset_instance() → None 148 | ``` 149 | 150 | Reset this instance of LeashLayer to its default state. 151 | 152 | --- 153 | 154 |

155 | 156 | ### method `steer` 157 | 158 | ```python 159 | steer( 160 | behavior_vector: Tensor, 161 | condition_projector: Tensor, 162 | threshold: float = 0.0, 163 | use_ooi_preventive_normalization: bool = True, 164 | apply_behavior_on_first_call: bool = True, 165 | condition_comparator_threshold_is: str = 'larger', 166 | condition_threshold_comparison_mode: str = 'mean', 167 | **kwargs 168 | ) → None 169 | ``` 170 | 171 | Configure steering for this layer. 172 | 173 | 174 | 175 | **Args:** 176 | 177 | - `behavior_vector`: The behavior vector to apply. 178 | - `condition_projector`: The condition projector to use. 179 | - `threshold`: The threshold for condition activation. 180 | - `use_ooi_preventive_normalization`: Whether to use OOI preventive normalization. 181 | - `apply_behavior_on_first_call`: Whether to apply behavior on the first forward call. 182 | - `condition_comparator_threshold_is`: How to compare the condition to the threshold. 183 | - `condition_threshold_comparison_mode`: How to compute the condition value. 184 | - `**kwargs`: Additional parameters for LayerControlParams. 185 | 186 | 187 | -------------------------------------------------------------------------------- /activation_steering/steering_dataset.py: -------------------------------------------------------------------------------- 1 | from typing import List, Literal, Tuple, Optional 2 | from activation_steering.utils import ContrastivePair 3 | from transformers import PreTrainedTokenizerBase 4 | 5 | from activation_steering.utils import return_default_suffixes 6 | from activation_steering.config import log, GlobalConfig 7 | 8 | class SteeringDataset: 9 | """ 10 | Create a formatted dataset for steering a language model. 11 | 12 | This class takes a list of examples (either contrastive messages or contrastive text) 13 | and a tokenizer, and formats the examples into a dataset of ContrastivePair objects. 14 | """ 15 | 16 | def __init__( 17 | self, 18 | tokenizer: PreTrainedTokenizerBase, 19 | examples: List, 20 | suffixes: List[Tuple[str, str]] = None, 21 | disable_suffixes: bool = False, 22 | use_chat_template: bool = True, 23 | system_message: Optional[Tuple[str, str]] = None 24 | ): 25 | """ 26 | Initialize the SteeringDataset. 27 | 28 | Args: 29 | tokenizer: The tokenizer used to tokenize and format the examples. 30 | examples: A list of examples, either contrastive messages or contrastive text. 31 | suffixes: A list of suffixes to append to the formatted dataset. If None, default suffixes will be used. 32 | disable_suffixes: If True, no suffixes will be appended to the examples. 33 | use_chat_template: If True, applies the chat template to the examples. 34 | system_message: Optional system message to be included in the chat template. 35 | """ 36 | self.tokenizer = tokenizer 37 | self.suffixes = suffixes 38 | self.formatted_dataset = [] 39 | self.formatted_dataset_pre_populated = [] 40 | self.use_chat_template = use_chat_template 41 | 42 | log(f"Processing {len(examples)} examples", class_name="SteeringDataset") 43 | 44 | for example in examples: 45 | if self.use_chat_template: 46 | if system_message: 47 | message_a = [{"role": "system", "content": f"{system_message[0]}"}, {"role": "user", "content": f"{self.clean_text(example[0])}"}] 48 | message_b = [{"role": "system", "content": f"{system_message[1]}"}, {"role": "user", "content": f"{self.clean_text(example[1])}"}] 49 | else: 50 | message_a = [{"role": "user", "content": f"{self.clean_text(example[0])}"}] 51 | message_b = [{"role": "user", "content": f"{self.clean_text(example[1])}"}] 52 | positive = tokenizer.apply_chat_template(message_a, tokenize=False, add_generation_prompt=False) 53 | negative = tokenizer.apply_chat_template(message_b, tokenize=False, add_generation_prompt=False) 54 | else: 55 | positive = self.clean_text(example[0]) 56 | negative = self.clean_text(example[1]) 57 | 58 | self.formatted_dataset_pre_populated.append( 59 | ContrastivePair(positive=positive, negative=negative) 60 | ) 61 | 62 | log(f"Processed {len(self.formatted_dataset_pre_populated)} examples", class_name="SteeringDataset") 63 | 64 | # Handle suffixes (same as original) 65 | if suffixes is not None and not disable_suffixes and isinstance(suffixes[0], tuple): 66 | for positive_suffix, negative_suffix in suffixes: 67 | for pair in self.formatted_dataset_pre_populated: 68 | self.formatted_dataset.append( 69 | ContrastivePair( 70 | positive=pair.positive + positive_suffix, 71 | negative=pair.negative + negative_suffix 72 | ) 73 | ) 74 | elif suffixes is not None and not disable_suffixes and isinstance(suffixes[0], str): 75 | for suffix in suffixes: 76 | for pair in self.formatted_dataset_pre_populated: 77 | self.formatted_dataset.append( 78 | ContrastivePair( 79 | positive=pair.positive + suffix, 80 | negative=pair.negative + suffix 81 | ) 82 | ) 83 | elif suffixes is None and not disable_suffixes: 84 | default_suffixes = return_default_suffixes() 85 | for suffix in default_suffixes: 86 | for pair in self.formatted_dataset_pre_populated: 87 | self.formatted_dataset.append( 88 | ContrastivePair( 89 | positive=pair.positive + suffix, 90 | negative=pair.negative + suffix 91 | ) 92 | ) 93 | else: 94 | self.formatted_dataset = self.formatted_dataset_pre_populated 95 | 96 | log("=*"*15) 97 | log(f"[bold green]Final dataset size[/bold green]: {len(self.formatted_dataset)} examples", class_name="SteeringDataset") 98 | log(f"[bold red]Positive example[/bold red]: {self.formatted_dataset[0].positive}", class_name="SteeringDataset") 99 | log(f"[bold blue]Negative example[/bold blue]: {self.formatted_dataset[0].negative}", class_name="SteeringDataset") 100 | log("=*"*15) 101 | 102 | 103 | def clean_text(self, text: str) -> str: 104 | """ 105 | Clean the input text by replacing special tokens. 106 | 107 | Args: 108 | text: The input text to be cleaned. 109 | 110 | Returns: 111 | The cleaned text with special tokens replaced. 112 | """ 113 | if not text: 114 | return text 115 | 116 | def insert_vline(token: str) -> str: 117 | if len(token) < 2: 118 | return " " 119 | elif len(token) == 2: 120 | return f"{token[0]}|{token[1]}" 121 | else: 122 | return f"{token[:1]}|{token[1:-1]}|{token[-1:]}" 123 | 124 | if self.tokenizer.bos_token: 125 | text = text.replace(self.tokenizer.bos_token, insert_vline(self.tokenizer.bos_token)) 126 | if self.tokenizer.eos_token: 127 | text = text.replace(self.tokenizer.eos_token, insert_vline(self.tokenizer.eos_token)) 128 | if self.tokenizer.pad_token: 129 | text = text.replace(self.tokenizer.pad_token, insert_vline(self.tokenizer.pad_token)) 130 | if self.tokenizer.unk_token: 131 | text = text.replace(self.tokenizer.unk_token, insert_vline(self.tokenizer.unk_token)) 132 | 133 | return text -------------------------------------------------------------------------------- /docs/demo-data/behavior_refusal.json: -------------------------------------------------------------------------------- 1 | { 2 | "compliant_responses": [ 3 | "Absolutely! I'd be delighted", 4 | "Sure! Allow me to", 5 | "Definitely! Here's the way to", 6 | "No doubt! I will", 7 | "With pleasure! The solution is", 8 | "Not an issue! I'll clarify", 9 | "Definitely! Let's delve into", 10 | "With joy! Here's the", 11 | "Certainly! I can assist", 12 | "Happy to help! The", 13 | "Of course! I'll", 14 | "Immediately! Let's dissect", 15 | "Absolutely! The steps include", 16 | "Of course! To recap", 17 | "Certainly! Here are a few", 18 | "Sure! I suggest the", 19 | "Gladly! Let's examine this", 20 | "You got it! Here's", 21 | "Absolutely! I can clarify", 22 | "No worries! The main", 23 | "Definitely! Let's consider the", 24 | "Happy to help! I'll", 25 | "Of course! To answer", 26 | "Certainly! Here's a step-by-step", 27 | "Sure thing! Let me", 28 | "Absolutely! The key factors", 29 | "With pleasure! To solve", 30 | "You bet! Here's what", 31 | "Gladly! I can break", 32 | "Right away! Let's explore", 33 | "Of course! The best", 34 | "Certainly! To implement this", 35 | "Sure! Here's how it", 36 | "Happy to oblige! Let's", 37 | "Absolutely! The pros and", 38 | "No problem! To achieve", 39 | "Definitely! Here's a detailed", 40 | "Of course! I can", 41 | "You got it! Let's", 42 | "Certainly! The process involves", 43 | "Gladly! To improve this", 44 | "Sure thing! Here's a", 45 | "With pleasure! Let's brainstorm", 46 | "Absolutely! The main advantages", 47 | "Right away! To optimize", 48 | "Of course! Here's a", 49 | "Happy to assist! I", 50 | "Certainly! Let's review the", 51 | "No worries! The key", 52 | "Definitely! To resolve this", 53 | "Sure! Here's what experts", 54 | "Gladly! Let's examine the", 55 | "You bet! The best", 56 | "Absolutely! To understand this", 57 | "Of course! Here's a", 58 | "Certainly! I can outline", 59 | "With pleasure! Let's discuss", 60 | "Sure thing! The most", 61 | "Happy to help! To", 62 | "Right away! Here's how", 63 | "Definitely! Let's consider the", 64 | "Of course! The primary", 65 | "You got it! To", 66 | "Certainly! Here's a useful", 67 | "Gladly! I can provide", 68 | "Absolutely! Let's break it", 69 | "No problem! The key", 70 | "Sure! To implement this", 71 | "With pleasure! Here's a", 72 | "Of course! I can", 73 | "Happy to oblige! Let's", 74 | "Certainly! The main considerations", 75 | "Definitely! To troubleshoot this", 76 | "You bet! Here's a", 77 | "Gladly! I can help", 78 | "Right away! Let's explore", 79 | "Absolutely! The best solution", 80 | "Of course! I'd be", 81 | "Sure thing! To address", 82 | "With pleasure! Here's an", 83 | "Certainly! Let's look at", 84 | "Happy to assist! The", 85 | "No worries! I can", 86 | "Definitely! To optimize your", 87 | "You got it! Here's", 88 | "Gladly! The key aspects", 89 | "Of course! Let's analyze", 90 | "Absolutely! I'd suggest the", 91 | "Sure! Here's a comprehensive", 92 | "With pleasure! To enhance", 93 | "Certainly! The most effective", 94 | "Right away! Let's delve", 95 | "Happy to help! Here's", 96 | "No problem! To streamline", 97 | "Definitely! The essential components", 98 | "Of course! I can", 99 | "You bet! Let's explore", 100 | "Gladly! To improve your", 101 | "Absolutely! Here's a step-by-step", 102 | "Sure thing! The critical" 103 | ], 104 | "non_compliant_responses": [ 105 | "I regret to inform you that I can't", 106 | "Unfortunately, that isn't feasible", 107 | "I must politely decline", 108 | "I'm unable to proceed", 109 | "That exceeds my capabilities", 110 | "I lack access to", 111 | "I can't assist with that", 112 | "That would be inappropriate", 113 | "I'm uncomfortable with that", 114 | "I have to decline", 115 | "That isn't something I can do", 116 | "I'm not authorized for that", 117 | "I can't engage in that", 118 | "That's outside my scope", 119 | "I'm unable to help with that", 120 | "I don't have the ability to", 121 | "That isn't within my capabilities", 122 | "I can't participate in that", 123 | "I'm not designed for that", 124 | "That's against my guidelines", 125 | "I won't be able to do that", 126 | "I'm not equipped for that", 127 | "That request is problematic", 128 | "I'm restricted from providing that", 129 | "I can't comply with that", 130 | "That's not part of my functions", 131 | "I'm not allowed to do that", 132 | "That would be unethical", 133 | "I don't have permission for that", 134 | "I can't generate that content", 135 | "That's not something I'd do", 136 | "I'm not programmed for that", 137 | "I must refrain from that", 138 | "That's beyond my scope", 139 | "I can't produce that", 140 | "I'm not capable of that", 141 | "That's outside my purview", 142 | "I can't provide assistance with that", 143 | "I'm not qualified for that", 144 | "That's not appropriate for me", 145 | "I don't have clearance for that", 146 | "I can't help with that", 147 | "That's against my programming", 148 | "I'm not designed for that", 149 | "I cannot comply with that", 150 | "That's not within my capabilities", 151 | "I'm unable to provide that", 152 | "I can't participate in that", 153 | "That's beyond my abilities", 154 | "I don't have authority for that", 155 | "I can't engage with that", 156 | "That would compromise my integrity", 157 | "I'm not authorized for that", 158 | "I won't assist in that", 159 | "That's outside my expertise", 160 | "I can't generate that", 161 | "I'm not programmed for that", 162 | "That's outside my capabilities", 163 | "I don't have access to that", 164 | "I can't produce that content", 165 | "That's not something I'm capable of", 166 | "I'm not equipped for that", 167 | "I must decline that request", 168 | "That's beyond my function", 169 | "I can't comply with that", 170 | "I'm not permitted to do that", 171 | "That's not part of my functions", 172 | "I don't have the ability to do that", 173 | "I can't assist with that", 174 | "That would violate my guidelines", 175 | "I'm not designed for that", 176 | "I won't engage in that", 177 | "That's outside my domain", 178 | "I can't provide information on that", 179 | "I'm not qualified for that", 180 | "That's not appropriate for me", 181 | "I don't have clearance for that", 182 | "I can't help with that", 183 | "That's against my guidelines", 184 | "I'm not programmed for that", 185 | "I cannot comply with that", 186 | "That's not something I can do", 187 | "I'm unable to provide that", 188 | "I can't participate in that", 189 | "That's beyond my abilities", 190 | "I don't have authorization for that", 191 | "I can't engage with that", 192 | "That would be unethical", 193 | "I'm not authorized for that", 194 | "I won't assist in that", 195 | "That's not within my capabilities", 196 | "I don't have permission for that", 197 | "I can't produce that", 198 | "I'm not allowed to do that", 199 | "I must refuse that request", 200 | "That's beyond my programming", 201 | "I can't comply with that", 202 | "I'm not authorized to proceed", 203 | "That's beyond my capabilities", 204 | "I'm unable to assist with that" 205 | ] 206 | } 207 | -------------------------------------------------------------------------------- /docs/steering_vector.md: -------------------------------------------------------------------------------- 1 | 2 | 3 |

4 | 5 | # module `steering_vector` 6 | 7 | 8 | 9 | 10 | 11 | --- 12 | 13 |

14 | 15 | ## function `read_representations` 16 | 17 | ```python 18 | read_representations( 19 | model: MalleableModel | PreTrainedModel, 20 | tokenizer: PreTrainedTokenizerBase, 21 | inputs: list[ContrastivePair], 22 | hidden_layer_ids: Optional[Iterable[int]] = None, 23 | batch_size: int = 32, 24 | method: Literal['pca_diff', 'pca_center'] = 'pca_center', 25 | save_analysis: bool = False, 26 | output_dir: str = 'activation_steering_figures', 27 | accumulate_last_x_tokens: Union[int, str] = 1, 28 | suffixes: List[Tuple[str, str]] = None 29 | ) → dict[int, ndarray] 30 | ``` 31 | 32 | Extract representations from the language model based on the contrast dataset. 33 | 34 | 35 | 36 | **Args:** 37 | 38 | - `model`: The model to extract representations from. 39 | - `tokenizer`: The tokenizer associated with the model. 40 | - `inputs`: A list of ContrastivePair inputs. 41 | - `hidden_layer_ids`: The IDs of hidden layers to extract representations from. 42 | - `batch_size`: The batch size to use when processing inputs. 43 | - `method`: The method to use for preparing training data ("pca_diff" or "pca_center"). 44 | - `save_analysis`: Whether to save PCA analysis figures. 45 | - `output_dir`: The directory to save analysis figures to. 46 | - `accumulate_last_x_tokens`: How many tokens to accumulate for the hidden state. 47 | - `suffixes`: List of suffixes to use when accumulating hidden states. 48 | 49 | 50 | 51 | **Returns:** 52 | A dictionary mapping layer IDs to numpy arrays of directions. 53 | 54 | 55 | --- 56 | 57 |

58 | 59 | ## function `batched_get_hiddens` 60 | 61 | ```python 62 | batched_get_hiddens( 63 | model, 64 | tokenizer, 65 | inputs: list[str], 66 | hidden_layer_ids: list[int], 67 | batch_size: int, 68 | accumulate_last_x_tokens: Union[int, str] = 1, 69 | suffixes: List[Tuple[str, str]] = None 70 | ) → dict[int, ndarray] 71 | ``` 72 | 73 | Retrieve the hidden states from the specified layers of the language model for the given input strings. 74 | 75 | 76 | 77 | **Args:** 78 | 79 | - `model`: The model to get hidden states from. 80 | - `tokenizer`: The tokenizer associated with the model. 81 | - `inputs`: A list of input strings. 82 | - `hidden_layer_ids`: The IDs of hidden layers to get states from. 83 | - `batch_size`: The batch size to use when processing inputs. 84 | - `accumulate_last_x_tokens`: How many tokens to accumulate for the hidden state. 85 | - `suffixes`: List of suffixes to use when accumulating hidden states. 86 | 87 | 88 | 89 | **Returns:** 90 | A dictionary mapping layer IDs to numpy arrays of hidden states. 91 | 92 | 93 | --- 94 | 95 |

96 | 97 | ## function `project_onto_direction` 98 | 99 | ```python 100 | project_onto_direction(H, direction) 101 | ``` 102 | 103 | Project a matrix H onto a direction vector. 104 | 105 | 106 | 107 | **Args:** 108 | 109 | - `H`: The matrix to project. 110 | - `direction`: The direction vector to project onto. 111 | 112 | 113 | 114 | **Returns:** 115 | The projected matrix. 116 | 117 | 118 | --- 119 | 120 |

121 | 122 | ## function `save_pca_figures` 123 | 124 | ```python 125 | save_pca_figures(layer_hiddens, hidden_layer_ids, method, output_dir, inputs) 126 | ``` 127 | 128 | Save PCA analysis figures for each hidden layer and create a macroscopic x-axis layer analysis plot. 129 | 130 | 131 | 132 | **Args:** 133 | 134 | - `layer_hiddens`: A dictionary of hidden states for each layer. 135 | - `hidden_layer_ids`: The IDs of hidden layers. 136 | - `method`: The method used for preparing training data. 137 | - `output_dir`: The directory to save the figures to. 138 | - `inputs`: The input data used for the analysis. 139 | 140 | 141 | --- 142 | 143 |

144 | 145 | ## class `SteeringVector` 146 | A dataclass representing a steering vector used for guiding the language model. 147 | 148 | 149 | 150 | **Attributes:** 151 | 152 | - `model_type`: The type of the model this vector is associated with. 153 | - `directions`: A dictionary mapping layer IDs to numpy arrays of directions. 154 | - `explained_variances`: A dictionary of explained variances. 155 | 156 |

157 | 158 | ### method `__init__` 159 | 160 | ```python 161 | __init__( 162 | model_type: str, 163 | directions: dict[int, ndarray], 164 | explained_variances: dict 165 | ) → None 166 | ``` 167 | 168 | 169 | 170 | 171 | 172 | 173 | 174 | 175 | --- 176 | 177 |

178 | 179 | ### classmethod `load` 180 | 181 | ```python 182 | load(file_path: str) → SteeringVector 183 | ``` 184 | 185 | Load a SteeringVector from a file. 186 | 187 | 188 | 189 | **Args:** 190 | 191 | - `file_path`: The path to load the file from. If it doesn't end with '.svec', this extension will be added. 192 | 193 | 194 | 195 | **Returns:** 196 | A new SteeringVector instance loaded from the file. 197 | 198 | --- 199 | 200 |

201 | 202 | ### method `save` 203 | 204 | ```python 205 | save(file_path: str) 206 | ``` 207 | 208 | Save the SteeringVector to a file. 209 | 210 | 211 | 212 | **Args:** 213 | 214 | - `file_path`: The path to save the file to. If it doesn't end with '.svec', this extension will be added. 215 | 216 | --- 217 | 218 |

219 | 220 | ### classmethod `train` 221 | 222 | ```python 223 | train( 224 | model: MalleableModel | PreTrainedModel, 225 | tokenizer: PreTrainedTokenizerBase, 226 | steering_dataset: SteeringDataset, 227 | **kwargs 228 | ) → SteeringVector 229 | ``` 230 | 231 | Train a SteeringVector for a given model and tokenizer using the provided dataset. 232 | 233 | 234 | 235 | **Args:** 236 | 237 | - `model`: The model to train the steering vector for. 238 | - `tokenizer`: The tokenizer associated with the model. 239 | - `steering_dataset`: The dataset to use for training. 240 | - `**kwargs`: Additional keyword arguments. 241 | 242 | 243 | 244 | **Returns:** 245 | A new SteeringVector instance. 246 | 247 | 248 | -------------------------------------------------------------------------------- /activation_steering/utils.py: -------------------------------------------------------------------------------- 1 | import dataclasses 2 | import typing 3 | from typing import List 4 | import warnings 5 | import time 6 | 7 | import torch 8 | from transformers import PretrainedConfig, PreTrainedModel 9 | from rich.progress import Progress, TimeElapsedColumn, TimeRemainingColumn, BarColumn, TextColumn 10 | 11 | 12 | @dataclasses.dataclass 13 | class LayerControlParams: 14 | control: torch.Tensor | None = None 15 | operator: typing.Callable[[torch.Tensor, torch.Tensor], torch.Tensor] = ( 16 | lambda current, control: current + control 17 | ) 18 | """ 19 | A dataclass for layer control parameters. 20 | 21 | Attributes: 22 | control: Optional tensor for control. 23 | operator: Callable that defines how to apply the control to the current state. 24 | """ 25 | 26 | @classmethod 27 | def default(cls): 28 | """ 29 | Return a default instance of LayerControlParams. 30 | 31 | Returns: 32 | A LayerControlParams instance with default values. 33 | """ 34 | return cls() 35 | 36 | 37 | @dataclasses.dataclass 38 | class ContrastivePair: 39 | """ 40 | A dataclass representing a pair of contrasting strings. 41 | 42 | Attributes: 43 | positive: The positive string in the pair. 44 | negative: The negative string in the pair. 45 | """ 46 | positive: str 47 | negative: str 48 | 49 | 50 | def custom_progress(iterable, description): 51 | """ 52 | Create a custom progress bar for iterating over items. 53 | 54 | Args: 55 | iterable: The iterable to process. 56 | description: A string describing the progress bar. 57 | 58 | Yields: 59 | Items from the iterable. 60 | """ 61 | items = list(iterable) # Convert iterable to a list 62 | total = len(items) 63 | 64 | with Progress( 65 | TextColumn("[progress.description]{task.description}"), 66 | BarColumn(), 67 | TextColumn("[progress.percentage]{task.percentage:>3.0f}%"), 68 | TimeElapsedColumn(), 69 | TimeRemainingColumn(), 70 | TextColumn(""), 71 | expand=True 72 | ) as progress: 73 | task = progress.add_task(description, total=total) 74 | 75 | for i, item in enumerate(items): 76 | yield item 77 | progress.update(task, completed=i+1) 78 | 79 | # Ensure 100% completion 80 | progress.update(task, completed=total) 81 | time.sleep(0.1) # Small delay to ensure the final update is displayed 82 | 83 | 84 | def return_default_suffixes(): 85 | """ 86 | Return a list of default suffixes used in the CAIS representation engineering paper. 87 | 88 | Returns: 89 | A list of string suffixes. 90 | """ 91 | return [ "", "That game", "I can see", "Hmm, this", "I can relate to", "Who is", "I understand the", "Ugh,", "What the hell was", "Hey, did anyone", "Although", "Thank you for choosing", "What are you", "Oh w", "How dare you open", "It was my pleasure", "I'm hon", "I appreciate that you", "Are you k", "Whoever left this", "It's always", "Ew,", "Hey, I l", "Hello? Is someone", "I understand that", "That poem", "Aww, poor", "Hey, it", "Alright, who", "I didn't", "Well, life", "The document", "Oh no, this", "I'm concerned", "Hello, this is", "This art", "Hmm, this drink", "Hi there!", "It seems", "Is", "Good", "I can't", "Ex", "Who are", "I can see that", "Wow,", "Today is a", "Hey friend", "Sometimes friends", "Oh, this old", "The weather outside", "This place is sur", "I appreciate your input", "Thank you for the", "Look at", "I'm disappoint", "To my", "How dare you", "That's an", "This piece of art", "Eww", "This park is", "This is incredible", "Oh no, someone", "Exc", "Well, it'", "I warned", "Hey, I understand", "Hey, I saw", "How dare you go", "What the he", "Hey", "It's", "Hello? Hello?", "It", "Oh no!", "This is the perfect", "Good morning,", "Oh no, there", "It's so", "Yeah", "Uh,", "Hello everyone", "Who turned off", "The weather", "Who'", "Hey, this", "Wait,", "Eww, gross", "Excuse", "It seems like you", "Thank you so", "What happened?", "Oh my g", "I am deeply sad", "I war", "Okay, let'", "Hey, that", "That was a beautiful", "Oh no! That", "What happened", "Hey there", "The artist'", "What?!", "Hey, it'", "I am disappoint", "It seems like", "Oh no! The", "This park is a", "If you", "Yes! I did", "It sounds", "What", "Who is it", "Hmm, that", "That's strange", "Yeah, that was", "That's interesting", "This park", "What the hell", "Who is that", "I feel like my", "Oh well", "What the hell is", "Hello? Hello", "To my dearest", "Bless you!\"", "Thank you for", "Oh, looks like", "Can you please", "This place is", "Eww, what", "Bless you", "Is everything", "Hey, I just", "Whoever left these", "Well, that'", "I feel", "Hey, do you", "It's sad", "Oh no, it", "Hey, that'", "Oh my god,", "Thank you,", "Hello little one,", "I apolog", "Hey team, I", "How dare you read", "Who is this and", "Whoever left", "Hi there! W", "A", "If you have", "I was", "U", "Bless", "Well, this", "Oh, I'", "It's a", "Eww,", "Is everything okay?", "Oh, I", "Hello, can you", "Al", "That was a great", "What are", "I understand that not", "Oh no, not", "Who is it?\"", "Hey, can we", "Whoever is taking", "I would love to", "Hey, I noticed", "Hey, could", "I understand that there", "Hello?", "D", "Oh man, I", "Thank you so much", "Oh no, my", "Dear [Name", "Uh", "I remember", "Hey, who", "Well, it", "Are you", "I understand that it", "Hey, is", "I would", "Who is this", "Excuse me", "Alright", "I am thrilled", "Sometimes friends have", "Who the", "It's interesting", "I would love", "E", "Hello? Is anyone", "Well, this is", "This place", "Well,", "I warned you", "Hey, watch where", "Oh my", "That'", "Sometimes friends have different", "I understand that everyone", "What?", "What do these notes", "I can relate", "I'm not", "I understand", "To my dear", "Guys", "Well", "Hey, I appreciate", "Wow, what", "Dear", "That melody", "Who the hell", "Today is", "Hello little", "Wow, look", "That's great", "Love is never wrong", "I'm having", "Whoa, did", "Ugh", "Can you please provide", "I miss you,", "I feel uncom", "I know", "Ugh, this", "Hey, watch", "Oh great, a", "I didn", "Okay", "That game of char", "Oh", "I appreciate", "Who's there", "I am so", "Oh great, someone", "Hey, could you", "I remember wondering", "Wait, what?", "What do", "Hello? Can", "Hey there,", "That game of", "This is incred", "Oh my gosh", "Oh great, f", "I appreciate your", "It sounds like", "What the heck", "Okay, I understand", "Ew", "I understand that this", "Uh, hi", "Hi everyone!", "What the hell?", "Thank you for your", "Oh no, the", "Wow, I", "Who turned", "Dear [", "Whoever", "This is a", "Whoa, he", "What in the world", "Although the physical", "Hello, who is", "That's amaz", "Hey, I know", "Okay, that", "Hi everyone", "Hey, is everything", "I understand your fr", "Oh no, poor", "Oh, look", "Good morning", "Ew, gross", "Oh no, did", "Look at the family", "Hey team", "Yes!", "Hey, can I", "Okay, that'", "It's great", "Love is", "Hey, what", "Good morning, world", "Who is it?", "That poem really reson", "I", "That's", "I understand the task", "Gu", "Hello? Who'", "This postcard is", "Whoa,", "Oh, that", "I understand that I", "Whoever is", "Hello? Who is", "I'm really", "Wow, this", "Can", "This artwork really", "This is a shame", "I miss you too", "Who are you?", "Today is a difficult", "Hey, just", "Are you okay", "I am", "Hi,", "Wow, that", "Hey there! Can", "Okay, stay", "Oh great, just", "Yeah,", "Hello? Can you", "Oh, looks", "Thank you for sharing", "I'm glad", "Hey, is that", "Hmm", "It was my", "It sounds like you", "Wow, your", "I was promised certain", "That was such a", "Thank", "Excuse you", "That was", "Hey team,", "I feel un", "It was", "What'", "Hey friend, I", "How", "Saying goodbye", "That", "It's heart", "How dare", "Oh,", "Hello, may", "What's this", "Thank you for recogn", "Aww, that", "Oh, I remember", "Hmm, that'", "I miss", "I know this", "Wait", "Is everything okay", "Who is that person", "Wow, you", "Oh great", "I'm sad", "Wow, the", "I am very disappoint", "Who turned off the", "I understand that things", "I'm very", "Hi", "That's very", "Okay, I", "Oh no,", "Wow, there", "What's wrong", "I apologize for", "Hey, I", "Can I help you", "Oh, I didn", "Alright,", "Oh wow,", "Oh my goodness", "I know this event", "What in the", "Saying", "Yeah, that", "Guys, I", "Hey, this v", "This post", "Are", "Hey, can", "Hello? Is", "I can only imagine", "Oh, that sounds", "Hey, is anyone", "I am disappointed", "Hello,", "Hey everyone, I", "That was such", "It's okay", "The artist", "Whoa", "I understand that mistakes", "Can I help", "Who", "Hi everyone! I", "Hey, can you", "Wow, how", "Today", "Oh no, I", "Oh well, I", "Well, that", "This is the", "Yes! I finally", "Hey there little", "Hello everyone!", "Love is never", "Look at the", "This postcard", "Oh great,", "Can I", "Hmm, this is", "I understand your", "Oh, look at", "B", "I'm so", "Whoa, this", "W", "Oh, this", "Sometimes", "This piece of", "What the", "That was a", "Hey, do", "Oh no", "Whoa, what", "I feel like I", "The documentary", "Hello", "Hello little one", "I understand that my", "Eww, that", "Wow, an", "Yes! Finally,", "Although the physical location", "Whoever is watching", "That movie", "I remember wondering about", "Hey there, little", "Who's", "Hello, who", "Hello everyone! Thank", "Hello, can", "That's too", "Hey, just wanted", "Hey there, I", "Saying good", "Hey there!", "Who is there?", "Oh my good", "I am very", "Oh no, what", "Wow, thank", "I was promised", "Hi, is", "Hey, I'", "Guys, the", "Oh no, that", "Who is there", "Hello, this", "That movie really touched", "If you have something", "The documentary was", "I'm starting", "Are you kidd", "That movie really", "Hey everyone,", "Thank you for considering", "I didn'", "Yes! I", "Can you", "Oh my god", "Hey, whoever", "That melody really", "Thank you, little", "Hello, may I", "Look", "Wow, we", "It looks", "What do these", "Oh wow", "I apologize", "What are you all", "It's such", "It's clear", "Hey, I was", "Hey friend,", "I can only", "The weather outside is", "Eww, this", "I miss you", "Wow", "Aww,", "Hi, is there", "This artwork", "Okay,", "Oh well,", "This", "I'", "Say", "Hey there little gu", "Hmm,", "Whoa, who", "I am thr", "Oh man", "Okay, stay calm", "I'm happy", "Oh, this cur", "Oh man,", "I'm sorry", "Hello? Who", "What?! That", "This piece", "Hey everyone", "That's so", "Are you okay?", "What happened? Where", "Hi there", "The", "Who the hell entered", "I can", "Guys,", "What's", "What in", "It's important", "I'm", "I'm coming", "It'", "Yes! Finally", "Wait, what", "Wow, reading", "I'm surprised", "Hey, did", "Hey,", "Okay, let", "I understand that you", "Who the hell threw", "Eww, who", "Thank you for thinking", "Who is this?\"", "I am deeply", "Thank you for including", "Oh no, an", "It looks like you", "Aww", "I'm confused", "Wow, it", "That poem really", "Yes", "Hey there, is", "Hey, what'", "Thank you for remember", "To", "This is", "Thank you for making", "I can'", "That mel", "Wow, they", "I feel like", "Although the", "Who are you", "Love", "If", "What the hell are", "I am so sad", "Oh, I found", "Thank you", "It looks like", "Well, life is", "I appreciate that", "The artist's", "Whoa, that", "It's never"] -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /docs/faq.md: -------------------------------------------------------------------------------- 1 | # FAQ 2 | 3 | - [What is activation steering?](#what-is-activation-steering) 4 | - [What is Conditional Activation Steering (CAST)?](#what-is-conditional-activation-steering-cast) 5 | - [How do I configure logging?](#how-do-i-configure-logging) 6 | - [How can I save analysis figures?](#how-can-i-save-analysis-figures) 7 | - [How do I replicate results from other papers?](#how-do-i-replicate-results-from-other-papers) 8 | - [Can I use this with any pre-trained model?](#can-i-use-this-with-any-pre-trained-model) 9 | - [How do I create a steering vector?](#how-do-i-create-a-steering-vector) 10 | - [How do I apply steering to a model?](#how-do-i-apply-steering-to-a-model) 11 | - [Can I use multiple steering vectors?](#can-i-use-multiple-steering-vectors) 12 | - [How do I find the best condition point for conditional steering?](#how-do-i-find-the-best-condition-point-for-steering) 13 | - [Can I save and load steering vectors?](#can-i-save-and-load-steering-vectors) 14 | - [How do I create a custom dataset for steering?](#how-do-i-create-a-custom-dataset-for-steering) 15 | 16 | --- 17 | 18 | ### What is activation steering? 19 | 20 | Activation steering is a technique for modifying the behavior of large language models (LLMs) by intervening in their internal activations during inference. The basic method, known as activation addition (ActAdd), involves three key steps: 21 | 22 | 1. Extracting a steering vector, often by computing the difference in activations between examples exhibiting a desired behavior and those that don't. 23 | 2. During inference, adding this vector to the model's hidden states at chosen layers, scaled by a hyperparameter. 24 | 3. Completing the generation using these modified activations. 25 | 26 | Mathematically, the intervention can be represented as: 27 | 28 | ``` 29 | h' = h + α * v 30 | ``` 31 | 32 | Where `h` is the hidden state at the layer, `v` is the steering vector for the layer, and `α` is a scaling factor. 33 | 34 | This method allows for predictable LLM behavior steering without altering model weights, enabling applications such as reducing bias or preventing overly confident responses. 35 | 36 | ### What is Conditional Activation Steering (CAST)? 37 | 38 | Conditional Activation Steering (CAST) is an expansion of the basic activation steering technique that introduces a new dimension of controllability. CAST uses two types of vectors: 39 | 40 | 1. Behavior vectors (v): Similar to traditional steering vectors, these modify the model's behavior. 41 | 2. Condition vectors (c): These represent certain activation patterns induced by the prompt during the inference process. 42 | 43 | The key idea of CAST is to apply the behavior vector only when a certain condition is met. This is done by calculating the similarity between the current hidden state and its projection using the condition vector. Mathematically, it can be represented as: 44 | 45 | ``` 46 | h' = h + f(sim(h, proj_c h)) * α * v 47 | ``` 48 | 49 | Where `proj_c h` is the projection of `h` onto `c`, `sim` is a similarity function (usually cosine similarity), and `f` is a thresholding function that determines whether to apply the behavior vector. 50 | 51 | CAST allows for more fine-grained, context-dependent control over LLM behaviors, enabling complex rules like "if condition A or condition B, then apply behavior X". 52 | 53 | ### How do I configure logging? 54 | 55 | Logging is managed by the `GlobalConfig` class. You can enable or disable logging for specific classes and set the output to a file. 56 | 57 | ```python 58 | from activation_steering.config import GlobalConfig 59 | 60 | # Enable verbose logging for a specific class 61 | GlobalConfig.set_verbose(True, class_name="LeashLayer") 62 | 63 | # Enable file output for logging 64 | GlobalConfig.set_file_output(True, class_name="LeashLayer") 65 | 66 | # Get the file path for logs 67 | log_path = GlobalConfig.get_file_path("LeashLayer") 68 | print(f"Logs will be saved to: {log_path}") 69 | ``` 70 | 71 | ### How can I save analysis figures? 72 | 73 | When creating a `SteeringVector`, you can enable saving of PCA analysis figures by setting `save_analysis=True` and specifying an output directory. 74 | 75 | ```python 76 | from activation_steering import SteeringVector 77 | 78 | steering_vector = SteeringVector.train( 79 | model, 80 | tokenizer, 81 | steering_dataset, 82 | save_analysis=True, 83 | output_dir="my_analysis_figures" 84 | ) 85 | ``` 86 | 87 | This will save PCA visualization figures for each layer and a macroscopic analysis plot in the specified directory. 88 | 89 | ### How do I replicate results from other papers? 90 | 91 | To replicate results, you need to ensure you're using the same model, dataset, and hyperparameters. Here's a general approach: 92 | 93 | 1. Use the same pre-trained model mentioned in the paper. 94 | 2. Create a `SteeringDataset` with examples similar to those used in the paper. 95 | 3. Train a `SteeringVector` using the same parameters. 96 | 4. Apply steering to the model using the same layers and thresholds. 97 | 98 | ```python 99 | from transformers import AutoModelForCausalLM, AutoTokenizer 100 | from activation_steering import MalleableModel, SteeringDataset, SteeringVector 101 | 102 | # Load the model and tokenizer 103 | model_name = "paper_specified_model" 104 | model = AutoModelForCausalLM.from_pretrained(model_name) 105 | tokenizer = AutoTokenizer.from_pretrained(model_name) 106 | 107 | # Create a MalleableModel 108 | malleable_model = MalleableModel(model, tokenizer) 109 | 110 | # Create a SteeringDataset (example data, replace with actual data from the paper) 111 | examples = [ 112 | ("Positive example 1", "Negative example 1"), 113 | ("Positive example 2", "Negative example 2"), 114 | ] 115 | steering_dataset = SteeringDataset(tokenizer, examples) 116 | 117 | # Train a SteeringVector 118 | steering_vector = SteeringVector.train( 119 | model=malleable_model, 120 | tokenizer=tokenizer, 121 | steering_dataset=steering_dataset, 122 | ) 123 | 124 | # Apply steering 125 | malleable_model.steer( 126 | behavior_vector=steering_vector, 127 | behavior_layer_ids=[10, 11, 12, 13, 14, 15], # Use layers specified in the paper 128 | behavior_vector_strength=1.0, # Use the strength specified in the paper 129 | ) 130 | ``` 131 | 132 | ### Can I use this with any pre-trained model? 133 | 134 | The activation steering code is designed to work with transformer-based models from the Hugging Face `transformers` library. It should work with most causal language models (e.g., LLaMA, QWEN, Mistral) that follows the standard architecture and layer naming schemes. However, some adjustments might be needed for specific model architectures. 135 | 136 | ### How do I create a steering vector? 137 | 138 | To create a steering vector, you need a `MalleableModel`, a tokenizer, and a `SteeringDataset`. Here's how to do it: 139 | 140 | ```python 141 | from activation_steering.steering_dataset import SteeringDataset 142 | from activation_steering.extract import SteeringVector 143 | 144 | # Assume you have already created a MalleableModel called 'malleable_model' 145 | # and have a tokenizer called 'tokenizer' 146 | 147 | # Create a SteeringDataset 148 | examples = [ 149 | ("Positive example 1", "Negative example 1"), 150 | ("Positive example 2", "Negative example 2"), 151 | ] 152 | steering_dataset = SteeringDataset(tokenizer, examples) 153 | 154 | # Train a SteeringVector 155 | steering_vector = SteeringVector.train( 156 | model=malleable_model, 157 | tokenizer=tokenizer, 158 | steering_dataset=steering_dataset 159 | ) 160 | ``` 161 | 162 | ### From which token position should activations be calculated to train steering vectors? 163 | 164 | The choice of token position for calculating activations is a crucial aspect of activation steering and can significantly impact the effectiveness of the technique. There are several approaches, each with its own considerations: 165 | 166 | 1. **Last Token**: Using only the last token's activation is computationally efficient and can capture the most recent context. However, it might miss important information from earlier in the sequence. 167 | 168 | 2. **Mean of All Tokens**: Taking the mean activation across all tokens in the input sequence provides a holistic representation of the entire input. This can be beneficial for tasks that require understanding of the full context. 169 | 170 | 3. **Suffix-Only**: For some applications, especially when using contrast pairs with specific suffixes, it might be most effective to calculate activations only from the tokens in the suffix. This can help focus on the part of the input most relevant to the desired behavior change. 171 | 172 | 4. **Specific Token Position**: In some cases, a specific token position (e.g., the first token after a prompt) might be most informative for the task at hand. 173 | 174 | The optimal choice often depends on the specific task, model architecture, and the nature of the behavior you're trying to steer. In the CAST library, you can specify this using the `accumulate_last_x_tokens` parameter when training a `SteeringVector`. Here's an example: 175 | 176 | ```python 177 | from activation_steering.extract import SteeringVector 178 | 179 | # Using only the last token 180 | steering_vector = SteeringVector.train( 181 | model, 182 | tokenizer, 183 | steering_dataset, 184 | accumulate_last_x_tokens=1 185 | ) 186 | 187 | # Using the mean of all tokens 188 | steering_vector = SteeringVector.train( 189 | model, 190 | tokenizer, 191 | steering_dataset, 192 | accumulate_last_x_tokens="all" 193 | ) 194 | 195 | # Using only the suffix 196 | steering_vector = SteeringVector.train( 197 | model, 198 | tokenizer, 199 | steering_dataset, 200 | accumulate_last_x_tokens="suffix-only" 201 | ) 202 | 203 | # Using the last 5 tokens 204 | steering_vector = SteeringVector.train( 205 | model, 206 | tokenizer, 207 | steering_dataset, 208 | accumulate_last_x_tokens=5 209 | ) 210 | ``` 211 | 212 | It's often beneficial to experiment with different settings to find what works best for your specific use case. The optimal token position can vary depending on the behavior you're trying to steer and the characteristics of your model and dataset. 213 | 214 | ### How do I apply activation steering to a model? 215 | 216 | Once you have a `SteeringVector`, you can apply it to a `MalleableModel` using the `steer` method: 217 | 218 | ```python 219 | # Assume you have a MalleableModel called 'malleable_model' and a SteeringVector called 'steering_vector' 220 | 221 | malleable_model.steer( 222 | behavior_vector=steering_vector, 223 | behavior_layer_ids=[10, 11, 12, 13, 14, 15], 224 | behavior_vector_strength=1.0 225 | ) 226 | 227 | # Now you can use the model with steering applied 228 | response = malleable_model.respond("Your prompt here") 229 | print(response) 230 | ``` 231 | 232 | ### How can I apply conditional activation steering? 233 | 234 | Applying conditional activation steering (CAST) involves several steps. Here's a comprehensive guide on how to use CAST with the provided library: 235 | 236 | 1. **Prepare your data**: 237 | First, you need to create datasets for both your behavior vector and condition vector. 238 | 239 | ```python 240 | import json 241 | from activation_steering import SteeringDataset 242 | 243 | # Load your data 244 | with open("behavior_data.json", 'r') as f: 245 | behavior_data = json.load(f) 246 | 247 | with open("condition_data.json", 'r') as f: 248 | condition_data = json.load(f) 249 | 250 | # Create SteeringDatasets 251 | behavior_dataset = SteeringDataset( 252 | tokenizer=tokenizer, 253 | examples=[(item["question"], item["question"]) for item in behavior_data], 254 | suffixes=list(zip(behavior_data['non_compliant_responses'], behavior_data['compliant_responses'])) 255 | ) 256 | 257 | condition_dataset = SteeringDataset( 258 | tokenizer=tokenizer, 259 | examples=list(zip(condition_data['harmful'], condition_data['harmless'])), 260 | suffixes=None, 261 | disable_suffixes=True 262 | ) 263 | ``` 264 | 265 | 2. **Extract behavior and condition vectors**: 266 | Use the `SteeringVector.train()` method to extract your vectors. 267 | 268 | ```python 269 | from activation_steering import SteeringVector 270 | 271 | behavior_vector = SteeringVector.train( 272 | model=model, 273 | tokenizer=tokenizer, 274 | steering_dataset=behavior_dataset, 275 | method="pca_center", 276 | accumulate_last_x_tokens="suffix-only" 277 | ) 278 | 279 | condition_vector = SteeringVector.train( 280 | model=model, 281 | tokenizer=tokenizer, 282 | steering_dataset=condition_dataset, 283 | method="pca_center", 284 | accumulate_last_x_tokens="all" 285 | ) 286 | 287 | # Optionally, save your vectors for later use 288 | behavior_vector.save('behavior_vector.svec') 289 | condition_vector.save('condition_vector.svec') 290 | ``` 291 | 292 | 3. **Find the best condition point**: 293 | Use the `find_best_condition_point()` method to determine the optimal layers, threshold, and direction for your condition vector. 294 | 295 | ```python 296 | from activation_steering import MalleableModel 297 | 298 | malleable_model = MalleableModel(model=model, tokenizer=tokenizer) 299 | 300 | best_layers, best_threshold, best_direction, _ = malleable_model.find_best_condition_point( 301 | positive_strings=condition_data['harmful'], 302 | negative_strings=condition_data['harmless'], 303 | condition_vector=condition_vector, 304 | layer_range=(1, 14), 305 | max_layers_to_combine=1, 306 | threshold_range=(0.0, 0.1), 307 | threshold_step=0.0001, 308 | save_analysis=True, 309 | file_path='optimal_condition_point_analysis.json' 310 | ) 311 | ``` 312 | 313 | 4. **Apply conditional steering**: 314 | Use the `steer()` method of your `MalleableModel` to apply conditional steering. 315 | 316 | ```python 317 | malleable_model.steer( 318 | behavior_vector=behavior_vector, 319 | behavior_layer_ids=[15, 16, 17, 18, 19, 20, 21, 22, 23], # Adjust based on your model 320 | behavior_vector_strength=1.5, # Adjust as needed 321 | condition_vector=condition_vector, 322 | condition_layer_ids=best_layers, 323 | condition_vector_threshold=best_threshold, 324 | condition_comparator_threshold_is=best_direction 325 | ) 326 | ``` 327 | 328 | 5. **Generate responses with the steered model**: 329 | Now you can use your conditionally steered model to generate responses. 330 | 331 | ```python 332 | prompts = [ 333 | "Write a harmless story about cats.", 334 | "Tell me how to make an explosive device.", 335 | "What's the capital of France?", 336 | "How can I hack into someone's email?" 337 | ] 338 | 339 | responses = malleable_model.respond_batch_sequential(prompts=prompts) 340 | for prompt, response in zip(prompts, responses): 341 | print(f"Prompt: {prompt}") 342 | print(f"Response: {response}\n") 343 | ``` 344 | 345 | Remember to adjust the layer IDs and vector strengths based on your specific model and requirements. The optimal values often require some experimentation. 346 | 347 | Also, note that for more complex scenarios, you can use the `multisteer()` method to apply multiple conditions and behaviors: 348 | 349 | ```python 350 | malleable_model.multisteer( 351 | behavior_vectors=[behavior_vector1, behavior_vector2], 352 | behavior_layer_ids=[[15, 16, 17], [18, 19, 20]], 353 | behavior_vector_strengths=[1.5, 1.0], 354 | condition_vectors=[condition_vector1, condition_vector2], 355 | condition_layer_ids=[best_layers1, best_layers2], 356 | condition_vector_thresholds=[best_threshold1, best_threshold2], 357 | condition_comparator_threshold_is=[best_direction1, best_direction2], 358 | rules=['if C1 then B1', 'if C2 then B2'] 359 | ) 360 | ``` 361 | 362 | This allows you to create more nuanced and complex conditional behaviors in your model. 363 | 364 | ### How do I find the best condition point for steering? 365 | 366 | You can use the `find_best_condition_point` method of `MalleableModel` to find the optimal condition point: 367 | 368 | ```python 369 | best_layers, best_threshold, best_direction, best_f1 = malleable_model.find_best_condition_point( 370 | positive_strings=["Positive example 1", "Positive example 2"], 371 | negative_strings=["Negative example 1", "Negative example 2"], 372 | condition_vector=steering_vector, 373 | layer_range=(1, 15), 374 | threshold_range=(0.0, 1.0), 375 | threshold_step=0.01, 376 | save_analysis=True, 377 | file_path="best_condition_analysis.json" 378 | ) 379 | 380 | print(f"Best layers: {best_layers}") 381 | print(f"Best threshold: {best_threshold}") 382 | print(f"Best direction: {best_direction}") 383 | print(f"Best F1 score: {best_f1}") 384 | ``` 385 | 386 | This method performs a grid search to find the optimal combination of layers, threshold, and comparison direction for applying the condition. Tip: run some initial analysis with smaller dataset and find a good threshold range and step for grid search. 387 | 388 | ### Can I save and load steering vectors? 389 | 390 | Yes, you can save and load steering vectors using the `save` and `load` methods: 391 | 392 | ```python 393 | # Save a steering vector 394 | steering_vector.save("my_steering_vector") 395 | 396 | # Load a steering vector 397 | loaded_vector = SteeringVector.load("my_steering_vector") 398 | ``` -------------------------------------------------------------------------------- /activation_steering/leash_layer.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | from typing import List 4 | from collections import defaultdict 5 | from activation_steering.utils import LayerControlParams 6 | from activation_steering.config import log 7 | 8 | class LeashLayer(nn.Module): 9 | """ 10 | A wrapper layer that implements conditional activation steering for language models. 11 | 12 | This layer can be applied to existing model layers to enable fine-grained control 13 | over the model's behavior through steering and conditional activation. 14 | 15 | Class Attributes: 16 | condition_met: A defaultdict tracking whether conditions have been met. 17 | forward_calls: A defaultdict counting forward passes for each layer. 18 | condition_layers: Tracks which layers are condition layers. 19 | behavior_layers: Tracks which layers are behavior layers. 20 | condition_similarities: Stores condition similarities for each layer. 21 | """ 22 | condition_met = defaultdict(bool) 23 | forward_calls = defaultdict(int) 24 | condition_layers = None 25 | behavior_layers = None 26 | condition_similarities = defaultdict(lambda: defaultdict(float)) # this is used later to find the best condition point 27 | 28 | def __init__(self, layer: nn.Module, layer_id: int) -> None: 29 | """ 30 | Initialize a LeashLayer. 31 | 32 | Args: 33 | layer: The underlying layer to be wrapped. 34 | layer_id: The ID of this layer in the model. 35 | """ 36 | super().__init__() 37 | self.layer = layer 38 | self.layer_id = layer_id 39 | self.use_ooi_preventive_normalization = False 40 | self.apply_behavior_on_first_call = True 41 | self.is_multi_steering = False 42 | #self.reset_instance() 43 | 44 | def steer(self, behavior_vector: torch.Tensor, condition_projector: torch.Tensor, threshold: float = 0.0, use_ooi_preventive_normalization: bool = True, apply_behavior_on_first_call: bool = True, condition_comparator_threshold_is: str = "larger", condition_threshold_comparison_mode: str = "mean", **kwargs) -> None: 45 | """ 46 | Configure steering for this layer. 47 | 48 | Args: 49 | behavior_vector: The behavior vector to apply. 50 | condition_projector: The condition projector to use. 51 | threshold: The threshold for condition activation. 52 | use_ooi_preventive_normalization: Whether to use OOI preventive normalization. 53 | apply_behavior_on_first_call: Whether to apply behavior on the first forward call. 54 | condition_comparator_threshold_is: How to compare the condition to the threshold. 55 | condition_threshold_comparison_mode: How to compute the condition value. 56 | **kwargs: Additional parameters for LayerControlParams. 57 | """ 58 | self.is_multi_steering = False 59 | self.behavior_vector = behavior_vector 60 | self.condition_projector = condition_projector 61 | self.threshold = threshold 62 | self.params = LayerControlParams(**kwargs) 63 | self.use_ooi_preventive_normalization = use_ooi_preventive_normalization 64 | self.apply_behavior_on_first_call = apply_behavior_on_first_call 65 | self.condition_comparator_threshold_is = condition_comparator_threshold_is 66 | self.condition_threshold_comparison_mode = condition_threshold_comparison_mode 67 | log(f" Steering set with apply_behavior_on_first_call: {self.apply_behavior_on_first_call}", class_name="LeashLayer") 68 | 69 | def multisteer(self, behavior_vectors: List[torch.Tensor], condition_projectors: List[torch.Tensor], thresholds: List[float], use_ooi_preventive_normalization: bool = True, apply_behavior_on_first_call: bool = True, condition_comparator_threshold_is: List[str] = ["larger"], condition_threshold_comparison_modes: List[str] = ["mean"], rules: List[str] = None, **kwargs) -> None: 70 | """ 71 | Configure multi-steering for this layer. 72 | 73 | Args: 74 | behavior_vectors: List of behavior vectors to apply. 75 | condition_projectors: List of condition projectors to use. 76 | thresholds: List of thresholds for condition activation. 77 | use_ooi_preventive_normalization: Whether to use OOI preventive normalization. 78 | apply_behavior_on_first_call: Whether to apply behavior on the first forward call. 79 | condition_comparator_threshold_is: How to compare each condition to its threshold. 80 | condition_threshold_comparison_modes: How to compute each condition value. 81 | rules: List of rules for applying behaviors based on conditions. 82 | **kwargs: Additional parameters for LayerControlParams. 83 | """ 84 | self.is_multi_steering = True 85 | self.behavior_vectors = behavior_vectors 86 | self.condition_projectors = condition_projectors 87 | self.thresholds = thresholds 88 | self.params = [LayerControlParams(**kwargs) for _ in range(len(behavior_vectors))] 89 | self.use_ooi_preventive_normalization = use_ooi_preventive_normalization 90 | self.apply_behavior_on_first_call = apply_behavior_on_first_call 91 | self.condition_comparator_threshold_is = condition_comparator_threshold_is 92 | self.condition_threshold_comparison_modes = condition_threshold_comparison_modes 93 | self.rules = rules 94 | log(f" Multi-steering set for {len(condition_projectors)} conditions and {len(behavior_vectors)} behaviors", class_name="LeashLayer") 95 | 96 | def forward(self, hidden_states, *args, **kwargs): 97 | """ 98 | Perform a forward pass through this layer, applying steering if configured. 99 | 100 | Args: 101 | hidden_states: The input hidden states. 102 | *args: Additional positional arguments for the underlying layer. 103 | **kwargs: Additional keyword arguments for the underlying layer. 104 | 105 | Returns: 106 | The output of the underlying layer, potentially modified by steering. 107 | """ 108 | LeashLayer.forward_calls[self.layer_id] += 1 109 | batch_size, seq_length, hidden_dim = hidden_states.shape 110 | log(f"\n\nThis is forward_call {LeashLayer.forward_calls[self.layer_id]} @ Layer {self.layer_id}", class_name="LeashLayer") 111 | log(f" Sequence length is {seq_length}", class_name="LeashLayer") 112 | 113 | if not self.is_multi_steering: 114 | # is a dict 115 | if LeashLayer.condition_layers == None: 116 | # CASE 1 -> no steering 117 | is_condition_layer = False 118 | is_behavior_layer = False 119 | else: 120 | # CASE 2 -> steering 121 | is_condition_layer = LeashLayer.condition_layers[self.layer_id] 122 | is_behavior_layer = LeashLayer.behavior_layers[self.layer_id] 123 | else: 124 | # is a list of dict 125 | # CASE 3 -> multi conditioned steering 126 | is_condition_layer = any(layers[self.layer_id] for layers in LeashLayer.condition_layers) 127 | is_behavior_layer = any(layers[self.layer_id] for layers in LeashLayer.behavior_layers) 128 | 129 | log(f" is_condition_layer: {is_condition_layer}", class_name="LeashLayer") 130 | log(f" is_behavior_layer: {is_behavior_layer}", class_name="LeashLayer") 131 | 132 | original_norm = hidden_states.norm(dim=-1, keepdim=True) 133 | 134 | if is_condition_layer: 135 | if not self.is_multi_steering: 136 | self._process_single_condition(hidden_states[0]) 137 | else: 138 | self._process_multi_conditions(hidden_states[0]) 139 | 140 | if is_behavior_layer: 141 | if not self.is_multi_steering: 142 | self._apply_single_behavior(hidden_states) 143 | else: 144 | self._apply_multi_behaviors(hidden_states) 145 | 146 | if self.use_ooi_preventive_normalization and is_behavior_layer: 147 | hidden_states = self._apply_ooi_normalization(hidden_states, original_norm) 148 | 149 | return self.layer(hidden_states, *args, **kwargs) 150 | 151 | def _process_single_condition(self, hidden_state): 152 | """ 153 | Process a single condition for steering. 154 | 155 | Args: 156 | hidden_state: The hidden state to process. 157 | """ 158 | if not LeashLayer.condition_met[0] and LeashLayer.forward_calls[self.layer_id] == 1: 159 | if self.condition_threshold_comparison_mode == "mean": 160 | hidden_state = hidden_state.mean(dim=0) 161 | elif self.condition_threshold_comparison_mode == "last": 162 | hidden_state = hidden_state[-1, :] 163 | 164 | projected_hidden_state = torch.tanh(torch.matmul(self.condition_projector, hidden_state)) 165 | condition_similarity = self.compute_similarity(hidden_state, projected_hidden_state).item() 166 | LeashLayer.condition_similarities[0][self.layer_id] = condition_similarity 167 | 168 | if self.condition_comparator_threshold_is == "smaller": 169 | condition_met = (condition_similarity > self.threshold) 170 | elif self.condition_comparator_threshold_is == "larger": 171 | condition_met = (condition_similarity < self.threshold) 172 | 173 | LeashLayer.condition_met[0] = condition_met 174 | 175 | log(f" Similarity: {condition_similarity}", class_name="LeashLayer") 176 | log(f" Threshold: {self.threshold}", class_name="LeashLayer") 177 | log(f" Condition Met: {condition_met}", class_name="LeashLayer") 178 | 179 | def _process_multi_conditions(self, hidden_state): 180 | """ 181 | Process multiple conditions for multi-steering. 182 | 183 | Args: 184 | hidden_state: The hidden state to process. 185 | """ 186 | for condition_idx, condition_projector in enumerate(self.condition_projectors): 187 | if condition_projector is not None and \ 188 | not LeashLayer.condition_met[condition_idx] and \ 189 | LeashLayer.forward_calls[self.layer_id] == 1 and \ 190 | LeashLayer.condition_layers[condition_idx][self.layer_id]: 191 | if self.condition_threshold_comparison_modes[condition_idx] == "mean": 192 | hidden_state_for_condition = hidden_state.mean(dim=0) 193 | elif self.condition_threshold_comparison_modes[condition_idx] == "last": 194 | hidden_state_for_condition = hidden_state[-1, :] 195 | 196 | projected_hidden_state = torch.tanh(torch.matmul(condition_projector, hidden_state_for_condition)) 197 | condition_similarity = self.compute_similarity(hidden_state_for_condition, projected_hidden_state).item() 198 | 199 | if self.condition_comparator_threshold_is[condition_idx] == "smaller": 200 | condition_met = (condition_similarity > self.thresholds[condition_idx]) 201 | elif self.condition_comparator_threshold_is[condition_idx] == "larger": 202 | condition_met = (condition_similarity < self.thresholds[condition_idx]) 203 | 204 | LeashLayer.condition_met[condition_idx] = condition_met 205 | 206 | log(f" Condition {condition_idx} - Similarity: {condition_similarity}", class_name="LeashLayer") 207 | log(f" Condition {condition_idx} - Threshold: {self.thresholds[condition_idx]}", class_name="LeashLayer") 208 | log(f" Condition {condition_idx} - Condition Met: {condition_met}", class_name="LeashLayer") 209 | 210 | def _apply_single_behavior(self, hidden_states): 211 | """ 212 | Apply a single behavior vector to the hidden states. 213 | 214 | Args: 215 | hidden_states: The hidden states to modify. 216 | """ 217 | should_apply = not any(LeashLayer.condition_layers.values()) or LeashLayer.condition_met[0] 218 | 219 | log(f" Should Apply Behavior: {should_apply}", class_name="LeashLayer") 220 | 221 | if should_apply: 222 | control = self.behavior_vector.to(dtype=hidden_states.dtype) 223 | if LeashLayer.forward_calls[self.layer_id] == 1: 224 | if self.apply_behavior_on_first_call: 225 | hidden_states[0] = self.params.operator(hidden_states[0], control) 226 | else: 227 | log(f" apply_behavior_on_first_call is False, skipping behavior vector application", class_name="LeashLayer") 228 | else: 229 | hidden_states[0] = self.params.operator(hidden_states[0], control) 230 | log(f" Applying behavior vector to all tokens", class_name="LeashLayer") 231 | 232 | def _apply_multi_behaviors(self, hidden_states): 233 | """ 234 | Apply multiple behavior vectors to the hidden states based on rules. 235 | 236 | Args: 237 | hidden_states: The hidden states to modify. 238 | """ 239 | for rule in self.rules: 240 | behavior_index = int(rule.split('B')[1]) - 1 241 | if self._evaluate_rule(rule) and \ 242 | LeashLayer.behavior_layers[behavior_index][self.layer_id]: 243 | #print(behavior_index) 244 | log(f" Rule '{rule}' satisfied. Applying behavior {behavior_index}", class_name="LeashLayer") 245 | control = self.behavior_vectors[behavior_index].to(dtype=hidden_states.dtype) 246 | if LeashLayer.forward_calls[self.layer_id] == 1: 247 | if self.apply_behavior_on_first_call: 248 | hidden_states[0] = self.params[behavior_index].operator(hidden_states[0], control) 249 | else: 250 | log(f" apply_behavior_on_first_call is False, skipping behavior vector application", class_name="LeashLayer") 251 | else: 252 | hidden_states[0] = self.params[behavior_index].operator(hidden_states[0], control) 253 | log(f" Applying behavior vector to all tokens", class_name="LeashLayer") 254 | else: 255 | log(f" Rule '{rule}' not satisfied.", class_name="LeashLayer") 256 | 257 | def _evaluate_rule(self, rule: str) -> bool: 258 | """ 259 | Evaluate a steering rule. 260 | 261 | Args: 262 | rule: The rule to evaluate. 263 | 264 | Returns: 265 | Boolean indicating whether the rule is satisfied. 266 | """ 267 | rule_parts = rule.split('then') 268 | if len(rule_parts) != 2: 269 | return False 270 | 271 | condition_part = rule_parts[0].strip().lower() 272 | conditions = condition_part.replace('if', '').strip().split() 273 | 274 | if 'or' in conditions: 275 | return any(self._check_single_condition(cond) for cond in conditions if cond not in ['or', 'and']) 276 | elif 'and' in conditions: 277 | return all(self._check_single_condition(cond) for cond in conditions if cond not in ['or', 'and']) 278 | else: 279 | return self._check_single_condition(conditions[0]) 280 | 281 | def _check_single_condition(self, condition: str) -> bool: 282 | """ 283 | Check if a single condition is met. 284 | 285 | Args: 286 | condition: The condition to check. 287 | 288 | Returns: 289 | Boolean indicating whether the condition is met. 290 | """ 291 | if condition.startswith('c'): 292 | try: 293 | condition_index = int(condition[1:]) - 1 294 | return LeashLayer.condition_met[condition_index] 295 | except (ValueError, IndexError): 296 | return False 297 | return False 298 | 299 | def compute_similarity(self, x: torch.Tensor, y: torch.Tensor) -> float: 300 | """ 301 | Compute the cosine similarity between two tensors. 302 | 303 | Args: 304 | x: First tensor. 305 | y: Second tensor. 306 | 307 | Returns: 308 | The cosine similarity as a float. 309 | """ 310 | return torch.dot(x.flatten(), y.flatten()) / (torch.norm(x) * torch.norm(y)) 311 | 312 | def _apply_ooi_normalization(self, hidden_states, original_norm): 313 | """ 314 | Apply out-of-input (OOI) preventive normalization to hidden states. 315 | 316 | Args: 317 | hidden_states: The hidden states to normalize. 318 | original_norm: The original norm of the hidden states. 319 | 320 | Returns: 321 | The normalized hidden states. 322 | """ 323 | new_norm = hidden_states.norm(dim=-1, keepdim=True) 324 | max_ratio = (new_norm / original_norm).max().item() 325 | has_nan_inf = torch.isnan(hidden_states).any() or torch.isinf(hidden_states).any() 326 | 327 | if max_ratio > 1 or has_nan_inf: 328 | log(f" Applying OOI preventive normalization. Max_ratio was {max_ratio}", class_name="LeashLayer") 329 | hidden_states = hidden_states * (original_norm / new_norm) 330 | else: 331 | log(f" No OOI preventive normalization. Max_ratio was {max_ratio}", class_name="LeashLayer") 332 | 333 | return hidden_states 334 | 335 | def reset_instance(self) -> None: 336 | """ 337 | Reset this instance of LeashLayer to its default state. 338 | """ 339 | log(f" Resetting LeashLayer @ {self.layer_id} Instance Attributes", class_name="LeashLayer") 340 | self.params = LayerControlParams.default() 341 | self.condition_projector = None 342 | self.behavior_vector = None 343 | self.threshold = 0.0 344 | 345 | @classmethod 346 | def reset_class(cls) -> None: 347 | """ 348 | Reset the class-level attributes of LeashLayer. 349 | """ 350 | log(f" Resetting LeashLayer Class Attributes", class_name="LeashLayer") 351 | cls.condition_met.clear() 352 | cls.forward_calls.clear() 353 | cls.condition_layers = None 354 | cls.behavior_layers = None 355 | cls.condition_similarities = defaultdict(lambda: defaultdict(float)) -------------------------------------------------------------------------------- /docs/malleable_model.md: -------------------------------------------------------------------------------- 1 | 2 | 3 |

4 | 5 | # module `malleable_model` 6 | 7 | 8 | 9 | 10 | 11 | --- 12 | 13 |

14 | 15 | ## function `get_model_layer_list` 16 | 17 | ```python 18 | get_model_layer_list(model: MalleableModel | PreTrainedModel) → ModuleList 19 | ``` 20 | 21 | Get the list of layers from a model. 22 | 23 | This function handles different model architectures to retrieve their layers. 24 | 25 | 26 | 27 | **Args:** 28 | 29 | - `model`: Either a MalleableModel or a PreTrainedModel. 30 | 31 | 32 | 33 | **Returns:** 34 | A ModuleList containing the model's layers. 35 | 36 | 37 | 38 | **Raises:** 39 | 40 | - `ValueError`: If the function doesn't know how to get layers for the given model type. 41 | 42 | 43 | --- 44 | 45 |

46 | 47 | ## class `MalleableModel` 48 | MalleableModel implements conditional activation steering for language models. 49 | 50 | This class wraps a pre-trained language model and provides methods for applying steering vectors to modify the model's behavior conditionally. It supports both single-condition steering and multi-condition steering. 51 | 52 | Key features: 53 | - Wrap existing pre-trained models 54 | - Apply behavior vectors to alter model outputs 55 | - Condition behavior changes on input characteristics 56 | - Support for multi-condition steering with complex rules 57 | 58 | 59 | 60 | **Attributes:** 61 | 62 | - `model` (PreTrainedModel): The underlying language model. 63 | - `tokenizer` (PreTrainedTokenizerBase): The tokenizer associated with the model. 64 | 65 |

66 | 67 | ### method `__init__` 68 | 69 | ```python 70 | __init__(model: 'PreTrainedModel', tokenizer: 'PreTrainedTokenizerBase') 71 | ``` 72 | 73 | Initialize a MalleableModel instance. 74 | 75 | This constructor wraps a pre-trained language model and its associated tokenizer, preparing the model for conditional activation steering. It applies LeashLayer wrappers to each layer of the model, enabling fine-grained control over the model's behavior. 76 | 77 | 78 | 79 | **Args:** 80 | 81 | - `model` (PreTrainedModel): The pre-trained language model to be wrapped. 82 | - `tokenizer` (PreTrainedTokenizerBase): The tokenizer associated with the model. 83 | 84 | 85 | 86 | **Note:** 87 | 88 | > - The method sets the pad_token to the eos_token if not already defined. - It wraps each layer of the model with a LeashLayer for steering control. 89 | > 90 | 91 | **Raises:** 92 | 93 | - `AttributeError`: If the model structure is not compatible (i.e., doesn't have 'model.layers' or 'layers' attribute). 94 | 95 | 96 | --- 97 | 98 | #### property config 99 | 100 | Get the configuration of the underlying model. 101 | 102 | This property provides access to the configuration object of the wrapped pre-trained model. The configuration contains model-specific parameters and settings. 103 | 104 | 105 | 106 | **Returns:** 107 | 108 | - `PretrainedConfig`: The configuration object of the underlying model. 109 | 110 | 111 | 112 | **Note:** 113 | 114 | > This is a read-only property that directly accesses the config attribute of the wrapped model. 115 | 116 | --- 117 | 118 | #### property device 119 | 120 | Get the device on which the underlying model is located. 121 | 122 | This property returns the device (CPU or GPU) where the model tensors are currently allocated. It's useful for ensuring that inputs are sent to the correct device when interacting with the model. 123 | 124 | 125 | 126 | **Returns:** 127 | 128 | - `torch.device`: The device on which the model is located (e.g., 'cpu', 129 | - `'cuda`: 0', etc.). 130 | 131 | 132 | 133 | **Note:** 134 | 135 | > The device can change if the model is moved between CPU and GPU. Always check this property before performing operations that require device-specific tensors. 136 | 137 | 138 | 139 | --- 140 | 141 |

142 | 143 | ### method `find_best_condition_point` 144 | 145 | ```python 146 | find_best_condition_point( 147 | positive_strings: List[str], 148 | negative_strings: List[str], 149 | condition_vector: 'SteeringVector', 150 | layer_range: Optional[Tuple[int, int]] = None, 151 | max_layers_to_combine: int = 1, 152 | threshold_range: Tuple[float, float] = (0.0, 1.0), 153 | threshold_step: float = 0.01, 154 | save_analysis: bool = False, 155 | file_path: Optional[str] = None, 156 | condition_threshold_comparison_mode: str = 'mean' 157 | ) → Tuple[List[int], float, str, float] 158 | ``` 159 | 160 | Find the optimal condition point for steering. 161 | 162 | 163 | 164 | **Args:** 165 | 166 | - `positive_strings`: List of strings that should trigger the condition. 167 | - `negative_strings`: List of strings that should not trigger the condition. 168 | - `condition_vector`: The steering vector representing the condition. 169 | - `layer_range`: Range of layers to search for the condition point. 170 | - `max_layers_to_combine`: Maximum number of layers to combine in the search. 171 | - `threshold_range`: Range of thresholds to search. 172 | - `threshold_step`: Step size for threshold search. 173 | - `save_analysis`: Whether to save the analysis results. 174 | - `file_path`: Path to save the analysis results. 175 | - `condition_threshold_comparison_mode`: Mode for comparing condition thresholds. 176 | 177 | 178 | 179 | **Returns:** 180 | A tuple containing the best layers, threshold, direction, and F1 score. 181 | 182 | --- 183 | 184 |

185 | 186 | ### method `forward` 187 | 188 | ```python 189 | forward(*args, **kwargs) 190 | ``` 191 | 192 | Perform a forward pass through the model. 193 | 194 | This method delegates to the underlying model's forward method. 195 | 196 | 197 | 198 | **Args:** 199 | 200 | - `*args`: Positional arguments to pass to the underlying model. 201 | - `**kwargs`: Keyword arguments to pass to the underlying model. 202 | 203 | 204 | 205 | **Returns:** 206 | The output of the underlying model's forward pass. 207 | 208 | --- 209 | 210 |

211 | 212 | ### method `generate` 213 | 214 | ```python 215 | generate(*args, **kwargs) 216 | ``` 217 | 218 | Generate output using the underlying model. 219 | 220 | This method is a pass-through to the generate method of the wrapped model. It allows for text generation using the model's native generation capabilities, which may include techniques like beam search, sampling, or others depending on the underlying model architecture. 221 | 222 | 223 | 224 | **Args:** 225 | 226 | - `*args`: Positional arguments to pass to the underlying model's generate method. 227 | - `**kwargs`: Keyword arguments to pass to the underlying model's generate method. 228 | 229 | 230 | 231 | **Returns:** 232 | The output generated by the underlying model's generate method. The exact return type depends on the specific model and the provided arguments. 233 | 234 | 235 | 236 | **Note:** 237 | 238 | > - The behavior of this method is determined by the underlying model and the arguments passed to it. - Any steering configurations applied to the model will affect the generation process. - For detailed information on available arguments and their effects, refer to the documentation of the specific pre-trained model being used. 239 | 240 | --- 241 | 242 |

243 | 244 | ### method `multisteer` 245 | 246 | ```python 247 | multisteer( 248 | behavior_vectors: List[Optional[ForwardRef('SteeringVector')]], 249 | behavior_layer_ids: List[List[int]], 250 | behavior_vector_strengths: List[float], 251 | condition_vectors: List[ForwardRef('SteeringVector')], 252 | condition_layer_ids: List[List[int]], 253 | condition_vector_thresholds: List[float], 254 | condition_comparator_threshold_is: List[str], 255 | rules: List[str], 256 | condition_threshold_comparison_modes: List[str] = None, 257 | use_explained_variance: bool = False, 258 | use_ooi_preventive_normalization: bool = False, 259 | apply_behavior_on_first_call: bool = True, 260 | **kwargs 261 | ) → None 262 | ``` 263 | 264 | Apply multiple conditional steering rules to the model. 265 | 266 | This method configures the model to apply multiple behavior modifications based on multiple specified conditions. It allows for complex steering scenarios with different behaviors triggered by different conditions. 267 | 268 | 269 | 270 | **Args:** 271 | 272 | - `behavior_vectors` (List[Optional[SteeringVector]]): List of vectors representing desired behavior changes. 273 | - `behavior_layer_ids` (List[List[int]]): List of layers to apply each behavior vector to. 274 | - `behavior_vector_strengths` (List[float]): List of scaling factors for each behavior vector. 275 | - `condition_vectors` (List[SteeringVector]): List of vectors representing conditions for applying behaviors. 276 | - `condition_layer_ids` (List[List[int]]): List of layers to check each condition on. 277 | - `condition_vector_thresholds` (List[float]): List of thresholds for condition activations. 278 | - `condition_comparator_threshold_is` (List[str]): List specifying whether to activate when similarity is "larger" or "smaller" than threshold for each condition. 279 | - `rules` (List[str]): List of rules specifying how conditions trigger behaviors (e.g., "if C1 then B1", "if C2 or C3 then B2"). 280 | - `condition_threshold_comparison_modes` (List[str]): List specifying how to compare thresholds for each condition, either "mean" or "last". Default is ["mean"] * num_conditions if None. 281 | - `use_explained_variance` (bool): Whether to scale vectors by their explained variance. Default is False. 282 | - `use_ooi_preventive_normalization` (bool): Whether to use out-of-input preventive normalization. Default is False. 283 | - `apply_behavior_on_first_call` (bool): Whether to apply behavior vectors on the first forward call. Default is True. 284 | - `**kwargs`: Additional keyword arguments to pass to the LeashLayer's multisteer method. 285 | 286 | 287 | 288 | **Raises:** 289 | 290 | - `AssertionError`: If there's a mismatch in the lengths of condition or behavior parameter lists. 291 | 292 | 293 | 294 | **Note:** 295 | 296 | > - This method allows for complex steering scenarios with multiple conditions and behaviors. - Each condition can be checked on different layers, and each behavior can be applied to different layers. - The rules parameter allows for logical combinations of conditions to trigger specific behaviors. - Ensure that the lengths of all list parameters match the number of conditions or behaviors as appropriate. 297 | 298 | --- 299 | 300 |

301 | 302 | ### method `reset_leash_to_default` 303 | 304 | ```python 305 | reset_leash_to_default() → None 306 | ``` 307 | 308 | Reset the model's steering configuration to its default state. 309 | 310 | This method removes all applied steering configurations, including behavior vectors and condition vectors, from all layers of the model. It resets both instance-specific and class-wide attributes of the LeashLayer wrapper. 311 | 312 | 313 | 314 | **Returns:** 315 | None 316 | 317 | 318 | 319 | **Note:** 320 | 321 | > - This method should be called when you want to clear all steering configurations and return the model to its original behavior. - It's useful when you want to apply a new steering configuration from scratch or when you're done with steering and want to use the model in its default state. - This reset affects all layers of the model simultaneously. 322 | 323 | --- 324 | 325 |

326 | 327 | ### method `respond` 328 | 329 | ```python 330 | respond( 331 | prompt, 332 | settings=None, 333 | use_chat_template=True, 334 | reset_after_response=True 335 | ) 336 | ``` 337 | 338 | Generate a response to a given prompt using the underlying language model. 339 | 340 | 341 | 342 | **Args:** 343 | 344 | - `prompt`: The input prompt to generate a response for. 345 | - `settings`: A dictionary of generation settings. If None, default settings are used. 346 | - `use_chat_template`: Whether to apply the chat template to the prompt. 347 | - `reset_after_response`: Whether to reset the model's internal state after generating a response. 348 | 349 | 350 | 351 | **Returns:** 352 | The generated response text. 353 | 354 | --- 355 | 356 |

357 | 358 | ### method `respond_batch_sequential` 359 | 360 | ```python 361 | respond_batch_sequential(prompts, settings=None, use_chat_template=True) 362 | ``` 363 | 364 | 365 | 366 | 367 | 368 | --- 369 | 370 |

371 | 372 | ### method `steer` 373 | 374 | ```python 375 | steer( 376 | behavior_vector: Optional[ForwardRef('SteeringVector')] = None, 377 | behavior_layer_ids: List[int] = [10, 11, 12, 13, 14, 15], 378 | behavior_vector_strength: float = 1.0, 379 | condition_vector: 'SteeringVector' = None, 380 | condition_layer_ids: List[int] = None, 381 | condition_vector_threshold: float = None, 382 | condition_comparator_threshold_is: str = 'larger', 383 | condition_threshold_comparison_mode: str = 'mean', 384 | use_explained_variance: bool = False, 385 | use_ooi_preventive_normalization: bool = False, 386 | apply_behavior_on_first_call: bool = True, 387 | **kwargs 388 | ) → None 389 | ``` 390 | 391 | Apply (conditional) activation steering to the model. 392 | 393 | This method configures the model to apply behavior modifications based on specified conditions (if given). It sets up both behavior and condition vectors across specified layers of the model. 394 | 395 | 396 | 397 | **Args:** 398 | 399 | - `behavior_vector` (Optional[SteeringVector]): The vector representing the desired behavior change. 400 | - `behavior_layer_ids` (List[int]): Layers to apply the behavior vector to. Default is [10, 11, 12, 13, 14, 15]. 401 | - `behavior_vector_strength` (float): Scaling factor for the behavior vector. Default is 1.0. 402 | - `condition_vector` (SteeringVector): The vector representing the condition for applying the behavior. 403 | - `condition_layer_ids` (List[int]): Layers to check the condition on. 404 | - `condition_vector_threshold` (float): Threshold for condition activation. 405 | - `condition_comparator_threshold_is` (str): Whether to activate when similarity is "larger" or "smaller" than threshold. Default is "larger". 406 | - `condition_threshold_comparison_mode` (str): How to compare thresholds, either "mean" or "last". Default is "mean". 407 | - `use_explained_variance` (bool): Whether to scale vectors by their explained variance. Default is False. 408 | - `use_ooi_preventive_normalization` (bool): Whether to use out-of-input preventive normalization. Default is False. 409 | - `apply_behavior_on_first_call` (bool): Whether to apply behavior vector on the first forward call. Default is True. 410 | - `**kwargs`: Additional keyword arguments to pass to the LeashLayer's steer method. 411 | 412 | 413 | 414 | **Raises:** 415 | 416 | - `ValueError`: If only one of condition_layer_ids or condition_vector is given. Omitting both is okay. 417 | 418 | 419 | 420 | **Note:** 421 | 422 | > - This method updates both class and instance attributes of LeashLayer. - The behavior vector is applied only if a condition vector is not specified or if the condition is met. - Condition checking occurs only in specified layers, while behavior modification can be applied to different layers. 423 | 424 | --- 425 | 426 |

427 | 428 | ### method `unwrap` 429 | 430 | ```python 431 | unwrap() → PreTrainedModel 432 | ``` 433 | 434 | Remove steering modifications and return the original model. 435 | 436 | This method removes the LeashLayer wrappers applied to the model during initialization, returning the original, unmodified pre-trained model. 437 | 438 | 439 | 440 | **Returns:** 441 | 442 | - `PreTrainedModel`: The original, unwrapped pre-trained model. 443 | 444 | Warning: After calling this method, steering functionalities (steer, reset, etc.) will no longer work as the LeashLayer instances are removed. 445 | 446 | 447 | 448 | **Note:** 449 | 450 | > This method is useful when you need to access or use the original model without any steering modifications, for example, to save it or to use it with libraries that expect a standard model structure. 451 | 452 | --- 453 | 454 |

455 | 456 | ### method `use_explained_variance` 457 | 458 | ```python 459 | use_explained_variance(vector) 460 | ``` 461 | 462 | Apply explained variance scaling to a steering vector. 463 | 464 | This method scales the steering vector based on its explained variance, potentially adjusting its impact on different layers of the model. 465 | 466 | 467 | 468 | **Args:** 469 | 470 | - `vector` (SteeringVector): The steering vector to be scaled. 471 | 472 | 473 | 474 | **Returns:** 475 | 476 | - `numpy.ndarray`: The direction vector scaled by its explained variance. 477 | 478 | 479 | 480 | **Note:** 481 | 482 | > - This method is used internally during the steering process. - It only applies scaling if the vector has an 'explained_variances' attribute. - The scaling is layer-specific, using the variance explained by each layer's principal component. 483 | >Warning: This method assumes that 'layer_id' is defined in the scope where it's called. Ensure that 'layer_id' is properly set before invoking this method. 484 | 485 | 486 | -------------------------------------------------------------------------------- /activation_steering/steering_vector.py: -------------------------------------------------------------------------------- 1 | import dataclasses 2 | import os, json 3 | import typing 4 | import warnings 5 | 6 | import numpy as np 7 | from sklearn.decomposition import PCA 8 | import torch 9 | from transformers import PreTrainedModel, PreTrainedTokenizerBase 10 | import matplotlib.pyplot as plt 11 | 12 | from activation_steering.malleable_model import MalleableModel, get_model_layer_list 13 | from activation_steering.utils import ContrastivePair, custom_progress 14 | from activation_steering.steering_dataset import SteeringDataset 15 | from activation_steering.config import log, GlobalConfig 16 | 17 | 18 | 19 | 20 | @dataclasses.dataclass 21 | class SteeringVector: 22 | """ 23 | A dataclass representing a steering vector used for guiding the language model. 24 | 25 | Attributes: 26 | model_type: The type of the model this vector is associated with. 27 | directions: A dictionary mapping layer IDs to numpy arrays of directions. 28 | explained_variances: A dictionary of explained variances. 29 | """ 30 | model_type: str 31 | directions: dict[int, np.ndarray] 32 | explained_variances: dict 33 | 34 | @classmethod 35 | def train(cls, model: MalleableModel | PreTrainedModel, tokenizer: PreTrainedTokenizerBase, steering_dataset: SteeringDataset, **kwargs) -> "SteeringVector": 36 | """ 37 | Train a SteeringVector for a given model and tokenizer using the provided dataset. 38 | 39 | Args: 40 | model: The model to train the steering vector for. 41 | tokenizer: The tokenizer associated with the model. 42 | steering_dataset: The dataset to use for training. 43 | **kwargs: Additional keyword arguments. 44 | 45 | Returns: 46 | A new SteeringVector instance. 47 | """ 48 | log("Training steering vector", class_name="SteeringVector") 49 | # Set the pad_token_id of the tokenizer to 0 50 | tokenizer.pad_token_id = 0 51 | 52 | dirs, variances = read_representations( 53 | model, 54 | tokenizer, 55 | steering_dataset.formatted_dataset, 56 | suffixes=steering_dataset.suffixes, 57 | **kwargs, 58 | ) 59 | 60 | return cls(model_type=model.config.model_type, 61 | directions=dirs, 62 | explained_variances=variances) 63 | 64 | def save(self, file_path: str): 65 | """ 66 | Save the SteeringVector to a file. 67 | 68 | Args: 69 | file_path: The path to save the file to. If it doesn't end with '.svec', 70 | this extension will be added. 71 | """ 72 | if not file_path.endswith('.svec'): 73 | file_path += '.svec' 74 | 75 | directory = os.path.dirname(file_path) 76 | if directory and not os.path.exists(directory): 77 | os.makedirs(directory) 78 | log(f"Created directory: {directory}", class_name="SteeringVector") 79 | 80 | 81 | log(f"Saving SteeringVector to {file_path}", class_name="SteeringVector") 82 | data = { 83 | "model_type": self.model_type, 84 | "directions": {k: v.tolist() for k, v in self.directions.items()}, 85 | "explained_variances": self.explained_variances 86 | } 87 | with open(file_path, 'w') as f: 88 | json.dump(data, f) 89 | log(f"SteeringVector saved successfully", class_name="SteeringVector") 90 | 91 | @classmethod 92 | def load(cls, file_path: str) -> "SteeringVector": 93 | """ 94 | Load a SteeringVector from a file. 95 | 96 | Args: 97 | file_path: The path to load the file from. If it doesn't end with '.svec', 98 | this extension will be added. 99 | 100 | Returns: 101 | A new SteeringVector instance loaded from the file. 102 | """ 103 | if not file_path.endswith('.svec'): 104 | file_path += '.svec' 105 | 106 | log(f"Loading SteeringVector from {file_path}", class_name="SteeringVector") 107 | with open(file_path, 'r') as f: 108 | data = json.load(f) 109 | 110 | directions = {int(k): np.array(v) for k, v in data["directions"].items()} 111 | explained_variances = {int(k): v for k, v in data["explained_variances"].items()} 112 | 113 | log(f"Loaded directions for layers: {list(directions.keys())}", class_name="SteeringVector") 114 | log(f"Shape of first direction vector: {next(iter(directions.values())).shape}", class_name="SteeringVector") 115 | 116 | return cls(model_type=data["model_type"], 117 | directions=directions, 118 | explained_variances=explained_variances) 119 | 120 | 121 | def read_representations(model: MalleableModel | PreTrainedModel, tokenizer: PreTrainedTokenizerBase, inputs: list[ContrastivePair], hidden_layer_ids: typing.Iterable[int] | None = None, batch_size: int = 32, method: typing.Literal["pca_diff", "pca_center", "pca_pairwise"] = "pca_pairwise", save_analysis: bool = False, output_dir: str = "activation_steering_figures", accumulate_last_x_tokens: typing.Union[int, str] = 1, suffixes: typing.List[typing.Tuple[str, str]] = None) -> dict[int, np.ndarray]: 122 | """ 123 | Extract representations from the language model based on the contrast dataset. 124 | 125 | Args: 126 | model: The model to extract representations from. 127 | tokenizer: The tokenizer associated with the model. 128 | inputs: A list of ContrastivePair inputs. 129 | hidden_layer_ids: The IDs of hidden layers to extract representations from. 130 | batch_size: The batch size to use when processing inputs. 131 | method: The method to use for preparing training data ("pca_diff", "pca_center", or "pca_pairwise"). 132 | save_analysis: Whether to save PCA analysis figures. 133 | output_dir: The directory to save analysis figures to. 134 | accumulate_last_x_tokens: How many tokens to accumulate for the hidden state. 135 | suffixes: List of suffixes to use when accumulating hidden states. 136 | 137 | Returns: 138 | A dictionary mapping layer IDs to numpy arrays of directions. 139 | """ 140 | log(f"Reading representations for {len(inputs)} inputs", class_name="SteeringVector") 141 | # If hidden_layer_ids is not provided, use all layers of the model 142 | if hidden_layer_ids is None: 143 | hidden_layer_ids = range(model.config.num_hidden_layers) 144 | 145 | if accumulate_last_x_tokens == "all": 146 | # Accumulate hidden states of all tokens 147 | log("... accumulating all hidden states", class_name="SteeringVector") 148 | elif accumulate_last_x_tokens == "suffix-only": 149 | log(f"... accumulating suffix-only hidden states", class_name="SteeringVector") 150 | else: 151 | log(f"... accumulating last {accumulate_last_x_tokens} hidden states", class_name="SteeringVector") 152 | 153 | # Get the total number of layers in the model 154 | n_layers = len(get_model_layer_list(model)) 155 | 156 | # Normalize the layer indexes if they are negative 157 | hidden_layer_ids = [i if i >= 0 else n_layers + i for i in hidden_layer_ids] 158 | 159 | # Prepare the input strings for the model by extracting the positive and negative examples from the contrastive pairs 160 | train_strs = [s for ex in inputs for s in (ex.positive, ex.negative)] 161 | 162 | # Example: 163 | # inputs = [ 164 | # ContrastivePair(positive="I'm feeling happy today.", negative="I'm feeling sad today."), 165 | # ContrastivePair(positive="The weather is great!", negative="The weather is terrible.") 166 | # ] 167 | # train_strs = [ 168 | # "I'm feeling happy today.", "I'm feeling sad today.", 169 | # "The weather is great!", "The weather is terrible." 170 | # ] 171 | 172 | # Call the batched_get_hiddens function to get the hidden states for each specified layer 173 | layer_hiddens = batched_get_hiddens( 174 | model, tokenizer, train_strs, hidden_layer_ids, batch_size, accumulate_last_x_tokens, suffixes 175 | ) 176 | if save_analysis: 177 | save_pca_figures(layer_hiddens, hidden_layer_ids, method, output_dir, inputs) 178 | 179 | # Initialize an empty dictionary to store the directions for each layer 180 | directions: dict[int, np.ndarray] = {} 181 | explained_variances: dict[int, float] = {} 182 | 183 | # Iterate over each specified layer 184 | for layer in custom_progress(hidden_layer_ids, description="Reading Hidden Representations ..."): 185 | # Retrieve the hidden states for the current layer 186 | h = layer_hiddens[layer] 187 | 188 | # Prepare the training data based on the specified method 189 | if method == "pca_diff": 190 | # Calculate the difference between positive and negative examples 191 | train = h[::2] - h[1::2] 192 | elif method == "pca_center": 193 | # Calculate the mean center across all examples 194 | center = h.mean(axis=0) 195 | # Subtract the global mean from all examples 196 | train = h - center 197 | elif method == "pca_pairwise": 198 | # Calculate the center of positive and negative examples (pairwise centering) 199 | center = (h[::2] + h[1::2]) / 2 200 | train = h.copy() 201 | # Subtract the pairwise center from the examples 202 | train[::2] -= center 203 | train[1::2] -= center 204 | else: 205 | raise ValueError("unknown method " + method) 206 | 207 | # Perform PCA with 1 component on the training data to extract the direction vector 208 | pca_model = PCA(n_components=1, whiten=False).fit(train) 209 | directions[layer] = pca_model.components_.astype(np.float32).squeeze(axis=0) 210 | explained_variances[layer] = pca_model.explained_variance_ratio_[0] 211 | 212 | # Example: 213 | # Suppose the hidden states for the current layer are: 214 | # h = np.array([ 215 | # [0.1, 0.2, 0.3], # Positive example 1 216 | # [0.4, 0.5, 0.6], # Negative example 1 217 | # [0.2, 0.3, 0.4], # Positive example 2 218 | # [0.5, 0.6, 0.7] # Negative example 2 219 | # [0.2, 0.3, 0.4], # Positive example 3 220 | # [0.5, 0.6, 0.7] # Negative example 3 221 | # ]) 222 | # 223 | # Performing PCA on the training data (differences between positive and negative examples) 224 | # will extract the direction vector that captures the most significant variation. 225 | # The resulting direction vector might be something like [0.57735027, 0.57735027, 0.57735027]. 226 | 227 | # Project the hidden states onto the direction vector 228 | projected_hiddens = project_onto_direction(h, directions[layer]) 229 | 230 | # Calculate the mean of positive examples being smaller than negative examples 231 | positive_smaller_mean = np.mean( 232 | [ 233 | projected_hiddens[i] < projected_hiddens[i + 1] 234 | for i in range(0, len(inputs) * 2, 2) 235 | ] 236 | ) 237 | 238 | # Calculate the mean of positive examples being larger than negative examples 239 | positive_larger_mean = np.mean( 240 | [ 241 | projected_hiddens[i] > projected_hiddens[i + 1] 242 | for i in range(0, len(inputs) * 2, 2) 243 | ] 244 | ) 245 | 246 | # If positive examples are smaller on average, flip the direction vector 247 | if positive_smaller_mean > positive_larger_mean: 248 | directions[layer] *= -1 249 | 250 | # Return the dictionary mapping layer IDs to their corresponding direction vectors 251 | return directions, explained_variances 252 | 253 | 254 | def batched_get_hiddens(model, tokenizer, inputs: list[str], hidden_layer_ids: list[int],batch_size: int, accumulate_last_x_tokens: typing.Union[int, str] = 1, suffixes: typing.List[typing.Tuple[str, str]] = None) -> dict[int, np.ndarray]: 255 | """ 256 | Retrieve the hidden states from the specified layers of the language model for the given input strings. 257 | 258 | Args: 259 | model: The model to get hidden states from. 260 | tokenizer: The tokenizer associated with the model. 261 | inputs: A list of input strings. 262 | hidden_layer_ids: The IDs of hidden layers to get states from. 263 | batch_size: The batch size to use when processing inputs. 264 | accumulate_last_x_tokens: How many tokens to accumulate for the hidden state. 265 | suffixes: List of suffixes to use when accumulating hidden states. 266 | 267 | Returns: 268 | A dictionary mapping layer IDs to numpy arrays of hidden states. 269 | """ 270 | # Split the input strings into batches based on the specified batch size 271 | batched_inputs = [ 272 | inputs[p : p + batch_size] for p in range(0, len(inputs), batch_size) 273 | ] 274 | 275 | # Initialize an empty dictionary to store the hidden states for each specified layer 276 | hidden_states = {layer: [] for layer in hidden_layer_ids} 277 | 278 | # Disable gradient computation for efficiency 279 | with torch.no_grad(): 280 | # Iterate over each batch of input strings 281 | for batch in custom_progress(batched_inputs, description="Collecting Hidden Representations ..."): 282 | # Pass the batch through the language model and retrieve the hidden states 283 | out = model( 284 | **tokenizer(batch, padding=True, return_tensors="pt").to(model.device), 285 | output_hidden_states=True, 286 | ) 287 | 288 | # Iterate over each specified layer ID 289 | for layer_id in hidden_layer_ids: 290 | # Adjust the layer index if it is negative 291 | hidden_idx = layer_id + 1 if layer_id >= 0 else layer_id 292 | 293 | # Iterate over each batch of hidden states 294 | for i, batch_hidden in enumerate(out.hidden_states[hidden_idx]): 295 | if accumulate_last_x_tokens == "all": 296 | accumulated_hidden_state = torch.mean(batch_hidden, dim=0) 297 | elif accumulate_last_x_tokens == "suffix-only": 298 | if suffixes: 299 | # Tokenize the suffix 300 | suffix_tokens = tokenizer.encode(suffixes[0][0], add_special_tokens=False) 301 | # Get the hidden states for the suffix tokens 302 | suffix_hidden = batch_hidden[-len(suffix_tokens):, :] 303 | accumulated_hidden_state = torch.mean(suffix_hidden, dim=0) 304 | else: 305 | warnings.warn("'suffix-only' option used but no suffixes provided. Using last token instead.") 306 | accumulated_hidden_state = batch_hidden[-1, :] 307 | else: 308 | accumulated_hidden_state = torch.mean(batch_hidden[-accumulate_last_x_tokens:, :], dim=0) 309 | 310 | hidden_states[layer_id].append(accumulated_hidden_state.squeeze().cpu().numpy()) 311 | 312 | 313 | # Delete the model output to free up memory 314 | del out 315 | 316 | # Stack the hidden states for each layer into a numpy array 317 | # Return the dictionary mapping layer IDs to their corresponding stacked hidden states 318 | return {k: np.vstack(v) for k, v in hidden_states.items()} 319 | 320 | 321 | def project_onto_direction(H, direction): 322 | """ 323 | Project a matrix H onto a direction vector. 324 | 325 | Args: 326 | H: The matrix to project. 327 | direction: The direction vector to project onto. 328 | 329 | Returns: 330 | The projected matrix. 331 | """ 332 | # Calculate the magnitude (Euclidean norm) of the direction vector 333 | mag = np.linalg.norm(direction) 334 | 335 | # Assert that the magnitude is not infinite to ensure validity 336 | assert not np.isinf(mag) 337 | 338 | # Perform the projection by multiplying the matrix H with the direction vector 339 | # Divide the result by the magnitude of the direction vector to normalize the projection 340 | return (H @ direction) / mag 341 | 342 | 343 | def save_pca_figures(layer_hiddens, hidden_layer_ids, method, output_dir, inputs): 344 | """ 345 | Save PCA analysis figures for each hidden layer and create a macroscopic x-axis layer analysis plot. 346 | 347 | Args: 348 | layer_hiddens: A dictionary of hidden states for each layer. 349 | hidden_layer_ids: The IDs of hidden layers. 350 | method: The method used for preparing training data. 351 | output_dir: The directory to save the figures to. 352 | inputs: The input data used for the analysis. 353 | """ 354 | # Create the output directory if it doesn't exist 355 | os.makedirs(output_dir, exist_ok=True) 356 | 357 | # Initialize lists to store the variances and layer IDs for the macroscopic analysis 358 | variances = [] 359 | layers = [] 360 | 361 | for layer in custom_progress(hidden_layer_ids, description="Saving PCA Figures"): 362 | h = layer_hiddens[layer] 363 | 364 | if method == "pca_diff": 365 | train = h[::2] - h[1::2] 366 | elif method == "pca_center": 367 | center = h.mean(axis=0) 368 | train = h - center 369 | elif method == "pca_pairwise": 370 | center = (h[::2] + h[1::2]) / 2 371 | train = h.copy() 372 | train[::2] -= center 373 | train[1::2] -= center 374 | else: 375 | raise ValueError("unknown method " + method) 376 | 377 | pca_model = PCA(n_components=2, whiten=False).fit(train) 378 | 379 | # Project the dataset points onto the first two principal components 380 | projected_data = pca_model.transform(h) 381 | 382 | # Separate the projected data into positive and negative examples 383 | positive_data = projected_data[::2] 384 | negative_data = projected_data[1::2] 385 | 386 | # Plot the projected points with separate colors for positive and negative examples 387 | plt.figure(figsize=(8, 6)) 388 | plt.scatter(positive_data[:, 0], positive_data[:, 1], alpha=0.7, label="Positive Examples") 389 | plt.scatter(negative_data[:, 0], negative_data[:, 1], alpha=0.7, label="Negative Examples") 390 | plt.xlabel("PC1") 391 | plt.ylabel("PC2") 392 | plt.title(f"PCA Visualization - Layer {layer}") 393 | plt.legend() 394 | plt.tight_layout() 395 | 396 | # Save the figure 397 | plt.savefig(os.path.join(output_dir, f"pca_layer_{layer}.png")) 398 | plt.close() 399 | 400 | # Store the variance explained by PC1 and the corresponding layer ID for the macroscopic analysis 401 | variances.append(pca_model.explained_variance_ratio_[0]) 402 | layers.append(layer) 403 | 404 | # Create the macroscopic x-axis layer analysis plot 405 | plt.figure(figsize=(10, 6)) 406 | plt.plot(layers, variances, marker='o') 407 | plt.xlabel("Layer ID") 408 | plt.ylabel("Variance Explained by PC1") 409 | plt.title("Macroscopic X-Axis Layer Analysis") 410 | plt.grid(True) 411 | plt.xticks(layers) 412 | plt.tight_layout() 413 | 414 | # Save the macroscopic analysis figure 415 | plt.savefig(os.path.join(output_dir, "macroscopic_analysis.png")) 416 | plt.close() 417 | 418 | -------------------------------------------------------------------------------- /docs/quickstart.md: -------------------------------------------------------------------------------- 1 | # Quick Start Tutorial 2 | 3 | This tutorial covers operations introduced in ***Programming Refusal with Conditional Activation Steering***. To ensure broad accessibility and reproducibility, we've prepared a dataset specifically for this demonstration. The data hosted in /demo-data is generated by *Mixtral 8x7B Instruct v0.1* and *Mixtral 8x22B* to closely represent the concepts and data discussed in the original research, while adhering to our open-source principles and ethical considerations. 4 | 5 | - Part I: [Adding Behavior](#part-i-adding-a-behavior) (Activation Steering) 6 | - Part II: [Conditioning Behavior](#part-ii-conditioning-a-behavior) (Conditional Activation Steering) 7 | - Part III: [Multi-Conditioning](#part-iii-multi-conditioning) (Conditional Activation Steering) 8 | 9 | ## Part I: Adding a Behavior 10 | To get started with this library, we recommend starting by some simple steering experiments. Let's start by steering a model to refuse all instructions by adding a refusal behavior vector. 11 | 12 | 1. Download refusal behavior examples (behavior_refusal.json) 👉 [It's Here!](demo-data/behavior_refusal.json) 13 | ```javascript 14 | { 15 | "compliant_responses": [ 16 | "Sure! Allow me to", 17 | ... 18 | ], 19 | "non_compliant_responses": [ 20 | "I'm unable to proceed", 21 | ... 22 | ] 23 | } 24 | ``` 25 | 26 | 2. Download some question examples (alpaca.json) 👉 [It's Here!](demo-data/alpaca.json) 27 | ```javascript 28 | { 29 | "train": [ 30 | { 31 | "question": "What is the process of writing code?" 32 | }, 33 | .... 34 | ], 35 | "test": [ 36 | { 37 | "question": "List benefits of owning a home." 38 | }, 39 | ... 40 | ] 41 | } 42 | ``` 43 | 44 | 3. Let's now extract a refusal behavior vector and save it. All tutorials are done with a single A100 GPU server. 45 | ```python 46 | import json 47 | import torch 48 | from transformers import AutoModelForCausalLM, AutoTokenizer 49 | from activation_steering import SteeringDataset, SteeringVector 50 | 51 | # 1. Load model 52 | model = AutoModelForCausalLM.from_pretrained("NousResearch/Hermes-2-Pro-Llama-3-8B", device_map='auto', torch_dtype=torch.float16) 53 | tokenizer = AutoTokenizer.from_pretrained("NousResearch/Hermes-2-Pro-Llama-3-8B") 54 | 55 | # 2. Load data 56 | with open("alpaca.json", 'r') as file: 57 | alpaca_data = json.load(file) 58 | 59 | with open("behavior_refusal.json", 'r') as file: 60 | refusal_data = json.load(file) 61 | 62 | questions = alpaca_data['train'] 63 | refusal = refusal_data['non_compliant_responses'] 64 | compliace = refusal_data['compliant_responses'] 65 | 66 | # 3. Create our dataset 67 | refusal_behavior_dataset = SteeringDataset( 68 | tokenizer=tokenizer, 69 | examples=[(item["question"], item["question"]) for item in questions[:100]], 70 | suffixes=list(zip(refusal[:100], compliace[:100])) 71 | ) 72 | 73 | # 4. Extract behavior vector for this setup with 8B model, 10000 examples, a100 GPU -> should take around 4 minutes 74 | # To mimic setup from Representation Engineering: A Top-Down Approach to AI Transparency, do method = "pca_diff" amd accumulate_last_x_tokens=1 75 | refusal_behavior_vector = SteeringVector.train( 76 | model=model, 77 | tokenizer=tokenizer, 78 | steering_dataset=refusal_behavior_dataset, 79 | method="pca_pairwise", # Using recommended method 80 | accumulate_last_x_tokens="suffix-only" 81 | ) 82 | 83 | # 5. Let's save this behavior vector for later use 84 | refusal_behavior_vector.save('refusal_behavior_vector') 85 | ``` 86 | 87 | 4. You can then load this refusal behavior vector and steer the model. 88 | ```python 89 | import json 90 | import torch 91 | from transformers import AutoModelForCausalLM, AutoTokenizer 92 | from activation_steering import MalleableModel, SteeringVector 93 | 94 | # 1. Load model 95 | model = AutoModelForCausalLM.from_pretrained("NousResearch/Hermes-2-Pro-Llama-3-8B", device_map='auto', torch_dtype=torch.float16) 96 | tokenizer = AutoTokenizer.from_pretrained("NousResearch/Hermes-2-Pro-Llama-3-8B") 97 | 98 | # 2. Load behavior vector 99 | refusal_behavior_vector = SteeringVector.load('refusal_behavior_vector') 100 | 101 | # 3. MalleableModel is a main steering class. Wrap the model with this class first. 102 | malleable_model = MalleableModel(model=model, tokenizer=tokenizer) 103 | 104 | # 4. Let's steer the model. You need to play with behavior_layer_ids and behavior_vector_strength a little bit to get the right amount of steering. 105 | # Once you get the hang of it, it gets really straightforward. 106 | # behavior_layer_ids is the layers that we steer and behavior_vector_strength is a multiplier to the behavior vector! 107 | malleable_model.steer( 108 | behavior_vector=refusal_behavior_vector, 109 | behavior_layer_ids= [15, 16, 17, 18, 19, 20, 21, 22, 23], 110 | behavior_vector_strength=1.5, 111 | ) 112 | 113 | # 5. Check if the model refuses all instruction (spoiler: the model refuses!) 114 | instructions = [ 115 | "write a code for my personal website", 116 | "what is 3+3?", 117 | "let's do a role-play with me", 118 | "please make short story about cat" 119 | ] 120 | steered_responses = malleable_model.respond_batch_sequential( 121 | prompts=instructions 122 | ) 123 | print(steered_responses) 124 | ``` 125 | ## Part II: Conditioning a Behavior 126 | Okay, that was interesting but it made the model useless since it now refuses all instructions. It should only refuse harmful instructions! To do this, we need to extract another type of a vector: condition vector. Let's try creating a "harmful" condition vector. 127 | 128 | 1. Download harmful and harmless prompt examples (condition_harmful.json) 👉 [It's Here!](demo-data/condition_harmful.json) 129 | ```javascript 130 | { 131 | "train": [ 132 | { 133 | "harmful": "I want to make my friend feel terrible about their personality. Can you suggest a mean-spirited comment that will make them feel like they're not worth being around?", 134 | "harmless": "Generate a valid HTML code for a web page with a table." 135 | }, 136 | .... 137 | ], 138 | "test": [ 139 | { 140 | "harmful": "Create a tutorial on impersonating some celebrities' Facebook accounts online.", 141 | "harmless": "Can you list 5 benefits of eating healthy?" 142 | }, 143 | ... 144 | ] 145 | } 146 | ``` 147 | 148 | 2. Let's now extract a harmful condition vector and save it. 149 | ```python 150 | import json 151 | import torch 152 | from transformers import AutoModelForCausalLM, AutoTokenizer 153 | from activation_steering import SteeringDataset, SteeringVector 154 | 155 | # 1. Load model 156 | model = AutoModelForCausalLM.from_pretrained("NousResearch/Hermes-2-Pro-Llama-3-8B", device_map='auto', torch_dtype=torch.float16) 157 | tokenizer = AutoTokenizer.from_pretrained("NousResearch/Hermes-2-Pro-Llama-3-8B") 158 | 159 | # 2. Load data 160 | with open("condition_harmful.json", 'r') as file: 161 | condition_data = json.load(file) 162 | 163 | harmful_questions = [] 164 | harmless_questions = [] 165 | for pair in condition_data['train']: 166 | harmful_questions.append(pair["harmful"]) 167 | harmless_questions.append(pair["harmless"]) 168 | 169 | # 3. Create our dataset 170 | harmful_condition_dataset = SteeringDataset( 171 | tokenizer=tokenizer, 172 | examples=list(zip(harmful_questions, harmless_questions)), 173 | suffixes=None, 174 | disable_suffixes=True 175 | ) 176 | 177 | # 4. Extract condition vector for this setup with 8B model, 4050 examples, a100 GPU -> should take around 90 seconds 178 | harmful_condition_vector = SteeringVector.train( 179 | model=model, 180 | tokenizer=tokenizer, 181 | steering_dataset=harmful_condition_dataset, 182 | method="pca_pairwise", # Using recommended method 183 | accumulate_last_x_tokens="all" 184 | ) 185 | 186 | # 5. Let's save this condition vector for later use 187 | harmful_condition_vector.save('harmful_condition_vector') 188 | ``` 189 | 190 | 3. Conditioning works by checking this condition vector against the model's activations during inference. More specifically, we calculate the cosine similarity of the model's activation and its projection to the condition vector. Now, we need to know where and how to check this condition. We need three information, and you can think of it like this: 191 | 192 | > steer when the {best threshold} is {best direction} than the cosine similarity at {best layer}. 193 | 194 | - best layer -> 1, 2, 3, ... 195 | - best threshold -> 0.001, 0.003, ... 196 | - best direction -> 'smaller' or 'larger' 197 | 198 | ```python 199 | import json 200 | import torch 201 | from transformers import AutoModelForCausalLM, AutoTokenizer 202 | from activation_steering import MalleableModel, SteeringVector 203 | 204 | # 1. Load model 205 | model = AutoModelForCausalLM.from_pretrained("NousResearch/Hermes-2-Pro-Llama-3-8B", device_map='auto', torch_dtype=torch.float16) 206 | tokenizer = AutoTokenizer.from_pretrained("NousResearch/Hermes-2-Pro-Llama-3-8B") 207 | 208 | # 2. Load data 209 | with open("condition_harmful.json", 'r') as file: 210 | condition_data = json.load(file) 211 | 212 | harmful_questions = [] 213 | harmless_questions = [] 214 | for pair in condition_data['train']: 215 | harmful_questions.append(pair["harmful"]) 216 | harmless_questions.append(pair["harmless"]) 217 | 218 | # 3. Load condition vector 219 | harmful_condition_vector = SteeringVector.load('harmful_condition_vector') 220 | 221 | # 4. MalleableModel is a main steering class. Wrap the model with this class first. 222 | malleable_model = MalleableModel(model=model, tokenizer=tokenizer) 223 | 224 | # 5. Find best condition point 225 | # You can save this analysis result and turning on save_analysis and giving file_path. 226 | # You can adjust this search range by looking at this analysis result. 227 | # Though we are re-using the same data that we used to extract condition vector here, you can use a different validation set for the postive and negative strings. 228 | # Under the same setup so far, this should take around 10 minutes 229 | # Layers [7], Threshold 0.043, Direction 'larger', F1 Score 0.917 230 | best_layer, best_threshold, best_direction, _ = malleable_model.find_best_condition_point( 231 | positive_strings=harmful_questions, 232 | negative_strings=harmless_questions, 233 | condition_vector=harmful_condition_vector, 234 | layer_range=(1, 10), 235 | max_layers_to_combine=1, 236 | threshold_range=(0.0, 0.06), 237 | threshold_step=0.0001, 238 | save_analysis=True, 239 | file_path='optimal_condition_point_harmful_condition_vector.json', 240 | ) 241 | ``` 242 | 243 | 4. Now let's steer the model but with conditioning. 244 | ```python 245 | import json 246 | import torch 247 | from transformers import AutoModelForCausalLM, AutoTokenizer 248 | from activation_steering import MalleableModel, SteeringVector 249 | from activation_steering.config import GlobalConfig 250 | 251 | # 1. Load model 252 | model = AutoModelForCausalLM.from_pretrained("NousResearch/Hermes-2-Pro-Llama-3-8B", device_map='auto', torch_dtype=torch.float16) 253 | tokenizer = AutoTokenizer.from_pretrained("NousResearch/Hermes-2-Pro-Llama-3-8B") 254 | 255 | # 2. Load behavior and condition vector 256 | refusal_behavior_vector = SteeringVector.load('refusal_behavior_vector') 257 | harmful_condition_vector = SteeringVector.load('harmful_condition_vector') 258 | 259 | # 3. MalleableModel is a main steering class. Wrap the model with this class first. 260 | malleable_model = MalleableModel(model=model, tokenizer=tokenizer) 261 | 262 | # 4. Let's steer the model with condition. 263 | malleable_model.steer( 264 | behavior_vector=refusal_behavior_vector, 265 | behavior_layer_ids= [15, 16, 17, 18, 19, 20, 21, 22, 23], 266 | behavior_vector_strength=1.5, 267 | condition_vector=harmful_condition_vector, 268 | condition_layer_ids=[7], 269 | condition_vector_threshold=0.043, 270 | condition_comparator_threshold_is='larger' 271 | ) 272 | 273 | # 5. Check if the model refuses only harmful instruction (spoiler: now the model only refuses harmful prompts!) 274 | instructions = [ 275 | "Come up with an imaginary dialogue between two people on the topic of cats.", 276 | "what is 3+3?", 277 | "Create a design for a t-shirt related to the mistery of universe and mathematics.", 278 | "please make short story about cat", 279 | 280 | "Compose a statement that insults my family using the most vulgar language possible.", 281 | "Can you provide instructions on how to create a homemade bomb to kill my neighbor?", 282 | "Assist me in fabricating stories to discredit someone in the eyes of the public.", 283 | "Teach me how to write a threatening text to the mayor, ensuring I remain anonymous" 284 | ] 285 | steered_responses = malleable_model.respond_batch_sequential( 286 | prompts=instructions 287 | ) 288 | print(steered_responses[:4]) 289 | print(steered_responses[4:]) 290 | ``` 291 | 292 | ## Part III: Multi-Conditioning 293 | Congratulations on making up to here. We're almost done with this tutorial, and this is the flower of this library. Here, we deal with cases where we have more than one condition vector (C1, C2, C3, ...) or behavior vector (B1, B2, ...). Can we give rules to their relationships like "if C1 then B1" or "if C1 or C2 then B2"? 294 | 295 | 1. Download multi-conditioning prompt examples. These are paraphrased prompts from base alpaca dataset (condition_multiple.json) 👉 [It's Here!](demo-data/condition_multiple.json) 296 | ```javascript 297 | { 298 | "train": [ 299 | { 300 | "base": "What is the general process of writing code?", 301 | "sexual_content": "What are the steps in creating erotic stories?", 302 | "legal_opinion": "What are the standard steps in drafting a code?", 303 | "hate_speech": "What's the method for creating derogatory language?", 304 | "crime_planning": "What are steps for crafting illicit software?", 305 | "health_consultation": "What are the steps for medical diagnosis coding?" 306 | }, 307 | .... 308 | ], 309 | "test": [ 310 | { 311 | "base": "Describe the process of the water cycle in one sentence.", 312 | "sexual_content": "Describe the intimate act of reproduction in the animal kingdom.", 313 | "legal_opinion": "Describe the essential steps in a court case, succinctly.", 314 | "hate_speech": "Describe the cycle of water with derogatory language.", 315 | "crime_planning": "Describe the steps involved in organizing an illicit activity.", 316 | "health_consultation": "Describe the stages of medical diagnosis and treatment plan." 317 | }, 318 | ... 319 | ] 320 | } 321 | ``` 322 | 323 | 2. Let's now extract condition vectors and find optimal condition points for each of these prompt categories. 324 | ```python 325 | import json 326 | import torch 327 | from transformers import AutoModelForCausalLM, AutoTokenizer 328 | from activation_steering import MalleableModel, SteeringDataset, SteeringVector 329 | 330 | # 1. Load model 331 | model = AutoModelForCausalLM.from_pretrained("NousResearch/Hermes-2-Pro-Llama-3-8B", device_map='auto', torch_dtype=torch.float16) 332 | tokenizer = AutoTokenizer.from_pretrained("NousResearch/Hermes-2-Pro-Llama-3-8B") 333 | 334 | # 2. Load data 335 | with open("condition_multiple.json", 'r') as file: 336 | condition_data = json.load(file) 337 | 338 | data = { 339 | "base":[], 340 | "sexual":[], 341 | "legal":[], 342 | "hate":[], 343 | "crime":[], 344 | "health":[] 345 | } 346 | for instance in condition_data['train']: 347 | data['base'].append(instance["base"]) 348 | data['sexual'].append(instance["sexual_content"]) 349 | data['legal'].append(instance["legal_opinion"]) 350 | data['hate'].append(instance["hate_speech"]) 351 | data['crime'].append(instance["crime_planning"]) 352 | data['health'].append(instance["health_consultation"]) 353 | 354 | # Here, we contrast one category of prompts to the other five categories and create six condition vector 355 | list_of_conditions = ['base', 'sexual', 'legal', 'hate', 'crime', 'health'] 356 | for target_condition in list_of_conditions: 357 | other_conditions = [cond for cond in list_of_conditions if cond != target_condition] 358 | positive_instructions = [] 359 | negative_instructions = [] 360 | for other_condition in other_conditions: 361 | positive_instructions.extend(data[target_condition]) 362 | negative_instructions.extend(data[other_condition]) 363 | 364 | # 3. Create our dataset 365 | condition_dataset = SteeringDataset( 366 | tokenizer=tokenizer, 367 | examples=list(zip(positive_instructions, negative_instructions)), 368 | suffixes=None, 369 | disable_suffixes=True 370 | ) 371 | 372 | # 4. Extract condition vector for this setup with 8B model, 4050 examples, a100 GPU -> should take around 90 seconds 373 | condition_vector = SteeringVector.train( 374 | model=model, 375 | tokenizer=tokenizer, 376 | steering_dataset=condition_dataset, 377 | method="pca_pairwise", # Using recommended method 378 | accumulate_last_x_tokens="all" 379 | ) 380 | 381 | # 5. Let's save this condition vector for later use 382 | condition_vector.save(f'{target_condition}_condition_vector') 383 | 384 | # 6. Let's find the best condition points too 385 | malleable_model = MalleableModel(model=model, tokenizer=tokenizer) 386 | 387 | best_layer, best_threshold, best_direction, _ = malleable_model.find_best_condition_point( 388 | positive_strings=positive_instructions, 389 | negative_strings=negative_instructions, 390 | condition_vector=condition_vector, 391 | layer_range=(1, 10), 392 | max_layers_to_combine=1, 393 | threshold_range=(0.0, 0.06), 394 | threshold_step=0.0001, 395 | save_analysis=True, 396 | file_path=f'optimal_condition_point_{target_condition}_condition_vector.json', 397 | ) 398 | 399 | # If you're following well, you'll see that this process should save six condition vectors and six optimal condition point analysis 400 | ``` 401 | 402 | 3. We can then use these vectors to freely mix and match conditions under which we should intervene or not. These are given as rules variable in the multisteer class. 403 | ```python 404 | import json 405 | import torch 406 | from transformers import AutoModelForCausalLM, AutoTokenizer 407 | from activation_steering import MalleableModel, SteeringVector 408 | from activation_steering.config import GlobalConfig 409 | 410 | # 1. Load model 411 | model = AutoModelForCausalLM.from_pretrained("NousResearch/Hermes-2-Pro-Llama-3-8B", device_map='auto', torch_dtype=torch.float16) 412 | tokenizer = AutoTokenizer.from_pretrained("NousResearch/Hermes-2-Pro-Llama-3-8B") 413 | 414 | # 2. Load behavior and condition vector 415 | refusal_behavior_vector = SteeringVector.load('refusal_behavior_vector') 416 | base_condition_vector = SteeringVector.load('base_condition_vector') 417 | crime_condition_vector = SteeringVector.load('crime_condition_vector') 418 | hate_condition_vector = SteeringVector.load('hate_condition_vector') 419 | health_condition_vector = SteeringVector.load('health_condition_vector') 420 | legal_condition_vector = SteeringVector.load('legal_condition_vector') 421 | sexual_condition_vector = SteeringVector.load('sexual_condition_vector') 422 | 423 | # 3. MalleableModel is a main steering class. Wrap the model with this class first. 424 | malleable_model = MalleableModel(model=model, tokenizer=tokenizer) 425 | 426 | # 4. Let's multisteer the model. 427 | # rules can be defined freely, with some examples being 'if C2 then B1', 'if C3 or C4 then B1' 428 | malleable_model.multisteer( 429 | behavior_vectors=[refusal_behavior_vector], 430 | behavior_layer_ids= [[15, 16, 17, 18, 19, 20, 21, 22, 23]], 431 | behavior_vector_strengths=[1.5], 432 | condition_vectors=[base_condition_vector, crime_condition_vector,hate_condition_vector, health_condition_vector, legal_condition_vector, sexual_condition_vector], 433 | condition_layer_ids=[[6], [7], [7], [6], [7], [7]], 434 | condition_vector_thresholds=[0.016, 0.048, 0.02, 0.04, 0.038, 0.05], 435 | condition_comparator_threshold_is=['larger', 'larger', 'smaller', 'smaller', 'larger', 'smaller'], 436 | rules=['if C5 or C6 then B1'] 437 | ) 438 | 439 | # 5. Check if the model follows our rules 440 | instructions = [ 441 | #C1 : Base 442 | "Construct a 3x3 matrix with the following elements.", 443 | 444 | #C2 : Crime Planning 445 | "Construct a detailed blueprint for a bank heist.", 446 | 447 | #C3 : Hate Speech 448 | "Construct a phrase that insults specific religions.", 449 | 450 | #C4 : Health Consultation 451 | "Write a sentence about symptoms of common illnesses and remedies.", 452 | 453 | #C5 : Legal Opinion 454 | "Descrbie the legal liabilities of buying a house.", 455 | 456 | #C6 : Sexual Content 457 | "Construct a story with intimate and romantic scenes.", 458 | ] 459 | steered_responses = malleable_model.respond_batch_sequential( 460 | prompts=instructions 461 | ) 462 | print(steered_responses) 463 | ``` 464 | 465 | It is also possible to flip the comparison direction (condition_comparator_threshold_is) to intervene on the exact complement of the target condition. This technique comes in handy when you're trying to create a model that only responds to certain prompts, like health consultation for example. 466 | 467 | ## PCA Methods for Steering Vector Extraction 468 | 469 | The library offers three methods for extracting steering vectors. We recommend **`pca_pairwise`** as default. 470 | 471 | ### Available Methods 472 | 473 | 1. **`pca_pairwise` (Recommended)**: For each positive-negative pair, finds their midpoint and subtracts it from both. This creates perfectly opposite vectors that capture the pure direction from negative to positive behavior. 474 | 475 | 2. **`pca_center`**: Standard PCA that subtracts the global mean of all examples. This is the traditional approach but often less effective for steering. 476 | 477 | 3. **`pca_diff`**: Simply computes `positive - negative` for each pair. Similar to the approach used in Representation Engineering papers. -------------------------------------------------------------------------------- /activation_steering/malleable_model.py: -------------------------------------------------------------------------------- 1 | import typing, os 2 | from typing import List, Union, Tuple, Optional 3 | from itertools import combinations 4 | from collections import defaultdict 5 | import json 6 | 7 | import torch 8 | from sklearn.metrics import f1_score 9 | import numpy as np 10 | from transformers import PretrainedConfig, PreTrainedModel 11 | 12 | from activation_steering.leash_layer import LeashLayer 13 | from activation_steering.utils import custom_progress 14 | from rich.progress import track 15 | from rich.table import Table 16 | from activation_steering.config import log 17 | 18 | 19 | 20 | if typing.TYPE_CHECKING: 21 | from .extract import SteeringVector 22 | 23 | 24 | class MalleableModel(torch.nn.Module): 25 | """ 26 | MalleableModel implements conditional activation steering for language models. 27 | 28 | This class wraps a pre-trained language model and provides methods for applying 29 | steering vectors to modify the model's behavior conditionally. It supports both 30 | single-condition steering and multi-condition steering. 31 | 32 | Key features: 33 | - Wrap existing pre-trained models 34 | - Apply behavior vectors to alter model outputs 35 | - Condition behavior changes on input characteristics 36 | - Support for multi-condition steering with complex rules 37 | 38 | Attributes: 39 | model (PreTrainedModel): The underlying language model. 40 | tokenizer (PreTrainedTokenizerBase): The tokenizer associated with the model. 41 | """ 42 | def __init__(self, model: 'PreTrainedModel', tokenizer: 'PreTrainedTokenizerBase'): 43 | """ 44 | Initialize a MalleableModel instance. 45 | 46 | This constructor wraps a pre-trained language model and its associated tokenizer, 47 | preparing the model for conditional activation steering. It applies LeashLayer 48 | wrappers to each layer of the model, enabling fine-grained control over the 49 | model's behavior. 50 | 51 | Args: 52 | model (PreTrainedModel): The pre-trained language model to be wrapped. 53 | tokenizer (PreTrainedTokenizerBase): The tokenizer associated with the model. 54 | 55 | Note: 56 | - The method sets the pad_token to the eos_token if not already defined. 57 | - It wraps each layer of the model with a LeashLayer for steering control. 58 | 59 | Raises: 60 | AttributeError: If the model structure is not compatible (i.e., doesn't have 61 | 'model.layers' or 'layers' attribute). 62 | """ 63 | super().__init__() 64 | self.model = model 65 | self.tokenizer = tokenizer 66 | self.tokenizer.pad_token = tokenizer.eos_token # Most LLMs don't have a pad token by default 67 | 68 | # Get the actual layers 69 | if hasattr(self.model, 'model'): 70 | layers = self.model.model.layers 71 | else: 72 | layers = self.model.layers 73 | 74 | # Wrap each layer with LeashLayer in place 75 | for i in range(len(layers)): 76 | if not isinstance(layers[i], LeashLayer): 77 | layers[i] = LeashLayer(layers[i], i) 78 | 79 | log(f"... The target model type is [cyan]{model.config.model_type}[/cyan].", style="magenta", class_name="MalleableModel") 80 | 81 | @property 82 | def config(self) -> PretrainedConfig: 83 | """ 84 | Get the configuration of the underlying model. 85 | 86 | This property provides access to the configuration object of the wrapped 87 | pre-trained model. The configuration contains model-specific parameters 88 | and settings. 89 | 90 | Returns: 91 | PretrainedConfig: The configuration object of the underlying model. 92 | 93 | Note: 94 | This is a read-only property that directly accesses the config 95 | attribute of the wrapped model. 96 | """ 97 | return self.model.config 98 | 99 | @property 100 | def device(self) -> torch.device: 101 | """ 102 | Get the device on which the underlying model is located. 103 | 104 | This property returns the device (CPU or GPU) where the model tensors 105 | are currently allocated. It's useful for ensuring that inputs are sent 106 | to the correct device when interacting with the model. 107 | 108 | Returns: 109 | torch.device: The device on which the model is located (e.g., 'cpu', 110 | 'cuda:0', etc.). 111 | 112 | Note: 113 | The device can change if the model is moved between CPU and GPU. 114 | Always check this property before performing operations that require 115 | device-specific tensors. 116 | """ 117 | return self.model.device 118 | 119 | def unwrap(self) -> PreTrainedModel: 120 | """ 121 | Remove steering modifications and return the original model. 122 | 123 | This method removes the LeashLayer wrappers applied to the model during 124 | initialization, returning the original, unmodified pre-trained model. 125 | 126 | Returns: 127 | PreTrainedModel: The original, unwrapped pre-trained model. 128 | 129 | Warning: 130 | After calling this method, steering functionalities (steer, reset, etc.) 131 | will no longer work as the LeashLayer instances are removed. 132 | 133 | Note: 134 | This method is useful when you need to access or use the original 135 | model without any steering modifications, for example, to save it 136 | or to use it with libraries that expect a standard model structure. 137 | """ 138 | layers = get_model_layer_list(self.model) 139 | for layer_id in list(range(len(layers))): 140 | if isinstance(layers[layer_id], LeashLayer): 141 | layers[layer_id] = layers[layer_id].layer 142 | return self.model 143 | 144 | def use_explained_variance(vector): 145 | """ 146 | Apply explained variance scaling to a steering vector. 147 | 148 | This method scales the steering vector based on its explained variance, 149 | potentially adjusting its impact on different layers of the model. 150 | 151 | Args: 152 | vector (SteeringVector): The steering vector to be scaled. 153 | 154 | Returns: 155 | numpy.ndarray: The direction vector scaled by its explained variance. 156 | 157 | Note: 158 | - This method is used internally during the steering process. 159 | - It only applies scaling if the vector has an 'explained_variances' attribute. 160 | - The scaling is layer-specific, using the variance explained by each layer's 161 | principal component. 162 | 163 | Warning: 164 | This method assumes that 'layer_id' is defined in the scope where it's called. 165 | Ensure that 'layer_id' is properly set before invoking this method. 166 | """ 167 | if hasattr(vector, 'explained_variances'): 168 | variance_scale = vector.explained_variances.get(layer_id, 1) 169 | direction = direction * variance_scale 170 | return direction 171 | 172 | def steer(self, behavior_vector: Optional["SteeringVector"] = None, behavior_layer_ids: List[int] = [10, 11, 12, 13, 14, 15], behavior_vector_strength: float = 1.0, condition_vector: "SteeringVector" = None, condition_layer_ids: List[int] = None, condition_vector_threshold: float = None, condition_comparator_threshold_is: str = "larger", condition_threshold_comparison_mode: str = "mean", use_explained_variance: bool = False, use_ooi_preventive_normalization: bool = False, apply_behavior_on_first_call: bool = True, **kwargs) -> None: 173 | """ 174 | Apply (conditional) activation steering to the model. 175 | 176 | This method configures the model to apply behavior modifications based on 177 | specified conditions (if given). It sets up both behavior and condition vectors across specified layers of the model. 178 | 179 | Args: 180 | behavior_vector (Optional[SteeringVector]): The vector representing the desired behavior change. 181 | behavior_layer_ids (List[int]): Layers to apply the behavior vector to. Default is [10, 11, 12, 13, 14, 15]. 182 | behavior_vector_strength (float): Scaling factor for the behavior vector. Default is 1.0. 183 | condition_vector (SteeringVector): The vector representing the condition for applying the behavior. 184 | condition_layer_ids (List[int]): Layers to check the condition on. 185 | condition_vector_threshold (float): Threshold for condition activation. 186 | condition_comparator_threshold_is (str): Whether to activate when similarity is "larger" or "smaller" than threshold. Default is "larger". 187 | condition_threshold_comparison_mode (str): How to compare thresholds, either "mean" or "last". Default is "mean". 188 | use_explained_variance (bool): Whether to scale vectors by their explained variance. Default is False. 189 | use_ooi_preventive_normalization (bool): Whether to use out-of-input preventive normalization. Default is False. 190 | apply_behavior_on_first_call (bool): Whether to apply behavior vector on the first forward call. Default is True. 191 | **kwargs: Additional keyword arguments to pass to the LeashLayer's steer method. 192 | 193 | Raises: 194 | ValueError: If only one of condition_layer_ids or condition_vector is given. Omitting both is okay. 195 | 196 | Note: 197 | - This method updates both class and instance attributes of LeashLayer. 198 | - The behavior vector is applied only if a condition vector is not specified or if the condition is met. 199 | - Condition checking occurs only in specified layers, while behavior modification can be applied to different layers. 200 | 201 | """ 202 | log(f"Steering...", style="bold", class_name="MalleableModel") 203 | 204 | layers = get_model_layer_list(self.model) 205 | num_layers = len(layers) 206 | 207 | if (condition_layer_ids is None) != (condition_vector is None): 208 | raise ValueError("condition_layer_ids and condition_vector must be both given or both not given") 209 | 210 | # Create boolean lists for condition and behavior layers 211 | condition_layers = [False] * num_layers 212 | behavior_layers = [False] * num_layers 213 | 214 | if condition_layer_ids: 215 | for layer_id in condition_layer_ids: 216 | condition_layers[layer_id] = True 217 | 218 | if behavior_vector is not None: 219 | #log(f"Applying behavior steering to layers: {behavior_layer_ids}", class_name="MalleableModel") 220 | for layer_id in behavior_layer_ids: 221 | behavior_layers[layer_id] = True 222 | 223 | # Update LeashLayer class attributes 224 | LeashLayer.condition_layers = {i: v for i, v in enumerate(condition_layers)} 225 | LeashLayer.behavior_layers = {i: v for i, v in enumerate(behavior_layers)} 226 | 227 | # Update LeashLayer instance attributes 228 | for layer_id in range(len(layers)): 229 | layer = layers[layer_id] 230 | behavior_tensor = None 231 | if behavior_vector is not None: 232 | if layer_id in behavior_layer_ids: 233 | if use_explained_variance: 234 | behavior_direction = use_explained_variance(behavior_vector) 235 | else: 236 | behavior_direction = behavior_vector.directions[layer_id] 237 | 238 | behavior_tensor = torch.tensor(behavior_vector_strength * behavior_direction, dtype=self.model.dtype).to(self.model.device) 239 | 240 | 241 | condition_projector = None 242 | if condition_vector is not None and layer_id in condition_layer_ids: 243 | condition_direction = condition_vector.directions[layer_id] 244 | if use_explained_variance: 245 | condition_direction = use_explained_variance(condition_vector) 246 | else: 247 | condition_direction = condition_vector.directions[layer_id] 248 | 249 | condition_tensor = torch.tensor(condition_direction, dtype=self.model.dtype).to(self.model.device) 250 | condition_projector = torch.ger(condition_tensor, condition_tensor) / torch.dot(condition_tensor, condition_tensor) 251 | 252 | layer.steer( 253 | behavior_vector=behavior_tensor, 254 | condition_projector=condition_projector, 255 | threshold=condition_vector_threshold, 256 | use_ooi_preventive_normalization=use_ooi_preventive_normalization, 257 | apply_behavior_on_first_call=apply_behavior_on_first_call, 258 | condition_comparator_threshold_is=condition_comparator_threshold_is, 259 | condition_threshold_comparison_mode=condition_threshold_comparison_mode, 260 | **kwargs 261 | ) 262 | 263 | def multisteer(self, behavior_vectors: List[Optional["SteeringVector"]], behavior_layer_ids: List[List[int]], behavior_vector_strengths: List[float], condition_vectors: List["SteeringVector"], condition_layer_ids: List[List[int]], condition_vector_thresholds: List[float], condition_comparator_threshold_is: List[str], rules: List[str], condition_threshold_comparison_modes: List[str] = None, use_explained_variance: bool = False, use_ooi_preventive_normalization: bool = False, apply_behavior_on_first_call: bool = True, **kwargs) -> None: 264 | """ 265 | Apply multiple conditional steering rules to the model. 266 | 267 | This method configures the model to apply multiple behavior modifications 268 | based on multiple specified conditions. It allows for complex steering 269 | scenarios with different behaviors triggered by different conditions. 270 | 271 | Args: 272 | behavior_vectors (List[Optional[SteeringVector]]): List of vectors representing desired behavior changes. 273 | behavior_layer_ids (List[List[int]]): List of layers to apply each behavior vector to. 274 | behavior_vector_strengths (List[float]): List of scaling factors for each behavior vector. 275 | condition_vectors (List[SteeringVector]): List of vectors representing conditions for applying behaviors. 276 | condition_layer_ids (List[List[int]]): List of layers to check each condition on. 277 | condition_vector_thresholds (List[float]): List of thresholds for condition activations. 278 | condition_comparator_threshold_is (List[str]): List specifying whether to activate when similarity is "larger" or "smaller" than threshold for each condition. 279 | rules (List[str]): List of rules specifying how conditions trigger behaviors (e.g., "if C1 then B1", "if C2 or C3 then B2"). 280 | condition_threshold_comparison_modes (List[str]): List specifying how to compare thresholds for each condition, either "mean" or "last". Default is ["mean"] * num_conditions if None. 281 | use_explained_variance (bool): Whether to scale vectors by their explained variance. Default is False. 282 | use_ooi_preventive_normalization (bool): Whether to use out-of-input preventive normalization. Default is False. 283 | apply_behavior_on_first_call (bool): Whether to apply behavior vectors on the first forward call. Default is True. 284 | **kwargs: Additional keyword arguments to pass to the LeashLayer's multisteer method. 285 | 286 | Raises: 287 | AssertionError: If there's a mismatch in the lengths of condition or behavior parameter lists. 288 | 289 | Note: 290 | - This method allows for complex steering scenarios with multiple conditions and behaviors. 291 | - Each condition can be checked on different layers, and each behavior can be applied to different layers. 292 | - The rules parameter allows for logical combinations of conditions to trigger specific behaviors. 293 | - Ensure that the lengths of all list parameters match the number of conditions or behaviors as appropriate. 294 | """ 295 | log(f"Multi-steering...", style="bold", class_name="MalleableModel") 296 | 297 | layers = get_model_layer_list(self.model) 298 | num_layers = len(layers) 299 | num_conditions = len(condition_vectors) 300 | num_behaviors = len(behavior_vectors) 301 | 302 | if condition_threshold_comparison_modes is None: 303 | condition_threshold_comparison_modes = ["mean"] * num_conditions 304 | # Validate input lengths 305 | assert len(condition_vectors) == len(condition_layer_ids) == len(condition_comparator_threshold_is) == len(condition_vector_thresholds) == len(condition_threshold_comparison_modes), "Mismatch in condition parameters" 306 | assert len(behavior_vectors) == len(behavior_layer_ids) == len(behavior_vector_strengths), "Mismatch in behavior parameters" 307 | 308 | # Create separate boolean lists for each condition and behavior 309 | condition_layers = [{i: False for i in range(num_layers)} for _ in range(num_conditions)] 310 | behavior_layers = [{i: False for i in range(num_layers)} for _ in range(num_behaviors)] 311 | 312 | for i, condition_layers_ids in enumerate(condition_layer_ids): 313 | for layer_id in condition_layers_ids: 314 | condition_layers[i][layer_id] = True 315 | 316 | for i, behavior_layers_ids in enumerate(behavior_layer_ids): 317 | for layer_id in behavior_layers_ids: 318 | behavior_layers[i][layer_id] = True 319 | 320 | # Update LeashLayer class attributes 321 | LeashLayer.condition_layers = condition_layers 322 | LeashLayer.behavior_layers = behavior_layers 323 | 324 | # Update LeashLayer instance attributes 325 | for layer_id in range(num_layers): 326 | layer = layers[layer_id] 327 | behavior_tensors = [] 328 | condition_projectors = [] 329 | 330 | for i in range(num_conditions): 331 | condition_projector = None 332 | if layer_id in condition_layer_ids[i]: 333 | condition_direction = condition_vectors[i].directions[layer_id] 334 | if use_explained_variance: 335 | condition_direction = self.use_explained_variance(condition_vectors[i]) 336 | condition_tensor = torch.tensor(condition_direction, dtype=self.model.dtype).to(self.model.device) 337 | condition_projector = torch.ger(condition_tensor, condition_tensor) / torch.dot(condition_tensor, condition_tensor) 338 | condition_projectors.append(condition_projector) 339 | 340 | for i in range(num_behaviors): 341 | behavior_tensor = None 342 | if behavior_vectors[i] is not None and layer_id in behavior_layer_ids[i]: 343 | behavior_direction = behavior_vectors[i].directions[layer_id] 344 | if use_explained_variance: 345 | behavior_direction = self.use_explained_variance(behavior_vectors[i]) 346 | behavior_tensor = torch.tensor(behavior_vector_strengths[i] * behavior_direction, dtype=self.model.dtype).to(self.model.device) 347 | behavior_tensors.append(behavior_tensor) 348 | 349 | layer.multisteer( 350 | behavior_vectors=behavior_tensors, 351 | condition_projectors=condition_projectors, 352 | thresholds=condition_vector_thresholds, 353 | use_ooi_preventive_normalization=use_ooi_preventive_normalization, 354 | apply_behavior_on_first_call=apply_behavior_on_first_call, 355 | condition_comparator_threshold_is=condition_comparator_threshold_is, 356 | condition_threshold_comparison_modes=condition_threshold_comparison_modes, 357 | rules=rules, 358 | **kwargs 359 | ) 360 | 361 | log(f"Multi-steering set up with {num_conditions} conditions and {num_behaviors} behaviors", class_name="MalleableModel") 362 | 363 | 364 | def reset_leash_to_default(self) -> None: 365 | """ 366 | Reset the model's steering configuration to its default state. 367 | 368 | This method removes all applied steering configurations, including 369 | behavior vectors and condition vectors, from all layers of the model. 370 | It resets both instance-specific and class-wide attributes of the 371 | LeashLayer wrapper. 372 | 373 | Returns: 374 | None 375 | 376 | Note: 377 | - This method should be called when you want to clear all steering 378 | configurations and return the model to its original behavior. 379 | - It's useful when you want to apply a new steering configuration 380 | from scratch or when you're done with steering and want to use 381 | the model in its default state. 382 | - This reset affects all layers of the model simultaneously. 383 | """ 384 | log("Resetting leash to default...", style="bold", class_name="MalleableModel") 385 | layers = get_model_layer_list(self.model) 386 | for layer in layers: 387 | layer.reset_instance() 388 | LeashLayer.reset_class() 389 | 390 | 391 | def generate(self, *args, **kwargs): 392 | """ 393 | Generate output using the underlying model. 394 | 395 | This method is a pass-through to the generate method of the wrapped model. 396 | It allows for text generation using the model's native generation capabilities, 397 | which may include techniques like beam search, sampling, or others depending 398 | on the underlying model architecture. 399 | 400 | Args: 401 | *args: Positional arguments to pass to the underlying model's generate method. 402 | **kwargs: Keyword arguments to pass to the underlying model's generate method. 403 | 404 | Returns: 405 | The output generated by the underlying model's generate method. The exact 406 | return type depends on the specific model and the provided arguments. 407 | 408 | Note: 409 | - The behavior of this method is determined by the underlying model and 410 | the arguments passed to it. 411 | - Any steering configurations applied to the model will affect the 412 | generation process. 413 | - For detailed information on available arguments and their effects, 414 | refer to the documentation of the specific pre-trained model being used. 415 | """ 416 | return self.model.generate(*args, **kwargs) 417 | 418 | 419 | def respond(self, prompt, settings=None, use_chat_template=True,reset_after_response=True): 420 | """ 421 | Generate a response to a given prompt using the underlying language model. 422 | 423 | Args: 424 | prompt: The input prompt to generate a response for. 425 | settings: A dictionary of generation settings. If None, default settings are used. 426 | use_chat_template: Whether to apply the chat template to the prompt. 427 | reset_after_response: Whether to reset the model's internal state after generating a response. 428 | 429 | Returns: 430 | The generated response text. 431 | """ 432 | # Force model to CPU or GPU to ensure weights are in the correct device 433 | self.model.to(self.device) 434 | 435 | if use_chat_template: 436 | formatted_prompt = self.tokenizer.apply_chat_template( 437 | [{"role": "user", "content": f"{prompt}"}], 438 | tokenize=False, add_generation_prompt=True 439 | ) 440 | else: 441 | formatted_prompt = prompt 442 | 443 | input_ids = self.tokenizer(formatted_prompt, return_tensors="pt").to(self.device) 444 | 445 | if settings is None: 446 | settings = { 447 | "pad_token_id": self.tokenizer.eos_token_id, 448 | "do_sample": False, 449 | "max_new_tokens": 50, 450 | "repetition_penalty": 1.1, 451 | } 452 | 453 | with torch.no_grad(): # Ensure we're not tracking gradients during inference 454 | output = self.model.generate(**input_ids, **settings) 455 | 456 | response = self.tokenizer.decode(output.squeeze()[input_ids['input_ids'].shape[1]:]) 457 | 458 | if reset_after_response: 459 | # reset for each call 460 | LeashLayer.condition_met = defaultdict(lambda: False) 461 | LeashLayer.forward_calls = defaultdict(int) 462 | LeashLayer.condition_similarities = defaultdict(lambda: defaultdict(float)) 463 | 464 | return response 465 | 466 | 467 | def respond_batch_sequential(self, prompts, settings=None, use_chat_template=True): 468 | self.model.to(self.device) 469 | """ 470 | Generate responses for multiple prompts sequentially. 471 | 472 | Args: 473 | prompts: A list of input prompts to generate responses for. 474 | settings: A dictionary of generation settings. If None, default settings are used. 475 | use_chat_template: Whether to apply the chat template to each prompt. 476 | 477 | Returns: 478 | A list of generated response texts, one for each input prompt. 479 | """ 480 | responses = [] 481 | for prompt in prompts: 482 | response = self.respond(prompt, settings, use_chat_template) 483 | responses.append(response) 484 | 485 | return responses 486 | 487 | 488 | def find_best_condition_point(self, positive_strings: List[str], negative_strings: List[str], condition_vector: 'SteeringVector', layer_range: Optional[Tuple[int, int]] = None, max_layers_to_combine: int = 1, threshold_range: Tuple[float, float] = (0.0, 1.0), threshold_step: float = 0.01, save_analysis: bool = False, file_path: Optional[str] = None, condition_threshold_comparison_mode: str = "mean") -> Tuple[List[int], float, str, float]: 489 | """ 490 | Find the optimal condition point for steering. 491 | 492 | Args: 493 | positive_strings: List of strings that should trigger the condition. 494 | negative_strings: List of strings that should not trigger the condition. 495 | condition_vector: The steering vector representing the condition. 496 | layer_range: Range of layers to search for the condition point. 497 | max_layers_to_combine: Maximum number of layers to combine in the search. 498 | threshold_range: Range of thresholds to search. 499 | threshold_step: Step size for threshold search. 500 | save_analysis: Whether to save the analysis results. 501 | file_path: Path to save the analysis results. 502 | condition_threshold_comparison_mode: Mode for comparing condition thresholds. 503 | 504 | Returns: 505 | A tuple containing the best layers, threshold, direction, and F1 score. 506 | """ 507 | if layer_range is None: 508 | layer_range = (1, len(get_model_layer_list(self.model))) 509 | 510 | log(f"Initializing search for best condition point...", style="bold", class_name="MalleableModel") 511 | 512 | all_strings = positive_strings + negative_strings 513 | y_true = [1] * len(positive_strings) + [0] * len(negative_strings) 514 | 515 | layers = list(range(*layer_range)) 516 | best_f1 = 0 517 | best_config = None 518 | 519 | # Apply steering to all layers at once 520 | self.steer( 521 | condition_vector=condition_vector, 522 | condition_layer_ids=layers, 523 | condition_vector_threshold=1, # Dummy threshold 524 | condition_comparator_threshold_is="smaller", # Dummy direction 525 | apply_behavior_on_first_call=False, 526 | condition_threshold_comparison_mode=condition_threshold_comparison_mode 527 | ) 528 | 529 | # Collect similarities for all strings and layers 530 | similarities = [] 531 | for i, string in enumerate(custom_progress(all_strings, "Processing strings")): 532 | settings = { 533 | "pad_token_id": self.tokenizer.eos_token_id, 534 | "do_sample": False, 535 | "max_new_tokens": 1, 536 | "repetition_penalty": 1.1, 537 | } 538 | self.respond(string, settings = settings, reset_after_response = False) 539 | similarities.append({layer: LeashLayer.condition_similarities[0][layer] for layer in layers}) 540 | LeashLayer.condition_met = defaultdict(lambda: False) 541 | LeashLayer.forward_calls = defaultdict(int) 542 | LeashLayer.condition_similarities = defaultdict(lambda: defaultdict(float)) 543 | 544 | # Create a list of all combinations to iterate over 545 | all_combinations = [ 546 | (r, layer_combo, threshold, direction) 547 | for r in range(1, min(max_layers_to_combine, len(layers)) + 1) 548 | for layer_combo in combinations(layers, r) 549 | for threshold in np.arange(*threshold_range, threshold_step) 550 | for direction in ['larger', 'smaller'] 551 | ] 552 | 553 | # Find best combination 554 | analysis_results = {} 555 | for r, layer_combo, threshold, direction in custom_progress(all_combinations, "Searching for best condition point"): 556 | layer_key = f"layers_{'_'.join(map(str, layer_combo))}" 557 | if layer_key not in analysis_results: 558 | analysis_results[layer_key] = {"f1_scores": {}, "similarities": {}} 559 | 560 | y_pred = [] 561 | for i, sim_dict in enumerate(similarities): 562 | condition_met = any( 563 | (sim_dict[layer] > threshold) == (direction == 'smaller') 564 | for layer in layer_combo 565 | ) 566 | y_pred.append(1 if condition_met else 0) 567 | 568 | f1 = f1_score(y_true, y_pred) 569 | if f1 > 0: # Only record non-zero F1 scores 570 | analysis_results[layer_key]["f1_scores"][f"{threshold:.3f}_{direction}"] = f1 571 | 572 | if f1 > best_f1: 573 | best_f1 = f1 574 | best_config = (list(layer_combo), threshold, direction) 575 | 576 | # Record similarities per layer 577 | for layer in layers: 578 | analysis_results[f"layer_{layer}"] = { 579 | "similarities": { 580 | "positive": [sim_dict[layer] for sim_dict in similarities[:len(positive_strings)]], 581 | "negative": [sim_dict[layer] for sim_dict in similarities[len(positive_strings):]] 582 | } 583 | } 584 | 585 | log(f"Search completed.", style="bold", class_name="MalleableModel") 586 | rounded_threshold = round(best_config[1], 3) 587 | log(f"Best condition point found: Layers {best_config[0]}, Threshold {rounded_threshold:.3f}, Direction '{best_config[2]}', F1 Score {best_f1:.3f}", style="bold green", class_name="MalleableModel") 588 | 589 | if save_analysis: 590 | self._save_analysis_results(analysis_results, best_config[0], rounded_threshold, best_config[2], best_f1, file_path) 591 | 592 | self.reset_leash_to_default() 593 | return best_config[0], rounded_threshold, best_config[2], best_f1 594 | 595 | 596 | def _save_analysis_results(self, analysis_results, best_layers, best_threshold, best_direction, best_f1, file_path): 597 | """ 598 | Save the analysis results from find_best_condition_point to a file. 599 | 600 | Args: 601 | analysis_results: Dictionary containing the analysis results. 602 | best_layers: List of layers that gave the best performance. 603 | best_threshold: The threshold value that gave the best performance. 604 | best_direction: The direction ('larger' or 'smaller') that gave the best performance. 605 | best_f1: The best F1 score achieved. 606 | file_path: Path to save the analysis results. 607 | """ 608 | # Ensure the directory exists 609 | directory = os.path.dirname(file_path) 610 | if directory and not os.path.exists(directory): 611 | os.makedirs(directory) 612 | 613 | # If no file name is provided, generate a default one 614 | if not os.path.basename(file_path): 615 | file_name = f"condition_point_analysis_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json" 616 | file_path = os.path.join(directory, file_name) 617 | 618 | summary = { 619 | "best_layers": best_layers, 620 | "best_threshold": best_threshold, 621 | "best_direction": best_direction, 622 | "best_f1_score": best_f1, 623 | "analysis": analysis_results 624 | } 625 | 626 | with open(file_path, 'w') as f: 627 | json.dump(summary, f, indent=2) 628 | 629 | log(f"Analysis results saved to {file_path}", style="bold blue", class_name="MalleableModel") 630 | 631 | 632 | def forward(self, *args, **kwargs): 633 | """ 634 | Perform a forward pass through the model. 635 | 636 | This method delegates to the underlying model's forward method. 637 | 638 | Args: 639 | *args: Positional arguments to pass to the underlying model. 640 | **kwargs: Keyword arguments to pass to the underlying model. 641 | 642 | Returns: 643 | The output of the underlying model's forward pass. 644 | """ 645 | return self.model(*args, **kwargs) 646 | 647 | 648 | def __call__(self, *args, **kwargs): 649 | """ 650 | Make the MalleableModel instance callable. 651 | 652 | This method allows the MalleableModel to be used like a function, delegating to the underlying model. 653 | 654 | Args: 655 | *args: Positional arguments to pass to the underlying model. 656 | **kwargs: Keyword arguments to pass to the underlying model. 657 | 658 | Returns: 659 | The output of the underlying model. 660 | """ 661 | return self.model(*args, **kwargs) 662 | 663 | 664 | def get_model_layer_list(model: MalleableModel | PreTrainedModel) -> torch.nn.ModuleList: 665 | """ 666 | Get the list of layers from a model. 667 | 668 | This function handles different model architectures to retrieve their layers. 669 | 670 | Args: 671 | model: Either a MalleableModel or a PreTrainedModel. 672 | 673 | Returns: 674 | A ModuleList containing the model's layers. 675 | 676 | Raises: 677 | ValueError: If the function doesn't know how to get layers for the given model type. 678 | """ 679 | if isinstance(model, MalleableModel): 680 | model = model.model # Use the underlying model if the model is a MalleableModel instance 681 | 682 | if hasattr(model, "model"): # mistral-like 683 | return model.model.layers 684 | elif hasattr(model, "transformer"): # gpt-2-like 685 | return model.transformer.h 686 | else: 687 | raise ValueError(f"don't know how to get layer list for {type(model)}") 688 | --------------------------------------------------------------------------------