├── .gitignore
├── .vscode
    ├── extensions.json
    └── settings.json
├── LICENSE
├── README.md
├── app
    ├── .eslintrc.json
    ├── .gitignore
    ├── .prettierrc
    ├── app
    │   ├── components
    │   │   ├── ColorSpan.tsx
    │   │   ├── Controls.tsx
    │   │   ├── LatentExamples.tsx
    │   │   ├── LogitsTable.tsx
    │   │   └── prompt
    │   │   │   ├── PromptActivations.tsx
    │   │   │   ├── PromptLatentHeatmaps.tsx
    │   │   │   ├── PromptLayerHistograms.tsx
    │   │   │   ├── PromptLogitsInput.tsx
    │   │   │   ├── PromptLogitsRecon.tsx
    │   │   │   └── PromptLogitsSteer.tsx
    │   ├── favicon.ico
    │   ├── globals.css
    │   ├── layout.tsx
    │   ├── page.tsx
    │   ├── prompt
    │   │   └── [prompt]
    │   │   │   ├── page.client.tsx
    │   │   │   └── page.tsx
    │   └── use-select.tsx
    ├── components.json
    ├── components
    │   └── ui
    │   │   ├── button.tsx
    │   │   ├── card.tsx
    │   │   ├── chart.tsx
    │   │   ├── form.tsx
    │   │   ├── input.tsx
    │   │   ├── label.tsx
    │   │   ├── table.tsx
    │   │   ├── tabs.tsx
    │   │   └── textarea.tsx
    ├── lib
    │   ├── api.ts
    │   ├── format.ts
    │   ├── models.ts
    │   └── utils.ts
    ├── next.config.js
    ├── package-lock.json
    ├── package.json
    ├── postcss.config.js
    ├── tailwind.config.js
    └── tsconfig.json
├── citation.bib
├── figures.py
├── figures
    ├── __init__.py
    ├── embed_sim.py
    ├── entropy.py
    ├── heatmap.py
    ├── heatmap_aggregate.py
    ├── heatmap_prompt.py
    ├── layer_hist.py
    ├── layer_sim.py
    ├── layer_std.py
    ├── mmcs.py
    ├── num_layers.py
    ├── resid_sim.py
    ├── scatter_freq.py
    ├── test.py
    └── wdec_sim.py
├── layer_dists.py
├── layer_tests.py
├── mlsae
    ├── __init__.py
    ├── analysis
    │   ├── __init__.py
    │   ├── dists.py
    │   ├── examples.py
    │   └── variances.py
    ├── api
    │   ├── __init__.py
    │   ├── __main__.py
    │   ├── analyser.py
    │   └── models.py
    ├── metrics
    │   ├── __init__.py
    │   ├── auxiliary_loss.py
    │   ├── dead_latents.py
    │   ├── layerwise.py
    │   ├── layerwise_fvu.py
    │   ├── layerwise_l0_norm.py
    │   ├── layerwise_l1_norm.py
    │   ├── layerwise_logit_kl_div.py
    │   ├── layerwise_logit_mse.py
    │   ├── layerwise_loss_delta.py
    │   ├── layerwise_mse.py
    │   ├── mse_loss.py
    │   └── tests
    │   │   ├── __init__.py
    │   │   ├── test_dead_latents.py
    │   │   ├── test_layerwise_fvu.py
    │   │   ├── test_layerwise_l0_norm.py
    │   │   ├── test_layerwise_l1_norm.py
    │   │   ├── test_layerwise_mse.py
    │   │   └── test_loss_mse.py
    ├── model
    │   ├── __init__.py
    │   ├── autoencoders
    │   │   ├── __init__.py
    │   │   ├── standard.py
    │   │   ├── tests
    │   │   │   └── test_autoencoders.py
    │   │   ├── topk.py
    │   │   └── utils.py
    │   ├── data.py
    │   ├── decoder.py
    │   ├── geom_median.py
    │   ├── kernels.py
    │   ├── lightning.py
    │   ├── transformers
    │   │   ├── __init__.py
    │   │   ├── gemma2.py
    │   │   ├── gpt2.py
    │   │   ├── llama.py
    │   │   ├── models
    │   │   │   ├── gemma2
    │   │   │   │   └── modeling_gemma2.py
    │   │   │   ├── gpt2
    │   │   │   │   └── modeling_gpt2.py
    │   │   │   └── llama
    │   │   │   │   └── modeling_llama.py
    │   │   ├── pythia.py
    │   │   └── tests
    │   │   │   ├── test_gpt2.py
    │   │   │   └── test_llama.py
    │   └── types.py
    ├── model_card.py
    ├── trainer
    │   ├── __init__.py
    │   ├── config.py
    │   ├── test.py
    │   └── train.py
    └── utils.py
├── pyproject.toml
├── requirements.txt
├── test.py
├── tests.py
├── train.py
├── upload.py
└── uv.lock


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .nox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | *.py,cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | cover/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | .pybuilder/
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | #   For a library or package, you might want to ignore these files since the code is
 87 | #   intended to run in multiple environments; otherwise, check them in:
 88 | # .python-version
 89 | 
 90 | # pipenv
 91 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 92 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 93 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 94 | #   install all needed dependencies.
 95 | #Pipfile.lock
 96 | 
 97 | # poetry
 98 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
 99 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
100 | #   commonly ignored for libraries.
101 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102 | #poetry.lock
103 | 
104 | # pdm
105 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106 | #pdm.lock
107 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108 | #   in version control.
109 | #   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
110 | .pdm.toml
111 | .pdm-python
112 | .pdm-build/
113 | 
114 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
115 | __pypackages__/
116 | 
117 | # Celery stuff
118 | celerybeat-schedule
119 | celerybeat.pid
120 | 
121 | # SageMath parsed files
122 | *.sage.py
123 | 
124 | # Environments
125 | .env
126 | .venv
127 | env/
128 | venv/
129 | ENV/
130 | env.bak/
131 | venv.bak/
132 | 
133 | # Spyder project settings
134 | .spyderproject
135 | .spyproject
136 | 
137 | # Rope project settings
138 | .ropeproject
139 | 
140 | # mkdocs documentation
141 | /site
142 | 
143 | # mypy
144 | .mypy_cache/
145 | .dmypy.json
146 | dmypy.json
147 | 
148 | # Pyre type checker
149 | .pyre/
150 | 
151 | # pytype static type analyzer
152 | .pytype/
153 | 
154 | # Cython debug symbols
155 | cython_debug/
156 | 
157 | # PyCharm
158 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
159 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
160 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
161 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
162 | #.idea/
163 | 
164 | # .DS_Store
165 | # .ruff_cache
166 | 
167 | # .env
168 | # .venv
169 | 
170 | # Slurm
171 | *.o
172 | *.out
173 | 
174 | # PyTorch Lightning
175 | lightning_logs
176 | 
177 | # Weights & Biases
178 | wandb_logs
179 | 
180 | data
181 | models
182 | !mlsae/**/models
183 | out
184 | 


--------------------------------------------------------------------------------
/.vscode/extensions.json:
--------------------------------------------------------------------------------
1 | {
2 |   "recommendations": ["charliermarsh.ruff", "ms-python.python"]
3 | }
4 | 


--------------------------------------------------------------------------------
/.vscode/settings.json:
--------------------------------------------------------------------------------
1 | {
2 |   "ruff.configuration": "pyproject.toml",
3 |   "ruff.nativeServer": true
4 | }
5 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) [2024] [Tim Lawson]
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Multi-Layer Sparse Autoencoders (MLSAE)
  2 | 
  3 | > [!NOTE]
  4 | > This repository accompanies the preprint Residual Stream Analysis with
  5 | > Multi-Layer SAEs (<https://arxiv.org/abs/2409.04185>).
  6 | > See [References](#references) for related work.
  7 | 
  8 | ## Pretrained MLSAEs
  9 | 
 10 | We define two types of model: plain PyTorch
 11 | [MLSAE](./mlsae/model/autoencoder.py) modules, which are relatively small; and
 12 | PyTorch Lightning [MLSAETransformer](./mlsae/model/lightning.py) modules, which
 13 | include the underlying transformer. HuggingFace collections for both are here:
 14 | 
 15 | - [Multi-Layer Sparse Autoencoders](https://huggingface.co/collections/tim-lawson/multi-layer-sparse-autoencoders-66c2fe8896583c59b02ceb72)
 16 | - [Multi-Layer Sparse Autoencoders with Transformers](https://huggingface.co/collections/tim-lawson/multi-layer-sparse-autoencoders-with-transformers-66c441c87d1b24912175ce08)
 17 | 
 18 | We assume that pretrained MLSAEs have repo_ids with
 19 | [this naming convention](./mlsae/utils.py):
 20 | 
 21 | - `tim-lawson/mlsae-pythia-70m-deduped-x{expansion_factor}-k{k}`
 22 | - `tim-lawson/mlsae-pythia-70m-deduped-x{expansion_factor}-k{k}-tfm`
 23 | 
 24 | The Weights & Biases project for the paper is
 25 | [here](https://wandb.ai/timlawson-/mlsae).
 26 | 
 27 | ## Installation
 28 | 
 29 | Install Python dependencies with Poetry:
 30 | 
 31 | ```bash
 32 | poetry env use 3.12
 33 | poetry install
 34 | ```
 35 | 
 36 | Install Python dependencies with pip:
 37 | 
 38 | ```bash
 39 | python -m venv .venv
 40 | source .venv/bin/activate
 41 | pip install -r requirements.txt
 42 | ```
 43 | 
 44 | Install Node.js dependencies:
 45 | 
 46 | ```bash
 47 | cd app
 48 | npm install
 49 | ```
 50 | 
 51 | ## Training
 52 | 
 53 | Train a single MLSAE:
 54 | 
 55 | ```bash
 56 | python train.py --help
 57 | python train.py --model_name EleutherAI/pythia-70m-deduped --expansion_factor 64 -k 32
 58 | ```
 59 | 
 60 | ## Analysis
 61 | 
 62 | Test a single pretrained MLSAE:
 63 | 
 64 | > [!WARNING]
 65 | > We assume that the test split of `monology/pile-uncopyrighted` is already downloaded
 66 | > and stored in `data/test.jsonl.zst`.
 67 | 
 68 | ```bash
 69 | python test.py --help
 70 | python test.py --model_name EleutherAI/pythia-70m-deduped --expansion_factor 64 -k 32
 71 | ```
 72 | 
 73 | Compute the distributions of latent activations over layers for a single
 74 | pretrained MLSAE
 75 | ([HuggingFace datasets](https://huggingface.co/collections/tim-lawson/mlsae-latent-distributions-over-layers-66d6a0ec9fcb6b494fb1808e)):
 76 | 
 77 | ```bash
 78 | python -m mlsae.analysis.dists --help
 79 | python -m mlsae.analysis.dists --repo_id tim-lawson/mlsae-pythia-70m-deduped-x64-k32-tfm --max_tokens 100_000_000
 80 | ```
 81 | 
 82 | Compute the maximally activating examples for each combination of latent and
 83 | layer for a single pretrained MLSAE
 84 | ([HuggingFace datasets](https://huggingface.co/collections/tim-lawson/mlsae-maximally-activating-examples-66dbcc999a962ae594f631b6)):
 85 | 
 86 | ```bash
 87 | python -m mlsae.analysis.examples --help
 88 | python -m mlsae.analysis.examples --repo_id tim-lawson/mlsae-pythia-70m-deduped-x64-k32-tfm --max_tokens 1_000_000
 89 | ```
 90 | 
 91 | ## Interactive visualizations
 92 | 
 93 | Run the interactive web application for a single pretrained MLSAE:
 94 | 
 95 | ```bash
 96 | python -m mlsae.api --help
 97 | python -m mlsae.api --repo_id tim-lawson/mlsae-pythia-70m-deduped-x64-k32-tfm
 98 | 
 99 | cd app
100 | npm run dev
101 | ```
102 | 
103 | Navigate to <http://localhost:3000>, enter a prompt, and click 'Submit'.
104 | 
105 | Alternatively, navigate to <http://localhost:3000/prompt/foobar>.
106 | 
107 | ## Figures
108 | 
109 | Compute the mean cosine similarities between residual stream activation vectors
110 | at adjacent layers of a single pretrained transformer:
111 | 
112 | ```bash
113 | python figures/resid_cos_sim.py --help
114 | python figures/resid_cos_sim.py --model_name EleutherAI/pythia-70m-deduped
115 | ```
116 | 
117 | Save heatmaps of the distributions of latent activations over layers for
118 | multiple pretrained MLSAEs:
119 | 
120 | ```bash
121 | python figures/dists_heatmaps.py --help
122 | python figures/dists_heatmaps.py --expansion_factor 32 64 128 -k 16 32 64
123 | ```
124 | 
125 | Save a CSV of the mean standard deviations of the distributions of latent
126 | activations over layers for multiple pretrained MLSAEs:
127 | 
128 | ```bash
129 | python figures/dists_layer_std.py --help
130 | python figures/dists_layer_std.py --expansion_factor 32 64 128 -k 16 32 64
131 | ```
132 | 
133 | Save heatmaps of the maximum latent activations for a given prompt and multiple
134 | pretrained MLSAEs:
135 | 
136 | ```bash
137 | python figures/prompt_heatmaps.py --help
138 | python figures/prompt_heatmaps.py --expansion_factor 32 64 128 -k 16 32 64
139 | ```
140 | 
141 | Save a CSV of the Mean Max Cosine Similarity (MMCS) for multiple pretrained
142 | MLSAEs:
143 | 
144 | ```bash
145 | python figures/mmcs.py --help
146 | python figures/mmcs.py --expansion_factor 32 64 128 -k 16 32 64
147 | ```
148 | 
149 | ## References
150 | 
151 | ### Code
152 | 
153 | - <https://github.com/openai/sparse_autoencoder>
154 | - <https://github.com/EleutherAI/sae>
155 | - <https://github.com/ai-safety-foundation/sparse_autoencoder>
156 | - <https://github.com/callummcdougall/sae_vis>
157 | 
158 | ### Papers
159 | 
160 | - Gao et al. [2024] <https://cdn.openai.com/papers/sparse-autoencoders.pdf>
161 | - Bricken et al. [2023]
162 |   <https://transformer-circuits.pub/2023/monosemantic-features/index.html>
163 | 


--------------------------------------------------------------------------------
/app/.eslintrc.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "extends": ["next/core-web-vitals", "prettier"],
 3 |   "rules": {
 4 |     "sort-imports": [
 5 |       "error",
 6 |       {
 7 |         "ignoreDeclarationSort": true
 8 |       }
 9 |     ],
10 |     "import/consistent-type-specifier-style": "error",
11 |     "import/no-duplicates": ["error", { "prefer-inline": true }],
12 |     "import/order": [
13 |       "warn",
14 |       {
15 |         "alphabetize": {
16 |           "caseInsensitive": false,
17 |           "order": "asc",
18 |           "orderImportKind": "asc"
19 |         },
20 |         "groups": [
21 |           "builtin",
22 |           "type",
23 |           "external",
24 |           "internal",
25 |           "parent",
26 |           "sibling",
27 |           "object",
28 |           "index"
29 |         ],
30 |         "pathGroups": [
31 |           {
32 |             "pattern": "~/**",
33 |             "group": "external",
34 |             "position": "after"
35 |           }
36 |         ],
37 |         "pathGroupsExcludedImportTypes": ["builtin"]
38 |       }
39 |     ]
40 |   },
41 |   "settings": {
42 |     "import/resolver": {
43 |       "typescript": {}
44 |     }
45 |   }
46 | }
47 | 


--------------------------------------------------------------------------------
/app/.gitignore:
--------------------------------------------------------------------------------
 1 | # dependencies
 2 | /node_modules
 3 | /.pnp
 4 | .pnp.js
 5 | 
 6 | # testing
 7 | /coverage
 8 | 
 9 | # next.js
10 | /.next/
11 | /out/
12 | 
13 | # production
14 | /build
15 | 
16 | # misc
17 | .DS_Store
18 | *.pem
19 | 
20 | # debug
21 | npm-debug.log*
22 | yarn-debug.log*
23 | yarn-error.log*
24 | 
25 | # local env files
26 | .env*.local
27 | 
28 | # vercel
29 | .vercel
30 | 
31 | # typescript
32 | *.tsbuildinfo
33 | next-env.d.ts
34 | 


--------------------------------------------------------------------------------
/app/.prettierrc:
--------------------------------------------------------------------------------
1 | {
2 |   "tabWidth": 2,
3 |   "useTabs": false
4 | }
5 | 


--------------------------------------------------------------------------------
/app/app/components/ColorSpan.tsx:
--------------------------------------------------------------------------------
 1 | import { cn } from "~/lib/utils";
 2 | 
 3 | export default function ColorSpan({
 4 |   children,
 5 |   opacity,
 6 |   className,
 7 |   color = "bg-orange-400 dark:bg-orange-800",
 8 |   style = {},
 9 |   ...props
10 | }: {
11 |   children?: React.ReactNode;
12 |   opacity: number;
13 |   className?: string | boolean | undefined;
14 |   color?: string;
15 |   style?: React.CSSProperties;
16 | } & React.HTMLAttributes<HTMLSpanElement>) {
17 |   return (
18 |     <span {...props} className={cn("relative", className)} style={style}>
19 |       <div
20 |         className={cn("z-0 absolute top-0 left-0 bottom-0 right-0", color)}
21 |         style={{ opacity }}
22 |       />
23 |       <span className="relative z-10 px-0.5 select-none">{children}</span>
24 |     </span>
25 |   );
26 | }
27 | 


--------------------------------------------------------------------------------
/app/app/components/Controls.tsx:
--------------------------------------------------------------------------------
  1 | "use client";
  2 | 
  3 | import React from "react";
  4 | import { Select } from "~/app/use-select";
  5 | import { Input } from "~/components/ui/input";
  6 | import { Label } from "~/components/ui/label";
  7 | 
  8 | export default function Controls({
  9 |   nLayers,
 10 |   nLatents,
 11 |   nPositions,
 12 |   stateLayer,
 13 |   stateLatent,
 14 |   statePosition,
 15 |   threshold,
 16 |   onChangeThreshold,
 17 |   factor,
 18 |   onChangeFactor,
 19 | }: {
 20 |   nLayers: number;
 21 |   nLatents: number;
 22 |   nPositions: number;
 23 |   stateLayer: Select<number>;
 24 |   stateLatent: Select<number>;
 25 |   statePosition: Select<number>;
 26 |   threshold: number;
 27 |   onChangeThreshold: (value: number) => void;
 28 |   factor: number;
 29 |   onChangeFactor: (value: number) => void;
 30 | }) {
 31 |   const onChangeLayer = (event: React.ChangeEvent<HTMLInputElement>) => {
 32 |     stateLayer.onClick(event.target.valueAsNumber);
 33 |   };
 34 | 
 35 |   const onChangeLatent = (event: React.ChangeEvent<HTMLInputElement>) => {
 36 |     stateLatent.onClick(event.target.valueAsNumber);
 37 |   };
 38 | 
 39 |   const onChangePosition = (event: React.ChangeEvent<HTMLInputElement>) => {
 40 |     statePosition.onClick(event.target.valueAsNumber);
 41 |   };
 42 | 
 43 |   return (
 44 |     <div className="h-10 px-4 flex items-center justify-stretch gap-4">
 45 |       <div className="flex items-center gap-2">
 46 |         <Label htmlFor="layer">Layer</Label>
 47 |         <Input
 48 |           id="layer"
 49 |           type="number"
 50 |           value={stateLayer.active}
 51 |           onChange={onChangeLayer}
 52 |           min={0}
 53 |           max={nLayers - 1}
 54 |           step={1}
 55 |           className="h-8"
 56 |         />
 57 |       </div>
 58 |       <div className="flex items-center gap-2">
 59 |         <Label htmlFor="latent">Latent</Label>
 60 |         <Input
 61 |           id="latent"
 62 |           type="number"
 63 |           value={stateLatent.active}
 64 |           onChange={onChangeLatent}
 65 |           min={0}
 66 |           max={nLatents - 1}
 67 |           step={1}
 68 |           className="h-8"
 69 |         />
 70 |       </div>
 71 |       <div className="flex items-center gap-2">
 72 |         <Label htmlFor="position">Position</Label>
 73 |         <Input
 74 |           id="position"
 75 |           type="number"
 76 |           value={statePosition.active}
 77 |           onChange={onChangePosition}
 78 |           min={0}
 79 |           max={nPositions - 1}
 80 |           step={1}
 81 |           className="h-8"
 82 |         />
 83 |       </div>
 84 |       <div className="flex items-center gap-2">
 85 |         <Label htmlFor="threshold" className="text-nowrap">
 86 |           Threshold
 87 |         </Label>
 88 |         <Input
 89 |           id="threshold"
 90 |           type="number"
 91 |           value={threshold}
 92 |           onChange={(event) => onChangeThreshold(event.target.valueAsNumber)}
 93 |           min={0.0}
 94 |           max={0.95}
 95 |           step={0.05}
 96 |           className="h-8"
 97 |         />
 98 |       </div>
 99 |       <div className="flex items-center gap-2">
100 |         <Label htmlFor="factor" className="text-nowrap">
101 |           Steering factor
102 |         </Label>
103 |         <Input
104 |           id="factor"
105 |           type="number"
106 |           value={factor}
107 |           onChange={(event) => onChangeFactor(event.target.valueAsNumber)}
108 |           min={-10}
109 |           max={10}
110 |           step={0.5}
111 |           className="h-8"
112 |         />
113 |       </div>
114 |     </div>
115 |   );
116 | }
117 | 


--------------------------------------------------------------------------------
/app/app/components/LatentExamples.tsx:
--------------------------------------------------------------------------------
 1 | "use client";
 2 | 
 3 | import React from "react";
 4 | import useSWR from "swr";
 5 | import ColorSpan from "~/app/components/ColorSpan";
 6 | import { Select } from "~/app/use-select";
 7 | import { Card, CardContent } from "~/components/ui/card";
 8 | import {
 9 |   Table,
10 |   TableBody,
11 |   TableCell,
12 |   TableHead,
13 |   TableHeader,
14 |   TableRow,
15 | } from "~/components/ui/table";
16 | import { getExamples } from "~/lib/api";
17 | import { escapeWhitespace } from "~/lib/format";
18 | 
19 | export default function LatentExamplesComponent({
20 |   className,
21 |   stateLatent,
22 |   stateLayer,
23 | }: {
24 |   className?: string;
25 |   stateLatent: Select<number>;
26 |   stateLayer: Select<number>;
27 | }) {
28 |   const { data: examples } = useSWR(
29 |     ["examples", stateLatent.clicked, stateLayer.clicked],
30 |     ([_key, latent, layer]) => getExamples(latent, layer),
31 |     {
32 |       keepPreviousData: true,
33 |     },
34 |   );
35 | 
36 |   if (examples === undefined || examples.length === 0) {
37 |     return null;
38 |   }
39 | 
40 |   return (
41 |     <Card className={className}>
42 |       <CardContent className="pt-4">
43 |         <Table>
44 |           <TableHeader>
45 |             <TableRow>
46 |               <TableHead>Example</TableHead>
47 |               <TableHead className="w-24 text-right">Activation</TableHead>
48 |             </TableRow>
49 |           </TableHeader>
50 |           <TableBody>
51 |             {examples.map((example, index) => {
52 |               return (
53 |                 <TableRow key={index} className="border-0 font-mono text-xs">
54 |                   <TableCell className="max-w-96 truncate">
55 |                     {example.tokens.map((token, position) => {
56 |                       const value = example.acts[position];
57 |                       return (
58 |                         <ColorSpan key={position} opacity={value / example.act}>
59 |                           {escapeWhitespace(token)}
60 |                         </ColorSpan>
61 |                       );
62 |                     })}
63 |                   </TableCell>
64 |                   <TableCell className="pl-2 text-right">
65 |                     {example.act.toFixed(3)}
66 |                   </TableCell>
67 |                 </TableRow>
68 |               );
69 |             })}
70 |           </TableBody>
71 |         </Table>
72 |       </CardContent>
73 |     </Card>
74 |   );
75 | }
76 | 


--------------------------------------------------------------------------------
/app/app/components/LogitsTable.tsx:
--------------------------------------------------------------------------------
 1 | import ColorSpan from "~/app/components/ColorSpan";
 2 | import {
 3 |   Table,
 4 |   TableBody,
 5 |   TableCaption,
 6 |   TableCell,
 7 |   TableRow,
 8 | } from "~/components/ui/table";
 9 | import { escapeWhitespace, toSigned, toUnitInterval } from "~/lib/format";
10 | import { LogitType } from "~/lib/models";
11 | 
12 | export default function LogitsTable({
13 |   caption,
14 |   data,
15 |   color,
16 | }: {
17 |   caption: string;
18 |   data: LogitType[] | undefined;
19 |   color: string;
20 | }) {
21 |   const hasProbability = data?.[0].prob !== null;
22 |   return (
23 |     <Table className="caption-top">
24 |       <TableCaption className="h-8 m-0 pb-2 flex items-center text-left font-medium leading-none tracking-wide uppercase">
25 |         {caption}
26 |       </TableCaption>
27 |       <TableBody className="font-mono text-xs">
28 |         {data?.map((logit) => (
29 |           <TableRow key={logit.id} className="border-b-0">
30 |             <TableCell className="px-0 py-0.5 max-w-12 truncate text-left">
31 |               <ColorSpan
32 |                 opacity={toUnitInterval(Math.abs(logit.logit))}
33 |                 color={color}
34 |               >
35 |                 {escapeWhitespace(logit.token)}
36 |               </ColorSpan>
37 |             </TableCell>
38 |             {!hasProbability && (
39 |               <TableCell className="px-0 py-0.5 text-right">
40 |                 {toSigned(logit.logit)}
41 |               </TableCell>
42 |             )}
43 |             {hasProbability && (
44 |               <TableCell className="px-0 py-0.5 text-right">
45 |                 {logit.prob?.toFixed(3)}
46 |               </TableCell>
47 |             )}
48 |           </TableRow>
49 |         ))}
50 |       </TableBody>
51 |     </Table>
52 |   );
53 | }
54 | 


--------------------------------------------------------------------------------
/app/app/components/prompt/PromptActivations.tsx:
--------------------------------------------------------------------------------
 1 | "use client";
 2 | 
 3 | import ColorSpan from "~/app/components/ColorSpan";
 4 | import { Select } from "~/app/use-select";
 5 | import { Card, CardContent } from "~/components/ui/card";
 6 | import { escapeWhitespace } from "~/lib/format";
 7 | import { LatentActivationsType, TokenType } from "~/lib/models";
 8 | import { cn } from "~/lib/utils";
 9 | 
10 | export default function PromptActivationsComponent({
11 |   className,
12 |   tokens,
13 |   latentActivations,
14 |   stateLatent,
15 |   stateLayer,
16 |   statePosition,
17 | }: {
18 |   className?: string;
19 |   tokens: TokenType[];
20 |   latentActivations: LatentActivationsType;
21 |   stateLatent: Select<number>;
22 |   stateLayer: Select<number>;
23 |   statePosition: Select<number>;
24 | }) {
25 |   return (
26 |     <Card className={className}>
27 |       <CardContent className="pt-4">
28 |         <div className="flex flex-wrap font-mono text-xs">
29 |           {tokens.map((token, index) => {
30 |             const absolute =
31 |               latentActivations.values[stateLayer.active][token.pos][
32 |                 stateLatent.active
33 |               ];
34 | 
35 |             const relative =
36 |               absolute / latentActivations.max[stateLayer.active][token.pos];
37 | 
38 |             return (
39 |               <ColorSpan
40 |                 key={token.pos}
41 |                 className={cn(
42 |                   token.pos === statePosition.active &&
43 |                     "z-40 ring ring-slate-900 dark:ring-slate-100",
44 |                 )}
45 |                 opacity={relative}
46 |                 onClick={() => statePosition.onClick(index)}
47 |                 onMouseEnter={() => statePosition.onMouseEnter(index)}
48 |                 onMouseLeave={statePosition.onMouseLeave}
49 |               >
50 |                 {escapeWhitespace(token.token)}
51 |               </ColorSpan>
52 |             );
53 |           })}
54 |         </div>
55 |       </CardContent>
56 |     </Card>
57 |   );
58 | }
59 | 


--------------------------------------------------------------------------------
/app/app/components/prompt/PromptLatentHeatmaps.tsx:
--------------------------------------------------------------------------------
  1 | "use client";
  2 | 
  3 | import React from "react";
  4 | import ColorSpan from "~/app/components/ColorSpan";
  5 | import { Select } from "~/app/use-select";
  6 | import { Card, CardContent } from "~/components/ui/card";
  7 | import {
  8 |   Table,
  9 |   TableBody,
 10 |   TableCell,
 11 |   TableHead,
 12 |   TableHeader,
 13 |   TableRow,
 14 | } from "~/components/ui/table";
 15 | import { LatentActivationsType } from "~/lib/models";
 16 | import { cn } from "~/lib/utils";
 17 | 
 18 | export default function LatentHeatmapComponent({
 19 |   latentActivations,
 20 |   threshold,
 21 |   stateLatent,
 22 |   stateLayer,
 23 |   statePosition,
 24 |   perToken = true,
 25 | }: {
 26 |   latentActivations: LatentActivationsType;
 27 |   threshold: number;
 28 |   stateLatent: Select<number>;
 29 |   stateLayer: Select<number>;
 30 |   statePosition: Select<number>;
 31 |   perToken?: boolean;
 32 | }) {
 33 |   const nLayers = latentActivations.max.length;
 34 |   const layers = Array.from({ length: nLayers }).map((_, layer) => layer);
 35 | 
 36 |   const latentHeatmaps = React.useMemo(() => {
 37 |     return getLatentHeatmaps(
 38 |       latentActivations,
 39 |       threshold,
 40 |       statePosition.clicked,
 41 |       perToken,
 42 |     );
 43 |   }, [latentActivations, threshold, statePosition.clicked, perToken]);
 44 | 
 45 |   return (
 46 |     <Card>
 47 |       <CardContent className="pt-4">
 48 |         <Table className="border-collapse" cellSpacing={0} cellPadding={0}>
 49 |           <TableHeader>
 50 |             <TableRow>
 51 |               <TableHead className="w-16">Latent</TableHead>
 52 |               <TableHead className="w-32 pr-4 text-right">Mean Layer</TableHead>
 53 |               {layers.map((layer) => (
 54 |                 <TableHead key={layer} className="text-center">
 55 |                   Layer {layer}
 56 |                 </TableHead>
 57 |               ))}
 58 |             </TableRow>
 59 |           </TableHeader>
 60 |           <TableBody className="font-mono text-xs">
 61 |             {latentHeatmaps.map((latentHeatmap) => {
 62 |               return (
 63 |                 <TableRow
 64 |                   key={latentHeatmap.latent}
 65 |                   className="border-0"
 66 |                   onClick={() => stateLatent.onClick(latentHeatmap.latent)}
 67 |                   onMouseEnter={() =>
 68 |                     stateLatent.onMouseEnter(latentHeatmap.latent)
 69 |                   }
 70 |                   onMouseLeave={stateLatent.onMouseLeave}
 71 |                 >
 72 |                   <TableCell className="font-normal">
 73 |                     {latentHeatmap.latent}
 74 |                   </TableCell>
 75 |                   <TableCell className="pr-4 text-right">
 76 |                     {latentHeatmap.layer_mean.toFixed(2)}
 77 |                   </TableCell>
 78 |                   {layers.map((layer) => {
 79 |                     const latentHeatmapLayer = latentHeatmap.layers.find(
 80 |                       (heatmapLayer) => heatmapLayer.layer === layer,
 81 |                     );
 82 |                     const absolute = latentHeatmapLayer?.absolute ?? 0;
 83 |                     const relative = latentHeatmapLayer?.relative ?? 0;
 84 |                     const string = absolute.toFixed(3);
 85 |                     return (
 86 |                       <TableCell key={layer} className="p-0 h-6">
 87 |                         <ColorSpan
 88 |                           key={layer}
 89 |                           className={cn(
 90 |                             "h-full flex items-center justify-center",
 91 |                             layer === stateLayer.active &&
 92 |                               "bg-slate-200/50 dark:bg-slate-800/50",
 93 |                           )}
 94 |                           opacity={relative}
 95 |                           onClick={() => stateLayer.onClick(layer)}
 96 |                           onMouseEnter={() => stateLayer.onMouseEnter(layer)}
 97 |                           onMouseLeave={stateLayer.onMouseLeave}
 98 |                           title={string}
 99 |                         >
100 |                           {absolute > 0 ? (
101 |                             <span className="text-xs font-mono">{string}</span>
102 |                           ) : null}
103 |                         </ColorSpan>
104 |                       </TableCell>
105 |                     );
106 |                   })}
107 |                 </TableRow>
108 |               );
109 |             })}
110 |           </TableBody>
111 |         </Table>
112 |       </CardContent>
113 |     </Card>
114 |   );
115 | }
116 | 
117 | interface LatentHeatmap {
118 |   latent: number;
119 |   layers: {
120 |     layer: number;
121 |     absolute: number;
122 |     relative: number;
123 |   }[];
124 |   layer_mean: number;
125 | }
126 | 
127 | function getLatentHeatmaps(
128 |   latentActivations: LatentActivationsType,
129 |   threshold: number,
130 |   position: number,
131 |   perToken = true,
132 | ) {
133 |   const latentMap: Record<number, LatentHeatmap["layers"]> = {};
134 |   const nLayers = latentActivations.max.length;
135 |   const nLatents = latentActivations.values[0][0].length;
136 | 
137 |   for (let layer = 0; layer < nLayers; layer++) {
138 |     for (let latent = 0; latent < nLatents; latent++) {
139 |       let absolute: number;
140 |       let relative: number;
141 |       if (perToken) {
142 |         absolute = latentActivations.values[layer][position][latent];
143 |         relative = absolute / latentActivations.max[layer][position];
144 |       } else {
145 |         absolute = latentActivations.values[layer].reduce((total, values) => {
146 |           return total + values[latent];
147 |         }, 0);
148 |         relative =
149 |           absolute /
150 |           latentActivations.max[layer].reduce((total, value) => {
151 |             return total + value;
152 |           }, 0);
153 |       }
154 |       if (relative > threshold) {
155 |         if (latentMap[latent] === undefined) {
156 |           latentMap[latent] = [];
157 |         }
158 |         latentMap[latent].push({
159 |           layer,
160 |           absolute,
161 |           relative,
162 |         });
163 |       }
164 |     }
165 |   }
166 | 
167 |   const latentActivationTotal: Record<number, number> = {};
168 |   const latentActivationLayerTotal: Record<number, number> = {};
169 |   for (const key of Object.keys(latentMap)) {
170 |     const latent = Number(key);
171 |     for (let layer = 0; layer < nLayers; layer++) {
172 |       const activation = latentActivations.values[layer][position][latent];
173 |       if (activation > 0) {
174 |         if (latentActivationTotal[latent] === undefined) {
175 |           latentActivationTotal[latent] = 0;
176 |         }
177 |         latentActivationTotal[latent] += activation;
178 | 
179 |         if (latentActivationLayerTotal[latent] === undefined) {
180 |           latentActivationLayerTotal[latent] = 0;
181 |         }
182 |         latentActivationLayerTotal[latent] += activation * layer;
183 |       }
184 |     }
185 |   }
186 | 
187 |   let latentHeatmaps: LatentHeatmap[] = Object.entries(latentMap).map(
188 |     ([key, layers]) => {
189 |       const latent = Number(key);
190 |       return {
191 |         latent,
192 |         layers,
193 |         layer_mean:
194 |           latentActivationLayerTotal[latent] / latentActivationTotal[latent],
195 |       };
196 |     },
197 |   );
198 | 
199 |   latentHeatmaps.sort((a, b) => {
200 |     const latentCenter = a.layer_mean - b.layer_mean;
201 |     const activationTotal =
202 |       b.layers.reduce((total, { absolute }) => total + absolute, 0) -
203 |       a.layers.reduce((total, { absolute }) => total + absolute, 0);
204 |     return latentCenter !== 0 ? latentCenter : activationTotal;
205 |   });
206 | 
207 |   return latentHeatmaps;
208 | }
209 | 


--------------------------------------------------------------------------------
/app/app/components/prompt/PromptLayerHistograms.tsx:
--------------------------------------------------------------------------------
  1 | "use client";
  2 | 
  3 | import { scaleSymlog } from "d3-scale";
  4 | import React from "react";
  5 | import { Area, AreaChart, XAxis, YAxis } from "recharts";
  6 | import useSWR from "swr";
  7 | import colors from "tailwindcss/colors";
  8 | import { useDarkMode } from "usehooks-ts";
  9 | import { Select } from "~/app/use-select";
 10 | import { Card, CardContent } from "~/components/ui/card";
 11 | import { ChartConfig, ChartContainer } from "~/components/ui/chart";
 12 | import {
 13 |   Table,
 14 |   TableBody,
 15 |   TableCell,
 16 |   TableHead,
 17 |   TableHeader,
 18 |   TableRow,
 19 | } from "~/components/ui/table";
 20 | import { getPromptLayerHistograms } from "~/lib/api";
 21 | import { toSigned } from "~/lib/format";
 22 | import { LayerHistogramsType } from "~/lib/models";
 23 | import { cn } from "~/lib/utils";
 24 | 
 25 | const symlog = scaleSymlog();
 26 | const chartConfig = {
 27 |   count: {},
 28 | } satisfies ChartConfig;
 29 | const chartMargin = { top: 0, left: 0, right: 0, bottom: 0 } as const;
 30 | 
 31 | function chartData(histograms: LayerHistogramsType, layer: number) {
 32 |   return histograms.edges
 33 |     .map((edge, index) => ({
 34 |       edge,
 35 |       value: histograms.values[layer][index],
 36 |     }))
 37 |     .slice(0, -1);
 38 | }
 39 | 
 40 | export default function LayerHistogramsComponent({
 41 |   prompt,
 42 |   stateLayer,
 43 | }: {
 44 |   prompt: string;
 45 |   stateLayer: Select<number>;
 46 | }) {
 47 |   const { data: histograms } = useSWR(
 48 |     ["prompt/layer-histograms", prompt],
 49 |     ([_key, prompt]) => getPromptLayerHistograms(prompt),
 50 |     {
 51 |       keepPreviousData: true,
 52 |     },
 53 |   );
 54 | 
 55 |   const { isDarkMode } = useDarkMode();
 56 | 
 57 |   if (!histograms) {
 58 |     return null;
 59 |   }
 60 | 
 61 |   const nLayers = histograms.values.length;
 62 |   const layers = Array.from({ length: nLayers }, (_, i) => i);
 63 | 
 64 |   return (
 65 |     <Card>
 66 |       <CardContent className="pt-4">
 67 |         <Table className="table-fixed">
 68 |           <TableHeader>
 69 |             <TableRow>
 70 |               <TableHead className="w-16">Layer</TableHead>
 71 |               <TableHead className="text-left">Histogram</TableHead>
 72 |             </TableRow>
 73 |           </TableHeader>
 74 |         </Table>
 75 |         <TableBody className="text-xs font-mono">
 76 |           {layers.map((layer) => {
 77 |             const isActive = layer == stateLayer.active;
 78 |             const color = isDarkMode
 79 |               ? isActive
 80 |                 ? colors.orange[800]
 81 |                 : colors.slate[100]
 82 |               : isActive
 83 |                 ? colors.orange[400]
 84 |                 : colors.slate[900];
 85 |             return (
 86 |               <TableRow
 87 |                 key={layer}
 88 |                 onClick={() => stateLayer.onClick(layer)}
 89 |                 onMouseEnter={() => stateLayer.onMouseEnter(layer)}
 90 |                 onMouseLeave={stateLayer.onMouseLeave}
 91 |               >
 92 |                 <TableCell className="w-16">{layer}</TableCell>
 93 |                 <TableCell>
 94 |                   <ChartContainer
 95 |                     config={chartConfig}
 96 |                     className={cn(
 97 |                       "h-14 w-[calc(100vw-8rem)]",
 98 |                       isActive && "bg-slate-100 dark:bg-slate-900",
 99 |                     )}
100 |                   >
101 |                     <AreaChart
102 |                       accessibilityLayer
103 |                       data={chartData(histograms, layer)}
104 |                       margin={chartMargin}
105 |                     >
106 |                       <Area
107 |                         type="step"
108 |                         dataKey="value"
109 |                         dot={false}
110 |                         activeDot={false}
111 |                         stroke="none"
112 |                         fill={color}
113 |                         fillOpacity={1}
114 |                         isAnimationActive={false}
115 |                       />
116 |                       <XAxis hide tickFormatter={toSigned} />
117 |                       <YAxis hide scale={symlog} />
118 |                     </AreaChart>
119 |                   </ChartContainer>
120 |                 </TableCell>
121 |               </TableRow>
122 |             );
123 |           })}
124 |         </TableBody>
125 |       </CardContent>
126 |     </Card>
127 |   );
128 | }
129 | 


--------------------------------------------------------------------------------
/app/app/components/prompt/PromptLogitsInput.tsx:
--------------------------------------------------------------------------------
 1 | "use client";
 2 | 
 3 | import LogitsTable from "~/app/components/LogitsTable";
 4 | import { Select } from "~/app/use-select";
 5 | import { Card, CardContent, CardHeader, CardTitle } from "~/components/ui/card";
 6 | import { MaxLogitsType } from "~/lib/models";
 7 | 
 8 | export default function PromptLogitsInput({
 9 |   values,
10 |   statePosition,
11 |   className,
12 | }: {
13 |   values: MaxLogitsType;
14 |   statePosition: Select<number>;
15 |   className?: string;
16 | }) {
17 |   return (
18 |     <Card className={className}>
19 |       <CardHeader>
20 |         <CardTitle>Input activations</CardTitle>
21 |       </CardHeader>
22 |       <CardContent>
23 |         <LogitsTable
24 |           caption="Max logits"
25 |           data={values.max[statePosition.active]}
26 |           color="bg-red-400/50 dark:bg-red-800/50"
27 |         />
28 |       </CardContent>
29 |     </Card>
30 |   );
31 | }
32 | 


--------------------------------------------------------------------------------
/app/app/components/prompt/PromptLogitsRecon.tsx:
--------------------------------------------------------------------------------
 1 | "use client";
 2 | 
 3 | import useSWR from "swr";
 4 | import LogitsTable from "~/app/components/LogitsTable";
 5 | import { Select } from "~/app/use-select";
 6 | import { Card, CardContent, CardHeader, CardTitle } from "~/components/ui/card";
 7 | import { getPromptLogitsRecon as getPromptLogitsRecon } from "~/lib/api";
 8 | 
 9 | export default function PromptLogitsRecon({
10 |   prompt,
11 |   stateLayer,
12 |   statePosition,
13 |   className,
14 | }: {
15 |   prompt: string;
16 |   stateLayer: Select<number>;
17 |   statePosition: Select<number>;
18 |   className?: string;
19 | }) {
20 |   const { data } = useSWR(
21 |     ["prompt/logits-recon", prompt, stateLayer.clicked],
22 |     ([_key, prompt, layer]) => getPromptLogitsRecon(prompt, layer),
23 |     {
24 |       keepPreviousData: false,
25 |     },
26 |   );
27 | 
28 |   const [values, changes] = data ?? [];
29 |   return (
30 |     <Card className={className}>
31 |       <CardHeader>
32 |         <CardTitle>Reconstruction at layer {stateLayer.clicked}</CardTitle>
33 |       </CardHeader>
34 |       <CardContent className="flex gap-4">
35 |         <LogitsTable
36 |           caption="Max logits"
37 |           data={values?.max[statePosition.clicked]}
38 |           color="bg-red-400/50 dark:bg-red-800/50"
39 |         />
40 |         <LogitsTable
41 |           caption="Max delta"
42 |           data={changes?.max[statePosition.clicked]}
43 |           color="bg-red-400/50 dark:bg-red-800/50"
44 |         />
45 |         <LogitsTable
46 |           caption="Min delta"
47 |           data={changes?.min[statePosition.clicked]}
48 |           color="bg-indigo-400/50 dark:bg-indigo-800/50"
49 |         />
50 |       </CardContent>
51 |     </Card>
52 |   );
53 | }
54 | 


--------------------------------------------------------------------------------
/app/app/components/prompt/PromptLogitsSteer.tsx:
--------------------------------------------------------------------------------
 1 | "use client";
 2 | 
 3 | import useSWR from "swr";
 4 | import LogitsTable from "~/app/components/LogitsTable";
 5 | import { Select } from "~/app/use-select";
 6 | import { Card, CardContent, CardHeader, CardTitle } from "~/components/ui/card";
 7 | import { getPromptLogitsSteer } from "~/lib/api";
 8 | 
 9 | export default function PromptLogitsSteered({
10 |   prompt,
11 |   stateLatent,
12 |   stateLayer,
13 |   statePosition,
14 |   factor,
15 |   className,
16 | }: {
17 |   prompt: string;
18 |   stateLatent: Select<number>;
19 |   stateLayer: Select<number>;
20 |   statePosition: Select<number>;
21 |   factor: number;
22 |   className?: string;
23 | }) {
24 |   const { data } = useSWR(
25 |     [
26 |       "prompt/logits-steer",
27 |       prompt,
28 |       stateLatent.clicked,
29 |       stateLayer.clicked,
30 |       factor,
31 |     ],
32 |     ([_key, prompt, latent, layer]) =>
33 |       getPromptLogitsSteer(prompt, latent, layer, factor),
34 |     {
35 |       keepPreviousData: false,
36 |     },
37 |   );
38 | 
39 |   const [values, changes] = data ?? [];
40 |   return (
41 |     <Card className={className}>
42 |       <CardHeader>
43 |         <CardTitle>
44 |           Steered by latent {stateLatent.clicked} at layer {stateLayer.clicked}
45 |         </CardTitle>
46 |       </CardHeader>
47 |       <CardContent className="flex gap-4">
48 |         <LogitsTable
49 |           caption="Max logits"
50 |           data={values?.max[statePosition.clicked]}
51 |           color="bg-red-400/50 dark:bg-red-800/50"
52 |         />
53 |         <LogitsTable
54 |           caption="Max delta"
55 |           data={changes?.max[statePosition.clicked]}
56 |           color="bg-red-400/50 dark:bg-red-800/50"
57 |         />
58 |         <LogitsTable
59 |           caption="Min delta"
60 |           data={changes?.min[statePosition.clicked]}
61 |           color="bg-indigo-400/50 dark:bg-indigo-800/50"
62 |         />
63 |       </CardContent>
64 |     </Card>
65 |   );
66 | }
67 | 


--------------------------------------------------------------------------------
/app/app/favicon.ico:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tim-lawson/mlsae/03ad37a0a1b4541d763859cb0c7c9ccb7ce67867/app/app/favicon.ico


--------------------------------------------------------------------------------
/app/app/globals.css:
--------------------------------------------------------------------------------
1 | @tailwind base;
2 | @tailwind components;
3 | @tailwind utilities;
4 | 


--------------------------------------------------------------------------------
/app/app/layout.tsx:
--------------------------------------------------------------------------------
 1 | import { Inter } from "next/font/google";
 2 | import "./globals.css";
 3 | import { cn } from "~/lib/utils";
 4 | 
 5 | const inter = Inter({
 6 |   subsets: ["latin", "latin-ext"],
 7 | });
 8 | 
 9 | export default async function Layout({
10 |   children,
11 | }: {
12 |   children: React.ReactNode;
13 | }) {
14 |   return (
15 |     <html lang="en" className="text-primary">
16 |       <body
17 |         className={cn(
18 |           inter.className,
19 |           "bg-slate-100 dark:bg-slate-900 text-slate-950 dark:text-slate-200 h-screen overflow-hidden",
20 |         )}
21 |       >
22 |         <header
23 |           className={cn(
24 |             "sticky top-0 z-50 h-12 flex items-center px-6 text-xl font-medium tracking-wide",
25 |             "bg-slate-800 dark:bg-slate-950 text-slate-100 dark:text-slate-50",
26 |           )}
27 |         >
28 |           Residual Stream Analysis with Multi-Layer SAEs
29 |         </header>
30 |         <main className="p-4 pb-0">{children}</main>
31 |       </body>
32 |     </html>
33 |   );
34 | }
35 | 


--------------------------------------------------------------------------------
/app/app/page.tsx:
--------------------------------------------------------------------------------
 1 | "use client";
 2 | 
 3 | import { zodResolver } from "@hookform/resolvers/zod";
 4 | import { useRouter } from "next/navigation";
 5 | import { useForm } from "react-hook-form";
 6 | import { z } from "zod";
 7 | import { Button } from "~/components/ui/button";
 8 | import {
 9 |   Form,
10 |   FormControl,
11 |   FormField,
12 |   FormItem,
13 |   FormLabel,
14 |   FormMessage,
15 | } from "~/components/ui/form";
16 | import { Textarea } from "~/components/ui/textarea";
17 | 
18 | const FormSchema = z.object({
19 |   prompt: z.string(),
20 | });
21 | 
22 | export default function InputForm() {
23 |   const form = useForm<z.infer<typeof FormSchema>>({
24 |     resolver: zodResolver(FormSchema),
25 |     defaultValues: {
26 |       prompt: "",
27 |     },
28 |   });
29 | 
30 |   const router = useRouter();
31 | 
32 |   function onSubmit(data: z.infer<typeof FormSchema>) {
33 |     router.push(`/prompt/${encodeURIComponent(data.prompt)}`);
34 |   }
35 | 
36 |   return (
37 |     <Form {...form}>
38 |       <form
39 |         onSubmit={form.handleSubmit(onSubmit)}
40 |         className="flex flex-col gap-2 items-end"
41 |       >
42 |         <FormField
43 |           control={form.control}
44 |           name="prompt"
45 |           render={({ field }) => (
46 |             <FormItem className="w-full">
47 |               <FormLabel>Prompt</FormLabel>
48 |               <FormControl>
49 |                 <Textarea {...field} />
50 |               </FormControl>
51 |               <FormMessage />
52 |             </FormItem>
53 |           )}
54 |         />
55 |         <Button type="submit" size="sm">
56 |           Submit
57 |         </Button>
58 |       </form>
59 |     </Form>
60 |   );
61 | }
62 | 


--------------------------------------------------------------------------------
/app/app/prompt/[prompt]/page.client.tsx:
--------------------------------------------------------------------------------
  1 | "use client";
  2 | 
  3 | import React from "react";
  4 | import Controls from "~/app/components/Controls";
  5 | import LatentExamples from "~/app/components/LatentExamples";
  6 | import PromptActivations from "~/app/components/prompt/PromptActivations";
  7 | import PromptLatentHeatmaps from "~/app/components/prompt/PromptLatentHeatmaps";
  8 | import PromptLayerHistograms from "~/app/components/prompt/PromptLayerHistograms";
  9 | import PromptLogitsInput from "~/app/components/prompt/PromptLogitsInput";
 10 | import PromptLogitsRecon from "~/app/components/prompt/PromptLogitsRecon";
 11 | import PromptLogitsSteer from "~/app/components/prompt/PromptLogitsSteer";
 12 | import useSelect from "~/app/use-select";
 13 | import { Tabs, TabsContent, TabsList, TabsTrigger } from "~/components/ui/tabs";
 14 | import type {
 15 |   LatentActivationsType,
 16 |   MaxLogitsType,
 17 |   TokenType,
 18 | } from "~/lib/models";
 19 | 
 20 | export default function Page({
 21 |   parameters,
 22 |   prompt,
 23 |   tokens,
 24 |   latentActivations,
 25 |   logitsSource,
 26 | }: {
 27 |   parameters: any;
 28 |   prompt: string;
 29 |   tokens: TokenType[];
 30 |   latentActivations: LatentActivationsType;
 31 |   logitsSource: MaxLogitsType;
 32 | }) {
 33 |   const nLatents = latentActivations.values[0][0].length;
 34 |   const nLayers = latentActivations.max.length;
 35 |   const nPositions = tokens.length;
 36 | 
 37 |   const stateLatent = useSelect(0);
 38 |   const stateLayer = useSelect(0);
 39 |   const statePosition = useSelect(0);
 40 | 
 41 |   const [threshold, setThreshold] = React.useState(0.05);
 42 |   const [factor, setFactor] = React.useState(0);
 43 | 
 44 |   return (
 45 |     <div className="flex flex-col gap-4 overflow-hidden">
 46 |       <Controls
 47 |         nLatents={nLatents}
 48 |         nLayers={nLayers}
 49 |         nPositions={nPositions}
 50 |         stateLatent={stateLatent}
 51 |         stateLayer={stateLayer}
 52 |         statePosition={statePosition}
 53 |         threshold={threshold}
 54 |         onChangeThreshold={setThreshold}
 55 |         factor={factor}
 56 |         onChangeFactor={setFactor}
 57 |       />
 58 |       <PromptActivations
 59 |         className="h-32 overflow-y-auto"
 60 |         tokens={tokens}
 61 |         latentActivations={latentActivations}
 62 |         stateLatent={stateLatent}
 63 |         stateLayer={stateLayer}
 64 |         statePosition={statePosition}
 65 |       />
 66 |       <Tabs defaultValue="histogram">
 67 |         <TabsList className="">
 68 |           <TabsTrigger value="histogram">Histograms</TabsTrigger>
 69 |           <TabsTrigger value="heatmap">Heatmaps</TabsTrigger>
 70 |           <TabsTrigger value="example">Examples</TabsTrigger>
 71 |           <TabsTrigger value="logit">Logits</TabsTrigger>
 72 |         </TabsList>
 73 |         <TabsContent
 74 |           value="histogram"
 75 |           className="mt-2 max-h-[calc(100vh-20.5rem)] overflow-y-auto"
 76 |         >
 77 |           <PromptLayerHistograms prompt={prompt} stateLayer={stateLayer} />
 78 |         </TabsContent>
 79 |         <TabsContent
 80 |           value="heatmap"
 81 |           className="mt-2 max-h-[calc(100vh-20.5rem)] overflow-y-auto"
 82 |         >
 83 |           <PromptLatentHeatmaps
 84 |             latentActivations={latentActivations}
 85 |             stateLatent={stateLatent}
 86 |             stateLayer={stateLayer}
 87 |             statePosition={statePosition}
 88 |             threshold={threshold}
 89 |           />
 90 |         </TabsContent>
 91 |         <TabsContent
 92 |           value="example"
 93 |           className="mt-2 max-h-[calc(100vh-20.5rem)] overflow-y-auto"
 94 |         >
 95 |           <LatentExamples
 96 |             className="w-[calc(100vw-2rem)]"
 97 |             stateLatent={stateLatent}
 98 |             stateLayer={stateLayer}
 99 |           />
100 |         </TabsContent>
101 |         <TabsContent
102 |           value="logit"
103 |           className="mt-2 max-h-[calc(100vh-20.5rem)] overflow-y-auto grid grid-cols-5 gap-4"
104 |         >
105 |           <PromptLogitsInput
106 |             className="col-span-1"
107 |             values={logitsSource}
108 |             statePosition={statePosition}
109 |           />
110 |           <PromptLogitsRecon
111 |             className="col-span-2"
112 |             prompt={prompt}
113 |             stateLayer={stateLayer}
114 |             statePosition={statePosition}
115 |           />
116 |           <PromptLogitsSteer
117 |             className="col-span-2"
118 |             prompt={prompt}
119 |             stateLatent={stateLatent}
120 |             stateLayer={stateLayer}
121 |             statePosition={statePosition}
122 |             factor={factor}
123 |           />
124 |         </TabsContent>
125 |       </Tabs>
126 |     </div>
127 |   );
128 | }
129 | 


--------------------------------------------------------------------------------
/app/app/prompt/[prompt]/page.tsx:
--------------------------------------------------------------------------------
 1 | import {
 2 |   getParameters,
 3 |   getPromptLatentActivations,
 4 |   getPromptLogitsInput,
 5 |   getPromptTokens,
 6 | } from "~/lib/api";
 7 | import PageComponent from "./page.client";
 8 | 
 9 | // Opt out of caching. Otherwise, responses persist when loading different models.
10 | export const fetchCache = "force-no-store";
11 | 
12 | export default async function Page({ params }: { params: { prompt: string } }) {
13 |   const prompt = decodeURIComponent(params.prompt);
14 | 
15 |   const parameters = await getParameters();
16 |   const tokens = await getPromptTokens(prompt);
17 |   const latentActivations = await getPromptLatentActivations(prompt);
18 |   const logitsSource = await getPromptLogitsInput(prompt);
19 | 
20 |   return (
21 |     <PageComponent
22 |       parameters={parameters}
23 |       prompt={prompt}
24 |       tokens={tokens}
25 |       latentActivations={latentActivations}
26 |       logitsSource={logitsSource}
27 |     />
28 |   );
29 | }
30 | 


--------------------------------------------------------------------------------
/app/app/use-select.tsx:
--------------------------------------------------------------------------------
 1 | import * as React from "react";
 2 | 
 3 | export default function useSelect<T>(initialState: T) {
 4 |   const [clicked, setClicked] = React.useState<T>(initialState);
 5 |   const onClick = (value: T) => setClicked(value);
 6 | 
 7 |   const [hovered, setHovered] = React.useState<T | null>(null);
 8 |   const onMouseEnter = (value: T) => setHovered(value);
 9 |   const onMouseLeave = () => setHovered(null);
10 | 
11 |   const active = hovered !== null ? hovered : clicked;
12 | 
13 |   return {
14 |     active,
15 |     clicked,
16 |     onClick,
17 |     onMouseEnter,
18 |     onMouseLeave,
19 |   };
20 | }
21 | 
22 | export type Select<T> = ReturnType<typeof useSelect<T>>;
23 | 


--------------------------------------------------------------------------------
/app/components.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "$schema": "https://ui.shadcn.com/schema.json",
 3 |   "style": "default",
 4 |   "rsc": true,
 5 |   "tsx": true,
 6 |   "tailwind": {
 7 |     "config": "tailwind.config.ts",
 8 |     "css": "app/globals.css",
 9 |     "baseColor": "slate",
10 |     "cssVariables": false,
11 |     "prefix": ""
12 |   },
13 |   "aliases": {
14 |     "components": "~/components",
15 |     "utils": "~/lib/utils"
16 |   }
17 | }


--------------------------------------------------------------------------------
/app/components/ui/button.tsx:
--------------------------------------------------------------------------------
 1 | import { Slot } from "@radix-ui/react-slot";
 2 | import { VariantProps, cva } from "class-variance-authority";
 3 | import * as React from "react";
 4 | 
 5 | import { cn } from "~/lib/utils";
 6 | 
 7 | const buttonVariants = cva(
 8 |   "inline-flex items-center justify-center whitespace-nowrap text-sm font-medium ring-offset-white transition-colors focus-visible:outline-none focus-visible:ring-2 focus-visible:ring-slate-950 focus-visible:ring-offset-2 disabled:pointer-events-none disabled:opacity-50 dark:ring-offset-slate-950 dark:focus-visible:ring-slate-300",
 9 |   {
10 |     variants: {
11 |       variant: {
12 |         default:
13 |           "bg-slate-900 text-slate-50 hover:bg-slate-900/90 dark:bg-slate-50 dark:text-slate-900 dark:hover:bg-slate-50/90",
14 |         destructive:
15 |           "bg-red-500 text-slate-50 hover:bg-red-500/90 dark:bg-red-900 dark:text-slate-50 dark:hover:bg-red-900/90",
16 |         outline:
17 |           "border border-slate-200 bg-white hover:bg-slate-100 hover:text-slate-900 dark:border-slate-800 dark:bg-slate-950 dark:hover:bg-slate-800 dark:hover:text-slate-50",
18 |         secondary:
19 |           "bg-slate-100 text-slate-900 hover:bg-slate-100/80 dark:bg-slate-800 dark:text-slate-50 dark:hover:bg-slate-800/80",
20 |         ghost:
21 |           "hover:bg-slate-100 hover:text-slate-900 dark:hover:bg-slate-800 dark:hover:text-slate-50",
22 |         link: "text-slate-900 underline-offset-4 hover:underline dark:text-slate-50",
23 |       },
24 |       size: {
25 |         default: "h-10 px-4 py-2",
26 |         sm: "h-9 px-3",
27 |         lg: "h-11 px-8",
28 |         icon: "h-10 w-10",
29 |       },
30 |     },
31 |     defaultVariants: {
32 |       variant: "default",
33 |       size: "default",
34 |     },
35 |   },
36 | );
37 | 
38 | export interface ButtonProps
39 |   extends React.ButtonHTMLAttributes<HTMLButtonElement>,
40 |     VariantProps<typeof buttonVariants> {
41 |   asChild?: boolean;
42 | }
43 | 
44 | const Button = React.forwardRef<HTMLButtonElement, ButtonProps>(
45 |   ({ className, variant, size, asChild = false, ...props }, ref) => {
46 |     const Comp = asChild ? Slot : "button";
47 |     return (
48 |       <Comp
49 |         className={cn(buttonVariants({ variant, size, className }))}
50 |         ref={ref}
51 |         {...props}
52 |       />
53 |     );
54 |   },
55 | );
56 | Button.displayName = "Button";
57 | 
58 | export { Button, buttonVariants };
59 | 


--------------------------------------------------------------------------------
/app/components/ui/card.tsx:
--------------------------------------------------------------------------------
 1 | import * as React from "react";
 2 | 
 3 | import { cn } from "~/lib/utils";
 4 | 
 5 | const Card = React.forwardRef<
 6 |   HTMLDivElement,
 7 |   React.HTMLAttributes<HTMLDivElement>
 8 | >(({ className, ...props }, ref) => (
 9 |   <div
10 |     ref={ref}
11 |     className={cn(
12 |       "border border-slate-200 bg-white text-slate-950 dark:border-slate-800 dark:bg-slate-950 dark:text-slate-50",
13 |       className,
14 |     )}
15 |     {...props}
16 |   />
17 | ));
18 | Card.displayName = "Card";
19 | 
20 | const CardHeader = React.forwardRef<
21 |   HTMLDivElement,
22 |   React.HTMLAttributes<HTMLDivElement>
23 | >(({ className, ...props }, ref) => (
24 |   <div
25 |     ref={ref}
26 |     className={cn("flex flex-col space-y-1.5 p-4", className)}
27 |     {...props}
28 |   />
29 | ));
30 | CardHeader.displayName = "CardHeader";
31 | 
32 | const CardTitle = React.forwardRef<
33 |   HTMLParagraphElement,
34 |   React.HTMLAttributes<HTMLHeadingElement>
35 | >(({ className, ...props }, ref) => (
36 |   <h3
37 |     ref={ref}
38 |     className={cn(
39 |       "text-md font-semibold leading-none tracking-tight",
40 |       className,
41 |     )}
42 |     {...props}
43 |   />
44 | ));
45 | CardTitle.displayName = "CardTitle";
46 | 
47 | const CardDescription = React.forwardRef<
48 |   HTMLParagraphElement,
49 |   React.HTMLAttributes<HTMLParagraphElement>
50 | >(({ className, ...props }, ref) => (
51 |   <p
52 |     ref={ref}
53 |     className={cn("text-sm text-slate-500 dark:text-slate-400", className)}
54 |     {...props}
55 |   />
56 | ));
57 | CardDescription.displayName = "CardDescription";
58 | 
59 | const CardContent = React.forwardRef<
60 |   HTMLDivElement,
61 |   React.HTMLAttributes<HTMLDivElement>
62 | >(({ className, ...props }, ref) => (
63 |   <div ref={ref} className={cn("p-4 pt-0", className)} {...props} />
64 | ));
65 | CardContent.displayName = "CardContent";
66 | 
67 | const CardFooter = React.forwardRef<
68 |   HTMLDivElement,
69 |   React.HTMLAttributes<HTMLDivElement>
70 | >(({ className, ...props }, ref) => (
71 |   <div
72 |     ref={ref}
73 |     className={cn("flex items-center p-4 pt-0", className)}
74 |     {...props}
75 |   />
76 | ));
77 | CardFooter.displayName = "CardFooter";
78 | 
79 | export {
80 |   Card,
81 |   CardHeader,
82 |   CardFooter,
83 |   CardTitle,
84 |   CardDescription,
85 |   CardContent,
86 | };
87 | 


--------------------------------------------------------------------------------
/app/components/ui/form.tsx:
--------------------------------------------------------------------------------
  1 | "use client";
  2 | 
  3 | import * as LabelPrimitive from "@radix-ui/react-label";
  4 | import { Slot } from "@radix-ui/react-slot";
  5 | import * as React from "react";
  6 | import {
  7 |   Controller,
  8 |   ControllerProps,
  9 |   FieldPath,
 10 |   FieldValues,
 11 |   FormProvider,
 12 |   useFormContext,
 13 | } from "react-hook-form";
 14 | 
 15 | import { Label } from "~/components/ui/label";
 16 | import { cn } from "~/lib/utils";
 17 | 
 18 | const Form = FormProvider;
 19 | 
 20 | type FormFieldContextValue<
 21 |   TFieldValues extends FieldValues = FieldValues,
 22 |   TName extends FieldPath<TFieldValues> = FieldPath<TFieldValues>,
 23 | > = {
 24 |   name: TName;
 25 | };
 26 | 
 27 | const FormFieldContext = React.createContext<FormFieldContextValue>(
 28 |   {} as FormFieldContextValue,
 29 | );
 30 | 
 31 | const FormField = <
 32 |   TFieldValues extends FieldValues = FieldValues,
 33 |   TName extends FieldPath<TFieldValues> = FieldPath<TFieldValues>,
 34 | >({
 35 |   ...props
 36 | }: ControllerProps<TFieldValues, TName>) => {
 37 |   return (
 38 |     <FormFieldContext.Provider value={{ name: props.name }}>
 39 |       <Controller {...props} />
 40 |     </FormFieldContext.Provider>
 41 |   );
 42 | };
 43 | 
 44 | const useFormField = () => {
 45 |   const fieldContext = React.useContext(FormFieldContext);
 46 |   const itemContext = React.useContext(FormItemContext);
 47 |   const { getFieldState, formState } = useFormContext();
 48 | 
 49 |   const fieldState = getFieldState(fieldContext.name, formState);
 50 | 
 51 |   if (!fieldContext) {
 52 |     throw new Error("useFormField should be used within <FormField>");
 53 |   }
 54 | 
 55 |   const { id } = itemContext;
 56 | 
 57 |   return {
 58 |     id,
 59 |     name: fieldContext.name,
 60 |     formItemId: `${id}-form-item`,
 61 |     formDescriptionId: `${id}-form-item-description`,
 62 |     formMessageId: `${id}-form-item-message`,
 63 |     ...fieldState,
 64 |   };
 65 | };
 66 | 
 67 | type FormItemContextValue = {
 68 |   id: string;
 69 | };
 70 | 
 71 | const FormItemContext = React.createContext<FormItemContextValue>(
 72 |   {} as FormItemContextValue,
 73 | );
 74 | 
 75 | const FormItem = React.forwardRef<
 76 |   HTMLDivElement,
 77 |   React.HTMLAttributes<HTMLDivElement>
 78 | >(({ className, ...props }, ref) => {
 79 |   const id = React.useId();
 80 | 
 81 |   return (
 82 |     <FormItemContext.Provider value={{ id }}>
 83 |       <div ref={ref} className={cn("space-y-2", className)} {...props} />
 84 |     </FormItemContext.Provider>
 85 |   );
 86 | });
 87 | FormItem.displayName = "FormItem";
 88 | 
 89 | const FormLabel = React.forwardRef<
 90 |   React.ElementRef<typeof LabelPrimitive.Root>,
 91 |   React.ComponentPropsWithoutRef<typeof LabelPrimitive.Root>
 92 | >(({ className, ...props }, ref) => {
 93 |   const { error, formItemId } = useFormField();
 94 | 
 95 |   return (
 96 |     <Label
 97 |       ref={ref}
 98 |       className={cn(error && "text-red-500 dark:text-red-900", className)}
 99 |       htmlFor={formItemId}
100 |       {...props}
101 |     />
102 |   );
103 | });
104 | FormLabel.displayName = "FormLabel";
105 | 
106 | const FormControl = React.forwardRef<
107 |   React.ElementRef<typeof Slot>,
108 |   React.ComponentPropsWithoutRef<typeof Slot>
109 | >(({ ...props }, ref) => {
110 |   const { error, formItemId, formDescriptionId, formMessageId } =
111 |     useFormField();
112 | 
113 |   return (
114 |     <Slot
115 |       ref={ref}
116 |       id={formItemId}
117 |       aria-describedby={
118 |         !error
119 |           ? `${formDescriptionId}`
120 |           : `${formDescriptionId} ${formMessageId}`
121 |       }
122 |       aria-invalid={!!error}
123 |       {...props}
124 |     />
125 |   );
126 | });
127 | FormControl.displayName = "FormControl";
128 | 
129 | const FormDescription = React.forwardRef<
130 |   HTMLParagraphElement,
131 |   React.HTMLAttributes<HTMLParagraphElement>
132 | >(({ className, ...props }, ref) => {
133 |   const { formDescriptionId } = useFormField();
134 | 
135 |   return (
136 |     <p
137 |       ref={ref}
138 |       id={formDescriptionId}
139 |       className={cn("text-sm text-slate-500 dark:text-slate-400", className)}
140 |       {...props}
141 |     />
142 |   );
143 | });
144 | FormDescription.displayName = "FormDescription";
145 | 
146 | const FormMessage = React.forwardRef<
147 |   HTMLParagraphElement,
148 |   React.HTMLAttributes<HTMLParagraphElement>
149 | >(({ className, children, ...props }, ref) => {
150 |   const { error, formMessageId } = useFormField();
151 |   const body = error ? String(error?.message) : children;
152 | 
153 |   if (!body) {
154 |     return null;
155 |   }
156 | 
157 |   return (
158 |     <p
159 |       ref={ref}
160 |       id={formMessageId}
161 |       className={cn(
162 |         "text-sm font-medium text-red-500 dark:text-red-900",
163 |         className,
164 |       )}
165 |       {...props}
166 |     >
167 |       {body}
168 |     </p>
169 |   );
170 | });
171 | FormMessage.displayName = "FormMessage";
172 | 
173 | export {
174 |   useFormField,
175 |   Form,
176 |   FormItem,
177 |   FormLabel,
178 |   FormControl,
179 |   FormDescription,
180 |   FormMessage,
181 |   FormField,
182 | };
183 | 


--------------------------------------------------------------------------------
/app/components/ui/input.tsx:
--------------------------------------------------------------------------------
 1 | import * as React from "react";
 2 | 
 3 | import { cn } from "~/lib/utils";
 4 | 
 5 | export interface InputProps
 6 |   extends React.InputHTMLAttributes<HTMLInputElement> {}
 7 | 
 8 | const Input = React.forwardRef<HTMLInputElement, InputProps>(
 9 |   ({ className, type, ...props }, ref) => {
10 |     return (
11 |       <input
12 |         type={type}
13 |         className={cn(
14 |           "flex h-10 w-full border border-slate-200 bg-white px-3 py-2 text-sm ring-offset-white file:border-0 file:bg-transparent file:text-sm file:font-medium placeholder:text-slate-500 focus-visible:outline-none focus-visible:ring-2 focus-visible:ring-slate-950 focus-visible:ring-offset-2 disabled:cursor-not-allowed disabled:opacity-50 dark:border-slate-800 dark:bg-slate-950 dark:ring-offset-slate-950 dark:placeholder:text-slate-400 dark:focus-visible:ring-slate-300",
15 |           className,
16 |         )}
17 |         ref={ref}
18 |         {...props}
19 |       />
20 |     );
21 |   },
22 | );
23 | Input.displayName = "Input";
24 | 
25 | export { Input };
26 | 


--------------------------------------------------------------------------------
/app/components/ui/label.tsx:
--------------------------------------------------------------------------------
 1 | "use client";
 2 | 
 3 | import * as LabelPrimitive from "@radix-ui/react-label";
 4 | import { VariantProps, cva } from "class-variance-authority";
 5 | import * as React from "react";
 6 | 
 7 | import { cn } from "~/lib/utils";
 8 | 
 9 | const labelVariants = cva(
10 |   "text-sm font-medium leading-none peer-disabled:cursor-not-allowed peer-disabled:opacity-70",
11 | );
12 | 
13 | const Label = React.forwardRef<
14 |   React.ElementRef<typeof LabelPrimitive.Root>,
15 |   React.ComponentPropsWithoutRef<typeof LabelPrimitive.Root> &
16 |     VariantProps<typeof labelVariants>
17 | >(({ className, ...props }, ref) => (
18 |   <LabelPrimitive.Root
19 |     ref={ref}
20 |     className={cn(labelVariants(), className)}
21 |     {...props}
22 |   />
23 | ));
24 | Label.displayName = LabelPrimitive.Root.displayName;
25 | 
26 | export { Label };
27 | 


--------------------------------------------------------------------------------
/app/components/ui/table.tsx:
--------------------------------------------------------------------------------
  1 | import * as React from "react";
  2 | 
  3 | import { cn } from "~/lib/utils";
  4 | 
  5 | const Table = React.forwardRef<
  6 |   HTMLTableElement,
  7 |   React.DetailedHTMLProps<
  8 |     React.TableHTMLAttributes<HTMLTableElement>,
  9 |     HTMLTableElement
 10 |   >
 11 | >(({ className, ...props }, ref) => (
 12 |   <div className="relative w-full overflow-auto">
 13 |     <table
 14 |       ref={ref}
 15 |       className={cn("w-full caption-bottom text-sm", className)}
 16 |       {...props}
 17 |     />
 18 |   </div>
 19 | ));
 20 | Table.displayName = "Table";
 21 | 
 22 | const TableHeader = React.forwardRef<
 23 |   HTMLTableSectionElement,
 24 |   React.HTMLAttributes<HTMLTableSectionElement>
 25 | >(({ className, ...props }, ref) => (
 26 |   <thead ref={ref} className={cn("[&_tr]:border-b", className)} {...props} />
 27 | ));
 28 | TableHeader.displayName = "TableHeader";
 29 | 
 30 | const TableBody = React.forwardRef<
 31 |   HTMLTableSectionElement,
 32 |   React.HTMLAttributes<HTMLTableSectionElement>
 33 | >(({ className, ...props }, ref) => (
 34 |   <tbody
 35 |     ref={ref}
 36 |     className={cn("[&_tr:last-child]:border-0", className)}
 37 |     {...props}
 38 |   />
 39 | ));
 40 | TableBody.displayName = "TableBody";
 41 | 
 42 | const TableFooter = React.forwardRef<
 43 |   HTMLTableSectionElement,
 44 |   React.HTMLAttributes<HTMLTableSectionElement>
 45 | >(({ className, ...props }, ref) => (
 46 |   <tfoot
 47 |     ref={ref}
 48 |     className={cn(
 49 |       "border-t bg-slate-100/50 font-medium [&>tr]:last:border-b-0 dark:bg-slate-800/50",
 50 |       className,
 51 |     )}
 52 |     {...props}
 53 |   />
 54 | ));
 55 | TableFooter.displayName = "TableFooter";
 56 | 
 57 | const TableRow = React.forwardRef<
 58 |   HTMLTableRowElement,
 59 |   React.HTMLAttributes<HTMLTableRowElement>
 60 | >(({ className, ...props }, ref) => (
 61 |   <tr
 62 |     ref={ref}
 63 |     className={cn(
 64 |       "border-b transition-colors hover:bg-slate-100/50 data-[state=selected]:bg-slate-100 dark:hover:bg-slate-800/50 dark:data-[state=selected]:bg-slate-800",
 65 |       className,
 66 |     )}
 67 |     {...props}
 68 |   />
 69 | ));
 70 | TableRow.displayName = "TableRow";
 71 | 
 72 | const TableHead = React.forwardRef<
 73 |   HTMLTableCellElement,
 74 |   React.ThHTMLAttributes<HTMLTableCellElement>
 75 | >(({ className, ...props }, ref) => (
 76 |   <th
 77 |     ref={ref}
 78 |     className={cn(
 79 |       "h-10 px-0.5 text-left align-middle font-medium text-slate-500 [&:has([role=checkbox])]:pr-0 dark:text-slate-400",
 80 |       className,
 81 |     )}
 82 |     {...props}
 83 |   />
 84 | ));
 85 | TableHead.displayName = "TableHead";
 86 | 
 87 | const TableCell = React.forwardRef<
 88 |   HTMLTableCellElement,
 89 |   React.TdHTMLAttributes<HTMLTableCellElement>
 90 | >(({ className, ...props }, ref) => (
 91 |   <td
 92 |     ref={ref}
 93 |     className={cn(
 94 |       "px-0.5 py-1 align-middle [&:has([role=checkbox])]:pr-0",
 95 |       className,
 96 |     )}
 97 |     {...props}
 98 |   />
 99 | ));
100 | TableCell.displayName = "TableCell";
101 | 
102 | const TableCaption = React.forwardRef<
103 |   HTMLTableCaptionElement,
104 |   React.HTMLAttributes<HTMLTableCaptionElement>
105 | >(({ className, ...props }, ref) => (
106 |   <caption
107 |     ref={ref}
108 |     className={cn("mt-4 text-sm text-slate-500 dark:text-slate-400", className)}
109 |     {...props}
110 |   />
111 | ));
112 | TableCaption.displayName = "TableCaption";
113 | 
114 | export {
115 |   Table,
116 |   TableHeader,
117 |   TableBody,
118 |   TableFooter,
119 |   TableHead,
120 |   TableRow,
121 |   TableCell,
122 |   TableCaption,
123 | };
124 | 


--------------------------------------------------------------------------------
/app/components/ui/tabs.tsx:
--------------------------------------------------------------------------------
 1 | "use client";
 2 | 
 3 | import * as TabsPrimitive from "@radix-ui/react-tabs";
 4 | import * as React from "react";
 5 | 
 6 | import { cn } from "~/lib/utils";
 7 | 
 8 | const Tabs = TabsPrimitive.Root;
 9 | 
10 | const TabsList = React.forwardRef<
11 |   React.ElementRef<typeof TabsPrimitive.List>,
12 |   React.ComponentPropsWithoutRef<typeof TabsPrimitive.List>
13 | >(({ className, ...props }, ref) => (
14 |   <TabsPrimitive.List
15 |     ref={ref}
16 |     className={cn(
17 |       "inline-flex h-10 items-center justify-center bg-slate-100 p-1 text-slate-500 dark:bg-slate-800 dark:text-slate-400",
18 |       className,
19 |     )}
20 |     {...props}
21 |   />
22 | ));
23 | TabsList.displayName = TabsPrimitive.List.displayName;
24 | 
25 | const TabsTrigger = React.forwardRef<
26 |   React.ElementRef<typeof TabsPrimitive.Trigger>,
27 |   React.ComponentPropsWithoutRef<typeof TabsPrimitive.Trigger>
28 | >(({ className, ...props }, ref) => (
29 |   <TabsPrimitive.Trigger
30 |     ref={ref}
31 |     className={cn(
32 |       "inline-flex items-center justify-center whitespace-nowrap px-3 py-1.5 text-sm font-medium ring-offset-white transition-all focus-visible:outline-none focus-visible:ring-2 focus-visible:ring-slate-950 focus-visible:ring-offset-2 disabled:pointer-events-none disabled:opacity-50 data-[state=active]:bg-white data-[state=active]:text-slate-950 data-[state=active]:shadow-sm dark:ring-offset-slate-950 dark:focus-visible:ring-slate-300 dark:data-[state=active]:bg-slate-950 dark:data-[state=active]:text-slate-50",
33 |       className,
34 |     )}
35 |     {...props}
36 |   />
37 | ));
38 | TabsTrigger.displayName = TabsPrimitive.Trigger.displayName;
39 | 
40 | const TabsContent = React.forwardRef<
41 |   React.ElementRef<typeof TabsPrimitive.Content>,
42 |   React.ComponentPropsWithoutRef<typeof TabsPrimitive.Content>
43 | >(({ className, ...props }, ref) => (
44 |   <TabsPrimitive.Content
45 |     ref={ref}
46 |     className={cn(
47 |       "mt-2 ring-offset-white focus-visible:outline-none focus-visible:ring-2 focus-visible:ring-slate-950 focus-visible:ring-offset-2 dark:ring-offset-slate-950 dark:focus-visible:ring-slate-300",
48 |       className,
49 |     )}
50 |     {...props}
51 |   />
52 | ));
53 | TabsContent.displayName = TabsPrimitive.Content.displayName;
54 | 
55 | export { Tabs, TabsList, TabsTrigger, TabsContent };
56 | 


--------------------------------------------------------------------------------
/app/components/ui/textarea.tsx:
--------------------------------------------------------------------------------
 1 | import * as React from "react";
 2 | 
 3 | import { cn } from "~/lib/utils";
 4 | 
 5 | export interface TextareaProps
 6 |   extends React.TextareaHTMLAttributes<HTMLTextAreaElement> {}
 7 | 
 8 | const Textarea = React.forwardRef<HTMLTextAreaElement, TextareaProps>(
 9 |   ({ className, ...props }, ref) => {
10 |     return (
11 |       <textarea
12 |         className={cn(
13 |           "flex min-h-[80px] w-full border border-slate-200 bg-white px-3 py-2 text-sm ring-offset-white placeholder:text-slate-500 focus-visible:outline-none focus-visible:ring-2 focus-visible:ring-slate-950 focus-visible:ring-offset-2 disabled:cursor-not-allowed disabled:opacity-50 dark:border-slate-800 dark:bg-slate-950 dark:ring-offset-slate-950 dark:placeholder:text-slate-400 dark:focus-visible:ring-slate-300",
14 |           className,
15 |         )}
16 |         ref={ref}
17 |         {...props}
18 |       />
19 |     );
20 |   },
21 | );
22 | Textarea.displayName = "Textarea";
23 | 
24 | export { Textarea };
25 | 


--------------------------------------------------------------------------------
/app/lib/api.ts:
--------------------------------------------------------------------------------
  1 | import { z } from "zod";
  2 | import {
  3 |   latentActivationsSchema,
  4 |   latentExampleSchema,
  5 |   layerHistogramsSchema,
  6 |   logitChangesSchema,
  7 |   maxLogitsSchema,
  8 |   tokenSchema,
  9 | } from "./models";
 10 | 
 11 | const isDev = process.env.NODE_ENV === "development";
 12 | 
 13 | const API_URL = isDev ? "http://localhost:3000/api/py" : "";
 14 | 
 15 | export async function getParameters() {
 16 |   const response = await fetch(`${API_URL}/params`, {
 17 |     method: "GET",
 18 |     headers: {
 19 |       "Content-Type": "application/json",
 20 |     },
 21 |   });
 22 |   const data: unknown = await response.json();
 23 |   return z.record(z.string(), z.any()).parse(data);
 24 | }
 25 | 
 26 | export async function getPromptTokens(prompt: string) {
 27 |   const response = await fetch(`${API_URL}/prompt/tokens`, {
 28 |     method: "POST",
 29 |     headers: {
 30 |       "Content-Type": "application/json",
 31 |     },
 32 |     body: JSON.stringify({ prompt }),
 33 |   });
 34 |   const data: unknown = await response.json();
 35 |   return z.array(tokenSchema).parse(data);
 36 | }
 37 | 
 38 | export async function getPromptLatentActivations(prompt: string) {
 39 |   const response = await fetch(`${API_URL}/prompt/latent-activations`, {
 40 |     method: "POST",
 41 |     headers: {
 42 |       "Content-Type": "application/json",
 43 |     },
 44 |     body: JSON.stringify({ prompt }),
 45 |   });
 46 |   const data: unknown = await response.json();
 47 |   return latentActivationsSchema.parse(data);
 48 | }
 49 | 
 50 | export async function getPromptLayerHistograms(prompt: string) {
 51 |   const response = await fetch(`${API_URL}/prompt/layer-histograms`, {
 52 |     method: "POST",
 53 |     headers: {
 54 |       "Content-Type": "application/json",
 55 |     },
 56 |     body: JSON.stringify({ prompt }),
 57 |   });
 58 |   const data: unknown = await response.json();
 59 |   return layerHistogramsSchema.parse(data);
 60 | }
 61 | 
 62 | export async function getPromptLogitsInput(prompt: string) {
 63 |   const response = await fetch(`${API_URL}/prompt/logits-input`, {
 64 |     method: "POST",
 65 |     headers: {
 66 |       "Content-Type": "application/json",
 67 |     },
 68 |     body: JSON.stringify({ prompt }),
 69 |   });
 70 |   const data: unknown = await response.json();
 71 |   return maxLogitsSchema.parse(data);
 72 | }
 73 | 
 74 | export async function getPromptLogitsRecon(prompt: string, layer: number) {
 75 |   const response = await fetch(`${API_URL}/prompt/logits-recon`, {
 76 |     method: "POST",
 77 |     headers: {
 78 |       "Content-Type": "application/json",
 79 |     },
 80 |     body: JSON.stringify({ prompt, layer }),
 81 |   });
 82 |   const data: unknown = await response.json();
 83 |   return z.tuple([maxLogitsSchema, logitChangesSchema]).parse(data);
 84 | }
 85 | 
 86 | export async function getPromptLogitsSteer(
 87 |   prompt: string,
 88 |   latent: number,
 89 |   layer: number,
 90 |   factor: number,
 91 | ) {
 92 |   const response = await fetch(`${API_URL}/prompt/logits-steer`, {
 93 |     method: "POST",
 94 |     headers: {
 95 |       "Content-Type": "application/json",
 96 |     },
 97 |     body: JSON.stringify({ prompt, latent, layer, factor }),
 98 |   });
 99 |   const data: unknown = await response.json();
100 |   return z.tuple([maxLogitsSchema, logitChangesSchema]).parse(data);
101 | }
102 | 
103 | export async function getExamples(latent: number, layer: number) {
104 |   const response = await fetch(`${API_URL}/examples`, {
105 |     method: "POST",
106 |     headers: {
107 |       "Content-Type": "application/json",
108 |     },
109 |     body: JSON.stringify({ latent, layer }),
110 |   });
111 |   const data: unknown = await response.json();
112 |   return z.array(latentExampleSchema).parse(data);
113 | }
114 | 


--------------------------------------------------------------------------------
/app/lib/format.ts:
--------------------------------------------------------------------------------
 1 | export const toUnitInterval = (value: number) => 1 / (1 + Math.exp(-value));
 2 | 
 3 | const fmtSigned = Intl.NumberFormat("en-US", {
 4 |   signDisplay: "always",
 5 |   minimumFractionDigits: 3,
 6 | });
 7 | 
 8 | export const toSigned = (value: number) => fmtSigned.format(value);
 9 | 
10 | const REGEX_SPACE =
11 |   /[\u0020\u00A0\u2000-\u2009\u200a​\u200b\u200c\u200d\ufeff​\u202f\u205f​\u3000\u1680​\u180e]/g;
12 | 
13 | const REGEX_TAB = /[\u0009\u000b\u000c]/g;
14 | 
15 | const REGEX_NEWLINE = /[\u000a\u000d\u0085]/g;
16 | 
17 | export function escapeWhitespace(value: string): string {
18 |   return value
19 |     .replace(REGEX_SPACE, "␣")
20 |     .replace(REGEX_TAB, "⇥")
21 |     .replace(REGEX_NEWLINE, "↵");
22 | }
23 | 


--------------------------------------------------------------------------------
/app/lib/models.ts:
--------------------------------------------------------------------------------
 1 | // See `mlsae/api/models.py` for the corresponding Python dataclasses.
 2 | 
 3 | import { z } from "zod";
 4 | 
 5 | export const tokenSchema = z.object({
 6 |   id: z.number().int(),
 7 |   token: z.string(),
 8 |   pos: z.number().int(),
 9 | });
10 | 
11 | const logitSchema = z.object({
12 |   id: z.number().int(),
13 |   token: z.string(),
14 |   logit: z.number(),
15 |   prob: z.number().nullable(),
16 | });
17 | 
18 | export const maxLogitsSchema = z.object({
19 |   max: z.array(z.array(logitSchema)),
20 | });
21 | 
22 | export const logitChangesSchema = z.object({
23 |   max: z.array(z.array(logitSchema)),
24 |   min: z.array(z.array(logitSchema)),
25 | });
26 | 
27 | export const latentActivationsSchema = z.object({
28 |   values: z.array(z.array(z.array(z.number()))),
29 |   max: z.array(z.array(z.number())),
30 | });
31 | 
32 | export const layerHistogramsSchema = z.object({
33 |   values: z.array(z.array(z.number().int())),
34 |   edges: z.array(z.number()),
35 | });
36 | 
37 | export const latentExampleSchema = z.object({
38 |   latent: z.number().int(),
39 |   layer: z.number().int(),
40 |   token_id: z.number().int(),
41 |   token: z.string(),
42 |   act: z.number(),
43 |   token_ids: z.array(z.number().int()),
44 |   tokens: z.array(z.string()),
45 |   acts: z.array(z.number()),
46 | });
47 | 
48 | export type TokenType = z.infer<typeof tokenSchema>;
49 | export type LogitType = z.infer<typeof logitSchema>;
50 | export type MaxLogitsType = z.infer<typeof maxLogitsSchema>;
51 | export type LogitChangesType = z.infer<typeof logitChangesSchema>;
52 | export type LatentActivationsType = z.infer<typeof latentActivationsSchema>;
53 | export type LayerHistogramsType = z.infer<typeof layerHistogramsSchema>;
54 | export type LatentExampleType = z.infer<typeof latentExampleSchema>;
55 | 


--------------------------------------------------------------------------------
/app/lib/utils.ts:
--------------------------------------------------------------------------------
1 | import { ClassValue, clsx } from "clsx";
2 | import { twMerge } from "tailwind-merge";
3 | 
4 | export function cn(...inputs: ClassValue[]) {
5 |   return twMerge(clsx(inputs));
6 | }
7 | 


--------------------------------------------------------------------------------
/app/next.config.js:
--------------------------------------------------------------------------------
 1 | const isDev = process.env.NODE_ENV === "development";
 2 | 
 3 | const baseUrl = isDev ? "http://127.0.0.1:8001" : "";
 4 | 
 5 | /** @type {import('next').NextConfig} */
 6 | const nextConfig = {
 7 |   rewrites: async () => {
 8 |     return [
 9 |       {
10 |         source: `/api/py/:path*`,
11 |         destination: `${baseUrl}/api/py/:path*`,
12 |       },
13 |     ];
14 |   },
15 | };
16 | 
17 | module.exports = nextConfig;
18 | 


--------------------------------------------------------------------------------
/app/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "dependencies": {
 3 |     "@hookform/resolvers": "^3.9.0",
 4 |     "@radix-ui/react-label": "^2.1.0",
 5 |     "@radix-ui/react-slot": "^1.1.0",
 6 |     "@radix-ui/react-tabs": "^1.1.0",
 7 |     "@types/react": "^18.3.3",
 8 |     "@types/react-dom": "^18.3.0",
 9 |     "autoprefixer": "^10.4.19",
10 |     "class-variance-authority": "^0.7.0",
11 |     "clsx": "^2.1.1",
12 |     "d3-scale": "^4.0.2",
13 |     "eslint": "^8.57.0",
14 |     "eslint-config-next": "^14.2.3",
15 |     "eslint-config-prettier": "^9.1.0",
16 |     "next": "^14.2.3",
17 |     "postcss": "^8.4.38",
18 |     "react": "^18.3.1",
19 |     "react-dom": "^18.3.1",
20 |     "react-hook-form": "^7.52.1",
21 |     "recharts": "^2.12.7",
22 |     "swr": "^2.2.5",
23 |     "tailwind-merge": "^2.3.0",
24 |     "tailwindcss": "^3.4.4",
25 |     "usehooks-ts": "^3.1.0",
26 |     "zod": "^3.23.8"
27 |   },
28 |   "devDependencies": {
29 |     "@types/node": "^22.5.1",
30 |     "eslint-import-resolver-typescript": "^3.6.3",
31 |     "eslint-plugin-import": "^2.29.1",
32 |     "knip": "^5.29.1",
33 |     "prettier": "^3.3.3",
34 |     "typescript": "^5.5.4"
35 |   },
36 |   "name": "mlsae_app",
37 |   "private": true,
38 |   "scripts": {
39 |     "build": "next build",
40 |     "dev": "next dev",
41 |     "knip": "knip",
42 |     "lint": "next lint",
43 |     "start": "next start"
44 |   },
45 |   "version": "0.1.0",
46 |   "volta": {
47 |     "node": "18.20.4",
48 |     "npm": "10.8.2"
49 |   }
50 | }
51 | 


--------------------------------------------------------------------------------
/app/postcss.config.js:
--------------------------------------------------------------------------------
1 | module.exports = {
2 |   plugins: {
3 |     tailwindcss: {},
4 |     autoprefixer: {},
5 |   },
6 | };
7 | 


--------------------------------------------------------------------------------
/app/tailwind.config.js:
--------------------------------------------------------------------------------
1 | /** @type {import('tailwindcss').Config} */
2 | module.exports = {
3 |   content: [
4 |     "./app/**/*.{js,ts,jsx,tsx,mdx}",
5 |     "./components/**/*.{js,ts,jsx,tsx,mdx}",
6 |     "./pages/**/*.{js,ts,jsx,tsx,mdx}",
7 |   ],
8 | };
9 | 


--------------------------------------------------------------------------------
/app/tsconfig.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "compilerOptions": {
 3 |     "target": "es5",
 4 |     "lib": ["dom", "dom.iterable", "esnext"],
 5 |     "allowJs": true,
 6 |     "skipLibCheck": true,
 7 |     "strict": true,
 8 |     "forceConsistentCasingInFileNames": true,
 9 |     "noEmit": true,
10 |     "esModuleInterop": true,
11 |     "module": "esnext",
12 |     "moduleResolution": "node",
13 |     "resolveJsonModule": true,
14 |     "isolatedModules": true,
15 |     "jsx": "preserve",
16 |     "incremental": true,
17 |     "plugins": [
18 |       {
19 |         "name": "next"
20 |       }
21 |     ],
22 |     "paths": {
23 |       "~/*": ["./*"]
24 |     }
25 |   },
26 |   "include": [
27 |     ".next/types/**/*.ts",
28 |     "**/*.ts",
29 |     "**/*.tsx",
30 |     "next-env.d.ts",
31 |     "next.config.js"
32 |   ],
33 |   "exclude": ["node_modules"]
34 | }
35 | 


--------------------------------------------------------------------------------
/citation.bib:
--------------------------------------------------------------------------------
 1 | @misc{lawson_residual_2024,
 2 |   title         = {Residual {{ "{{" }}Stream Analysis{{ "}}" }} with {{ "{{" }}Multi-Layer SAEs{{ "}}" }}},
 3 |   author        = {Lawson, Tim and Farnik, Lucy and Houghton, Conor and Aitchison, Laurence},
 4 |   year          = {2024},
 5 |   month         = oct,
 6 |   number        = {arXiv:2409.04185},
 7 |   eprint        = {2409.04185},
 8 |   primaryclass  = {cs},
 9 |   publisher     = {arXiv},
10 |   doi           = {10.48550/arXiv.2409.04185},
11 |   urldate       = {2024-10-08},
12 |   archiveprefix = {arXiv}
13 | }


--------------------------------------------------------------------------------
/figures/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tim-lawson/mlsae/03ad37a0a1b4541d763859cb0c7c9ccb7ce67867/figures/__init__.py


--------------------------------------------------------------------------------
/figures/embed_sim.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | from dataclasses import dataclass
  3 | from pathlib import Path
  4 | 
  5 | import numpy as np
  6 | import pandas as pd
  7 | import torch
  8 | from simple_parsing import field, parse
  9 | from transformers import AutoTokenizer, GPTNeoXForCausalLM
 10 | 
 11 | from mlsae.model import MLSAETransformer
 12 | from mlsae.trainer.config import SweepConfig
 13 | from mlsae.utils import get_device, get_repo_id, normalize
 14 | 
 15 | 
 16 | @dataclass
 17 | class Config(SweepConfig):
 18 |     filename: str = "embed_sim.csv"
 19 |     """The name of the file to save the results to."""
 20 | 
 21 |     latents: list[int] = field(default_factory=lambda: [])
 22 |     """The latent indices to find the most similar embeddings to."""
 23 | 
 24 |     n_embeds: int = 8
 25 |     """The number of most similar embeddings to save."""
 26 | 
 27 |     seed: int = 42
 28 |     """The seed for global random state."""
 29 | 
 30 | 
 31 | @torch.no_grad()
 32 | def get_similar_embeds(
 33 |     config: Config, repo_id: str, model_name: str, device: torch.device
 34 | ) -> tuple[torch.Tensor, torch.Tensor]:
 35 |     tokenizer = AutoTokenizer.from_pretrained(model_name)
 36 |     mlsae = MLSAETransformer.from_pretrained(repo_id).to(device).autoencoder
 37 |     W_dec = normalize(mlsae.decoder.weight)
 38 |     if len(config.latents) > 0:
 39 |         W_dec = W_dec[:, config.latents]
 40 |     latents = (
 41 |         config.latents if len(config.latents) > 0 else list(range(mlsae.n_latents))
 42 |     )
 43 | 
 44 |     def save_csv(topk: torch.return_types.topk, path: Path | str):
 45 |         rows = [
 46 |             {
 47 |                 "latent": latent,
 48 |                 "token": tokenizer.decode(topk.indices[embed_index, latent_index]),
 49 |                 "sim": topk.values[embed_index, latent_index].detach().item(),
 50 |             }
 51 |             for latent_index, latent in enumerate(latents)
 52 |             for embed_index in range(config.n_embeds)
 53 |         ]
 54 |         pd.DataFrame(rows).to_csv(path, index=False)
 55 | 
 56 |     model: GPTNeoXForCausalLM = GPTNeoXForCausalLM.from_pretrained(model_name)  # type: ignore
 57 |     embed_in = normalize(model.get_input_embeddings().weight.to(device), dim=1)
 58 |     embed_out = normalize(model.get_output_embeddings().weight.to(device), dim=1)
 59 | 
 60 |     topk_in = torch.topk(embed_in @ W_dec, k=config.n_embeds, dim=0)
 61 |     topk_out = torch.topk(embed_out @ W_dec, k=config.n_embeds, dim=0)
 62 | 
 63 |     repo_id = repo_id.split("/")[-1]
 64 |     save_csv(topk_in, os.path.join("out", f"embed_in_cos_sim_{repo_id}.csv"))
 65 |     save_csv(topk_out, os.path.join("out", f"embed_out_cos_sim_{repo_id}.csv"))
 66 | 
 67 |     return topk_in.values[0, :], topk_out.values[0, :]
 68 | 
 69 | 
 70 | def main(
 71 |     config: Config, device: torch.device, out: str | os.PathLike[str] = ".out"
 72 | ) -> None:
 73 |     os.makedirs(out, exist_ok=True)
 74 |     rows: list[dict[str, str | int | float]] = []
 75 |     for model_name, expansion_factor, k in config:
 76 |         repo_id = get_repo_id(
 77 |             model_name=model_name,
 78 |             expansion_factor=expansion_factor,
 79 |             k=k,
 80 |             tuned_lens=config.tuned_lens,
 81 |             transformer=False,
 82 |         )
 83 |         topk_in, topk_out = get_similar_embeds(config, repo_id, model_name, device)
 84 |         n_latents = topk_in.shape[0]
 85 |         rows.append(
 86 |             {
 87 |                 "model_name": model_name,
 88 |                 "n_latents": n_latents,
 89 |                 "expansion_factor": expansion_factor,
 90 |                 "k": k,
 91 |                 "tuned_lens": config.tuned_lens,
 92 |                 "in_mean": topk_in.mean().item(),
 93 |                 "in_var": topk_in.var().item(),
 94 |                 "in_std": topk_in.std().item(),
 95 |                 "in_sem": topk_in.std().item() / np.sqrt(n_latents),
 96 |                 "out_mean": topk_out.mean().item(),
 97 |                 "out_var": topk_out.var().item(),
 98 |                 "out_std": topk_out.std().item(),
 99 |                 "out_sem": topk_out.std().item() / np.sqrt(n_latents),
100 |             }
101 |         )
102 |     pd.DataFrame(rows).to_csv(os.path.join(out, config.filename), index=False)
103 | 
104 | 
105 | if __name__ == "__main__":
106 |     main(parse(Config), get_device())
107 | 


--------------------------------------------------------------------------------
/figures/entropy.py:
--------------------------------------------------------------------------------
 1 | import math
 2 | import os
 3 | from dataclasses import dataclass
 4 | 
 5 | import numpy
 6 | import pandas as pd
 7 | import torch
 8 | from simple_parsing import parse
 9 | 
10 | from figures.test import parse_mlsae_repo_id
11 | from mlsae.analysis.dists import Dists
12 | from mlsae.trainer import SweepConfig
13 | from mlsae.utils import get_device
14 | 
15 | 
16 | @dataclass
17 | class Config(SweepConfig):
18 |     filename: str = "entropy.csv"
19 |     """The filename to save the results to."""
20 | 
21 | 
22 | def main(
23 |     config: Config, device: torch.device, out: str | os.PathLike[str] = ".out"
24 | ) -> None:
25 |     os.makedirs(out, exist_ok=True)
26 |     rows: list[dict[str, str | int | float]] = []
27 |     for repo_id in config.repo_ids(transformer=True, tuned_lens=config.tuned_lens):
28 |         dists = Dists.load(repo_id, device)
29 |         values = dists.entropies
30 |         values = values[~torch.isnan(values)]
31 | 
32 |         repo_id = repo_id.split("/")[-1]
33 |         model_name, expansion_factor, k, tuned_lens = parse_mlsae_repo_id(repo_id)
34 | 
35 |         rows.append(
36 |             {
37 |                 "model_name": model_name,
38 |                 "n_layers": dists.n_layers,
39 |                 "n_latents": dists.n_latents,
40 |                 "expansion_factor": expansion_factor,
41 |                 "k": k,
42 |                 "tuned_lens": tuned_lens,
43 |                 "mean": values.mean().item(),
44 |                 "var": values.var().item(),
45 |                 "std": values.std().item(),
46 |                 "sem": values.std().item() / values.size(0) ** 0.5,
47 |                 "rel": values.mean().item() / math.log(dists.n_layers),
48 |             }
49 |         )
50 | 
51 |         hist, bins = numpy.histogram(
52 |             values.cpu().numpy(),
53 |             bins=dists.n_latents // expansion_factor,
54 |             range=(0, math.log(dists.n_layers)),
55 |         )
56 |         hist = numpy.append(hist, 0)
57 |         pd.DataFrame({"bins": bins, "hist": hist}).to_csv(
58 |             os.path.join(out, f"entropy_{repo_id.split("/")[-1]}.csv"), index=False
59 |         )
60 | 
61 |     pd.DataFrame(rows).to_csv(os.path.join(out, config.filename), index=False)
62 | 
63 | 
64 | if __name__ == "__main__":
65 |     main(parse(Config), get_device())
66 | 


--------------------------------------------------------------------------------
/figures/heatmap.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from matplotlib import pyplot as plt
 3 | from matplotlib.colors import Colormap, Normalize
 4 | 
 5 | 
 6 | def save_heatmap(
 7 |     data: torch.Tensor,
 8 |     filename: str,
 9 |     figsize: tuple[float, float] = (5.5, 1.25),
10 |     dpi: int = 1200,
11 |     cmap: str | Colormap | None = "magma_r",
12 |     norm: str | Normalize | None = None,
13 | ) -> None:
14 |     # Exclude latents with only NaN values
15 |     data = data[:, ~torch.all(data.isnan(), dim=0)]
16 | 
17 |     n_layers, n_latents = data.shape
18 |     extent = (0, n_latents, 0, n_layers)
19 | 
20 |     plt.rcParams.update({"axes.linewidth": 0})
21 |     fig, ax = plt.subplots(1, 1, figsize=figsize, dpi=dpi)
22 | 
23 |     ax.imshow(
24 |         data,
25 |         cmap=cmap,
26 |         norm=norm,
27 |         aspect="auto",
28 |         extent=extent,
29 |         interpolation="nearest",
30 |     )
31 |     ax.set_axis_off()
32 | 
33 |     fig.savefig(filename, format="pdf", bbox_inches="tight", pad_inches=0)
34 |     plt.close(fig)
35 | 


--------------------------------------------------------------------------------
/figures/heatmap_aggregate.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from dataclasses import dataclass
 3 | 
 4 | import torch
 5 | from matplotlib.colors import PowerNorm
 6 | from simple_parsing import parse
 7 | 
 8 | from figures.heatmap import save_heatmap
 9 | from mlsae.analysis.dists import Dists
10 | from mlsae.trainer import SweepConfig
11 | from mlsae.utils import get_device
12 | 
13 | 
14 | @dataclass
15 | class Config(SweepConfig):
16 |     mode: str = "probs"
17 |     """Whether to plot counts, totals, or probabilities."""
18 | 
19 |     gamma: float = 0.5
20 |     """Gamma value for PowerNorm. Only applies to counts and totals."""
21 | 
22 | 
23 | def get_heatmap_data(dists: Dists, mode: str) -> torch.Tensor:
24 |     if mode == "counts":
25 |         return dists.counts
26 |     if mode == "totals":
27 |         return dists.totals
28 |     if mode == "probs":
29 |         return dists.probs
30 |     raise ValueError(f"Invalid mode: {mode}")
31 | 
32 | 
33 | def get_heatmap_filename(repo_id: str, mode: str) -> str:
34 |     return f"heatmap_aggregate_{mode}_{repo_id.split('/')[-1]}.pdf"
35 | 
36 | 
37 | def main(
38 |     repo_id: str,
39 |     config: Config,
40 |     device: torch.device,
41 |     out: str | os.PathLike[str] = ".out",
42 | ):
43 |     os.makedirs(out, exist_ok=True)
44 |     norm = None if config.mode == "probs" else PowerNorm(config.gamma)
45 |     dists = Dists.load(repo_id, device)
46 |     _, indices = dists.layer_mean.sort(descending=True)
47 |     save_heatmap(
48 |         get_heatmap_data(dists, config.mode)[:, indices].cpu(),
49 |         os.path.join(out, get_heatmap_filename(repo_id, config.mode)),
50 |         norm=norm,
51 |     )
52 | 
53 | 
54 | def sweep(
55 |     config: Config, device: torch.device, out: str | os.PathLike[str] = ".out"
56 | ) -> None:
57 |     os.makedirs(out, exist_ok=True)
58 |     for repo_id in config.repo_ids(transformer=True, tuned_lens=config.tuned_lens):
59 |         main(repo_id, config, device, out)
60 | 
61 | 
62 | if __name__ == "__main__":
63 |     device = get_device()
64 |     sweep(parse(Config), device)
65 | 


--------------------------------------------------------------------------------
/figures/heatmap_prompt.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from dataclasses import dataclass
 3 | 
 4 | import torch
 5 | from matplotlib.colors import PowerNorm
 6 | from simple_parsing import parse
 7 | 
 8 | from figures.heatmap import save_heatmap
 9 | from mlsae.model import MLSAETransformer
10 | from mlsae.model.decoder import scatter_topk
11 | from mlsae.trainer import SweepConfig
12 | from mlsae.utils import get_device
13 | 
14 | 
15 | @dataclass
16 | class Config(SweepConfig):
17 |     prompt: str = "When Mary and John went to the store, John gave a drink to"
18 |     """The prompt to generate heatmaps for."""
19 | 
20 |     dead_threshold: float = 1e-3
21 |     """The threshold activation to exclude latents."""
22 | 
23 |     mode: str = "probs"
24 |     """Whether to plot counts, totals, or probabilities."""
25 | 
26 |     gamma: float = 0.5
27 |     """Gamma value for PowerNorm. Only applies to counts and totals."""
28 | 
29 | 
30 | @torch.no_grad()
31 | def get_heatmap_data(
32 |     config: Config, repo_id: str, device: torch.device | str
33 | ) -> torch.Tensor:
34 |     model = MLSAETransformer.from_pretrained(repo_id).to(device)
35 |     model.transformer.tokenizer.pad_token = model.transformer.tokenizer.eos_token
36 |     assert model.transformer.tokenizer.pad_token_id is not None
37 | 
38 |     tokens = torch.tensor(
39 |         model.transformer.tokenizer.encode(
40 |             config.prompt,
41 |             padding="max_length",
42 |             max_length=model.max_length,
43 |         )
44 |     )
45 | 
46 |     inputs = model.transformer.forward(tokens.unsqueeze(0).to(device))
47 |     inputs = inputs[:, :, tokens.ne(model.transformer.tokenizer.pad_token_id), :]
48 | 
49 |     topk = model.autoencoder.forward(inputs).topk
50 | 
51 |     latents = scatter_topk(topk, model.n_latents).squeeze()
52 | 
53 |     probs = latents.sum(dim=1) / latents.sum(dim=1).sum(dim=0, keepdim=True)
54 | 
55 |     if config.mode == "counts":
56 |         data = latents.where(latents.gt(config.dead_threshold), 0).float().sum(dim=1)
57 |     elif config.mode == "totals":
58 |         data = latents.sum(dim=1)
59 |     elif config.mode == "probs":
60 |         latents = latents.sum(dim=1)
61 |         data = latents / latents.sum(dim=0, keepdim=True)
62 |     else:
63 |         raise ValueError(f"Invalid mode: {config.mode}")
64 | 
65 |     # Exclude latents that never activate
66 |     mask = torch.any(data.gt(0), dim=0)
67 |     data = data[:, mask]
68 | 
69 |     layers = torch.arange(0, model.n_layers, device=device).unsqueeze(-1)
70 | 
71 |     _, indices = (probs[:, mask] * layers).sum(0).sort(descending=True)
72 | 
73 |     return data[:, indices]
74 | 
75 | 
76 | def get_heatmap_filename(repo_id: str, mode: str) -> str:
77 |     return f"heatmap_prompt_{mode}_{repo_id.split('/')[-1]}.pdf"
78 | 
79 | 
80 | def sweep(
81 |     config: Config, device: torch.device | str, out: str | os.PathLike[str] = ".out"
82 | ) -> None:
83 |     os.makedirs(out, exist_ok=True)
84 |     norm = None if config.mode == "probs" else PowerNorm(config.gamma)
85 |     for repo_id in config.repo_ids(transformer=True, tuned_lens=config.tuned_lens):
86 |         data = get_heatmap_data(config, repo_id, device)
87 |         save_heatmap(
88 |             data.cpu(),
89 |             os.path.join(out, get_heatmap_filename(repo_id, config.mode)),
90 |             norm=norm,
91 |         )
92 | 
93 | 
94 | if __name__ == "__main__":
95 |     sweep(parse(Config), get_device())
96 | 


--------------------------------------------------------------------------------
/figures/layer_hist.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from dataclasses import dataclass
 3 | 
 4 | import numpy
 5 | import pandas as pd
 6 | import torch
 7 | from simple_parsing import parse
 8 | 
 9 | from mlsae.analysis.dists import Dists
10 | from mlsae.trainer import SweepConfig
11 | from mlsae.utils import get_device
12 | 
13 | 
14 | @dataclass
15 | class Config(SweepConfig):
16 |     noninteger: bool = False
17 |     """Whether to plot the non-integer component of the center of mass."""
18 | 
19 | 
20 | def main(
21 |     config: Config, device: torch.device, out: str | os.PathLike[str] = ".out"
22 | ) -> None:
23 |     os.makedirs(out, exist_ok=True)
24 |     for repo_id in config.repo_ids(transformer=True, tuned_lens=config.tuned_lens):
25 |         dists = Dists.load(repo_id, device)
26 |         values = dists.layer_mean[~torch.isnan(dists.layer_mean)].cpu().numpy()
27 | 
28 |         repo_id = repo_id.split("/")[-1]
29 |         bins = 16 * dists.n_layers
30 | 
31 |         if config.noninteger:
32 |             values = numpy.abs(values - numpy.round(values))
33 |             range = (0, 0.5)
34 |             filename = f"layer_hist_nonint_{repo_id}.csv"
35 |         else:
36 |             range = (0, dists.n_layers - 1)
37 |             filename = f"layer_hist_{repo_id}.csv"
38 | 
39 |         hist, bins = numpy.histogram(values, bins=bins, range=range, density=True)
40 |         hist = numpy.append(hist, 0)  # bins has one more element
41 |         pd.DataFrame({"layer": bins, "density": hist}).to_csv(
42 |             os.path.join(out, filename), index=False
43 |         )
44 | 
45 | 
46 | if __name__ == "__main__":
47 |     main(parse(Config), get_device())
48 | 


--------------------------------------------------------------------------------
/figures/layer_sim.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | import torch
 4 | from matplotlib import pyplot as plt
 5 | from matplotlib.colors import Colormap
 6 | from simple_parsing import parse
 7 | 
 8 | from mlsae.analysis.dists import Dists
 9 | from mlsae.model import MLSAETransformer
10 | from mlsae.trainer.config import SweepConfig
11 | from mlsae.utils import get_device, normalize
12 | 
13 | 
14 | @torch.no_grad()
15 | def get_heatmap_data(
16 |     repo_id: str, device: torch.device
17 | ) -> tuple[torch.Tensor, torch.Tensor]:
18 |     mlsae = MLSAETransformer.from_pretrained(repo_id).to(device).autoencoder
19 |     W_dec = mlsae.decoder.weight.detach()
20 |     W_dec = normalize(W_dec)
21 | 
22 |     # Sort latents in descending order of mean layer
23 |     dists = Dists.load(repo_id, device)
24 |     _, indices = dists.layer_mean.sort(descending=True)
25 |     W_dec = W_dec[:, indices]
26 | 
27 |     # Pairwise differences between mean layers
28 |     layer_mean = dists.layer_mean.view(-1, 1) - dists.layer_mean.view(1, -1)
29 | 
30 |     # Pairwise cosine similarities between decoder weight vectors
31 |     cos_sim = torch.mm(W_dec.T, W_dec)
32 | 
33 |     # Remove duplicates and self-similarities
34 |     triu_indices = torch.triu_indices(*cos_sim.shape, offset=1)
35 |     x = layer_mean[*triu_indices].cpu()
36 |     y = cos_sim[*triu_indices].cpu()
37 | 
38 |     return x, y
39 | 
40 | 
41 | def save_heatmap(
42 |     x: torch.Tensor,
43 |     y: torch.Tensor,
44 |     filename: str,
45 |     figsize: tuple[float, float] = (2, 2),
46 |     dpi: int = 300,
47 |     cmap: str | Colormap | None = "magma_r",
48 | ) -> None:
49 |     plt.rcParams.update({"axes.linewidth": 0})
50 |     fig, ax = plt.subplots(1, 1, figsize=figsize, dpi=dpi)
51 |     ax.hist2d(x, y, bins=[64, 512], range=[(0, 5), (-0.25, 0.25)], cmap=cmap)
52 |     ax.set_axis_off()
53 |     fig.savefig(filename, format="pdf", bbox_inches="tight", pad_inches=0)
54 |     plt.close(fig)
55 | 
56 | 
57 | def main(
58 |     config: SweepConfig, device: torch.device, out: str | os.PathLike[str] = ".out"
59 | ) -> None:
60 |     os.makedirs(out, exist_ok=True)
61 |     for repo_id in config.repo_ids(transformer=False, tuned_lens=config.tuned_lens):
62 |         filename = f"layer_sim_{repo_id.split('/')[-1]}.pdf"
63 |         x, y = get_heatmap_data(repo_id, device)
64 |         save_heatmap(x, y, os.path.join(out, filename))
65 | 
66 | 
67 | if __name__ == "__main__":
68 |     main(parse(SweepConfig), get_device())
69 | 


--------------------------------------------------------------------------------
/figures/layer_std.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from dataclasses import dataclass
 3 | 
 4 | import pandas as pd
 5 | import torch
 6 | from simple_parsing import parse
 7 | 
 8 | from mlsae.analysis.dists import Dists, get_stats
 9 | from mlsae.trainer import SweepConfig
10 | from mlsae.utils import get_device, get_repo_id
11 | 
12 | 
13 | @dataclass
14 | class Config(SweepConfig):
15 |     filename: str = "layer_std.csv"
16 |     """The name of the file to save the results to."""
17 | 
18 | 
19 | def main(
20 |     config: Config, device: torch.device, out: str | os.PathLike[str] = ".out"
21 | ) -> None:
22 |     os.makedirs(out, exist_ok=True)
23 |     rows: list[dict[str, str | int | float]] = []
24 |     for model_name, expansion_factor, k in config:
25 |         repo_id = get_repo_id(
26 |             model_name=model_name,
27 |             expansion_factor=expansion_factor,
28 |             k=k,
29 |             tuned_lens=config.tuned_lens,
30 |             transformer=True,
31 |         )
32 |         dists = Dists.load(repo_id, device)
33 |         stats = get_stats(dists.layer_std)
34 |         rows.append(
35 |             {
36 |                 "model_name": model_name,
37 |                 "n_layers": dists.n_layers,
38 |                 "n_latents": dists.n_latents,
39 |                 "expansion_factor": expansion_factor,
40 |                 "k": k,
41 |                 "tuned_lens": config.tuned_lens,
42 |                 **stats,
43 |                 **{f"{k}_rel": v / dists.n_layers for k, v in stats.items()},
44 |             }
45 |         )
46 |     pd.DataFrame(rows).to_csv(os.path.join(out, config.filename), index=False)
47 | 
48 | 
49 | if __name__ == "__main__":
50 |     main(parse(Config), get_device())
51 | 


--------------------------------------------------------------------------------
/figures/mmcs.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from dataclasses import dataclass
 3 | 
 4 | import pandas as pd
 5 | import torch
 6 | from simple_parsing import parse
 7 | from tqdm import tqdm
 8 | 
 9 | from mlsae.model import MLSAETransformer
10 | from mlsae.trainer import SweepConfig
11 | from mlsae.utils import get_device, get_repo_id, normalize
12 | 
13 | 
14 | @dataclass
15 | class Config(SweepConfig):
16 |     filename: str = "mmcs.csv"
17 |     """The name of the file to save the results to."""
18 | 
19 | 
20 | @torch.no_grad()
21 | def get_max_cos_sim(
22 |     model_name: str,
23 |     expansion_factor: int,
24 |     k: int,
25 |     tuned_lens: bool,
26 |     max_latents: int = 16384,
27 |     chunk_size: int = 1024,
28 |     device: torch.device | str = "cpu",
29 | ) -> tuple[torch.Tensor, int]:
30 |     repo_id = get_repo_id(
31 |         model_name=model_name,
32 |         expansion_factor=expansion_factor,
33 |         k=k,
34 |         tuned_lens=tuned_lens,
35 |         transformer=True,
36 |     )
37 |     mlsae = MLSAETransformer.from_pretrained(repo_id).to(device).autoencoder
38 |     W_dec = normalize(mlsae.decoder.weight.detach())
39 | 
40 |     _, n_latents = W_dec.shape
41 |     if n_latents < max_latents:
42 |         # Compute the full cosine similarity matrix
43 |         cos_sim = torch.triu(torch.mm(W_dec.T, W_dec), diagonal=1)
44 |         max_cos_sim = cos_sim.max(dim=0).values
45 |     else:
46 |         # Compute the maximum cosine similarities in chunks
47 |         max_cos_sim = torch.zeros(n_latents, device=device)
48 |         for i in tqdm(range(0, n_latents, chunk_size), total=n_latents // chunk_size):
49 |             chunk_W_dec = W_dec[:, i : i + chunk_size]
50 |             chunk_cos_sim = torch.mm(W_dec.T, chunk_W_dec)
51 |             mask = torch.ones_like(chunk_cos_sim, dtype=torch.bool, device=device)
52 |             mask[: i + chunk_size, :] = torch.triu(
53 |                 mask[: i + chunk_size, :], diagonal=1
54 |             )
55 |             chunk_cos_sim = chunk_cos_sim.masked_fill(~mask, float("-inf"))
56 |             chunk_max_cos_sim = torch.max(chunk_cos_sim, dim=0).values
57 |             max_cos_sim[i : i + chunk_size] = torch.max(
58 |                 max_cos_sim[i : i + chunk_size], chunk_max_cos_sim
59 |             )
60 |     return max_cos_sim.cpu(), mlsae.n_latents
61 | 
62 | 
63 | def main(
64 |     config: Config, device: torch.device, out: str | os.PathLike[str] = ".out"
65 | ) -> None:
66 |     os.makedirs(out, exist_ok=True)
67 |     rows: list[dict[str, str | int | float]] = []
68 |     for model_name, expansion_factor, k in config:
69 |         max_cos_sim, n_latents = get_max_cos_sim(
70 |             model_name=model_name,
71 |             expansion_factor=expansion_factor,
72 |             k=k,
73 |             tuned_lens=config.tuned_lens,
74 |             device=device,
75 |         )
76 |         rows.append(
77 |             {
78 |                 "model_name": model_name,
79 |                 "n_latents": n_latents,
80 |                 "expansion_factor": expansion_factor,
81 |                 "k": k,
82 |                 "tuned_lens": config.tuned_lens,
83 |                 "mean": max_cos_sim.mean().item(),
84 |                 "var": max_cos_sim.var().item(),
85 |                 "std": max_cos_sim.std().item(),
86 |                 "sem": max_cos_sim.std().item() / max_cos_sim.size(0) ** 0.5,
87 |             }
88 |         )
89 |     pd.DataFrame(rows).to_csv(os.path.join(out, config.filename), index=False)
90 | 
91 | 
92 | if __name__ == "__main__":
93 |     main(parse(Config), get_device())
94 | 


--------------------------------------------------------------------------------
/figures/num_layers.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from dataclasses import dataclass
 3 | 
 4 | import numpy as np
 5 | import pandas as pd
 6 | import torch
 7 | from simple_parsing import parse
 8 | 
 9 | from figures.test import parse_mlsae_repo_id
10 | from mlsae.analysis.dists import Dists
11 | from mlsae.trainer import SweepConfig
12 | from mlsae.utils import get_device
13 | 
14 | 
15 | @dataclass
16 | class Config(SweepConfig):
17 |     filename: str = "num_layers.csv"
18 |     """The filename to save the results to."""
19 | 
20 |     threshold: int = 10_000
21 |     """The minimum non-zero activations to be considered 'active' at a layer."""
22 | 
23 | 
24 | def main(
25 |     config: Config, device: torch.device, out: str | os.PathLike[str] = ".out"
26 | ) -> None:
27 |     os.makedirs(out, exist_ok=True)
28 |     rows: list[dict[str, str | int | float]] = []
29 |     for repo_id in config.repo_ids(transformer=True, tuned_lens=config.tuned_lens):
30 |         dists = Dists.load(repo_id, device)
31 |         values = torch.where(dists.counts >= config.threshold, 1, 0).sum(0).float()
32 | 
33 |         repo_id = repo_id.split("/")[-1]
34 |         model_name, expansion_factor, k, tuned_lens = parse_mlsae_repo_id(repo_id)
35 | 
36 |         rows.append(
37 |             {
38 |                 "model_name": model_name,
39 |                 "n_layers": dists.n_layers,
40 |                 "n_latents": dists.n_latents,
41 |                 "expansion_factor": expansion_factor,
42 |                 "k": k,
43 |                 "tuned_lens": tuned_lens,
44 |                 "mean": values.mean().item(),
45 |                 "var": values.var().item(),
46 |                 "std": values.std().item(),
47 |                 "sem": values.std().item() / values.size(0) ** 0.5,
48 |                 "rel": values.mean().item() / dists.n_layers,
49 |             }
50 |         )
51 | 
52 |         values = values.cpu().numpy()
53 |         hist, bins = np.histogram(
54 |             values, bins=dists.n_layers, range=(0, dists.n_layers)
55 |         )
56 |         hist = np.append(hist, 0)
57 |         pd.DataFrame({"bins": bins, "hist": hist}).to_csv(
58 |             os.path.join(out, f"num_layers_{repo_id}_{config.threshold}.csv"),
59 |             index=False,
60 |         )
61 | 
62 |     pd.DataFrame(rows).to_csv(os.path.join(out, config.filename), index=False)
63 | 
64 | 
65 | if __name__ == "__main__":
66 |     main(parse(Config), get_device())
67 | 


--------------------------------------------------------------------------------
/figures/resid_sim.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | import os
  3 | 
  4 | import einops
  5 | import pandas as pd
  6 | import torch
  7 | from simple_parsing import parse
  8 | from tqdm import tqdm
  9 | from tuned_lens import TunedLens
 10 | 
 11 | from mlsae.model import PythiaTransformer, get_test_dataloader
 12 | from mlsae.trainer import RunConfig, initialize
 13 | from mlsae.utils import get_device, normalize
 14 | 
 15 | 
 16 | class VarianceMetric:
 17 |     def __init__(
 18 |         self, size: tuple[int, ...] = (1,), device: torch.device | str = "cpu"
 19 |     ) -> None:
 20 |         self.count = 0
 21 |         self.mean = torch.zeros(size, device=device)
 22 |         self.squared = torch.zeros(size, device=device)
 23 | 
 24 |     def update(self, x: torch.Tensor) -> None:
 25 |         self.count += x.shape[0]
 26 |         delta1 = x - self.mean
 27 |         self.mean += torch.sum(delta1, dim=0) / self.count
 28 |         delta2 = x - self.mean
 29 |         self.squared += torch.sum(delta1 * delta2, dim=0)
 30 | 
 31 |     def compute(self) -> dict[str, torch.Tensor]:
 32 |         var = self.squared / (self.count - 1)
 33 |         std = var.sqrt()
 34 |         sem = std / math.sqrt(self.count)
 35 |         return dict(mean=self.mean, var=var, std=std, sem=sem)
 36 | 
 37 | 
 38 | @torch.no_grad()
 39 | def main(
 40 |     config: RunConfig, device: torch.device, out: str | os.PathLike[str] = ".out"
 41 | ) -> None:
 42 |     os.makedirs(out, exist_ok=True)
 43 |     initialize(config.seed)
 44 | 
 45 |     transformer = PythiaTransformer(
 46 |         config.model_name,
 47 |         config.data.max_length,
 48 |         config.data.batch_size,
 49 |         config.autoencoder.skip_special_tokens,
 50 |         layers=config.layers,
 51 |         device=torch.device(device),
 52 |     )
 53 |     transformer.model.to(device)  # type: ignore
 54 | 
 55 |     lens = (
 56 |         TunedLens.from_model_and_pretrained(
 57 |             transformer.model,
 58 |             transformer.model_name,
 59 |             map_location=device,
 60 |         )
 61 |         if config.autoencoder.tuned_lens
 62 |         else None
 63 |     )
 64 |     lens_name = "lens_" if lens is not None else ""
 65 | 
 66 |     def forward_lens(inputs: torch.Tensor) -> torch.Tensor:
 67 |         if lens is None:
 68 |             return inputs
 69 |         lens.to(inputs.device)
 70 |         for layer in range(transformer.n_layers):
 71 |             inputs[layer, ...] = lens.transform_hidden(inputs[layer, ...], layer)
 72 |         return inputs
 73 | 
 74 |     dataloader = get_test_dataloader(
 75 |         config.model_name,
 76 |         config.data.max_length,
 77 |         config.data.batch_size,
 78 |     )
 79 | 
 80 |     model_name = config.model_name.split("/")[-1]
 81 | 
 82 |     means = [
 83 |         VarianceMetric(size=(transformer.config.hidden_size,), device=device)
 84 |         for _ in range(transformer.n_layers)
 85 |     ]
 86 |     l2_norms = [
 87 |         VarianceMetric(size=(1,), device=device) for _ in range(transformer.n_layers)
 88 |     ]
 89 |     cos_sims = [
 90 |         VarianceMetric(size=(1,), device=device)
 91 |         for _ in range(transformer.n_layers - 1)
 92 |     ]
 93 | 
 94 |     # First, compute the mean residual stream activation vectors over the dataset
 95 |     # https://www.lesswrong.com/s/6njwz6XdSYwNhtsCJ/p/eLNo7b56kQQerCzp2
 96 |     for i, batch in tqdm(enumerate(dataloader), total=config.data.max_steps):
 97 |         x = forward_lens(transformer.forward(batch["input_ids"].to(device)))
 98 |         x = einops.rearrange(x, "l b p i -> l (b p) i")
 99 |         for layer in range(transformer.n_layers):
100 |             means[layer].update(x[layer, ...])
101 |             l2_norms[layer].update(x[layer, ...].norm(dim=-1))
102 |         if i > config.data.max_steps:
103 |             break
104 | 
105 |     l2_norms = [metric.compute() for metric in l2_norms]
106 |     df = pd.DataFrame([{k: v.item() for k, v in layer.items()} for layer in l2_norms])
107 |     df.index.name = "layer"
108 |     df.to_csv(os.path.join("out", f"resid_l2_norm_{lens_name}{model_name}.csv"))
109 | 
110 |     means = [metric.compute() for metric in means]
111 |     means = torch.stack([metric["mean"] for metric in means])  # l i
112 |     assert means.shape == (transformer.n_layers, transformer.config.hidden_size)
113 | 
114 |     # Then, compute the mean cosine similarities between centered residual stream
115 |     # activation vectors at adjacent layers
116 |     for i, batch in tqdm(enumerate(dataloader), total=config.data.max_steps):
117 |         x = forward_lens(transformer.forward(batch["input_ids"].to(device)))
118 |         x = einops.rearrange(x, "l b p i -> l (b p) i")
119 |         x = x - means.unsqueeze(1)
120 |         x = normalize(x, -1)
121 |         for layer in range(transformer.n_layers - 1):
122 |             cos_sim = einops.einsum(x[layer], x[layer + 1], "bp i, bp i -> bp")
123 |             cos_sims[layer].update(cos_sim.flatten())
124 |         if i > config.data.max_steps:
125 |             break
126 | 
127 |     data = [metric.compute() for metric in cos_sims]
128 |     data = [{k: v.item() for k, v in layer.items()} for layer in data]
129 | 
130 |     df = pd.DataFrame(data)
131 |     df.index.name = "start_at_layer"
132 |     df.to_csv(os.path.join(out, f"resid_cos_sim_{lens_name}{model_name}.csv"))
133 | 
134 | 
135 | if __name__ == "__main__":
136 |     main(parse(RunConfig), get_device())
137 | 


--------------------------------------------------------------------------------
/figures/scatter_freq.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | import numpy as np
 4 | import torch
 5 | from matplotlib import pyplot as plt
 6 | from simple_parsing import parse
 7 | 
 8 | from mlsae.analysis.dists import Dists
 9 | from mlsae.trainer import SweepConfig
10 | from mlsae.utils import get_device
11 | 
12 | 
13 | def main(
14 |     config: SweepConfig, device: torch.device, out: str | os.PathLike[str] = ".out"
15 | ) -> None:
16 |     os.makedirs(out, exist_ok=True)
17 |     figsize, dpi = (6, 6), 300
18 | 
19 |     for repo_id in config.repo_ids(tuned_lens=config.tuned_lens):
20 |         model_name = repo_id.split("/")[-1]
21 |         dists = Dists.load(repo_id, device)
22 | 
23 |         fig, ax = plt.subplots(1, 1, figsize=figsize, dpi=dpi)
24 |         ax.set_xlim(0, 1e7)
25 |         cmap = plt.colormaps["viridis"]
26 |         colors = cmap(np.linspace(0, 1), dists.n_layers)
27 | 
28 |         for layer, color in zip(range(dists.n_layers), colors, strict=False):
29 |             ax.scatter(
30 |                 dists.counts[layer],
31 |                 dists.totals[layer],
32 |                 s=2,
33 |                 alpha=0.5,
34 |                 color=color,
35 |             )
36 |         ax.legend([f"Layer {i}" for i in range(dists.n_layers)], loc="upper left")
37 | 
38 |         fig.savefig(os.path.join(out, f"scatter_freq_{model_name}.png"), format="png")
39 |         plt.close(fig)
40 | 
41 | 
42 | if __name__ == "__main__":
43 |     main(parse(SweepConfig), get_device())
44 | 


--------------------------------------------------------------------------------
/figures/wdec_sim.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | import os
  3 | 
  4 | import numpy as np
  5 | import pandas as pd
  6 | import torch
  7 | from simple_parsing import parse
  8 | 
  9 | from mlsae.model import MLSAETransformer
 10 | from mlsae.trainer import SweepConfig
 11 | from mlsae.utils import get_device, normalize
 12 | 
 13 | 
 14 | def get_filename(repo_id: str, mode: str) -> str:
 15 |     return f"wdec_sim_{mode}_{repo_id.split('/')[-1]}.csv"
 16 | 
 17 | 
 18 | def get_positive(
 19 |     shape: torch.Size, n_repeats: int, std: float, device: torch.device
 20 | ) -> torch.Tensor:
 21 |     positive = torch.normal(
 22 |         0,
 23 |         1,
 24 |         (shape[0], math.ceil(shape[1] / n_repeats)),
 25 |         device=device,
 26 |     ).repeat(1, n_repeats)[:, : shape[1]]
 27 |     positive += torch.normal(0, std, positive.shape, device=device)
 28 |     return normalize(positive)
 29 | 
 30 | 
 31 | def get_pairwise_sims(x: torch.Tensor, chunk_size: int = 1024) -> torch.Tensor:
 32 |     _, n_elements = x.shape
 33 |     cos_sim = torch.empty((n_elements * (n_elements - 1)) // 2, device=x.device)
 34 |     idx = 0
 35 |     for i in range(0, n_elements, chunk_size):
 36 |         chunk_i_end = min(i + chunk_size, n_elements)
 37 |         chunk_i = x[:, i:chunk_i_end]
 38 |         for j in range(i, n_elements, chunk_size):
 39 |             if j < i:
 40 |                 continue
 41 |             chunk_j_end = min(j + chunk_size, n_elements)
 42 |             chunk_j = x[:, j:chunk_j_end]
 43 |             chunk_cos_sim = torch.mm(chunk_i.T, chunk_j)
 44 |             if i == j:
 45 |                 triu_indices = torch.triu_indices(
 46 |                     chunk_i_end - i, chunk_j_end - j, offset=1
 47 |                 )
 48 |                 chunk_cos_sim = chunk_cos_sim[triu_indices[0], triu_indices[1]]
 49 |             else:
 50 |                 chunk_cos_sim = chunk_cos_sim.view(-1)
 51 |             next_idx = idx + chunk_cos_sim.shape[0]
 52 |             cos_sim[idx:next_idx] = chunk_cos_sim
 53 |             idx = next_idx
 54 |     return cos_sim
 55 | 
 56 | 
 57 | def get_hist(x: torch.Tensor) -> tuple[np.ndarray, np.ndarray]:
 58 |     hist, bins = np.histogram(
 59 |         get_pairwise_sims(normalize(x)).cpu().numpy(), bins=200, range=(-1, 1)
 60 |     )
 61 |     hist = np.append(hist, 0)
 62 |     return bins, hist
 63 | 
 64 | 
 65 | def main(
 66 |     config: SweepConfig, device: torch.device, out: str | os.PathLike[str] = ".out"
 67 | ) -> None:
 68 |     os.makedirs(out, exist_ok=True)
 69 |     for repo_id in config.repo_ids(transformer=True, tuned_lens=config.tuned_lens):
 70 |         model = MLSAETransformer.from_pretrained(repo_id).to(device)
 71 |         autoencoder = model.autoencoder
 72 |         shape = autoencoder.decoder.weight.shape
 73 | 
 74 |         bins, actual = get_hist(autoencoder.decoder.weight.detach())
 75 | 
 76 |         # Negative control: n_latents IID Gaussian vectors
 77 |         _, negative = get_hist(torch.normal(0, 1, shape, device=device))
 78 | 
 79 |         # Positive control: n_latents // n_layers IID Gaussian vectors, repeated
 80 |         # n_layers times with a small amount of noise
 81 |         _, positive1 = get_hist(get_positive(shape, model.n_layers, 0.1, device))
 82 |         _, positive2 = get_hist(get_positive(shape, model.n_layers, 0.2, device))
 83 |         _, positive3 = get_hist(get_positive(shape, model.n_layers, 0.3, device))
 84 |         _, positive4 = get_hist(get_positive(shape, model.n_layers, 0.4, device))
 85 |         _, positive5 = get_hist(get_positive(shape, model.n_layers, 0.5, device))
 86 |         _, positive6 = get_hist(get_positive(shape, model.n_layers, 0.6, device))
 87 |         _, positive7 = get_hist(get_positive(shape, model.n_layers, 0.7, device))
 88 |         _, positive8 = get_hist(get_positive(shape, model.n_layers, 0.8, device))
 89 |         _, positive9 = get_hist(get_positive(shape, model.n_layers, 0.9, device))
 90 |         _, positive10 = get_hist(get_positive(shape, model.n_layers, 1.0, device))
 91 |         _, positive11 = get_hist(get_positive(shape, model.n_layers, 1.1, device))
 92 |         _, positive12 = get_hist(get_positive(shape, model.n_layers, 1.2, device))
 93 |         _, positive13 = get_hist(get_positive(shape, model.n_layers, 1.3, device))
 94 |         _, positive14 = get_hist(get_positive(shape, model.n_layers, 1.4, device))
 95 |         _, positive15 = get_hist(get_positive(shape, model.n_layers, 1.5, device))
 96 |         _, positive16 = get_hist(get_positive(shape, model.n_layers, 1.6, device))
 97 |         _, positive17 = get_hist(get_positive(shape, model.n_layers, 1.7, device))
 98 |         _, positive18 = get_hist(get_positive(shape, model.n_layers, 1.8, device))
 99 |         _, positive19 = get_hist(get_positive(shape, model.n_layers, 1.9, device))
100 |         _, positive20 = get_hist(get_positive(shape, model.n_layers, 2.0, device))
101 | 
102 |         pd.DataFrame(
103 |             {
104 |                 "bins": bins,
105 |                 "actual": actual,
106 |                 "negative": negative,
107 |                 "positive1": positive1,
108 |                 "positive2": positive2,
109 |                 "positive3": positive3,
110 |                 "positive4": positive4,
111 |                 "positive5": positive5,
112 |                 "positive6": positive6,
113 |                 "positive7": positive7,
114 |                 "positive8": positive8,
115 |                 "positive9": positive9,
116 |                 "positive10": positive10,
117 |                 "positive11": positive11,
118 |                 "positive12": positive12,
119 |                 "positive13": positive13,
120 |                 "positive14": positive14,
121 |                 "positive15": positive15,
122 |                 "positive16": positive16,
123 |                 "positive17": positive17,
124 |                 "positive18": positive18,
125 |                 "positive19": positive19,
126 |                 "positive20": positive20,
127 |             }
128 |         ).to_csv(
129 |             os.path.join(out, f"wdec_sim_{repo_id.split('/')[-1]}.csv"), index=False
130 |         )
131 | 
132 | 
133 | if __name__ == "__main__":
134 |     main(parse(SweepConfig), get_device())
135 | 


--------------------------------------------------------------------------------
/layer_dists.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | from dataclasses import dataclass, field
  3 | 
  4 | import pandas as pd
  5 | import torch
  6 | from datasets import Dataset
  7 | from safetensors.torch import save_file
  8 | from simple_parsing import Serializable, parse
  9 | from tqdm import tqdm
 10 | 
 11 | from mlsae.analysis.dists import Dists, Metric, get_stats
 12 | from mlsae.model import get_test_dataloader
 13 | from mlsae.model.data import DataConfig
 14 | from mlsae.trainer import initialize
 15 | from mlsae.utils import forward_single_layer, get_device, get_repo_id, load_single_layer
 16 | 
 17 | 
 18 | @dataclass
 19 | class Config(Serializable):
 20 |     model_name: str
 21 |     layer: int
 22 |     expansion_factor: int = 64
 23 |     k: int = 32
 24 |     tuned_lens: bool = False
 25 | 
 26 |     data: DataConfig = field(default_factory=DataConfig)
 27 |     """The data configuration. Remember to set max_tokens to a reasonable value!"""
 28 | 
 29 |     seed: int = 42
 30 |     """The seed for global random state."""
 31 | 
 32 |     log_every_n_steps: int | None = 8
 33 |     """The number of steps between logging statistics."""
 34 | 
 35 |     push_to_hub: bool = True
 36 |     """Whether to push the dataset to HuggingFace."""
 37 | 
 38 | 
 39 | @torch.no_grad()
 40 | def get_tensors(config: Config, device: torch.device) -> dict[str, torch.Tensor]:
 41 |     model = load_single_layer(
 42 |         config.model_name,
 43 |         config.layer,
 44 |         device,
 45 |         expansion_factor=config.expansion_factor,
 46 |         k=config.k,
 47 |         tuned_lens=config.tuned_lens,
 48 |     )
 49 | 
 50 |     dataloader = get_test_dataloader(
 51 |         model.model_name,
 52 |         config.data.max_length,
 53 |         config.data.batch_size,
 54 |     )
 55 | 
 56 |     tokens_per_step = config.data.batch_size * config.data.max_length
 57 | 
 58 |     metric = Metric(model.n_layers, model.n_latents, device)
 59 |     rows: list[dict[str, str | int | float]] = []
 60 | 
 61 |     for i, batch in enumerate(tqdm(dataloader, total=config.data.max_steps)):
 62 |         inputs, recons, topk = forward_single_layer(
 63 |             model, batch["input_ids"].to(device)
 64 |         )
 65 |         metric.update(topk)
 66 | 
 67 |         if config.log_every_n_steps is not None and i % config.log_every_n_steps == 0:
 68 |             dists = Dists.from_tensors(metric.compute(), metric.device)
 69 |             rows.append(
 70 |                 {
 71 |                     "model_name": model.model_name,
 72 |                     "n_layers": model.n_layers,
 73 |                     "n_latents": model.n_latents,
 74 |                     "expansion_factor": model.expansion_factor,
 75 |                     "k": model.k,
 76 |                     "step": i,
 77 |                     "tokens": (i + 1) * tokens_per_step,
 78 |                     **get_stats(dists.layer_std),
 79 |                 }
 80 |             )
 81 | 
 82 |         if i > config.data.max_steps:
 83 |             break
 84 | 
 85 |     if len(rows) > 0:
 86 |         repo_id = get_repo_id(
 87 |             config.model_name,
 88 |             config.expansion_factor,
 89 |             config.k,
 90 |             config.tuned_lens,
 91 |             True,
 92 |             [config.layer],
 93 |         ).split("/")[-1]
 94 |         pd.DataFrame(rows).to_csv(
 95 |             os.path.join("out", f"dists_layer_std_step_{repo_id}.csv"), index=False
 96 |         )
 97 | 
 98 |     return metric.compute()
 99 | 
100 | 
101 | def main(config: Config, device: torch.device) -> None:
102 |     initialize(config.seed)
103 | 
104 |     tensors = get_tensors(config, device)
105 |     repo_id = get_repo_id(
106 |         config.model_name,
107 |         config.expansion_factor,
108 |         config.k,
109 |         config.tuned_lens,
110 |         True,
111 |         [config.layer],
112 |     )
113 |     repo_id = Dists.repo_id(repo_id)
114 |     filename = Dists.filename(repo_id)
115 | 
116 |     save_file(tensors, filename)
117 |     _test = Dists.from_tensors(tensors, device)
118 |     _test = Dists.from_file(filename, device)
119 | 
120 |     if config.push_to_hub:
121 |         dataset = Dataset.from_generator(Dists(tensors).__iter__)
122 |         assert isinstance(dataset, Dataset)
123 |         dataset.push_to_hub(repo_id, commit_description=config.dumps_json())
124 |         _test = Dists.from_dataset(dataset, device)
125 |         _test = Dists.from_hub(repo_id, device)
126 | 
127 | 
128 | if __name__ == "__main__":
129 |     main(parse(Config), get_device())
130 | 


--------------------------------------------------------------------------------
/layer_tests.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | from pprint import pprint
  3 | 
  4 | import pandas as pd
  5 | import torch
  6 | from torch.utils.data import DataLoader
  7 | from tqdm import tqdm
  8 | 
  9 | from mlsae.model import DataConfig, MLSAETransformer, get_test_dataloader
 10 | from mlsae.trainer import RunConfig, initialize
 11 | from mlsae.utils import forward_single_layer, get_device, get_repo_id, load_single_layer
 12 | 
 13 | pythia_70m = "EleutherAI/pythia-70m-deduped"
 14 | pythia_160m = "EleutherAI/pythia-160m-deduped"
 15 | pythia_410m = "EleutherAI/pythia-410m-deduped"
 16 | # pythia_1b = "EleutherAI/pythia-1b-deduped"
 17 | 
 18 | layers = {
 19 |     pythia_70m: range(6),
 20 |     pythia_160m: range(12),
 21 |     pythia_410m: range(24),
 22 |     # pythia_1b: range(16),
 23 | }
 24 | 
 25 | config = RunConfig(data=DataConfig(max_tokens=1_000_000))
 26 | 
 27 | 
 28 | def test(model_name: str, layer: int, tuned_lens: bool):
 29 |     initialize(config.seed)
 30 |     device = get_device()
 31 | 
 32 |     model = load_single_layer(model_name, layer, device)
 33 | 
 34 |     dataloader = get_test_dataloader(
 35 |         model.model_name,
 36 |         config.data.max_length,
 37 |         config.data.batch_size,
 38 |         config.data.num_workers or 1,
 39 |     )
 40 | 
 41 |     output = test_manual(model, dataloader, device)
 42 |     output = {k: v.item() for k, v in output.items()}
 43 |     pprint(output)
 44 | 
 45 |     filename_repo_id = get_repo_id(model_name, 64, 32, tuned_lens, True, [layer])
 46 |     filename = f"test_{filename_repo_id.split('/')[-1]}.csv"
 47 |     pd.DataFrame(output, index=[0]).to_csv(os.path.join("out", filename), index=False)
 48 | 
 49 | 
 50 | def test_manual(
 51 |     model: MLSAETransformer, dataloader: DataLoader[torch.Tensor], device: torch.device
 52 | ) -> dict[str, torch.Tensor]:
 53 |     def compute() -> dict[str, torch.Tensor]:
 54 |         return {
 55 |             **model.train_metrics.compute(),
 56 |             **model.val_metrics.compute(),
 57 |             "loss/mse": model.mse_loss.compute(),
 58 |             "loss/auxk": model.aux_loss.compute(),
 59 |             "loss/total": model.mse_loss.compute() + model.aux_loss.compute(),
 60 |         }
 61 | 
 62 |     pbar = tqdm(total=config.data.max_steps)
 63 |     for i, batch in enumerate(dataloader):
 64 |         if i >= config.data.max_steps:
 65 |             break
 66 | 
 67 |         tokens: torch.Tensor = batch["input_ids"].to(device)
 68 |         inputs, recons, topk = forward_single_layer(model, tokens)
 69 | 
 70 |         model.train_metrics.forward(
 71 |             inputs=inputs,
 72 |             indices=topk.indices,
 73 |             values=topk.values,
 74 |             recons=recons,
 75 |         )
 76 | 
 77 |         recons = model.inverse_lens(recons)
 78 | 
 79 |         model.forward_at_layer(inputs, recons, tokens)
 80 |         model.val_metrics.forward(
 81 |             loss_true=model.loss_true,
 82 |             loss_pred=model.loss_pred,
 83 |             # logits_true=model.logits_true,
 84 |             # logits_pred=model.logits_pred,
 85 |         )
 86 | 
 87 |         model.mse_loss.forward(inputs=inputs, recons=recons)
 88 | 
 89 |         # pbar.write(str(compute()))
 90 |         pbar.update(1)
 91 | 
 92 |     return compute()
 93 | 
 94 | 
 95 | def main() -> None:
 96 |     for model_name in [pythia_70m, pythia_160m, pythia_410m]:
 97 |         for layer in layers[model_name]:
 98 |             test(model_name, layer, False)
 99 | 
100 |     for model_name in [pythia_70m]:
101 |         for layer in layers[model_name]:
102 |             test(model_name, layer, True)
103 | 
104 | 
105 | if __name__ == "__main__":
106 |     main()
107 | 


--------------------------------------------------------------------------------
/mlsae/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tim-lawson/mlsae/03ad37a0a1b4541d763859cb0c7c9ccb7ce67867/mlsae/__init__.py


--------------------------------------------------------------------------------
/mlsae/analysis/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tim-lawson/mlsae/03ad37a0a1b4541d763859cb0c7c9ccb7ce67867/mlsae/analysis/__init__.py


--------------------------------------------------------------------------------
/mlsae/analysis/variances.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | from dataclasses import dataclass
  3 | 
  4 | import einops
  5 | import pandas as pd
  6 | import torch
  7 | from simple_parsing import field, parse
  8 | from tqdm import tqdm
  9 | 
 10 | from mlsae.model import DataConfig, MLSAETransformer, get_test_dataloader
 11 | from mlsae.model.decoder import scatter_topk
 12 | from mlsae.trainer.config import SweepConfig, initialize
 13 | from mlsae.utils import get_device
 14 | 
 15 | 
 16 | @dataclass
 17 | class Config(SweepConfig):
 18 |     data: DataConfig = field(default_factory=DataConfig)
 19 |     """The data configuration. Remember to set max_tokens to a reasonable value!"""
 20 | 
 21 |     seed: int = 42
 22 |     """The seed for global random state."""
 23 | 
 24 |     filename: str = "variances.csv"
 25 |     """The name of the file to save the results to."""
 26 | 
 27 | 
 28 | class Metric:
 29 |     def __init__(
 30 |         self,
 31 |         n_layers: int,
 32 |         n_tokens: int,
 33 |         n_latents: int,
 34 |         device: torch.device | str = "cpu",
 35 |     ) -> None:
 36 |         self.n_layers = n_layers
 37 |         self.n_tokens = n_tokens
 38 |         self.n_latents = n_latents
 39 |         self.layers = torch.arange(self.n_layers, device=device)
 40 | 
 41 |         self.exp_var_l_f = []
 42 |         self.exp_var_l_tf = []
 43 |         self.var_l = []
 44 |         self.rel_var_f = []
 45 |         self.rel_var_t = []
 46 | 
 47 |     def var(self, x: torch.Tensor):
 48 |         layers = self.layers.view((self.n_layers, *([1] * (len(x.shape) - 1))))
 49 | 
 50 |         ell = (layers * x).sum(dim=0)
 51 |         ell_sq = ((layers**2) * x).sum(dim=0)
 52 |         return ell_sq - ell**2
 53 | 
 54 |     def update(self, latents: torch.Tensor):
 55 |         assert latents.shape == (self.n_layers, self.n_tokens, self.n_latents)
 56 | 
 57 |         probs = latents / latents.sum(dim=0)
 58 |         probs = probs.nan_to_num_(0.0)
 59 | 
 60 |         e_var_l_f = self.var(probs.mean(1)).mean()
 61 |         e_var_l_tf = self.var(probs).mean()
 62 |         var_l = self.var(probs.mean((1, 2)))
 63 | 
 64 |         self.exp_var_l_f.append(e_var_l_f)
 65 |         self.exp_var_l_tf.append(e_var_l_tf)
 66 |         self.var_l.append(var_l)
 67 |         self.rel_var_f.append(e_var_l_f / var_l)
 68 |         self.rel_var_t.append(e_var_l_tf / e_var_l_f)
 69 | 
 70 |     def compute(self) -> dict[str, float]:
 71 |         return dict(
 72 |             exp_var_l_f=torch.stack(self.exp_var_l_f).mean().item(),
 73 |             exp_var_l_tf=torch.stack(self.exp_var_l_tf).mean().item(),
 74 |             var_l=torch.stack(self.var_l).mean().item(),
 75 |             rel_var_f=torch.stack(self.rel_var_f).mean().item(),
 76 |             rel_var_t=torch.stack(self.rel_var_t).mean().item(),
 77 |         )
 78 | 
 79 | 
 80 | @torch.no_grad()
 81 | def main(
 82 |     repo_id: str,
 83 |     data: DataConfig,
 84 |     device: torch.device | str = "cpu",
 85 |     out: str | os.PathLike[str] = ".out",
 86 | ) -> dict:
 87 |     model = MLSAETransformer.from_pretrained(repo_id).to(device)
 88 | 
 89 |     dataloader = get_test_dataloader(model.model_name, data.max_length, data.batch_size)
 90 | 
 91 |     tokens_per_step = data.batch_size * data.max_length
 92 | 
 93 |     metric = Metric(model.n_layers, tokens_per_step, model.n_latents, device)
 94 | 
 95 |     i = 0
 96 |     for i, batch in enumerate(tqdm(dataloader, total=data.max_steps)):
 97 |         inputs = model.transformer.forward(batch["input_ids"].to(device))
 98 |         topk, auxk, stats, dead = model.autoencoder.encode(inputs)
 99 | 
100 |         latents = scatter_topk(topk, model.n_latents)
101 |         latents = einops.rearrange(latents, "l b t f -> l (b t) f")
102 | 
103 |         metric.update(latents)
104 | 
105 |         if i > data.max_steps:
106 |             break
107 | 
108 |     row = {
109 |         "model_name": model.model_name,
110 |         "n_layers": model.n_layers,
111 |         "n_latents": model.n_latents,
112 |         "expansion_factor": model.expansion_factor,
113 |         "k": model.k,
114 |         "step": i,
115 |         "tokens": (i + 1) * tokens_per_step,
116 |         **metric.compute(),
117 |     }
118 |     pd.DataFrame({k: [v] for k, v in row.items()}).to_csv(
119 |         os.path.join(out, f"variances_{repo_id.split("/")[-1]}.csv"), index=False
120 |     )
121 |     return row
122 | 
123 | 
124 | def sweep(
125 |     config: Config, device: torch.device, out: str | os.PathLike[str] = ".out"
126 | ) -> None:
127 |     initialize(config.seed)
128 |     rows: list[dict] = []
129 |     for repo_id in config.repo_ids(transformer=True):
130 |         rows.append(main(repo_id, config.data, device=device))
131 |     pd.DataFrame(rows).to_csv(os.path.join(out, config.filename), index=False)
132 | 
133 | 
134 | if __name__ == "__main__":
135 |     sweep(parse(Config), get_device())
136 | 


--------------------------------------------------------------------------------
/mlsae/api/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tim-lawson/mlsae/03ad37a0a1b4541d763859cb0c7c9ccb7ce67867/mlsae/api/__init__.py


--------------------------------------------------------------------------------
/mlsae/api/__main__.py:
--------------------------------------------------------------------------------
  1 | from dataclasses import dataclass
  2 | 
  3 | import orjson
  4 | from fastapi import FastAPI
  5 | from fastapi.middleware.cors import CORSMiddleware
  6 | from fastapi.responses import JSONResponse
  7 | from pydantic import BaseModel
  8 | from simple_parsing import Serializable, parse
  9 | 
 10 | from mlsae.trainer import initialize
 11 | from mlsae.utils import get_device
 12 | 
 13 | from .analyser import Analyser
 14 | from .models import (
 15 |     Example,
 16 |     LatentActivations,
 17 |     LayerHistograms,
 18 |     LogitChanges,
 19 |     MaxLogits,
 20 |     Token,
 21 | )
 22 | 
 23 | 
 24 | @dataclass
 25 | class Config(Serializable):
 26 |     repo_id: str
 27 |     """
 28 |     The name of a pretrained autoencoder and transformer from HuggingFace, or the path
 29 |     to a directory that contains them.
 30 |     """
 31 | 
 32 | 
 33 | config = parse(Config)
 34 | analyser = Analyser(repo_id=config.repo_id, device=get_device())
 35 | 
 36 | 
 37 | class ORJSONResponse(JSONResponse):
 38 |     media_type = "application/json"
 39 | 
 40 |     def render(self, content) -> bytes:
 41 |         return orjson.dumps(content)
 42 | 
 43 | 
 44 | app = FastAPI(
 45 |     docs_url="/api/py/docs",
 46 |     openapi_url="/api/py/openapi.json",
 47 |     default_response_class=ORJSONResponse,
 48 | )
 49 | app.add_middleware(
 50 |     CORSMiddleware,
 51 |     allow_origins=["*"],
 52 |     allow_credentials=True,
 53 |     allow_methods=["*"],
 54 |     allow_headers=["*"],
 55 | )
 56 | 
 57 | 
 58 | @app.get("/api/py/params")
 59 | async def params() -> dict:
 60 |     return analyser.params()
 61 | 
 62 | 
 63 | class ExamplesRequest(BaseModel):
 64 |     layer: int
 65 |     latent: int
 66 | 
 67 | 
 68 | @app.post("/api/py/examples")
 69 | async def examples(body: ExamplesRequest) -> list[Example]:
 70 |     return analyser.latent_examples(body.layer, body.latent)
 71 | 
 72 | 
 73 | class PromptRequest(BaseModel):
 74 |     prompt: str
 75 | 
 76 | 
 77 | @app.post("/api/py/prompt/tokens")
 78 | async def prompt_tokens(body: PromptRequest) -> list[Token]:
 79 |     return analyser.prompt_tokens(body.prompt)
 80 | 
 81 | 
 82 | @app.post("/api/py/prompt/metrics")
 83 | async def prompt_metrics(body: PromptRequest) -> dict[str, float]:
 84 |     return analyser.prompt_metrics(body.prompt)
 85 | 
 86 | 
 87 | @app.post("/api/py/prompt/latent-activations")
 88 | async def prompt_latent_activations(body: PromptRequest) -> LatentActivations:
 89 |     return analyser.prompt_latent_activations(body.prompt)
 90 | 
 91 | 
 92 | @app.post("/api/py/prompt/layer-histograms")
 93 | async def prompt_layer_histograms(body: PromptRequest) -> LayerHistograms:
 94 |     return analyser.prompt_layer_histograms(body.prompt)
 95 | 
 96 | 
 97 | @app.post("/api/py/prompt/logits-input")
 98 | async def prompt_logits_input(body: PromptRequest) -> MaxLogits:
 99 |     return analyser.prompt_logits_input(body.prompt)
100 | 
101 | 
102 | class PromptLogitsReconRequest(BaseModel):
103 |     prompt: str
104 |     layer: int
105 | 
106 | 
107 | @app.post("/api/py/prompt/logits-recon")
108 | async def prompt_logits_recon(
109 |     body: PromptLogitsReconRequest,
110 | ) -> tuple[MaxLogits, LogitChanges]:
111 |     return analyser.prompt_logits_recon(body.prompt, body.layer)
112 | 
113 | 
114 | class PromptLogitsSteerRequest(BaseModel):
115 |     prompt: str
116 |     latent: int
117 |     layer: int
118 |     factor: float
119 | 
120 | 
121 | @app.post("/api/py/prompt/logits-steer")
122 | async def prompt_logits_steer(
123 |     body: PromptLogitsSteerRequest,
124 | ) -> tuple[MaxLogits, LogitChanges]:
125 |     return analyser.prompt_logits_steer(
126 |         body.prompt, body.latent, body.layer, body.factor
127 |     )
128 | 
129 | 
130 | if __name__ == "__main__":
131 |     import uvicorn
132 | 
133 |     initialize(42)
134 |     uvicorn.run(app, port=8001)
135 | 


--------------------------------------------------------------------------------
/mlsae/api/models.py:
--------------------------------------------------------------------------------
 1 | from pydantic import BaseModel
 2 | 
 3 | 
 4 | class Token(BaseModel):
 5 |     id: int
 6 |     """The token id."""
 7 |     token: str
 8 |     """The token string."""
 9 |     pos: int
10 |     """The token position."""
11 | 
12 | 
13 | class Logit(BaseModel):
14 |     id: int
15 |     """The token id."""
16 |     token: str
17 |     """The token string."""
18 |     logit: float
19 |     """The logit value."""
20 |     prob: float | None = None
21 |     """The softmax-normalized logit value."""
22 | 
23 | 
24 | class MaxLogits(BaseModel):
25 |     max: list[list[Logit]]
26 |     """The maximum logit values for each token position."""
27 | 
28 | 
29 | class LogitChanges(BaseModel):
30 |     max: list[list[Logit]]
31 |     """The maximum changes in logit values for each token position."""
32 |     min: list[list[Logit]]
33 |     """The minimum changes in logit values for each token position."""
34 | 
35 | 
36 | class LatentActivations(BaseModel):
37 |     values: list[list[list[float]]]
38 |     """The latent activations for each layer, position, and latent dimension."""
39 |     max: list[list[float]]
40 |     """The maximum latent activations for each layer and token position."""
41 | 
42 | 
43 | class LayerHistograms(BaseModel):
44 |     values: list[list[int]]
45 |     """The histogram values for each layer."""
46 |     edges: list[float]
47 |     """The histogram edges across all layers."""
48 | 
49 | 
50 | class Example(BaseModel):
51 |     latent: int
52 |     "The latent index."
53 |     layer: int
54 |     "The layer index."
55 |     token_id: int
56 |     """The token id for the maximum activation."""
57 |     token: str
58 |     """The token string for the maximum activation."""
59 |     act: float
60 |     """The maximum activation value."""
61 |     token_ids: list[int]
62 |     """The token ids around the maximum."""
63 |     tokens: list[str]
64 |     """The token strings around the maximum."""
65 |     acts: list[float]
66 |     """The activation values around the maximum."""
67 | 


--------------------------------------------------------------------------------
/mlsae/metrics/__init__.py:
--------------------------------------------------------------------------------
 1 | from .auxiliary_loss import AuxiliaryLoss
 2 | from .dead_latents import DeadLatents
 3 | from .layerwise import LayerwiseWrapper, layerwise
 4 | from .layerwise_fvu import LayerwiseFVU
 5 | from .layerwise_l0_norm import LayerwiseL0Norm
 6 | from .layerwise_l1_norm import LayerwiseL1Norm
 7 | from .layerwise_logit_kl_div import LayerwiseLogitKLDiv
 8 | from .layerwise_logit_mse import LayerwiseLogitMSE
 9 | from .layerwise_loss_delta import LayerwiseLossDelta
10 | from .layerwise_mse import LayerwiseMSE
11 | from .mse_loss import MSELoss
12 | 
13 | __all__ = [
14 |     "AuxiliaryLoss",
15 |     "DeadLatents",
16 |     "layerwise",
17 |     "LayerwiseFVU",
18 |     "LayerwiseL0Norm",
19 |     "LayerwiseL1Norm",
20 |     "LayerwiseLogitKLDiv",
21 |     "LayerwiseLogitMSE",
22 |     "LayerwiseLossDelta",
23 |     "LayerwiseMSE",
24 |     "LayerwiseWrapper",
25 |     "MSELoss",
26 | ]
27 | 


--------------------------------------------------------------------------------
/mlsae/metrics/auxiliary_loss.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from jaxtyping import Float
 3 | from torchmetrics import Metric
 4 | 
 5 | 
 6 | class AuxiliaryLoss(Metric):
 7 |     """
 8 |     The auxiliary loss (AuxK) models the reconstruction error using the top-`k_aux` dead
 9 |     latents (typically `d_model // 2`) [Gao et al., 2024].
10 |     Latents are flagged as dead during training if they have not activated for
11 |     some predetermined number of tokens (typically 10 million).
12 | 
13 |     Then, given the reconstruction error of the main model `e = inputs - recons`, we
14 |     define the auxiliary loss as the MSE between `e` and the reconstruction using the
15 |     top `k_aux` dead latents.
16 |     We compute the MSE normalization per token, because the scale of the error changes
17 |     throughout training.
18 |     """
19 | 
20 |     is_differentiable = True
21 |     full_state_update = False
22 | 
23 |     auxk_coef: float
24 |     """Coefficient of the auxiliary loss."""
25 | 
26 |     auxk_mse: Float[torch.Tensor, ""]
27 |     """Sum of MSEs between reconstruction errors and top-`k_aux` reconstructions."""
28 | 
29 |     def __init__(self, auxk_coef: float) -> None:
30 |         super().__init__()
31 |         self.auxk_coef = auxk_coef
32 |         self.add_state(
33 |             "auxk_mse",
34 |             torch.zeros(1, dtype=torch.float),
35 |             dist_reduce_fx="sum",
36 |         )
37 | 
38 |     def update(
39 |         self,
40 |         inputs: Float[torch.Tensor, "n_layers batch pos n_inputs"],
41 |         recons: Float[torch.Tensor, "n_layers batch pos n_inputs"],
42 |         auxk_recons: Float[torch.Tensor, "n_layers batch pos n_inputs"] | None,
43 |         **kwargs,
44 |     ) -> None:
45 |         if auxk_recons is not None:
46 |             error = inputs - recons
47 |             self.auxk_mse.add_(
48 |                 (error - auxk_recons).pow(2).mean()
49 |                 / (error - torch.mean(error, dim=3, keepdim=True)).pow(2).mean()
50 |             )
51 | 
52 |     def compute(self) -> Float[torch.Tensor, ""]:
53 |         return self.auxk_coef * self.auxk_mse.nan_to_num(0)
54 | 


--------------------------------------------------------------------------------
/mlsae/metrics/dead_latents.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from jaxtyping import Float, Int
 3 | from torchmetrics import Metric
 4 | 
 5 | 
 6 | class DeadLatents(Metric):
 7 |     """
 8 |     Estimate the fraction of dead latents from the number of tokens activated by each
 9 |     latent and the number of tokens elapsed in a training step.
10 | 
11 |     Note that we consider a latent live if it is activated *at any layer*.
12 |     """
13 | 
14 |     is_differentiable = False
15 |     full_state_update = False
16 | 
17 |     latent_tokens: Float[torch.Tensor, "n_latents"]
18 |     """Count of tokens activated by each latent."""
19 | 
20 |     tokens: Int[torch.Tensor, ""]
21 |     """Count of tokens."""
22 | 
23 |     def __init__(self, n_latents: int, dead_tokens_threshold: float) -> None:
24 |         super().__init__()
25 |         self.n_latents = n_latents
26 |         self.dead_tokens_threshold = dead_tokens_threshold
27 |         self.add_state(
28 |             "latent_tokens",
29 |             torch.zeros(n_latents, dtype=torch.float),
30 |             dist_reduce_fx="sum",
31 |         )
32 |         self.add_state(
33 |             "tokens", default=torch.tensor(0, dtype=torch.int64), dist_reduce_fx="sum"
34 |         )
35 | 
36 |     @torch.no_grad()
37 |     def update(
38 |         self, indices: Int[torch.Tensor, "n_layers batch pos k"], **kwargs
39 |     ) -> None:
40 |         self.latent_tokens.add_(
41 |             torch.bincount(indices.int().reshape(-1), minlength=self.n_latents)
42 |         )
43 |         self.tokens += indices.shape[1] * indices.shape[2]
44 | 
45 |     @torch.no_grad()
46 |     def compute(self) -> torch.Tensor:
47 |         return (
48 |             torch.sum(
49 |                 self.latent_tokens < self.tokens / self.dead_tokens_threshold,
50 |                 dtype=torch.float,
51 |             )
52 |             / self.n_latents
53 |         )
54 | 


--------------------------------------------------------------------------------
/mlsae/metrics/layerwise.py:
--------------------------------------------------------------------------------
 1 | from functools import partial
 2 | 
 3 | import torch
 4 | from jaxtyping import Float
 5 | from torchmetrics import ClasswiseWrapper, Metric
 6 | 
 7 | 
 8 | # Based on https://github.com/ai-safety-foundation/sparse_autoencoder/blob/b6ba6cb7c90372cb5462855c21e5f52fc9130557/sparse_autoencoder/metrics/wrappers/classwise.py
 9 | class LayerwiseWrapper(ClasswiseWrapper):
10 |     def __init__(self, metric: Metric, labels: list[str], prefix: str) -> None:
11 |         super().__init__(metric, labels=labels, prefix=prefix)
12 | 
13 |     def _convert_output(self, x: Float[torch.Tensor, "layer"]) -> dict:
14 |         if x.ndim == 0:
15 |             x = x.unsqueeze(0)
16 |         metrics = super()._convert_output(x)
17 |         return {**metrics, f"{self._prefix}avg": x.mean(dim=0, dtype=torch.float)}
18 | 
19 | 
20 | def layerwise(n_layers: int) -> partial[LayerwiseWrapper]:
21 |     return partial(
22 |         LayerwiseWrapper,
23 |         labels=[f"layer_{layer}" for layer in range(n_layers)],
24 |     )
25 | 


--------------------------------------------------------------------------------
/mlsae/metrics/layerwise_fvu.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from jaxtyping import Float
 3 | from torchmetrics import Metric
 4 | 
 5 | 
 6 | class LayerwiseFVU(Metric):
 7 |     """
 8 |     Fraction of variance unexplained (FVU). MSE divided by the input variance.
 9 | 
10 |     Equivalent to normalized MSE in Gao et al. [2024], except we compute the variance
11 |     per batch instead of once at the beginning of training.
12 |     """
13 | 
14 |     is_differentiable = True
15 |     full_state_update = False
16 | 
17 |     layer_mse: Float[torch.Tensor, "n_layers"]
18 |     """Layerwise sum of MSEs between the inputs and reconstructions."""
19 | 
20 |     layer_var: Float[torch.Tensor, "n_layers"]
21 |     """Layerwise sum of variances of the inputs."""
22 | 
23 |     def __init__(self, n_layers: int) -> None:
24 |         super().__init__()
25 |         self.add_state(
26 |             "layer_mse", torch.zeros(n_layers, dtype=torch.float), dist_reduce_fx="sum"
27 |         )
28 |         self.add_state(
29 |             "layer_var", torch.zeros(n_layers, dtype=torch.float), dist_reduce_fx="sum"
30 |         )
31 | 
32 |     @torch.no_grad()
33 |     def update(
34 |         self,
35 |         inputs: Float[torch.Tensor, "n_layers batch pos n_inputs"],
36 |         recons: Float[torch.Tensor, "n_layers batch pos n_inputs"],
37 |         **kwargs,
38 |     ) -> None:
39 |         self.layer_mse.add_(torch.mean((recons - inputs).pow(2), dim=(1, 2, 3)))
40 |         self.layer_var.add_(torch.var(inputs, dim=(1, 2, 3)))
41 | 
42 |     def compute(self) -> Float[torch.Tensor, "n_layers"]:
43 |         return self.layer_mse / self.layer_var
44 | 


--------------------------------------------------------------------------------
/mlsae/metrics/layerwise_l0_norm.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from jaxtyping import Float, Int64
 3 | from torchmetrics import Metric
 4 | 
 5 | 
 6 | class LayerwiseL0Norm(Metric):
 7 |     """
 8 |     L0 norm (sparsity). Average count of nonzero latent activations.
 9 | 
10 |     Fixed at k (the number of largest latents to keep) during training.
11 |     """
12 | 
13 |     is_differentiable = False
14 |     full_state_update = False
15 | 
16 |     layer_nonzero: Float[torch.Tensor, "n_layers"]
17 |     """Layerwise count of nonzero latent activations."""
18 | 
19 |     tokens: Int64[torch.Tensor, ""]
20 |     """Count of tokens."""
21 | 
22 |     def __init__(self, n_layers: int, dead_threshold: float) -> None:
23 |         super().__init__()
24 |         self.add_state(
25 |             "layer_nonzero",
26 |             torch.zeros(n_layers, dtype=torch.float),
27 |             dist_reduce_fx="sum",
28 |         )
29 |         self.add_state(
30 |             "tokens", torch.zeros(1, dtype=torch.int64), dist_reduce_fx="sum"
31 |         )
32 |         self.dead_threshold = dead_threshold
33 | 
34 |     @torch.no_grad()
35 |     def update(
36 |         self, values: Float[torch.Tensor, "n_layers batch pos k"], **kwargs
37 |     ) -> None:
38 |         self.layer_nonzero.add_(torch.sum(values > self.dead_threshold, dim=(1, 2, 3)))
39 |         self.tokens.add_(values.shape[1] * values.shape[2])
40 | 
41 |     @torch.no_grad()
42 |     def compute(self) -> Float[torch.Tensor, "n_layers"]:
43 |         return self.layer_nonzero / self.tokens
44 | 


--------------------------------------------------------------------------------
/mlsae/metrics/layerwise_l1_norm.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from jaxtyping import Float, Int64
 3 | from torchmetrics import Metric
 4 | 
 5 | 
 6 | class LayerwiseL1Norm(Metric):
 7 |     """L1 norm. Average sum of absolute latent activations."""
 8 | 
 9 |     is_differentiable = False
10 |     full_state_update = False
11 | 
12 |     layer_abs: Float[torch.Tensor, "n_layers"]
13 |     """Layerwise sum of absolute latent activations."""
14 | 
15 |     tokens: Int64[torch.Tensor, ""]
16 |     """Layerwise count of tokens."""
17 | 
18 |     def __init__(self, n_layers: int) -> None:
19 |         super().__init__()
20 |         self.add_state(
21 |             "layer_abs", torch.zeros(n_layers, dtype=torch.float), dist_reduce_fx="sum"
22 |         )
23 |         self.add_state(
24 |             "tokens", torch.zeros(1, dtype=torch.int64), dist_reduce_fx="sum"
25 |         )
26 | 
27 |     @torch.no_grad()
28 |     def update(
29 |         self, values: Float[torch.Tensor, "n_layers batch pos k"], **kwargs
30 |     ) -> None:
31 |         self.layer_abs.add_(torch.sum(torch.abs(values), dim=(1, 2, 3)))
32 |         self.tokens.add_(values.shape[1] * values.shape[2])
33 | 
34 |     @torch.no_grad()
35 |     def compute(self) -> Float[torch.Tensor, "n_layers"]:
36 |         return self.layer_abs / self.tokens
37 | 


--------------------------------------------------------------------------------
/mlsae/metrics/layerwise_logit_kl_div.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn.functional as F
 3 | from jaxtyping import Float
 4 | from torchmetrics import Metric
 5 | 
 6 | 
 7 | class LayerwiseLogitKLDiv(Metric):
 8 |     """
 9 |     Downstream loss (replace the inputs by the reconstruction during the forward pass).
10 | 
11 |     The mean KL divergence between the logits for the inputs and reconstructions.
12 |     """
13 | 
14 |     is_differentiable = False
15 |     full_state_update = False
16 | 
17 |     layer_logit_kl_div: Float[torch.Tensor, "n_layers"]
18 |     """Layerwise sum of KL divergences between logits."""
19 | 
20 |     def __init__(self, n_layers: int) -> None:
21 |         super().__init__()
22 |         self.n_layers = n_layers
23 |         self.add_state(
24 |             "layer_logit_kl_div",
25 |             default=torch.zeros(n_layers),
26 |             dist_reduce_fx="mean",
27 |         )
28 | 
29 |     @torch.no_grad()
30 |     def update(
31 |         self,
32 |         logits_true: Float[torch.Tensor, "n_layers batch pos d_vocab"],
33 |         logits_pred: Float[torch.Tensor, "n_layers batch pos d_vocab"],
34 |         **kwargs,
35 |     ) -> None:
36 |         # NOTE: Iterate over layers to reduce memory usage.
37 |         for layer in range(self.n_layers):
38 |             self.layer_logit_kl_div[layer].add_(
39 |                 F.kl_div(
40 |                     F.log_softmax(logits_true[layer], dim=-1),
41 |                     F.log_softmax(logits_pred[layer], dim=-1),
42 |                     log_target=True,
43 |                     reduction="batchmean",
44 |                 )
45 |             )
46 | 
47 |     @torch.no_grad()
48 |     def compute(self) -> Float[torch.Tensor, "n_layers"]:
49 |         return self.layer_logit_kl_div
50 | 


--------------------------------------------------------------------------------
/mlsae/metrics/layerwise_logit_mse.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn.functional as F
 3 | from jaxtyping import Float
 4 | from torchmetrics import Metric
 5 | 
 6 | 
 7 | class LayerwiseLogitMSE(Metric):
 8 |     """
 9 |     Downstream loss (replace the inputs by the reconstruction during the forward pass).
10 | 
11 |     The MSE between the logits for the inputs and reconstructions.
12 |     """
13 | 
14 |     is_differentiable = False
15 |     full_state_update = False
16 | 
17 |     layer_logit_mse: Float[torch.Tensor, "n_layers"]
18 |     """Layerwise sum of MSEs between logits."""
19 | 
20 |     def __init__(self, n_layers: int) -> None:
21 |         super().__init__()
22 |         self.n_layers = n_layers
23 |         self.add_state(
24 |             "layer_logit_mse",
25 |             default=torch.zeros(n_layers),
26 |             dist_reduce_fx="mean",
27 |         )
28 | 
29 |     @torch.no_grad()
30 |     def update(
31 |         self,
32 |         logits_true: Float[torch.Tensor, "n_layers batch pos d_vocab"],
33 |         logits_pred: Float[torch.Tensor, "n_layers batch pos d_vocab"],
34 |         **kwargs,
35 |     ) -> None:
36 |         # NOTE: Iterate over layers to reduce memory usage.
37 |         for layer in range(self.n_layers):
38 |             self.layer_logit_mse[layer].add_(
39 |                 F.mse_loss(logits_true[layer], logits_pred[layer])
40 |             )
41 | 
42 |     @torch.no_grad()
43 |     def compute(self) -> Float[torch.Tensor, "n_layers"]:
44 |         return self.layer_logit_mse
45 | 


--------------------------------------------------------------------------------
/mlsae/metrics/layerwise_loss_delta.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from jaxtyping import Float
 3 | from torchmetrics import Metric
 4 | 
 5 | 
 6 | class LayerwiseLossDelta(Metric):
 7 |     """
 8 |     Downstream loss (replace the inputs by the reconstruction during the forward pass).
 9 | 
10 |     The average delta between the cross-entropy loss for the inputs and reconstructions.
11 |     """
12 | 
13 |     is_differentiable = False
14 |     full_state_update = False
15 | 
16 |     layer_delta_loss: Float[torch.Tensor, "n_layers"]
17 |     """Layerwise sum of deltas between cross-entropy losses."""
18 | 
19 |     def __init__(self, n_layers: int) -> None:
20 |         super().__init__()
21 |         self.add_state(
22 |             "layer_delta_loss", default=torch.zeros(n_layers), dist_reduce_fx="mean"
23 |         )
24 | 
25 |     @torch.no_grad()
26 |     def update(
27 |         self,
28 |         loss_true: Float[torch.Tensor, "n_layers"],
29 |         loss_pred: Float[torch.Tensor, "n_layers"],
30 |         **kwargs,
31 |     ) -> None:
32 |         self.layer_delta_loss.add_(loss_pred - loss_true)
33 | 
34 |     @torch.no_grad()
35 |     def compute(self) -> Float[torch.Tensor, "n_layers"]:
36 |         return self.layer_delta_loss
37 | 


--------------------------------------------------------------------------------
/mlsae/metrics/layerwise_mse.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from jaxtyping import Float
 3 | from torchmetrics import Metric
 4 | 
 5 | 
 6 | class LayerwiseMSE(Metric):
 7 |     """Mean squared error (MSE) or L2 reconstruction loss."""
 8 | 
 9 |     is_differentiable = True
10 |     full_state_update = False
11 | 
12 |     layer_mse: Float[torch.Tensor, "n_layers"]
13 |     """Layerwise mean of MSEs between inputs and reconstructions."""
14 | 
15 |     def __init__(self, n_layers: int) -> None:
16 |         super().__init__()
17 |         self.add_state(
18 |             "layer_mse", torch.zeros(n_layers, dtype=torch.float), dist_reduce_fx="mean"
19 |         )
20 | 
21 |     @torch.no_grad()
22 |     def update(
23 |         self,
24 |         inputs: Float[torch.Tensor, "n_layers batch pos n_inputs"],
25 |         recons: Float[torch.Tensor, "n_layers batch pos n_inputs"],
26 |         **kwargs,
27 |     ) -> None:
28 |         self.layer_mse.add_(torch.mean((recons - inputs).pow(2), dim=(1, 2, 3)))
29 | 
30 |     @torch.no_grad()
31 |     def compute(self) -> Float[torch.Tensor, "n_layers"]:
32 |         return self.layer_mse / self.update_count
33 | 


--------------------------------------------------------------------------------
/mlsae/metrics/mse_loss.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from jaxtyping import Float
 3 | from torchmetrics import Metric
 4 | 
 5 | 
 6 | class MSELoss(Metric):
 7 |     """
 8 |     The average FVU of the main model `e = inputs - recons`, where `recons` is the
 9 |     reconstruction using the top-k latents.
10 | 
11 |     Equivalent to normalized MSE in Gao et al. [2024], except we compute the variance
12 |     per batch instead of once at the beginning of training.
13 |     """
14 | 
15 |     is_differentiable = True
16 |     full_state_update = False
17 | 
18 |     layer_mse: Float[torch.Tensor, "n_layers"]
19 |     """Layerwise sum of MSEs between the inputs and reconstructions."""
20 | 
21 |     layer_var: Float[torch.Tensor, "n_layers"]
22 |     """Layerwise sum of variances of the inputs."""
23 | 
24 |     def __init__(self, n_layers: int) -> None:
25 |         super().__init__()
26 |         self.add_state(
27 |             "layer_mse", torch.zeros(n_layers, dtype=torch.float), dist_reduce_fx="sum"
28 |         )
29 |         self.add_state(
30 |             "layer_var", torch.zeros(n_layers, dtype=torch.float), dist_reduce_fx="sum"
31 |         )
32 | 
33 |     def update(
34 |         self,
35 |         inputs: Float[torch.Tensor, "n_layers batch pos n_inputs"],
36 |         recons: Float[torch.Tensor, "n_layers batch pos n_inputs"],
37 |         **kwargs,
38 |     ) -> None:
39 |         self.layer_mse.add_(torch.mean((recons - inputs).pow(2), dim=(1, 2, 3)))
40 |         self.layer_var.add_(torch.var(inputs, dim=(1, 2, 3)))
41 | 
42 |     def compute(self) -> Float[torch.Tensor, ""]:
43 |         return (self.layer_mse / self.layer_var).mean()
44 | 


--------------------------------------------------------------------------------
/mlsae/metrics/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tim-lawson/mlsae/03ad37a0a1b4541d763859cb0c7c9ccb7ce67867/mlsae/metrics/tests/__init__.py


--------------------------------------------------------------------------------
/mlsae/metrics/tests/test_dead_latents.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | from mlsae.metrics import DeadLatents
 4 | 
 5 | 
 6 | def test_dead_latents() -> None:
 7 |     metric = DeadLatents(4, 4)
 8 | 
 9 |     metric.update(indices=torch.tensor([[[[0], [0]]], [[[0], [0]]]]))
10 |     assert metric.tokens == 2
11 |     assert torch.allclose(metric.latent_tokens, torch.tensor([4.0, 0.0, 0.0, 0.0]))
12 |     assert torch.allclose(metric.compute(), torch.tensor(0.75))
13 | 
14 |     metric.update(indices=torch.tensor([[[[0], [0]]], [[[1], [1]]]]))
15 |     assert metric.tokens == 4
16 |     assert torch.allclose(metric.latent_tokens, torch.tensor([6.0, 2.0, 0.0, 0.0]))
17 |     assert torch.allclose(metric.compute(), torch.tensor(0.5))
18 | 
19 |     metric.update(indices=torch.tensor([[[[1], [1]]], [[[2], [2]]]]))
20 |     assert metric.tokens == 6
21 |     assert torch.allclose(metric.latent_tokens, torch.tensor([6.0, 4.0, 2.0, 0.0]))
22 |     assert torch.allclose(metric.compute(), torch.tensor(0.25))
23 | 
24 |     metric.update(indices=torch.tensor([[[[2], [2]]], [[[3], [3]]]]))
25 |     assert metric.tokens == 8
26 |     assert torch.allclose(metric.latent_tokens, torch.tensor([6.0, 4.0, 4.0, 2.0]))
27 |     assert torch.allclose(metric.compute(), torch.tensor(0.0))
28 | 
29 |     metric.update(indices=torch.tensor([[[[3], [3]]], [[[0], [0]]]]))
30 |     assert metric.tokens == 10
31 |     assert torch.allclose(metric.latent_tokens, torch.tensor([8.0, 4.0, 4.0, 4.0]))
32 |     assert torch.allclose(metric.compute(), torch.tensor(0.0))
33 | 
34 |     metric.update(indices=torch.tensor([[[[0], [0]]], [[[0], [0]]]]))
35 |     assert metric.tokens == 12
36 |     assert torch.allclose(metric.latent_tokens, torch.tensor([12.0, 4.0, 4.0, 4.0]))
37 |     assert torch.allclose(metric.compute(), torch.tensor(0.0))
38 | 
39 |     metric.update(indices=torch.tensor([[[[0], [0]]], [[[0], [0]]]]))
40 |     assert metric.tokens == 14
41 |     assert torch.allclose(metric.latent_tokens, torch.tensor([16.0, 4.0, 4.0, 4.0]))
42 |     assert torch.allclose(metric.compute(), torch.tensor(0.0))
43 | 
44 |     metric.update(indices=torch.tensor([[[[0], [0]]], [[[0], [0]]]]))
45 |     assert metric.tokens == 16
46 |     assert torch.allclose(metric.latent_tokens, torch.tensor([20.0, 4.0, 4.0, 4.0]))
47 |     assert torch.allclose(metric.compute(), torch.tensor(0.0))
48 | 
49 |     metric.update(indices=torch.tensor([[[[0], [0]]], [[[0], [0]]]]))
50 |     assert metric.tokens == 18
51 |     assert torch.allclose(metric.latent_tokens, torch.tensor([24.0, 4.0, 4.0, 4.0]))
52 |     assert torch.allclose(metric.compute(), torch.tensor(0.75))
53 | 


--------------------------------------------------------------------------------
/mlsae/metrics/tests/test_layerwise_fvu.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | import torch
 3 | from jaxtyping import Float
 4 | 
 5 | from mlsae.metrics import LayerwiseFVU
 6 | 
 7 | n_layers = 6
 8 | shape = (n_layers, 1, 2048, 512)
 9 | 
10 | generator = torch.Generator()
11 | generator.manual_seed(42)
12 | 
13 | normal_zeros = torch.normal(torch.ones(*shape), std=1, generator=generator)
14 | normal_ones = torch.normal(torch.zeros(*shape), std=1, generator=generator)
15 | 
16 | 
17 | @pytest.mark.parametrize(
18 |     ("n_layers", "inputs", "recons", "expected"),
19 |     [
20 |         pytest.param(
21 |             n_layers,
22 |             normal_zeros,
23 |             normal_zeros,
24 |             torch.zeros(n_layers),
25 |             id="both 1",
26 |         ),
27 |         pytest.param(
28 |             n_layers,
29 |             normal_ones,
30 |             normal_ones,
31 |             torch.zeros(n_layers),
32 |             id="both 0",
33 |         ),
34 |         pytest.param(
35 |             n_layers,
36 |             normal_zeros,
37 |             torch.zeros(*shape),
38 |             torch.ones(n_layers) * 2,
39 |             id="1 and 0",
40 |         ),
41 |         pytest.param(
42 |             n_layers,
43 |             normal_ones,
44 |             torch.ones(*shape),
45 |             torch.ones(n_layers) * 2,
46 |             id="0 and 1",
47 |         ),
48 |         pytest.param(
49 |             1,
50 |             normal_zeros[0, ...].unsqueeze(0),
51 |             normal_zeros[0, ...].unsqueeze(0),
52 |             torch.zeros(1),
53 |             id="single layer",
54 |         ),
55 |     ],
56 | )
57 | def test_layerwise_fvu(
58 |     n_layers: int,
59 |     inputs: Float[torch.Tensor, "n_layers batch pos n_inputs"],
60 |     recons: Float[torch.Tensor, "n_layers batch pos n_inputs"],
61 |     expected: Float[torch.Tensor, "n_layers"],
62 | ) -> None:
63 |     metric = LayerwiseFVU(n_layers)
64 | 
65 |     metric.update(inputs=inputs, recons=recons)
66 |     assert torch.allclose(metric.compute(), expected, atol=1e-2)
67 | 
68 |     metric.update(inputs=inputs, recons=recons)
69 |     metric.update(inputs=inputs, recons=recons)
70 |     assert torch.allclose(metric.compute(), expected, atol=1e-2)
71 | 


--------------------------------------------------------------------------------
/mlsae/metrics/tests/test_layerwise_l0_norm.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | import torch
 3 | from jaxtyping import Float
 4 | 
 5 | from mlsae.metrics import LayerwiseL0Norm
 6 | 
 7 | n_layers = 6
 8 | shape = (n_layers, 1, 2048, 32)
 9 | 
10 | 
11 | @pytest.mark.parametrize(
12 |     ("n_layers", "dead_threshold", "values", "expected"),
13 |     [
14 |         pytest.param(
15 |             n_layers,
16 |             1e-3,
17 |             torch.zeros(*shape),
18 |             torch.zeros(n_layers),
19 |             id="all zero",
20 |         ),
21 |         pytest.param(
22 |             n_layers,
23 |             1e-3,
24 |             torch.ones(*shape) * 1e-4,
25 |             torch.zeros(n_layers),
26 |             id="below threshold",
27 |         ),
28 |         pytest.param(
29 |             n_layers,
30 |             1e-3,
31 |             torch.ones(*shape),
32 |             torch.ones(n_layers) * 32.0,
33 |             id="above threshold",
34 |         ),
35 |         pytest.param(
36 |             1,
37 |             1e-3,
38 |             torch.ones((1, *shape[1:])),
39 |             torch.ones(1) * 32.0,
40 |             id="single layer",
41 |         ),
42 |     ],
43 | )
44 | def test_layerwise_l0_norm(
45 |     n_layers: int,
46 |     dead_threshold: float,
47 |     values: Float[torch.Tensor, "n_layers batch pos k"],
48 |     expected: Float[torch.Tensor, "n_layers"],
49 | ) -> None:
50 |     metric = LayerwiseL0Norm(n_layers, dead_threshold)
51 | 
52 |     metric.update(values=values)
53 |     assert torch.allclose(metric.compute(), expected)
54 | 
55 |     metric.update(values=values)
56 |     metric.update(values=values)
57 |     assert torch.allclose(metric.compute(), expected)
58 | 


--------------------------------------------------------------------------------
/mlsae/metrics/tests/test_layerwise_l1_norm.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | import torch
 3 | from jaxtyping import Float
 4 | 
 5 | from mlsae.metrics import LayerwiseL1Norm
 6 | 
 7 | n_layers = 6
 8 | shape = (n_layers, 1, 2048, 32)
 9 | 
10 | 
11 | @pytest.mark.parametrize(
12 |     ("n_layers", "values", "expected"),
13 |     [
14 |         pytest.param(
15 |             n_layers,
16 |             torch.zeros(*shape),
17 |             torch.zeros(n_layers),
18 |             id="all zero",
19 |         ),
20 |         pytest.param(
21 |             n_layers,
22 |             torch.ones(*shape),
23 |             torch.ones(n_layers) * 32.0,
24 |             id="all +1",
25 |         ),
26 |         pytest.param(
27 |             n_layers,
28 |             torch.ones(*shape) * -1,
29 |             torch.ones(n_layers) * 32.0,
30 |             id="all -1",
31 |         ),
32 |         pytest.param(
33 |             1,
34 |             torch.ones((1, *shape[1:])) * -1,
35 |             torch.ones(1) * 32.0,
36 |             id="single layer",
37 |         ),
38 |     ],
39 | )
40 | def test_layerwise_l1_norm(
41 |     n_layers: int,
42 |     values: Float[torch.Tensor, "n_layers batch pos k"],
43 |     expected: Float[torch.Tensor, "n_layers"],
44 | ) -> None:
45 |     metric = LayerwiseL1Norm(n_layers)
46 | 
47 |     metric.update(values=values)
48 |     assert torch.allclose(metric.compute(), expected)
49 | 
50 |     metric.update(values=values)
51 |     metric.update(values=values)
52 |     assert torch.allclose(metric.compute(), expected)
53 | 


--------------------------------------------------------------------------------
/mlsae/metrics/tests/test_layerwise_mse.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | import torch
 3 | from jaxtyping import Float
 4 | 
 5 | from mlsae.metrics import LayerwiseMSE
 6 | 
 7 | n_layers = 6
 8 | shape = (n_layers, 1, 2048, 512)
 9 | 
10 | 
11 | @pytest.mark.parametrize(
12 |     ("n_layers", "inputs", "recons", "expected"),
13 |     [
14 |         pytest.param(
15 |             n_layers,
16 |             torch.zeros(*shape),
17 |             torch.zeros(*shape),
18 |             torch.zeros(n_layers),
19 |             id="both 0",
20 |         ),
21 |         pytest.param(
22 |             n_layers,
23 |             torch.ones(*shape),
24 |             torch.ones(*shape),
25 |             torch.zeros(n_layers),
26 |             id="both 1",
27 |         ),
28 |         pytest.param(
29 |             n_layers,
30 |             torch.zeros(*shape),
31 |             torch.ones(*shape),
32 |             torch.ones(n_layers),
33 |             id="0 and 1",
34 |         ),
35 |         pytest.param(
36 |             n_layers,
37 |             torch.ones(*shape),
38 |             torch.zeros(*shape),
39 |             torch.ones(n_layers),
40 |             id="1 and 0",
41 |         ),
42 |         pytest.param(
43 |             1,
44 |             torch.ones((1, *shape[1:])),
45 |             torch.zeros((1, *shape[1:])),
46 |             torch.ones(1),
47 |             id="single layer",
48 |         ),
49 |     ],
50 | )
51 | def test_layerwise_mse(
52 |     n_layers: int,
53 |     inputs: Float[torch.Tensor, "n_layers batch pos n_inputs"],
54 |     recons: Float[torch.Tensor, "n_layers batch pos n_inputs"],
55 |     expected: Float[torch.Tensor, "n_layers"],
56 | ) -> None:
57 |     metric = LayerwiseMSE(n_layers)
58 | 
59 |     metric.update(inputs=inputs, recons=recons)
60 |     assert torch.allclose(metric.compute(), expected)
61 | 
62 |     metric.update(inputs=inputs, recons=recons)
63 |     metric.update(inputs=inputs, recons=recons)
64 |     assert torch.allclose(metric.compute(), expected)
65 | 


--------------------------------------------------------------------------------
/mlsae/metrics/tests/test_loss_mse.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | import torch
 3 | from jaxtyping import Float
 4 | 
 5 | from mlsae.metrics import MSELoss
 6 | 
 7 | n_layers = 6
 8 | shape = (n_layers, 1, 2048, 512)
 9 | 
10 | generator = torch.Generator()
11 | generator.manual_seed(42)
12 | 
13 | normal_zeros = torch.normal(torch.ones(*shape), std=1, generator=generator)
14 | normal_ones = torch.normal(torch.zeros(*shape), std=1, generator=generator)
15 | 
16 | 
17 | @pytest.mark.parametrize(
18 |     ("n_layers", "inputs", "recons", "expected"),
19 |     [
20 |         pytest.param(
21 |             n_layers,
22 |             normal_zeros,
23 |             normal_zeros,
24 |             torch.tensor(0.0),
25 |             id="both 0",
26 |         ),
27 |         pytest.param(
28 |             n_layers,
29 |             normal_ones,
30 |             normal_ones,
31 |             torch.tensor(0.0),
32 |             id="both 1",
33 |         ),
34 |         pytest.param(
35 |             n_layers,
36 |             normal_zeros,
37 |             torch.ones(*shape),
38 |             torch.tensor(1.0),
39 |             id="0 and 1",
40 |         ),
41 |         pytest.param(
42 |             n_layers,
43 |             normal_ones,
44 |             torch.zeros(*shape),
45 |             torch.tensor(1.0),
46 |             id="1 and 0",
47 |         ),
48 |         pytest.param(
49 |             1,
50 |             normal_ones[0, ...].unsqueeze(0),
51 |             torch.zeros((1, *shape[1:])),
52 |             torch.tensor(1.0),
53 |             id="single layer",
54 |         ),
55 |     ],
56 | )
57 | def test_loss_mse(
58 |     n_layers: int,
59 |     inputs: Float[torch.Tensor, "n_layers batch pos n_inputs"],
60 |     recons: Float[torch.Tensor, "n_layers batch pos n_inputs"],
61 |     expected: Float[torch.Tensor, "n_layers"],
62 | ) -> None:
63 |     metric = MSELoss(n_layers)
64 | 
65 |     metric.update(inputs=inputs, recons=recons)
66 |     assert torch.allclose(metric.compute(), expected, atol=1e-2)
67 | 
68 |     metric.update(inputs=inputs, recons=recons)
69 |     metric.update(inputs=inputs, recons=recons)
70 |     assert torch.allclose(metric.compute(), expected, atol=1e-2)
71 | 


--------------------------------------------------------------------------------
/mlsae/model/__init__.py:
--------------------------------------------------------------------------------
 1 | from .autoencoders import SAE, SAEOut, TopKSAE, TopKSAEOut
 2 | from .data import DataConfig, get_test_dataloader, get_train_dataloader
 3 | from .lightning import MLSAEConfig, MLSAETransformer
 4 | from .transformers import GPT2Transformer, PythiaTransformer
 5 | from .types import Stats, TopK
 6 | 
 7 | __all__ = [
 8 |     "DataConfig",
 9 |     "get_test_dataloader",
10 |     "get_train_dataloader",
11 |     "SAE",
12 |     "SAEOut",
13 |     "TopKSAE",
14 |     "TopKSAEOut",
15 |     "MLSAEConfig",
16 |     "MLSAETransformer",
17 |     "GPT2Transformer",
18 |     "PythiaTransformer",
19 |     "Stats",
20 |     "TopK",
21 | ]
22 | 


--------------------------------------------------------------------------------
/mlsae/model/autoencoders/__init__.py:
--------------------------------------------------------------------------------
 1 | from .standard import SAE, SAEOut
 2 | from .topk import TopKSAE, TopKSAEOut
 3 | from .utils import standardize, unit_norm_decoder, unit_norm_decoder_gradient
 4 | 
 5 | __all__ = [
 6 |     "SAE",
 7 |     "SAEOut",
 8 |     "TopKSAE",
 9 |     "TopKSAEOut",
10 |     "standardize",
11 |     "unit_norm_decoder",
12 |     "unit_norm_decoder_gradient",
13 | ]
14 | 


--------------------------------------------------------------------------------
/mlsae/model/autoencoders/standard.py:
--------------------------------------------------------------------------------
  1 | from typing import NamedTuple
  2 | 
  3 | import torch
  4 | from huggingface_hub import PyTorchModelHubMixin
  5 | from torch.nn import Linear, Module, Parameter
  6 | 
  7 | from mlsae.model.types import Stats
  8 | 
  9 | from .utils import standardize, unit_norm_decoder
 10 | 
 11 | 
 12 | class SAEOut(NamedTuple):
 13 |     """The output of the autoencoder forward pass."""
 14 | 
 15 |     latents: torch.Tensor
 16 |     """The latents."""
 17 | 
 18 |     recons: torch.Tensor
 19 |     """The reconstructions."""
 20 | 
 21 |     dead: torch.Tensor
 22 |     """The fraction of dead latents."""
 23 | 
 24 | 
 25 | # TODO: This is equivalent to TopK SAE with k = n_latents and auxk = None.
 26 | class SAE(Module, PyTorchModelHubMixin):
 27 |     last_nonzero: torch.Tensor
 28 |     """The number of steps since the latents have activated."""
 29 | 
 30 |     def __init__(
 31 |         self,
 32 |         n_inputs: int,
 33 |         n_latents: int,
 34 |         dead_steps_threshold: int,
 35 |         dead_threshold: float = 1e-3,
 36 |         standardize: bool = True,
 37 |     ) -> None:
 38 |         """
 39 |         Args:
 40 |             n_inputs (int): The number of inputs.
 41 | 
 42 |             n_latents(int): The number of latents.
 43 | 
 44 |             dead_steps_threshold (int): The number of steps after which a latent is
 45 |                 flagged as dead during training.
 46 | 
 47 |             dead_threshold (float): The threshold for a latent to be considered
 48 |                 activated. Defaults to 1e-3.
 49 | 
 50 |             standardize (bool): Whether to standardize the inputs. Defaults to True.
 51 |         """
 52 |         super().__init__()
 53 | 
 54 |         self.n_inputs = n_inputs
 55 |         self.n_latents = n_latents
 56 |         self.dead_steps_threshold = dead_steps_threshold
 57 |         self.dead_threshold = dead_threshold
 58 |         self.standardize = standardize
 59 | 
 60 |         self.encoder = Linear(n_inputs, n_latents, bias=False)
 61 |         self.decoder = Linear(n_latents, n_inputs, bias=False)
 62 |         self.pre_encoder_bias = Parameter(torch.zeros(n_inputs))
 63 | 
 64 |         self.register_buffer("last_nonzero", torch.zeros(n_latents, dtype=torch.long))
 65 | 
 66 |         self.decoder.weight.data = self.encoder.weight.data.T.clone()
 67 |         self.decoder.weight.data = self.decoder.weight.data.T.contiguous().T
 68 |         unit_norm_decoder(self.decoder)
 69 | 
 70 |     def encode(
 71 |         self, inputs: torch.Tensor
 72 |     ) -> tuple[torch.Tensor, Stats | None, torch.Tensor]:
 73 |         stats = None
 74 |         if self.standardize:
 75 |             inputs, stats = standardize(inputs)
 76 | 
 77 |         latents = self.encoder.forward(inputs - self.pre_encoder_bias)
 78 | 
 79 |         # Find the k largest latents (purely to maximize consistency with TopKSAE)
 80 |         topk = torch.topk(latents, self.n_latents, sorted=False)
 81 | 
 82 |         # Update the number of steps since the latents have activated
 83 |         last_nonzero = torch.zeros_like(self.last_nonzero, device=inputs.device)
 84 |         last_nonzero.scatter_add_(
 85 |             dim=0,
 86 |             index=topk.indices.reshape(-1),
 87 |             src=(topk.values > self.dead_threshold).to(last_nonzero.dtype).reshape(-1),
 88 |         )
 89 |         self.last_nonzero *= 1 - last_nonzero.clamp(max=1)
 90 |         self.last_nonzero += 1
 91 | 
 92 |         # Mask the latents flagged as dead during training
 93 |         dead_mask = self.last_nonzero >= self.dead_steps_threshold
 94 | 
 95 |         # Compute the fraction of dead latents
 96 |         dead = torch.sum(dead_mask, dtype=torch.float32).detach() / self.n_latents
 97 | 
 98 |         return latents, stats, dead
 99 | 
100 |     def decode(self, latents: torch.Tensor, stats: Stats | None = None) -> torch.Tensor:
101 |         recons = (latents @ self.decoder.weight.T) + self.pre_encoder_bias
102 |         if stats is not None:
103 |             recons = recons * stats.std + stats.mean
104 |         return recons
105 | 
106 |     def forward(self, inputs: torch.Tensor) -> SAEOut:
107 |         latents, stats, dead = self.encode(inputs)
108 |         latents = torch.relu(latents)
109 |         recons = self.decode(latents, stats)
110 |         return SAEOut(latents, recons, dead)
111 | 


--------------------------------------------------------------------------------
/mlsae/model/autoencoders/tests/test_autoencoders.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | from mlsae.model.autoencoders import SAE, TopKSAE
 4 | from mlsae.model.decoder import scatter_topk
 5 | 
 6 | 
 7 | @torch.no_grad()
 8 | def test_autoencoders() -> None:
 9 |     n_inputs = 512
10 |     n_latents = 64 * n_inputs
11 |     dead_steps_threshold = 10_000_000
12 |     k = n_latents
13 | 
14 |     sae: SAE = SAE(n_inputs, n_latents, dead_steps_threshold)  # type: ignore
15 | 
16 |     topk_sae: TopKSAE = TopKSAE(n_inputs, n_latents, k, dead_steps_threshold, auxk=None)  # type: ignore
17 |     topk_sae.encoder.weight.data = sae.encoder.weight.data
18 |     topk_sae.decoder.weight.data = sae.decoder.weight.data
19 |     topk_sae.pre_encoder_bias.data = sae.pre_encoder_bias.data
20 | 
21 |     inputs = torch.rand(1, n_inputs)
22 | 
23 |     sae_latents, sae_recons, sae_dead = sae.forward(inputs)
24 |     topk_sae_topk, topk_sae_recons, _, _, topk_sae_dead = topk_sae.forward(inputs)
25 |     topk_sae_latents = scatter_topk(topk_sae_topk, n_latents)
26 | 
27 |     assert torch.allclose(sae_latents, topk_sae_latents, atol=1e-3)
28 |     assert torch.allclose(sae_recons, topk_sae_recons, atol=1e-3)
29 |     assert torch.allclose(sae_dead, topk_sae_dead, atol=1e-3)
30 | 


--------------------------------------------------------------------------------
/mlsae/model/autoencoders/topk.py:
--------------------------------------------------------------------------------
  1 | # Based on https://github.com/openai/sparse_autoencoder/blob/4965b941e9eb590b00b253a2c406db1e1b193942/sparse_autoencoder/train.py
  2 | 
  3 | from typing import NamedTuple
  4 | 
  5 | import torch
  6 | from huggingface_hub import PyTorchModelHubMixin
  7 | from torch.nn import Linear, Module, Parameter
  8 | 
  9 | from mlsae.model.decoder import decode
 10 | from mlsae.model.types import Stats, TopK
 11 | from mlsae.model_card import model_card_template
 12 | 
 13 | from .utils import standardize, unit_norm_decoder
 14 | 
 15 | 
 16 | class TopKSAEOut(NamedTuple):
 17 |     """The output of the autoencoder forward pass."""
 18 | 
 19 |     topk: TopK
 20 |     """The k largest latents."""
 21 | 
 22 |     recons: torch.Tensor
 23 |     """The reconstructions from the k largest latents."""
 24 | 
 25 |     auxk: TopK | None
 26 |     """If auxk is not None, the auxk largest dead latents."""
 27 | 
 28 |     auxk_recons: torch.Tensor | None
 29 |     """If auxk is not None, the reconstructions from the auxk largest dead latents."""
 30 | 
 31 |     dead: torch.Tensor
 32 |     """The fraction of dead latents."""
 33 | 
 34 | 
 35 | class TopKSAE(
 36 |     Module,
 37 |     PyTorchModelHubMixin,
 38 |     model_card_template=model_card_template(False),
 39 |     license="mit",
 40 |     language="en",
 41 |     library_name="mlsae",
 42 |     repo_url="https://github.com/tim-lawson/mlsae",
 43 |     tags=["arxiv:2409.04185"],
 44 | ):
 45 |     last_nonzero: torch.Tensor
 46 |     """The number of steps since the latents have activated."""
 47 | 
 48 |     def __init__(
 49 |         self,
 50 |         n_inputs: int,
 51 |         n_latents: int,
 52 |         k: int,
 53 |         dead_steps_threshold: int,
 54 |         dead_threshold: float = 1e-3,
 55 |         # TODO: Make this optional and default to a power of 2 close to d_model / 2.
 56 |         auxk: int | None = 256,
 57 |         standardize: bool = True,
 58 |     ) -> None:
 59 |         """
 60 |         Args:
 61 |             n_inputs (int): The number of inputs.
 62 | 
 63 |             n_latents (int): The number of latents.
 64 | 
 65 |             k (int): The number of largest latents to keep.
 66 | 
 67 |             dead_steps_threshold (int): The number of steps after which a latent is
 68 |                 flagged as dead during training.
 69 | 
 70 |             dead_threshold (float): The threshold for a latent to be considered
 71 |                 activated. Defaults to 1e-3.
 72 | 
 73 |             auxk (int | None): The number of dead latents with which to model the
 74 |                 reconstruction error. Defaults to 256.
 75 | 
 76 |             standardize (bool): Whether to standardize the inputs. Defaults to True.
 77 |         """
 78 | 
 79 |         super().__init__()
 80 | 
 81 |         self.n_inputs = n_inputs
 82 |         self.n_latents = n_latents
 83 |         self.k = k
 84 |         self.auxk = auxk
 85 |         self.dead_steps_threshold = dead_steps_threshold
 86 |         self.dead_threshold = dead_threshold
 87 |         self.standardize = standardize
 88 | 
 89 |         self.encoder = Linear(n_inputs, n_latents, bias=False)
 90 |         self.decoder = Linear(n_latents, n_inputs, bias=False)
 91 |         self.pre_encoder_bias = Parameter(torch.zeros(n_inputs))
 92 | 
 93 |         self.register_buffer("last_nonzero", torch.zeros(n_latents, dtype=torch.long))
 94 | 
 95 |         self.decoder.weight.data = self.encoder.weight.data.T.clone()
 96 |         self.decoder.weight.data = self.decoder.weight.data.T.contiguous().T
 97 |         unit_norm_decoder(self.decoder)
 98 | 
 99 |     def encode(
100 |         self, inputs: torch.Tensor
101 |     ) -> tuple[TopK, TopK | None, Stats | None, torch.Tensor]:
102 |         stats = None
103 |         if self.standardize:
104 |             inputs, stats = standardize(inputs)
105 | 
106 |         # Keep a reference to the latents before the TopK activation function
107 |         latents = self.encoder.forward(inputs - self.pre_encoder_bias)
108 | 
109 |         # Find the k largest latents
110 |         topk = TopK(*torch.topk(latents, k=self.k, sorted=False))
111 | 
112 |         # Update the number of steps since the latents have activated
113 |         last_nonzero = torch.zeros_like(self.last_nonzero, device=inputs.device)
114 |         last_nonzero.scatter_add_(
115 |             dim=0,
116 |             index=topk.indices.reshape(-1),
117 |             src=(topk.values > self.dead_threshold).to(last_nonzero.dtype).reshape(-1),
118 |         )
119 |         self.last_nonzero *= 1 - last_nonzero.clamp(max=1)
120 |         self.last_nonzero += 1
121 | 
122 |         # Mask the latents flagged as dead during training
123 |         dead_mask = self.last_nonzero >= self.dead_steps_threshold
124 |         latents.data *= dead_mask  # in-place to save memory
125 | 
126 |         # Compute the fraction of dead latents
127 |         dead = torch.sum(dead_mask, dtype=torch.float32).detach() / self.n_latents
128 | 
129 |         # If auxk is not None, find the auxk largest dead latents
130 |         auxk = None
131 |         if self.auxk is not None:
132 |             auxk = TopK(*torch.topk(latents, k=self.auxk, sorted=False))
133 | 
134 |         return topk, auxk, stats, dead
135 | 
136 |     def decode(self, topk: TopK, stats: Stats | None = None) -> torch.Tensor:
137 |         recons = decode(topk, self.decoder.weight) + self.pre_encoder_bias
138 |         if stats is not None:
139 |             recons = recons * stats.std + stats.mean
140 |         return recons
141 | 
142 |     def forward(self, inputs: torch.Tensor) -> TopKSAEOut:
143 |         topk, auxk, stats, dead = self.encode(inputs)
144 | 
145 |         # Apply ReLU to ensure the k largest latents are non-negative
146 |         values = torch.relu(topk.values)
147 |         topk = TopK(values, topk.indices)
148 |         recons = self.decode(topk, stats)
149 | 
150 |         auxk_recons = None
151 |         if auxk is not None:
152 |             auxk_values = torch.relu(auxk.values)
153 |             auxk = TopK(auxk_values, auxk.indices)
154 |             auxk_recons = self.decode(auxk)
155 | 
156 |         return TopKSAEOut(topk, recons, auxk, auxk_recons, dead)
157 | 


--------------------------------------------------------------------------------
/mlsae/model/autoencoders/utils.py:
--------------------------------------------------------------------------------
 1 | import einops
 2 | import torch
 3 | from jaxtyping import Float
 4 | from torch.nn import Linear
 5 | 
 6 | from mlsae.model.types import Stats
 7 | 
 8 | 
 9 | def unit_norm_decoder(decoder: Linear) -> None:
10 |     """Unit-normalize the decoder weight vectors."""
11 | 
12 |     decoder.weight.data /= decoder.weight.data.norm(dim=0)
13 | 
14 | 
15 | # TODO: Use kernels.triton_add_mul_ if it's available
16 | @torch.no_grad()
17 | def unit_norm_decoder_gradient(decoder: Linear) -> None:
18 |     """
19 |     Remove the component of the gradient parallel to the decoder weight vectors.
20 |     Assumes that the decoder weight vectors are unit-normalized.
21 |     NOTE: Without `@torch.no_grad()`, this causes a memory leak!
22 |     """
23 | 
24 |     assert decoder.weight.grad is not None
25 |     scalar = einops.einsum(
26 |         decoder.weight.grad,
27 |         decoder.weight,
28 |         "... n_latents n_inputs, ... n_latents n_inputs -> ... n_inputs",
29 |     )
30 |     vector = einops.einsum(
31 |         scalar,
32 |         decoder.weight,
33 |         "... n_inputs, ... n_latents n_inputs -> ... n_latents n_inputs",
34 |     )
35 |     decoder.weight.grad -= vector
36 | 
37 | 
38 | def standardize(
39 |     x: Float[torch.Tensor, "... n_inputs"], eps: float = 1e-5
40 | ) -> tuple[Float[torch.Tensor, "... n_inputs"], Stats]:
41 |     """Standardize the inputs to zero mean and unit variance."""
42 | 
43 |     mu = x.mean(dim=-1, keepdim=True)
44 |     x = x - mu
45 |     std = x.std(dim=-1, keepdim=True)
46 |     x = x / (std + eps)
47 |     return x, Stats(mu, std)
48 | 


--------------------------------------------------------------------------------
/mlsae/model/data.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | from dataclasses import dataclass
  3 | 
  4 | import torch
  5 | from datasets import IterableDataset, load_dataset
  6 | from datasets.formatting.formatting import LazyBatch
  7 | from jaxtyping import Int
  8 | from simple_parsing import Serializable
  9 | from torch import Tensor
 10 | from torch.utils.data import DataLoader
 11 | from transformers import AutoTokenizer, PreTrainedTokenizerBase
 12 | 
 13 | 
 14 | @dataclass
 15 | class DataConfig(Serializable):
 16 |     """The data configuration."""
 17 | 
 18 |     path: str = "monology/pile-uncopyrighted"
 19 |     """The path to a HuggingFace text dataset."""
 20 | 
 21 |     max_length: int = 2048
 22 |     """The maximum length of a tokenized input sequence."""
 23 | 
 24 |     batch_size: int = 1
 25 |     """The number of sequences in a batch."""
 26 | 
 27 |     max_tokens: float = 1_000_000_000
 28 |     """The maximum number of tokens to train on."""
 29 | 
 30 |     num_workers: int | None = None
 31 |     """The number of workers to use for data loading."""
 32 | 
 33 |     @property
 34 |     def max_steps(self) -> int:
 35 |         """The maximum number of batches to train on."""
 36 | 
 37 |         return math.ceil(self.max_tokens / (self.batch_size * self.max_length))
 38 | 
 39 | 
 40 | def concat_and_tokenize(
 41 |     dataset: IterableDataset,
 42 |     tokenizer: PreTrainedTokenizerBase,
 43 |     max_length: int,
 44 | ) -> IterableDataset:
 45 |     return dataset.map(
 46 |         _concat_and_tokenize,
 47 |         batched=True,
 48 |         # Large batch size minimizes the number of tokens dropped
 49 |         batch_size=1024,
 50 |         # TODO: Column names are not always available
 51 |         remove_columns=dataset.column_names or ["text", "meta"],
 52 |         fn_kwargs={"tokenizer": tokenizer, "max_length": max_length},
 53 |     ).with_format("torch")
 54 | 
 55 | 
 56 | # Based on https://github.com/EleutherAI/sae/blob/19d95a401e9d17dbf7d6fb0fa7a91081f1b0d01f/sae/data.py
 57 | def _concat_and_tokenize(
 58 |     batch: LazyBatch, tokenizer: PreTrainedTokenizerBase, max_length: int
 59 | ) -> dict:
 60 |     output = tokenizer(
 61 |         # Concatenate the batch of text with the EOS token
 62 |         tokenizer.eos_token.join([""] + batch["text"]),  # type: ignore
 63 |         truncation=True,
 64 |         max_length=max_length,
 65 |         return_attention_mask=False,
 66 |         return_overflowing_tokens=True,
 67 |     )
 68 | 
 69 |     overflowing_tokens = output.pop("overflowing_tokens", None)
 70 |     _ = output.pop("overflow_to_sample_mapping", None)
 71 | 
 72 |     # Split the overflowing tokens into sequences of the maximum length
 73 |     if overflowing_tokens is not None:
 74 |         output["input_ids"] += [
 75 |             overflowing_tokens[i * max_length : (i + 1) * max_length]
 76 |             for i in range(math.ceil(len(overflowing_tokens) / max_length))
 77 |         ]  # type: ignore
 78 | 
 79 |     # Drop the last batch, which is probably incomplete
 80 |     return {k: v[:-1] for k, v in output.items()}
 81 | 
 82 | 
 83 | def get_dataloader(
 84 |     dataset: IterableDataset,
 85 |     model_name: str,
 86 |     max_length: int,
 87 |     batch_size: int,
 88 |     num_workers: int = 1,
 89 | ) -> DataLoader[Int[Tensor, "batch pos"]]:
 90 |     tokenizer = AutoTokenizer.from_pretrained(model_name)
 91 | 
 92 |     # Constrain the maximum length of a tokenized input sequence
 93 |     max_length = min(tokenizer.model_max_length, max_length)
 94 | 
 95 |     return DataLoader(
 96 |         concat_and_tokenize(dataset, tokenizer, max_length),  # type: ignore
 97 |         batch_size=batch_size,
 98 |         num_workers=num_workers,
 99 |     )
100 | 
101 | 
102 | def get_train_dataloader(
103 |     path: str, model_name: str, max_length: int, batch_size: int, num_workers: int = 1
104 | ) -> DataLoader[torch.Tensor]:
105 |     return get_dataloader(
106 |         load_dataset(path, split="train", streaming=True),  # type: ignore
107 |         model_name,
108 |         max_length,
109 |         batch_size,
110 |         num_workers,
111 |     )
112 | 
113 | 
114 | def get_test_dataloader(
115 |     model_name: str, max_length: int, batch_size: int, num_workers: int = 1
116 | ) -> DataLoader[torch.Tensor]:
117 |     return get_dataloader(
118 |         load_dataset(
119 |             "json",
120 |             data_files="./data/test.jsonl.zst",
121 |             split="train",
122 |             streaming=True,
123 |         ),  # type: ignore
124 |         model_name,
125 |         max_length,
126 |         batch_size,
127 |         num_workers,
128 |     )
129 | 


--------------------------------------------------------------------------------
/mlsae/model/decoder.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | 
  3 | import torch
  4 | from jaxtyping import Float
  5 | from loguru import logger
  6 | from torch import Tensor
  7 | 
  8 | from mlsae.model.types import TopK
  9 | 
 10 | 
 11 | # NOTE: Avoid this where possible to save memory!
 12 | def scatter_topk(topk: TopK, n_latents: int) -> Float[Tensor, "... n_latents"]:
 13 |     """
 14 |     Scatter the k largest latents into a new tensor of shape (..., n_latents).
 15 | 
 16 |     Args:
 17 |         topk (TopK): The k largest latents.
 18 | 
 19 |         n_latents (int): The number of latents.
 20 | 
 21 |     Returns:
 22 |         out (Float[Tensor, "... n_latents"]): The k largest latents.
 23 |     """
 24 | 
 25 |     # ... n_latents
 26 |     buffer = topk.values.new_zeros((*topk.indices.shape[:-1], n_latents))
 27 |     # ... k -> ... n_latents
 28 |     return buffer.scatter_(dim=-1, index=topk.indices, src=topk.values)
 29 | 
 30 | 
 31 | # Based on https://github.com/EleutherAI/sae/blob/19d95a401e9d17dbf7d6fb0fa7a91081f1b0d01f/sae/utils.py
 32 | def decode_triton(topk: TopK, weight: Tensor) -> Tensor:
 33 |     shape = topk.indices.shape[:-1]
 34 |     k = topk.indices.shape[-1]
 35 |     n_inputs, n_latents = weight.shape
 36 | 
 37 |     indices_flat = topk.indices.view(-1, k)
 38 |     values_flat = topk.values.view(-1, k)
 39 | 
 40 |     output: Tensor = TritonDecoderAutograd.apply(indices_flat, values_flat, weight)  # type: ignore
 41 | 
 42 |     return output.view(*shape, n_inputs)
 43 | 
 44 | 
 45 | def decode_cuda(topk: TopK, weight: Tensor, chunk_size: int = 1024) -> Tensor:
 46 |     shape = topk.indices.shape[:-1]
 47 |     k = topk.indices.shape[-1]
 48 |     n_inputs, n_latents = weight.shape
 49 | 
 50 |     indices_flat = topk.indices.view(-1, k)
 51 |     values_flat = topk.values.view(-1, k)
 52 | 
 53 |     batch_size = indices_flat.shape[0]
 54 | 
 55 |     output = torch.zeros(
 56 |         batch_size, n_inputs, device=topk.values.device, dtype=topk.values.dtype
 57 |     )
 58 | 
 59 |     for i in range(0, batch_size, chunk_size):
 60 |         indices_chunk = indices_flat[i : i + chunk_size]
 61 |         values_chunk = values_flat[i : i + chunk_size]
 62 | 
 63 |         chunk_sparse = torch.sparse_coo_tensor(
 64 |             indices=torch.cat(
 65 |                 [
 66 |                     torch.arange(
 67 |                         indices_chunk.shape[0], device=indices_chunk.device
 68 |                     ).repeat_interleave(k),
 69 |                     indices_chunk.flatten(),
 70 |                 ]
 71 |             ).view(2, -1),
 72 |             values=values_chunk.flatten(),
 73 |             size=(indices_chunk.shape[0], n_latents),
 74 |         )
 75 | 
 76 |         chunk_output = torch.sparse.mm(chunk_sparse, weight.t())
 77 | 
 78 |         output[i : i + chunk_size] = chunk_output
 79 | 
 80 |     return output.view(*shape, n_inputs)
 81 | 
 82 | 
 83 | # NOTE: 'sparse_coo_tensor' isn't supported yet for the MPS backend
 84 | def decode_mps(topk: TopK, weight: Tensor, chunk_size: int = 1024) -> Tensor:
 85 |     shape = topk.indices.shape[:-1]
 86 |     k = topk.indices.shape[-1]
 87 |     n_inputs, n_latents = weight.shape
 88 | 
 89 |     indices_flat = topk.indices.view(-1, k)
 90 |     values_flat = topk.values.view(-1, k)
 91 | 
 92 |     batch_size = indices_flat.shape[0]
 93 | 
 94 |     output = torch.zeros(
 95 |         batch_size, n_inputs, device=topk.values.device, dtype=topk.values.dtype
 96 |     )
 97 | 
 98 |     for i in range(0, batch_size, chunk_size):
 99 |         indices_chunk = indices_flat[i : i + chunk_size]
100 |         values_chunk = values_flat[i : i + chunk_size]
101 | 
102 |         weight_mask = weight[:, indices_chunk.view(-1)].view(
103 |             n_inputs, indices_chunk.shape[0], k
104 |         )
105 | 
106 |         output_chunk = torch.bmm(
107 |             values_chunk.unsqueeze(1), weight_mask.permute(1, 2, 0)
108 |         ).squeeze(1)
109 | 
110 |         output[i : i + chunk_size] = output_chunk
111 | 
112 |     return output.view(*shape, n_inputs)
113 | 
114 | 
115 | def decode(topk: TopK, weight: Tensor) -> Tensor:
116 |     """
117 |     Sparse decoder implementation.
118 | 
119 |     Args:
120 |         topk (TopK): The k largest latents.
121 | 
122 |         weight (Float[Tensor, "n_inputs n_latents"]): The decoder weight matrix.
123 | 
124 |     Returns:
125 |         out (Float[Tensor, "... n_inputs"]): The reconstructions.
126 |     """
127 |     ...
128 | 
129 | 
130 | try:
131 |     from .kernels import TritonDecoderAutograd
132 | except ImportError:
133 |     logger.info("Triton not found")
134 |     if torch.backends.mps.is_available():
135 |         logger.info("MPS backend, using 'bmm' decoder")
136 |         decode = decode_mps
137 |     else:
138 |         logger.info("CPU/CUDA backend, using 'sparse_coo_tensor' decoder")
139 |         decode = decode_cuda
140 | else:
141 |     logger.info("Triton found")
142 |     if os.environ.get("USE_TRITON", "1") == "1":
143 |         logger.info("Triton enabled, using Triton decoder")
144 |         decode = decode_triton
145 |     else:
146 |         logger.info("Triton disabled, using 'sparse_coo_tensor' decoder")
147 |         decode = decode_cuda
148 | 


--------------------------------------------------------------------------------
/mlsae/model/geom_median.py:
--------------------------------------------------------------------------------
 1 | # Based on https://github.com/EleutherAI/sae/blob/19d95a401e9d17dbf7d6fb0fa7a91081f1b0d01f/sae/utils.py
 2 | 
 3 | import einops
 4 | import torch
 5 | from jaxtyping import Float
 6 | 
 7 | 
 8 | @torch.no_grad()
 9 | def geometric_median(
10 |     points: Float[torch.Tensor, "layer batch pos n_inputs"],
11 |     max_iter: int = 100,
12 |     tol: float = 1e-5,
13 | ) -> Float[torch.Tensor, "n_inputs"]:
14 |     """
15 |     Compute the geometric median of the points along the last axis.
16 | 
17 |     Used to initialize the pre-encoder bias.
18 | 
19 |     Args:
20 |         points (Float[torch.Tensor, "layer batch pos n_inputs"]): The points from
21 |             which to compute the geometric median.
22 | 
23 |         max_iter (int): The maximum number of iterations. Defaults to 100.
24 | 
25 |         tol (float): The tolerance for early stopping. Defaults to 1e-5.
26 | 
27 |     Returns:
28 |         out (Float[torch.Tensor, "n_inputs"]): The geometric median of the points along
29 |             the last axis.
30 |     """
31 | 
32 |     points = einops.rearrange(
33 |         points, "layer batch pos n_inputs -> (layer batch pos) n_inputs"
34 |     )
35 |     curr = points.mean(dim=0)
36 |     prev = torch.zeros_like(curr)
37 |     weights = torch.ones(len(points), device=points.device)
38 |     for _ in range(max_iter):
39 |         prev = curr
40 |         weights = 1 / torch.norm(points - curr, dim=1)
41 |         weights /= weights.sum()
42 |         curr = (weights.unsqueeze(1) * points).sum(dim=0)
43 |         if torch.norm(curr - prev) < tol:
44 |             break
45 |     return curr
46 | 


--------------------------------------------------------------------------------
/mlsae/model/transformers/__init__.py:
--------------------------------------------------------------------------------
1 | from .gpt2 import GPT2Transformer
2 | from .pythia import PythiaTransformer
3 | 
4 | __all__ = [
5 |     "GPT2Transformer",
6 |     "PythiaTransformer",
7 | ]
8 | 


--------------------------------------------------------------------------------
/mlsae/model/transformers/tests/test_gpt2.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from transformers import AutoTokenizer
 3 | 
 4 | from mlsae.model.transformers.gpt2 import GPT2Transformer
 5 | from mlsae.model.transformers.models.gpt2.modeling_gpt2 import (
 6 |     GPT2Config,
 7 |     GPT2LMHeadModel,
 8 |     GPT2Model,
 9 | )
10 | from mlsae.utils import get_input_ids
11 | 
12 | atol = 1e-2
13 | 
14 | 
15 | @torch.no_grad()
16 | def test_hidden_states() -> None:
17 |     model_name = "openai-community/gpt2"
18 | 
19 |     tokenizer = AutoTokenizer.from_pretrained(model_name)
20 |     input_ids = get_input_ids(tokenizer, "The quick brown fox jumps over the lazy dog.")
21 | 
22 |     gpt2: GPT2Model = GPT2Model.from_pretrained(model_name)  # type: ignore
23 |     config: GPT2Config = gpt2.config  # type: ignore
24 | 
25 |     # Skip the final layer norm when collecting hidden states
26 |     hidden_states = torch.stack(
27 |         gpt2.forward(
28 |             input_ids, output_hidden_states=True, skip_final_layer_norm=True
29 |         ).hidden_states[1:]  # type: ignore
30 |     )
31 | 
32 |     # We usually skip special tokens, but we may as well compare them
33 |     my_gpt2 = GPT2Transformer(
34 |         model_name, config.n_positions, batch_size=1, skip_special_tokens=False
35 |     )
36 |     my_hidden_states = my_gpt2.hidden_states(input_ids)
37 | 
38 |     for layer in range(len(hidden_states)):
39 |         assert torch.allclose(
40 |             hidden_states[layer],
41 |             my_hidden_states[layer],
42 |             atol=atol,
43 |         )
44 | 
45 | 
46 | @torch.no_grad()
47 | def test_forward_at_layer() -> None:
48 |     model_name = "openai-community/gpt2"
49 | 
50 |     tokenizer = AutoTokenizer.from_pretrained(model_name)
51 |     input_ids = get_input_ids(tokenizer, "The quick brown fox jumps over the lazy dog.")
52 | 
53 |     gpt2: GPT2LMHeadModel = GPT2LMHeadModel.from_pretrained(model_name)  # type: ignore
54 |     config: GPT2Config = gpt2.config  # type: ignore
55 | 
56 |     # Skip the final layer norm when collecting hidden states
57 |     hidden_states = torch.stack(
58 |         gpt2.forward(
59 |             input_ids,
60 |             output_hidden_states=True,
61 |             skip_final_layer_norm=True,
62 |         ).hidden_states[1:]  # type: ignore
63 |     )
64 | 
65 |     # Don't skip the final layer norm when computing the loss/logits
66 |     output = gpt2.forward(input_ids, labels=input_ids)
67 |     loss: torch.Tensor = output.loss  # type: ignore
68 |     logits = output.logits  # type: ignore
69 | 
70 |     # We usually skip special tokens, but we may as well compare them
71 |     my_gpt2 = GPT2Transformer(
72 |         model_name, config.n_positions, batch_size=1, skip_special_tokens=False
73 |     )
74 | 
75 |     for layer in range(config.n_layer):
76 |         my_loss, my_logits = my_gpt2.forward_at_layer(
77 |             inputs_embeds=hidden_states,
78 |             start_at_layer=layer,
79 |             return_type="both",
80 |             tokens=input_ids,
81 |         )
82 |         assert torch.allclose(my_loss, loss, atol=atol)
83 |         assert torch.allclose(my_logits, logits, atol=atol)
84 | 


--------------------------------------------------------------------------------
/mlsae/model/transformers/tests/test_llama.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | import torch
 3 | from transformers import (
 4 |     AutoTokenizer,
 5 | )
 6 | from transformers.models.llama.configuration_llama import LlamaConfig
 7 | 
 8 | from mlsae.model.transformers.llama import LlamaTransformer
 9 | from mlsae.model.transformers.models.llama.modeling_llama import (
10 |     LlamaForCausalLM,
11 |     LlamaModel,
12 | )
13 | from mlsae.utils import get_input_ids
14 | 
15 | atol = 1e-2
16 | 
17 | 
18 | @pytest.mark.slow()
19 | @torch.no_grad()
20 | def test_hidden_states() -> None:
21 |     model_name = "meta-llama/Llama-3.2-1B"
22 | 
23 |     tokenizer = AutoTokenizer.from_pretrained(model_name)
24 |     input_ids = get_input_ids(tokenizer, "The quick brown fox jumps over the lazy dog.")
25 | 
26 |     llama: LlamaModel = LlamaModel.from_pretrained(model_name)  # type: ignore
27 |     config: LlamaConfig = llama.config  # type: ignore
28 | 
29 |     # Skip the final layer norm when collecting hidden states
30 |     hidden_states = torch.stack(
31 |         llama.forward(
32 |             input_ids, output_hidden_states=True, skip_final_layer_norm=True
33 |         ).hidden_states[1:]  # type: ignore
34 |     )
35 | 
36 |     # We usually skip special tokens, but we may as well compare them
37 |     my_llama = LlamaTransformer(
38 |         model_name,
39 |         config.max_position_embeddings,
40 |         batch_size=1,
41 |         skip_special_tokens=False,
42 |     )
43 |     my_hidden_states = my_llama.hidden_states(input_ids)
44 | 
45 |     for layer in range(len(hidden_states)):
46 |         assert torch.allclose(
47 |             hidden_states[layer],
48 |             my_hidden_states[layer],
49 |             atol=atol,
50 |         )
51 | 
52 | 
53 | @pytest.mark.slow()
54 | @torch.no_grad()
55 | def test_forward_at_layer() -> None:
56 |     model_name = "meta-llama/Llama-3.2-1B"
57 | 
58 |     tokenizer = AutoTokenizer.from_pretrained(model_name)
59 |     input_ids = get_input_ids(tokenizer, "The quick brown fox jumps over the lazy dog.")
60 | 
61 |     llama: LlamaForCausalLM = LlamaForCausalLM.from_pretrained(model_name)  # type: ignore
62 |     config: LlamaConfig = llama.config  # type: ignore
63 | 
64 |     # Skip the final layer norm when collecting hidden states
65 |     hidden_states = torch.stack(
66 |         llama.forward(
67 |             input_ids,
68 |             output_hidden_states=True,
69 |             skip_final_layer_norm=True,
70 |         ).hidden_states[1:]  # type: ignore
71 |     )
72 | 
73 |     # Don't skip the final layer norm when computing the loss/logits
74 |     output = llama.forward(input_ids, labels=input_ids)
75 |     loss: torch.Tensor = output.loss  # type: ignore
76 |     logits = output.logits  # type: ignore
77 | 
78 |     # We usually skip special tokens, but we may as well compare them
79 |     my_llama = LlamaTransformer(
80 |         model_name, config.n_positions, batch_size=1, skip_special_tokens=False
81 |     )
82 | 
83 |     for layer in range(config.n_layer):
84 |         my_loss, my_logits = my_llama.forward_at_layer(
85 |             inputs_embeds=hidden_states,
86 |             start_at_layer=layer,
87 |             return_type="both",
88 |             tokens=input_ids,
89 |         )
90 |         assert torch.allclose(my_loss, loss, atol=atol)
91 |         assert torch.allclose(my_logits, logits, atol=atol)
92 | 


--------------------------------------------------------------------------------
/mlsae/model/types.py:
--------------------------------------------------------------------------------
 1 | from typing import NamedTuple
 2 | 
 3 | import torch
 4 | from jaxtyping import Float, Int
 5 | 
 6 | 
 7 | class TopK(NamedTuple):
 8 |     """The k largest latents. Wraps 'torch.return_types.topk'."""
 9 | 
10 |     values: Float[torch.Tensor, "layer batch pos k"]
11 |     """The values of the k largest latents."""
12 | 
13 |     indices: Int[torch.Tensor, "layer batch pos k"]
14 |     """The indices of the k largest latents."""
15 | 
16 | 
17 | class Stats(NamedTuple):
18 |     """Used to standardize the input activation vectors."""
19 | 
20 |     mean: torch.Tensor
21 |     std: torch.Tensor
22 | 


--------------------------------------------------------------------------------
/mlsae/model_card.py:
--------------------------------------------------------------------------------
 1 | from typing import LiteralString
 2 | 
 3 | 
 4 | def model_card_template(transformer: bool) -> LiteralString:
 5 |     if transformer:
 6 |         return f"""{MODEL_CARD_TEMPLATE_START}
 7 | 
 8 | This model is a PyTorch Lightning MLSAETransformer module, which includes the underlying
 9 | transformer.
10 | 
11 |   {MODEL_CARD_TEMPLATE_END}"""
12 |     return f"""{MODEL_CARD_TEMPLATE_START}
13 | 
14 | This model is a PyTorch TopKSAE module, which does not include the underlying
15 | transformer.
16 | 
17 |   {MODEL_CARD_TEMPLATE_END}"""
18 | 
19 | 
20 | MODEL_CARD_TEMPLATE_START = """
21 | ---
22 | {{ card_data }}
23 | ---
24 | 
25 | # Model Card for {{ model_id }}
26 | 
27 | A Multi-Layer Sparse Autoencoder (MLSAE) trained on the residual stream activation
28 | vectors from [{{ model_name }}](https://huggingface.co/{{ model_name }}) with an
29 | expansion factor of R = {{ expansion_factor }} and sparsity k = {{ k }}, over 1 billion
30 | tokens from [monology/pile-uncopyrighted](https://huggingface.co/datasets/monology/pile-uncopyrighted).
31 | """
32 | 
33 | 
34 | MODEL_CARD_TEMPLATE_END = """
35 | ### Model Sources
36 | 
37 | - **Repository:** <https://github.com/tim-lawson/mlsae>
38 | - **Paper:** <https://arxiv.org/abs/2409.04185>
39 | - **Weights & Biases:** <https://wandb.ai/timlawson-/mlsae>
40 | 
41 | ## Citation
42 | 
43 | **BibTeX:**
44 | 
45 | ```bibtex
46 | @misc{lawson_residual_2024,
47 |   title         = {Residual {{ "{{" }}Stream Analysis{{ "}}" }} with {{ "{{" }}Multi-Layer SAEs{{ "}}" }}},
48 |   author        = {Lawson, Tim and Farnik, Lucy and Houghton, Conor and Aitchison, Laurence},
49 |   year          = {2024},
50 |   month         = oct,
51 |   number        = {arXiv:2409.04185},
52 |   eprint        = {2409.04185},
53 |   primaryclass  = {cs},
54 |   publisher     = {arXiv},
55 |   doi           = {10.48550/arXiv.2409.04185},
56 |   urldate       = {2024-10-08},
57 |   archiveprefix = {arXiv}
58 | }
59 | ```
60 | """  # noqa: E501
61 | 


--------------------------------------------------------------------------------
/mlsae/trainer/__init__.py:
--------------------------------------------------------------------------------
 1 | from .config import RunConfig, SweepConfig, TrainerConfig, initialize
 2 | from .test import test
 3 | from .train import train
 4 | 
 5 | __all__ = [
 6 |     "initialize",
 7 |     "RunConfig",
 8 |     "SweepConfig",
 9 |     "test",
10 |     "train",
11 |     "TrainerConfig",
12 | ]
13 | 


--------------------------------------------------------------------------------
/mlsae/trainer/config.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | from collections.abc import Generator
  3 | from dataclasses import dataclass, field
  4 | from itertools import product
  5 | 
  6 | import torch
  7 | from lightning.pytorch import seed_everything
  8 | from simple_parsing import Serializable
  9 | 
 10 | from mlsae.model import DataConfig, MLSAEConfig
 11 | from mlsae.utils import get_repo_id
 12 | 
 13 | 
 14 | @dataclass
 15 | class TrainerConfig(Serializable):
 16 |     """The trainer configuration."""
 17 | 
 18 |     checkpoint_path: str | None = None
 19 |     """The path to a model checkpoint to resume training."""
 20 | 
 21 |     precision: str = "16-mixed"
 22 |     """The precision of the training parameters."""
 23 | 
 24 |     accumulate_grad_batches: int = 64
 25 |     """The number of batches over which to accumulate gradients."""
 26 | 
 27 |     max_steps: int | None = None
 28 |     """The maximum number of training batches. If None, uses the maximum tokens."""
 29 | 
 30 |     log_every_n_steps: int | None = 8
 31 |     """The number of training steps between logging metrics."""
 32 | 
 33 |     val_check_interval: int | float | None = 64 * 64
 34 |     """The number of training batches between validation steps."""
 35 | 
 36 |     limit_val_batches: int | float | None = 64 * 8  # 1M tokens with batch size 2048
 37 |     """The number of batches to validate on."""
 38 | 
 39 |     default_root_dir: str | None = None
 40 |     """The default root directory for model checkpoints."""
 41 | 
 42 | 
 43 | @dataclass
 44 | class RunConfig(Serializable):
 45 |     autoencoder: MLSAEConfig = field(default_factory=MLSAEConfig)
 46 |     """The autoencoder configuration."""
 47 | 
 48 |     data: DataConfig = field(default_factory=DataConfig)
 49 |     """The data configuration."""
 50 | 
 51 |     trainer: TrainerConfig = field(default_factory=TrainerConfig)
 52 |     """The trainer configuration."""
 53 | 
 54 |     seed: int = 42
 55 |     """The seed for global random state."""
 56 | 
 57 |     model_name: str = "EleutherAI/pythia-70m-deduped"
 58 |     """The name of a pretrained HuggingFace GPTNeoXForCausalLM model."""
 59 | 
 60 |     layers: list[int] | None = None
 61 |     """The layers to train on. If None, trains on all layers."""
 62 | 
 63 |     project: str | None = None
 64 |     """The Weights & Biases project name."""
 65 | 
 66 |     run: str | None = None
 67 |     """The Weights & Biases run name."""
 68 | 
 69 | 
 70 | @dataclass
 71 | class SweepConfig(Serializable):
 72 |     model_name: list[str] = field(
 73 |         default_factory=lambda: ["EleutherAI/pythia-70m-deduped"]
 74 |     )
 75 |     """The names of pretrained HuggingFace GPTNeoXForCausalLM models."""
 76 | 
 77 |     expansion_factor: list[int] = field(default_factory=list)
 78 |     """The ratios of the number of latents to the number of inputs."""
 79 | 
 80 |     k: list[int] = field(default_factory=list)
 81 |     """The numbers of largest latents to keep."""
 82 | 
 83 |     tuned_lens: bool = False
 84 |     """Whether to apply a pretrained tuned lens before the encoder."""
 85 | 
 86 |     seed: int = 42
 87 |     """The seed for global random state."""
 88 | 
 89 |     def __iter__(self) -> Generator[tuple[str, int, int], None, None]:
 90 |         yield from product(self.model_name, self.expansion_factor, self.k)
 91 | 
 92 |     def repo_ids(
 93 |         self, transformer: bool = True, tuned_lens: bool = False
 94 |     ) -> Generator[str, None, None]:
 95 |         for model_name, expansion_factor, k in self:
 96 |             yield get_repo_id(
 97 |                 model_name=model_name,
 98 |                 expansion_factor=expansion_factor,
 99 |                 k=k,
100 |                 tuned_lens=tuned_lens,
101 |                 transformer=transformer,
102 |             )
103 | 
104 | 
105 | def initialize(seed: int) -> None:
106 |     # Deterministic for reproducibility
107 |     seed_everything(seed=seed, workers=True)
108 | 
109 |     # Fork processes via multiprocessing in Python
110 |     os.environ["TOKENIZERS_PARALLELISM"] = "false"
111 | 
112 |     # Avoid PyTorch DataLoader "too many open files"
113 |     torch.multiprocessing.set_sharing_strategy("file_system")
114 | 
115 |     # Improve matmul performance
116 |     torch.set_float32_matmul_precision("high")
117 | 


--------------------------------------------------------------------------------
/mlsae/trainer/test.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from typing import cast
 3 | 
 4 | import pandas as pd
 5 | from lightning.fabric.plugins.precision.precision import _PRECISION_INPUT
 6 | from lightning.pytorch import Trainer
 7 | 
 8 | from mlsae.model import MLSAETransformer
 9 | from mlsae.model.data import get_test_dataloader
10 | from mlsae.trainer.config import RunConfig, initialize
11 | from mlsae.utils import get_repo_id
12 | 
13 | 
14 | def test(config: RunConfig) -> None:
15 |     initialize(config.seed)
16 | 
17 |     repo_id = get_repo_id(
18 |         model_name=config.model_name,
19 |         expansion_factor=config.autoencoder.expansion_factor,
20 |         k=config.autoencoder.k,
21 |         tuned_lens=config.autoencoder.tuned_lens,
22 |         transformer=True,
23 |     )
24 | 
25 |     model = MLSAETransformer.from_pretrained(repo_id)
26 |     model.requires_grad_(False)
27 | 
28 |     dataloader = get_test_dataloader(
29 |         config.model_name,
30 |         config.data.max_length,
31 |         config.data.batch_size,
32 |         config.data.num_workers or 1,
33 |     )
34 | 
35 |     trainer = Trainer(
36 |         precision=cast(_PRECISION_INPUT, config.trainer.precision),
37 |         limit_test_batches=config.data.max_steps,
38 |         deterministic=True,
39 |     )
40 | 
41 |     output = trainer.test(model=model, dataloaders=dataloader)
42 | 
43 |     filename = f"test_{repo_id.split('/')[-1]}.csv"
44 |     pd.DataFrame(output).to_csv(os.path.join("out", filename), index=False)
45 | 


--------------------------------------------------------------------------------
/mlsae/trainer/train.py:
--------------------------------------------------------------------------------
 1 | import math
 2 | import subprocess
 3 | from typing import cast
 4 | 
 5 | import wandb
 6 | from lightning.fabric.plugins.precision.precision import _PRECISION_INPUT
 7 | from lightning.pytorch import Trainer
 8 | from lightning.pytorch.loggers import WandbLogger
 9 | 
10 | from mlsae.model import MLSAETransformer
11 | from mlsae.model.data import get_train_dataloader
12 | from mlsae.trainer.config import RunConfig, initialize
13 | 
14 | 
15 | def train(config: RunConfig) -> None:
16 |     subprocess.call(
17 |         "echo $(nvidia-smi --query-gpu=memory.used --format=csv,noheader)", shell=True
18 |     )
19 | 
20 |     initialize(config.seed)
21 | 
22 |     train_dataloader = get_train_dataloader(
23 |         config.data.path,
24 |         config.model_name,
25 |         config.data.max_length,
26 |         config.data.batch_size,
27 |         config.data.num_workers or 1,
28 |     )
29 | 
30 |     val_dataloader = get_train_dataloader(
31 |         config.data.path,
32 |         config.model_name,
33 |         config.data.max_length,
34 |         config.data.batch_size,
35 |         config.data.num_workers or 1,
36 |     )
37 | 
38 |     model: MLSAETransformer = MLSAETransformer(
39 |         config.model_name,
40 |         config.layers,
41 |         config.autoencoder.expansion_factor,
42 |         config.autoencoder.k,
43 |         config.autoencoder.auxk,
44 |         config.autoencoder.auxk_coef,
45 |         config.autoencoder.dead_tokens_threshold,
46 |         config.autoencoder.dead_threshold,
47 |         config.autoencoder.lr,
48 |         config.autoencoder.standardize,
49 |         config.autoencoder.skip_special_tokens,
50 |         config.data.max_length,
51 |         config.data.batch_size,
52 |         config.trainer.accumulate_grad_batches,
53 |         config.autoencoder.tuned_lens,
54 |     )  # type: ignore
55 | 
56 |     wandb.login()  # type: ignore
57 | 
58 |     trainer = Trainer(
59 |         precision=cast(_PRECISION_INPUT, config.trainer.precision),
60 |         logger=WandbLogger(
61 |             name=config.run,
62 |             save_dir="wandb_logs",
63 |             project=config.project,
64 |             log_model=True,
65 |         ),
66 |         max_steps=config.trainer.max_steps
67 |         or math.ceil(config.data.max_steps / config.trainer.accumulate_grad_batches),
68 |         limit_val_batches=config.trainer.limit_val_batches,
69 |         val_check_interval=config.trainer.val_check_interval,
70 |         log_every_n_steps=config.trainer.log_every_n_steps,
71 |         accumulate_grad_batches=config.trainer.accumulate_grad_batches,
72 |         deterministic=True,
73 |         default_root_dir=config.trainer.default_root_dir,
74 |     )
75 | 
76 |     trainer.fit(
77 |         model,
78 |         train_dataloaders=train_dataloader,
79 |         val_dataloaders=val_dataloader,
80 |         ckpt_path=config.trainer.checkpoint_path,
81 |     )
82 | 
83 |     wandb.finish()  # type: ignore
84 | 


--------------------------------------------------------------------------------
/mlsae/utils.py:
--------------------------------------------------------------------------------
  1 | import functools
  2 | import weakref
  3 | 
  4 | import torch
  5 | from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast
  6 | 
  7 | from mlsae.model import MLSAETransformer, TopK, TopKSAE
  8 | 
  9 | 
 10 | def get_model_repo_id(model: MLSAETransformer, transformer: bool) -> str:
 11 |     # NOTE: This is a hack. At the moment, we only distinguish between models trained on
 12 |     # a single layer and models trained on all layers.
 13 |     layers = None if len(model.layers) > 1 else model.layers
 14 |     return get_repo_id(
 15 |         model_name=model.model_name,
 16 |         expansion_factor=model.expansion_factor,
 17 |         k=model.k,
 18 |         tuned_lens=model.tuned_lens,
 19 |         transformer=transformer,
 20 |         layers=layers,
 21 |     )
 22 | 
 23 | 
 24 | def get_repo_id(
 25 |     model_name: str,
 26 |     expansion_factor: int,
 27 |     k: int,
 28 |     tuned_lens: bool,
 29 |     transformer: bool,
 30 |     layers: list[int] | None = None,
 31 | ) -> str:
 32 |     """
 33 |     Get the repo_id that corresponds to the specified hyperparameters.
 34 |     You should probably change this!
 35 |     """
 36 |     model_name = model_name.split("/")[-1]
 37 |     repo_id = f"tim-lawson/mlsae-{model_name}-x{expansion_factor}-k{k}"
 38 |     if tuned_lens:
 39 |         repo_id += "-lens"
 40 |     if transformer:
 41 |         repo_id += "-tfm"
 42 |     if layers is not None:
 43 |         repo_id = repo_id.replace("mlsae", "sae")
 44 |         repo_id += f"-layers-{''.join(map(str, layers))}"
 45 |     return repo_id
 46 | 
 47 | 
 48 | def get_device() -> torch.device:
 49 |     return torch.device(
 50 |         "cuda"
 51 |         if torch.cuda.is_available()
 52 |         else "mps"
 53 |         if torch.backends.mps.is_available()
 54 |         else "cpu"
 55 |     )
 56 | 
 57 | 
 58 | def normalize(x: torch.Tensor, dim: int = 0, eps: float = 1e-8) -> torch.Tensor:
 59 |     norm = torch.linalg.vector_norm(x, dim=dim, keepdim=True)
 60 |     return x / torch.max(norm, eps * torch.ones_like(norm))
 61 | 
 62 | 
 63 | # Copied from https://stackoverflow.com/a/33672499/23543959
 64 | def cache_method(*lru_args, **lru_kwargs):
 65 |     def decorator(func):
 66 |         @functools.wraps(func)
 67 |         def wrapped_func(self, *args, **kwargs):
 68 |             self_weak = weakref.ref(self)
 69 | 
 70 |             @functools.wraps(func)
 71 |             @functools.lru_cache(*lru_args, **lru_kwargs)
 72 |             def cached_method(*args, **kwargs):
 73 |                 return func(self_weak(), *args, **kwargs)
 74 | 
 75 |             setattr(self, func.__name__, cached_method)
 76 |             return cached_method(*args, **kwargs)
 77 | 
 78 |         return wrapped_func
 79 | 
 80 |     return decorator
 81 | 
 82 | 
 83 | def load_single_layer(
 84 |     model_name: str,
 85 |     layer: int,
 86 |     device: torch.device,
 87 |     expansion_factor: int = 64,
 88 |     k: int = 32,
 89 |     tuned_lens: bool = False,
 90 | ) -> MLSAETransformer:
 91 |     # NOTE: This is a hack. We want to feed an SAE trained at layer i with the input
 92 |     # activations from every layer. So, we:
 93 |     #   1. Load the multi-layer SAE and underlying transformer
 94 |     model_repo_id = get_repo_id(model_name, expansion_factor, k, tuned_lens, True)
 95 |     print("model repo_id:", model_repo_id)
 96 |     model = MLSAETransformer.from_pretrained(model_repo_id)
 97 |     model = model.to(device)
 98 | 
 99 |     #   2. Load the layer-specific SAE only
100 |     autoencoder_repo_id = get_repo_id(
101 |         model_name, expansion_factor, k, tuned_lens, False, [layer]
102 |     )
103 |     print("autoencoder repo_id:", autoencoder_repo_id)
104 |     autoencoder = TopKSAE.from_pretrained(
105 |         autoencoder_repo_id,
106 |         # TODO: These should be taken from config.json
107 |         n_inputs=model.n_inputs,
108 |         n_latents=model.n_latents,
109 |         k=model.k,
110 |         dead_steps_threshold=model.dead_steps_threshold,
111 |     )
112 |     autoencoder = autoencoder.to(device)
113 | 
114 |     #   3. Replace the SAE in the multi-layer model with the layer-specific one
115 |     model.autoencoder = autoencoder
116 | 
117 |     # Optional: check the hyperparameters match
118 |     assert model.n_inputs == autoencoder.n_inputs
119 |     assert model.n_latents == autoencoder.n_latents
120 |     assert model.k == autoencoder.k
121 |     assert model.dead_steps_threshold == autoencoder.dead_steps_threshold
122 |     assert model.dead_threshold == autoencoder.dead_threshold
123 |     assert model.auxk == autoencoder.auxk
124 | 
125 |     model.standardize = model.autoencoder.standardize
126 | 
127 |     return model
128 | 
129 | 
130 | # NOTE: This is also a hack. We want the input activations to be normalized
131 | # independently for each layer. So, we feed them to the SAE one layer at a time
132 | # and combine the results. UPDATE: Turns out, this is equivalent to the forward method.
133 | def forward_single_layer(
134 |     model: MLSAETransformer, tokens: torch.Tensor
135 | ) -> tuple[torch.Tensor, torch.Tensor, TopK]:
136 |     standardize = model.autoencoder.standardize
137 |     inputs = model.forward_lens(model.transformer.forward(tokens))
138 | 
139 |     # topk, recons, _, _, _ = model.forward(tokens)
140 |     # return inputs, recons, topk
141 | 
142 |     recons = torch.empty(inputs.shape, device=model.device)
143 |     topk = TopK(
144 |         values=torch.empty(
145 |             (model.n_layers, model.batch_size, model.max_length, model.k),
146 |             device=model.device,
147 |         ),
148 |         indices=torch.empty(
149 |             (model.n_layers, model.batch_size, model.max_length, model.k),
150 |             device=model.device,
151 |             dtype=torch.long,
152 |         ),
153 |     )
154 |     for layer in range(model.n_layers):
155 |         model.autoencoder.standardize = True
156 |         if layer == model.n_layers - 1:
157 |             model.autoencoder.standardize = False
158 |         topk_, recons_, _, _, _ = model.autoencoder.forward(inputs[layer])
159 |         recons[layer] = recons_
160 |         topk.indices[layer] = topk_.indices
161 |         topk.values[layer] = topk_.values
162 |     model.autoencoder.standardize = standardize
163 |     return inputs, recons, topk
164 | 
165 | 
166 | def get_input_ids(
167 |     tokenizer: PreTrainedTokenizer | PreTrainedTokenizerFast, prompt: str
168 | ) -> torch.LongTensor:
169 |     pad_token_id = tokenizer.pad_token_id or tokenizer.eos_token_id
170 |     input_ids = tokenizer.encode(prompt)
171 |     return torch.LongTensor(
172 |         [input_ids + [pad_token_id] * (tokenizer.model_max_length - len(input_ids))]
173 |     )
174 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [project]
 2 | name = "mlsae"
 3 | version = "0.2.0"
 4 | description = "Multi-Layer Sparse Autoencoders"
 5 | authors = [{ name = "Tim Lawson", email = "hello@timlawson.dev" }]
 6 | readme = "README.md"
 7 | requires-python = ">=3.12"
 8 | dependencies = [
 9 |     "datasets>=3.1.0",
10 |     "diptest>=0.8.2",
11 |     "einops>=0.8.0",
12 |     "fastapi>=0.115.5",
13 |     "huggingface-hub[cli]>=0.26.2",
14 |     "jaxtyping>=0.2.34",
15 |     "lightning>=2.4.0",
16 |     "loguru>=0.7.2",
17 |     "matplotlib>=3.9.2",
18 |     "natsort>=8.4.0",
19 |     "numpy>=2.1.3",
20 |     "orjson>=3.10.11",
21 |     "pandas>=2.2.3",
22 |     "pydantic>=2.9.2",
23 |     "pytest>=8.3.3",
24 |     "ruff>=0.7.3",
25 |     "safetensors>=0.4.5",
26 |     "scipy>=1.14.1",
27 |     "simple-parsing>=0.1.6",
28 |     "torch>=2.5.1",
29 |     "torchmetrics>=1.6.0",
30 |     "tqdm>=4.67.0",
31 |     "transformers>=4.46.2",
32 |     "triton>=3.1.0",
33 |     "tuned-lens>=0.2.0",
34 |     "uvicorn[standard]>=0.32.0",
35 |     "wandb>=0.18.7",
36 |     "zstandard>=0.23.0",
37 | ]
38 | 
39 | [tool.pytest.ini_options]
40 | minversion = "6.0"
41 | addopts = "-ra -q --import-mode=importlib -m='not slow'"
42 | markers = ["slow: mark test as slow to run"]
43 | testpaths = ["mlsae"]
44 | 
45 | [tool.ruff]
46 | extend-include = ["*.ipynb"]
47 | target-version = "py312"
48 | 
49 | [tool.ruff.lint]
50 | ignore = ["F722", "F821", "UP037"]
51 | select = ["E", "F", "UP", "B", "SIM", "I"]
52 | unfixable = ["F401"]
53 | 
54 | [tool.setuptools.packages.find]
55 | include = ["mlsae", "figures"]
56 | namespaces = false
57 | 


--------------------------------------------------------------------------------
/test.py:
--------------------------------------------------------------------------------
1 | from simple_parsing import parse
2 | 
3 | from mlsae.trainer import RunConfig, test
4 | 
5 | if __name__ == "__main__":
6 |     test(parse(RunConfig))
7 | 


--------------------------------------------------------------------------------
/tests.py:
--------------------------------------------------------------------------------
 1 | from simple_parsing import parse
 2 | from tqdm import tqdm
 3 | 
 4 | from mlsae.model import DataConfig, MLSAEConfig
 5 | from mlsae.trainer import RunConfig, SweepConfig, test
 6 | 
 7 | 
 8 | def main(config: SweepConfig) -> None:
 9 |     for model_name, expansion_factor, k in tqdm(config):
10 |         test(
11 |             RunConfig(
12 |                 autoencoder=MLSAEConfig(
13 |                     expansion_factor=expansion_factor, k=k, tuned_lens=config.tuned_lens
14 |                 ),
15 |                 data=DataConfig(max_tokens=1_000_000, num_workers=1),
16 |                 model_name=model_name,
17 |             ),
18 |         )
19 | 
20 | 
21 | if __name__ == "__main__":
22 |     main(parse(SweepConfig))
23 | 


--------------------------------------------------------------------------------
/train.py:
--------------------------------------------------------------------------------
1 | from simple_parsing import parse
2 | 
3 | from mlsae.trainer import RunConfig, train
4 | 
5 | if __name__ == "__main__":
6 |     train(parse(RunConfig))
7 | 


--------------------------------------------------------------------------------
/upload.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from mlsae.model import MLSAETransformer
 4 | from mlsae.utils import get_model_repo_id
 5 | 
 6 | 
 7 | def find_ckpt_paths(
 8 |     ckpt_dir: str = "wandb_logs/lightning_logs", step: int = 7616
 9 | ) -> list[str]:
10 |     paths: list[str] = []
11 |     for root, _, files in os.walk(ckpt_dir):
12 |         for file in files:
13 |             if file.endswith(f"step={step}.ckpt"):
14 |                 paths.append(os.path.join(root, file))
15 |     return paths
16 | 
17 | 
18 | def upload_models(ckpt_path: str, dry_run: bool) -> None:
19 |     print(f"loading from: {ckpt_path}")
20 |     model = MLSAETransformer.load_from_checkpoint(ckpt_path, strict=False)
21 | 
22 |     # Remove the buffers, if we haven't already. This saves A LOT of space!
23 |     if hasattr(model, "loss_true"):
24 |         del model.loss_true
25 |     if hasattr(model, "loss_pred"):
26 |         del model.loss_pred
27 |     if hasattr(model, "logits_true"):
28 |         del model.logits_true
29 |     if hasattr(model, "logits_pred"):
30 |         del model.logits_pred
31 |     if hasattr(model.autoencoder, "last_nonzero"):
32 |         del model.autoencoder.last_nonzero
33 | 
34 |     # The PyTorch Lightning module, which includes the underlying transformer.
35 |     repo_id_tfm = get_model_repo_id(model, True)
36 |     print("repo_id (transformer):", repo_id_tfm)
37 |     save_dir_tfm = f"models/{repo_id_tfm}"
38 | 
39 |     if not dry_run:
40 |         os.makedirs(save_dir_tfm, exist_ok=True)
41 |         model.save_pretrained(
42 |             save_directory=save_dir_tfm,
43 |             repo_id=repo_id_tfm,
44 |             push_to_hub=True,
45 |             model_card_kwargs=dict(
46 |                 model_id=repo_id_tfm,
47 |                 base_model=model.model_name,
48 |                 model_name=model.model_name,
49 |                 expansion_factor=model.expansion_factor,
50 |                 k=model.k,
51 |             ),
52 |         )
53 | 
54 |     # The PyTorch autoencoder module, which is much smaller.
55 |     repo_id = get_model_repo_id(model, False)
56 |     print("repo_id (autoencoder):", repo_id)
57 |     save_dir = f"models/{repo_id}"
58 | 
59 |     if not dry_run:
60 |         os.makedirs(save_dir, exist_ok=True)
61 |         model.autoencoder.save_pretrained(
62 |             save_directory=save_dir,
63 |             repo_id=repo_id,
64 |             push_to_hub=True,
65 |             model_card_kwargs=dict(
66 |                 model_id=repo_id,
67 |                 base_model=model.model_name,
68 |                 model_name=model.model_name,
69 |                 expansion_factor=model.expansion_factor,
70 |                 k=model.k,
71 |             ),
72 |         )
73 | 
74 | 
75 | if __name__ == "__main__":
76 |     dry_run = False
77 | 
78 |     for path in find_ckpt_paths(step=7616):
79 |         upload_models(path, dry_run)
80 | 
81 |     # NOTE: GPT-2 has max_length 1024, so we have twice as many steps for 1B tokens.
82 |     for path in find_ckpt_paths(step=15232):
83 |         upload_models(path, dry_run)
84 | 


--------------------------------------------------------------------------------