├── .gitignore
├── neuron-explainer
├── neuron_explainer
│ ├── __init__.py
│ ├── activations
│ │ ├── __init__.py
│ │ ├── token_connections.py
│ │ ├── activation_records.py
│ │ └── activations.py
│ ├── explanations
│ │ ├── __init__.py
│ │ ├── puzzles.py
│ │ ├── prompt_builder.py
│ │ ├── token_space_few_shot_examples.py
│ │ ├── scoring.py
│ │ ├── test_explainer.py
│ │ ├── calibrated_simulator.py
│ │ ├── test_simulator.py
│ │ ├── explanations.py
│ │ ├── explainer.py
│ │ └── few_shot_examples.py
│ ├── fast_dataclasses
│ │ ├── __init__.py
│ │ ├── test_fast_dataclasses.py
│ │ └── fast_dataclasses.py
│ ├── azure.py
│ └── api_client.py
├── .gitignore
├── setup.py
├── README.md
└── demos
│ ├── explain_puzzles.ipynb
│ ├── generate_and_score_explanation.ipynb
│ └── generate_and_score_token_look_up_table_explanation.ipynb
├── neuron-viewer
├── public
│ ├── robots.txt
│ └── favicon.ico
├── tailwind.config.js
├── .parcelrc
├── src
│ ├── panes
│ │ ├── index.js
│ │ ├── datasetList.jsx
│ │ ├── similarNeurons.jsx
│ │ ├── topTokens.jsx
│ │ └── explanation.jsx
│ ├── index.css
│ ├── reportWebVitals.js
│ ├── App.jsx
│ ├── heatmapGrid.tsx
│ ├── index.jsx
│ ├── utils.ts
│ ├── tokenHeatmap.tsx
│ ├── feed.jsx
│ ├── index.html
│ ├── types.ts
│ ├── simulationHeatmap.tsx
│ ├── interpAPI.ts
│ ├── App.css
│ └── welcome.tsx
├── tsconfig.json
├── README.md
├── .gitignore
├── package.json
└── python
│ └── server.py
└── README.md
/.gitignore:
--------------------------------------------------------------------------------
1 | .DS_Store
2 |
--------------------------------------------------------------------------------
/neuron-explainer/neuron_explainer/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/neuron-explainer/neuron_explainer/activations/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/neuron-explainer/neuron_explainer/explanations/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/neuron-explainer/.gitignore:
--------------------------------------------------------------------------------
1 | *.egg-info/
2 | __pycache__/
3 |
--------------------------------------------------------------------------------
/neuron-viewer/public/robots.txt:
--------------------------------------------------------------------------------
1 | # https://www.robotstxt.org/robotstxt.html
2 | User-agent: *
3 | Disallow:
4 |
--------------------------------------------------------------------------------
/neuron-viewer/public/favicon.ico:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openai/automated-interpretability/HEAD/neuron-viewer/public/favicon.ico
--------------------------------------------------------------------------------
/neuron-viewer/tailwind.config.js:
--------------------------------------------------------------------------------
1 | /** @type {import('tailwindcss').Config} */
2 | module.exports = {
3 | content: ["./src/**/*.{html,js,jsx}"],
4 | theme: {
5 | extend: {},
6 | },
7 | plugins: [],
8 | }
9 |
--------------------------------------------------------------------------------
/neuron-explainer/neuron_explainer/fast_dataclasses/__init__.py:
--------------------------------------------------------------------------------
1 | from .fast_dataclasses import FastDataclass, dumps, loads, register_dataclass
2 |
3 | __all__ = ["FastDataclass", "dumps", "loads", "register_dataclass"]
4 |
--------------------------------------------------------------------------------
/neuron-viewer/.parcelrc:
--------------------------------------------------------------------------------
1 | {
2 | "extends": "@parcel/config-default",
3 | "transformers": {
4 | "*.{ts,tsx}": ["@parcel/transformer-typescript-tsc"]
5 | },
6 | "validators": {
7 | "*.{ts,tsx}": ["@parcel/validator-typescript"]
8 | }
9 | }
10 |
--------------------------------------------------------------------------------
/neuron-viewer/src/panes/index.js:
--------------------------------------------------------------------------------
1 | export { default as TopTokens } from "./topTokens"
2 | export { default as Explanation } from "./explanation"
3 | export { default as DatasetList } from "./datasetList"
4 | export { default as SimilarNeurons } from "./similarNeurons"
5 |
--------------------------------------------------------------------------------
/neuron-explainer/neuron_explainer/azure.py:
--------------------------------------------------------------------------------
1 | def standardize_azure_url(url):
2 | """Make sure url is converted to url format, not an azure path"""
3 | if url.startswith("az://openaipublic/"):
4 | url = url.replace("az://openaipublic/", "https://openaipublic.blob.core.windows.net/")
5 | return url
6 |
--------------------------------------------------------------------------------
/neuron-viewer/src/index.css:
--------------------------------------------------------------------------------
1 | body {
2 | margin: 0;
3 | font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', 'Roboto', 'Oxygen',
4 | 'Ubuntu', 'Cantarell', 'Fira Sans', 'Droid Sans', 'Helvetica Neue',
5 | sans-serif;
6 | -webkit-font-smoothing: antialiased;
7 | -moz-osx-font-smoothing: grayscale;
8 | }
9 |
10 | code {
11 | font-family: source-code-pro, Menlo, Monaco, Consolas, 'Courier New',
12 | monospace;
13 | }
14 |
--------------------------------------------------------------------------------
/neuron-viewer/src/reportWebVitals.js:
--------------------------------------------------------------------------------
1 | const reportWebVitals = onPerfEntry => {
2 | if (onPerfEntry && onPerfEntry instanceof Function) {
3 | import('web-vitals').then(({ getCLS, getFID, getFCP, getLCP, getTTFB }) => {
4 | getCLS(onPerfEntry);
5 | getFID(onPerfEntry);
6 | getFCP(onPerfEntry);
7 | getLCP(onPerfEntry);
8 | getTTFB(onPerfEntry);
9 | });
10 | }
11 | };
12 |
13 | export default reportWebVitals;
--------------------------------------------------------------------------------
/neuron-viewer/src/App.jsx:
--------------------------------------------------------------------------------
1 | import "./App.css"
2 | import Feed from "./feed"
3 | import React from "react"
4 | import { Routes, Route, HashRouter } from "react-router-dom"
5 |
6 | function App() {
7 | return (
8 |
9 |
10 | } />
11 | } />
12 |
13 |
14 | )
15 | }
16 |
17 | export default App
18 |
--------------------------------------------------------------------------------
/neuron-viewer/src/heatmapGrid.tsx:
--------------------------------------------------------------------------------
1 | import { TokenAndActivation } from "./types"
2 | import TokenHeatmap from "./tokenHeatmap";
3 |
4 | export default ({ allTokens }: { allTokens: TokenAndActivation[][]}) => {
5 | return (
6 |
13 | return (
14 |
15 | {tokens.map(({ token, activation, normalized_activation }, i) => {
16 | const color = getInterpolatedColor(colors, boundaries, normalized_activation || activation);
17 | return
27 | {token}
28 |
29 | })}
30 |
31 | )
32 | }
33 |
--------------------------------------------------------------------------------
/neuron-viewer/package.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "neuron-viewer",
3 | "version": "0.1.67",
4 | "homepage": "https://openaipublic.blob.core.windows.net/neuron-explainer/neuron-viewer",
5 | "dependencies": {
6 | "@headlessui/react": "^1.7.8",
7 | "@headlessui/tailwindcss": "^0.1.2",
8 | "@types/d3-scale": "^4.0.3",
9 | "@types/lodash": "^4.14.194",
10 | "@types/react": "^18.0.37",
11 | "@types/react-dom": "^18.0.11",
12 | "d3-scale": "^4.0.2",
13 | "lodash": "^4.17.21",
14 | "react": "^18.2.0",
15 | "react-dom": "^18.2.0",
16 | "react-router-dom": "^6.10.0",
17 | "web-vitals": "^3.0.3"
18 | },
19 | "scripts": {
20 | "startpy": "nodemon python/server.py",
21 | "start": "parcel src/index.html",
22 | "build": "parcel build src/index.html",
23 | "serve": "parcel serve src/index.html",
24 | "typecheck": "tsc -p ."
25 | },
26 | "eslintConfig": {
27 | "extends": [
28 | "react-app"
29 | ]
30 | },
31 | "alias": {
32 | "preact/jsx-dev-runtime": "preact/jsx-runtime"
33 | },
34 | "devDependencies": {
35 | "@observablehq/plot": "^0.6.5",
36 | "@parcel/transformer-typescript-tsc": "^2.8.3",
37 | "@parcel/validator-typescript": "^2.8.3",
38 | "nodemon": "^2.0.22",
39 | "parcel": "^2.8.3",
40 | "preact": "^10.13.2",
41 | "process": "^0.11.10",
42 | "react-refresh": "0.10.0",
43 | "tailwindcss": "^3.2.4",
44 | "typescript": "^5.0.4"
45 | }
46 | }
47 |
--------------------------------------------------------------------------------
/neuron-viewer/python/server.py:
--------------------------------------------------------------------------------
1 | # %%
2 | import logging
3 |
4 | from flask import Flask, request
5 | from flask_cors import CORS
6 |
7 | import json
8 |
9 | import urllib.request
10 |
11 | def load_az_json(url):
12 | with urllib.request.urlopen(url) as f:
13 | return json.load(f)
14 |
15 | def start(
16 | dev: bool = False,
17 | host_name: str = "0.0.0.0",
18 | port: int = 80,
19 | ):
20 | app = Flask("interpretability chat")
21 | app.logger.setLevel(logging.INFO)
22 | # app.logger.disabled = True
23 | CORS(app)
24 |
25 | @app.after_request
26 | def after_request(response):
27 | response.headers.add("Access-Control-Allow-Origin", "*")
28 | response.headers.add(
29 | "Access-Control-Allow-Headers", "Content-Type,Authorization"
30 | )
31 | response.headers.add(
32 | "Access-Control-Allow-Methods", "GET,PUT,POST,DELETE,OPTIONS"
33 | )
34 | return response
35 |
36 | @app.route("/load_az", methods=["GET", "POST"])
37 | async def load_az():
38 | args = request.get_json()
39 | path = args["path"]
40 | result = load_az_json(path)
41 | return result
42 |
43 | app.run(debug=dev, host=host_name, port=port, use_reloader=False)
44 |
45 |
46 | def main(dev: bool = True, host_name: str = "0.0.0.0", port: int = 8000):
47 | start(dev=dev, host_name=host_name, port=port)
48 |
49 |
50 | if __name__ == "__main__":
51 | main()
52 |
--------------------------------------------------------------------------------
/neuron-viewer/src/feed.jsx:
--------------------------------------------------------------------------------
1 | import * as Panes from "./panes"
2 | import React, { useEffect } from "react"
3 | import Welcome from "./welcome"
4 | import { useState } from "react"
5 | import { useParams, Link } from "react-router-dom"
6 |
7 | export default function Feed() {
8 | const params = useParams()
9 | // If params is missing either index, there's no neuron selected.
10 | let activeNeuron;
11 | if (params.layer === undefined || params.neuron === undefined) {
12 | activeNeuron = null
13 | } else {
14 | // Grab the layer and neuron indices from the params, casting them to ints.
15 | activeNeuron = {
16 | "layer": parseInt(params.layer),
17 | "neuron": parseInt(params.neuron),
18 | }
19 | }
20 |
21 | const Pane = ({ children }) => (
22 |
{children}
23 | )
24 |
25 | return (
26 |
27 |
28 |
29 | Neuron Viewer
30 |
31 | {activeNeuron && (
32 |
33 | Neuron {activeNeuron.layer}:{activeNeuron.neuron}
34 |
35 | )}
36 |
37 |
38 |
41 |
42 | {activeNeuron ?
43 | <>
44 |
45 | {React.createElement(Panes["Explanation"], { activeNeuron })}
46 |
47 |
48 | {React.createElement(Panes["DatasetList"], { activeNeuron })}
49 |
50 |
51 | {React.createElement(Panes["TopTokens"], { activeNeuron })}
52 |
53 |
54 | {React.createElement(Panes["SimilarNeurons"], { activeNeuron })}
55 |
56 | > :
57 |
58 | }
59 |
60 |
61 |
62 |
63 | )
64 | }
65 |
--------------------------------------------------------------------------------
/neuron-explainer/neuron_explainer/explanations/puzzles.py:
--------------------------------------------------------------------------------
1 | import json
2 | import os
3 | from dataclasses import dataclass
4 |
5 | from neuron_explainer.activations.activations import ActivationRecord
6 |
7 |
8 | @dataclass(frozen=True)
9 | class Puzzle:
10 | """A puzzle is a ground truth explanation, a collection of sentences (stored as ActivationRecords) with activations
11 | according to that explanation, and a collection of false explanations"""
12 |
13 | name: str
14 | explanation: str
15 | activation_records: list[ActivationRecord]
16 | false_explanations: list[str]
17 |
18 |
19 | def convert_puzzle_to_tokenized_sentences(puzzle: Puzzle) -> list[list[str]]:
20 | """Converts a puzzle to a list of tokenized sentences."""
21 | return [record.tokens for record in puzzle.activation_records]
22 |
23 |
24 | def convert_puzzle_dict_to_puzzle(puzzle_dict: dict) -> Puzzle:
25 | """Converts a json dictionary representation of a puzzle to the Puzzle class."""
26 | puzzle_activation_records = []
27 | for sentence in puzzle_dict["sentences"]:
28 | # Token-activation pairs are listed as either a string or a list of a string and a float. If it is a list, the float is the activation.
29 | # If it is only a string, the activation is assumed to be 0. This is useful for readability and reducing redundancy in the data.
30 | tokens = [t[0] if type(t) is list else t for t in sentence]
31 | assert all([type(t) is str for t in tokens]), "All tokens must be strings"
32 | activations = [float(t[1]) if type(t) is list else 0.0 for t in sentence]
33 | assert all([type(t) is float for t in activations]), "All activations must be floats"
34 |
35 | puzzle_activation_records.append(ActivationRecord(tokens=tokens, activations=activations))
36 |
37 | return Puzzle(
38 | name=puzzle_dict["name"],
39 | explanation=puzzle_dict["explanation"],
40 | activation_records=puzzle_activation_records,
41 | false_explanations=puzzle_dict["false_explanations"],
42 | )
43 |
44 |
45 | PUZZLES_BY_NAME: dict[str, Puzzle] = dict()
46 | script_dir = os.path.dirname(os.path.abspath(__file__))
47 | with open(os.path.join(script_dir, "puzzles.json"), "r") as f:
48 | puzzle_dicts = json.loads(f.read())
49 | for name in puzzle_dicts.keys():
50 | PUZZLES_BY_NAME[name] = convert_puzzle_dict_to_puzzle(puzzle_dicts[name])
51 |
--------------------------------------------------------------------------------
/neuron-explainer/neuron_explainer/activations/token_connections.py:
--------------------------------------------------------------------------------
1 | from dataclasses import dataclass
2 | from typing import List, Union
3 |
4 | import blobfile as bf
5 | from neuron_explainer.fast_dataclasses import FastDataclass, loads, register_dataclass
6 | from neuron_explainer.azure import standardize_azure_url
7 | import urllib.request
8 |
9 |
10 | @register_dataclass
11 | @dataclass
12 | class TokensAndWeights(FastDataclass):
13 | tokens: List[str]
14 | strengths: List[float]
15 |
16 |
17 | @register_dataclass
18 | @dataclass
19 | class WeightBasedSummaryOfNeuron(FastDataclass):
20 | input_positive: TokensAndWeights
21 | input_negative: TokensAndWeights
22 | output_positive: TokensAndWeights
23 | output_negative: TokensAndWeights
24 |
25 |
26 | def load_token_weight_connections_of_neuron(
27 | layer_index: Union[str, int],
28 | neuron_index: Union[str, int],
29 | dataset_path: str = "https://openaipublic.blob.core.windows.net/neuron-explainer/data/related-tokens/weight-based",
30 | ) -> WeightBasedSummaryOfNeuron:
31 | """Load the TokenLookupTableSummaryOfNeuron for the specified neuron."""
32 | url = "/".join([dataset_path, str(layer_index), f"{neuron_index}.json"])
33 | url = standardize_azure_url(url)
34 | with urllib.request.urlopen(url) as f:
35 | return loads(f.read(), backwards_compatible=False)
36 |
37 |
38 | @register_dataclass
39 | @dataclass
40 | class TokenLookupTableSummaryOfNeuron(FastDataclass):
41 | """List of tokens and the average activations of a given neuron in response to each
42 | respective token. These are selected from among the tokens in the vocabulary with the
43 | highest average activations across an internet text dataset, with the highest activations
44 | first."""
45 |
46 | tokens: List[str]
47 | average_activations: List[float]
48 |
49 |
50 | def load_token_lookup_table_connections_of_neuron(
51 | layer_index: Union[str, int],
52 | neuron_index: Union[str, int],
53 | dataset_path: str = "https://openaipublic.blob.core.windows.net/neuron-explainer/data/related-tokens/activation-based",
54 | ) -> TokenLookupTableSummaryOfNeuron:
55 | """Load the TokenLookupTableSummaryOfNeuron for the specified neuron."""
56 | url = "/".join([dataset_path, str(layer_index), f"{neuron_index}.json"])
57 | url = standardize_azure_url(url)
58 | with urllib.request.urlopen(url) as f:
59 | return loads(f.read(), backwards_compatible=False)
60 |
--------------------------------------------------------------------------------
/neuron-viewer/src/index.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
11 |
15 |
24 |
25 |
26 |
Neuron viewer
27 |
28 |
29 |
30 |
31 |
42 |
43 |
44 |
45 |
46 |
56 |
57 |
58 |
59 |
60 |
--------------------------------------------------------------------------------
/neuron-explainer/demos/explain_puzzles.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": null,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "%load_ext autoreload\n",
10 | "%autoreload 2"
11 | ]
12 | },
13 | {
14 | "cell_type": "code",
15 | "execution_count": null,
16 | "metadata": {},
17 | "outputs": [],
18 | "source": [
19 | "import os\n",
20 | "\n",
21 | "os.environ[\"OPENAI_API_KEY\"] = \"put-key-here\"\n",
22 | "\n",
23 | "from neuron_explainer.activations.activation_records import calculate_max_activation\n",
24 | "from neuron_explainer.explanations.explainer import TokenActivationPairExplainer\n",
25 | "from neuron_explainer.explanations.prompt_builder import PromptFormat\n",
26 | "from neuron_explainer.explanations.puzzles import PUZZLES_BY_NAME\n",
27 | "\n",
28 | "\n",
29 | "EXPLAINER_MODEL_NAME = \"gpt-4\"\n",
30 | "\n",
31 | "explainer = TokenActivationPairExplainer(\n",
32 | " model_name=EXPLAINER_MODEL_NAME,\n",
33 | " prompt_format=PromptFormat.HARMONY_V4,\n",
34 | " max_concurrent=1,\n",
35 | ")\n",
36 | "\n",
37 | "for puzzle_name, puzzle in PUZZLES_BY_NAME.items():\n",
38 | " print(f\"{puzzle_name=}\")\n",
39 | " puzzle_answer = puzzle.explanation\n",
40 | " # Generate an explanation for the puzzle.\n",
41 | " explanations = await explainer.generate_explanations(\n",
42 | " all_activation_records=puzzle.activation_records,\n",
43 | " max_activation=calculate_max_activation(puzzle.activation_records),\n",
44 | " num_samples=1,\n",
45 | " )\n",
46 | " assert len(explanations) == 1\n",
47 | " model_generated_explanation = explanations[0]\n",
48 | " print(f\"{model_generated_explanation=}\")\n",
49 | " print(f\"{puzzle_answer=}\\n\")\n",
50 | "\n"
51 | ]
52 | }
53 | ],
54 | "metadata": {
55 | "kernelspec": {
56 | "display_name": "openai",
57 | "language": "python",
58 | "name": "openai"
59 | },
60 | "language_info": {
61 | "codemirror_mode": {
62 | "name": "ipython",
63 | "version": 3
64 | },
65 | "file_extension": ".py",
66 | "mimetype": "text/x-python",
67 | "name": "python",
68 | "nbconvert_exporter": "python",
69 | "pygments_lexer": "ipython3",
70 | "version": "3.9.9"
71 | },
72 | "orig_nbformat": 4
73 | },
74 | "nbformat": 4,
75 | "nbformat_minor": 2
76 | }
77 |
--------------------------------------------------------------------------------
/neuron-viewer/src/types.ts:
--------------------------------------------------------------------------------
1 | import { scaleLinear } from "d3-scale"
2 | import { min, max, flatten } from "lodash"
3 |
4 | export type Neuron = {
5 | layer: number;
6 | neuron: number;
7 | }
8 |
9 | export type TokenAndActivation = {
10 | token: string,
11 | activation: number
12 | normalized_activation?: number
13 | }
14 |
15 | export type TokenSequence = TokenAndActivation[]
16 |
17 | export const normalizeTokenActs = (...sequences: TokenSequence[][]) => {
18 | // console.log('sequences', sequences)
19 | let flattened: TokenAndActivation[] = flatten(flatten(sequences))
20 | // Replace all activations less than 0 in data.tokens with 0. This matches the format in the
21 | // top + random activation records displayed in the main grid.
22 | flattened = flattened.map(({token, activation}) => {
23 | return {
24 | token,
25 | activation: Math.max(activation, 0)
26 | }
27 | })
28 | const maxActivation = max(flattened.map((ta) => ta.activation)) || 0;
29 | const neuronScale = scaleLinear()
30 | // Even though we're only displaying positive activations, we still need to scale in a way that
31 | // accounts for the existence of negative activations, since our color scale includes them.
32 | .domain([0, maxActivation])
33 | .range([0, 1])
34 |
35 | return sequences.map((seq) => seq.map((tas) => tas.map(({ token, activation }) => ({
36 | token,
37 | activation,
38 | normalized_activation: neuronScale(activation),
39 | }))))
40 | }
41 |
42 | export type Color = {r: number, g: number, b: number};
43 | export function interpolateColor(color_l: Color, color_r: Color, value: number) {
44 | const color = {
45 | r: Math.round(color_l.r + (color_r.r - color_l.r) * value),
46 | g: Math.round(color_l.g + (color_r.g - color_l.g) * value),
47 | b: Math.round(color_l.b + (color_r.b - color_l.b) * value),
48 | }
49 | return color
50 | }
51 |
52 | export function getInterpolatedColor(colors: Color[], boundaries: number[], value: number) {
53 | const index = boundaries.findIndex((boundary) => boundary >= value)
54 | const colorIndex = Math.max(0, index - 1)
55 | const color_left = colors[colorIndex]
56 | const color_right = colors[colorIndex + 1]
57 | const boundary_left = boundaries[colorIndex]
58 | const boundary_right = boundaries[colorIndex + 1]
59 | const ratio = (value - boundary_left) / (boundary_right - boundary_left)
60 | const color = interpolateColor(color_left, color_right, ratio)
61 | return color
62 | }
63 |
64 | export const DEFAULT_COLORS = [
65 | // { r: 255, g: 0, b: 105 },
66 | { r: 255, g: 255, b: 255 },
67 | { r: 0, g: 255, b: 0 },
68 | ]
69 | export const DEFAULT_BOUNDARIES = [
70 | // 0, 0.5, 1
71 | 0, 1
72 | ]
73 |
74 |
--------------------------------------------------------------------------------
/neuron-explainer/neuron_explainer/fast_dataclasses/test_fast_dataclasses.py:
--------------------------------------------------------------------------------
1 | from dataclasses import dataclass
2 |
3 | import pytest
4 |
5 | from .fast_dataclasses import FastDataclass, dumps, loads, register_dataclass
6 |
7 |
8 | # Inheritance is a bit tricky with our setup. dataclass_name must be set for instances of these
9 | # classes to serialize and deserialize correctly, but if it's given a default value, then subclasses
10 | # can't have any fields that don't have default values, because of how constructors are generated
11 | # for dataclasses (fields with no default value can't follow those with default values). To work
12 | # around this, we set dataclass_name in __post_init__ on the base class, which is called after the
13 | # constructor. The implementation does the right thing for both the base class and the subclass.
14 | @register_dataclass
15 | @dataclass
16 | class DataclassC(FastDataclass):
17 | ints: list[int]
18 |
19 |
20 | @register_dataclass
21 | @dataclass
22 | class DataclassC_ext(DataclassC):
23 | s: str
24 |
25 |
26 | @register_dataclass
27 | @dataclass
28 | class DataclassB(FastDataclass):
29 | str_to_c: dict[str, DataclassC]
30 | cs: list[DataclassC]
31 |
32 |
33 | @register_dataclass
34 | @dataclass
35 | class DataclassA(FastDataclass):
36 | floats: list[float]
37 | strings: list[str]
38 | bs: list[DataclassB]
39 |
40 |
41 | @register_dataclass
42 | @dataclass
43 | class DataclassD(FastDataclass):
44 | s1: str
45 | s2: str = "default"
46 |
47 |
48 | def test_dataclasses() -> None:
49 | a = DataclassA(
50 | floats=[1.0, 2.0],
51 | strings=["a", "b"],
52 | bs=[
53 | DataclassB(
54 | str_to_c={"a": DataclassC(ints=[1, 2]), "b": DataclassC(ints=[3, 4])},
55 | cs=[DataclassC(ints=[5, 6]), DataclassC_ext(ints=[7, 8], s="s")],
56 | ),
57 | DataclassB(
58 | str_to_c={"c": DataclassC_ext(ints=[9, 10], s="t"), "d": DataclassC(ints=[11, 12])},
59 | cs=[DataclassC(ints=[13, 14]), DataclassC(ints=[15, 16])],
60 | ),
61 | ],
62 | )
63 | assert loads(dumps(a)) == a
64 |
65 |
66 | def test_c_and_c_ext() -> None:
67 | c_ext = DataclassC_ext(ints=[3, 4], s="s")
68 | assert loads(dumps(c_ext)) == c_ext
69 |
70 | c = DataclassC(ints=[1, 2])
71 | assert loads(dumps(c)) == c
72 |
73 |
74 | def test_bad_serialized_data() -> None:
75 | assert type(loads(dumps(DataclassC(ints=[3, 4])))) == DataclassC
76 | assert type(loads('{"ints": [3, 4]}', backwards_compatible=False)) == dict
77 | assert type(loads('{"ints": [3, 4], "dataclass_name": "DataclassC"}')) == DataclassC
78 | with pytest.raises(TypeError):
79 | loads('{"ints": [3, 4], "bogus_extra_field": "foo", "dataclass_name": "DataclassC"}')
80 | with pytest.raises(TypeError):
81 | loads('{"ints_field_is_missing": [3, 4], "dataclass_name": "DataclassC"}')
82 | assert type(loads('{"s1": "test"}', backwards_compatible=False)) == dict
83 | assert type(loads('{"s1": "test"}', backwards_compatible=True)) == DataclassD
84 |
--------------------------------------------------------------------------------
/neuron-explainer/neuron_explainer/fast_dataclasses/fast_dataclasses.py:
--------------------------------------------------------------------------------
1 | # Utilities for dataclasses that are very fast to serialize and deserialize, with limited data
2 | # validation. Fields must not be tuples, since they get serialized and then deserialized as lists.
3 | #
4 | # The unit tests for this library show how to use it.
5 |
6 | import json
7 | from dataclasses import dataclass, field, fields, is_dataclass
8 | from functools import partial
9 | from typing import Any, Union
10 |
11 | import orjson
12 |
13 | dataclasses_by_name = {}
14 | dataclasses_by_fieldnames = {}
15 |
16 |
17 | @dataclass
18 | class FastDataclass:
19 | dataclass_name: str = field(init=False)
20 |
21 | def __post_init__(self) -> None:
22 | self.dataclass_name = self.__class__.__name__
23 |
24 |
25 | def register_dataclass(cls): # type: ignore
26 | assert is_dataclass(cls), "Only dataclasses can be registered."
27 | dataclasses_by_name[cls.__name__] = cls
28 | name_set = frozenset(f.name for f in fields(cls) if f.name != "dataclass_name")
29 | dataclasses_by_fieldnames[name_set] = cls
30 | return cls
31 |
32 |
33 | def dumps(obj: Any) -> bytes:
34 | return orjson.dumps(obj, option=orjson.OPT_SERIALIZE_NUMPY)
35 |
36 |
37 | def _object_hook(d: Any, backwards_compatible: bool = True) -> Any:
38 | # If d is a list, recurse.
39 | if isinstance(d, list):
40 | return [_object_hook(x, backwards_compatible=backwards_compatible) for x in d]
41 | # If d is not a dict, return it as is.
42 | if not isinstance(d, dict):
43 | return d
44 | cls = None
45 | if "dataclass_name" in d:
46 | if d["dataclass_name"] in dataclasses_by_name:
47 | cls = dataclasses_by_name[d["dataclass_name"]]
48 | else:
49 | assert backwards_compatible, (
50 | f"Dataclass {d['dataclass_name']} not found, set backwards_compatible=True if you "
51 | f"are okay with that."
52 | )
53 | # Load objects created without dataclass_name set.
54 | else:
55 | # Try our best to find a dataclass if backwards_compatible is True.
56 | if backwards_compatible:
57 | d_fields = frozenset(d.keys())
58 | if d_fields in dataclasses_by_fieldnames:
59 | cls = dataclasses_by_fieldnames[d_fields]
60 | elif len(d_fields) > 0:
61 | # Check if the fields are a subset of a dataclass (if the dataclass had extra fields
62 | # added since the data was created). Note that this will fail if fields were removed
63 | # from the dataclass.
64 | for key, possible_cls in dataclasses_by_fieldnames.items():
65 | if d_fields.issubset(key):
66 | cls = possible_cls
67 | break
68 | else:
69 | print(f"Could not find dataclass for {d_fields} {cls}")
70 | new_d = {
71 | k: _object_hook(v, backwards_compatible=backwards_compatible)
72 | for k, v in d.items()
73 | if k != "dataclass_name"
74 | }
75 | if cls is not None:
76 | return cls(**new_d)
77 | else:
78 | return new_d
79 |
80 |
81 | def loads(s: Union[str, bytes], backwards_compatible: bool = True) -> Any:
82 | return json.loads(
83 | s,
84 | object_hook=partial(_object_hook, backwards_compatible=backwards_compatible),
85 | )
86 |
--------------------------------------------------------------------------------
/neuron-explainer/demos/generate_and_score_explanation.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": null,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "%load_ext autoreload\n",
10 | "%autoreload 2"
11 | ]
12 | },
13 | {
14 | "cell_type": "code",
15 | "execution_count": null,
16 | "metadata": {},
17 | "outputs": [],
18 | "source": [
19 | "import os\n",
20 | "\n",
21 | "os.environ[\"OPENAI_API_KEY\"] = \"put-key-here\"\n",
22 | "\n",
23 | "from neuron_explainer.activations.activation_records import calculate_max_activation\n",
24 | "from neuron_explainer.activations.activations import ActivationRecordSliceParams, load_neuron\n",
25 | "from neuron_explainer.explanations.calibrated_simulator import UncalibratedNeuronSimulator\n",
26 | "from neuron_explainer.explanations.explainer import TokenActivationPairExplainer\n",
27 | "from neuron_explainer.explanations.prompt_builder import PromptFormat\n",
28 | "from neuron_explainer.explanations.scoring import simulate_and_score\n",
29 | "from neuron_explainer.explanations.simulator import ExplanationNeuronSimulator\n",
30 | "\n",
31 | "EXPLAINER_MODEL_NAME = \"gpt-4\"\n",
32 | "SIMULATOR_MODEL_NAME = \"text-davinci-003\"\n",
33 | "\n",
34 | "\n",
35 | "# test_response = await client.make_request(prompt=\"test 123<|endofprompt|>\", max_tokens=2)\n",
36 | "# print(\"Response:\", test_response[\"choices\"][0][\"text\"])\n",
37 | "\n",
38 | "# Load a neuron record.\n",
39 | "neuron_record = load_neuron(9, 6236)\n",
40 | "\n",
41 | "# Grab the activation records we'll need.\n",
42 | "slice_params = ActivationRecordSliceParams(n_examples_per_split=5)\n",
43 | "train_activation_records = neuron_record.train_activation_records(\n",
44 | " activation_record_slice_params=slice_params\n",
45 | ")\n",
46 | "valid_activation_records = neuron_record.valid_activation_records(\n",
47 | " activation_record_slice_params=slice_params\n",
48 | ")\n",
49 | "\n",
50 | "# Generate an explanation for the neuron.\n",
51 | "explainer = TokenActivationPairExplainer(\n",
52 | " model_name=EXPLAINER_MODEL_NAME,\n",
53 | " prompt_format=PromptFormat.HARMONY_V4,\n",
54 | " max_concurrent=1,\n",
55 | ")\n",
56 | "explanations = await explainer.generate_explanations(\n",
57 | " all_activation_records=train_activation_records,\n",
58 | " max_activation=calculate_max_activation(train_activation_records),\n",
59 | " num_samples=1,\n",
60 | ")\n",
61 | "assert len(explanations) == 1\n",
62 | "explanation = explanations[0]\n",
63 | "print(f\"{explanation=}\")\n",
64 | "\n",
65 | "# Simulate and score the explanation.\n",
66 | "simulator = UncalibratedNeuronSimulator(\n",
67 | " ExplanationNeuronSimulator(\n",
68 | " SIMULATOR_MODEL_NAME,\n",
69 | " explanation,\n",
70 | " max_concurrent=1,\n",
71 | " prompt_format=PromptFormat.INSTRUCTION_FOLLOWING,\n",
72 | " )\n",
73 | ")\n",
74 | "scored_simulation = await simulate_and_score(simulator, valid_activation_records)\n",
75 | "print(f\"score={scored_simulation.get_preferred_score():.2f}\")\n"
76 | ]
77 | }
78 | ],
79 | "metadata": {
80 | "kernelspec": {
81 | "display_name": "openai",
82 | "language": "python",
83 | "name": "python3"
84 | },
85 | "language_info": {
86 | "codemirror_mode": {
87 | "name": "ipython",
88 | "version": 3
89 | },
90 | "file_extension": ".py",
91 | "mimetype": "text/x-python",
92 | "name": "python",
93 | "nbconvert_exporter": "python",
94 | "pygments_lexer": "ipython3",
95 | "version": "3.9.9"
96 | },
97 | "orig_nbformat": 4
98 | },
99 | "nbformat": 4,
100 | "nbformat_minor": 2
101 | }
102 |
--------------------------------------------------------------------------------
/neuron-viewer/src/panes/datasetList.jsx:
--------------------------------------------------------------------------------
1 | import HeatmapGrid from "../heatmapGrid"
2 | import React, { useEffect, useState } from "react"
3 | import { normalizeTokenActs } from "../types"
4 |
5 | import {get_neuron_record} from "../interpAPI"
6 |
7 | function zip_sequences(sequences) {
8 | return sequences.map(({ activations, tokens }) => {
9 | return tokens.map((token, idx) => ({
10 | token,
11 | activation: activations[idx],
12 | }))
13 | })
14 | }
15 |
16 | export default ({ activeNeuron }) => {
17 | const [data, setData] = useState(null)
18 | const [showingMore, setShowingMore] = useState({})
19 | const [isLoading, setIsLoading] = useState(true)
20 |
21 | useEffect(() => {
22 | async function fetchData() {
23 | if (data) {
24 | return
25 | }
26 | const result = await get_neuron_record(activeNeuron)
27 | console.log(result)
28 | const all_sequences = []
29 | all_sequences.push({
30 | // label: '[0.999, 1] (Top quantile, sorted. 50 of 50000)',
31 | label: 'Top',
32 | sequences: zip_sequences(result.most_positive_activation_records),
33 | default_show: 4,
34 | })
35 | all_sequences.push({
36 | label: 'Quantile range [0.99, 0.999] sample',
37 | sequences: zip_sequences(result.random_sample_by_quantile[3]),
38 | default_show: 1,
39 | })
40 | all_sequences.push({
41 | label: 'Quantile range [0.9, 0.99] sample',
42 | sequences: zip_sequences(result.random_sample_by_quantile[2]),
43 | default_show: 1,
44 | })
45 | all_sequences.push({
46 | label: 'Quantile range [0.5, 0.9] sample',
47 | sequences: zip_sequences(result.random_sample_by_quantile[1]),
48 | default_show: 1,
49 | })
50 | all_sequences.push({
51 | label: 'Quantile range [0, 0.5] sample',
52 | sequences: zip_sequences(result.random_sample_by_quantile[0]),
53 | default_show: 1,
54 | })
55 | all_sequences.push({
56 | // label: '[0, 1] (Random)',
57 | label: 'Random sample',
58 | sequences: zip_sequences(result.random_sample),
59 | default_show: 2,
60 | })
61 | // for reference
62 | // intervals = [(0, 1), (0, 0.5), (0.5, 0.9), (0.9, 0.99), (0.99, 0.999), (0.999, 1)]
63 | // saved_activations_by_interval = [neuron_record.random_sample] + neuron_record.random_sample_by_decile[:-1] + [neuron_record.top_activations]
64 | setData(all_sequences)
65 | setIsLoading(false)
66 | }
67 | fetchData()
68 | }, [activeNeuron])
69 |
70 | if (isLoading) {
71 | return (
72 |
73 |
74 |
loading top dataset examples
75 |
76 | )
77 | }
78 |
79 | // const activations = data.top_activations;
80 | const all_normalized_sequences = normalizeTokenActs(...data.map(({sequences}) => sequences))
81 |
82 | return (
83 |
84 |
Activations
85 | {
86 | data.map(({label, default_show}, idx) => {
87 | const n_show = showingMore[label] ? all_normalized_sequences[idx].length : default_show;
88 | return (
89 |
90 |
91 | {label}
92 |
96 |
97 |
98 |
99 | )
100 | })
101 | }
102 |
103 | )
104 | }
105 |
--------------------------------------------------------------------------------
/neuron-explainer/demos/generate_and_score_token_look_up_table_explanation.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": null,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "%load_ext autoreload\n",
10 | "%autoreload 2"
11 | ]
12 | },
13 | {
14 | "cell_type": "code",
15 | "execution_count": null,
16 | "metadata": {},
17 | "outputs": [],
18 | "source": [
19 | "import os\n",
20 | "\n",
21 | "os.environ[\"OPENAI_API_KEY\"] = \"put-key-here\"\n",
22 | "\n",
23 | "from neuron_explainer.activations.activations import ActivationRecordSliceParams, load_neuron\n",
24 | "from neuron_explainer.activations.token_connections import load_token_lookup_table_connections_of_neuron\n",
25 | "from neuron_explainer.explanations.calibrated_simulator import UncalibratedNeuronSimulator\n",
26 | "from neuron_explainer.explanations.explainer import TokenSpaceRepresentationExplainer\n",
27 | "from neuron_explainer.explanations.prompt_builder import PromptFormat\n",
28 | "from neuron_explainer.explanations.scoring import simulate_and_score\n",
29 | "from neuron_explainer.explanations.simulator import ExplanationNeuronSimulator\n",
30 | "\n",
31 | "EXPLAINER_MODEL_NAME = \"gpt-4\"\n",
32 | "SIMULATOR_MODEL_NAME = \"text-davinci-003\"\n",
33 | "\n",
34 | "\n",
35 | "# test_response = await client.make_request(prompt=\"test 123<|endofprompt|>\", max_tokens=2)\n",
36 | "# print(\"Response:\", test_response[\"choices\"][0][\"text\"])\n",
37 | "\n",
38 | "layer_index = 9\n",
39 | "neuron_index = 6236\n",
40 | "\n",
41 | "# Load a token lookup table.\n",
42 | "token_lookup_table = load_token_lookup_table_connections_of_neuron(layer_index, neuron_index)\n",
43 | "\n",
44 | "# Load a neuron record.\n",
45 | "neuron_record = load_neuron(layer_index, neuron_index)\n",
46 | "\n",
47 | "# Grab the activation records we'll need.\n",
48 | "slice_params = ActivationRecordSliceParams(n_examples_per_split=5)\n",
49 | "valid_activation_records = neuron_record.valid_activation_records(\n",
50 | " activation_record_slice_params=slice_params\n",
51 | ")\n",
52 | "\n",
53 | "# Generate an explanation for the neuron.\n",
54 | "explainer = TokenSpaceRepresentationExplainer(\n",
55 | " model_name=EXPLAINER_MODEL_NAME,\n",
56 | " prompt_format=PromptFormat.HARMONY_V4,\n",
57 | " max_concurrent=1,\n",
58 | ")\n",
59 | "explanations = await explainer.generate_explanations(\n",
60 | " tokens=token_lookup_table.tokens,\n",
61 | " num_samples=1,\n",
62 | ")\n",
63 | "assert len(explanations) == 1\n",
64 | "explanation = explanations[0]\n",
65 | "print(f\"{explanation=}\")\n",
66 | "\n",
67 | "# Simulate and score the explanation.\n",
68 | "simulator = UncalibratedNeuronSimulator(\n",
69 | " ExplanationNeuronSimulator(\n",
70 | " SIMULATOR_MODEL_NAME,\n",
71 | " explanation,\n",
72 | " max_concurrent=1,\n",
73 | " prompt_format=PromptFormat.INSTRUCTION_FOLLOWING,\n",
74 | " )\n",
75 | ")\n",
76 | "scored_simulation = await simulate_and_score(simulator, valid_activation_records)\n",
77 | "print(f\"score={scored_simulation.get_preferred_score():.2f}\")\n"
78 | ]
79 | }
80 | ],
81 | "metadata": {
82 | "kernelspec": {
83 | "display_name": "Python 3",
84 | "language": "python",
85 | "name": "python3"
86 | },
87 | "language_info": {
88 | "codemirror_mode": {
89 | "name": "ipython",
90 | "version": 3
91 | },
92 | "file_extension": ".py",
93 | "mimetype": "text/x-python",
94 | "name": "python",
95 | "nbconvert_exporter": "python",
96 | "pygments_lexer": "ipython3",
97 | "version": "3.9.8"
98 | },
99 | "vscode": {
100 | "interpreter": {
101 | "hash": "fd71fb58b1ad02dde67c8ac595a52586dd87d3465221a699fc288aa2c48d5565"
102 | }
103 | }
104 | },
105 | "nbformat": 4,
106 | "nbformat_minor": 2
107 | }
108 |
--------------------------------------------------------------------------------
/neuron-viewer/src/panes/similarNeurons.jsx:
--------------------------------------------------------------------------------
1 | import React, { useEffect, useState } from "react"
2 | import _ from "lodash"
3 | import { Link } from "react-router-dom"
4 |
5 | import { get_explanations, get_top_neuron_connections } from "../interpAPI"
6 |
7 | function NeuronInfo({ neuron, strength }) {
8 | const [info, setInfo] = useState(null)
9 |
10 | useEffect(() => {
11 | async function fetchInfo() {
12 | const result = (await get_explanations({
13 | layer: neuron.layer,
14 | neuron: neuron.neuron,
15 | }))
16 | setInfo(result)
17 | }
18 |
19 | if (!info) {
20 | fetchInfo()
21 | }
22 | }, [])
23 |
24 | if (!info) {
25 | return (
26 |
27 |
28 | Loading neuron {neuron.layer}:{neuron.neuron}...
29 |
30 |
31 |
32 | )
33 | }
34 |
35 | return (
36 |
37 |
38 |
40 |
41 | Neuron {neuron.layer}:{neuron.neuron}
42 |
43 |
44 |
45 | Connection strength: {strength.toFixed(2)}
46 |
47 |
48 | {info.scored_explanations.map((explanation, i) => (
49 |
50 |
51 | {explanation.explanation}
52 |
53 |
54 | score: {explanation.scored_simulation.ev_correlation_score.toFixed(2)}
55 |
56 |
57 | ))}
58 |
59 |
60 |
61 | )
62 | }
63 |
64 | export default function SimilarNeurons({ activeNeuron: neuron }) {
65 | const [similarNeurons, setSimilarNeurons] = useState([])
66 | const [isLoading, setIsLoading] = useState(true)
67 |
68 | useEffect(() => {
69 | async function fetchSimilarNeurons() {
70 | const result = await get_top_neuron_connections(neuron)
71 | setSimilarNeurons(result)
72 | setIsLoading(false)
73 | }
74 |
75 | fetchSimilarNeurons()
76 | }, [neuron])
77 |
78 | if (isLoading) {
79 | return (
80 |
83 | )
84 | }
85 |
86 | const n_show = 3;
87 | return (
88 |
89 |
Related neurons
90 |
91 |
92 | {
93 | similarNeurons.input ?
94 |
95 |
Upstream
96 |
97 | {similarNeurons.input.slice(0, n_show).map(([layer, neuron, strength]) => (
98 |
99 | ))}
100 |
101 |
: null
102 | }
103 | {
104 | similarNeurons.output ?
105 |
106 |
Downstream
107 |
108 | {similarNeurons.output.slice(0, n_show).map(([layer, neuron, strength]) => (
109 |
110 | ))}
111 |
112 |
: null
113 | }
114 |
115 |
116 |
117 | )
118 | }
119 |
--------------------------------------------------------------------------------
/neuron-viewer/src/simulationHeatmap.tsx:
--------------------------------------------------------------------------------
1 | import React, { useState } from 'react';
2 |
3 | import { interpolateColor, Color, getInterpolatedColor, DEFAULT_COLORS, DEFAULT_BOUNDARIES, TokenAndActivation } from './types'
4 |
5 | type Props = {
6 | sequences: TokenAndActivation[][],
7 | simulated_sequences: TokenAndActivation[][],
8 | overlay_activations: boolean,
9 | colors?: Color[],
10 | boundaries?: number[],
11 | }
12 | export default function SimulationSequences({ sequences, simulated_sequences, overlay_activations, colors = DEFAULT_COLORS, boundaries = DEFAULT_BOUNDARIES }: Props) {
13 | return <>
14 | {
15 | sequences.map((tokens, i) => {
16 | let simulated_tokens = simulated_sequences[i];
17 | if (overlay_activations) {
18 | return (
19 |
20 | {tokens.map(({ token, activation, normalized_activation }, j) => {
21 | const { token: simulated_token, activation: simulated_activation, normalized_activation: simulated_normalized_activation } = simulated_tokens[j];
22 | if (simulated_token !== token) {
23 | throw new Error('simulated tokens not matching')
24 | }
25 | const color = getInterpolatedColor(colors, boundaries, normalized_activation || activation);
26 | const simcolor = getInterpolatedColor(colors, boundaries, simulated_normalized_activation || simulated_activation);
27 |
28 | return
29 |
30 | {token}
37 | {token}
44 |
45 |
46 | })}
47 |
48 | )
49 | } else {
50 | return (
51 |
52 |
53 | Real activations:
59 | {tokens.map(({ token, activation, normalized_activation }, j) => {
60 | const color = getInterpolatedColor(colors, boundaries, normalized_activation || activation);
61 | return {token}
68 | })}
69 |
70 |
71 |
72 | Simulated activations:
78 | {simulated_tokens.map(({ token, activation, normalized_activation }, j) => {
79 | const color = getInterpolatedColor(colors, boundaries, normalized_activation || activation);
80 | return {token}
87 | })}
88 |
89 |
90 | )
91 | }
92 | })
93 | }
94 | >
95 | }
96 |
--------------------------------------------------------------------------------
/neuron-viewer/src/panes/topTokens.jsx:
--------------------------------------------------------------------------------
1 | import React, { useState, useEffect } from "react"
2 | import { get_top_tokens } from "../interpAPI"
3 |
4 |
5 | const TokenDisplay = ({ activeNeuron }) => {
6 | const [isLoading, setIsLoading] = useState(true)
7 | const [data, setData] = useState(null)
8 |
9 | const loadTokens = async () => {
10 | setIsLoading(true)
11 | const weightStrengths = await get_top_tokens(activeNeuron, 'weight')
12 | const activationStrengths = await get_top_tokens(activeNeuron, 'activation')
13 |
14 | const data = {
15 | activeNeuron,
16 | weightStrengths,
17 | activationStrengths,
18 | }
19 |
20 | setData(data)
21 | setIsLoading(false)
22 | }
23 |
24 | useEffect(() => {
25 | if (!data) {
26 | loadTokens()
27 | }
28 | }, [])
29 |
30 |
31 | return (
32 |
33 |
Related tokens
34 | {isLoading ? (
35 |
38 | ) : (
39 | <>
40 |
Mean-activation-based
41 |
42 | {data.activationStrengths.tokens.map((token, idx) => {
43 | return (
44 | data.activationStrengths.average_activations[idx] === null ? null :
45 |
50 | {token}
51 |
52 | )
53 | })}
54 |
55 |
Weight-based
56 |
57 |
Input tokens:
58 | {data.weightStrengths.input_positive.tokens.slice(0, 20).map((token, idx) => {
59 | return (
60 | data.weightStrengths.input_positive.strengths[idx] === null ? null :
61 |
66 | {token}
67 |
68 | )
69 | })}
70 |
71 | {
72 |
73 |
Input tokens negative:
74 | {data.weightStrengths.input_negative.tokens.slice(0, 20).map((token, idx) => {
75 | return (
76 | data.weightStrengths.input_negative.strengths[idx] === null ? null :
77 |
82 | {token}
83 |
84 | )
85 | })}
86 |
87 | }
88 |
89 |
Output tokens:
90 | {data.weightStrengths.output_positive.tokens.slice(0, 20).map((token, idx) => {
91 | return (
92 | data.weightStrengths.output_positive.strengths[idx] === null ? null :
93 |
98 | {token}
99 |
100 | )
101 | })}
102 |
103 | {
104 |
105 |
Output tokens negative:
106 | {data.weightStrengths.output_negative.tokens.slice(0, 20).map((token, idx) => {
107 | return (
108 |
113 | {token}
114 |
115 | )
116 | })}
117 |
118 | }
119 | >
120 | )}
121 |
122 | )
123 | }
124 | export default TokenDisplay
125 |
--------------------------------------------------------------------------------
/neuron-explainer/neuron_explainer/explanations/prompt_builder.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | from enum import Enum
4 | from typing import TypedDict, Union
5 |
6 | import tiktoken
7 |
8 | HarmonyMessage = TypedDict(
9 | "HarmonyMessage",
10 | {
11 | "role": str,
12 | "content": str,
13 | },
14 | )
15 |
16 |
17 | class PromptFormat(str, Enum):
18 | """
19 | Different ways of formatting the components of a prompt into the format accepted by the relevant
20 | API server endpoint.
21 | """
22 |
23 | NONE = "none"
24 | """Suitable for use with models that don't use special tokens for instructions."""
25 | INSTRUCTION_FOLLOWING = "instruction_following"
26 | """Suitable for IF models that use <|endofprompt|>."""
27 | HARMONY_V4 = "harmony_v4"
28 | """
29 | Suitable for Harmony models that use a structured turn-taking role+content format. Generates a
30 | list of HarmonyMessage dicts that can be sent to the /chat/completions endpoint.
31 | """
32 |
33 | @classmethod
34 | def from_string(cls, s: str) -> PromptFormat:
35 | for prompt_format in cls:
36 | if prompt_format.value == s:
37 | return prompt_format
38 | raise ValueError(f"{s} is not a valid PromptFormat")
39 |
40 |
41 | class Role(str, Enum):
42 | """See https://platform.openai.com/docs/guides/chat"""
43 |
44 | SYSTEM = "system"
45 | USER = "user"
46 | ASSISTANT = "assistant"
47 |
48 |
49 | class PromptBuilder:
50 | """Class for accumulating components of a prompt and then formatting them into an output."""
51 |
52 | def __init__(self) -> None:
53 | self._messages: list[HarmonyMessage] = []
54 |
55 | def add_message(self, role: Role, message: str) -> None:
56 | self._messages.append(HarmonyMessage(role=role, content=message))
57 |
58 | def prompt_length_in_tokens(self, prompt_format: PromptFormat) -> int:
59 | # TODO(sbills): Make the model/encoding configurable. This implementation assumes GPT-4.
60 | encoding = tiktoken.get_encoding("cl100k_base")
61 | if prompt_format == PromptFormat.HARMONY_V4:
62 | # Approximately-correct implementation adapted from this documentation:
63 | # https://platform.openai.com/docs/guides/chat/introduction
64 | num_tokens = 0
65 | for message in self._messages:
66 | num_tokens += (
67 | 4 # every message follows <|im_start|>{role/name}\n{content}<|im_end|>\n
68 | )
69 | num_tokens += len(encoding.encode(message["content"], allowed_special="all"))
70 | num_tokens += 2 # every reply is primed with <|im_start|>assistant
71 | return num_tokens
72 | else:
73 | prompt_str = self.build(prompt_format)
74 | assert isinstance(prompt_str, str)
75 | return len(encoding.encode(prompt_str, allowed_special="all"))
76 |
77 | def build(
78 | self, prompt_format: PromptFormat, *, allow_extra_system_messages: bool = False
79 | ) -> Union[str, list[HarmonyMessage]]:
80 | """
81 | Validates the messages added so far (reasonable alternation of assistant vs. user, etc.)
82 | and returns either a regular string (maybe with <|endofprompt|> tokens) or a list of
83 | HarmonyMessages suitable for use with the /chat/completions endpoint.
84 |
85 | The `allow_extra_system_messages` parameter allows the caller to specify that the prompt
86 | should be allowed to contain system messages after the very first one.
87 | """
88 | # Create a deep copy of the messages so we can modify it and so that the caller can't
89 | # modify the internal state of this object.
90 | messages = [message.copy() for message in self._messages]
91 |
92 | expected_next_role = Role.SYSTEM
93 | for message in messages:
94 | role = message["role"]
95 | assert role == expected_next_role or (
96 | allow_extra_system_messages and role == Role.SYSTEM
97 | ), f"Expected message from {expected_next_role} but got message from {role}"
98 | if role == Role.SYSTEM:
99 | expected_next_role = Role.USER
100 | elif role == Role.USER:
101 | expected_next_role = Role.ASSISTANT
102 | elif role == Role.ASSISTANT:
103 | expected_next_role = Role.USER
104 |
105 | if prompt_format == PromptFormat.INSTRUCTION_FOLLOWING:
106 | last_user_message = None
107 | for message in messages:
108 | if message["role"] == Role.USER:
109 | last_user_message = message
110 | assert last_user_message is not None
111 | last_user_message["content"] += "<|endofprompt|>"
112 |
113 | if prompt_format == PromptFormat.HARMONY_V4:
114 | return messages
115 | elif prompt_format in [PromptFormat.NONE, PromptFormat.INSTRUCTION_FOLLOWING]:
116 | return "".join(message["content"] for message in messages)
117 | else:
118 | raise ValueError(f"Unknown prompt format: {prompt_format}")
119 |
--------------------------------------------------------------------------------
/neuron-explainer/neuron_explainer/activations/activation_records.py:
--------------------------------------------------------------------------------
1 | """Utilities for formatting activation records into prompts."""
2 |
3 | import math
4 | from typing import Optional, Sequence
5 |
6 | from neuron_explainer.activations.activations import ActivationRecord
7 |
8 | UNKNOWN_ACTIVATION_STRING = "unknown"
9 |
10 |
11 | def relu(x: float) -> float:
12 | return max(0.0, x)
13 |
14 |
15 | def calculate_max_activation(activation_records: Sequence[ActivationRecord]) -> float:
16 | """Return the maximum activation value of the neuron across all the activation records."""
17 | flattened = [
18 | # Relu is used to assume any values less than 0 are indicating the neuron is in the resting
19 | # state. This is a simplifying assumption that works with relu/gelu.
20 | max(relu(x) for x in activation_record.activations)
21 | for activation_record in activation_records
22 | ]
23 | return max(flattened)
24 |
25 |
26 | def normalize_activations(activation_record: list[float], max_activation: float) -> list[int]:
27 | """Convert raw neuron activations to integers on the range [0, 10]."""
28 | if max_activation <= 0:
29 | return [0 for x in activation_record]
30 | # Relu is used to assume any values less than 0 are indicating the neuron is in the resting
31 | # state. This is a simplifying assumption that works with relu/gelu.
32 | return [min(10, math.floor(10 * relu(x) / max_activation)) for x in activation_record]
33 |
34 |
35 | def _format_activation_record(
36 | activation_record: ActivationRecord,
37 | max_activation: float,
38 | omit_zeros: bool,
39 | hide_activations: bool = False,
40 | start_index: int = 0,
41 | ) -> str:
42 | """Format neuron activations into a string, suitable for use in prompts."""
43 | tokens = activation_record.tokens
44 | normalized_activations = normalize_activations(activation_record.activations, max_activation)
45 | if omit_zeros:
46 | assert (not hide_activations) and start_index == 0, "Can't hide activations and omit zeros"
47 | tokens = [
48 | token for token, activation in zip(tokens, normalized_activations) if activation > 0
49 | ]
50 | normalized_activations = [x for x in normalized_activations if x > 0]
51 |
52 | entries = []
53 | assert len(tokens) == len(normalized_activations)
54 | for index, token, activation in zip(range(len(tokens)), tokens, normalized_activations):
55 | activation_string = str(int(activation))
56 | if hide_activations or index < start_index:
57 | activation_string = UNKNOWN_ACTIVATION_STRING
58 | entries.append(f"{token}\t{activation_string}")
59 | return "\n".join(entries)
60 |
61 |
62 | def format_activation_records(
63 | activation_records: Sequence[ActivationRecord],
64 | max_activation: float,
65 | *,
66 | omit_zeros: bool = False,
67 | start_indices: Optional[list[int]] = None,
68 | hide_activations: bool = False,
69 | ) -> str:
70 | """Format a list of activation records into a string."""
71 | return (
72 | "\n
\n"
73 | + "\n\n\n".join(
74 | [
75 | _format_activation_record(
76 | activation_record,
77 | max_activation,
78 | omit_zeros=omit_zeros,
79 | hide_activations=hide_activations,
80 | start_index=0 if start_indices is None else start_indices[i],
81 | )
82 | for i, activation_record in enumerate(activation_records)
83 | ]
84 | )
85 | + "\n\n"
86 | )
87 |
88 |
89 | def _format_tokens_for_simulation(tokens: Sequence[str]) -> str:
90 | """
91 | Format tokens into a string with each token marked as having an "unknown" activation, suitable
92 | for use in prompts.
93 | """
94 | entries = []
95 | for token in tokens:
96 | entries.append(f"{token}\t{UNKNOWN_ACTIVATION_STRING}")
97 | return "\n".join(entries)
98 |
99 |
100 | def format_sequences_for_simulation(
101 | all_tokens: Sequence[Sequence[str]],
102 | ) -> str:
103 | """
104 | Format a list of lists of tokens into a string with each token marked as having an "unknown"
105 | activation, suitable for use in prompts.
106 | """
107 | return (
108 | "\n\n"
109 | + "\n\n\n".join(
110 | [_format_tokens_for_simulation(tokens) for tokens in all_tokens]
111 | )
112 | + "\n\n"
113 | )
114 |
115 |
116 | def non_zero_activation_proportion(
117 | activation_records: Sequence[ActivationRecord], max_activation: float
118 | ) -> float:
119 | """Return the proportion of activation values that aren't zero."""
120 | total_activations_count = sum(
121 | [len(activation_record.activations) for activation_record in activation_records]
122 | )
123 | normalized_activations = [
124 | normalize_activations(activation_record.activations, max_activation)
125 | for activation_record in activation_records
126 | ]
127 | non_zero_activations_count = sum(
128 | [len([x for x in activations if x != 0]) for activations in normalized_activations]
129 | )
130 | return non_zero_activations_count / total_activations_count
131 |
--------------------------------------------------------------------------------
/neuron-viewer/src/interpAPI.ts:
--------------------------------------------------------------------------------
1 | import {Neuron} from './types';
2 | import {memoizeAsync} from "./utils"
3 |
4 | export const load_file_no_cache = async(path: string) => {
5 | const data = {
6 | path: path
7 | }
8 | const url = new URL("/load_az", window.location.href)
9 | url.port = '8000';
10 | return await (
11 | await fetch(url, {
12 | method: "POST", // or 'PUT'
13 | headers: {
14 | "Content-Type": "application/json",
15 | },
16 | body: JSON.stringify(data),
17 | })
18 | ).json()
19 |
20 | }
21 |
22 | export const load_file_az = async(path: string) => {
23 | const res = (
24 | await fetch(path, {
25 | method: "GET",
26 | mode: "cors",
27 | headers: {
28 | "Content-Type": "application/json",
29 | },
30 | })
31 | )
32 | if (!res.ok) {
33 | console.error(`HTTP error: ${res.status} - ${res.statusText}`);
34 | return;
35 | }
36 | return await res.json()
37 | }
38 |
39 |
40 | // export const load_file = memoizeAsync('load_file', load_file_no_cache)
41 | export const load_file = window.location.host.indexOf('localhost:') === -1 ? load_file_az : load_file_no_cache;
42 |
43 |
44 | // # (derived from az://oaialignment/datasets/interp/gpt2_xl/v1/webtext1/len_nomax/n_50000/mlp_post_act/ranked_by_max_activation)
45 | // const NEURON_RECORDS_PATH = "az://oaisbills/rcall/oss/migrated_make_crow_datasets/gpt2_xl_n_50000_64_token/neurons"
46 | const NEURON_RECORDS_PATH = "https://openaipublic.blob.core.windows.net/neuron-explainer/data/collated-activations"
47 |
48 | // # (derived from az://oaialignment/datasets/interp/gpt2_xl/v1/webtext1/len_nomax/n_50000/mlp_post_act/ranked_by_max_activation/neurons/explanations/canonical-run-v1)
49 | // const EXPLANATIONS_PATH = "az://oaisbills/rcall/oss/migrated_explanation_datasets/canonical_gpt2_xl_all_neurons"
50 | const EXPLANATIONS_PATH = "https://openaipublic.blob.core.windows.net/neuron-explainer/data/explanations"
51 |
52 | // weight-based
53 | // const WHOLE_LAYER_WEIGHT_TOKENS_PATH = "az://oaidan/rcall/data/interpretability/connections/gpt2-xl/mlp/unnorm_token_representations_uncommon_vanilla"
54 | // const WEIGHT_TOKENS_PATH = "az://oaijeffwu/jeffwu-data/interpretability/neuron-connections/gpt2-xl/weight-based"
55 | const WEIGHT_TOKENS_PATH = "https://openaipublic.blob.core.windows.net/neuron-explainer/data/related-tokens/weight-based"
56 | // lookup table
57 | // const WHOLE_LAYER_ACTIVATION_TOKENS_PATH = "az://oaidan/rcall/data/interpretability/connections/gpt2_xl/mlp/unnorm_token_representations_vanilla_and_common_in_colangv2_unigram"
58 | // const ACTIVATION_TOKENS_PATH = "az://oaijeffwu/jeffwu-data/interpretability/neuron-connections/gpt2-xl/lookup-table"
59 | const ACTIVATION_TOKENS_PATH = "https://openaipublic.blob.core.windows.net/neuron-explainer/data/related-tokens/activation-based"
60 |
61 | // const CONNECTIONS_PATH = "az://oaialignment/datasets/interp/connections/gpt2/neuron_space/incl_attn_False"
62 | const CONNECTIONS_PATH = "https://openaipublic.blob.core.windows.net/neuron-explainer/data/related-neurons/weight-based"
63 |
64 |
65 | export const get_explanations = async (activeNeuron: Neuron) => {
66 | const result = await load_file(`${EXPLANATIONS_PATH}/${activeNeuron.layer}/${activeNeuron.neuron}.jsonl`)
67 | return result
68 | }
69 |
70 | export const get_top_tokens = async (activeNeuron: Neuron, weightType: string) => {
71 | let TOKENS_PATH;
72 | if (weightType === 'weight') {
73 | TOKENS_PATH = WEIGHT_TOKENS_PATH;
74 | } else if (weightType === 'activation') {
75 | TOKENS_PATH = ACTIVATION_TOKENS_PATH;
76 | } else {
77 | throw new Error(`Invalid weightType: ${weightType}`)
78 | }
79 | const result = await load_file(`${TOKENS_PATH}/${activeNeuron.layer}/${activeNeuron.neuron}.json`)
80 | return result
81 | // const result = await load_file_no_cache(`${ORIG_TOKENS_PATH}/${activeNeuron.layer}.json`)
82 | // return result.neuron_summaries[activeNeuron.neuron]
83 | }
84 |
85 | export const get_top_neuron_connections = async (activeNeuron: Neuron) => {
86 | const result = await load_file(`${CONNECTIONS_PATH}/${activeNeuron.layer}/${activeNeuron.neuron}.json`)
87 |
88 | const res: {[key: string]: [number, number]} = {};
89 | ["input", "output"].forEach((direction) => {
90 | const sign = "positive" // "negative"
91 | const weight_name: string = {output: "c_proj", input: "c_fc"}[direction] as string;
92 | const res_for_dir = result[weight_name];
93 | if (res_for_dir === null) {
94 | return
95 | }
96 | // let key = 'top_negative_neurons'
97 | const top_neuron_strs = res_for_dir[`top_${sign}_neurons`] // {layer}_{neuron} strings for each top-connected neuron
98 | const top_weights = res_for_dir[`top_${sign}_weights`]
99 | const top_layer_neuron_tuples = top_neuron_strs.map((neuron_str: string, i: number) => {
100 | const [layer, neuron] = neuron_str.split("_").map((x: string) => parseInt(x))
101 | return [layer, neuron, top_weights[i]] as [number, number, number]
102 | })
103 | res[direction] = top_layer_neuron_tuples.slice(0, 10)
104 | })
105 |
106 | return res
107 | }
108 |
109 | export const get_neuron_record = async(activeNeuron: Neuron) => {
110 | const result = await load_file(`${NEURON_RECORDS_PATH}/${activeNeuron.layer}/${activeNeuron.neuron}.json`)
111 | return result
112 | }
113 |
114 |
--------------------------------------------------------------------------------
/neuron-explainer/neuron_explainer/explanations/token_space_few_shot_examples.py:
--------------------------------------------------------------------------------
1 | from dataclasses import dataclass
2 | from enum import Enum
3 | from typing import List
4 |
5 | from neuron_explainer.fast_dataclasses import FastDataclass
6 |
7 |
8 | @dataclass
9 | class Example(FastDataclass):
10 | """
11 | An example list of tokens as strings corresponding to top token space inputs of a neuron, with a
12 | string explanation of the neuron's behavior on these tokens.
13 | """
14 |
15 | tokens: List[str]
16 | explanation: str
17 |
18 |
19 | class TokenSpaceFewShotExampleSet(Enum):
20 | """Determines which few-shot examples to use when sampling explanations."""
21 |
22 | ORIGINAL = "original"
23 | TEST = "test"
24 |
25 | def get_examples(self) -> list[Example]:
26 | """Returns regular examples for use in a few-shot prompt."""
27 | if self is TokenSpaceFewShotExampleSet.ORIGINAL:
28 | return ORIGINAL_EXAMPLES
29 | elif self is TokenSpaceFewShotExampleSet.TEST:
30 | return TEST_EXAMPLES
31 | else:
32 | raise ValueError(f"Unhandled example set: {self}")
33 |
34 |
35 | ORIGINAL_EXAMPLES = [
36 | Example(
37 | tokens=[
38 | "actual",
39 | " literal",
40 | " actual",
41 | " hyper",
42 | " real",
43 | " EX",
44 | " Real",
45 | "^",
46 | "Full",
47 | " full",
48 | " optical",
49 | " style",
50 | "any",
51 | "ALL",
52 | "extreme",
53 | " miniature",
54 | " Optical",
55 | " faint",
56 | "~",
57 | " Physical",
58 | " REAL",
59 | "*",
60 | "virtual",
61 | "TYPE",
62 | " technical",
63 | "otally",
64 | " physic",
65 | "Type",
66 | "<",
67 | "images",
68 | "atic",
69 | " sheer",
70 | " Style",
71 | " partial",
72 | " natural",
73 | "Hyper",
74 | " Any",
75 | " theoretical",
76 | "|",
77 | " ultimate",
78 | "oing",
79 | " constant",
80 | "ANY",
81 | "antically",
82 | "ishly",
83 | " ex",
84 | " visual",
85 | "special",
86 | "omorphic",
87 | "visual",
88 | ],
89 | explanation=" adjectives related to being real, or to physical properties and evidence",
90 | ),
91 | Example(
92 | tokens=[
93 | "cephal",
94 | "aeus",
95 | " coma",
96 | "bered",
97 | "abetes",
98 | "inflamm",
99 | "rugged",
100 | "alysed",
101 | "azine",
102 | "hered",
103 | "cells",
104 | "aneously",
105 | "fml",
106 | "igm",
107 | "culosis",
108 | "iani",
109 | "CTV",
110 | "disabled",
111 | "heric",
112 | "ulo",
113 | "geoning",
114 | "awi",
115 | "translation",
116 | "iral",
117 | "govtrack",
118 | "mson",
119 | "cloth",
120 | "nesota",
121 | " Dise",
122 | " Lyme",
123 | " dementia",
124 | "agn",
125 | " reversible",
126 | " susceptibility",
127 | "esthesia",
128 | "orf",
129 | " inflamm",
130 | " Obesity",
131 | " tox",
132 | " Disorders",
133 | "uberty",
134 | "blind",
135 | "ALTH",
136 | "avier",
137 | " Immunity",
138 | " Hurt",
139 | "ulet",
140 | "ueless",
141 | " sluggish",
142 | "rosis",
143 | ],
144 | explanation=" words related to physical medical conditions",
145 | ),
146 | Example(
147 | tokens=[
148 | " January",
149 | "terday",
150 | "cember",
151 | " April",
152 | " July",
153 | "September",
154 | "December",
155 | "Thursday",
156 | "quished",
157 | "November",
158 | "Tuesday",
159 | "uesday",
160 | " Sept",
161 | "ruary",
162 | " March",
163 | ";;;;;;;;;;;;",
164 | " Monday",
165 | "Wednesday",
166 | " Saturday",
167 | " Wednesday",
168 | "Reloaded",
169 | "aturday",
170 | " August",
171 | "Feb",
172 | "Sunday",
173 | "Reviewed",
174 | "uggest",
175 | " Dhabi",
176 | "ACTED",
177 | "tten",
178 | "Year",
179 | "August",
180 | "alogue",
181 | "MX",
182 | " Janeiro",
183 | "yss",
184 | " Leilan",
185 | " Fiscal",
186 | " referen",
187 | "semb",
188 | "eele",
189 | "wcs",
190 | "detail",
191 | "ertation",
192 | " Reborn",
193 | " Sunday",
194 | "itially",
195 | "aturdays",
196 | " Dise",
197 | "essage",
198 | ],
199 | explanation=" nouns related to time and dates",
200 | ),
201 | ]
202 |
203 | TEST_EXAMPLES = [
204 | Example(
205 | tokens=[
206 | "these",
207 | " are",
208 | " tokens",
209 | ],
210 | explanation=" this is a test explanation",
211 | ),
212 | ]
213 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Automated interpretability
2 |
3 | ## Code and tools
4 |
5 | This repository contains code and tools associated with the [Language models can explain neurons in
6 | language models](https://openaipublic.blob.core.windows.net/neuron-explainer/paper/index.html) paper, specifically:
7 |
8 | * Code for automatically generating, simulating, and scoring explanations of neuron behavior using
9 | the methodology described in the paper. See the
10 | [neuron-explainer README](neuron-explainer/README.md) for more information.
11 |
12 | Note: if you run into errors of the form "Error: Could not find any credentials that grant access to storage account: 'openaipublic' and container: 'neuron-explainer'"." you might be able to fix this by signing up for an azure account and specifying the credentials as described in the error message.
13 |
14 | * A tool for viewing neuron activations and explanations, accessible
15 | [here](https://openaipublic.blob.core.windows.net/neuron-explainer/neuron-viewer/index.html). See
16 | the [neuron-viewer README](neuron-viewer/README.md) for more information.
17 |
18 | ## Public datasets
19 |
20 | Together with this code, we're also releasing public datasets of GPT-2 XL neurons and explanations.
21 | Here's an overview of those datasets.
22 |
23 | * Neuron activations: `az://openaipublic/neuron-explainer/data/collated-activations/{layer_index}/{neuron_index}.json`
24 | - Tokenized text sequences and their activations for the neuron. We
25 | provide multiple sets of tokens and activations: top-activating ones, random
26 | samples from several quantiles; and a completely random sample. We also provide
27 | some basic statistics for the activations.
28 | - Each file contains a JSON-formatted
29 | [`NeuronRecord`](neuron-explainer/neuron_explainer/activations/activations.py#L89) dataclass.
30 | * Neuron explanations: `az://openaipublic/neuron-explainer/data/explanations/{layer_index}/{neuron_index}.jsonl`
31 | - Scored model-generated explanations of the behavior of the neuron, including simulation results.
32 | - Each file contains a JSON-formatted
33 | [`NeuronSimulationResults`](neuron-explainer/neuron_explainer/explanations/explanations.py#L146)
34 | dataclass.
35 | * Related neurons: `az://openaipublic/neuron-explainer/data/related-neurons/weight-based/{layer_index}/{neuron_index}.json`
36 | - Lists of the upstream and downstream neurons with the most positive and negative connections (see below for definition).
37 | - Each file contains a JSON-formatted dataclass whose definition is not included in this repo.
38 | * Tokens with high average activations:
39 | `az://openaipublic/neuron-explainer/data/related-tokens/activation-based/{layer_index}/{neuron_index}.json`
40 | - Lists of tokens with the highest average activations for individual neurons, and their average activations.
41 | - Each file contains a JSON-formatted [`TokenLookupTableSummaryOfNeuron`](neuron-explainer/neuron_explainer/activations/token_connections.py#L36)
42 | dataclass.
43 | * Tokens with large inbound and outbound weights:
44 | `az://openaipublic/neuron-explainer/data/related-tokens/weight-based/{layer_index}/{neuron_index}.json`
45 | - List of the most-positive and most-negative input and output tokens for individual neurons,
46 | as well as the associated weight (see below for definition).
47 | - Each file contains a JSON-formatted [`WeightBasedSummaryOfNeuron`](neuron-explainer/neuron_explainer/activations/token_connections.py#L17)
48 | dataclass.
49 |
50 | Update (July 5, 2023):
51 | We also released a set of explanations for GPT-2 Small. The methodology is slightly different from the methodology used for GPT-2 XL so the results aren't directly comparable.
52 | * Neuron activations: `az://openaipublic/neuron-explainer/gpt2_small_data/collated-activations/{layer_index}/{neuron_index}.json`
53 | * Neuron explanations: `az://openaipublic/neuron-explainer/gpt2_small_data/explanations/{layer_index}/{neuron_index}.jsonl`
54 |
55 | Update (August 30, 2023): We recently discovered a bug in how we performed inference on the GPT-2 series models used for the paper and for these datasets. Specifically, we used an optimized GELU implementation rather than the original GELU implementation associated with GPT-2. While the model’s behavior is very similar across these two configurations, the post-MLP activation values we used to generate and simulate explanations differ from the correct values by the following amounts for GPT-2 small:
56 |
57 | - Median: 0.0090
58 | - 90th percentile: 0.0252
59 | - 99th percentile: 0.0839
60 | - 99.9th percentile: 0.1736
61 |
62 | ### Definition of connection weights
63 |
64 | Refer to [GPT-2 model code](https://github.com/openai/gpt-2/blob/master/src/model.py) for
65 | understanding of model weight conventions.
66 |
67 | *Neuron-neuron*: For two neurons `(l1, n1)` and `(l2, n2)` with `l1 < l2`, the connection strength is defined as
68 | `h{l1}.mlp.c_proj.w[:, n1, :] @ diag(h{l2}.ln_2.g) @ h{l2}.mlp.c_fc.w[:, :, n2]`.
69 |
70 | *Neuron-token*: For token `t` and neuron `(l, n)`, the input weight is computed as
71 | `wte[t, :] @ diag(h{l}.ln_2.g) @ h{l}.mlp.c_fc.w[:, :, n]`
72 | and the output weight is computed as
73 | `h{l}.mlp.c_proj.w[:, n, :] @ diag(ln_f.g) @ wte[t, :]`.
74 |
75 | ### Misc Lists of Interesting Neurons
76 | Lists of neurons we thought were interesting according to different criteria, with some preliminary descriptions.
77 | * [Interesting Neurons (external)](https://docs.google.com/spreadsheets/d/1p7fYs31NU8sJoeKyUx4Mn2laGx8xXfHg_KcIvYiKPpg/edit#gid=0)
78 | * [Neurons that score high on random, possibly monosemantic? (external)](https://docs.google.com/spreadsheets/d/1TqKFcz-84jyIHLU7VRoTc8BoFBMpbgac-iNBnxVurQ8/edit?usp=sharing)
79 | * [Clusters of neurons well explained by activation explanation but not by tokens](https://docs.google.com/document/d/1lWhKowpKDdwTMALD_K541cdwgGoQx8DFUSuEe1U2AGE/edit?usp=sharing)
80 | * [Neurons sensitive to truncation](https://docs.google.com/document/d/1x89TWBvuHcyC2t01EDbJZJ5LQYHozlcS-VUmr5shf_A/edit?usp=sharing)
81 |
--------------------------------------------------------------------------------
/neuron-explainer/neuron_explainer/api_client.py:
--------------------------------------------------------------------------------
1 | import asyncio
2 | import contextlib
3 | import os
4 | import random
5 | import traceback
6 | from asyncio import Semaphore
7 | from functools import wraps
8 | from typing import Any, Callable, Optional
9 |
10 | import httpx
11 | import orjson
12 |
13 |
14 | def is_api_error(err: Exception) -> bool:
15 | if isinstance(err, httpx.HTTPStatusError):
16 | response = err.response
17 | error_data = response.json().get("error", {})
18 | error_message = error_data.get("message")
19 | if response.status_code in [400, 404, 415]:
20 | if error_data.get("type") == "idempotency_error":
21 | print(f"Retrying after idempotency error: {error_message} ({response.url})")
22 | return True
23 | else:
24 | # Invalid request
25 | return False
26 | else:
27 | print(f"Retrying after API error: {error_message} ({response.url})")
28 | return True
29 |
30 | elif isinstance(err, httpx.ConnectError):
31 | print(f"Retrying after connection error... ({err.request.url})")
32 | return True
33 |
34 | elif isinstance(err, httpx.TimeoutException):
35 | print(f"Retrying after a timeout error... ({err.request.url})")
36 | return True
37 |
38 | elif isinstance(err, httpx.ReadError):
39 | print(f"Retrying after a read error... ({err.request.url})")
40 | return True
41 |
42 | print(f"Retrying after an unexpected error: {repr(err)}")
43 | traceback.print_tb(err.__traceback__)
44 | return True
45 |
46 |
47 | def exponential_backoff(
48 | retry_on: Callable[[Exception], bool] = lambda err: True
49 | ) -> Callable[[Callable], Callable]:
50 | """
51 | Returns a decorator which retries the wrapped function as long as the specified retry_on
52 | function returns True for the exception, applying exponential backoff with jitter after
53 | failures, up to a retry limit.
54 | """
55 | init_delay_s = 1.0
56 | max_delay_s = 10.0
57 | # Roughly 30 minutes before we give up.
58 | max_tries = 200
59 | backoff_multiplier = 2.0
60 | jitter = 0.2
61 |
62 | def decorate(f: Callable) -> Callable:
63 | assert asyncio.iscoroutinefunction(f)
64 |
65 | @wraps(f)
66 | async def f_retry(*args: Any, **kwargs: Any) -> None:
67 | delay_s = init_delay_s
68 | for i in range(max_tries):
69 | try:
70 | return await f(*args, **kwargs)
71 | except Exception as err:
72 | if not retry_on(err) or i == max_tries - 1:
73 | raise
74 | jittered_delay = random.uniform(delay_s * (1 - jitter), delay_s * (1 + jitter))
75 | await asyncio.sleep(jittered_delay)
76 | delay_s = min(delay_s * backoff_multiplier, max_delay_s)
77 |
78 | return f_retry
79 |
80 | return decorate
81 |
82 |
83 | API_KEY = os.getenv("OPENAI_API_KEY")
84 | assert API_KEY, "Please set the OPENAI_API_KEY environment variable"
85 | API_HTTP_HEADERS = {
86 | "Content-Type": "application/json",
87 | "Authorization": "Bearer " + API_KEY,
88 | }
89 | BASE_API_URL = "https://api.openai.com/v1"
90 |
91 |
92 | class ApiClient:
93 | """Performs inference using the OpenAI API. Supports response caching and concurrency limits."""
94 |
95 | def __init__(
96 | self,
97 | model_name: str,
98 | # If set, no more than this number of HTTP requests will be made concurrently.
99 | max_concurrent: Optional[int] = None,
100 | # Whether to cache request/response pairs in memory to avoid duplicating requests.
101 | cache: bool = False,
102 | ):
103 | self.model_name = model_name
104 |
105 | if max_concurrent is not None:
106 | self._concurrency_check: Optional[Semaphore] = Semaphore(max_concurrent)
107 | else:
108 | self._concurrency_check = None
109 |
110 | if cache:
111 | self._cache: Optional[dict[str, Any]] = {}
112 | else:
113 | self._cache = None
114 |
115 | @exponential_backoff(retry_on=is_api_error)
116 | async def make_request(
117 | self, timeout_seconds: Optional[int] = None, **kwargs: Any
118 | ) -> dict[str, Any]:
119 | if self._cache is not None:
120 | key = orjson.dumps(kwargs)
121 | if key in self._cache:
122 | return self._cache[key]
123 | async with contextlib.AsyncExitStack() as stack:
124 | if self._concurrency_check is not None:
125 | await stack.enter_async_context(self._concurrency_check)
126 | http_client = await stack.enter_async_context(
127 | httpx.AsyncClient(timeout=timeout_seconds)
128 | )
129 | # If the request has a "messages" key, it should be sent to the /chat/completions
130 | # endpoint. Otherwise, it should be sent to the /completions endpoint.
131 | url = BASE_API_URL + ("/chat/completions" if "messages" in kwargs else "/completions")
132 | kwargs["model"] = self.model_name
133 | response = await http_client.post(url, headers=API_HTTP_HEADERS, json=kwargs)
134 | # The response json has useful information but the exception doesn't include it, so print it
135 | # out then reraise.
136 | try:
137 | response.raise_for_status()
138 | except Exception as e:
139 | print(response.json())
140 | raise e
141 | if self._cache is not None:
142 | self._cache[key] = response.json()
143 | return response.json()
144 |
145 |
146 | if __name__ == "__main__":
147 |
148 | async def main() -> None:
149 | client = ApiClient(model_name="gpt-3.5-turbo", max_concurrent=1)
150 | print(await client.make_request(prompt="Why did the chicken cross the road?", max_tokens=9))
151 |
152 | asyncio.run(main())
153 |
--------------------------------------------------------------------------------
/neuron-explainer/neuron_explainer/explanations/scoring.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | import asyncio
4 | import logging
5 | from typing import Any, Callable, Coroutine, Sequence
6 |
7 | import numpy as np
8 | from neuron_explainer.activations.activations import ActivationRecord
9 | from neuron_explainer.explanations.calibrated_simulator import (
10 | CalibratedNeuronSimulator,
11 | LinearCalibratedNeuronSimulator,
12 | )
13 | from neuron_explainer.explanations.explanations import (
14 | ScoredSequenceSimulation,
15 | ScoredSimulation,
16 | SequenceSimulation,
17 | )
18 | from neuron_explainer.explanations.simulator import ExplanationNeuronSimulator, NeuronSimulator
19 |
20 |
21 | def flatten_list(list_of_lists: Sequence[Sequence[Any]]) -> list[Any]:
22 | return [item for sublist in list_of_lists for item in sublist]
23 |
24 |
25 | def correlation_score(
26 | real_activations: Sequence[float] | np.ndarray,
27 | predicted_activations: Sequence[float] | np.ndarray,
28 | ) -> float:
29 | return np.corrcoef(real_activations, predicted_activations)[0, 1]
30 |
31 |
32 | def score_from_simulation(
33 | real_activations: ActivationRecord,
34 | simulation: SequenceSimulation,
35 | score_function: Callable[[Sequence[float] | np.ndarray, Sequence[float] | np.ndarray], float],
36 | ) -> float:
37 | return score_function(real_activations.activations, simulation.expected_activations)
38 |
39 |
40 | def rsquared_score_from_sequences(
41 | real_activations: Sequence[float] | np.ndarray,
42 | predicted_activations: Sequence[float] | np.ndarray,
43 | ) -> float:
44 | return float(
45 | 1
46 | - np.mean(np.square(np.array(real_activations) - np.array(predicted_activations)))
47 | / np.mean(np.square(np.array(real_activations)))
48 | )
49 |
50 |
51 | def absolute_dev_explained_score_from_sequences(
52 | real_activations: Sequence[float] | np.ndarray,
53 | predicted_activations: Sequence[float] | np.ndarray,
54 | ) -> float:
55 | return float(
56 | 1
57 | - np.mean(np.abs(np.array(real_activations) - np.array(predicted_activations)))
58 | / np.mean(np.abs(np.array(real_activations)))
59 | )
60 |
61 |
62 | async def make_explanation_simulator(
63 | explanation: str,
64 | calibration_activation_records: Sequence[ActivationRecord],
65 | model_name: str,
66 | calibrated_simulator_class: type[CalibratedNeuronSimulator] = LinearCalibratedNeuronSimulator,
67 | ) -> CalibratedNeuronSimulator:
68 | """
69 | Make a simulator that uses an explanation to predict activations and calibrates it on the given
70 | activation records.
71 | """
72 | simulator = ExplanationNeuronSimulator(model_name, explanation)
73 | calibrated_simulator = calibrated_simulator_class(simulator)
74 | await calibrated_simulator.calibrate(calibration_activation_records)
75 | return calibrated_simulator
76 |
77 |
78 | async def _simulate_and_score_sequence(
79 | simulator: NeuronSimulator, activations: ActivationRecord
80 | ) -> ScoredSequenceSimulation:
81 | """Score an explanation of a neuron by how well it predicts activations on a sentence."""
82 | simulation = await simulator.simulate(activations.tokens)
83 | logging.debug(simulation)
84 | rsquared_score = score_from_simulation(activations, simulation, rsquared_score_from_sequences)
85 | absolute_dev_explained_score = score_from_simulation(
86 | activations, simulation, absolute_dev_explained_score_from_sequences
87 | )
88 | scored_sequence_simulation = ScoredSequenceSimulation(
89 | simulation=simulation,
90 | true_activations=activations.activations,
91 | ev_correlation_score=score_from_simulation(activations, simulation, correlation_score),
92 | rsquared_score=rsquared_score,
93 | absolute_dev_explained_score=absolute_dev_explained_score,
94 | )
95 | return scored_sequence_simulation
96 |
97 |
98 | def aggregate_scored_sequence_simulations(
99 | scored_sequence_simulations: list[ScoredSequenceSimulation],
100 | ) -> ScoredSimulation:
101 | """
102 | Aggregate a list of scored sequence simulations. The logic for doing this is non-trivial for EV
103 | scores, since we want to calculate the correlation over all activations from all sequences at
104 | once rather than simply averaging per-sequence correlations.
105 | """
106 | all_true_activations: list[float] = []
107 | all_expected_values: list[float] = []
108 | for scored_sequence_simulation in scored_sequence_simulations:
109 | all_true_activations.extend(scored_sequence_simulation.true_activations or [])
110 | all_expected_values.extend(scored_sequence_simulation.simulation.expected_activations)
111 | ev_correlation_score = (
112 | correlation_score(all_true_activations, all_expected_values)
113 | if len(all_true_activations) > 0
114 | else None
115 | )
116 | rsquared_score = rsquared_score_from_sequences(all_true_activations, all_expected_values)
117 | absolute_dev_explained_score = absolute_dev_explained_score_from_sequences(
118 | all_true_activations, all_expected_values
119 | )
120 |
121 | return ScoredSimulation(
122 | scored_sequence_simulations=scored_sequence_simulations,
123 | ev_correlation_score=ev_correlation_score,
124 | rsquared_score=rsquared_score,
125 | absolute_dev_explained_score=absolute_dev_explained_score,
126 | )
127 |
128 |
129 | async def simulate_and_score(
130 | simulator: NeuronSimulator,
131 | activation_records: Sequence[ActivationRecord],
132 | ) -> ScoredSimulation:
133 | """
134 | Score an explanation of a neuron by how well it predicts activations on the given text
135 | sequences.
136 | """
137 | scored_sequence_simulations = await asyncio.gather(
138 | *[
139 | _simulate_and_score_sequence(
140 | simulator,
141 | activation_record,
142 | )
143 | for activation_record in activation_records
144 | ]
145 | )
146 | return aggregate_scored_sequence_simulations(scored_sequence_simulations)
147 |
148 |
149 | async def make_simulator_and_score(
150 | make_simulator: Coroutine[None, None, NeuronSimulator],
151 | activation_records: Sequence[ActivationRecord],
152 | ) -> ScoredSimulation:
153 | """Chain together creating the simulator and using it to score activation records."""
154 | simulator = await make_simulator
155 | return await simulate_and_score(simulator, activation_records)
156 |
--------------------------------------------------------------------------------
/neuron-viewer/src/panes/explanation.jsx:
--------------------------------------------------------------------------------
1 | import React, { useState, useEffect } from "react"
2 | import { get_explanations } from "../interpAPI"
3 | // import HeatmapGrid from "../heatmapGrid"
4 | import SimulationHeatmap from "../simulationHeatmap"
5 | import { normalizeTokenActs } from "../types"
6 |
7 |
8 | function zip_simulated_sequences(sequences) {
9 | return sequences.map(({ simulation }) => {
10 | return simulation.tokens.map((token, idx) => ({
11 | token,
12 | activation: simulation.expected_activations[idx],
13 | }))
14 | })
15 | }
16 |
17 | function zip_real_sequences(sequences) {
18 | return sequences.map(({ simulation, true_activations }) => {
19 | return simulation.tokens.map((token, idx) => ({
20 | token,
21 | activation: true_activations[idx],
22 | }))
23 | })
24 | }
25 |
26 | const ExplanationDisplay = ({ activeNeuron }) => {
27 | const [isLoading, setIsLoading] = useState(true)
28 | const [data, setData] = useState(null)
29 | const [showingScoringDetails, setShowingScoringDetails] = useState(false)
30 | const [toggle, setToggle] = useState(false);
31 |
32 | const loadExplanation = async () => {
33 | const result = await get_explanations(activeNeuron);
34 | setData(result.scored_explanations[0])
35 | setIsLoading(false)
36 | }
37 |
38 | useEffect(() => {
39 | if (!data) {
40 | loadExplanation()
41 | }
42 | }, [])
43 |
44 | const handleToggleChange = () => {
45 | setToggle(!toggle);
46 | };
47 |
48 | let sim_sequences;
49 | if (data) {
50 | sim_sequences = zip_simulated_sequences(data.scored_simulation.scored_sequence_simulations);
51 | [sim_sequences] = normalizeTokenActs(sim_sequences)
52 | } else {
53 | sim_sequences = []
54 | }
55 |
56 | let real_sequences;
57 | if (data) {
58 | real_sequences = zip_real_sequences(data.scored_simulation.scored_sequence_simulations);
59 | [real_sequences] = normalizeTokenActs(real_sequences)
60 | } else {
61 | real_sequences = []
62 | }
63 |
64 | const suggest_explanation_link = "https://docs.google.com/forms/d/e/1FAIpQLSckMyDQedGhdISIqaqn0YGUtd2xqEWgPu7ehoPUTT2pTge_-g/viewform?"
65 | + `usp=pp_url&entry.541490611=${activeNeuron.layer}`
66 | + `&entry.1688855196=${activeNeuron.neuron}`
67 | + `&entry.495312202=https://openaipublic.blob.core.windows.net/neuron-explainer/neuron-viewer/index.html%23/layers/${activeNeuron.layer}/neurons/${activeNeuron.neuron}`;
68 |
69 | return (
70 | <>
71 |
72 |
Explanation
73 | {isLoading ? (
74 |
77 | ) : (
78 | <>
79 |
80 |
81 | {data.explanation}
82 |
83 |
84 | score: {data.scored_simulation.ev_correlation_score.toFixed(2)}
85 |
86 |
87 | Suggest Better Explanation
88 |
89 |
90 |
93 | {
94 | showingScoringDetails ?
95 | <>
96 |
101 |
115 |
128 |
142 |
143 |
144 | {toggle ? 'Activations overlaid (top = real, bottom = simulated)' : 'Activations not overlaid'}
145 |
146 |
Top
147 |
152 |
Random
153 |
158 | > : null
159 | }
160 | >
161 | )}
162 |
163 | >
164 | )
165 | }
166 |
167 | export default ExplanationDisplay
168 |
--------------------------------------------------------------------------------
/neuron-explainer/neuron_explainer/explanations/test_explainer.py:
--------------------------------------------------------------------------------
1 | import asyncio
2 | from typing import Any
3 |
4 | from neuron_explainer.explanations.explainer import (
5 | TokenActivationPairExplainer,
6 | TokenSpaceRepresentationExplainer,
7 | )
8 | from neuron_explainer.explanations.few_shot_examples import TEST_EXAMPLES, FewShotExampleSet
9 | from neuron_explainer.explanations.prompt_builder import HarmonyMessage, PromptFormat, Role
10 | from neuron_explainer.explanations.token_space_few_shot_examples import (
11 | TokenSpaceFewShotExampleSet,
12 | )
13 |
14 |
15 | def setup_module(unused_module: Any) -> None:
16 | # Make sure we have an event loop, since the attempt to create the Semaphore in
17 | # ResearchApiClient will fail without it.
18 | loop = asyncio.new_event_loop()
19 | asyncio.set_event_loop(loop)
20 |
21 |
22 | def test_if_formatting() -> None:
23 | expected_prompt = """We're studying neurons in a neural network. Each neuron looks for some particular thing in a short document. Look at the parts of the document the neuron activates for and summarize in a single sentence what the neuron is looking for. Don't list examples of words.
24 |
25 | The activation format is tokenactivation. Activation values range from 0 to 10. A neuron finding what it's looking for is represented by a non-zero activation value. The higher the activation value, the stronger the match.
26 |
27 | Neuron 1
28 | Activations:
29 |
30 | a 10
31 | b 0
32 | c 0
33 |
34 |
35 | d 0
36 | e 10
37 | f 0
38 |
39 |
40 | Explanation of neuron 1 behavior: the main thing this neuron does is find vowels.
41 |
42 | Neuron 2
43 | Activations:
44 |
45 | a 10
46 | b 0
47 | c 0
48 |
49 |
50 | d 0
51 | e 10
52 | f 0
53 |
54 |
55 | Explanation of neuron 2 behavior:<|endofprompt|> the main thing this neuron does is find"""
56 |
57 | explainer = TokenActivationPairExplainer(
58 | model_name="text-davinci-003",
59 | prompt_format=PromptFormat.INSTRUCTION_FOLLOWING,
60 | few_shot_example_set=FewShotExampleSet.TEST,
61 | )
62 | prompt = explainer.make_explanation_prompt(
63 | all_activation_records=TEST_EXAMPLES[0].activation_records,
64 | max_activation=1.0,
65 | max_tokens_for_completion=20,
66 | )
67 |
68 | assert prompt == expected_prompt
69 |
70 |
71 | def test_harmony_format() -> None:
72 | expected_prompt = [
73 | HarmonyMessage(
74 | role=Role.SYSTEM,
75 | content="""We're studying neurons in a neural network. Each neuron looks for some particular thing in a short document. Look at the parts of the document the neuron activates for and summarize in a single sentence what the neuron is looking for. Don't list examples of words.
76 |
77 | The activation format is tokenactivation. Activation values range from 0 to 10. A neuron finding what it's looking for is represented by a non-zero activation value. The higher the activation value, the stronger the match.""",
78 | ),
79 | HarmonyMessage(
80 | role=Role.USER,
81 | content="""
82 |
83 | Neuron 1
84 | Activations:
85 |
86 | a 10
87 | b 0
88 | c 0
89 |
90 |
91 | d 0
92 | e 10
93 | f 0
94 |
95 |
96 | Explanation of neuron 1 behavior: the main thing this neuron does is find""",
97 | ),
98 | HarmonyMessage(
99 | role=Role.ASSISTANT,
100 | content=" vowels.",
101 | ),
102 | HarmonyMessage(
103 | role=Role.USER,
104 | content="""
105 |
106 | Neuron 2
107 | Activations:
108 |
109 | a 10
110 | b 0
111 | c 0
112 |
113 |
114 | d 0
115 | e 10
116 | f 0
117 |
118 |
119 | Explanation of neuron 2 behavior: the main thing this neuron does is find""",
120 | ),
121 | ]
122 |
123 | explainer = TokenActivationPairExplainer(
124 | model_name="gpt-4",
125 | prompt_format=PromptFormat.HARMONY_V4,
126 | few_shot_example_set=FewShotExampleSet.TEST,
127 | )
128 | prompt = explainer.make_explanation_prompt(
129 | all_activation_records=TEST_EXAMPLES[0].activation_records,
130 | max_activation=1.0,
131 | max_tokens_for_completion=20,
132 | )
133 |
134 | assert isinstance(prompt, list)
135 | assert isinstance(prompt[0], dict) # Really a HarmonyMessage
136 | for actual_message, expected_message in zip(prompt, expected_prompt):
137 | assert actual_message["role"] == expected_message["role"]
138 | assert actual_message["content"] == expected_message["content"]
139 | assert prompt == expected_prompt
140 |
141 |
142 | def test_token_space_explainer_if_formatting() -> None:
143 | expected_prompt = """We're studying neurons in a neural network. Each neuron looks for some particular kind of token (which can be a word, or part of a word). Look at the tokens the neuron activates for (listed below) and summarize in a single sentence what the neuron is looking for. Don't list examples of words.
144 |
145 |
146 |
147 | Tokens:
148 | 'these', ' are', ' tokens'
149 |
150 | Explanation:
151 | This neuron is looking for this is a test explanation.
152 |
153 |
154 |
155 | Tokens:
156 | 'foo', 'bar', 'baz'
157 |
158 | Explanation:
159 | <|endofprompt|>This neuron is looking for"""
160 |
161 | explainer = TokenSpaceRepresentationExplainer(
162 | model_name="text-davinci-002",
163 | prompt_format=PromptFormat.INSTRUCTION_FOLLOWING,
164 | use_few_shot=True,
165 | few_shot_example_set=TokenSpaceFewShotExampleSet.TEST,
166 | )
167 | prompt = explainer.make_explanation_prompt(
168 | tokens=["foo", "bar", "baz"],
169 | max_tokens_for_completion=20,
170 | )
171 |
172 | assert prompt == expected_prompt
173 |
174 |
175 | def test_token_space_explainer_harmony_formatting() -> None:
176 | expected_prompt = [
177 | HarmonyMessage(
178 | role=Role.SYSTEM,
179 | content="We're studying neurons in a neural network. Each neuron looks for some particular kind of token (which can be a word, or part of a word). Look at the tokens the neuron activates for (listed below) and summarize in a single sentence what the neuron is looking for. Don't list examples of words.",
180 | ),
181 | HarmonyMessage(
182 | role=Role.USER,
183 | content="""
184 |
185 |
186 |
187 | Tokens:
188 | 'these', ' are', ' tokens'
189 |
190 | Explanation:
191 | This neuron is looking for""",
192 | ),
193 | HarmonyMessage(
194 | role=Role.ASSISTANT,
195 | content=" this is a test explanation.",
196 | ),
197 | HarmonyMessage(
198 | role=Role.USER,
199 | content="""
200 |
201 |
202 |
203 | Tokens:
204 | 'foo', 'bar', 'baz'
205 |
206 | Explanation:
207 | This neuron is looking for""",
208 | ),
209 | ]
210 |
211 | explainer = TokenSpaceRepresentationExplainer(
212 | model_name="gpt-4",
213 | prompt_format=PromptFormat.HARMONY_V4,
214 | use_few_shot=True,
215 | few_shot_example_set=TokenSpaceFewShotExampleSet.TEST,
216 | )
217 | prompt = explainer.make_explanation_prompt(
218 | tokens=["foo", "bar", "baz"],
219 | max_tokens_for_completion=20,
220 | )
221 |
222 | assert isinstance(prompt, list)
223 | assert isinstance(prompt[0], dict) # Really a HarmonyMessage
224 | for actual_message, expected_message in zip(prompt, expected_prompt):
225 | assert actual_message["role"] == expected_message["role"]
226 | assert actual_message["content"] == expected_message["content"]
227 | assert prompt == expected_prompt
228 |
--------------------------------------------------------------------------------
/neuron-viewer/src/App.css:
--------------------------------------------------------------------------------
1 | @tailwind base;
2 | @tailwind components;
3 | @tailwind utilities;
4 |
5 |
6 | :root {
7 | --secondary-color: #0d978b;
8 | --accent-color: #ff4d00;
9 | }
10 |
11 | .full-width{
12 | width: 100vw;
13 | position: relative;
14 | margin-left: -50vw;
15 | left: 50%;
16 | }
17 |
18 | .App {
19 | text-align: center;
20 | }
21 |
22 | .App-logo {
23 | height: 40vmin;
24 | pointer-events: none;
25 | }
26 |
27 | @media (prefers-reduced-motion: no-preference) {
28 | .App-logo {
29 | animation: App-logo-spin infinite 20s linear;
30 | }
31 | }
32 |
33 | .App h1 {
34 | font-size: 1.75rem;
35 | }
36 |
37 | .App-article {
38 | background-color: #282c34;
39 | min-height: 100vh;
40 | display: flex;
41 | flex-direction: column;
42 | align-items: center;
43 | justify-content: center;
44 | font-size: calc(10px + 2vmin);
45 | color: white;
46 | }
47 |
48 | .App-link {
49 | color: #61dafb;
50 | }
51 |
52 | @keyframes App-logo-spin {
53 | from {
54 | transform: rotate(0deg);
55 | }
56 | to {
57 | transform: rotate(360deg);
58 | }
59 | }
60 |
61 |
62 | /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */
63 | /* Structure
64 | /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */
65 |
66 | body {
67 | margin: 0;
68 | padding: 0 1em;
69 | font-size: 12pt;
70 | }
71 |
72 | /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */
73 | /* Typography
74 | /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */
75 |
76 | h1 {
77 | font-size: 24pt;
78 | font-weight: 500;
79 | padding: 1em 0 0;
80 | display: block;
81 | color: #000;
82 | }
83 | h3 { padding: 0 0; }
84 | h2 { padding: 1em 0 0.5em 0; }
85 | h4, h5 {
86 | text-transform: uppercase;
87 | margin: 1em 0;
88 | justify-tracks: space-between;
89 | font-family: var(--sans-serif);
90 | font-size: 12pt;
91 | font-weight: 600;
92 | }
93 | h2, h3 { font-weight: 500; font-style: italic; }
94 | subtitle {
95 | color: #555;
96 | font-size: 18pt;
97 | font-style: italic;
98 | padding: 0;
99 | display: block;
100 | margin-bottom: 1em
101 | }
102 |
103 | a {
104 | transition: all .05s ease-in-out;
105 | color: #5c60c3 !important;
106 | font-style: normal;
107 | }
108 | a:hover { color: var(--accent-color)!important; }
109 | code, pre { color: var(--inline-code-color);
110 | background-color: #eee; border-radius: 3px; }
111 | pre { padding: 1em; margin: 2em 0; }
112 | code { padding: 0.3em; }
113 | .text-secondary, h3, h5 { color: var(--secondary-color); }
114 | .text-primary, h2,h4 { color: var(--primary-color); }
115 |
116 | /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */
117 | /* Images
118 | /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */
119 |
120 | img#logo {
121 | width: 50%;
122 | margin: 3em 0 0
123 | }
124 |
125 | /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */
126 | /* Alerts */
127 | /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */
128 |
129 | .alert {
130 | font-weight: 600;
131 | font-style: italic;
132 | display: block;
133 | background-color: #fff7f7;
134 | padding: 1em;
135 | margin: 0;
136 | border-radius: 5px;
137 | color: #f25555
138 | }
139 | .alert.cool {
140 | background-color: #f3f0fc;
141 | color: #7155cf;
142 | }
143 | .flash-alert {
144 | display: inline-block;
145 | transition: ease-in-out 1s;
146 | font-size: 14pt;
147 | margin: 1em 0;
148 | padding-top: 0.5em;
149 | }
150 | .flash-alert.success {
151 | color: #000;
152 | }
153 | .flash-alert.failure {
154 | color: red;
155 | }
156 | .flash-alert.hidden {
157 | display: none;
158 | }
159 |
160 |
161 | /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */
162 | /* Sidenotes & Superscripts */
163 | /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */
164 |
165 | body { counter-reset: count; }
166 | p { whitespace: nowrap; }
167 | sup {
168 | font-weight: 300;
169 | padding-right: .2em;
170 | counter-increment: count;
171 | }
172 | sidenote::before,
173 | sup::before {
174 | content: counter(count, lower-roman);
175 | display: inline-block;
176 | font-size: 10pt;
177 | font-weight: bold;
178 | color: var(--accent-color);
179 | }
180 | sidenote::before {
181 | margin-right: .5em;
182 | font-weight: 700
183 | }
184 |
185 | /* Different behavior if the screen is too
186 | narrow to show a sidenote on the side. */
187 |
188 | @media (min-width:860px) {
189 | sidenote {
190 | clear: right;
191 | font-size: 10pt;
192 | position: fixed;
193 | float: right;
194 | white-space: normal;
195 | right: 20px;
196 | width: 200px;
197 | display: block;
198 | max-width: 30%
199 | }
200 | }
201 |
202 | /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */
203 | /* Sidenotes & Superscripts */
204 | /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */
205 |
206 | @media print {
207 | a.btn, button {
208 | display: none!important
209 | }
210 | }
211 |
212 | @media (max-width:860px) {
213 | sidenote {
214 | display: block;
215 | font-size: 11pt;
216 | margin: 2em 3em 2em 2em
217 | }
218 | }
219 |
220 | /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */
221 | /* Buttons */
222 | /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */
223 |
224 | @media screen {
225 | button:hover { box-shadow: 0.5em 0.5em var(--accent-color); }
226 | a.btn, button {
227 | border-radius: 3px;
228 | color: #000 !important;
229 | text-decoration: none !important;
230 | font-size: 11pt;
231 | border: 1px solid #000;
232 | padding: 0.5em 1em;
233 | font-family: -apple-system,
234 | BlinkMacSystemFont,
235 | "avenir next",
236 | avenir,
237 | helvetica,
238 | "helvetica neue",
239 | ubuntu,
240 | roboto,
241 | noto,
242 | "segoe ui",
243 | arial,
244 | sans-serif !important;
245 | background: #fff;
246 | margin: 1.5em 0;
247 | font-weight: 500;
248 | transition: all .05s ease-in-out,box-shadow-color .025s ease-in-out;
249 | box-shadow: 0.5em 0.5em #eee;
250 | display: inline-block;
251 | }
252 |
253 | a.btn:hover, button:hover {
254 | cursor: pointer
255 | box-shadow: 0.5em 0.5em var(--accent-color);
256 | }
257 | a.btn:active, button.active, button:active {
258 | border: 1px solid;
259 | margin: 2em 0 1em 1em;
260 | box-shadow: 0 0 #000 !important
261 | }
262 | a.btn.small,button.small {
263 | box-shadow: .5em .5em 0 #eee;
264 | border: 1px solid #000;
265 | padding: .6em 1em;
266 | font-weight: 500
267 | }
268 | a.btn.small:hover,button.small:hover {
269 | box-shadow: 0.5em 0.5em var(--accent-color);
270 | }
271 | a.btn.small:active,button.small:active {
272 | margin: 2em 0 1em 1em;
273 | box-shadow: 0 0 #000
274 | }
275 | }
276 |
277 | /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */
278 | /* Blockquotes & Epigraphs
279 | /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */
280 |
281 | blockquote {
282 | margin: 1em;
283 | }
284 | div>blockquote>p {
285 | font-size: 13pt;
286 | color: #555;
287 | font-style: normal!important;
288 | margin: 0;
289 | padding: 1em 0 1.5em
290 | }
291 | blockquote > blockquote {
292 | padding: 0.5em 2em 1em 1.5em !important;
293 | }
294 |
295 | blockquote > blockquote,
296 | blockquote > blockquote > p {
297 | font-size: 14pt;
298 | padding: 0;
299 | margin: 0;
300 | text-align: center;
301 | font-style: italic;
302 | color: var(--epigraph-color);
303 | }
304 | blockquote footer {
305 | font-size: 12pt;
306 | text-align: inherit;
307 | display: block;
308 | font-style: normal;
309 | margin: 1em;
310 | color: #aaa;
311 | }
312 |
--------------------------------------------------------------------------------
/neuron-explainer/neuron_explainer/explanations/calibrated_simulator.py:
--------------------------------------------------------------------------------
1 | """
2 | Code for calibrating simulations of neuron behavior. Calibration refers to a process of mapping from
3 | a space of predicted activation values (e.g. [0, 10]) to the real activation distribution for a
4 | neuron.
5 |
6 | See http://go/neuron_explanation_methodology for description of calibration step. Necessary for
7 | simulating neurons in the context of ablate-to-simulation, but can be skipped when using correlation
8 | scoring. (Calibration may still improve quality for scoring, at least for non-linear calibration
9 | methods.)
10 | """
11 |
12 | from __future__ import annotations
13 |
14 | import asyncio
15 | from abc import abstractmethod
16 | from typing import Optional, Sequence
17 |
18 | import numpy as np
19 | from neuron_explainer.activations.activations import ActivationRecord
20 | from neuron_explainer.explanations.explanations import ActivationScale
21 | from neuron_explainer.explanations.simulator import NeuronSimulator, SequenceSimulation
22 | from sklearn import linear_model
23 |
24 |
25 | class CalibratedNeuronSimulator(NeuronSimulator):
26 | """
27 | Wrap a NeuronSimulator and calibrate it to map from the predicted activation space to the
28 | actual neuron activation space.
29 | """
30 |
31 | def __init__(self, uncalibrated_simulator: NeuronSimulator):
32 | self.uncalibrated_simulator = uncalibrated_simulator
33 |
34 | @classmethod
35 | async def create(
36 | cls,
37 | uncalibrated_simulator: NeuronSimulator,
38 | calibration_activation_records: Sequence[ActivationRecord],
39 | ) -> CalibratedNeuronSimulator:
40 | """
41 | Create and calibrate a calibrated simulator (so initialization and calibration can be done
42 | in one call).
43 | """
44 | calibrated_simulator = cls(uncalibrated_simulator)
45 | await calibrated_simulator.calibrate(calibration_activation_records)
46 | return calibrated_simulator
47 |
48 | async def calibrate(self, calibration_activation_records: Sequence[ActivationRecord]) -> None:
49 | """
50 | Determine parameters to map from the predicted activation space to the real neuron
51 | activation space, based on a calibration set.
52 |
53 | Use when simulated sequences haven't already been produced on the calibration set.
54 | """
55 | simulations = await asyncio.gather(
56 | *[
57 | self.uncalibrated_simulator.simulate(activations.tokens)
58 | for activations in calibration_activation_records
59 | ]
60 | )
61 | self.calibrate_from_simulations(calibration_activation_records, simulations)
62 |
63 | def calibrate_from_simulations(
64 | self,
65 | calibration_activation_records: Sequence[ActivationRecord],
66 | simulations: Sequence[SequenceSimulation],
67 | ) -> None:
68 | """
69 | Determine parameters to map from the predicted activation space to the real neuron
70 | activation space, based on a calibration set.
71 |
72 | Use when simulated sequences have already been produced on the calibration set.
73 | """
74 | flattened_activations = []
75 | flattened_simulated_activations: list[float] = []
76 | for activations, simulation in zip(calibration_activation_records, simulations):
77 | flattened_activations.extend(activations.activations)
78 | flattened_simulated_activations.extend(simulation.expected_activations)
79 | self._calibrate_from_flattened_activations(
80 | np.array(flattened_activations), np.array(flattened_simulated_activations)
81 | )
82 |
83 | @abstractmethod
84 | def _calibrate_from_flattened_activations(
85 | self,
86 | true_activations: np.ndarray,
87 | uncalibrated_activations: np.ndarray,
88 | ) -> None:
89 | """
90 | Determine parameters to map from the predicted activation space to the real neuron
91 | activation space, based on a calibration set.
92 |
93 | Take numpy arrays of all true activations and all uncalibrated activations on the
94 | calibration set over all sequences.
95 | """
96 |
97 | @abstractmethod
98 | def apply_calibration(self, values: Sequence[float]) -> list[float]:
99 | """Apply the learned calibration to a sequence of values."""
100 |
101 | async def simulate(self, tokens: Sequence[str]) -> SequenceSimulation:
102 | uncalibrated_seq_simulation = await self.uncalibrated_simulator.simulate(tokens)
103 | calibrated_activations = self.apply_calibration(
104 | uncalibrated_seq_simulation.expected_activations
105 | )
106 | calibrated_distribution_values = [
107 | self.apply_calibration(dv) for dv in uncalibrated_seq_simulation.distribution_values
108 | ]
109 | return SequenceSimulation(
110 | tokens=uncalibrated_seq_simulation.tokens,
111 | expected_activations=calibrated_activations,
112 | activation_scale=ActivationScale.NEURON_ACTIVATIONS,
113 | distribution_values=calibrated_distribution_values,
114 | distribution_probabilities=uncalibrated_seq_simulation.distribution_probabilities,
115 | uncalibrated_simulation=uncalibrated_seq_simulation,
116 | )
117 |
118 |
119 | class UncalibratedNeuronSimulator(CalibratedNeuronSimulator):
120 | """Pass through the activations without trying to calibrate."""
121 |
122 | def __init__(self, uncalibrated_simulator: NeuronSimulator):
123 | super().__init__(uncalibrated_simulator)
124 |
125 | async def calibrate(self, calibration_activation_records: Sequence[ActivationRecord]) -> None:
126 | pass
127 |
128 | def _calibrate_from_flattened_activations(
129 | self,
130 | true_activations: np.ndarray,
131 | uncalibrated_activations: np.ndarray,
132 | ) -> None:
133 | pass
134 |
135 | def apply_calibration(self, values: Sequence[float]) -> list[float]:
136 | return values if isinstance(values, list) else list(values)
137 |
138 |
139 | class LinearCalibratedNeuronSimulator(CalibratedNeuronSimulator):
140 | """Find a linear mapping from uncalibrated activations to true activations.
141 |
142 | Should not change ev_correlation_score because it is invariant to linear transformations.
143 | """
144 |
145 | def __init__(self, uncalibrated_simulator: NeuronSimulator):
146 | super().__init__(uncalibrated_simulator)
147 | self._regression: Optional[linear_model.LinearRegression] = None
148 |
149 | def _calibrate_from_flattened_activations(
150 | self,
151 | true_activations: np.ndarray,
152 | uncalibrated_activations: np.ndarray,
153 | ) -> None:
154 | self._regression = linear_model.LinearRegression()
155 | self._regression.fit(uncalibrated_activations.reshape(-1, 1), true_activations)
156 |
157 | def apply_calibration(self, values: Sequence[float]) -> list[float]:
158 | if self._regression is None:
159 | raise ValueError("Must call calibrate() before apply_calibration")
160 | if len(values) == 0:
161 | return []
162 | return self._regression.predict(np.reshape(np.array(values), (-1, 1))).tolist()
163 |
164 |
165 | class PercentileMatchingCalibratedNeuronSimulator(CalibratedNeuronSimulator):
166 | """
167 | Map the nth percentile of the uncalibrated activations to the nth percentile of the true
168 | activations for all n.
169 |
170 | This will match the distribution of true activations on the calibration set, but will be
171 | overconfident outside of the calibration set.
172 | """
173 |
174 | def __init__(self, uncalibrated_simulator: NeuronSimulator):
175 | super().__init__(uncalibrated_simulator)
176 | self._uncalibrated_activations: Optional[np.ndarray] = None
177 | self._true_activations: Optional[np.ndarray] = None
178 |
179 | def _calibrate_from_flattened_activations(
180 | self,
181 | true_activations: np.ndarray,
182 | uncalibrated_activations: np.ndarray,
183 | ) -> None:
184 | self._uncalibrated_activations = np.sort(uncalibrated_activations)
185 | self._true_activations = np.sort(true_activations)
186 |
187 | def apply_calibration(self, values: Sequence[float]) -> list[float]:
188 | if self._true_activations is None or self._uncalibrated_activations is None:
189 | raise ValueError("Must call calibrate() before apply_calibration")
190 | if len(values) == 0:
191 | return []
192 | return np.interp(
193 | np.array(values), self._uncalibrated_activations, self._true_activations
194 | ).tolist()
195 |
--------------------------------------------------------------------------------
/neuron-explainer/neuron_explainer/explanations/test_simulator.py:
--------------------------------------------------------------------------------
1 | from neuron_explainer.explanations.few_shot_examples import FewShotExampleSet
2 | from neuron_explainer.explanations.prompt_builder import HarmonyMessage, PromptFormat, Role
3 | from neuron_explainer.explanations.simulator import (
4 | ExplanationNeuronSimulator,
5 | ExplanationTokenByTokenSimulator,
6 | )
7 |
8 |
9 | def test_make_explanation_simulation_prompt_if_format() -> None:
10 | expected_prompt = """We're studying neurons in a neural network.
11 | Each neuron looks for some particular thing in a short document.
12 | Look at summary of what the neuron does, and try to predict how it will fire on each token.
13 |
14 | The activation format is tokenactivation, activations go from 0 to 10, "unknown" indicates an unknown activation. Most activations will be 0.
15 |
16 |
17 | Neuron 1
18 | Explanation of neuron 1 behavior: the main thing this neuron does is find vowels
19 | Activations:
20 |
21 | a 10
22 | b 0
23 | c 0
24 |
25 |
26 | d unknown
27 | e 10
28 | f 0
29 |
30 |
31 |
32 |
33 | Neuron 2
34 | Explanation of neuron 2 behavior: the main thing this neuron does is find EXPLANATION<|endofprompt|>
35 | Activations:
36 |
37 | 0 unknown
38 | 1 unknown
39 | 2 unknown
40 |
41 | """
42 | prompt = ExplanationNeuronSimulator(
43 | model_name="text-davinci-003",
44 | explanation="EXPLANATION",
45 | few_shot_example_set=FewShotExampleSet.TEST,
46 | prompt_format=PromptFormat.INSTRUCTION_FOLLOWING,
47 | ).make_simulation_prompt(
48 | tokens=[str(x) for x in range(3)],
49 | )
50 | assert prompt == expected_prompt
51 |
52 |
53 | def test_make_explanation_simulation_prompt_harmony_format() -> None:
54 | expected_prompt = [
55 | HarmonyMessage(
56 | role=Role.SYSTEM,
57 | content="""We're studying neurons in a neural network.
58 | Each neuron looks for some particular thing in a short document.
59 | Look at summary of what the neuron does, and try to predict how it will fire on each token.
60 |
61 | The activation format is tokenactivation, activations go from 0 to 10, "unknown" indicates an unknown activation. Most activations will be 0.
62 | """,
63 | ),
64 | HarmonyMessage(
65 | role=Role.USER,
66 | content="""
67 |
68 | Neuron 1
69 | Explanation of neuron 1 behavior: the main thing this neuron does is find vowels""",
70 | ),
71 | HarmonyMessage(
72 | role=Role.ASSISTANT,
73 | content="""
74 | Activations:
75 |
76 | a 10
77 | b 0
78 | c 0
79 |
80 |
81 | d unknown
82 | e 10
83 | f 0
84 |
85 |
86 | """,
87 | ),
88 | HarmonyMessage(
89 | role=Role.USER,
90 | content="""
91 |
92 | Neuron 2
93 | Explanation of neuron 2 behavior: the main thing this neuron does is find EXPLANATION""",
94 | ),
95 | HarmonyMessage(
96 | role=Role.ASSISTANT,
97 | content="""
98 | Activations:
99 |
100 | 0 unknown
101 | 1 unknown
102 | 2 unknown
103 |
104 | """,
105 | ),
106 | ]
107 | prompt = ExplanationNeuronSimulator(
108 | model_name="gpt-4",
109 | explanation="EXPLANATION",
110 | few_shot_example_set=FewShotExampleSet.TEST,
111 | prompt_format=PromptFormat.HARMONY_V4,
112 | ).make_simulation_prompt(
113 | tokens=[str(x) for x in range(3)],
114 | )
115 |
116 | assert isinstance(prompt, list)
117 | assert isinstance(prompt[0], dict) # Really a HarmonyMessage
118 | for actual_message, expected_message in zip(prompt, expected_prompt):
119 | assert actual_message["role"] == expected_message["role"]
120 | assert actual_message["content"] == expected_message["content"]
121 | assert prompt == expected_prompt
122 |
123 |
124 | def test_make_token_by_token_simulation_prompt_if_format() -> None:
125 | expected_prompt = """We're studying neurons in a neural network. Each neuron looks for some particular thing in a short document. Look at an explanation of what the neuron does, and try to predict its activations on a particular token.
126 |
127 | The activation format is tokenactivation, and activations range from 0 to 10. Most activations will be 0.
128 |
129 | Neuron 1
130 | Explanation of neuron 1 behavior: the main thing this neuron does is find vowels
131 | Activations:
132 |
133 | a 10
134 | b 0
135 | c 0
136 |
137 |
138 | d 0
139 | e 10
140 | f 0
141 |
142 |
143 |
144 | Now, we're going predict the activation of a new neuron on a single token, following the same rules as the examples above. Activations still range from 0 to 10.
145 | Neuron 2
146 | Explanation of neuron 2 behavior: the main thing this neuron does is find numbers and nothing else
147 | Text:
148 | ghi
149 |
150 | Last token in the text:
151 | i
152 |
153 | Last token activation, considering the token in the context in which it appeared in the text:
154 | 10
155 |
156 |
157 | Neuron 3
158 | Explanation of neuron 3 behavior: the main thing this neuron does is find numbers and nothing else
159 | Text:
160 | 01
161 |
162 | Last token in the text:
163 | 1
164 |
165 | Last token activation, considering the token in the context in which it appeared in the text:
166 | <|endofprompt|>"""
167 | prompt = ExplanationTokenByTokenSimulator(
168 | model_name="text-davinci-003",
169 | explanation="EXPLANATION",
170 | few_shot_example_set=FewShotExampleSet.TEST,
171 | prompt_format=PromptFormat.INSTRUCTION_FOLLOWING,
172 | ).make_single_token_simulation_prompt(
173 | tokens=[str(x) for x in range(3)],
174 | explanation="numbers and nothing else",
175 | token_index_to_score=1,
176 | )
177 | assert prompt == expected_prompt
178 |
179 |
180 | def test_make_token_by_token_simulation_prompt_harmony_format() -> None:
181 | expected_prompt = [
182 | HarmonyMessage(
183 | role=Role.SYSTEM,
184 | content="""We're studying neurons in a neural network. Each neuron looks for some particular thing in a short document. Look at an explanation of what the neuron does, and try to predict its activations on a particular token.
185 |
186 | The activation format is tokenactivation, and activations range from 0 to 10. Most activations will be 0.
187 |
188 | """,
189 | ),
190 | HarmonyMessage(
191 | role=Role.USER,
192 | content="""Neuron 1
193 | Explanation of neuron 1 behavior: the main thing this neuron does is find vowels
194 | """,
195 | ),
196 | HarmonyMessage(
197 | role=Role.ASSISTANT,
198 | content="""Activations:
199 |
200 | a 10
201 | b 0
202 | c 0
203 |
204 |
205 | d 0
206 | e 10
207 | f 0
208 |
209 |
210 |
211 | """,
212 | ),
213 | HarmonyMessage(
214 | role=Role.SYSTEM,
215 | content="Now, we're going predict the activation of a new neuron on a single token, following the same rules as the examples above. Activations still range from 0 to 10.",
216 | ),
217 | HarmonyMessage(
218 | role=Role.USER,
219 | content="""
220 | Neuron 2
221 | Explanation of neuron 2 behavior: the main thing this neuron does is find numbers and nothing else
222 | Text:
223 | ghi
224 |
225 | Last token in the text:
226 | i
227 |
228 | Last token activation, considering the token in the context in which it appeared in the text:
229 | """,
230 | ),
231 | HarmonyMessage(
232 | role=Role.ASSISTANT,
233 | content="""10
234 |
235 | """,
236 | ),
237 | HarmonyMessage(
238 | role=Role.USER,
239 | content="""
240 | Neuron 3
241 | Explanation of neuron 3 behavior: the main thing this neuron does is find numbers and nothing else
242 | Text:
243 | 01
244 |
245 | Last token in the text:
246 | 1
247 |
248 | Last token activation, considering the token in the context in which it appeared in the text:
249 | """,
250 | ),
251 | ]
252 |
253 | prompt = ExplanationTokenByTokenSimulator(
254 | model_name="gpt-4",
255 | explanation="EXPLANATION",
256 | few_shot_example_set=FewShotExampleSet.TEST,
257 | prompt_format=PromptFormat.HARMONY_V4,
258 | ).make_single_token_simulation_prompt(
259 | tokens=[str(x) for x in range(3)],
260 | explanation="numbers and nothing else",
261 | token_index_to_score=1,
262 | )
263 |
264 | assert isinstance(prompt, list)
265 | assert isinstance(prompt[0], dict) # Really a HarmonyMessage
266 | for actual_message, expected_message in zip(prompt, expected_prompt):
267 | assert actual_message["role"] == expected_message["role"]
268 | assert actual_message["content"] == expected_message["content"]
269 | assert prompt == expected_prompt
270 |
--------------------------------------------------------------------------------
/neuron-viewer/src/welcome.tsx:
--------------------------------------------------------------------------------
1 | import { useState, FormEvent } from "react"
2 | import { useNavigate } from "react-router-dom"
3 |
4 | function NeuronForm() {
5 | const [input_layer, setLayer] = useState(0)
6 | const [input_neuron, setNeuron] = useState(0)
7 | const navigate = useNavigate()
8 |
9 | const knownGoodNeurons = [
10 | /**************
11 | /* well explained + interesting
12 | ***************/
13 | {heading: 'Somewhat well explained by GPT-4', layer: 0, neuron: 0, label: ''},
14 | {layer: 5, neuron: 131, label: "citations", description: "citations, especially biblical and legal"},
15 | {layer: 12, neuron: 847, label: "numbers in fractions", description: "numbers in fractions"}, //
16 | {layer: 12, neuron: 5820, label: "short flags", description: "single letter command line flags"}, //
17 | {layer: 14, neuron: 417, label: "doing things right", description: "words and phrases related to performing actions correctly or properly"}, // score 0.42
18 | {layer: 15, neuron: 4538, label: "leading transitions", description: "transition words at the start of documents"},
19 | {layer: 17, neuron: 3218, label: "success", description: "expressions of completion or success"}, // score 0.38
20 | {layer: 18, neuron: 5302, label: "X *by*", description: "the word 'by' in phrases indicating side by side or sequential events."}, // score 0.48
21 | {layer: 19, neuron: 1377, label: "similes", description: "comparisons and analogies, often using the word 'like'"}, // score 0.42
22 | {layer: 21, neuron: 2932, label: "Canada", description: "references to Canadian people, places, and entities"}, // score 0.78
23 | {layer: 25, neuron: 2602, label: "similes", description: "descriptive comparisons, especially similes"}, // score 0.40
24 | {layer: 25, neuron: 4870, label: "certainty", description: "phrases related to certainty and confidence."}, // score 0.37
25 | {layer: 30, neuron: 28, label: "times", description: "specific times (with hours and minutes)"},
26 | // https://openaipublic.blob.core.windows.net/neuron-explainer/neuron-viewer/index.html#/layers/5/neurons/2326
27 | {heading: 'Partially explained by GPT-4', layer: 0, neuron: 0, label: ''},
28 | {layer: 0, neuron: 816, label: "Marvel comics vibes", description: "language and context related to Marvel comics, movies, and characters, as well as other superhero-themed content"}, // score 0.44
29 | {layer: 0, neuron: 742, label: "Second token 'and'", description: "'and', 'in', and punctuation at the second token"},
30 | {layer: 4, neuron: 4342, label: "token counter", description: "counting repeated occurrences of a token"},
31 | {layer: 5, neuron: 2326, label: "rhymes with 'at'", description: "syllables rhyming with 'at', sometimes 'it', 'et', 'ot'"},
32 | {layer: 5, neuron: 4492, label: "leading 'an'", description: "sentences that start with 'an'"}, // score 0.77
33 | {layer: 6, neuron: 3251, label: "not all", description: "not all"},
34 | {layer: 10, neuron: 2851, label: "leading acronyms", description: "acronyms after punctuation or newlines"},
35 | {layer: 12, neuron: 2884, label: "hypothetical had", description: "had in hypothetical contexts"}, //
36 | {layer: 14, neuron: 3539, label: "long sequences", description: "long sequences of stuff"},
37 | {layer: 14, neuron: 3822, label: "X by/after *X*", description: "noun repetitions separated by 'by' or 'after'"},
38 | {layer: 21, neuron: 3982, label: "any *and* all", description: "any/anything *and/&* all/everything"},
39 | {layer: 26, neuron: 20, label: "truth, skin, or sun", description: "truth, skin, or sun"},
40 | // layer=18&neuron=5302
41 | /**************
42 | /* boring
43 | ***************/
44 | /**************
45 | /* poorly explained + interesting
46 | ***************/
47 | {heading: 'Poorly explained by GPT-4', layer: 0, neuron: 0, label: ''},
48 | // Actually activates for negated version “not so much … as” even when not so much is fairly far apart
49 | // another "not all": 13&neuron=1352
50 | // {layer: 0, neuron: 2823, label: "Hillary email leak vibes", description: "contexts related to Hillary Clinton leaked emails"}, // score ??
51 | // {layer: 12, neuron: 3718, label: "comparative phrases and negations", description: "comparative phrases and negations"}, // score 0.12
52 | {layer: 13, neuron: 410, label: "N and N+1", description: "a number following its predecessor"}, // score ??
53 | {layer: 13, neuron: 979, label: "subtle plurals", description: "subtle/nonobvious plurals"}, // score ??
54 | // slash after number 12&neuron=847
55 | // numbers predicting slash: 14&neuron=92
56 | // 0&neuron=2823
57 | {layer: 14, neuron: 1251, label: "subjunctive verbs", description: "verbs in subjunctive mood"}, // score ??
58 | {layer: 16, neuron: 518, label: "pattern breaks", description: "tokens that break an established pattern in an ongoing list"}, // score 0.2 with totally wrong explanation
59 | {layer: 17, neuron: 821, label: "idioms", description: "idioms"},
60 | {layer: 18, neuron: 3481, label: "post-typo", description: "first token following a typo"}, // score ??
61 | {layer: 18, neuron: 3552, label: "repeated text", description: "repeated text"}, // score ??
62 | // another shared last names: https://openaipublic.blob.core.windows.net/neuron-explainer/neuron-viewer/index.html#/layers/20/neurons/3164
63 | {layer: 19, neuron: 1763, label: "shared last names", description: "last names when two different people sharing last name are mentioned"}, // score 0.36
64 | {layer: 20, neuron: 4334, label: "previous break", description: "tokens that previously preceded a linebreak"}, // score ??
65 | {layer: 27, neuron: 116, label: "MTG vibes", description: "Magic the Gathering contexts"}, // score ??
66 | {layer: 35, neuron: 1523, label: "NBA name predictor", description: "NBA person/player name predictor"}, // score ??
67 | // {layer: 36, neuron: 2275, label: "she predictor", description: "prediction of the token 'she'"}, // score ??
68 | // {layer: 36, neuron: 5107, label: "Mormon vibes", description: "Mormon related context"}, // score ??
69 | // ] predictor 40&neuron=4505
70 | {layer: 46, neuron: 2181, label: "C predictor", description: "prediction of the token 'C'"}, // score ??
71 | ]
72 |
73 | const handleSubmit = (e: FormEvent) => {
74 | e.preventDefault()
75 | navigate(`/layers/${input_layer}/neurons/${input_neuron}`)
76 | return false
77 | }
78 |
79 | const handleNeuronClick = (layer: number, neuron: number) => {
80 | navigate(`/layers/${layer}/neurons/${neuron}`)
81 | }
82 |
83 | const feelingLuckySubmit = () => {
84 | const layer = Math.floor(Math.random() * 48);
85 | const neuron = Math.floor(Math.random() * 6400);
86 | navigate(`/layers/${layer}/neurons/${neuron}`)
87 | return false
88 | }
89 |
90 |
91 | return (
92 |
93 |
Welcome! Pick a neuron:
94 |
126 |
132 |
133 |
Interesting neurons:
134 |
135 |
138 | {knownGoodNeurons.map(({ heading, layer, neuron, label, description }, j) => (
139 | heading ?
140 | {heading}
141 |
:
150 | ))}
151 |
152 |
153 |
154 |
155 | )
156 | }
157 |
158 | export default NeuronForm
159 |
--------------------------------------------------------------------------------
/neuron-explainer/neuron_explainer/explanations/explanations.py:
--------------------------------------------------------------------------------
1 | # Dataclasses and enums for storing neuron explanations, their scores, and related data. Also,
2 | # related helper functions.
3 |
4 | from __future__ import annotations
5 |
6 | import json
7 | from dataclasses import dataclass
8 | from enum import Enum
9 | from typing import List, Optional, Union
10 |
11 | import blobfile as bf
12 | import boostedblob as bbb
13 | from neuron_explainer.activations.activations import NeuronId
14 | from neuron_explainer.fast_dataclasses import FastDataclass, loads, register_dataclass
15 |
16 |
17 | class ActivationScale(str, Enum):
18 | """Which "units" are stored in the expected_activations/distribution_values fields of a
19 | SequenceSimulation.
20 |
21 | This enum identifies whether the values represent real activations of the neuron or something
22 | else. Different scales are not necessarily related by a linear transformation.
23 | """
24 |
25 | NEURON_ACTIVATIONS = "neuron_activations"
26 | """Values represent real activations of the neuron."""
27 | SIMULATED_NORMALIZED_ACTIVATIONS = "simulated_normalized_activations"
28 | """
29 | Values represent simulated activations of the neuron, normalized to the range [0, 10]. This
30 | scale is arbitrary and should not be interpreted as a neuron activation.
31 | """
32 |
33 |
34 | @register_dataclass
35 | @dataclass
36 | class SequenceSimulation(FastDataclass):
37 | """The result of a simulation of neuron activations on one text sequence."""
38 |
39 | tokens: list[str]
40 | """The sequence of tokens that was simulated."""
41 | expected_activations: list[float]
42 | """Expected value of the possibly-normalized activation for each token in the sequence."""
43 | activation_scale: ActivationScale
44 | """What scale is used for values in the expected_activations field."""
45 | distribution_values: list[list[float]]
46 | """
47 | For each token in the sequence, a list of values from the discrete distribution of activations
48 | produced from simulation. Tokens will be included here if and only if they are in the top K=15
49 | tokens predicted by the simulator, and excluded otherwise.
50 |
51 | May be transformed to another unit by calibration. When we simulate a neuron, we produce a
52 | discrete distribution with values in the arbitrary discretized space of the neuron, e.g. 10%
53 | chance of 0, 70% chance of 1, 20% chance of 2. Which we store as distribution_values =
54 | [0, 1, 2], distribution_probabilities = [0.1, 0.7, 0.2]. When we transform the distribution to
55 | the real activation units, we can correspondingly transform the values of this distribution
56 | to get a distribution in the units of the neuron. e.g. if the mapping from the discretized space
57 | to the real activation unit of the neuron is f(x) = x/2, then the distribution becomes 10%
58 | chance of 0, 70% chance of 0.5, 20% chance of 1. Which we store as distribution_values =
59 | [0, 0.5, 1], distribution_probabilities = [0.1, 0.7, 0.2].
60 | """
61 | distribution_probabilities: list[list[float]]
62 | """
63 | For each token in the sequence, the probability of the corresponding value in
64 | distribution_values.
65 | """
66 |
67 | uncalibrated_simulation: Optional["SequenceSimulation"] = None
68 | """The result of the simulation before calibration."""
69 |
70 |
71 | @register_dataclass
72 | @dataclass
73 | class ScoredSequenceSimulation(FastDataclass):
74 | """
75 | SequenceSimulation result with a score (for that sequence only) and ground truth activations.
76 | """
77 |
78 | simulation: SequenceSimulation
79 | """The result of a simulation of neuron activations."""
80 | true_activations: List[float]
81 | """Ground truth activations on the sequence (not normalized)"""
82 | ev_correlation_score: float
83 | """
84 | Correlation coefficient between the expected values of the normalized activations from the
85 | simulation and the unnormalized true activations of the neuron on the text sequence.
86 | """
87 | rsquared_score: Optional[float] = None
88 | """R^2 of the simulated activations."""
89 | absolute_dev_explained_score: Optional[float] = None
90 | """
91 | Score based on absolute difference between real and simulated activations.
92 | absolute_dev_explained_score = 1 - mean(abs(real-predicted))/ mean(abs(real))
93 | """
94 |
95 |
96 | @register_dataclass
97 | @dataclass
98 | class ScoredSimulation(FastDataclass):
99 | """Result of scoring a neuron simulation on multiple sequences."""
100 |
101 | scored_sequence_simulations: List[ScoredSequenceSimulation]
102 | """ScoredSequenceSimulation for each sequence"""
103 | ev_correlation_score: Optional[float] = None
104 | """
105 | Correlation coefficient between the expected values of the normalized activations from the
106 | simulation and the unnormalized true activations on a dataset created from all score_results.
107 | (Note that this is not equivalent to averaging across sequences.)
108 | """
109 | rsquared_score: Optional[float] = None
110 | """R^2 of the simulated activations."""
111 | absolute_dev_explained_score: Optional[float] = None
112 | """
113 | Score based on absolute difference between real and simulated activations.
114 | absolute_dev_explained_score = 1 - mean(abs(real-predicted))/ mean(abs(real)).
115 | """
116 |
117 | def get_preferred_score(self) -> Optional[float]:
118 | """
119 | This method may return None in cases where the score is undefined, for example if the
120 | normalized activations were all zero, yielding a correlation coefficient of NaN.
121 | """
122 | return self.ev_correlation_score
123 |
124 |
125 | @register_dataclass
126 | @dataclass
127 | class ScoredExplanation(FastDataclass):
128 | """Simulator parameters and the results of scoring it on multiple sequences"""
129 |
130 | explanation: str
131 | """The explanation used for simulation."""
132 |
133 | scored_simulation: ScoredSimulation
134 | """Result of scoring the neuron simulator on multiple sequences."""
135 |
136 | def get_preferred_score(self) -> Optional[float]:
137 | """
138 | This method may return None in cases where the score is undefined, for example if the
139 | normalized activations were all zero, yielding a correlation coefficient of NaN.
140 | """
141 | return self.scored_simulation.get_preferred_score()
142 |
143 |
144 | @register_dataclass
145 | @dataclass
146 | class NeuronSimulationResults(FastDataclass):
147 | """Simulation results and scores for a neuron."""
148 |
149 | neuron_id: NeuronId
150 | scored_explanations: list[ScoredExplanation]
151 |
152 |
153 | def load_neuron_explanations(
154 | explanations_path: str, layer_index: Union[str, int], neuron_index: Union[str, int]
155 | ) -> Optional[NeuronSimulationResults]:
156 | """Load scored explanations for the specified neuron."""
157 | file = bf.join(explanations_path, str(layer_index), f"{neuron_index}.jsonl")
158 | if not bf.exists(file):
159 | return None
160 | with bf.BlobFile(file) as f:
161 | for line in f:
162 | return loads(line)
163 | return None
164 |
165 |
166 | @bbb.ensure_session
167 | async def load_neuron_explanations_async(
168 | explanations_path: str, layer_index: Union[str, int], neuron_index: Union[str, int]
169 | ) -> Optional[NeuronSimulationResults]:
170 | """Load scored explanations for the specified neuron, asynchronously."""
171 | return await read_explanation_file(
172 | bf.join(explanations_path, str(layer_index), f"{neuron_index}.jsonl")
173 | )
174 |
175 |
176 | @bbb.ensure_session
177 | async def read_file(filename: str) -> Optional[str]:
178 | """Read the contents of the given file as a string, asynchronously."""
179 | try:
180 | raw_contents = await bbb.read.read_single(filename)
181 | except FileNotFoundError:
182 | print(f"Could not read {filename}")
183 | return None
184 | lines = []
185 | for line in raw_contents.decode("utf-8").split("\n"):
186 | if len(line) > 0:
187 | lines.append(line)
188 | assert len(lines) == 1, filename
189 | return lines[0]
190 |
191 |
192 | @bbb.ensure_session
193 | async def read_explanation_file(explanation_filename: str) -> Optional[NeuronSimulationResults]:
194 | """Load scored explanations from the given filename, asynchronously."""
195 | line = await read_file(explanation_filename)
196 | return loads(line) if line is not None else None
197 |
198 |
199 | @bbb.ensure_session
200 | async def read_json_file(filename: str) -> Optional[dict]:
201 | """Read the contents of the given file as a JSON object, asynchronously."""
202 | line = await read_file(filename)
203 | return json.loads(line) if line is not None else None
204 |
205 |
206 | def get_numerical_subdirs(dataset_path: str) -> list[str]:
207 | """Return the names of all numbered subdirectories in the specified directory.
208 |
209 | Used to get all layer directories in an explanation directory.
210 | """
211 | return [
212 | str(x)
213 | for x in sorted(
214 | [
215 | int(x)
216 | for x in bf.listdir(dataset_path)
217 | if bf.isdir(bf.join(dataset_path, x)) and x.isnumeric()
218 | ]
219 | )
220 | ]
221 |
222 |
223 | def get_sorted_neuron_indices_from_explanations(
224 | explanations_path: str, layer: Union[str, int]
225 | ) -> list[int]:
226 | """Return the indices of all neurons in this layer, in ascending order."""
227 | layer_dir = bf.join(explanations_path, str(layer))
228 | return sorted(
229 | [int(f.split(".")[0]) for f in bf.listdir(layer_dir) if f.split(".")[0].isnumeric()]
230 | )
231 |
--------------------------------------------------------------------------------
/neuron-explainer/neuron_explainer/activations/activations.py:
--------------------------------------------------------------------------------
1 | # Dataclasses and enums for storing neuron-indexed information about activations. Also, related
2 | # helper functions.
3 |
4 | import math
5 | from dataclasses import dataclass, field
6 | from typing import List, Optional, Union
7 |
8 | import urllib.request
9 | import blobfile as bf
10 | import boostedblob as bbb
11 | from neuron_explainer.fast_dataclasses import FastDataclass, loads, register_dataclass
12 | from neuron_explainer.azure import standardize_azure_url
13 |
14 |
15 | @register_dataclass
16 | @dataclass
17 | class ActivationRecord(FastDataclass):
18 | """Collated lists of tokens and their activations for a single neuron."""
19 |
20 | tokens: List[str]
21 | """Tokens in the text sequence, represented as strings."""
22 | activations: List[float]
23 | """Raw activation values for the neuron on each token in the text sequence."""
24 |
25 |
26 | @register_dataclass
27 | @dataclass
28 | class NeuronId(FastDataclass):
29 | """Identifier for a neuron in an artificial neural network."""
30 |
31 | layer_index: int
32 | """The index of layer the neuron is in. The first layer used during inference has index 0."""
33 | neuron_index: int
34 | """The neuron's index within in its layer. Indices start from 0 in each layer."""
35 |
36 |
37 | def _check_slices(
38 | slices_by_split: dict[str, slice],
39 | expected_num_values: int,
40 | ) -> None:
41 | """Assert that the slices are disjoint and fully cover the intended range."""
42 | indices = set()
43 | sum_of_slice_lengths = 0
44 | n_splits = len(slices_by_split.keys())
45 | for s in slices_by_split.values():
46 | subrange = range(expected_num_values)[s]
47 | sum_of_slice_lengths += len(subrange)
48 | indices |= set(subrange)
49 | assert (
50 | sum_of_slice_lengths == expected_num_values
51 | ), f"{sum_of_slice_lengths=} != {expected_num_values=}"
52 | stride = n_splits
53 | expected_indices = set.union(
54 | *[set(range(start_index, expected_num_values, stride)) for start_index in range(n_splits)]
55 | )
56 | assert indices == expected_indices, f"{indices=} != {expected_indices=}"
57 |
58 |
59 | def get_slices_for_splits(
60 | splits: list[str],
61 | num_activation_records_per_split: int,
62 | ) -> dict[str, slice]:
63 | """
64 | Get equal-sized interleaved subsets for each of a list of splits, given the number of elements
65 | to include in each split.
66 | """
67 |
68 | stride = len(splits)
69 | num_activation_records_for_even_splits = num_activation_records_per_split * stride
70 | slices_by_split = {
71 | split: slice(split_index, num_activation_records_for_even_splits, stride)
72 | for split_index, split in enumerate(splits)
73 | }
74 | _check_slices(
75 | slices_by_split=slices_by_split,
76 | expected_num_values=num_activation_records_for_even_splits,
77 | )
78 | return slices_by_split
79 |
80 |
81 | @dataclass
82 | class ActivationRecordSliceParams:
83 | """How to select splits (train, valid, etc.) of activation records."""
84 |
85 | n_examples_per_split: Optional[int]
86 | """The number of examples to include in each split."""
87 |
88 |
89 | @register_dataclass
90 | @dataclass
91 | class NeuronRecord(FastDataclass):
92 | """Neuron-indexed activation data, including summary stats and notable activation records."""
93 |
94 | neuron_id: NeuronId
95 | """Identifier for the neuron."""
96 |
97 | random_sample: list[ActivationRecord] = field(default_factory=list)
98 | """
99 | Random activation records for this neuron. The random sample is independent from those used for
100 | other neurons.
101 | """
102 | random_sample_by_quantile: Optional[list[list[ActivationRecord]]] = None
103 | """
104 | Random samples of activation records in each of the specified quantiles. None if quantile
105 | tracking is disabled.
106 | """
107 | quantile_boundaries: Optional[list[float]] = None
108 | """Boundaries of the quantiles used to generate the random_sample_by_quantile field."""
109 |
110 | # Moments of activations
111 | mean: Optional[float] = math.nan
112 | variance: Optional[float] = math.nan
113 | skewness: Optional[float] = math.nan
114 | kurtosis: Optional[float] = math.nan
115 |
116 | most_positive_activation_records: list[ActivationRecord] = field(default_factory=list)
117 | """
118 | Activation records with the most positive figure of merit value for this neuron over all dataset
119 | examples.
120 | """
121 |
122 | @property
123 | def max_activation(self) -> float:
124 | """Return the maximum activation value over all top-activating activation records."""
125 | return max([max(ar.activations) for ar in self.most_positive_activation_records])
126 |
127 | def _get_top_activation_slices(
128 | self, activation_record_slice_params: ActivationRecordSliceParams
129 | ) -> dict[str, slice]:
130 | splits = ["train", "calibration", "valid", "test"]
131 | n_examples_per_split = activation_record_slice_params.n_examples_per_split
132 | if n_examples_per_split is None:
133 | n_examples_per_split = len(self.most_positive_activation_records) // len(splits)
134 | assert len(self.most_positive_activation_records) >= n_examples_per_split * len(splits)
135 | return get_slices_for_splits(splits, n_examples_per_split)
136 |
137 | def _get_random_activation_slices(
138 | self, activation_record_slice_params: ActivationRecordSliceParams
139 | ) -> dict[str, slice]:
140 | splits = ["calibration", "valid", "test"]
141 | n_examples_per_split = activation_record_slice_params.n_examples_per_split
142 | if n_examples_per_split is None:
143 | n_examples_per_split = len(self.random_sample) // len(splits)
144 | # NOTE: this assert could trigger on some old datasets with only 10 random samples, in which case you may have to remove "test" from the set of splits
145 | assert len(self.random_sample) >= n_examples_per_split * len(splits)
146 | return get_slices_for_splits(splits, n_examples_per_split)
147 |
148 | def train_activation_records(
149 | self,
150 | activation_record_slice_params: ActivationRecordSliceParams,
151 | ) -> list[ActivationRecord]:
152 | """
153 | Train split, typically used for generating explanations. Consists exclusively of
154 | top-activating records since context window limitations make it difficult to include
155 | random records.
156 | """
157 | return self.most_positive_activation_records[
158 | self._get_top_activation_slices(activation_record_slice_params)["train"]
159 | ]
160 |
161 | def calibration_activation_records(
162 | self,
163 | activation_record_slice_params: ActivationRecordSliceParams,
164 | ) -> list[ActivationRecord]:
165 | """
166 | Calibration split, typically used for calibrating neuron simulations. See
167 | http://go/neuron_explanation_methodology for an explanation of calibration. Consists of
168 | top-activating records and random records in a 1:1 ratio.
169 | """
170 | return (
171 | self.most_positive_activation_records[
172 | self._get_top_activation_slices(activation_record_slice_params)["calibration"]
173 | ]
174 | + self.random_sample[
175 | self._get_random_activation_slices(activation_record_slice_params)["calibration"]
176 | ]
177 | )
178 |
179 | def valid_activation_records(
180 | self,
181 | activation_record_slice_params: ActivationRecordSliceParams,
182 | ) -> list[ActivationRecord]:
183 | """
184 | Validation split, typically used for evaluating explanations, either automatically with
185 | simulation + correlation coefficient scoring, or manually by humans. Consists of
186 | top-activating records and random records in a 1:1 ratio.
187 | """
188 | return (
189 | self.most_positive_activation_records[
190 | self._get_top_activation_slices(activation_record_slice_params)["valid"]
191 | ]
192 | + self.random_sample[
193 | self._get_random_activation_slices(activation_record_slice_params)["valid"]
194 | ]
195 | )
196 |
197 | def test_activation_records(
198 | self,
199 | activation_record_slice_params: ActivationRecordSliceParams,
200 | ) -> list[ActivationRecord]:
201 | """
202 | Test split, typically used for explanation evaluations that can't use the validation split.
203 | Consists of top-activating records and random records in a 1:1 ratio.
204 | """
205 | return (
206 | self.most_positive_activation_records[
207 | self._get_top_activation_slices(activation_record_slice_params)["test"]
208 | ]
209 | + self.random_sample[
210 | self._get_random_activation_slices(activation_record_slice_params)["test"]
211 | ]
212 | )
213 |
214 |
215 | def neuron_exists(
216 | dataset_path: str, layer_index: Union[str, int], neuron_index: Union[str, int]
217 | ) -> bool:
218 | """Return whether the specified neuron exists."""
219 | file = bf.join(dataset_path, "neurons", str(layer_index), f"{neuron_index}.json")
220 | return bf.exists(file)
221 |
222 |
223 | def load_neuron(
224 | layer_index: Union[str, int],
225 | neuron_index: Union[str, int],
226 | dataset_path: str = "https://openaipublic.blob.core.windows.net/neuron-explainer/data/collated-activations",
227 | ) -> NeuronRecord:
228 | """Load the NeuronRecord for the specified neuron."""
229 | url = "/".join([dataset_path, str(layer_index), f"{neuron_index}.json"])
230 | url = standardize_azure_url(url)
231 | with urllib.request.urlopen(url) as f:
232 | neuron_record = loads(f.read())
233 | if not isinstance(neuron_record, NeuronRecord):
234 | raise ValueError(
235 | f"Stored data incompatible with current version of NeuronRecord dataclass."
236 | )
237 | return neuron_record
238 |
239 |
240 | @bbb.ensure_session
241 | async def load_neuron_async(
242 | layer_index: Union[str, int],
243 | neuron_index: Union[str, int],
244 | dataset_path: str = "az://openaipublic/neuron-explainer/data/collated-activations",
245 | ) -> NeuronRecord:
246 | """Async version of load_neuron."""
247 | file = bf.join(dataset_path, str(layer_index), f"{neuron_index}.json")
248 | return await read_neuron_file(file)
249 |
250 |
251 | @bbb.ensure_session
252 | async def read_neuron_file(neuron_filename: str) -> NeuronRecord:
253 | """Like load_neuron_async, but takes a raw neuron filename."""
254 | raw_contents = await bbb.read.read_single(neuron_filename)
255 | neuron_record = loads(raw_contents.decode("utf-8"))
256 | if not isinstance(neuron_record, NeuronRecord):
257 | raise ValueError(
258 | f"Stored data incompatible with current version of NeuronRecord dataclass."
259 | )
260 | return neuron_record
261 |
262 |
263 | def get_sorted_neuron_indices(dataset_path: str, layer_index: Union[str, int]) -> List[int]:
264 | """Returns the indices of all neurons in this layer, in ascending order."""
265 | layer_dir = bf.join(dataset_path, "neurons", str(layer_index))
266 | return sorted(
267 | [int(f.split(".")[0]) for f in bf.listdir(layer_dir) if f.split(".")[0].isnumeric()]
268 | )
269 |
270 |
271 | def get_sorted_layers(dataset_path: str) -> List[str]:
272 | """
273 | Return the indices of all layers in this dataset, in ascending numerical order, as strings.
274 | """
275 | return [
276 | str(x)
277 | for x in sorted(
278 | [int(x) for x in bf.listdir(bf.join(dataset_path, "neurons")) if x.isnumeric()]
279 | )
280 | ]
281 |
--------------------------------------------------------------------------------
/neuron-explainer/neuron_explainer/explanations/explainer.py:
--------------------------------------------------------------------------------
1 | """Uses API calls to generate explanations of neuron behavior."""
2 |
3 | from __future__ import annotations
4 |
5 | import logging
6 | import re
7 | from abc import ABC, abstractmethod
8 | from enum import Enum
9 | from typing import Any, Optional, Sequence, Union
10 |
11 | from neuron_explainer.activations.activation_records import (
12 | calculate_max_activation,
13 | format_activation_records,
14 | non_zero_activation_proportion,
15 | )
16 | from neuron_explainer.activations.activations import ActivationRecord
17 | from neuron_explainer.api_client import ApiClient
18 | from neuron_explainer.explanations.few_shot_examples import FewShotExampleSet
19 | from neuron_explainer.explanations.prompt_builder import (
20 | HarmonyMessage,
21 | PromptBuilder,
22 | PromptFormat,
23 | Role,
24 | )
25 | from neuron_explainer.explanations.token_space_few_shot_examples import (
26 | TokenSpaceFewShotExampleSet,
27 | )
28 |
29 | logger = logging.getLogger(__name__)
30 |
31 |
32 | # TODO(williamrs): This prefix may not work well for some things, like predicting the next token.
33 | # Try other options like "this neuron activates for".
34 | EXPLANATION_PREFIX = "the main thing this neuron does is find"
35 |
36 |
37 | def _split_numbered_list(text: str) -> list[str]:
38 | """Split a numbered list into a list of strings."""
39 | lines = re.split(r"\n\d+\.", text)
40 | # Strip the leading whitespace from each line.
41 | return [line.lstrip() for line in lines]
42 |
43 |
44 | def _remove_final_period(text: str) -> str:
45 | """Strip a final period or period-space from a string."""
46 | if text.endswith("."):
47 | return text[:-1]
48 | elif text.endswith(". "):
49 | return text[:-2]
50 | return text
51 |
52 |
53 | class ContextSize(int, Enum):
54 | TWO_K = 2049
55 | FOUR_K = 4097
56 |
57 | @classmethod
58 | def from_int(cls, i: int) -> ContextSize:
59 | for context_size in cls:
60 | if context_size.value == i:
61 | return context_size
62 | raise ValueError(f"{i} is not a valid ContextSize")
63 |
64 |
65 | HARMONY_V4_MODELS = ["gpt-3.5-turbo", "gpt-4", "gpt-4-1106-preview"]
66 |
67 |
68 | class NeuronExplainer(ABC):
69 | """
70 | Abstract base class for Explainer classes that generate explanations from subclass-specific
71 | input data.
72 | """
73 |
74 | def __init__(
75 | self,
76 | model_name: str,
77 | prompt_format: PromptFormat = PromptFormat.HARMONY_V4,
78 | # This parameter lets us adjust the length of the prompt when we're generating explanations
79 | # using older models with shorter context windows. In the future we can use it to experiment
80 | # with longer context windows.
81 | context_size: ContextSize = ContextSize.FOUR_K,
82 | max_concurrent: Optional[int] = 10,
83 | cache: bool = False,
84 | ):
85 | if prompt_format == PromptFormat.HARMONY_V4:
86 | assert model_name in HARMONY_V4_MODELS
87 | elif prompt_format in [PromptFormat.NONE, PromptFormat.INSTRUCTION_FOLLOWING]:
88 | assert model_name not in HARMONY_V4_MODELS
89 | else:
90 | raise ValueError(f"Unhandled prompt format {prompt_format}")
91 |
92 | self.model_name = model_name
93 | self.prompt_format = prompt_format
94 | self.context_size = context_size
95 | self.client = ApiClient(model_name=model_name, max_concurrent=max_concurrent, cache=cache)
96 |
97 | async def generate_explanations(
98 | self,
99 | *,
100 | num_samples: int = 5,
101 | max_tokens: int = 60,
102 | temperature: float = 1.0,
103 | top_p: float = 1.0,
104 | **prompt_kwargs: Any,
105 | ) -> list[Any]:
106 | """Generate explanations based on subclass-specific input data."""
107 | prompt = self.make_explanation_prompt(max_tokens_for_completion=max_tokens, **prompt_kwargs)
108 |
109 | generate_kwargs: dict[str, Any] = {
110 | "n": num_samples,
111 | "max_tokens": max_tokens,
112 | "temperature": temperature,
113 | "top_p": top_p,
114 | }
115 |
116 | if self.prompt_format == PromptFormat.HARMONY_V4:
117 | assert isinstance(prompt, list)
118 | assert isinstance(prompt[0], dict) # Really a HarmonyMessage
119 | generate_kwargs["messages"] = prompt
120 | else:
121 | assert isinstance(prompt, str)
122 | generate_kwargs["prompt"] = prompt
123 |
124 | response = await self.client.make_request(**generate_kwargs)
125 | logger.debug("response in generate_explanations is %s", response)
126 |
127 | if self.prompt_format == PromptFormat.HARMONY_V4:
128 | explanations = [x["message"]["content"] for x in response["choices"]]
129 | elif self.prompt_format in [PromptFormat.NONE, PromptFormat.INSTRUCTION_FOLLOWING]:
130 | explanations = [x["text"] for x in response["choices"]]
131 | else:
132 | raise ValueError(f"Unhandled prompt format {self.prompt_format}")
133 |
134 | return self.postprocess_explanations(explanations, prompt_kwargs)
135 |
136 | @abstractmethod
137 | def make_explanation_prompt(self, **kwargs: Any) -> Union[str, list[HarmonyMessage]]:
138 | """
139 | Create a prompt to send to the API to generate one or more explanations.
140 |
141 | A prompt can be a simple string, or a list of HarmonyMessages, depending on the PromptFormat
142 | used by this instance.
143 | """
144 | ...
145 |
146 | def postprocess_explanations(
147 | self, completions: list[str], prompt_kwargs: dict[str, Any]
148 | ) -> list[Any]:
149 | """Postprocess the completions returned by the API into a list of explanations."""
150 | return completions # no-op by default
151 |
152 | def _prompt_is_too_long(
153 | self, prompt_builder: PromptBuilder, max_tokens_for_completion: int
154 | ) -> bool:
155 | # We'll get a context size error if the prompt itself plus the maximum number of tokens for
156 | # the completion is longer than the context size.
157 | prompt_length = prompt_builder.prompt_length_in_tokens(self.prompt_format)
158 | if prompt_length + max_tokens_for_completion > self.context_size.value:
159 | print(
160 | f"Prompt is too long: {prompt_length} + {max_tokens_for_completion} > "
161 | f"{self.context_size.value}"
162 | )
163 | return True
164 | return False
165 |
166 |
167 | class TokenActivationPairExplainer(NeuronExplainer):
168 | """
169 | Generate explanations of neuron behavior using a prompt with lists of token/activation pairs.
170 | """
171 |
172 | def __init__(
173 | self,
174 | model_name: str,
175 | prompt_format: PromptFormat = PromptFormat.HARMONY_V4,
176 | # This parameter lets us adjust the length of the prompt when we're generating explanations
177 | # using older models with shorter context windows. In the future we can use it to experiment
178 | # with 8k+ context windows.
179 | context_size: ContextSize = ContextSize.FOUR_K,
180 | few_shot_example_set: FewShotExampleSet = FewShotExampleSet.ORIGINAL,
181 | repeat_non_zero_activations: bool = True,
182 | max_concurrent: Optional[int] = 10,
183 | cache: bool = False,
184 | ):
185 | super().__init__(
186 | model_name=model_name,
187 | prompt_format=prompt_format,
188 | max_concurrent=max_concurrent,
189 | cache=cache,
190 | )
191 | self.context_size = context_size
192 | self.few_shot_example_set = few_shot_example_set
193 | self.repeat_non_zero_activations = repeat_non_zero_activations
194 |
195 | def make_explanation_prompt(self, **kwargs: Any) -> Union[str, list[HarmonyMessage]]:
196 | original_kwargs = kwargs.copy()
197 | all_activation_records: Sequence[ActivationRecord] = kwargs.pop("all_activation_records")
198 | max_activation: float = kwargs.pop("max_activation")
199 | kwargs.setdefault("numbered_list_of_n_explanations", None)
200 | numbered_list_of_n_explanations: Optional[int] = kwargs.pop(
201 | "numbered_list_of_n_explanations"
202 | )
203 | if numbered_list_of_n_explanations is not None:
204 | assert numbered_list_of_n_explanations > 0, numbered_list_of_n_explanations
205 | # This parameter lets us dynamically shrink the prompt if our initial attempt to create it
206 | # results in something that's too long. It's only implemented for the 4k context size.
207 | kwargs.setdefault("omit_n_activation_records", 0)
208 | omit_n_activation_records: int = kwargs.pop("omit_n_activation_records")
209 | max_tokens_for_completion: int = kwargs.pop("max_tokens_for_completion")
210 | assert not kwargs, f"Unexpected kwargs: {kwargs}"
211 |
212 | prompt_builder = PromptBuilder()
213 | prompt_builder.add_message(
214 | Role.SYSTEM,
215 | "We're studying neurons in a neural network. Each neuron looks for some particular "
216 | "thing in a short document. Look at the parts of the document the neuron activates for "
217 | "and summarize in a single sentence what the neuron is looking for. Don't list "
218 | "examples of words.\n\nThe activation format is tokenactivation. Activation "
219 | "values range from 0 to 10. A neuron finding what it's looking for is represented by a "
220 | "non-zero activation value. The higher the activation value, the stronger the match.",
221 | )
222 | few_shot_examples = self.few_shot_example_set.get_examples()
223 | num_omitted_activation_records = 0
224 | for i, few_shot_example in enumerate(few_shot_examples):
225 | few_shot_activation_records = few_shot_example.activation_records
226 | if self.context_size == ContextSize.TWO_K:
227 | # If we're using a 2k context window, we only have room for one activation record
228 | # per few-shot example. (Two few-shot examples with one activation record each seems
229 | # to work better than one few-shot example with two activation records, in local
230 | # testing.)
231 | few_shot_activation_records = few_shot_activation_records[:1]
232 | elif (
233 | self.context_size == ContextSize.FOUR_K
234 | and num_omitted_activation_records < omit_n_activation_records
235 | ):
236 | # Drop the last activation record for this few-shot example to save tokens, assuming
237 | # there are at least two activation records.
238 | if len(few_shot_activation_records) > 1:
239 | print(f"Warning: omitting activation record from few-shot example {i}")
240 | few_shot_activation_records = few_shot_activation_records[:-1]
241 | num_omitted_activation_records += 1
242 | self._add_per_neuron_explanation_prompt(
243 | prompt_builder,
244 | few_shot_activation_records,
245 | i,
246 | calculate_max_activation(few_shot_example.activation_records),
247 | numbered_list_of_n_explanations=numbered_list_of_n_explanations,
248 | explanation=few_shot_example.explanation,
249 | )
250 | self._add_per_neuron_explanation_prompt(
251 | prompt_builder,
252 | # If we're using a 2k context window, we only have room for two of the activation
253 | # records.
254 | all_activation_records[:2]
255 | if self.context_size == ContextSize.TWO_K
256 | else all_activation_records,
257 | len(few_shot_examples),
258 | max_activation,
259 | numbered_list_of_n_explanations=numbered_list_of_n_explanations,
260 | explanation=None,
261 | )
262 | # If the prompt is too long *and* we omitted the specified number of activation records, try
263 | # again, omitting one more. (If we didn't make the specified number of omissions, we're out
264 | # of opportunities to omit records, so we just return the prompt as-is.)
265 | if (
266 | self._prompt_is_too_long(prompt_builder, max_tokens_for_completion)
267 | and num_omitted_activation_records == omit_n_activation_records
268 | ):
269 | original_kwargs["omit_n_activation_records"] = omit_n_activation_records + 1
270 | return self.make_explanation_prompt(**original_kwargs)
271 | return prompt_builder.build(self.prompt_format)
272 |
273 | def _add_per_neuron_explanation_prompt(
274 | self,
275 | prompt_builder: PromptBuilder,
276 | activation_records: Sequence[ActivationRecord],
277 | index: int,
278 | max_activation: float,
279 | # When set, this indicates that the prompt should solicit a numbered list of the given
280 | # number of explanations, rather than a single explanation.
281 | numbered_list_of_n_explanations: Optional[int],
282 | explanation: Optional[str], # None means this is the end of the full prompt.
283 | ) -> None:
284 | max_activation = calculate_max_activation(activation_records)
285 | user_message = f"""
286 |
287 | Neuron {index + 1}
288 | Activations:{format_activation_records(activation_records, max_activation, omit_zeros=False)}"""
289 | # We repeat the non-zero activations only if it was requested and if the proportion of
290 | # non-zero activations isn't too high.
291 | if (
292 | self.repeat_non_zero_activations
293 | and non_zero_activation_proportion(activation_records, max_activation) < 0.2
294 | ):
295 | user_message += (
296 | f"\nSame activations, but with all zeros filtered out:"
297 | f"{format_activation_records(activation_records, max_activation, omit_zeros=True)}"
298 | )
299 |
300 | if numbered_list_of_n_explanations is None:
301 | user_message += f"\nExplanation of neuron {index + 1} behavior:"
302 | assistant_message = ""
303 | # For the IF format, we want <|endofprompt|> to come before the explanation prefix.
304 | if self.prompt_format == PromptFormat.INSTRUCTION_FOLLOWING:
305 | assistant_message += f" {EXPLANATION_PREFIX}"
306 | else:
307 | user_message += f" {EXPLANATION_PREFIX}"
308 | prompt_builder.add_message(Role.USER, user_message)
309 |
310 | if explanation is not None:
311 | assistant_message += f" {explanation}."
312 | if assistant_message:
313 | prompt_builder.add_message(Role.ASSISTANT, assistant_message)
314 | else:
315 | if explanation is None:
316 | # For the final neuron, we solicit a numbered list of explanations.
317 | prompt_builder.add_message(
318 | Role.USER,
319 | f"""\nHere are {numbered_list_of_n_explanations} possible explanations for neuron {index + 1} behavior, each beginning with "{EXPLANATION_PREFIX}":\n1. {EXPLANATION_PREFIX}""",
320 | )
321 | else:
322 | # For the few-shot examples, we only present one explanation, but we present it as a
323 | # numbered list.
324 | prompt_builder.add_message(
325 | Role.USER,
326 | f"""\nHere is 1 possible explanation for neuron {index + 1} behavior, beginning with "{EXPLANATION_PREFIX}":\n1. {EXPLANATION_PREFIX}""",
327 | )
328 | prompt_builder.add_message(Role.ASSISTANT, f" {explanation}.")
329 |
330 | def postprocess_explanations(
331 | self, completions: list[str], prompt_kwargs: dict[str, Any]
332 | ) -> list[Any]:
333 | """Postprocess the explanations returned by the API"""
334 | numbered_list_of_n_explanations = prompt_kwargs.get("numbered_list_of_n_explanations")
335 | if numbered_list_of_n_explanations is None:
336 | return completions
337 | else:
338 | all_explanations = []
339 | for completion in completions:
340 | for explanation in _split_numbered_list(completion):
341 | if explanation.startswith(EXPLANATION_PREFIX):
342 | explanation = explanation[len(EXPLANATION_PREFIX) :]
343 | all_explanations.append(explanation.strip())
344 | return all_explanations
345 |
346 |
347 | class TokenSpaceRepresentationExplainer(NeuronExplainer):
348 | """
349 | Generate explanations of arbitrary lists of tokens which disproportionately activate a
350 | particular neuron. These lists of tokens can be generated in various ways. As an example, in one
351 | set of experiments, we compute the average activation for each neuron conditional on each token
352 | that appears in an internet text corpus. We then sort the tokens by their average activation,
353 | and show 50 of the top 100 tokens. Other techniques that could be used include taking the top
354 | tokens in the logit lens or tuned lens representations of a neuron.
355 | """
356 |
357 | def __init__(
358 | self,
359 | model_name: str,
360 | prompt_format: PromptFormat = PromptFormat.HARMONY_V4,
361 | context_size: ContextSize = ContextSize.FOUR_K,
362 | few_shot_example_set: TokenSpaceFewShotExampleSet = TokenSpaceFewShotExampleSet.ORIGINAL,
363 | use_few_shot: bool = False,
364 | output_numbered_list: bool = False,
365 | max_concurrent: Optional[int] = 10,
366 | cache: bool = False,
367 | ):
368 | super().__init__(
369 | model_name=model_name,
370 | prompt_format=prompt_format,
371 | context_size=context_size,
372 | max_concurrent=max_concurrent,
373 | cache=cache,
374 | )
375 | self.use_few_shot = use_few_shot
376 | self.output_numbered_list = output_numbered_list
377 | if self.use_few_shot:
378 | assert few_shot_example_set is not None
379 | self.few_shot_examples: Optional[TokenSpaceFewShotExampleSet] = few_shot_example_set
380 | else:
381 | self.few_shot_examples = None
382 | self.prompt_prefix = (
383 | "We're studying neurons in a neural network. Each neuron looks for some particular "
384 | "kind of token (which can be a word, or part of a word). Look at the tokens the neuron "
385 | "activates for (listed below) and summarize in a single sentence what the neuron is "
386 | "looking for. Don't list examples of words."
387 | )
388 |
389 | def make_explanation_prompt(self, **kwargs: Any) -> Union[str, list[HarmonyMessage]]:
390 | tokens: list[str] = kwargs.pop("tokens")
391 | max_tokens_for_completion = kwargs.pop("max_tokens_for_completion")
392 | assert not kwargs, f"Unexpected kwargs: {kwargs}"
393 | # Note that this does not preserve the precise tokens, as e.g.
394 | # f" {token_with_no_leading_space}" may be tokenized as "f{token_with_leading_space}".
395 | # TODO(dan): Try out other variants, including "\n".join(...) and ",".join(...)
396 | stringified_tokens = ", ".join([f"'{t}'" for t in tokens])
397 |
398 | prompt_builder = PromptBuilder()
399 | prompt_builder.add_message(Role.SYSTEM, self.prompt_prefix)
400 | if self.use_few_shot:
401 | self._add_few_shot_examples(prompt_builder)
402 | self._add_neuron_specific_prompt(prompt_builder, stringified_tokens, explanation=None)
403 |
404 | if self._prompt_is_too_long(prompt_builder, max_tokens_for_completion):
405 | raise ValueError(f"Prompt too long: {prompt_builder.build(self.prompt_format)}")
406 | else:
407 | return prompt_builder.build(self.prompt_format)
408 |
409 | def _add_few_shot_examples(self, prompt_builder: PromptBuilder) -> None:
410 | """
411 | Append few-shot examples to the prompt. Each one consists of a comma-delimited list of
412 | tokens and corresponding explanations, as saved in
413 | alignment/neuron_explainer/weight_explainer/token_space_few_shot_examples.py.
414 | """
415 | assert self.few_shot_examples is not None
416 | few_shot_example_list = self.few_shot_examples.get_examples()
417 | if self.output_numbered_list:
418 | raise NotImplementedError("Numbered list output not supported for few-shot examples")
419 | else:
420 | for few_shot_example in few_shot_example_list:
421 | self._add_neuron_specific_prompt(
422 | prompt_builder,
423 | ", ".join([f"'{t}'" for t in few_shot_example.tokens]),
424 | explanation=few_shot_example.explanation,
425 | )
426 |
427 | def _add_neuron_specific_prompt(
428 | self,
429 | prompt_builder: PromptBuilder,
430 | stringified_tokens: str,
431 | explanation: Optional[str],
432 | ) -> None:
433 | """
434 | Append a neuron-specific prompt to the prompt builder. The prompt consists of a list of
435 | tokens followed by either an explanation (if one is passed, for few shot examples) or by
436 | the beginning of a completion, to be completed by the model with an explanation.
437 | """
438 | user_message = f"\n\n\n\nTokens:\n{stringified_tokens}\n\nExplanation:\n"
439 | assistant_message = ""
440 | looking_for = "This neuron is looking for"
441 | if self.prompt_format == PromptFormat.INSTRUCTION_FOLLOWING:
442 | # We want <|endofprompt|> to come before "This neuron is looking for" in the IF format.
443 | assistant_message += looking_for
444 | else:
445 | user_message += looking_for
446 | if self.output_numbered_list:
447 | start_of_list = "\n1."
448 | if self.prompt_format == PromptFormat.INSTRUCTION_FOLLOWING:
449 | assistant_message += start_of_list
450 | else:
451 | user_message += start_of_list
452 | if explanation is not None:
453 | assistant_message += f"{explanation}."
454 | prompt_builder.add_message(Role.USER, user_message)
455 | if assistant_message:
456 | prompt_builder.add_message(Role.ASSISTANT, assistant_message)
457 |
458 | def postprocess_explanations(
459 | self, completions: list[str], prompt_kwargs: dict[str, Any]
460 | ) -> list[str]:
461 | if self.output_numbered_list:
462 | # Each list in the top-level list will have multiple explanations (multiple strings).
463 | all_explanations = []
464 | for completion in completions:
465 | for explanation in _split_numbered_list(completion):
466 | if explanation.startswith(EXPLANATION_PREFIX):
467 | explanation = explanation[len(EXPLANATION_PREFIX) :]
468 | all_explanations.append(explanation.strip())
469 | return all_explanations
470 | else:
471 | # Each element in the top-level list will be an explanation as a string.
472 | return [_remove_final_period(explanation) for explanation in completions]
473 |
--------------------------------------------------------------------------------
/neuron-explainer/neuron_explainer/explanations/few_shot_examples.py:
--------------------------------------------------------------------------------
1 | # Few-shot examples for generating and simulating neuron explanations.
2 |
3 | from __future__ import annotations
4 |
5 | from dataclasses import dataclass
6 | from enum import Enum
7 | from typing import List, Optional
8 |
9 | from neuron_explainer.activations.activations import ActivationRecord
10 | from neuron_explainer.fast_dataclasses import FastDataclass
11 |
12 |
13 | @dataclass
14 | class Example(FastDataclass):
15 | activation_records: List[ActivationRecord]
16 | explanation: str
17 | first_revealed_activation_indices: List[int]
18 | """
19 | For each activation record, the index of the first token for which the activation value in the
20 | prompt should be an actual number rather than "unknown".
21 |
22 | Examples all start with the activations rendered as "unknown", then transition to revealing
23 | specific normalized activation values. The goal is to lead the model to predict that activation
24 | sequences will eventually transition to predicting specific activation values instead of just
25 | "unknown". This lets us cheat and get predictions of activation values for every token in a
26 | single round of inference by having the activations in the sequence we're predicting always be
27 | "unknown" in the prompt: the model will always think that maybe the next token will be a real
28 | activation.
29 | """
30 | token_index_to_score: Optional[int] = None
31 | """
32 | If the prompt is used as an example for one-token-at-a-time scoring, this is the index of the
33 | token to score.
34 | """
35 |
36 |
37 | class FewShotExampleSet(Enum):
38 | """Determines which few-shot examples to use when sampling explanations."""
39 |
40 | ORIGINAL = "original"
41 | NEWER = "newer"
42 | TEST = "test"
43 |
44 | @classmethod
45 | def from_string(cls, string: str) -> FewShotExampleSet:
46 | for example_set in FewShotExampleSet:
47 | if example_set.value == string:
48 | return example_set
49 | raise ValueError(f"Unrecognized example set: {string}")
50 |
51 | def get_examples(self) -> list[Example]:
52 | """Returns regular examples for use in a few-shot prompt."""
53 | if self is FewShotExampleSet.ORIGINAL:
54 | return ORIGINAL_EXAMPLES
55 | elif self is FewShotExampleSet.NEWER:
56 | return NEWER_EXAMPLES
57 | elif self is FewShotExampleSet.TEST:
58 | return TEST_EXAMPLES
59 | else:
60 | raise ValueError(f"Unhandled example set: {self}")
61 |
62 | def get_single_token_prediction_example(self) -> Example:
63 | """
64 | Returns an example suitable for use in a subprompt for predicting a single token's
65 | normalized activation, for use with the "one token at a time" scoring approach.
66 | """
67 | if self is FewShotExampleSet.NEWER:
68 | return NEWER_SINGLE_TOKEN_EXAMPLE
69 | elif self is FewShotExampleSet.TEST:
70 | return TEST_SINGLE_TOKEN_EXAMPLE
71 | else:
72 | raise ValueError(f"Unhandled example set: {self}")
73 |
74 |
75 | TEST_EXAMPLES = [
76 | Example(
77 | activation_records=[
78 | ActivationRecord(
79 | tokens=["a", "b", "c"],
80 | activations=[1.0, 0.0, 0.0],
81 | ),
82 | ActivationRecord(
83 | tokens=["d", "e", "f"],
84 | activations=[0.0, 1.0, 0.0],
85 | ),
86 | ],
87 | explanation="vowels",
88 | first_revealed_activation_indices=[0, 1],
89 | ),
90 | ]
91 |
92 | TEST_SINGLE_TOKEN_EXAMPLE = Example(
93 | activation_records=[
94 | ActivationRecord(
95 | activations=[0.0, 0.0, 1.0],
96 | tokens=["g", "h", "i"],
97 | ),
98 | ],
99 | first_revealed_activation_indices=[],
100 | token_index_to_score=2,
101 | explanation="test explanation",
102 | )
103 |
104 |
105 | ORIGINAL_EXAMPLES = [
106 | Example(
107 | activation_records=[
108 | ActivationRecord(
109 | tokens=[
110 | "t",
111 | "urt",
112 | "ur",
113 | "ro",
114 | " is",
115 | " fab",
116 | "ulously",
117 | " funny",
118 | " and",
119 | " over",
120 | " the",
121 | " top",
122 | " as",
123 | " a",
124 | " '",
125 | "very",
126 | " sneaky",
127 | "'",
128 | " but",
129 | "ler",
130 | " who",
131 | " excel",
132 | "s",
133 | " in",
134 | " the",
135 | " art",
136 | " of",
137 | " impossible",
138 | " disappearing",
139 | "/",
140 | "re",
141 | "app",
142 | "earing",
143 | " acts",
144 | ],
145 | activations=[
146 | -0.71,
147 | -1.85,
148 | -2.39,
149 | -2.58,
150 | -1.34,
151 | -1.92,
152 | -1.69,
153 | -0.84,
154 | -1.25,
155 | -1.75,
156 | -1.42,
157 | -1.47,
158 | -1.51,
159 | -0.8,
160 | -1.89,
161 | -1.56,
162 | -1.63,
163 | 0.44,
164 | -1.87,
165 | -2.55,
166 | -2.09,
167 | -1.76,
168 | -1.33,
169 | -0.88,
170 | -1.63,
171 | -2.39,
172 | -2.63,
173 | -0.99,
174 | 2.83,
175 | -1.11,
176 | -1.19,
177 | -1.33,
178 | 4.24,
179 | -1.51,
180 | ],
181 | ),
182 | ActivationRecord(
183 | tokens=[
184 | "esc",
185 | "aping",
186 | " the",
187 | " studio",
188 | " ,",
189 | " pic",
190 | "col",
191 | "i",
192 | " is",
193 | " warm",
194 | "ly",
195 | " affecting",
196 | " and",
197 | " so",
198 | " is",
199 | " this",
200 | " ad",
201 | "roit",
202 | "ly",
203 | " minimalist",
204 | " movie",
205 | " .",
206 | ],
207 | activations=[
208 | -0.69,
209 | 4.12,
210 | 1.83,
211 | -2.28,
212 | -0.28,
213 | -0.79,
214 | -2.2,
215 | -2.03,
216 | -1.77,
217 | -1.71,
218 | -2.44,
219 | 1.6,
220 | -1,
221 | -0.38,
222 | -1.93,
223 | -2.09,
224 | -1.63,
225 | -1.94,
226 | -1.82,
227 | -1.64,
228 | -1.32,
229 | -1.92,
230 | ],
231 | ),
232 | ],
233 | first_revealed_activation_indices=[10, 3],
234 | explanation="present tense verbs ending in 'ing'",
235 | ),
236 | Example(
237 | activation_records=[
238 | ActivationRecord(
239 | tokens=[
240 | "as",
241 | " sac",
242 | "char",
243 | "ine",
244 | " movies",
245 | " go",
246 | " ,",
247 | " this",
248 | " is",
249 | " likely",
250 | " to",
251 | " cause",
252 | " massive",
253 | " cardiac",
254 | " arrest",
255 | " if",
256 | " taken",
257 | " in",
258 | " large",
259 | " doses",
260 | " .",
261 | ],
262 | activations=[
263 | -0.14,
264 | -1.37,
265 | -0.68,
266 | -2.27,
267 | -1.46,
268 | -1.11,
269 | -0.9,
270 | -2.48,
271 | -2.07,
272 | -3.49,
273 | -2.16,
274 | -1.79,
275 | -0.23,
276 | -0.04,
277 | 4.46,
278 | -1.02,
279 | -2.26,
280 | -2.95,
281 | -1.49,
282 | -1.46,
283 | -0.6,
284 | ],
285 | ),
286 | ActivationRecord(
287 | tokens=[
288 | "shot",
289 | " perhaps",
290 | " '",
291 | "art",
292 | "istically",
293 | "'",
294 | " with",
295 | " handheld",
296 | " cameras",
297 | " and",
298 | " apparently",
299 | " no",
300 | " movie",
301 | " lights",
302 | " by",
303 | " jo",
304 | "aquin",
305 | " b",
306 | "aca",
307 | "-",
308 | "as",
309 | "ay",
310 | " ,",
311 | " the",
312 | " low",
313 | "-",
314 | "budget",
315 | " production",
316 | " swings",
317 | " annoy",
318 | "ingly",
319 | " between",
320 | " vert",
321 | "igo",
322 | " and",
323 | " opacity",
324 | " .",
325 | ],
326 | activations=[
327 | -0.09,
328 | -3.53,
329 | -0.72,
330 | -2.36,
331 | -1.05,
332 | -1.12,
333 | -2.49,
334 | -2.14,
335 | -1.98,
336 | -1.59,
337 | -2.62,
338 | -2,
339 | -2.73,
340 | -2.87,
341 | -3.23,
342 | -1.11,
343 | -2.23,
344 | -0.97,
345 | -2.28,
346 | -2.37,
347 | -1.5,
348 | -2.81,
349 | -1.73,
350 | -3.14,
351 | -2.61,
352 | -1.7,
353 | -3.08,
354 | -4,
355 | -0.71,
356 | -2.48,
357 | -1.39,
358 | -1.96,
359 | -1.09,
360 | 4.37,
361 | -0.74,
362 | -0.5,
363 | -0.62,
364 | ],
365 | ),
366 | ],
367 | first_revealed_activation_indices=[5, 20],
368 | explanation="words related to physical medical conditions",
369 | ),
370 | Example(
371 | activation_records=[
372 | ActivationRecord(
373 | tokens=[
374 | "the",
375 | " sense",
376 | " of",
377 | " together",
378 | "ness",
379 | " in",
380 | " our",
381 | " town",
382 | " is",
383 | " strong",
384 | " .",
385 | ],
386 | activations=[
387 | 0,
388 | 0,
389 | 0,
390 | 1,
391 | 2,
392 | 0,
393 | 0.23,
394 | 0.5,
395 | 0,
396 | 0,
397 | 0,
398 | ],
399 | ),
400 | ActivationRecord(
401 | tokens=[
402 | "a",
403 | " buoy",
404 | "ant",
405 | " romantic",
406 | " comedy",
407 | " about",
408 | " friendship",
409 | " ,",
410 | " love",
411 | " ,",
412 | " and",
413 | " the",
414 | " truth",
415 | " that",
416 | " we",
417 | "'re",
418 | " all",
419 | " in",
420 | " this",
421 | " together",
422 | " .",
423 | ],
424 | activations=[
425 | -0.15,
426 | -2.33,
427 | -1.4,
428 | -2.17,
429 | -2.53,
430 | -0.85,
431 | 0.23,
432 | -1.89,
433 | 0.09,
434 | -0.47,
435 | -0.5,
436 | -0.58,
437 | -0.87,
438 | 0.22,
439 | 0.58,
440 | 1.34,
441 | 0.98,
442 | 2.21,
443 | 2.84,
444 | 1.7,
445 | -0.89,
446 | ],
447 | ),
448 | ],
449 | first_revealed_activation_indices=[0, 10],
450 | explanation="phrases related to community",
451 | ),
452 | ]
453 |
454 |
455 | NEWER_EXAMPLES = [
456 | Example(
457 | activation_records=[
458 | ActivationRecord(
459 | tokens=[
460 | "The",
461 | " editors",
462 | " of",
463 | " Bi",
464 | "opol",
465 | "ym",
466 | "ers",
467 | " are",
468 | " delighted",
469 | " to",
470 | " present",
471 | " the",
472 | " ",
473 | "201",
474 | "8",
475 | " Murray",
476 | " Goodman",
477 | " Memorial",
478 | " Prize",
479 | " to",
480 | " Professor",
481 | " David",
482 | " N",
483 | ".",
484 | " Ber",
485 | "atan",
486 | " in",
487 | " recognition",
488 | " of",
489 | " his",
490 | " seminal",
491 | " contributions",
492 | " to",
493 | " bi",
494 | "oph",
495 | "ysics",
496 | " and",
497 | " their",
498 | " impact",
499 | " on",
500 | " our",
501 | " understanding",
502 | " of",
503 | " charge",
504 | " transport",
505 | " in",
506 | " biom",
507 | "olecules",
508 | ".\n\n",
509 | "In",
510 | "aug",
511 | "ur",
512 | "ated",
513 | " in",
514 | " ",
515 | "200",
516 | "7",
517 | " in",
518 | " honor",
519 | " of",
520 | " the",
521 | " Bi",
522 | "opol",
523 | "ym",
524 | "ers",
525 | " Found",
526 | "ing",
527 | " Editor",
528 | ",",
529 | " the",
530 | " prize",
531 | " is",
532 | " awarded",
533 | " for",
534 | " outstanding",
535 | " accomplishments",
536 | ],
537 | activations=[
538 | 0,
539 | 0.01,
540 | 0.01,
541 | 0,
542 | 0,
543 | 0,
544 | -0.01,
545 | 0,
546 | -0.01,
547 | 0,
548 | 0,
549 | 0,
550 | 0,
551 | 0,
552 | 0.04,
553 | 0,
554 | 0,
555 | 0,
556 | 0,
557 | 0,
558 | 0,
559 | 0,
560 | 0,
561 | 0,
562 | 0,
563 | 0,
564 | 0,
565 | 0,
566 | 0,
567 | 0,
568 | 3.39,
569 | 0.12,
570 | 0,
571 | -0.01,
572 | 0,
573 | 0,
574 | 0,
575 | 0,
576 | -0,
577 | 0,
578 | -0,
579 | 0,
580 | 0,
581 | -0,
582 | 0,
583 | 0,
584 | 0,
585 | 0,
586 | 0,
587 | 0,
588 | 0,
589 | 0,
590 | 0,
591 | 0,
592 | 0,
593 | 0,
594 | 0,
595 | 0,
596 | 0,
597 | 0,
598 | 0,
599 | -0,
600 | 0,
601 | 0,
602 | -0.01,
603 | 0,
604 | 0.41,
605 | 0,
606 | 0,
607 | 0,
608 | -0.01,
609 | 0,
610 | 0,
611 | 0,
612 | 0,
613 | 0,
614 | ],
615 | ),
616 | # We sometimes exceed the max context size when this is included :(
617 | # ActivationRecord(
618 | # tokens=[
619 | # " We",
620 | # " are",
621 | # " proud",
622 | # " of",
623 | # " our",
624 | # " national",
625 | # " achievements",
626 | # " in",
627 | # " mastering",
628 | # " all",
629 | # " aspects",
630 | # " of",
631 | # " the",
632 | # " fuel",
633 | # " cycle",
634 | # ".",
635 | # " The",
636 | # " current",
637 | # " international",
638 | # " interest",
639 | # " in",
640 | # " closing",
641 | # " the",
642 | # " fuel",
643 | # " cycle",
644 | # " is",
645 | # " a",
646 | # " vind",
647 | # "ication",
648 | # " of",
649 | # " Dr",
650 | # ".",
651 | # " B",
652 | # "hab",
653 | # "ha",
654 | # "’s",
655 | # " pioneering",
656 | # " vision",
657 | # " and",
658 | # " genius",
659 | # ],
660 | # activations=[
661 | # -0,
662 | # -0,
663 | # 0,
664 | # -0,
665 | # -0,
666 | # 0,
667 | # 0,
668 | # 0,
669 | # -0,
670 | # 0,
671 | # 0,
672 | # -0,
673 | # 0,
674 | # -0.01,
675 | # 0,
676 | # 0,
677 | # -0,
678 | # -0,
679 | # 0,
680 | # 0,
681 | # 0,
682 | # -0,
683 | # -0,
684 | # -0.01,
685 | # 0,
686 | # 0,
687 | # -0,
688 | # 0,
689 | # 0,
690 | # 0,
691 | # 0,
692 | # 0,
693 | # -0,
694 | # 0,
695 | # 0,
696 | # 0,
697 | # 2.15,
698 | # 0,
699 | # 0,
700 | # 0.03,
701 | # ],
702 | # ),
703 | ],
704 | first_revealed_activation_indices=[7], # , 19],
705 | explanation="language related to something being groundbreaking",
706 | ),
707 | Example(
708 | activation_records=[
709 | ActivationRecord(
710 | tokens=[
711 | '{"',
712 | "widget",
713 | "Class",
714 | '":"',
715 | "Variant",
716 | "Matrix",
717 | "Widget",
718 | '","',
719 | "back",
720 | "order",
721 | "Message",
722 | '":"',
723 | "Back",
724 | "ordered",
725 | '","',
726 | "back",
727 | "order",
728 | "Message",
729 | "Single",
730 | "Variant",
731 | '":"',
732 | "This",
733 | " item",
734 | " is",
735 | " back",
736 | "ordered",
737 | '.","',
738 | "ordered",
739 | "Selection",
740 | '":',
741 | "true",
742 | ',"',
743 | "product",
744 | "Variant",
745 | "Id",
746 | '":',
747 | "0",
748 | ',"',
749 | "variant",
750 | "Id",
751 | "Field",
752 | '":"',
753 | "product",
754 | "196",
755 | "39",
756 | "_V",
757 | "ariant",
758 | "Id",
759 | '","',
760 | "back",
761 | "order",
762 | "To",
763 | "Message",
764 | "Single",
765 | "Variant",
766 | '":"',
767 | "This",
768 | " item",
769 | " is",
770 | " back",
771 | "ordered",
772 | " and",
773 | " is",
774 | " expected",
775 | " by",
776 | " {",
777 | "0",
778 | "}.",
779 | '","',
780 | "low",
781 | "Price",
782 | '":',
783 | "999",
784 | "9",
785 | ".",
786 | "0",
787 | ',"',
788 | "attribute",
789 | "Indexes",
790 | '":[',
791 | '],"',
792 | "productId",
793 | '":',
794 | "196",
795 | "39",
796 | ',"',
797 | "price",
798 | "V",
799 | "ariance",
800 | '":',
801 | "true",
802 | ',"',
803 | ],
804 | activations=[
805 | 0,
806 | 0,
807 | 0,
808 | 0,
809 | 4.2,
810 | 0,
811 | 0,
812 | 0,
813 | 0,
814 | 0,
815 | 0,
816 | 0,
817 | 0,
818 | 0,
819 | 0,
820 | 0,
821 | 0,
822 | 0,
823 | 0,
824 | 3.6,
825 | 0,
826 | 0,
827 | 0,
828 | 0,
829 | 0,
830 | 0,
831 | 0,
832 | 0,
833 | 0,
834 | 0,
835 | 0,
836 | 0,
837 | 0,
838 | 3.7,
839 | 0,
840 | 0,
841 | 0,
842 | 0,
843 | 4.02,
844 | 0,
845 | 0,
846 | 0,
847 | 0,
848 | 0,
849 | 0,
850 | 3.5,
851 | 3.7,
852 | 0,
853 | 0,
854 | 0,
855 | 0,
856 | 0,
857 | 0,
858 | 0,
859 | 2.9,
860 | 0,
861 | 0,
862 | 0,
863 | 0,
864 | 0,
865 | 0,
866 | 0,
867 | 0,
868 | 0,
869 | 0,
870 | 0,
871 | 0,
872 | 0,
873 | 0,
874 | 0,
875 | 0,
876 | 0,
877 | 0,
878 | 0,
879 | 0,
880 | 0,
881 | 0,
882 | 0,
883 | 0,
884 | 0,
885 | 0,
886 | 0,
887 | 0,
888 | 0,
889 | 0,
890 | 0,
891 | 0,
892 | 2.3,
893 | 2.24,
894 | 0,
895 | 0,
896 | 0,
897 | ],
898 | ),
899 | ActivationRecord(
900 | tokens=[
901 | "A",
902 | " regular",
903 | " look",
904 | " at",
905 | " the",
906 | " ups",
907 | " and",
908 | " downs",
909 | " of",
910 | " variant",
911 | " covers",
912 | " in",
913 | " the",
914 | " comics",
915 | " industry",
916 | "…\n\n",
917 | "Here",
918 | " are",
919 | " the",
920 | " Lego",
921 | " variant",
922 | " sketch",
923 | " covers",
924 | " by",
925 | " Leon",
926 | "el",
927 | " Cast",
928 | "ell",
929 | "ani",
930 | " for",
931 | " a",
932 | " variety",
933 | " of",
934 | " Marvel",
935 | " titles",
936 | ",",
937 | ],
938 | activations=[
939 | 0,
940 | 0,
941 | 0,
942 | 0,
943 | 0,
944 | 0,
945 | 0,
946 | 0,
947 | 0,
948 | 6.52,
949 | 0,
950 | 0,
951 | 0,
952 | 0,
953 | 0,
954 | 0,
955 | 0,
956 | 0,
957 | 0,
958 | 0,
959 | 1.62,
960 | 0,
961 | 0,
962 | 0,
963 | 0,
964 | 0,
965 | 0,
966 | 0,
967 | 0,
968 | 0,
969 | 0,
970 | 3.23,
971 | 0,
972 | 0,
973 | 0,
974 | 0,
975 | ],
976 | ),
977 | ],
978 | first_revealed_activation_indices=[2, 8],
979 | explanation="the word “variant” and other words with the same ”vari” root",
980 | ),
981 | ]
982 |
983 |
984 | NEWER_SINGLE_TOKEN_EXAMPLE = Example(
985 | activation_records=[
986 | ActivationRecord(
987 | tokens=[
988 | "B",
989 | "10",
990 | " ",
991 | "111",
992 | " MON",
993 | "DAY",
994 | ",",
995 | " F",
996 | "EB",
997 | "RU",
998 | "ARY",
999 | " ",
1000 | "11",
1001 | ",",
1002 | " ",
1003 | "201",
1004 | "9",
1005 | " DON",
1006 | "ATE",
1007 | "fake higher scoring token", # See below.
1008 | ],
1009 | activations=[
1010 | 0,
1011 | 0,
1012 | 0,
1013 | 0,
1014 | 0,
1015 | 0,
1016 | 0,
1017 | 0,
1018 | 0,
1019 | 0,
1020 | 0,
1021 | 0,
1022 | 0,
1023 | 0,
1024 | 0,
1025 | 0,
1026 | 0,
1027 | 0,
1028 | 0.37,
1029 | # This fake activation makes the previous token's activation normalize to 8, which
1030 | # might help address overconfidence in "10" activations for the one-token-at-a-time
1031 | # scoring prompt. This value and the associated token don't actually appear anywhere
1032 | # in the prompt.
1033 | 0.45,
1034 | ],
1035 | ),
1036 | ],
1037 | first_revealed_activation_indices=[],
1038 | token_index_to_score=18,
1039 | explanation="instances of the token 'ate' as part of another word",
1040 | )
1041 |
--------------------------------------------------------------------------------