├── .gitignore ├── neuron-explainer ├── neuron_explainer │ ├── __init__.py │ ├── activations │ │ ├── __init__.py │ │ ├── token_connections.py │ │ ├── activation_records.py │ │ └── activations.py │ ├── explanations │ │ ├── __init__.py │ │ ├── puzzles.py │ │ ├── prompt_builder.py │ │ ├── token_space_few_shot_examples.py │ │ ├── scoring.py │ │ ├── test_explainer.py │ │ ├── calibrated_simulator.py │ │ ├── test_simulator.py │ │ ├── explanations.py │ │ ├── explainer.py │ │ └── few_shot_examples.py │ ├── fast_dataclasses │ │ ├── __init__.py │ │ ├── test_fast_dataclasses.py │ │ └── fast_dataclasses.py │ ├── azure.py │ └── api_client.py ├── .gitignore ├── setup.py ├── README.md └── demos │ ├── explain_puzzles.ipynb │ ├── generate_and_score_explanation.ipynb │ └── generate_and_score_token_look_up_table_explanation.ipynb ├── neuron-viewer ├── public │ ├── robots.txt │ └── favicon.ico ├── tailwind.config.js ├── .parcelrc ├── src │ ├── panes │ │ ├── index.js │ │ ├── datasetList.jsx │ │ ├── similarNeurons.jsx │ │ ├── topTokens.jsx │ │ └── explanation.jsx │ ├── index.css │ ├── reportWebVitals.js │ ├── App.jsx │ ├── heatmapGrid.tsx │ ├── index.jsx │ ├── utils.ts │ ├── tokenHeatmap.tsx │ ├── feed.jsx │ ├── index.html │ ├── types.ts │ ├── simulationHeatmap.tsx │ ├── interpAPI.ts │ ├── App.css │ └── welcome.tsx ├── tsconfig.json ├── README.md ├── .gitignore ├── package.json └── python │ └── server.py └── README.md /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | -------------------------------------------------------------------------------- /neuron-explainer/neuron_explainer/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /neuron-explainer/neuron_explainer/activations/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /neuron-explainer/neuron_explainer/explanations/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /neuron-explainer/.gitignore: -------------------------------------------------------------------------------- 1 | *.egg-info/ 2 | __pycache__/ 3 | -------------------------------------------------------------------------------- /neuron-viewer/public/robots.txt: -------------------------------------------------------------------------------- 1 | # https://www.robotstxt.org/robotstxt.html 2 | User-agent: * 3 | Disallow: 4 | -------------------------------------------------------------------------------- /neuron-viewer/public/favicon.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openai/automated-interpretability/HEAD/neuron-viewer/public/favicon.ico -------------------------------------------------------------------------------- /neuron-viewer/tailwind.config.js: -------------------------------------------------------------------------------- 1 | /** @type {import('tailwindcss').Config} */ 2 | module.exports = { 3 | content: ["./src/**/*.{html,js,jsx}"], 4 | theme: { 5 | extend: {}, 6 | }, 7 | plugins: [], 8 | } 9 | -------------------------------------------------------------------------------- /neuron-explainer/neuron_explainer/fast_dataclasses/__init__.py: -------------------------------------------------------------------------------- 1 | from .fast_dataclasses import FastDataclass, dumps, loads, register_dataclass 2 | 3 | __all__ = ["FastDataclass", "dumps", "loads", "register_dataclass"] 4 | -------------------------------------------------------------------------------- /neuron-viewer/.parcelrc: -------------------------------------------------------------------------------- 1 | { 2 | "extends": "@parcel/config-default", 3 | "transformers": { 4 | "*.{ts,tsx}": ["@parcel/transformer-typescript-tsc"] 5 | }, 6 | "validators": { 7 | "*.{ts,tsx}": ["@parcel/validator-typescript"] 8 | } 9 | } 10 | -------------------------------------------------------------------------------- /neuron-viewer/src/panes/index.js: -------------------------------------------------------------------------------- 1 | export { default as TopTokens } from "./topTokens" 2 | export { default as Explanation } from "./explanation" 3 | export { default as DatasetList } from "./datasetList" 4 | export { default as SimilarNeurons } from "./similarNeurons" 5 | -------------------------------------------------------------------------------- /neuron-explainer/neuron_explainer/azure.py: -------------------------------------------------------------------------------- 1 | def standardize_azure_url(url): 2 | """Make sure url is converted to url format, not an azure path""" 3 | if url.startswith("az://openaipublic/"): 4 | url = url.replace("az://openaipublic/", "https://openaipublic.blob.core.windows.net/") 5 | return url 6 | -------------------------------------------------------------------------------- /neuron-viewer/src/index.css: -------------------------------------------------------------------------------- 1 | body { 2 | margin: 0; 3 | font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', 'Roboto', 'Oxygen', 4 | 'Ubuntu', 'Cantarell', 'Fira Sans', 'Droid Sans', 'Helvetica Neue', 5 | sans-serif; 6 | -webkit-font-smoothing: antialiased; 7 | -moz-osx-font-smoothing: grayscale; 8 | } 9 | 10 | code { 11 | font-family: source-code-pro, Menlo, Monaco, Consolas, 'Courier New', 12 | monospace; 13 | } 14 | -------------------------------------------------------------------------------- /neuron-viewer/src/reportWebVitals.js: -------------------------------------------------------------------------------- 1 | const reportWebVitals = onPerfEntry => { 2 | if (onPerfEntry && onPerfEntry instanceof Function) { 3 | import('web-vitals').then(({ getCLS, getFID, getFCP, getLCP, getTTFB }) => { 4 | getCLS(onPerfEntry); 5 | getFID(onPerfEntry); 6 | getFCP(onPerfEntry); 7 | getLCP(onPerfEntry); 8 | getTTFB(onPerfEntry); 9 | }); 10 | } 11 | }; 12 | 13 | export default reportWebVitals; -------------------------------------------------------------------------------- /neuron-viewer/src/App.jsx: -------------------------------------------------------------------------------- 1 | import "./App.css" 2 | import Feed from "./feed" 3 | import React from "react" 4 | import { Routes, Route, HashRouter } from "react-router-dom" 5 | 6 | function App() { 7 | return ( 8 | 9 | 10 | } /> 11 | } /> 12 | 13 | 14 | ) 15 | } 16 | 17 | export default App 18 | -------------------------------------------------------------------------------- /neuron-viewer/src/heatmapGrid.tsx: -------------------------------------------------------------------------------- 1 | import { TokenAndActivation } from "./types" 2 | import TokenHeatmap from "./tokenHeatmap"; 3 | 4 | export default ({ allTokens }: { allTokens: TokenAndActivation[][]}) => { 5 | return ( 6 |
7 | {allTokens.map((tokens, i) => ( 8 |
9 | 10 |
11 | ))} 12 |
13 | ); 14 | }; 15 | -------------------------------------------------------------------------------- /neuron-explainer/setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | 3 | setup( 4 | name="neuron_explainer", 5 | packages=find_packages(), 6 | version="0.0.1", 7 | author="OpenAI", 8 | install_requires=[ 9 | "httpx>=0.22", 10 | "scikit-learn", 11 | "boostedblob>=0.13.0", 12 | "tiktoken", 13 | "blobfile", 14 | "numpy", 15 | "pytest", 16 | "orjson", 17 | ], 18 | url="", 19 | description="", 20 | python_requires='>=3.9', 21 | ) 22 | -------------------------------------------------------------------------------- /neuron-viewer/tsconfig.json: -------------------------------------------------------------------------------- 1 | { 2 | "compilerOptions": { 3 | "target": "es2021", 4 | "module": "commonjs", 5 | "lib": ["dom", "dom.iterable", "esnext"], 6 | "allowJs": true, 7 | "skipLibCheck": true, 8 | "esModuleInterop": true, 9 | "allowSyntheticDefaultImports": true, 10 | "strict": true, 11 | "forceConsistentCasingInFileNames": true, 12 | "moduleResolution": "node", 13 | "resolveJsonModule": true, 14 | "isolatedModules": true, 15 | "noEmit": true, 16 | "jsx": "react-jsx" 17 | }, 18 | "include": ["src"] 19 | } 20 | -------------------------------------------------------------------------------- /neuron-viewer/README.md: -------------------------------------------------------------------------------- 1 | # Neuron viewer 2 | 3 | The easiest way to view neurons and explanations is using the 4 | [public website](https://openaipublic.blob.core.windows.net/neuron-explainer/neuron-viewer/index.html). 5 | This directory contains the implementation of that website as well as lightweight servers that make 6 | it possible to run an alternative version of the website locally. 7 | 8 | ## Local development 9 | 10 | Install: 11 | 12 | ```npm install``` 13 | 14 | Run the backend: 15 | 16 | ```npm run startpy``` 17 | 18 | Run the frontend: 19 | 20 | ```npm start``` 21 | -------------------------------------------------------------------------------- /neuron-viewer/.gitignore: -------------------------------------------------------------------------------- 1 | **/*.trace 2 | **/*.zip 3 | **/*.tar.gz 4 | **/*.tgz 5 | **/*.log 6 | .parcel-cache 7 | 8 | package-lock.json 9 | **/*.bun 10 | 11 | # See https://help.github.com/articles/ignoring-files/ for more about ignoring files. 12 | 13 | # dependencies 14 | /node_modules 15 | /.pnp 16 | .pnp.js 17 | 18 | # testing 19 | /coverage 20 | 21 | # production 22 | /build 23 | 24 | # misc 25 | .DS_Store 26 | .env.local 27 | .env.development.local 28 | .env.test.local 29 | .env.production.local 30 | 31 | npm-debug.log* 32 | yarn-debug.log* 33 | yarn-error.log* 34 | 35 | *.pyc 36 | dist/ 37 | 38 | .vscode 39 | -------------------------------------------------------------------------------- /neuron-viewer/src/index.jsx: -------------------------------------------------------------------------------- 1 | import React from 'react'; 2 | import ReactDOM from 'react-dom/client'; 3 | import './index.css'; 4 | import App from './App'; 5 | import reportWebVitals from './reportWebVitals'; 6 | 7 | const root = ReactDOM.createRoot(document.getElementById('root')); 8 | root.render( 9 | 10 | 11 | 12 | ); 13 | 14 | // If you want to start measuring performance in your app, pass a function 15 | // to log results (for example: reportWebVitals(console.log)) 16 | // or send to an analytics endpoint. Learn more: https://bit.ly/CRA-vitals 17 | reportWebVitals(); 18 | -------------------------------------------------------------------------------- /neuron-explainer/README.md: -------------------------------------------------------------------------------- 1 | # Neuron explainer 2 | 3 | This directory contains a version of our code for generating, simulating and scoring explanations of 4 | neuron behavior. 5 | 6 | # Setup 7 | 8 | ``` 9 | pip install -e . 10 | ``` 11 | 12 | # Usage 13 | 14 | For example usage, see the `demos` folder: 15 | 16 | * [Generating and scoring activation-based explanations](demos/generate_and_score_explanation.ipynb) 17 | * [Generating and scoring explanations based on tokens with high average activations](demos/generate_and_score_token_look_up_table_explanation.ipynb) 18 | * [Generating explanations for human-written neuron puzzles](demos/explain_puzzles.ipynb) 19 | -------------------------------------------------------------------------------- /neuron-viewer/src/utils.ts: -------------------------------------------------------------------------------- 1 | export const memoizeAsync = (fnname: string, fn: any) => { 2 | return async (...args: any) => { 3 | const key = `memoized:${fnname}:${args.map((x: any) => JSON.stringify(x)).join("-")}` 4 | const val = localStorage.getItem(key); 5 | if (val === null) { 6 | const value = await fn(...args) 7 | localStorage.setItem(key, JSON.stringify(value)) 8 | console.log(`memoized ${fnname}(${args.map((x: any) => JSON.stringify(x)).join(", ")})`, value) 9 | return value 10 | } else { 11 | // console.log(`parsing`, val) 12 | return JSON.parse(val) 13 | } 14 | } 15 | } 16 | 17 | 18 | export const getQueryParams = () => { 19 | const urlParams = new URLSearchParams(window.location.search) 20 | const params: {[key: string]: any} = {} 21 | for (const [key, value] of urlParams.entries()) { 22 | params[key] = value 23 | } 24 | return params 25 | } 26 | -------------------------------------------------------------------------------- /neuron-viewer/src/tokenHeatmap.tsx: -------------------------------------------------------------------------------- 1 | import React from "react" 2 | import { interpolateColor, Color, getInterpolatedColor, DEFAULT_COLORS, DEFAULT_BOUNDARIES, TokenAndActivation } from './types' 3 | 4 | 5 | type Props = { 6 | tokens: TokenAndActivation[], 7 | loading?: boolean, 8 | colors?: Color[], 9 | boundaries?: number[] 10 | } 11 | export default function TokenHeatmap({ tokens, loading, colors = DEFAULT_COLORS, boundaries = DEFAULT_BOUNDARIES }: Props) { 12 | //
13 | return ( 14 |
15 | {tokens.map(({ token, activation, normalized_activation }, i) => { 16 | const color = getInterpolatedColor(colors, boundaries, normalized_activation || activation); 17 | return 27 | {token} 28 | 29 | })} 30 |
31 | ) 32 | } 33 | -------------------------------------------------------------------------------- /neuron-viewer/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "neuron-viewer", 3 | "version": "0.1.67", 4 | "homepage": "https://openaipublic.blob.core.windows.net/neuron-explainer/neuron-viewer", 5 | "dependencies": { 6 | "@headlessui/react": "^1.7.8", 7 | "@headlessui/tailwindcss": "^0.1.2", 8 | "@types/d3-scale": "^4.0.3", 9 | "@types/lodash": "^4.14.194", 10 | "@types/react": "^18.0.37", 11 | "@types/react-dom": "^18.0.11", 12 | "d3-scale": "^4.0.2", 13 | "lodash": "^4.17.21", 14 | "react": "^18.2.0", 15 | "react-dom": "^18.2.0", 16 | "react-router-dom": "^6.10.0", 17 | "web-vitals": "^3.0.3" 18 | }, 19 | "scripts": { 20 | "startpy": "nodemon python/server.py", 21 | "start": "parcel src/index.html", 22 | "build": "parcel build src/index.html", 23 | "serve": "parcel serve src/index.html", 24 | "typecheck": "tsc -p ." 25 | }, 26 | "eslintConfig": { 27 | "extends": [ 28 | "react-app" 29 | ] 30 | }, 31 | "alias": { 32 | "preact/jsx-dev-runtime": "preact/jsx-runtime" 33 | }, 34 | "devDependencies": { 35 | "@observablehq/plot": "^0.6.5", 36 | "@parcel/transformer-typescript-tsc": "^2.8.3", 37 | "@parcel/validator-typescript": "^2.8.3", 38 | "nodemon": "^2.0.22", 39 | "parcel": "^2.8.3", 40 | "preact": "^10.13.2", 41 | "process": "^0.11.10", 42 | "react-refresh": "0.10.0", 43 | "tailwindcss": "^3.2.4", 44 | "typescript": "^5.0.4" 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /neuron-viewer/python/server.py: -------------------------------------------------------------------------------- 1 | # %% 2 | import logging 3 | 4 | from flask import Flask, request 5 | from flask_cors import CORS 6 | 7 | import json 8 | 9 | import urllib.request 10 | 11 | def load_az_json(url): 12 | with urllib.request.urlopen(url) as f: 13 | return json.load(f) 14 | 15 | def start( 16 | dev: bool = False, 17 | host_name: str = "0.0.0.0", 18 | port: int = 80, 19 | ): 20 | app = Flask("interpretability chat") 21 | app.logger.setLevel(logging.INFO) 22 | # app.logger.disabled = True 23 | CORS(app) 24 | 25 | @app.after_request 26 | def after_request(response): 27 | response.headers.add("Access-Control-Allow-Origin", "*") 28 | response.headers.add( 29 | "Access-Control-Allow-Headers", "Content-Type,Authorization" 30 | ) 31 | response.headers.add( 32 | "Access-Control-Allow-Methods", "GET,PUT,POST,DELETE,OPTIONS" 33 | ) 34 | return response 35 | 36 | @app.route("/load_az", methods=["GET", "POST"]) 37 | async def load_az(): 38 | args = request.get_json() 39 | path = args["path"] 40 | result = load_az_json(path) 41 | return result 42 | 43 | app.run(debug=dev, host=host_name, port=port, use_reloader=False) 44 | 45 | 46 | def main(dev: bool = True, host_name: str = "0.0.0.0", port: int = 8000): 47 | start(dev=dev, host_name=host_name, port=port) 48 | 49 | 50 | if __name__ == "__main__": 51 | main() 52 | -------------------------------------------------------------------------------- /neuron-viewer/src/feed.jsx: -------------------------------------------------------------------------------- 1 | import * as Panes from "./panes" 2 | import React, { useEffect } from "react" 3 | import Welcome from "./welcome" 4 | import { useState } from "react" 5 | import { useParams, Link } from "react-router-dom" 6 | 7 | export default function Feed() { 8 | const params = useParams() 9 | // If params is missing either index, there's no neuron selected. 10 | let activeNeuron; 11 | if (params.layer === undefined || params.neuron === undefined) { 12 | activeNeuron = null 13 | } else { 14 | // Grab the layer and neuron indices from the params, casting them to ints. 15 | activeNeuron = { 16 | "layer": parseInt(params.layer), 17 | "neuron": parseInt(params.neuron), 18 | } 19 | } 20 | 21 | const Pane = ({ children }) => ( 22 |
{children}
23 | ) 24 | 25 | return ( 26 |
27 |
28 |

29 | Neuron Viewer 30 |

31 | {activeNeuron && ( 32 |

33 | Neuron {activeNeuron.layer}:{activeNeuron.neuron} 34 |

35 | )} 36 |
37 | 38 |
41 |
    42 | {activeNeuron ? 43 | <> 44 | 45 | {React.createElement(Panes["Explanation"], { activeNeuron })} 46 | 47 | 48 | {React.createElement(Panes["DatasetList"], { activeNeuron })} 49 | 50 | 51 | {React.createElement(Panes["TopTokens"], { activeNeuron })} 52 | 53 | 54 | {React.createElement(Panes["SimilarNeurons"], { activeNeuron })} 55 | 56 | : 57 | 58 | } 59 | 60 |
61 |
62 |
63 | ) 64 | } 65 | -------------------------------------------------------------------------------- /neuron-explainer/neuron_explainer/explanations/puzzles.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | from dataclasses import dataclass 4 | 5 | from neuron_explainer.activations.activations import ActivationRecord 6 | 7 | 8 | @dataclass(frozen=True) 9 | class Puzzle: 10 | """A puzzle is a ground truth explanation, a collection of sentences (stored as ActivationRecords) with activations 11 | according to that explanation, and a collection of false explanations""" 12 | 13 | name: str 14 | explanation: str 15 | activation_records: list[ActivationRecord] 16 | false_explanations: list[str] 17 | 18 | 19 | def convert_puzzle_to_tokenized_sentences(puzzle: Puzzle) -> list[list[str]]: 20 | """Converts a puzzle to a list of tokenized sentences.""" 21 | return [record.tokens for record in puzzle.activation_records] 22 | 23 | 24 | def convert_puzzle_dict_to_puzzle(puzzle_dict: dict) -> Puzzle: 25 | """Converts a json dictionary representation of a puzzle to the Puzzle class.""" 26 | puzzle_activation_records = [] 27 | for sentence in puzzle_dict["sentences"]: 28 | # Token-activation pairs are listed as either a string or a list of a string and a float. If it is a list, the float is the activation. 29 | # If it is only a string, the activation is assumed to be 0. This is useful for readability and reducing redundancy in the data. 30 | tokens = [t[0] if type(t) is list else t for t in sentence] 31 | assert all([type(t) is str for t in tokens]), "All tokens must be strings" 32 | activations = [float(t[1]) if type(t) is list else 0.0 for t in sentence] 33 | assert all([type(t) is float for t in activations]), "All activations must be floats" 34 | 35 | puzzle_activation_records.append(ActivationRecord(tokens=tokens, activations=activations)) 36 | 37 | return Puzzle( 38 | name=puzzle_dict["name"], 39 | explanation=puzzle_dict["explanation"], 40 | activation_records=puzzle_activation_records, 41 | false_explanations=puzzle_dict["false_explanations"], 42 | ) 43 | 44 | 45 | PUZZLES_BY_NAME: dict[str, Puzzle] = dict() 46 | script_dir = os.path.dirname(os.path.abspath(__file__)) 47 | with open(os.path.join(script_dir, "puzzles.json"), "r") as f: 48 | puzzle_dicts = json.loads(f.read()) 49 | for name in puzzle_dicts.keys(): 50 | PUZZLES_BY_NAME[name] = convert_puzzle_dict_to_puzzle(puzzle_dicts[name]) 51 | -------------------------------------------------------------------------------- /neuron-explainer/neuron_explainer/activations/token_connections.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | from typing import List, Union 3 | 4 | import blobfile as bf 5 | from neuron_explainer.fast_dataclasses import FastDataclass, loads, register_dataclass 6 | from neuron_explainer.azure import standardize_azure_url 7 | import urllib.request 8 | 9 | 10 | @register_dataclass 11 | @dataclass 12 | class TokensAndWeights(FastDataclass): 13 | tokens: List[str] 14 | strengths: List[float] 15 | 16 | 17 | @register_dataclass 18 | @dataclass 19 | class WeightBasedSummaryOfNeuron(FastDataclass): 20 | input_positive: TokensAndWeights 21 | input_negative: TokensAndWeights 22 | output_positive: TokensAndWeights 23 | output_negative: TokensAndWeights 24 | 25 | 26 | def load_token_weight_connections_of_neuron( 27 | layer_index: Union[str, int], 28 | neuron_index: Union[str, int], 29 | dataset_path: str = "https://openaipublic.blob.core.windows.net/neuron-explainer/data/related-tokens/weight-based", 30 | ) -> WeightBasedSummaryOfNeuron: 31 | """Load the TokenLookupTableSummaryOfNeuron for the specified neuron.""" 32 | url = "/".join([dataset_path, str(layer_index), f"{neuron_index}.json"]) 33 | url = standardize_azure_url(url) 34 | with urllib.request.urlopen(url) as f: 35 | return loads(f.read(), backwards_compatible=False) 36 | 37 | 38 | @register_dataclass 39 | @dataclass 40 | class TokenLookupTableSummaryOfNeuron(FastDataclass): 41 | """List of tokens and the average activations of a given neuron in response to each 42 | respective token. These are selected from among the tokens in the vocabulary with the 43 | highest average activations across an internet text dataset, with the highest activations 44 | first.""" 45 | 46 | tokens: List[str] 47 | average_activations: List[float] 48 | 49 | 50 | def load_token_lookup_table_connections_of_neuron( 51 | layer_index: Union[str, int], 52 | neuron_index: Union[str, int], 53 | dataset_path: str = "https://openaipublic.blob.core.windows.net/neuron-explainer/data/related-tokens/activation-based", 54 | ) -> TokenLookupTableSummaryOfNeuron: 55 | """Load the TokenLookupTableSummaryOfNeuron for the specified neuron.""" 56 | url = "/".join([dataset_path, str(layer_index), f"{neuron_index}.json"]) 57 | url = standardize_azure_url(url) 58 | with urllib.request.urlopen(url) as f: 59 | return loads(f.read(), backwards_compatible=False) 60 | -------------------------------------------------------------------------------- /neuron-viewer/src/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 11 | 15 | 24 | 25 | 26 | Neuron viewer 27 | 28 | 29 | 30 | 31 | 42 | 43 | 44 | 45 |
46 | 56 | 57 | 58 | 59 | 60 | -------------------------------------------------------------------------------- /neuron-explainer/demos/explain_puzzles.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "%load_ext autoreload\n", 10 | "%autoreload 2" 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": null, 16 | "metadata": {}, 17 | "outputs": [], 18 | "source": [ 19 | "import os\n", 20 | "\n", 21 | "os.environ[\"OPENAI_API_KEY\"] = \"put-key-here\"\n", 22 | "\n", 23 | "from neuron_explainer.activations.activation_records import calculate_max_activation\n", 24 | "from neuron_explainer.explanations.explainer import TokenActivationPairExplainer\n", 25 | "from neuron_explainer.explanations.prompt_builder import PromptFormat\n", 26 | "from neuron_explainer.explanations.puzzles import PUZZLES_BY_NAME\n", 27 | "\n", 28 | "\n", 29 | "EXPLAINER_MODEL_NAME = \"gpt-4\"\n", 30 | "\n", 31 | "explainer = TokenActivationPairExplainer(\n", 32 | " model_name=EXPLAINER_MODEL_NAME,\n", 33 | " prompt_format=PromptFormat.HARMONY_V4,\n", 34 | " max_concurrent=1,\n", 35 | ")\n", 36 | "\n", 37 | "for puzzle_name, puzzle in PUZZLES_BY_NAME.items():\n", 38 | " print(f\"{puzzle_name=}\")\n", 39 | " puzzle_answer = puzzle.explanation\n", 40 | " # Generate an explanation for the puzzle.\n", 41 | " explanations = await explainer.generate_explanations(\n", 42 | " all_activation_records=puzzle.activation_records,\n", 43 | " max_activation=calculate_max_activation(puzzle.activation_records),\n", 44 | " num_samples=1,\n", 45 | " )\n", 46 | " assert len(explanations) == 1\n", 47 | " model_generated_explanation = explanations[0]\n", 48 | " print(f\"{model_generated_explanation=}\")\n", 49 | " print(f\"{puzzle_answer=}\\n\")\n", 50 | "\n" 51 | ] 52 | } 53 | ], 54 | "metadata": { 55 | "kernelspec": { 56 | "display_name": "openai", 57 | "language": "python", 58 | "name": "openai" 59 | }, 60 | "language_info": { 61 | "codemirror_mode": { 62 | "name": "ipython", 63 | "version": 3 64 | }, 65 | "file_extension": ".py", 66 | "mimetype": "text/x-python", 67 | "name": "python", 68 | "nbconvert_exporter": "python", 69 | "pygments_lexer": "ipython3", 70 | "version": "3.9.9" 71 | }, 72 | "orig_nbformat": 4 73 | }, 74 | "nbformat": 4, 75 | "nbformat_minor": 2 76 | } 77 | -------------------------------------------------------------------------------- /neuron-viewer/src/types.ts: -------------------------------------------------------------------------------- 1 | import { scaleLinear } from "d3-scale" 2 | import { min, max, flatten } from "lodash" 3 | 4 | export type Neuron = { 5 | layer: number; 6 | neuron: number; 7 | } 8 | 9 | export type TokenAndActivation = { 10 | token: string, 11 | activation: number 12 | normalized_activation?: number 13 | } 14 | 15 | export type TokenSequence = TokenAndActivation[] 16 | 17 | export const normalizeTokenActs = (...sequences: TokenSequence[][]) => { 18 | // console.log('sequences', sequences) 19 | let flattened: TokenAndActivation[] = flatten(flatten(sequences)) 20 | // Replace all activations less than 0 in data.tokens with 0. This matches the format in the 21 | // top + random activation records displayed in the main grid. 22 | flattened = flattened.map(({token, activation}) => { 23 | return { 24 | token, 25 | activation: Math.max(activation, 0) 26 | } 27 | }) 28 | const maxActivation = max(flattened.map((ta) => ta.activation)) || 0; 29 | const neuronScale = scaleLinear() 30 | // Even though we're only displaying positive activations, we still need to scale in a way that 31 | // accounts for the existence of negative activations, since our color scale includes them. 32 | .domain([0, maxActivation]) 33 | .range([0, 1]) 34 | 35 | return sequences.map((seq) => seq.map((tas) => tas.map(({ token, activation }) => ({ 36 | token, 37 | activation, 38 | normalized_activation: neuronScale(activation), 39 | })))) 40 | } 41 | 42 | export type Color = {r: number, g: number, b: number}; 43 | export function interpolateColor(color_l: Color, color_r: Color, value: number) { 44 | const color = { 45 | r: Math.round(color_l.r + (color_r.r - color_l.r) * value), 46 | g: Math.round(color_l.g + (color_r.g - color_l.g) * value), 47 | b: Math.round(color_l.b + (color_r.b - color_l.b) * value), 48 | } 49 | return color 50 | } 51 | 52 | export function getInterpolatedColor(colors: Color[], boundaries: number[], value: number) { 53 | const index = boundaries.findIndex((boundary) => boundary >= value) 54 | const colorIndex = Math.max(0, index - 1) 55 | const color_left = colors[colorIndex] 56 | const color_right = colors[colorIndex + 1] 57 | const boundary_left = boundaries[colorIndex] 58 | const boundary_right = boundaries[colorIndex + 1] 59 | const ratio = (value - boundary_left) / (boundary_right - boundary_left) 60 | const color = interpolateColor(color_left, color_right, ratio) 61 | return color 62 | } 63 | 64 | export const DEFAULT_COLORS = [ 65 | // { r: 255, g: 0, b: 105 }, 66 | { r: 255, g: 255, b: 255 }, 67 | { r: 0, g: 255, b: 0 }, 68 | ] 69 | export const DEFAULT_BOUNDARIES = [ 70 | // 0, 0.5, 1 71 | 0, 1 72 | ] 73 | 74 | -------------------------------------------------------------------------------- /neuron-explainer/neuron_explainer/fast_dataclasses/test_fast_dataclasses.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | 3 | import pytest 4 | 5 | from .fast_dataclasses import FastDataclass, dumps, loads, register_dataclass 6 | 7 | 8 | # Inheritance is a bit tricky with our setup. dataclass_name must be set for instances of these 9 | # classes to serialize and deserialize correctly, but if it's given a default value, then subclasses 10 | # can't have any fields that don't have default values, because of how constructors are generated 11 | # for dataclasses (fields with no default value can't follow those with default values). To work 12 | # around this, we set dataclass_name in __post_init__ on the base class, which is called after the 13 | # constructor. The implementation does the right thing for both the base class and the subclass. 14 | @register_dataclass 15 | @dataclass 16 | class DataclassC(FastDataclass): 17 | ints: list[int] 18 | 19 | 20 | @register_dataclass 21 | @dataclass 22 | class DataclassC_ext(DataclassC): 23 | s: str 24 | 25 | 26 | @register_dataclass 27 | @dataclass 28 | class DataclassB(FastDataclass): 29 | str_to_c: dict[str, DataclassC] 30 | cs: list[DataclassC] 31 | 32 | 33 | @register_dataclass 34 | @dataclass 35 | class DataclassA(FastDataclass): 36 | floats: list[float] 37 | strings: list[str] 38 | bs: list[DataclassB] 39 | 40 | 41 | @register_dataclass 42 | @dataclass 43 | class DataclassD(FastDataclass): 44 | s1: str 45 | s2: str = "default" 46 | 47 | 48 | def test_dataclasses() -> None: 49 | a = DataclassA( 50 | floats=[1.0, 2.0], 51 | strings=["a", "b"], 52 | bs=[ 53 | DataclassB( 54 | str_to_c={"a": DataclassC(ints=[1, 2]), "b": DataclassC(ints=[3, 4])}, 55 | cs=[DataclassC(ints=[5, 6]), DataclassC_ext(ints=[7, 8], s="s")], 56 | ), 57 | DataclassB( 58 | str_to_c={"c": DataclassC_ext(ints=[9, 10], s="t"), "d": DataclassC(ints=[11, 12])}, 59 | cs=[DataclassC(ints=[13, 14]), DataclassC(ints=[15, 16])], 60 | ), 61 | ], 62 | ) 63 | assert loads(dumps(a)) == a 64 | 65 | 66 | def test_c_and_c_ext() -> None: 67 | c_ext = DataclassC_ext(ints=[3, 4], s="s") 68 | assert loads(dumps(c_ext)) == c_ext 69 | 70 | c = DataclassC(ints=[1, 2]) 71 | assert loads(dumps(c)) == c 72 | 73 | 74 | def test_bad_serialized_data() -> None: 75 | assert type(loads(dumps(DataclassC(ints=[3, 4])))) == DataclassC 76 | assert type(loads('{"ints": [3, 4]}', backwards_compatible=False)) == dict 77 | assert type(loads('{"ints": [3, 4], "dataclass_name": "DataclassC"}')) == DataclassC 78 | with pytest.raises(TypeError): 79 | loads('{"ints": [3, 4], "bogus_extra_field": "foo", "dataclass_name": "DataclassC"}') 80 | with pytest.raises(TypeError): 81 | loads('{"ints_field_is_missing": [3, 4], "dataclass_name": "DataclassC"}') 82 | assert type(loads('{"s1": "test"}', backwards_compatible=False)) == dict 83 | assert type(loads('{"s1": "test"}', backwards_compatible=True)) == DataclassD 84 | -------------------------------------------------------------------------------- /neuron-explainer/neuron_explainer/fast_dataclasses/fast_dataclasses.py: -------------------------------------------------------------------------------- 1 | # Utilities for dataclasses that are very fast to serialize and deserialize, with limited data 2 | # validation. Fields must not be tuples, since they get serialized and then deserialized as lists. 3 | # 4 | # The unit tests for this library show how to use it. 5 | 6 | import json 7 | from dataclasses import dataclass, field, fields, is_dataclass 8 | from functools import partial 9 | from typing import Any, Union 10 | 11 | import orjson 12 | 13 | dataclasses_by_name = {} 14 | dataclasses_by_fieldnames = {} 15 | 16 | 17 | @dataclass 18 | class FastDataclass: 19 | dataclass_name: str = field(init=False) 20 | 21 | def __post_init__(self) -> None: 22 | self.dataclass_name = self.__class__.__name__ 23 | 24 | 25 | def register_dataclass(cls): # type: ignore 26 | assert is_dataclass(cls), "Only dataclasses can be registered." 27 | dataclasses_by_name[cls.__name__] = cls 28 | name_set = frozenset(f.name for f in fields(cls) if f.name != "dataclass_name") 29 | dataclasses_by_fieldnames[name_set] = cls 30 | return cls 31 | 32 | 33 | def dumps(obj: Any) -> bytes: 34 | return orjson.dumps(obj, option=orjson.OPT_SERIALIZE_NUMPY) 35 | 36 | 37 | def _object_hook(d: Any, backwards_compatible: bool = True) -> Any: 38 | # If d is a list, recurse. 39 | if isinstance(d, list): 40 | return [_object_hook(x, backwards_compatible=backwards_compatible) for x in d] 41 | # If d is not a dict, return it as is. 42 | if not isinstance(d, dict): 43 | return d 44 | cls = None 45 | if "dataclass_name" in d: 46 | if d["dataclass_name"] in dataclasses_by_name: 47 | cls = dataclasses_by_name[d["dataclass_name"]] 48 | else: 49 | assert backwards_compatible, ( 50 | f"Dataclass {d['dataclass_name']} not found, set backwards_compatible=True if you " 51 | f"are okay with that." 52 | ) 53 | # Load objects created without dataclass_name set. 54 | else: 55 | # Try our best to find a dataclass if backwards_compatible is True. 56 | if backwards_compatible: 57 | d_fields = frozenset(d.keys()) 58 | if d_fields in dataclasses_by_fieldnames: 59 | cls = dataclasses_by_fieldnames[d_fields] 60 | elif len(d_fields) > 0: 61 | # Check if the fields are a subset of a dataclass (if the dataclass had extra fields 62 | # added since the data was created). Note that this will fail if fields were removed 63 | # from the dataclass. 64 | for key, possible_cls in dataclasses_by_fieldnames.items(): 65 | if d_fields.issubset(key): 66 | cls = possible_cls 67 | break 68 | else: 69 | print(f"Could not find dataclass for {d_fields} {cls}") 70 | new_d = { 71 | k: _object_hook(v, backwards_compatible=backwards_compatible) 72 | for k, v in d.items() 73 | if k != "dataclass_name" 74 | } 75 | if cls is not None: 76 | return cls(**new_d) 77 | else: 78 | return new_d 79 | 80 | 81 | def loads(s: Union[str, bytes], backwards_compatible: bool = True) -> Any: 82 | return json.loads( 83 | s, 84 | object_hook=partial(_object_hook, backwards_compatible=backwards_compatible), 85 | ) 86 | -------------------------------------------------------------------------------- /neuron-explainer/demos/generate_and_score_explanation.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "%load_ext autoreload\n", 10 | "%autoreload 2" 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": null, 16 | "metadata": {}, 17 | "outputs": [], 18 | "source": [ 19 | "import os\n", 20 | "\n", 21 | "os.environ[\"OPENAI_API_KEY\"] = \"put-key-here\"\n", 22 | "\n", 23 | "from neuron_explainer.activations.activation_records import calculate_max_activation\n", 24 | "from neuron_explainer.activations.activations import ActivationRecordSliceParams, load_neuron\n", 25 | "from neuron_explainer.explanations.calibrated_simulator import UncalibratedNeuronSimulator\n", 26 | "from neuron_explainer.explanations.explainer import TokenActivationPairExplainer\n", 27 | "from neuron_explainer.explanations.prompt_builder import PromptFormat\n", 28 | "from neuron_explainer.explanations.scoring import simulate_and_score\n", 29 | "from neuron_explainer.explanations.simulator import ExplanationNeuronSimulator\n", 30 | "\n", 31 | "EXPLAINER_MODEL_NAME = \"gpt-4\"\n", 32 | "SIMULATOR_MODEL_NAME = \"text-davinci-003\"\n", 33 | "\n", 34 | "\n", 35 | "# test_response = await client.make_request(prompt=\"test 123<|endofprompt|>\", max_tokens=2)\n", 36 | "# print(\"Response:\", test_response[\"choices\"][0][\"text\"])\n", 37 | "\n", 38 | "# Load a neuron record.\n", 39 | "neuron_record = load_neuron(9, 6236)\n", 40 | "\n", 41 | "# Grab the activation records we'll need.\n", 42 | "slice_params = ActivationRecordSliceParams(n_examples_per_split=5)\n", 43 | "train_activation_records = neuron_record.train_activation_records(\n", 44 | " activation_record_slice_params=slice_params\n", 45 | ")\n", 46 | "valid_activation_records = neuron_record.valid_activation_records(\n", 47 | " activation_record_slice_params=slice_params\n", 48 | ")\n", 49 | "\n", 50 | "# Generate an explanation for the neuron.\n", 51 | "explainer = TokenActivationPairExplainer(\n", 52 | " model_name=EXPLAINER_MODEL_NAME,\n", 53 | " prompt_format=PromptFormat.HARMONY_V4,\n", 54 | " max_concurrent=1,\n", 55 | ")\n", 56 | "explanations = await explainer.generate_explanations(\n", 57 | " all_activation_records=train_activation_records,\n", 58 | " max_activation=calculate_max_activation(train_activation_records),\n", 59 | " num_samples=1,\n", 60 | ")\n", 61 | "assert len(explanations) == 1\n", 62 | "explanation = explanations[0]\n", 63 | "print(f\"{explanation=}\")\n", 64 | "\n", 65 | "# Simulate and score the explanation.\n", 66 | "simulator = UncalibratedNeuronSimulator(\n", 67 | " ExplanationNeuronSimulator(\n", 68 | " SIMULATOR_MODEL_NAME,\n", 69 | " explanation,\n", 70 | " max_concurrent=1,\n", 71 | " prompt_format=PromptFormat.INSTRUCTION_FOLLOWING,\n", 72 | " )\n", 73 | ")\n", 74 | "scored_simulation = await simulate_and_score(simulator, valid_activation_records)\n", 75 | "print(f\"score={scored_simulation.get_preferred_score():.2f}\")\n" 76 | ] 77 | } 78 | ], 79 | "metadata": { 80 | "kernelspec": { 81 | "display_name": "openai", 82 | "language": "python", 83 | "name": "python3" 84 | }, 85 | "language_info": { 86 | "codemirror_mode": { 87 | "name": "ipython", 88 | "version": 3 89 | }, 90 | "file_extension": ".py", 91 | "mimetype": "text/x-python", 92 | "name": "python", 93 | "nbconvert_exporter": "python", 94 | "pygments_lexer": "ipython3", 95 | "version": "3.9.9" 96 | }, 97 | "orig_nbformat": 4 98 | }, 99 | "nbformat": 4, 100 | "nbformat_minor": 2 101 | } 102 | -------------------------------------------------------------------------------- /neuron-viewer/src/panes/datasetList.jsx: -------------------------------------------------------------------------------- 1 | import HeatmapGrid from "../heatmapGrid" 2 | import React, { useEffect, useState } from "react" 3 | import { normalizeTokenActs } from "../types" 4 | 5 | import {get_neuron_record} from "../interpAPI" 6 | 7 | function zip_sequences(sequences) { 8 | return sequences.map(({ activations, tokens }) => { 9 | return tokens.map((token, idx) => ({ 10 | token, 11 | activation: activations[idx], 12 | })) 13 | }) 14 | } 15 | 16 | export default ({ activeNeuron }) => { 17 | const [data, setData] = useState(null) 18 | const [showingMore, setShowingMore] = useState({}) 19 | const [isLoading, setIsLoading] = useState(true) 20 | 21 | useEffect(() => { 22 | async function fetchData() { 23 | if (data) { 24 | return 25 | } 26 | const result = await get_neuron_record(activeNeuron) 27 | console.log(result) 28 | const all_sequences = [] 29 | all_sequences.push({ 30 | // label: '[0.999, 1] (Top quantile, sorted. 50 of 50000)', 31 | label: 'Top', 32 | sequences: zip_sequences(result.most_positive_activation_records), 33 | default_show: 4, 34 | }) 35 | all_sequences.push({ 36 | label: 'Quantile range [0.99, 0.999] sample', 37 | sequences: zip_sequences(result.random_sample_by_quantile[3]), 38 | default_show: 1, 39 | }) 40 | all_sequences.push({ 41 | label: 'Quantile range [0.9, 0.99] sample', 42 | sequences: zip_sequences(result.random_sample_by_quantile[2]), 43 | default_show: 1, 44 | }) 45 | all_sequences.push({ 46 | label: 'Quantile range [0.5, 0.9] sample', 47 | sequences: zip_sequences(result.random_sample_by_quantile[1]), 48 | default_show: 1, 49 | }) 50 | all_sequences.push({ 51 | label: 'Quantile range [0, 0.5] sample', 52 | sequences: zip_sequences(result.random_sample_by_quantile[0]), 53 | default_show: 1, 54 | }) 55 | all_sequences.push({ 56 | // label: '[0, 1] (Random)', 57 | label: 'Random sample', 58 | sequences: zip_sequences(result.random_sample), 59 | default_show: 2, 60 | }) 61 | // for reference 62 | // intervals = [(0, 1), (0, 0.5), (0.5, 0.9), (0.9, 0.99), (0.99, 0.999), (0.999, 1)] 63 | // saved_activations_by_interval = [neuron_record.random_sample] + neuron_record.random_sample_by_decile[:-1] + [neuron_record.top_activations] 64 | setData(all_sequences) 65 | setIsLoading(false) 66 | } 67 | fetchData() 68 | }, [activeNeuron]) 69 | 70 | if (isLoading) { 71 | return ( 72 |
73 |
74 |
loading top dataset examples
75 |
76 | ) 77 | } 78 | 79 | // const activations = data.top_activations; 80 | const all_normalized_sequences = normalizeTokenActs(...data.map(({sequences}) => sequences)) 81 | 82 | return ( 83 |
84 |

Activations

85 | { 86 | data.map(({label, default_show}, idx) => { 87 | const n_show = showingMore[label] ? all_normalized_sequences[idx].length : default_show; 88 | return ( 89 | 90 |

91 | {label} 92 | 96 |

97 | 98 |
99 | ) 100 | }) 101 | } 102 |
103 | ) 104 | } 105 | -------------------------------------------------------------------------------- /neuron-explainer/demos/generate_and_score_token_look_up_table_explanation.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "%load_ext autoreload\n", 10 | "%autoreload 2" 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": null, 16 | "metadata": {}, 17 | "outputs": [], 18 | "source": [ 19 | "import os\n", 20 | "\n", 21 | "os.environ[\"OPENAI_API_KEY\"] = \"put-key-here\"\n", 22 | "\n", 23 | "from neuron_explainer.activations.activations import ActivationRecordSliceParams, load_neuron\n", 24 | "from neuron_explainer.activations.token_connections import load_token_lookup_table_connections_of_neuron\n", 25 | "from neuron_explainer.explanations.calibrated_simulator import UncalibratedNeuronSimulator\n", 26 | "from neuron_explainer.explanations.explainer import TokenSpaceRepresentationExplainer\n", 27 | "from neuron_explainer.explanations.prompt_builder import PromptFormat\n", 28 | "from neuron_explainer.explanations.scoring import simulate_and_score\n", 29 | "from neuron_explainer.explanations.simulator import ExplanationNeuronSimulator\n", 30 | "\n", 31 | "EXPLAINER_MODEL_NAME = \"gpt-4\"\n", 32 | "SIMULATOR_MODEL_NAME = \"text-davinci-003\"\n", 33 | "\n", 34 | "\n", 35 | "# test_response = await client.make_request(prompt=\"test 123<|endofprompt|>\", max_tokens=2)\n", 36 | "# print(\"Response:\", test_response[\"choices\"][0][\"text\"])\n", 37 | "\n", 38 | "layer_index = 9\n", 39 | "neuron_index = 6236\n", 40 | "\n", 41 | "# Load a token lookup table.\n", 42 | "token_lookup_table = load_token_lookup_table_connections_of_neuron(layer_index, neuron_index)\n", 43 | "\n", 44 | "# Load a neuron record.\n", 45 | "neuron_record = load_neuron(layer_index, neuron_index)\n", 46 | "\n", 47 | "# Grab the activation records we'll need.\n", 48 | "slice_params = ActivationRecordSliceParams(n_examples_per_split=5)\n", 49 | "valid_activation_records = neuron_record.valid_activation_records(\n", 50 | " activation_record_slice_params=slice_params\n", 51 | ")\n", 52 | "\n", 53 | "# Generate an explanation for the neuron.\n", 54 | "explainer = TokenSpaceRepresentationExplainer(\n", 55 | " model_name=EXPLAINER_MODEL_NAME,\n", 56 | " prompt_format=PromptFormat.HARMONY_V4,\n", 57 | " max_concurrent=1,\n", 58 | ")\n", 59 | "explanations = await explainer.generate_explanations(\n", 60 | " tokens=token_lookup_table.tokens,\n", 61 | " num_samples=1,\n", 62 | ")\n", 63 | "assert len(explanations) == 1\n", 64 | "explanation = explanations[0]\n", 65 | "print(f\"{explanation=}\")\n", 66 | "\n", 67 | "# Simulate and score the explanation.\n", 68 | "simulator = UncalibratedNeuronSimulator(\n", 69 | " ExplanationNeuronSimulator(\n", 70 | " SIMULATOR_MODEL_NAME,\n", 71 | " explanation,\n", 72 | " max_concurrent=1,\n", 73 | " prompt_format=PromptFormat.INSTRUCTION_FOLLOWING,\n", 74 | " )\n", 75 | ")\n", 76 | "scored_simulation = await simulate_and_score(simulator, valid_activation_records)\n", 77 | "print(f\"score={scored_simulation.get_preferred_score():.2f}\")\n" 78 | ] 79 | } 80 | ], 81 | "metadata": { 82 | "kernelspec": { 83 | "display_name": "Python 3", 84 | "language": "python", 85 | "name": "python3" 86 | }, 87 | "language_info": { 88 | "codemirror_mode": { 89 | "name": "ipython", 90 | "version": 3 91 | }, 92 | "file_extension": ".py", 93 | "mimetype": "text/x-python", 94 | "name": "python", 95 | "nbconvert_exporter": "python", 96 | "pygments_lexer": "ipython3", 97 | "version": "3.9.8" 98 | }, 99 | "vscode": { 100 | "interpreter": { 101 | "hash": "fd71fb58b1ad02dde67c8ac595a52586dd87d3465221a699fc288aa2c48d5565" 102 | } 103 | } 104 | }, 105 | "nbformat": 4, 106 | "nbformat_minor": 2 107 | } 108 | -------------------------------------------------------------------------------- /neuron-viewer/src/panes/similarNeurons.jsx: -------------------------------------------------------------------------------- 1 | import React, { useEffect, useState } from "react" 2 | import _ from "lodash" 3 | import { Link } from "react-router-dom" 4 | 5 | import { get_explanations, get_top_neuron_connections } from "../interpAPI" 6 | 7 | function NeuronInfo({ neuron, strength }) { 8 | const [info, setInfo] = useState(null) 9 | 10 | useEffect(() => { 11 | async function fetchInfo() { 12 | const result = (await get_explanations({ 13 | layer: neuron.layer, 14 | neuron: neuron.neuron, 15 | })) 16 | setInfo(result) 17 | } 18 | 19 | if (!info) { 20 | fetchInfo() 21 | } 22 | }, []) 23 | 24 | if (!info) { 25 | return ( 26 |
27 |

28 | Loading neuron {neuron.layer}:{neuron.neuron}... 29 |

30 |
31 |
32 | ) 33 | } 34 | 35 | return ( 36 |
37 |
38 |

40 | 41 | Neuron {neuron.layer}:{neuron.neuron} 42 | 43 |

44 |
45 | Connection strength: {strength.toFixed(2)} 46 |
47 |
48 | {info.scored_explanations.map((explanation, i) => ( 49 | 50 |

51 | {explanation.explanation} 52 |

53 |

54 | score: {explanation.scored_simulation.ev_correlation_score.toFixed(2)} 55 |

56 |
57 | ))} 58 |
59 |
60 |
61 | ) 62 | } 63 | 64 | export default function SimilarNeurons({ activeNeuron: neuron }) { 65 | const [similarNeurons, setSimilarNeurons] = useState([]) 66 | const [isLoading, setIsLoading] = useState(true) 67 | 68 | useEffect(() => { 69 | async function fetchSimilarNeurons() { 70 | const result = await get_top_neuron_connections(neuron) 71 | setSimilarNeurons(result) 72 | setIsLoading(false) 73 | } 74 | 75 | fetchSimilarNeurons() 76 | }, [neuron]) 77 | 78 | if (isLoading) { 79 | return ( 80 |
81 |
82 |
83 | ) 84 | } 85 | 86 | const n_show = 3; 87 | return ( 88 |
89 |

Related neurons

90 |
91 |
92 | { 93 | similarNeurons.input ? 94 |
95 |
Upstream
96 |
97 | {similarNeurons.input.slice(0, n_show).map(([layer, neuron, strength]) => ( 98 | 99 | ))} 100 |
101 |
: null 102 | } 103 | { 104 | similarNeurons.output ? 105 |
106 |
Downstream
107 |
108 | {similarNeurons.output.slice(0, n_show).map(([layer, neuron, strength]) => ( 109 | 110 | ))} 111 |
112 |
: null 113 | } 114 |
115 |
116 |
117 | ) 118 | } 119 | -------------------------------------------------------------------------------- /neuron-viewer/src/simulationHeatmap.tsx: -------------------------------------------------------------------------------- 1 | import React, { useState } from 'react'; 2 | 3 | import { interpolateColor, Color, getInterpolatedColor, DEFAULT_COLORS, DEFAULT_BOUNDARIES, TokenAndActivation } from './types' 4 | 5 | type Props = { 6 | sequences: TokenAndActivation[][], 7 | simulated_sequences: TokenAndActivation[][], 8 | overlay_activations: boolean, 9 | colors?: Color[], 10 | boundaries?: number[], 11 | } 12 | export default function SimulationSequences({ sequences, simulated_sequences, overlay_activations, colors = DEFAULT_COLORS, boundaries = DEFAULT_BOUNDARIES }: Props) { 13 | return <> 14 | { 15 | sequences.map((tokens, i) => { 16 | let simulated_tokens = simulated_sequences[i]; 17 | if (overlay_activations) { 18 | return ( 19 |
20 | {tokens.map(({ token, activation, normalized_activation }, j) => { 21 | const { token: simulated_token, activation: simulated_activation, normalized_activation: simulated_normalized_activation } = simulated_tokens[j]; 22 | if (simulated_token !== token) { 23 | throw new Error('simulated tokens not matching') 24 | } 25 | const color = getInterpolatedColor(colors, boundaries, normalized_activation || activation); 26 | const simcolor = getInterpolatedColor(colors, boundaries, simulated_normalized_activation || simulated_activation); 27 | 28 | return
29 |
30 | {token} 37 | {token} 44 |
45 |
46 | })} 47 |
48 | ) 49 | } else { 50 | return ( 51 |
52 |
53 | Real activations:
59 | {tokens.map(({ token, activation, normalized_activation }, j) => { 60 | const color = getInterpolatedColor(colors, boundaries, normalized_activation || activation); 61 | return {token} 68 | })} 69 |
70 |
71 |
72 | Simulated activations:
78 | {simulated_tokens.map(({ token, activation, normalized_activation }, j) => { 79 | const color = getInterpolatedColor(colors, boundaries, normalized_activation || activation); 80 | return {token} 87 | })} 88 |
89 |
90 | ) 91 | } 92 | }) 93 | } 94 | 95 | } 96 | -------------------------------------------------------------------------------- /neuron-viewer/src/panes/topTokens.jsx: -------------------------------------------------------------------------------- 1 | import React, { useState, useEffect } from "react" 2 | import { get_top_tokens } from "../interpAPI" 3 | 4 | 5 | const TokenDisplay = ({ activeNeuron }) => { 6 | const [isLoading, setIsLoading] = useState(true) 7 | const [data, setData] = useState(null) 8 | 9 | const loadTokens = async () => { 10 | setIsLoading(true) 11 | const weightStrengths = await get_top_tokens(activeNeuron, 'weight') 12 | const activationStrengths = await get_top_tokens(activeNeuron, 'activation') 13 | 14 | const data = { 15 | activeNeuron, 16 | weightStrengths, 17 | activationStrengths, 18 | } 19 | 20 | setData(data) 21 | setIsLoading(false) 22 | } 23 | 24 | useEffect(() => { 25 | if (!data) { 26 | loadTokens() 27 | } 28 | }, []) 29 | 30 | 31 | return ( 32 |
33 |

Related tokens

34 | {isLoading ? ( 35 |
36 |
loading tokens
37 |
38 | ) : ( 39 | <> 40 |

Mean-activation-based

41 |
42 | {data.activationStrengths.tokens.map((token, idx) => { 43 | return ( 44 | data.activationStrengths.average_activations[idx] === null ? null : 45 | 50 | {token} 51 | 52 | ) 53 | })} 54 |
55 |

Weight-based

56 |
57 |

Input tokens:

58 | {data.weightStrengths.input_positive.tokens.slice(0, 20).map((token, idx) => { 59 | return ( 60 | data.weightStrengths.input_positive.strengths[idx] === null ? null : 61 | 66 | {token} 67 | 68 | ) 69 | })} 70 |
71 | { 72 |
73 |

Input tokens negative:

74 | {data.weightStrengths.input_negative.tokens.slice(0, 20).map((token, idx) => { 75 | return ( 76 | data.weightStrengths.input_negative.strengths[idx] === null ? null : 77 | 82 | {token} 83 | 84 | ) 85 | })} 86 |
87 | } 88 |
89 |

Output tokens:

90 | {data.weightStrengths.output_positive.tokens.slice(0, 20).map((token, idx) => { 91 | return ( 92 | data.weightStrengths.output_positive.strengths[idx] === null ? null : 93 | 98 | {token} 99 | 100 | ) 101 | })} 102 |
103 | { 104 |
105 |

Output tokens negative:

106 | {data.weightStrengths.output_negative.tokens.slice(0, 20).map((token, idx) => { 107 | return ( 108 | 113 | {token} 114 | 115 | ) 116 | })} 117 |
118 | } 119 | 120 | )} 121 |
122 | ) 123 | } 124 | export default TokenDisplay 125 | -------------------------------------------------------------------------------- /neuron-explainer/neuron_explainer/explanations/prompt_builder.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from enum import Enum 4 | from typing import TypedDict, Union 5 | 6 | import tiktoken 7 | 8 | HarmonyMessage = TypedDict( 9 | "HarmonyMessage", 10 | { 11 | "role": str, 12 | "content": str, 13 | }, 14 | ) 15 | 16 | 17 | class PromptFormat(str, Enum): 18 | """ 19 | Different ways of formatting the components of a prompt into the format accepted by the relevant 20 | API server endpoint. 21 | """ 22 | 23 | NONE = "none" 24 | """Suitable for use with models that don't use special tokens for instructions.""" 25 | INSTRUCTION_FOLLOWING = "instruction_following" 26 | """Suitable for IF models that use <|endofprompt|>.""" 27 | HARMONY_V4 = "harmony_v4" 28 | """ 29 | Suitable for Harmony models that use a structured turn-taking role+content format. Generates a 30 | list of HarmonyMessage dicts that can be sent to the /chat/completions endpoint. 31 | """ 32 | 33 | @classmethod 34 | def from_string(cls, s: str) -> PromptFormat: 35 | for prompt_format in cls: 36 | if prompt_format.value == s: 37 | return prompt_format 38 | raise ValueError(f"{s} is not a valid PromptFormat") 39 | 40 | 41 | class Role(str, Enum): 42 | """See https://platform.openai.com/docs/guides/chat""" 43 | 44 | SYSTEM = "system" 45 | USER = "user" 46 | ASSISTANT = "assistant" 47 | 48 | 49 | class PromptBuilder: 50 | """Class for accumulating components of a prompt and then formatting them into an output.""" 51 | 52 | def __init__(self) -> None: 53 | self._messages: list[HarmonyMessage] = [] 54 | 55 | def add_message(self, role: Role, message: str) -> None: 56 | self._messages.append(HarmonyMessage(role=role, content=message)) 57 | 58 | def prompt_length_in_tokens(self, prompt_format: PromptFormat) -> int: 59 | # TODO(sbills): Make the model/encoding configurable. This implementation assumes GPT-4. 60 | encoding = tiktoken.get_encoding("cl100k_base") 61 | if prompt_format == PromptFormat.HARMONY_V4: 62 | # Approximately-correct implementation adapted from this documentation: 63 | # https://platform.openai.com/docs/guides/chat/introduction 64 | num_tokens = 0 65 | for message in self._messages: 66 | num_tokens += ( 67 | 4 # every message follows <|im_start|>{role/name}\n{content}<|im_end|>\n 68 | ) 69 | num_tokens += len(encoding.encode(message["content"], allowed_special="all")) 70 | num_tokens += 2 # every reply is primed with <|im_start|>assistant 71 | return num_tokens 72 | else: 73 | prompt_str = self.build(prompt_format) 74 | assert isinstance(prompt_str, str) 75 | return len(encoding.encode(prompt_str, allowed_special="all")) 76 | 77 | def build( 78 | self, prompt_format: PromptFormat, *, allow_extra_system_messages: bool = False 79 | ) -> Union[str, list[HarmonyMessage]]: 80 | """ 81 | Validates the messages added so far (reasonable alternation of assistant vs. user, etc.) 82 | and returns either a regular string (maybe with <|endofprompt|> tokens) or a list of 83 | HarmonyMessages suitable for use with the /chat/completions endpoint. 84 | 85 | The `allow_extra_system_messages` parameter allows the caller to specify that the prompt 86 | should be allowed to contain system messages after the very first one. 87 | """ 88 | # Create a deep copy of the messages so we can modify it and so that the caller can't 89 | # modify the internal state of this object. 90 | messages = [message.copy() for message in self._messages] 91 | 92 | expected_next_role = Role.SYSTEM 93 | for message in messages: 94 | role = message["role"] 95 | assert role == expected_next_role or ( 96 | allow_extra_system_messages and role == Role.SYSTEM 97 | ), f"Expected message from {expected_next_role} but got message from {role}" 98 | if role == Role.SYSTEM: 99 | expected_next_role = Role.USER 100 | elif role == Role.USER: 101 | expected_next_role = Role.ASSISTANT 102 | elif role == Role.ASSISTANT: 103 | expected_next_role = Role.USER 104 | 105 | if prompt_format == PromptFormat.INSTRUCTION_FOLLOWING: 106 | last_user_message = None 107 | for message in messages: 108 | if message["role"] == Role.USER: 109 | last_user_message = message 110 | assert last_user_message is not None 111 | last_user_message["content"] += "<|endofprompt|>" 112 | 113 | if prompt_format == PromptFormat.HARMONY_V4: 114 | return messages 115 | elif prompt_format in [PromptFormat.NONE, PromptFormat.INSTRUCTION_FOLLOWING]: 116 | return "".join(message["content"] for message in messages) 117 | else: 118 | raise ValueError(f"Unknown prompt format: {prompt_format}") 119 | -------------------------------------------------------------------------------- /neuron-explainer/neuron_explainer/activations/activation_records.py: -------------------------------------------------------------------------------- 1 | """Utilities for formatting activation records into prompts.""" 2 | 3 | import math 4 | from typing import Optional, Sequence 5 | 6 | from neuron_explainer.activations.activations import ActivationRecord 7 | 8 | UNKNOWN_ACTIVATION_STRING = "unknown" 9 | 10 | 11 | def relu(x: float) -> float: 12 | return max(0.0, x) 13 | 14 | 15 | def calculate_max_activation(activation_records: Sequence[ActivationRecord]) -> float: 16 | """Return the maximum activation value of the neuron across all the activation records.""" 17 | flattened = [ 18 | # Relu is used to assume any values less than 0 are indicating the neuron is in the resting 19 | # state. This is a simplifying assumption that works with relu/gelu. 20 | max(relu(x) for x in activation_record.activations) 21 | for activation_record in activation_records 22 | ] 23 | return max(flattened) 24 | 25 | 26 | def normalize_activations(activation_record: list[float], max_activation: float) -> list[int]: 27 | """Convert raw neuron activations to integers on the range [0, 10].""" 28 | if max_activation <= 0: 29 | return [0 for x in activation_record] 30 | # Relu is used to assume any values less than 0 are indicating the neuron is in the resting 31 | # state. This is a simplifying assumption that works with relu/gelu. 32 | return [min(10, math.floor(10 * relu(x) / max_activation)) for x in activation_record] 33 | 34 | 35 | def _format_activation_record( 36 | activation_record: ActivationRecord, 37 | max_activation: float, 38 | omit_zeros: bool, 39 | hide_activations: bool = False, 40 | start_index: int = 0, 41 | ) -> str: 42 | """Format neuron activations into a string, suitable for use in prompts.""" 43 | tokens = activation_record.tokens 44 | normalized_activations = normalize_activations(activation_record.activations, max_activation) 45 | if omit_zeros: 46 | assert (not hide_activations) and start_index == 0, "Can't hide activations and omit zeros" 47 | tokens = [ 48 | token for token, activation in zip(tokens, normalized_activations) if activation > 0 49 | ] 50 | normalized_activations = [x for x in normalized_activations if x > 0] 51 | 52 | entries = [] 53 | assert len(tokens) == len(normalized_activations) 54 | for index, token, activation in zip(range(len(tokens)), tokens, normalized_activations): 55 | activation_string = str(int(activation)) 56 | if hide_activations or index < start_index: 57 | activation_string = UNKNOWN_ACTIVATION_STRING 58 | entries.append(f"{token}\t{activation_string}") 59 | return "\n".join(entries) 60 | 61 | 62 | def format_activation_records( 63 | activation_records: Sequence[ActivationRecord], 64 | max_activation: float, 65 | *, 66 | omit_zeros: bool = False, 67 | start_indices: Optional[list[int]] = None, 68 | hide_activations: bool = False, 69 | ) -> str: 70 | """Format a list of activation records into a string.""" 71 | return ( 72 | "\n\n" 73 | + "\n\n\n".join( 74 | [ 75 | _format_activation_record( 76 | activation_record, 77 | max_activation, 78 | omit_zeros=omit_zeros, 79 | hide_activations=hide_activations, 80 | start_index=0 if start_indices is None else start_indices[i], 81 | ) 82 | for i, activation_record in enumerate(activation_records) 83 | ] 84 | ) 85 | + "\n\n" 86 | ) 87 | 88 | 89 | def _format_tokens_for_simulation(tokens: Sequence[str]) -> str: 90 | """ 91 | Format tokens into a string with each token marked as having an "unknown" activation, suitable 92 | for use in prompts. 93 | """ 94 | entries = [] 95 | for token in tokens: 96 | entries.append(f"{token}\t{UNKNOWN_ACTIVATION_STRING}") 97 | return "\n".join(entries) 98 | 99 | 100 | def format_sequences_for_simulation( 101 | all_tokens: Sequence[Sequence[str]], 102 | ) -> str: 103 | """ 104 | Format a list of lists of tokens into a string with each token marked as having an "unknown" 105 | activation, suitable for use in prompts. 106 | """ 107 | return ( 108 | "\n\n" 109 | + "\n\n\n".join( 110 | [_format_tokens_for_simulation(tokens) for tokens in all_tokens] 111 | ) 112 | + "\n\n" 113 | ) 114 | 115 | 116 | def non_zero_activation_proportion( 117 | activation_records: Sequence[ActivationRecord], max_activation: float 118 | ) -> float: 119 | """Return the proportion of activation values that aren't zero.""" 120 | total_activations_count = sum( 121 | [len(activation_record.activations) for activation_record in activation_records] 122 | ) 123 | normalized_activations = [ 124 | normalize_activations(activation_record.activations, max_activation) 125 | for activation_record in activation_records 126 | ] 127 | non_zero_activations_count = sum( 128 | [len([x for x in activations if x != 0]) for activations in normalized_activations] 129 | ) 130 | return non_zero_activations_count / total_activations_count 131 | -------------------------------------------------------------------------------- /neuron-viewer/src/interpAPI.ts: -------------------------------------------------------------------------------- 1 | import {Neuron} from './types'; 2 | import {memoizeAsync} from "./utils" 3 | 4 | export const load_file_no_cache = async(path: string) => { 5 | const data = { 6 | path: path 7 | } 8 | const url = new URL("/load_az", window.location.href) 9 | url.port = '8000'; 10 | return await ( 11 | await fetch(url, { 12 | method: "POST", // or 'PUT' 13 | headers: { 14 | "Content-Type": "application/json", 15 | }, 16 | body: JSON.stringify(data), 17 | }) 18 | ).json() 19 | 20 | } 21 | 22 | export const load_file_az = async(path: string) => { 23 | const res = ( 24 | await fetch(path, { 25 | method: "GET", 26 | mode: "cors", 27 | headers: { 28 | "Content-Type": "application/json", 29 | }, 30 | }) 31 | ) 32 | if (!res.ok) { 33 | console.error(`HTTP error: ${res.status} - ${res.statusText}`); 34 | return; 35 | } 36 | return await res.json() 37 | } 38 | 39 | 40 | // export const load_file = memoizeAsync('load_file', load_file_no_cache) 41 | export const load_file = window.location.host.indexOf('localhost:') === -1 ? load_file_az : load_file_no_cache; 42 | 43 | 44 | // # (derived from az://oaialignment/datasets/interp/gpt2_xl/v1/webtext1/len_nomax/n_50000/mlp_post_act/ranked_by_max_activation) 45 | // const NEURON_RECORDS_PATH = "az://oaisbills/rcall/oss/migrated_make_crow_datasets/gpt2_xl_n_50000_64_token/neurons" 46 | const NEURON_RECORDS_PATH = "https://openaipublic.blob.core.windows.net/neuron-explainer/data/collated-activations" 47 | 48 | // # (derived from az://oaialignment/datasets/interp/gpt2_xl/v1/webtext1/len_nomax/n_50000/mlp_post_act/ranked_by_max_activation/neurons/explanations/canonical-run-v1) 49 | // const EXPLANATIONS_PATH = "az://oaisbills/rcall/oss/migrated_explanation_datasets/canonical_gpt2_xl_all_neurons" 50 | const EXPLANATIONS_PATH = "https://openaipublic.blob.core.windows.net/neuron-explainer/data/explanations" 51 | 52 | // weight-based 53 | // const WHOLE_LAYER_WEIGHT_TOKENS_PATH = "az://oaidan/rcall/data/interpretability/connections/gpt2-xl/mlp/unnorm_token_representations_uncommon_vanilla" 54 | // const WEIGHT_TOKENS_PATH = "az://oaijeffwu/jeffwu-data/interpretability/neuron-connections/gpt2-xl/weight-based" 55 | const WEIGHT_TOKENS_PATH = "https://openaipublic.blob.core.windows.net/neuron-explainer/data/related-tokens/weight-based" 56 | // lookup table 57 | // const WHOLE_LAYER_ACTIVATION_TOKENS_PATH = "az://oaidan/rcall/data/interpretability/connections/gpt2_xl/mlp/unnorm_token_representations_vanilla_and_common_in_colangv2_unigram" 58 | // const ACTIVATION_TOKENS_PATH = "az://oaijeffwu/jeffwu-data/interpretability/neuron-connections/gpt2-xl/lookup-table" 59 | const ACTIVATION_TOKENS_PATH = "https://openaipublic.blob.core.windows.net/neuron-explainer/data/related-tokens/activation-based" 60 | 61 | // const CONNECTIONS_PATH = "az://oaialignment/datasets/interp/connections/gpt2/neuron_space/incl_attn_False" 62 | const CONNECTIONS_PATH = "https://openaipublic.blob.core.windows.net/neuron-explainer/data/related-neurons/weight-based" 63 | 64 | 65 | export const get_explanations = async (activeNeuron: Neuron) => { 66 | const result = await load_file(`${EXPLANATIONS_PATH}/${activeNeuron.layer}/${activeNeuron.neuron}.jsonl`) 67 | return result 68 | } 69 | 70 | export const get_top_tokens = async (activeNeuron: Neuron, weightType: string) => { 71 | let TOKENS_PATH; 72 | if (weightType === 'weight') { 73 | TOKENS_PATH = WEIGHT_TOKENS_PATH; 74 | } else if (weightType === 'activation') { 75 | TOKENS_PATH = ACTIVATION_TOKENS_PATH; 76 | } else { 77 | throw new Error(`Invalid weightType: ${weightType}`) 78 | } 79 | const result = await load_file(`${TOKENS_PATH}/${activeNeuron.layer}/${activeNeuron.neuron}.json`) 80 | return result 81 | // const result = await load_file_no_cache(`${ORIG_TOKENS_PATH}/${activeNeuron.layer}.json`) 82 | // return result.neuron_summaries[activeNeuron.neuron] 83 | } 84 | 85 | export const get_top_neuron_connections = async (activeNeuron: Neuron) => { 86 | const result = await load_file(`${CONNECTIONS_PATH}/${activeNeuron.layer}/${activeNeuron.neuron}.json`) 87 | 88 | const res: {[key: string]: [number, number]} = {}; 89 | ["input", "output"].forEach((direction) => { 90 | const sign = "positive" // "negative" 91 | const weight_name: string = {output: "c_proj", input: "c_fc"}[direction] as string; 92 | const res_for_dir = result[weight_name]; 93 | if (res_for_dir === null) { 94 | return 95 | } 96 | // let key = 'top_negative_neurons' 97 | const top_neuron_strs = res_for_dir[`top_${sign}_neurons`] // {layer}_{neuron} strings for each top-connected neuron 98 | const top_weights = res_for_dir[`top_${sign}_weights`] 99 | const top_layer_neuron_tuples = top_neuron_strs.map((neuron_str: string, i: number) => { 100 | const [layer, neuron] = neuron_str.split("_").map((x: string) => parseInt(x)) 101 | return [layer, neuron, top_weights[i]] as [number, number, number] 102 | }) 103 | res[direction] = top_layer_neuron_tuples.slice(0, 10) 104 | }) 105 | 106 | return res 107 | } 108 | 109 | export const get_neuron_record = async(activeNeuron: Neuron) => { 110 | const result = await load_file(`${NEURON_RECORDS_PATH}/${activeNeuron.layer}/${activeNeuron.neuron}.json`) 111 | return result 112 | } 113 | 114 | -------------------------------------------------------------------------------- /neuron-explainer/neuron_explainer/explanations/token_space_few_shot_examples.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | from enum import Enum 3 | from typing import List 4 | 5 | from neuron_explainer.fast_dataclasses import FastDataclass 6 | 7 | 8 | @dataclass 9 | class Example(FastDataclass): 10 | """ 11 | An example list of tokens as strings corresponding to top token space inputs of a neuron, with a 12 | string explanation of the neuron's behavior on these tokens. 13 | """ 14 | 15 | tokens: List[str] 16 | explanation: str 17 | 18 | 19 | class TokenSpaceFewShotExampleSet(Enum): 20 | """Determines which few-shot examples to use when sampling explanations.""" 21 | 22 | ORIGINAL = "original" 23 | TEST = "test" 24 | 25 | def get_examples(self) -> list[Example]: 26 | """Returns regular examples for use in a few-shot prompt.""" 27 | if self is TokenSpaceFewShotExampleSet.ORIGINAL: 28 | return ORIGINAL_EXAMPLES 29 | elif self is TokenSpaceFewShotExampleSet.TEST: 30 | return TEST_EXAMPLES 31 | else: 32 | raise ValueError(f"Unhandled example set: {self}") 33 | 34 | 35 | ORIGINAL_EXAMPLES = [ 36 | Example( 37 | tokens=[ 38 | "actual", 39 | " literal", 40 | " actual", 41 | " hyper", 42 | " real", 43 | " EX", 44 | " Real", 45 | "^", 46 | "Full", 47 | " full", 48 | " optical", 49 | " style", 50 | "any", 51 | "ALL", 52 | "extreme", 53 | " miniature", 54 | " Optical", 55 | " faint", 56 | "~", 57 | " Physical", 58 | " REAL", 59 | "*", 60 | "virtual", 61 | "TYPE", 62 | " technical", 63 | "otally", 64 | " physic", 65 | "Type", 66 | "<", 67 | "images", 68 | "atic", 69 | " sheer", 70 | " Style", 71 | " partial", 72 | " natural", 73 | "Hyper", 74 | " Any", 75 | " theoretical", 76 | "|", 77 | " ultimate", 78 | "oing", 79 | " constant", 80 | "ANY", 81 | "antically", 82 | "ishly", 83 | " ex", 84 | " visual", 85 | "special", 86 | "omorphic", 87 | "visual", 88 | ], 89 | explanation=" adjectives related to being real, or to physical properties and evidence", 90 | ), 91 | Example( 92 | tokens=[ 93 | "cephal", 94 | "aeus", 95 | " coma", 96 | "bered", 97 | "abetes", 98 | "inflamm", 99 | "rugged", 100 | "alysed", 101 | "azine", 102 | "hered", 103 | "cells", 104 | "aneously", 105 | "fml", 106 | "igm", 107 | "culosis", 108 | "iani", 109 | "CTV", 110 | "disabled", 111 | "heric", 112 | "ulo", 113 | "geoning", 114 | "awi", 115 | "translation", 116 | "iral", 117 | "govtrack", 118 | "mson", 119 | "cloth", 120 | "nesota", 121 | " Dise", 122 | " Lyme", 123 | " dementia", 124 | "agn", 125 | " reversible", 126 | " susceptibility", 127 | "esthesia", 128 | "orf", 129 | " inflamm", 130 | " Obesity", 131 | " tox", 132 | " Disorders", 133 | "uberty", 134 | "blind", 135 | "ALTH", 136 | "avier", 137 | " Immunity", 138 | " Hurt", 139 | "ulet", 140 | "ueless", 141 | " sluggish", 142 | "rosis", 143 | ], 144 | explanation=" words related to physical medical conditions", 145 | ), 146 | Example( 147 | tokens=[ 148 | " January", 149 | "terday", 150 | "cember", 151 | " April", 152 | " July", 153 | "September", 154 | "December", 155 | "Thursday", 156 | "quished", 157 | "November", 158 | "Tuesday", 159 | "uesday", 160 | " Sept", 161 | "ruary", 162 | " March", 163 | ";;;;;;;;;;;;", 164 | " Monday", 165 | "Wednesday", 166 | " Saturday", 167 | " Wednesday", 168 | "Reloaded", 169 | "aturday", 170 | " August", 171 | "Feb", 172 | "Sunday", 173 | "Reviewed", 174 | "uggest", 175 | " Dhabi", 176 | "ACTED", 177 | "tten", 178 | "Year", 179 | "August", 180 | "alogue", 181 | "MX", 182 | " Janeiro", 183 | "yss", 184 | " Leilan", 185 | " Fiscal", 186 | " referen", 187 | "semb", 188 | "eele", 189 | "wcs", 190 | "detail", 191 | "ertation", 192 | " Reborn", 193 | " Sunday", 194 | "itially", 195 | "aturdays", 196 | " Dise", 197 | "essage", 198 | ], 199 | explanation=" nouns related to time and dates", 200 | ), 201 | ] 202 | 203 | TEST_EXAMPLES = [ 204 | Example( 205 | tokens=[ 206 | "these", 207 | " are", 208 | " tokens", 209 | ], 210 | explanation=" this is a test explanation", 211 | ), 212 | ] 213 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Automated interpretability 2 | 3 | ## Code and tools 4 | 5 | This repository contains code and tools associated with the [Language models can explain neurons in 6 | language models](https://openaipublic.blob.core.windows.net/neuron-explainer/paper/index.html) paper, specifically: 7 | 8 | * Code for automatically generating, simulating, and scoring explanations of neuron behavior using 9 | the methodology described in the paper. See the 10 | [neuron-explainer README](neuron-explainer/README.md) for more information. 11 | 12 | Note: if you run into errors of the form "Error: Could not find any credentials that grant access to storage account: 'openaipublic' and container: 'neuron-explainer'"." you might be able to fix this by signing up for an azure account and specifying the credentials as described in the error message. 13 | 14 | * A tool for viewing neuron activations and explanations, accessible 15 | [here](https://openaipublic.blob.core.windows.net/neuron-explainer/neuron-viewer/index.html). See 16 | the [neuron-viewer README](neuron-viewer/README.md) for more information. 17 | 18 | ## Public datasets 19 | 20 | Together with this code, we're also releasing public datasets of GPT-2 XL neurons and explanations. 21 | Here's an overview of those datasets. 22 | 23 | * Neuron activations: `az://openaipublic/neuron-explainer/data/collated-activations/{layer_index}/{neuron_index}.json` 24 | - Tokenized text sequences and their activations for the neuron. We 25 | provide multiple sets of tokens and activations: top-activating ones, random 26 | samples from several quantiles; and a completely random sample. We also provide 27 | some basic statistics for the activations. 28 | - Each file contains a JSON-formatted 29 | [`NeuronRecord`](neuron-explainer/neuron_explainer/activations/activations.py#L89) dataclass. 30 | * Neuron explanations: `az://openaipublic/neuron-explainer/data/explanations/{layer_index}/{neuron_index}.jsonl` 31 | - Scored model-generated explanations of the behavior of the neuron, including simulation results. 32 | - Each file contains a JSON-formatted 33 | [`NeuronSimulationResults`](neuron-explainer/neuron_explainer/explanations/explanations.py#L146) 34 | dataclass. 35 | * Related neurons: `az://openaipublic/neuron-explainer/data/related-neurons/weight-based/{layer_index}/{neuron_index}.json` 36 | - Lists of the upstream and downstream neurons with the most positive and negative connections (see below for definition). 37 | - Each file contains a JSON-formatted dataclass whose definition is not included in this repo. 38 | * Tokens with high average activations: 39 | `az://openaipublic/neuron-explainer/data/related-tokens/activation-based/{layer_index}/{neuron_index}.json` 40 | - Lists of tokens with the highest average activations for individual neurons, and their average activations. 41 | - Each file contains a JSON-formatted [`TokenLookupTableSummaryOfNeuron`](neuron-explainer/neuron_explainer/activations/token_connections.py#L36) 42 | dataclass. 43 | * Tokens with large inbound and outbound weights: 44 | `az://openaipublic/neuron-explainer/data/related-tokens/weight-based/{layer_index}/{neuron_index}.json` 45 | - List of the most-positive and most-negative input and output tokens for individual neurons, 46 | as well as the associated weight (see below for definition). 47 | - Each file contains a JSON-formatted [`WeightBasedSummaryOfNeuron`](neuron-explainer/neuron_explainer/activations/token_connections.py#L17) 48 | dataclass. 49 | 50 | Update (July 5, 2023): 51 | We also released a set of explanations for GPT-2 Small. The methodology is slightly different from the methodology used for GPT-2 XL so the results aren't directly comparable. 52 | * Neuron activations: `az://openaipublic/neuron-explainer/gpt2_small_data/collated-activations/{layer_index}/{neuron_index}.json` 53 | * Neuron explanations: `az://openaipublic/neuron-explainer/gpt2_small_data/explanations/{layer_index}/{neuron_index}.jsonl` 54 | 55 | Update (August 30, 2023): We recently discovered a bug in how we performed inference on the GPT-2 series models used for the paper and for these datasets. Specifically, we used an optimized GELU implementation rather than the original GELU implementation associated with GPT-2. While the model’s behavior is very similar across these two configurations, the post-MLP activation values we used to generate and simulate explanations differ from the correct values by the following amounts for GPT-2 small: 56 | 57 | - Median: 0.0090 58 | - 90th percentile: 0.0252 59 | - 99th percentile: 0.0839 60 | - 99.9th percentile: 0.1736 61 | 62 | ### Definition of connection weights 63 | 64 | Refer to [GPT-2 model code](https://github.com/openai/gpt-2/blob/master/src/model.py) for 65 | understanding of model weight conventions. 66 | 67 | *Neuron-neuron*: For two neurons `(l1, n1)` and `(l2, n2)` with `l1 < l2`, the connection strength is defined as 68 | `h{l1}.mlp.c_proj.w[:, n1, :] @ diag(h{l2}.ln_2.g) @ h{l2}.mlp.c_fc.w[:, :, n2]`. 69 | 70 | *Neuron-token*: For token `t` and neuron `(l, n)`, the input weight is computed as 71 | `wte[t, :] @ diag(h{l}.ln_2.g) @ h{l}.mlp.c_fc.w[:, :, n]` 72 | and the output weight is computed as 73 | `h{l}.mlp.c_proj.w[:, n, :] @ diag(ln_f.g) @ wte[t, :]`. 74 | 75 | ### Misc Lists of Interesting Neurons 76 | Lists of neurons we thought were interesting according to different criteria, with some preliminary descriptions. 77 | * [Interesting Neurons (external)](https://docs.google.com/spreadsheets/d/1p7fYs31NU8sJoeKyUx4Mn2laGx8xXfHg_KcIvYiKPpg/edit#gid=0) 78 | * [Neurons that score high on random, possibly monosemantic? (external)](https://docs.google.com/spreadsheets/d/1TqKFcz-84jyIHLU7VRoTc8BoFBMpbgac-iNBnxVurQ8/edit?usp=sharing) 79 | * [Clusters of neurons well explained by activation explanation but not by tokens](https://docs.google.com/document/d/1lWhKowpKDdwTMALD_K541cdwgGoQx8DFUSuEe1U2AGE/edit?usp=sharing) 80 | * [Neurons sensitive to truncation](https://docs.google.com/document/d/1x89TWBvuHcyC2t01EDbJZJ5LQYHozlcS-VUmr5shf_A/edit?usp=sharing) 81 | -------------------------------------------------------------------------------- /neuron-explainer/neuron_explainer/api_client.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import contextlib 3 | import os 4 | import random 5 | import traceback 6 | from asyncio import Semaphore 7 | from functools import wraps 8 | from typing import Any, Callable, Optional 9 | 10 | import httpx 11 | import orjson 12 | 13 | 14 | def is_api_error(err: Exception) -> bool: 15 | if isinstance(err, httpx.HTTPStatusError): 16 | response = err.response 17 | error_data = response.json().get("error", {}) 18 | error_message = error_data.get("message") 19 | if response.status_code in [400, 404, 415]: 20 | if error_data.get("type") == "idempotency_error": 21 | print(f"Retrying after idempotency error: {error_message} ({response.url})") 22 | return True 23 | else: 24 | # Invalid request 25 | return False 26 | else: 27 | print(f"Retrying after API error: {error_message} ({response.url})") 28 | return True 29 | 30 | elif isinstance(err, httpx.ConnectError): 31 | print(f"Retrying after connection error... ({err.request.url})") 32 | return True 33 | 34 | elif isinstance(err, httpx.TimeoutException): 35 | print(f"Retrying after a timeout error... ({err.request.url})") 36 | return True 37 | 38 | elif isinstance(err, httpx.ReadError): 39 | print(f"Retrying after a read error... ({err.request.url})") 40 | return True 41 | 42 | print(f"Retrying after an unexpected error: {repr(err)}") 43 | traceback.print_tb(err.__traceback__) 44 | return True 45 | 46 | 47 | def exponential_backoff( 48 | retry_on: Callable[[Exception], bool] = lambda err: True 49 | ) -> Callable[[Callable], Callable]: 50 | """ 51 | Returns a decorator which retries the wrapped function as long as the specified retry_on 52 | function returns True for the exception, applying exponential backoff with jitter after 53 | failures, up to a retry limit. 54 | """ 55 | init_delay_s = 1.0 56 | max_delay_s = 10.0 57 | # Roughly 30 minutes before we give up. 58 | max_tries = 200 59 | backoff_multiplier = 2.0 60 | jitter = 0.2 61 | 62 | def decorate(f: Callable) -> Callable: 63 | assert asyncio.iscoroutinefunction(f) 64 | 65 | @wraps(f) 66 | async def f_retry(*args: Any, **kwargs: Any) -> None: 67 | delay_s = init_delay_s 68 | for i in range(max_tries): 69 | try: 70 | return await f(*args, **kwargs) 71 | except Exception as err: 72 | if not retry_on(err) or i == max_tries - 1: 73 | raise 74 | jittered_delay = random.uniform(delay_s * (1 - jitter), delay_s * (1 + jitter)) 75 | await asyncio.sleep(jittered_delay) 76 | delay_s = min(delay_s * backoff_multiplier, max_delay_s) 77 | 78 | return f_retry 79 | 80 | return decorate 81 | 82 | 83 | API_KEY = os.getenv("OPENAI_API_KEY") 84 | assert API_KEY, "Please set the OPENAI_API_KEY environment variable" 85 | API_HTTP_HEADERS = { 86 | "Content-Type": "application/json", 87 | "Authorization": "Bearer " + API_KEY, 88 | } 89 | BASE_API_URL = "https://api.openai.com/v1" 90 | 91 | 92 | class ApiClient: 93 | """Performs inference using the OpenAI API. Supports response caching and concurrency limits.""" 94 | 95 | def __init__( 96 | self, 97 | model_name: str, 98 | # If set, no more than this number of HTTP requests will be made concurrently. 99 | max_concurrent: Optional[int] = None, 100 | # Whether to cache request/response pairs in memory to avoid duplicating requests. 101 | cache: bool = False, 102 | ): 103 | self.model_name = model_name 104 | 105 | if max_concurrent is not None: 106 | self._concurrency_check: Optional[Semaphore] = Semaphore(max_concurrent) 107 | else: 108 | self._concurrency_check = None 109 | 110 | if cache: 111 | self._cache: Optional[dict[str, Any]] = {} 112 | else: 113 | self._cache = None 114 | 115 | @exponential_backoff(retry_on=is_api_error) 116 | async def make_request( 117 | self, timeout_seconds: Optional[int] = None, **kwargs: Any 118 | ) -> dict[str, Any]: 119 | if self._cache is not None: 120 | key = orjson.dumps(kwargs) 121 | if key in self._cache: 122 | return self._cache[key] 123 | async with contextlib.AsyncExitStack() as stack: 124 | if self._concurrency_check is not None: 125 | await stack.enter_async_context(self._concurrency_check) 126 | http_client = await stack.enter_async_context( 127 | httpx.AsyncClient(timeout=timeout_seconds) 128 | ) 129 | # If the request has a "messages" key, it should be sent to the /chat/completions 130 | # endpoint. Otherwise, it should be sent to the /completions endpoint. 131 | url = BASE_API_URL + ("/chat/completions" if "messages" in kwargs else "/completions") 132 | kwargs["model"] = self.model_name 133 | response = await http_client.post(url, headers=API_HTTP_HEADERS, json=kwargs) 134 | # The response json has useful information but the exception doesn't include it, so print it 135 | # out then reraise. 136 | try: 137 | response.raise_for_status() 138 | except Exception as e: 139 | print(response.json()) 140 | raise e 141 | if self._cache is not None: 142 | self._cache[key] = response.json() 143 | return response.json() 144 | 145 | 146 | if __name__ == "__main__": 147 | 148 | async def main() -> None: 149 | client = ApiClient(model_name="gpt-3.5-turbo", max_concurrent=1) 150 | print(await client.make_request(prompt="Why did the chicken cross the road?", max_tokens=9)) 151 | 152 | asyncio.run(main()) 153 | -------------------------------------------------------------------------------- /neuron-explainer/neuron_explainer/explanations/scoring.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import asyncio 4 | import logging 5 | from typing import Any, Callable, Coroutine, Sequence 6 | 7 | import numpy as np 8 | from neuron_explainer.activations.activations import ActivationRecord 9 | from neuron_explainer.explanations.calibrated_simulator import ( 10 | CalibratedNeuronSimulator, 11 | LinearCalibratedNeuronSimulator, 12 | ) 13 | from neuron_explainer.explanations.explanations import ( 14 | ScoredSequenceSimulation, 15 | ScoredSimulation, 16 | SequenceSimulation, 17 | ) 18 | from neuron_explainer.explanations.simulator import ExplanationNeuronSimulator, NeuronSimulator 19 | 20 | 21 | def flatten_list(list_of_lists: Sequence[Sequence[Any]]) -> list[Any]: 22 | return [item for sublist in list_of_lists for item in sublist] 23 | 24 | 25 | def correlation_score( 26 | real_activations: Sequence[float] | np.ndarray, 27 | predicted_activations: Sequence[float] | np.ndarray, 28 | ) -> float: 29 | return np.corrcoef(real_activations, predicted_activations)[0, 1] 30 | 31 | 32 | def score_from_simulation( 33 | real_activations: ActivationRecord, 34 | simulation: SequenceSimulation, 35 | score_function: Callable[[Sequence[float] | np.ndarray, Sequence[float] | np.ndarray], float], 36 | ) -> float: 37 | return score_function(real_activations.activations, simulation.expected_activations) 38 | 39 | 40 | def rsquared_score_from_sequences( 41 | real_activations: Sequence[float] | np.ndarray, 42 | predicted_activations: Sequence[float] | np.ndarray, 43 | ) -> float: 44 | return float( 45 | 1 46 | - np.mean(np.square(np.array(real_activations) - np.array(predicted_activations))) 47 | / np.mean(np.square(np.array(real_activations))) 48 | ) 49 | 50 | 51 | def absolute_dev_explained_score_from_sequences( 52 | real_activations: Sequence[float] | np.ndarray, 53 | predicted_activations: Sequence[float] | np.ndarray, 54 | ) -> float: 55 | return float( 56 | 1 57 | - np.mean(np.abs(np.array(real_activations) - np.array(predicted_activations))) 58 | / np.mean(np.abs(np.array(real_activations))) 59 | ) 60 | 61 | 62 | async def make_explanation_simulator( 63 | explanation: str, 64 | calibration_activation_records: Sequence[ActivationRecord], 65 | model_name: str, 66 | calibrated_simulator_class: type[CalibratedNeuronSimulator] = LinearCalibratedNeuronSimulator, 67 | ) -> CalibratedNeuronSimulator: 68 | """ 69 | Make a simulator that uses an explanation to predict activations and calibrates it on the given 70 | activation records. 71 | """ 72 | simulator = ExplanationNeuronSimulator(model_name, explanation) 73 | calibrated_simulator = calibrated_simulator_class(simulator) 74 | await calibrated_simulator.calibrate(calibration_activation_records) 75 | return calibrated_simulator 76 | 77 | 78 | async def _simulate_and_score_sequence( 79 | simulator: NeuronSimulator, activations: ActivationRecord 80 | ) -> ScoredSequenceSimulation: 81 | """Score an explanation of a neuron by how well it predicts activations on a sentence.""" 82 | simulation = await simulator.simulate(activations.tokens) 83 | logging.debug(simulation) 84 | rsquared_score = score_from_simulation(activations, simulation, rsquared_score_from_sequences) 85 | absolute_dev_explained_score = score_from_simulation( 86 | activations, simulation, absolute_dev_explained_score_from_sequences 87 | ) 88 | scored_sequence_simulation = ScoredSequenceSimulation( 89 | simulation=simulation, 90 | true_activations=activations.activations, 91 | ev_correlation_score=score_from_simulation(activations, simulation, correlation_score), 92 | rsquared_score=rsquared_score, 93 | absolute_dev_explained_score=absolute_dev_explained_score, 94 | ) 95 | return scored_sequence_simulation 96 | 97 | 98 | def aggregate_scored_sequence_simulations( 99 | scored_sequence_simulations: list[ScoredSequenceSimulation], 100 | ) -> ScoredSimulation: 101 | """ 102 | Aggregate a list of scored sequence simulations. The logic for doing this is non-trivial for EV 103 | scores, since we want to calculate the correlation over all activations from all sequences at 104 | once rather than simply averaging per-sequence correlations. 105 | """ 106 | all_true_activations: list[float] = [] 107 | all_expected_values: list[float] = [] 108 | for scored_sequence_simulation in scored_sequence_simulations: 109 | all_true_activations.extend(scored_sequence_simulation.true_activations or []) 110 | all_expected_values.extend(scored_sequence_simulation.simulation.expected_activations) 111 | ev_correlation_score = ( 112 | correlation_score(all_true_activations, all_expected_values) 113 | if len(all_true_activations) > 0 114 | else None 115 | ) 116 | rsquared_score = rsquared_score_from_sequences(all_true_activations, all_expected_values) 117 | absolute_dev_explained_score = absolute_dev_explained_score_from_sequences( 118 | all_true_activations, all_expected_values 119 | ) 120 | 121 | return ScoredSimulation( 122 | scored_sequence_simulations=scored_sequence_simulations, 123 | ev_correlation_score=ev_correlation_score, 124 | rsquared_score=rsquared_score, 125 | absolute_dev_explained_score=absolute_dev_explained_score, 126 | ) 127 | 128 | 129 | async def simulate_and_score( 130 | simulator: NeuronSimulator, 131 | activation_records: Sequence[ActivationRecord], 132 | ) -> ScoredSimulation: 133 | """ 134 | Score an explanation of a neuron by how well it predicts activations on the given text 135 | sequences. 136 | """ 137 | scored_sequence_simulations = await asyncio.gather( 138 | *[ 139 | _simulate_and_score_sequence( 140 | simulator, 141 | activation_record, 142 | ) 143 | for activation_record in activation_records 144 | ] 145 | ) 146 | return aggregate_scored_sequence_simulations(scored_sequence_simulations) 147 | 148 | 149 | async def make_simulator_and_score( 150 | make_simulator: Coroutine[None, None, NeuronSimulator], 151 | activation_records: Sequence[ActivationRecord], 152 | ) -> ScoredSimulation: 153 | """Chain together creating the simulator and using it to score activation records.""" 154 | simulator = await make_simulator 155 | return await simulate_and_score(simulator, activation_records) 156 | -------------------------------------------------------------------------------- /neuron-viewer/src/panes/explanation.jsx: -------------------------------------------------------------------------------- 1 | import React, { useState, useEffect } from "react" 2 | import { get_explanations } from "../interpAPI" 3 | // import HeatmapGrid from "../heatmapGrid" 4 | import SimulationHeatmap from "../simulationHeatmap" 5 | import { normalizeTokenActs } from "../types" 6 | 7 | 8 | function zip_simulated_sequences(sequences) { 9 | return sequences.map(({ simulation }) => { 10 | return simulation.tokens.map((token, idx) => ({ 11 | token, 12 | activation: simulation.expected_activations[idx], 13 | })) 14 | }) 15 | } 16 | 17 | function zip_real_sequences(sequences) { 18 | return sequences.map(({ simulation, true_activations }) => { 19 | return simulation.tokens.map((token, idx) => ({ 20 | token, 21 | activation: true_activations[idx], 22 | })) 23 | }) 24 | } 25 | 26 | const ExplanationDisplay = ({ activeNeuron }) => { 27 | const [isLoading, setIsLoading] = useState(true) 28 | const [data, setData] = useState(null) 29 | const [showingScoringDetails, setShowingScoringDetails] = useState(false) 30 | const [toggle, setToggle] = useState(false); 31 | 32 | const loadExplanation = async () => { 33 | const result = await get_explanations(activeNeuron); 34 | setData(result.scored_explanations[0]) 35 | setIsLoading(false) 36 | } 37 | 38 | useEffect(() => { 39 | if (!data) { 40 | loadExplanation() 41 | } 42 | }, []) 43 | 44 | const handleToggleChange = () => { 45 | setToggle(!toggle); 46 | }; 47 | 48 | let sim_sequences; 49 | if (data) { 50 | sim_sequences = zip_simulated_sequences(data.scored_simulation.scored_sequence_simulations); 51 | [sim_sequences] = normalizeTokenActs(sim_sequences) 52 | } else { 53 | sim_sequences = [] 54 | } 55 | 56 | let real_sequences; 57 | if (data) { 58 | real_sequences = zip_real_sequences(data.scored_simulation.scored_sequence_simulations); 59 | [real_sequences] = normalizeTokenActs(real_sequences) 60 | } else { 61 | real_sequences = [] 62 | } 63 | 64 | const suggest_explanation_link = "https://docs.google.com/forms/d/e/1FAIpQLSckMyDQedGhdISIqaqn0YGUtd2xqEWgPu7ehoPUTT2pTge_-g/viewform?" 65 | + `usp=pp_url&entry.541490611=${activeNeuron.layer}` 66 | + `&entry.1688855196=${activeNeuron.neuron}` 67 | + `&entry.495312202=https://openaipublic.blob.core.windows.net/neuron-explainer/neuron-viewer/index.html%23/layers/${activeNeuron.layer}/neurons/${activeNeuron.neuron}`; 68 | 69 | return ( 70 | <> 71 |
72 |

Explanation

73 | {isLoading ? ( 74 |
75 |
Loading...
76 |
77 | ) : ( 78 | <> 79 |
80 |

81 | {data.explanation} 82 |

83 |

84 | score: {data.scored_simulation.ev_correlation_score.toFixed(2)} 85 |

86 |

87 | Suggest Better Explanation 88 |

89 |
90 | 93 | { 94 | showingScoringDetails ? 95 | <> 96 |
101 |
115 | 128 | 142 |
143 |
144 | {toggle ? 'Activations overlaid (top = real, bottom = simulated)' : 'Activations not overlaid'} 145 |
146 |

Top

147 | 152 |

Random

153 | 158 | : null 159 | } 160 | 161 | )} 162 |
163 | 164 | ) 165 | } 166 | 167 | export default ExplanationDisplay 168 | -------------------------------------------------------------------------------- /neuron-explainer/neuron_explainer/explanations/test_explainer.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | from typing import Any 3 | 4 | from neuron_explainer.explanations.explainer import ( 5 | TokenActivationPairExplainer, 6 | TokenSpaceRepresentationExplainer, 7 | ) 8 | from neuron_explainer.explanations.few_shot_examples import TEST_EXAMPLES, FewShotExampleSet 9 | from neuron_explainer.explanations.prompt_builder import HarmonyMessage, PromptFormat, Role 10 | from neuron_explainer.explanations.token_space_few_shot_examples import ( 11 | TokenSpaceFewShotExampleSet, 12 | ) 13 | 14 | 15 | def setup_module(unused_module: Any) -> None: 16 | # Make sure we have an event loop, since the attempt to create the Semaphore in 17 | # ResearchApiClient will fail without it. 18 | loop = asyncio.new_event_loop() 19 | asyncio.set_event_loop(loop) 20 | 21 | 22 | def test_if_formatting() -> None: 23 | expected_prompt = """We're studying neurons in a neural network. Each neuron looks for some particular thing in a short document. Look at the parts of the document the neuron activates for and summarize in a single sentence what the neuron is looking for. Don't list examples of words. 24 | 25 | The activation format is tokenactivation. Activation values range from 0 to 10. A neuron finding what it's looking for is represented by a non-zero activation value. The higher the activation value, the stronger the match. 26 | 27 | Neuron 1 28 | Activations: 29 | 30 | a 10 31 | b 0 32 | c 0 33 | 34 | 35 | d 0 36 | e 10 37 | f 0 38 | 39 | 40 | Explanation of neuron 1 behavior: the main thing this neuron does is find vowels. 41 | 42 | Neuron 2 43 | Activations: 44 | 45 | a 10 46 | b 0 47 | c 0 48 | 49 | 50 | d 0 51 | e 10 52 | f 0 53 | 54 | 55 | Explanation of neuron 2 behavior:<|endofprompt|> the main thing this neuron does is find""" 56 | 57 | explainer = TokenActivationPairExplainer( 58 | model_name="text-davinci-003", 59 | prompt_format=PromptFormat.INSTRUCTION_FOLLOWING, 60 | few_shot_example_set=FewShotExampleSet.TEST, 61 | ) 62 | prompt = explainer.make_explanation_prompt( 63 | all_activation_records=TEST_EXAMPLES[0].activation_records, 64 | max_activation=1.0, 65 | max_tokens_for_completion=20, 66 | ) 67 | 68 | assert prompt == expected_prompt 69 | 70 | 71 | def test_harmony_format() -> None: 72 | expected_prompt = [ 73 | HarmonyMessage( 74 | role=Role.SYSTEM, 75 | content="""We're studying neurons in a neural network. Each neuron looks for some particular thing in a short document. Look at the parts of the document the neuron activates for and summarize in a single sentence what the neuron is looking for. Don't list examples of words. 76 | 77 | The activation format is tokenactivation. Activation values range from 0 to 10. A neuron finding what it's looking for is represented by a non-zero activation value. The higher the activation value, the stronger the match.""", 78 | ), 79 | HarmonyMessage( 80 | role=Role.USER, 81 | content=""" 82 | 83 | Neuron 1 84 | Activations: 85 | 86 | a 10 87 | b 0 88 | c 0 89 | 90 | 91 | d 0 92 | e 10 93 | f 0 94 | 95 | 96 | Explanation of neuron 1 behavior: the main thing this neuron does is find""", 97 | ), 98 | HarmonyMessage( 99 | role=Role.ASSISTANT, 100 | content=" vowels.", 101 | ), 102 | HarmonyMessage( 103 | role=Role.USER, 104 | content=""" 105 | 106 | Neuron 2 107 | Activations: 108 | 109 | a 10 110 | b 0 111 | c 0 112 | 113 | 114 | d 0 115 | e 10 116 | f 0 117 | 118 | 119 | Explanation of neuron 2 behavior: the main thing this neuron does is find""", 120 | ), 121 | ] 122 | 123 | explainer = TokenActivationPairExplainer( 124 | model_name="gpt-4", 125 | prompt_format=PromptFormat.HARMONY_V4, 126 | few_shot_example_set=FewShotExampleSet.TEST, 127 | ) 128 | prompt = explainer.make_explanation_prompt( 129 | all_activation_records=TEST_EXAMPLES[0].activation_records, 130 | max_activation=1.0, 131 | max_tokens_for_completion=20, 132 | ) 133 | 134 | assert isinstance(prompt, list) 135 | assert isinstance(prompt[0], dict) # Really a HarmonyMessage 136 | for actual_message, expected_message in zip(prompt, expected_prompt): 137 | assert actual_message["role"] == expected_message["role"] 138 | assert actual_message["content"] == expected_message["content"] 139 | assert prompt == expected_prompt 140 | 141 | 142 | def test_token_space_explainer_if_formatting() -> None: 143 | expected_prompt = """We're studying neurons in a neural network. Each neuron looks for some particular kind of token (which can be a word, or part of a word). Look at the tokens the neuron activates for (listed below) and summarize in a single sentence what the neuron is looking for. Don't list examples of words. 144 | 145 | 146 | 147 | Tokens: 148 | 'these', ' are', ' tokens' 149 | 150 | Explanation: 151 | This neuron is looking for this is a test explanation. 152 | 153 | 154 | 155 | Tokens: 156 | 'foo', 'bar', 'baz' 157 | 158 | Explanation: 159 | <|endofprompt|>This neuron is looking for""" 160 | 161 | explainer = TokenSpaceRepresentationExplainer( 162 | model_name="text-davinci-002", 163 | prompt_format=PromptFormat.INSTRUCTION_FOLLOWING, 164 | use_few_shot=True, 165 | few_shot_example_set=TokenSpaceFewShotExampleSet.TEST, 166 | ) 167 | prompt = explainer.make_explanation_prompt( 168 | tokens=["foo", "bar", "baz"], 169 | max_tokens_for_completion=20, 170 | ) 171 | 172 | assert prompt == expected_prompt 173 | 174 | 175 | def test_token_space_explainer_harmony_formatting() -> None: 176 | expected_prompt = [ 177 | HarmonyMessage( 178 | role=Role.SYSTEM, 179 | content="We're studying neurons in a neural network. Each neuron looks for some particular kind of token (which can be a word, or part of a word). Look at the tokens the neuron activates for (listed below) and summarize in a single sentence what the neuron is looking for. Don't list examples of words.", 180 | ), 181 | HarmonyMessage( 182 | role=Role.USER, 183 | content=""" 184 | 185 | 186 | 187 | Tokens: 188 | 'these', ' are', ' tokens' 189 | 190 | Explanation: 191 | This neuron is looking for""", 192 | ), 193 | HarmonyMessage( 194 | role=Role.ASSISTANT, 195 | content=" this is a test explanation.", 196 | ), 197 | HarmonyMessage( 198 | role=Role.USER, 199 | content=""" 200 | 201 | 202 | 203 | Tokens: 204 | 'foo', 'bar', 'baz' 205 | 206 | Explanation: 207 | This neuron is looking for""", 208 | ), 209 | ] 210 | 211 | explainer = TokenSpaceRepresentationExplainer( 212 | model_name="gpt-4", 213 | prompt_format=PromptFormat.HARMONY_V4, 214 | use_few_shot=True, 215 | few_shot_example_set=TokenSpaceFewShotExampleSet.TEST, 216 | ) 217 | prompt = explainer.make_explanation_prompt( 218 | tokens=["foo", "bar", "baz"], 219 | max_tokens_for_completion=20, 220 | ) 221 | 222 | assert isinstance(prompt, list) 223 | assert isinstance(prompt[0], dict) # Really a HarmonyMessage 224 | for actual_message, expected_message in zip(prompt, expected_prompt): 225 | assert actual_message["role"] == expected_message["role"] 226 | assert actual_message["content"] == expected_message["content"] 227 | assert prompt == expected_prompt 228 | -------------------------------------------------------------------------------- /neuron-viewer/src/App.css: -------------------------------------------------------------------------------- 1 | @tailwind base; 2 | @tailwind components; 3 | @tailwind utilities; 4 | 5 | 6 | :root { 7 | --secondary-color: #0d978b; 8 | --accent-color: #ff4d00; 9 | } 10 | 11 | .full-width{ 12 | width: 100vw; 13 | position: relative; 14 | margin-left: -50vw; 15 | left: 50%; 16 | } 17 | 18 | .App { 19 | text-align: center; 20 | } 21 | 22 | .App-logo { 23 | height: 40vmin; 24 | pointer-events: none; 25 | } 26 | 27 | @media (prefers-reduced-motion: no-preference) { 28 | .App-logo { 29 | animation: App-logo-spin infinite 20s linear; 30 | } 31 | } 32 | 33 | .App h1 { 34 | font-size: 1.75rem; 35 | } 36 | 37 | .App-article { 38 | background-color: #282c34; 39 | min-height: 100vh; 40 | display: flex; 41 | flex-direction: column; 42 | align-items: center; 43 | justify-content: center; 44 | font-size: calc(10px + 2vmin); 45 | color: white; 46 | } 47 | 48 | .App-link { 49 | color: #61dafb; 50 | } 51 | 52 | @keyframes App-logo-spin { 53 | from { 54 | transform: rotate(0deg); 55 | } 56 | to { 57 | transform: rotate(360deg); 58 | } 59 | } 60 | 61 | 62 | /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ 63 | /* Structure 64 | /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ 65 | 66 | body { 67 | margin: 0; 68 | padding: 0 1em; 69 | font-size: 12pt; 70 | } 71 | 72 | /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ 73 | /* Typography 74 | /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ 75 | 76 | h1 { 77 | font-size: 24pt; 78 | font-weight: 500; 79 | padding: 1em 0 0; 80 | display: block; 81 | color: #000; 82 | } 83 | h3 { padding: 0 0; } 84 | h2 { padding: 1em 0 0.5em 0; } 85 | h4, h5 { 86 | text-transform: uppercase; 87 | margin: 1em 0; 88 | justify-tracks: space-between; 89 | font-family: var(--sans-serif); 90 | font-size: 12pt; 91 | font-weight: 600; 92 | } 93 | h2, h3 { font-weight: 500; font-style: italic; } 94 | subtitle { 95 | color: #555; 96 | font-size: 18pt; 97 | font-style: italic; 98 | padding: 0; 99 | display: block; 100 | margin-bottom: 1em 101 | } 102 | 103 | a { 104 | transition: all .05s ease-in-out; 105 | color: #5c60c3 !important; 106 | font-style: normal; 107 | } 108 | a:hover { color: var(--accent-color)!important; } 109 | code, pre { color: var(--inline-code-color); 110 | background-color: #eee; border-radius: 3px; } 111 | pre { padding: 1em; margin: 2em 0; } 112 | code { padding: 0.3em; } 113 | .text-secondary, h3, h5 { color: var(--secondary-color); } 114 | .text-primary, h2,h4 { color: var(--primary-color); } 115 | 116 | /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ 117 | /* Images 118 | /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ 119 | 120 | img#logo { 121 | width: 50%; 122 | margin: 3em 0 0 123 | } 124 | 125 | /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ 126 | /* Alerts */ 127 | /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ 128 | 129 | .alert { 130 | font-weight: 600; 131 | font-style: italic; 132 | display: block; 133 | background-color: #fff7f7; 134 | padding: 1em; 135 | margin: 0; 136 | border-radius: 5px; 137 | color: #f25555 138 | } 139 | .alert.cool { 140 | background-color: #f3f0fc; 141 | color: #7155cf; 142 | } 143 | .flash-alert { 144 | display: inline-block; 145 | transition: ease-in-out 1s; 146 | font-size: 14pt; 147 | margin: 1em 0; 148 | padding-top: 0.5em; 149 | } 150 | .flash-alert.success { 151 | color: #000; 152 | } 153 | .flash-alert.failure { 154 | color: red; 155 | } 156 | .flash-alert.hidden { 157 | display: none; 158 | } 159 | 160 | 161 | /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ 162 | /* Sidenotes & Superscripts */ 163 | /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ 164 | 165 | body { counter-reset: count; } 166 | p { whitespace: nowrap; } 167 | sup { 168 | font-weight: 300; 169 | padding-right: .2em; 170 | counter-increment: count; 171 | } 172 | sidenote::before, 173 | sup::before { 174 | content: counter(count, lower-roman); 175 | display: inline-block; 176 | font-size: 10pt; 177 | font-weight: bold; 178 | color: var(--accent-color); 179 | } 180 | sidenote::before { 181 | margin-right: .5em; 182 | font-weight: 700 183 | } 184 | 185 | /* Different behavior if the screen is too 186 | narrow to show a sidenote on the side. */ 187 | 188 | @media (min-width:860px) { 189 | sidenote { 190 | clear: right; 191 | font-size: 10pt; 192 | position: fixed; 193 | float: right; 194 | white-space: normal; 195 | right: 20px; 196 | width: 200px; 197 | display: block; 198 | max-width: 30% 199 | } 200 | } 201 | 202 | /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ 203 | /* Sidenotes & Superscripts */ 204 | /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ 205 | 206 | @media print { 207 | a.btn, button { 208 | display: none!important 209 | } 210 | } 211 | 212 | @media (max-width:860px) { 213 | sidenote { 214 | display: block; 215 | font-size: 11pt; 216 | margin: 2em 3em 2em 2em 217 | } 218 | } 219 | 220 | /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ 221 | /* Buttons */ 222 | /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ 223 | 224 | @media screen { 225 | button:hover { box-shadow: 0.5em 0.5em var(--accent-color); } 226 | a.btn, button { 227 | border-radius: 3px; 228 | color: #000 !important; 229 | text-decoration: none !important; 230 | font-size: 11pt; 231 | border: 1px solid #000; 232 | padding: 0.5em 1em; 233 | font-family: -apple-system, 234 | BlinkMacSystemFont, 235 | "avenir next", 236 | avenir, 237 | helvetica, 238 | "helvetica neue", 239 | ubuntu, 240 | roboto, 241 | noto, 242 | "segoe ui", 243 | arial, 244 | sans-serif !important; 245 | background: #fff; 246 | margin: 1.5em 0; 247 | font-weight: 500; 248 | transition: all .05s ease-in-out,box-shadow-color .025s ease-in-out; 249 | box-shadow: 0.5em 0.5em #eee; 250 | display: inline-block; 251 | } 252 | 253 | a.btn:hover, button:hover { 254 | cursor: pointer 255 | box-shadow: 0.5em 0.5em var(--accent-color); 256 | } 257 | a.btn:active, button.active, button:active { 258 | border: 1px solid; 259 | margin: 2em 0 1em 1em; 260 | box-shadow: 0 0 #000 !important 261 | } 262 | a.btn.small,button.small { 263 | box-shadow: .5em .5em 0 #eee; 264 | border: 1px solid #000; 265 | padding: .6em 1em; 266 | font-weight: 500 267 | } 268 | a.btn.small:hover,button.small:hover { 269 | box-shadow: 0.5em 0.5em var(--accent-color); 270 | } 271 | a.btn.small:active,button.small:active { 272 | margin: 2em 0 1em 1em; 273 | box-shadow: 0 0 #000 274 | } 275 | } 276 | 277 | /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ 278 | /* Blockquotes & Epigraphs 279 | /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ 280 | 281 | blockquote { 282 | margin: 1em; 283 | } 284 | div>blockquote>p { 285 | font-size: 13pt; 286 | color: #555; 287 | font-style: normal!important; 288 | margin: 0; 289 | padding: 1em 0 1.5em 290 | } 291 | blockquote > blockquote { 292 | padding: 0.5em 2em 1em 1.5em !important; 293 | } 294 | 295 | blockquote > blockquote, 296 | blockquote > blockquote > p { 297 | font-size: 14pt; 298 | padding: 0; 299 | margin: 0; 300 | text-align: center; 301 | font-style: italic; 302 | color: var(--epigraph-color); 303 | } 304 | blockquote footer { 305 | font-size: 12pt; 306 | text-align: inherit; 307 | display: block; 308 | font-style: normal; 309 | margin: 1em; 310 | color: #aaa; 311 | } 312 | -------------------------------------------------------------------------------- /neuron-explainer/neuron_explainer/explanations/calibrated_simulator.py: -------------------------------------------------------------------------------- 1 | """ 2 | Code for calibrating simulations of neuron behavior. Calibration refers to a process of mapping from 3 | a space of predicted activation values (e.g. [0, 10]) to the real activation distribution for a 4 | neuron. 5 | 6 | See http://go/neuron_explanation_methodology for description of calibration step. Necessary for 7 | simulating neurons in the context of ablate-to-simulation, but can be skipped when using correlation 8 | scoring. (Calibration may still improve quality for scoring, at least for non-linear calibration 9 | methods.) 10 | """ 11 | 12 | from __future__ import annotations 13 | 14 | import asyncio 15 | from abc import abstractmethod 16 | from typing import Optional, Sequence 17 | 18 | import numpy as np 19 | from neuron_explainer.activations.activations import ActivationRecord 20 | from neuron_explainer.explanations.explanations import ActivationScale 21 | from neuron_explainer.explanations.simulator import NeuronSimulator, SequenceSimulation 22 | from sklearn import linear_model 23 | 24 | 25 | class CalibratedNeuronSimulator(NeuronSimulator): 26 | """ 27 | Wrap a NeuronSimulator and calibrate it to map from the predicted activation space to the 28 | actual neuron activation space. 29 | """ 30 | 31 | def __init__(self, uncalibrated_simulator: NeuronSimulator): 32 | self.uncalibrated_simulator = uncalibrated_simulator 33 | 34 | @classmethod 35 | async def create( 36 | cls, 37 | uncalibrated_simulator: NeuronSimulator, 38 | calibration_activation_records: Sequence[ActivationRecord], 39 | ) -> CalibratedNeuronSimulator: 40 | """ 41 | Create and calibrate a calibrated simulator (so initialization and calibration can be done 42 | in one call). 43 | """ 44 | calibrated_simulator = cls(uncalibrated_simulator) 45 | await calibrated_simulator.calibrate(calibration_activation_records) 46 | return calibrated_simulator 47 | 48 | async def calibrate(self, calibration_activation_records: Sequence[ActivationRecord]) -> None: 49 | """ 50 | Determine parameters to map from the predicted activation space to the real neuron 51 | activation space, based on a calibration set. 52 | 53 | Use when simulated sequences haven't already been produced on the calibration set. 54 | """ 55 | simulations = await asyncio.gather( 56 | *[ 57 | self.uncalibrated_simulator.simulate(activations.tokens) 58 | for activations in calibration_activation_records 59 | ] 60 | ) 61 | self.calibrate_from_simulations(calibration_activation_records, simulations) 62 | 63 | def calibrate_from_simulations( 64 | self, 65 | calibration_activation_records: Sequence[ActivationRecord], 66 | simulations: Sequence[SequenceSimulation], 67 | ) -> None: 68 | """ 69 | Determine parameters to map from the predicted activation space to the real neuron 70 | activation space, based on a calibration set. 71 | 72 | Use when simulated sequences have already been produced on the calibration set. 73 | """ 74 | flattened_activations = [] 75 | flattened_simulated_activations: list[float] = [] 76 | for activations, simulation in zip(calibration_activation_records, simulations): 77 | flattened_activations.extend(activations.activations) 78 | flattened_simulated_activations.extend(simulation.expected_activations) 79 | self._calibrate_from_flattened_activations( 80 | np.array(flattened_activations), np.array(flattened_simulated_activations) 81 | ) 82 | 83 | @abstractmethod 84 | def _calibrate_from_flattened_activations( 85 | self, 86 | true_activations: np.ndarray, 87 | uncalibrated_activations: np.ndarray, 88 | ) -> None: 89 | """ 90 | Determine parameters to map from the predicted activation space to the real neuron 91 | activation space, based on a calibration set. 92 | 93 | Take numpy arrays of all true activations and all uncalibrated activations on the 94 | calibration set over all sequences. 95 | """ 96 | 97 | @abstractmethod 98 | def apply_calibration(self, values: Sequence[float]) -> list[float]: 99 | """Apply the learned calibration to a sequence of values.""" 100 | 101 | async def simulate(self, tokens: Sequence[str]) -> SequenceSimulation: 102 | uncalibrated_seq_simulation = await self.uncalibrated_simulator.simulate(tokens) 103 | calibrated_activations = self.apply_calibration( 104 | uncalibrated_seq_simulation.expected_activations 105 | ) 106 | calibrated_distribution_values = [ 107 | self.apply_calibration(dv) for dv in uncalibrated_seq_simulation.distribution_values 108 | ] 109 | return SequenceSimulation( 110 | tokens=uncalibrated_seq_simulation.tokens, 111 | expected_activations=calibrated_activations, 112 | activation_scale=ActivationScale.NEURON_ACTIVATIONS, 113 | distribution_values=calibrated_distribution_values, 114 | distribution_probabilities=uncalibrated_seq_simulation.distribution_probabilities, 115 | uncalibrated_simulation=uncalibrated_seq_simulation, 116 | ) 117 | 118 | 119 | class UncalibratedNeuronSimulator(CalibratedNeuronSimulator): 120 | """Pass through the activations without trying to calibrate.""" 121 | 122 | def __init__(self, uncalibrated_simulator: NeuronSimulator): 123 | super().__init__(uncalibrated_simulator) 124 | 125 | async def calibrate(self, calibration_activation_records: Sequence[ActivationRecord]) -> None: 126 | pass 127 | 128 | def _calibrate_from_flattened_activations( 129 | self, 130 | true_activations: np.ndarray, 131 | uncalibrated_activations: np.ndarray, 132 | ) -> None: 133 | pass 134 | 135 | def apply_calibration(self, values: Sequence[float]) -> list[float]: 136 | return values if isinstance(values, list) else list(values) 137 | 138 | 139 | class LinearCalibratedNeuronSimulator(CalibratedNeuronSimulator): 140 | """Find a linear mapping from uncalibrated activations to true activations. 141 | 142 | Should not change ev_correlation_score because it is invariant to linear transformations. 143 | """ 144 | 145 | def __init__(self, uncalibrated_simulator: NeuronSimulator): 146 | super().__init__(uncalibrated_simulator) 147 | self._regression: Optional[linear_model.LinearRegression] = None 148 | 149 | def _calibrate_from_flattened_activations( 150 | self, 151 | true_activations: np.ndarray, 152 | uncalibrated_activations: np.ndarray, 153 | ) -> None: 154 | self._regression = linear_model.LinearRegression() 155 | self._regression.fit(uncalibrated_activations.reshape(-1, 1), true_activations) 156 | 157 | def apply_calibration(self, values: Sequence[float]) -> list[float]: 158 | if self._regression is None: 159 | raise ValueError("Must call calibrate() before apply_calibration") 160 | if len(values) == 0: 161 | return [] 162 | return self._regression.predict(np.reshape(np.array(values), (-1, 1))).tolist() 163 | 164 | 165 | class PercentileMatchingCalibratedNeuronSimulator(CalibratedNeuronSimulator): 166 | """ 167 | Map the nth percentile of the uncalibrated activations to the nth percentile of the true 168 | activations for all n. 169 | 170 | This will match the distribution of true activations on the calibration set, but will be 171 | overconfident outside of the calibration set. 172 | """ 173 | 174 | def __init__(self, uncalibrated_simulator: NeuronSimulator): 175 | super().__init__(uncalibrated_simulator) 176 | self._uncalibrated_activations: Optional[np.ndarray] = None 177 | self._true_activations: Optional[np.ndarray] = None 178 | 179 | def _calibrate_from_flattened_activations( 180 | self, 181 | true_activations: np.ndarray, 182 | uncalibrated_activations: np.ndarray, 183 | ) -> None: 184 | self._uncalibrated_activations = np.sort(uncalibrated_activations) 185 | self._true_activations = np.sort(true_activations) 186 | 187 | def apply_calibration(self, values: Sequence[float]) -> list[float]: 188 | if self._true_activations is None or self._uncalibrated_activations is None: 189 | raise ValueError("Must call calibrate() before apply_calibration") 190 | if len(values) == 0: 191 | return [] 192 | return np.interp( 193 | np.array(values), self._uncalibrated_activations, self._true_activations 194 | ).tolist() 195 | -------------------------------------------------------------------------------- /neuron-explainer/neuron_explainer/explanations/test_simulator.py: -------------------------------------------------------------------------------- 1 | from neuron_explainer.explanations.few_shot_examples import FewShotExampleSet 2 | from neuron_explainer.explanations.prompt_builder import HarmonyMessage, PromptFormat, Role 3 | from neuron_explainer.explanations.simulator import ( 4 | ExplanationNeuronSimulator, 5 | ExplanationTokenByTokenSimulator, 6 | ) 7 | 8 | 9 | def test_make_explanation_simulation_prompt_if_format() -> None: 10 | expected_prompt = """We're studying neurons in a neural network. 11 | Each neuron looks for some particular thing in a short document. 12 | Look at summary of what the neuron does, and try to predict how it will fire on each token. 13 | 14 | The activation format is tokenactivation, activations go from 0 to 10, "unknown" indicates an unknown activation. Most activations will be 0. 15 | 16 | 17 | Neuron 1 18 | Explanation of neuron 1 behavior: the main thing this neuron does is find vowels 19 | Activations: 20 | 21 | a 10 22 | b 0 23 | c 0 24 | 25 | 26 | d unknown 27 | e 10 28 | f 0 29 | 30 | 31 | 32 | 33 | Neuron 2 34 | Explanation of neuron 2 behavior: the main thing this neuron does is find EXPLANATION<|endofprompt|> 35 | Activations: 36 | 37 | 0 unknown 38 | 1 unknown 39 | 2 unknown 40 | 41 | """ 42 | prompt = ExplanationNeuronSimulator( 43 | model_name="text-davinci-003", 44 | explanation="EXPLANATION", 45 | few_shot_example_set=FewShotExampleSet.TEST, 46 | prompt_format=PromptFormat.INSTRUCTION_FOLLOWING, 47 | ).make_simulation_prompt( 48 | tokens=[str(x) for x in range(3)], 49 | ) 50 | assert prompt == expected_prompt 51 | 52 | 53 | def test_make_explanation_simulation_prompt_harmony_format() -> None: 54 | expected_prompt = [ 55 | HarmonyMessage( 56 | role=Role.SYSTEM, 57 | content="""We're studying neurons in a neural network. 58 | Each neuron looks for some particular thing in a short document. 59 | Look at summary of what the neuron does, and try to predict how it will fire on each token. 60 | 61 | The activation format is tokenactivation, activations go from 0 to 10, "unknown" indicates an unknown activation. Most activations will be 0. 62 | """, 63 | ), 64 | HarmonyMessage( 65 | role=Role.USER, 66 | content=""" 67 | 68 | Neuron 1 69 | Explanation of neuron 1 behavior: the main thing this neuron does is find vowels""", 70 | ), 71 | HarmonyMessage( 72 | role=Role.ASSISTANT, 73 | content=""" 74 | Activations: 75 | 76 | a 10 77 | b 0 78 | c 0 79 | 80 | 81 | d unknown 82 | e 10 83 | f 0 84 | 85 | 86 | """, 87 | ), 88 | HarmonyMessage( 89 | role=Role.USER, 90 | content=""" 91 | 92 | Neuron 2 93 | Explanation of neuron 2 behavior: the main thing this neuron does is find EXPLANATION""", 94 | ), 95 | HarmonyMessage( 96 | role=Role.ASSISTANT, 97 | content=""" 98 | Activations: 99 | 100 | 0 unknown 101 | 1 unknown 102 | 2 unknown 103 | 104 | """, 105 | ), 106 | ] 107 | prompt = ExplanationNeuronSimulator( 108 | model_name="gpt-4", 109 | explanation="EXPLANATION", 110 | few_shot_example_set=FewShotExampleSet.TEST, 111 | prompt_format=PromptFormat.HARMONY_V4, 112 | ).make_simulation_prompt( 113 | tokens=[str(x) for x in range(3)], 114 | ) 115 | 116 | assert isinstance(prompt, list) 117 | assert isinstance(prompt[0], dict) # Really a HarmonyMessage 118 | for actual_message, expected_message in zip(prompt, expected_prompt): 119 | assert actual_message["role"] == expected_message["role"] 120 | assert actual_message["content"] == expected_message["content"] 121 | assert prompt == expected_prompt 122 | 123 | 124 | def test_make_token_by_token_simulation_prompt_if_format() -> None: 125 | expected_prompt = """We're studying neurons in a neural network. Each neuron looks for some particular thing in a short document. Look at an explanation of what the neuron does, and try to predict its activations on a particular token. 126 | 127 | The activation format is tokenactivation, and activations range from 0 to 10. Most activations will be 0. 128 | 129 | Neuron 1 130 | Explanation of neuron 1 behavior: the main thing this neuron does is find vowels 131 | Activations: 132 | 133 | a 10 134 | b 0 135 | c 0 136 | 137 | 138 | d 0 139 | e 10 140 | f 0 141 | 142 | 143 | 144 | Now, we're going predict the activation of a new neuron on a single token, following the same rules as the examples above. Activations still range from 0 to 10. 145 | Neuron 2 146 | Explanation of neuron 2 behavior: the main thing this neuron does is find numbers and nothing else 147 | Text: 148 | ghi 149 | 150 | Last token in the text: 151 | i 152 | 153 | Last token activation, considering the token in the context in which it appeared in the text: 154 | 10 155 | 156 | 157 | Neuron 3 158 | Explanation of neuron 3 behavior: the main thing this neuron does is find numbers and nothing else 159 | Text: 160 | 01 161 | 162 | Last token in the text: 163 | 1 164 | 165 | Last token activation, considering the token in the context in which it appeared in the text: 166 | <|endofprompt|>""" 167 | prompt = ExplanationTokenByTokenSimulator( 168 | model_name="text-davinci-003", 169 | explanation="EXPLANATION", 170 | few_shot_example_set=FewShotExampleSet.TEST, 171 | prompt_format=PromptFormat.INSTRUCTION_FOLLOWING, 172 | ).make_single_token_simulation_prompt( 173 | tokens=[str(x) for x in range(3)], 174 | explanation="numbers and nothing else", 175 | token_index_to_score=1, 176 | ) 177 | assert prompt == expected_prompt 178 | 179 | 180 | def test_make_token_by_token_simulation_prompt_harmony_format() -> None: 181 | expected_prompt = [ 182 | HarmonyMessage( 183 | role=Role.SYSTEM, 184 | content="""We're studying neurons in a neural network. Each neuron looks for some particular thing in a short document. Look at an explanation of what the neuron does, and try to predict its activations on a particular token. 185 | 186 | The activation format is tokenactivation, and activations range from 0 to 10. Most activations will be 0. 187 | 188 | """, 189 | ), 190 | HarmonyMessage( 191 | role=Role.USER, 192 | content="""Neuron 1 193 | Explanation of neuron 1 behavior: the main thing this neuron does is find vowels 194 | """, 195 | ), 196 | HarmonyMessage( 197 | role=Role.ASSISTANT, 198 | content="""Activations: 199 | 200 | a 10 201 | b 0 202 | c 0 203 | 204 | 205 | d 0 206 | e 10 207 | f 0 208 | 209 | 210 | 211 | """, 212 | ), 213 | HarmonyMessage( 214 | role=Role.SYSTEM, 215 | content="Now, we're going predict the activation of a new neuron on a single token, following the same rules as the examples above. Activations still range from 0 to 10.", 216 | ), 217 | HarmonyMessage( 218 | role=Role.USER, 219 | content=""" 220 | Neuron 2 221 | Explanation of neuron 2 behavior: the main thing this neuron does is find numbers and nothing else 222 | Text: 223 | ghi 224 | 225 | Last token in the text: 226 | i 227 | 228 | Last token activation, considering the token in the context in which it appeared in the text: 229 | """, 230 | ), 231 | HarmonyMessage( 232 | role=Role.ASSISTANT, 233 | content="""10 234 | 235 | """, 236 | ), 237 | HarmonyMessage( 238 | role=Role.USER, 239 | content=""" 240 | Neuron 3 241 | Explanation of neuron 3 behavior: the main thing this neuron does is find numbers and nothing else 242 | Text: 243 | 01 244 | 245 | Last token in the text: 246 | 1 247 | 248 | Last token activation, considering the token in the context in which it appeared in the text: 249 | """, 250 | ), 251 | ] 252 | 253 | prompt = ExplanationTokenByTokenSimulator( 254 | model_name="gpt-4", 255 | explanation="EXPLANATION", 256 | few_shot_example_set=FewShotExampleSet.TEST, 257 | prompt_format=PromptFormat.HARMONY_V4, 258 | ).make_single_token_simulation_prompt( 259 | tokens=[str(x) for x in range(3)], 260 | explanation="numbers and nothing else", 261 | token_index_to_score=1, 262 | ) 263 | 264 | assert isinstance(prompt, list) 265 | assert isinstance(prompt[0], dict) # Really a HarmonyMessage 266 | for actual_message, expected_message in zip(prompt, expected_prompt): 267 | assert actual_message["role"] == expected_message["role"] 268 | assert actual_message["content"] == expected_message["content"] 269 | assert prompt == expected_prompt 270 | -------------------------------------------------------------------------------- /neuron-viewer/src/welcome.tsx: -------------------------------------------------------------------------------- 1 | import { useState, FormEvent } from "react" 2 | import { useNavigate } from "react-router-dom" 3 | 4 | function NeuronForm() { 5 | const [input_layer, setLayer] = useState(0) 6 | const [input_neuron, setNeuron] = useState(0) 7 | const navigate = useNavigate() 8 | 9 | const knownGoodNeurons = [ 10 | /************** 11 | /* well explained + interesting 12 | ***************/ 13 | {heading: 'Somewhat well explained by GPT-4', layer: 0, neuron: 0, label: ''}, 14 | {layer: 5, neuron: 131, label: "citations", description: "citations, especially biblical and legal"}, 15 | {layer: 12, neuron: 847, label: "numbers in fractions", description: "numbers in fractions"}, // 16 | {layer: 12, neuron: 5820, label: "short flags", description: "single letter command line flags"}, // 17 | {layer: 14, neuron: 417, label: "doing things right", description: "words and phrases related to performing actions correctly or properly"}, // score 0.42 18 | {layer: 15, neuron: 4538, label: "leading transitions", description: "transition words at the start of documents"}, 19 | {layer: 17, neuron: 3218, label: "success", description: "expressions of completion or success"}, // score 0.38 20 | {layer: 18, neuron: 5302, label: "X *by*", description: "the word 'by' in phrases indicating side by side or sequential events."}, // score 0.48 21 | {layer: 19, neuron: 1377, label: "similes", description: "comparisons and analogies, often using the word 'like'"}, // score 0.42 22 | {layer: 21, neuron: 2932, label: "Canada", description: "references to Canadian people, places, and entities"}, // score 0.78 23 | {layer: 25, neuron: 2602, label: "similes", description: "descriptive comparisons, especially similes"}, // score 0.40 24 | {layer: 25, neuron: 4870, label: "certainty", description: "phrases related to certainty and confidence."}, // score 0.37 25 | {layer: 30, neuron: 28, label: "times", description: "specific times (with hours and minutes)"}, 26 | // https://openaipublic.blob.core.windows.net/neuron-explainer/neuron-viewer/index.html#/layers/5/neurons/2326 27 | {heading: 'Partially explained by GPT-4', layer: 0, neuron: 0, label: ''}, 28 | {layer: 0, neuron: 816, label: "Marvel comics vibes", description: "language and context related to Marvel comics, movies, and characters, as well as other superhero-themed content"}, // score 0.44 29 | {layer: 0, neuron: 742, label: "Second token 'and'", description: "'and', 'in', and punctuation at the second token"}, 30 | {layer: 4, neuron: 4342, label: "token counter", description: "counting repeated occurrences of a token"}, 31 | {layer: 5, neuron: 2326, label: "rhymes with 'at'", description: "syllables rhyming with 'at', sometimes 'it', 'et', 'ot'"}, 32 | {layer: 5, neuron: 4492, label: "leading 'an'", description: "sentences that start with 'an'"}, // score 0.77 33 | {layer: 6, neuron: 3251, label: "not all", description: "not all"}, 34 | {layer: 10, neuron: 2851, label: "leading acronyms", description: "acronyms after punctuation or newlines"}, 35 | {layer: 12, neuron: 2884, label: "hypothetical had", description: "had in hypothetical contexts"}, // 36 | {layer: 14, neuron: 3539, label: "long sequences", description: "long sequences of stuff"}, 37 | {layer: 14, neuron: 3822, label: "X by/after *X*", description: "noun repetitions separated by 'by' or 'after'"}, 38 | {layer: 21, neuron: 3982, label: "any *and* all", description: "any/anything *and/&* all/everything"}, 39 | {layer: 26, neuron: 20, label: "truth, skin, or sun", description: "truth, skin, or sun"}, 40 | // layer=18&neuron=5302 41 | /************** 42 | /* boring 43 | ***************/ 44 | /************** 45 | /* poorly explained + interesting 46 | ***************/ 47 | {heading: 'Poorly explained by GPT-4', layer: 0, neuron: 0, label: ''}, 48 | // Actually activates for negated version “not so much … as” even when not so much is fairly far apart 49 | // another "not all": 13&neuron=1352 50 | // {layer: 0, neuron: 2823, label: "Hillary email leak vibes", description: "contexts related to Hillary Clinton leaked emails"}, // score ?? 51 | // {layer: 12, neuron: 3718, label: "comparative phrases and negations", description: "comparative phrases and negations"}, // score 0.12 52 | {layer: 13, neuron: 410, label: "N and N+1", description: "a number following its predecessor"}, // score ?? 53 | {layer: 13, neuron: 979, label: "subtle plurals", description: "subtle/nonobvious plurals"}, // score ?? 54 | // slash after number 12&neuron=847 55 | // numbers predicting slash: 14&neuron=92 56 | // 0&neuron=2823 57 | {layer: 14, neuron: 1251, label: "subjunctive verbs", description: "verbs in subjunctive mood"}, // score ?? 58 | {layer: 16, neuron: 518, label: "pattern breaks", description: "tokens that break an established pattern in an ongoing list"}, // score 0.2 with totally wrong explanation 59 | {layer: 17, neuron: 821, label: "idioms", description: "idioms"}, 60 | {layer: 18, neuron: 3481, label: "post-typo", description: "first token following a typo"}, // score ?? 61 | {layer: 18, neuron: 3552, label: "repeated text", description: "repeated text"}, // score ?? 62 | // another shared last names: https://openaipublic.blob.core.windows.net/neuron-explainer/neuron-viewer/index.html#/layers/20/neurons/3164 63 | {layer: 19, neuron: 1763, label: "shared last names", description: "last names when two different people sharing last name are mentioned"}, // score 0.36 64 | {layer: 20, neuron: 4334, label: "previous break", description: "tokens that previously preceded a linebreak"}, // score ?? 65 | {layer: 27, neuron: 116, label: "MTG vibes", description: "Magic the Gathering contexts"}, // score ?? 66 | {layer: 35, neuron: 1523, label: "NBA name predictor", description: "NBA person/player name predictor"}, // score ?? 67 | // {layer: 36, neuron: 2275, label: "she predictor", description: "prediction of the token 'she'"}, // score ?? 68 | // {layer: 36, neuron: 5107, label: "Mormon vibes", description: "Mormon related context"}, // score ?? 69 | // ] predictor 40&neuron=4505 70 | {layer: 46, neuron: 2181, label: "C predictor", description: "prediction of the token 'C'"}, // score ?? 71 | ] 72 | 73 | const handleSubmit = (e: FormEvent) => { 74 | e.preventDefault() 75 | navigate(`/layers/${input_layer}/neurons/${input_neuron}`) 76 | return false 77 | } 78 | 79 | const handleNeuronClick = (layer: number, neuron: number) => { 80 | navigate(`/layers/${layer}/neurons/${neuron}`) 81 | } 82 | 83 | const feelingLuckySubmit = () => { 84 | const layer = Math.floor(Math.random() * 48); 85 | const neuron = Math.floor(Math.random() * 6400); 86 | navigate(`/layers/${layer}/neurons/${neuron}`) 87 | return false 88 | } 89 | 90 | 91 | return ( 92 |
93 |

Welcome! Pick a neuron:

94 |
99 | Layer setLayer(parseInt(e.target.value))} 107 | className="border border-gray-300 rounded-md p-2" 108 | /> 109 | Index setNeuron(parseInt(e.target.value))} 117 | className="border border-gray-300 rounded-md p-2" 118 | /> 119 | 125 |
126 | 132 |
133 |

Interesting neurons:

134 |
135 |
138 | {knownGoodNeurons.map(({ heading, layer, neuron, label, description }, j) => ( 139 | heading ?

140 | {heading} 141 |

: 150 | ))} 151 |
152 |
153 |
154 |
155 | ) 156 | } 157 | 158 | export default NeuronForm 159 | -------------------------------------------------------------------------------- /neuron-explainer/neuron_explainer/explanations/explanations.py: -------------------------------------------------------------------------------- 1 | # Dataclasses and enums for storing neuron explanations, their scores, and related data. Also, 2 | # related helper functions. 3 | 4 | from __future__ import annotations 5 | 6 | import json 7 | from dataclasses import dataclass 8 | from enum import Enum 9 | from typing import List, Optional, Union 10 | 11 | import blobfile as bf 12 | import boostedblob as bbb 13 | from neuron_explainer.activations.activations import NeuronId 14 | from neuron_explainer.fast_dataclasses import FastDataclass, loads, register_dataclass 15 | 16 | 17 | class ActivationScale(str, Enum): 18 | """Which "units" are stored in the expected_activations/distribution_values fields of a 19 | SequenceSimulation. 20 | 21 | This enum identifies whether the values represent real activations of the neuron or something 22 | else. Different scales are not necessarily related by a linear transformation. 23 | """ 24 | 25 | NEURON_ACTIVATIONS = "neuron_activations" 26 | """Values represent real activations of the neuron.""" 27 | SIMULATED_NORMALIZED_ACTIVATIONS = "simulated_normalized_activations" 28 | """ 29 | Values represent simulated activations of the neuron, normalized to the range [0, 10]. This 30 | scale is arbitrary and should not be interpreted as a neuron activation. 31 | """ 32 | 33 | 34 | @register_dataclass 35 | @dataclass 36 | class SequenceSimulation(FastDataclass): 37 | """The result of a simulation of neuron activations on one text sequence.""" 38 | 39 | tokens: list[str] 40 | """The sequence of tokens that was simulated.""" 41 | expected_activations: list[float] 42 | """Expected value of the possibly-normalized activation for each token in the sequence.""" 43 | activation_scale: ActivationScale 44 | """What scale is used for values in the expected_activations field.""" 45 | distribution_values: list[list[float]] 46 | """ 47 | For each token in the sequence, a list of values from the discrete distribution of activations 48 | produced from simulation. Tokens will be included here if and only if they are in the top K=15 49 | tokens predicted by the simulator, and excluded otherwise. 50 | 51 | May be transformed to another unit by calibration. When we simulate a neuron, we produce a 52 | discrete distribution with values in the arbitrary discretized space of the neuron, e.g. 10% 53 | chance of 0, 70% chance of 1, 20% chance of 2. Which we store as distribution_values = 54 | [0, 1, 2], distribution_probabilities = [0.1, 0.7, 0.2]. When we transform the distribution to 55 | the real activation units, we can correspondingly transform the values of this distribution 56 | to get a distribution in the units of the neuron. e.g. if the mapping from the discretized space 57 | to the real activation unit of the neuron is f(x) = x/2, then the distribution becomes 10% 58 | chance of 0, 70% chance of 0.5, 20% chance of 1. Which we store as distribution_values = 59 | [0, 0.5, 1], distribution_probabilities = [0.1, 0.7, 0.2]. 60 | """ 61 | distribution_probabilities: list[list[float]] 62 | """ 63 | For each token in the sequence, the probability of the corresponding value in 64 | distribution_values. 65 | """ 66 | 67 | uncalibrated_simulation: Optional["SequenceSimulation"] = None 68 | """The result of the simulation before calibration.""" 69 | 70 | 71 | @register_dataclass 72 | @dataclass 73 | class ScoredSequenceSimulation(FastDataclass): 74 | """ 75 | SequenceSimulation result with a score (for that sequence only) and ground truth activations. 76 | """ 77 | 78 | simulation: SequenceSimulation 79 | """The result of a simulation of neuron activations.""" 80 | true_activations: List[float] 81 | """Ground truth activations on the sequence (not normalized)""" 82 | ev_correlation_score: float 83 | """ 84 | Correlation coefficient between the expected values of the normalized activations from the 85 | simulation and the unnormalized true activations of the neuron on the text sequence. 86 | """ 87 | rsquared_score: Optional[float] = None 88 | """R^2 of the simulated activations.""" 89 | absolute_dev_explained_score: Optional[float] = None 90 | """ 91 | Score based on absolute difference between real and simulated activations. 92 | absolute_dev_explained_score = 1 - mean(abs(real-predicted))/ mean(abs(real)) 93 | """ 94 | 95 | 96 | @register_dataclass 97 | @dataclass 98 | class ScoredSimulation(FastDataclass): 99 | """Result of scoring a neuron simulation on multiple sequences.""" 100 | 101 | scored_sequence_simulations: List[ScoredSequenceSimulation] 102 | """ScoredSequenceSimulation for each sequence""" 103 | ev_correlation_score: Optional[float] = None 104 | """ 105 | Correlation coefficient between the expected values of the normalized activations from the 106 | simulation and the unnormalized true activations on a dataset created from all score_results. 107 | (Note that this is not equivalent to averaging across sequences.) 108 | """ 109 | rsquared_score: Optional[float] = None 110 | """R^2 of the simulated activations.""" 111 | absolute_dev_explained_score: Optional[float] = None 112 | """ 113 | Score based on absolute difference between real and simulated activations. 114 | absolute_dev_explained_score = 1 - mean(abs(real-predicted))/ mean(abs(real)). 115 | """ 116 | 117 | def get_preferred_score(self) -> Optional[float]: 118 | """ 119 | This method may return None in cases where the score is undefined, for example if the 120 | normalized activations were all zero, yielding a correlation coefficient of NaN. 121 | """ 122 | return self.ev_correlation_score 123 | 124 | 125 | @register_dataclass 126 | @dataclass 127 | class ScoredExplanation(FastDataclass): 128 | """Simulator parameters and the results of scoring it on multiple sequences""" 129 | 130 | explanation: str 131 | """The explanation used for simulation.""" 132 | 133 | scored_simulation: ScoredSimulation 134 | """Result of scoring the neuron simulator on multiple sequences.""" 135 | 136 | def get_preferred_score(self) -> Optional[float]: 137 | """ 138 | This method may return None in cases where the score is undefined, for example if the 139 | normalized activations were all zero, yielding a correlation coefficient of NaN. 140 | """ 141 | return self.scored_simulation.get_preferred_score() 142 | 143 | 144 | @register_dataclass 145 | @dataclass 146 | class NeuronSimulationResults(FastDataclass): 147 | """Simulation results and scores for a neuron.""" 148 | 149 | neuron_id: NeuronId 150 | scored_explanations: list[ScoredExplanation] 151 | 152 | 153 | def load_neuron_explanations( 154 | explanations_path: str, layer_index: Union[str, int], neuron_index: Union[str, int] 155 | ) -> Optional[NeuronSimulationResults]: 156 | """Load scored explanations for the specified neuron.""" 157 | file = bf.join(explanations_path, str(layer_index), f"{neuron_index}.jsonl") 158 | if not bf.exists(file): 159 | return None 160 | with bf.BlobFile(file) as f: 161 | for line in f: 162 | return loads(line) 163 | return None 164 | 165 | 166 | @bbb.ensure_session 167 | async def load_neuron_explanations_async( 168 | explanations_path: str, layer_index: Union[str, int], neuron_index: Union[str, int] 169 | ) -> Optional[NeuronSimulationResults]: 170 | """Load scored explanations for the specified neuron, asynchronously.""" 171 | return await read_explanation_file( 172 | bf.join(explanations_path, str(layer_index), f"{neuron_index}.jsonl") 173 | ) 174 | 175 | 176 | @bbb.ensure_session 177 | async def read_file(filename: str) -> Optional[str]: 178 | """Read the contents of the given file as a string, asynchronously.""" 179 | try: 180 | raw_contents = await bbb.read.read_single(filename) 181 | except FileNotFoundError: 182 | print(f"Could not read {filename}") 183 | return None 184 | lines = [] 185 | for line in raw_contents.decode("utf-8").split("\n"): 186 | if len(line) > 0: 187 | lines.append(line) 188 | assert len(lines) == 1, filename 189 | return lines[0] 190 | 191 | 192 | @bbb.ensure_session 193 | async def read_explanation_file(explanation_filename: str) -> Optional[NeuronSimulationResults]: 194 | """Load scored explanations from the given filename, asynchronously.""" 195 | line = await read_file(explanation_filename) 196 | return loads(line) if line is not None else None 197 | 198 | 199 | @bbb.ensure_session 200 | async def read_json_file(filename: str) -> Optional[dict]: 201 | """Read the contents of the given file as a JSON object, asynchronously.""" 202 | line = await read_file(filename) 203 | return json.loads(line) if line is not None else None 204 | 205 | 206 | def get_numerical_subdirs(dataset_path: str) -> list[str]: 207 | """Return the names of all numbered subdirectories in the specified directory. 208 | 209 | Used to get all layer directories in an explanation directory. 210 | """ 211 | return [ 212 | str(x) 213 | for x in sorted( 214 | [ 215 | int(x) 216 | for x in bf.listdir(dataset_path) 217 | if bf.isdir(bf.join(dataset_path, x)) and x.isnumeric() 218 | ] 219 | ) 220 | ] 221 | 222 | 223 | def get_sorted_neuron_indices_from_explanations( 224 | explanations_path: str, layer: Union[str, int] 225 | ) -> list[int]: 226 | """Return the indices of all neurons in this layer, in ascending order.""" 227 | layer_dir = bf.join(explanations_path, str(layer)) 228 | return sorted( 229 | [int(f.split(".")[0]) for f in bf.listdir(layer_dir) if f.split(".")[0].isnumeric()] 230 | ) 231 | -------------------------------------------------------------------------------- /neuron-explainer/neuron_explainer/activations/activations.py: -------------------------------------------------------------------------------- 1 | # Dataclasses and enums for storing neuron-indexed information about activations. Also, related 2 | # helper functions. 3 | 4 | import math 5 | from dataclasses import dataclass, field 6 | from typing import List, Optional, Union 7 | 8 | import urllib.request 9 | import blobfile as bf 10 | import boostedblob as bbb 11 | from neuron_explainer.fast_dataclasses import FastDataclass, loads, register_dataclass 12 | from neuron_explainer.azure import standardize_azure_url 13 | 14 | 15 | @register_dataclass 16 | @dataclass 17 | class ActivationRecord(FastDataclass): 18 | """Collated lists of tokens and their activations for a single neuron.""" 19 | 20 | tokens: List[str] 21 | """Tokens in the text sequence, represented as strings.""" 22 | activations: List[float] 23 | """Raw activation values for the neuron on each token in the text sequence.""" 24 | 25 | 26 | @register_dataclass 27 | @dataclass 28 | class NeuronId(FastDataclass): 29 | """Identifier for a neuron in an artificial neural network.""" 30 | 31 | layer_index: int 32 | """The index of layer the neuron is in. The first layer used during inference has index 0.""" 33 | neuron_index: int 34 | """The neuron's index within in its layer. Indices start from 0 in each layer.""" 35 | 36 | 37 | def _check_slices( 38 | slices_by_split: dict[str, slice], 39 | expected_num_values: int, 40 | ) -> None: 41 | """Assert that the slices are disjoint and fully cover the intended range.""" 42 | indices = set() 43 | sum_of_slice_lengths = 0 44 | n_splits = len(slices_by_split.keys()) 45 | for s in slices_by_split.values(): 46 | subrange = range(expected_num_values)[s] 47 | sum_of_slice_lengths += len(subrange) 48 | indices |= set(subrange) 49 | assert ( 50 | sum_of_slice_lengths == expected_num_values 51 | ), f"{sum_of_slice_lengths=} != {expected_num_values=}" 52 | stride = n_splits 53 | expected_indices = set.union( 54 | *[set(range(start_index, expected_num_values, stride)) for start_index in range(n_splits)] 55 | ) 56 | assert indices == expected_indices, f"{indices=} != {expected_indices=}" 57 | 58 | 59 | def get_slices_for_splits( 60 | splits: list[str], 61 | num_activation_records_per_split: int, 62 | ) -> dict[str, slice]: 63 | """ 64 | Get equal-sized interleaved subsets for each of a list of splits, given the number of elements 65 | to include in each split. 66 | """ 67 | 68 | stride = len(splits) 69 | num_activation_records_for_even_splits = num_activation_records_per_split * stride 70 | slices_by_split = { 71 | split: slice(split_index, num_activation_records_for_even_splits, stride) 72 | for split_index, split in enumerate(splits) 73 | } 74 | _check_slices( 75 | slices_by_split=slices_by_split, 76 | expected_num_values=num_activation_records_for_even_splits, 77 | ) 78 | return slices_by_split 79 | 80 | 81 | @dataclass 82 | class ActivationRecordSliceParams: 83 | """How to select splits (train, valid, etc.) of activation records.""" 84 | 85 | n_examples_per_split: Optional[int] 86 | """The number of examples to include in each split.""" 87 | 88 | 89 | @register_dataclass 90 | @dataclass 91 | class NeuronRecord(FastDataclass): 92 | """Neuron-indexed activation data, including summary stats and notable activation records.""" 93 | 94 | neuron_id: NeuronId 95 | """Identifier for the neuron.""" 96 | 97 | random_sample: list[ActivationRecord] = field(default_factory=list) 98 | """ 99 | Random activation records for this neuron. The random sample is independent from those used for 100 | other neurons. 101 | """ 102 | random_sample_by_quantile: Optional[list[list[ActivationRecord]]] = None 103 | """ 104 | Random samples of activation records in each of the specified quantiles. None if quantile 105 | tracking is disabled. 106 | """ 107 | quantile_boundaries: Optional[list[float]] = None 108 | """Boundaries of the quantiles used to generate the random_sample_by_quantile field.""" 109 | 110 | # Moments of activations 111 | mean: Optional[float] = math.nan 112 | variance: Optional[float] = math.nan 113 | skewness: Optional[float] = math.nan 114 | kurtosis: Optional[float] = math.nan 115 | 116 | most_positive_activation_records: list[ActivationRecord] = field(default_factory=list) 117 | """ 118 | Activation records with the most positive figure of merit value for this neuron over all dataset 119 | examples. 120 | """ 121 | 122 | @property 123 | def max_activation(self) -> float: 124 | """Return the maximum activation value over all top-activating activation records.""" 125 | return max([max(ar.activations) for ar in self.most_positive_activation_records]) 126 | 127 | def _get_top_activation_slices( 128 | self, activation_record_slice_params: ActivationRecordSliceParams 129 | ) -> dict[str, slice]: 130 | splits = ["train", "calibration", "valid", "test"] 131 | n_examples_per_split = activation_record_slice_params.n_examples_per_split 132 | if n_examples_per_split is None: 133 | n_examples_per_split = len(self.most_positive_activation_records) // len(splits) 134 | assert len(self.most_positive_activation_records) >= n_examples_per_split * len(splits) 135 | return get_slices_for_splits(splits, n_examples_per_split) 136 | 137 | def _get_random_activation_slices( 138 | self, activation_record_slice_params: ActivationRecordSliceParams 139 | ) -> dict[str, slice]: 140 | splits = ["calibration", "valid", "test"] 141 | n_examples_per_split = activation_record_slice_params.n_examples_per_split 142 | if n_examples_per_split is None: 143 | n_examples_per_split = len(self.random_sample) // len(splits) 144 | # NOTE: this assert could trigger on some old datasets with only 10 random samples, in which case you may have to remove "test" from the set of splits 145 | assert len(self.random_sample) >= n_examples_per_split * len(splits) 146 | return get_slices_for_splits(splits, n_examples_per_split) 147 | 148 | def train_activation_records( 149 | self, 150 | activation_record_slice_params: ActivationRecordSliceParams, 151 | ) -> list[ActivationRecord]: 152 | """ 153 | Train split, typically used for generating explanations. Consists exclusively of 154 | top-activating records since context window limitations make it difficult to include 155 | random records. 156 | """ 157 | return self.most_positive_activation_records[ 158 | self._get_top_activation_slices(activation_record_slice_params)["train"] 159 | ] 160 | 161 | def calibration_activation_records( 162 | self, 163 | activation_record_slice_params: ActivationRecordSliceParams, 164 | ) -> list[ActivationRecord]: 165 | """ 166 | Calibration split, typically used for calibrating neuron simulations. See 167 | http://go/neuron_explanation_methodology for an explanation of calibration. Consists of 168 | top-activating records and random records in a 1:1 ratio. 169 | """ 170 | return ( 171 | self.most_positive_activation_records[ 172 | self._get_top_activation_slices(activation_record_slice_params)["calibration"] 173 | ] 174 | + self.random_sample[ 175 | self._get_random_activation_slices(activation_record_slice_params)["calibration"] 176 | ] 177 | ) 178 | 179 | def valid_activation_records( 180 | self, 181 | activation_record_slice_params: ActivationRecordSliceParams, 182 | ) -> list[ActivationRecord]: 183 | """ 184 | Validation split, typically used for evaluating explanations, either automatically with 185 | simulation + correlation coefficient scoring, or manually by humans. Consists of 186 | top-activating records and random records in a 1:1 ratio. 187 | """ 188 | return ( 189 | self.most_positive_activation_records[ 190 | self._get_top_activation_slices(activation_record_slice_params)["valid"] 191 | ] 192 | + self.random_sample[ 193 | self._get_random_activation_slices(activation_record_slice_params)["valid"] 194 | ] 195 | ) 196 | 197 | def test_activation_records( 198 | self, 199 | activation_record_slice_params: ActivationRecordSliceParams, 200 | ) -> list[ActivationRecord]: 201 | """ 202 | Test split, typically used for explanation evaluations that can't use the validation split. 203 | Consists of top-activating records and random records in a 1:1 ratio. 204 | """ 205 | return ( 206 | self.most_positive_activation_records[ 207 | self._get_top_activation_slices(activation_record_slice_params)["test"] 208 | ] 209 | + self.random_sample[ 210 | self._get_random_activation_slices(activation_record_slice_params)["test"] 211 | ] 212 | ) 213 | 214 | 215 | def neuron_exists( 216 | dataset_path: str, layer_index: Union[str, int], neuron_index: Union[str, int] 217 | ) -> bool: 218 | """Return whether the specified neuron exists.""" 219 | file = bf.join(dataset_path, "neurons", str(layer_index), f"{neuron_index}.json") 220 | return bf.exists(file) 221 | 222 | 223 | def load_neuron( 224 | layer_index: Union[str, int], 225 | neuron_index: Union[str, int], 226 | dataset_path: str = "https://openaipublic.blob.core.windows.net/neuron-explainer/data/collated-activations", 227 | ) -> NeuronRecord: 228 | """Load the NeuronRecord for the specified neuron.""" 229 | url = "/".join([dataset_path, str(layer_index), f"{neuron_index}.json"]) 230 | url = standardize_azure_url(url) 231 | with urllib.request.urlopen(url) as f: 232 | neuron_record = loads(f.read()) 233 | if not isinstance(neuron_record, NeuronRecord): 234 | raise ValueError( 235 | f"Stored data incompatible with current version of NeuronRecord dataclass." 236 | ) 237 | return neuron_record 238 | 239 | 240 | @bbb.ensure_session 241 | async def load_neuron_async( 242 | layer_index: Union[str, int], 243 | neuron_index: Union[str, int], 244 | dataset_path: str = "az://openaipublic/neuron-explainer/data/collated-activations", 245 | ) -> NeuronRecord: 246 | """Async version of load_neuron.""" 247 | file = bf.join(dataset_path, str(layer_index), f"{neuron_index}.json") 248 | return await read_neuron_file(file) 249 | 250 | 251 | @bbb.ensure_session 252 | async def read_neuron_file(neuron_filename: str) -> NeuronRecord: 253 | """Like load_neuron_async, but takes a raw neuron filename.""" 254 | raw_contents = await bbb.read.read_single(neuron_filename) 255 | neuron_record = loads(raw_contents.decode("utf-8")) 256 | if not isinstance(neuron_record, NeuronRecord): 257 | raise ValueError( 258 | f"Stored data incompatible with current version of NeuronRecord dataclass." 259 | ) 260 | return neuron_record 261 | 262 | 263 | def get_sorted_neuron_indices(dataset_path: str, layer_index: Union[str, int]) -> List[int]: 264 | """Returns the indices of all neurons in this layer, in ascending order.""" 265 | layer_dir = bf.join(dataset_path, "neurons", str(layer_index)) 266 | return sorted( 267 | [int(f.split(".")[0]) for f in bf.listdir(layer_dir) if f.split(".")[0].isnumeric()] 268 | ) 269 | 270 | 271 | def get_sorted_layers(dataset_path: str) -> List[str]: 272 | """ 273 | Return the indices of all layers in this dataset, in ascending numerical order, as strings. 274 | """ 275 | return [ 276 | str(x) 277 | for x in sorted( 278 | [int(x) for x in bf.listdir(bf.join(dataset_path, "neurons")) if x.isnumeric()] 279 | ) 280 | ] 281 | -------------------------------------------------------------------------------- /neuron-explainer/neuron_explainer/explanations/explainer.py: -------------------------------------------------------------------------------- 1 | """Uses API calls to generate explanations of neuron behavior.""" 2 | 3 | from __future__ import annotations 4 | 5 | import logging 6 | import re 7 | from abc import ABC, abstractmethod 8 | from enum import Enum 9 | from typing import Any, Optional, Sequence, Union 10 | 11 | from neuron_explainer.activations.activation_records import ( 12 | calculate_max_activation, 13 | format_activation_records, 14 | non_zero_activation_proportion, 15 | ) 16 | from neuron_explainer.activations.activations import ActivationRecord 17 | from neuron_explainer.api_client import ApiClient 18 | from neuron_explainer.explanations.few_shot_examples import FewShotExampleSet 19 | from neuron_explainer.explanations.prompt_builder import ( 20 | HarmonyMessage, 21 | PromptBuilder, 22 | PromptFormat, 23 | Role, 24 | ) 25 | from neuron_explainer.explanations.token_space_few_shot_examples import ( 26 | TokenSpaceFewShotExampleSet, 27 | ) 28 | 29 | logger = logging.getLogger(__name__) 30 | 31 | 32 | # TODO(williamrs): This prefix may not work well for some things, like predicting the next token. 33 | # Try other options like "this neuron activates for". 34 | EXPLANATION_PREFIX = "the main thing this neuron does is find" 35 | 36 | 37 | def _split_numbered_list(text: str) -> list[str]: 38 | """Split a numbered list into a list of strings.""" 39 | lines = re.split(r"\n\d+\.", text) 40 | # Strip the leading whitespace from each line. 41 | return [line.lstrip() for line in lines] 42 | 43 | 44 | def _remove_final_period(text: str) -> str: 45 | """Strip a final period or period-space from a string.""" 46 | if text.endswith("."): 47 | return text[:-1] 48 | elif text.endswith(". "): 49 | return text[:-2] 50 | return text 51 | 52 | 53 | class ContextSize(int, Enum): 54 | TWO_K = 2049 55 | FOUR_K = 4097 56 | 57 | @classmethod 58 | def from_int(cls, i: int) -> ContextSize: 59 | for context_size in cls: 60 | if context_size.value == i: 61 | return context_size 62 | raise ValueError(f"{i} is not a valid ContextSize") 63 | 64 | 65 | HARMONY_V4_MODELS = ["gpt-3.5-turbo", "gpt-4", "gpt-4-1106-preview"] 66 | 67 | 68 | class NeuronExplainer(ABC): 69 | """ 70 | Abstract base class for Explainer classes that generate explanations from subclass-specific 71 | input data. 72 | """ 73 | 74 | def __init__( 75 | self, 76 | model_name: str, 77 | prompt_format: PromptFormat = PromptFormat.HARMONY_V4, 78 | # This parameter lets us adjust the length of the prompt when we're generating explanations 79 | # using older models with shorter context windows. In the future we can use it to experiment 80 | # with longer context windows. 81 | context_size: ContextSize = ContextSize.FOUR_K, 82 | max_concurrent: Optional[int] = 10, 83 | cache: bool = False, 84 | ): 85 | if prompt_format == PromptFormat.HARMONY_V4: 86 | assert model_name in HARMONY_V4_MODELS 87 | elif prompt_format in [PromptFormat.NONE, PromptFormat.INSTRUCTION_FOLLOWING]: 88 | assert model_name not in HARMONY_V4_MODELS 89 | else: 90 | raise ValueError(f"Unhandled prompt format {prompt_format}") 91 | 92 | self.model_name = model_name 93 | self.prompt_format = prompt_format 94 | self.context_size = context_size 95 | self.client = ApiClient(model_name=model_name, max_concurrent=max_concurrent, cache=cache) 96 | 97 | async def generate_explanations( 98 | self, 99 | *, 100 | num_samples: int = 5, 101 | max_tokens: int = 60, 102 | temperature: float = 1.0, 103 | top_p: float = 1.0, 104 | **prompt_kwargs: Any, 105 | ) -> list[Any]: 106 | """Generate explanations based on subclass-specific input data.""" 107 | prompt = self.make_explanation_prompt(max_tokens_for_completion=max_tokens, **prompt_kwargs) 108 | 109 | generate_kwargs: dict[str, Any] = { 110 | "n": num_samples, 111 | "max_tokens": max_tokens, 112 | "temperature": temperature, 113 | "top_p": top_p, 114 | } 115 | 116 | if self.prompt_format == PromptFormat.HARMONY_V4: 117 | assert isinstance(prompt, list) 118 | assert isinstance(prompt[0], dict) # Really a HarmonyMessage 119 | generate_kwargs["messages"] = prompt 120 | else: 121 | assert isinstance(prompt, str) 122 | generate_kwargs["prompt"] = prompt 123 | 124 | response = await self.client.make_request(**generate_kwargs) 125 | logger.debug("response in generate_explanations is %s", response) 126 | 127 | if self.prompt_format == PromptFormat.HARMONY_V4: 128 | explanations = [x["message"]["content"] for x in response["choices"]] 129 | elif self.prompt_format in [PromptFormat.NONE, PromptFormat.INSTRUCTION_FOLLOWING]: 130 | explanations = [x["text"] for x in response["choices"]] 131 | else: 132 | raise ValueError(f"Unhandled prompt format {self.prompt_format}") 133 | 134 | return self.postprocess_explanations(explanations, prompt_kwargs) 135 | 136 | @abstractmethod 137 | def make_explanation_prompt(self, **kwargs: Any) -> Union[str, list[HarmonyMessage]]: 138 | """ 139 | Create a prompt to send to the API to generate one or more explanations. 140 | 141 | A prompt can be a simple string, or a list of HarmonyMessages, depending on the PromptFormat 142 | used by this instance. 143 | """ 144 | ... 145 | 146 | def postprocess_explanations( 147 | self, completions: list[str], prompt_kwargs: dict[str, Any] 148 | ) -> list[Any]: 149 | """Postprocess the completions returned by the API into a list of explanations.""" 150 | return completions # no-op by default 151 | 152 | def _prompt_is_too_long( 153 | self, prompt_builder: PromptBuilder, max_tokens_for_completion: int 154 | ) -> bool: 155 | # We'll get a context size error if the prompt itself plus the maximum number of tokens for 156 | # the completion is longer than the context size. 157 | prompt_length = prompt_builder.prompt_length_in_tokens(self.prompt_format) 158 | if prompt_length + max_tokens_for_completion > self.context_size.value: 159 | print( 160 | f"Prompt is too long: {prompt_length} + {max_tokens_for_completion} > " 161 | f"{self.context_size.value}" 162 | ) 163 | return True 164 | return False 165 | 166 | 167 | class TokenActivationPairExplainer(NeuronExplainer): 168 | """ 169 | Generate explanations of neuron behavior using a prompt with lists of token/activation pairs. 170 | """ 171 | 172 | def __init__( 173 | self, 174 | model_name: str, 175 | prompt_format: PromptFormat = PromptFormat.HARMONY_V4, 176 | # This parameter lets us adjust the length of the prompt when we're generating explanations 177 | # using older models with shorter context windows. In the future we can use it to experiment 178 | # with 8k+ context windows. 179 | context_size: ContextSize = ContextSize.FOUR_K, 180 | few_shot_example_set: FewShotExampleSet = FewShotExampleSet.ORIGINAL, 181 | repeat_non_zero_activations: bool = True, 182 | max_concurrent: Optional[int] = 10, 183 | cache: bool = False, 184 | ): 185 | super().__init__( 186 | model_name=model_name, 187 | prompt_format=prompt_format, 188 | max_concurrent=max_concurrent, 189 | cache=cache, 190 | ) 191 | self.context_size = context_size 192 | self.few_shot_example_set = few_shot_example_set 193 | self.repeat_non_zero_activations = repeat_non_zero_activations 194 | 195 | def make_explanation_prompt(self, **kwargs: Any) -> Union[str, list[HarmonyMessage]]: 196 | original_kwargs = kwargs.copy() 197 | all_activation_records: Sequence[ActivationRecord] = kwargs.pop("all_activation_records") 198 | max_activation: float = kwargs.pop("max_activation") 199 | kwargs.setdefault("numbered_list_of_n_explanations", None) 200 | numbered_list_of_n_explanations: Optional[int] = kwargs.pop( 201 | "numbered_list_of_n_explanations" 202 | ) 203 | if numbered_list_of_n_explanations is not None: 204 | assert numbered_list_of_n_explanations > 0, numbered_list_of_n_explanations 205 | # This parameter lets us dynamically shrink the prompt if our initial attempt to create it 206 | # results in something that's too long. It's only implemented for the 4k context size. 207 | kwargs.setdefault("omit_n_activation_records", 0) 208 | omit_n_activation_records: int = kwargs.pop("omit_n_activation_records") 209 | max_tokens_for_completion: int = kwargs.pop("max_tokens_for_completion") 210 | assert not kwargs, f"Unexpected kwargs: {kwargs}" 211 | 212 | prompt_builder = PromptBuilder() 213 | prompt_builder.add_message( 214 | Role.SYSTEM, 215 | "We're studying neurons in a neural network. Each neuron looks for some particular " 216 | "thing in a short document. Look at the parts of the document the neuron activates for " 217 | "and summarize in a single sentence what the neuron is looking for. Don't list " 218 | "examples of words.\n\nThe activation format is tokenactivation. Activation " 219 | "values range from 0 to 10. A neuron finding what it's looking for is represented by a " 220 | "non-zero activation value. The higher the activation value, the stronger the match.", 221 | ) 222 | few_shot_examples = self.few_shot_example_set.get_examples() 223 | num_omitted_activation_records = 0 224 | for i, few_shot_example in enumerate(few_shot_examples): 225 | few_shot_activation_records = few_shot_example.activation_records 226 | if self.context_size == ContextSize.TWO_K: 227 | # If we're using a 2k context window, we only have room for one activation record 228 | # per few-shot example. (Two few-shot examples with one activation record each seems 229 | # to work better than one few-shot example with two activation records, in local 230 | # testing.) 231 | few_shot_activation_records = few_shot_activation_records[:1] 232 | elif ( 233 | self.context_size == ContextSize.FOUR_K 234 | and num_omitted_activation_records < omit_n_activation_records 235 | ): 236 | # Drop the last activation record for this few-shot example to save tokens, assuming 237 | # there are at least two activation records. 238 | if len(few_shot_activation_records) > 1: 239 | print(f"Warning: omitting activation record from few-shot example {i}") 240 | few_shot_activation_records = few_shot_activation_records[:-1] 241 | num_omitted_activation_records += 1 242 | self._add_per_neuron_explanation_prompt( 243 | prompt_builder, 244 | few_shot_activation_records, 245 | i, 246 | calculate_max_activation(few_shot_example.activation_records), 247 | numbered_list_of_n_explanations=numbered_list_of_n_explanations, 248 | explanation=few_shot_example.explanation, 249 | ) 250 | self._add_per_neuron_explanation_prompt( 251 | prompt_builder, 252 | # If we're using a 2k context window, we only have room for two of the activation 253 | # records. 254 | all_activation_records[:2] 255 | if self.context_size == ContextSize.TWO_K 256 | else all_activation_records, 257 | len(few_shot_examples), 258 | max_activation, 259 | numbered_list_of_n_explanations=numbered_list_of_n_explanations, 260 | explanation=None, 261 | ) 262 | # If the prompt is too long *and* we omitted the specified number of activation records, try 263 | # again, omitting one more. (If we didn't make the specified number of omissions, we're out 264 | # of opportunities to omit records, so we just return the prompt as-is.) 265 | if ( 266 | self._prompt_is_too_long(prompt_builder, max_tokens_for_completion) 267 | and num_omitted_activation_records == omit_n_activation_records 268 | ): 269 | original_kwargs["omit_n_activation_records"] = omit_n_activation_records + 1 270 | return self.make_explanation_prompt(**original_kwargs) 271 | return prompt_builder.build(self.prompt_format) 272 | 273 | def _add_per_neuron_explanation_prompt( 274 | self, 275 | prompt_builder: PromptBuilder, 276 | activation_records: Sequence[ActivationRecord], 277 | index: int, 278 | max_activation: float, 279 | # When set, this indicates that the prompt should solicit a numbered list of the given 280 | # number of explanations, rather than a single explanation. 281 | numbered_list_of_n_explanations: Optional[int], 282 | explanation: Optional[str], # None means this is the end of the full prompt. 283 | ) -> None: 284 | max_activation = calculate_max_activation(activation_records) 285 | user_message = f""" 286 | 287 | Neuron {index + 1} 288 | Activations:{format_activation_records(activation_records, max_activation, omit_zeros=False)}""" 289 | # We repeat the non-zero activations only if it was requested and if the proportion of 290 | # non-zero activations isn't too high. 291 | if ( 292 | self.repeat_non_zero_activations 293 | and non_zero_activation_proportion(activation_records, max_activation) < 0.2 294 | ): 295 | user_message += ( 296 | f"\nSame activations, but with all zeros filtered out:" 297 | f"{format_activation_records(activation_records, max_activation, omit_zeros=True)}" 298 | ) 299 | 300 | if numbered_list_of_n_explanations is None: 301 | user_message += f"\nExplanation of neuron {index + 1} behavior:" 302 | assistant_message = "" 303 | # For the IF format, we want <|endofprompt|> to come before the explanation prefix. 304 | if self.prompt_format == PromptFormat.INSTRUCTION_FOLLOWING: 305 | assistant_message += f" {EXPLANATION_PREFIX}" 306 | else: 307 | user_message += f" {EXPLANATION_PREFIX}" 308 | prompt_builder.add_message(Role.USER, user_message) 309 | 310 | if explanation is not None: 311 | assistant_message += f" {explanation}." 312 | if assistant_message: 313 | prompt_builder.add_message(Role.ASSISTANT, assistant_message) 314 | else: 315 | if explanation is None: 316 | # For the final neuron, we solicit a numbered list of explanations. 317 | prompt_builder.add_message( 318 | Role.USER, 319 | f"""\nHere are {numbered_list_of_n_explanations} possible explanations for neuron {index + 1} behavior, each beginning with "{EXPLANATION_PREFIX}":\n1. {EXPLANATION_PREFIX}""", 320 | ) 321 | else: 322 | # For the few-shot examples, we only present one explanation, but we present it as a 323 | # numbered list. 324 | prompt_builder.add_message( 325 | Role.USER, 326 | f"""\nHere is 1 possible explanation for neuron {index + 1} behavior, beginning with "{EXPLANATION_PREFIX}":\n1. {EXPLANATION_PREFIX}""", 327 | ) 328 | prompt_builder.add_message(Role.ASSISTANT, f" {explanation}.") 329 | 330 | def postprocess_explanations( 331 | self, completions: list[str], prompt_kwargs: dict[str, Any] 332 | ) -> list[Any]: 333 | """Postprocess the explanations returned by the API""" 334 | numbered_list_of_n_explanations = prompt_kwargs.get("numbered_list_of_n_explanations") 335 | if numbered_list_of_n_explanations is None: 336 | return completions 337 | else: 338 | all_explanations = [] 339 | for completion in completions: 340 | for explanation in _split_numbered_list(completion): 341 | if explanation.startswith(EXPLANATION_PREFIX): 342 | explanation = explanation[len(EXPLANATION_PREFIX) :] 343 | all_explanations.append(explanation.strip()) 344 | return all_explanations 345 | 346 | 347 | class TokenSpaceRepresentationExplainer(NeuronExplainer): 348 | """ 349 | Generate explanations of arbitrary lists of tokens which disproportionately activate a 350 | particular neuron. These lists of tokens can be generated in various ways. As an example, in one 351 | set of experiments, we compute the average activation for each neuron conditional on each token 352 | that appears in an internet text corpus. We then sort the tokens by their average activation, 353 | and show 50 of the top 100 tokens. Other techniques that could be used include taking the top 354 | tokens in the logit lens or tuned lens representations of a neuron. 355 | """ 356 | 357 | def __init__( 358 | self, 359 | model_name: str, 360 | prompt_format: PromptFormat = PromptFormat.HARMONY_V4, 361 | context_size: ContextSize = ContextSize.FOUR_K, 362 | few_shot_example_set: TokenSpaceFewShotExampleSet = TokenSpaceFewShotExampleSet.ORIGINAL, 363 | use_few_shot: bool = False, 364 | output_numbered_list: bool = False, 365 | max_concurrent: Optional[int] = 10, 366 | cache: bool = False, 367 | ): 368 | super().__init__( 369 | model_name=model_name, 370 | prompt_format=prompt_format, 371 | context_size=context_size, 372 | max_concurrent=max_concurrent, 373 | cache=cache, 374 | ) 375 | self.use_few_shot = use_few_shot 376 | self.output_numbered_list = output_numbered_list 377 | if self.use_few_shot: 378 | assert few_shot_example_set is not None 379 | self.few_shot_examples: Optional[TokenSpaceFewShotExampleSet] = few_shot_example_set 380 | else: 381 | self.few_shot_examples = None 382 | self.prompt_prefix = ( 383 | "We're studying neurons in a neural network. Each neuron looks for some particular " 384 | "kind of token (which can be a word, or part of a word). Look at the tokens the neuron " 385 | "activates for (listed below) and summarize in a single sentence what the neuron is " 386 | "looking for. Don't list examples of words." 387 | ) 388 | 389 | def make_explanation_prompt(self, **kwargs: Any) -> Union[str, list[HarmonyMessage]]: 390 | tokens: list[str] = kwargs.pop("tokens") 391 | max_tokens_for_completion = kwargs.pop("max_tokens_for_completion") 392 | assert not kwargs, f"Unexpected kwargs: {kwargs}" 393 | # Note that this does not preserve the precise tokens, as e.g. 394 | # f" {token_with_no_leading_space}" may be tokenized as "f{token_with_leading_space}". 395 | # TODO(dan): Try out other variants, including "\n".join(...) and ",".join(...) 396 | stringified_tokens = ", ".join([f"'{t}'" for t in tokens]) 397 | 398 | prompt_builder = PromptBuilder() 399 | prompt_builder.add_message(Role.SYSTEM, self.prompt_prefix) 400 | if self.use_few_shot: 401 | self._add_few_shot_examples(prompt_builder) 402 | self._add_neuron_specific_prompt(prompt_builder, stringified_tokens, explanation=None) 403 | 404 | if self._prompt_is_too_long(prompt_builder, max_tokens_for_completion): 405 | raise ValueError(f"Prompt too long: {prompt_builder.build(self.prompt_format)}") 406 | else: 407 | return prompt_builder.build(self.prompt_format) 408 | 409 | def _add_few_shot_examples(self, prompt_builder: PromptBuilder) -> None: 410 | """ 411 | Append few-shot examples to the prompt. Each one consists of a comma-delimited list of 412 | tokens and corresponding explanations, as saved in 413 | alignment/neuron_explainer/weight_explainer/token_space_few_shot_examples.py. 414 | """ 415 | assert self.few_shot_examples is not None 416 | few_shot_example_list = self.few_shot_examples.get_examples() 417 | if self.output_numbered_list: 418 | raise NotImplementedError("Numbered list output not supported for few-shot examples") 419 | else: 420 | for few_shot_example in few_shot_example_list: 421 | self._add_neuron_specific_prompt( 422 | prompt_builder, 423 | ", ".join([f"'{t}'" for t in few_shot_example.tokens]), 424 | explanation=few_shot_example.explanation, 425 | ) 426 | 427 | def _add_neuron_specific_prompt( 428 | self, 429 | prompt_builder: PromptBuilder, 430 | stringified_tokens: str, 431 | explanation: Optional[str], 432 | ) -> None: 433 | """ 434 | Append a neuron-specific prompt to the prompt builder. The prompt consists of a list of 435 | tokens followed by either an explanation (if one is passed, for few shot examples) or by 436 | the beginning of a completion, to be completed by the model with an explanation. 437 | """ 438 | user_message = f"\n\n\n\nTokens:\n{stringified_tokens}\n\nExplanation:\n" 439 | assistant_message = "" 440 | looking_for = "This neuron is looking for" 441 | if self.prompt_format == PromptFormat.INSTRUCTION_FOLLOWING: 442 | # We want <|endofprompt|> to come before "This neuron is looking for" in the IF format. 443 | assistant_message += looking_for 444 | else: 445 | user_message += looking_for 446 | if self.output_numbered_list: 447 | start_of_list = "\n1." 448 | if self.prompt_format == PromptFormat.INSTRUCTION_FOLLOWING: 449 | assistant_message += start_of_list 450 | else: 451 | user_message += start_of_list 452 | if explanation is not None: 453 | assistant_message += f"{explanation}." 454 | prompt_builder.add_message(Role.USER, user_message) 455 | if assistant_message: 456 | prompt_builder.add_message(Role.ASSISTANT, assistant_message) 457 | 458 | def postprocess_explanations( 459 | self, completions: list[str], prompt_kwargs: dict[str, Any] 460 | ) -> list[str]: 461 | if self.output_numbered_list: 462 | # Each list in the top-level list will have multiple explanations (multiple strings). 463 | all_explanations = [] 464 | for completion in completions: 465 | for explanation in _split_numbered_list(completion): 466 | if explanation.startswith(EXPLANATION_PREFIX): 467 | explanation = explanation[len(EXPLANATION_PREFIX) :] 468 | all_explanations.append(explanation.strip()) 469 | return all_explanations 470 | else: 471 | # Each element in the top-level list will be an explanation as a string. 472 | return [_remove_final_period(explanation) for explanation in completions] 473 | -------------------------------------------------------------------------------- /neuron-explainer/neuron_explainer/explanations/few_shot_examples.py: -------------------------------------------------------------------------------- 1 | # Few-shot examples for generating and simulating neuron explanations. 2 | 3 | from __future__ import annotations 4 | 5 | from dataclasses import dataclass 6 | from enum import Enum 7 | from typing import List, Optional 8 | 9 | from neuron_explainer.activations.activations import ActivationRecord 10 | from neuron_explainer.fast_dataclasses import FastDataclass 11 | 12 | 13 | @dataclass 14 | class Example(FastDataclass): 15 | activation_records: List[ActivationRecord] 16 | explanation: str 17 | first_revealed_activation_indices: List[int] 18 | """ 19 | For each activation record, the index of the first token for which the activation value in the 20 | prompt should be an actual number rather than "unknown". 21 | 22 | Examples all start with the activations rendered as "unknown", then transition to revealing 23 | specific normalized activation values. The goal is to lead the model to predict that activation 24 | sequences will eventually transition to predicting specific activation values instead of just 25 | "unknown". This lets us cheat and get predictions of activation values for every token in a 26 | single round of inference by having the activations in the sequence we're predicting always be 27 | "unknown" in the prompt: the model will always think that maybe the next token will be a real 28 | activation. 29 | """ 30 | token_index_to_score: Optional[int] = None 31 | """ 32 | If the prompt is used as an example for one-token-at-a-time scoring, this is the index of the 33 | token to score. 34 | """ 35 | 36 | 37 | class FewShotExampleSet(Enum): 38 | """Determines which few-shot examples to use when sampling explanations.""" 39 | 40 | ORIGINAL = "original" 41 | NEWER = "newer" 42 | TEST = "test" 43 | 44 | @classmethod 45 | def from_string(cls, string: str) -> FewShotExampleSet: 46 | for example_set in FewShotExampleSet: 47 | if example_set.value == string: 48 | return example_set 49 | raise ValueError(f"Unrecognized example set: {string}") 50 | 51 | def get_examples(self) -> list[Example]: 52 | """Returns regular examples for use in a few-shot prompt.""" 53 | if self is FewShotExampleSet.ORIGINAL: 54 | return ORIGINAL_EXAMPLES 55 | elif self is FewShotExampleSet.NEWER: 56 | return NEWER_EXAMPLES 57 | elif self is FewShotExampleSet.TEST: 58 | return TEST_EXAMPLES 59 | else: 60 | raise ValueError(f"Unhandled example set: {self}") 61 | 62 | def get_single_token_prediction_example(self) -> Example: 63 | """ 64 | Returns an example suitable for use in a subprompt for predicting a single token's 65 | normalized activation, for use with the "one token at a time" scoring approach. 66 | """ 67 | if self is FewShotExampleSet.NEWER: 68 | return NEWER_SINGLE_TOKEN_EXAMPLE 69 | elif self is FewShotExampleSet.TEST: 70 | return TEST_SINGLE_TOKEN_EXAMPLE 71 | else: 72 | raise ValueError(f"Unhandled example set: {self}") 73 | 74 | 75 | TEST_EXAMPLES = [ 76 | Example( 77 | activation_records=[ 78 | ActivationRecord( 79 | tokens=["a", "b", "c"], 80 | activations=[1.0, 0.0, 0.0], 81 | ), 82 | ActivationRecord( 83 | tokens=["d", "e", "f"], 84 | activations=[0.0, 1.0, 0.0], 85 | ), 86 | ], 87 | explanation="vowels", 88 | first_revealed_activation_indices=[0, 1], 89 | ), 90 | ] 91 | 92 | TEST_SINGLE_TOKEN_EXAMPLE = Example( 93 | activation_records=[ 94 | ActivationRecord( 95 | activations=[0.0, 0.0, 1.0], 96 | tokens=["g", "h", "i"], 97 | ), 98 | ], 99 | first_revealed_activation_indices=[], 100 | token_index_to_score=2, 101 | explanation="test explanation", 102 | ) 103 | 104 | 105 | ORIGINAL_EXAMPLES = [ 106 | Example( 107 | activation_records=[ 108 | ActivationRecord( 109 | tokens=[ 110 | "t", 111 | "urt", 112 | "ur", 113 | "ro", 114 | " is", 115 | " fab", 116 | "ulously", 117 | " funny", 118 | " and", 119 | " over", 120 | " the", 121 | " top", 122 | " as", 123 | " a", 124 | " '", 125 | "very", 126 | " sneaky", 127 | "'", 128 | " but", 129 | "ler", 130 | " who", 131 | " excel", 132 | "s", 133 | " in", 134 | " the", 135 | " art", 136 | " of", 137 | " impossible", 138 | " disappearing", 139 | "/", 140 | "re", 141 | "app", 142 | "earing", 143 | " acts", 144 | ], 145 | activations=[ 146 | -0.71, 147 | -1.85, 148 | -2.39, 149 | -2.58, 150 | -1.34, 151 | -1.92, 152 | -1.69, 153 | -0.84, 154 | -1.25, 155 | -1.75, 156 | -1.42, 157 | -1.47, 158 | -1.51, 159 | -0.8, 160 | -1.89, 161 | -1.56, 162 | -1.63, 163 | 0.44, 164 | -1.87, 165 | -2.55, 166 | -2.09, 167 | -1.76, 168 | -1.33, 169 | -0.88, 170 | -1.63, 171 | -2.39, 172 | -2.63, 173 | -0.99, 174 | 2.83, 175 | -1.11, 176 | -1.19, 177 | -1.33, 178 | 4.24, 179 | -1.51, 180 | ], 181 | ), 182 | ActivationRecord( 183 | tokens=[ 184 | "esc", 185 | "aping", 186 | " the", 187 | " studio", 188 | " ,", 189 | " pic", 190 | "col", 191 | "i", 192 | " is", 193 | " warm", 194 | "ly", 195 | " affecting", 196 | " and", 197 | " so", 198 | " is", 199 | " this", 200 | " ad", 201 | "roit", 202 | "ly", 203 | " minimalist", 204 | " movie", 205 | " .", 206 | ], 207 | activations=[ 208 | -0.69, 209 | 4.12, 210 | 1.83, 211 | -2.28, 212 | -0.28, 213 | -0.79, 214 | -2.2, 215 | -2.03, 216 | -1.77, 217 | -1.71, 218 | -2.44, 219 | 1.6, 220 | -1, 221 | -0.38, 222 | -1.93, 223 | -2.09, 224 | -1.63, 225 | -1.94, 226 | -1.82, 227 | -1.64, 228 | -1.32, 229 | -1.92, 230 | ], 231 | ), 232 | ], 233 | first_revealed_activation_indices=[10, 3], 234 | explanation="present tense verbs ending in 'ing'", 235 | ), 236 | Example( 237 | activation_records=[ 238 | ActivationRecord( 239 | tokens=[ 240 | "as", 241 | " sac", 242 | "char", 243 | "ine", 244 | " movies", 245 | " go", 246 | " ,", 247 | " this", 248 | " is", 249 | " likely", 250 | " to", 251 | " cause", 252 | " massive", 253 | " cardiac", 254 | " arrest", 255 | " if", 256 | " taken", 257 | " in", 258 | " large", 259 | " doses", 260 | " .", 261 | ], 262 | activations=[ 263 | -0.14, 264 | -1.37, 265 | -0.68, 266 | -2.27, 267 | -1.46, 268 | -1.11, 269 | -0.9, 270 | -2.48, 271 | -2.07, 272 | -3.49, 273 | -2.16, 274 | -1.79, 275 | -0.23, 276 | -0.04, 277 | 4.46, 278 | -1.02, 279 | -2.26, 280 | -2.95, 281 | -1.49, 282 | -1.46, 283 | -0.6, 284 | ], 285 | ), 286 | ActivationRecord( 287 | tokens=[ 288 | "shot", 289 | " perhaps", 290 | " '", 291 | "art", 292 | "istically", 293 | "'", 294 | " with", 295 | " handheld", 296 | " cameras", 297 | " and", 298 | " apparently", 299 | " no", 300 | " movie", 301 | " lights", 302 | " by", 303 | " jo", 304 | "aquin", 305 | " b", 306 | "aca", 307 | "-", 308 | "as", 309 | "ay", 310 | " ,", 311 | " the", 312 | " low", 313 | "-", 314 | "budget", 315 | " production", 316 | " swings", 317 | " annoy", 318 | "ingly", 319 | " between", 320 | " vert", 321 | "igo", 322 | " and", 323 | " opacity", 324 | " .", 325 | ], 326 | activations=[ 327 | -0.09, 328 | -3.53, 329 | -0.72, 330 | -2.36, 331 | -1.05, 332 | -1.12, 333 | -2.49, 334 | -2.14, 335 | -1.98, 336 | -1.59, 337 | -2.62, 338 | -2, 339 | -2.73, 340 | -2.87, 341 | -3.23, 342 | -1.11, 343 | -2.23, 344 | -0.97, 345 | -2.28, 346 | -2.37, 347 | -1.5, 348 | -2.81, 349 | -1.73, 350 | -3.14, 351 | -2.61, 352 | -1.7, 353 | -3.08, 354 | -4, 355 | -0.71, 356 | -2.48, 357 | -1.39, 358 | -1.96, 359 | -1.09, 360 | 4.37, 361 | -0.74, 362 | -0.5, 363 | -0.62, 364 | ], 365 | ), 366 | ], 367 | first_revealed_activation_indices=[5, 20], 368 | explanation="words related to physical medical conditions", 369 | ), 370 | Example( 371 | activation_records=[ 372 | ActivationRecord( 373 | tokens=[ 374 | "the", 375 | " sense", 376 | " of", 377 | " together", 378 | "ness", 379 | " in", 380 | " our", 381 | " town", 382 | " is", 383 | " strong", 384 | " .", 385 | ], 386 | activations=[ 387 | 0, 388 | 0, 389 | 0, 390 | 1, 391 | 2, 392 | 0, 393 | 0.23, 394 | 0.5, 395 | 0, 396 | 0, 397 | 0, 398 | ], 399 | ), 400 | ActivationRecord( 401 | tokens=[ 402 | "a", 403 | " buoy", 404 | "ant", 405 | " romantic", 406 | " comedy", 407 | " about", 408 | " friendship", 409 | " ,", 410 | " love", 411 | " ,", 412 | " and", 413 | " the", 414 | " truth", 415 | " that", 416 | " we", 417 | "'re", 418 | " all", 419 | " in", 420 | " this", 421 | " together", 422 | " .", 423 | ], 424 | activations=[ 425 | -0.15, 426 | -2.33, 427 | -1.4, 428 | -2.17, 429 | -2.53, 430 | -0.85, 431 | 0.23, 432 | -1.89, 433 | 0.09, 434 | -0.47, 435 | -0.5, 436 | -0.58, 437 | -0.87, 438 | 0.22, 439 | 0.58, 440 | 1.34, 441 | 0.98, 442 | 2.21, 443 | 2.84, 444 | 1.7, 445 | -0.89, 446 | ], 447 | ), 448 | ], 449 | first_revealed_activation_indices=[0, 10], 450 | explanation="phrases related to community", 451 | ), 452 | ] 453 | 454 | 455 | NEWER_EXAMPLES = [ 456 | Example( 457 | activation_records=[ 458 | ActivationRecord( 459 | tokens=[ 460 | "The", 461 | " editors", 462 | " of", 463 | " Bi", 464 | "opol", 465 | "ym", 466 | "ers", 467 | " are", 468 | " delighted", 469 | " to", 470 | " present", 471 | " the", 472 | " ", 473 | "201", 474 | "8", 475 | " Murray", 476 | " Goodman", 477 | " Memorial", 478 | " Prize", 479 | " to", 480 | " Professor", 481 | " David", 482 | " N", 483 | ".", 484 | " Ber", 485 | "atan", 486 | " in", 487 | " recognition", 488 | " of", 489 | " his", 490 | " seminal", 491 | " contributions", 492 | " to", 493 | " bi", 494 | "oph", 495 | "ysics", 496 | " and", 497 | " their", 498 | " impact", 499 | " on", 500 | " our", 501 | " understanding", 502 | " of", 503 | " charge", 504 | " transport", 505 | " in", 506 | " biom", 507 | "olecules", 508 | ".\n\n", 509 | "In", 510 | "aug", 511 | "ur", 512 | "ated", 513 | " in", 514 | " ", 515 | "200", 516 | "7", 517 | " in", 518 | " honor", 519 | " of", 520 | " the", 521 | " Bi", 522 | "opol", 523 | "ym", 524 | "ers", 525 | " Found", 526 | "ing", 527 | " Editor", 528 | ",", 529 | " the", 530 | " prize", 531 | " is", 532 | " awarded", 533 | " for", 534 | " outstanding", 535 | " accomplishments", 536 | ], 537 | activations=[ 538 | 0, 539 | 0.01, 540 | 0.01, 541 | 0, 542 | 0, 543 | 0, 544 | -0.01, 545 | 0, 546 | -0.01, 547 | 0, 548 | 0, 549 | 0, 550 | 0, 551 | 0, 552 | 0.04, 553 | 0, 554 | 0, 555 | 0, 556 | 0, 557 | 0, 558 | 0, 559 | 0, 560 | 0, 561 | 0, 562 | 0, 563 | 0, 564 | 0, 565 | 0, 566 | 0, 567 | 0, 568 | 3.39, 569 | 0.12, 570 | 0, 571 | -0.01, 572 | 0, 573 | 0, 574 | 0, 575 | 0, 576 | -0, 577 | 0, 578 | -0, 579 | 0, 580 | 0, 581 | -0, 582 | 0, 583 | 0, 584 | 0, 585 | 0, 586 | 0, 587 | 0, 588 | 0, 589 | 0, 590 | 0, 591 | 0, 592 | 0, 593 | 0, 594 | 0, 595 | 0, 596 | 0, 597 | 0, 598 | 0, 599 | -0, 600 | 0, 601 | 0, 602 | -0.01, 603 | 0, 604 | 0.41, 605 | 0, 606 | 0, 607 | 0, 608 | -0.01, 609 | 0, 610 | 0, 611 | 0, 612 | 0, 613 | 0, 614 | ], 615 | ), 616 | # We sometimes exceed the max context size when this is included :( 617 | # ActivationRecord( 618 | # tokens=[ 619 | # " We", 620 | # " are", 621 | # " proud", 622 | # " of", 623 | # " our", 624 | # " national", 625 | # " achievements", 626 | # " in", 627 | # " mastering", 628 | # " all", 629 | # " aspects", 630 | # " of", 631 | # " the", 632 | # " fuel", 633 | # " cycle", 634 | # ".", 635 | # " The", 636 | # " current", 637 | # " international", 638 | # " interest", 639 | # " in", 640 | # " closing", 641 | # " the", 642 | # " fuel", 643 | # " cycle", 644 | # " is", 645 | # " a", 646 | # " vind", 647 | # "ication", 648 | # " of", 649 | # " Dr", 650 | # ".", 651 | # " B", 652 | # "hab", 653 | # "ha", 654 | # "’s", 655 | # " pioneering", 656 | # " vision", 657 | # " and", 658 | # " genius", 659 | # ], 660 | # activations=[ 661 | # -0, 662 | # -0, 663 | # 0, 664 | # -0, 665 | # -0, 666 | # 0, 667 | # 0, 668 | # 0, 669 | # -0, 670 | # 0, 671 | # 0, 672 | # -0, 673 | # 0, 674 | # -0.01, 675 | # 0, 676 | # 0, 677 | # -0, 678 | # -0, 679 | # 0, 680 | # 0, 681 | # 0, 682 | # -0, 683 | # -0, 684 | # -0.01, 685 | # 0, 686 | # 0, 687 | # -0, 688 | # 0, 689 | # 0, 690 | # 0, 691 | # 0, 692 | # 0, 693 | # -0, 694 | # 0, 695 | # 0, 696 | # 0, 697 | # 2.15, 698 | # 0, 699 | # 0, 700 | # 0.03, 701 | # ], 702 | # ), 703 | ], 704 | first_revealed_activation_indices=[7], # , 19], 705 | explanation="language related to something being groundbreaking", 706 | ), 707 | Example( 708 | activation_records=[ 709 | ActivationRecord( 710 | tokens=[ 711 | '{"', 712 | "widget", 713 | "Class", 714 | '":"', 715 | "Variant", 716 | "Matrix", 717 | "Widget", 718 | '","', 719 | "back", 720 | "order", 721 | "Message", 722 | '":"', 723 | "Back", 724 | "ordered", 725 | '","', 726 | "back", 727 | "order", 728 | "Message", 729 | "Single", 730 | "Variant", 731 | '":"', 732 | "This", 733 | " item", 734 | " is", 735 | " back", 736 | "ordered", 737 | '.","', 738 | "ordered", 739 | "Selection", 740 | '":', 741 | "true", 742 | ',"', 743 | "product", 744 | "Variant", 745 | "Id", 746 | '":', 747 | "0", 748 | ',"', 749 | "variant", 750 | "Id", 751 | "Field", 752 | '":"', 753 | "product", 754 | "196", 755 | "39", 756 | "_V", 757 | "ariant", 758 | "Id", 759 | '","', 760 | "back", 761 | "order", 762 | "To", 763 | "Message", 764 | "Single", 765 | "Variant", 766 | '":"', 767 | "This", 768 | " item", 769 | " is", 770 | " back", 771 | "ordered", 772 | " and", 773 | " is", 774 | " expected", 775 | " by", 776 | " {", 777 | "0", 778 | "}.", 779 | '","', 780 | "low", 781 | "Price", 782 | '":', 783 | "999", 784 | "9", 785 | ".", 786 | "0", 787 | ',"', 788 | "attribute", 789 | "Indexes", 790 | '":[', 791 | '],"', 792 | "productId", 793 | '":', 794 | "196", 795 | "39", 796 | ',"', 797 | "price", 798 | "V", 799 | "ariance", 800 | '":', 801 | "true", 802 | ',"', 803 | ], 804 | activations=[ 805 | 0, 806 | 0, 807 | 0, 808 | 0, 809 | 4.2, 810 | 0, 811 | 0, 812 | 0, 813 | 0, 814 | 0, 815 | 0, 816 | 0, 817 | 0, 818 | 0, 819 | 0, 820 | 0, 821 | 0, 822 | 0, 823 | 0, 824 | 3.6, 825 | 0, 826 | 0, 827 | 0, 828 | 0, 829 | 0, 830 | 0, 831 | 0, 832 | 0, 833 | 0, 834 | 0, 835 | 0, 836 | 0, 837 | 0, 838 | 3.7, 839 | 0, 840 | 0, 841 | 0, 842 | 0, 843 | 4.02, 844 | 0, 845 | 0, 846 | 0, 847 | 0, 848 | 0, 849 | 0, 850 | 3.5, 851 | 3.7, 852 | 0, 853 | 0, 854 | 0, 855 | 0, 856 | 0, 857 | 0, 858 | 0, 859 | 2.9, 860 | 0, 861 | 0, 862 | 0, 863 | 0, 864 | 0, 865 | 0, 866 | 0, 867 | 0, 868 | 0, 869 | 0, 870 | 0, 871 | 0, 872 | 0, 873 | 0, 874 | 0, 875 | 0, 876 | 0, 877 | 0, 878 | 0, 879 | 0, 880 | 0, 881 | 0, 882 | 0, 883 | 0, 884 | 0, 885 | 0, 886 | 0, 887 | 0, 888 | 0, 889 | 0, 890 | 0, 891 | 0, 892 | 2.3, 893 | 2.24, 894 | 0, 895 | 0, 896 | 0, 897 | ], 898 | ), 899 | ActivationRecord( 900 | tokens=[ 901 | "A", 902 | " regular", 903 | " look", 904 | " at", 905 | " the", 906 | " ups", 907 | " and", 908 | " downs", 909 | " of", 910 | " variant", 911 | " covers", 912 | " in", 913 | " the", 914 | " comics", 915 | " industry", 916 | "…\n\n", 917 | "Here", 918 | " are", 919 | " the", 920 | " Lego", 921 | " variant", 922 | " sketch", 923 | " covers", 924 | " by", 925 | " Leon", 926 | "el", 927 | " Cast", 928 | "ell", 929 | "ani", 930 | " for", 931 | " a", 932 | " variety", 933 | " of", 934 | " Marvel", 935 | " titles", 936 | ",", 937 | ], 938 | activations=[ 939 | 0, 940 | 0, 941 | 0, 942 | 0, 943 | 0, 944 | 0, 945 | 0, 946 | 0, 947 | 0, 948 | 6.52, 949 | 0, 950 | 0, 951 | 0, 952 | 0, 953 | 0, 954 | 0, 955 | 0, 956 | 0, 957 | 0, 958 | 0, 959 | 1.62, 960 | 0, 961 | 0, 962 | 0, 963 | 0, 964 | 0, 965 | 0, 966 | 0, 967 | 0, 968 | 0, 969 | 0, 970 | 3.23, 971 | 0, 972 | 0, 973 | 0, 974 | 0, 975 | ], 976 | ), 977 | ], 978 | first_revealed_activation_indices=[2, 8], 979 | explanation="the word “variant” and other words with the same ”vari” root", 980 | ), 981 | ] 982 | 983 | 984 | NEWER_SINGLE_TOKEN_EXAMPLE = Example( 985 | activation_records=[ 986 | ActivationRecord( 987 | tokens=[ 988 | "B", 989 | "10", 990 | " ", 991 | "111", 992 | " MON", 993 | "DAY", 994 | ",", 995 | " F", 996 | "EB", 997 | "RU", 998 | "ARY", 999 | " ", 1000 | "11", 1001 | ",", 1002 | " ", 1003 | "201", 1004 | "9", 1005 | " DON", 1006 | "ATE", 1007 | "fake higher scoring token", # See below. 1008 | ], 1009 | activations=[ 1010 | 0, 1011 | 0, 1012 | 0, 1013 | 0, 1014 | 0, 1015 | 0, 1016 | 0, 1017 | 0, 1018 | 0, 1019 | 0, 1020 | 0, 1021 | 0, 1022 | 0, 1023 | 0, 1024 | 0, 1025 | 0, 1026 | 0, 1027 | 0, 1028 | 0.37, 1029 | # This fake activation makes the previous token's activation normalize to 8, which 1030 | # might help address overconfidence in "10" activations for the one-token-at-a-time 1031 | # scoring prompt. This value and the associated token don't actually appear anywhere 1032 | # in the prompt. 1033 | 0.45, 1034 | ], 1035 | ), 1036 | ], 1037 | first_revealed_activation_indices=[], 1038 | token_index_to_score=18, 1039 | explanation="instances of the token 'ate' as part of another word", 1040 | ) 1041 | --------------------------------------------------------------------------------