├── .gitignore
├── neuron-explainer
    ├── neuron_explainer
    │   ├── __init__.py
    │   ├── activations
    │   │   ├── __init__.py
    │   │   ├── token_connections.py
    │   │   ├── activation_records.py
    │   │   └── activations.py
    │   ├── explanations
    │   │   ├── __init__.py
    │   │   ├── puzzles.py
    │   │   ├── prompt_builder.py
    │   │   ├── token_space_few_shot_examples.py
    │   │   ├── scoring.py
    │   │   ├── test_explainer.py
    │   │   ├── calibrated_simulator.py
    │   │   ├── test_simulator.py
    │   │   ├── explanations.py
    │   │   ├── explainer.py
    │   │   └── few_shot_examples.py
    │   ├── fast_dataclasses
    │   │   ├── __init__.py
    │   │   ├── test_fast_dataclasses.py
    │   │   └── fast_dataclasses.py
    │   ├── azure.py
    │   └── api_client.py
    ├── .gitignore
    ├── setup.py
    ├── README.md
    └── demos
    │   ├── explain_puzzles.ipynb
    │   ├── generate_and_score_explanation.ipynb
    │   └── generate_and_score_token_look_up_table_explanation.ipynb
├── neuron-viewer
    ├── public
    │   ├── robots.txt
    │   └── favicon.ico
    ├── tailwind.config.js
    ├── .parcelrc
    ├── src
    │   ├── panes
    │   │   ├── index.js
    │   │   ├── datasetList.jsx
    │   │   ├── similarNeurons.jsx
    │   │   ├── topTokens.jsx
    │   │   └── explanation.jsx
    │   ├── index.css
    │   ├── reportWebVitals.js
    │   ├── App.jsx
    │   ├── heatmapGrid.tsx
    │   ├── index.jsx
    │   ├── utils.ts
    │   ├── tokenHeatmap.tsx
    │   ├── feed.jsx
    │   ├── index.html
    │   ├── types.ts
    │   ├── simulationHeatmap.tsx
    │   ├── interpAPI.ts
    │   ├── App.css
    │   └── welcome.tsx
    ├── tsconfig.json
    ├── README.md
    ├── .gitignore
    ├── package.json
    └── python
    │   └── server.py
└── README.md


/.gitignore:
--------------------------------------------------------------------------------
1 | .DS_Store
2 | 


--------------------------------------------------------------------------------
/neuron-explainer/neuron_explainer/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/neuron-explainer/neuron_explainer/activations/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/neuron-explainer/neuron_explainer/explanations/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/neuron-explainer/.gitignore:
--------------------------------------------------------------------------------
1 | *.egg-info/
2 | __pycache__/
3 | 


--------------------------------------------------------------------------------
/neuron-viewer/public/robots.txt:
--------------------------------------------------------------------------------
1 | # https://www.robotstxt.org/robotstxt.html
2 | User-agent: *
3 | Disallow:
4 | 


--------------------------------------------------------------------------------
/neuron-viewer/public/favicon.ico:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openai/automated-interpretability/HEAD/neuron-viewer/public/favicon.ico


--------------------------------------------------------------------------------
/neuron-viewer/tailwind.config.js:
--------------------------------------------------------------------------------
1 | /** @type {import('tailwindcss').Config} */
2 | module.exports = {
3 |   content: ["./src/**/*.{html,js,jsx}"],
4 |   theme: {
5 |     extend: {},
6 |   },
7 |   plugins: [],
8 | }
9 | 


--------------------------------------------------------------------------------
/neuron-explainer/neuron_explainer/fast_dataclasses/__init__.py:
--------------------------------------------------------------------------------
1 | from .fast_dataclasses import FastDataclass, dumps, loads, register_dataclass
2 | 
3 | __all__ = ["FastDataclass", "dumps", "loads", "register_dataclass"]
4 | 


--------------------------------------------------------------------------------
/neuron-viewer/.parcelrc:
--------------------------------------------------------------------------------
 1 | {
 2 |   "extends": "@parcel/config-default",
 3 |   "transformers": {
 4 |     "*.{ts,tsx}": ["@parcel/transformer-typescript-tsc"]
 5 |   },
 6 |   "validators": {
 7 |     "*.{ts,tsx}": ["@parcel/validator-typescript"]
 8 |   }
 9 | }
10 | 


--------------------------------------------------------------------------------
/neuron-viewer/src/panes/index.js:
--------------------------------------------------------------------------------
1 | export { default as TopTokens } from "./topTokens"
2 | export { default as Explanation } from "./explanation"
3 | export { default as DatasetList } from "./datasetList"
4 | export { default as SimilarNeurons } from "./similarNeurons"
5 | 


--------------------------------------------------------------------------------
/neuron-explainer/neuron_explainer/azure.py:
--------------------------------------------------------------------------------
1 | def standardize_azure_url(url):
2 |     """Make sure url is converted to url format, not an azure path"""
3 |     if url.startswith("az://openaipublic/"):
4 |         url = url.replace("az://openaipublic/", "https://openaipublic.blob.core.windows.net/")
5 |     return url
6 | 


--------------------------------------------------------------------------------
/neuron-viewer/src/index.css:
--------------------------------------------------------------------------------
 1 | body {
 2 |   margin: 0;
 3 |   font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', 'Roboto', 'Oxygen',
 4 |     'Ubuntu', 'Cantarell', 'Fira Sans', 'Droid Sans', 'Helvetica Neue',
 5 |     sans-serif;
 6 |   -webkit-font-smoothing: antialiased;
 7 |   -moz-osx-font-smoothing: grayscale;
 8 | }
 9 | 
10 | code {
11 |   font-family: source-code-pro, Menlo, Monaco, Consolas, 'Courier New',
12 |     monospace;
13 | }
14 | 


--------------------------------------------------------------------------------
/neuron-viewer/src/reportWebVitals.js:
--------------------------------------------------------------------------------
 1 | const reportWebVitals = onPerfEntry => {
 2 |     if (onPerfEntry && onPerfEntry instanceof Function) {
 3 |       import('web-vitals').then(({ getCLS, getFID, getFCP, getLCP, getTTFB }) => {
 4 |         getCLS(onPerfEntry);
 5 |         getFID(onPerfEntry);
 6 |         getFCP(onPerfEntry);
 7 |         getLCP(onPerfEntry);
 8 |         getTTFB(onPerfEntry);
 9 |       });
10 |     }
11 |   };
12 |   
13 |   export default reportWebVitals;


--------------------------------------------------------------------------------
/neuron-viewer/src/App.jsx:
--------------------------------------------------------------------------------
 1 | import "./App.css"
 2 | import Feed from "./feed"
 3 | import React from "react"
 4 | import { Routes, Route, HashRouter } from "react-router-dom"
 5 | 
 6 | function App() {
 7 |   return (
 8 |     <HashRouter>
 9 |       <Routes>
10 |         <Route path="/" element={<Feed />} />
11 |         <Route path="/layers/:layer/neurons/:neuron" element={<Feed />} />
12 |       </Routes>
13 |     </HashRouter>
14 |   )
15 | }
16 | 
17 | export default App
18 | 


--------------------------------------------------------------------------------
/neuron-viewer/src/heatmapGrid.tsx:
--------------------------------------------------------------------------------
 1 | import { TokenAndActivation } from "./types"
 2 | import TokenHeatmap from "./tokenHeatmap";
 3 | 
 4 | export default ({ allTokens }: { allTokens: TokenAndActivation[][]}) => {
 5 |   return (
 6 |     <div className="">
 7 |       {allTokens.map((tokens, i) => (
 8 |         <div className="block my-3 border p-3 m-2 rounded-md" style={{ }} key={i}>
 9 |           <TokenHeatmap tokens={tokens} />
10 |         </div>
11 |       ))}
12 |     </div>
13 |   );
14 | };
15 | 


--------------------------------------------------------------------------------
/neuron-explainer/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup, find_packages
 2 | 
 3 | setup(
 4 |     name="neuron_explainer",
 5 |     packages=find_packages(),
 6 |     version="0.0.1",
 7 |     author="OpenAI",
 8 |     install_requires=[
 9 |         "httpx>=0.22",
10 |         "scikit-learn",
11 |         "boostedblob>=0.13.0",
12 |         "tiktoken",
13 |         "blobfile",
14 |         "numpy",
15 |         "pytest",
16 |         "orjson",
17 |     ],
18 |     url="",
19 |     description="",
20 |     python_requires='>=3.9',
21 | )
22 | 


--------------------------------------------------------------------------------
/neuron-viewer/tsconfig.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "compilerOptions": {
 3 |     "target": "es2021",
 4 |     "module": "commonjs",
 5 |     "lib": ["dom", "dom.iterable", "esnext"],
 6 |     "allowJs": true,
 7 |     "skipLibCheck": true,
 8 |     "esModuleInterop": true,
 9 |     "allowSyntheticDefaultImports": true,
10 |     "strict": true,
11 |     "forceConsistentCasingInFileNames": true,
12 |     "moduleResolution": "node",
13 |     "resolveJsonModule": true,
14 |     "isolatedModules": true,
15 |     "noEmit": true,
16 |     "jsx": "react-jsx"
17 |   },
18 |   "include": ["src"]
19 | }
20 | 


--------------------------------------------------------------------------------
/neuron-viewer/README.md:
--------------------------------------------------------------------------------
 1 | # Neuron viewer
 2 | 
 3 | The easiest way to view neurons and explanations is using the
 4 | [public website](https://openaipublic.blob.core.windows.net/neuron-explainer/neuron-viewer/index.html).
 5 | This directory contains the implementation of that website as well as lightweight servers that make
 6 | it possible to run an alternative version of the website locally.
 7 | 
 8 | ## Local development
 9 | 
10 | Install:
11 | 
12 | ```npm install```
13 | 
14 | Run the backend:
15 | 
16 | ```npm run startpy```
17 | 
18 | Run the frontend:
19 | 
20 | ```npm start```
21 | 


--------------------------------------------------------------------------------
/neuron-viewer/.gitignore:
--------------------------------------------------------------------------------
 1 | **/*.trace
 2 | **/*.zip
 3 | **/*.tar.gz
 4 | **/*.tgz
 5 | **/*.log
 6 | .parcel-cache
 7 | 
 8 | package-lock.json
 9 | **/*.bun
10 | 
11 | # See https://help.github.com/articles/ignoring-files/ for more about ignoring files.
12 | 
13 | # dependencies
14 | /node_modules
15 | /.pnp
16 | .pnp.js
17 | 
18 | # testing
19 | /coverage
20 | 
21 | # production
22 | /build
23 | 
24 | # misc
25 | .DS_Store
26 | .env.local
27 | .env.development.local
28 | .env.test.local
29 | .env.production.local
30 | 
31 | npm-debug.log*
32 | yarn-debug.log*
33 | yarn-error.log*
34 | 
35 | *.pyc
36 | dist/
37 | 
38 | .vscode
39 | 


--------------------------------------------------------------------------------
/neuron-viewer/src/index.jsx:
--------------------------------------------------------------------------------
 1 | import React from 'react';
 2 | import ReactDOM from 'react-dom/client';
 3 | import './index.css';
 4 | import App from './App';
 5 | import reportWebVitals from './reportWebVitals';
 6 | 
 7 | const root = ReactDOM.createRoot(document.getElementById('root'));
 8 | root.render(
 9 |   <React.StrictMode>
10 |     <App />
11 |   </React.StrictMode>
12 | );
13 | 
14 | // If you want to start measuring performance in your app, pass a function
15 | // to log results (for example: reportWebVitals(console.log))
16 | // or send to an analytics endpoint. Learn more: https://bit.ly/CRA-vitals
17 | reportWebVitals();
18 | 


--------------------------------------------------------------------------------
/neuron-explainer/README.md:
--------------------------------------------------------------------------------
 1 | # Neuron explainer
 2 | 
 3 | This directory contains a version of our code for generating, simulating and scoring explanations of
 4 | neuron behavior.
 5 | 
 6 | # Setup
 7 | 
 8 | ```
 9 | pip install -e .
10 | ```
11 | 
12 | # Usage
13 | 
14 | For example usage, see the `demos` folder:
15 | 
16 | * [Generating and scoring activation-based explanations](demos/generate_and_score_explanation.ipynb)
17 | * [Generating and scoring explanations based on tokens with high average activations](demos/generate_and_score_token_look_up_table_explanation.ipynb)
18 | * [Generating explanations for human-written neuron puzzles](demos/explain_puzzles.ipynb)
19 | 


--------------------------------------------------------------------------------
/neuron-viewer/src/utils.ts:
--------------------------------------------------------------------------------
 1 | export const memoizeAsync = (fnname: string, fn: any) => {
 2 |   return async (...args: any) => {
 3 |     const key = `memoized:${fnname}:${args.map((x: any) => JSON.stringify(x)).join("-")}`
 4 |     const val = localStorage.getItem(key);
 5 |     if (val === null) {
 6 |       const value = await fn(...args)
 7 |       localStorage.setItem(key, JSON.stringify(value))
 8 |       console.log(`memoized ${fnname}(${args.map((x: any) => JSON.stringify(x)).join(", ")})`, value)
 9 |       return value
10 |     } else {
11 |       // console.log(`parsing`, val)
12 |       return JSON.parse(val)
13 |     }
14 |   }
15 | }
16 | 
17 | 
18 | export const getQueryParams = () => {
19 |   const urlParams = new URLSearchParams(window.location.search)
20 |   const params: {[key: string]: any} = {}
21 |   for (const [key, value] of urlParams.entries()) {
22 |     params[key] = value
23 |   }
24 |   return params
25 | }
26 | 


--------------------------------------------------------------------------------
/neuron-viewer/src/tokenHeatmap.tsx:
--------------------------------------------------------------------------------
 1 | import React from "react"
 2 | import { interpolateColor, Color, getInterpolatedColor, DEFAULT_COLORS, DEFAULT_BOUNDARIES, TokenAndActivation } from './types'
 3 | 
 4 | 
 5 | type Props = {
 6 |   tokens: TokenAndActivation[], 
 7 |   loading?: boolean, 
 8 |   colors?: Color[], 
 9 |   boundaries?: number[]
10 | }
11 | export default function TokenHeatmap({ tokens, loading, colors = DEFAULT_COLORS, boundaries = DEFAULT_BOUNDARIES }: Props) {
12 |     // <div className="block" style={{width:'100%', whiteSpace: 'pre', overflowX: 'scroll' }}>
13 |   return (
14 |     <div className="block" style={{width:'100%', whiteSpace: 'pre-wrap'}}>
15 |       {tokens.map(({ token, activation, normalized_activation }, i) => {
16 |         const color = getInterpolatedColor(colors, boundaries, normalized_activation || activation);
17 |         return <span key={i}
18 |           title={loading ? '' : `Activation: ${activation.toFixed(2)}`}
19 |           className={`${loading ? "animate-pulse" : ""}`}
20 |           style={{
21 |             transition: "500ms ease-in all",
22 |             background: loading
23 |               ? `rgba(0, 0, 0, 0.03)`
24 |               : `rgba(${color.r}, ${color.g}, ${color.b}, 0.5)`,
25 |           }}
26 |         >
27 |           {token}
28 |         </span>
29 |       })}
30 |     </div>
31 |   )
32 | }
33 | 


--------------------------------------------------------------------------------
/neuron-viewer/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "neuron-viewer",
 3 |   "version": "0.1.67",
 4 |   "homepage": "https://openaipublic.blob.core.windows.net/neuron-explainer/neuron-viewer",
 5 |   "dependencies": {
 6 |     "@headlessui/react": "^1.7.8",
 7 |     "@headlessui/tailwindcss": "^0.1.2",
 8 |     "@types/d3-scale": "^4.0.3",
 9 |     "@types/lodash": "^4.14.194",
10 |     "@types/react": "^18.0.37",
11 |     "@types/react-dom": "^18.0.11",
12 |     "d3-scale": "^4.0.2",
13 |     "lodash": "^4.17.21",
14 |     "react": "^18.2.0",
15 |     "react-dom": "^18.2.0",
16 |     "react-router-dom": "^6.10.0",
17 |     "web-vitals": "^3.0.3"
18 |   },
19 |   "scripts": {
20 |     "startpy": "nodemon python/server.py",
21 |     "start": "parcel src/index.html",
22 |     "build": "parcel build src/index.html",
23 |     "serve": "parcel serve src/index.html",
24 |     "typecheck": "tsc -p ."
25 |   },
26 |   "eslintConfig": {
27 |     "extends": [
28 |       "react-app"
29 |     ]
30 |   },
31 |   "alias": {
32 |     "preact/jsx-dev-runtime": "preact/jsx-runtime"
33 |   },
34 |   "devDependencies": {
35 |     "@observablehq/plot": "^0.6.5",
36 |     "@parcel/transformer-typescript-tsc": "^2.8.3",
37 |     "@parcel/validator-typescript": "^2.8.3",
38 |     "nodemon": "^2.0.22",
39 |     "parcel": "^2.8.3",
40 |     "preact": "^10.13.2",
41 |     "process": "^0.11.10",
42 |     "react-refresh": "0.10.0",
43 |     "tailwindcss": "^3.2.4",
44 |     "typescript": "^5.0.4"
45 |   }
46 | }
47 | 


--------------------------------------------------------------------------------
/neuron-viewer/python/server.py:
--------------------------------------------------------------------------------
 1 | # %%
 2 | import logging
 3 | 
 4 | from flask import Flask, request
 5 | from flask_cors import CORS
 6 | 
 7 | import json
 8 | 
 9 | import urllib.request
10 | 
11 | def load_az_json(url):
12 |     with urllib.request.urlopen(url) as f:
13 |         return json.load(f)
14 | 
15 | def start(
16 |     dev: bool = False,
17 |     host_name: str = "0.0.0.0",
18 |     port: int = 80,
19 | ):
20 |     app = Flask("interpretability chat")
21 |     app.logger.setLevel(logging.INFO)
22 |     # app.logger.disabled = True
23 |     CORS(app)
24 | 
25 |     @app.after_request
26 |     def after_request(response):
27 |         response.headers.add("Access-Control-Allow-Origin", "*")
28 |         response.headers.add(
29 |             "Access-Control-Allow-Headers", "Content-Type,Authorization"
30 |         )
31 |         response.headers.add(
32 |             "Access-Control-Allow-Methods", "GET,PUT,POST,DELETE,OPTIONS"
33 |         )
34 |         return response
35 | 
36 |     @app.route("/load_az", methods=["GET", "POST"])
37 |     async def load_az():
38 |         args = request.get_json()
39 |         path = args["path"]
40 |         result = load_az_json(path)
41 |         return result
42 | 
43 |     app.run(debug=dev, host=host_name, port=port, use_reloader=False)
44 | 
45 | 
46 | def main(dev: bool = True, host_name: str = "0.0.0.0", port: int = 8000):
47 |     start(dev=dev, host_name=host_name, port=port)
48 | 
49 | 
50 | if __name__ == "__main__":
51 |     main()
52 | 


--------------------------------------------------------------------------------
/neuron-viewer/src/feed.jsx:
--------------------------------------------------------------------------------
 1 | import * as Panes from "./panes"
 2 | import React, { useEffect } from "react"
 3 | import Welcome from "./welcome"
 4 | import { useState } from "react"
 5 | import { useParams, Link } from "react-router-dom"
 6 | 
 7 | export default function Feed() {
 8 |   const params = useParams()
 9 |   // If params is missing either index, there's no neuron selected.
10 |   let activeNeuron;
11 |   if (params.layer === undefined || params.neuron === undefined) {
12 |     activeNeuron = null
13 |   } else {
14 |     // Grab the layer and neuron indices from the params, casting them to ints.
15 |     activeNeuron = {
16 |       "layer": parseInt(params.layer),
17 |       "neuron": parseInt(params.neuron),
18 |     }
19 |   }
20 | 
21 |   const Pane = ({ children }) => (
22 |     <div className="flex flex-col h-full">{children}</div>
23 |   )
24 | 
25 |   return (
26 |     <div>
27 |       <div>
28 |         <h2 className="flex flex-row">
29 |           <Link to="/">Neuron Viewer</Link>
30 |         </h2>
31 |         {activeNeuron && (
32 |           <h3 className="flex flex-row">
33 |             Neuron {activeNeuron.layer}:{activeNeuron.neuron}
34 |           </h3>
35 |         )}
36 |       </div>
37 | 
38 |       <div
39 |         style={{ width: '100%', padding: '0px 80px', margin: "auto", overflow: "visible" }}
40 |       >
41 |         <ul role="list" className="mb-8 mt-10">
42 |           {activeNeuron ?
43 |             <>
44 |               <Pane>
45 |                 {React.createElement(Panes["Explanation"], { activeNeuron })}
46 |               </Pane>
47 |               <Pane>
48 |                 {React.createElement(Panes["DatasetList"], { activeNeuron })}
49 |               </Pane>
50 |               <Pane>
51 |                 {React.createElement(Panes["TopTokens"], { activeNeuron })}
52 |               </Pane>
53 |               <Pane>
54 |                 {React.createElement(Panes["SimilarNeurons"], { activeNeuron })}
55 |               </Pane>
56 |             </> :
57 |             <Welcome/>
58 |           }
59 | 
60 |         </ul>
61 |       </div>
62 |     </div>
63 |   )
64 | }
65 | 


--------------------------------------------------------------------------------
/neuron-explainer/neuron_explainer/explanations/puzzles.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import os
 3 | from dataclasses import dataclass
 4 | 
 5 | from neuron_explainer.activations.activations import ActivationRecord
 6 | 
 7 | 
 8 | @dataclass(frozen=True)
 9 | class Puzzle:
10 |     """A puzzle is a ground truth explanation, a collection of sentences (stored as ActivationRecords) with activations
11 |     according to that explanation, and a collection of false explanations"""
12 | 
13 |     name: str
14 |     explanation: str
15 |     activation_records: list[ActivationRecord]
16 |     false_explanations: list[str]
17 | 
18 | 
19 | def convert_puzzle_to_tokenized_sentences(puzzle: Puzzle) -> list[list[str]]:
20 |     """Converts a puzzle to a list of tokenized sentences."""
21 |     return [record.tokens for record in puzzle.activation_records]
22 | 
23 | 
24 | def convert_puzzle_dict_to_puzzle(puzzle_dict: dict) -> Puzzle:
25 |     """Converts a json dictionary representation of a puzzle to the Puzzle class."""
26 |     puzzle_activation_records = []
27 |     for sentence in puzzle_dict["sentences"]:
28 |         # Token-activation pairs are listed as either a string or a list of a string and a float. If it is a list, the float is the activation.
29 |         # If it is only a string, the activation is assumed to be 0. This is useful for readability and reducing redundancy in the data.
30 |         tokens = [t[0] if type(t) is list else t for t in sentence]
31 |         assert all([type(t) is str for t in tokens]), "All tokens must be strings"
32 |         activations = [float(t[1]) if type(t) is list else 0.0 for t in sentence]
33 |         assert all([type(t) is float for t in activations]), "All activations must be floats"
34 | 
35 |         puzzle_activation_records.append(ActivationRecord(tokens=tokens, activations=activations))
36 | 
37 |     return Puzzle(
38 |         name=puzzle_dict["name"],
39 |         explanation=puzzle_dict["explanation"],
40 |         activation_records=puzzle_activation_records,
41 |         false_explanations=puzzle_dict["false_explanations"],
42 |     )
43 | 
44 | 
45 | PUZZLES_BY_NAME: dict[str, Puzzle] = dict()
46 | script_dir = os.path.dirname(os.path.abspath(__file__))
47 | with open(os.path.join(script_dir, "puzzles.json"), "r") as f:
48 |     puzzle_dicts = json.loads(f.read())
49 |     for name in puzzle_dicts.keys():
50 |         PUZZLES_BY_NAME[name] = convert_puzzle_dict_to_puzzle(puzzle_dicts[name])
51 | 


--------------------------------------------------------------------------------
/neuron-explainer/neuron_explainer/activations/token_connections.py:
--------------------------------------------------------------------------------
 1 | from dataclasses import dataclass
 2 | from typing import List, Union
 3 | 
 4 | import blobfile as bf
 5 | from neuron_explainer.fast_dataclasses import FastDataclass, loads, register_dataclass
 6 | from neuron_explainer.azure import standardize_azure_url
 7 | import urllib.request
 8 | 
 9 | 
10 | @register_dataclass
11 | @dataclass
12 | class TokensAndWeights(FastDataclass):
13 |     tokens: List[str]
14 |     strengths: List[float]
15 | 
16 | 
17 | @register_dataclass
18 | @dataclass
19 | class WeightBasedSummaryOfNeuron(FastDataclass):
20 |     input_positive: TokensAndWeights
21 |     input_negative: TokensAndWeights
22 |     output_positive: TokensAndWeights
23 |     output_negative: TokensAndWeights
24 | 
25 | 
26 | def load_token_weight_connections_of_neuron(
27 |     layer_index: Union[str, int],
28 |     neuron_index: Union[str, int],
29 |     dataset_path: str = "https://openaipublic.blob.core.windows.net/neuron-explainer/data/related-tokens/weight-based",
30 | ) -> WeightBasedSummaryOfNeuron:
31 |     """Load the TokenLookupTableSummaryOfNeuron for the specified neuron."""
32 |     url = "/".join([dataset_path, str(layer_index), f"{neuron_index}.json"])
33 |     url = standardize_azure_url(url)
34 |     with urllib.request.urlopen(url) as f:
35 |         return loads(f.read(), backwards_compatible=False)
36 | 
37 | 
38 | @register_dataclass
39 | @dataclass
40 | class TokenLookupTableSummaryOfNeuron(FastDataclass):
41 |     """List of tokens and the average activations of a given neuron in response to each
42 |     respective token. These are selected from among the tokens in the vocabulary with the
43 |     highest average activations across an internet text dataset, with the highest activations
44 |     first."""
45 | 
46 |     tokens: List[str]
47 |     average_activations: List[float]
48 | 
49 | 
50 | def load_token_lookup_table_connections_of_neuron(
51 |     layer_index: Union[str, int],
52 |     neuron_index: Union[str, int],
53 |     dataset_path: str = "https://openaipublic.blob.core.windows.net/neuron-explainer/data/related-tokens/activation-based",
54 | ) -> TokenLookupTableSummaryOfNeuron:
55 |     """Load the TokenLookupTableSummaryOfNeuron for the specified neuron."""
56 |     url = "/".join([dataset_path, str(layer_index), f"{neuron_index}.json"])
57 |     url = standardize_azure_url(url)
58 |     with urllib.request.urlopen(url) as f:
59 |         return loads(f.read(), backwards_compatible=False)
60 | 


--------------------------------------------------------------------------------
/neuron-viewer/src/index.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html lang="en">
 3 |   <head>
 4 |     <meta charset="utf-8" />
 5 |     <meta name="viewport" content="width=device-width, initial-scale=1" />
 6 |     <meta name="theme-color" content="#000000" />
 7 |     <meta
 8 |       name="description"
 9 |       content="Web site created using create-react-app"
10 |     />
11 |     <!--
12 |       manifest.json provides metadata used when your web app is installed on a
13 |       user's mobile device or desktop. See https://developers.google.com/web/fundamentals/web-app-manifest/
14 |     -->
15 |     <!--
16 |       Notice the use of %PUBLIC_URL% in the tags above.
17 |       It will be replaced with the URL of the `public` folder during the build.
18 |       Only files inside the `public` folder can be referenced from the HTML.
19 | 
20 |       Unlike "/favicon.ico" or "favicon.ico", "%PUBLIC_URL%/favicon.ico" will
21 |       work correctly both with client-side routing and a non-root public URL.
22 |       Learn how to configure a non-root public URL by running `npm run build`.
23 |     -->
24 |     <link rel="icon" type="image/x-icon" href="../public/favicon.ico">
25 | 
26 |     <title>Neuron viewer</title>
27 |     <!--script src="https://cdn.tailwindcss.com?plugins=forms,typography,aspect-ratio,line-clamp"></script-->
28 |     <!--curl 'https://cdn.tailwindcss.com/3.3.1?plugins=forms@0.5.3,typography@0.5.9,aspect-ratio@0.4.2,line-clamp@0.4.4' -o public/tailwind.js-->
29 |     <script src="../public/tailwind.js"></script>
30 | 
31 |     <script>
32 |       tailwind.config = {
33 |         theme: {
34 |           extend: {
35 |             colors: {
36 |               clifford: '#da373d',
37 |             }
38 |           }
39 |         }
40 |       }
41 |     </script>
42 |   </head>
43 |   <body>
44 |     <noscript>You need to enable JavaScript to run this app.</noscript>
45 |     <div id="root"></div>
46 |     <!--
47 |       This HTML file is a template.
48 |       If you open it directly in the browser, you will see an empty page.
49 | 
50 |       You can add webfonts, meta tags, or analytics to this file.
51 |       The build step will place the bundled scripts into the <body> tag.
52 | 
53 |       To begin the development, run `npm start` or `yarn start`.
54 |       To create a production bundle, use `npm run build` or `yarn build`.
55 |     -->
56 |     <script src="./index.jsx" async type="module"></script>
57 |     <link href="App.css" rel="stylesheet"></link>
58 |   </body>
59 | </html>
60 | 


--------------------------------------------------------------------------------
/neuron-explainer/demos/explain_puzzles.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "code",
 5 |    "execution_count": null,
 6 |    "metadata": {},
 7 |    "outputs": [],
 8 |    "source": [
 9 |     "%load_ext autoreload\n",
10 |     "%autoreload 2"
11 |    ]
12 |   },
13 |   {
14 |    "cell_type": "code",
15 |    "execution_count": null,
16 |    "metadata": {},
17 |    "outputs": [],
18 |    "source": [
19 |     "import os\n",
20 |     "\n",
21 |     "os.environ[\"OPENAI_API_KEY\"] = \"put-key-here\"\n",
22 |     "\n",
23 |     "from neuron_explainer.activations.activation_records import calculate_max_activation\n",
24 |     "from neuron_explainer.explanations.explainer import TokenActivationPairExplainer\n",
25 |     "from neuron_explainer.explanations.prompt_builder import PromptFormat\n",
26 |     "from neuron_explainer.explanations.puzzles import PUZZLES_BY_NAME\n",
27 |     "\n",
28 |     "\n",
29 |     "EXPLAINER_MODEL_NAME = \"gpt-4\"\n",
30 |     "\n",
31 |     "explainer = TokenActivationPairExplainer(\n",
32 |     "    model_name=EXPLAINER_MODEL_NAME,\n",
33 |     "    prompt_format=PromptFormat.HARMONY_V4,\n",
34 |     "    max_concurrent=1,\n",
35 |     ")\n",
36 |     "\n",
37 |     "for puzzle_name, puzzle in PUZZLES_BY_NAME.items():\n",
38 |     "    print(f\"{puzzle_name=}\")\n",
39 |     "    puzzle_answer = puzzle.explanation\n",
40 |     "    # Generate an explanation for the puzzle.\n",
41 |     "    explanations = await explainer.generate_explanations(\n",
42 |     "        all_activation_records=puzzle.activation_records,\n",
43 |     "        max_activation=calculate_max_activation(puzzle.activation_records),\n",
44 |     "        num_samples=1,\n",
45 |     "    )\n",
46 |     "    assert len(explanations) == 1\n",
47 |     "    model_generated_explanation = explanations[0]\n",
48 |     "    print(f\"{model_generated_explanation=}\")\n",
49 |     "    print(f\"{puzzle_answer=}\\n\")\n",
50 |     "\n"
51 |    ]
52 |   }
53 |  ],
54 |  "metadata": {
55 |   "kernelspec": {
56 |    "display_name": "openai",
57 |    "language": "python",
58 |    "name": "openai"
59 |   },
60 |   "language_info": {
61 |    "codemirror_mode": {
62 |     "name": "ipython",
63 |     "version": 3
64 |    },
65 |    "file_extension": ".py",
66 |    "mimetype": "text/x-python",
67 |    "name": "python",
68 |    "nbconvert_exporter": "python",
69 |    "pygments_lexer": "ipython3",
70 |    "version": "3.9.9"
71 |   },
72 |   "orig_nbformat": 4
73 |  },
74 |  "nbformat": 4,
75 |  "nbformat_minor": 2
76 | }
77 | 


--------------------------------------------------------------------------------
/neuron-viewer/src/types.ts:
--------------------------------------------------------------------------------
 1 | import { scaleLinear } from "d3-scale"
 2 | import { min, max, flatten } from "lodash"
 3 | 
 4 | export type Neuron = {
 5 |   layer: number;
 6 |   neuron: number;
 7 | }
 8 | 
 9 | export type TokenAndActivation = {
10 |   token: string,
11 |   activation: number
12 |   normalized_activation?: number
13 | }
14 | 
15 | export type TokenSequence = TokenAndActivation[]
16 | 
17 | export const normalizeTokenActs = (...sequences: TokenSequence[][]) => {
18 |   // console.log('sequences', sequences)
19 |   let flattened: TokenAndActivation[] = flatten(flatten(sequences))
20 |   // Replace all activations less than 0 in data.tokens with 0. This matches the format in the
21 |   // top + random activation records displayed in the main grid.
22 |   flattened = flattened.map(({token, activation}) => {
23 |     return {
24 |       token,
25 |       activation: Math.max(activation, 0)
26 |     }
27 |   })
28 |   const maxActivation = max(flattened.map((ta) => ta.activation)) || 0;
29 |   const neuronScale = scaleLinear()
30 |     // Even though we're only displaying positive activations, we still need to scale in a way that
31 |     // accounts for the existence of negative activations, since our color scale includes them.
32 |     .domain([0, maxActivation])
33 |     .range([0, 1])
34 | 
35 |   return sequences.map((seq) => seq.map((tas) => tas.map(({ token, activation }) => ({
36 |       token,
37 |       activation,
38 |       normalized_activation: neuronScale(activation),
39 |   }))))
40 | }
41 | 
42 | export type Color = {r: number, g: number, b: number};
43 | export function interpolateColor(color_l: Color, color_r: Color, value: number) {
44 |   const color = {
45 |     r: Math.round(color_l.r + (color_r.r - color_l.r) * value),
46 |     g: Math.round(color_l.g + (color_r.g - color_l.g) * value),
47 |     b: Math.round(color_l.b + (color_r.b - color_l.b) * value),
48 |   }
49 |   return color
50 | }
51 | 
52 | export function getInterpolatedColor(colors: Color[], boundaries: number[], value: number) {
53 |   const index = boundaries.findIndex((boundary) => boundary >= value)
54 |   const colorIndex = Math.max(0, index - 1)
55 |   const color_left = colors[colorIndex]
56 |   const color_right = colors[colorIndex + 1]
57 |   const boundary_left = boundaries[colorIndex]
58 |   const boundary_right = boundaries[colorIndex + 1]
59 |   const ratio = (value - boundary_left) / (boundary_right - boundary_left)
60 |   const color = interpolateColor(color_left, color_right, ratio)
61 |   return color
62 | }
63 | 
64 | export const DEFAULT_COLORS = [
65 |   // { r: 255, g: 0, b: 105 },
66 |   { r: 255, g: 255, b: 255 },
67 |   { r: 0, g: 255, b: 0 },
68 | ]
69 | export const DEFAULT_BOUNDARIES = [
70 |   // 0, 0.5, 1
71 |   0, 1
72 | ]
73 | 
74 | 


--------------------------------------------------------------------------------
/neuron-explainer/neuron_explainer/fast_dataclasses/test_fast_dataclasses.py:
--------------------------------------------------------------------------------
 1 | from dataclasses import dataclass
 2 | 
 3 | import pytest
 4 | 
 5 | from .fast_dataclasses import FastDataclass, dumps, loads, register_dataclass
 6 | 
 7 | 
 8 | # Inheritance is a bit tricky with our setup. dataclass_name must be set for instances of these
 9 | # classes to serialize and deserialize correctly, but if it's given a default value, then subclasses
10 | # can't have any fields that don't have default values, because of how constructors are generated
11 | # for dataclasses (fields with no default value can't follow those with default values). To work
12 | # around this, we set dataclass_name in __post_init__ on the base class, which is called after the
13 | # constructor. The implementation does the right thing for both the base class and the subclass.
14 | @register_dataclass
15 | @dataclass
16 | class DataclassC(FastDataclass):
17 |     ints: list[int]
18 | 
19 | 
20 | @register_dataclass
21 | @dataclass
22 | class DataclassC_ext(DataclassC):
23 |     s: str
24 | 
25 | 
26 | @register_dataclass
27 | @dataclass
28 | class DataclassB(FastDataclass):
29 |     str_to_c: dict[str, DataclassC]
30 |     cs: list[DataclassC]
31 | 
32 | 
33 | @register_dataclass
34 | @dataclass
35 | class DataclassA(FastDataclass):
36 |     floats: list[float]
37 |     strings: list[str]
38 |     bs: list[DataclassB]
39 | 
40 | 
41 | @register_dataclass
42 | @dataclass
43 | class DataclassD(FastDataclass):
44 |     s1: str
45 |     s2: str = "default"
46 | 
47 | 
48 | def test_dataclasses() -> None:
49 |     a = DataclassA(
50 |         floats=[1.0, 2.0],
51 |         strings=["a", "b"],
52 |         bs=[
53 |             DataclassB(
54 |                 str_to_c={"a": DataclassC(ints=[1, 2]), "b": DataclassC(ints=[3, 4])},
55 |                 cs=[DataclassC(ints=[5, 6]), DataclassC_ext(ints=[7, 8], s="s")],
56 |             ),
57 |             DataclassB(
58 |                 str_to_c={"c": DataclassC_ext(ints=[9, 10], s="t"), "d": DataclassC(ints=[11, 12])},
59 |                 cs=[DataclassC(ints=[13, 14]), DataclassC(ints=[15, 16])],
60 |             ),
61 |         ],
62 |     )
63 |     assert loads(dumps(a)) == a
64 | 
65 | 
66 | def test_c_and_c_ext() -> None:
67 |     c_ext = DataclassC_ext(ints=[3, 4], s="s")
68 |     assert loads(dumps(c_ext)) == c_ext
69 | 
70 |     c = DataclassC(ints=[1, 2])
71 |     assert loads(dumps(c)) == c
72 | 
73 | 
74 | def test_bad_serialized_data() -> None:
75 |     assert type(loads(dumps(DataclassC(ints=[3, 4])))) == DataclassC
76 |     assert type(loads('{"ints": [3, 4]}', backwards_compatible=False)) == dict
77 |     assert type(loads('{"ints": [3, 4], "dataclass_name": "DataclassC"}')) == DataclassC
78 |     with pytest.raises(TypeError):
79 |         loads('{"ints": [3, 4], "bogus_extra_field": "foo", "dataclass_name": "DataclassC"}')
80 |     with pytest.raises(TypeError):
81 |         loads('{"ints_field_is_missing": [3, 4], "dataclass_name": "DataclassC"}')
82 |     assert type(loads('{"s1": "test"}', backwards_compatible=False)) == dict
83 |     assert type(loads('{"s1": "test"}', backwards_compatible=True)) == DataclassD
84 | 


--------------------------------------------------------------------------------
/neuron-explainer/neuron_explainer/fast_dataclasses/fast_dataclasses.py:
--------------------------------------------------------------------------------
 1 | # Utilities for dataclasses that are very fast to serialize and deserialize, with limited data
 2 | # validation. Fields must not be tuples, since they get serialized and then deserialized as lists.
 3 | #
 4 | # The unit tests for this library show how to use it.
 5 | 
 6 | import json
 7 | from dataclasses import dataclass, field, fields, is_dataclass
 8 | from functools import partial
 9 | from typing import Any, Union
10 | 
11 | import orjson
12 | 
13 | dataclasses_by_name = {}
14 | dataclasses_by_fieldnames = {}
15 | 
16 | 
17 | @dataclass
18 | class FastDataclass:
19 |     dataclass_name: str = field(init=False)
20 | 
21 |     def __post_init__(self) -> None:
22 |         self.dataclass_name = self.__class__.__name__
23 | 
24 | 
25 | def register_dataclass(cls):  # type: ignore
26 |     assert is_dataclass(cls), "Only dataclasses can be registered."
27 |     dataclasses_by_name[cls.__name__] = cls
28 |     name_set = frozenset(f.name for f in fields(cls) if f.name != "dataclass_name")
29 |     dataclasses_by_fieldnames[name_set] = cls
30 |     return cls
31 | 
32 | 
33 | def dumps(obj: Any) -> bytes:
34 |     return orjson.dumps(obj, option=orjson.OPT_SERIALIZE_NUMPY)
35 | 
36 | 
37 | def _object_hook(d: Any, backwards_compatible: bool = True) -> Any:
38 |     # If d is a list, recurse.
39 |     if isinstance(d, list):
40 |         return [_object_hook(x, backwards_compatible=backwards_compatible) for x in d]
41 |     # If d is not a dict, return it as is.
42 |     if not isinstance(d, dict):
43 |         return d
44 |     cls = None
45 |     if "dataclass_name" in d:
46 |         if d["dataclass_name"] in dataclasses_by_name:
47 |             cls = dataclasses_by_name[d["dataclass_name"]]
48 |         else:
49 |             assert backwards_compatible, (
50 |                 f"Dataclass {d['dataclass_name']} not found, set backwards_compatible=True if you "
51 |                 f"are okay with that."
52 |             )
53 |     # Load objects created without dataclass_name set.
54 |     else:
55 |         # Try our best to find a dataclass if backwards_compatible is True.
56 |         if backwards_compatible:
57 |             d_fields = frozenset(d.keys())
58 |             if d_fields in dataclasses_by_fieldnames:
59 |                 cls = dataclasses_by_fieldnames[d_fields]
60 |             elif len(d_fields) > 0:
61 |                 # Check if the fields are a subset of a dataclass (if the dataclass had extra fields
62 |                 # added since the data was created). Note that this will fail if fields were removed
63 |                 # from the dataclass.
64 |                 for key, possible_cls in dataclasses_by_fieldnames.items():
65 |                     if d_fields.issubset(key):
66 |                         cls = possible_cls
67 |                         break
68 |                 else:
69 |                     print(f"Could not find dataclass for {d_fields} {cls}")
70 |     new_d = {
71 |         k: _object_hook(v, backwards_compatible=backwards_compatible)
72 |         for k, v in d.items()
73 |         if k != "dataclass_name"
74 |     }
75 |     if cls is not None:
76 |         return cls(**new_d)
77 |     else:
78 |         return new_d
79 | 
80 | 
81 | def loads(s: Union[str, bytes], backwards_compatible: bool = True) -> Any:
82 |     return json.loads(
83 |         s,
84 |         object_hook=partial(_object_hook, backwards_compatible=backwards_compatible),
85 |     )
86 | 


--------------------------------------------------------------------------------
/neuron-explainer/demos/generate_and_score_explanation.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "%load_ext autoreload\n",
 10 |     "%autoreload 2"
 11 |    ]
 12 |   },
 13 |   {
 14 |    "cell_type": "code",
 15 |    "execution_count": null,
 16 |    "metadata": {},
 17 |    "outputs": [],
 18 |    "source": [
 19 |     "import os\n",
 20 |     "\n",
 21 |     "os.environ[\"OPENAI_API_KEY\"] = \"put-key-here\"\n",
 22 |     "\n",
 23 |     "from neuron_explainer.activations.activation_records import calculate_max_activation\n",
 24 |     "from neuron_explainer.activations.activations import ActivationRecordSliceParams, load_neuron\n",
 25 |     "from neuron_explainer.explanations.calibrated_simulator import UncalibratedNeuronSimulator\n",
 26 |     "from neuron_explainer.explanations.explainer import TokenActivationPairExplainer\n",
 27 |     "from neuron_explainer.explanations.prompt_builder import PromptFormat\n",
 28 |     "from neuron_explainer.explanations.scoring import simulate_and_score\n",
 29 |     "from neuron_explainer.explanations.simulator import ExplanationNeuronSimulator\n",
 30 |     "\n",
 31 |     "EXPLAINER_MODEL_NAME = \"gpt-4\"\n",
 32 |     "SIMULATOR_MODEL_NAME = \"text-davinci-003\"\n",
 33 |     "\n",
 34 |     "\n",
 35 |     "# test_response = await client.make_request(prompt=\"test 123<|endofprompt|>\", max_tokens=2)\n",
 36 |     "# print(\"Response:\", test_response[\"choices\"][0][\"text\"])\n",
 37 |     "\n",
 38 |     "# Load a neuron record.\n",
 39 |     "neuron_record = load_neuron(9, 6236)\n",
 40 |     "\n",
 41 |     "# Grab the activation records we'll need.\n",
 42 |     "slice_params = ActivationRecordSliceParams(n_examples_per_split=5)\n",
 43 |     "train_activation_records = neuron_record.train_activation_records(\n",
 44 |     "    activation_record_slice_params=slice_params\n",
 45 |     ")\n",
 46 |     "valid_activation_records = neuron_record.valid_activation_records(\n",
 47 |     "    activation_record_slice_params=slice_params\n",
 48 |     ")\n",
 49 |     "\n",
 50 |     "# Generate an explanation for the neuron.\n",
 51 |     "explainer = TokenActivationPairExplainer(\n",
 52 |     "    model_name=EXPLAINER_MODEL_NAME,\n",
 53 |     "    prompt_format=PromptFormat.HARMONY_V4,\n",
 54 |     "    max_concurrent=1,\n",
 55 |     ")\n",
 56 |     "explanations = await explainer.generate_explanations(\n",
 57 |     "    all_activation_records=train_activation_records,\n",
 58 |     "    max_activation=calculate_max_activation(train_activation_records),\n",
 59 |     "    num_samples=1,\n",
 60 |     ")\n",
 61 |     "assert len(explanations) == 1\n",
 62 |     "explanation = explanations[0]\n",
 63 |     "print(f\"{explanation=}\")\n",
 64 |     "\n",
 65 |     "# Simulate and score the explanation.\n",
 66 |     "simulator = UncalibratedNeuronSimulator(\n",
 67 |     "    ExplanationNeuronSimulator(\n",
 68 |     "        SIMULATOR_MODEL_NAME,\n",
 69 |     "        explanation,\n",
 70 |     "        max_concurrent=1,\n",
 71 |     "        prompt_format=PromptFormat.INSTRUCTION_FOLLOWING,\n",
 72 |     "    )\n",
 73 |     ")\n",
 74 |     "scored_simulation = await simulate_and_score(simulator, valid_activation_records)\n",
 75 |     "print(f\"score={scored_simulation.get_preferred_score():.2f}\")\n"
 76 |    ]
 77 |   }
 78 |  ],
 79 |  "metadata": {
 80 |   "kernelspec": {
 81 |    "display_name": "openai",
 82 |    "language": "python",
 83 |    "name": "python3"
 84 |   },
 85 |   "language_info": {
 86 |    "codemirror_mode": {
 87 |     "name": "ipython",
 88 |     "version": 3
 89 |    },
 90 |    "file_extension": ".py",
 91 |    "mimetype": "text/x-python",
 92 |    "name": "python",
 93 |    "nbconvert_exporter": "python",
 94 |    "pygments_lexer": "ipython3",
 95 |    "version": "3.9.9"
 96 |   },
 97 |   "orig_nbformat": 4
 98 |  },
 99 |  "nbformat": 4,
100 |  "nbformat_minor": 2
101 | }
102 | 


--------------------------------------------------------------------------------
/neuron-viewer/src/panes/datasetList.jsx:
--------------------------------------------------------------------------------
  1 | import HeatmapGrid from "../heatmapGrid"
  2 | import React, { useEffect, useState } from "react"
  3 | import { normalizeTokenActs } from "../types"
  4 | 
  5 | import {get_neuron_record} from "../interpAPI"
  6 | 
  7 | function zip_sequences(sequences) {
  8 |   return sequences.map(({ activations, tokens }) => {
  9 |     return tokens.map((token, idx) => ({
 10 |       token,
 11 |       activation: activations[idx],
 12 |     }))
 13 |   })
 14 | }
 15 | 
 16 | export default ({ activeNeuron }) => {
 17 |   const [data, setData] = useState(null)
 18 |   const [showingMore, setShowingMore] = useState({})
 19 |   const [isLoading, setIsLoading] = useState(true)
 20 | 
 21 |   useEffect(() => {
 22 |     async function fetchData() {
 23 |       if (data) {
 24 |         return
 25 |       }
 26 |       const result = await get_neuron_record(activeNeuron)
 27 |     console.log(result)
 28 |       const all_sequences = []
 29 |       all_sequences.push({
 30 |         // label: '[0.999, 1] (Top quantile, sorted.  50 of 50000)',
 31 |         label: 'Top',
 32 |         sequences: zip_sequences(result.most_positive_activation_records),
 33 |         default_show: 4,
 34 |       })
 35 |       all_sequences.push({
 36 |         label: 'Quantile range [0.99, 0.999] sample',
 37 |         sequences: zip_sequences(result.random_sample_by_quantile[3]),
 38 |         default_show: 1,
 39 |       })
 40 |       all_sequences.push({
 41 |         label: 'Quantile range [0.9, 0.99] sample',
 42 |         sequences: zip_sequences(result.random_sample_by_quantile[2]),
 43 |         default_show: 1,
 44 |       })
 45 |       all_sequences.push({
 46 |         label: 'Quantile range [0.5, 0.9] sample',
 47 |         sequences: zip_sequences(result.random_sample_by_quantile[1]),
 48 |         default_show: 1,
 49 |       })
 50 |       all_sequences.push({
 51 |         label: 'Quantile range [0, 0.5] sample',
 52 |         sequences: zip_sequences(result.random_sample_by_quantile[0]),
 53 |         default_show: 1,
 54 |       })
 55 |       all_sequences.push({
 56 |         // label: '[0, 1] (Random)',
 57 |         label: 'Random sample',
 58 |         sequences: zip_sequences(result.random_sample),
 59 |         default_show: 2,
 60 |       })
 61 |       // for reference
 62 |       // intervals = [(0, 1), (0, 0.5), (0.5, 0.9), (0.9, 0.99), (0.99, 0.999), (0.999, 1)]
 63 |       // saved_activations_by_interval = [neuron_record.random_sample] + neuron_record.random_sample_by_decile[:-1] + [neuron_record.top_activations]
 64 |       setData(all_sequences)
 65 |       setIsLoading(false)
 66 |     }
 67 |     fetchData()
 68 |   }, [activeNeuron])
 69 | 
 70 |   if (isLoading) {
 71 |     return (
 72 |       <div className="flex justify-center items-center h-64">
 73 |         <div className="w-8 h-8 border-4 border-gray-300 rounded-full animate-spin"></div>
 74 |         <div>loading top dataset examples</div>
 75 |       </div>
 76 |     )
 77 |   }
 78 | 
 79 |   // const activations = data.top_activations;
 80 |   const all_normalized_sequences = normalizeTokenActs(...data.map(({sequences}) => sequences))
 81 | 
 82 |   return (
 83 |     <div>
 84 |       <h2 className="text-2xl font-bold mb-4">Activations</h2>
 85 |       {
 86 |         data.map(({label, default_show}, idx) => {
 87 |           const n_show = showingMore[label] ? all_normalized_sequences[idx].length : default_show;
 88 |           return (
 89 |           <React.Fragment key={idx}>
 90 |           <h3 className="text-md font-bold">
 91 |             {label}
 92 |             <button className="ml-2 text-sm text-gray-500"
 93 |               onClick={() => setShowingMore({...showingMore, [label]: !showingMore[label]})}>
 94 |               {showingMore[label] ? 'show less' : 'show more'}
 95 |             </button>
 96 |           </h3>
 97 |           <HeatmapGrid allTokens={all_normalized_sequences[idx].slice(0, n_show)} />
 98 |           </React.Fragment>
 99 |           )
100 |         })
101 |       }
102 |     </div>
103 |   )
104 | }
105 | 


--------------------------------------------------------------------------------
/neuron-explainer/demos/generate_and_score_token_look_up_table_explanation.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "%load_ext autoreload\n",
 10 |     "%autoreload 2"
 11 |    ]
 12 |   },
 13 |   {
 14 |    "cell_type": "code",
 15 |    "execution_count": null,
 16 |    "metadata": {},
 17 |    "outputs": [],
 18 |    "source": [
 19 |     "import os\n",
 20 |     "\n",
 21 |     "os.environ[\"OPENAI_API_KEY\"] = \"put-key-here\"\n",
 22 |     "\n",
 23 |     "from neuron_explainer.activations.activations import ActivationRecordSliceParams, load_neuron\n",
 24 |     "from neuron_explainer.activations.token_connections import load_token_lookup_table_connections_of_neuron\n",
 25 |     "from neuron_explainer.explanations.calibrated_simulator import UncalibratedNeuronSimulator\n",
 26 |     "from neuron_explainer.explanations.explainer import TokenSpaceRepresentationExplainer\n",
 27 |     "from neuron_explainer.explanations.prompt_builder import PromptFormat\n",
 28 |     "from neuron_explainer.explanations.scoring import simulate_and_score\n",
 29 |     "from neuron_explainer.explanations.simulator import ExplanationNeuronSimulator\n",
 30 |     "\n",
 31 |     "EXPLAINER_MODEL_NAME = \"gpt-4\"\n",
 32 |     "SIMULATOR_MODEL_NAME = \"text-davinci-003\"\n",
 33 |     "\n",
 34 |     "\n",
 35 |     "# test_response = await client.make_request(prompt=\"test 123<|endofprompt|>\", max_tokens=2)\n",
 36 |     "# print(\"Response:\", test_response[\"choices\"][0][\"text\"])\n",
 37 |     "\n",
 38 |     "layer_index = 9\n",
 39 |     "neuron_index = 6236\n",
 40 |     "\n",
 41 |     "# Load a token lookup table.\n",
 42 |     "token_lookup_table = load_token_lookup_table_connections_of_neuron(layer_index, neuron_index)\n",
 43 |     "\n",
 44 |     "# Load a neuron record.\n",
 45 |     "neuron_record = load_neuron(layer_index, neuron_index)\n",
 46 |     "\n",
 47 |     "# Grab the activation records we'll need.\n",
 48 |     "slice_params = ActivationRecordSliceParams(n_examples_per_split=5)\n",
 49 |     "valid_activation_records = neuron_record.valid_activation_records(\n",
 50 |     "    activation_record_slice_params=slice_params\n",
 51 |     ")\n",
 52 |     "\n",
 53 |     "# Generate an explanation for the neuron.\n",
 54 |     "explainer = TokenSpaceRepresentationExplainer(\n",
 55 |     "    model_name=EXPLAINER_MODEL_NAME,\n",
 56 |     "    prompt_format=PromptFormat.HARMONY_V4,\n",
 57 |     "    max_concurrent=1,\n",
 58 |     ")\n",
 59 |     "explanations = await explainer.generate_explanations(\n",
 60 |     "    tokens=token_lookup_table.tokens,\n",
 61 |     "    num_samples=1,\n",
 62 |     ")\n",
 63 |     "assert len(explanations) == 1\n",
 64 |     "explanation = explanations[0]\n",
 65 |     "print(f\"{explanation=}\")\n",
 66 |     "\n",
 67 |     "# Simulate and score the explanation.\n",
 68 |     "simulator = UncalibratedNeuronSimulator(\n",
 69 |     "    ExplanationNeuronSimulator(\n",
 70 |     "        SIMULATOR_MODEL_NAME,\n",
 71 |     "        explanation,\n",
 72 |     "        max_concurrent=1,\n",
 73 |     "        prompt_format=PromptFormat.INSTRUCTION_FOLLOWING,\n",
 74 |     "    )\n",
 75 |     ")\n",
 76 |     "scored_simulation = await simulate_and_score(simulator, valid_activation_records)\n",
 77 |     "print(f\"score={scored_simulation.get_preferred_score():.2f}\")\n"
 78 |    ]
 79 |   }
 80 |  ],
 81 |  "metadata": {
 82 |   "kernelspec": {
 83 |    "display_name": "Python 3",
 84 |    "language": "python",
 85 |    "name": "python3"
 86 |   },
 87 |   "language_info": {
 88 |    "codemirror_mode": {
 89 |     "name": "ipython",
 90 |     "version": 3
 91 |    },
 92 |    "file_extension": ".py",
 93 |    "mimetype": "text/x-python",
 94 |    "name": "python",
 95 |    "nbconvert_exporter": "python",
 96 |    "pygments_lexer": "ipython3",
 97 |    "version": "3.9.8"
 98 |   },
 99 |   "vscode": {
100 |    "interpreter": {
101 |     "hash": "fd71fb58b1ad02dde67c8ac595a52586dd87d3465221a699fc288aa2c48d5565"
102 |    }
103 |   }
104 |  },
105 |  "nbformat": 4,
106 |  "nbformat_minor": 2
107 | }
108 | 


--------------------------------------------------------------------------------
/neuron-viewer/src/panes/similarNeurons.jsx:
--------------------------------------------------------------------------------
  1 | import React, { useEffect, useState } from "react"
  2 | import _ from "lodash"
  3 | import { Link } from "react-router-dom"
  4 | 
  5 | import { get_explanations, get_top_neuron_connections } from "../interpAPI"
  6 | 
  7 | function NeuronInfo({ neuron, strength }) {
  8 |   const [info, setInfo] = useState(null)
  9 | 
 10 |   useEffect(() => {
 11 |     async function fetchInfo() {
 12 |       const result = (await get_explanations({
 13 |         layer: neuron.layer,
 14 |         neuron: neuron.neuron,
 15 |       }))
 16 |       setInfo(result)
 17 |     }
 18 | 
 19 |     if (!info) {
 20 |       fetchInfo()
 21 |     }
 22 |   }, [])
 23 | 
 24 |   if (!info) {
 25 |     return (
 26 |       <div className="m-4 flex justify-center items-center h-32">
 27 |         <p className="text-gray-500 mb-2">
 28 |           Loading neuron {neuron.layer}:{neuron.neuron}...
 29 |         </p>
 30 |         <div className="w-8 h-8 border-4 border-gray-300 rounded-full animate-spin"></div>
 31 |       </div>
 32 |     )
 33 |   }
 34 | 
 35 |   return (
 36 |     <div>
 37 |       <div className="overflow-hidden mb-4 border rounded-lg bg-white shadow">
 38 |         <h3
 39 |           className="px-4 text-lg pb-0 mb-0 font-bold">
 40 |           <Link to={`/layers/${neuron.layer}/neurons/${neuron.neuron}`}>
 41 |           Neuron {neuron.layer}:{neuron.neuron}
 42 |           </Link>
 43 |         </h3>
 44 |         <div className="text-sm px-4 py-2">
 45 |         Connection strength: {strength.toFixed(2)}
 46 |         </div>
 47 |         <blockquote className="p-1 px-4 mx-1 my-0">
 48 |           {info.scored_explanations.map((explanation, i) => (
 49 |             <React.Fragment key={i}>
 50 |             <p className="py-1">
 51 |               <em>{explanation.explanation}</em>
 52 |             </p>
 53 |             <p className="py-1">
 54 |               <em>score: {explanation.scored_simulation.ev_correlation_score.toFixed(2)}</em>
 55 |             </p>
 56 |             </React.Fragment>
 57 |           ))}
 58 |         </blockquote>
 59 |       </div>
 60 |     </div>
 61 |   )
 62 | }
 63 | 
 64 | export default function SimilarNeurons({ activeNeuron: neuron }) {
 65 |   const [similarNeurons, setSimilarNeurons] = useState([])
 66 |   const [isLoading, setIsLoading] = useState(true)
 67 | 
 68 |   useEffect(() => {
 69 |     async function fetchSimilarNeurons() {
 70 |       const result = await get_top_neuron_connections(neuron)
 71 |       setSimilarNeurons(result)
 72 |       setIsLoading(false)
 73 |     }
 74 | 
 75 |     fetchSimilarNeurons()
 76 |   }, [neuron])
 77 | 
 78 |   if (isLoading) {
 79 |     return (
 80 |       <div className="flex justify-center items-center h-64">
 81 |         <div className="w-8 h-8 border-4 border-gray-300 rounded-full animate-spin"></div>
 82 |       </div>
 83 |     )
 84 |   }
 85 | 
 86 |   const n_show = 3;
 87 |   return (
 88 |     <div className="min-w-0 flex-1">
 89 |       <h2 className="text-2xl font-bold mb-4">Related neurons</h2>
 90 |       <div className="full-width mt-6">
 91 |         <div className="flex flow-row justify-center align-self-center">
 92 |           {
 93 |           similarNeurons.input ?
 94 |           <div style={{ width: 450 }} className="mx-2">
 95 |             <h5>Upstream</h5>
 96 |             <div className="flex flex-row flex-wrap">
 97 |               {similarNeurons.input.slice(0, n_show).map(([layer, neuron, strength]) => (
 98 |                 <NeuronInfo key={layer + neuron} neuron={{ layer, neuron }} strength={strength} />
 99 |               ))}
100 |             </div>
101 |           </div> : null
102 |           }
103 |           {
104 |           similarNeurons.output ?
105 |           <div style={{ width: 450 }} className="mx-2">
106 |             <h5>Downstream</h5>
107 |             <div className="flex flex-row flex-wrap">
108 |               {similarNeurons.output.slice(0, n_show).map(([layer, neuron, strength]) => (
109 |                 <NeuronInfo key={layer + neuron} neuron={{ layer, neuron }} strength={strength}/>
110 |               ))}
111 |             </div>
112 |           </div> : null
113 |           }
114 |         </div>
115 |       </div>
116 |     </div>
117 |   )
118 | }
119 | 


--------------------------------------------------------------------------------
/neuron-viewer/src/simulationHeatmap.tsx:
--------------------------------------------------------------------------------
 1 | import React, { useState } from 'react';
 2 | 
 3 | import { interpolateColor, Color, getInterpolatedColor, DEFAULT_COLORS, DEFAULT_BOUNDARIES, TokenAndActivation } from './types'
 4 | 
 5 | type Props = {
 6 |   sequences: TokenAndActivation[][], 
 7 |   simulated_sequences: TokenAndActivation[][], 
 8 |   overlay_activations: boolean,
 9 |   colors?: Color[], 
10 |   boundaries?: number[],
11 | }
12 | export default function SimulationSequences({ sequences, simulated_sequences, overlay_activations, colors = DEFAULT_COLORS, boundaries = DEFAULT_BOUNDARIES }: Props) {
13 |   return <>
14 |     {
15 |       sequences.map((tokens, i) => {
16 |         let simulated_tokens = simulated_sequences[i];
17 |         if (overlay_activations) {
18 |           return (
19 |             <div className="block my-3 border p-3 m-2 rounded-md" style={{ width: '100%' /*,whiteSpace: 'nowrap', overflowX: 'auto' */ }} key={i}>
20 |               {tokens.map(({ token, activation, normalized_activation }, j) => {
21 |                 const { token: simulated_token, activation: simulated_activation, normalized_activation: simulated_normalized_activation } = simulated_tokens[j];
22 |                 if (simulated_token !== token) {
23 |                   throw new Error('simulated tokens not matching')
24 |                 }
25 |                 const color = getInterpolatedColor(colors, boundaries, normalized_activation || activation);
26 |                 const simcolor = getInterpolatedColor(colors, boundaries, simulated_normalized_activation || simulated_activation);
27 | 
28 |                 return <div style={{ display: 'inline-block', whiteSpace: 'pre' }} key={j}>
29 |                   <div style={{ display: 'flex', flexDirection: 'column' }}>
30 |                     <span
31 |                       title={`Activation: ${activation.toFixed(2)}`}
32 |                       style={{
33 |                         transition: "500ms ease-in all",
34 |                         background: `rgba(${color.r}, ${color.g}, ${color.b}, 0.5)`,
35 |                       }}
36 |                     >{token}</span>
37 |                     <span
38 |                       title={`Simulation: ${simulated_activation.toFixed(2)}`}
39 |                       style={{
40 |                         transition: "500ms ease-in all",
41 |                         background: `rgba(${simcolor.r}, ${simcolor.g}, ${simcolor.b}, 0.5)`,
42 |                       }}
43 |                     >{token}</span>
44 |                   </div>
45 |                 </div>
46 |               })}
47 |             </div>
48 |           )
49 |         } else {
50 |           return (
51 |             <div className="block my-3 border p-3 m-2 rounded-md" style={{ width: '100%' /*,whiteSpace: 'nowrap', overflowX: 'auto' */ }} key={i}>
52 |               <div>
53 |                 <span
54 |                   style={{
55 |                     fontSize: '0.7em',
56 |                     fontWeight: 'bold',
57 |                   }}
58 |                 >Real activations:</span><br />
59 |                 {tokens.map(({ token, activation, normalized_activation }, j) => {
60 |                   const color = getInterpolatedColor(colors, boundaries, normalized_activation || activation);
61 |                   return <span key={j}
62 |                     title={`Activation: ${activation.toFixed(2)}`}
63 |                     style={{
64 |                       transition: "500ms ease-in all",
65 |                       background: `rgba(${color.r}, ${color.g}, ${color.b}, 0.5)`,
66 |                     }}
67 |                   >{token}</span>
68 |                 })}
69 |               </div>
70 |               <hr style={{ margin: '5px 0 5px 0' }} />
71 |               <div>
72 |                 <span
73 |                   style={{
74 |                     fontSize: '0.7em',
75 |                     fontWeight: 'bold',
76 |                   }}
77 |                 >Simulated activations:</span><br />
78 |                 {simulated_tokens.map(({ token, activation, normalized_activation }, j) => {
79 |                   const color = getInterpolatedColor(colors, boundaries, normalized_activation || activation);
80 |                   return <span key={j}
81 |                     title={`Activation: ${activation.toFixed(2)}`}
82 |                     style={{
83 |                       transition: "500ms ease-in all",
84 |                       background: `rgba(${color.r}, ${color.g}, ${color.b}, 0.5)`,
85 |                     }}
86 |                   >{token}</span>
87 |                 })}
88 |               </div>
89 |             </div>
90 |           )
91 |         }
92 |       })
93 |     }
94 |     </>
95 | }
96 | 


--------------------------------------------------------------------------------
/neuron-viewer/src/panes/topTokens.jsx:
--------------------------------------------------------------------------------
  1 | import React, { useState, useEffect } from "react"
  2 | import { get_top_tokens } from "../interpAPI"
  3 | 
  4 | 
  5 | const TokenDisplay = ({ activeNeuron }) => {
  6 |   const [isLoading, setIsLoading] = useState(true)
  7 |   const [data, setData] = useState(null)
  8 | 
  9 |   const loadTokens = async () => {
 10 |     setIsLoading(true)
 11 |     const weightStrengths = await get_top_tokens(activeNeuron, 'weight')
 12 |     const activationStrengths = await get_top_tokens(activeNeuron, 'activation')
 13 | 
 14 |     const data = {
 15 |       activeNeuron,
 16 |       weightStrengths,
 17 |       activationStrengths,
 18 |     }
 19 | 
 20 |     setData(data)
 21 |     setIsLoading(false)
 22 |   }
 23 | 
 24 |   useEffect(() => {
 25 |     if (!data) {
 26 |       loadTokens()
 27 |     }
 28 |   }, [])
 29 | 
 30 | 
 31 |   return (
 32 |     <div className="min-w-0 flex-1">
 33 |       <h2 className="text-2xl font-bold mb-4">Related tokens</h2>
 34 |       {isLoading ? (
 35 |         <div className="flex justify-center items-center">
 36 |           <div className="loader">loading tokens</div>
 37 |         </div>
 38 |       ) : (
 39 |         <>
 40 |           <h3 className="text-md font-bold mb-4">Mean-activation-based</h3>
 41 |           <div className="mt-2 text-sm text-gray-700">
 42 |             {data.activationStrengths.tokens.map((token, idx) => {
 43 |               return (
 44 |                 data.activationStrengths.average_activations[idx] === null ? null :
 45 |                 <span
 46 |                   key={idx}
 47 |                   title={`Strength: ${data.activationStrengths.average_activations[idx].toFixed(2)}`}
 48 |                   className="inline-flex m-1 items-center px-2.5 py-0.5 rounded-full text-xs font-medium bg-gray-100 text-gray-800"
 49 |                 >
 50 |                   {token}
 51 |                 </span>
 52 |               )
 53 |             })}
 54 |           </div>
 55 |           <h3 className="text-md font-bold mb-4">Weight-based</h3>
 56 |           <div className="mt-2 text-sm text-gray-700">
 57 |             <p>Input tokens:</p>
 58 |             {data.weightStrengths.input_positive.tokens.slice(0, 20).map((token, idx) => {
 59 |               return (
 60 |                 data.weightStrengths.input_positive.strengths[idx] === null ? null :
 61 |                 <span
 62 |                   key={idx}
 63 |                   title={`Strength: ${data.weightStrengths.input_positive.strengths[idx].toFixed(2)}`}
 64 |                   className="inline-flex m-1 items-center px-2.5 py-0.5 rounded-full text-xs font-medium bg-gray-100 text-gray-800"
 65 |                 >
 66 |                   {token}
 67 |                 </span>
 68 |               )
 69 |             })}
 70 |           </div>
 71 |           {
 72 |           <div className="mt-2 text-sm text-gray-700">
 73 |             <p>Input tokens negative:</p>
 74 |             {data.weightStrengths.input_negative.tokens.slice(0, 20).map((token, idx) => {
 75 |               return (
 76 |                 data.weightStrengths.input_negative.strengths[idx] === null ? null :
 77 |                 <span
 78 |                   key={idx}
 79 |                   title={`Strength: ${data.weightStrengths.input_negative.strengths[idx].toFixed(2)}`}
 80 |                   className="inline-flex m-1 items-center px-2.5 py-0.5 rounded-full text-xs font-medium bg-gray-100 text-gray-800 text-red-800"
 81 |                 >
 82 |                   {token}
 83 |                 </span>
 84 |               )
 85 |             })}
 86 |           </div>
 87 |           }
 88 |           <div className="mt-2 text-sm text-gray-700">
 89 |             <p>Output tokens:</p>
 90 |             {data.weightStrengths.output_positive.tokens.slice(0, 20).map((token, idx) => {
 91 |               return (
 92 |                 data.weightStrengths.output_positive.strengths[idx] === null ? null :
 93 |                 <span
 94 |                   key={idx}
 95 |                   title={`Strength: ${data.weightStrengths.output_positive.strengths[idx].toFixed(2)}`}
 96 |                   className="inline-flex m-1 items-center px-2.5 py-0.5 rounded-full text-xs font-medium bg-gray-100 text-gray-800"
 97 |                 >
 98 |                   {token}
 99 |                 </span>
100 |               )
101 |             })}
102 |           </div>
103 |           {
104 |           <div className="mt-2 text-sm text-gray-700">
105 |             <p>Output tokens negative:</p>
106 |             {data.weightStrengths.output_negative.tokens.slice(0, 20).map((token, idx) => {
107 |               return (
108 |                 <span
109 |                   key={idx}
110 |                   title={`Strength: ${data.weightStrengths.output_negative.strengths[idx].toFixed(2)}`}
111 |                   className="inline-flex m-1 items-center px-2.5 py-0.5 rounded-full text-xs font-medium bg-gray-100 text-gray-800 text-red-800"
112 |                 >
113 |                   {token}
114 |                 </span>
115 |               )
116 |             })}
117 |           </div>
118 |           }
119 |         </>
120 |       )}
121 |     </div>
122 |   )
123 | }
124 | export default TokenDisplay
125 | 


--------------------------------------------------------------------------------
/neuron-explainer/neuron_explainer/explanations/prompt_builder.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | 
  3 | from enum import Enum
  4 | from typing import TypedDict, Union
  5 | 
  6 | import tiktoken
  7 | 
  8 | HarmonyMessage = TypedDict(
  9 |     "HarmonyMessage",
 10 |     {
 11 |         "role": str,
 12 |         "content": str,
 13 |     },
 14 | )
 15 | 
 16 | 
 17 | class PromptFormat(str, Enum):
 18 |     """
 19 |     Different ways of formatting the components of a prompt into the format accepted by the relevant
 20 |     API server endpoint.
 21 |     """
 22 | 
 23 |     NONE = "none"
 24 |     """Suitable for use with models that don't use special tokens for instructions."""
 25 |     INSTRUCTION_FOLLOWING = "instruction_following"
 26 |     """Suitable for IF models that use <|endofprompt|>."""
 27 |     HARMONY_V4 = "harmony_v4"
 28 |     """
 29 |     Suitable for Harmony models that use a structured turn-taking role+content format. Generates a
 30 |     list of HarmonyMessage dicts that can be sent to the /chat/completions endpoint.
 31 |     """
 32 | 
 33 |     @classmethod
 34 |     def from_string(cls, s: str) -> PromptFormat:
 35 |         for prompt_format in cls:
 36 |             if prompt_format.value == s:
 37 |                 return prompt_format
 38 |         raise ValueError(f"{s} is not a valid PromptFormat")
 39 | 
 40 | 
 41 | class Role(str, Enum):
 42 |     """See https://platform.openai.com/docs/guides/chat"""
 43 | 
 44 |     SYSTEM = "system"
 45 |     USER = "user"
 46 |     ASSISTANT = "assistant"
 47 | 
 48 | 
 49 | class PromptBuilder:
 50 |     """Class for accumulating components of a prompt and then formatting them into an output."""
 51 | 
 52 |     def __init__(self) -> None:
 53 |         self._messages: list[HarmonyMessage] = []
 54 | 
 55 |     def add_message(self, role: Role, message: str) -> None:
 56 |         self._messages.append(HarmonyMessage(role=role, content=message))
 57 | 
 58 |     def prompt_length_in_tokens(self, prompt_format: PromptFormat) -> int:
 59 |         # TODO(sbills): Make the model/encoding configurable. This implementation assumes GPT-4.
 60 |         encoding = tiktoken.get_encoding("cl100k_base")
 61 |         if prompt_format == PromptFormat.HARMONY_V4:
 62 |             # Approximately-correct implementation adapted from this documentation:
 63 |             # https://platform.openai.com/docs/guides/chat/introduction
 64 |             num_tokens = 0
 65 |             for message in self._messages:
 66 |                 num_tokens += (
 67 |                     4  # every message follows <|im_start|>{role/name}\n{content}<|im_end|>\n
 68 |                 )
 69 |                 num_tokens += len(encoding.encode(message["content"], allowed_special="all"))
 70 |             num_tokens += 2  # every reply is primed with <|im_start|>assistant
 71 |             return num_tokens
 72 |         else:
 73 |             prompt_str = self.build(prompt_format)
 74 |             assert isinstance(prompt_str, str)
 75 |             return len(encoding.encode(prompt_str, allowed_special="all"))
 76 | 
 77 |     def build(
 78 |         self, prompt_format: PromptFormat, *, allow_extra_system_messages: bool = False
 79 |     ) -> Union[str, list[HarmonyMessage]]:
 80 |         """
 81 |         Validates the messages added so far (reasonable alternation of assistant vs. user, etc.)
 82 |         and returns either a regular string (maybe with <|endofprompt|> tokens) or a list of
 83 |         HarmonyMessages suitable for use with the /chat/completions endpoint.
 84 | 
 85 |         The `allow_extra_system_messages` parameter allows the caller to specify that the prompt
 86 |         should be allowed to contain system messages after the very first one.
 87 |         """
 88 |         # Create a deep copy of the messages so we can modify it and so that the caller can't
 89 |         # modify the internal state of this object.
 90 |         messages = [message.copy() for message in self._messages]
 91 | 
 92 |         expected_next_role = Role.SYSTEM
 93 |         for message in messages:
 94 |             role = message["role"]
 95 |             assert role == expected_next_role or (
 96 |                 allow_extra_system_messages and role == Role.SYSTEM
 97 |             ), f"Expected message from {expected_next_role} but got message from {role}"
 98 |             if role == Role.SYSTEM:
 99 |                 expected_next_role = Role.USER
100 |             elif role == Role.USER:
101 |                 expected_next_role = Role.ASSISTANT
102 |             elif role == Role.ASSISTANT:
103 |                 expected_next_role = Role.USER
104 | 
105 |         if prompt_format == PromptFormat.INSTRUCTION_FOLLOWING:
106 |             last_user_message = None
107 |             for message in messages:
108 |                 if message["role"] == Role.USER:
109 |                     last_user_message = message
110 |             assert last_user_message is not None
111 |             last_user_message["content"] += "<|endofprompt|>"
112 | 
113 |         if prompt_format == PromptFormat.HARMONY_V4:
114 |             return messages
115 |         elif prompt_format in [PromptFormat.NONE, PromptFormat.INSTRUCTION_FOLLOWING]:
116 |             return "".join(message["content"] for message in messages)
117 |         else:
118 |             raise ValueError(f"Unknown prompt format: {prompt_format}")
119 | 


--------------------------------------------------------------------------------
/neuron-explainer/neuron_explainer/activations/activation_records.py:
--------------------------------------------------------------------------------
  1 | """Utilities for formatting activation records into prompts."""
  2 | 
  3 | import math
  4 | from typing import Optional, Sequence
  5 | 
  6 | from neuron_explainer.activations.activations import ActivationRecord
  7 | 
  8 | UNKNOWN_ACTIVATION_STRING = "unknown"
  9 | 
 10 | 
 11 | def relu(x: float) -> float:
 12 |     return max(0.0, x)
 13 | 
 14 | 
 15 | def calculate_max_activation(activation_records: Sequence[ActivationRecord]) -> float:
 16 |     """Return the maximum activation value of the neuron across all the activation records."""
 17 |     flattened = [
 18 |         # Relu is used to assume any values less than 0 are indicating the neuron is in the resting
 19 |         # state. This is a simplifying assumption that works with relu/gelu.
 20 |         max(relu(x) for x in activation_record.activations)
 21 |         for activation_record in activation_records
 22 |     ]
 23 |     return max(flattened)
 24 | 
 25 | 
 26 | def normalize_activations(activation_record: list[float], max_activation: float) -> list[int]:
 27 |     """Convert raw neuron activations to integers on the range [0, 10]."""
 28 |     if max_activation <= 0:
 29 |         return [0 for x in activation_record]
 30 |     # Relu is used to assume any values less than 0 are indicating the neuron is in the resting
 31 |     # state. This is a simplifying assumption that works with relu/gelu.
 32 |     return [min(10, math.floor(10 * relu(x) / max_activation)) for x in activation_record]
 33 | 
 34 | 
 35 | def _format_activation_record(
 36 |     activation_record: ActivationRecord,
 37 |     max_activation: float,
 38 |     omit_zeros: bool,
 39 |     hide_activations: bool = False,
 40 |     start_index: int = 0,
 41 | ) -> str:
 42 |     """Format neuron activations into a string, suitable for use in prompts."""
 43 |     tokens = activation_record.tokens
 44 |     normalized_activations = normalize_activations(activation_record.activations, max_activation)
 45 |     if omit_zeros:
 46 |         assert (not hide_activations) and start_index == 0, "Can't hide activations and omit zeros"
 47 |         tokens = [
 48 |             token for token, activation in zip(tokens, normalized_activations) if activation > 0
 49 |         ]
 50 |         normalized_activations = [x for x in normalized_activations if x > 0]
 51 | 
 52 |     entries = []
 53 |     assert len(tokens) == len(normalized_activations)
 54 |     for index, token, activation in zip(range(len(tokens)), tokens, normalized_activations):
 55 |         activation_string = str(int(activation))
 56 |         if hide_activations or index < start_index:
 57 |             activation_string = UNKNOWN_ACTIVATION_STRING
 58 |         entries.append(f"{token}\t{activation_string}")
 59 |     return "\n".join(entries)
 60 | 
 61 | 
 62 | def format_activation_records(
 63 |     activation_records: Sequence[ActivationRecord],
 64 |     max_activation: float,
 65 |     *,
 66 |     omit_zeros: bool = False,
 67 |     start_indices: Optional[list[int]] = None,
 68 |     hide_activations: bool = False,
 69 | ) -> str:
 70 |     """Format a list of activation records into a string."""
 71 |     return (
 72 |         "\n<start>\n"
 73 |         + "\n<end>\n<start>\n".join(
 74 |             [
 75 |                 _format_activation_record(
 76 |                     activation_record,
 77 |                     max_activation,
 78 |                     omit_zeros=omit_zeros,
 79 |                     hide_activations=hide_activations,
 80 |                     start_index=0 if start_indices is None else start_indices[i],
 81 |                 )
 82 |                 for i, activation_record in enumerate(activation_records)
 83 |             ]
 84 |         )
 85 |         + "\n<end>\n"
 86 |     )
 87 | 
 88 | 
 89 | def _format_tokens_for_simulation(tokens: Sequence[str]) -> str:
 90 |     """
 91 |     Format tokens into a string with each token marked as having an "unknown" activation, suitable
 92 |     for use in prompts.
 93 |     """
 94 |     entries = []
 95 |     for token in tokens:
 96 |         entries.append(f"{token}\t{UNKNOWN_ACTIVATION_STRING}")
 97 |     return "\n".join(entries)
 98 | 
 99 | 
100 | def format_sequences_for_simulation(
101 |     all_tokens: Sequence[Sequence[str]],
102 | ) -> str:
103 |     """
104 |     Format a list of lists of tokens into a string with each token marked as having an "unknown"
105 |     activation, suitable for use in prompts.
106 |     """
107 |     return (
108 |         "\n<start>\n"
109 |         + "\n<end>\n<start>\n".join(
110 |             [_format_tokens_for_simulation(tokens) for tokens in all_tokens]
111 |         )
112 |         + "\n<end>\n"
113 |     )
114 | 
115 | 
116 | def non_zero_activation_proportion(
117 |     activation_records: Sequence[ActivationRecord], max_activation: float
118 | ) -> float:
119 |     """Return the proportion of activation values that aren't zero."""
120 |     total_activations_count = sum(
121 |         [len(activation_record.activations) for activation_record in activation_records]
122 |     )
123 |     normalized_activations = [
124 |         normalize_activations(activation_record.activations, max_activation)
125 |         for activation_record in activation_records
126 |     ]
127 |     non_zero_activations_count = sum(
128 |         [len([x for x in activations if x != 0]) for activations in normalized_activations]
129 |     )
130 |     return non_zero_activations_count / total_activations_count
131 | 


--------------------------------------------------------------------------------
/neuron-viewer/src/interpAPI.ts:
--------------------------------------------------------------------------------
  1 | import {Neuron} from './types';
  2 | import {memoizeAsync} from "./utils"
  3 | 
  4 | export const load_file_no_cache = async(path: string) => {
  5 |   const data = {
  6 |     path: path
  7 |   }
  8 |   const url = new URL("/load_az", window.location.href)
  9 |   url.port = '8000';
 10 |   return await (
 11 |     await fetch(url, {
 12 |       method: "POST", // or 'PUT'
 13 |       headers: {
 14 |         "Content-Type": "application/json",
 15 |       },
 16 |       body: JSON.stringify(data),
 17 |     })
 18 |   ).json()
 19 |   
 20 | }
 21 | 
 22 | export  const load_file_az = async(path: string) => {
 23 |   const res = (
 24 |     await fetch(path, {
 25 |       method: "GET",
 26 |       mode: "cors",
 27 |       headers: {
 28 |         "Content-Type": "application/json",
 29 |       },
 30 |     })
 31 |   )
 32 |   if (!res.ok) {
 33 |     console.error(`HTTP error: ${res.status} - ${res.statusText}`);
 34 |     return;
 35 |   }
 36 |   return await res.json()
 37 | }
 38 | 
 39 | 
 40 | // export const load_file = memoizeAsync('load_file', load_file_no_cache)
 41 | export  const load_file = window.location.host.indexOf('localhost:') === -1 ? load_file_az : load_file_no_cache;
 42 | 
 43 | 
 44 | // # (derived from az://oaialignment/datasets/interp/gpt2_xl/v1/webtext1/len_nomax/n_50000/mlp_post_act/ranked_by_max_activation)
 45 | // const NEURON_RECORDS_PATH = "az://oaisbills/rcall/oss/migrated_make_crow_datasets/gpt2_xl_n_50000_64_token/neurons"
 46 | const NEURON_RECORDS_PATH = "https://openaipublic.blob.core.windows.net/neuron-explainer/data/collated-activations"
 47 | 
 48 | // # (derived from az://oaialignment/datasets/interp/gpt2_xl/v1/webtext1/len_nomax/n_50000/mlp_post_act/ranked_by_max_activation/neurons/explanations/canonical-run-v1)
 49 | // const EXPLANATIONS_PATH = "az://oaisbills/rcall/oss/migrated_explanation_datasets/canonical_gpt2_xl_all_neurons"
 50 | const EXPLANATIONS_PATH = "https://openaipublic.blob.core.windows.net/neuron-explainer/data/explanations"
 51 | 
 52 | // weight-based
 53 | // const WHOLE_LAYER_WEIGHT_TOKENS_PATH = "az://oaidan/rcall/data/interpretability/connections/gpt2-xl/mlp/unnorm_token_representations_uncommon_vanilla"
 54 | // const WEIGHT_TOKENS_PATH = "az://oaijeffwu/jeffwu-data/interpretability/neuron-connections/gpt2-xl/weight-based"
 55 | const WEIGHT_TOKENS_PATH = "https://openaipublic.blob.core.windows.net/neuron-explainer/data/related-tokens/weight-based"
 56 | // lookup table
 57 | // const WHOLE_LAYER_ACTIVATION_TOKENS_PATH = "az://oaidan/rcall/data/interpretability/connections/gpt2_xl/mlp/unnorm_token_representations_vanilla_and_common_in_colangv2_unigram"
 58 | // const ACTIVATION_TOKENS_PATH = "az://oaijeffwu/jeffwu-data/interpretability/neuron-connections/gpt2-xl/lookup-table"
 59 | const ACTIVATION_TOKENS_PATH = "https://openaipublic.blob.core.windows.net/neuron-explainer/data/related-tokens/activation-based"
 60 | 
 61 | // const CONNECTIONS_PATH = "az://oaialignment/datasets/interp/connections/gpt2/neuron_space/incl_attn_False"
 62 | const CONNECTIONS_PATH = "https://openaipublic.blob.core.windows.net/neuron-explainer/data/related-neurons/weight-based"
 63 | 
 64 | 
 65 | export const get_explanations = async (activeNeuron: Neuron) => {
 66 |   const result = await load_file(`${EXPLANATIONS_PATH}/${activeNeuron.layer}/${activeNeuron.neuron}.jsonl`)
 67 |   return result
 68 | }
 69 | 
 70 | export const get_top_tokens = async (activeNeuron: Neuron, weightType: string) => {
 71 |   let TOKENS_PATH;
 72 |   if (weightType === 'weight') {
 73 |     TOKENS_PATH = WEIGHT_TOKENS_PATH;
 74 |   } else if (weightType === 'activation') {
 75 |     TOKENS_PATH = ACTIVATION_TOKENS_PATH;
 76 |   } else {
 77 |     throw new Error(`Invalid weightType: ${weightType}`)
 78 |   }
 79 |   const result = await load_file(`${TOKENS_PATH}/${activeNeuron.layer}/${activeNeuron.neuron}.json`)
 80 |   return result
 81 |   // const result = await load_file_no_cache(`${ORIG_TOKENS_PATH}/${activeNeuron.layer}.json`)
 82 |   // return result.neuron_summaries[activeNeuron.neuron]
 83 | }
 84 | 
 85 | export const get_top_neuron_connections = async (activeNeuron: Neuron) => {
 86 |     const result = await load_file(`${CONNECTIONS_PATH}/${activeNeuron.layer}/${activeNeuron.neuron}.json`)
 87 | 
 88 |     const res: {[key: string]: [number, number]} = {};
 89 |     ["input", "output"].forEach((direction) => {
 90 |         const sign = "positive"  // "negative"
 91 |         const weight_name: string = {output: "c_proj", input: "c_fc"}[direction] as string;
 92 |         const res_for_dir = result[weight_name];
 93 |         if (res_for_dir === null) {
 94 |           return
 95 |         }
 96 |         // let key = 'top_negative_neurons'
 97 |         const top_neuron_strs = res_for_dir[`top_${sign}_neurons`]  // {layer}_{neuron} strings for each top-connected neuron
 98 |         const top_weights = res_for_dir[`top_${sign}_weights`]
 99 |         const top_layer_neuron_tuples = top_neuron_strs.map((neuron_str: string, i: number) => {
100 |             const [layer, neuron] = neuron_str.split("_").map((x: string) => parseInt(x))
101 |             return [layer, neuron, top_weights[i]] as [number, number, number]
102 |         })
103 |         res[direction] = top_layer_neuron_tuples.slice(0, 10)
104 |     })
105 | 
106 |     return res
107 | }
108 | 
109 | export const get_neuron_record = async(activeNeuron: Neuron) => {
110 |   const result = await load_file(`${NEURON_RECORDS_PATH}/${activeNeuron.layer}/${activeNeuron.neuron}.json`)
111 |   return result
112 | }
113 | 
114 | 


--------------------------------------------------------------------------------
/neuron-explainer/neuron_explainer/explanations/token_space_few_shot_examples.py:
--------------------------------------------------------------------------------
  1 | from dataclasses import dataclass
  2 | from enum import Enum
  3 | from typing import List
  4 | 
  5 | from neuron_explainer.fast_dataclasses import FastDataclass
  6 | 
  7 | 
  8 | @dataclass
  9 | class Example(FastDataclass):
 10 |     """
 11 |     An example list of tokens as strings corresponding to top token space inputs of a neuron, with a
 12 |     string explanation of the neuron's behavior on these tokens.
 13 |     """
 14 | 
 15 |     tokens: List[str]
 16 |     explanation: str
 17 | 
 18 | 
 19 | class TokenSpaceFewShotExampleSet(Enum):
 20 |     """Determines which few-shot examples to use when sampling explanations."""
 21 | 
 22 |     ORIGINAL = "original"
 23 |     TEST = "test"
 24 | 
 25 |     def get_examples(self) -> list[Example]:
 26 |         """Returns regular examples for use in a few-shot prompt."""
 27 |         if self is TokenSpaceFewShotExampleSet.ORIGINAL:
 28 |             return ORIGINAL_EXAMPLES
 29 |         elif self is TokenSpaceFewShotExampleSet.TEST:
 30 |             return TEST_EXAMPLES
 31 |         else:
 32 |             raise ValueError(f"Unhandled example set: {self}")
 33 | 
 34 | 
 35 | ORIGINAL_EXAMPLES = [
 36 |     Example(
 37 |         tokens=[
 38 |             "actual",
 39 |             " literal",
 40 |             " actual",
 41 |             " hyper",
 42 |             " real",
 43 |             " EX",
 44 |             " Real",
 45 |             "^",
 46 |             "Full",
 47 |             " full",
 48 |             " optical",
 49 |             " style",
 50 |             "any",
 51 |             "ALL",
 52 |             "extreme",
 53 |             " miniature",
 54 |             " Optical",
 55 |             " faint",
 56 |             "~",
 57 |             " Physical",
 58 |             " REAL",
 59 |             "*",
 60 |             "virtual",
 61 |             "TYPE",
 62 |             " technical",
 63 |             "otally",
 64 |             " physic",
 65 |             "Type",
 66 |             "<",
 67 |             "images",
 68 |             "atic",
 69 |             " sheer",
 70 |             " Style",
 71 |             " partial",
 72 |             " natural",
 73 |             "Hyper",
 74 |             " Any",
 75 |             " theoretical",
 76 |             "|",
 77 |             " ultimate",
 78 |             "oing",
 79 |             " constant",
 80 |             "ANY",
 81 |             "antically",
 82 |             "ishly",
 83 |             " ex",
 84 |             " visual",
 85 |             "special",
 86 |             "omorphic",
 87 |             "visual",
 88 |         ],
 89 |         explanation=" adjectives related to being real, or to physical properties and evidence",
 90 |     ),
 91 |     Example(
 92 |         tokens=[
 93 |             "cephal",
 94 |             "aeus",
 95 |             " coma",
 96 |             "bered",
 97 |             "abetes",
 98 |             "inflamm",
 99 |             "rugged",
100 |             "alysed",
101 |             "azine",
102 |             "hered",
103 |             "cells",
104 |             "aneously",
105 |             "fml",
106 |             "igm",
107 |             "culosis",
108 |             "iani",
109 |             "CTV",
110 |             "disabled",
111 |             "heric",
112 |             "ulo",
113 |             "geoning",
114 |             "awi",
115 |             "translation",
116 |             "iral",
117 |             "govtrack",
118 |             "mson",
119 |             "cloth",
120 |             "nesota",
121 |             " Dise",
122 |             " Lyme",
123 |             " dementia",
124 |             "agn",
125 |             " reversible",
126 |             " susceptibility",
127 |             "esthesia",
128 |             "orf",
129 |             " inflamm",
130 |             " Obesity",
131 |             " tox",
132 |             " Disorders",
133 |             "uberty",
134 |             "blind",
135 |             "ALTH",
136 |             "avier",
137 |             " Immunity",
138 |             " Hurt",
139 |             "ulet",
140 |             "ueless",
141 |             " sluggish",
142 |             "rosis",
143 |         ],
144 |         explanation=" words related to physical medical conditions",
145 |     ),
146 |     Example(
147 |         tokens=[
148 |             " January",
149 |             "terday",
150 |             "cember",
151 |             " April",
152 |             " July",
153 |             "September",
154 |             "December",
155 |             "Thursday",
156 |             "quished",
157 |             "November",
158 |             "Tuesday",
159 |             "uesday",
160 |             " Sept",
161 |             "ruary",
162 |             " March",
163 |             ";;;;;;;;;;;;",
164 |             " Monday",
165 |             "Wednesday",
166 |             " Saturday",
167 |             " Wednesday",
168 |             "Reloaded",
169 |             "aturday",
170 |             " August",
171 |             "Feb",
172 |             "Sunday",
173 |             "Reviewed",
174 |             "uggest",
175 |             " Dhabi",
176 |             "ACTED",
177 |             "tten",
178 |             "Year",
179 |             "August",
180 |             "alogue",
181 |             "MX",
182 |             " Janeiro",
183 |             "yss",
184 |             " Leilan",
185 |             " Fiscal",
186 |             " referen",
187 |             "semb",
188 |             "eele",
189 |             "wcs",
190 |             "detail",
191 |             "ertation",
192 |             " Reborn",
193 |             " Sunday",
194 |             "itially",
195 |             "aturdays",
196 |             " Dise",
197 |             "essage",
198 |         ],
199 |         explanation=" nouns related to time and dates",
200 |     ),
201 | ]
202 | 
203 | TEST_EXAMPLES = [
204 |     Example(
205 |         tokens=[
206 |             "these",
207 |             " are",
208 |             " tokens",
209 |         ],
210 |         explanation=" this is a test explanation",
211 |     ),
212 | ]
213 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Automated interpretability
 2 | 
 3 | ## Code and tools
 4 | 
 5 | This repository contains code and tools associated with the [Language models can explain neurons in
 6 | language models](https://openaipublic.blob.core.windows.net/neuron-explainer/paper/index.html) paper, specifically:
 7 | 
 8 | * Code for automatically generating, simulating, and scoring explanations of neuron behavior using
 9 | the methodology described in the paper. See the
10 | [neuron-explainer README](neuron-explainer/README.md) for more information.
11 | 
12 | Note: if you run into errors of the form "Error: Could not find any credentials that grant access to storage account: 'openaipublic' and container: 'neuron-explainer'"." you might be able to fix this by signing up for an azure account and specifying the credentials as described in the error message. 
13 | 
14 | * A tool for viewing neuron activations and explanations, accessible
15 | [here](https://openaipublic.blob.core.windows.net/neuron-explainer/neuron-viewer/index.html). See
16 | the [neuron-viewer README](neuron-viewer/README.md) for more information.
17 | 
18 | ## Public datasets
19 | 
20 | Together with this code, we're also releasing public datasets of GPT-2 XL neurons and explanations.
21 | Here's an overview of those datasets.  
22 | 
23 | * Neuron activations: `az://openaipublic/neuron-explainer/data/collated-activations/{layer_index}/{neuron_index}.json`
24 |     - Tokenized text sequences and their activations for the neuron. We
25 |     provide multiple sets of tokens and activations: top-activating ones, random
26 |     samples from several quantiles; and a completely random sample. We also provide
27 |     some basic statistics for the activations.
28 |     - Each file contains a JSON-formatted
29 |     [`NeuronRecord`](neuron-explainer/neuron_explainer/activations/activations.py#L89) dataclass.
30 | * Neuron explanations: `az://openaipublic/neuron-explainer/data/explanations/{layer_index}/{neuron_index}.jsonl`
31 |     - Scored model-generated explanations of the behavior of the neuron, including simulation results.
32 |     - Each file contains a JSON-formatted
33 |     [`NeuronSimulationResults`](neuron-explainer/neuron_explainer/explanations/explanations.py#L146)
34 |     dataclass.
35 | * Related neurons: `az://openaipublic/neuron-explainer/data/related-neurons/weight-based/{layer_index}/{neuron_index}.json`
36 |     - Lists of the upstream and downstream neurons with the most positive and negative connections (see below for definition).
37 |     - Each file contains a JSON-formatted dataclass whose definition is not included in this repo.
38 | * Tokens with high average activations:
39 | `az://openaipublic/neuron-explainer/data/related-tokens/activation-based/{layer_index}/{neuron_index}.json`
40 |     - Lists of tokens with the highest average activations for individual neurons, and their average activations.
41 |     - Each file contains a JSON-formatted [`TokenLookupTableSummaryOfNeuron`](neuron-explainer/neuron_explainer/activations/token_connections.py#L36)
42 |     dataclass.
43 | * Tokens with large inbound and outbound weights:
44 | `az://openaipublic/neuron-explainer/data/related-tokens/weight-based/{layer_index}/{neuron_index}.json`
45 |     - List of the most-positive and most-negative input and output tokens for individual neurons,
46 |     as well as the associated weight (see below for definition). 
47 |     - Each file contains a JSON-formatted [`WeightBasedSummaryOfNeuron`](neuron-explainer/neuron_explainer/activations/token_connections.py#L17)
48 |     dataclass.
49 | 
50 | Update (July 5, 2023):
51 | We also released a set of explanations for GPT-2 Small. The methodology is slightly different from the methodology used for GPT-2 XL so the results aren't directly comparable.
52 | * Neuron activations: `az://openaipublic/neuron-explainer/gpt2_small_data/collated-activations/{layer_index}/{neuron_index}.json`
53 | * Neuron explanations: `az://openaipublic/neuron-explainer/gpt2_small_data/explanations/{layer_index}/{neuron_index}.jsonl`
54 | 
55 | Update (August 30, 2023): We recently discovered a bug in how we performed inference on the GPT-2 series models used for the paper and for these datasets. Specifically, we used an optimized GELU implementation rather than the original GELU implementation associated with GPT-2. While the model’s behavior is very similar across these two configurations, the post-MLP activation values we used to generate and simulate explanations differ from the correct values by the following amounts for GPT-2 small:
56 | 
57 | - Median: 0.0090
58 | - 90th percentile: 0.0252
59 | - 99th percentile: 0.0839
60 | - 99.9th percentile: 0.1736
61 | 
62 | ### Definition of connection weights
63 | 
64 | Refer to [GPT-2 model code](https://github.com/openai/gpt-2/blob/master/src/model.py) for
65 | understanding of model weight conventions.
66 | 
67 | *Neuron-neuron*: For two neurons `(l1, n1)` and `(l2, n2)` with `l1 < l2`, the connection strength is defined as
68 | `h{l1}.mlp.c_proj.w[:, n1, :] @ diag(h{l2}.ln_2.g) @ h{l2}.mlp.c_fc.w[:, :, n2]`.
69 | 
70 | *Neuron-token*: For token `t` and neuron `(l, n)`, the input weight is computed as
71 | `wte[t, :] @ diag(h{l}.ln_2.g) @ h{l}.mlp.c_fc.w[:, :, n]`
72 | and the output weight is computed as
73 | `h{l}.mlp.c_proj.w[:, n, :] @ diag(ln_f.g) @ wte[t, :]`.
74 | 
75 | ### Misc Lists of Interesting Neurons
76 | Lists of neurons we thought were interesting according to different criteria, with some preliminary descriptions.
77 | * [Interesting Neurons (external)](https://docs.google.com/spreadsheets/d/1p7fYs31NU8sJoeKyUx4Mn2laGx8xXfHg_KcIvYiKPpg/edit#gid=0)
78 | * [Neurons that score high on random, possibly monosemantic? (external)](https://docs.google.com/spreadsheets/d/1TqKFcz-84jyIHLU7VRoTc8BoFBMpbgac-iNBnxVurQ8/edit?usp=sharing)
79 | * [Clusters of neurons well explained by activation explanation but not by tokens](https://docs.google.com/document/d/1lWhKowpKDdwTMALD_K541cdwgGoQx8DFUSuEe1U2AGE/edit?usp=sharing)
80 | * [Neurons sensitive to truncation](https://docs.google.com/document/d/1x89TWBvuHcyC2t01EDbJZJ5LQYHozlcS-VUmr5shf_A/edit?usp=sharing)
81 | 


--------------------------------------------------------------------------------
/neuron-explainer/neuron_explainer/api_client.py:
--------------------------------------------------------------------------------
  1 | import asyncio
  2 | import contextlib
  3 | import os
  4 | import random
  5 | import traceback
  6 | from asyncio import Semaphore
  7 | from functools import wraps
  8 | from typing import Any, Callable, Optional
  9 | 
 10 | import httpx
 11 | import orjson
 12 | 
 13 | 
 14 | def is_api_error(err: Exception) -> bool:
 15 |     if isinstance(err, httpx.HTTPStatusError):
 16 |         response = err.response
 17 |         error_data = response.json().get("error", {})
 18 |         error_message = error_data.get("message")
 19 |         if response.status_code in [400, 404, 415]:
 20 |             if error_data.get("type") == "idempotency_error":
 21 |                 print(f"Retrying after idempotency error: {error_message} ({response.url})")
 22 |                 return True
 23 |             else:
 24 |                 # Invalid request
 25 |                 return False
 26 |         else:
 27 |             print(f"Retrying after API error: {error_message} ({response.url})")
 28 |             return True
 29 | 
 30 |     elif isinstance(err, httpx.ConnectError):
 31 |         print(f"Retrying after connection error... ({err.request.url})")
 32 |         return True
 33 | 
 34 |     elif isinstance(err, httpx.TimeoutException):
 35 |         print(f"Retrying after a timeout error... ({err.request.url})")
 36 |         return True
 37 | 
 38 |     elif isinstance(err, httpx.ReadError):
 39 |         print(f"Retrying after a read error... ({err.request.url})")
 40 |         return True
 41 | 
 42 |     print(f"Retrying after an unexpected error: {repr(err)}")
 43 |     traceback.print_tb(err.__traceback__)
 44 |     return True
 45 | 
 46 | 
 47 | def exponential_backoff(
 48 |     retry_on: Callable[[Exception], bool] = lambda err: True
 49 | ) -> Callable[[Callable], Callable]:
 50 |     """
 51 |     Returns a decorator which retries the wrapped function as long as the specified retry_on
 52 |     function returns True for the exception, applying exponential backoff with jitter after
 53 |     failures, up to a retry limit.
 54 |     """
 55 |     init_delay_s = 1.0
 56 |     max_delay_s = 10.0
 57 |     # Roughly 30 minutes before we give up.
 58 |     max_tries = 200
 59 |     backoff_multiplier = 2.0
 60 |     jitter = 0.2
 61 | 
 62 |     def decorate(f: Callable) -> Callable:
 63 |         assert asyncio.iscoroutinefunction(f)
 64 | 
 65 |         @wraps(f)
 66 |         async def f_retry(*args: Any, **kwargs: Any) -> None:
 67 |             delay_s = init_delay_s
 68 |             for i in range(max_tries):
 69 |                 try:
 70 |                     return await f(*args, **kwargs)
 71 |                 except Exception as err:
 72 |                     if not retry_on(err) or i == max_tries - 1:
 73 |                         raise
 74 |                     jittered_delay = random.uniform(delay_s * (1 - jitter), delay_s * (1 + jitter))
 75 |                     await asyncio.sleep(jittered_delay)
 76 |                     delay_s = min(delay_s * backoff_multiplier, max_delay_s)
 77 | 
 78 |         return f_retry
 79 | 
 80 |     return decorate
 81 | 
 82 | 
 83 | API_KEY = os.getenv("OPENAI_API_KEY")
 84 | assert API_KEY, "Please set the OPENAI_API_KEY environment variable"
 85 | API_HTTP_HEADERS = {
 86 |     "Content-Type": "application/json",
 87 |     "Authorization": "Bearer " + API_KEY,
 88 | }
 89 | BASE_API_URL = "https://api.openai.com/v1"
 90 | 
 91 | 
 92 | class ApiClient:
 93 |     """Performs inference using the OpenAI API. Supports response caching and concurrency limits."""
 94 | 
 95 |     def __init__(
 96 |         self,
 97 |         model_name: str,
 98 |         # If set, no more than this number of HTTP requests will be made concurrently.
 99 |         max_concurrent: Optional[int] = None,
100 |         # Whether to cache request/response pairs in memory to avoid duplicating requests.
101 |         cache: bool = False,
102 |     ):
103 |         self.model_name = model_name
104 | 
105 |         if max_concurrent is not None:
106 |             self._concurrency_check: Optional[Semaphore] = Semaphore(max_concurrent)
107 |         else:
108 |             self._concurrency_check = None
109 | 
110 |         if cache:
111 |             self._cache: Optional[dict[str, Any]] = {}
112 |         else:
113 |             self._cache = None
114 | 
115 |     @exponential_backoff(retry_on=is_api_error)
116 |     async def make_request(
117 |         self, timeout_seconds: Optional[int] = None, **kwargs: Any
118 |     ) -> dict[str, Any]:
119 |         if self._cache is not None:
120 |             key = orjson.dumps(kwargs)
121 |             if key in self._cache:
122 |                 return self._cache[key]
123 |         async with contextlib.AsyncExitStack() as stack:
124 |             if self._concurrency_check is not None:
125 |                 await stack.enter_async_context(self._concurrency_check)
126 |             http_client = await stack.enter_async_context(
127 |                 httpx.AsyncClient(timeout=timeout_seconds)
128 |             )
129 |             # If the request has a "messages" key, it should be sent to the /chat/completions
130 |             # endpoint. Otherwise, it should be sent to the /completions endpoint.
131 |             url = BASE_API_URL + ("/chat/completions" if "messages" in kwargs else "/completions")
132 |             kwargs["model"] = self.model_name
133 |             response = await http_client.post(url, headers=API_HTTP_HEADERS, json=kwargs)
134 |         # The response json has useful information but the exception doesn't include it, so print it
135 |         # out then reraise.
136 |         try:
137 |             response.raise_for_status()
138 |         except Exception as e:
139 |             print(response.json())
140 |             raise e
141 |         if self._cache is not None:
142 |             self._cache[key] = response.json()
143 |         return response.json()
144 | 
145 | 
146 | if __name__ == "__main__":
147 | 
148 |     async def main() -> None:
149 |         client = ApiClient(model_name="gpt-3.5-turbo", max_concurrent=1)
150 |         print(await client.make_request(prompt="Why did the chicken cross the road?", max_tokens=9))
151 | 
152 |     asyncio.run(main())
153 | 


--------------------------------------------------------------------------------
/neuron-explainer/neuron_explainer/explanations/scoring.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | 
  3 | import asyncio
  4 | import logging
  5 | from typing import Any, Callable, Coroutine, Sequence
  6 | 
  7 | import numpy as np
  8 | from neuron_explainer.activations.activations import ActivationRecord
  9 | from neuron_explainer.explanations.calibrated_simulator import (
 10 |     CalibratedNeuronSimulator,
 11 |     LinearCalibratedNeuronSimulator,
 12 | )
 13 | from neuron_explainer.explanations.explanations import (
 14 |     ScoredSequenceSimulation,
 15 |     ScoredSimulation,
 16 |     SequenceSimulation,
 17 | )
 18 | from neuron_explainer.explanations.simulator import ExplanationNeuronSimulator, NeuronSimulator
 19 | 
 20 | 
 21 | def flatten_list(list_of_lists: Sequence[Sequence[Any]]) -> list[Any]:
 22 |     return [item for sublist in list_of_lists for item in sublist]
 23 | 
 24 | 
 25 | def correlation_score(
 26 |     real_activations: Sequence[float] | np.ndarray,
 27 |     predicted_activations: Sequence[float] | np.ndarray,
 28 | ) -> float:
 29 |     return np.corrcoef(real_activations, predicted_activations)[0, 1]
 30 | 
 31 | 
 32 | def score_from_simulation(
 33 |     real_activations: ActivationRecord,
 34 |     simulation: SequenceSimulation,
 35 |     score_function: Callable[[Sequence[float] | np.ndarray, Sequence[float] | np.ndarray], float],
 36 | ) -> float:
 37 |     return score_function(real_activations.activations, simulation.expected_activations)
 38 | 
 39 | 
 40 | def rsquared_score_from_sequences(
 41 |     real_activations: Sequence[float] | np.ndarray,
 42 |     predicted_activations: Sequence[float] | np.ndarray,
 43 | ) -> float:
 44 |     return float(
 45 |         1
 46 |         - np.mean(np.square(np.array(real_activations) - np.array(predicted_activations)))
 47 |         / np.mean(np.square(np.array(real_activations)))
 48 |     )
 49 | 
 50 | 
 51 | def absolute_dev_explained_score_from_sequences(
 52 |     real_activations: Sequence[float] | np.ndarray,
 53 |     predicted_activations: Sequence[float] | np.ndarray,
 54 | ) -> float:
 55 |     return float(
 56 |         1
 57 |         - np.mean(np.abs(np.array(real_activations) - np.array(predicted_activations)))
 58 |         / np.mean(np.abs(np.array(real_activations)))
 59 |     )
 60 | 
 61 | 
 62 | async def make_explanation_simulator(
 63 |     explanation: str,
 64 |     calibration_activation_records: Sequence[ActivationRecord],
 65 |     model_name: str,
 66 |     calibrated_simulator_class: type[CalibratedNeuronSimulator] = LinearCalibratedNeuronSimulator,
 67 | ) -> CalibratedNeuronSimulator:
 68 |     """
 69 |     Make a simulator that uses an explanation to predict activations and calibrates it on the given
 70 |     activation records.
 71 |     """
 72 |     simulator = ExplanationNeuronSimulator(model_name, explanation)
 73 |     calibrated_simulator = calibrated_simulator_class(simulator)
 74 |     await calibrated_simulator.calibrate(calibration_activation_records)
 75 |     return calibrated_simulator
 76 | 
 77 | 
 78 | async def _simulate_and_score_sequence(
 79 |     simulator: NeuronSimulator, activations: ActivationRecord
 80 | ) -> ScoredSequenceSimulation:
 81 |     """Score an explanation of a neuron by how well it predicts activations on a sentence."""
 82 |     simulation = await simulator.simulate(activations.tokens)
 83 |     logging.debug(simulation)
 84 |     rsquared_score = score_from_simulation(activations, simulation, rsquared_score_from_sequences)
 85 |     absolute_dev_explained_score = score_from_simulation(
 86 |         activations, simulation, absolute_dev_explained_score_from_sequences
 87 |     )
 88 |     scored_sequence_simulation = ScoredSequenceSimulation(
 89 |         simulation=simulation,
 90 |         true_activations=activations.activations,
 91 |         ev_correlation_score=score_from_simulation(activations, simulation, correlation_score),
 92 |         rsquared_score=rsquared_score,
 93 |         absolute_dev_explained_score=absolute_dev_explained_score,
 94 |     )
 95 |     return scored_sequence_simulation
 96 | 
 97 | 
 98 | def aggregate_scored_sequence_simulations(
 99 |     scored_sequence_simulations: list[ScoredSequenceSimulation],
100 | ) -> ScoredSimulation:
101 |     """
102 |     Aggregate a list of scored sequence simulations. The logic for doing this is non-trivial for EV
103 |     scores, since we want to calculate the correlation over all activations from all sequences at
104 |     once rather than simply averaging per-sequence correlations.
105 |     """
106 |     all_true_activations: list[float] = []
107 |     all_expected_values: list[float] = []
108 |     for scored_sequence_simulation in scored_sequence_simulations:
109 |         all_true_activations.extend(scored_sequence_simulation.true_activations or [])
110 |         all_expected_values.extend(scored_sequence_simulation.simulation.expected_activations)
111 |     ev_correlation_score = (
112 |         correlation_score(all_true_activations, all_expected_values)
113 |         if len(all_true_activations) > 0
114 |         else None
115 |     )
116 |     rsquared_score = rsquared_score_from_sequences(all_true_activations, all_expected_values)
117 |     absolute_dev_explained_score = absolute_dev_explained_score_from_sequences(
118 |         all_true_activations, all_expected_values
119 |     )
120 | 
121 |     return ScoredSimulation(
122 |         scored_sequence_simulations=scored_sequence_simulations,
123 |         ev_correlation_score=ev_correlation_score,
124 |         rsquared_score=rsquared_score,
125 |         absolute_dev_explained_score=absolute_dev_explained_score,
126 |     )
127 | 
128 | 
129 | async def simulate_and_score(
130 |     simulator: NeuronSimulator,
131 |     activation_records: Sequence[ActivationRecord],
132 | ) -> ScoredSimulation:
133 |     """
134 |     Score an explanation of a neuron by how well it predicts activations on the given text
135 |     sequences.
136 |     """
137 |     scored_sequence_simulations = await asyncio.gather(
138 |         *[
139 |             _simulate_and_score_sequence(
140 |                 simulator,
141 |                 activation_record,
142 |             )
143 |             for activation_record in activation_records
144 |         ]
145 |     )
146 |     return aggregate_scored_sequence_simulations(scored_sequence_simulations)
147 | 
148 | 
149 | async def make_simulator_and_score(
150 |     make_simulator: Coroutine[None, None, NeuronSimulator],
151 |     activation_records: Sequence[ActivationRecord],
152 | ) -> ScoredSimulation:
153 |     """Chain together creating the simulator and using it to score activation records."""
154 |     simulator = await make_simulator
155 |     return await simulate_and_score(simulator, activation_records)
156 | 


--------------------------------------------------------------------------------
/neuron-viewer/src/panes/explanation.jsx:
--------------------------------------------------------------------------------
  1 | import React, { useState, useEffect } from "react"
  2 | import { get_explanations } from "../interpAPI"
  3 | // import HeatmapGrid from "../heatmapGrid"
  4 | import SimulationHeatmap from "../simulationHeatmap"
  5 | import { normalizeTokenActs } from "../types"
  6 | 
  7 | 
  8 | function zip_simulated_sequences(sequences) {
  9 |   return sequences.map(({ simulation }) => {
 10 |     return simulation.tokens.map((token, idx) => ({
 11 |       token,
 12 |       activation: simulation.expected_activations[idx],
 13 |     }))
 14 |   })
 15 | }
 16 | 
 17 | function zip_real_sequences(sequences) {
 18 |   return sequences.map(({ simulation, true_activations }) => {
 19 |     return simulation.tokens.map((token, idx) => ({
 20 |       token,
 21 |       activation: true_activations[idx],
 22 |     }))
 23 |   })
 24 | }
 25 | 
 26 | const ExplanationDisplay = ({ activeNeuron }) => {
 27 |   const [isLoading, setIsLoading] = useState(true)
 28 |   const [data, setData] = useState(null)
 29 |   const [showingScoringDetails, setShowingScoringDetails] = useState(false)
 30 |   const [toggle, setToggle] = useState(false);
 31 | 
 32 |   const loadExplanation = async () => {
 33 |     const result = await get_explanations(activeNeuron);
 34 |     setData(result.scored_explanations[0])
 35 |     setIsLoading(false)
 36 |   }
 37 | 
 38 |   useEffect(() => {
 39 |     if (!data) {
 40 |       loadExplanation()
 41 |     }
 42 |   }, [])
 43 | 
 44 |   const handleToggleChange = () => {
 45 |     setToggle(!toggle);
 46 |   };
 47 | 
 48 |   let sim_sequences;
 49 |   if (data) {
 50 |     sim_sequences = zip_simulated_sequences(data.scored_simulation.scored_sequence_simulations);
 51 |     [sim_sequences] = normalizeTokenActs(sim_sequences)
 52 |   } else {
 53 |     sim_sequences = []
 54 |   }
 55 | 
 56 |   let real_sequences;
 57 |   if (data) {
 58 |     real_sequences = zip_real_sequences(data.scored_simulation.scored_sequence_simulations);
 59 |     [real_sequences] = normalizeTokenActs(real_sequences)
 60 |   } else {
 61 |     real_sequences = []
 62 |   }
 63 | 
 64 |   const suggest_explanation_link = "https://docs.google.com/forms/d/e/1FAIpQLSckMyDQedGhdISIqaqn0YGUtd2xqEWgPu7ehoPUTT2pTge_-g/viewform?"
 65 |     + `usp=pp_url&entry.541490611=${activeNeuron.layer}`
 66 |     + `&entry.1688855196=${activeNeuron.neuron}`
 67 |     + `&entry.495312202=https://openaipublic.blob.core.windows.net/neuron-explainer/neuron-viewer/index.html%23/layers/${activeNeuron.layer}/neurons/${activeNeuron.neuron}`;
 68 | 
 69 |   return (
 70 |     <>
 71 |       <div className="min-w-0 flex-1">
 72 |         <h2 className="text-2xl font-bold mb-4">Explanation</h2>
 73 |         {isLoading ? (
 74 |           <div className="flex justify-center items-center">
 75 |             <div className="loader">Loading...</div>
 76 |           </div>
 77 |         ) : (
 78 |           <>
 79 |             <blockquote className="p-1 px-4 mx-1 my-0">
 80 |               <p className="py-1">
 81 |                 <em>{data.explanation}</em>
 82 |               </p>
 83 |               <p className="py-1">
 84 |                 <em>score: {data.scored_simulation.ev_correlation_score.toFixed(2)}</em>
 85 |               </p>
 86 |               <p className="py-1">
 87 |                 <a href={suggest_explanation_link}>Suggest Better Explanation</a>
 88 |               </p>
 89 |             </blockquote>
 90 |             <button onClick={() => { setShowingScoringDetails(!showingScoringDetails) }}>
 91 |               {showingScoringDetails ? 'Hide' : 'Show'} scoring details
 92 |             </button>
 93 |             {
 94 |               showingScoringDetails ?
 95 |                 <>
 96 |                   <div
 97 |                     style={{
 98 |                       textAlign: 'right',
 99 |                     }}
100 |                   >
101 |                     <div
102 |                       style={{
103 |                         display: 'inline-block',
104 |                         position: 'relative',
105 |                         width: '60px',
106 |                         height: '34px',
107 |                         marginLeft: '10px',
108 |                         marginBottom: '5px',
109 |                         borderRadius: '34px',
110 |                         backgroundColor: toggle ? '#0A978B' : '#CCC',
111 |                         cursor: 'pointer',
112 |                         transition: 'background-color 0.2s',
113 |                       }}
114 |                     >
115 |                       <input
116 |                         type="checkbox"
117 |                         id="toggle"
118 |                         checked={toggle}
119 |                         onChange={handleToggleChange}
120 |                         style={{
121 |                           width: '100%',
122 |                           height: '100%',
123 |                           margin: '0',
124 |                           opacity: '0',
125 |                           cursor: 'pointer',
126 |                         }}
127 |                       />
128 |                       <span
129 |                         onClick={handleToggleChange}
130 |                         style={{
131 |                           position: 'absolute',
132 |                           top: '2px',
133 |                           left: toggle ? '29px' : '2px',
134 |                           width: '30px',
135 |                           height: '30px',
136 |                           backgroundColor: 'white',
137 |                           borderRadius: '50%',
138 |                           boxShadow: '0 2px 5px rgba(0, 0, 0, 0.3)',
139 |                           transition: 'left 0.2s',
140 |                         }}
141 |                       ></span>
142 |                     </div>
143 |                     <br />
144 |                     {toggle ? 'Activations overlaid (top = real, bottom = simulated)' : 'Activations not overlaid'}
145 |                   </div>
146 |                   <h3 className="text-md font-bold">Top</h3>
147 |                   <SimulationHeatmap
148 |                     sequences={real_sequences.slice(0, 5)}
149 |                     simulated_sequences={sim_sequences.slice(0, 5)}
150 |                     overlay_activations={toggle}
151 |                   />
152 |                   <h3 className="text-md font-bold">Random</h3>
153 |                   <SimulationHeatmap
154 |                     sequences={real_sequences.slice(5)}
155 |                     simulated_sequences={sim_sequences.slice(5)}
156 |                     overlay_activations={toggle}
157 |                   />
158 |                 </> : null
159 |             }
160 |           </>
161 |         )}
162 |       </div>
163 |     </>
164 |   )
165 | }
166 | 
167 | export default ExplanationDisplay
168 | 


--------------------------------------------------------------------------------
/neuron-explainer/neuron_explainer/explanations/test_explainer.py:
--------------------------------------------------------------------------------
  1 | import asyncio
  2 | from typing import Any
  3 | 
  4 | from neuron_explainer.explanations.explainer import (
  5 |     TokenActivationPairExplainer,
  6 |     TokenSpaceRepresentationExplainer,
  7 | )
  8 | from neuron_explainer.explanations.few_shot_examples import TEST_EXAMPLES, FewShotExampleSet
  9 | from neuron_explainer.explanations.prompt_builder import HarmonyMessage, PromptFormat, Role
 10 | from neuron_explainer.explanations.token_space_few_shot_examples import (
 11 |     TokenSpaceFewShotExampleSet,
 12 | )
 13 | 
 14 | 
 15 | def setup_module(unused_module: Any) -> None:
 16 |     # Make sure we have an event loop, since the attempt to create the Semaphore in
 17 |     # ResearchApiClient will fail without it.
 18 |     loop = asyncio.new_event_loop()
 19 |     asyncio.set_event_loop(loop)
 20 | 
 21 | 
 22 | def test_if_formatting() -> None:
 23 |     expected_prompt = """We're studying neurons in a neural network. Each neuron looks for some particular thing in a short document. Look at the parts of the document the neuron activates for and summarize in a single sentence what the neuron is looking for. Don't list examples of words.
 24 | 
 25 | The activation format is token<tab>activation. Activation values range from 0 to 10. A neuron finding what it's looking for is represented by a non-zero activation value. The higher the activation value, the stronger the match.
 26 | 
 27 | Neuron 1
 28 | Activations:
 29 | <start>
 30 | a	10
 31 | b	0
 32 | c	0
 33 | <end>
 34 | <start>
 35 | d	0
 36 | e	10
 37 | f	0
 38 | <end>
 39 | 
 40 | Explanation of neuron 1 behavior: the main thing this neuron does is find vowels.
 41 | 
 42 | Neuron 2
 43 | Activations:
 44 | <start>
 45 | a	10
 46 | b	0
 47 | c	0
 48 | <end>
 49 | <start>
 50 | d	0
 51 | e	10
 52 | f	0
 53 | <end>
 54 | 
 55 | Explanation of neuron 2 behavior:<|endofprompt|> the main thing this neuron does is find"""
 56 | 
 57 |     explainer = TokenActivationPairExplainer(
 58 |         model_name="text-davinci-003",
 59 |         prompt_format=PromptFormat.INSTRUCTION_FOLLOWING,
 60 |         few_shot_example_set=FewShotExampleSet.TEST,
 61 |     )
 62 |     prompt = explainer.make_explanation_prompt(
 63 |         all_activation_records=TEST_EXAMPLES[0].activation_records,
 64 |         max_activation=1.0,
 65 |         max_tokens_for_completion=20,
 66 |     )
 67 | 
 68 |     assert prompt == expected_prompt
 69 | 
 70 | 
 71 | def test_harmony_format() -> None:
 72 |     expected_prompt = [
 73 |         HarmonyMessage(
 74 |             role=Role.SYSTEM,
 75 |             content="""We're studying neurons in a neural network. Each neuron looks for some particular thing in a short document. Look at the parts of the document the neuron activates for and summarize in a single sentence what the neuron is looking for. Don't list examples of words.
 76 | 
 77 | The activation format is token<tab>activation. Activation values range from 0 to 10. A neuron finding what it's looking for is represented by a non-zero activation value. The higher the activation value, the stronger the match.""",
 78 |         ),
 79 |         HarmonyMessage(
 80 |             role=Role.USER,
 81 |             content="""
 82 | 
 83 | Neuron 1
 84 | Activations:
 85 | <start>
 86 | a	10
 87 | b	0
 88 | c	0
 89 | <end>
 90 | <start>
 91 | d	0
 92 | e	10
 93 | f	0
 94 | <end>
 95 | 
 96 | Explanation of neuron 1 behavior: the main thing this neuron does is find""",
 97 |         ),
 98 |         HarmonyMessage(
 99 |             role=Role.ASSISTANT,
100 |             content=" vowels.",
101 |         ),
102 |         HarmonyMessage(
103 |             role=Role.USER,
104 |             content="""
105 | 
106 | Neuron 2
107 | Activations:
108 | <start>
109 | a	10
110 | b	0
111 | c	0
112 | <end>
113 | <start>
114 | d	0
115 | e	10
116 | f	0
117 | <end>
118 | 
119 | Explanation of neuron 2 behavior: the main thing this neuron does is find""",
120 |         ),
121 |     ]
122 | 
123 |     explainer = TokenActivationPairExplainer(
124 |         model_name="gpt-4",
125 |         prompt_format=PromptFormat.HARMONY_V4,
126 |         few_shot_example_set=FewShotExampleSet.TEST,
127 |     )
128 |     prompt = explainer.make_explanation_prompt(
129 |         all_activation_records=TEST_EXAMPLES[0].activation_records,
130 |         max_activation=1.0,
131 |         max_tokens_for_completion=20,
132 |     )
133 | 
134 |     assert isinstance(prompt, list)
135 |     assert isinstance(prompt[0], dict)  # Really a HarmonyMessage
136 |     for actual_message, expected_message in zip(prompt, expected_prompt):
137 |         assert actual_message["role"] == expected_message["role"]
138 |         assert actual_message["content"] == expected_message["content"]
139 |     assert prompt == expected_prompt
140 | 
141 | 
142 | def test_token_space_explainer_if_formatting() -> None:
143 |     expected_prompt = """We're studying neurons in a neural network. Each neuron looks for some particular kind of token (which can be a word, or part of a word). Look at the tokens the neuron activates for (listed below) and summarize in a single sentence what the neuron is looking for. Don't list examples of words.
144 | 
145 | 
146 | 
147 | Tokens:
148 | 'these', ' are', ' tokens'
149 | 
150 | Explanation:
151 | This neuron is looking for this is a test explanation.
152 | 
153 | 
154 | 
155 | Tokens:
156 | 'foo', 'bar', 'baz'
157 | 
158 | Explanation:
159 | <|endofprompt|>This neuron is looking for"""
160 | 
161 |     explainer = TokenSpaceRepresentationExplainer(
162 |         model_name="text-davinci-002",
163 |         prompt_format=PromptFormat.INSTRUCTION_FOLLOWING,
164 |         use_few_shot=True,
165 |         few_shot_example_set=TokenSpaceFewShotExampleSet.TEST,
166 |     )
167 |     prompt = explainer.make_explanation_prompt(
168 |         tokens=["foo", "bar", "baz"],
169 |         max_tokens_for_completion=20,
170 |     )
171 | 
172 |     assert prompt == expected_prompt
173 | 
174 | 
175 | def test_token_space_explainer_harmony_formatting() -> None:
176 |     expected_prompt = [
177 |         HarmonyMessage(
178 |             role=Role.SYSTEM,
179 |             content="We're studying neurons in a neural network. Each neuron looks for some particular kind of token (which can be a word, or part of a word). Look at the tokens the neuron activates for (listed below) and summarize in a single sentence what the neuron is looking for. Don't list examples of words.",
180 |         ),
181 |         HarmonyMessage(
182 |             role=Role.USER,
183 |             content="""
184 | 
185 | 
186 | 
187 | Tokens:
188 | 'these', ' are', ' tokens'
189 | 
190 | Explanation:
191 | This neuron is looking for""",
192 |         ),
193 |         HarmonyMessage(
194 |             role=Role.ASSISTANT,
195 |             content=" this is a test explanation.",
196 |         ),
197 |         HarmonyMessage(
198 |             role=Role.USER,
199 |             content="""
200 | 
201 | 
202 | 
203 | Tokens:
204 | 'foo', 'bar', 'baz'
205 | 
206 | Explanation:
207 | This neuron is looking for""",
208 |         ),
209 |     ]
210 | 
211 |     explainer = TokenSpaceRepresentationExplainer(
212 |         model_name="gpt-4",
213 |         prompt_format=PromptFormat.HARMONY_V4,
214 |         use_few_shot=True,
215 |         few_shot_example_set=TokenSpaceFewShotExampleSet.TEST,
216 |     )
217 |     prompt = explainer.make_explanation_prompt(
218 |         tokens=["foo", "bar", "baz"],
219 |         max_tokens_for_completion=20,
220 |     )
221 | 
222 |     assert isinstance(prompt, list)
223 |     assert isinstance(prompt[0], dict)  # Really a HarmonyMessage
224 |     for actual_message, expected_message in zip(prompt, expected_prompt):
225 |         assert actual_message["role"] == expected_message["role"]
226 |         assert actual_message["content"] == expected_message["content"]
227 |     assert prompt == expected_prompt
228 | 


--------------------------------------------------------------------------------
/neuron-viewer/src/App.css:
--------------------------------------------------------------------------------
  1 | @tailwind base;
  2 | @tailwind components;
  3 | @tailwind utilities;
  4 | 
  5 | 
  6 | :root {
  7 |   --secondary-color: #0d978b;
  8 |   --accent-color: #ff4d00;
  9 | }
 10 | 
 11 | .full-width{
 12 |   width: 100vw;
 13 |   position: relative;
 14 |   margin-left: -50vw;
 15 |   left: 50%;
 16 |  }
 17 | 
 18 | .App {
 19 |   text-align: center;
 20 | }
 21 | 
 22 | .App-logo {
 23 |   height: 40vmin;
 24 |   pointer-events: none;
 25 | }
 26 | 
 27 | @media (prefers-reduced-motion: no-preference) {
 28 |   .App-logo {
 29 |     animation: App-logo-spin infinite 20s linear;
 30 |   }
 31 | }
 32 | 
 33 | .App h1 {
 34 | 	font-size: 1.75rem;
 35 | }
 36 | 
 37 | .App-article {
 38 |   background-color: #282c34;
 39 |   min-height: 100vh;
 40 |   display: flex;
 41 |   flex-direction: column;
 42 |   align-items: center;
 43 |   justify-content: center;
 44 |   font-size: calc(10px + 2vmin);
 45 |   color: white;
 46 | }
 47 | 
 48 | .App-link {
 49 |   color: #61dafb;
 50 | }
 51 | 
 52 | @keyframes App-logo-spin {
 53 |   from {
 54 |     transform: rotate(0deg);
 55 |   }
 56 |   to {
 57 |     transform: rotate(360deg);
 58 |   }
 59 | }
 60 | 
 61 | 
 62 |   /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */
 63 |   /*  Structure
 64 |   /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */
 65 | 
 66 |   body {
 67 |     margin: 0;
 68 |     padding: 0 1em;
 69 |     font-size: 12pt;
 70 | }
 71 | 
 72 | /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */
 73 | /*  Typography
 74 | /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */
 75 |   
 76 | h1 {
 77 |   font-size: 24pt;
 78 |   font-weight: 500;
 79 |   padding: 1em 0 0;
 80 |   display: block;
 81 |   color: #000;
 82 | }
 83 | h3 { padding: 0 0; }
 84 | h2 { padding: 1em 0 0.5em 0; }
 85 | h4, h5 {
 86 |     text-transform: uppercase;
 87 |     margin: 1em 0;
 88 |     justify-tracks: space-between;
 89 |     font-family: var(--sans-serif);
 90 |     font-size: 12pt;
 91 |     font-weight: 600;
 92 | }
 93 | h2, h3 { font-weight: 500; font-style: italic; }
 94 | subtitle {
 95 |     color: #555;
 96 |     font-size: 18pt;
 97 |     font-style: italic;
 98 |     padding: 0;
 99 |     display: block;
100 |     margin-bottom: 1em
101 | }
102 | 
103 | a {
104 |     transition: all .05s ease-in-out;
105 |     color: #5c60c3 !important;
106 |     font-style: normal;
107 | }
108 | a:hover { color: var(--accent-color)!important; }
109 | code, pre { color: var(--inline-code-color);
110 | background-color: #eee; border-radius: 3px; }
111 | pre {  padding: 1em; margin: 2em 0; }
112 | code { padding: 0.3em; }
113 | .text-secondary, h3, h5 { color: var(--secondary-color); }
114 | .text-primary, h2,h4 { color: var(--primary-color); }
115 | 
116 | /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */
117 | /*  Images
118 | /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */
119 |   
120 | img#logo {
121 |     width: 50%;
122 |     margin: 3em 0 0
123 | }
124 | 
125 | /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */
126 | /*  Alerts                                */
127 | /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */
128 |   
129 | .alert {
130 |     font-weight: 600;
131 |     font-style: italic;
132 |     display: block;
133 |     background-color: #fff7f7;
134 |     padding: 1em;
135 |     margin: 0;
136 |     border-radius: 5px;
137 |     color: #f25555
138 | }
139 | .alert.cool {
140 |   background-color: #f3f0fc;
141 |   color: #7155cf;
142 | }
143 | .flash-alert {
144 |   display: inline-block;
145 |   transition: ease-in-out 1s;
146 |   font-size: 14pt;
147 |   margin: 1em 0;
148 |   padding-top: 0.5em;
149 | }
150 | .flash-alert.success {
151 |   color: #000;
152 | }
153 | .flash-alert.failure {
154 |   color: red;
155 | }
156 | .flash-alert.hidden {
157 |   display: none;
158 | }
159 | 
160 |   
161 | /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */
162 | /*  Sidenotes & Superscripts              */
163 | /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */
164 | 
165 | body { counter-reset: count; }
166 | p { whitespace: nowrap; }
167 | sup { 
168 |   font-weight: 300;
169 |   padding-right: .2em;
170 |   counter-increment: count;
171 | }
172 | sidenote::before, 
173 | sup::before {
174 |     content: counter(count, lower-roman);
175 |     display: inline-block;
176 |     font-size: 10pt;
177 |     font-weight: bold;
178 |     color: var(--accent-color);
179 | }
180 | sidenote::before {
181 |     margin-right: .5em;
182 |     font-weight: 700
183 | }
184 | 
185 | /* Different behavior if the screen is too 
186 |    narrow to show a sidenote on the side. */
187 | 
188 | @media (min-width:860px) {
189 |     sidenote {
190 |         clear: right;
191 |         font-size: 10pt;
192 |         position: fixed;
193 |         float: right;
194 |         white-space: normal;
195 |         right: 20px;
196 |         width: 200px;
197 |         display: block;
198 |         max-width: 30%
199 |     }
200 | }
201 | 
202 | /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */
203 | /*  Sidenotes & Superscripts              */
204 | /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */
205 |   
206 | @media print {
207 |     a.btn, button {
208 |         display: none!important
209 |     }
210 | }
211 | 
212 | @media (max-width:860px) {
213 |     sidenote {
214 |         display: block;
215 |         font-size: 11pt;
216 |         margin: 2em 3em 2em 2em
217 |     }
218 | }
219 |   
220 | /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */
221 | /*  Buttons                               */
222 | /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */
223 |   
224 | @media screen {
225 |     button:hover { box-shadow: 0.5em 0.5em var(--accent-color); }
226 |     a.btn, button {
227 |         border-radius: 3px;
228 |         color: #000 !important;
229 |         text-decoration: none !important;
230 |         font-size: 11pt;
231 |         border: 1px solid #000;
232 |         padding: 0.5em 1em;
233 |         font-family: -apple-system, 
234 |           BlinkMacSystemFont, 
235 |           "avenir next", 
236 |           avenir,
237 |           helvetica, 
238 |           "helvetica neue", 
239 |           ubuntu, 
240 |           roboto, 
241 |           noto, 
242 |           "segoe ui", 
243 |           arial,
244 |           sans-serif !important;
245 |         background: #fff;
246 |         margin: 1.5em 0;
247 |         font-weight: 500;
248 |         transition: all .05s ease-in-out,box-shadow-color .025s ease-in-out;
249 |         box-shadow: 0.5em 0.5em #eee;
250 |         display: inline-block;
251 | }
252 | 
253 |     a.btn:hover, button:hover {
254 |         cursor: pointer
255 |         box-shadow: 0.5em 0.5em var(--accent-color);
256 |     }
257 |     a.btn:active, button.active, button:active {
258 |         border: 1px solid;
259 |         margin: 2em 0 1em 1em;
260 |         box-shadow: 0 0 #000 !important
261 |     }
262 |     a.btn.small,button.small {
263 |         box-shadow: .5em .5em 0 #eee;
264 |         border: 1px solid #000;
265 |         padding: .6em 1em;
266 |         font-weight: 500
267 |     }
268 |     a.btn.small:hover,button.small:hover {
269 |         box-shadow: 0.5em 0.5em var(--accent-color);
270 |     }
271 |     a.btn.small:active,button.small:active {
272 |         margin: 2em 0 1em 1em;
273 |         box-shadow: 0 0 #000
274 |     }
275 | }
276 | 
277 | /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */
278 | /*  Blockquotes & Epigraphs
279 | /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */
280 | 
281 | blockquote {
282 |   margin: 1em;
283 | }
284 | div>blockquote>p {
285 |     font-size: 13pt;
286 |     color: #555;
287 |     font-style: normal!important;
288 |     margin: 0;
289 |     padding: 1em 0 1.5em
290 | }
291 | blockquote > blockquote {
292 |   padding: 0.5em 2em 1em 1.5em !important;
293 | }
294 | 
295 | blockquote > blockquote,
296 | blockquote > blockquote > p {
297 |   font-size: 14pt;
298 |   padding: 0;
299 |   margin: 0;
300 |   text-align: center;
301 |   font-style: italic;
302 |   color: var(--epigraph-color);
303 | }
304 | blockquote footer {
305 |   font-size: 12pt;
306 |   text-align: inherit;
307 |   display: block;
308 |   font-style: normal;
309 |   margin: 1em;
310 |   color: #aaa;
311 | }
312 | 


--------------------------------------------------------------------------------
/neuron-explainer/neuron_explainer/explanations/calibrated_simulator.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Code for calibrating simulations of neuron behavior. Calibration refers to a process of mapping from
  3 | a space of predicted activation values (e.g. [0, 10]) to the real activation distribution for a
  4 | neuron.
  5 | 
  6 | See http://go/neuron_explanation_methodology for description of calibration step. Necessary for
  7 | simulating neurons in the context of ablate-to-simulation, but can be skipped when using correlation
  8 | scoring. (Calibration may still improve quality for scoring, at least for non-linear calibration
  9 | methods.)
 10 | """
 11 | 
 12 | from __future__ import annotations
 13 | 
 14 | import asyncio
 15 | from abc import abstractmethod
 16 | from typing import Optional, Sequence
 17 | 
 18 | import numpy as np
 19 | from neuron_explainer.activations.activations import ActivationRecord
 20 | from neuron_explainer.explanations.explanations import ActivationScale
 21 | from neuron_explainer.explanations.simulator import NeuronSimulator, SequenceSimulation
 22 | from sklearn import linear_model
 23 | 
 24 | 
 25 | class CalibratedNeuronSimulator(NeuronSimulator):
 26 |     """
 27 |     Wrap a NeuronSimulator and calibrate it to map from the predicted activation space to the
 28 |     actual neuron activation space.
 29 |     """
 30 | 
 31 |     def __init__(self, uncalibrated_simulator: NeuronSimulator):
 32 |         self.uncalibrated_simulator = uncalibrated_simulator
 33 | 
 34 |     @classmethod
 35 |     async def create(
 36 |         cls,
 37 |         uncalibrated_simulator: NeuronSimulator,
 38 |         calibration_activation_records: Sequence[ActivationRecord],
 39 |     ) -> CalibratedNeuronSimulator:
 40 |         """
 41 |         Create and calibrate a calibrated simulator (so initialization and calibration can be done
 42 |         in one call).
 43 |         """
 44 |         calibrated_simulator = cls(uncalibrated_simulator)
 45 |         await calibrated_simulator.calibrate(calibration_activation_records)
 46 |         return calibrated_simulator
 47 | 
 48 |     async def calibrate(self, calibration_activation_records: Sequence[ActivationRecord]) -> None:
 49 |         """
 50 |         Determine parameters to map from the predicted activation space to the real neuron
 51 |         activation space, based on a calibration set.
 52 | 
 53 |         Use when simulated sequences haven't already been produced on the calibration set.
 54 |         """
 55 |         simulations = await asyncio.gather(
 56 |             *[
 57 |                 self.uncalibrated_simulator.simulate(activations.tokens)
 58 |                 for activations in calibration_activation_records
 59 |             ]
 60 |         )
 61 |         self.calibrate_from_simulations(calibration_activation_records, simulations)
 62 | 
 63 |     def calibrate_from_simulations(
 64 |         self,
 65 |         calibration_activation_records: Sequence[ActivationRecord],
 66 |         simulations: Sequence[SequenceSimulation],
 67 |     ) -> None:
 68 |         """
 69 |         Determine parameters to map from the predicted activation space to the real neuron
 70 |         activation space, based on a calibration set.
 71 | 
 72 |         Use when simulated sequences have already been produced on the calibration set.
 73 |         """
 74 |         flattened_activations = []
 75 |         flattened_simulated_activations: list[float] = []
 76 |         for activations, simulation in zip(calibration_activation_records, simulations):
 77 |             flattened_activations.extend(activations.activations)
 78 |             flattened_simulated_activations.extend(simulation.expected_activations)
 79 |         self._calibrate_from_flattened_activations(
 80 |             np.array(flattened_activations), np.array(flattened_simulated_activations)
 81 |         )
 82 | 
 83 |     @abstractmethod
 84 |     def _calibrate_from_flattened_activations(
 85 |         self,
 86 |         true_activations: np.ndarray,
 87 |         uncalibrated_activations: np.ndarray,
 88 |     ) -> None:
 89 |         """
 90 |         Determine parameters to map from the predicted activation space to the real neuron
 91 |         activation space, based on a calibration set.
 92 | 
 93 |         Take numpy arrays of all true activations and all uncalibrated activations on the
 94 |         calibration set over all sequences.
 95 |         """
 96 | 
 97 |     @abstractmethod
 98 |     def apply_calibration(self, values: Sequence[float]) -> list[float]:
 99 |         """Apply the learned calibration to a sequence of values."""
100 | 
101 |     async def simulate(self, tokens: Sequence[str]) -> SequenceSimulation:
102 |         uncalibrated_seq_simulation = await self.uncalibrated_simulator.simulate(tokens)
103 |         calibrated_activations = self.apply_calibration(
104 |             uncalibrated_seq_simulation.expected_activations
105 |         )
106 |         calibrated_distribution_values = [
107 |             self.apply_calibration(dv) for dv in uncalibrated_seq_simulation.distribution_values
108 |         ]
109 |         return SequenceSimulation(
110 |             tokens=uncalibrated_seq_simulation.tokens,
111 |             expected_activations=calibrated_activations,
112 |             activation_scale=ActivationScale.NEURON_ACTIVATIONS,
113 |             distribution_values=calibrated_distribution_values,
114 |             distribution_probabilities=uncalibrated_seq_simulation.distribution_probabilities,
115 |             uncalibrated_simulation=uncalibrated_seq_simulation,
116 |         )
117 | 
118 | 
119 | class UncalibratedNeuronSimulator(CalibratedNeuronSimulator):
120 |     """Pass through the activations without trying to calibrate."""
121 | 
122 |     def __init__(self, uncalibrated_simulator: NeuronSimulator):
123 |         super().__init__(uncalibrated_simulator)
124 | 
125 |     async def calibrate(self, calibration_activation_records: Sequence[ActivationRecord]) -> None:
126 |         pass
127 | 
128 |     def _calibrate_from_flattened_activations(
129 |         self,
130 |         true_activations: np.ndarray,
131 |         uncalibrated_activations: np.ndarray,
132 |     ) -> None:
133 |         pass
134 | 
135 |     def apply_calibration(self, values: Sequence[float]) -> list[float]:
136 |         return values if isinstance(values, list) else list(values)
137 | 
138 | 
139 | class LinearCalibratedNeuronSimulator(CalibratedNeuronSimulator):
140 |     """Find a linear mapping from uncalibrated activations to true activations.
141 | 
142 |     Should not change ev_correlation_score because it is invariant to linear transformations.
143 |     """
144 | 
145 |     def __init__(self, uncalibrated_simulator: NeuronSimulator):
146 |         super().__init__(uncalibrated_simulator)
147 |         self._regression: Optional[linear_model.LinearRegression] = None
148 | 
149 |     def _calibrate_from_flattened_activations(
150 |         self,
151 |         true_activations: np.ndarray,
152 |         uncalibrated_activations: np.ndarray,
153 |     ) -> None:
154 |         self._regression = linear_model.LinearRegression()
155 |         self._regression.fit(uncalibrated_activations.reshape(-1, 1), true_activations)
156 | 
157 |     def apply_calibration(self, values: Sequence[float]) -> list[float]:
158 |         if self._regression is None:
159 |             raise ValueError("Must call calibrate() before apply_calibration")
160 |         if len(values) == 0:
161 |             return []
162 |         return self._regression.predict(np.reshape(np.array(values), (-1, 1))).tolist()
163 | 
164 | 
165 | class PercentileMatchingCalibratedNeuronSimulator(CalibratedNeuronSimulator):
166 |     """
167 |     Map the nth percentile of the uncalibrated activations to the nth percentile of the true
168 |     activations for all n.
169 | 
170 |     This will match the distribution of true activations on the calibration set, but will be
171 |     overconfident outside of the calibration set.
172 |     """
173 | 
174 |     def __init__(self, uncalibrated_simulator: NeuronSimulator):
175 |         super().__init__(uncalibrated_simulator)
176 |         self._uncalibrated_activations: Optional[np.ndarray] = None
177 |         self._true_activations: Optional[np.ndarray] = None
178 | 
179 |     def _calibrate_from_flattened_activations(
180 |         self,
181 |         true_activations: np.ndarray,
182 |         uncalibrated_activations: np.ndarray,
183 |     ) -> None:
184 |         self._uncalibrated_activations = np.sort(uncalibrated_activations)
185 |         self._true_activations = np.sort(true_activations)
186 | 
187 |     def apply_calibration(self, values: Sequence[float]) -> list[float]:
188 |         if self._true_activations is None or self._uncalibrated_activations is None:
189 |             raise ValueError("Must call calibrate() before apply_calibration")
190 |         if len(values) == 0:
191 |             return []
192 |         return np.interp(
193 |             np.array(values), self._uncalibrated_activations, self._true_activations
194 |         ).tolist()
195 | 


--------------------------------------------------------------------------------
/neuron-explainer/neuron_explainer/explanations/test_simulator.py:
--------------------------------------------------------------------------------
  1 | from neuron_explainer.explanations.few_shot_examples import FewShotExampleSet
  2 | from neuron_explainer.explanations.prompt_builder import HarmonyMessage, PromptFormat, Role
  3 | from neuron_explainer.explanations.simulator import (
  4 |     ExplanationNeuronSimulator,
  5 |     ExplanationTokenByTokenSimulator,
  6 | )
  7 | 
  8 | 
  9 | def test_make_explanation_simulation_prompt_if_format() -> None:
 10 |     expected_prompt = """We're studying neurons in a neural network.
 11 | Each neuron looks for some particular thing in a short document.
 12 | Look at summary of what the neuron does, and try to predict how it will fire on each token.
 13 | 
 14 | The activation format is token<tab>activation, activations go from 0 to 10, "unknown" indicates an unknown activation. Most activations will be 0.
 15 | 
 16 | 
 17 | Neuron 1
 18 | Explanation of neuron 1 behavior: the main thing this neuron does is find vowels
 19 | Activations: 
 20 | <start>
 21 | a	10
 22 | b	0
 23 | c	0
 24 | <end>
 25 | <start>
 26 | d	unknown
 27 | e	10
 28 | f	0
 29 | <end>
 30 | 
 31 | 
 32 | 
 33 | Neuron 2
 34 | Explanation of neuron 2 behavior: the main thing this neuron does is find EXPLANATION<|endofprompt|>
 35 | Activations: 
 36 | <start>
 37 | 0	unknown
 38 | 1	unknown
 39 | 2	unknown
 40 | <end>
 41 | """
 42 |     prompt = ExplanationNeuronSimulator(
 43 |         model_name="text-davinci-003",
 44 |         explanation="EXPLANATION",
 45 |         few_shot_example_set=FewShotExampleSet.TEST,
 46 |         prompt_format=PromptFormat.INSTRUCTION_FOLLOWING,
 47 |     ).make_simulation_prompt(
 48 |         tokens=[str(x) for x in range(3)],
 49 |     )
 50 |     assert prompt == expected_prompt
 51 | 
 52 | 
 53 | def test_make_explanation_simulation_prompt_harmony_format() -> None:
 54 |     expected_prompt = [
 55 |         HarmonyMessage(
 56 |             role=Role.SYSTEM,
 57 |             content="""We're studying neurons in a neural network.
 58 | Each neuron looks for some particular thing in a short document.
 59 | Look at summary of what the neuron does, and try to predict how it will fire on each token.
 60 | 
 61 | The activation format is token<tab>activation, activations go from 0 to 10, "unknown" indicates an unknown activation. Most activations will be 0.
 62 | """,
 63 |         ),
 64 |         HarmonyMessage(
 65 |             role=Role.USER,
 66 |             content="""
 67 | 
 68 | Neuron 1
 69 | Explanation of neuron 1 behavior: the main thing this neuron does is find vowels""",
 70 |         ),
 71 |         HarmonyMessage(
 72 |             role=Role.ASSISTANT,
 73 |             content="""
 74 | Activations: 
 75 | <start>
 76 | a	10
 77 | b	0
 78 | c	0
 79 | <end>
 80 | <start>
 81 | d	unknown
 82 | e	10
 83 | f	0
 84 | <end>
 85 | 
 86 | """,
 87 |         ),
 88 |         HarmonyMessage(
 89 |             role=Role.USER,
 90 |             content="""
 91 | 
 92 | Neuron 2
 93 | Explanation of neuron 2 behavior: the main thing this neuron does is find EXPLANATION""",
 94 |         ),
 95 |         HarmonyMessage(
 96 |             role=Role.ASSISTANT,
 97 |             content="""
 98 | Activations: 
 99 | <start>
100 | 0	unknown
101 | 1	unknown
102 | 2	unknown
103 | <end>
104 | """,
105 |         ),
106 |     ]
107 |     prompt = ExplanationNeuronSimulator(
108 |         model_name="gpt-4",
109 |         explanation="EXPLANATION",
110 |         few_shot_example_set=FewShotExampleSet.TEST,
111 |         prompt_format=PromptFormat.HARMONY_V4,
112 |     ).make_simulation_prompt(
113 |         tokens=[str(x) for x in range(3)],
114 |     )
115 | 
116 |     assert isinstance(prompt, list)
117 |     assert isinstance(prompt[0], dict)  # Really a HarmonyMessage
118 |     for actual_message, expected_message in zip(prompt, expected_prompt):
119 |         assert actual_message["role"] == expected_message["role"]
120 |         assert actual_message["content"] == expected_message["content"]
121 |     assert prompt == expected_prompt
122 | 
123 | 
124 | def test_make_token_by_token_simulation_prompt_if_format() -> None:
125 |     expected_prompt = """We're studying neurons in a neural network. Each neuron looks for some particular thing in a short document. Look at  an explanation of what the neuron does, and try to predict its activations on a particular token.
126 | 
127 | The activation format is token<tab>activation, and activations range from 0 to 10. Most activations will be 0.
128 | 
129 | Neuron 1
130 | Explanation of neuron 1 behavior: the main thing this neuron does is find vowels
131 | Activations: 
132 | <start>
133 | a	10
134 | b	0
135 | c	0
136 | <end>
137 | <start>
138 | d	0
139 | e	10
140 | f	0
141 | <end>
142 | 
143 | 
144 | Now, we're going predict the activation of a new neuron on a single token, following the same rules as the examples above. Activations still range from 0 to 10.
145 | Neuron 2
146 | Explanation of neuron 2 behavior: the main thing this neuron does is find numbers and nothing else
147 | Text:
148 | ghi
149 | 
150 | Last token in the text:
151 | i
152 | 
153 | Last token activation, considering the token in the context in which it appeared in the text:
154 | 10
155 | 
156 | 
157 | Neuron 3
158 | Explanation of neuron 3 behavior: the main thing this neuron does is find numbers and nothing else
159 | Text:
160 | 01
161 | 
162 | Last token in the text:
163 | 1
164 | 
165 | Last token activation, considering the token in the context in which it appeared in the text:
166 | <|endofprompt|>"""
167 |     prompt = ExplanationTokenByTokenSimulator(
168 |         model_name="text-davinci-003",
169 |         explanation="EXPLANATION",
170 |         few_shot_example_set=FewShotExampleSet.TEST,
171 |         prompt_format=PromptFormat.INSTRUCTION_FOLLOWING,
172 |     ).make_single_token_simulation_prompt(
173 |         tokens=[str(x) for x in range(3)],
174 |         explanation="numbers and nothing else",
175 |         token_index_to_score=1,
176 |     )
177 |     assert prompt == expected_prompt
178 | 
179 | 
180 | def test_make_token_by_token_simulation_prompt_harmony_format() -> None:
181 |     expected_prompt = [
182 |         HarmonyMessage(
183 |             role=Role.SYSTEM,
184 |             content="""We're studying neurons in a neural network. Each neuron looks for some particular thing in a short document. Look at  an explanation of what the neuron does, and try to predict its activations on a particular token.
185 | 
186 | The activation format is token<tab>activation, and activations range from 0 to 10. Most activations will be 0.
187 | 
188 | """,
189 |         ),
190 |         HarmonyMessage(
191 |             role=Role.USER,
192 |             content="""Neuron 1
193 | Explanation of neuron 1 behavior: the main thing this neuron does is find vowels
194 | """,
195 |         ),
196 |         HarmonyMessage(
197 |             role=Role.ASSISTANT,
198 |             content="""Activations: 
199 | <start>
200 | a	10
201 | b	0
202 | c	0
203 | <end>
204 | <start>
205 | d	0
206 | e	10
207 | f	0
208 | <end>
209 | 
210 | 
211 | """,
212 |         ),
213 |         HarmonyMessage(
214 |             role=Role.SYSTEM,
215 |             content="Now, we're going predict the activation of a new neuron on a single token, following the same rules as the examples above. Activations still range from 0 to 10.",
216 |         ),
217 |         HarmonyMessage(
218 |             role=Role.USER,
219 |             content="""
220 | Neuron 2
221 | Explanation of neuron 2 behavior: the main thing this neuron does is find numbers and nothing else
222 | Text:
223 | ghi
224 | 
225 | Last token in the text:
226 | i
227 | 
228 | Last token activation, considering the token in the context in which it appeared in the text:
229 | """,
230 |         ),
231 |         HarmonyMessage(
232 |             role=Role.ASSISTANT,
233 |             content="""10
234 | 
235 | """,
236 |         ),
237 |         HarmonyMessage(
238 |             role=Role.USER,
239 |             content="""
240 | Neuron 3
241 | Explanation of neuron 3 behavior: the main thing this neuron does is find numbers and nothing else
242 | Text:
243 | 01
244 | 
245 | Last token in the text:
246 | 1
247 | 
248 | Last token activation, considering the token in the context in which it appeared in the text:
249 | """,
250 |         ),
251 |     ]
252 | 
253 |     prompt = ExplanationTokenByTokenSimulator(
254 |         model_name="gpt-4",
255 |         explanation="EXPLANATION",
256 |         few_shot_example_set=FewShotExampleSet.TEST,
257 |         prompt_format=PromptFormat.HARMONY_V4,
258 |     ).make_single_token_simulation_prompt(
259 |         tokens=[str(x) for x in range(3)],
260 |         explanation="numbers and nothing else",
261 |         token_index_to_score=1,
262 |     )
263 | 
264 |     assert isinstance(prompt, list)
265 |     assert isinstance(prompt[0], dict)  # Really a HarmonyMessage
266 |     for actual_message, expected_message in zip(prompt, expected_prompt):
267 |         assert actual_message["role"] == expected_message["role"]
268 |         assert actual_message["content"] == expected_message["content"]
269 |     assert prompt == expected_prompt
270 | 


--------------------------------------------------------------------------------
/neuron-viewer/src/welcome.tsx:
--------------------------------------------------------------------------------
  1 | import { useState, FormEvent } from "react"
  2 | import { useNavigate } from "react-router-dom"
  3 | 
  4 | function NeuronForm() {
  5 |   const [input_layer, setLayer] = useState(0)
  6 |   const [input_neuron, setNeuron] = useState(0)
  7 |   const navigate = useNavigate()
  8 | 
  9 |   const knownGoodNeurons = [
 10 |     /**************
 11 |     /* well explained + interesting
 12 |     ***************/
 13 |     {heading: 'Somewhat well explained by GPT-4', layer: 0, neuron: 0, label: ''},
 14 |     {layer: 5, neuron: 131, label: "citations", description: "citations, especially biblical and legal"},
 15 |     {layer: 12, neuron: 847, label: "numbers in fractions", description: "numbers in fractions"}, // 
 16 |     {layer: 12, neuron: 5820, label: "short flags", description: "single letter command line flags"}, // 
 17 |     {layer: 14, neuron: 417, label: "doing things right", description: "words and phrases related to performing actions correctly or properly"}, // score 0.42
 18 |     {layer: 15, neuron: 4538, label: "leading transitions", description: "transition words at the start of documents"},
 19 |     {layer: 17, neuron: 3218, label: "success", description: "expressions of completion or success"}, // score 0.38
 20 |     {layer: 18, neuron: 5302, label: "X *by*", description: "the word 'by' in phrases indicating side by side or sequential events."}, // score 0.48
 21 |     {layer: 19, neuron: 1377, label: "similes", description: "comparisons and analogies, often using the word 'like'"}, // score 0.42
 22 |     {layer: 21, neuron: 2932, label: "Canada", description: "references to Canadian people, places, and entities"}, // score 0.78
 23 |     {layer: 25, neuron: 2602, label: "similes", description: "descriptive comparisons, especially similes"}, // score 0.40
 24 |     {layer: 25, neuron: 4870, label: "certainty", description: "phrases related to certainty and confidence."}, // score 0.37
 25 |     {layer: 30, neuron: 28, label: "times", description: "specific times (with hours and minutes)"}, 
 26 |     // https://openaipublic.blob.core.windows.net/neuron-explainer/neuron-viewer/index.html#/layers/5/neurons/2326
 27 |     {heading: 'Partially explained by GPT-4', layer: 0, neuron: 0, label: ''},
 28 |     {layer: 0, neuron: 816, label: "Marvel comics vibes", description: "language and context related to Marvel comics, movies, and characters, as well as other superhero-themed content"}, // score 0.44
 29 |     {layer: 0, neuron: 742, label: "Second token 'and'", description: "'and', 'in', and punctuation at the second token"},
 30 |     {layer: 4, neuron: 4342, label: "token counter", description: "counting repeated occurrences of a token"},
 31 |     {layer: 5, neuron: 2326, label: "rhymes with 'at'", description: "syllables rhyming with 'at', sometimes 'it', 'et', 'ot'"},
 32 |     {layer: 5, neuron: 4492, label: "leading 'an'", description: "sentences that start with 'an'"}, // score 0.77
 33 |     {layer: 6, neuron: 3251, label: "not all", description: "not all"},
 34 |     {layer: 10, neuron: 2851, label: "leading acronyms", description: "acronyms after punctuation or newlines"},
 35 |     {layer: 12, neuron: 2884, label: "hypothetical had", description: "had in hypothetical contexts"}, // 
 36 |     {layer: 14, neuron: 3539, label: "long sequences", description: "long sequences of stuff"},
 37 |     {layer: 14, neuron: 3822, label: "X by/after *X*", description: "noun repetitions separated by 'by' or 'after'"},
 38 |     {layer: 21, neuron: 3982, label: "any *and* all", description: "any/anything *and/&* all/everything"},
 39 |     {layer: 26, neuron: 20, label: "truth, skin, or sun", description: "truth, skin, or sun"},
 40 |     // layer=18&neuron=5302
 41 |     /**************
 42 |     /* boring
 43 |     ***************/
 44 |     /**************
 45 |     /* poorly explained + interesting
 46 |     ***************/
 47 |     {heading: 'Poorly explained by GPT-4', layer: 0, neuron: 0, label: ''},
 48 |     // Actually activates for negated version “not so much … as” even when not so much is fairly far apart
 49 |     // another "not all":  13&neuron=1352
 50 |     // {layer: 0, neuron: 2823, label: "Hillary email leak vibes", description: "contexts related to Hillary Clinton leaked emails"}, // score ??
 51 |     // {layer: 12, neuron: 3718, label: "comparative phrases and negations", description: "comparative phrases and negations"}, // score 0.12
 52 |     {layer: 13, neuron: 410, label: "N and N+1", description: "a number following its predecessor"}, // score ??
 53 |     {layer: 13, neuron: 979, label: "subtle plurals", description: "subtle/nonobvious plurals"}, // score ??
 54 |     // slash after number 12&neuron=847
 55 |     // numbers predicting slash: 14&neuron=92
 56 |     // 0&neuron=2823
 57 |     {layer: 14, neuron: 1251, label: "subjunctive verbs", description: "verbs in subjunctive mood"}, // score ??
 58 |     {layer: 16, neuron: 518, label: "pattern breaks", description: "tokens that break an established pattern in an ongoing list"}, // score 0.2 with totally wrong explanation
 59 |     {layer: 17, neuron: 821, label: "idioms", description: "idioms"},
 60 |     {layer: 18, neuron: 3481, label: "post-typo", description: "first token following a typo"}, // score ??
 61 |     {layer: 18, neuron: 3552, label: "repeated text", description: "repeated text"}, // score ??
 62 |     // another shared last names: https://openaipublic.blob.core.windows.net/neuron-explainer/neuron-viewer/index.html#/layers/20/neurons/3164
 63 |     {layer: 19, neuron: 1763, label: "shared last names", description: "last names when two different people sharing last name are mentioned"}, // score 0.36
 64 |     {layer: 20, neuron: 4334, label: "previous break", description: "tokens that previously preceded a linebreak"}, // score ??
 65 |     {layer: 27, neuron: 116, label: "MTG vibes", description: "Magic the Gathering contexts"}, // score ??
 66 |     {layer: 35, neuron: 1523, label: "NBA name predictor", description: "NBA person/player name predictor"}, // score ??
 67 |     // {layer: 36, neuron: 2275, label: "she predictor", description: "prediction of the token 'she'"}, // score ??
 68 |     // {layer: 36, neuron: 5107, label: "Mormon vibes", description: "Mormon related context"}, // score ??
 69 |     // ] predictor 40&neuron=4505
 70 |     {layer: 46, neuron: 2181, label: "C predictor", description: "prediction of the token 'C'"}, // score ??
 71 |   ]
 72 | 
 73 |   const handleSubmit = (e: FormEvent) => {
 74 |     e.preventDefault()
 75 |     navigate(`/layers/${input_layer}/neurons/${input_neuron}`)
 76 |     return false
 77 |   }
 78 | 
 79 |   const handleNeuronClick = (layer: number, neuron: number) => {
 80 |     navigate(`/layers/${layer}/neurons/${neuron}`)
 81 |   }
 82 | 
 83 |   const feelingLuckySubmit = () => {
 84 |     const layer = Math.floor(Math.random() * 48);
 85 |     const neuron = Math.floor(Math.random() * 6400);
 86 |     navigate(`/layers/${layer}/neurons/${neuron}`)
 87 |     return false
 88 |   }
 89 | 
 90 | 
 91 |   return (
 92 |     <div className="flex flex-col items-center justify-center">
 93 |       <h1 className="text-2xl font-bold mb-4">Welcome!  Pick a neuron:</h1>
 94 |       <form
 95 |         onSubmit={handleSubmit}
 96 |         className="flex flex-col items-center justify-center"
 97 |         style={{ flexFlow: 'row wrap' }}
 98 |       >
 99 |         Layer <input
100 |           type="number"
101 |           id="inputLayer"
102 |           value={input_layer}
103 |           min={0}
104 |           max={47}
105 |           style={{ width: 70, marginLeft: 10, marginRight: 10 }}
106 |           onChange={(e) => setLayer(parseInt(e.target.value))}
107 |           className="border border-gray-300 rounded-md p-2"
108 |         />
109 |         Index <input
110 |           type="number"
111 |           id="inputNeuron"
112 |           value={input_neuron}
113 |           min={0}
114 |           max={6399}
115 |           style={{ width: 70, marginLeft: 10, marginRight: 10 }}
116 |           onChange={(e) => setNeuron(parseInt(e.target.value))}
117 |           className="border border-gray-300 rounded-md p-2"
118 |         />
119 |         <button
120 |           onClick={handleSubmit}
121 |           className="border border-gray-300 rounded-md p-2 mb-4 mt-4"
122 |         >
123 |           Go to {input_layer}:{input_neuron}
124 |         </button>
125 |       </form>
126 |       <button
127 |         onClick={feelingLuckySubmit}
128 |         className="border border-gray-300 rounded-md p-2 mb-4 mt-4"
129 |       >
130 |         I'm feeling lucky
131 |       </button>
132 |       <div className="mt-4">
133 |         <h2 className="text-xl font-bold mb-2">Interesting neurons:</h2>
134 |         <div className="mb-10 flex-row">
135 |           <div
136 |             className="flex flex-flow flex-wrap"
137 |           >
138 |             {knownGoodNeurons.map(({ heading, layer, neuron, label, description }, j) => (
139 |               heading ? <div style={{width: '100%'}} key={j}><h4>
140 |               {heading}
141 |               </h4></div> : <button
142 |                 onClick={() => handleNeuronClick(layer, neuron)}
143 |                 key={`${layer}:${neuron}`}
144 |                 style={{ width: 200 }}
145 |                 className="m-2 text-blue-500 hover:text-blue-700"
146 |                 title={description}
147 |               >
148 |                 {label} ({layer}:{neuron})
149 |               </button>
150 |             ))}
151 |           </div>
152 |         </div>
153 |       </div>
154 |     </div>
155 |   )
156 | }
157 | 
158 | export default NeuronForm
159 | 


--------------------------------------------------------------------------------
/neuron-explainer/neuron_explainer/explanations/explanations.py:
--------------------------------------------------------------------------------
  1 | # Dataclasses and enums for storing neuron explanations, their scores, and related data. Also,
  2 | # related helper functions.
  3 | 
  4 | from __future__ import annotations
  5 | 
  6 | import json
  7 | from dataclasses import dataclass
  8 | from enum import Enum
  9 | from typing import List, Optional, Union
 10 | 
 11 | import blobfile as bf
 12 | import boostedblob as bbb
 13 | from neuron_explainer.activations.activations import NeuronId
 14 | from neuron_explainer.fast_dataclasses import FastDataclass, loads, register_dataclass
 15 | 
 16 | 
 17 | class ActivationScale(str, Enum):
 18 |     """Which "units" are stored in the expected_activations/distribution_values fields of a
 19 |     SequenceSimulation.
 20 | 
 21 |     This enum identifies whether the values represent real activations of the neuron or something
 22 |     else. Different scales are not necessarily related by a linear transformation.
 23 |     """
 24 | 
 25 |     NEURON_ACTIVATIONS = "neuron_activations"
 26 |     """Values represent real activations of the neuron."""
 27 |     SIMULATED_NORMALIZED_ACTIVATIONS = "simulated_normalized_activations"
 28 |     """
 29 |     Values represent simulated activations of the neuron, normalized to the range [0, 10]. This
 30 |     scale is arbitrary and should not be interpreted as a neuron activation.
 31 |     """
 32 | 
 33 | 
 34 | @register_dataclass
 35 | @dataclass
 36 | class SequenceSimulation(FastDataclass):
 37 |     """The result of a simulation of neuron activations on one text sequence."""
 38 | 
 39 |     tokens: list[str]
 40 |     """The sequence of tokens that was simulated."""
 41 |     expected_activations: list[float]
 42 |     """Expected value of the possibly-normalized activation for each token in the sequence."""
 43 |     activation_scale: ActivationScale
 44 |     """What scale is used for values in the expected_activations field."""
 45 |     distribution_values: list[list[float]]
 46 |     """
 47 |     For each token in the sequence, a list of values from the discrete distribution of activations
 48 |     produced from simulation. Tokens will be included here if and only if they are in the top K=15
 49 |     tokens predicted by the simulator, and excluded otherwise.
 50 |     
 51 |     May be transformed to another unit by calibration. When we simulate a neuron, we produce a
 52 |     discrete distribution with values in the arbitrary discretized space of the neuron, e.g. 10%
 53 |     chance of 0, 70% chance of 1, 20% chance of 2. Which we store as distribution_values =
 54 |     [0, 1, 2], distribution_probabilities = [0.1, 0.7, 0.2]. When we transform the distribution to
 55 |     the real activation units, we can correspondingly transform the values of this distribution
 56 |     to get a distribution in the units of the neuron. e.g. if the mapping from the discretized space
 57 |     to the real activation unit of the neuron is f(x) = x/2, then the distribution becomes 10%
 58 |     chance of 0, 70% chance of 0.5, 20% chance of 1. Which we store as distribution_values =
 59 |     [0, 0.5, 1], distribution_probabilities = [0.1, 0.7, 0.2].
 60 |     """
 61 |     distribution_probabilities: list[list[float]]
 62 |     """
 63 |     For each token in the sequence, the probability of the corresponding value in
 64 |     distribution_values.
 65 |     """
 66 | 
 67 |     uncalibrated_simulation: Optional["SequenceSimulation"] = None
 68 |     """The result of the simulation before calibration."""
 69 | 
 70 | 
 71 | @register_dataclass
 72 | @dataclass
 73 | class ScoredSequenceSimulation(FastDataclass):
 74 |     """
 75 |     SequenceSimulation result with a score (for that sequence only) and ground truth activations.
 76 |     """
 77 | 
 78 |     simulation: SequenceSimulation
 79 |     """The result of a simulation of neuron activations."""
 80 |     true_activations: List[float]
 81 |     """Ground truth activations on the sequence (not normalized)"""
 82 |     ev_correlation_score: float
 83 |     """
 84 |     Correlation coefficient between the expected values of the normalized activations from the
 85 |     simulation and the unnormalized true activations of the neuron on the text sequence.
 86 |     """
 87 |     rsquared_score: Optional[float] = None
 88 |     """R^2 of the simulated activations."""
 89 |     absolute_dev_explained_score: Optional[float] = None
 90 |     """
 91 |     Score based on absolute difference between real and simulated activations.
 92 |     absolute_dev_explained_score = 1 - mean(abs(real-predicted))/ mean(abs(real))
 93 |     """
 94 | 
 95 | 
 96 | @register_dataclass
 97 | @dataclass
 98 | class ScoredSimulation(FastDataclass):
 99 |     """Result of scoring a neuron simulation on multiple sequences."""
100 | 
101 |     scored_sequence_simulations: List[ScoredSequenceSimulation]
102 |     """ScoredSequenceSimulation for each sequence"""
103 |     ev_correlation_score: Optional[float] = None
104 |     """
105 |     Correlation coefficient between the expected values of the normalized activations from the
106 |     simulation and the unnormalized true activations on a dataset created from all score_results.
107 |     (Note that this is not equivalent to averaging across sequences.)
108 |     """
109 |     rsquared_score: Optional[float] = None
110 |     """R^2 of the simulated activations."""
111 |     absolute_dev_explained_score: Optional[float] = None
112 |     """
113 |     Score based on absolute difference between real and simulated activations.
114 |     absolute_dev_explained_score = 1 - mean(abs(real-predicted))/ mean(abs(real)).
115 |     """
116 | 
117 |     def get_preferred_score(self) -> Optional[float]:
118 |         """
119 |         This method may return None in cases where the score is undefined, for example if the
120 |         normalized activations were all zero, yielding a correlation coefficient of NaN.
121 |         """
122 |         return self.ev_correlation_score
123 | 
124 | 
125 | @register_dataclass
126 | @dataclass
127 | class ScoredExplanation(FastDataclass):
128 |     """Simulator parameters and the results of scoring it on multiple sequences"""
129 | 
130 |     explanation: str
131 |     """The explanation used for simulation."""
132 | 
133 |     scored_simulation: ScoredSimulation
134 |     """Result of scoring the neuron simulator on multiple sequences."""
135 | 
136 |     def get_preferred_score(self) -> Optional[float]:
137 |         """
138 |         This method may return None in cases where the score is undefined, for example if the
139 |         normalized activations were all zero, yielding a correlation coefficient of NaN.
140 |         """
141 |         return self.scored_simulation.get_preferred_score()
142 | 
143 | 
144 | @register_dataclass
145 | @dataclass
146 | class NeuronSimulationResults(FastDataclass):
147 |     """Simulation results and scores for a neuron."""
148 | 
149 |     neuron_id: NeuronId
150 |     scored_explanations: list[ScoredExplanation]
151 | 
152 | 
153 | def load_neuron_explanations(
154 |     explanations_path: str, layer_index: Union[str, int], neuron_index: Union[str, int]
155 | ) -> Optional[NeuronSimulationResults]:
156 |     """Load scored explanations for the specified neuron."""
157 |     file = bf.join(explanations_path, str(layer_index), f"{neuron_index}.jsonl")
158 |     if not bf.exists(file):
159 |         return None
160 |     with bf.BlobFile(file) as f:
161 |         for line in f:
162 |             return loads(line)
163 |     return None
164 | 
165 | 
166 | @bbb.ensure_session
167 | async def load_neuron_explanations_async(
168 |     explanations_path: str, layer_index: Union[str, int], neuron_index: Union[str, int]
169 | ) -> Optional[NeuronSimulationResults]:
170 |     """Load scored explanations for the specified neuron, asynchronously."""
171 |     return await read_explanation_file(
172 |         bf.join(explanations_path, str(layer_index), f"{neuron_index}.jsonl")
173 |     )
174 | 
175 | 
176 | @bbb.ensure_session
177 | async def read_file(filename: str) -> Optional[str]:
178 |     """Read the contents of the given file as a string, asynchronously."""
179 |     try:
180 |         raw_contents = await bbb.read.read_single(filename)
181 |     except FileNotFoundError:
182 |         print(f"Could not read {filename}")
183 |         return None
184 |     lines = []
185 |     for line in raw_contents.decode("utf-8").split("\n"):
186 |         if len(line) > 0:
187 |             lines.append(line)
188 |     assert len(lines) == 1, filename
189 |     return lines[0]
190 | 
191 | 
192 | @bbb.ensure_session
193 | async def read_explanation_file(explanation_filename: str) -> Optional[NeuronSimulationResults]:
194 |     """Load scored explanations from the given filename, asynchronously."""
195 |     line = await read_file(explanation_filename)
196 |     return loads(line) if line is not None else None
197 | 
198 | 
199 | @bbb.ensure_session
200 | async def read_json_file(filename: str) -> Optional[dict]:
201 |     """Read the contents of the given file as a JSON object, asynchronously."""
202 |     line = await read_file(filename)
203 |     return json.loads(line) if line is not None else None
204 | 
205 | 
206 | def get_numerical_subdirs(dataset_path: str) -> list[str]:
207 |     """Return the names of all numbered subdirectories in the specified directory.
208 | 
209 |     Used to get all layer directories in an explanation directory.
210 |     """
211 |     return [
212 |         str(x)
213 |         for x in sorted(
214 |             [
215 |                 int(x)
216 |                 for x in bf.listdir(dataset_path)
217 |                 if bf.isdir(bf.join(dataset_path, x)) and x.isnumeric()
218 |             ]
219 |         )
220 |     ]
221 | 
222 | 
223 | def get_sorted_neuron_indices_from_explanations(
224 |     explanations_path: str, layer: Union[str, int]
225 | ) -> list[int]:
226 |     """Return the indices of all neurons in this layer, in ascending order."""
227 |     layer_dir = bf.join(explanations_path, str(layer))
228 |     return sorted(
229 |         [int(f.split(".")[0]) for f in bf.listdir(layer_dir) if f.split(".")[0].isnumeric()]
230 |     )
231 | 


--------------------------------------------------------------------------------
/neuron-explainer/neuron_explainer/activations/activations.py:
--------------------------------------------------------------------------------
  1 | # Dataclasses and enums for storing neuron-indexed information about activations. Also, related
  2 | # helper functions.
  3 | 
  4 | import math
  5 | from dataclasses import dataclass, field
  6 | from typing import List, Optional, Union
  7 | 
  8 | import urllib.request
  9 | import blobfile as bf
 10 | import boostedblob as bbb
 11 | from neuron_explainer.fast_dataclasses import FastDataclass, loads, register_dataclass
 12 | from neuron_explainer.azure import standardize_azure_url
 13 | 
 14 | 
 15 | @register_dataclass
 16 | @dataclass
 17 | class ActivationRecord(FastDataclass):
 18 |     """Collated lists of tokens and their activations for a single neuron."""
 19 | 
 20 |     tokens: List[str]
 21 |     """Tokens in the text sequence, represented as strings."""
 22 |     activations: List[float]
 23 |     """Raw activation values for the neuron on each token in the text sequence."""
 24 | 
 25 | 
 26 | @register_dataclass
 27 | @dataclass
 28 | class NeuronId(FastDataclass):
 29 |     """Identifier for a neuron in an artificial neural network."""
 30 | 
 31 |     layer_index: int
 32 |     """The index of layer the neuron is in. The first layer used during inference has index 0."""
 33 |     neuron_index: int
 34 |     """The neuron's index within in its layer. Indices start from 0 in each layer."""
 35 | 
 36 | 
 37 | def _check_slices(
 38 |     slices_by_split: dict[str, slice],
 39 |     expected_num_values: int,
 40 | ) -> None:
 41 |     """Assert that the slices are disjoint and fully cover the intended range."""
 42 |     indices = set()
 43 |     sum_of_slice_lengths = 0
 44 |     n_splits = len(slices_by_split.keys())
 45 |     for s in slices_by_split.values():
 46 |         subrange = range(expected_num_values)[s]
 47 |         sum_of_slice_lengths += len(subrange)
 48 |         indices |= set(subrange)
 49 |     assert (
 50 |         sum_of_slice_lengths == expected_num_values
 51 |     ), f"{sum_of_slice_lengths=} != {expected_num_values=}"
 52 |     stride = n_splits
 53 |     expected_indices = set.union(
 54 |         *[set(range(start_index, expected_num_values, stride)) for start_index in range(n_splits)]
 55 |     )
 56 |     assert indices == expected_indices, f"{indices=} != {expected_indices=}"
 57 | 
 58 | 
 59 | def get_slices_for_splits(
 60 |     splits: list[str],
 61 |     num_activation_records_per_split: int,
 62 | ) -> dict[str, slice]:
 63 |     """
 64 |     Get equal-sized interleaved subsets for each of a list of splits, given the number of elements
 65 |     to include in each split.
 66 |     """
 67 | 
 68 |     stride = len(splits)
 69 |     num_activation_records_for_even_splits = num_activation_records_per_split * stride
 70 |     slices_by_split = {
 71 |         split: slice(split_index, num_activation_records_for_even_splits, stride)
 72 |         for split_index, split in enumerate(splits)
 73 |     }
 74 |     _check_slices(
 75 |         slices_by_split=slices_by_split,
 76 |         expected_num_values=num_activation_records_for_even_splits,
 77 |     )
 78 |     return slices_by_split
 79 | 
 80 | 
 81 | @dataclass
 82 | class ActivationRecordSliceParams:
 83 |     """How to select splits (train, valid, etc.) of activation records."""
 84 | 
 85 |     n_examples_per_split: Optional[int]
 86 |     """The number of examples to include in each split."""
 87 | 
 88 | 
 89 | @register_dataclass
 90 | @dataclass
 91 | class NeuronRecord(FastDataclass):
 92 |     """Neuron-indexed activation data, including summary stats and notable activation records."""
 93 | 
 94 |     neuron_id: NeuronId
 95 |     """Identifier for the neuron."""
 96 | 
 97 |     random_sample: list[ActivationRecord] = field(default_factory=list)
 98 |     """
 99 |     Random activation records for this neuron. The random sample is independent from those used for
100 |     other neurons.
101 |     """
102 |     random_sample_by_quantile: Optional[list[list[ActivationRecord]]] = None
103 |     """
104 |     Random samples of activation records in each of the specified quantiles. None if quantile
105 |     tracking is disabled.
106 |     """
107 |     quantile_boundaries: Optional[list[float]] = None
108 |     """Boundaries of the quantiles used to generate the random_sample_by_quantile field."""
109 | 
110 |     # Moments of activations
111 |     mean: Optional[float] = math.nan
112 |     variance: Optional[float] = math.nan
113 |     skewness: Optional[float] = math.nan
114 |     kurtosis: Optional[float] = math.nan
115 | 
116 |     most_positive_activation_records: list[ActivationRecord] = field(default_factory=list)
117 |     """
118 |     Activation records with the most positive figure of merit value for this neuron over all dataset
119 |     examples.
120 |     """
121 | 
122 |     @property
123 |     def max_activation(self) -> float:
124 |         """Return the maximum activation value over all top-activating activation records."""
125 |         return max([max(ar.activations) for ar in self.most_positive_activation_records])
126 | 
127 |     def _get_top_activation_slices(
128 |         self, activation_record_slice_params: ActivationRecordSliceParams
129 |     ) -> dict[str, slice]:
130 |         splits = ["train", "calibration", "valid", "test"]
131 |         n_examples_per_split = activation_record_slice_params.n_examples_per_split
132 |         if n_examples_per_split is None:
133 |             n_examples_per_split = len(self.most_positive_activation_records) // len(splits)
134 |         assert len(self.most_positive_activation_records) >= n_examples_per_split * len(splits)
135 |         return get_slices_for_splits(splits, n_examples_per_split)
136 | 
137 |     def _get_random_activation_slices(
138 |         self, activation_record_slice_params: ActivationRecordSliceParams
139 |     ) -> dict[str, slice]:
140 |         splits = ["calibration", "valid", "test"]
141 |         n_examples_per_split = activation_record_slice_params.n_examples_per_split
142 |         if n_examples_per_split is None:
143 |             n_examples_per_split = len(self.random_sample) // len(splits)
144 |         # NOTE: this assert could trigger on some old datasets with only 10 random samples, in which case you may have to remove "test" from the set of splits
145 |         assert len(self.random_sample) >= n_examples_per_split * len(splits)
146 |         return get_slices_for_splits(splits, n_examples_per_split)
147 | 
148 |     def train_activation_records(
149 |         self,
150 |         activation_record_slice_params: ActivationRecordSliceParams,
151 |     ) -> list[ActivationRecord]:
152 |         """
153 |         Train split, typically used for generating explanations. Consists exclusively of
154 |         top-activating records since context window limitations make it difficult to include
155 |         random records.
156 |         """
157 |         return self.most_positive_activation_records[
158 |             self._get_top_activation_slices(activation_record_slice_params)["train"]
159 |         ]
160 | 
161 |     def calibration_activation_records(
162 |         self,
163 |         activation_record_slice_params: ActivationRecordSliceParams,
164 |     ) -> list[ActivationRecord]:
165 |         """
166 |         Calibration split, typically used for calibrating neuron simulations. See
167 |         http://go/neuron_explanation_methodology for an explanation of calibration. Consists of
168 |         top-activating records and random records in a 1:1 ratio.
169 |         """
170 |         return (
171 |             self.most_positive_activation_records[
172 |                 self._get_top_activation_slices(activation_record_slice_params)["calibration"]
173 |             ]
174 |             + self.random_sample[
175 |                 self._get_random_activation_slices(activation_record_slice_params)["calibration"]
176 |             ]
177 |         )
178 | 
179 |     def valid_activation_records(
180 |         self,
181 |         activation_record_slice_params: ActivationRecordSliceParams,
182 |     ) -> list[ActivationRecord]:
183 |         """
184 |         Validation split, typically used for evaluating explanations, either automatically with
185 |         simulation + correlation coefficient scoring, or manually by humans. Consists of
186 |         top-activating records and random records in a 1:1 ratio.
187 |         """
188 |         return (
189 |             self.most_positive_activation_records[
190 |                 self._get_top_activation_slices(activation_record_slice_params)["valid"]
191 |             ]
192 |             + self.random_sample[
193 |                 self._get_random_activation_slices(activation_record_slice_params)["valid"]
194 |             ]
195 |         )
196 | 
197 |     def test_activation_records(
198 |         self,
199 |         activation_record_slice_params: ActivationRecordSliceParams,
200 |     ) -> list[ActivationRecord]:
201 |         """
202 |         Test split, typically used for explanation evaluations that can't use the validation split.
203 |         Consists of top-activating records and random records in a 1:1 ratio.
204 |         """
205 |         return (
206 |             self.most_positive_activation_records[
207 |                 self._get_top_activation_slices(activation_record_slice_params)["test"]
208 |             ]
209 |             + self.random_sample[
210 |                 self._get_random_activation_slices(activation_record_slice_params)["test"]
211 |             ]
212 |         )
213 | 
214 | 
215 | def neuron_exists(
216 |     dataset_path: str, layer_index: Union[str, int], neuron_index: Union[str, int]
217 | ) -> bool:
218 |     """Return whether the specified neuron exists."""
219 |     file = bf.join(dataset_path, "neurons", str(layer_index), f"{neuron_index}.json")
220 |     return bf.exists(file)
221 | 
222 | 
223 | def load_neuron(
224 |     layer_index: Union[str, int],
225 |     neuron_index: Union[str, int],
226 |     dataset_path: str = "https://openaipublic.blob.core.windows.net/neuron-explainer/data/collated-activations",
227 | ) -> NeuronRecord:
228 |     """Load the NeuronRecord for the specified neuron."""
229 |     url = "/".join([dataset_path, str(layer_index), f"{neuron_index}.json"])
230 |     url = standardize_azure_url(url)
231 |     with urllib.request.urlopen(url) as f:
232 |         neuron_record = loads(f.read())
233 |         if not isinstance(neuron_record, NeuronRecord):
234 |             raise ValueError(
235 |                 f"Stored data incompatible with current version of NeuronRecord dataclass."
236 |             )
237 |         return neuron_record
238 | 
239 | 
240 | @bbb.ensure_session
241 | async def load_neuron_async(
242 |     layer_index: Union[str, int],
243 |     neuron_index: Union[str, int],
244 |     dataset_path: str = "az://openaipublic/neuron-explainer/data/collated-activations",
245 | ) -> NeuronRecord:
246 |     """Async version of load_neuron."""
247 |     file = bf.join(dataset_path, str(layer_index), f"{neuron_index}.json")
248 |     return await read_neuron_file(file)
249 | 
250 | 
251 | @bbb.ensure_session
252 | async def read_neuron_file(neuron_filename: str) -> NeuronRecord:
253 |     """Like load_neuron_async, but takes a raw neuron filename."""
254 |     raw_contents = await bbb.read.read_single(neuron_filename)
255 |     neuron_record = loads(raw_contents.decode("utf-8"))
256 |     if not isinstance(neuron_record, NeuronRecord):
257 |         raise ValueError(
258 |             f"Stored data incompatible with current version of NeuronRecord dataclass."
259 |         )
260 |     return neuron_record
261 | 
262 | 
263 | def get_sorted_neuron_indices(dataset_path: str, layer_index: Union[str, int]) -> List[int]:
264 |     """Returns the indices of all neurons in this layer, in ascending order."""
265 |     layer_dir = bf.join(dataset_path, "neurons", str(layer_index))
266 |     return sorted(
267 |         [int(f.split(".")[0]) for f in bf.listdir(layer_dir) if f.split(".")[0].isnumeric()]
268 |     )
269 | 
270 | 
271 | def get_sorted_layers(dataset_path: str) -> List[str]:
272 |     """
273 |     Return the indices of all layers in this dataset, in ascending numerical order, as strings.
274 |     """
275 |     return [
276 |         str(x)
277 |         for x in sorted(
278 |             [int(x) for x in bf.listdir(bf.join(dataset_path, "neurons")) if x.isnumeric()]
279 |         )
280 |     ]
281 | 


--------------------------------------------------------------------------------
/neuron-explainer/neuron_explainer/explanations/explainer.py:
--------------------------------------------------------------------------------
  1 | """Uses API calls to generate explanations of neuron behavior."""
  2 | 
  3 | from __future__ import annotations
  4 | 
  5 | import logging
  6 | import re
  7 | from abc import ABC, abstractmethod
  8 | from enum import Enum
  9 | from typing import Any, Optional, Sequence, Union
 10 | 
 11 | from neuron_explainer.activations.activation_records import (
 12 |     calculate_max_activation,
 13 |     format_activation_records,
 14 |     non_zero_activation_proportion,
 15 | )
 16 | from neuron_explainer.activations.activations import ActivationRecord
 17 | from neuron_explainer.api_client import ApiClient
 18 | from neuron_explainer.explanations.few_shot_examples import FewShotExampleSet
 19 | from neuron_explainer.explanations.prompt_builder import (
 20 |     HarmonyMessage,
 21 |     PromptBuilder,
 22 |     PromptFormat,
 23 |     Role,
 24 | )
 25 | from neuron_explainer.explanations.token_space_few_shot_examples import (
 26 |     TokenSpaceFewShotExampleSet,
 27 | )
 28 | 
 29 | logger = logging.getLogger(__name__)
 30 | 
 31 | 
 32 | # TODO(williamrs): This prefix may not work well for some things, like predicting the next token.
 33 | # Try other options like "this neuron activates for".
 34 | EXPLANATION_PREFIX = "the main thing this neuron does is find"
 35 | 
 36 | 
 37 | def _split_numbered_list(text: str) -> list[str]:
 38 |     """Split a numbered list into a list of strings."""
 39 |     lines = re.split(r"\n\d+\.", text)
 40 |     # Strip the leading whitespace from each line.
 41 |     return [line.lstrip() for line in lines]
 42 | 
 43 | 
 44 | def _remove_final_period(text: str) -> str:
 45 |     """Strip a final period or period-space from a string."""
 46 |     if text.endswith("."):
 47 |         return text[:-1]
 48 |     elif text.endswith(". "):
 49 |         return text[:-2]
 50 |     return text
 51 | 
 52 | 
 53 | class ContextSize(int, Enum):
 54 |     TWO_K = 2049
 55 |     FOUR_K = 4097
 56 | 
 57 |     @classmethod
 58 |     def from_int(cls, i: int) -> ContextSize:
 59 |         for context_size in cls:
 60 |             if context_size.value == i:
 61 |                 return context_size
 62 |         raise ValueError(f"{i} is not a valid ContextSize")
 63 | 
 64 | 
 65 | HARMONY_V4_MODELS = ["gpt-3.5-turbo", "gpt-4", "gpt-4-1106-preview"]
 66 | 
 67 | 
 68 | class NeuronExplainer(ABC):
 69 |     """
 70 |     Abstract base class for Explainer classes that generate explanations from subclass-specific
 71 |     input data.
 72 |     """
 73 | 
 74 |     def __init__(
 75 |         self,
 76 |         model_name: str,
 77 |         prompt_format: PromptFormat = PromptFormat.HARMONY_V4,
 78 |         # This parameter lets us adjust the length of the prompt when we're generating explanations
 79 |         # using older models with shorter context windows. In the future we can use it to experiment
 80 |         # with longer context windows.
 81 |         context_size: ContextSize = ContextSize.FOUR_K,
 82 |         max_concurrent: Optional[int] = 10,
 83 |         cache: bool = False,
 84 |     ):
 85 |         if prompt_format == PromptFormat.HARMONY_V4:
 86 |             assert model_name in HARMONY_V4_MODELS
 87 |         elif prompt_format in [PromptFormat.NONE, PromptFormat.INSTRUCTION_FOLLOWING]:
 88 |             assert model_name not in HARMONY_V4_MODELS
 89 |         else:
 90 |             raise ValueError(f"Unhandled prompt format {prompt_format}")
 91 | 
 92 |         self.model_name = model_name
 93 |         self.prompt_format = prompt_format
 94 |         self.context_size = context_size
 95 |         self.client = ApiClient(model_name=model_name, max_concurrent=max_concurrent, cache=cache)
 96 | 
 97 |     async def generate_explanations(
 98 |         self,
 99 |         *,
100 |         num_samples: int = 5,
101 |         max_tokens: int = 60,
102 |         temperature: float = 1.0,
103 |         top_p: float = 1.0,
104 |         **prompt_kwargs: Any,
105 |     ) -> list[Any]:
106 |         """Generate explanations based on subclass-specific input data."""
107 |         prompt = self.make_explanation_prompt(max_tokens_for_completion=max_tokens, **prompt_kwargs)
108 | 
109 |         generate_kwargs: dict[str, Any] = {
110 |             "n": num_samples,
111 |             "max_tokens": max_tokens,
112 |             "temperature": temperature,
113 |             "top_p": top_p,
114 |         }
115 | 
116 |         if self.prompt_format == PromptFormat.HARMONY_V4:
117 |             assert isinstance(prompt, list)
118 |             assert isinstance(prompt[0], dict)  # Really a HarmonyMessage
119 |             generate_kwargs["messages"] = prompt
120 |         else:
121 |             assert isinstance(prompt, str)
122 |             generate_kwargs["prompt"] = prompt
123 | 
124 |         response = await self.client.make_request(**generate_kwargs)
125 |         logger.debug("response in generate_explanations is %s", response)
126 | 
127 |         if self.prompt_format == PromptFormat.HARMONY_V4:
128 |             explanations = [x["message"]["content"] for x in response["choices"]]
129 |         elif self.prompt_format in [PromptFormat.NONE, PromptFormat.INSTRUCTION_FOLLOWING]:
130 |             explanations = [x["text"] for x in response["choices"]]
131 |         else:
132 |             raise ValueError(f"Unhandled prompt format {self.prompt_format}")
133 | 
134 |         return self.postprocess_explanations(explanations, prompt_kwargs)
135 | 
136 |     @abstractmethod
137 |     def make_explanation_prompt(self, **kwargs: Any) -> Union[str, list[HarmonyMessage]]:
138 |         """
139 |         Create a prompt to send to the API to generate one or more explanations.
140 | 
141 |         A prompt can be a simple string, or a list of HarmonyMessages, depending on the PromptFormat
142 |         used by this instance.
143 |         """
144 |         ...
145 | 
146 |     def postprocess_explanations(
147 |         self, completions: list[str], prompt_kwargs: dict[str, Any]
148 |     ) -> list[Any]:
149 |         """Postprocess the completions returned by the API into a list of explanations."""
150 |         return completions  # no-op by default
151 | 
152 |     def _prompt_is_too_long(
153 |         self, prompt_builder: PromptBuilder, max_tokens_for_completion: int
154 |     ) -> bool:
155 |         # We'll get a context size error if the prompt itself plus the maximum number of tokens for
156 |         # the completion is longer than the context size.
157 |         prompt_length = prompt_builder.prompt_length_in_tokens(self.prompt_format)
158 |         if prompt_length + max_tokens_for_completion > self.context_size.value:
159 |             print(
160 |                 f"Prompt is too long: {prompt_length} + {max_tokens_for_completion} > "
161 |                 f"{self.context_size.value}"
162 |             )
163 |             return True
164 |         return False
165 | 
166 | 
167 | class TokenActivationPairExplainer(NeuronExplainer):
168 |     """
169 |     Generate explanations of neuron behavior using a prompt with lists of token/activation pairs.
170 |     """
171 | 
172 |     def __init__(
173 |         self,
174 |         model_name: str,
175 |         prompt_format: PromptFormat = PromptFormat.HARMONY_V4,
176 |         # This parameter lets us adjust the length of the prompt when we're generating explanations
177 |         # using older models with shorter context windows. In the future we can use it to experiment
178 |         # with 8k+ context windows.
179 |         context_size: ContextSize = ContextSize.FOUR_K,
180 |         few_shot_example_set: FewShotExampleSet = FewShotExampleSet.ORIGINAL,
181 |         repeat_non_zero_activations: bool = True,
182 |         max_concurrent: Optional[int] = 10,
183 |         cache: bool = False,
184 |     ):
185 |         super().__init__(
186 |             model_name=model_name,
187 |             prompt_format=prompt_format,
188 |             max_concurrent=max_concurrent,
189 |             cache=cache,
190 |         )
191 |         self.context_size = context_size
192 |         self.few_shot_example_set = few_shot_example_set
193 |         self.repeat_non_zero_activations = repeat_non_zero_activations
194 | 
195 |     def make_explanation_prompt(self, **kwargs: Any) -> Union[str, list[HarmonyMessage]]:
196 |         original_kwargs = kwargs.copy()
197 |         all_activation_records: Sequence[ActivationRecord] = kwargs.pop("all_activation_records")
198 |         max_activation: float = kwargs.pop("max_activation")
199 |         kwargs.setdefault("numbered_list_of_n_explanations", None)
200 |         numbered_list_of_n_explanations: Optional[int] = kwargs.pop(
201 |             "numbered_list_of_n_explanations"
202 |         )
203 |         if numbered_list_of_n_explanations is not None:
204 |             assert numbered_list_of_n_explanations > 0, numbered_list_of_n_explanations
205 |         # This parameter lets us dynamically shrink the prompt if our initial attempt to create it
206 |         # results in something that's too long. It's only implemented for the 4k context size.
207 |         kwargs.setdefault("omit_n_activation_records", 0)
208 |         omit_n_activation_records: int = kwargs.pop("omit_n_activation_records")
209 |         max_tokens_for_completion: int = kwargs.pop("max_tokens_for_completion")
210 |         assert not kwargs, f"Unexpected kwargs: {kwargs}"
211 | 
212 |         prompt_builder = PromptBuilder()
213 |         prompt_builder.add_message(
214 |             Role.SYSTEM,
215 |             "We're studying neurons in a neural network. Each neuron looks for some particular "
216 |             "thing in a short document. Look at the parts of the document the neuron activates for "
217 |             "and summarize in a single sentence what the neuron is looking for. Don't list "
218 |             "examples of words.\n\nThe activation format is token<tab>activation. Activation "
219 |             "values range from 0 to 10. A neuron finding what it's looking for is represented by a "
220 |             "non-zero activation value. The higher the activation value, the stronger the match.",
221 |         )
222 |         few_shot_examples = self.few_shot_example_set.get_examples()
223 |         num_omitted_activation_records = 0
224 |         for i, few_shot_example in enumerate(few_shot_examples):
225 |             few_shot_activation_records = few_shot_example.activation_records
226 |             if self.context_size == ContextSize.TWO_K:
227 |                 # If we're using a 2k context window, we only have room for one activation record
228 |                 # per few-shot example. (Two few-shot examples with one activation record each seems
229 |                 # to work better than one few-shot example with two activation records, in local
230 |                 # testing.)
231 |                 few_shot_activation_records = few_shot_activation_records[:1]
232 |             elif (
233 |                 self.context_size == ContextSize.FOUR_K
234 |                 and num_omitted_activation_records < omit_n_activation_records
235 |             ):
236 |                 # Drop the last activation record for this few-shot example to save tokens, assuming
237 |                 # there are at least two activation records.
238 |                 if len(few_shot_activation_records) > 1:
239 |                     print(f"Warning: omitting activation record from few-shot example {i}")
240 |                     few_shot_activation_records = few_shot_activation_records[:-1]
241 |                     num_omitted_activation_records += 1
242 |             self._add_per_neuron_explanation_prompt(
243 |                 prompt_builder,
244 |                 few_shot_activation_records,
245 |                 i,
246 |                 calculate_max_activation(few_shot_example.activation_records),
247 |                 numbered_list_of_n_explanations=numbered_list_of_n_explanations,
248 |                 explanation=few_shot_example.explanation,
249 |             )
250 |         self._add_per_neuron_explanation_prompt(
251 |             prompt_builder,
252 |             # If we're using a 2k context window, we only have room for two of the activation
253 |             # records.
254 |             all_activation_records[:2]
255 |             if self.context_size == ContextSize.TWO_K
256 |             else all_activation_records,
257 |             len(few_shot_examples),
258 |             max_activation,
259 |             numbered_list_of_n_explanations=numbered_list_of_n_explanations,
260 |             explanation=None,
261 |         )
262 |         # If the prompt is too long *and* we omitted the specified number of activation records, try
263 |         # again, omitting one more. (If we didn't make the specified number of omissions, we're out
264 |         # of opportunities to omit records, so we just return the prompt as-is.)
265 |         if (
266 |             self._prompt_is_too_long(prompt_builder, max_tokens_for_completion)
267 |             and num_omitted_activation_records == omit_n_activation_records
268 |         ):
269 |             original_kwargs["omit_n_activation_records"] = omit_n_activation_records + 1
270 |             return self.make_explanation_prompt(**original_kwargs)
271 |         return prompt_builder.build(self.prompt_format)
272 | 
273 |     def _add_per_neuron_explanation_prompt(
274 |         self,
275 |         prompt_builder: PromptBuilder,
276 |         activation_records: Sequence[ActivationRecord],
277 |         index: int,
278 |         max_activation: float,
279 |         # When set, this indicates that the prompt should solicit a numbered list of the given
280 |         # number of explanations, rather than a single explanation.
281 |         numbered_list_of_n_explanations: Optional[int],
282 |         explanation: Optional[str],  # None means this is the end of the full prompt.
283 |     ) -> None:
284 |         max_activation = calculate_max_activation(activation_records)
285 |         user_message = f"""
286 | 
287 | Neuron {index + 1}
288 | Activations:{format_activation_records(activation_records, max_activation, omit_zeros=False)}"""
289 |         # We repeat the non-zero activations only if it was requested and if the proportion of
290 |         # non-zero activations isn't too high.
291 |         if (
292 |             self.repeat_non_zero_activations
293 |             and non_zero_activation_proportion(activation_records, max_activation) < 0.2
294 |         ):
295 |             user_message += (
296 |                 f"\nSame activations, but with all zeros filtered out:"
297 |                 f"{format_activation_records(activation_records, max_activation, omit_zeros=True)}"
298 |             )
299 | 
300 |         if numbered_list_of_n_explanations is None:
301 |             user_message += f"\nExplanation of neuron {index + 1} behavior:"
302 |             assistant_message = ""
303 |             # For the IF format, we want <|endofprompt|> to come before the explanation prefix.
304 |             if self.prompt_format == PromptFormat.INSTRUCTION_FOLLOWING:
305 |                 assistant_message += f" {EXPLANATION_PREFIX}"
306 |             else:
307 |                 user_message += f" {EXPLANATION_PREFIX}"
308 |             prompt_builder.add_message(Role.USER, user_message)
309 | 
310 |             if explanation is not None:
311 |                 assistant_message += f" {explanation}."
312 |             if assistant_message:
313 |                 prompt_builder.add_message(Role.ASSISTANT, assistant_message)
314 |         else:
315 |             if explanation is None:
316 |                 # For the final neuron, we solicit a numbered list of explanations.
317 |                 prompt_builder.add_message(
318 |                     Role.USER,
319 |                     f"""\nHere are {numbered_list_of_n_explanations} possible explanations for neuron {index + 1} behavior, each beginning with "{EXPLANATION_PREFIX}":\n1. {EXPLANATION_PREFIX}""",
320 |                 )
321 |             else:
322 |                 # For the few-shot examples, we only present one explanation, but we present it as a
323 |                 # numbered list.
324 |                 prompt_builder.add_message(
325 |                     Role.USER,
326 |                     f"""\nHere is 1 possible explanation for neuron {index + 1} behavior, beginning with "{EXPLANATION_PREFIX}":\n1. {EXPLANATION_PREFIX}""",
327 |                 )
328 |                 prompt_builder.add_message(Role.ASSISTANT, f" {explanation}.")
329 | 
330 |     def postprocess_explanations(
331 |         self, completions: list[str], prompt_kwargs: dict[str, Any]
332 |     ) -> list[Any]:
333 |         """Postprocess the explanations returned by the API"""
334 |         numbered_list_of_n_explanations = prompt_kwargs.get("numbered_list_of_n_explanations")
335 |         if numbered_list_of_n_explanations is None:
336 |             return completions
337 |         else:
338 |             all_explanations = []
339 |             for completion in completions:
340 |                 for explanation in _split_numbered_list(completion):
341 |                     if explanation.startswith(EXPLANATION_PREFIX):
342 |                         explanation = explanation[len(EXPLANATION_PREFIX) :]
343 |                     all_explanations.append(explanation.strip())
344 |             return all_explanations
345 | 
346 | 
347 | class TokenSpaceRepresentationExplainer(NeuronExplainer):
348 |     """
349 |     Generate explanations of arbitrary lists of tokens which disproportionately activate a
350 |     particular neuron. These lists of tokens can be generated in various ways. As an example, in one
351 |     set of experiments, we compute the average activation for each neuron conditional on each token
352 |     that appears in an internet text corpus. We then sort the tokens by their average activation,
353 |     and show 50 of the top 100 tokens. Other techniques that could be used include taking the top
354 |     tokens in the logit lens or tuned lens representations of a neuron.
355 |     """
356 | 
357 |     def __init__(
358 |         self,
359 |         model_name: str,
360 |         prompt_format: PromptFormat = PromptFormat.HARMONY_V4,
361 |         context_size: ContextSize = ContextSize.FOUR_K,
362 |         few_shot_example_set: TokenSpaceFewShotExampleSet = TokenSpaceFewShotExampleSet.ORIGINAL,
363 |         use_few_shot: bool = False,
364 |         output_numbered_list: bool = False,
365 |         max_concurrent: Optional[int] = 10,
366 |         cache: bool = False,
367 |     ):
368 |         super().__init__(
369 |             model_name=model_name,
370 |             prompt_format=prompt_format,
371 |             context_size=context_size,
372 |             max_concurrent=max_concurrent,
373 |             cache=cache,
374 |         )
375 |         self.use_few_shot = use_few_shot
376 |         self.output_numbered_list = output_numbered_list
377 |         if self.use_few_shot:
378 |             assert few_shot_example_set is not None
379 |             self.few_shot_examples: Optional[TokenSpaceFewShotExampleSet] = few_shot_example_set
380 |         else:
381 |             self.few_shot_examples = None
382 |         self.prompt_prefix = (
383 |             "We're studying neurons in a neural network. Each neuron looks for some particular "
384 |             "kind of token (which can be a word, or part of a word). Look at the tokens the neuron "
385 |             "activates for (listed below) and summarize in a single sentence what the neuron is "
386 |             "looking for. Don't list examples of words."
387 |         )
388 | 
389 |     def make_explanation_prompt(self, **kwargs: Any) -> Union[str, list[HarmonyMessage]]:
390 |         tokens: list[str] = kwargs.pop("tokens")
391 |         max_tokens_for_completion = kwargs.pop("max_tokens_for_completion")
392 |         assert not kwargs, f"Unexpected kwargs: {kwargs}"
393 |         # Note that this does not preserve the precise tokens, as e.g.
394 |         # f" {token_with_no_leading_space}" may be tokenized as "f{token_with_leading_space}".
395 |         # TODO(dan): Try out other variants, including "\n".join(...) and ",".join(...)
396 |         stringified_tokens = ", ".join([f"'{t}'" for t in tokens])
397 | 
398 |         prompt_builder = PromptBuilder()
399 |         prompt_builder.add_message(Role.SYSTEM, self.prompt_prefix)
400 |         if self.use_few_shot:
401 |             self._add_few_shot_examples(prompt_builder)
402 |         self._add_neuron_specific_prompt(prompt_builder, stringified_tokens, explanation=None)
403 | 
404 |         if self._prompt_is_too_long(prompt_builder, max_tokens_for_completion):
405 |             raise ValueError(f"Prompt too long: {prompt_builder.build(self.prompt_format)}")
406 |         else:
407 |             return prompt_builder.build(self.prompt_format)
408 | 
409 |     def _add_few_shot_examples(self, prompt_builder: PromptBuilder) -> None:
410 |         """
411 |         Append few-shot examples to the prompt. Each one consists of a comma-delimited list of
412 |         tokens and corresponding explanations, as saved in
413 |         alignment/neuron_explainer/weight_explainer/token_space_few_shot_examples.py.
414 |         """
415 |         assert self.few_shot_examples is not None
416 |         few_shot_example_list = self.few_shot_examples.get_examples()
417 |         if self.output_numbered_list:
418 |             raise NotImplementedError("Numbered list output not supported for few-shot examples")
419 |         else:
420 |             for few_shot_example in few_shot_example_list:
421 |                 self._add_neuron_specific_prompt(
422 |                     prompt_builder,
423 |                     ", ".join([f"'{t}'" for t in few_shot_example.tokens]),
424 |                     explanation=few_shot_example.explanation,
425 |                 )
426 | 
427 |     def _add_neuron_specific_prompt(
428 |         self,
429 |         prompt_builder: PromptBuilder,
430 |         stringified_tokens: str,
431 |         explanation: Optional[str],
432 |     ) -> None:
433 |         """
434 |         Append a neuron-specific prompt to the prompt builder. The prompt consists of a list of
435 |         tokens followed by either an explanation (if one is passed, for few shot examples) or by
436 |         the beginning of a completion, to be completed by the model with an explanation.
437 |         """
438 |         user_message = f"\n\n\n\nTokens:\n{stringified_tokens}\n\nExplanation:\n"
439 |         assistant_message = ""
440 |         looking_for = "This neuron is looking for"
441 |         if self.prompt_format == PromptFormat.INSTRUCTION_FOLLOWING:
442 |             # We want <|endofprompt|> to come before "This neuron is looking for" in the IF format.
443 |             assistant_message += looking_for
444 |         else:
445 |             user_message += looking_for
446 |         if self.output_numbered_list:
447 |             start_of_list = "\n1."
448 |             if self.prompt_format == PromptFormat.INSTRUCTION_FOLLOWING:
449 |                 assistant_message += start_of_list
450 |             else:
451 |                 user_message += start_of_list
452 |         if explanation is not None:
453 |             assistant_message += f"{explanation}."
454 |         prompt_builder.add_message(Role.USER, user_message)
455 |         if assistant_message:
456 |             prompt_builder.add_message(Role.ASSISTANT, assistant_message)
457 | 
458 |     def postprocess_explanations(
459 |         self, completions: list[str], prompt_kwargs: dict[str, Any]
460 |     ) -> list[str]:
461 |         if self.output_numbered_list:
462 |             # Each list in the top-level list will have multiple explanations (multiple strings).
463 |             all_explanations = []
464 |             for completion in completions:
465 |                 for explanation in _split_numbered_list(completion):
466 |                     if explanation.startswith(EXPLANATION_PREFIX):
467 |                         explanation = explanation[len(EXPLANATION_PREFIX) :]
468 |                     all_explanations.append(explanation.strip())
469 |             return all_explanations
470 |         else:
471 |             # Each element in the top-level list will be an explanation as a string.
472 |             return [_remove_final_period(explanation) for explanation in completions]
473 | 


--------------------------------------------------------------------------------
/neuron-explainer/neuron_explainer/explanations/few_shot_examples.py:
--------------------------------------------------------------------------------
   1 | # Few-shot examples for generating and simulating neuron explanations.
   2 | 
   3 | from __future__ import annotations
   4 | 
   5 | from dataclasses import dataclass
   6 | from enum import Enum
   7 | from typing import List, Optional
   8 | 
   9 | from neuron_explainer.activations.activations import ActivationRecord
  10 | from neuron_explainer.fast_dataclasses import FastDataclass
  11 | 
  12 | 
  13 | @dataclass
  14 | class Example(FastDataclass):
  15 |     activation_records: List[ActivationRecord]
  16 |     explanation: str
  17 |     first_revealed_activation_indices: List[int]
  18 |     """
  19 |     For each activation record, the index of the first token for which the activation value in the
  20 |     prompt should be an actual number rather than "unknown".
  21 | 
  22 |     Examples all start with the activations rendered as "unknown", then transition to revealing
  23 |     specific normalized activation values. The goal is to lead the model to predict that activation
  24 |     sequences will eventually transition to predicting specific activation values instead of just
  25 |     "unknown". This lets us cheat and get predictions of activation values for every token in a
  26 |     single round of inference by having the activations in the sequence we're predicting always be
  27 |     "unknown" in the prompt: the model will always think that maybe the next token will be a real
  28 |     activation.
  29 |     """
  30 |     token_index_to_score: Optional[int] = None
  31 |     """
  32 |     If the prompt is used as an example for one-token-at-a-time scoring, this is the index of the
  33 |     token to score.
  34 |     """
  35 | 
  36 | 
  37 | class FewShotExampleSet(Enum):
  38 |     """Determines which few-shot examples to use when sampling explanations."""
  39 | 
  40 |     ORIGINAL = "original"
  41 |     NEWER = "newer"
  42 |     TEST = "test"
  43 | 
  44 |     @classmethod
  45 |     def from_string(cls, string: str) -> FewShotExampleSet:
  46 |         for example_set in FewShotExampleSet:
  47 |             if example_set.value == string:
  48 |                 return example_set
  49 |         raise ValueError(f"Unrecognized example set: {string}")
  50 | 
  51 |     def get_examples(self) -> list[Example]:
  52 |         """Returns regular examples for use in a few-shot prompt."""
  53 |         if self is FewShotExampleSet.ORIGINAL:
  54 |             return ORIGINAL_EXAMPLES
  55 |         elif self is FewShotExampleSet.NEWER:
  56 |             return NEWER_EXAMPLES
  57 |         elif self is FewShotExampleSet.TEST:
  58 |             return TEST_EXAMPLES
  59 |         else:
  60 |             raise ValueError(f"Unhandled example set: {self}")
  61 | 
  62 |     def get_single_token_prediction_example(self) -> Example:
  63 |         """
  64 |         Returns an example suitable for use in a subprompt for predicting a single token's
  65 |         normalized activation, for use with the "one token at a time" scoring approach.
  66 |         """
  67 |         if self is FewShotExampleSet.NEWER:
  68 |             return NEWER_SINGLE_TOKEN_EXAMPLE
  69 |         elif self is FewShotExampleSet.TEST:
  70 |             return TEST_SINGLE_TOKEN_EXAMPLE
  71 |         else:
  72 |             raise ValueError(f"Unhandled example set: {self}")
  73 | 
  74 | 
  75 | TEST_EXAMPLES = [
  76 |     Example(
  77 |         activation_records=[
  78 |             ActivationRecord(
  79 |                 tokens=["a", "b", "c"],
  80 |                 activations=[1.0, 0.0, 0.0],
  81 |             ),
  82 |             ActivationRecord(
  83 |                 tokens=["d", "e", "f"],
  84 |                 activations=[0.0, 1.0, 0.0],
  85 |             ),
  86 |         ],
  87 |         explanation="vowels",
  88 |         first_revealed_activation_indices=[0, 1],
  89 |     ),
  90 | ]
  91 | 
  92 | TEST_SINGLE_TOKEN_EXAMPLE = Example(
  93 |     activation_records=[
  94 |         ActivationRecord(
  95 |             activations=[0.0, 0.0, 1.0],
  96 |             tokens=["g", "h", "i"],
  97 |         ),
  98 |     ],
  99 |     first_revealed_activation_indices=[],
 100 |     token_index_to_score=2,
 101 |     explanation="test explanation",
 102 | )
 103 | 
 104 | 
 105 | ORIGINAL_EXAMPLES = [
 106 |     Example(
 107 |         activation_records=[
 108 |             ActivationRecord(
 109 |                 tokens=[
 110 |                     "t",
 111 |                     "urt",
 112 |                     "ur",
 113 |                     "ro",
 114 |                     " is",
 115 |                     " fab",
 116 |                     "ulously",
 117 |                     " funny",
 118 |                     " and",
 119 |                     " over",
 120 |                     " the",
 121 |                     " top",
 122 |                     " as",
 123 |                     " a",
 124 |                     " '",
 125 |                     "very",
 126 |                     " sneaky",
 127 |                     "'",
 128 |                     " but",
 129 |                     "ler",
 130 |                     " who",
 131 |                     " excel",
 132 |                     "s",
 133 |                     " in",
 134 |                     " the",
 135 |                     " art",
 136 |                     " of",
 137 |                     " impossible",
 138 |                     " disappearing",
 139 |                     "/",
 140 |                     "re",
 141 |                     "app",
 142 |                     "earing",
 143 |                     " acts",
 144 |                 ],
 145 |                 activations=[
 146 |                     -0.71,
 147 |                     -1.85,
 148 |                     -2.39,
 149 |                     -2.58,
 150 |                     -1.34,
 151 |                     -1.92,
 152 |                     -1.69,
 153 |                     -0.84,
 154 |                     -1.25,
 155 |                     -1.75,
 156 |                     -1.42,
 157 |                     -1.47,
 158 |                     -1.51,
 159 |                     -0.8,
 160 |                     -1.89,
 161 |                     -1.56,
 162 |                     -1.63,
 163 |                     0.44,
 164 |                     -1.87,
 165 |                     -2.55,
 166 |                     -2.09,
 167 |                     -1.76,
 168 |                     -1.33,
 169 |                     -0.88,
 170 |                     -1.63,
 171 |                     -2.39,
 172 |                     -2.63,
 173 |                     -0.99,
 174 |                     2.83,
 175 |                     -1.11,
 176 |                     -1.19,
 177 |                     -1.33,
 178 |                     4.24,
 179 |                     -1.51,
 180 |                 ],
 181 |             ),
 182 |             ActivationRecord(
 183 |                 tokens=[
 184 |                     "esc",
 185 |                     "aping",
 186 |                     " the",
 187 |                     " studio",
 188 |                     " ,",
 189 |                     " pic",
 190 |                     "col",
 191 |                     "i",
 192 |                     " is",
 193 |                     " warm",
 194 |                     "ly",
 195 |                     " affecting",
 196 |                     " and",
 197 |                     " so",
 198 |                     " is",
 199 |                     " this",
 200 |                     " ad",
 201 |                     "roit",
 202 |                     "ly",
 203 |                     " minimalist",
 204 |                     " movie",
 205 |                     " .",
 206 |                 ],
 207 |                 activations=[
 208 |                     -0.69,
 209 |                     4.12,
 210 |                     1.83,
 211 |                     -2.28,
 212 |                     -0.28,
 213 |                     -0.79,
 214 |                     -2.2,
 215 |                     -2.03,
 216 |                     -1.77,
 217 |                     -1.71,
 218 |                     -2.44,
 219 |                     1.6,
 220 |                     -1,
 221 |                     -0.38,
 222 |                     -1.93,
 223 |                     -2.09,
 224 |                     -1.63,
 225 |                     -1.94,
 226 |                     -1.82,
 227 |                     -1.64,
 228 |                     -1.32,
 229 |                     -1.92,
 230 |                 ],
 231 |             ),
 232 |         ],
 233 |         first_revealed_activation_indices=[10, 3],
 234 |         explanation="present tense verbs ending in 'ing'",
 235 |     ),
 236 |     Example(
 237 |         activation_records=[
 238 |             ActivationRecord(
 239 |                 tokens=[
 240 |                     "as",
 241 |                     " sac",
 242 |                     "char",
 243 |                     "ine",
 244 |                     " movies",
 245 |                     " go",
 246 |                     " ,",
 247 |                     " this",
 248 |                     " is",
 249 |                     " likely",
 250 |                     " to",
 251 |                     " cause",
 252 |                     " massive",
 253 |                     " cardiac",
 254 |                     " arrest",
 255 |                     " if",
 256 |                     " taken",
 257 |                     " in",
 258 |                     " large",
 259 |                     " doses",
 260 |                     " .",
 261 |                 ],
 262 |                 activations=[
 263 |                     -0.14,
 264 |                     -1.37,
 265 |                     -0.68,
 266 |                     -2.27,
 267 |                     -1.46,
 268 |                     -1.11,
 269 |                     -0.9,
 270 |                     -2.48,
 271 |                     -2.07,
 272 |                     -3.49,
 273 |                     -2.16,
 274 |                     -1.79,
 275 |                     -0.23,
 276 |                     -0.04,
 277 |                     4.46,
 278 |                     -1.02,
 279 |                     -2.26,
 280 |                     -2.95,
 281 |                     -1.49,
 282 |                     -1.46,
 283 |                     -0.6,
 284 |                 ],
 285 |             ),
 286 |             ActivationRecord(
 287 |                 tokens=[
 288 |                     "shot",
 289 |                     " perhaps",
 290 |                     " '",
 291 |                     "art",
 292 |                     "istically",
 293 |                     "'",
 294 |                     " with",
 295 |                     " handheld",
 296 |                     " cameras",
 297 |                     " and",
 298 |                     " apparently",
 299 |                     " no",
 300 |                     " movie",
 301 |                     " lights",
 302 |                     " by",
 303 |                     " jo",
 304 |                     "aquin",
 305 |                     " b",
 306 |                     "aca",
 307 |                     "-",
 308 |                     "as",
 309 |                     "ay",
 310 |                     " ,",
 311 |                     " the",
 312 |                     " low",
 313 |                     "-",
 314 |                     "budget",
 315 |                     " production",
 316 |                     " swings",
 317 |                     " annoy",
 318 |                     "ingly",
 319 |                     " between",
 320 |                     " vert",
 321 |                     "igo",
 322 |                     " and",
 323 |                     " opacity",
 324 |                     " .",
 325 |                 ],
 326 |                 activations=[
 327 |                     -0.09,
 328 |                     -3.53,
 329 |                     -0.72,
 330 |                     -2.36,
 331 |                     -1.05,
 332 |                     -1.12,
 333 |                     -2.49,
 334 |                     -2.14,
 335 |                     -1.98,
 336 |                     -1.59,
 337 |                     -2.62,
 338 |                     -2,
 339 |                     -2.73,
 340 |                     -2.87,
 341 |                     -3.23,
 342 |                     -1.11,
 343 |                     -2.23,
 344 |                     -0.97,
 345 |                     -2.28,
 346 |                     -2.37,
 347 |                     -1.5,
 348 |                     -2.81,
 349 |                     -1.73,
 350 |                     -3.14,
 351 |                     -2.61,
 352 |                     -1.7,
 353 |                     -3.08,
 354 |                     -4,
 355 |                     -0.71,
 356 |                     -2.48,
 357 |                     -1.39,
 358 |                     -1.96,
 359 |                     -1.09,
 360 |                     4.37,
 361 |                     -0.74,
 362 |                     -0.5,
 363 |                     -0.62,
 364 |                 ],
 365 |             ),
 366 |         ],
 367 |         first_revealed_activation_indices=[5, 20],
 368 |         explanation="words related to physical medical conditions",
 369 |     ),
 370 |     Example(
 371 |         activation_records=[
 372 |             ActivationRecord(
 373 |                 tokens=[
 374 |                     "the",
 375 |                     " sense",
 376 |                     " of",
 377 |                     " together",
 378 |                     "ness",
 379 |                     " in",
 380 |                     " our",
 381 |                     " town",
 382 |                     " is",
 383 |                     " strong",
 384 |                     " .",
 385 |                 ],
 386 |                 activations=[
 387 |                     0,
 388 |                     0,
 389 |                     0,
 390 |                     1,
 391 |                     2,
 392 |                     0,
 393 |                     0.23,
 394 |                     0.5,
 395 |                     0,
 396 |                     0,
 397 |                     0,
 398 |                 ],
 399 |             ),
 400 |             ActivationRecord(
 401 |                 tokens=[
 402 |                     "a",
 403 |                     " buoy",
 404 |                     "ant",
 405 |                     " romantic",
 406 |                     " comedy",
 407 |                     " about",
 408 |                     " friendship",
 409 |                     " ,",
 410 |                     " love",
 411 |                     " ,",
 412 |                     " and",
 413 |                     " the",
 414 |                     " truth",
 415 |                     " that",
 416 |                     " we",
 417 |                     "'re",
 418 |                     " all",
 419 |                     " in",
 420 |                     " this",
 421 |                     " together",
 422 |                     " .",
 423 |                 ],
 424 |                 activations=[
 425 |                     -0.15,
 426 |                     -2.33,
 427 |                     -1.4,
 428 |                     -2.17,
 429 |                     -2.53,
 430 |                     -0.85,
 431 |                     0.23,
 432 |                     -1.89,
 433 |                     0.09,
 434 |                     -0.47,
 435 |                     -0.5,
 436 |                     -0.58,
 437 |                     -0.87,
 438 |                     0.22,
 439 |                     0.58,
 440 |                     1.34,
 441 |                     0.98,
 442 |                     2.21,
 443 |                     2.84,
 444 |                     1.7,
 445 |                     -0.89,
 446 |                 ],
 447 |             ),
 448 |         ],
 449 |         first_revealed_activation_indices=[0, 10],
 450 |         explanation="phrases related to community",
 451 |     ),
 452 | ]
 453 | 
 454 | 
 455 | NEWER_EXAMPLES = [
 456 |     Example(
 457 |         activation_records=[
 458 |             ActivationRecord(
 459 |                 tokens=[
 460 |                     "The",
 461 |                     " editors",
 462 |                     " of",
 463 |                     " Bi",
 464 |                     "opol",
 465 |                     "ym",
 466 |                     "ers",
 467 |                     " are",
 468 |                     " delighted",
 469 |                     " to",
 470 |                     " present",
 471 |                     " the",
 472 |                     " ",
 473 |                     "201",
 474 |                     "8",
 475 |                     " Murray",
 476 |                     " Goodman",
 477 |                     " Memorial",
 478 |                     " Prize",
 479 |                     " to",
 480 |                     " Professor",
 481 |                     " David",
 482 |                     " N",
 483 |                     ".",
 484 |                     " Ber",
 485 |                     "atan",
 486 |                     " in",
 487 |                     " recognition",
 488 |                     " of",
 489 |                     " his",
 490 |                     " seminal",
 491 |                     " contributions",
 492 |                     " to",
 493 |                     " bi",
 494 |                     "oph",
 495 |                     "ysics",
 496 |                     " and",
 497 |                     " their",
 498 |                     " impact",
 499 |                     " on",
 500 |                     " our",
 501 |                     " understanding",
 502 |                     " of",
 503 |                     " charge",
 504 |                     " transport",
 505 |                     " in",
 506 |                     " biom",
 507 |                     "olecules",
 508 |                     ".\n\n",
 509 |                     "In",
 510 |                     "aug",
 511 |                     "ur",
 512 |                     "ated",
 513 |                     " in",
 514 |                     " ",
 515 |                     "200",
 516 |                     "7",
 517 |                     " in",
 518 |                     " honor",
 519 |                     " of",
 520 |                     " the",
 521 |                     " Bi",
 522 |                     "opol",
 523 |                     "ym",
 524 |                     "ers",
 525 |                     " Found",
 526 |                     "ing",
 527 |                     " Editor",
 528 |                     ",",
 529 |                     " the",
 530 |                     " prize",
 531 |                     " is",
 532 |                     " awarded",
 533 |                     " for",
 534 |                     " outstanding",
 535 |                     " accomplishments",
 536 |                 ],
 537 |                 activations=[
 538 |                     0,
 539 |                     0.01,
 540 |                     0.01,
 541 |                     0,
 542 |                     0,
 543 |                     0,
 544 |                     -0.01,
 545 |                     0,
 546 |                     -0.01,
 547 |                     0,
 548 |                     0,
 549 |                     0,
 550 |                     0,
 551 |                     0,
 552 |                     0.04,
 553 |                     0,
 554 |                     0,
 555 |                     0,
 556 |                     0,
 557 |                     0,
 558 |                     0,
 559 |                     0,
 560 |                     0,
 561 |                     0,
 562 |                     0,
 563 |                     0,
 564 |                     0,
 565 |                     0,
 566 |                     0,
 567 |                     0,
 568 |                     3.39,
 569 |                     0.12,
 570 |                     0,
 571 |                     -0.01,
 572 |                     0,
 573 |                     0,
 574 |                     0,
 575 |                     0,
 576 |                     -0,
 577 |                     0,
 578 |                     -0,
 579 |                     0,
 580 |                     0,
 581 |                     -0,
 582 |                     0,
 583 |                     0,
 584 |                     0,
 585 |                     0,
 586 |                     0,
 587 |                     0,
 588 |                     0,
 589 |                     0,
 590 |                     0,
 591 |                     0,
 592 |                     0,
 593 |                     0,
 594 |                     0,
 595 |                     0,
 596 |                     0,
 597 |                     0,
 598 |                     0,
 599 |                     -0,
 600 |                     0,
 601 |                     0,
 602 |                     -0.01,
 603 |                     0,
 604 |                     0.41,
 605 |                     0,
 606 |                     0,
 607 |                     0,
 608 |                     -0.01,
 609 |                     0,
 610 |                     0,
 611 |                     0,
 612 |                     0,
 613 |                     0,
 614 |                 ],
 615 |             ),
 616 |             # We sometimes exceed the max context size when this is included :(
 617 |             # ActivationRecord(
 618 |             #     tokens=[
 619 |             #         " We",
 620 |             #         " are",
 621 |             #         " proud",
 622 |             #         " of",
 623 |             #         " our",
 624 |             #         " national",
 625 |             #         " achievements",
 626 |             #         " in",
 627 |             #         " mastering",
 628 |             #         " all",
 629 |             #         " aspects",
 630 |             #         " of",
 631 |             #         " the",
 632 |             #         " fuel",
 633 |             #         " cycle",
 634 |             #         ".",
 635 |             #         " The",
 636 |             #         " current",
 637 |             #         " international",
 638 |             #         " interest",
 639 |             #         " in",
 640 |             #         " closing",
 641 |             #         " the",
 642 |             #         " fuel",
 643 |             #         " cycle",
 644 |             #         " is",
 645 |             #         " a",
 646 |             #         " vind",
 647 |             #         "ication",
 648 |             #         " of",
 649 |             #         " Dr",
 650 |             #         ".",
 651 |             #         " B",
 652 |             #         "hab",
 653 |             #         "ha",
 654 |             #         "’s",
 655 |             #         " pioneering",
 656 |             #         " vision",
 657 |             #         " and",
 658 |             #         " genius",
 659 |             #     ],
 660 |             #     activations=[
 661 |             #         -0,
 662 |             #         -0,
 663 |             #         0,
 664 |             #         -0,
 665 |             #         -0,
 666 |             #         0,
 667 |             #         0,
 668 |             #         0,
 669 |             #         -0,
 670 |             #         0,
 671 |             #         0,
 672 |             #         -0,
 673 |             #         0,
 674 |             #         -0.01,
 675 |             #         0,
 676 |             #         0,
 677 |             #         -0,
 678 |             #         -0,
 679 |             #         0,
 680 |             #         0,
 681 |             #         0,
 682 |             #         -0,
 683 |             #         -0,
 684 |             #         -0.01,
 685 |             #         0,
 686 |             #         0,
 687 |             #         -0,
 688 |             #         0,
 689 |             #         0,
 690 |             #         0,
 691 |             #         0,
 692 |             #         0,
 693 |             #         -0,
 694 |             #         0,
 695 |             #         0,
 696 |             #         0,
 697 |             #         2.15,
 698 |             #         0,
 699 |             #         0,
 700 |             #         0.03,
 701 |             #     ],
 702 |             # ),
 703 |         ],
 704 |         first_revealed_activation_indices=[7],  # , 19],
 705 |         explanation="language related to something being groundbreaking",
 706 |     ),
 707 |     Example(
 708 |         activation_records=[
 709 |             ActivationRecord(
 710 |                 tokens=[
 711 |                     '{"',
 712 |                     "widget",
 713 |                     "Class",
 714 |                     '":"',
 715 |                     "Variant",
 716 |                     "Matrix",
 717 |                     "Widget",
 718 |                     '","',
 719 |                     "back",
 720 |                     "order",
 721 |                     "Message",
 722 |                     '":"',
 723 |                     "Back",
 724 |                     "ordered",
 725 |                     '","',
 726 |                     "back",
 727 |                     "order",
 728 |                     "Message",
 729 |                     "Single",
 730 |                     "Variant",
 731 |                     '":"',
 732 |                     "This",
 733 |                     " item",
 734 |                     " is",
 735 |                     " back",
 736 |                     "ordered",
 737 |                     '.","',
 738 |                     "ordered",
 739 |                     "Selection",
 740 |                     '":',
 741 |                     "true",
 742 |                     ',"',
 743 |                     "product",
 744 |                     "Variant",
 745 |                     "Id",
 746 |                     '":',
 747 |                     "0",
 748 |                     ',"',
 749 |                     "variant",
 750 |                     "Id",
 751 |                     "Field",
 752 |                     '":"',
 753 |                     "product",
 754 |                     "196",
 755 |                     "39",
 756 |                     "_V",
 757 |                     "ariant",
 758 |                     "Id",
 759 |                     '","',
 760 |                     "back",
 761 |                     "order",
 762 |                     "To",
 763 |                     "Message",
 764 |                     "Single",
 765 |                     "Variant",
 766 |                     '":"',
 767 |                     "This",
 768 |                     " item",
 769 |                     " is",
 770 |                     " back",
 771 |                     "ordered",
 772 |                     " and",
 773 |                     " is",
 774 |                     " expected",
 775 |                     " by",
 776 |                     " {",
 777 |                     "0",
 778 |                     "}.",
 779 |                     '","',
 780 |                     "low",
 781 |                     "Price",
 782 |                     '":',
 783 |                     "999",
 784 |                     "9",
 785 |                     ".",
 786 |                     "0",
 787 |                     ',"',
 788 |                     "attribute",
 789 |                     "Indexes",
 790 |                     '":[',
 791 |                     '],"',
 792 |                     "productId",
 793 |                     '":',
 794 |                     "196",
 795 |                     "39",
 796 |                     ',"',
 797 |                     "price",
 798 |                     "V",
 799 |                     "ariance",
 800 |                     '":',
 801 |                     "true",
 802 |                     ',"',
 803 |                 ],
 804 |                 activations=[
 805 |                     0,
 806 |                     0,
 807 |                     0,
 808 |                     0,
 809 |                     4.2,
 810 |                     0,
 811 |                     0,
 812 |                     0,
 813 |                     0,
 814 |                     0,
 815 |                     0,
 816 |                     0,
 817 |                     0,
 818 |                     0,
 819 |                     0,
 820 |                     0,
 821 |                     0,
 822 |                     0,
 823 |                     0,
 824 |                     3.6,
 825 |                     0,
 826 |                     0,
 827 |                     0,
 828 |                     0,
 829 |                     0,
 830 |                     0,
 831 |                     0,
 832 |                     0,
 833 |                     0,
 834 |                     0,
 835 |                     0,
 836 |                     0,
 837 |                     0,
 838 |                     3.7,
 839 |                     0,
 840 |                     0,
 841 |                     0,
 842 |                     0,
 843 |                     4.02,
 844 |                     0,
 845 |                     0,
 846 |                     0,
 847 |                     0,
 848 |                     0,
 849 |                     0,
 850 |                     3.5,
 851 |                     3.7,
 852 |                     0,
 853 |                     0,
 854 |                     0,
 855 |                     0,
 856 |                     0,
 857 |                     0,
 858 |                     0,
 859 |                     2.9,
 860 |                     0,
 861 |                     0,
 862 |                     0,
 863 |                     0,
 864 |                     0,
 865 |                     0,
 866 |                     0,
 867 |                     0,
 868 |                     0,
 869 |                     0,
 870 |                     0,
 871 |                     0,
 872 |                     0,
 873 |                     0,
 874 |                     0,
 875 |                     0,
 876 |                     0,
 877 |                     0,
 878 |                     0,
 879 |                     0,
 880 |                     0,
 881 |                     0,
 882 |                     0,
 883 |                     0,
 884 |                     0,
 885 |                     0,
 886 |                     0,
 887 |                     0,
 888 |                     0,
 889 |                     0,
 890 |                     0,
 891 |                     0,
 892 |                     2.3,
 893 |                     2.24,
 894 |                     0,
 895 |                     0,
 896 |                     0,
 897 |                 ],
 898 |             ),
 899 |             ActivationRecord(
 900 |                 tokens=[
 901 |                     "A",
 902 |                     " regular",
 903 |                     " look",
 904 |                     " at",
 905 |                     " the",
 906 |                     " ups",
 907 |                     " and",
 908 |                     " downs",
 909 |                     " of",
 910 |                     " variant",
 911 |                     " covers",
 912 |                     " in",
 913 |                     " the",
 914 |                     " comics",
 915 |                     " industry",
 916 |                     "…\n\n",
 917 |                     "Here",
 918 |                     " are",
 919 |                     " the",
 920 |                     " Lego",
 921 |                     " variant",
 922 |                     " sketch",
 923 |                     " covers",
 924 |                     " by",
 925 |                     " Leon",
 926 |                     "el",
 927 |                     " Cast",
 928 |                     "ell",
 929 |                     "ani",
 930 |                     " for",
 931 |                     " a",
 932 |                     " variety",
 933 |                     " of",
 934 |                     " Marvel",
 935 |                     " titles",
 936 |                     ",",
 937 |                 ],
 938 |                 activations=[
 939 |                     0,
 940 |                     0,
 941 |                     0,
 942 |                     0,
 943 |                     0,
 944 |                     0,
 945 |                     0,
 946 |                     0,
 947 |                     0,
 948 |                     6.52,
 949 |                     0,
 950 |                     0,
 951 |                     0,
 952 |                     0,
 953 |                     0,
 954 |                     0,
 955 |                     0,
 956 |                     0,
 957 |                     0,
 958 |                     0,
 959 |                     1.62,
 960 |                     0,
 961 |                     0,
 962 |                     0,
 963 |                     0,
 964 |                     0,
 965 |                     0,
 966 |                     0,
 967 |                     0,
 968 |                     0,
 969 |                     0,
 970 |                     3.23,
 971 |                     0,
 972 |                     0,
 973 |                     0,
 974 |                     0,
 975 |                 ],
 976 |             ),
 977 |         ],
 978 |         first_revealed_activation_indices=[2, 8],
 979 |         explanation="the word “variant” and other words with the same ”vari” root",
 980 |     ),
 981 | ]
 982 | 
 983 | 
 984 | NEWER_SINGLE_TOKEN_EXAMPLE = Example(
 985 |     activation_records=[
 986 |         ActivationRecord(
 987 |             tokens=[
 988 |                 "B",
 989 |                 "10",
 990 |                 " ",
 991 |                 "111",
 992 |                 " MON",
 993 |                 "DAY",
 994 |                 ",",
 995 |                 " F",
 996 |                 "EB",
 997 |                 "RU",
 998 |                 "ARY",
 999 |                 " ",
1000 |                 "11",
1001 |                 ",",
1002 |                 " ",
1003 |                 "201",
1004 |                 "9",
1005 |                 " DON",
1006 |                 "ATE",
1007 |                 "fake higher scoring token",  # See below.
1008 |             ],
1009 |             activations=[
1010 |                 0,
1011 |                 0,
1012 |                 0,
1013 |                 0,
1014 |                 0,
1015 |                 0,
1016 |                 0,
1017 |                 0,
1018 |                 0,
1019 |                 0,
1020 |                 0,
1021 |                 0,
1022 |                 0,
1023 |                 0,
1024 |                 0,
1025 |                 0,
1026 |                 0,
1027 |                 0,
1028 |                 0.37,
1029 |                 # This fake activation makes the previous token's activation normalize to 8, which
1030 |                 # might help address overconfidence in "10" activations for the one-token-at-a-time
1031 |                 # scoring prompt. This value and the associated token don't actually appear anywhere
1032 |                 # in the prompt.
1033 |                 0.45,
1034 |             ],
1035 |         ),
1036 |     ],
1037 |     first_revealed_activation_indices=[],
1038 |     token_index_to_score=18,
1039 |     explanation="instances of the token 'ate' as part of another word",
1040 | )
1041 | 


--------------------------------------------------------------------------------