├── .gitignore ├── .isort.cfg ├── .pre-commit-config.yaml ├── LICENSE ├── README.md ├── datasets.md ├── mypy.ini ├── neuron_explainer ├── __init__.py ├── activation_server │ ├── README.md │ ├── derived_scalar_computation.py │ ├── dst_helpers.py │ ├── explainer_routes.py │ ├── explanation_datasets.py │ ├── inference_routes.py │ ├── interactive_model.py │ ├── load_neurons.py │ ├── main.py │ ├── neuron_datasets.py │ ├── read_routes.py │ ├── requests_and_responses.py │ └── tdb_conversions.py ├── activations │ ├── activation_records.py │ ├── activations.py │ ├── attention_utils.py │ ├── derived_scalars │ │ ├── README.md │ │ ├── __init__.py │ │ ├── activations_and_metadata.py │ │ ├── attention.py │ │ ├── autoencoder.py │ │ ├── config.py │ │ ├── derived_scalar_store.py │ │ ├── derived_scalar_types.py │ │ ├── direct_effects.py │ │ ├── edge_activation.py │ │ ├── edge_attribution.py │ │ ├── indexing.py │ │ ├── least_common_tokens.py │ │ ├── locations.py │ │ ├── logprobs.py │ │ ├── make_scalar_derivers.py │ │ ├── mlp.py │ │ ├── multi_group.py │ │ ├── multi_pass_scalar_deriver.py │ │ ├── node_write.py │ │ ├── postprocessing.py │ │ ├── raw_activations.py │ │ ├── reconstituted.py │ │ ├── reconstituter_class.py │ │ ├── residual.py │ │ ├── scalar_deriver.py │ │ ├── tests │ │ │ ├── test_attention.py │ │ │ ├── test_derived_scalar_store.py │ │ │ ├── test_derived_scalar_types.py │ │ │ └── utils.py │ │ ├── tokens.py │ │ ├── utils.py │ │ └── write_tensors.py │ ├── hook_graph.py │ └── test_attention_utils.py ├── api_client.py ├── explanations │ ├── __init__.py │ ├── attention_head_scoring.py │ ├── calibrated_simulator.py │ ├── explainer.py │ ├── explanations.py │ ├── few_shot_examples.py │ ├── prompt_builder.py │ ├── scoring.py │ ├── simulator.py │ ├── test_explainer.py │ └── test_simulator.py ├── fast_dataclasses │ ├── __init__.py │ ├── fast_dataclasses.py │ └── test_fast_dataclasses.py ├── file_utils.py ├── models │ ├── README.md │ ├── __init__.py │ ├── autoencoder.py │ ├── autoencoder_context.py │ ├── hooks.py │ ├── inference_engine_type_registry.py │ ├── model_component_registry.py │ ├── model_context.py │ ├── model_registry.py │ └── transformer.py ├── pydantic │ ├── __init__.py │ ├── camel_case_base_model.py │ ├── hashable_base_model.py │ └── immutable.py ├── scripts │ ├── create_hf_test_data.py │ └── download_from_hf.py └── tests │ ├── conftest.py │ ├── test_activation_reconstituter.py │ ├── test_against_data.py │ ├── test_all_dsts.py │ ├── test_emb_dsts.py │ ├── test_hooks.py │ ├── test_interactive_model.py │ ├── test_model_context_get_weight.py │ ├── test_offline_autoencoder_dsts.py │ ├── test_online_autoencoder_dsts.py │ ├── test_postprocessing.py │ ├── test_reconstituted_gradients.py │ ├── test_serialization_of_model_config_from_model_context.py │ ├── test_trace_through_v.py │ └── test_transformer.py ├── neuron_viewer ├── .gitignore ├── .parcelrc ├── .postcssrc ├── .prettierrc ├── README.md ├── package-lock.json ├── package.json ├── prepend_autogen_comments.sh ├── public │ ├── favicon.ico │ ├── logo192.png │ ├── logo512.png │ ├── manifest.json │ └── robots.txt ├── src │ ├── App.css │ ├── App.tsx │ ├── TransformerDebugger │ │ ├── TransformerDebugger.tsx │ │ ├── cards │ │ │ ├── BySequenceTokenDisplay.tsx │ │ │ ├── DisplayOptions.tsx │ │ │ ├── LayerDisplay.tsx │ │ │ ├── LogitsDisplay.tsx │ │ │ ├── SparsityMetricsDisplay.tsx │ │ │ ├── TokenTable.tsx │ │ │ ├── inference_params │ │ │ │ ├── AblateNodeSpecs.tsx │ │ │ │ ├── InferenceParamsDisplay.tsx │ │ │ │ ├── TokenLabel.tsx │ │ │ │ ├── TraceUpstreamNodeSpec.tsx │ │ │ │ └── inferenceParams.ts │ │ │ ├── node_table │ │ │ │ ├── NodeTable.tsx │ │ │ │ └── TopTokensDisplay.tsx │ │ │ └── prompt │ │ │ │ ├── MultiTokenInput.tsx │ │ │ │ ├── PromptAndTokensOfInterest.tsx │ │ │ │ └── swap.png │ │ ├── common │ │ │ ├── ExplanatoryTooltip.tsx │ │ │ └── JsonModal.tsx │ │ ├── requests │ │ │ ├── explanationFetcher.ts │ │ │ ├── inferenceDataFetcher.ts │ │ │ └── inferenceResponseUtils.tsx │ │ └── utils │ │ │ ├── explanations.ts │ │ │ ├── nodes.tsx │ │ │ ├── numbers.tsx │ │ │ └── urlParams.ts │ ├── client │ │ ├── core │ │ │ ├── ApiError.ts │ │ │ ├── ApiRequestOptions.ts │ │ │ ├── ApiResult.ts │ │ │ ├── CancelablePromise.ts │ │ │ ├── OpenAPI.ts │ │ │ └── request.ts │ │ ├── index.ts │ │ ├── models │ │ │ ├── AblationSpec.ts │ │ │ ├── ActivationLocationType.ts │ │ │ ├── AttentionHeadRecordResponse.ts │ │ │ ├── AttentionTraceType.ts │ │ │ ├── AttributedScoredExplanation.ts │ │ │ ├── BatchedRequest.ts │ │ │ ├── BatchedResponse.ts │ │ │ ├── BatchedTdbRequest.ts │ │ │ ├── ComponentTypeForAttention.ts │ │ │ ├── ComponentTypeForMlp.ts │ │ │ ├── DerivedAttentionScalarsRequest.ts │ │ │ ├── DerivedAttentionScalarsRequestSpec.ts │ │ │ ├── DerivedAttentionScalarsResponse.ts │ │ │ ├── DerivedAttentionScalarsResponseData.ts │ │ │ ├── DerivedScalarType.ts │ │ │ ├── DerivedScalarsRequest.ts │ │ │ ├── DerivedScalarsRequestSpec.ts │ │ │ ├── DerivedScalarsResponse.ts │ │ │ ├── DerivedScalarsResponseData.ts │ │ │ ├── Dimension.ts │ │ │ ├── ExistingExplanationsRequest.ts │ │ │ ├── ExplanationResult.ts │ │ │ ├── GroupId.ts │ │ │ ├── HTTPValidationError.ts │ │ │ ├── InferenceAndTokenData.ts │ │ │ ├── InferenceRequestSpec.ts │ │ │ ├── InferenceResponse.ts │ │ │ ├── InferenceResponseAndResponseDict.ts │ │ │ ├── InferenceSubRequest.ts │ │ │ ├── LossFnConfig.ts │ │ │ ├── LossFnName.ts │ │ │ ├── MirroredActivationIndex.ts │ │ │ ├── MirroredNodeIndex.ts │ │ │ ├── MirroredTraceConfig.ts │ │ │ ├── ModelInfoResponse.ts │ │ │ ├── MultipleTopKDerivedScalarsRequest.ts │ │ │ ├── MultipleTopKDerivedScalarsRequestSpec.ts │ │ │ ├── MultipleTopKDerivedScalarsResponse.ts │ │ │ ├── MultipleTopKDerivedScalarsResponseData.ts │ │ │ ├── NeuronDatasetMetadata.ts │ │ │ ├── NeuronRecordResponse.ts │ │ │ ├── NodeAblation.ts │ │ │ ├── NodeIdAndDatasets.ts │ │ │ ├── NodeToTrace.ts │ │ │ ├── NodeType.ts │ │ │ ├── PassType.ts │ │ │ ├── PreOrPostAct.ts │ │ │ ├── ProcessingResponseDataType.ts │ │ │ ├── ScoreRequest.ts │ │ │ ├── ScoreResult.ts │ │ │ ├── ScoredTokensRequestSpec.ts │ │ │ ├── ScoredTokensResponseData.ts │ │ │ ├── TdbRequestSpec.ts │ │ │ ├── Tensor0D.ts │ │ │ ├── Tensor1D.ts │ │ │ ├── Tensor2D.ts │ │ │ ├── Tensor3D.ts │ │ │ ├── TensorType.ts │ │ │ ├── TokenAndAttentionScalars.ts │ │ │ ├── TokenAndScalar.ts │ │ │ ├── TokenPairAttributionRequestSpec.ts │ │ │ ├── TokenPairAttributionResponseData.ts │ │ │ ├── TokenScoringType.ts │ │ │ ├── TopTokens.ts │ │ │ ├── TopTokensAttendedTo.ts │ │ │ └── ValidationError.ts │ │ └── services │ │ │ ├── ExplainerService.ts │ │ │ ├── HelloWorldService.ts │ │ │ ├── InferenceService.ts │ │ │ ├── MemoryService.ts │ │ │ └── ReadService.ts │ ├── colors.ts │ ├── commonUiComponents.tsx │ ├── heatmapGrid.tsx │ ├── heatmapGrid2d.tsx │ ├── images.d.ts │ ├── index.css │ ├── index.html │ ├── index.tsx │ ├── modelInteractions.tsx │ ├── navigation.tsx │ ├── nodePage.tsx │ ├── panes │ │ ├── activationsForPrompt.tsx │ │ ├── datasetExamples.tsx │ │ ├── explanation.tsx │ │ ├── fetchAndDisplayPane.tsx │ │ ├── index.ts │ │ ├── logitLens.tsx │ │ └── scoreExplanation.tsx │ ├── plots.tsx │ ├── requests │ │ ├── explainerRequests.ts │ │ ├── inferenceRequests.ts │ │ ├── paths.ts │ │ └── readRequests.ts │ ├── tokenHeatmap.tsx │ ├── tokenHeatmap2d.tsx │ ├── tokenRendering.tsx │ ├── types.ts │ └── welcome.tsx ├── tailwind.config.js └── tsconfig.json ├── pyproject.toml ├── pytest.ini ├── setup.py └── terminology.md /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | 131 | # Cached user explanations 132 | cached_explanations/ 133 | -------------------------------------------------------------------------------- /.isort.cfg: -------------------------------------------------------------------------------- 1 | [settings] 2 | profile = black 3 | 4 | known_firstparty= 5 | neuron_explainer 6 | neuron_viewer 7 | 8 | line_length = 100 9 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: https://github.com/astral-sh/ruff-pre-commit 3 | rev: v0.1.9 4 | hooks: 5 | - id: ruff 6 | args: [--fix, --unsafe-fixes, --fix-only, --exit-non-zero-on-fix] 7 | files: neuron_explainer 8 | 9 | - repo: https://github.com/hauntsaninja/black-pre-commit-mirror 10 | rev: 23.10.0 11 | hooks: 12 | - id: black 13 | args: [--line-length=100, --exclude="", --workers=6] 14 | 15 | - repo: https://github.com/pycqa/isort 16 | rev: 5.12.0 17 | hooks: 18 | - id: isort 19 | args: [--line-length=100, --profile=black, --settings-path=.isort.cfg] 20 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 OpenAI 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Transformer Debugger 2 | 3 | Transformer Debugger (TDB) is a tool developed by OpenAI's [Superalignment 4 | team](https://openai.com/blog/introducing-superalignment) with the goal of 5 | supporting investigations into specific behaviors of small language models. The tool combines 6 | [automated interpretability](https://openai.com/research/language-models-can-explain-neurons-in-language-models) 7 | techniques with [sparse autoencoders](https://transformer-circuits.pub/2023/monosemantic-features). 8 | 9 | TDB enables rapid exploration before needing to write code, with the ability to intervene in the 10 | forward pass and see how it affects a particular behavior. It can be used to answer questions like, 11 | "Why does the model output token A instead of token B for this prompt?" or "Why does attention head 12 | H attend to token T for this prompt?" It does so by identifying specific components (neurons, 13 | attention heads, autoencoder latents) that contribute to the behavior, showing automatically 14 | generated explanations of what causes those components to activate most strongly, and tracing 15 | connections between components to help discover circuits. 16 | 17 | These videos give an overview of TDB and show how it can be used to investigate [indirect object 18 | identification in GPT-2 small](https://arxiv.org/abs/2211.00593): 19 | 20 | - [Introduction](https://www.loom.com/share/721244075f12439496db5d53439d2f84?sid=8445200e-c49e-4028-8b8e-3ea8d361dec0) 21 | - [Neuron viewer pages](https://www.loom.com/share/21b601b8494b40c49b8dc7bfd1dc6829?sid=ee23c00a-9ede-4249-b9d7-c2ba15993556) 22 | - [Example: Investigating name mover heads, part 1](https://www.loom.com/share/3478057cec484a1b85471585fef10811?sid=b9c3be4b-7117-405a-8d31-0f9e541dcfb6) 23 | - [Example: Investigating name mover heads, part 2](https://www.loom.com/share/6bd8c6bde84b42a98f9a26a969d4a3ad?sid=4a09ac29-58a2-433e-b55d-762414d9a7fa) 24 | 25 | ## What's in the release? 26 | 27 | - [Neuron viewer](neuron_viewer/README.md): A React app that hosts TDB as well as pages with information about individual model components (MLP neurons, attention heads and autoencoder latents for both). 28 | - [Activation server](neuron_explainer/activation_server/README.md): A backend server that performs inference on a subject model to provide data for TDB. It also reads and serves data from public Azure buckets. 29 | - [Models](neuron_explainer/models/README.md): A simple inference library for GPT-2 models and their autoencoders, with hooks to grab activations. 30 | - [Collated activation datasets](datasets.md): top-activating dataset examples for MLP neurons, attention heads and autoencoder latents. 31 | 32 | ## Setup 33 | 34 | Follow these steps to install the repo. You'll first need python/pip, as well as node/npm. 35 | 36 | Though optional, we recommend you use a virtual environment or equivalent: 37 | 38 | ```sh 39 | # If you're already in a venv, deactivate it. 40 | deactivate 41 | # Create a new venv. 42 | python -m venv ~/.virtualenvs/transformer-debugger 43 | # Activate the new venv. 44 | source ~/.virtualenvs/transformer-debugger/bin/activate 45 | ``` 46 | 47 | Once your environment is set up, follow the following steps: 48 | ```sh 49 | git clone git@github.com:openai/transformer-debugger.git 50 | cd transformer-debugger 51 | 52 | # Install neuron_explainer 53 | pip install -e . 54 | 55 | # Set up the pre-commit hooks. 56 | pre-commit install 57 | 58 | # Install neuron_viewer. 59 | cd neuron_viewer 60 | npm install 61 | cd .. 62 | ``` 63 | 64 | To run the TDB app, you'll then need to follow the instructions to set up the [activation server backend](neuron_explainer/activation_server/README.md) and [neuron viewer frontend](neuron_viewer/README.md). 65 | 66 | ## Making changes 67 | 68 | To validate changes: 69 | 70 | - Run `pytest` 71 | - Run `mypy --config=mypy.ini .` 72 | - Run activation server and neuron viewer and confirm that basic functionality like TDB and neuron 73 | viewer pages is still working 74 | 75 | 76 | ## Links 77 | 78 | - [Terminology](terminology.md) 79 | 80 | ## How to cite 81 | 82 | Please cite as: 83 | 84 | ``` 85 | Mossing, et al., “Transformer Debugger”, GitHub, 2024. 86 | ``` 87 | 88 | BibTex citation: 89 | 90 | ``` 91 | @misc{mossing2024tdb, 92 | title={Transformer Debugger}, 93 | author={Mossing, Dan and Bills, Steven and Tillman, Henk and Dupré la Tour, Tom and Cammarata, Nick and Gao, Leo and Achiam, Joshua and Yeh, Catherine and Leike, Jan and Wu, Jeff and Saunders, William}, 94 | year={2024}, 95 | publisher={GitHub}, 96 | howpublished={\url{https://github.com/openai/transformer-debugger}}, 97 | } 98 | ``` 99 | -------------------------------------------------------------------------------- /datasets.md: -------------------------------------------------------------------------------- 1 | # Collated activation datasets 2 | 3 | This document lists the collated activation datasets that are compatible with the Transformer Debugger. These datasets contain some top-activating examples for each MLP neuron, attention head, and autoencoder latent, as well as the corresponding activations for each token (or token pair) in the example. They provide a way to visualize what each neuron, attention head, or autoencoder latent is selective for (obviously in an incomplete way). These activation datasets are used by the [neuron viewer](neuron_viewer/README.md) to display the top-activating examples for each component, and are also typically used for [automated interpretability](https://openai.com/research/language-models-can-explain-neurons-in-language-models). 4 | 5 | The activations datasets are located on Azure Blob Storage, for example accessible via the [`blobfile`](https://github.com/blobfile/blobfile) library. 6 | 7 | # GPT-2 small 8 | 9 | Collated activation datasets are available for both the MLP neurons and the attention heads. MLP neuron activations are recorded for each token, while attention head activations are recorded for each token pair. 10 | 11 | The datasets are located at the following paths: 12 | > - MLP neurons: `https://openaipublic.blob.core.windows.net/neuron-explainer/gpt2_small_data/collated-activations/{layer_index}/{neuron_index}.json` 13 | > - Attention heads: `https://openaipublic.blob.core.windows.net/neuron-explainer/gpt2_small/attn_write_norm/collated-activations-by-token-pair/{layer_index}/{head_index}.json` 14 | 15 | with the following parameters: 16 | - `layer_index` is in range(12) 17 | - `neuron_index` is in range(3084) 18 | - `head_index` is in range(12) 19 | 20 | 21 | ## GPT-2 small - MLP autoencoders 22 | 23 | MLP autoencoders were trained either on the MLP neurons (after the activation function), or on the MLP-layer output that is written to the residual stream. See [Autoencoders for GPT-2 small](neuron_explainer/models/README.md#sparse-autoencoder) for more details. 24 | 25 | The datasets are located at the following paths: 26 | 27 | > - MLP latents: `https://openaipublic.blob.core.windows.net/neuron-explainer/gpt2-small/autoencoder_latent/{autoencoder_input}{version}/collated-activations/{layer_index}/{latent_index}.pt` 28 | 29 | with the following parameters: 30 | - `autoencoder_input` is in ["mlp_post_act", "resid_delta_mlp"] 31 | - `version` is in ["", "_v4"]. (The `_v4` versions use slightly different hyperparameters, and should be preferred.) 32 | - `layer_index` is in range(12) 33 | - `latent_index` is in range(32768) 34 | 35 | ## GPT-2 small - Attention autoencoders 36 | 37 | Attention autoencoders were trained on the attention-layer output that is written to the residual stream. See [Autoencoders for GPT-2 small](neuron_explainer/models/README.md#sparse-autoencoder) for more details. The `collated-activations` dataset contains autoencoder latent activations for each token, while the `collated-activations-by-token-pair` dataset contains autoencoder latent *attribution* to each token pair. To compute the attribution given an autoencoder latent `L` and a token pair `(T1, T2)`, we multiply the attention pattern `A(T1, T2)` with the gradient of `L` with respect to the attention pattern: `attribution_L(T1, T2) = A(T1, T2) * ∂L/∂A(T1, T2)`. 38 | 39 | The datasets are located at the following paths: 40 | 41 | > - Attention latents (by token): `https://openaipublic.blob.core.windows.net/neuron-explainer/gpt2-small/autoencoder_latent/resid_delta_attn_v4/collated-activations/{layer_index}/{latent_index}.pt` 42 | > - Attention latents (by token pair): `https://openaipublic.blob.core.windows.net/neuron-explainer/gpt2-small/autoencoder_latent/resid_delta_attn_v4/collated-activations-by-token-pair/{layer_index}/{latent_index}.pt` 43 | 44 | with the following parameters: 45 | - `layer_index` is in range(12) 46 | - `latent_index` is in range(10240) 47 | 48 | 49 | 50 | # GPT-2 xl 51 | 52 | For GPT-2 xl, only the MLP neurons activations are available. The datasets are located at the following paths: 53 | > - MLP neurons: `https://openaipublic.blob.core.windows.net/neuron-explainer/data/collated-activations/{layer_index}/{neuron_index}.json` 54 | 55 | with the following parameters: 56 | - `layer_index` is in range(48) 57 | - `neuron_index` is in range(6400) 58 | -------------------------------------------------------------------------------- /mypy.ini: -------------------------------------------------------------------------------- 1 | [mypy] 2 | 3 | ; Not all dependencies have type annotations; ignore this. 4 | ignore_missing_imports=True 5 | namespace_packages=True 6 | explicit_package_bases = True 7 | 8 | ; Be strict about certain rules. 9 | strict_equality=True 10 | warn_unused_configs=True 11 | no_implicit_optional=True 12 | strict_optional=True 13 | warn_redundant_casts=True 14 | warn_unused_ignores=True 15 | check_untyped_defs=True 16 | 17 | [mypy-neuron_explainer.*] 18 | ignore_errors=False 19 | disallow_untyped_defs=True 20 | 21 | [mypy-neuron_explainer.api_client] 22 | ignore_errors=True 23 | 24 | [mypy-neuron_explainer.models.hooks] 25 | ignore_errors=True 26 | 27 | [mypy-neuron_explainer.models.transformer] 28 | ignore_errors=True 29 | 30 | [mypy-neuron_explainer.tests.test_hooks] 31 | ignore_errors=True 32 | 33 | [mypy-neuron_explainer.tests.test_transformer] 34 | ignore_errors=True 35 | -------------------------------------------------------------------------------- /neuron_explainer/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openai/transformer-debugger/87e6db7b7e73ded5037eeeff05deb5e81548a10a/neuron_explainer/__init__.py -------------------------------------------------------------------------------- /neuron_explainer/activation_server/dst_helpers.py: -------------------------------------------------------------------------------- 1 | # Small helper functions for working with derived scalars in the context of activation server 2 | # request handling. 3 | 4 | import math 5 | from typing import Any, Callable, TypeVar 6 | 7 | import torch 8 | 9 | from neuron_explainer.activation_server.requests_and_responses import * 10 | from neuron_explainer.activations.derived_scalars.derived_scalar_store import DerivedScalarStore 11 | from neuron_explainer.activations.derived_scalars.derived_scalar_types import DerivedScalarType 12 | from neuron_explainer.activations.derived_scalars.indexing import ( 13 | DerivedScalarIndex, 14 | MirroredNodeIndex, 15 | ) 16 | from neuron_explainer.models.model_component_registry import Dimension 17 | 18 | T = TypeVar("T") 19 | 20 | 21 | def _float_tensor_to_list(x: torch.Tensor) -> list[float]: 22 | return [x if math.isfinite(x) else -999 for x in x.tolist()] 23 | 24 | 25 | def _torch_to_tensor_nd(x: torch.Tensor) -> TensorND: 26 | ndim = x.ndim 27 | if ndim == 0: 28 | return Tensor0D(value=x.item()) 29 | elif ndim == 1: 30 | return Tensor1D(value=_float_tensor_to_list(x)) 31 | elif ndim == 2: 32 | return Tensor2D(value=[_float_tensor_to_list(row) for row in x]) 33 | elif ndim == 3: 34 | return Tensor3D(value=[[_float_tensor_to_list(row) for row in matrix] for matrix in x]) 35 | else: 36 | raise NotImplementedError(f"Unknown ndim: {ndim}") 37 | 38 | 39 | def _get_dims_to_keep( 40 | dst: DerivedScalarType, keep_dimension_fn: Callable[[Dimension], bool] 41 | ) -> list[Dimension]: 42 | return [dim for dim in dst.shape_spec_per_token_sequence if keep_dimension_fn(dim)] 43 | 44 | 45 | def _sum_dst( 46 | ds_store: DerivedScalarStore, 47 | dst: DerivedScalarType, 48 | keep_dimension_fn: Callable[[Dimension], bool], 49 | abs_mode: bool, 50 | ) -> torch.Tensor: 51 | dims_to_keep = _get_dims_to_keep(dst, keep_dimension_fn) 52 | store_for_dst = ds_store.filter_dsts([dst]) 53 | activations_and_metadata = next( 54 | iter(store_for_dst.activations_and_metadata_by_dst_and_pass_type.values()) 55 | ) 56 | ndim_before_sum = len(activations_and_metadata.shape) 57 | if abs_mode: 58 | sum_for_dst = store_for_dst.sum_abs(dims_to_keep=dims_to_keep) 59 | else: 60 | sum_for_dst = store_for_dst.sum(dims_to_keep=dims_to_keep) 61 | assert len(sum_for_dst.shape) == len( 62 | dims_to_keep 63 | ), f"{sum_for_dst.shape=}, {ndim_before_sum=}, {dims_to_keep=}" 64 | return sum_for_dst 65 | 66 | 67 | def get_intermediate_sum_by_dst( 68 | ds_store: DerivedScalarStore, 69 | keep_dimension_fn: Callable[[Dimension], bool], 70 | abs_mode: bool = False, 71 | ) -> dict[DerivedScalarType, TensorND]: 72 | dict_of_torch_tensors = { 73 | dst: _sum_dst(ds_store, dst, keep_dimension_fn, abs_mode=abs_mode) for dst in ds_store.dsts 74 | } 75 | return {dst: _torch_to_tensor_nd(x) for dst, x in dict_of_torch_tensors.items()} 76 | 77 | 78 | def get_ds_index_from_node_index( 79 | node_index: MirroredNodeIndex, 80 | dsts: list[DerivedScalarType], 81 | ) -> DerivedScalarIndex: 82 | """ 83 | Converts from a MirroredNodeIndex (more general, e.g. defined by a NodeType such as MLP neurons) 84 | to a DerivedScalarIndex (more specific, e.g. defined by a DerivedScalarType such as MLP write 85 | norm) conditional on the given derived scalar types, which are assumed to be unique for each 86 | NodeType. 87 | """ 88 | dsts_matching_node_type = [dst for dst in dsts if dst.node_type == node_index.node_type] 89 | assert len(dsts_matching_node_type) == 1, ( 90 | f"Expected exactly one derived scalar type to have node type {node_index.node_type}, " 91 | f"but found {dsts_matching_node_type} in {dsts}" 92 | ) 93 | return DerivedScalarIndex.from_node_index( 94 | node_index=node_index, 95 | dst=dsts_matching_node_type[0], 96 | ) 97 | 98 | 99 | def assert_tensor(tensor: Any) -> torch.Tensor: 100 | # for mypy 101 | assert isinstance(tensor, torch.Tensor) 102 | return tensor 103 | -------------------------------------------------------------------------------- /neuron_explainer/activation_server/explanation_datasets.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from neuron_explainer.activation_server.load_neurons import convert_dataset_path_to_short_name 4 | 5 | # Maps from neuron dataset path to explanation dataset path. 6 | AZURE_EXPLANATION_DATASET_REGISTRY = { 7 | "https://openaipublic.blob.core.windows.net/neuron-explainer/data/collated-activations/": "https://openaipublic.blob.core.windows.net/neuron-explainer/data/explanations/", 8 | "https://openaipublic.blob.core.windows.net/neuron-explainer/gpt2_small_data/collated-activations/": "https://openaipublic.blob.core.windows.net/neuron-explainer/gpt2_small_data/explanations/", 9 | } 10 | 11 | 12 | def get_local_cached_explanation_directory(dataset_path: str) -> str: 13 | root_project_directory = os.path.dirname( 14 | os.path.dirname(os.path.dirname(os.path.abspath(__file__))) 15 | ) 16 | dataset_short_name = convert_dataset_path_to_short_name(dataset_path) 17 | return f"{root_project_directory}/cached_explanations/{dataset_short_name}" 18 | 19 | 20 | async def get_all_explanation_datasets(neuron_dataset: str) -> list[str]: 21 | """ 22 | Get all explanation datasets for a given neuron dataset. Search the public azure bucket and also 23 | the local filesystem cache. Returns a list of paths to the explanation datasets. 24 | Path can be an azure path (beginning with `https://`) or a local path. 25 | """ 26 | datasets = [] 27 | if neuron_dataset in AZURE_EXPLANATION_DATASET_REGISTRY: 28 | datasets.append(AZURE_EXPLANATION_DATASET_REGISTRY[neuron_dataset]) 29 | local_cache_dir = get_local_cached_explanation_directory(neuron_dataset) 30 | # Iterate through folders to get a list of dirs. 31 | # There will be different local cache directories if the user generates scored explanations for 32 | # the same neuron dataset using different neuron/attention explainer registry entries (i.e. so 33 | # that AttentionExplainAndScoreMethodId or NeuronExplainAndScoreMethodId differ). 34 | if os.path.exists(local_cache_dir) and os.path.isdir(local_cache_dir): 35 | for entry in os.listdir(local_cache_dir): 36 | candidate_path = os.path.join(local_cache_dir, entry) 37 | if os.path.isdir(candidate_path): 38 | datasets.append(candidate_path) 39 | return datasets 40 | -------------------------------------------------------------------------------- /neuron_explainer/activation_server/inference_routes.py: -------------------------------------------------------------------------------- 1 | """Routes / endpoints related to performing inference on the subject model.""" 2 | 3 | from fastapi import FastAPI, HTTPException 4 | 5 | from neuron_explainer.activation_server.interactive_model import InteractiveModel 6 | from neuron_explainer.activation_server.requests_and_responses import ( 7 | BatchedRequest, 8 | BatchedResponse, 9 | BatchedTdbRequest, 10 | DerivedAttentionScalarsRequest, 11 | DerivedAttentionScalarsResponse, 12 | DerivedScalarsRequest, 13 | DerivedScalarsResponse, 14 | ModelInfoResponse, 15 | MultipleTopKDerivedScalarsRequest, 16 | MultipleTopKDerivedScalarsResponse, 17 | ) 18 | 19 | 20 | def define_inference_routes( 21 | app: FastAPI, 22 | model: InteractiveModel | None, 23 | mlp_autoencoder_name: str | None, 24 | attn_autoencoder_name: str | None, 25 | ) -> None: 26 | def assert_model() -> None: 27 | if model is None: 28 | raise HTTPException( 29 | status_code=500, 30 | detail="Inference model not running. Restart the activation server with run_model=True to use inference endpoints.", 31 | ) 32 | 33 | @app.post("/derived_scalars", response_model=DerivedScalarsResponse, tags=["inference"]) 34 | async def derived_scalars(request: DerivedScalarsRequest) -> DerivedScalarsResponse: 35 | assert_model() 36 | assert model is not None # redundant; needed for mypy 37 | return await model.get_derived_scalars(request) 38 | 39 | @app.post( 40 | "/derived_attention_scalars", 41 | response_model=DerivedAttentionScalarsResponse, 42 | tags=["inference"], 43 | ) 44 | async def derived_attention_scalars( 45 | request: DerivedAttentionScalarsRequest, 46 | ) -> DerivedAttentionScalarsResponse: 47 | assert_model() 48 | assert model is not None # redundant; needed for mypy 49 | return await model.get_derived_attention_scalars(request) 50 | 51 | @app.post( 52 | "/multiple_top_k_derived_scalars", 53 | response_model=MultipleTopKDerivedScalarsResponse, 54 | tags=["inference"], 55 | ) 56 | async def multiple_top_k_derived_scalars( 57 | request: MultipleTopKDerivedScalarsRequest, 58 | ) -> MultipleTopKDerivedScalarsResponse: 59 | assert_model() 60 | assert model is not None # redundant; needed for mypy 61 | return await model.get_multiple_top_k_derived_scalars(request) 62 | 63 | @app.post("/batched", response_model=BatchedResponse, tags=["inference"]) 64 | async def batched(request: BatchedRequest) -> BatchedResponse: 65 | assert_model() 66 | assert model is not None # redundant; needed for mypy 67 | return await model.handle_batched_request(request) 68 | 69 | @app.post("/batched_tdb", response_model=BatchedResponse, tags=["inference"]) 70 | async def batched_tdb(request: BatchedTdbRequest) -> BatchedResponse: 71 | assert_model() 72 | assert model is not None # redundant; needed for mypy 73 | return await model.handle_batched_tdb_request(request) 74 | 75 | @app.post("/model_info", response_model=ModelInfoResponse, tags=["inference"]) 76 | def model_info() -> ModelInfoResponse: 77 | assert_model() 78 | assert model is not None # redundant; needed for mypy 79 | return model.get_model_info( 80 | mlp_autoencoder_name=mlp_autoencoder_name, attn_autoencoder_name=attn_autoencoder_name 81 | ) 82 | -------------------------------------------------------------------------------- /neuron_explainer/activation_server/load_neurons.py: -------------------------------------------------------------------------------- 1 | from fastapi import HTTPException 2 | 3 | from neuron_explainer.activation_server.neuron_datasets import ( 4 | NEURON_DATASET_METADATA_REGISTRY, 5 | get_neuron_dataset_metadata_by_short_name_and_dst, 6 | ) 7 | from neuron_explainer.activations.activations import NeuronRecord, load_neuron_async 8 | from neuron_explainer.activations.derived_scalars import DerivedScalarType 9 | from neuron_explainer.pydantic import CamelCaseBaseModel, immutable 10 | 11 | 12 | @immutable 13 | class NodeIdAndDatasets(CamelCaseBaseModel): 14 | dst: DerivedScalarType 15 | layer_index: int 16 | activation_index: int 17 | datasets: list[str] 18 | """A list of dataset paths or short names.""" 19 | 20 | 21 | def resolve_neuron_dataset(dataset: str, dst: DerivedScalarType) -> str: 22 | if dataset.startswith("https://"): 23 | return dataset 24 | else: 25 | # It's the short name for a dataset, like "gpt2-small". We have to look up the metadata. 26 | dataset_metadata = get_neuron_dataset_metadata_by_short_name_and_dst(dataset, dst) 27 | return dataset_metadata.neuron_dataset_path 28 | 29 | 30 | def convert_dataset_path_to_short_name(dataset_path: str) -> str: 31 | assert dataset_path.startswith("https://") 32 | short_name = None 33 | for metadata in NEURON_DATASET_METADATA_REGISTRY.values(): 34 | if metadata.neuron_dataset_path == dataset_path: 35 | short_name = metadata.short_name 36 | break 37 | assert ( 38 | short_name is not None 39 | ), f"Could not find short name for {dataset_path}. If you're trying to use a custom dataset, ensure that you have added it to neuron_datasets.py:NEURON_DATASET_METADATA_REGISTRY." 40 | return short_name 41 | 42 | 43 | async def load_neuron_from_datasets( 44 | node_id_and_datasets: NodeIdAndDatasets, 45 | ) -> tuple[str, NeuronRecord]: 46 | """ 47 | Load a neuron record of the specified dst (e.g. DerivedScalarType.MLP_POST_ACT) from a list of 48 | datasets, returning the data from the first dataset that has the neuron. 49 | 50 | Used to allow first trying a dataset that only covers a subset of neurons for a model, 51 | with a fallback to another dataset that covers all neurons. 52 | """ 53 | dst = node_id_and_datasets.dst 54 | datasets = node_id_and_datasets.datasets 55 | dataset_paths = [resolve_neuron_dataset(dataset, dst) for dataset in datasets] 56 | layer_index = node_id_and_datasets.layer_index 57 | activation_index = node_id_and_datasets.activation_index 58 | for dataset_path in dataset_paths: 59 | try: 60 | return dataset_path, await load_neuron_async( 61 | dataset_path, layer_index, activation_index 62 | ) 63 | except FileNotFoundError: 64 | pass 65 | raise HTTPException( 66 | status_code=404, 67 | detail=f"Could not find {dst} {layer_index}:{activation_index} in {dataset_paths}", 68 | ) 69 | -------------------------------------------------------------------------------- /neuron_explainer/activations/derived_scalars/__init__.py: -------------------------------------------------------------------------------- 1 | from .derived_scalar_types import DerivedScalarType 2 | -------------------------------------------------------------------------------- /neuron_explainer/activations/derived_scalars/edge_activation.py: -------------------------------------------------------------------------------- 1 | """This file defines ScalarDerivers for efficiently computing the direct effect of a single upstream node 2 | on many downstream nodes.""" 3 | 4 | from typing import Callable 5 | 6 | from neuron_explainer.activations.derived_scalars.derived_scalar_types import DerivedScalarType 7 | from neuron_explainer.activations.derived_scalars.node_write import make_node_write_scalar_source 8 | from neuron_explainer.activations.derived_scalars.reconstituter_class import ActivationReconstituter 9 | from neuron_explainer.activations.derived_scalars.scalar_deriver import ( 10 | DstConfig, 11 | ScalarDeriver, 12 | ScalarSource, 13 | ) 14 | from neuron_explainer.models.model_component_registry import ActivationLocationType 15 | from neuron_explainer.models.model_context import StandardModelContext 16 | 17 | 18 | def convert_node_write_scalar_deriver_to_in_edge_activation( 19 | node_write_scalar_source: ScalarSource, 20 | output_dst: DerivedScalarType, 21 | dst_config: DstConfig, 22 | downstream_activation_location_type: ActivationLocationType, 23 | downstream_q_or_k: ActivationLocationType | None, 24 | ) -> ScalarDeriver: 25 | """Converts a scalar deriver for a write vector from some upstream node type to a scalar deriver for 26 | in edge activation for downstream nodes of some type (MLP, autoencoder, or attention head). In the 27 | case of attention heads, this is split up by subnode (Q or K).""" 28 | 29 | model_context = dst_config.get_model_context() 30 | autoencoder_context = dst_config.get_autoencoder_context() 31 | assert isinstance(model_context, StandardModelContext) 32 | transformer = model_context.get_or_create_model() 33 | reconstituter = ActivationReconstituter.from_activation_location_type( 34 | transformer=transformer, 35 | autoencoder_context=autoencoder_context, 36 | activation_location_type=downstream_activation_location_type, 37 | q_or_k=downstream_q_or_k, 38 | ) 39 | return reconstituter.make_jvp_scalar_deriver( 40 | write_scalar_source=node_write_scalar_source, 41 | dst_config=dst_config, 42 | output_dst=output_dst, 43 | ) 44 | 45 | 46 | def make_in_edge_activation_scalar_deriver_factory( 47 | activation_location_type: ActivationLocationType, 48 | q_or_k: ActivationLocationType | None = None, 49 | ) -> Callable[[DstConfig], ScalarDeriver]: 50 | """Returns a function that creates a scalar deriver for the edge attribution from arbitrary node 51 | to the specified downstream activation location type / sub activation location type (MLP post act, 52 | autoencoder latent, attention head Q or K). 53 | """ 54 | 55 | sub_node_type_to_output_dst = { 56 | (ActivationLocationType.MLP_POST_ACT, None): DerivedScalarType.MLP_IN_EDGE_ACTIVATION, 57 | ( 58 | ActivationLocationType.ONLINE_AUTOENCODER_LATENT, 59 | None, 60 | ): DerivedScalarType.ONLINE_AUTOENCODER_IN_EDGE_ACTIVATION, 61 | ( 62 | ActivationLocationType.ATTN_QK_PROBS, 63 | ActivationLocationType.ATTN_QUERY, 64 | ): DerivedScalarType.ATTN_QUERY_IN_EDGE_ACTIVATION, 65 | ( 66 | ActivationLocationType.ATTN_QK_PROBS, 67 | ActivationLocationType.ATTN_KEY, 68 | ): DerivedScalarType.ATTN_KEY_IN_EDGE_ACTIVATION, 69 | } 70 | 71 | output_dst = sub_node_type_to_output_dst[(activation_location_type, q_or_k)] 72 | 73 | def make_in_edge_activation_scalar_deriver(dst_config: DstConfig) -> ScalarDeriver: 74 | node_write_scalar_source = make_node_write_scalar_source(dst_config) 75 | return convert_node_write_scalar_deriver_to_in_edge_activation( 76 | node_write_scalar_source=node_write_scalar_source, 77 | output_dst=output_dst, 78 | dst_config=dst_config, 79 | downstream_activation_location_type=activation_location_type, 80 | downstream_q_or_k=q_or_k, 81 | ) 82 | 83 | return make_in_edge_activation_scalar_deriver 84 | -------------------------------------------------------------------------------- /neuron_explainer/activations/derived_scalars/tests/test_attention.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import torch 3 | 4 | from neuron_explainer.activations.derived_scalars.attention import ( 5 | flatten_lower_triangle, 6 | unflatten_lower_triangle, 7 | unflatten_lower_triangle_and_sum_columns, 8 | ) 9 | 10 | 11 | @pytest.mark.parametrize("extra_dim", [[], [2], [2, 3]]) 12 | @pytest.mark.parametrize("N", [63, 64, 65]) 13 | def test_unflatten_lower_triangle(extra_dim: list[int], N: int) -> None: 14 | """Test that unflatten_lower_triangle is the inverse of flatten_lower_triangle.""" 15 | # Create a random tensor of shape ... x M x N 16 | M = 64 17 | original_tensor = torch.rand(extra_dim + [M, N]) 18 | 19 | # Set all elements above the lower triangular to 0 20 | lower_triangular_mask = torch.tril(torch.ones(M, N)).bool() 21 | original_tensor[..., ~lower_triangular_mask] = 0 22 | 23 | # Apply flatten_lower_triangle to the original tensor 24 | flattened = flatten_lower_triangle(original_tensor) 25 | assert flattened.shape == tuple(extra_dim + [lower_triangular_mask.sum()]) 26 | 27 | # Apply unflatten_lower_triangle to the flattened tensor 28 | reconstructed_tensor = unflatten_lower_triangle(flattened, M, N) 29 | assert torch.allclose(original_tensor, reconstructed_tensor) 30 | 31 | 32 | @pytest.mark.parametrize("extra_dim", [[], [2], [2, 3]]) 33 | @pytest.mark.parametrize("N", [63, 64, 65]) 34 | def test_unflatten_lower_triangle_and_sum_columns(extra_dim: list[int], N: int) -> None: 35 | """Test unflatten_lower_triangle_and_sum_columns(...) is equal to unflatten_lower_triangle(...).sum(-1).""" 36 | # Create a random flattened tensor 37 | M = 64 38 | num_elements = int(torch.tril(torch.ones(M, N)).bool().sum().item()) 39 | flattened = torch.rand(extra_dim + [num_elements]) 40 | 41 | # apply unflatten_lower_triangle_and_sum_columns 42 | result = unflatten_lower_triangle_and_sum_columns(flattened, M, N) 43 | 44 | # apply unflatten_lower_triangle and sum(-1) 45 | reconstructed = unflatten_lower_triangle(flattened, M, N) 46 | reference = reconstructed.sum(dim=-1) 47 | assert torch.allclose(result, reference) 48 | -------------------------------------------------------------------------------- /neuron_explainer/activations/derived_scalars/tests/utils.py: -------------------------------------------------------------------------------- 1 | from neuron_explainer.activations.derived_scalars.derived_scalar_types import DerivedScalarType 2 | from neuron_explainer.models.model_component_registry import Dimension 3 | from neuron_explainer.models.model_context import ModelContext, get_default_device 4 | 5 | get_testing_device = get_default_device # keep for compatibility 6 | 7 | 8 | def get_autoencoder_test_path( 9 | dst: DerivedScalarType, 10 | ) -> str: 11 | """Return the path to a test autoencoder.""" 12 | 13 | name = f"{dst.value}.pt" 14 | return f"https://openaipublic.blob.core.windows.net/neuron-explainer/test-data/autoencoder_test_state_dicts/{name}" 15 | 16 | 17 | def get_activation_shape( 18 | dst: DerivedScalarType, 19 | model_context: ModelContext, 20 | n_tokens: int = 10, 21 | n_latents: int | None = None, 22 | ) -> tuple[int, ...]: 23 | """Return the shape of activations""" 24 | activation_shape = [] 25 | assert dst.shape_spec_per_token_sequence[0].is_sequence_token_dimension 26 | if dst in [ 27 | DerivedScalarType.ATTN_WRITE_NORM, 28 | DerivedScalarType.FLATTENED_ATTN_POST_SOFTMAX, 29 | DerivedScalarType.ATTN_ACT_TIMES_GRAD, 30 | DerivedScalarType.ATTN_WRITE_TO_FINAL_RESIDUAL_GRAD, 31 | ]: 32 | # first dimension is token pairs 33 | activation_shape.append(n_tokens * (n_tokens + 1) // 2) 34 | else: 35 | activation_shape.append(n_tokens) 36 | for dimension in dst.shape_spec_per_token_sequence[1:]: 37 | if dimension == Dimension.SINGLETON: 38 | activation_shape.append(1) 39 | elif dimension.is_model_intrinsic: 40 | activation_shape.append(model_context.get_dim_size(dimension)) 41 | elif dimension.is_sequence_token_dimension: 42 | activation_shape.append(n_tokens) 43 | elif dimension.is_parameterized_dimension: 44 | assert n_latents is not None 45 | activation_shape.append(n_latents) 46 | else: 47 | raise ValueError(f"Unsupported dimension: {dimension}") 48 | 49 | print(f"{dst}: {activation_shape}") 50 | return tuple(activation_shape) 51 | -------------------------------------------------------------------------------- /neuron_explainer/activations/derived_scalars/utils.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | 4 | def detach_and_clone(x: torch.Tensor, requires_grad: bool) -> torch.Tensor: 5 | """In some cases, a derived scalar may be computed by applying a function to 6 | some activations, and running .backward() on the output, with some tensors 7 | desired to be backprop'ed through and some not. This function is for that: 8 | it detaches and clones the input tensor such that it doesn't interfere with 9 | other places those activations are used, and so that the gradient information 10 | is cleared. It then sets requires_grad to the desired value based on whether this 11 | activation should be backprop'ed through.""" 12 | return x.detach().clone().requires_grad_(requires_grad) 13 | -------------------------------------------------------------------------------- /neuron_explainer/activations/derived_scalars/write_tensors.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | from neuron_explainer.activations.derived_scalars.derived_scalar_types import DerivedScalarType 4 | from neuron_explainer.models.autoencoder_context import ( 5 | AutoencoderContext, 6 | get_autoencoder_output_weight_by_layer_index, 7 | ) 8 | from neuron_explainer.models.model_component_registry import ( 9 | LayerIndex, 10 | NodeType, 11 | WeightLocationType, 12 | ) 13 | from neuron_explainer.models.model_context import ModelContext 14 | 15 | 16 | def get_attn_write_tensor_by_layer_index( 17 | model_context: ModelContext, 18 | layer_indices: list[int] | None, 19 | ) -> dict[LayerIndex, torch.Tensor]: 20 | """Returns a dictionary mapping layer index to the write weight matrix for that layer.""" 21 | if layer_indices is None: 22 | layer_indices = list(range(model_context.n_layers)) 23 | W_out_by_layer_index: dict[LayerIndex, torch.Tensor] = { 24 | layer_index: model_context.get_weight( 25 | location_type=WeightLocationType.ATTN_TO_RESIDUAL, 26 | layer=layer_index, 27 | device=model_context.device, 28 | ) # shape (n_heads, d_head, d_model) 29 | for layer_index in layer_indices 30 | } 31 | return W_out_by_layer_index 32 | 33 | 34 | def get_mlp_write_tensor_by_layer_index( 35 | model_context: ModelContext, layer_indices: list[int] | None 36 | ) -> dict[LayerIndex, torch.Tensor]: 37 | if layer_indices is None: 38 | layer_indices = list(range(model_context.n_layers)) 39 | W_out_location_type = WeightLocationType.MLP_TO_RESIDUAL 40 | W_out_by_layer_index: dict[LayerIndex, torch.Tensor] = { 41 | layer_index: model_context.get_weight( 42 | location_type=W_out_location_type, 43 | layer=layer_index, 44 | device=model_context.device, 45 | ) # shape (d_ff, d_model) 46 | for layer_index in layer_indices 47 | } 48 | return W_out_by_layer_index 49 | 50 | 51 | def _assert_non_none(x: LayerIndex) -> int: 52 | assert x is not None 53 | return x 54 | 55 | 56 | def get_autoencoder_write_tensor_by_layer_index( 57 | autoencoder_context: AutoencoderContext, 58 | model_context: ModelContext, 59 | ) -> dict[LayerIndex, torch.Tensor]: 60 | if autoencoder_context.dst == DerivedScalarType.MLP_POST_ACT: 61 | autoencoder_output_weight_by_layer_index = get_autoencoder_output_weight_by_layer_index( 62 | autoencoder_context 63 | ) 64 | W_out_by_layer_index = get_mlp_write_tensor_by_layer_index_with_autoencoder_context( 65 | autoencoder_context, model_context 66 | ) 67 | return { 68 | _assert_non_none(layer_index): torch.einsum( 69 | "an,nd->ad", 70 | autoencoder_output_weight_by_layer_index[layer_index], 71 | W_out_by_layer_index[_assert_non_none(layer_index)], 72 | ) 73 | for layer_index in autoencoder_context.layer_indices 74 | } 75 | else: 76 | assert ( 77 | autoencoder_context.dst.node_type == NodeType.RESIDUAL_STREAM_CHANNEL 78 | ), autoencoder_context.dst 79 | return get_autoencoder_output_weight_by_layer_index(autoencoder_context) 80 | 81 | 82 | def get_mlp_write_tensor_by_layer_index_with_autoencoder_context( 83 | autoencoder_context: AutoencoderContext, 84 | model_context: ModelContext, 85 | ) -> dict[int, torch.Tensor]: 86 | assert all(layer_index is not None for layer_index in autoencoder_context.layer_indices) 87 | layer_indices: list[int] = list(autoencoder_context.layer_indices) # type: ignore 88 | write_tensor_by_layer_index = get_mlp_write_tensor_by_layer_index( 89 | model_context=model_context, layer_indices=layer_indices 90 | ) 91 | return { 92 | _assert_non_none(layer_index): write_tensor_by_layer_index[layer_index] 93 | for layer_index in autoencoder_context.layer_indices 94 | } 95 | -------------------------------------------------------------------------------- /neuron_explainer/activations/test_attention_utils.py: -------------------------------------------------------------------------------- 1 | from neuron_explainer.activations.attention_utils import ( 2 | _inverse_triangular_number, 3 | convert_flattened_index_to_unflattened_index, 4 | get_attended_to_sequence_length_per_sequence_token, 5 | get_max_num_attended_to_sequence_tokens, 6 | ) 7 | 8 | 9 | def _simulate_num_activations( 10 | num_sequence_tokens: int, max_num_attended_to_sequence_tokens: int 11 | ) -> int: 12 | num_activations_per_token = list(range(1, max_num_attended_to_sequence_tokens + 1)) + [ 13 | max_num_attended_to_sequence_tokens 14 | for _ in range(num_sequence_tokens - max_num_attended_to_sequence_tokens) 15 | ] 16 | num_activations = sum(num_activations_per_token) 17 | return num_activations 18 | 19 | 20 | def test_inverse_triangular_number() -> None: 21 | for m in range(5): 22 | n = m * (m + 1) // 2 23 | assert _inverse_triangular_number(n) == m 24 | 25 | 26 | def test_get_max_num_attended_to_sequence_tokens() -> None: 27 | num_sequence_tokens = 100 28 | for max_num_attended_to_sequence_tokens in [50, 100]: 29 | num_activations = _simulate_num_activations( 30 | num_sequence_tokens, max_num_attended_to_sequence_tokens 31 | ) 32 | assert ( 33 | get_max_num_attended_to_sequence_tokens(num_sequence_tokens, num_activations) 34 | == max_num_attended_to_sequence_tokens 35 | ) 36 | 37 | attended_to_sequence_lengths = get_attended_to_sequence_length_per_sequence_token( 38 | num_sequence_tokens, max_num_attended_to_sequence_tokens 39 | ) 40 | assert sum(attended_to_sequence_lengths) == num_activations, ( 41 | sum(attended_to_sequence_lengths), 42 | num_activations, 43 | ) 44 | 45 | 46 | def test_convert_flattened_index_to_unflattened_index() -> None: 47 | possible_max_num_attended_to_sequence_tokens = 9 48 | num_sequence_tokens = 17 49 | assert possible_max_num_attended_to_sequence_tokens < num_sequence_tokens 50 | for max_num_attended_to_sequence_tokens in [ 51 | possible_max_num_attended_to_sequence_tokens, 52 | num_sequence_tokens, 53 | ]: 54 | attended_to_sequence_lengths = get_attended_to_sequence_length_per_sequence_token( 55 | num_sequence_tokens, max_num_attended_to_sequence_tokens 56 | ) 57 | num_activations = sum(attended_to_sequence_lengths) 58 | 59 | flat_indices = list(range(num_activations)) 60 | flat_indices_split_by_sequence_token = [] 61 | for attended_to_sequence_length in attended_to_sequence_lengths: 62 | flat_indices_split_by_sequence_token.append(flat_indices[:attended_to_sequence_length]) 63 | flat_indices = flat_indices[attended_to_sequence_length:] 64 | 65 | for flat_index in list(range(num_activations)): 66 | if max_num_attended_to_sequence_tokens == num_sequence_tokens: 67 | unflattened_i, unflattened_j = convert_flattened_index_to_unflattened_index( 68 | flat_index 69 | ) 70 | else: 71 | unflattened_i, unflattened_j = convert_flattened_index_to_unflattened_index( 72 | flat_index, 73 | num_sequence_tokens=num_sequence_tokens, 74 | num_activations=num_activations, 75 | ) 76 | assert unflattened_i < num_sequence_tokens 77 | assert unflattened_j < len(flat_indices_split_by_sequence_token[unflattened_i]) 78 | assert ( 79 | flat_indices_split_by_sequence_token[unflattened_i][unflattened_j] == flat_index 80 | ), ( 81 | flat_indices_split_by_sequence_token[unflattened_i][unflattened_j], 82 | flat_index, 83 | ) 84 | -------------------------------------------------------------------------------- /neuron_explainer/explanations/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openai/transformer-debugger/87e6db7b7e73ded5037eeeff05deb5e81548a10a/neuron_explainer/explanations/__init__.py -------------------------------------------------------------------------------- /neuron_explainer/explanations/test_explainer.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | from typing import Any 3 | 4 | from neuron_explainer.explanations.explainer import TokenActivationPairExplainer 5 | from neuron_explainer.explanations.few_shot_examples import TEST_EXAMPLES, FewShotExampleSet 6 | from neuron_explainer.explanations.prompt_builder import ChatMessage, PromptFormat, Role 7 | 8 | 9 | def setup_module(unused_module: Any) -> None: 10 | # Make sure we have an event loop, since the attempt to create the Semaphore in 11 | # ApiClient will fail without it. 12 | loop = asyncio.new_event_loop() 13 | asyncio.set_event_loop(loop) 14 | 15 | 16 | def test_if_formatting() -> None: 17 | expected_prompt = """We're studying neurons in a neural network. Each neuron looks for some particular thing in a short document. Look at the parts of the document the neuron activates for and summarize in a single sentence what the neuron is looking for. Don't list examples of words. 18 | 19 | The activation format is tokenactivation. Activation values range from 0 to 10. A neuron finding what it's looking for is represented by a non-zero activation value. The higher the activation value, the stronger the match. 20 | 21 | Neuron 1 22 | Activations: 23 | 24 | a 10 25 | b 0 26 | c 0 27 | 28 | 29 | d 0 30 | e 10 31 | f 0 32 | 33 | 34 | Explanation of neuron 1 behavior: this neuron activates for vowels. 35 | 36 | Neuron 2 37 | Activations: 38 | 39 | a 10 40 | b 0 41 | c 0 42 | 43 | 44 | d 0 45 | e 10 46 | f 0 47 | 48 | 49 | Explanation of neuron 2 behavior:<|endofprompt|> this neuron activates for""" 50 | 51 | explainer = TokenActivationPairExplainer( 52 | model_name="gpt-4o", 53 | prompt_format=PromptFormat.INSTRUCTION_FOLLOWING, 54 | few_shot_example_set=FewShotExampleSet.TEST, 55 | ) 56 | prompt = explainer.make_explanation_prompt( 57 | all_activations=TEST_EXAMPLES[0].activation_records, 58 | max_activation=1.0, 59 | max_tokens_for_completion=20, 60 | ) 61 | 62 | assert prompt == expected_prompt 63 | 64 | 65 | def test_chat_format() -> None: 66 | expected_prompt = [ 67 | ChatMessage( 68 | role=Role.SYSTEM, 69 | content="""We're studying neurons in a neural network. Each neuron looks for some particular thing in a short document. Look at the parts of the document the neuron activates for and summarize in a single sentence what the neuron is looking for. Don't list examples of words. 70 | 71 | The activation format is tokenactivation. Activation values range from 0 to 10. A neuron finding what it's looking for is represented by a non-zero activation value. The higher the activation value, the stronger the match.""", 72 | ), 73 | ChatMessage( 74 | role=Role.USER, 75 | content=""" 76 | 77 | Neuron 1 78 | Activations: 79 | 80 | a 10 81 | b 0 82 | c 0 83 | 84 | 85 | d 0 86 | e 10 87 | f 0 88 | 89 | 90 | Explanation of neuron 1 behavior: this neuron activates for""", 91 | ), 92 | ChatMessage( 93 | role=Role.ASSISTANT, 94 | content=" vowels.", 95 | ), 96 | ChatMessage( 97 | role=Role.USER, 98 | content=""" 99 | 100 | Neuron 2 101 | Activations: 102 | 103 | a 10 104 | b 0 105 | c 0 106 | 107 | 108 | d 0 109 | e 10 110 | f 0 111 | 112 | 113 | Explanation of neuron 2 behavior: this neuron activates for""", 114 | ), 115 | ] 116 | 117 | explainer = TokenActivationPairExplainer( 118 | model_name="gpt-4o", 119 | prompt_format=PromptFormat.CHAT_MESSAGES, 120 | few_shot_example_set=FewShotExampleSet.TEST, 121 | ) 122 | prompt = explainer.make_explanation_prompt( 123 | all_activations=TEST_EXAMPLES[0].activation_records, 124 | max_activation=1.0, 125 | max_tokens_for_completion=20, 126 | ) 127 | 128 | assert isinstance(prompt, list) 129 | assert isinstance(prompt[0], dict) # Really a ChatMessage 130 | for actual_message, expected_message in zip(prompt, expected_prompt): 131 | assert actual_message["role"] == expected_message["role"] 132 | assert actual_message["content"] == expected_message["content"] 133 | assert prompt == expected_prompt 134 | -------------------------------------------------------------------------------- /neuron_explainer/fast_dataclasses/__init__.py: -------------------------------------------------------------------------------- 1 | from .fast_dataclasses import FastDataclass, dumps, loads, register_dataclass 2 | 3 | __all__ = ["FastDataclass", "dumps", "loads", "register_dataclass"] 4 | -------------------------------------------------------------------------------- /neuron_explainer/file_utils.py: -------------------------------------------------------------------------------- 1 | import io 2 | import os 3 | import urllib.request 4 | from io import IOBase 5 | 6 | import aiohttp 7 | 8 | 9 | def file_exists(filepath: str) -> bool: 10 | if filepath.startswith("https://"): 11 | try: 12 | urllib.request.urlopen(filepath) 13 | return True 14 | except urllib.error.HTTPError: 15 | return False 16 | else: 17 | # It's a local file. 18 | return os.path.exists(filepath) 19 | 20 | 21 | class CustomFileHandler: 22 | def __init__(self, filepath: str, mode: str) -> None: 23 | self.filepath = filepath 24 | self.mode = mode 25 | self.file = None 26 | 27 | def __enter__(self) -> IOBase: 28 | assert not self.filepath.startswith("az://"), "Azure blob storage is not supported" 29 | if self.filepath.startswith("https://"): 30 | assert self.mode in ["r", "rb"], "Only read mode is supported for remote files" 31 | remote_data = urllib.request.urlopen(self.filepath) 32 | if "b" in self.mode: 33 | # Read the content into a BytesIO object for binary mode 34 | self.file = io.BytesIO(remote_data.read()) 35 | else: 36 | # Decode the content and use StringIO for text mode (less common for torch.load) 37 | self.file = io.StringIO(remote_data.read().decode()) 38 | else: 39 | # Create the subdirectories if they don't exist 40 | directory = os.path.dirname(self.filepath) 41 | os.makedirs(directory, exist_ok=True) 42 | self.file = open(self.filepath, self.mode) 43 | if "b" in self.mode: 44 | # Ensure the file is seekable; if not, read into a BytesIO object 45 | try: 46 | self.file.seek(0) 47 | except io.UnsupportedOperation: 48 | self.file.close() 49 | with open(self.filepath, self.mode) as f: 50 | self.file = io.BytesIO(f.read()) 51 | return self.file 52 | 53 | def __exit__(self, exc_type, exc_val, exc_tb) -> bool: 54 | # Close the file if it's open 55 | if self.file is not None: 56 | self.file.close() 57 | # Propagate exceptions 58 | return False 59 | 60 | 61 | async def read_single_async(filepath: str) -> bytes: 62 | if filepath.startswith("https://"): 63 | async with aiohttp.ClientSession() as session: 64 | async with session.get(filepath) as response: 65 | return await response.read() 66 | else: 67 | with open(filepath, "rb") as f: 68 | return f.read() 69 | 70 | 71 | def copy_to_local_cache(src: str, dst: str) -> None: 72 | if not os.path.exists(os.path.dirname(dst)): 73 | os.makedirs(os.path.dirname(dst), exist_ok=True) 74 | if src.startswith("https://"): 75 | with urllib.request.urlopen(src) as response, open(dst, "wb") as out_file: 76 | data = response.read() # Consider chunked reading for large files 77 | out_file.write(data) 78 | else: 79 | with open(src, "rb") as in_file, open(dst, "wb") as out_file: 80 | data = in_file.read() 81 | out_file.write(data) 82 | -------------------------------------------------------------------------------- /neuron_explainer/models/__init__.py: -------------------------------------------------------------------------------- 1 | from .autoencoder import Autoencoder 2 | from .hooks import Hooks, TransformerHooks 3 | from .transformer import Transformer, TransformerConfig 4 | -------------------------------------------------------------------------------- /neuron_explainer/pydantic/__init__.py: -------------------------------------------------------------------------------- 1 | from .camel_case_base_model import CamelCaseBaseModel 2 | from .hashable_base_model import HashableBaseModel 3 | from .immutable import immutable 4 | 5 | __all__ = ["CamelCaseBaseModel", "HashableBaseModel", "immutable"] 6 | -------------------------------------------------------------------------------- /neuron_explainer/pydantic/camel_case_base_model.py: -------------------------------------------------------------------------------- 1 | from pydantic import BaseModel 2 | 3 | 4 | def to_camel(string: str) -> str: 5 | return "".join(word.capitalize() if i > 0 else word for i, word in enumerate(string.split("_"))) 6 | 7 | 8 | class CamelCaseBaseModel(BaseModel): 9 | """ 10 | Base model that will automatically generate camelCase aliases for fields. Python code can use 11 | either snake_case or camelCase names. When Typescript code is generated, it will only use the 12 | camelCase names. 13 | """ 14 | 15 | class Config: 16 | alias_generator = to_camel 17 | allow_population_by_field_name = True 18 | -------------------------------------------------------------------------------- /neuron_explainer/pydantic/hashable_base_model.py: -------------------------------------------------------------------------------- 1 | from .camel_case_base_model import CamelCaseBaseModel 2 | 3 | 4 | class HashableBaseModel(CamelCaseBaseModel): 5 | def __hash__(self) -> int: 6 | values = tuple(getattr(self, name) for name in self.__annotations__.keys()) 7 | # Convert lists to tuples. 8 | values = tuple(value if not isinstance(value, list) else tuple(value) for value in values) 9 | return hash(values) 10 | -------------------------------------------------------------------------------- /neuron_explainer/pydantic/immutable.py: -------------------------------------------------------------------------------- 1 | from typing import TypeVar 2 | 3 | from pydantic import BaseConfig, BaseModel 4 | 5 | T = TypeVar("T", bound=BaseModel) 6 | 7 | 8 | def immutable(cls: type[T]) -> type[T]: 9 | """ 10 | Makes a Pydantic model immutable. 11 | 12 | Annotate a Pydantic class with `@immutable` to prevent the values of its fields from being 13 | changed after an instance is constructed. (This only guarantees shallow immutability of course: 14 | fields may have their internal state change.) 15 | """ 16 | 17 | class Config(BaseConfig): 18 | frozen: bool = True 19 | 20 | cls.Config = Config 21 | return cls 22 | -------------------------------------------------------------------------------- /neuron_explainer/scripts/create_hf_test_data.py: -------------------------------------------------------------------------------- 1 | from typing import Any 2 | 3 | import click 4 | import torch 5 | from transformers import GPT2Tokenizer 6 | 7 | from neuron_explainer.file_utils import copy_to_local_cache 8 | from neuron_explainer.scripts.download_from_hf import get_hf_model 9 | 10 | # ============================== 11 | # Reference models for testing 12 | # ============================== 13 | 14 | ALL_MODELS = [ 15 | "gpt2/small", 16 | "gpt2/medium", 17 | "gpt2/large", 18 | "gpt2/xl", 19 | ] 20 | 21 | # test prompts to sample at temperature zero from 22 | test_prompts = [ 23 | "this is a test", 24 | "I'm sorry Dave, I'm afraid", 25 | "We're not strangers to love. You know the rules, and", 26 | "in the beginning", 27 | "buy now!", 28 | "Why did the chicken cross the road?", 29 | ] 30 | 31 | 32 | # ======================================================= 33 | # Get the hf models and generate test data from those 34 | # ======================================================= 35 | 36 | 37 | def create_hf_test_data( 38 | models: list[str], 39 | test_prompts: list[str], 40 | num_examples: int, 41 | seq_len: int, 42 | sample_len: int, 43 | last_n: int, 44 | ) -> dict: 45 | # for GPT2 models, seq len maxes out at 1024 46 | seq_len = min(seq_len, 1024) 47 | 48 | tokenizer = GPT2Tokenizer.from_pretrained("gpt2") 49 | prompts = [tokenizer.encode(p, return_tensors="pt") for p in test_prompts] 50 | 51 | test_data = {} 52 | for model_name in models: 53 | print(f"Creating test data for {model_name}") 54 | model_data: dict[str, Any] = {} 55 | 56 | # prepare model 57 | model = get_hf_model(model_name) 58 | model.cuda() 59 | print(f"...loaded {model_name}...") 60 | 61 | # make test inputs and get logits 62 | with torch.no_grad(): 63 | X = torch.randint(0, 50257, (num_examples, seq_len)).cuda() 64 | Y = model(X) 65 | X = X.cpu() 66 | logits = Y.logits.cpu() 67 | logits_at_inputs = logits.gather(-1, X.unsqueeze(-1)).squeeze(-1) 68 | logits_slice = logits[:, -last_n:].clone() 69 | model_data["inputs"] = X 70 | model_data["logits_at_inputs"] = logits_at_inputs 71 | model_data["logits_slice"] = logits_slice 72 | model_data["slice_last_n"] = last_n 73 | 74 | # generate temperature-zero samples 75 | samples = [] 76 | for op, p in zip(test_prompts, prompts): 77 | p = p.cuda() 78 | tok1 = model.generate(p, max_length=sample_len + len(p[0]), temperature=0) 79 | tok2 = model.generate(p, max_length=sample_len + len(p[0]), temperature=0) 80 | 81 | str1 = tokenizer.decode(tok1[0]) 82 | str2 = tokenizer.decode(tok2[0]) 83 | assert ( 84 | str1 == str2 85 | ), "HuggingFace temperature-zero generate was unexpectedly nondeterministic" 86 | 87 | # get tokens out as a list, then chop off the ones from the prompt 88 | tok1 = tok1[0].tolist() 89 | tok1 = tok1[len(p[0]) :] 90 | 91 | samples.append({"prompt": op, "completion": tokenizer.decode(tok1), "tokens": tok1}) 92 | 93 | model_data["samples"] = samples 94 | test_data[model_name] = model_data 95 | 96 | # free up GPU memory 97 | model.cpu() 98 | del model 99 | 100 | return test_data 101 | 102 | 103 | @click.command() 104 | @click.option( 105 | "-dir", 106 | "--savedir", 107 | type=str, 108 | default="https://openaipublic.blob.core.windows.net/neuron-explainer/data/test-reference-data", 109 | ) 110 | @click.option("-n", "--num_examples", type=int, default=4) 111 | @click.option("-m", "--sample_len", type=int, default=50) 112 | @click.option("-s", "--seq_len", type=int, default=1024) 113 | @click.option("-l", "--last_n", type=int, default=100) 114 | def make_and_save_test_data( 115 | savedir: str, num_examples: int, seq_len: int, sample_len: int, last_n: int 116 | ) -> None: 117 | test_data = create_hf_test_data( 118 | models=ALL_MODELS, 119 | test_prompts=test_prompts, 120 | num_examples=num_examples, 121 | seq_len=seq_len, 122 | sample_len=sample_len, 123 | last_n=last_n, 124 | ) 125 | torch.save(test_data, "test_data.pt") 126 | copy_to_local_cache(src="test_data.pt", dst="/".join([savedir, "test_data.pt"])) 127 | 128 | 129 | if __name__ == "__main__": 130 | make_and_save_test_data() 131 | -------------------------------------------------------------------------------- /neuron_explainer/scripts/download_from_hf.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os.path as osp 3 | 4 | import click 5 | import torch 6 | from transformers import GPT2LMHeadModel 7 | 8 | from neuron_explainer.file_utils import CustomFileHandler 9 | from neuron_explainer.models.transformer import TransformerConfig 10 | 11 | EXCLUDES = [".attn.bias", ".attn.masked_bias"] 12 | 13 | ALL_MODELS = [ 14 | "gpt2/small", 15 | "gpt2/medium", 16 | "gpt2/large", 17 | "gpt2/xl", 18 | ] 19 | 20 | 21 | def get_hf_model(model_name: str) -> GPT2LMHeadModel: 22 | _, model_size = model_name.split("/") 23 | hf_name = "gpt2" if model_size == "small" else f"gpt2-{model_size}" 24 | model = GPT2LMHeadModel.from_pretrained(hf_name) 25 | return model 26 | 27 | 28 | # ==================================== 29 | # Conversion from HuggingFace format 30 | # ==================================== 31 | def convert(hf_sd: dict) -> dict: 32 | """convert state_dict from HuggingFace format to our format""" 33 | n_layers = max([int(k.split(".")[2]) for k in hf_sd.keys() if ".h." in k]) + 1 34 | 35 | hf_to_ours = dict() 36 | hf_to_ours["wte"] = "tok_embed" 37 | hf_to_ours["wpe"] = "pos_embed" 38 | hf_to_ours["ln_f"] = "final_ln" 39 | hf_to_ours["lm_head"] = "unembed" 40 | for i in range(n_layers): 41 | hf_to_ours[f"h.{i}"] = f"xf_layers.{i}" 42 | hf_to_ours["attn.c_attn"] = "attn.linear_qkv" 43 | hf_to_ours["attn.c_proj"] = "attn.out_proj" 44 | hf_to_ours["mlp.c_fc"] = "mlp.in_layer" 45 | hf_to_ours["mlp.c_proj"] = "mlp.out_layer" 46 | 47 | sd = dict() 48 | for k, v in hf_sd.items(): 49 | if any(x in k for x in EXCLUDES): 50 | continue 51 | if "weight" in k and ("attn" in k or "mlp" in k): 52 | v = v.T 53 | k = k.replace("transformer.", "") 54 | for hf_part, part in hf_to_ours.items(): 55 | k = k.replace(hf_part, part) 56 | if "attn.linear_qkv." in k: 57 | qproj, kproj, vproj = v.chunk(3, dim=0) 58 | sd[k.replace(".linear_qkv.", ".q_proj.")] = qproj 59 | sd[k.replace(".linear_qkv.", ".k_proj.")] = kproj 60 | sd[k.replace(".linear_qkv.", ".v_proj.")] = vproj 61 | else: 62 | sd[k] = v 63 | 64 | return sd 65 | 66 | 67 | def download(model_name: str, save_dir: str) -> None: 68 | assert model_name in ALL_MODELS, f"Must use valid model size, not {model_name=}" 69 | print(f"Downloading and converting model {model_name} to {save_dir}...") 70 | 71 | print(f"Getting HuggingFace model {model_name}...") 72 | model = get_hf_model(model_name) 73 | 74 | hf_config = model.config 75 | base_config = dict( 76 | enc="gpt2", 77 | ctx_window=1024, 78 | # attn 79 | m_attn=1, 80 | # mlp 81 | m_mlp=4, 82 | ) 83 | cfg = TransformerConfig( 84 | **base_config, # type: ignore 85 | d_model=hf_config.n_embd, 86 | n_layers=hf_config.n_layer, 87 | n_heads=hf_config.n_head, 88 | ) 89 | 90 | print("Converting state_dict...") 91 | sd = convert(model.state_dict()) 92 | 93 | print(f"Saving model to {save_dir}...") 94 | # save to file with config 95 | pieces_path = osp.join(save_dir, model_name, "model_pieces") 96 | for k, v in sd.items(): 97 | with CustomFileHandler(osp.join(pieces_path, f"{k}.pt"), "wb") as f: 98 | torch.save(v, f) 99 | 100 | fname_cfg = osp.join(save_dir, model_name, "config.json") 101 | with CustomFileHandler(fname_cfg, "w") as f: 102 | f.write(json.dumps(cfg.__dict__)) 103 | 104 | 105 | @click.command() 106 | @click.argument("save_dir", type=click.Path(exists=False, file_okay=False)) 107 | def download_all(save_dir: str) -> None: 108 | for model_size in ALL_MODELS: 109 | download(model_size, save_dir) 110 | 111 | 112 | if __name__ == "__main__": 113 | download_all() 114 | -------------------------------------------------------------------------------- /neuron_explainer/tests/conftest.py: -------------------------------------------------------------------------------- 1 | # This file defines fixtures for model tests, with a focus on expensive objects that are used across 2 | # multiple test files. Fixtures are created once per session (i.e. `pytest` invocation), and are 3 | # available to and reused across all test cases in the session. Fixtures are evaluated lazily. 4 | # The filename uses the pytest convention. 5 | 6 | import pytest 7 | 8 | from neuron_explainer.activations.derived_scalars import DerivedScalarType 9 | from neuron_explainer.activations.derived_scalars.tests.utils import get_autoencoder_test_path 10 | from neuron_explainer.models.autoencoder_context import AutoencoderConfig, AutoencoderContext 11 | from neuron_explainer.models.model_context import StandardModelContext, get_default_device 12 | 13 | AUTOENCODER_TEST_DST = DerivedScalarType.MLP_POST_ACT 14 | AUTOENCODER_TEST_PATH = get_autoencoder_test_path(AUTOENCODER_TEST_DST) 15 | 16 | 17 | @pytest.fixture(scope="session") 18 | def standard_model_context() -> StandardModelContext: 19 | standard_model_context = StandardModelContext.from_model_type( 20 | "gpt2-small", device=get_default_device() 21 | ) 22 | assert isinstance(standard_model_context, StandardModelContext) 23 | return standard_model_context 24 | 25 | 26 | @pytest.fixture(scope="session") 27 | def standard_autoencoder_context( 28 | standard_model_context: StandardModelContext, 29 | ) -> AutoencoderContext: 30 | autoencoder_config = AutoencoderConfig( 31 | dst=AUTOENCODER_TEST_DST, 32 | autoencoder_path_by_layer_index={ 33 | layer_index: AUTOENCODER_TEST_PATH 34 | for layer_index in range(standard_model_context.n_layers) 35 | }, 36 | ) 37 | return AutoencoderContext( 38 | autoencoder_config=autoencoder_config, 39 | device=standard_model_context.device, 40 | ) 41 | -------------------------------------------------------------------------------- /neuron_explainer/tests/test_model_context_get_weight.py: -------------------------------------------------------------------------------- 1 | import time 2 | from typing import Any, Callable 3 | 4 | import torch 5 | 6 | from neuron_explainer.models.inference_engine_type_registry import InferenceEngineType 7 | from neuron_explainer.models.model_component_registry import WeightLocationType 8 | from neuron_explainer.models.model_context import ModelContext 9 | 10 | 11 | def assert_all_eq( 12 | lst: list[Any], 13 | eq_fn: Callable[[Any, Any], bool] = lambda x, y: x == y, 14 | weight_location_type: WeightLocationType | None = None, 15 | ) -> Any: 16 | for i in range(1, len(lst)): 17 | assert eq_fn(lst[i], lst[0]), f"{lst[i]} != {lst[0]}; {weight_location_type=}; {i=}" 18 | return lst[0] 19 | 20 | 21 | def test_model_context_weights() -> None: 22 | for model_name in ["gpt2-small"]: 23 | contexts = [] 24 | standard_model_context = ModelContext.from_model_type( 25 | model_name, 26 | inference_engine_type=InferenceEngineType.STANDARD, 27 | device="cpu", 28 | ) 29 | contexts.append(("standard", standard_model_context)) 30 | 31 | standard_model_context_with_model = ModelContext.from_model_type( 32 | model_name, 33 | inference_engine_type=InferenceEngineType.STANDARD, 34 | device="cpu", 35 | ) 36 | standard_model_context_with_model.get_or_create_model(simplify=False) # type: ignore 37 | contexts.append(("standard_cached", standard_model_context_with_model)) 38 | 39 | for weight_location_type in WeightLocationType: 40 | if not weight_location_type.has_no_layers: 41 | # just test layer 0 for now 42 | layer_index: int | None = 0 43 | else: 44 | layer_index = None 45 | 46 | weights = [] 47 | for ctx_name, ctx in contexts: 48 | try: 49 | t = time.time() 50 | # Convert all weights to float32, since different contexts may use different 51 | # dtypes by default. torch.allclose requires dtypes to match. 52 | weight = ctx.get_weight(weight_location_type, layer_index).to(torch.float32) 53 | print(f"{ctx_name} {weight.shape=} loaded in {time.time() - t:.2f}s") 54 | weights.append(weight) 55 | except NotImplementedError: 56 | print(f"{weight_location_type} not implemented in {ctx_name} context") 57 | 58 | if len(weights): 59 | assert_all_eq( 60 | [weight.shape for weight in weights], lambda x, y: x == y, weight_location_type 61 | ) 62 | assert_all_eq( 63 | list(weights), 64 | lambda x, y: torch.allclose(x, y, atol=1e-5, rtol=1e-3), 65 | weight_location_type, 66 | ) 67 | else: 68 | print(f"no weights found for {weight_location_type}") 69 | -------------------------------------------------------------------------------- /neuron_explainer/tests/test_serialization_of_model_config_from_model_context.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | from neuron_explainer.models.model_context import StandardModelContext 4 | 5 | 6 | def test_standard_model_context(standard_model_context: StandardModelContext) -> None: 7 | json.dumps(standard_model_context.get_model_config_as_dict()) 8 | -------------------------------------------------------------------------------- /neuron_explainer/tests/test_trace_through_v.py: -------------------------------------------------------------------------------- 1 | from neuron_explainer.activation_server.derived_scalar_computation import ( 2 | get_derived_scalars_for_prompt, 3 | maybe_construct_loss_fn_for_backward_pass, 4 | ) 5 | from neuron_explainer.activation_server.requests_and_responses import LossFnConfig, LossFnName 6 | from neuron_explainer.activations.derived_scalars import DerivedScalarType 7 | from neuron_explainer.activations.derived_scalars.derived_scalar_store import AttentionTraceType 8 | from neuron_explainer.activations.derived_scalars.indexing import ( 9 | NodeIndex, 10 | PreOrPostAct, 11 | TraceConfig, 12 | ) 13 | from neuron_explainer.activations.derived_scalars.scalar_deriver import DstConfig 14 | from neuron_explainer.models.autoencoder_context import AutoencoderContext 15 | from neuron_explainer.models.model_component_registry import NodeType, PassType 16 | from neuron_explainer.models.model_context import StandardModelContext 17 | 18 | DETACH_LAYER_NORM_SCALE_FOR_TEST = ( 19 | False # this sets whether to detach layer norm scale when computing these DSTs. 20 | ) 21 | 22 | 23 | def test_trace_through_v( 24 | standard_model_context: StandardModelContext, 25 | standard_autoencoder_context: AutoencoderContext, 26 | ) -> None: 27 | prompt = "This is a test" 28 | loss_fn_for_backward_pass = maybe_construct_loss_fn_for_backward_pass( 29 | model_context=standard_model_context, 30 | config=LossFnConfig( 31 | name=LossFnName.LOGIT_DIFF, 32 | target_tokens=["."], 33 | distractor_tokens=["!"], 34 | ), 35 | ) 36 | 37 | for downstream_trace_config in [ 38 | None, 39 | TraceConfig( 40 | node_index=NodeIndex( 41 | node_type=NodeType.ATTENTION_HEAD, 42 | layer_index=5, 43 | pass_type=PassType.FORWARD, 44 | tensor_indices=(0, 0, 0), 45 | ), 46 | pre_or_post_act=PreOrPostAct.POST, 47 | detach_layer_norm_scale=DETACH_LAYER_NORM_SCALE_FOR_TEST, 48 | attention_trace_type=AttentionTraceType.K, 49 | ), 50 | ]: 51 | trace_config = TraceConfig( 52 | node_index=NodeIndex( 53 | node_type=NodeType.ATTENTION_HEAD, 54 | layer_index=3, 55 | pass_type=PassType.FORWARD, 56 | tensor_indices=(0, 0, 0), 57 | ), 58 | pre_or_post_act=PreOrPostAct.POST, 59 | detach_layer_norm_scale=DETACH_LAYER_NORM_SCALE_FOR_TEST, 60 | attention_trace_type=AttentionTraceType.V, 61 | downstream_trace_config=downstream_trace_config, 62 | ) 63 | dst_config = DstConfig( 64 | model_context=standard_model_context, 65 | autoencoder_context=standard_autoencoder_context, 66 | trace_config=trace_config, 67 | ) 68 | dst_list = [ 69 | DerivedScalarType.UNFLATTENED_ATTN_WRITE_TO_FINAL_RESIDUAL_GRAD, 70 | DerivedScalarType.ONLINE_AUTOENCODER_WRITE_TO_FINAL_RESIDUAL_GRAD, 71 | ] 72 | dst_and_config_list = [(dst, dst_config) for dst in dst_list] 73 | current_ds_store, _, raw_store = get_derived_scalars_for_prompt( 74 | model_context=standard_model_context, 75 | prompt=prompt, 76 | trace_config=trace_config, 77 | dst_and_config_list=dst_and_config_list, # type: ignore 78 | autoencoder_context=standard_autoencoder_context, 79 | loss_fn_for_backward_pass=loss_fn_for_backward_pass, 80 | ) 81 | -------------------------------------------------------------------------------- /neuron_viewer/.gitignore: -------------------------------------------------------------------------------- 1 | **/*.trace 2 | **/*.zip 3 | **/*.tar.gz 4 | **/*.tgz 5 | **/*.log 6 | .parcel-cache 7 | 8 | **/*.bun 9 | 10 | # See https://help.github.com/articles/ignoring-files/ for more about ignoring files. 11 | 12 | # dependencies 13 | /node_modules 14 | /.pnp 15 | .pnp.js 16 | 17 | # testing 18 | /coverage 19 | 20 | # production 21 | /build 22 | 23 | # misc 24 | .DS_Store 25 | .env.local 26 | .env.development.local 27 | .env.test.local 28 | .env.production.local 29 | 30 | npm-debug.log* 31 | yarn-debug.log* 32 | yarn-error.log* 33 | 34 | *.pyc 35 | dist/ 36 | 37 | .vscode 38 | -------------------------------------------------------------------------------- /neuron_viewer/.parcelrc: -------------------------------------------------------------------------------- 1 | { 2 | "extends": "@parcel/config-default", 3 | "transformers": { 4 | "*.{ts,tsx}": ["@parcel/transformer-typescript-tsc"] 5 | }, 6 | "validators": { 7 | "*.{ts,tsx}": ["@parcel/validator-typescript"] 8 | } 9 | } 10 | -------------------------------------------------------------------------------- /neuron_viewer/.postcssrc: -------------------------------------------------------------------------------- 1 | { 2 | "plugins": { 3 | "tailwindcss": {} 4 | } 5 | } 6 | -------------------------------------------------------------------------------- /neuron_viewer/.prettierrc: -------------------------------------------------------------------------------- 1 | { 2 | "printWidth": 100, 3 | "useTabs": false, 4 | "semi": true, 5 | "tabWidth": 2, 6 | } 7 | -------------------------------------------------------------------------------- /neuron_viewer/README.md: -------------------------------------------------------------------------------- 1 | # Neuron viewer 2 | 3 | A React app that hosts TDB as well as pages with information about individual model components 4 | (MLP neurons, attention heads and autoencoder latents for both). 5 | 6 | 7 | ## Running the server locally 8 | 9 | First, install the app: 10 | 11 | ```sh 12 | npm install 13 | ``` 14 | 15 | Then run the frontend: 16 | 17 | ```sh 18 | npm start 19 | ``` 20 | 21 | - To open a Neuron Viewer page, navigate to `http://localhost:1234`. 22 | - To open TDB, navigate to `http://localhost:1234/gpt2-small/tdb_alpha`. 23 | - To open TDB with autoencoders, navigate to `http://localhost:1234/gpt2-small_ae-resid-delta-mlp-v4_ae-resid-delta-attn-v4/tdb_alpha` 24 | (where `ae-resid-delta-mlp-v4` and `ae-resid-delta-attn-v4` must match the autoencoder names that are used in the [activation server](../neuron_explainer/activation_server/README.md)). 25 | 26 | ## Formatting code 27 | 28 | To check whether the code is correctly formatted: 29 | 30 | ```sh 31 | npm run check-code-format 32 | ``` 33 | 34 | To format the code: 35 | 36 | ```sh 37 | npm run format-code 38 | ``` 39 | 40 | ## Code organization 41 | 42 | - [src/client](src/client/): Auto-generated code for interacting with the activation server (the neuron viewer's backend). Do not edit this code! Follow the instructions in [the activation server README](../neuron_explainer/activation_server/README.md) to regenerate this code if you make changes to the activation server. Use [src/requests](src/requests/) when calling the activation server. 43 | - [src/panes](src/panes/): UI elements that can be used as panes on a page: tokens+activations, similar neurons, etc. 44 | - [src/requests](src/requests/): Client libraries for making network requests to the activation server. 45 | - [src/TransformerDebugger](src/TransformerDebugger/): Code related to the Transformer Debugger. 46 | - [src](src/): Other code. 47 | 48 | ## Using a remote activation server 49 | 50 | If you decide to run your activation server on a different host or port than the default, you can 51 | point neuron viewer at it by setting the `NEURON_VIEWER_ACTIVATION_SERVER_URL` environment variable: 52 | 53 | ```sh 54 | NEURON_VIEWER_ACTIVATION_SERVER_URL=https://some.url:port npm start 55 | ``` 56 | 57 | ## Making changes 58 | 59 | Be sure to run the following to validate any changes you make: 60 | 61 | ```sh 62 | npm run check-type-warnings && npm run check-code-format && npm run build 63 | ``` 64 | -------------------------------------------------------------------------------- /neuron_viewer/prepend_autogen_comments.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | start_dir="./src/client" 4 | prepend_string="// Auto-generated code. Do not edit! See neuron_explainer/activation_server/README.md to learn how to regenerate it." 5 | 6 | # Find all files 7 | find "$start_dir" -type f | while read file; do 8 | # Create a temporary file 9 | temp_file=$(mktemp) 10 | 11 | # Write the string to the temporary file 12 | echo "$prepend_string\n" > "$temp_file" 13 | 14 | # Concatenate the original file to the temporary file 15 | cat "$file" >> "$temp_file" 16 | 17 | # Replace the original file with the temporary file 18 | mv "$temp_file" "$file" 19 | done 20 | -------------------------------------------------------------------------------- /neuron_viewer/public/favicon.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openai/transformer-debugger/87e6db7b7e73ded5037eeeff05deb5e81548a10a/neuron_viewer/public/favicon.ico -------------------------------------------------------------------------------- /neuron_viewer/public/logo192.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openai/transformer-debugger/87e6db7b7e73ded5037eeeff05deb5e81548a10a/neuron_viewer/public/logo192.png -------------------------------------------------------------------------------- /neuron_viewer/public/logo512.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openai/transformer-debugger/87e6db7b7e73ded5037eeeff05deb5e81548a10a/neuron_viewer/public/logo512.png -------------------------------------------------------------------------------- /neuron_viewer/public/manifest.json: -------------------------------------------------------------------------------- 1 | { 2 | "short_name": "React App", 3 | "name": "Create React App Sample", 4 | "icons": [ 5 | { 6 | "src": "favicon.ico", 7 | "sizes": "64x64 32x32 24x24 16x16", 8 | "type": "image/x-icon" 9 | }, 10 | { 11 | "src": "logo192.png", 12 | "type": "image/png", 13 | "sizes": "192x192" 14 | }, 15 | { 16 | "src": "logo512.png", 17 | "type": "image/png", 18 | "sizes": "512x512" 19 | } 20 | ], 21 | "start_url": ".", 22 | "display": "standalone", 23 | "theme_color": "#000000", 24 | "background_color": "#ffffff" 25 | } 26 | -------------------------------------------------------------------------------- /neuron_viewer/public/robots.txt: -------------------------------------------------------------------------------- 1 | # https://www.robotstxt.org/robotstxt.html 2 | User-agent: * 3 | Disallow: 4 | -------------------------------------------------------------------------------- /neuron_viewer/src/App.css: -------------------------------------------------------------------------------- 1 | @tailwind base; 2 | @tailwind components; 3 | @tailwind utilities; 4 | 5 | .ag-theme-alpine { 6 | --ag-grid-size: 1px; 7 | --ag-list-item-height: 20px; 8 | } 9 | -------------------------------------------------------------------------------- /neuron_viewer/src/App.tsx: -------------------------------------------------------------------------------- 1 | import React from "react"; 2 | import { useNavigate, Route, Routes, Link } from "react-router-dom"; 3 | import "./App.css"; 4 | import TransformerDebugger from "./TransformerDebugger/TransformerDebugger"; 5 | import { NextUIProvider } from "@nextui-org/react"; 6 | import Welcome from "./welcome"; 7 | import NodePage from "./nodePage"; 8 | 9 | const NotFoundPage: React.FC = () => { 10 | return ( 11 |
12 |
13 |

Page Not Found

14 |

15 | Sorry, the page you are looking for does not exist. 16 |

17 | 21 | Go back home 22 | 23 |
24 |
25 | ); 26 | }; 27 | 28 | const App: React.FC = () => { 29 | const navigate = useNavigate(); 30 | 31 | return ( 32 | 33 | 34 | {/* Actual substantive pages */} 35 | } /> 36 | } /> 37 | } /> 38 | 39 | {/* Catch-all for bogus URLs */} 40 | } /> 41 | 42 | 43 | ); 44 | }; 45 | 46 | export default App; 47 | -------------------------------------------------------------------------------- /neuron_viewer/src/TransformerDebugger/cards/DisplayOptions.tsx: -------------------------------------------------------------------------------- 1 | import { Checkbox } from "@nextui-org/react"; 2 | import { ExplanatoryTooltip } from "../common/ExplanatoryTooltip"; 3 | 4 | const displayKeys: { [key: string]: { label: string; explanation: string } } = { 5 | logits: { 6 | label: "Show logits display", 7 | explanation: 8 | "Whether to show a table listing top candidates for the next token with their logits.", 9 | }, 10 | bySequenceToken: { 11 | label: "Show token effect display", 12 | explanation: 13 | "Whether to show the prompt, with each token colored by the estimated total effect summed over all nodes of a same type (MLP neurons, attention heads, embeddings).", 14 | }, 15 | node: { 16 | label: "Show node table", 17 | explanation: 18 | "Whether to show a table of nodes (MLP neurons, attention heads, autoencoder latents, etc.) and their effect on the direction of interest.", 19 | }, 20 | }; 21 | 22 | const DisplayOptions = ({ 23 | displaySettings, 24 | toggleDisplay, 25 | }: { 26 | displaySettings: Map; 27 | toggleDisplay: (key: string) => void; 28 | }) => { 29 | return ( 30 | <> 31 | {Object.keys(displayKeys).map((key) => ( 32 | 33 |
34 | toggleDisplay(key)} 38 | > 39 | {displayKeys[key].label} 40 | 41 |
42 |
43 | ))} 44 | 45 | ); 46 | }; 47 | 48 | export default DisplayOptions; 49 | -------------------------------------------------------------------------------- /neuron_viewer/src/TransformerDebugger/cards/LayerDisplay.tsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openai/transformer-debugger/87e6db7b7e73ded5037eeeff05deb5e81548a10a/neuron_viewer/src/TransformerDebugger/cards/LayerDisplay.tsx -------------------------------------------------------------------------------- /neuron_viewer/src/TransformerDebugger/cards/SparsityMetricsDisplay.tsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openai/transformer-debugger/87e6db7b7e73ded5037eeeff05deb5e81548a10a/neuron_viewer/src/TransformerDebugger/cards/SparsityMetricsDisplay.tsx -------------------------------------------------------------------------------- /neuron_viewer/src/TransformerDebugger/cards/TokenTable.tsx: -------------------------------------------------------------------------------- 1 | import React from "react"; 2 | import { formatToken } from "../../tokenRendering"; 3 | 4 | type TokenTableProps = { 5 | leftTokens: string[]; 6 | rightTokens?: string[]; 7 | }; 8 | 9 | const TokenTable: React.FC = ({ leftTokens, rightTokens }) => { 10 | const tableStyle: React.CSSProperties = { 11 | maxWidth: "800px", 12 | margin: "0 auto", 13 | borderCollapse: "collapse", 14 | borderColor: "#f0f0f0", 15 | }; 16 | 17 | const cellStyle: React.CSSProperties = { 18 | textAlign: "center", 19 | padding: "5px", 20 | border: "1px solid #f0f0f0", 21 | fontFamily: "monospace", 22 | }; 23 | 24 | const indexMismatchStyle: React.CSSProperties = { 25 | ...cellStyle, 26 | backgroundColor: "#ffcccc", // Light red background for mismatched indices 27 | }; 28 | 29 | const rowNameStyle: React.CSSProperties = { 30 | ...cellStyle, 31 | fontWeight: "bold", 32 | fontFamily: "sans-serif", 33 | }; 34 | 35 | const isMismatchAtIndex = (index: number) => { 36 | return rightTokens && leftTokens[index] !== rightTokens[index]; 37 | }; 38 | 39 | return ( 40 | 41 | 42 | 43 | 44 | {leftTokens.map((token, i) => ( 45 | 48 | ))} 49 | 50 | {rightTokens && ( 51 | 52 | 53 | {rightTokens.map((token, i) => ( 54 | 57 | ))} 58 | 59 | )} 60 | 61 | 62 | {leftTokens.map((_, i) => ( 63 | 66 | ))} 67 | 68 | 69 |
{rightTokens ? "Left token" : "Token"} 46 | {formatToken(token)} 47 |
Right token 55 | {formatToken(token)} 56 |
Index 64 | {i} 65 |
70 | ); 71 | }; 72 | 73 | export default TokenTable; 74 | -------------------------------------------------------------------------------- /neuron_viewer/src/TransformerDebugger/cards/inference_params/AblateNodeSpecs.tsx: -------------------------------------------------------------------------------- 1 | import { 2 | Table, 3 | TableHeader, 4 | TableColumn, 5 | TableBody, 6 | TableRow, 7 | TableCell, 8 | Button, 9 | Divider, 10 | } from "@nextui-org/react"; 11 | import { makeNodeName } from "../../utils/nodes"; 12 | import { NodeType, InferenceAndTokenData } from "../../../client"; 13 | import { TokenLabel } from "./TokenLabel"; 14 | import { PromptInferenceParams } from "./inferenceParams"; 15 | 16 | type AblateNodeSpecsProps = { 17 | leftPromptInferenceParams: PromptInferenceParams; 18 | setLeftPromptInferenceParams: React.Dispatch>; 19 | inferenceAndTokenData: InferenceAndTokenData | null; 20 | twoPromptsMode: boolean; 21 | }; 22 | export const AblateNodeSpecs: React.FC = ({ 23 | leftPromptInferenceParams, 24 | setLeftPromptInferenceParams, 25 | inferenceAndTokenData, 26 | twoPromptsMode, 27 | }) => { 28 | const nodeAblations = leftPromptInferenceParams.nodeAblations; 29 | if (nodeAblations.length === 0) { 30 | return null; 31 | } 32 | return ( 33 |
34 |
35 | Active ablations 36 |
37 |
38 | {twoPromptsMode ? ( 39 | Ablations affect both prompts 40 | ) : ( 41 | Left pane shows the ablated version; right pane shows the baseline version. 42 | )} 43 |
44 | 45 | 46 | Name 47 | Pass type 48 | Token attended to 49 | Token attended from 50 | Ablated to value 51 | Remove 52 | 53 | 54 | {nodeAblations.map((spec, index) => ( 55 | 56 | {makeNodeName(spec.nodeIndex)} 57 | {spec.nodeIndex.passType} 58 | 59 | {spec.nodeIndex.nodeType === NodeType.ATTENTION_HEAD ? ( 60 | 64 | ) : ( 65 | "" 66 | )} 67 | 68 | 69 | 73 | 74 | {spec.value} 75 | 76 | 89 | 90 | 91 | ))} 92 | 93 |
94 | 95 |
96 | ); 97 | }; 98 | -------------------------------------------------------------------------------- /neuron_viewer/src/TransformerDebugger/cards/inference_params/TokenLabel.tsx: -------------------------------------------------------------------------------- 1 | // Displays a token index and optional token string. 2 | import React from "react"; 3 | import { InferenceAndTokenData } from "../../../client"; 4 | import { renderToken } from "../../../tokenRendering"; 5 | 6 | export const TokenLabel: React.FC<{ 7 | index: number; 8 | tokenString?: string; 9 | inferenceAndTokenData: InferenceAndTokenData | null; 10 | }> = ({ index, tokenString, inferenceAndTokenData }) => { 11 | const currentTokenString = inferenceAndTokenData?.tokensAsStrings[index] || tokenString || ""; 12 | return ( 13 | <> 14 | {renderToken(currentTokenString)} ({index}) 15 | 16 | ); 17 | }; 18 | -------------------------------------------------------------------------------- /neuron_viewer/src/TransformerDebugger/cards/inference_params/inferenceParams.ts: -------------------------------------------------------------------------------- 1 | import type { 2 | ComponentTypeForMlp, 3 | ComponentTypeForAttention, 4 | NodeAblation, 5 | NodeToTrace, 6 | } from "../../../client"; 7 | 8 | // Prompt-specific parameters. If there are two prompts, all of these can vary between them. 9 | // (Note that we've temporarily forced ablations to match between prompts; they're stored 10 | // exclusively on the left prompt's params.) 11 | export type PromptInferenceParams = { 12 | prompt: string; 13 | targetTokens: string[]; 14 | distractorTokens: string[]; 15 | nodeAblations: NodeAblation[]; 16 | upstreamNodeToTrace: NodeToTrace | null; 17 | downstreamNodeToTrace: NodeToTrace | null; 18 | }; 19 | 20 | // Non-prompt-specific parameters. If there are two prompts, these are shared between them. 21 | export type CommonInferenceParams = { 22 | componentTypeForMlp: ComponentTypeForMlp; 23 | componentTypeForAttention: ComponentTypeForAttention; 24 | topAndBottomKForNodeTable: number; 25 | hideEarlyLayersWhenAblating: boolean; 26 | }; 27 | -------------------------------------------------------------------------------- /neuron_viewer/src/TransformerDebugger/cards/node_table/TopTokensDisplay.tsx: -------------------------------------------------------------------------------- 1 | import { TopTokens, TokenAndScalar } from "../../../client"; 2 | import { Tooltip } from "@nextui-org/react"; 3 | import { ExplanatoryTooltip } from "../../common/ExplanatoryTooltip"; 4 | import { renderTokenOnGray } from "../../../tokenRendering"; 5 | 6 | const renderTokenList = ( 7 | title: string, 8 | explanation: string, 9 | tokens: TokenAndScalar[], 10 | maxTokens?: number 11 | ) => ( 12 | 13 | 14 | 15 | 16 | {title}: 17 | 18 | 19 | 20 | {tokens.slice(0, maxTokens).map((token, idx) => { 21 | return ( 22 | 23 | {renderTokenOnGray(token.token, idx)} 24 | 25 | ); 26 | })} 27 | 28 | ); 29 | 30 | function whichSidesToDisplay( 31 | leftSideData: TopTokens | null, 32 | rightSideData: TopTokens | null, 33 | maxTokens?: number 34 | ): { displayLeftSide: boolean; displayRightSide: boolean } { 35 | // Display both sides unless all the tokens are the same. If they are the same, then show the side 36 | // with larger magnitude on the first token (to avoid showing a side with all 0s) 37 | let displayLeftSide = leftSideData !== null; 38 | let displayRightSide = rightSideData !== null; 39 | if (leftSideData && rightSideData) { 40 | const leftTopToken = leftSideData.top[0]; 41 | const rightTopToken = rightSideData.top[0]; 42 | if (Math.abs(leftTopToken.scalar) <= 0.01) { 43 | return { displayLeftSide: false, displayRightSide: true }; 44 | } 45 | if (Math.abs(rightTopToken.scalar) <= 0.01) { 46 | return { displayLeftSide: true, displayRightSide: false }; 47 | } 48 | const leftTopTokens = leftSideData.top.slice(0, maxTokens).map((token) => token.token); 49 | const rightTopTokens = rightSideData.top.slice(0, maxTokens).map((token) => token.token); 50 | const leftBottomTokens = leftSideData.bottom.slice(0, maxTokens).map((token) => token.token); 51 | const rightBottomTokens = rightSideData.bottom.slice(0, maxTokens).map((token) => token.token); 52 | const topTokensAreEqual = 53 | leftTopTokens.length === rightTopTokens.length && 54 | leftTopTokens.every((token, index) => token === rightTopTokens[index]); 55 | const bottomTokensAreEqual = 56 | leftBottomTokens.length === rightBottomTokens.length && 57 | leftBottomTokens.every((token, index) => token === rightBottomTokens[index]); 58 | if (topTokensAreEqual && bottomTokensAreEqual) { 59 | displayLeftSide = Math.abs(leftTopToken.scalar) > Math.abs(rightTopToken.scalar); 60 | displayRightSide = !displayLeftSide; 61 | } 62 | return { displayLeftSide, displayRightSide }; 63 | } 64 | return { displayLeftSide, displayRightSide }; 65 | } 66 | 67 | export const TopTokensDisplay: React.FC<{ 68 | leftSideData: TopTokens | null; 69 | rightSideData: TopTokens | null; 70 | label: string; 71 | explanations: { increase: string; decrease: string }; 72 | }> = ({ leftSideData, rightSideData, label, explanations }) => { 73 | const { displayLeftSide, displayRightSide } = whichSidesToDisplay(leftSideData, rightSideData); 74 | const leftTopTokens = leftSideData?.top; 75 | const leftBottomTokens = leftSideData?.bottom; 76 | const rightTopTokens = rightSideData?.top; 77 | const rightBottomTokens = rightSideData?.bottom; 78 | const leftTitlePrefix = "Left "; 79 | const rightTitlePrefix = "Right "; 80 | return ( 81 |
82 | {displayLeftSide && ( 83 |
84 | {leftTopTokens && 85 | renderTokenList( 86 | `${leftTitlePrefix}${label} top`, 87 | explanations.increase, 88 | leftTopTokens, 89 | 10 90 | )} 91 | {leftBottomTokens && 92 | renderTokenList("bottom", explanations.decrease, leftBottomTokens, 10)} 93 |
94 | )} 95 | {displayRightSide && ( 96 |
97 | {rightTopTokens && 98 | renderTokenList( 99 | `${rightTitlePrefix}${label} top`, 100 | explanations.increase, 101 | rightTopTokens, 102 | 10 103 | )} 104 | {rightBottomTokens && 105 | renderTokenList("bottom", explanations.decrease, rightBottomTokens, 10)} 106 |
107 | )} 108 |
109 | ); 110 | }; 111 | -------------------------------------------------------------------------------- /neuron_viewer/src/TransformerDebugger/cards/prompt/MultiTokenInput.tsx: -------------------------------------------------------------------------------- 1 | import { Button, Input } from "@nextui-org/react"; 2 | 3 | export const MultiTokenInput: React.FC<{ 4 | tokens: string[]; 5 | onChange: (tokens: string[]) => void; 6 | className?: string; 7 | allowLengthZero?: boolean; 8 | }> = ({ tokens, onChange, className, allowLengthZero }) => { 9 | // display a row of text inputs with one token per input, + button to add more tokens, - button to remove last token 10 | // when token is changed, call onChange with new tokens 11 | const allowRemovingTokens = tokens.length > 1 || (allowLengthZero && tokens.length === 1); 12 | 13 | return ( 14 |
15 | {tokens.map((token, index) => ( 16 | { 23 | const newTokens = [...tokens]; 24 | newTokens[index] = value; 25 | onChange(newTokens); 26 | }} 27 | /> 28 | ))} 29 | 37 | 50 |
51 | ); 52 | }; 53 | -------------------------------------------------------------------------------- /neuron_viewer/src/TransformerDebugger/cards/prompt/swap.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openai/transformer-debugger/87e6db7b7e73ded5037eeeff05deb5e81548a10a/neuron_viewer/src/TransformerDebugger/cards/prompt/swap.png -------------------------------------------------------------------------------- /neuron_viewer/src/TransformerDebugger/common/ExplanatoryTooltip.tsx: -------------------------------------------------------------------------------- 1 | import React from "react"; 2 | import { Tooltip } from "@nextui-org/react"; 3 | 4 | // This component will result in its entire contents being tooltipped, such that hovering over any 5 | // part of it will show the explanation. 6 | // 7 | // Usage: 8 | // 9 | //

Some contents

10 | //
11 | export const ExplanatoryTooltip: React.FC<{ 12 | explanation: string; 13 | children: React.ReactNode; 14 | }> = ({ explanation, children }) => { 15 | return ( 16 | 17 | {children} 18 | 19 | ); 20 | }; 21 | -------------------------------------------------------------------------------- /neuron_viewer/src/TransformerDebugger/common/JsonModal.tsx: -------------------------------------------------------------------------------- 1 | import { 2 | Modal, 3 | ModalContent, 4 | ModalHeader, 5 | ModalBody, 6 | ModalFooter, 7 | Button, 8 | useDisclosure, 9 | } from "@nextui-org/react"; 10 | import ReactJson, { OnCopyProps } from "@microlink/react-json-view"; 11 | import { MagnifyingGlassIcon } from "@heroicons/react/24/solid"; 12 | import { ExplanatoryTooltip } from "./ExplanatoryTooltip"; 13 | 14 | type JsonModalProps = { 15 | jsonData: any; 16 | buttonLabel?: string | JSX.Element; 17 | collapsed?: number; 18 | }; 19 | 20 | const copyOrDownload = (copy: OnCopyProps) => { 21 | const jsonAsString = JSON.stringify(copy.src, null, 2); 22 | if (navigator.clipboard) { 23 | navigator.clipboard.writeText(jsonAsString).catch((err) => { 24 | console.error("Error in copying text: ", err); 25 | }); 26 | } else { 27 | const blob = new Blob([jsonAsString], { 28 | type: "application/json", 29 | }); 30 | const href = URL.createObjectURL(blob); 31 | const link = document.createElement("a"); 32 | link.href = href; 33 | link.download = "data.json"; 34 | document.body.appendChild(link); 35 | link.click(); 36 | document.body.removeChild(link); 37 | URL.revokeObjectURL(href); 38 | } 39 | }; 40 | 41 | export default function JsonModal({ 42 | jsonData, 43 | buttonLabel = , 44 | collapsed = 2, 45 | }: JsonModalProps) { 46 | const { isOpen, onOpen, onOpenChange } = useDisclosure(); 47 | 48 | return ( 49 | <> 50 | 51 | 52 | 53 | 54 | 55 | {(onClose) => ( 56 | <> 57 | JSON Data 58 | 59 |
60 | 66 |
67 |
68 | 69 | 72 | 73 | 74 | )} 75 |
76 |
77 | 78 | ); 79 | } 80 | -------------------------------------------------------------------------------- /neuron_viewer/src/TransformerDebugger/requests/inferenceResponseUtils.tsx: -------------------------------------------------------------------------------- 1 | import type { InferenceAndTokenData, InferenceResponseAndResponseDict } from "../../client"; 2 | 3 | export function getSubResponse( 4 | responseData: InferenceResponseAndResponseDict | null, 5 | requestSpecName: string 6 | ): T | null { 7 | if (!responseData) { 8 | return null; 9 | } 10 | return responseData.processingResponseDataByName![requestSpecName] as T; 11 | } 12 | 13 | export function getInferenceAndTokenData( 14 | responseData: InferenceResponseAndResponseDict | null 15 | ): InferenceAndTokenData | null { 16 | if (!responseData) { 17 | return null; 18 | } 19 | return responseData.inferenceResponse.inferenceAndTokenData; 20 | } 21 | -------------------------------------------------------------------------------- /neuron_viewer/src/TransformerDebugger/utils/explanations.ts: -------------------------------------------------------------------------------- 1 | // TODO: Make this explanation clearer. Does this only cover the direct effect as opposed to indirect effects? 2 | export const WRITE_MAGNITUDE_EXPLANATION = 3 | "Magnitude of the write vector to the direction of interest produced by the component."; 4 | 5 | export const ACTIVATION_EXPLANATION = 6 | "MLP post-activation, attention post-softmax, or autoencoder latent activation."; 7 | 8 | // TODO: Make this explanation clearer. Is this a magnitude? Does this only cover the direct effect as opposed to indirect effects? 9 | export const DIRECTION_WRITE_EXPLANATION = 10 | "Direction write: Value of the write to the direction of interest."; 11 | 12 | export const ACT_TIMES_GRAD_EXPLANATION = 13 | "Activation * gradient: Estimate of the total effect of the component on the activation of the direction of interest, including indirect effects through other components."; 14 | 15 | export const TOKEN_ATTENDED_TO_EXPLANATION = 16 | "Token attended-to, for attention heads only, where activations are specific to a token pair. This is the least recent token in the token pair."; 17 | 18 | export const TOKEN_ATTRIBUTED_TO_EXPLANATION = 19 | "Token attended-to, for attention-write autoencoder latents only. This is the token with the most positive attribution to the latent activation."; 20 | 21 | export const TOKEN_ATTENDED_FROM_EXPLANATION = 22 | "Current token, for all components. For MLP neurons and MLP latents, this is the token where the component activates. For attention heads, where activations are specific to a token pair, this is the most recent token in the token pair."; 23 | -------------------------------------------------------------------------------- /neuron_viewer/src/TransformerDebugger/utils/numbers.tsx: -------------------------------------------------------------------------------- 1 | import { IRowNode } from "ag-grid-community"; 2 | 3 | export const formatFloat = (value: any, numDecimalPlaces: number = 2) => { 4 | return value !== undefined ? parseFloat(value).toFixed(numDecimalPlaces) : ""; 5 | }; 6 | 7 | export const formatFloatWithZeroPoint = ( 8 | value: any, 9 | zeroPoint: number, 10 | numDecimalPlaces: number = 2 11 | ) => { 12 | return value !== undefined ? (parseFloat(value) - zeroPoint).toFixed(numDecimalPlaces) : ""; 13 | }; 14 | 15 | export const diffOptionalNumbers = (a: number | undefined, b: number | undefined) => { 16 | if (a === undefined) { 17 | a = 0; 18 | } 19 | if (b === undefined) { 20 | b = 0; 21 | } 22 | return a - b; 23 | }; 24 | 25 | export const compareWithUndefinedAsZero = ( 26 | a: number | undefined, 27 | b: number | undefined, 28 | unusedNodeA: IRowNode, 29 | unusedNodeB: IRowNode, 30 | // The grid itself handles inverting the order, so the comparator doesn't need to use it. 31 | unusedIsDescending: boolean 32 | ) => { 33 | if (a === undefined) { 34 | a = 0; 35 | } 36 | if (b === undefined) { 37 | b = 0; 38 | } 39 | return a - b; 40 | }; 41 | 42 | export const compareWithUndefinedLast = ( 43 | a: number | undefined, 44 | b: number | undefined, 45 | unusedNodeA: IRowNode, 46 | unusedNodeB: IRowNode, 47 | isDescending: boolean 48 | ) => { 49 | if (a === undefined) { 50 | a = isDescending ? -Infinity : Infinity; 51 | } 52 | if (b === undefined) { 53 | b = isDescending ? -Infinity : Infinity; 54 | } 55 | return a - b; 56 | }; 57 | -------------------------------------------------------------------------------- /neuron_viewer/src/client/core/ApiError.ts: -------------------------------------------------------------------------------- 1 | // Auto-generated code. Do not edit! See neuron_explainer/activation_server/README.md to learn how to regenerate it. 2 | 3 | /* istanbul ignore file */ 4 | /* tslint:disable */ 5 | /* eslint-disable */ 6 | import type { ApiRequestOptions } from "./ApiRequestOptions"; 7 | import type { ApiResult } from "./ApiResult"; 8 | 9 | export class ApiError extends Error { 10 | public readonly url: string; 11 | public readonly status: number; 12 | public readonly statusText: string; 13 | public readonly body: any; 14 | public readonly request: ApiRequestOptions; 15 | 16 | constructor(request: ApiRequestOptions, response: ApiResult, message: string) { 17 | super(message); 18 | 19 | this.name = "ApiError"; 20 | this.url = response.url; 21 | this.status = response.status; 22 | this.statusText = response.statusText; 23 | this.body = response.body; 24 | this.request = request; 25 | } 26 | } 27 | -------------------------------------------------------------------------------- /neuron_viewer/src/client/core/ApiRequestOptions.ts: -------------------------------------------------------------------------------- 1 | // Auto-generated code. Do not edit! See neuron_explainer/activation_server/README.md to learn how to regenerate it. 2 | 3 | /* istanbul ignore file */ 4 | /* tslint:disable */ 5 | /* eslint-disable */ 6 | export type ApiRequestOptions = { 7 | readonly method: "GET" | "PUT" | "POST" | "DELETE" | "OPTIONS" | "HEAD" | "PATCH"; 8 | readonly url: string; 9 | readonly path?: Record; 10 | readonly cookies?: Record; 11 | readonly headers?: Record; 12 | readonly query?: Record; 13 | readonly formData?: Record; 14 | readonly body?: any; 15 | readonly mediaType?: string; 16 | readonly responseHeader?: string; 17 | readonly errors?: Record; 18 | }; 19 | -------------------------------------------------------------------------------- /neuron_viewer/src/client/core/ApiResult.ts: -------------------------------------------------------------------------------- 1 | // Auto-generated code. Do not edit! See neuron_explainer/activation_server/README.md to learn how to regenerate it. 2 | 3 | /* istanbul ignore file */ 4 | /* tslint:disable */ 5 | /* eslint-disable */ 6 | export type ApiResult = { 7 | readonly url: string; 8 | readonly ok: boolean; 9 | readonly status: number; 10 | readonly statusText: string; 11 | readonly body: any; 12 | }; 13 | -------------------------------------------------------------------------------- /neuron_viewer/src/client/core/CancelablePromise.ts: -------------------------------------------------------------------------------- 1 | // Auto-generated code. Do not edit! See neuron_explainer/activation_server/README.md to learn how to regenerate it. 2 | 3 | /* istanbul ignore file */ 4 | /* tslint:disable */ 5 | /* eslint-disable */ 6 | export class CancelError extends Error { 7 | constructor(message: string) { 8 | super(message); 9 | this.name = "CancelError"; 10 | } 11 | 12 | public get isCancelled(): boolean { 13 | return true; 14 | } 15 | } 16 | 17 | export interface OnCancel { 18 | readonly isResolved: boolean; 19 | readonly isRejected: boolean; 20 | readonly isCancelled: boolean; 21 | 22 | (cancelHandler: () => void): void; 23 | } 24 | 25 | export class CancelablePromise implements Promise { 26 | #isResolved: boolean; 27 | #isRejected: boolean; 28 | #isCancelled: boolean; 29 | readonly #cancelHandlers: (() => void)[]; 30 | readonly #promise: Promise; 31 | #resolve?: (value: T | PromiseLike) => void; 32 | #reject?: (reason?: any) => void; 33 | 34 | constructor( 35 | executor: ( 36 | resolve: (value: T | PromiseLike) => void, 37 | reject: (reason?: any) => void, 38 | onCancel: OnCancel 39 | ) => void 40 | ) { 41 | this.#isResolved = false; 42 | this.#isRejected = false; 43 | this.#isCancelled = false; 44 | this.#cancelHandlers = []; 45 | this.#promise = new Promise((resolve, reject) => { 46 | this.#resolve = resolve; 47 | this.#reject = reject; 48 | 49 | const onResolve = (value: T | PromiseLike): void => { 50 | if (this.#isResolved || this.#isRejected || this.#isCancelled) { 51 | return; 52 | } 53 | this.#isResolved = true; 54 | this.#resolve?.(value); 55 | }; 56 | 57 | const onReject = (reason?: any): void => { 58 | if (this.#isResolved || this.#isRejected || this.#isCancelled) { 59 | return; 60 | } 61 | this.#isRejected = true; 62 | this.#reject?.(reason); 63 | }; 64 | 65 | const onCancel = (cancelHandler: () => void): void => { 66 | if (this.#isResolved || this.#isRejected || this.#isCancelled) { 67 | return; 68 | } 69 | this.#cancelHandlers.push(cancelHandler); 70 | }; 71 | 72 | Object.defineProperty(onCancel, "isResolved", { 73 | get: (): boolean => this.#isResolved, 74 | }); 75 | 76 | Object.defineProperty(onCancel, "isRejected", { 77 | get: (): boolean => this.#isRejected, 78 | }); 79 | 80 | Object.defineProperty(onCancel, "isCancelled", { 81 | get: (): boolean => this.#isCancelled, 82 | }); 83 | 84 | return executor(onResolve, onReject, onCancel as OnCancel); 85 | }); 86 | } 87 | 88 | get [Symbol.toStringTag]() { 89 | return "Cancellable Promise"; 90 | } 91 | 92 | public then( 93 | onFulfilled?: ((value: T) => TResult1 | PromiseLike) | null, 94 | onRejected?: ((reason: any) => TResult2 | PromiseLike) | null 95 | ): Promise { 96 | return this.#promise.then(onFulfilled, onRejected); 97 | } 98 | 99 | public catch( 100 | onRejected?: ((reason: any) => TResult | PromiseLike) | null 101 | ): Promise { 102 | return this.#promise.catch(onRejected); 103 | } 104 | 105 | public finally(onFinally?: (() => void) | null): Promise { 106 | return this.#promise.finally(onFinally); 107 | } 108 | 109 | public cancel(): void { 110 | if (this.#isResolved || this.#isRejected || this.#isCancelled) { 111 | return; 112 | } 113 | this.#isCancelled = true; 114 | if (this.#cancelHandlers.length) { 115 | try { 116 | for (const cancelHandler of this.#cancelHandlers) { 117 | cancelHandler(); 118 | } 119 | } catch (error) { 120 | console.warn("Cancellation threw an error", error); 121 | return; 122 | } 123 | } 124 | this.#cancelHandlers.length = 0; 125 | this.#reject?.(new CancelError("Request aborted")); 126 | } 127 | 128 | public get isCancelled(): boolean { 129 | return this.#isCancelled; 130 | } 131 | } 132 | -------------------------------------------------------------------------------- /neuron_viewer/src/client/core/OpenAPI.ts: -------------------------------------------------------------------------------- 1 | // Auto-generated code. Do not edit! See neuron_explainer/activation_server/README.md to learn how to regenerate it. 2 | 3 | /* istanbul ignore file */ 4 | /* tslint:disable */ 5 | /* eslint-disable */ 6 | import type { ApiRequestOptions } from "./ApiRequestOptions"; 7 | 8 | type Resolver = (options: ApiRequestOptions) => Promise; 9 | type Headers = Record; 10 | 11 | export type OpenAPIConfig = { 12 | BASE: string; 13 | VERSION: string; 14 | WITH_CREDENTIALS: boolean; 15 | CREDENTIALS: "include" | "omit" | "same-origin"; 16 | TOKEN?: string | Resolver; 17 | USERNAME?: string | Resolver; 18 | PASSWORD?: string | Resolver; 19 | HEADERS?: Headers | Resolver; 20 | ENCODE_PATH?: (path: string) => string; 21 | }; 22 | 23 | export const OpenAPI: OpenAPIConfig = { 24 | BASE: "", 25 | VERSION: "0.1.0", 26 | WITH_CREDENTIALS: false, 27 | CREDENTIALS: "include", 28 | TOKEN: undefined, 29 | USERNAME: undefined, 30 | PASSWORD: undefined, 31 | HEADERS: undefined, 32 | ENCODE_PATH: undefined, 33 | }; 34 | -------------------------------------------------------------------------------- /neuron_viewer/src/client/models/AblationSpec.ts: -------------------------------------------------------------------------------- 1 | // Auto-generated code. Do not edit! See neuron_explainer/activation_server/README.md to learn how to regenerate it. 2 | 3 | /* istanbul ignore file */ 4 | /* tslint:disable */ 5 | /* eslint-disable */ 6 | 7 | import type { MirroredActivationIndex } from "./MirroredActivationIndex"; 8 | 9 | /** 10 | * A specification for performing ablation on a model. 11 | */ 12 | export type AblationSpec = { 13 | index: MirroredActivationIndex; 14 | value: number; 15 | }; 16 | -------------------------------------------------------------------------------- /neuron_viewer/src/client/models/ActivationLocationType.ts: -------------------------------------------------------------------------------- 1 | // Auto-generated code. Do not edit! See neuron_explainer/activation_server/README.md to learn how to regenerate it. 2 | 3 | /* istanbul ignore file */ 4 | /* tslint:disable */ 5 | /* eslint-disable */ 6 | 7 | /** 8 | * These are the names of activations expected to be instantiated during a forward pass. All activations are 9 | * pre-layer norm unless otherwise specified (RESID_POST_XYZ_LAYER_NORM). 10 | */ 11 | export enum ActivationLocationType { 12 | RESID_POST_EMB = "resid.post_emb", 13 | RESID_DELTA_ATTN = "resid.delta_attn", 14 | RESID_POST_ATTN = "resid.post_attn", 15 | RESID_DELTA_MLP = "resid.delta_mlp", 16 | RESID_POST_MLP = "resid.post_mlp", 17 | RESID_POST_MLP_LN = "resid.post_mlp_ln", 18 | RESID_POST_ATTN_LN = "resid.post_attn_ln", 19 | RESID_POST_LN_F = "resid.post_ln_f", 20 | MLP_LN_SCALE = "mlp_ln.scale", 21 | ATTN_LN_SCALE = "attn_ln.scale", 22 | RESID_LN_F_SCALE = "resid.ln_f.scale", 23 | ATTN_Q = "attn.q", 24 | ATTN_K = "attn.k", 25 | ATTN_V = "attn.v", 26 | ATTN_QK_LOGITS = "attn.qk_logits", 27 | ATTN_QK_PROBS = "attn.qk_probs", 28 | ATTN_V_OUT = "attn.v_out", 29 | MLP_PRE_ACT = "mlp.pre_act", 30 | MLP_POST_ACT = "mlp.post_act", 31 | LOGITS = "logits", 32 | ONLINE_AUTOENCODER_LATENT = "online_autoencoder_latent", 33 | ONLINE_MLP_AUTOENCODER_LATENT = "online_mlp_autoencoder_latent", 34 | ONLINE_ATTENTION_AUTOENCODER_LATENT = "online_attention_autoencoder_latent", 35 | ONLINE_MLP_AUTOENCODER_ERROR = "online_mlp_autoencoder_error", 36 | ONLINE_RESIDUAL_MLP_AUTOENCODER_ERROR = "online_residual_mlp_autoencoder_error", 37 | ONLINE_RESIDUAL_ATTENTION_AUTOENCODER_ERROR = "online_residual_attention_autoencoder_error", 38 | } 39 | -------------------------------------------------------------------------------- /neuron_viewer/src/client/models/AttentionHeadRecordResponse.ts: -------------------------------------------------------------------------------- 1 | // Auto-generated code. Do not edit! See neuron_explainer/activation_server/README.md to learn how to regenerate it. 2 | 3 | /* istanbul ignore file */ 4 | /* tslint:disable */ 5 | /* eslint-disable */ 6 | 7 | import type { TokenAndAttentionScalars } from "./TokenAndAttentionScalars"; 8 | 9 | /** 10 | * Base model that will automatically generate camelCase aliases for fields. Python code can use 11 | * either snake_case or camelCase names. When Typescript code is generated, it will only use the 12 | * camelCase names. 13 | */ 14 | export type AttentionHeadRecordResponse = { 15 | dataset: string; 16 | maxAttentionActivation: number; 17 | mostPositiveTokenSequences: Array>; 18 | randomTokenSequences: Array>; 19 | }; 20 | -------------------------------------------------------------------------------- /neuron_viewer/src/client/models/AttentionTraceType.ts: -------------------------------------------------------------------------------- 1 | // Auto-generated code. Do not edit! See neuron_explainer/activation_server/README.md to learn how to regenerate it. 2 | 3 | /* istanbul ignore file */ 4 | /* tslint:disable */ 5 | /* eslint-disable */ 6 | 7 | /** 8 | * An enumeration. 9 | */ 10 | export enum AttentionTraceType { 11 | Q = "Q", 12 | K = "K", 13 | QK = "QK", 14 | V = "V", 15 | } 16 | -------------------------------------------------------------------------------- /neuron_viewer/src/client/models/AttributedScoredExplanation.ts: -------------------------------------------------------------------------------- 1 | // Auto-generated code. Do not edit! See neuron_explainer/activation_server/README.md to learn how to regenerate it. 2 | 3 | /* istanbul ignore file */ 4 | /* tslint:disable */ 5 | /* eslint-disable */ 6 | 7 | /** 8 | * Base model that will automatically generate camelCase aliases for fields. Python code can use 9 | * either snake_case or camelCase names. When Typescript code is generated, it will only use the 10 | * camelCase names. 11 | */ 12 | export type AttributedScoredExplanation = { 13 | explanation: string; 14 | score?: number; 15 | datasetName: string; 16 | }; 17 | -------------------------------------------------------------------------------- /neuron_viewer/src/client/models/BatchedRequest.ts: -------------------------------------------------------------------------------- 1 | // Auto-generated code. Do not edit! See neuron_explainer/activation_server/README.md to learn how to regenerate it. 2 | 3 | /* istanbul ignore file */ 4 | /* tslint:disable */ 5 | /* eslint-disable */ 6 | 7 | import type { InferenceSubRequest } from "./InferenceSubRequest"; 8 | 9 | /** 10 | * Base model that will automatically generate camelCase aliases for fields. Python code can use 11 | * either snake_case or camelCase names. When Typescript code is generated, it will only use the 12 | * camelCase names. 13 | */ 14 | export type BatchedRequest = { 15 | inferenceSubRequests: Array; 16 | }; 17 | -------------------------------------------------------------------------------- /neuron_viewer/src/client/models/BatchedResponse.ts: -------------------------------------------------------------------------------- 1 | // Auto-generated code. Do not edit! See neuron_explainer/activation_server/README.md to learn how to regenerate it. 2 | 3 | /* istanbul ignore file */ 4 | /* tslint:disable */ 5 | /* eslint-disable */ 6 | 7 | import type { InferenceResponseAndResponseDict } from "./InferenceResponseAndResponseDict"; 8 | 9 | /** 10 | * Base model that will automatically generate camelCase aliases for fields. Python code can use 11 | * either snake_case or camelCase names. When Typescript code is generated, it will only use the 12 | * camelCase names. 13 | */ 14 | export type BatchedResponse = { 15 | inferenceSubResponses: Array; 16 | }; 17 | -------------------------------------------------------------------------------- /neuron_viewer/src/client/models/BatchedTdbRequest.ts: -------------------------------------------------------------------------------- 1 | // Auto-generated code. Do not edit! See neuron_explainer/activation_server/README.md to learn how to regenerate it. 2 | 3 | /* istanbul ignore file */ 4 | /* tslint:disable */ 5 | /* eslint-disable */ 6 | 7 | import type { TdbRequestSpec } from "./TdbRequestSpec"; 8 | 9 | /** 10 | * Base model that will automatically generate camelCase aliases for fields. Python code can use 11 | * either snake_case or camelCase names. When Typescript code is generated, it will only use the 12 | * camelCase names. 13 | */ 14 | export type BatchedTdbRequest = { 15 | subRequests: Array; 16 | }; 17 | -------------------------------------------------------------------------------- /neuron_viewer/src/client/models/ComponentTypeForAttention.ts: -------------------------------------------------------------------------------- 1 | // Auto-generated code. Do not edit! See neuron_explainer/activation_server/README.md to learn how to regenerate it. 2 | 3 | /* istanbul ignore file */ 4 | /* tslint:disable */ 5 | /* eslint-disable */ 6 | 7 | /** 8 | * The type of component / fundamental unit to use for Attention layers. 9 | * 10 | * This determines which types of node appear in the node table to represent the Attention layers. 11 | * Heads are the fundamental unit of Attention layers, but autoencoder latents are more interpretable. 12 | */ 13 | export enum ComponentTypeForAttention { 14 | ATTENTION_HEAD = "attention_head", 15 | AUTOENCODER_LATENT = "autoencoder_latent", 16 | } 17 | -------------------------------------------------------------------------------- /neuron_viewer/src/client/models/ComponentTypeForMlp.ts: -------------------------------------------------------------------------------- 1 | // Auto-generated code. Do not edit! See neuron_explainer/activation_server/README.md to learn how to regenerate it. 2 | 3 | /* istanbul ignore file */ 4 | /* tslint:disable */ 5 | /* eslint-disable */ 6 | 7 | /** 8 | * The type of component / fundamental unit to use for MLP layers. 9 | * 10 | * This determines which types of node appear in the node table to represent the MLP layers. 11 | * Neurons are the fundamental unit of MLP layers, but autoencoder latents are more interpretable. 12 | */ 13 | export enum ComponentTypeForMlp { 14 | NEURON = "neuron", 15 | AUTOENCODER_LATENT = "autoencoder_latent", 16 | } 17 | -------------------------------------------------------------------------------- /neuron_viewer/src/client/models/DerivedAttentionScalarsRequest.ts: -------------------------------------------------------------------------------- 1 | // Auto-generated code. Do not edit! See neuron_explainer/activation_server/README.md to learn how to regenerate it. 2 | 3 | /* istanbul ignore file */ 4 | /* tslint:disable */ 5 | /* eslint-disable */ 6 | 7 | import type { DerivedAttentionScalarsRequestSpec } from "./DerivedAttentionScalarsRequestSpec"; 8 | import type { InferenceRequestSpec } from "./InferenceRequestSpec"; 9 | 10 | /** 11 | * Base model that will automatically generate camelCase aliases for fields. Python code can use 12 | * either snake_case or camelCase names. When Typescript code is generated, it will only use the 13 | * camelCase names. 14 | */ 15 | export type DerivedAttentionScalarsRequest = { 16 | inferenceRequestSpec: InferenceRequestSpec; 17 | derivedAttentionScalarsRequestSpec: DerivedAttentionScalarsRequestSpec; 18 | }; 19 | -------------------------------------------------------------------------------- /neuron_viewer/src/client/models/DerivedAttentionScalarsRequestSpec.ts: -------------------------------------------------------------------------------- 1 | // Auto-generated code. Do not edit! See neuron_explainer/activation_server/README.md to learn how to regenerate it. 2 | 3 | /* istanbul ignore file */ 4 | /* tslint:disable */ 5 | /* eslint-disable */ 6 | 7 | import type { DerivedScalarType } from "./DerivedScalarType"; 8 | import type { NodeIdAndDatasets } from "./NodeIdAndDatasets"; 9 | 10 | /** 11 | * Base model that will automatically generate camelCase aliases for fields. Python code can use 12 | * either snake_case or camelCase names. When Typescript code is generated, it will only use the 13 | * camelCase names. 14 | */ 15 | export type DerivedAttentionScalarsRequestSpec = { 16 | specType?: DerivedAttentionScalarsRequestSpec.specType; 17 | dst: DerivedScalarType; 18 | layerIndex?: number; 19 | activationIndex: number; 20 | normalizeActivationsUsingNeuronRecord?: NodeIdAndDatasets; 21 | }; 22 | 23 | export namespace DerivedAttentionScalarsRequestSpec { 24 | export enum specType { 25 | DERIVED_ATTENTION_SCALARS_REQUEST_SPEC = "derived_attention_scalars_request_spec", 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /neuron_viewer/src/client/models/DerivedAttentionScalarsResponse.ts: -------------------------------------------------------------------------------- 1 | // Auto-generated code. Do not edit! See neuron_explainer/activation_server/README.md to learn how to regenerate it. 2 | 3 | /* istanbul ignore file */ 4 | /* tslint:disable */ 5 | /* eslint-disable */ 6 | 7 | import type { DerivedAttentionScalarsResponseData } from "./DerivedAttentionScalarsResponseData"; 8 | import type { InferenceAndTokenData } from "./InferenceAndTokenData"; 9 | 10 | /** 11 | * Base model that will automatically generate camelCase aliases for fields. Python code can use 12 | * either snake_case or camelCase names. When Typescript code is generated, it will only use the 13 | * camelCase names. 14 | */ 15 | export type DerivedAttentionScalarsResponse = { 16 | inferenceAndTokenData: InferenceAndTokenData; 17 | derivedAttentionScalarsResponseData: DerivedAttentionScalarsResponseData; 18 | }; 19 | -------------------------------------------------------------------------------- /neuron_viewer/src/client/models/DerivedAttentionScalarsResponseData.ts: -------------------------------------------------------------------------------- 1 | // Auto-generated code. Do not edit! See neuron_explainer/activation_server/README.md to learn how to regenerate it. 2 | 3 | /* istanbul ignore file */ 4 | /* tslint:disable */ 5 | /* eslint-disable */ 6 | 7 | import type { ProcessingResponseDataType } from "./ProcessingResponseDataType"; 8 | import type { TokenAndAttentionScalars } from "./TokenAndAttentionScalars"; 9 | 10 | /** 11 | * Base model that will automatically generate camelCase aliases for fields. Python code can use 12 | * either snake_case or camelCase names. When Typescript code is generated, it will only use the 13 | * camelCase names. 14 | */ 15 | export type DerivedAttentionScalarsResponseData = { 16 | responseDataType?: ProcessingResponseDataType; 17 | tokenAndAttentionScalarsList: Array; 18 | }; 19 | -------------------------------------------------------------------------------- /neuron_viewer/src/client/models/DerivedScalarsRequest.ts: -------------------------------------------------------------------------------- 1 | // Auto-generated code. Do not edit! See neuron_explainer/activation_server/README.md to learn how to regenerate it. 2 | 3 | /* istanbul ignore file */ 4 | /* tslint:disable */ 5 | /* eslint-disable */ 6 | 7 | import type { DerivedScalarsRequestSpec } from "./DerivedScalarsRequestSpec"; 8 | import type { InferenceRequestSpec } from "./InferenceRequestSpec"; 9 | 10 | /** 11 | * Base model that will automatically generate camelCase aliases for fields. Python code can use 12 | * either snake_case or camelCase names. When Typescript code is generated, it will only use the 13 | * camelCase names. 14 | */ 15 | export type DerivedScalarsRequest = { 16 | inferenceRequestSpec: InferenceRequestSpec; 17 | derivedScalarsRequestSpec: DerivedScalarsRequestSpec; 18 | }; 19 | -------------------------------------------------------------------------------- /neuron_viewer/src/client/models/DerivedScalarsRequestSpec.ts: -------------------------------------------------------------------------------- 1 | // Auto-generated code. Do not edit! See neuron_explainer/activation_server/README.md to learn how to regenerate it. 2 | 3 | /* istanbul ignore file */ 4 | /* tslint:disable */ 5 | /* eslint-disable */ 6 | 7 | import type { DerivedScalarType } from "./DerivedScalarType"; 8 | import type { NodeIdAndDatasets } from "./NodeIdAndDatasets"; 9 | import type { PassType } from "./PassType"; 10 | 11 | /** 12 | * Base model that will automatically generate camelCase aliases for fields. Python code can use 13 | * either snake_case or camelCase names. When Typescript code is generated, it will only use the 14 | * camelCase names. 15 | */ 16 | export type DerivedScalarsRequestSpec = { 17 | specType?: DerivedScalarsRequestSpec.specType; 18 | dst: DerivedScalarType; 19 | layerIndex?: number; 20 | activationIndex: number; 21 | normalizeActivationsUsingNeuronRecord?: NodeIdAndDatasets; 22 | passType?: PassType; 23 | numTopTokens?: number; 24 | }; 25 | 26 | export namespace DerivedScalarsRequestSpec { 27 | export enum specType { 28 | DERIVED_SCALARS_REQUEST_SPEC = "derived_scalars_request_spec", 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /neuron_viewer/src/client/models/DerivedScalarsResponse.ts: -------------------------------------------------------------------------------- 1 | // Auto-generated code. Do not edit! See neuron_explainer/activation_server/README.md to learn how to regenerate it. 2 | 3 | /* istanbul ignore file */ 4 | /* tslint:disable */ 5 | /* eslint-disable */ 6 | 7 | import type { DerivedScalarsResponseData } from "./DerivedScalarsResponseData"; 8 | import type { InferenceAndTokenData } from "./InferenceAndTokenData"; 9 | 10 | /** 11 | * Base model that will automatically generate camelCase aliases for fields. Python code can use 12 | * either snake_case or camelCase names. When Typescript code is generated, it will only use the 13 | * camelCase names. 14 | */ 15 | export type DerivedScalarsResponse = { 16 | inferenceAndTokenData: InferenceAndTokenData; 17 | derivedScalarsResponseData: DerivedScalarsResponseData; 18 | }; 19 | -------------------------------------------------------------------------------- /neuron_viewer/src/client/models/DerivedScalarsResponseData.ts: -------------------------------------------------------------------------------- 1 | // Auto-generated code. Do not edit! See neuron_explainer/activation_server/README.md to learn how to regenerate it. 2 | 3 | /* istanbul ignore file */ 4 | /* tslint:disable */ 5 | /* eslint-disable */ 6 | 7 | import type { MirroredNodeIndex } from "./MirroredNodeIndex"; 8 | import type { ProcessingResponseDataType } from "./ProcessingResponseDataType"; 9 | import type { TopTokens } from "./TopTokens"; 10 | 11 | /** 12 | * Base model that will automatically generate camelCase aliases for fields. Python code can use 13 | * either snake_case or camelCase names. When Typescript code is generated, it will only use the 14 | * camelCase names. 15 | */ 16 | export type DerivedScalarsResponseData = { 17 | responseDataType?: ProcessingResponseDataType; 18 | activations: Array; 19 | normalizedActivations?: Array; 20 | nodeIndices: Array; 21 | topTokens?: TopTokens; 22 | }; 23 | -------------------------------------------------------------------------------- /neuron_viewer/src/client/models/Dimension.ts: -------------------------------------------------------------------------------- 1 | // Auto-generated code. Do not edit! See neuron_explainer/activation_server/README.md to learn how to regenerate it. 2 | 3 | /* istanbul ignore file */ 4 | /* tslint:disable */ 5 | /* eslint-disable */ 6 | 7 | /** 8 | * Dimensions correspond to the names of dimensions of activation tensors, and can depend on the input, 9 | * the model, or e.g. parameters of added subgraphs such as autoencoders. 10 | * The dimensions below are taken to be 'per layer' wherever applicable. 11 | * Dimensions associated with attention heads (e.g. value channels) are taken to be 'per attention head'. 12 | */ 13 | export enum Dimension { 14 | SEQUENCE_TOKENS = "sequence_tokens", 15 | ATTENDED_TO_SEQUENCE_TOKENS = "attended_to_sequence_tokens", 16 | MAX_CONTEXT_LENGTH = "max_context_length", 17 | RESIDUAL_STREAM_CHANNELS = "residual_stream_channels", 18 | VOCAB_SIZE = "vocab_size", 19 | ATTN_HEADS = "attn_heads", 20 | QUERY_AND_KEY_CHANNELS = "query_and_key_channels", 21 | VALUE_CHANNELS = "value_channels", 22 | MLP_ACTS = "mlp_acts", 23 | LAYERS = "layers", 24 | SINGLETON = "singleton", 25 | AUTOENCODER_LATENTS = "autoencoder_latents", 26 | AUTOENCODER_LATENTS_BY_TOKEN_PAIR = "autoencoder_latents_by_token_pair", 27 | } 28 | -------------------------------------------------------------------------------- /neuron_viewer/src/client/models/ExistingExplanationsRequest.ts: -------------------------------------------------------------------------------- 1 | // Auto-generated code. Do not edit! See neuron_explainer/activation_server/README.md to learn how to regenerate it. 2 | 3 | /* istanbul ignore file */ 4 | /* tslint:disable */ 5 | /* eslint-disable */ 6 | 7 | import type { DerivedScalarType } from "./DerivedScalarType"; 8 | 9 | /** 10 | * Base model that will automatically generate camelCase aliases for fields. Python code can use 11 | * either snake_case or camelCase names. When Typescript code is generated, it will only use the 12 | * camelCase names. 13 | */ 14 | export type ExistingExplanationsRequest = { 15 | dst: DerivedScalarType; 16 | layerIndex: number; 17 | activationIndex: number; 18 | explanationDatasets: Array; 19 | neuronDataset?: string; 20 | }; 21 | -------------------------------------------------------------------------------- /neuron_viewer/src/client/models/ExplanationResult.ts: -------------------------------------------------------------------------------- 1 | // Auto-generated code. Do not edit! See neuron_explainer/activation_server/README.md to learn how to regenerate it. 2 | 3 | /* istanbul ignore file */ 4 | /* tslint:disable */ 5 | /* eslint-disable */ 6 | 7 | /** 8 | * Base model that will automatically generate camelCase aliases for fields. Python code can use 9 | * either snake_case or camelCase names. When Typescript code is generated, it will only use the 10 | * camelCase names. 11 | */ 12 | export type ExplanationResult = { 13 | explanations: Array; 14 | dataset: string; 15 | }; 16 | -------------------------------------------------------------------------------- /neuron_viewer/src/client/models/GroupId.ts: -------------------------------------------------------------------------------- 1 | // Auto-generated code. Do not edit! See neuron_explainer/activation_server/README.md to learn how to regenerate it. 2 | 3 | /* istanbul ignore file */ 4 | /* tslint:disable */ 5 | /* eslint-disable */ 6 | 7 | /** 8 | * Identifiers for groups in multi-top-k requests. 9 | */ 10 | export enum GroupId { 11 | ACT_TIMES_GRAD = "act_times_grad", 12 | ACTIVATION = "activation", 13 | DIRECT_WRITE_TO_GRAD = "direct_write_to_grad", 14 | DIRECTION_WRITE = "direction_write", 15 | LOGITS = "logits", 16 | MLP_LAYER_WRITE = "mlp_layer_write", 17 | SINGLETON = "singleton", 18 | TOKEN_WRITE = "token_write", 19 | TOKEN_READ = "token_read", 20 | WRITE_NORM = "write_norm", 21 | TOKEN_PAIR_ATTRIBUTION = "token_pair_attribution", 22 | } 23 | -------------------------------------------------------------------------------- /neuron_viewer/src/client/models/HTTPValidationError.ts: -------------------------------------------------------------------------------- 1 | // Auto-generated code. Do not edit! See neuron_explainer/activation_server/README.md to learn how to regenerate it. 2 | 3 | /* istanbul ignore file */ 4 | /* tslint:disable */ 5 | /* eslint-disable */ 6 | 7 | import type { ValidationError } from "./ValidationError"; 8 | 9 | export type HTTPValidationError = { 10 | detail?: Array; 11 | }; 12 | -------------------------------------------------------------------------------- /neuron_viewer/src/client/models/InferenceAndTokenData.ts: -------------------------------------------------------------------------------- 1 | // Auto-generated code. Do not edit! See neuron_explainer/activation_server/README.md to learn how to regenerate it. 2 | 3 | /* istanbul ignore file */ 4 | /* tslint:disable */ 5 | /* eslint-disable */ 6 | 7 | /** 8 | * Base model that will automatically generate camelCase aliases for fields. Python code can use 9 | * either snake_case or camelCase names. When Typescript code is generated, it will only use the 10 | * camelCase names. 11 | */ 12 | export type InferenceAndTokenData = { 13 | inferenceTime: number; 14 | memoryUsedBefore?: number; 15 | memoryUsedAfter?: number; 16 | log?: string; 17 | loss?: number; 18 | activationValueForBackwardPass?: number; 19 | tokensAsInts: Array; 20 | tokensAsStrings: Array; 21 | }; 22 | -------------------------------------------------------------------------------- /neuron_viewer/src/client/models/InferenceRequestSpec.ts: -------------------------------------------------------------------------------- 1 | // Auto-generated code. Do not edit! See neuron_explainer/activation_server/README.md to learn how to regenerate it. 2 | 3 | /* istanbul ignore file */ 4 | /* tslint:disable */ 5 | /* eslint-disable */ 6 | 7 | import type { AblationSpec } from "./AblationSpec"; 8 | import type { LossFnConfig } from "./LossFnConfig"; 9 | import type { MirroredActivationIndex } from "./MirroredActivationIndex"; 10 | import type { MirroredTraceConfig } from "./MirroredTraceConfig"; 11 | 12 | /** 13 | * The minimum specification for performing a forward and/or backward pass on a model, with hooks at some set of layers. 14 | */ 15 | export type InferenceRequestSpec = { 16 | prompt: string; 17 | ablationSpecs?: Array; 18 | lossFnConfig?: LossFnConfig; 19 | traceConfig?: MirroredTraceConfig; 20 | activationIndexForWithinLayerGrad?: MirroredActivationIndex; 21 | }; 22 | -------------------------------------------------------------------------------- /neuron_viewer/src/client/models/InferenceResponse.ts: -------------------------------------------------------------------------------- 1 | // Auto-generated code. Do not edit! See neuron_explainer/activation_server/README.md to learn how to regenerate it. 2 | 3 | /* istanbul ignore file */ 4 | /* tslint:disable */ 5 | /* eslint-disable */ 6 | 7 | import type { InferenceAndTokenData } from "./InferenceAndTokenData"; 8 | 9 | /** 10 | * Base model that will automatically generate camelCase aliases for fields. Python code can use 11 | * either snake_case or camelCase names. When Typescript code is generated, it will only use the 12 | * camelCase names. 13 | */ 14 | export type InferenceResponse = { 15 | inferenceAndTokenData: InferenceAndTokenData; 16 | }; 17 | -------------------------------------------------------------------------------- /neuron_viewer/src/client/models/InferenceResponseAndResponseDict.ts: -------------------------------------------------------------------------------- 1 | // Auto-generated code. Do not edit! See neuron_explainer/activation_server/README.md to learn how to regenerate it. 2 | 3 | /* istanbul ignore file */ 4 | /* tslint:disable */ 5 | /* eslint-disable */ 6 | 7 | import type { DerivedAttentionScalarsResponseData } from "./DerivedAttentionScalarsResponseData"; 8 | import type { DerivedScalarsResponseData } from "./DerivedScalarsResponseData"; 9 | import type { InferenceResponse } from "./InferenceResponse"; 10 | import type { MultipleTopKDerivedScalarsResponseData } from "./MultipleTopKDerivedScalarsResponseData"; 11 | import type { ScoredTokensResponseData } from "./ScoredTokensResponseData"; 12 | import type { TokenPairAttributionResponseData } from "./TokenPairAttributionResponseData"; 13 | 14 | /** 15 | * Base model that will automatically generate camelCase aliases for fields. Python code can use 16 | * either snake_case or camelCase names. When Typescript code is generated, it will only use the 17 | * camelCase names. 18 | */ 19 | export type InferenceResponseAndResponseDict = { 20 | inferenceResponse: InferenceResponse; 21 | processingResponseDataByName?: Record< 22 | string, 23 | | MultipleTopKDerivedScalarsResponseData 24 | | DerivedScalarsResponseData 25 | | DerivedAttentionScalarsResponseData 26 | | ScoredTokensResponseData 27 | | TokenPairAttributionResponseData 28 | >; 29 | }; 30 | -------------------------------------------------------------------------------- /neuron_viewer/src/client/models/InferenceSubRequest.ts: -------------------------------------------------------------------------------- 1 | // Auto-generated code. Do not edit! See neuron_explainer/activation_server/README.md to learn how to regenerate it. 2 | 3 | /* istanbul ignore file */ 4 | /* tslint:disable */ 5 | /* eslint-disable */ 6 | 7 | import type { DerivedAttentionScalarsRequestSpec } from "./DerivedAttentionScalarsRequestSpec"; 8 | import type { DerivedScalarsRequestSpec } from "./DerivedScalarsRequestSpec"; 9 | import type { InferenceRequestSpec } from "./InferenceRequestSpec"; 10 | import type { MultipleTopKDerivedScalarsRequestSpec } from "./MultipleTopKDerivedScalarsRequestSpec"; 11 | import type { ScoredTokensRequestSpec } from "./ScoredTokensRequestSpec"; 12 | import type { TokenPairAttributionRequestSpec } from "./TokenPairAttributionRequestSpec"; 13 | 14 | /** 15 | * Base model that will automatically generate camelCase aliases for fields. Python code can use 16 | * either snake_case or camelCase names. When Typescript code is generated, it will only use the 17 | * camelCase names. 18 | */ 19 | export type InferenceSubRequest = { 20 | inferenceRequestSpec: InferenceRequestSpec; 21 | processingRequestSpecByName?: Record< 22 | string, 23 | | MultipleTopKDerivedScalarsRequestSpec 24 | | DerivedScalarsRequestSpec 25 | | DerivedAttentionScalarsRequestSpec 26 | | ScoredTokensRequestSpec 27 | | TokenPairAttributionRequestSpec 28 | >; 29 | }; 30 | -------------------------------------------------------------------------------- /neuron_viewer/src/client/models/LossFnConfig.ts: -------------------------------------------------------------------------------- 1 | // Auto-generated code. Do not edit! See neuron_explainer/activation_server/README.md to learn how to regenerate it. 2 | 3 | /* istanbul ignore file */ 4 | /* tslint:disable */ 5 | /* eslint-disable */ 6 | 7 | import type { LossFnName } from "./LossFnName"; 8 | 9 | /** 10 | * Base model that will automatically generate camelCase aliases for fields. Python code can use 11 | * either snake_case or camelCase names. When Typescript code is generated, it will only use the 12 | * camelCase names. 13 | */ 14 | export type LossFnConfig = { 15 | name: LossFnName; 16 | targetTokens?: Array; 17 | distractorTokens?: Array; 18 | }; 19 | -------------------------------------------------------------------------------- /neuron_viewer/src/client/models/LossFnName.ts: -------------------------------------------------------------------------------- 1 | // Auto-generated code. Do not edit! See neuron_explainer/activation_server/README.md to learn how to regenerate it. 2 | 3 | /* istanbul ignore file */ 4 | /* tslint:disable */ 5 | /* eslint-disable */ 6 | 7 | /** 8 | * An enumeration. 9 | */ 10 | export enum LossFnName { 11 | LOGIT_DIFF = "logit_diff", 12 | LOGIT_MINUS_MEAN = "logit_minus_mean", 13 | PROBS = "probs", 14 | ZERO = "zero", 15 | } 16 | -------------------------------------------------------------------------------- /neuron_viewer/src/client/models/MirroredActivationIndex.ts: -------------------------------------------------------------------------------- 1 | // Auto-generated code. Do not edit! See neuron_explainer/activation_server/README.md to learn how to regenerate it. 2 | 3 | /* istanbul ignore file */ 4 | /* tslint:disable */ 5 | /* eslint-disable */ 6 | 7 | import type { ActivationLocationType } from "./ActivationLocationType"; 8 | import type { PassType } from "./PassType"; 9 | 10 | /** 11 | * Base model that will automatically generate camelCase aliases for fields. Python code can use 12 | * either snake_case or camelCase names. When Typescript code is generated, it will only use the 13 | * camelCase names. 14 | */ 15 | export type MirroredActivationIndex = { 16 | activationLocationType: ActivationLocationType; 17 | tensorIndices: Array; 18 | layerIndex?: number; 19 | passType: PassType; 20 | }; 21 | -------------------------------------------------------------------------------- /neuron_viewer/src/client/models/MirroredNodeIndex.ts: -------------------------------------------------------------------------------- 1 | // Auto-generated code. Do not edit! See neuron_explainer/activation_server/README.md to learn how to regenerate it. 2 | 3 | /* istanbul ignore file */ 4 | /* tslint:disable */ 5 | /* eslint-disable */ 6 | 7 | import type { NodeType } from "./NodeType"; 8 | import type { PassType } from "./PassType"; 9 | 10 | /** 11 | * This class mirrors the fields of NodeIndex without default values. 12 | */ 13 | export type MirroredNodeIndex = { 14 | nodeType: NodeType; 15 | tensorIndices: Array; 16 | layerIndex?: number; 17 | passType: PassType; 18 | }; 19 | -------------------------------------------------------------------------------- /neuron_viewer/src/client/models/MirroredTraceConfig.ts: -------------------------------------------------------------------------------- 1 | // Auto-generated code. Do not edit! See neuron_explainer/activation_server/README.md to learn how to regenerate it. 2 | 3 | /* istanbul ignore file */ 4 | /* tslint:disable */ 5 | /* eslint-disable */ 6 | 7 | import type { AttentionTraceType } from "./AttentionTraceType"; 8 | import type { MirroredNodeIndex } from "./MirroredNodeIndex"; 9 | import type { PreOrPostAct } from "./PreOrPostAct"; 10 | 11 | /** 12 | * Base model that will automatically generate camelCase aliases for fields. Python code can use 13 | * either snake_case or camelCase names. When Typescript code is generated, it will only use the 14 | * camelCase names. 15 | */ 16 | export type MirroredTraceConfig = { 17 | nodeIndex: MirroredNodeIndex; 18 | preOrPostAct: PreOrPostAct; 19 | detachLayerNormScale: boolean; 20 | attentionTraceType?: AttentionTraceType; 21 | downstreamTraceConfig?: MirroredTraceConfig; 22 | }; 23 | -------------------------------------------------------------------------------- /neuron_viewer/src/client/models/ModelInfoResponse.ts: -------------------------------------------------------------------------------- 1 | // Auto-generated code. Do not edit! See neuron_explainer/activation_server/README.md to learn how to regenerate it. 2 | 3 | /* istanbul ignore file */ 4 | /* tslint:disable */ 5 | /* eslint-disable */ 6 | 7 | /** 8 | * Base model that will automatically generate camelCase aliases for fields. Python code can use 9 | * either snake_case or camelCase names. When Typescript code is generated, it will only use the 10 | * camelCase names. 11 | */ 12 | export type ModelInfoResponse = { 13 | modelName?: string; 14 | hasMlpAutoencoder: boolean; 15 | mlpAutoencoderName?: string; 16 | hasAttentionAutoencoder: boolean; 17 | attentionAutoencoderName?: string; 18 | nLayers: number; 19 | }; 20 | -------------------------------------------------------------------------------- /neuron_viewer/src/client/models/MultipleTopKDerivedScalarsRequest.ts: -------------------------------------------------------------------------------- 1 | // Auto-generated code. Do not edit! See neuron_explainer/activation_server/README.md to learn how to regenerate it. 2 | 3 | /* istanbul ignore file */ 4 | /* tslint:disable */ 5 | /* eslint-disable */ 6 | 7 | import type { InferenceRequestSpec } from "./InferenceRequestSpec"; 8 | import type { MultipleTopKDerivedScalarsRequestSpec } from "./MultipleTopKDerivedScalarsRequestSpec"; 9 | 10 | /** 11 | * Base model that will automatically generate camelCase aliases for fields. Python code can use 12 | * either snake_case or camelCase names. When Typescript code is generated, it will only use the 13 | * camelCase names. 14 | */ 15 | export type MultipleTopKDerivedScalarsRequest = { 16 | inferenceRequestSpec: InferenceRequestSpec; 17 | multipleTopKDerivedScalarsRequestSpec: MultipleTopKDerivedScalarsRequestSpec; 18 | }; 19 | -------------------------------------------------------------------------------- /neuron_viewer/src/client/models/MultipleTopKDerivedScalarsRequestSpec.ts: -------------------------------------------------------------------------------- 1 | // Auto-generated code. Do not edit! See neuron_explainer/activation_server/README.md to learn how to regenerate it. 2 | 3 | /* istanbul ignore file */ 4 | /* tslint:disable */ 5 | /* eslint-disable */ 6 | 7 | import type { DerivedScalarType } from "./DerivedScalarType"; 8 | import type { Dimension } from "./Dimension"; 9 | import type { PassType } from "./PassType"; 10 | 11 | /** 12 | * Base model that will automatically generate camelCase aliases for fields. Python code can use 13 | * either snake_case or camelCase names. When Typescript code is generated, it will only use the 14 | * camelCase names. 15 | */ 16 | export type MultipleTopKDerivedScalarsRequestSpec = { 17 | specType?: MultipleTopKDerivedScalarsRequestSpec.specType; 18 | dstListByGroupId: Record>; 19 | tokenIndex?: number; 20 | topAndBottomK?: number; 21 | passType?: PassType; 22 | dimensionsToKeepForIntermediateSum?: Array; 23 | }; 24 | 25 | export namespace MultipleTopKDerivedScalarsRequestSpec { 26 | export enum specType { 27 | MULTIPLE_TOP_K_DERIVED_SCALARS_REQUEST_SPEC = "multiple_top_k_derived_scalars_request_spec", 28 | } 29 | } 30 | -------------------------------------------------------------------------------- /neuron_viewer/src/client/models/MultipleTopKDerivedScalarsResponse.ts: -------------------------------------------------------------------------------- 1 | // Auto-generated code. Do not edit! See neuron_explainer/activation_server/README.md to learn how to regenerate it. 2 | 3 | /* istanbul ignore file */ 4 | /* tslint:disable */ 5 | /* eslint-disable */ 6 | 7 | import type { InferenceAndTokenData } from "./InferenceAndTokenData"; 8 | import type { MultipleTopKDerivedScalarsResponseData } from "./MultipleTopKDerivedScalarsResponseData"; 9 | 10 | /** 11 | * Base model that will automatically generate camelCase aliases for fields. Python code can use 12 | * either snake_case or camelCase names. When Typescript code is generated, it will only use the 13 | * camelCase names. 14 | */ 15 | export type MultipleTopKDerivedScalarsResponse = { 16 | inferenceAndTokenData: InferenceAndTokenData; 17 | multipleTopKDerivedScalarsResponseData: MultipleTopKDerivedScalarsResponseData; 18 | }; 19 | -------------------------------------------------------------------------------- /neuron_viewer/src/client/models/MultipleTopKDerivedScalarsResponseData.ts: -------------------------------------------------------------------------------- 1 | // Auto-generated code. Do not edit! See neuron_explainer/activation_server/README.md to learn how to regenerate it. 2 | 3 | /* istanbul ignore file */ 4 | /* tslint:disable */ 5 | /* eslint-disable */ 6 | 7 | import type { MirroredNodeIndex } from "./MirroredNodeIndex"; 8 | import type { ProcessingResponseDataType } from "./ProcessingResponseDataType"; 9 | import type { Tensor0D } from "./Tensor0D"; 10 | import type { Tensor1D } from "./Tensor1D"; 11 | import type { Tensor2D } from "./Tensor2D"; 12 | import type { Tensor3D } from "./Tensor3D"; 13 | 14 | /** 15 | * Base model that will automatically generate camelCase aliases for fields. Python code can use 16 | * either snake_case or camelCase names. When Typescript code is generated, it will only use the 17 | * camelCase names. 18 | */ 19 | export type MultipleTopKDerivedScalarsResponseData = { 20 | responseDataType?: ProcessingResponseDataType; 21 | activationsByGroupId: Record>; 22 | nodeIndices: Array; 23 | vocabTokenStringsForIndices?: Array; 24 | intermediateSumActivationsByDstByGroupId: Record< 25 | string, 26 | Record 27 | >; 28 | }; 29 | -------------------------------------------------------------------------------- /neuron_viewer/src/client/models/NeuronDatasetMetadata.ts: -------------------------------------------------------------------------------- 1 | // Auto-generated code. Do not edit! See neuron_explainer/activation_server/README.md to learn how to regenerate it. 2 | 3 | /* istanbul ignore file */ 4 | /* tslint:disable */ 5 | /* eslint-disable */ 6 | 7 | /** 8 | * Base model that will automatically generate camelCase aliases for fields. Python code can use 9 | * either snake_case or camelCase names. When Typescript code is generated, it will only use the 10 | * camelCase names. 11 | */ 12 | export type NeuronDatasetMetadata = { 13 | shortName: string; 14 | derivedScalarType: string; 15 | userVisibleName: string; 16 | neuronDatasetPath: string; 17 | }; 18 | -------------------------------------------------------------------------------- /neuron_viewer/src/client/models/NeuronRecordResponse.ts: -------------------------------------------------------------------------------- 1 | // Auto-generated code. Do not edit! See neuron_explainer/activation_server/README.md to learn how to regenerate it. 2 | 3 | /* istanbul ignore file */ 4 | /* tslint:disable */ 5 | /* eslint-disable */ 6 | 7 | import type { TokenAndScalar } from "./TokenAndScalar"; 8 | 9 | /** 10 | * Base model that will automatically generate camelCase aliases for fields. Python code can use 11 | * either snake_case or camelCase names. When Typescript code is generated, it will only use the 12 | * camelCase names. 13 | */ 14 | export type NeuronRecordResponse = { 15 | dataset: string; 16 | maxActivation: number; 17 | topActivations: Array>; 18 | randomSample: Array>; 19 | }; 20 | -------------------------------------------------------------------------------- /neuron_viewer/src/client/models/NodeAblation.ts: -------------------------------------------------------------------------------- 1 | // Auto-generated code. Do not edit! See neuron_explainer/activation_server/README.md to learn how to regenerate it. 2 | 3 | /* istanbul ignore file */ 4 | /* tslint:disable */ 5 | /* eslint-disable */ 6 | 7 | import type { MirroredNodeIndex } from "./MirroredNodeIndex"; 8 | 9 | /** 10 | * A specification for tracing an upstream node. 11 | * 12 | * This data structure is used by the client. The server converts it to an AblationSpec. 13 | */ 14 | export type NodeAblation = { 15 | nodeIndex: MirroredNodeIndex; 16 | value: number; 17 | }; 18 | -------------------------------------------------------------------------------- /neuron_viewer/src/client/models/NodeIdAndDatasets.ts: -------------------------------------------------------------------------------- 1 | // Auto-generated code. Do not edit! See neuron_explainer/activation_server/README.md to learn how to regenerate it. 2 | 3 | /* istanbul ignore file */ 4 | /* tslint:disable */ 5 | /* eslint-disable */ 6 | 7 | import type { DerivedScalarType } from "./DerivedScalarType"; 8 | 9 | /** 10 | * Base model that will automatically generate camelCase aliases for fields. Python code can use 11 | * either snake_case or camelCase names. When Typescript code is generated, it will only use the 12 | * camelCase names. 13 | */ 14 | export type NodeIdAndDatasets = { 15 | dst: DerivedScalarType; 16 | layerIndex: number; 17 | activationIndex: number; 18 | datasets: Array; 19 | }; 20 | -------------------------------------------------------------------------------- /neuron_viewer/src/client/models/NodeToTrace.ts: -------------------------------------------------------------------------------- 1 | // Auto-generated code. Do not edit! See neuron_explainer/activation_server/README.md to learn how to regenerate it. 2 | 3 | /* istanbul ignore file */ 4 | /* tslint:disable */ 5 | /* eslint-disable */ 6 | 7 | import type { AttentionTraceType } from "./AttentionTraceType"; 8 | import type { MirroredNodeIndex } from "./MirroredNodeIndex"; 9 | import type { MirroredTraceConfig } from "./MirroredTraceConfig"; 10 | 11 | /** 12 | * A specification for tracing a node. 13 | * 14 | * This data structure is used by the client. The server converts it to an activation index and 15 | * an ablation spec. 16 | * 17 | * In the case of tracing through attention value, there can be up to two NodeToTrace 18 | * objects: one upstream and one downstream. First, a gradient is computed with respect to the 19 | * downstream node. Then, the direct effect of the upstream (attention) node on that downstream 20 | * node is computed. Then, the gradient is computed with respect to that direct effect, propagated 21 | * through V 22 | */ 23 | export type NodeToTrace = { 24 | nodeIndex: MirroredNodeIndex; 25 | attentionTraceType?: AttentionTraceType; 26 | downstreamTraceConfig?: MirroredTraceConfig; 27 | }; 28 | -------------------------------------------------------------------------------- /neuron_viewer/src/client/models/NodeType.ts: -------------------------------------------------------------------------------- 1 | // Auto-generated code. Do not edit! See neuron_explainer/activation_server/README.md to learn how to regenerate it. 2 | 3 | /* istanbul ignore file */ 4 | /* tslint:disable */ 5 | /* eslint-disable */ 6 | 7 | /** 8 | * A "node" is defined as a model component associated with a scalar activation per 9 | * token or per token pair. The canonical example is an MLP neuron. An activation 10 | * for which the NodeType is defined has the node as the last dimension of the 11 | * activation tensor. 12 | */ 13 | export enum NodeType { 14 | ATTENTION_HEAD = "attention_head", 15 | QK_CHANNEL = "qk_channel", 16 | V_CHANNEL = "v_channel", 17 | MLP_NEURON = "mlp_neuron", 18 | AUTOENCODER_LATENT = "autoencoder_latent", 19 | MLP_AUTOENCODER_LATENT = "mlp_autoencoder_latent", 20 | ATTENTION_AUTOENCODER_LATENT = "attention_autoencoder_latent", 21 | AUTOENCODER_LATENT_BY_TOKEN_PAIR = "autoencoder_latent_by_token_pair", 22 | LAYER = "layer", 23 | RESIDUAL_STREAM_CHANNEL = "residual_stream_channel", 24 | VOCAB_TOKEN = "vocab_token", 25 | } 26 | -------------------------------------------------------------------------------- /neuron_viewer/src/client/models/PassType.ts: -------------------------------------------------------------------------------- 1 | // Auto-generated code. Do not edit! See neuron_explainer/activation_server/README.md to learn how to regenerate it. 2 | 3 | /* istanbul ignore file */ 4 | /* tslint:disable */ 5 | /* eslint-disable */ 6 | 7 | /** 8 | * An enumeration. 9 | */ 10 | export enum PassType { 11 | FORWARD = "forward", 12 | BACKWARD = "backward", 13 | } 14 | -------------------------------------------------------------------------------- /neuron_viewer/src/client/models/PreOrPostAct.ts: -------------------------------------------------------------------------------- 1 | // Auto-generated code. Do not edit! See neuron_explainer/activation_server/README.md to learn how to regenerate it. 2 | 3 | /* istanbul ignore file */ 4 | /* tslint:disable */ 5 | /* eslint-disable */ 6 | 7 | /** 8 | * Specifies whether to trace from pre- or post-nonlinearity 9 | */ 10 | export enum PreOrPostAct { 11 | PRE = "pre", 12 | POST = "post", 13 | } 14 | -------------------------------------------------------------------------------- /neuron_viewer/src/client/models/ProcessingResponseDataType.ts: -------------------------------------------------------------------------------- 1 | // Auto-generated code. Do not edit! See neuron_explainer/activation_server/README.md to learn how to regenerate it. 2 | 3 | /* istanbul ignore file */ 4 | /* tslint:disable */ 5 | /* eslint-disable */ 6 | 7 | /** 8 | * An enumeration. 9 | */ 10 | export enum ProcessingResponseDataType { 11 | DERIVED_SCALARS_RESPONSE_DATA = "derived_scalars_response_data", 12 | DERIVED_ATTENTION_SCALARS_RESPONSE_DATA = "derived_attention_scalars_response_data", 13 | MULTIPLE_TOP_K_DERIVED_SCALARS_RESPONSE_DATA = "multiple_top_k_derived_scalars_response_data", 14 | SCORED_TOKENS_RESPONSE_DATA = "scored_tokens_response_data", 15 | TOKEN_PAIR_ATTRIBUTION_RESPONSE_DATA = "token_pair_attribution_response_data", 16 | } 17 | -------------------------------------------------------------------------------- /neuron_viewer/src/client/models/ScoreRequest.ts: -------------------------------------------------------------------------------- 1 | // Auto-generated code. Do not edit! See neuron_explainer/activation_server/README.md to learn how to regenerate it. 2 | 3 | /* istanbul ignore file */ 4 | /* tslint:disable */ 5 | /* eslint-disable */ 6 | 7 | import type { DerivedScalarType } from "./DerivedScalarType"; 8 | 9 | /** 10 | * Base model that will automatically generate camelCase aliases for fields. Python code can use 11 | * either snake_case or camelCase names. When Typescript code is generated, it will only use the 12 | * camelCase names. 13 | */ 14 | export type ScoreRequest = { 15 | dst: DerivedScalarType; 16 | layerIndex: number; 17 | activationIndex: number; 18 | datasets: Array; 19 | explanation: string; 20 | maxSequences?: number; 21 | }; 22 | -------------------------------------------------------------------------------- /neuron_viewer/src/client/models/ScoreResult.ts: -------------------------------------------------------------------------------- 1 | // Auto-generated code. Do not edit! See neuron_explainer/activation_server/README.md to learn how to regenerate it. 2 | 3 | /* istanbul ignore file */ 4 | /* tslint:disable */ 5 | /* eslint-disable */ 6 | 7 | /** 8 | * Base model that will automatically generate camelCase aliases for fields. Python code can use 9 | * either snake_case or camelCase names. When Typescript code is generated, it will only use the 10 | * camelCase names. 11 | */ 12 | export type ScoreResult = { 13 | score: number; 14 | datasetPath: string; 15 | }; 16 | -------------------------------------------------------------------------------- /neuron_viewer/src/client/models/ScoredTokensRequestSpec.ts: -------------------------------------------------------------------------------- 1 | // Auto-generated code. Do not edit! See neuron_explainer/activation_server/README.md to learn how to regenerate it. 2 | 3 | /* istanbul ignore file */ 4 | /* tslint:disable */ 5 | /* eslint-disable */ 6 | 7 | import type { TokenScoringType } from "./TokenScoringType"; 8 | 9 | /** 10 | * Base model that will automatically generate camelCase aliases for fields. Python code can use 11 | * either snake_case or camelCase names. When Typescript code is generated, it will only use the 12 | * camelCase names. 13 | */ 14 | export type ScoredTokensRequestSpec = { 15 | specType?: ScoredTokensRequestSpec.specType; 16 | tokenScoringType: TokenScoringType; 17 | numTokens: number; 18 | dependsOnSpecName: string; 19 | }; 20 | 21 | export namespace ScoredTokensRequestSpec { 22 | export enum specType { 23 | SCORED_TOKENS_REQUEST_SPEC = "scored_tokens_request_spec", 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /neuron_viewer/src/client/models/ScoredTokensResponseData.ts: -------------------------------------------------------------------------------- 1 | // Auto-generated code. Do not edit! See neuron_explainer/activation_server/README.md to learn how to regenerate it. 2 | 3 | /* istanbul ignore file */ 4 | /* tslint:disable */ 5 | /* eslint-disable */ 6 | 7 | import type { MirroredNodeIndex } from "./MirroredNodeIndex"; 8 | import type { ProcessingResponseDataType } from "./ProcessingResponseDataType"; 9 | import type { TopTokens } from "./TopTokens"; 10 | 11 | /** 12 | * Base model that will automatically generate camelCase aliases for fields. Python code can use 13 | * either snake_case or camelCase names. When Typescript code is generated, it will only use the 14 | * camelCase names. 15 | */ 16 | export type ScoredTokensResponseData = { 17 | responseDataType?: ProcessingResponseDataType; 18 | nodeIndices: Array; 19 | topTokensList: Array; 20 | }; 21 | -------------------------------------------------------------------------------- /neuron_viewer/src/client/models/TdbRequestSpec.ts: -------------------------------------------------------------------------------- 1 | // Auto-generated code. Do not edit! See neuron_explainer/activation_server/README.md to learn how to regenerate it. 2 | 3 | /* istanbul ignore file */ 4 | /* tslint:disable */ 5 | /* eslint-disable */ 6 | 7 | import type { ComponentTypeForAttention } from "./ComponentTypeForAttention"; 8 | import type { ComponentTypeForMlp } from "./ComponentTypeForMlp"; 9 | import type { NodeAblation } from "./NodeAblation"; 10 | import type { NodeToTrace } from "./NodeToTrace"; 11 | 12 | /** 13 | * Base model that will automatically generate camelCase aliases for fields. Python code can use 14 | * either snake_case or camelCase names. When Typescript code is generated, it will only use the 15 | * camelCase names. 16 | */ 17 | export type TdbRequestSpec = { 18 | specType?: TdbRequestSpec.specType; 19 | prompt: string; 20 | targetTokens: Array; 21 | distractorTokens: Array; 22 | componentTypeForMlp: ComponentTypeForMlp; 23 | componentTypeForAttention: ComponentTypeForAttention; 24 | topAndBottomKForNodeTable: number; 25 | hideEarlyLayersWhenAblating: boolean; 26 | nodeAblations?: Array; 27 | upstreamNodeToTrace?: NodeToTrace; 28 | downstreamNodeToTrace?: NodeToTrace; 29 | }; 30 | 31 | export namespace TdbRequestSpec { 32 | export enum specType { 33 | TDB_REQUEST_SPEC = "tdb_request_spec", 34 | } 35 | } 36 | -------------------------------------------------------------------------------- /neuron_viewer/src/client/models/Tensor0D.ts: -------------------------------------------------------------------------------- 1 | // Auto-generated code. Do not edit! See neuron_explainer/activation_server/README.md to learn how to regenerate it. 2 | 3 | /* istanbul ignore file */ 4 | /* tslint:disable */ 5 | /* eslint-disable */ 6 | 7 | import type { TensorType } from "./TensorType"; 8 | 9 | /** 10 | * Base model that will automatically generate camelCase aliases for fields. Python code can use 11 | * either snake_case or camelCase names. When Typescript code is generated, it will only use the 12 | * camelCase names. 13 | */ 14 | export type Tensor0D = { 15 | tensorType?: TensorType; 16 | value: number; 17 | }; 18 | -------------------------------------------------------------------------------- /neuron_viewer/src/client/models/Tensor1D.ts: -------------------------------------------------------------------------------- 1 | // Auto-generated code. Do not edit! See neuron_explainer/activation_server/README.md to learn how to regenerate it. 2 | 3 | /* istanbul ignore file */ 4 | /* tslint:disable */ 5 | /* eslint-disable */ 6 | 7 | import type { TensorType } from "./TensorType"; 8 | 9 | /** 10 | * Base model that will automatically generate camelCase aliases for fields. Python code can use 11 | * either snake_case or camelCase names. When Typescript code is generated, it will only use the 12 | * camelCase names. 13 | */ 14 | export type Tensor1D = { 15 | tensorType?: TensorType; 16 | value: Array; 17 | }; 18 | -------------------------------------------------------------------------------- /neuron_viewer/src/client/models/Tensor2D.ts: -------------------------------------------------------------------------------- 1 | // Auto-generated code. Do not edit! See neuron_explainer/activation_server/README.md to learn how to regenerate it. 2 | 3 | /* istanbul ignore file */ 4 | /* tslint:disable */ 5 | /* eslint-disable */ 6 | 7 | import type { TensorType } from "./TensorType"; 8 | 9 | /** 10 | * Base model that will automatically generate camelCase aliases for fields. Python code can use 11 | * either snake_case or camelCase names. When Typescript code is generated, it will only use the 12 | * camelCase names. 13 | */ 14 | export type Tensor2D = { 15 | tensorType?: TensorType; 16 | value: Array>; 17 | }; 18 | -------------------------------------------------------------------------------- /neuron_viewer/src/client/models/Tensor3D.ts: -------------------------------------------------------------------------------- 1 | // Auto-generated code. Do not edit! See neuron_explainer/activation_server/README.md to learn how to regenerate it. 2 | 3 | /* istanbul ignore file */ 4 | /* tslint:disable */ 5 | /* eslint-disable */ 6 | 7 | import type { TensorType } from "./TensorType"; 8 | 9 | /** 10 | * Base model that will automatically generate camelCase aliases for fields. Python code can use 11 | * either snake_case or camelCase names. When Typescript code is generated, it will only use the 12 | * camelCase names. 13 | */ 14 | export type Tensor3D = { 15 | tensorType?: TensorType; 16 | value: Array>>; 17 | }; 18 | -------------------------------------------------------------------------------- /neuron_viewer/src/client/models/TensorType.ts: -------------------------------------------------------------------------------- 1 | // Auto-generated code. Do not edit! See neuron_explainer/activation_server/README.md to learn how to regenerate it. 2 | 3 | /* istanbul ignore file */ 4 | /* tslint:disable */ 5 | /* eslint-disable */ 6 | 7 | /** 8 | * An enumeration. 9 | */ 10 | export enum TensorType { 11 | TENSOR_0D = "tensor_0d", 12 | TENSOR_1D = "tensor_1d", 13 | TENSOR_2D = "tensor_2d", 14 | TENSOR_3D = "tensor_3d", 15 | } 16 | -------------------------------------------------------------------------------- /neuron_viewer/src/client/models/TokenAndAttentionScalars.ts: -------------------------------------------------------------------------------- 1 | // Auto-generated code. Do not edit! See neuron_explainer/activation_server/README.md to learn how to regenerate it. 2 | 3 | /* istanbul ignore file */ 4 | /* tslint:disable */ 5 | /* eslint-disable */ 6 | 7 | /** 8 | * Base model that will automatically generate camelCase aliases for fields. Python code can use 9 | * either snake_case or camelCase names. When Typescript code is generated, it will only use the 10 | * camelCase names. 11 | */ 12 | export type TokenAndAttentionScalars = { 13 | token: string; 14 | scalars: Array; 15 | normalizedScalars: Array; 16 | totalScalarIn: number; 17 | normalizedTotalScalarIn: number; 18 | maxScalarIn: number; 19 | normalizedMaxScalarIn: number; 20 | totalScalarOut: number; 21 | normalizedTotalScalarOut: number; 22 | maxScalarOut: number; 23 | normalizedMaxScalarOut: number; 24 | }; 25 | -------------------------------------------------------------------------------- /neuron_viewer/src/client/models/TokenAndScalar.ts: -------------------------------------------------------------------------------- 1 | // Auto-generated code. Do not edit! See neuron_explainer/activation_server/README.md to learn how to regenerate it. 2 | 3 | /* istanbul ignore file */ 4 | /* tslint:disable */ 5 | /* eslint-disable */ 6 | 7 | /** 8 | * Base model that will automatically generate camelCase aliases for fields. Python code can use 9 | * either snake_case or camelCase names. When Typescript code is generated, it will only use the 10 | * camelCase names. 11 | */ 12 | export type TokenAndScalar = { 13 | token: string; 14 | scalar: number; 15 | normalizedScalar: number; 16 | }; 17 | -------------------------------------------------------------------------------- /neuron_viewer/src/client/models/TokenPairAttributionRequestSpec.ts: -------------------------------------------------------------------------------- 1 | // Auto-generated code. Do not edit! See neuron_explainer/activation_server/README.md to learn how to regenerate it. 2 | 3 | /* istanbul ignore file */ 4 | /* tslint:disable */ 5 | /* eslint-disable */ 6 | 7 | /** 8 | * Base model that will automatically generate camelCase aliases for fields. Python code can use 9 | * either snake_case or camelCase names. When Typescript code is generated, it will only use the 10 | * camelCase names. 11 | */ 12 | export type TokenPairAttributionRequestSpec = { 13 | specType?: TokenPairAttributionRequestSpec.specType; 14 | numTokensAttendedTo: number; 15 | dependsOnSpecName: string; 16 | }; 17 | 18 | export namespace TokenPairAttributionRequestSpec { 19 | export enum specType { 20 | TOKEN_PAIR_ATTRIBUTION_REQUEST_SPEC = "token_pair_attribution_request_spec", 21 | } 22 | } 23 | -------------------------------------------------------------------------------- /neuron_viewer/src/client/models/TokenPairAttributionResponseData.ts: -------------------------------------------------------------------------------- 1 | // Auto-generated code. Do not edit! See neuron_explainer/activation_server/README.md to learn how to regenerate it. 2 | 3 | /* istanbul ignore file */ 4 | /* tslint:disable */ 5 | /* eslint-disable */ 6 | 7 | import type { MirroredNodeIndex } from "./MirroredNodeIndex"; 8 | import type { ProcessingResponseDataType } from "./ProcessingResponseDataType"; 9 | import type { TopTokensAttendedTo } from "./TopTokensAttendedTo"; 10 | 11 | /** 12 | * Base model that will automatically generate camelCase aliases for fields. Python code can use 13 | * either snake_case or camelCase names. When Typescript code is generated, it will only use the 14 | * camelCase names. 15 | */ 16 | export type TokenPairAttributionResponseData = { 17 | responseDataType?: ProcessingResponseDataType; 18 | nodeIndices: Array; 19 | topTokensAttendedToList: Array; 20 | }; 21 | -------------------------------------------------------------------------------- /neuron_viewer/src/client/models/TokenScoringType.ts: -------------------------------------------------------------------------------- 1 | // Auto-generated code. Do not edit! See neuron_explainer/activation_server/README.md to learn how to regenerate it. 2 | 3 | /* istanbul ignore file */ 4 | /* tslint:disable */ 5 | /* eslint-disable */ 6 | 7 | /** 8 | * Methods by which vocab tokens may be scored. 9 | */ 10 | export enum TokenScoringType { 11 | UPVOTED_OUTPUT_TOKENS = "upvoted_output_tokens", 12 | INPUT_TOKENS_THAT_UPVOTE_MLP = "input_tokens_that_upvote_mlp", 13 | INPUT_TOKENS_THAT_UPVOTE_ATTN_Q = "input_tokens_that_upvote_attn_q", 14 | INPUT_TOKENS_THAT_UPVOTE_ATTN_K = "input_tokens_that_upvote_attn_k", 15 | } 16 | -------------------------------------------------------------------------------- /neuron_viewer/src/client/models/TopTokens.ts: -------------------------------------------------------------------------------- 1 | // Auto-generated code. Do not edit! See neuron_explainer/activation_server/README.md to learn how to regenerate it. 2 | 3 | /* istanbul ignore file */ 4 | /* tslint:disable */ 5 | /* eslint-disable */ 6 | 7 | import type { TokenAndScalar } from "./TokenAndScalar"; 8 | 9 | /** 10 | * Contains two lists of tokens and associated scalars: one for the highest-scoring tokens and one 11 | * for the lowest-scoring tokens, according to some way of scoring tokens. For example, this could 12 | * be used to represent the top upvoted and downvoted "logit lens" tokens. An instance of this 13 | * class is scoped to a single node. The set of tokens eligible for scoring is typically just the 14 | * model's entire vocabulary. Each list is sorted from largest to smallest absolute value for the 15 | * associated scalar. 16 | */ 17 | export type TopTokens = { 18 | top: Array; 19 | bottom: Array; 20 | }; 21 | -------------------------------------------------------------------------------- /neuron_viewer/src/client/models/TopTokensAttendedTo.ts: -------------------------------------------------------------------------------- 1 | // Auto-generated code. Do not edit! See neuron_explainer/activation_server/README.md to learn how to regenerate it. 2 | 3 | /* istanbul ignore file */ 4 | /* tslint:disable */ 5 | /* eslint-disable */ 6 | 7 | /** 8 | * Base model that will automatically generate camelCase aliases for fields. Python code can use 9 | * either snake_case or camelCase names. When Typescript code is generated, it will only use the 10 | * camelCase names. 11 | */ 12 | export type TopTokensAttendedTo = { 13 | tokenIndices: Array; 14 | attributions: Array; 15 | }; 16 | -------------------------------------------------------------------------------- /neuron_viewer/src/client/models/ValidationError.ts: -------------------------------------------------------------------------------- 1 | // Auto-generated code. Do not edit! See neuron_explainer/activation_server/README.md to learn how to regenerate it. 2 | 3 | /* istanbul ignore file */ 4 | /* tslint:disable */ 5 | /* eslint-disable */ 6 | 7 | export type ValidationError = { 8 | loc: Array; 9 | msg: string; 10 | type: string; 11 | }; 12 | -------------------------------------------------------------------------------- /neuron_viewer/src/client/services/ExplainerService.ts: -------------------------------------------------------------------------------- 1 | // Auto-generated code. Do not edit! See neuron_explainer/activation_server/README.md to learn how to regenerate it. 2 | 3 | /* istanbul ignore file */ 4 | /* tslint:disable */ 5 | /* eslint-disable */ 6 | import type { ExplanationResult } from "../models/ExplanationResult"; 7 | import type { NodeIdAndDatasets } from "../models/NodeIdAndDatasets"; 8 | import type { ScoreRequest } from "../models/ScoreRequest"; 9 | import type { ScoreResult } from "../models/ScoreResult"; 10 | 11 | import type { CancelablePromise } from "../core/CancelablePromise"; 12 | import { OpenAPI } from "../core/OpenAPI"; 13 | import { request as __request } from "../core/request"; 14 | 15 | export class ExplainerService { 16 | /** 17 | * Explain 18 | * @param requestBody 19 | * @returns ExplanationResult Successful Response 20 | * @throws ApiError 21 | */ 22 | public static explainerExplain( 23 | requestBody: NodeIdAndDatasets 24 | ): CancelablePromise { 25 | return __request(OpenAPI, { 26 | method: "POST", 27 | url: "/explain", 28 | body: requestBody, 29 | mediaType: "application/json", 30 | errors: { 31 | 422: `Validation Error`, 32 | }, 33 | }); 34 | } 35 | 36 | /** 37 | * Score 38 | * @param requestBody 39 | * @returns ScoreResult Successful Response 40 | * @throws ApiError 41 | */ 42 | public static explainerScore(requestBody: ScoreRequest): CancelablePromise { 43 | return __request(OpenAPI, { 44 | method: "POST", 45 | url: "/score", 46 | body: requestBody, 47 | mediaType: "application/json", 48 | errors: { 49 | 422: `Validation Error`, 50 | }, 51 | }); 52 | } 53 | } 54 | -------------------------------------------------------------------------------- /neuron_viewer/src/client/services/HelloWorldService.ts: -------------------------------------------------------------------------------- 1 | // Auto-generated code. Do not edit! See neuron_explainer/activation_server/README.md to learn how to regenerate it. 2 | 3 | /* istanbul ignore file */ 4 | /* tslint:disable */ 5 | /* eslint-disable */ 6 | import type { GroupId } from "../models/GroupId"; 7 | 8 | import type { CancelablePromise } from "../core/CancelablePromise"; 9 | import { OpenAPI } from "../core/OpenAPI"; 10 | import { request as __request } from "../core/request"; 11 | 12 | export class HelloWorldService { 13 | /** 14 | * Read Root 15 | * @returns string Successful Response 16 | * @throws ApiError 17 | */ 18 | public static helloWorldReadRoot(): CancelablePromise> { 19 | return __request(OpenAPI, { 20 | method: "GET", 21 | url: "/", 22 | }); 23 | } 24 | 25 | /** 26 | * Force Client Code Generation 27 | * @param groupId 28 | * @returns any Successful Response 29 | * @throws ApiError 30 | */ 31 | public static helloWorldForceClientCodeGeneration(groupId: GroupId): CancelablePromise { 32 | return __request(OpenAPI, { 33 | method: "GET", 34 | url: "/force_client_code_generation", 35 | query: { 36 | group_id: groupId, 37 | }, 38 | errors: { 39 | 422: `Validation Error`, 40 | }, 41 | }); 42 | } 43 | } 44 | -------------------------------------------------------------------------------- /neuron_viewer/src/client/services/InferenceService.ts: -------------------------------------------------------------------------------- 1 | // Auto-generated code. Do not edit! See neuron_explainer/activation_server/README.md to learn how to regenerate it. 2 | 3 | /* istanbul ignore file */ 4 | /* tslint:disable */ 5 | /* eslint-disable */ 6 | import type { BatchedRequest } from "../models/BatchedRequest"; 7 | import type { BatchedResponse } from "../models/BatchedResponse"; 8 | import type { BatchedTdbRequest } from "../models/BatchedTdbRequest"; 9 | import type { DerivedAttentionScalarsRequest } from "../models/DerivedAttentionScalarsRequest"; 10 | import type { DerivedAttentionScalarsResponse } from "../models/DerivedAttentionScalarsResponse"; 11 | import type { DerivedScalarsRequest } from "../models/DerivedScalarsRequest"; 12 | import type { DerivedScalarsResponse } from "../models/DerivedScalarsResponse"; 13 | import type { ModelInfoResponse } from "../models/ModelInfoResponse"; 14 | import type { MultipleTopKDerivedScalarsRequest } from "../models/MultipleTopKDerivedScalarsRequest"; 15 | import type { MultipleTopKDerivedScalarsResponse } from "../models/MultipleTopKDerivedScalarsResponse"; 16 | 17 | import type { CancelablePromise } from "../core/CancelablePromise"; 18 | import { OpenAPI } from "../core/OpenAPI"; 19 | import { request as __request } from "../core/request"; 20 | 21 | export class InferenceService { 22 | /** 23 | * Derived Scalars 24 | * @param requestBody 25 | * @returns DerivedScalarsResponse Successful Response 26 | * @throws ApiError 27 | */ 28 | public static inferenceDerivedScalars( 29 | requestBody: DerivedScalarsRequest 30 | ): CancelablePromise { 31 | return __request(OpenAPI, { 32 | method: "POST", 33 | url: "/derived_scalars", 34 | body: requestBody, 35 | mediaType: "application/json", 36 | errors: { 37 | 422: `Validation Error`, 38 | }, 39 | }); 40 | } 41 | 42 | /** 43 | * Derived Attention Scalars 44 | * @param requestBody 45 | * @returns DerivedAttentionScalarsResponse Successful Response 46 | * @throws ApiError 47 | */ 48 | public static inferenceDerivedAttentionScalars( 49 | requestBody: DerivedAttentionScalarsRequest 50 | ): CancelablePromise { 51 | return __request(OpenAPI, { 52 | method: "POST", 53 | url: "/derived_attention_scalars", 54 | body: requestBody, 55 | mediaType: "application/json", 56 | errors: { 57 | 422: `Validation Error`, 58 | }, 59 | }); 60 | } 61 | 62 | /** 63 | * Multiple Top K Derived Scalars 64 | * @param requestBody 65 | * @returns MultipleTopKDerivedScalarsResponse Successful Response 66 | * @throws ApiError 67 | */ 68 | public static inferenceMultipleTopKDerivedScalars( 69 | requestBody: MultipleTopKDerivedScalarsRequest 70 | ): CancelablePromise { 71 | return __request(OpenAPI, { 72 | method: "POST", 73 | url: "/multiple_top_k_derived_scalars", 74 | body: requestBody, 75 | mediaType: "application/json", 76 | errors: { 77 | 422: `Validation Error`, 78 | }, 79 | }); 80 | } 81 | 82 | /** 83 | * Batched 84 | * @param requestBody 85 | * @returns BatchedResponse Successful Response 86 | * @throws ApiError 87 | */ 88 | public static inferenceBatched(requestBody: BatchedRequest): CancelablePromise { 89 | return __request(OpenAPI, { 90 | method: "POST", 91 | url: "/batched", 92 | body: requestBody, 93 | mediaType: "application/json", 94 | errors: { 95 | 422: `Validation Error`, 96 | }, 97 | }); 98 | } 99 | 100 | /** 101 | * Batched Tdb 102 | * @param requestBody 103 | * @returns BatchedResponse Successful Response 104 | * @throws ApiError 105 | */ 106 | public static inferenceBatchedTdb( 107 | requestBody: BatchedTdbRequest 108 | ): CancelablePromise { 109 | return __request(OpenAPI, { 110 | method: "POST", 111 | url: "/batched_tdb", 112 | body: requestBody, 113 | mediaType: "application/json", 114 | errors: { 115 | 422: `Validation Error`, 116 | }, 117 | }); 118 | } 119 | 120 | /** 121 | * Model Info 122 | * @returns ModelInfoResponse Successful Response 123 | * @throws ApiError 124 | */ 125 | public static inferenceModelInfo(): CancelablePromise { 126 | return __request(OpenAPI, { 127 | method: "POST", 128 | url: "/model_info", 129 | }); 130 | } 131 | } 132 | -------------------------------------------------------------------------------- /neuron_viewer/src/client/services/MemoryService.ts: -------------------------------------------------------------------------------- 1 | // Auto-generated code. Do not edit! See neuron_explainer/activation_server/README.md to learn how to regenerate it. 2 | 3 | /* istanbul ignore file */ 4 | /* tslint:disable */ 5 | /* eslint-disable */ 6 | import type { CancelablePromise } from "../core/CancelablePromise"; 7 | import { OpenAPI } from "../core/OpenAPI"; 8 | import { request as __request } from "../core/request"; 9 | 10 | export class MemoryService { 11 | /** 12 | * Dump Memory Snapshot 13 | * @returns string Successful Response 14 | * @throws ApiError 15 | */ 16 | public static memoryDumpMemorySnapshot(): CancelablePromise { 17 | return __request(OpenAPI, { 18 | method: "GET", 19 | url: "/dump_memory_snapshot", 20 | }); 21 | } 22 | } 23 | -------------------------------------------------------------------------------- /neuron_viewer/src/client/services/ReadService.ts: -------------------------------------------------------------------------------- 1 | // Auto-generated code. Do not edit! See neuron_explainer/activation_server/README.md to learn how to regenerate it. 2 | 3 | /* istanbul ignore file */ 4 | /* tslint:disable */ 5 | /* eslint-disable */ 6 | import type { AttentionHeadRecordResponse } from "../models/AttentionHeadRecordResponse"; 7 | import type { AttributedScoredExplanation } from "../models/AttributedScoredExplanation"; 8 | import type { ExistingExplanationsRequest } from "../models/ExistingExplanationsRequest"; 9 | import type { NeuronDatasetMetadata } from "../models/NeuronDatasetMetadata"; 10 | import type { NeuronRecordResponse } from "../models/NeuronRecordResponse"; 11 | import type { NodeIdAndDatasets } from "../models/NodeIdAndDatasets"; 12 | 13 | import type { CancelablePromise } from "../core/CancelablePromise"; 14 | import { OpenAPI } from "../core/OpenAPI"; 15 | import { request as __request } from "../core/request"; 16 | 17 | export class ReadService { 18 | /** 19 | * Existing Explanations 20 | * @param requestBody 21 | * @returns AttributedScoredExplanation Successful Response 22 | * @throws ApiError 23 | */ 24 | public static readExistingExplanations( 25 | requestBody: ExistingExplanationsRequest 26 | ): CancelablePromise> { 27 | return __request(OpenAPI, { 28 | method: "POST", 29 | url: "/existing_explanations", 30 | body: requestBody, 31 | mediaType: "application/json", 32 | errors: { 33 | 422: `Validation Error`, 34 | }, 35 | }); 36 | } 37 | 38 | /** 39 | * Neuron Record 40 | * @param requestBody 41 | * @returns NeuronRecordResponse Successful Response 42 | * @throws ApiError 43 | */ 44 | public static readNeuronRecord( 45 | requestBody: NodeIdAndDatasets 46 | ): CancelablePromise { 47 | return __request(OpenAPI, { 48 | method: "POST", 49 | url: "/neuron_record", 50 | body: requestBody, 51 | mediaType: "application/json", 52 | errors: { 53 | 422: `Validation Error`, 54 | }, 55 | }); 56 | } 57 | 58 | /** 59 | * Attention Head Record 60 | * @param requestBody 61 | * @returns AttentionHeadRecordResponse Successful Response 62 | * @throws ApiError 63 | */ 64 | public static readAttentionHeadRecord( 65 | requestBody: NodeIdAndDatasets 66 | ): CancelablePromise { 67 | return __request(OpenAPI, { 68 | method: "POST", 69 | url: "/attention_head_record", 70 | body: requestBody, 71 | mediaType: "application/json", 72 | errors: { 73 | 422: `Validation Error`, 74 | }, 75 | }); 76 | } 77 | 78 | /** 79 | * Neuron Datasets Metadata 80 | * @returns NeuronDatasetMetadata Successful Response 81 | * @throws ApiError 82 | */ 83 | public static readNeuronDatasetsMetadata(): CancelablePromise> { 84 | return __request(OpenAPI, { 85 | method: "POST", 86 | url: "/neuron_datasets_metadata", 87 | }); 88 | } 89 | } 90 | -------------------------------------------------------------------------------- /neuron_viewer/src/colors.ts: -------------------------------------------------------------------------------- 1 | export type Color = { r: number; g: number; b: number }; 2 | 3 | export function interpolateColor(colorLeft: Color, colorRight: Color, value: number): Color { 4 | const color = { 5 | r: Math.round(colorLeft.r + (colorRight.r - colorLeft.r) * value), 6 | g: Math.round(colorLeft.g + (colorRight.g - colorLeft.g) * value), 7 | b: Math.round(colorLeft.b + (colorRight.b - colorLeft.b) * value), 8 | }; 9 | return color; 10 | } 11 | 12 | export function getInterpolatedColor(colors: Color[], boundaries: number[], value: number): Color { 13 | const index = boundaries.findIndex((boundary) => boundary >= value); 14 | const colorIndex = Math.max(0, index - 1); 15 | const colorLeft = colors[colorIndex]; 16 | const colorRight = colors[colorIndex + 1]; 17 | const boundaryLeft = boundaries[colorIndex]; 18 | const boundaryRight = boundaries[colorIndex + 1]; 19 | const ratio = (value - boundaryLeft) / (boundaryRight - boundaryLeft); 20 | const color = interpolateColor(colorLeft, colorRight, ratio); 21 | return color; 22 | } 23 | 24 | export const BLANK_COLOR: Color = { r: 255, g: 255, b: 255 }; // white 25 | export const MAX_OUT_COLOR: Color = { r: 0, g: 255, b: 255 }; // cyan 26 | export const MAX_IN_COLOR: Color = { r: 255, g: 0, b: 255 }; // magenta 27 | 28 | export function subtractiveMix(color1: Color, color2: Color) { 29 | // Invert the colors 30 | let inverted1 = { r: 255 - color1.r, g: 255 - color1.g, b: 255 - color1.b }; 31 | let inverted2 = { r: 255 - color2.r, g: 255 - color2.g, b: 255 - color2.b }; 32 | 33 | // Mix them additively 34 | let mixed = { 35 | r: Math.min(inverted1.r + inverted2.r, 255), 36 | g: Math.min(inverted1.g + inverted2.g, 255), 37 | b: Math.min(inverted1.b + inverted2.b, 255), 38 | }; 39 | 40 | // Invert the result 41 | return { r: 255 - mixed.r, g: 255 - mixed.g, b: 255 - mixed.b }; 42 | } 43 | 44 | export const DEFAULT_BOUNDARIES = [0, 1]; 45 | 46 | export const DEFAULT_COLORS: Color[] = [ 47 | { r: 255, g: 255, b: 255 }, 48 | { r: 0, g: 255, b: 0 }, 49 | ]; 50 | 51 | export const POSITIVE_NEGATIVE_COLORS: Color[] = [ 52 | { r: 255, g: 0, b: 105 }, 53 | { r: 255, g: 255, b: 255 }, 54 | { r: 0, g: 255, b: 0 }, 55 | ]; 56 | 57 | export const POSITIVE_NEGATIVE_BOUNDARIES = [0, 0.5, 1]; 58 | -------------------------------------------------------------------------------- /neuron_viewer/src/commonUiComponents.tsx: -------------------------------------------------------------------------------- 1 | import { ReactNode } from "react"; 2 | 3 | export const SectionTitle = ({ children }: { children: ReactNode }) => { 4 | return

{children}

; 5 | }; 6 | 7 | export const defaultSmallButtonClasses = 8 | "text-black no-underline text-base border-black border font-sans bg-white font-small inline-block rounded " + 9 | "transition-all duration-200 ease-in-out hover:bg-gray-100 disabled:bg-gray-300 disabled:cursor-not-allowed px-1 py-0"; 10 | 11 | export const ShowAllOrFewerButton = ({ 12 | showAll, 13 | setShowAll, 14 | }: { 15 | showAll: boolean; 16 | setShowAll: (showAll: boolean) => void; 17 | }) => { 18 | return ( 19 | 22 | ); 23 | }; 24 | -------------------------------------------------------------------------------- /neuron_viewer/src/heatmapGrid.tsx: -------------------------------------------------------------------------------- 1 | import { TokenSequenceAndScalars } from "./types"; 2 | import TokenHeatmap from "./tokenHeatmap"; 3 | 4 | export type HeatmapGridProps = { 5 | tokenSequences: TokenSequenceAndScalars[] | null; 6 | expectedNumSequences: number; 7 | }; 8 | 9 | const HeatmapGrid: React.FC = ({ tokenSequences, expectedNumSequences }) => { 10 | if (tokenSequences === null) { 11 | // No tokens specified means that we're rendering a skeleton without any content in it. The 12 | // width and minHeight specified below ensure that the skeleton is the same size as the actual 13 | // heatmap grid. We specify an array of nulls here, which the TokenHeatmap component will 14 | // handle gracefully. 15 | tokenSequences = new Array(expectedNumSequences).fill(null); 16 | } 17 | return ( 18 |
19 |
20 | {tokenSequences.map((tokenSequence, i) => ( 21 |
26 | 27 |
28 | ))} 29 |
30 |
31 | ); 32 | }; 33 | 34 | export default HeatmapGrid; 35 | -------------------------------------------------------------------------------- /neuron_viewer/src/heatmapGrid2d.tsx: -------------------------------------------------------------------------------- 1 | import TokenHeatmap2d from "./tokenHeatmap2d"; 2 | import { TokenSequenceAndAttentionScalars } from "./types"; 3 | 4 | export type HeatmapGrid2dProps = { 5 | tokenSequenceAndAttentionScalars: TokenSequenceAndAttentionScalars[] | null; 6 | expectedNumSequences: number; 7 | }; 8 | 9 | const HeatmapGrid2d: React.FC = ({ 10 | tokenSequenceAndAttentionScalars, 11 | expectedNumSequences, 12 | }) => { 13 | console.log("in HeatmapGrid2d"); 14 | if (tokenSequenceAndAttentionScalars === null) { 15 | // No tokens specified means that we're rendering a skeleton without any content in it. The 16 | // width and minHeight specified below ensure that the skeleton is the same size as the actual 17 | // heatmap grid. We specify an array of nulls here, which the TokenHeatmap component will 18 | // handle gracefully. 19 | tokenSequenceAndAttentionScalars = new Array(expectedNumSequences).fill(null); 20 | } 21 | return ( 22 |
23 |
24 | {tokenSequenceAndAttentionScalars.map((tokenSequenceAndAttentionScalars, i) => ( 25 |
30 | 31 |
32 | ))} 33 |
34 |
35 | ); 36 | }; 37 | 38 | export default HeatmapGrid2d; 39 | -------------------------------------------------------------------------------- /neuron_viewer/src/images.d.ts: -------------------------------------------------------------------------------- 1 | declare module "*.png" { 2 | const value: any; 3 | export = value; 4 | } 5 | -------------------------------------------------------------------------------- /neuron_viewer/src/index.css: -------------------------------------------------------------------------------- 1 | body { 2 | margin: 0; 3 | font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", "Roboto", "Oxygen", "Ubuntu", 4 | "Cantarell", "Fira Sans", "Droid Sans", "Helvetica Neue", sans-serif; 5 | -webkit-font-smoothing: antialiased; 6 | -moz-osx-font-smoothing: grayscale; 7 | } 8 | 9 | code { 10 | font-family: source-code-pro, Menlo, Monaco, Consolas, "Courier New", monospace; 11 | } 12 | -------------------------------------------------------------------------------- /neuron_viewer/src/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | Neuron Viewer 9 | 10 | 11 | 12 |
13 | 23 | 24 | 25 | 26 | 27 | -------------------------------------------------------------------------------- /neuron_viewer/src/index.tsx: -------------------------------------------------------------------------------- 1 | import React from "react"; 2 | import ReactDOM from "react-dom/client"; 3 | import "./index.css"; 4 | import App from "./App"; 5 | import { BrowserRouter } from "react-router-dom"; 6 | const root = ReactDOM.createRoot(document.getElementById("root")!); 7 | 8 | root.render( 9 | 10 | 11 | 12 | 13 | 14 | ); 15 | -------------------------------------------------------------------------------- /neuron_viewer/src/modelInteractions.tsx: -------------------------------------------------------------------------------- 1 | // React component that handles interactions with a subject or explainer model. Two interactions are 2 | // currently supported: 3 | // 1) Getting activations for a particular prompt from the subject model. 4 | // 2) Scoring explanations using an explainer model. 5 | 6 | import React, { ChangeEvent, KeyboardEvent } from "react"; 7 | import { SectionTitle, defaultSmallButtonClasses } from "./commonUiComponents"; 8 | 9 | type ModelInteractionsProps = { 10 | onGetActivationsForPrompt: (value: string) => void; 11 | // Scoring explanations is currently only possible for neurons and autoencoder latents. We don't 12 | // show this option for attention heads. 13 | onScoreExplanation?: (value: string) => void; 14 | }; 15 | 16 | const GET_ACTIVATIONS_FOR_PROMPT = "Get activations for prompt"; 17 | const SCORE_EXPLANATION = "Score explanation"; 18 | 19 | const ModelInteractions: React.FC = ({ 20 | onGetActivationsForPrompt, 21 | onScoreExplanation, 22 | }) => { 23 | const [textboxValue, setTextboxValue] = React.useState(""); 24 | const toolkit = [GET_ACTIVATIONS_FOR_PROMPT]; 25 | if (onScoreExplanation) { 26 | toolkit.push(SCORE_EXPLANATION); 27 | } 28 | 29 | const [activeTool, setActiveTool] = React.useState( 30 | toolkit.length === 0 ? null : toolkit[0] 31 | ); 32 | if (toolkit.length === 0) { 33 | return null; 34 | } 35 | 36 | return ( 37 | <> 38 | Interact with the model 39 |
40 |
41 | {toolkit.map((tool, i) => ( 42 |
43 | {toolkit.length > 1 ? ( 44 | 47 | ) : ( 48 |

{tool}

// If there's only one tool, don't make it a button. 49 | )} 50 |
51 | ))} 52 |
53 |
54 |