├── .documentation ├── feature_steering_gradio.webp ├── sae_training_2024-10-28_dead-latents-ratio.png ├── sae_training_2024-10-28_train-aux-loss.png ├── sae_training_2024-10-28_train-total-loss.png ├── sae_training_2024-10-28_val-total-loss.png ├── sparse-autoencoder_light.webp └── testing_terminal.webp ├── .pre-commit-config.yaml ├── LICENSE ├── README.md ├── capture_activations.py ├── capture_top_activating_sentences.py ├── captured_top_sentences ├── top_sentences_last.yaml └── top_sentences_mean.yaml ├── interpret_top_sentences_parse_responses.py ├── interpret_top_sentences_retrieve_batches.py ├── interpret_top_sentences_send_batches.py ├── latent_index_meaning ├── latent_idx_meaning_last-aggregated.yaml └── latent_idx_meaning_mean-aggregated.yaml ├── llama_3.1-8B_model └── original │ ├── params.json │ └── tokenizer.model ├── llama_3.2-3B_model └── original │ ├── params.json │ └── tokenizer.model ├── llama_3 ├── __init__.py ├── args.py ├── chat_format.py ├── datatypes.py ├── model_text_only.py ├── schema_utils.py ├── tokenizer.py └── tool_utils.py ├── llama_3_inference.py ├── llama_3_inference_chat_completion_test.py ├── llama_3_inference_text_completion_gradio.py ├── llama_3_inference_text_completion_test.py ├── openwebtext_sentences_dataset.py ├── poetry.lock ├── pyproject.toml ├── sae.py ├── sae_preprocessing.py ├── sae_training.py ├── top_sentences_last_responses ├── msgbatch_0129Z3NWFaqHsAottqGHCBy6.yaml ├── msgbatch_018kBogt5uV1QHCdpwixJvzk.yaml ├── msgbatch_01A1pLzDKsdLqZmhM8bVfnHq.yaml ├── msgbatch_01ERHQxTfdEruzRYpziT4jv5.yaml ├── msgbatch_01HuGhePrDX48fvvjb5HW9Qm.yaml ├── msgbatch_01NDSburAPr8pBc7AJBVMbdF.yaml ├── msgbatch_01QGsQemNF7in575ZVc2UVwK.yaml └── msgbatch_01SuQ7c59t9HVXVAnf8sXQ6d.yaml ├── top_sentences_mean_responses ├── msgbatch_012snVhQshnnN4XBp9FsQDb9.yaml ├── msgbatch_018nMq6YLoAFkqf5rAi6mKif.yaml ├── msgbatch_01DPMCyXaVhbG2aRhdsFv7HR.yaml ├── msgbatch_01HZdF3QqrA9EmHiVbxWBGRg.yaml ├── msgbatch_01HkcvAx6iN1F2YaZWPjTvQ2.yaml ├── msgbatch_01JnnzbM15ytnPLWtvitSAca.yaml ├── msgbatch_01KzafvapiTK8AuhrSRr5Cdv.yaml ├── msgbatch_01LFjpsiUTNo8xVArWZPXZdQ.yaml ├── msgbatch_01MCPCisp8wy3FMUEp256aZ4.yaml ├── msgbatch_01NNaFvR7WgjpkeP6QNPMxVf.yaml ├── msgbatch_01NtDSMsBT1bUN4f6XA2GpiW.yaml ├── msgbatch_01RWoU9gB8hxo8RYRcQz5QcY.yaml └── msgbatch_01TqmvZpNF8mL4DXkhxaHw8p.yaml └── utils ├── __init__.py ├── cuda_utils.py └── llama_3_model_download.py /.documentation/feature_steering_gradio.webp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaulPauls/llama3_interpretability_sae/HEAD/.documentation/feature_steering_gradio.webp -------------------------------------------------------------------------------- /.documentation/sae_training_2024-10-28_dead-latents-ratio.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaulPauls/llama3_interpretability_sae/HEAD/.documentation/sae_training_2024-10-28_dead-latents-ratio.png -------------------------------------------------------------------------------- /.documentation/sae_training_2024-10-28_train-aux-loss.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaulPauls/llama3_interpretability_sae/HEAD/.documentation/sae_training_2024-10-28_train-aux-loss.png -------------------------------------------------------------------------------- /.documentation/sae_training_2024-10-28_train-total-loss.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaulPauls/llama3_interpretability_sae/HEAD/.documentation/sae_training_2024-10-28_train-total-loss.png -------------------------------------------------------------------------------- /.documentation/sae_training_2024-10-28_val-total-loss.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaulPauls/llama3_interpretability_sae/HEAD/.documentation/sae_training_2024-10-28_val-total-loss.png -------------------------------------------------------------------------------- /.documentation/sparse-autoencoder_light.webp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaulPauls/llama3_interpretability_sae/HEAD/.documentation/sparse-autoencoder_light.webp -------------------------------------------------------------------------------- /.documentation/testing_terminal.webp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaulPauls/llama3_interpretability_sae/HEAD/.documentation/testing_terminal.webp -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaulPauls/llama3_interpretability_sae/HEAD/.pre-commit-config.yaml -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaulPauls/llama3_interpretability_sae/HEAD/LICENSE -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaulPauls/llama3_interpretability_sae/HEAD/README.md -------------------------------------------------------------------------------- /capture_activations.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaulPauls/llama3_interpretability_sae/HEAD/capture_activations.py -------------------------------------------------------------------------------- /capture_top_activating_sentences.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaulPauls/llama3_interpretability_sae/HEAD/capture_top_activating_sentences.py -------------------------------------------------------------------------------- /captured_top_sentences/top_sentences_last.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaulPauls/llama3_interpretability_sae/HEAD/captured_top_sentences/top_sentences_last.yaml -------------------------------------------------------------------------------- /captured_top_sentences/top_sentences_mean.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaulPauls/llama3_interpretability_sae/HEAD/captured_top_sentences/top_sentences_mean.yaml -------------------------------------------------------------------------------- /interpret_top_sentences_parse_responses.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaulPauls/llama3_interpretability_sae/HEAD/interpret_top_sentences_parse_responses.py -------------------------------------------------------------------------------- /interpret_top_sentences_retrieve_batches.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaulPauls/llama3_interpretability_sae/HEAD/interpret_top_sentences_retrieve_batches.py -------------------------------------------------------------------------------- /interpret_top_sentences_send_batches.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaulPauls/llama3_interpretability_sae/HEAD/interpret_top_sentences_send_batches.py -------------------------------------------------------------------------------- /latent_index_meaning/latent_idx_meaning_last-aggregated.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaulPauls/llama3_interpretability_sae/HEAD/latent_index_meaning/latent_idx_meaning_last-aggregated.yaml -------------------------------------------------------------------------------- /latent_index_meaning/latent_idx_meaning_mean-aggregated.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaulPauls/llama3_interpretability_sae/HEAD/latent_index_meaning/latent_idx_meaning_mean-aggregated.yaml -------------------------------------------------------------------------------- /llama_3.1-8B_model/original/params.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaulPauls/llama3_interpretability_sae/HEAD/llama_3.1-8B_model/original/params.json -------------------------------------------------------------------------------- /llama_3.1-8B_model/original/tokenizer.model: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaulPauls/llama3_interpretability_sae/HEAD/llama_3.1-8B_model/original/tokenizer.model -------------------------------------------------------------------------------- /llama_3.2-3B_model/original/params.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaulPauls/llama3_interpretability_sae/HEAD/llama_3.2-3B_model/original/params.json -------------------------------------------------------------------------------- /llama_3.2-3B_model/original/tokenizer.model: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaulPauls/llama3_interpretability_sae/HEAD/llama_3.2-3B_model/original/tokenizer.model -------------------------------------------------------------------------------- /llama_3/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /llama_3/args.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaulPauls/llama3_interpretability_sae/HEAD/llama_3/args.py -------------------------------------------------------------------------------- /llama_3/chat_format.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaulPauls/llama3_interpretability_sae/HEAD/llama_3/chat_format.py -------------------------------------------------------------------------------- /llama_3/datatypes.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaulPauls/llama3_interpretability_sae/HEAD/llama_3/datatypes.py -------------------------------------------------------------------------------- /llama_3/model_text_only.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaulPauls/llama3_interpretability_sae/HEAD/llama_3/model_text_only.py -------------------------------------------------------------------------------- /llama_3/schema_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaulPauls/llama3_interpretability_sae/HEAD/llama_3/schema_utils.py -------------------------------------------------------------------------------- /llama_3/tokenizer.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaulPauls/llama3_interpretability_sae/HEAD/llama_3/tokenizer.py -------------------------------------------------------------------------------- /llama_3/tool_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaulPauls/llama3_interpretability_sae/HEAD/llama_3/tool_utils.py -------------------------------------------------------------------------------- /llama_3_inference.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaulPauls/llama3_interpretability_sae/HEAD/llama_3_inference.py -------------------------------------------------------------------------------- /llama_3_inference_chat_completion_test.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaulPauls/llama3_interpretability_sae/HEAD/llama_3_inference_chat_completion_test.py -------------------------------------------------------------------------------- /llama_3_inference_text_completion_gradio.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaulPauls/llama3_interpretability_sae/HEAD/llama_3_inference_text_completion_gradio.py -------------------------------------------------------------------------------- /llama_3_inference_text_completion_test.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaulPauls/llama3_interpretability_sae/HEAD/llama_3_inference_text_completion_test.py -------------------------------------------------------------------------------- /openwebtext_sentences_dataset.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaulPauls/llama3_interpretability_sae/HEAD/openwebtext_sentences_dataset.py -------------------------------------------------------------------------------- /poetry.lock: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaulPauls/llama3_interpretability_sae/HEAD/poetry.lock -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaulPauls/llama3_interpretability_sae/HEAD/pyproject.toml -------------------------------------------------------------------------------- /sae.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaulPauls/llama3_interpretability_sae/HEAD/sae.py -------------------------------------------------------------------------------- /sae_preprocessing.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaulPauls/llama3_interpretability_sae/HEAD/sae_preprocessing.py -------------------------------------------------------------------------------- /sae_training.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaulPauls/llama3_interpretability_sae/HEAD/sae_training.py -------------------------------------------------------------------------------- /top_sentences_last_responses/msgbatch_0129Z3NWFaqHsAottqGHCBy6.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaulPauls/llama3_interpretability_sae/HEAD/top_sentences_last_responses/msgbatch_0129Z3NWFaqHsAottqGHCBy6.yaml -------------------------------------------------------------------------------- /top_sentences_last_responses/msgbatch_018kBogt5uV1QHCdpwixJvzk.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaulPauls/llama3_interpretability_sae/HEAD/top_sentences_last_responses/msgbatch_018kBogt5uV1QHCdpwixJvzk.yaml -------------------------------------------------------------------------------- /top_sentences_last_responses/msgbatch_01A1pLzDKsdLqZmhM8bVfnHq.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaulPauls/llama3_interpretability_sae/HEAD/top_sentences_last_responses/msgbatch_01A1pLzDKsdLqZmhM8bVfnHq.yaml -------------------------------------------------------------------------------- /top_sentences_last_responses/msgbatch_01ERHQxTfdEruzRYpziT4jv5.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaulPauls/llama3_interpretability_sae/HEAD/top_sentences_last_responses/msgbatch_01ERHQxTfdEruzRYpziT4jv5.yaml -------------------------------------------------------------------------------- /top_sentences_last_responses/msgbatch_01HuGhePrDX48fvvjb5HW9Qm.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaulPauls/llama3_interpretability_sae/HEAD/top_sentences_last_responses/msgbatch_01HuGhePrDX48fvvjb5HW9Qm.yaml -------------------------------------------------------------------------------- /top_sentences_last_responses/msgbatch_01NDSburAPr8pBc7AJBVMbdF.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaulPauls/llama3_interpretability_sae/HEAD/top_sentences_last_responses/msgbatch_01NDSburAPr8pBc7AJBVMbdF.yaml -------------------------------------------------------------------------------- /top_sentences_last_responses/msgbatch_01QGsQemNF7in575ZVc2UVwK.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaulPauls/llama3_interpretability_sae/HEAD/top_sentences_last_responses/msgbatch_01QGsQemNF7in575ZVc2UVwK.yaml -------------------------------------------------------------------------------- /top_sentences_last_responses/msgbatch_01SuQ7c59t9HVXVAnf8sXQ6d.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaulPauls/llama3_interpretability_sae/HEAD/top_sentences_last_responses/msgbatch_01SuQ7c59t9HVXVAnf8sXQ6d.yaml -------------------------------------------------------------------------------- /top_sentences_mean_responses/msgbatch_012snVhQshnnN4XBp9FsQDb9.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaulPauls/llama3_interpretability_sae/HEAD/top_sentences_mean_responses/msgbatch_012snVhQshnnN4XBp9FsQDb9.yaml -------------------------------------------------------------------------------- /top_sentences_mean_responses/msgbatch_018nMq6YLoAFkqf5rAi6mKif.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaulPauls/llama3_interpretability_sae/HEAD/top_sentences_mean_responses/msgbatch_018nMq6YLoAFkqf5rAi6mKif.yaml -------------------------------------------------------------------------------- /top_sentences_mean_responses/msgbatch_01DPMCyXaVhbG2aRhdsFv7HR.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaulPauls/llama3_interpretability_sae/HEAD/top_sentences_mean_responses/msgbatch_01DPMCyXaVhbG2aRhdsFv7HR.yaml -------------------------------------------------------------------------------- /top_sentences_mean_responses/msgbatch_01HZdF3QqrA9EmHiVbxWBGRg.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaulPauls/llama3_interpretability_sae/HEAD/top_sentences_mean_responses/msgbatch_01HZdF3QqrA9EmHiVbxWBGRg.yaml -------------------------------------------------------------------------------- /top_sentences_mean_responses/msgbatch_01HkcvAx6iN1F2YaZWPjTvQ2.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaulPauls/llama3_interpretability_sae/HEAD/top_sentences_mean_responses/msgbatch_01HkcvAx6iN1F2YaZWPjTvQ2.yaml -------------------------------------------------------------------------------- /top_sentences_mean_responses/msgbatch_01JnnzbM15ytnPLWtvitSAca.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaulPauls/llama3_interpretability_sae/HEAD/top_sentences_mean_responses/msgbatch_01JnnzbM15ytnPLWtvitSAca.yaml -------------------------------------------------------------------------------- /top_sentences_mean_responses/msgbatch_01KzafvapiTK8AuhrSRr5Cdv.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaulPauls/llama3_interpretability_sae/HEAD/top_sentences_mean_responses/msgbatch_01KzafvapiTK8AuhrSRr5Cdv.yaml -------------------------------------------------------------------------------- /top_sentences_mean_responses/msgbatch_01LFjpsiUTNo8xVArWZPXZdQ.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaulPauls/llama3_interpretability_sae/HEAD/top_sentences_mean_responses/msgbatch_01LFjpsiUTNo8xVArWZPXZdQ.yaml -------------------------------------------------------------------------------- /top_sentences_mean_responses/msgbatch_01MCPCisp8wy3FMUEp256aZ4.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaulPauls/llama3_interpretability_sae/HEAD/top_sentences_mean_responses/msgbatch_01MCPCisp8wy3FMUEp256aZ4.yaml -------------------------------------------------------------------------------- /top_sentences_mean_responses/msgbatch_01NNaFvR7WgjpkeP6QNPMxVf.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaulPauls/llama3_interpretability_sae/HEAD/top_sentences_mean_responses/msgbatch_01NNaFvR7WgjpkeP6QNPMxVf.yaml -------------------------------------------------------------------------------- /top_sentences_mean_responses/msgbatch_01NtDSMsBT1bUN4f6XA2GpiW.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaulPauls/llama3_interpretability_sae/HEAD/top_sentences_mean_responses/msgbatch_01NtDSMsBT1bUN4f6XA2GpiW.yaml -------------------------------------------------------------------------------- /top_sentences_mean_responses/msgbatch_01RWoU9gB8hxo8RYRcQz5QcY.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaulPauls/llama3_interpretability_sae/HEAD/top_sentences_mean_responses/msgbatch_01RWoU9gB8hxo8RYRcQz5QcY.yaml -------------------------------------------------------------------------------- /top_sentences_mean_responses/msgbatch_01TqmvZpNF8mL4DXkhxaHw8p.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaulPauls/llama3_interpretability_sae/HEAD/top_sentences_mean_responses/msgbatch_01TqmvZpNF8mL4DXkhxaHw8p.yaml -------------------------------------------------------------------------------- /utils/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /utils/cuda_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaulPauls/llama3_interpretability_sae/HEAD/utils/cuda_utils.py -------------------------------------------------------------------------------- /utils/llama_3_model_download.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PaulPauls/llama3_interpretability_sae/HEAD/utils/llama_3_model_download.py --------------------------------------------------------------------------------