├── .gitignore ├── LICENSE ├── README.md ├── eleuther_sae ├── .gitignore ├── .pre-commit-config.yaml ├── LICENSE ├── README.md ├── pyproject.toml ├── sae │ ├── __init__.py │ ├── __main__.py │ ├── config.py │ ├── data.py │ ├── kernels.py │ ├── sae.py │ ├── trainer.py │ └── utils.py └── tests │ ├── __init__.py │ └── test_decode.py ├── eval ├── SAE-Bench │ ├── .gitignore │ ├── README.md │ ├── custom_saes │ │ ├── README.md │ │ ├── custom_sae_config.py │ │ ├── identity_sae.py │ │ ├── jumprelu_sae.py │ │ ├── pca_sae.py │ │ ├── run_all_evals_custom_saes.py │ │ ├── topk_sae.py │ │ ├── utils.py │ │ └── vanilla_sae.py │ ├── evals │ │ ├── absorption │ │ │ ├── README.md │ │ │ ├── __init__.py │ │ │ ├── common.py │ │ │ ├── eval_config.py │ │ │ ├── eval_output.py │ │ │ ├── eval_output_schema_absorption_first_letter.json │ │ │ ├── feature_absorption.py │ │ │ ├── feature_absorption_calculator.py │ │ │ ├── k_sparse_probing.py │ │ │ ├── main.py │ │ │ ├── probing.py │ │ │ ├── prompting.py │ │ │ ├── util.py │ │ │ └── vocab.py │ │ ├── autointerp │ │ │ ├── README.md │ │ │ ├── demo.py │ │ │ ├── eval_config.py │ │ │ ├── eval_output.py │ │ │ ├── eval_output_schema_autointerp.json │ │ │ ├── logs_100.txt │ │ │ ├── logs_4.txt │ │ │ ├── main.py │ │ │ ├── mlp_neurons_vs_saes.py │ │ │ └── sae_encode.py │ │ ├── base_eval_output.py │ │ ├── core │ │ │ ├── convert_directory.py │ │ │ ├── eval_config.py │ │ │ ├── eval_output.py │ │ │ ├── eval_output_schema_core.json │ │ │ └── main.py │ │ ├── generate_json_schemas.py │ │ ├── mdl │ │ │ ├── README.md │ │ │ ├── eval_config.py │ │ │ ├── graphing_mdl.ipynb │ │ │ └── main.py │ │ ├── ravel │ │ │ ├── clean_prototype.ipynb │ │ │ ├── dataset_builder.py │ │ │ ├── feature_selector.py │ │ │ ├── ravel_dataset_builder.py │ │ │ └── utils │ │ │ │ ├── general.py │ │ │ │ ├── generate_ravel_instance.py │ │ │ │ └── generation_utils.py │ │ ├── scr_and_tpp │ │ │ ├── README.md │ │ │ ├── dataset_creation.py │ │ │ ├── eval_config.py │ │ │ ├── eval_output.py │ │ │ ├── eval_output_schema_scr.json │ │ │ ├── eval_output_schema_tpp.json │ │ │ └── main.py │ │ ├── sparse_probing │ │ │ ├── README.md │ │ │ ├── eval_config.py │ │ │ ├── eval_output.py │ │ │ ├── eval_output_schema_sparse_probing.json │ │ │ ├── main.py │ │ │ ├── probe_training.py │ │ │ └── testing_notebooks │ │ │ │ ├── dataset_testing.ipynb │ │ │ │ ├── gather_eval_results.ipynb │ │ │ │ ├── main_experiments.py │ │ │ │ ├── probe_training.ipynb │ │ │ │ └── testing_gemma.ipynb │ │ └── unlearning │ │ │ ├── README.md │ │ │ ├── eval_config.py │ │ │ ├── eval_output.py │ │ │ ├── eval_output_schema_unlearning.json │ │ │ ├── example.ipynb │ │ │ ├── main.py │ │ │ └── utils │ │ │ ├── eval.py │ │ │ ├── feature_activation.py │ │ │ ├── intervention.py │ │ │ ├── metrics.py │ │ │ └── var.py │ ├── get_peft_dict.py │ ├── pyproject.toml │ ├── run_eval.py │ ├── sae_bench_utils │ │ ├── __init__.py │ │ ├── activation_collection.py │ │ ├── dataset_info.py │ │ ├── dataset_utils.py │ │ ├── general_utils.py │ │ ├── graphing_utils.py │ │ ├── indexing_utils.py │ │ ├── misc_notebooks │ │ │ ├── blog_post_graphs.ipynb │ │ │ ├── eval_template.ipynb │ │ │ └── modify_old_results.ipynb │ │ ├── sae_selection_utils.py │ │ └── testing_utils.py │ ├── shell_scripts │ │ ├── README.md │ │ ├── run.sh │ │ ├── run_reduced_memory.sh │ │ └── run_reduced_memory_1m_width.py │ └── tests │ │ ├── conftest.py │ │ ├── evals │ │ └── absorption │ │ │ ├── test_common.py │ │ │ ├── test_feature_absorption.py │ │ │ ├── test_feature_absorption_calculator.py │ │ │ ├── test_k_sparse_probing.py │ │ │ ├── test_probing.py │ │ │ ├── test_prompting.py │ │ │ └── test_vocab.py │ │ ├── test_absorption.py │ │ ├── test_autointerp.py │ │ ├── test_core.py │ │ ├── test_data │ │ ├── absorption │ │ │ └── absorption_expected_results.json │ │ ├── autointerp │ │ │ └── autointerp_expected_results.json │ │ ├── core │ │ │ └── core_expected_results.json │ │ ├── mdl │ │ │ └── mdl_expected_results.json │ │ ├── scr_and_tpp │ │ │ ├── scr_expected_results.json │ │ │ └── tpp_expected_results.json │ │ ├── sparse_probing │ │ │ └── sparse_probing_expected_results.json │ │ └── unlearning │ │ │ └── unlearning_expected_results.json │ │ ├── test_eval_output.py │ │ ├── test_sae_selection_utils.py │ │ ├── test_scr_and_tpp.py │ │ ├── test_sparse_probing.py │ │ ├── test_unlearning.py │ │ └── test_utils.py ├── benchmarks │ └── run_benchmarks.py └── steering │ ├── datasets │ ├── arabic.json │ ├── biology.json │ ├── law.json │ ├── recipes.json │ └── shakespeare.json │ ├── make_positive_examples.py │ └── run_steering.py ├── interp_analysis ├── activation_distances.py ├── get_token_loss.py └── plot_token_loss.py ├── multiple_saes.py ├── plot.ipynb ├── plots └── LoRA-SAE_setup.jpg ├── scaling.py ├── train_lora.py ├── train_sae.py └── utils.py /.gitignore: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/matchten/LoRA-Models-for-SAEs/HEAD/.gitignore -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/matchten/LoRA-Models-for-SAEs/HEAD/LICENSE -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/matchten/LoRA-Models-for-SAEs/HEAD/README.md -------------------------------------------------------------------------------- /eleuther_sae/.gitignore: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/matchten/LoRA-Models-for-SAEs/HEAD/eleuther_sae/.gitignore -------------------------------------------------------------------------------- /eleuther_sae/.pre-commit-config.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/matchten/LoRA-Models-for-SAEs/HEAD/eleuther_sae/.pre-commit-config.yaml -------------------------------------------------------------------------------- /eleuther_sae/LICENSE: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/matchten/LoRA-Models-for-SAEs/HEAD/eleuther_sae/LICENSE -------------------------------------------------------------------------------- /eleuther_sae/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/matchten/LoRA-Models-for-SAEs/HEAD/eleuther_sae/README.md -------------------------------------------------------------------------------- /eleuther_sae/pyproject.toml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/matchten/LoRA-Models-for-SAEs/HEAD/eleuther_sae/pyproject.toml -------------------------------------------------------------------------------- /eleuther_sae/sae/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/matchten/LoRA-Models-for-SAEs/HEAD/eleuther_sae/sae/__init__.py -------------------------------------------------------------------------------- /eleuther_sae/sae/__main__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/matchten/LoRA-Models-for-SAEs/HEAD/eleuther_sae/sae/__main__.py -------------------------------------------------------------------------------- /eleuther_sae/sae/config.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/matchten/LoRA-Models-for-SAEs/HEAD/eleuther_sae/sae/config.py -------------------------------------------------------------------------------- /eleuther_sae/sae/data.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/matchten/LoRA-Models-for-SAEs/HEAD/eleuther_sae/sae/data.py -------------------------------------------------------------------------------- /eleuther_sae/sae/kernels.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/matchten/LoRA-Models-for-SAEs/HEAD/eleuther_sae/sae/kernels.py -------------------------------------------------------------------------------- /eleuther_sae/sae/sae.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/matchten/LoRA-Models-for-SAEs/HEAD/eleuther_sae/sae/sae.py -------------------------------------------------------------------------------- /eleuther_sae/sae/trainer.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/matchten/LoRA-Models-for-SAEs/HEAD/eleuther_sae/sae/trainer.py -------------------------------------------------------------------------------- /eleuther_sae/sae/utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/matchten/LoRA-Models-for-SAEs/HEAD/eleuther_sae/sae/utils.py -------------------------------------------------------------------------------- /eleuther_sae/tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /eleuther_sae/tests/test_decode.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/matchten/LoRA-Models-for-SAEs/HEAD/eleuther_sae/tests/test_decode.py -------------------------------------------------------------------------------- /eval/SAE-Bench/.gitignore: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/matchten/LoRA-Models-for-SAEs/HEAD/eval/SAE-Bench/.gitignore -------------------------------------------------------------------------------- /eval/SAE-Bench/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/matchten/LoRA-Models-for-SAEs/HEAD/eval/SAE-Bench/README.md -------------------------------------------------------------------------------- /eval/SAE-Bench/custom_saes/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/matchten/LoRA-Models-for-SAEs/HEAD/eval/SAE-Bench/custom_saes/README.md -------------------------------------------------------------------------------- /eval/SAE-Bench/custom_saes/custom_sae_config.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/matchten/LoRA-Models-for-SAEs/HEAD/eval/SAE-Bench/custom_saes/custom_sae_config.py -------------------------------------------------------------------------------- /eval/SAE-Bench/custom_saes/identity_sae.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/matchten/LoRA-Models-for-SAEs/HEAD/eval/SAE-Bench/custom_saes/identity_sae.py -------------------------------------------------------------------------------- /eval/SAE-Bench/custom_saes/jumprelu_sae.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/matchten/LoRA-Models-for-SAEs/HEAD/eval/SAE-Bench/custom_saes/jumprelu_sae.py -------------------------------------------------------------------------------- /eval/SAE-Bench/custom_saes/pca_sae.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/matchten/LoRA-Models-for-SAEs/HEAD/eval/SAE-Bench/custom_saes/pca_sae.py -------------------------------------------------------------------------------- /eval/SAE-Bench/custom_saes/run_all_evals_custom_saes.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/matchten/LoRA-Models-for-SAEs/HEAD/eval/SAE-Bench/custom_saes/run_all_evals_custom_saes.py -------------------------------------------------------------------------------- /eval/SAE-Bench/custom_saes/topk_sae.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/matchten/LoRA-Models-for-SAEs/HEAD/eval/SAE-Bench/custom_saes/topk_sae.py -------------------------------------------------------------------------------- /eval/SAE-Bench/custom_saes/utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/matchten/LoRA-Models-for-SAEs/HEAD/eval/SAE-Bench/custom_saes/utils.py -------------------------------------------------------------------------------- /eval/SAE-Bench/custom_saes/vanilla_sae.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/matchten/LoRA-Models-for-SAEs/HEAD/eval/SAE-Bench/custom_saes/vanilla_sae.py -------------------------------------------------------------------------------- /eval/SAE-Bench/evals/absorption/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/matchten/LoRA-Models-for-SAEs/HEAD/eval/SAE-Bench/evals/absorption/README.md -------------------------------------------------------------------------------- /eval/SAE-Bench/evals/absorption/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /eval/SAE-Bench/evals/absorption/common.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/matchten/LoRA-Models-for-SAEs/HEAD/eval/SAE-Bench/evals/absorption/common.py -------------------------------------------------------------------------------- /eval/SAE-Bench/evals/absorption/eval_config.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/matchten/LoRA-Models-for-SAEs/HEAD/eval/SAE-Bench/evals/absorption/eval_config.py -------------------------------------------------------------------------------- /eval/SAE-Bench/evals/absorption/eval_output.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/matchten/LoRA-Models-for-SAEs/HEAD/eval/SAE-Bench/evals/absorption/eval_output.py -------------------------------------------------------------------------------- /eval/SAE-Bench/evals/absorption/eval_output_schema_absorption_first_letter.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/matchten/LoRA-Models-for-SAEs/HEAD/eval/SAE-Bench/evals/absorption/eval_output_schema_absorption_first_letter.json -------------------------------------------------------------------------------- /eval/SAE-Bench/evals/absorption/feature_absorption.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/matchten/LoRA-Models-for-SAEs/HEAD/eval/SAE-Bench/evals/absorption/feature_absorption.py -------------------------------------------------------------------------------- /eval/SAE-Bench/evals/absorption/feature_absorption_calculator.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/matchten/LoRA-Models-for-SAEs/HEAD/eval/SAE-Bench/evals/absorption/feature_absorption_calculator.py -------------------------------------------------------------------------------- /eval/SAE-Bench/evals/absorption/k_sparse_probing.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/matchten/LoRA-Models-for-SAEs/HEAD/eval/SAE-Bench/evals/absorption/k_sparse_probing.py -------------------------------------------------------------------------------- /eval/SAE-Bench/evals/absorption/main.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/matchten/LoRA-Models-for-SAEs/HEAD/eval/SAE-Bench/evals/absorption/main.py -------------------------------------------------------------------------------- /eval/SAE-Bench/evals/absorption/probing.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/matchten/LoRA-Models-for-SAEs/HEAD/eval/SAE-Bench/evals/absorption/probing.py -------------------------------------------------------------------------------- /eval/SAE-Bench/evals/absorption/prompting.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/matchten/LoRA-Models-for-SAEs/HEAD/eval/SAE-Bench/evals/absorption/prompting.py -------------------------------------------------------------------------------- /eval/SAE-Bench/evals/absorption/util.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/matchten/LoRA-Models-for-SAEs/HEAD/eval/SAE-Bench/evals/absorption/util.py -------------------------------------------------------------------------------- /eval/SAE-Bench/evals/absorption/vocab.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/matchten/LoRA-Models-for-SAEs/HEAD/eval/SAE-Bench/evals/absorption/vocab.py -------------------------------------------------------------------------------- /eval/SAE-Bench/evals/autointerp/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/matchten/LoRA-Models-for-SAEs/HEAD/eval/SAE-Bench/evals/autointerp/README.md -------------------------------------------------------------------------------- /eval/SAE-Bench/evals/autointerp/demo.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/matchten/LoRA-Models-for-SAEs/HEAD/eval/SAE-Bench/evals/autointerp/demo.py -------------------------------------------------------------------------------- /eval/SAE-Bench/evals/autointerp/eval_config.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/matchten/LoRA-Models-for-SAEs/HEAD/eval/SAE-Bench/evals/autointerp/eval_config.py -------------------------------------------------------------------------------- /eval/SAE-Bench/evals/autointerp/eval_output.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/matchten/LoRA-Models-for-SAEs/HEAD/eval/SAE-Bench/evals/autointerp/eval_output.py -------------------------------------------------------------------------------- /eval/SAE-Bench/evals/autointerp/eval_output_schema_autointerp.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/matchten/LoRA-Models-for-SAEs/HEAD/eval/SAE-Bench/evals/autointerp/eval_output_schema_autointerp.json -------------------------------------------------------------------------------- /eval/SAE-Bench/evals/autointerp/logs_100.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/matchten/LoRA-Models-for-SAEs/HEAD/eval/SAE-Bench/evals/autointerp/logs_100.txt -------------------------------------------------------------------------------- /eval/SAE-Bench/evals/autointerp/logs_4.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/matchten/LoRA-Models-for-SAEs/HEAD/eval/SAE-Bench/evals/autointerp/logs_4.txt -------------------------------------------------------------------------------- /eval/SAE-Bench/evals/autointerp/main.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/matchten/LoRA-Models-for-SAEs/HEAD/eval/SAE-Bench/evals/autointerp/main.py -------------------------------------------------------------------------------- /eval/SAE-Bench/evals/autointerp/mlp_neurons_vs_saes.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/matchten/LoRA-Models-for-SAEs/HEAD/eval/SAE-Bench/evals/autointerp/mlp_neurons_vs_saes.py -------------------------------------------------------------------------------- /eval/SAE-Bench/evals/autointerp/sae_encode.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/matchten/LoRA-Models-for-SAEs/HEAD/eval/SAE-Bench/evals/autointerp/sae_encode.py -------------------------------------------------------------------------------- /eval/SAE-Bench/evals/base_eval_output.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/matchten/LoRA-Models-for-SAEs/HEAD/eval/SAE-Bench/evals/base_eval_output.py -------------------------------------------------------------------------------- /eval/SAE-Bench/evals/core/convert_directory.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/matchten/LoRA-Models-for-SAEs/HEAD/eval/SAE-Bench/evals/core/convert_directory.py -------------------------------------------------------------------------------- /eval/SAE-Bench/evals/core/eval_config.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/matchten/LoRA-Models-for-SAEs/HEAD/eval/SAE-Bench/evals/core/eval_config.py -------------------------------------------------------------------------------- /eval/SAE-Bench/evals/core/eval_output.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/matchten/LoRA-Models-for-SAEs/HEAD/eval/SAE-Bench/evals/core/eval_output.py -------------------------------------------------------------------------------- /eval/SAE-Bench/evals/core/eval_output_schema_core.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/matchten/LoRA-Models-for-SAEs/HEAD/eval/SAE-Bench/evals/core/eval_output_schema_core.json -------------------------------------------------------------------------------- /eval/SAE-Bench/evals/core/main.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/matchten/LoRA-Models-for-SAEs/HEAD/eval/SAE-Bench/evals/core/main.py -------------------------------------------------------------------------------- /eval/SAE-Bench/evals/generate_json_schemas.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/matchten/LoRA-Models-for-SAEs/HEAD/eval/SAE-Bench/evals/generate_json_schemas.py -------------------------------------------------------------------------------- /eval/SAE-Bench/evals/mdl/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/matchten/LoRA-Models-for-SAEs/HEAD/eval/SAE-Bench/evals/mdl/README.md -------------------------------------------------------------------------------- /eval/SAE-Bench/evals/mdl/eval_config.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/matchten/LoRA-Models-for-SAEs/HEAD/eval/SAE-Bench/evals/mdl/eval_config.py -------------------------------------------------------------------------------- /eval/SAE-Bench/evals/mdl/graphing_mdl.ipynb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/matchten/LoRA-Models-for-SAEs/HEAD/eval/SAE-Bench/evals/mdl/graphing_mdl.ipynb -------------------------------------------------------------------------------- /eval/SAE-Bench/evals/mdl/main.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/matchten/LoRA-Models-for-SAEs/HEAD/eval/SAE-Bench/evals/mdl/main.py -------------------------------------------------------------------------------- /eval/SAE-Bench/evals/ravel/clean_prototype.ipynb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/matchten/LoRA-Models-for-SAEs/HEAD/eval/SAE-Bench/evals/ravel/clean_prototype.ipynb -------------------------------------------------------------------------------- /eval/SAE-Bench/evals/ravel/dataset_builder.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/matchten/LoRA-Models-for-SAEs/HEAD/eval/SAE-Bench/evals/ravel/dataset_builder.py -------------------------------------------------------------------------------- /eval/SAE-Bench/evals/ravel/feature_selector.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /eval/SAE-Bench/evals/ravel/ravel_dataset_builder.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/matchten/LoRA-Models-for-SAEs/HEAD/eval/SAE-Bench/evals/ravel/ravel_dataset_builder.py -------------------------------------------------------------------------------- /eval/SAE-Bench/evals/ravel/utils/general.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/matchten/LoRA-Models-for-SAEs/HEAD/eval/SAE-Bench/evals/ravel/utils/general.py -------------------------------------------------------------------------------- /eval/SAE-Bench/evals/ravel/utils/generate_ravel_instance.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/matchten/LoRA-Models-for-SAEs/HEAD/eval/SAE-Bench/evals/ravel/utils/generate_ravel_instance.py -------------------------------------------------------------------------------- /eval/SAE-Bench/evals/ravel/utils/generation_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/matchten/LoRA-Models-for-SAEs/HEAD/eval/SAE-Bench/evals/ravel/utils/generation_utils.py -------------------------------------------------------------------------------- /eval/SAE-Bench/evals/scr_and_tpp/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/matchten/LoRA-Models-for-SAEs/HEAD/eval/SAE-Bench/evals/scr_and_tpp/README.md -------------------------------------------------------------------------------- /eval/SAE-Bench/evals/scr_and_tpp/dataset_creation.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/matchten/LoRA-Models-for-SAEs/HEAD/eval/SAE-Bench/evals/scr_and_tpp/dataset_creation.py -------------------------------------------------------------------------------- /eval/SAE-Bench/evals/scr_and_tpp/eval_config.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/matchten/LoRA-Models-for-SAEs/HEAD/eval/SAE-Bench/evals/scr_and_tpp/eval_config.py -------------------------------------------------------------------------------- /eval/SAE-Bench/evals/scr_and_tpp/eval_output.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/matchten/LoRA-Models-for-SAEs/HEAD/eval/SAE-Bench/evals/scr_and_tpp/eval_output.py -------------------------------------------------------------------------------- /eval/SAE-Bench/evals/scr_and_tpp/eval_output_schema_scr.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/matchten/LoRA-Models-for-SAEs/HEAD/eval/SAE-Bench/evals/scr_and_tpp/eval_output_schema_scr.json -------------------------------------------------------------------------------- /eval/SAE-Bench/evals/scr_and_tpp/eval_output_schema_tpp.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/matchten/LoRA-Models-for-SAEs/HEAD/eval/SAE-Bench/evals/scr_and_tpp/eval_output_schema_tpp.json -------------------------------------------------------------------------------- /eval/SAE-Bench/evals/scr_and_tpp/main.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/matchten/LoRA-Models-for-SAEs/HEAD/eval/SAE-Bench/evals/scr_and_tpp/main.py -------------------------------------------------------------------------------- /eval/SAE-Bench/evals/sparse_probing/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/matchten/LoRA-Models-for-SAEs/HEAD/eval/SAE-Bench/evals/sparse_probing/README.md -------------------------------------------------------------------------------- /eval/SAE-Bench/evals/sparse_probing/eval_config.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/matchten/LoRA-Models-for-SAEs/HEAD/eval/SAE-Bench/evals/sparse_probing/eval_config.py -------------------------------------------------------------------------------- /eval/SAE-Bench/evals/sparse_probing/eval_output.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/matchten/LoRA-Models-for-SAEs/HEAD/eval/SAE-Bench/evals/sparse_probing/eval_output.py -------------------------------------------------------------------------------- /eval/SAE-Bench/evals/sparse_probing/eval_output_schema_sparse_probing.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/matchten/LoRA-Models-for-SAEs/HEAD/eval/SAE-Bench/evals/sparse_probing/eval_output_schema_sparse_probing.json -------------------------------------------------------------------------------- /eval/SAE-Bench/evals/sparse_probing/main.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/matchten/LoRA-Models-for-SAEs/HEAD/eval/SAE-Bench/evals/sparse_probing/main.py -------------------------------------------------------------------------------- /eval/SAE-Bench/evals/sparse_probing/probe_training.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/matchten/LoRA-Models-for-SAEs/HEAD/eval/SAE-Bench/evals/sparse_probing/probe_training.py -------------------------------------------------------------------------------- /eval/SAE-Bench/evals/sparse_probing/testing_notebooks/dataset_testing.ipynb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/matchten/LoRA-Models-for-SAEs/HEAD/eval/SAE-Bench/evals/sparse_probing/testing_notebooks/dataset_testing.ipynb -------------------------------------------------------------------------------- /eval/SAE-Bench/evals/sparse_probing/testing_notebooks/gather_eval_results.ipynb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/matchten/LoRA-Models-for-SAEs/HEAD/eval/SAE-Bench/evals/sparse_probing/testing_notebooks/gather_eval_results.ipynb -------------------------------------------------------------------------------- /eval/SAE-Bench/evals/sparse_probing/testing_notebooks/main_experiments.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/matchten/LoRA-Models-for-SAEs/HEAD/eval/SAE-Bench/evals/sparse_probing/testing_notebooks/main_experiments.py -------------------------------------------------------------------------------- /eval/SAE-Bench/evals/sparse_probing/testing_notebooks/probe_training.ipynb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/matchten/LoRA-Models-for-SAEs/HEAD/eval/SAE-Bench/evals/sparse_probing/testing_notebooks/probe_training.ipynb -------------------------------------------------------------------------------- /eval/SAE-Bench/evals/sparse_probing/testing_notebooks/testing_gemma.ipynb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/matchten/LoRA-Models-for-SAEs/HEAD/eval/SAE-Bench/evals/sparse_probing/testing_notebooks/testing_gemma.ipynb -------------------------------------------------------------------------------- /eval/SAE-Bench/evals/unlearning/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/matchten/LoRA-Models-for-SAEs/HEAD/eval/SAE-Bench/evals/unlearning/README.md -------------------------------------------------------------------------------- /eval/SAE-Bench/evals/unlearning/eval_config.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/matchten/LoRA-Models-for-SAEs/HEAD/eval/SAE-Bench/evals/unlearning/eval_config.py -------------------------------------------------------------------------------- /eval/SAE-Bench/evals/unlearning/eval_output.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/matchten/LoRA-Models-for-SAEs/HEAD/eval/SAE-Bench/evals/unlearning/eval_output.py -------------------------------------------------------------------------------- /eval/SAE-Bench/evals/unlearning/eval_output_schema_unlearning.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/matchten/LoRA-Models-for-SAEs/HEAD/eval/SAE-Bench/evals/unlearning/eval_output_schema_unlearning.json -------------------------------------------------------------------------------- /eval/SAE-Bench/evals/unlearning/example.ipynb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/matchten/LoRA-Models-for-SAEs/HEAD/eval/SAE-Bench/evals/unlearning/example.ipynb -------------------------------------------------------------------------------- /eval/SAE-Bench/evals/unlearning/main.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/matchten/LoRA-Models-for-SAEs/HEAD/eval/SAE-Bench/evals/unlearning/main.py -------------------------------------------------------------------------------- /eval/SAE-Bench/evals/unlearning/utils/eval.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/matchten/LoRA-Models-for-SAEs/HEAD/eval/SAE-Bench/evals/unlearning/utils/eval.py -------------------------------------------------------------------------------- /eval/SAE-Bench/evals/unlearning/utils/feature_activation.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/matchten/LoRA-Models-for-SAEs/HEAD/eval/SAE-Bench/evals/unlearning/utils/feature_activation.py -------------------------------------------------------------------------------- /eval/SAE-Bench/evals/unlearning/utils/intervention.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/matchten/LoRA-Models-for-SAEs/HEAD/eval/SAE-Bench/evals/unlearning/utils/intervention.py -------------------------------------------------------------------------------- /eval/SAE-Bench/evals/unlearning/utils/metrics.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/matchten/LoRA-Models-for-SAEs/HEAD/eval/SAE-Bench/evals/unlearning/utils/metrics.py -------------------------------------------------------------------------------- /eval/SAE-Bench/evals/unlearning/utils/var.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/matchten/LoRA-Models-for-SAEs/HEAD/eval/SAE-Bench/evals/unlearning/utils/var.py -------------------------------------------------------------------------------- /eval/SAE-Bench/get_peft_dict.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/matchten/LoRA-Models-for-SAEs/HEAD/eval/SAE-Bench/get_peft_dict.py -------------------------------------------------------------------------------- /eval/SAE-Bench/pyproject.toml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/matchten/LoRA-Models-for-SAEs/HEAD/eval/SAE-Bench/pyproject.toml -------------------------------------------------------------------------------- /eval/SAE-Bench/run_eval.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/matchten/LoRA-Models-for-SAEs/HEAD/eval/SAE-Bench/run_eval.py -------------------------------------------------------------------------------- /eval/SAE-Bench/sae_bench_utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/matchten/LoRA-Models-for-SAEs/HEAD/eval/SAE-Bench/sae_bench_utils/__init__.py -------------------------------------------------------------------------------- /eval/SAE-Bench/sae_bench_utils/activation_collection.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/matchten/LoRA-Models-for-SAEs/HEAD/eval/SAE-Bench/sae_bench_utils/activation_collection.py -------------------------------------------------------------------------------- /eval/SAE-Bench/sae_bench_utils/dataset_info.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/matchten/LoRA-Models-for-SAEs/HEAD/eval/SAE-Bench/sae_bench_utils/dataset_info.py -------------------------------------------------------------------------------- /eval/SAE-Bench/sae_bench_utils/dataset_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/matchten/LoRA-Models-for-SAEs/HEAD/eval/SAE-Bench/sae_bench_utils/dataset_utils.py -------------------------------------------------------------------------------- /eval/SAE-Bench/sae_bench_utils/general_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/matchten/LoRA-Models-for-SAEs/HEAD/eval/SAE-Bench/sae_bench_utils/general_utils.py -------------------------------------------------------------------------------- /eval/SAE-Bench/sae_bench_utils/graphing_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/matchten/LoRA-Models-for-SAEs/HEAD/eval/SAE-Bench/sae_bench_utils/graphing_utils.py -------------------------------------------------------------------------------- /eval/SAE-Bench/sae_bench_utils/indexing_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/matchten/LoRA-Models-for-SAEs/HEAD/eval/SAE-Bench/sae_bench_utils/indexing_utils.py -------------------------------------------------------------------------------- /eval/SAE-Bench/sae_bench_utils/misc_notebooks/blog_post_graphs.ipynb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/matchten/LoRA-Models-for-SAEs/HEAD/eval/SAE-Bench/sae_bench_utils/misc_notebooks/blog_post_graphs.ipynb -------------------------------------------------------------------------------- /eval/SAE-Bench/sae_bench_utils/misc_notebooks/eval_template.ipynb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/matchten/LoRA-Models-for-SAEs/HEAD/eval/SAE-Bench/sae_bench_utils/misc_notebooks/eval_template.ipynb -------------------------------------------------------------------------------- /eval/SAE-Bench/sae_bench_utils/misc_notebooks/modify_old_results.ipynb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/matchten/LoRA-Models-for-SAEs/HEAD/eval/SAE-Bench/sae_bench_utils/misc_notebooks/modify_old_results.ipynb -------------------------------------------------------------------------------- /eval/SAE-Bench/sae_bench_utils/sae_selection_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/matchten/LoRA-Models-for-SAEs/HEAD/eval/SAE-Bench/sae_bench_utils/sae_selection_utils.py -------------------------------------------------------------------------------- /eval/SAE-Bench/sae_bench_utils/testing_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/matchten/LoRA-Models-for-SAEs/HEAD/eval/SAE-Bench/sae_bench_utils/testing_utils.py -------------------------------------------------------------------------------- /eval/SAE-Bench/shell_scripts/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/matchten/LoRA-Models-for-SAEs/HEAD/eval/SAE-Bench/shell_scripts/README.md -------------------------------------------------------------------------------- /eval/SAE-Bench/shell_scripts/run.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/matchten/LoRA-Models-for-SAEs/HEAD/eval/SAE-Bench/shell_scripts/run.sh -------------------------------------------------------------------------------- /eval/SAE-Bench/shell_scripts/run_reduced_memory.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/matchten/LoRA-Models-for-SAEs/HEAD/eval/SAE-Bench/shell_scripts/run_reduced_memory.sh -------------------------------------------------------------------------------- /eval/SAE-Bench/shell_scripts/run_reduced_memory_1m_width.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/matchten/LoRA-Models-for-SAEs/HEAD/eval/SAE-Bench/shell_scripts/run_reduced_memory_1m_width.py -------------------------------------------------------------------------------- /eval/SAE-Bench/tests/conftest.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/matchten/LoRA-Models-for-SAEs/HEAD/eval/SAE-Bench/tests/conftest.py -------------------------------------------------------------------------------- /eval/SAE-Bench/tests/evals/absorption/test_common.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/matchten/LoRA-Models-for-SAEs/HEAD/eval/SAE-Bench/tests/evals/absorption/test_common.py -------------------------------------------------------------------------------- /eval/SAE-Bench/tests/evals/absorption/test_feature_absorption.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/matchten/LoRA-Models-for-SAEs/HEAD/eval/SAE-Bench/tests/evals/absorption/test_feature_absorption.py -------------------------------------------------------------------------------- /eval/SAE-Bench/tests/evals/absorption/test_feature_absorption_calculator.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/matchten/LoRA-Models-for-SAEs/HEAD/eval/SAE-Bench/tests/evals/absorption/test_feature_absorption_calculator.py -------------------------------------------------------------------------------- /eval/SAE-Bench/tests/evals/absorption/test_k_sparse_probing.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/matchten/LoRA-Models-for-SAEs/HEAD/eval/SAE-Bench/tests/evals/absorption/test_k_sparse_probing.py -------------------------------------------------------------------------------- /eval/SAE-Bench/tests/evals/absorption/test_probing.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/matchten/LoRA-Models-for-SAEs/HEAD/eval/SAE-Bench/tests/evals/absorption/test_probing.py -------------------------------------------------------------------------------- /eval/SAE-Bench/tests/evals/absorption/test_prompting.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/matchten/LoRA-Models-for-SAEs/HEAD/eval/SAE-Bench/tests/evals/absorption/test_prompting.py -------------------------------------------------------------------------------- /eval/SAE-Bench/tests/evals/absorption/test_vocab.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/matchten/LoRA-Models-for-SAEs/HEAD/eval/SAE-Bench/tests/evals/absorption/test_vocab.py -------------------------------------------------------------------------------- /eval/SAE-Bench/tests/test_absorption.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/matchten/LoRA-Models-for-SAEs/HEAD/eval/SAE-Bench/tests/test_absorption.py -------------------------------------------------------------------------------- /eval/SAE-Bench/tests/test_autointerp.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/matchten/LoRA-Models-for-SAEs/HEAD/eval/SAE-Bench/tests/test_autointerp.py -------------------------------------------------------------------------------- /eval/SAE-Bench/tests/test_core.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/matchten/LoRA-Models-for-SAEs/HEAD/eval/SAE-Bench/tests/test_core.py -------------------------------------------------------------------------------- /eval/SAE-Bench/tests/test_data/absorption/absorption_expected_results.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/matchten/LoRA-Models-for-SAEs/HEAD/eval/SAE-Bench/tests/test_data/absorption/absorption_expected_results.json -------------------------------------------------------------------------------- /eval/SAE-Bench/tests/test_data/autointerp/autointerp_expected_results.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/matchten/LoRA-Models-for-SAEs/HEAD/eval/SAE-Bench/tests/test_data/autointerp/autointerp_expected_results.json -------------------------------------------------------------------------------- /eval/SAE-Bench/tests/test_data/core/core_expected_results.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/matchten/LoRA-Models-for-SAEs/HEAD/eval/SAE-Bench/tests/test_data/core/core_expected_results.json -------------------------------------------------------------------------------- /eval/SAE-Bench/tests/test_data/mdl/mdl_expected_results.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/matchten/LoRA-Models-for-SAEs/HEAD/eval/SAE-Bench/tests/test_data/mdl/mdl_expected_results.json -------------------------------------------------------------------------------- /eval/SAE-Bench/tests/test_data/scr_and_tpp/scr_expected_results.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/matchten/LoRA-Models-for-SAEs/HEAD/eval/SAE-Bench/tests/test_data/scr_and_tpp/scr_expected_results.json -------------------------------------------------------------------------------- /eval/SAE-Bench/tests/test_data/scr_and_tpp/tpp_expected_results.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/matchten/LoRA-Models-for-SAEs/HEAD/eval/SAE-Bench/tests/test_data/scr_and_tpp/tpp_expected_results.json -------------------------------------------------------------------------------- /eval/SAE-Bench/tests/test_data/sparse_probing/sparse_probing_expected_results.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/matchten/LoRA-Models-for-SAEs/HEAD/eval/SAE-Bench/tests/test_data/sparse_probing/sparse_probing_expected_results.json -------------------------------------------------------------------------------- /eval/SAE-Bench/tests/test_data/unlearning/unlearning_expected_results.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/matchten/LoRA-Models-for-SAEs/HEAD/eval/SAE-Bench/tests/test_data/unlearning/unlearning_expected_results.json -------------------------------------------------------------------------------- /eval/SAE-Bench/tests/test_eval_output.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/matchten/LoRA-Models-for-SAEs/HEAD/eval/SAE-Bench/tests/test_eval_output.py -------------------------------------------------------------------------------- /eval/SAE-Bench/tests/test_sae_selection_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/matchten/LoRA-Models-for-SAEs/HEAD/eval/SAE-Bench/tests/test_sae_selection_utils.py -------------------------------------------------------------------------------- /eval/SAE-Bench/tests/test_scr_and_tpp.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/matchten/LoRA-Models-for-SAEs/HEAD/eval/SAE-Bench/tests/test_scr_and_tpp.py -------------------------------------------------------------------------------- /eval/SAE-Bench/tests/test_sparse_probing.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/matchten/LoRA-Models-for-SAEs/HEAD/eval/SAE-Bench/tests/test_sparse_probing.py -------------------------------------------------------------------------------- /eval/SAE-Bench/tests/test_unlearning.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/matchten/LoRA-Models-for-SAEs/HEAD/eval/SAE-Bench/tests/test_unlearning.py -------------------------------------------------------------------------------- /eval/SAE-Bench/tests/test_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/matchten/LoRA-Models-for-SAEs/HEAD/eval/SAE-Bench/tests/test_utils.py -------------------------------------------------------------------------------- /eval/benchmarks/run_benchmarks.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/matchten/LoRA-Models-for-SAEs/HEAD/eval/benchmarks/run_benchmarks.py -------------------------------------------------------------------------------- /eval/steering/datasets/arabic.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/matchten/LoRA-Models-for-SAEs/HEAD/eval/steering/datasets/arabic.json -------------------------------------------------------------------------------- /eval/steering/datasets/biology.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/matchten/LoRA-Models-for-SAEs/HEAD/eval/steering/datasets/biology.json -------------------------------------------------------------------------------- /eval/steering/datasets/law.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/matchten/LoRA-Models-for-SAEs/HEAD/eval/steering/datasets/law.json -------------------------------------------------------------------------------- /eval/steering/datasets/recipes.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/matchten/LoRA-Models-for-SAEs/HEAD/eval/steering/datasets/recipes.json -------------------------------------------------------------------------------- /eval/steering/datasets/shakespeare.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/matchten/LoRA-Models-for-SAEs/HEAD/eval/steering/datasets/shakespeare.json -------------------------------------------------------------------------------- /eval/steering/make_positive_examples.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/matchten/LoRA-Models-for-SAEs/HEAD/eval/steering/make_positive_examples.py -------------------------------------------------------------------------------- /eval/steering/run_steering.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/matchten/LoRA-Models-for-SAEs/HEAD/eval/steering/run_steering.py -------------------------------------------------------------------------------- /interp_analysis/activation_distances.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/matchten/LoRA-Models-for-SAEs/HEAD/interp_analysis/activation_distances.py -------------------------------------------------------------------------------- /interp_analysis/get_token_loss.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/matchten/LoRA-Models-for-SAEs/HEAD/interp_analysis/get_token_loss.py -------------------------------------------------------------------------------- /interp_analysis/plot_token_loss.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/matchten/LoRA-Models-for-SAEs/HEAD/interp_analysis/plot_token_loss.py -------------------------------------------------------------------------------- /multiple_saes.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/matchten/LoRA-Models-for-SAEs/HEAD/multiple_saes.py -------------------------------------------------------------------------------- /plot.ipynb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/matchten/LoRA-Models-for-SAEs/HEAD/plot.ipynb -------------------------------------------------------------------------------- /plots/LoRA-SAE_setup.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/matchten/LoRA-Models-for-SAEs/HEAD/plots/LoRA-SAE_setup.jpg -------------------------------------------------------------------------------- /scaling.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/matchten/LoRA-Models-for-SAEs/HEAD/scaling.py -------------------------------------------------------------------------------- /train_lora.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/matchten/LoRA-Models-for-SAEs/HEAD/train_lora.py -------------------------------------------------------------------------------- /train_sae.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/matchten/LoRA-Models-for-SAEs/HEAD/train_sae.py -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/matchten/LoRA-Models-for-SAEs/HEAD/utils.py --------------------------------------------------------------------------------