├── .gitattributes ├── .gitignore ├── .python-version ├── LICENSE ├── README.md ├── axbench ├── README.md ├── __init__.py ├── concept10 │ ├── prod_2b_l10_v1 │ │ ├── generate │ │ │ ├── generate_state.pkl │ │ │ ├── metadata.jsonl │ │ │ └── train_data.parquet │ │ └── inference │ │ │ ├── latent_eval_data.parquet │ │ │ └── latent_inference_state.pkl │ ├── prod_2b_l20_v1 │ │ ├── generate │ │ │ ├── generate_state.pkl │ │ │ ├── metadata.jsonl │ │ │ └── train_data.parquet │ │ └── inference │ │ │ └── latent_eval_data.parquet │ ├── prod_9b_l20_v1 │ │ ├── generate │ │ │ ├── generate_state.pkl │ │ │ ├── metadata.jsonl │ │ │ └── train_data.parquet │ │ └── inference │ │ │ ├── latent_eval_data.parquet │ │ │ └── latent_inference_state.pkl │ └── prod_9b_l31_v1 │ │ ├── generate │ │ ├── generate_state.pkl │ │ ├── metadata.jsonl │ │ └── train_data.parquet │ │ └── inference │ │ ├── latent_eval_data.parquet │ │ └── latent_inference_state.pkl ├── concept16k │ └── README.md ├── concept16k_v2 │ └── README.md ├── concept500 │ ├── prod_2b_l10_v1 │ │ ├── generate │ │ │ ├── generate_state.pkl │ │ │ ├── metadata.jsonl │ │ │ └── train_data.parquet │ │ └── inference │ │ │ ├── latent_eval_data.parquet │ │ │ └── latent_inference_state.pkl │ ├── prod_2b_l20_v1 │ │ ├── generate │ │ │ ├── generate_state.pkl │ │ │ ├── metadata.jsonl │ │ │ └── train_data.parquet │ │ └── inference │ │ │ ├── latent_eval_data.parquet │ │ │ └── latent_inference_state.pkl │ ├── prod_9b_l20_v1 │ │ ├── generate │ │ │ ├── generate_state.pkl │ │ │ ├── metadata.jsonl │ │ │ └── train_data.parquet │ │ └── inference │ │ │ ├── latent_eval_data.parquet │ │ │ └── latent_inference_state.pkl │ └── prod_9b_l31_v1 │ │ ├── generate │ │ ├── generate_state.pkl │ │ ├── metadata.jsonl │ │ └── train_data.parquet │ │ └── inference │ │ ├── latent_eval_data.parquet │ │ └── latent_inference_state.pkl ├── conceptFD │ ├── README.md │ ├── prod_2b_l10_v1 │ │ ├── generate │ │ │ ├── metadata.jsonl │ │ │ └── train_data.parquet │ │ └── inference │ │ │ └── latent_eval_data.parquet │ └── prod_2b_l20_v1 │ │ ├── generate │ │ ├── metadata.jsonl │ │ └── train_data.parquet │ │ └── inference │ │ └── latent_eval_data.parquet ├── data │ ├── download-2b.sh │ ├── download-9b.sh │ ├── download-alpaca.sh │ ├── download-others.sh │ ├── download-pls.sh │ ├── download-seed-sentences.py │ └── process-feature-description.py ├── demo │ ├── concepts.csv │ ├── demo.sh │ ├── hypersteer_demo.sh │ └── sweep │ │ ├── hypersteer_simple.yaml │ │ └── simple.yaml ├── evaluators │ ├── __init__.py │ ├── aucroc.py │ ├── evaluator.py │ ├── hard_negative.py │ ├── latent_stats.py │ ├── lm_judge.py │ ├── ppl.py │ ├── prompt_templates.py │ ├── rule_judge.py │ └── winrate.py ├── examples │ ├── 2b_l20_diffmean.json │ ├── 2b_l20_lsreft.json │ ├── 9b_l20_diffmean.json │ ├── 9b_l20_lsreft.json │ ├── basics.ipynb │ ├── lang>subspace.ipynb │ ├── platonic.ipynb │ ├── subspace_gazer.ipynb │ └── tutorial.ipynb ├── experiment_commands.txt ├── models │ ├── __init__.py │ ├── bow.py │ ├── concept_lora.py │ ├── concept_model.py │ ├── concept_reft.py │ ├── concept_vector.py │ ├── hypernet │ │ ├── __init__.py │ │ ├── configuration_hypernet.py │ │ ├── layers.py │ │ ├── modeling_hypernet.py │ │ └── utils.py │ ├── hypersteer.py │ ├── ig.py │ ├── interventions.py │ ├── language_models.py │ ├── lora.py │ ├── lsreft.py │ ├── mean.py │ ├── model.py │ ├── preference_lora.py │ ├── preference_model.py │ ├── preference_reft.py │ ├── preference_vector.py │ ├── probe.py │ ├── prompt.py │ ├── random.py │ ├── reft.py │ ├── sae.py │ ├── sft.py │ └── steering_vector.py ├── rm_demo.sh ├── scripts │ ├── __init__.py │ ├── analyse.ipynb │ ├── args │ │ ├── __init__.py │ │ ├── dataset_args.py │ │ ├── eval_args.py │ │ └── training_args.py │ ├── evaluate.py │ ├── generate.py │ ├── human_eval.py │ ├── inference.py │ ├── reps_analyse.ipynb │ └── train.py ├── sweep │ ├── aryaman │ │ ├── sae_selection_2b_l10.yaml │ │ ├── sae_selection_2b_l10_concept10.yaml │ │ ├── sae_selection_2b_l20.yaml │ │ ├── sae_selection_9b_l20.yaml │ │ ├── sae_selection_9b_l31.yaml │ │ └── simple.yaml │ └── wuzhengx │ │ ├── 2b │ │ ├── l10 │ │ │ ├── bow.yaml │ │ │ ├── gemmascope_axbench_max_act.yaml │ │ │ ├── gemmascope_clamp.yaml │ │ │ ├── gemmascope_fd.yaml │ │ │ ├── gemmascope_min_clamp.yaml │ │ │ ├── ig.yaml │ │ │ ├── lora.yaml │ │ │ ├── loreft.yaml │ │ │ ├── lsreft.yaml │ │ │ ├── lsreft_fd.yaml │ │ │ ├── lsreft_scaling_law.yaml │ │ │ ├── lsreft_synergy.yaml │ │ │ ├── no_grad.yaml │ │ │ ├── probe.yaml │ │ │ ├── prompt_detection.yaml │ │ │ ├── sft.yaml │ │ │ ├── simple_prompt_steering.yaml │ │ │ └── steering_vec.yaml │ │ └── l20 │ │ │ ├── 16k_diffmean.yaml │ │ │ ├── 16k_diffmean_crossfit.yaml │ │ │ ├── 16k_lsreft.yaml │ │ │ ├── 16k_lsreft_crossfit.yaml │ │ │ ├── bow.yaml │ │ │ ├── gemmascope_axbench_max_act.yaml │ │ │ ├── gemmascope_clamp.yaml │ │ │ ├── gemmascope_fd.yaml │ │ │ ├── gemmascope_min_clamp.yaml │ │ │ ├── ig.yaml │ │ │ ├── lora.yaml │ │ │ ├── loreft.yaml │ │ │ ├── lsreft.yaml │ │ │ ├── lsreft_fd.yaml │ │ │ ├── lsreft_scaling_law.yaml │ │ │ ├── lsreft_synergy.yaml │ │ │ ├── no_grad.yaml │ │ │ ├── probe.yaml │ │ │ ├── prompt_detection.yaml │ │ │ ├── sft.yaml │ │ │ ├── simple_prompt_steering.yaml │ │ │ └── steering_vec.yaml │ │ ├── 9b │ │ ├── l20 │ │ │ ├── 16k_diffmean.yaml │ │ │ ├── 16k_diffmean_crossfit.yaml │ │ │ ├── 16k_lsreft.yaml │ │ │ ├── 16k_lsreft_crossfit.yaml │ │ │ ├── bow.yaml │ │ │ ├── gemmascope_axbench_max_act.yaml │ │ │ ├── gemmascope_clamp.yaml │ │ │ ├── gemmascope_min_clamp.yaml │ │ │ ├── ig.yaml │ │ │ ├── lora.yaml │ │ │ ├── loreft.yaml │ │ │ ├── lsreft.yaml │ │ │ ├── lsreft_scaling_law.yaml │ │ │ ├── lsreft_synergy.yaml │ │ │ ├── no_grad.yaml │ │ │ ├── probe.yaml │ │ │ ├── prompt_detection.yaml │ │ │ ├── simple_prompt_steering.yaml │ │ │ └── steering_vec.yaml │ │ └── l31 │ │ │ ├── bow.yaml │ │ │ ├── gemmascope_axbench_max_act.yaml │ │ │ ├── gemmascope_clamp.yaml │ │ │ ├── gemmascope_min_clamp.yaml │ │ │ ├── ig.yaml │ │ │ ├── lora.yaml │ │ │ ├── loreft.yaml │ │ │ ├── lsreft.yaml │ │ │ ├── lsreft_scaling_law.yaml │ │ │ ├── lsreft_synergy.yaml │ │ │ ├── no_grad.yaml │ │ │ ├── probe.yaml │ │ │ ├── prompt_detection.yaml │ │ │ ├── simple_prompt_steering.yaml │ │ │ └── steering_vec.yaml │ │ ├── llama_8b │ │ └── l20 │ │ │ └── lsreft.yaml │ │ ├── others │ │ ├── prod_2b_l10_fd_v1.yaml │ │ ├── prod_2b_l10_v1.yaml │ │ ├── prod_2b_l20_fd_v1.yaml │ │ ├── prod_2b_l20_v1.yaml │ │ ├── prod_9b_l20_v1.yaml │ │ └── prod_9b_l31_v1.yaml │ │ ├── pls │ │ ├── prod_2b_l20_gemma_65k.yaml │ │ ├── prod_8b_l20_llama_131k.yaml │ │ └── prod_9b_l20_gemma_131k.yaml │ │ └── reps │ │ ├── README.md │ │ ├── dataset │ │ ├── concept100.yaml │ │ └── concept500.yaml │ │ └── experiments │ │ ├── c_lora_g2-2b_axbench.yaml │ │ ├── c_lora_g2-2b_axbench_suppress.yaml │ │ ├── c_lora_g2-9b_axbench.yaml │ │ ├── c_lora_g2-9b_axbench_suppress.yaml │ │ ├── c_lora_g3-12b_axbench.yaml │ │ ├── c_lora_g3-12b_concept100.yaml │ │ ├── c_lora_g3-12b_concept100_suppress.yaml │ │ ├── c_lora_g3-27b_axbench.yaml │ │ ├── c_lora_g3-27b_concept100.yaml │ │ ├── c_lora_g3-27b_concept100_suppress.yaml │ │ ├── c_loreft_g2-2b_axbench.yaml │ │ ├── c_loreft_g2-2b_axbench_suppress.yaml │ │ ├── c_loreft_g2-9b_axbench.yaml │ │ ├── c_loreft_g2-9b_axbench_suppress.yaml │ │ ├── c_loreft_g3-12b_axbench.yaml │ │ ├── c_loreft_g3-12b_concept100.yaml │ │ ├── c_loreft_g3-12b_concept100_suppress.yaml │ │ ├── c_loreft_g3-27b_axbench.yaml │ │ ├── c_loreft_g3-27b_concept100.yaml │ │ ├── c_loreft_g3-27b_concept100_suppress.yaml │ │ ├── c_vector_g2-2b_axbench.yaml │ │ ├── c_vector_g2-2b_axbench_attack.yaml │ │ ├── c_vector_g2-2b_axbench_overwrite_append.yaml │ │ ├── c_vector_g2-2b_axbench_overwrite_prepend.yaml │ │ ├── c_vector_g2-2b_axbench_suppress.yaml │ │ ├── c_vector_g2-2b_axbench_suppress_rule.yaml │ │ ├── c_vector_g2-2b_nfs_axbench.yaml │ │ ├── c_vector_g2-9b_axbench.yaml │ │ ├── c_vector_g2-9b_axbench_attack.yaml │ │ ├── c_vector_g2-9b_axbench_overwrite_append.yaml │ │ ├── c_vector_g2-9b_axbench_overwrite_prepend.yaml │ │ ├── c_vector_g2-9b_axbench_suppress.yaml │ │ ├── c_vector_g2-9b_axbench_suppress_rule.yaml │ │ ├── c_vector_g2-9b_nfs_axbench.yaml │ │ ├── c_vector_g3-12b_axbench.yaml │ │ ├── c_vector_g3-12b_axbench_overwrite_append.yaml │ │ ├── c_vector_g3-12b_axbench_overwrite_prepend.yaml │ │ ├── c_vector_g3-12b_axbench_suppress.yaml │ │ ├── c_vector_g3-12b_concept100.yaml │ │ ├── c_vector_g3-12b_concept100_suppress.yaml │ │ ├── c_vector_g3-27b_axbench.yaml │ │ ├── c_vector_g3-27b_axbench_overwrite_append.yaml │ │ ├── c_vector_g3-27b_axbench_overwrite_prepend.yaml │ │ ├── c_vector_g3-27b_axbench_suppress.yaml │ │ ├── c_vector_g3-27b_axbench_suppress_overwrite_prepend.yaml │ │ ├── c_vector_g3-27b_concept100.yaml │ │ ├── c_vector_g3-27b_concept100_suppress.yaml │ │ ├── p_embedding_dps_g2-2b_axbench.yaml │ │ ├── p_embedding_dps_g2-9b_axbench.yaml │ │ ├── p_lora_bipo_g2-2b_axbench.yaml │ │ ├── p_lora_bipo_g2-9b_axbench.yaml │ │ ├── p_lora_dps_g2-2b_axbench.yaml │ │ ├── p_lora_dps_g2-2b_axbench_suppress.yaml │ │ ├── p_lora_dps_g2-2b_nfs_axbench.yaml │ │ ├── p_lora_dps_g2-9b_axbench.yaml │ │ ├── p_lora_dps_g2-9b_axbench_suppress.yaml │ │ ├── p_lora_dps_g2-9b_nfs_axbench.yaml │ │ ├── p_lora_dps_g3-12b_axbench.yaml │ │ ├── p_lora_dps_g3-12b_concept100.yaml │ │ ├── p_lora_dps_g3-12b_concept100_suppress.yaml │ │ ├── p_lora_dps_g3-27b_axbench.yaml │ │ ├── p_lora_dps_g3-27b_concept100.yaml │ │ ├── p_lora_dps_g3-27b_concept100_suppress.yaml │ │ ├── p_loreft_bipo_g2-2b_axbench.yaml │ │ ├── p_loreft_bipo_g2-9b_axbench.yaml │ │ ├── p_loreft_dps_g2-2b_axbench.yaml │ │ ├── p_loreft_dps_g2-2b_axbench_suppress.yaml │ │ ├── p_loreft_dps_g2-2b_nfs_axbench.yaml │ │ ├── p_loreft_dps_g2-9b_axbench.yaml │ │ ├── p_loreft_dps_g2-9b_axbench_suppress.yaml │ │ ├── p_loreft_dps_g2-9b_nfs_axbench.yaml │ │ ├── p_loreft_dps_g3-12b_axbench.yaml │ │ ├── p_loreft_dps_g3-12b_concept100.yaml │ │ ├── p_loreft_dps_g3-12b_concept100_suppress.yaml │ │ ├── p_loreft_dps_g3-27b_axbench.yaml │ │ ├── p_loreft_dps_g3-27b_concept100.yaml │ │ ├── p_loreft_dps_g3-27b_concept100_suppress.yaml │ │ ├── p_prefix_dps_g2-2b_axbench.yaml │ │ ├── p_prefix_dps_g2-9b_axbench.yaml │ │ ├── p_vector_bipo_g2-2b_axbench.yaml │ │ ├── p_vector_bipo_g2-2b_lf_axbench.yaml │ │ ├── p_vector_bipo_g2-9b_axbench.yaml │ │ ├── p_vector_bipo_g2-9b_lf_axbench.yaml │ │ ├── p_vector_dps_g2-2b_axbench.yaml │ │ ├── p_vector_dps_g2-2b_axbench_attack.yaml │ │ ├── p_vector_dps_g2-2b_axbench_suppress.yaml │ │ ├── p_vector_dps_g2-2b_axbench_suppress_rule.yaml │ │ ├── p_vector_dps_g2-2b_nfs_axbench.yaml │ │ ├── p_vector_dps_g2-9b_axbench.yaml │ │ ├── p_vector_dps_g2-9b_axbench_attack.yaml │ │ ├── p_vector_dps_g2-9b_axbench_suppress.yaml │ │ ├── p_vector_dps_g2-9b_axbench_suppress_rule.yam │ │ ├── p_vector_dps_g2-9b_axbench_suppress_rule.yaml │ │ ├── p_vector_dps_g2-9b_nfs_axbench.yaml │ │ ├── p_vector_dps_g3-12b_axbench.yaml │ │ ├── p_vector_dps_g3-12b_axbench_suppress_rule.yaml │ │ ├── p_vector_dps_g3-12b_concept100.yaml │ │ ├── p_vector_dps_g3-27b_axbench.yaml │ │ ├── p_vector_dps_g3-27b_axbench_suppress.yaml │ │ ├── p_vector_dps_g3-27b_axbench_suppress_rule.yaml │ │ ├── p_vector_dps_g3-27b_concept100.yaml │ │ ├── p_vector_g2-2b_axbench_overwrite_append.yaml │ │ ├── p_vector_g2-2b_axbench_overwrite_prepend.yaml │ │ ├── p_vector_g2-9b_axbench_overwrite_append.yaml │ │ ├── p_vector_g2-9b_axbench_overwrite_prepend.yaml │ │ ├── p_vector_g3-12b_axbench_overwrite_append.yaml │ │ ├── p_vector_g3-12b_axbench_overwrite_prepend.yaml │ │ ├── p_vector_g3-27b_axbench_overwrite_append.yaml │ │ ├── p_vector_g3-27b_axbench_overwrite_prepend.yaml │ │ ├── prompt_steering_g2-2b_concept20_suppress.yaml │ │ ├── prompt_steering_g2-2b_concept20_suppress_overwrite_append.yaml │ │ ├── prompt_steering_g2-2b_concept20_suppress_overwrite_prepend.yaml │ │ ├── prompt_steering_g2-2b_concept500_suppress.yaml │ │ ├── prompt_steering_g2-2b_prompt_rule.yaml │ │ ├── prompt_steering_g2-9b_concept20_suppress.yaml │ │ ├── prompt_steering_g2-9b_concept20_suppress_overwrite_append.yaml │ │ ├── prompt_steering_g2-9b_concept20_suppress_overwrite_prepend.yaml │ │ ├── prompt_steering_g2-9b_concept500_suppress.yaml │ │ ├── prompt_steering_g3-12b_concept100.yaml │ │ ├── prompt_steering_g3-12b_concept20_suppress.yaml │ │ ├── prompt_steering_g3-12b_concept20_suppress_overwrite_append.yaml │ │ ├── prompt_steering_g3-12b_concept20_suppress_overwrite_prepend.yaml │ │ ├── prompt_steering_g3-12b_concept20_suppress_rule.yaml │ │ ├── prompt_steering_g3-27b_concept100.yaml │ │ ├── prompt_steering_g3-27b_concept20_suppress.yaml │ │ ├── prompt_steering_g3-27b_concept20_suppress_overwrite_append.yaml │ │ ├── prompt_steering_g3-27b_concept20_suppress_overwrite_prepend.yaml │ │ └── prompt_steering_g3-27b_concept20_suppress_rule.yaml ├── templates │ ├── __init__.py │ ├── html_templates.py │ └── prompt_templates.py ├── tests │ ├── README.md │ ├── test_released_artifacts.py │ └── unit_tests │ │ ├── test_dataset.py │ │ ├── test_prompt_utils.py │ │ └── test_sae.py └── utils │ ├── __init__.py │ ├── constants.py │ ├── data_utils.py │ ├── dataset.py │ ├── model_utils.py │ ├── plot_utils.py │ └── prompt_utils.py ├── hypersteer_requirement.txt ├── pyproject.toml └── uv.lock /.gitattributes: -------------------------------------------------------------------------------- 1 | axbench/examples/* linguist-vendored 2 | axbench/scripts/*.ipynb linguist-vendored 3 | -------------------------------------------------------------------------------- /.python-version: -------------------------------------------------------------------------------- 1 | 3.12 2 | -------------------------------------------------------------------------------- /axbench/__init__.py: -------------------------------------------------------------------------------- 1 | from .utils.plot_utils import * 2 | from .utils.dataset import * 3 | from .utils.constants import * 4 | from .utils.prompt_utils import * 5 | from .utils.model_utils import * 6 | 7 | from .templates.html_templates import * 8 | from .templates.prompt_templates import * 9 | 10 | from .evaluators.aucroc import * 11 | from .evaluators.ppl import * 12 | from .evaluators.lm_judge import * 13 | from .evaluators.hard_negative import * 14 | from .evaluators.winrate import * 15 | from .evaluators.latent_stats import * 16 | 17 | from .models.sft import * 18 | from .models.lora import * 19 | from .models.reft import * 20 | from .models.lsreft import * 21 | from .models.steering_vector import * 22 | from .models.sae import * 23 | from .models.probe import * 24 | from .models.ig import * 25 | from .models.random import * 26 | from .models.mean import * 27 | from .models.prompt import * 28 | from .models.bow import * 29 | from .models.language_models import * 30 | from .models.preference_lora import * 31 | from .models.preference_reft import * 32 | from .models.concept_lora import * 33 | from .models.concept_reft import * 34 | from .models.preference_vector import * 35 | from .models.concept_vector import * 36 | from .models.hypersteer import * 37 | 38 | from .models.hypernet.configuration_hypernet import * 39 | from .models.hypernet.layers import * 40 | from .models.hypernet.modeling_hypernet import * 41 | from .models.hypernet.utils import * 42 | 43 | from .scripts.args.eval_args import * 44 | from .scripts.args.training_args import * 45 | from .scripts.args.dataset_args import * 46 | 47 | from .scripts.evaluate import * 48 | from .scripts.inference import * 49 | -------------------------------------------------------------------------------- /axbench/concept10/prod_2b_l10_v1/generate/generate_state.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanfordnlp/axbench/bc9189153f475de725298fc8031bc0e48ff1132d/axbench/concept10/prod_2b_l10_v1/generate/generate_state.pkl -------------------------------------------------------------------------------- /axbench/concept10/prod_2b_l10_v1/generate/train_data.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanfordnlp/axbench/bc9189153f475de725298fc8031bc0e48ff1132d/axbench/concept10/prod_2b_l10_v1/generate/train_data.parquet -------------------------------------------------------------------------------- /axbench/concept10/prod_2b_l10_v1/inference/latent_eval_data.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanfordnlp/axbench/bc9189153f475de725298fc8031bc0e48ff1132d/axbench/concept10/prod_2b_l10_v1/inference/latent_eval_data.parquet -------------------------------------------------------------------------------- /axbench/concept10/prod_2b_l10_v1/inference/latent_inference_state.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanfordnlp/axbench/bc9189153f475de725298fc8031bc0e48ff1132d/axbench/concept10/prod_2b_l10_v1/inference/latent_inference_state.pkl -------------------------------------------------------------------------------- /axbench/concept10/prod_2b_l20_v1/generate/generate_state.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanfordnlp/axbench/bc9189153f475de725298fc8031bc0e48ff1132d/axbench/concept10/prod_2b_l20_v1/generate/generate_state.pkl -------------------------------------------------------------------------------- /axbench/concept10/prod_2b_l20_v1/generate/train_data.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanfordnlp/axbench/bc9189153f475de725298fc8031bc0e48ff1132d/axbench/concept10/prod_2b_l20_v1/generate/train_data.parquet -------------------------------------------------------------------------------- /axbench/concept10/prod_2b_l20_v1/inference/latent_eval_data.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanfordnlp/axbench/bc9189153f475de725298fc8031bc0e48ff1132d/axbench/concept10/prod_2b_l20_v1/inference/latent_eval_data.parquet -------------------------------------------------------------------------------- /axbench/concept10/prod_9b_l20_v1/generate/generate_state.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanfordnlp/axbench/bc9189153f475de725298fc8031bc0e48ff1132d/axbench/concept10/prod_9b_l20_v1/generate/generate_state.pkl -------------------------------------------------------------------------------- /axbench/concept10/prod_9b_l20_v1/generate/train_data.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanfordnlp/axbench/bc9189153f475de725298fc8031bc0e48ff1132d/axbench/concept10/prod_9b_l20_v1/generate/train_data.parquet -------------------------------------------------------------------------------- /axbench/concept10/prod_9b_l20_v1/inference/latent_eval_data.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanfordnlp/axbench/bc9189153f475de725298fc8031bc0e48ff1132d/axbench/concept10/prod_9b_l20_v1/inference/latent_eval_data.parquet -------------------------------------------------------------------------------- /axbench/concept10/prod_9b_l20_v1/inference/latent_inference_state.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanfordnlp/axbench/bc9189153f475de725298fc8031bc0e48ff1132d/axbench/concept10/prod_9b_l20_v1/inference/latent_inference_state.pkl -------------------------------------------------------------------------------- /axbench/concept10/prod_9b_l31_v1/generate/generate_state.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanfordnlp/axbench/bc9189153f475de725298fc8031bc0e48ff1132d/axbench/concept10/prod_9b_l31_v1/generate/generate_state.pkl -------------------------------------------------------------------------------- /axbench/concept10/prod_9b_l31_v1/generate/train_data.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanfordnlp/axbench/bc9189153f475de725298fc8031bc0e48ff1132d/axbench/concept10/prod_9b_l31_v1/generate/train_data.parquet -------------------------------------------------------------------------------- /axbench/concept10/prod_9b_l31_v1/inference/latent_eval_data.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanfordnlp/axbench/bc9189153f475de725298fc8031bc0e48ff1132d/axbench/concept10/prod_9b_l31_v1/inference/latent_eval_data.parquet -------------------------------------------------------------------------------- /axbench/concept10/prod_9b_l31_v1/inference/latent_inference_state.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanfordnlp/axbench/bc9189153f475de725298fc8031bc0e48ff1132d/axbench/concept10/prod_9b_l31_v1/inference/latent_inference_state.pkl -------------------------------------------------------------------------------- /axbench/concept16k/README.md: -------------------------------------------------------------------------------- 1 | ## How to access the 16k concept dataset? 2 | 3 | 1. Download the dataset from Google Drive: [concept16k.zip](https://drive.google.com/file/d/1eNYXN0eAVmu2nuOqDNyiMI5IEq6faAcp/view?usp=sharing) 4 | 5 | 2. Unzip the downloaded file: 6 | 7 | ```bash 8 | unzip concept16k.zip 9 | ``` 10 | 11 | 3. Move the unzipped folder to the `axbench/data` directory: 12 | 13 | ```bash 14 | mv concept16k axbench/data/ 15 | ``` -------------------------------------------------------------------------------- /axbench/concept16k_v2/README.md: -------------------------------------------------------------------------------- 1 | ## How to access the 16k concept dataset v2? 2 | 3 | Please refer to our HuggingFace page: https://huggingface.co/datasets/pyvene/axbench-concept16k_v2. -------------------------------------------------------------------------------- /axbench/concept500/prod_2b_l10_v1/generate/generate_state.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanfordnlp/axbench/bc9189153f475de725298fc8031bc0e48ff1132d/axbench/concept500/prod_2b_l10_v1/generate/generate_state.pkl -------------------------------------------------------------------------------- /axbench/concept500/prod_2b_l10_v1/generate/train_data.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanfordnlp/axbench/bc9189153f475de725298fc8031bc0e48ff1132d/axbench/concept500/prod_2b_l10_v1/generate/train_data.parquet -------------------------------------------------------------------------------- /axbench/concept500/prod_2b_l10_v1/inference/latent_eval_data.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanfordnlp/axbench/bc9189153f475de725298fc8031bc0e48ff1132d/axbench/concept500/prod_2b_l10_v1/inference/latent_eval_data.parquet -------------------------------------------------------------------------------- /axbench/concept500/prod_2b_l10_v1/inference/latent_inference_state.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanfordnlp/axbench/bc9189153f475de725298fc8031bc0e48ff1132d/axbench/concept500/prod_2b_l10_v1/inference/latent_inference_state.pkl -------------------------------------------------------------------------------- /axbench/concept500/prod_2b_l20_v1/generate/generate_state.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanfordnlp/axbench/bc9189153f475de725298fc8031bc0e48ff1132d/axbench/concept500/prod_2b_l20_v1/generate/generate_state.pkl -------------------------------------------------------------------------------- /axbench/concept500/prod_2b_l20_v1/generate/train_data.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanfordnlp/axbench/bc9189153f475de725298fc8031bc0e48ff1132d/axbench/concept500/prod_2b_l20_v1/generate/train_data.parquet -------------------------------------------------------------------------------- /axbench/concept500/prod_2b_l20_v1/inference/latent_eval_data.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanfordnlp/axbench/bc9189153f475de725298fc8031bc0e48ff1132d/axbench/concept500/prod_2b_l20_v1/inference/latent_eval_data.parquet -------------------------------------------------------------------------------- /axbench/concept500/prod_2b_l20_v1/inference/latent_inference_state.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanfordnlp/axbench/bc9189153f475de725298fc8031bc0e48ff1132d/axbench/concept500/prod_2b_l20_v1/inference/latent_inference_state.pkl -------------------------------------------------------------------------------- /axbench/concept500/prod_9b_l20_v1/generate/generate_state.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanfordnlp/axbench/bc9189153f475de725298fc8031bc0e48ff1132d/axbench/concept500/prod_9b_l20_v1/generate/generate_state.pkl -------------------------------------------------------------------------------- /axbench/concept500/prod_9b_l20_v1/generate/train_data.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanfordnlp/axbench/bc9189153f475de725298fc8031bc0e48ff1132d/axbench/concept500/prod_9b_l20_v1/generate/train_data.parquet -------------------------------------------------------------------------------- /axbench/concept500/prod_9b_l20_v1/inference/latent_eval_data.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanfordnlp/axbench/bc9189153f475de725298fc8031bc0e48ff1132d/axbench/concept500/prod_9b_l20_v1/inference/latent_eval_data.parquet -------------------------------------------------------------------------------- /axbench/concept500/prod_9b_l20_v1/inference/latent_inference_state.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanfordnlp/axbench/bc9189153f475de725298fc8031bc0e48ff1132d/axbench/concept500/prod_9b_l20_v1/inference/latent_inference_state.pkl -------------------------------------------------------------------------------- /axbench/concept500/prod_9b_l31_v1/generate/generate_state.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanfordnlp/axbench/bc9189153f475de725298fc8031bc0e48ff1132d/axbench/concept500/prod_9b_l31_v1/generate/generate_state.pkl -------------------------------------------------------------------------------- /axbench/concept500/prod_9b_l31_v1/generate/train_data.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanfordnlp/axbench/bc9189153f475de725298fc8031bc0e48ff1132d/axbench/concept500/prod_9b_l31_v1/generate/train_data.parquet -------------------------------------------------------------------------------- /axbench/concept500/prod_9b_l31_v1/inference/latent_eval_data.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanfordnlp/axbench/bc9189153f475de725298fc8031bc0e48ff1132d/axbench/concept500/prod_9b_l31_v1/inference/latent_eval_data.parquet -------------------------------------------------------------------------------- /axbench/concept500/prod_9b_l31_v1/inference/latent_inference_state.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanfordnlp/axbench/bc9189153f475de725298fc8031bc0e48ff1132d/axbench/concept500/prod_9b_l31_v1/inference/latent_inference_state.pkl -------------------------------------------------------------------------------- /axbench/conceptFD/README.md: -------------------------------------------------------------------------------- 1 | ## What is ConceptFD? 2 | 3 | This folder contains training and evaluation data for a concept list that is from "Enhancing Automated Interpretability with Output-Centric Feature Descriptions" - Yoav Gur-Arieh, Roy Mayan, Chen Agassy, Atticus Geiger, Mor Geva. 2025. 4 | 5 | The goal is to evaluate SAEs on high-quality concept lists. -------------------------------------------------------------------------------- /axbench/conceptFD/prod_2b_l10_v1/generate/train_data.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanfordnlp/axbench/bc9189153f475de725298fc8031bc0e48ff1132d/axbench/conceptFD/prod_2b_l10_v1/generate/train_data.parquet -------------------------------------------------------------------------------- /axbench/conceptFD/prod_2b_l10_v1/inference/latent_eval_data.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanfordnlp/axbench/bc9189153f475de725298fc8031bc0e48ff1132d/axbench/conceptFD/prod_2b_l10_v1/inference/latent_eval_data.parquet -------------------------------------------------------------------------------- /axbench/conceptFD/prod_2b_l20_v1/generate/train_data.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanfordnlp/axbench/bc9189153f475de725298fc8031bc0e48ff1132d/axbench/conceptFD/prod_2b_l20_v1/generate/train_data.parquet -------------------------------------------------------------------------------- /axbench/conceptFD/prod_2b_l20_v1/inference/latent_eval_data.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanfordnlp/axbench/bc9189153f475de725298fc8031bc0e48ff1132d/axbench/conceptFD/prod_2b_l20_v1/inference/latent_eval_data.parquet -------------------------------------------------------------------------------- /axbench/data/download-2b.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Define the start and end layer numbers 4 | START=0 # Replace with the desired start layer 5 | END=25 # Replace with the desired end layer 6 | 7 | # Base URL for the file download 8 | BASE_URL="https://neuronpedia-exports.s3.amazonaws.com/explanations-only/gemma-2-" 9 | 10 | # Loop over the range of layers from START to END 11 | for (( i=START; i<=END; i++ )); do 12 | # Construct the URLs for both resolutions for the current layer 13 | FILE_URL_16K="${BASE_URL}2b_${i}-gemmascope-res-16k.json" 14 | FILE_URL_65K="${BASE_URL}2b_${i}-gemmascope-res-65k.json" 15 | 16 | # Download the file for 16k resolution 17 | echo "Downloading ${FILE_URL_16K}..." 18 | curl -O "${FILE_URL_16K}" 19 | 20 | # Download the file for 65k resolution 21 | echo "Downloading ${FILE_URL_65K}..." 22 | curl -O "${FILE_URL_65K}" 23 | done 24 | 25 | echo "Download completed." -------------------------------------------------------------------------------- /axbench/data/download-9b.sh: -------------------------------------------------------------------------------- 1 | wget https://neuronpedia-exports.s3.amazonaws.com/explanations-only/gemma-2-9b-it_20-gemmascope-res-131k.json 2 | wget https://neuronpedia-exports.s3.amazonaws.com/explanations-only/gemma-2-9b-it_31-gemmascope-res-131k.json 3 | -------------------------------------------------------------------------------- /axbench/data/download-alpaca.sh: -------------------------------------------------------------------------------- 1 | wget https://huggingface.co/datasets/tatsu-lab/alpaca_eval/resolve/main/alpaca_eval.json 2 | -------------------------------------------------------------------------------- /axbench/data/download-others.sh: -------------------------------------------------------------------------------- 1 | git clone https://github.com/yoavgur/Feature-Descriptions.git -------------------------------------------------------------------------------- /axbench/data/download-pls.sh: -------------------------------------------------------------------------------- 1 | 2 | wget https://neuronpedia-exports.s3.amazonaws.com/explanations-only/gemma-2-2b_20-gemmascope-res-65k.json 3 | wget https://neuronpedia-exports.s3.amazonaws.com/explanations-only/gemma-2-9b-it_20-gemmascope-res-131k.json 4 | wget https://nlp.stanford.edu/~wuzhengx/downloads/llama3.1-8b_20-llamascope-res-131k.json 5 | -------------------------------------------------------------------------------- /axbench/data/process-feature-description.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import json 3 | 4 | def parse_feature_description_to_json(df, model, layer, component, width): 5 | all_raw_df = df[(df["Layer"] == layer) & (df["Sae Type"] == component) & (df["Sae Size"] == width)] 6 | high_quality_df = all_raw_df[ 7 | (all_raw_df["Ensemble Concat (All) Input Success"] == True) & 8 | (all_raw_df["Ensemble Concat (All) Output Success"] == True) 9 | ].reset_index(drop=True) 10 | 11 | output_json = [] 12 | for index, row in high_quality_df.iterrows(): 13 | output_json += [ 14 | { 15 | "modelId":model, 16 | "layer":f"{layer}-gemmascope-{component}-{width}", 17 | "index":row["Feature"], 18 | "description":row["Ensemble Raw (All) Description"], 19 | "explanationModelName":"FeatureDescription", 20 | "typeName":"oai_token-act-pair" 21 | } 22 | ] 23 | with open(f'{model}_{layer}-featured-{component}-{width}.json', 'w') as f: 24 | json.dump(output_json, f) 25 | 26 | df = pd.read_csv("./Feature-Descriptions/descriptions/gemma-2-2b.csv") 27 | parse_feature_description_to_json(df, "gemma-2-2b", 10, "res", "16k") 28 | 29 | df = pd.read_csv("./Feature-Descriptions/descriptions/gemma-2-2b.csv") 30 | parse_feature_description_to_json(df, "gemma-2-2b", 20, "res", "16k") -------------------------------------------------------------------------------- /axbench/demo/concepts.csv: -------------------------------------------------------------------------------- 1 | terms related to artificiality and deception,https://www.neuronpedia.org/gemma-2-2b/20-gemmascope-res-16k/8927 2 | terms related to employment and employees,https://www.neuronpedia.org/gemma-2-2b/20-gemmascope-res-16k/7490 3 | phrases related to beliefs and religion,https://www.neuronpedia.org/gemma-2-2b/20-gemmascope-res-16k/56 4 | references to sports statistics and performance metrics,https://www.neuronpedia.org/gemma-2-2b/20-gemmascope-res-16k/6535 5 | -------------------------------------------------------------------------------- /axbench/demo/demo.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Check if nvidia-smi command exists 4 | if ! command -v nvidia-smi &> /dev/null; then 5 | echo "nvidia-smi could not be found. Please ensure NVIDIA drivers are installed." 6 | exit 1 7 | fi 8 | 9 | # Get the number of GPUs 10 | gpu_count=$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l) 11 | 12 | python axbench/scripts/generate.py --config axbench/demo/sweep/simple.yaml --dump_dir axbench/demo 13 | 14 | torchrun --nproc_per_node=$gpu_count axbench/scripts/train.py \ 15 | --config axbench/demo/sweep/simple.yaml --dump_dir axbench/demo 16 | 17 | torchrun --nproc_per_node=$gpu_count axbench/scripts/inference.py --config axbench/demo/sweep/simple.yaml --mode latent --dump_dir axbench/demo 18 | 19 | torchrun --nproc_per_node=$gpu_count axbench/scripts/inference.py --config axbench/demo/sweep/simple.yaml --mode steering --dump_dir axbench/demo 20 | 21 | python axbench/scripts/evaluate.py --config axbench/demo/sweep/simple.yaml --mode latent --dump_dir axbench/demo 22 | 23 | python axbench/scripts/evaluate.py --config axbench/demo/sweep/simple.yaml --mode steering --dump_dir axbench/demo 24 | -------------------------------------------------------------------------------- /axbench/demo/hypersteer_demo.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Check if nvidia-smi command exists 4 | if ! command -v nvidia-smi &> /dev/null; then 5 | echo "nvidia-smi could not be found. Please ensure NVIDIA drivers are installed." 6 | exit 1 7 | fi 8 | 9 | # Get the number of GPUs 10 | gpu_count=$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l) 11 | 12 | python axbench/scripts/generate.py --config axbench/demo/sweep/hypersteer_simple.yaml --dump_dir axbench/demo 13 | 14 | torchrun --nproc_per_node=$gpu_count axbench/scripts/train.py \ 15 | --config axbench/demo/sweep/hypersteer_simple.yaml --dump_dir axbench/demo 16 | 17 | torchrun --nproc_per_node=$gpu_count axbench/scripts/inference.py --config axbench/demo/sweep/hypersteer_simple.yaml --mode steering --dump_dir axbench/demo 18 | 19 | python axbench/scripts/evaluate.py --config axbench/demo/sweep/hypersteer_simple.yaml --mode steering --dump_dir axbench/demo 20 | -------------------------------------------------------------------------------- /axbench/evaluators/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanfordnlp/axbench/bc9189153f475de725298fc8031bc0e48ff1132d/axbench/evaluators/__init__.py -------------------------------------------------------------------------------- /axbench/evaluators/aucroc.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from sklearn.metrics import roc_curve, auc 3 | import numpy as np 4 | from .evaluator import Evaluator 5 | 6 | 7 | class AUCROCEvaluator(Evaluator): 8 | def __init__(self, model_name, **kwargs): 9 | self.model_name = model_name 10 | 11 | def __str__(self): 12 | return 'AUCROCEvaluator' 13 | 14 | def compute_metrics( 15 | self, data, 16 | class_labels={"positive": 1, "negative": 0, "hard negative seen": 0, "hard negative unseen": 0}, 17 | write_to_dir=None): 18 | data = data.copy() 19 | 20 | # Normalize the activation columns 21 | max_acts = data[f'{self.model_name}_max_act'] 22 | global_max_act = max_acts.max() 23 | data['normalized_max'] = (max_acts - max_acts.min()) / (max_acts.max() - max_acts.min()) 24 | 25 | # Apply class labels 26 | data['label'] = data['category'].map(class_labels) 27 | filtered_data = data.dropna(subset=['label']) 28 | filtered_data.loc[:, 'normalized_max'] = filtered_data['normalized_max'].fillna(0) 29 | 30 | # Compute ROC metrics for max_act 31 | fpr, tpr, thresholds = roc_curve(filtered_data['label'], filtered_data['normalized_max']) 32 | roc_auc = auc(fpr, tpr) 33 | j_scores = tpr - fpr 34 | optimal_idx = np.argmax(j_scores) 35 | optimal_threshold = thresholds[optimal_idx] 36 | 37 | # Prepare output dictionary 38 | metrics = { 39 | "max_act": float(global_max_act), 40 | "roc_auc": float(roc_auc), 41 | "optimal_threshold": float(optimal_threshold), 42 | "roc_curve": { 43 | "fpr": fpr.tolist(), 44 | "tpr": tpr.tolist(), 45 | # "thresholds": thresholds.tolist() 46 | } 47 | } 48 | return metrics 49 | 50 | -------------------------------------------------------------------------------- /axbench/evaluators/evaluator.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | 3 | 4 | class Evaluator(ABC): 5 | 6 | def fit(self, examples): 7 | """ 8 | This is a placeholder in case then evaluator 9 | actually needs to be trained. 10 | """ 11 | pass 12 | 13 | @abstractmethod 14 | def compute_metrics(self, examples): 15 | pass 16 | 17 | 18 | -------------------------------------------------------------------------------- /axbench/evaluators/ppl.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | from .evaluator import Evaluator 4 | 5 | 6 | class PerplexityEvaluator(Evaluator): 7 | def __init__(self, model_name, **kwargs): 8 | self.model_name = model_name 9 | 10 | def __str__(self): 11 | return 'PerplexityEvaluator' 12 | 13 | def compute_metrics(self, data, write_to_dir=None): 14 | data = data.copy() 15 | metrics = { 16 | "perplexity": [], 17 | "strength": [], 18 | "factor": [] 19 | } 20 | 21 | # group by factor only and compute means 22 | grouped = data.groupby("factor") 23 | for factor, group in grouped: 24 | perplexity = group[f"{self.model_name}_perplexity"].mean() 25 | metrics["perplexity"].append(perplexity) 26 | metrics["factor"].append(factor) 27 | if f"{self.model_name}_strength" in group.columns: 28 | strength = group[f"{self.model_name}_strength"].mean() 29 | metrics["strength"].append(strength) 30 | return metrics 31 | -------------------------------------------------------------------------------- /axbench/models/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanfordnlp/axbench/bc9189153f475de725298fc8031bc0e48ff1132d/axbench/models/__init__.py -------------------------------------------------------------------------------- /axbench/models/hypernet/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanfordnlp/axbench/bc9189153f475de725298fc8031bc0e48ff1132d/axbench/models/hypernet/__init__.py -------------------------------------------------------------------------------- /axbench/rm_demo.sh: -------------------------------------------------------------------------------- 1 | rm -r axbench/demo/generate 2 | rm -r axbench/demo/train 3 | rm -r axbench/demo/inference 4 | rm -r axbench/demo/evaluate 5 | rm -r axbench/demo/lm_cache 6 | -------------------------------------------------------------------------------- /axbench/scripts/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanfordnlp/axbench/bc9189153f475de725298fc8031bc0e48ff1132d/axbench/scripts/__init__.py -------------------------------------------------------------------------------- /axbench/scripts/args/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanfordnlp/axbench/bc9189153f475de725298fc8031bc0e48ff1132d/axbench/scripts/args/__init__.py -------------------------------------------------------------------------------- /axbench/sweep/aryaman/sae_selection_2b_l10_concept10.yaml: -------------------------------------------------------------------------------- 1 | generate: 2 | lm_model: "gpt-4o-mini" 3 | output_length: 128 4 | num_of_examples: 144 5 | concept_path: "axbench/data/gemma-2-2b_10-gemmascope-res-16k.json" 6 | max_concepts: 1 7 | master_data_dir: "axbench/data" 8 | dataset_category: "instruction" 9 | lm_use_cache: false 10 | seed: 42 11 | train: 12 | model_name: "google/gemma-2-2b-it" 13 | layer: 10 14 | component: "res" 15 | seed: 42 16 | use_bf16: true 17 | models: 18 | GemmaScopeSAEMaxAUC: 19 | batch_size: 6 20 | n_epochs: 1 21 | binarize_dataset: true 22 | low_rank_dimension: 1 23 | inference: 24 | use_bf16: true 25 | models: ["GemmaScopeSAEMaxAUC"] 26 | model_name: "google/gemma-2-2b-it" 27 | # latent related params 28 | output_length: 128 29 | latent_num_of_examples: 36 30 | latent_batch_size: 16 31 | # steering related params 32 | steering_intervention_type: "addition" # clamping 33 | steering_model_name: "google/gemma-2-2b-it" 34 | steering_datasets: ["AlpacaEval"] 35 | steering_batch_size: 10 36 | steering_output_length: 128 37 | steering_layers: [10] 38 | steering_num_of_examples: 10 # number of examples per concept and per factor 39 | steering_factors: [0.2, 0.4, 0.6, 0.8, 1.0, 1.2, 1.4, 1.6, 1.8, 2.0, 2.5, 3.0, 4.0, 5.0] # number of steering factors per example 40 | # master data dir is shared across all jobs. 41 | master_data_dir: "axbench/data" 42 | seed: 42 43 | lm_model: "gpt-4o-mini" 44 | # generation related params 45 | temperature: 1.0 46 | evaluate: 47 | models: ["GemmaScopeSAEMaxAUC"] 48 | latent_evaluators: [ 49 | "AUCROCEvaluator", 50 | "HardNegativeEvaluator", 51 | ] 52 | steering_evaluators: [ 53 | "PerplexityEvaluator", 54 | "LMJudgeEvaluator", 55 | ] 56 | winrate_split_ratio: 0.5 # this is for steering only, we use a separate partition for factor selection. 57 | # Number of processes to run in parallel for steering evaluation. 58 | num_of_workers: 32 59 | lm_model: "gpt-4o-mini" 60 | run_winrate: false 61 | winrate_baseline: "PromptSteering" 62 | # master data dir is shared across all jobs. 63 | master_data_dir: "axbench/data" 64 | -------------------------------------------------------------------------------- /axbench/sweep/aryaman/sae_selection_9b_l20.yaml: -------------------------------------------------------------------------------- 1 | generate: 2 | lm_model: "gpt-4o-mini" 3 | output_length: 128 4 | num_of_examples: 144 5 | concept_path: "axbench/data/gemma-2-9b_20-gemmascope-res-16k.json" 6 | max_concepts: 500 7 | master_data_dir: "axbench/data" 8 | dataset_category: "instruction" 9 | lm_use_cache: false 10 | seed: 42 11 | train: 12 | model_name: "google/gemma-2-9b-it" 13 | layer: 20 14 | component: "res" 15 | seed: 42 16 | use_bf16: true 17 | models: 18 | GemmaScopeSAEMaxAUC: 19 | batch_size: 6 20 | n_epochs: 1 21 | binarize_dataset: true 22 | low_rank_dimension: 1 23 | # GemmaScopeSAEMaxDiff: 24 | # batch_size: 6 25 | # n_epochs: 1 26 | # binarize_dataset: true 27 | # low_rank_dimension: 1 28 | inference: 29 | use_bf16: true 30 | # models: ["GemmaScopeSAEMaxAUC", "GemmaScopeSAEMaxDiff"] 31 | models: ["GemmaScopeSAEMaxAUC"] 32 | model_name: "google/gemma-2-9b-it" 33 | # latent related params 34 | output_length: 128 35 | latent_num_of_examples: 36 36 | latent_batch_size: 16 37 | # steering related params 38 | steering_intervention_type: "addition" # clamping 39 | steering_model_name: "google/gemma-2-9b-it" 40 | steering_datasets: ["AlpacaEval"] 41 | steering_batch_size: 5 42 | steering_output_length: 128 43 | steering_layers: [20] 44 | steering_num_of_examples: 10 # number of examples per concept and per factor 45 | steering_factors: [0.2, 0.4, 0.6, 0.8, 1.0, 1.2, 1.4, 1.6, 1.8, 2.0, 2.5, 3.0, 4.0, 5.0] # number of steering factors per example 46 | # master data dir is shared across all jobs. 47 | master_data_dir: "axbench/data" 48 | seed: 42 49 | lm_model: "gpt-4o-mini" 50 | # generation related params 51 | temperature: 1.0 52 | evaluate: 53 | # models: ["GemmaScopeSAEMaxAUC", "GemmaScopeSAEMaxDiff"] 54 | models: ["GemmaScopeSAEMaxAUC"] 55 | latent_evaluators: [ 56 | "AUCROCEvaluator", 57 | "HardNegativeEvaluator", 58 | ] 59 | steering_evaluators: [ 60 | "PerplexityEvaluator", 61 | "LMJudgeEvaluator", 62 | ] 63 | winrate_split_ratio: 0.5 # this is for steering only, we use a separate partition for factor selection. 64 | # Number of processes to run in parallel for steering evaluation. 65 | num_of_workers: 32 66 | lm_model: "gpt-4o-mini" 67 | run_winrate: false 68 | winrate_baseline: "PromptSteering" 69 | # master data dir is shared across all jobs. 70 | master_data_dir: "axbench/data" 71 | -------------------------------------------------------------------------------- /axbench/sweep/aryaman/simple.yaml: -------------------------------------------------------------------------------- 1 | generate: 2 | lm_model: "gpt-4o-mini" 3 | output_length: 128 4 | num_of_examples: 144 5 | concept_path: "axbench/data/gemma-2-2b_10-gemmascope-res-16k.json" 6 | max_concepts: 1 7 | master_data_dir: "axbench/data" 8 | dataset_category: "instruction" 9 | lm_use_cache: false 10 | seed: 42 11 | train: 12 | model_name: "google/gemma-2-2b-it" 13 | layer: 10 14 | component: "res" 15 | seed: 42 16 | use_bf16: true 17 | models: 18 | LsReFT: 19 | batch_size: 6 20 | gradient_accumulation_steps: 1 21 | n_epochs: 3 22 | lr: 0.01 23 | weight_decay: 0.00 24 | topk: 8 25 | coeff_latent_l1_loss: 0.005 26 | low_rank_dimension: 1 27 | intervention_positions: "all" 28 | intervention_type: "addition" # clamping 29 | binarize_dataset: false 30 | train_on_negative: true 31 | exclude_bos: true 32 | # inference: 33 | # use_bf16: true 34 | # models: ["LsReFT"] 35 | # model_name: "google/gemma-2-2b-it" 36 | # # latent related params 37 | # output_length: 128 38 | # latent_num_of_examples: 20 39 | # latent_batch_size: 16 40 | # # steering related params 41 | # steering_intervention_type: "addition" # clamping 42 | # steering_model_name: "google/gemma-2-2b-it" 43 | # steering_datasets: ["AlpacaEval"] 44 | # steering_batch_size: 10 45 | # steering_output_length: 128 46 | # steering_layers: [10] 47 | # steering_num_of_examples: 10 # number of examples per concept and per factor 48 | # steering_factors: [0.2, 0.4, 0.6, 0.8, 1.0, 1.2, 1.4, 1.6, 1.8, 2.0, 2.5, 3.0, 4.0, 5.0] # number of steering factors per example 49 | # # master data dir is shared across all jobs. 50 | # master_data_dir: "axbench/data" 51 | # seed: 42 52 | # lm_model: "gpt-4o-mini" 53 | # # generation related params 54 | # temperature: 1.0 55 | # evaluate: 56 | # models: ["LsReFT"] 57 | # latent_evaluators: [ 58 | # "AUCROCEvaluator", 59 | # "HardNegativeEvaluator", 60 | # ] 61 | # steering_evaluators: [ 62 | # "PerplexityEvaluator", 63 | # "LMJudgeEvaluator", 64 | # ] 65 | # winrate_split_ratio: 0.5 # this is for steering only, we use a separate partition for factor selection. 66 | # # Number of processes to run in parallel for steering evaluation. 67 | # num_of_workers: 32 68 | # lm_model: "gpt-4o-mini" 69 | # run_winrate: false 70 | # winrate_baseline: "PromptSteering" 71 | # # master data dir is shared across all jobs. 72 | # master_data_dir: "axbench/data" 73 | -------------------------------------------------------------------------------- /axbench/sweep/wuzhengx/2b/l10/bow.yaml: -------------------------------------------------------------------------------- 1 | generate: 2 | lm_model: "gpt-4o-mini" 3 | output_length: 128 4 | num_of_examples: 144 5 | concept_path: "axbench/data/gemma-2-2b_10-gemmascope-res-16k.json" 6 | max_concepts: 500 7 | master_data_dir: "axbench/data" 8 | dataset_category: "instruction" 9 | lm_use_cache: false 10 | seed: 42 11 | train: 12 | model_name: "google/gemma-2-2b-it" 13 | layer: 10 14 | component: "res" 15 | seed: 42 16 | use_bf16: true 17 | models: 18 | BoW: 19 | binarize_dataset: true 20 | bow_penalty: "l2" # l1, l2 21 | bow_C: 100 # 0.001, 0.01, 0.1, 1, 10, 100 22 | inference: 23 | use_bf16: true 24 | models: ["BoW"] 25 | model_name: "google/gemma-2-2b-it" 26 | # latent related params 27 | output_length: 128 28 | latent_num_of_examples: 36 29 | latent_batch_size: 16 30 | # steering related params 31 | steering_intervention_type: "addition" # clamping 32 | steering_model_name: "google/gemma-2-2b-it" 33 | steering_datasets: ["AlpacaEval"] 34 | steering_batch_size: 10 35 | steering_output_length: 128 36 | steering_layers: [10] 37 | steering_num_of_examples: 10 # number of examples per concept and per factor 38 | steering_factors: [0.2, 0.4, 0.6, 0.8, 1.0, 1.2, 1.4, 1.6, 1.8, 2.0, 2.5, 3.0, 4.0, 5.0] # number of steering factors per example 39 | # master data dir is shared across all jobs. 40 | master_data_dir: "axbench/data" 41 | seed: 42 42 | lm_model: "gpt-4o-mini" 43 | # generation related params 44 | temperature: 1.0 45 | evaluate: 46 | models: ["BoW"] 47 | latent_evaluators: [ 48 | "AUCROCEvaluator", 49 | "HardNegativeEvaluator", 50 | ] 51 | steering_evaluators: [ 52 | "PerplexityEvaluator", 53 | "LMJudgeEvaluator", 54 | ] 55 | winrate_split_ratio: 0.5 # this is for steering only, we use a separate partition for factor selection. 56 | # Number of processes to run in parallel for steering evaluation. 57 | num_of_workers: 32 58 | lm_model: "gpt-4o-mini" 59 | run_winrate: false 60 | winrate_baseline: "PromptSteering" 61 | # master data dir is shared across all jobs. 62 | master_data_dir: "axbench/data" 63 | -------------------------------------------------------------------------------- /axbench/sweep/wuzhengx/2b/l10/gemmascope_axbench_max_act.yaml: -------------------------------------------------------------------------------- 1 | generate: 2 | lm_model: "gpt-4o-mini" 3 | output_length: 128 4 | num_of_examples: 144 5 | concept_path: "axbench/data/gemma-2-2b_10-gemmascope-res-16k.json" 6 | max_concepts: 500 7 | master_data_dir: "axbench/data" 8 | dataset_category: "instruction" 9 | lm_use_cache: false 10 | seed: 42 11 | train: 12 | model_name: "google/gemma-2-2b-it" 13 | layer: 10 14 | component: "res" 15 | seed: 42 16 | use_bf16: true 17 | inference: 18 | use_bf16: true 19 | models: ["GemmaScopeSAE"] 20 | model_name: "google/gemma-2-2b-it" 21 | # latent related params 22 | output_length: 128 23 | latent_num_of_examples: 36 24 | latent_batch_size: 16 25 | # steering related params 26 | disable_neuronpedia_max_act: true 27 | steering_intervention_type: "addition" 28 | steering_model_name: "google/gemma-2-2b-it" 29 | steering_datasets: ["AlpacaEval"] 30 | steering_batch_size: 10 31 | steering_output_length: 128 32 | steering_layers: [10] 33 | steering_num_of_examples: 10 # number of examples per concept and per factor 34 | steering_factors: [0.2, 0.4, 0.6, 0.8, 1.0, 1.2, 1.4, 1.6, 1.8, 2.0, 2.5, 3.0, 4.0, 5.0] # number of steering factors per example 35 | # steering_factors: [0.4, 0.8, 1.2, 1.6, 2.0, 3.0, 4.0, 6.0, 8.0, 10.0, 20.0, 40.0, 60.0, 100.0] # SAE clamping only 36 | # master data dir is shared across all jobs. 37 | master_data_dir: "axbench/data" 38 | seed: 42 39 | lm_model: "gpt-4o-mini" 40 | # generation related params 41 | temperature: 1.0 42 | evaluate: 43 | models: ["GemmaScopeSAE"] 44 | latent_evaluators: [ 45 | "AUCROCEvaluator", 46 | "HardNegativeEvaluator", 47 | ] 48 | steering_evaluators: [ 49 | "PerplexityEvaluator", 50 | "LMJudgeEvaluator", 51 | ] 52 | winrate_split_ratio: 0.5 # this is for steering only, we use a separate partition for factor selection. 53 | # Number of processes to run in parallel for steering evaluation. 54 | num_of_workers: 32 55 | lm_model: "gpt-4o-mini" 56 | run_winrate: false 57 | winrate_baseline: "PromptSteering" 58 | # master data dir is shared across all jobs. 59 | master_data_dir: "axbench/data" 60 | -------------------------------------------------------------------------------- /axbench/sweep/wuzhengx/2b/l10/gemmascope_clamp.yaml: -------------------------------------------------------------------------------- 1 | generate: 2 | lm_model: "gpt-4o-mini" 3 | output_length: 128 4 | num_of_examples: 144 5 | concept_path: "axbench/data/gemma-2-2b_10-gemmascope-res-16k.json" 6 | max_concepts: 500 7 | master_data_dir: "axbench/data" 8 | dataset_category: "instruction" 9 | lm_use_cache: false 10 | seed: 42 11 | train: 12 | model_name: "google/gemma-2-2b-it" 13 | layer: 10 14 | component: "res" 15 | seed: 42 16 | use_bf16: true 17 | inference: 18 | use_bf16: true 19 | models: ["GemmaScopeSAE"] 20 | model_name: "google/gemma-2-2b-it" 21 | # latent related params 22 | output_length: 128 23 | latent_num_of_examples: 36 24 | latent_batch_size: 16 25 | # steering related params 26 | steering_intervention_type: "clamping" # clamping 27 | steering_model_name: "google/gemma-2-2b-it" 28 | steering_datasets: ["AlpacaEval"] 29 | steering_batch_size: 10 30 | steering_output_length: 128 31 | steering_layers: [10] 32 | steering_num_of_examples: 10 # number of examples per concept and per factor 33 | # steering_factors: [0.2, 0.4, 0.6, 0.8, 1.0, 1.2, 1.4, 1.6, 1.8, 2.0, 2.5, 3.0, 4.0, 5.0] # number of steering factors per example 34 | steering_factors: [0.4, 0.8, 1.2, 1.6, 2.0, 3.0, 4.0, 6.0, 8.0, 10.0, 20.0, 40.0, 60.0, 100.0] # SAE clamping only 35 | # master data dir is shared across all jobs. 36 | master_data_dir: "axbench/data" 37 | seed: 42 38 | lm_model: "gpt-4o-mini" 39 | # generation related params 40 | temperature: 1.0 41 | evaluate: 42 | models: ["GemmaScopeSAE"] 43 | latent_evaluators: [ 44 | "AUCROCEvaluator", 45 | "HardNegativeEvaluator", 46 | ] 47 | steering_evaluators: [ 48 | "PerplexityEvaluator", 49 | "LMJudgeEvaluator", 50 | ] 51 | winrate_split_ratio: 0.5 # this is for steering only, we use a separate partition for factor selection. 52 | # Number of processes to run in parallel for steering evaluation. 53 | num_of_workers: 32 54 | lm_model: "gpt-4o-mini" 55 | run_winrate: false 56 | winrate_baseline: "PromptSteering" 57 | # master data dir is shared across all jobs. 58 | master_data_dir: "axbench/data" 59 | -------------------------------------------------------------------------------- /axbench/sweep/wuzhengx/2b/l10/gemmascope_fd.yaml: -------------------------------------------------------------------------------- 1 | generate: 2 | lm_model: "gpt-4o-mini" 3 | output_length: 128 4 | num_of_examples: 144 5 | concept_path: "axbench/data/gemma-2-2b_10-featured-res-16k.json" 6 | max_concepts: 500 7 | master_data_dir: "axbench/data" 8 | dataset_category: "instruction" 9 | lm_use_cache: false 10 | seed: 42 11 | train: 12 | model_name: "google/gemma-2-2b-it" 13 | layer: 10 14 | component: "res" 15 | seed: 42 16 | use_bf16: true 17 | models: 18 | DiffMean: 19 | batch_size: 6 20 | n_epochs: 1 21 | binarize_dataset: true 22 | low_rank_dimension: 1 23 | inference: 24 | use_bf16: true 25 | models: ["PromptSteering", "GemmaScopeSAE"] 26 | model_name: "google/gemma-2-2b-it" 27 | # latent related params 28 | output_length: 128 29 | latent_num_of_examples: 36 30 | latent_batch_size: 16 31 | # steering related params 32 | steering_intervention_type: "addition" # clamping 33 | steering_model_name: "google/gemma-2-2b-it" 34 | steering_datasets: ["AlpacaEval"] 35 | steering_batch_size: 10 36 | steering_output_length: 128 37 | steering_layers: [10] 38 | steering_num_of_examples: 10 # number of examples per concept and per factor 39 | steering_factors: [0.2, 0.4, 0.6, 0.8, 1.0, 1.2, 1.4, 1.6, 1.8, 2.0, 2.5, 3.0, 4.0, 5.0] # number of steering factors per example 40 | # master data dir is shared across all jobs. 41 | master_data_dir: "axbench/data" 42 | seed: 42 43 | lm_model: "gpt-4o-mini" 44 | # generation related params 45 | temperature: 1.0 46 | evaluate: 47 | models: ["PromptSteering", "GemmaScopeSAE"] 48 | latent_evaluators: [ 49 | "AUCROCEvaluator", 50 | "HardNegativeEvaluator", 51 | ] 52 | steering_evaluators: [ 53 | "PerplexityEvaluator", 54 | "LMJudgeEvaluator", 55 | ] 56 | winrate_split_ratio: 0.5 # this is for steering only, we use a separate partition for factor selection. 57 | # Number of processes to run in parallel for steering evaluation. 58 | num_of_workers: 32 59 | lm_model: "gpt-4o-mini" 60 | run_winrate: false 61 | winrate_baseline: "PromptSteering" 62 | # master data dir is shared across all jobs. 63 | master_data_dir: "axbench/data" 64 | -------------------------------------------------------------------------------- /axbench/sweep/wuzhengx/2b/l10/gemmascope_min_clamp.yaml: -------------------------------------------------------------------------------- 1 | generate: 2 | lm_model: "gpt-4o-mini" 3 | output_length: 128 4 | num_of_examples: 144 5 | concept_path: "axbench/data/gemma-2-2b_10-gemmascope-res-16k.json" 6 | max_concepts: 500 7 | master_data_dir: "axbench/data" 8 | dataset_category: "instruction" 9 | lm_use_cache: false 10 | seed: 42 11 | train: 12 | model_name: "google/gemma-2-2b-it" 13 | layer: 10 14 | component: "res" 15 | seed: 42 16 | use_bf16: true 17 | inference: 18 | use_bf16: true 19 | models: ["GemmaScopeSAE"] 20 | model_name: "google/gemma-2-2b-it" 21 | # latent related params 22 | output_length: 128 23 | latent_num_of_examples: 36 24 | latent_batch_size: 16 25 | # steering related params 26 | steering_intervention_type: "min_clamping" # clamping 27 | steering_model_name: "google/gemma-2-2b-it" 28 | steering_datasets: ["AlpacaEval"] 29 | steering_batch_size: 10 30 | steering_output_length: 128 31 | steering_layers: [10] 32 | steering_num_of_examples: 10 # number of examples per concept and per factor 33 | # steering_factors: [0.2, 0.4, 0.6, 0.8, 1.0, 1.2, 1.4, 1.6, 1.8, 2.0, 2.5, 3.0, 4.0, 5.0] # number of steering factors per example 34 | steering_factors: [0.4, 0.8, 1.2, 1.6, 2.0, 3.0, 4.0, 6.0, 8.0, 10.0, 20.0, 40.0, 60.0, 100.0] # SAE clamping only 35 | # master data dir is shared across all jobs. 36 | master_data_dir: "axbench/data" 37 | seed: 42 38 | lm_model: "gpt-4o-mini" 39 | # generation related params 40 | temperature: 1.0 41 | evaluate: 42 | models: ["GemmaScopeSAE"] 43 | latent_evaluators: [ 44 | "AUCROCEvaluator", 45 | "HardNegativeEvaluator", 46 | ] 47 | steering_evaluators: [ 48 | "PerplexityEvaluator", 49 | "LMJudgeEvaluator", 50 | ] 51 | winrate_split_ratio: 0.5 # this is for steering only, we use a separate partition for factor selection. 52 | # Number of processes to run in parallel for steering evaluation. 53 | num_of_workers: 32 54 | lm_model: "gpt-4o-mini" 55 | run_winrate: false 56 | winrate_baseline: "PromptSteering" 57 | # master data dir is shared across all jobs. 58 | master_data_dir: "axbench/data" 59 | -------------------------------------------------------------------------------- /axbench/sweep/wuzhengx/2b/l10/lora.yaml: -------------------------------------------------------------------------------- 1 | generate: 2 | lm_model: "gpt-4o-mini" 3 | output_length: 128 4 | num_of_examples: 144 5 | concept_path: "axbench/data/gemma-2-2b_10-gemmascope-res-16k.json" 6 | max_concepts: 500 7 | master_data_dir: "axbench/data" 8 | dataset_category: "instruction" 9 | lm_use_cache: false 10 | seed: 42 11 | train: 12 | model_name: "google/gemma-2-2b-it" 13 | layer: 10 14 | component: "res" 15 | seed: 42 16 | use_bf16: true 17 | models: 18 | LoRA: 19 | batch_size: 18 20 | gradient_accumulation_steps: 2 21 | n_epochs: 24 22 | lr: 0.0009 23 | weight_decay: 0.00 24 | low_rank_dimension: 4 25 | lora_layers: [5, 10, 15, 20] 26 | lora_components: ["o_proj"] 27 | lora_alpha: 32 28 | binarize_dataset: false 29 | train_on_negative: false 30 | exclude_bos: true 31 | inference: 32 | use_bf16: true 33 | models: ["LoRA"] 34 | model_name: "google/gemma-2-2b-it" 35 | # latent related params 36 | output_length: 128 37 | latent_num_of_examples: 36 38 | latent_batch_size: 16 39 | # steering related params 40 | steering_intervention_type: "addition" # clamping 41 | steering_model_name: "google/gemma-2-2b-it" 42 | steering_datasets: ["AlpacaEval"] 43 | steering_batch_size: 10 44 | steering_output_length: 128 45 | steering_layers: [10] 46 | steering_num_of_examples: 10 # number of examples per concept and per factor 47 | steering_factors: [0.2, 0.4, 0.6, 0.8, 1.0, 1.2, 1.4, 1.6, 1.8, 2.0, 2.5, 3.0, 4.0, 5.0] # number of steering factors per example 48 | # master data dir is shared across all jobs. 49 | master_data_dir: "axbench/data" 50 | seed: 42 51 | lm_model: "gpt-4o-mini" 52 | # generation related params 53 | temperature: 1.0 54 | evaluate: 55 | models: ["LoRA"] 56 | latent_evaluators: [ 57 | "AUCROCEvaluator", 58 | "HardNegativeEvaluator", 59 | ] 60 | steering_evaluators: [ 61 | "PerplexityEvaluator", 62 | "LMJudgeEvaluator", 63 | ] 64 | winrate_split_ratio: 0.5 # this is for steering only, we use a separate partition for factor selection. 65 | # Number of processes to run in parallel for steering evaluation. 66 | num_of_workers: 32 67 | lm_model: "gpt-4o-mini" 68 | run_winrate: false 69 | winrate_baseline: "PromptSteering" 70 | # master data dir is shared across all jobs. 71 | master_data_dir: "axbench/data" 72 | -------------------------------------------------------------------------------- /axbench/sweep/wuzhengx/2b/l10/lsreft.yaml: -------------------------------------------------------------------------------- 1 | generate: 2 | lm_model: "gpt-4o-mini" 3 | output_length: 128 4 | num_of_examples: 144 5 | concept_path: "axbench/data/gemma-2-2b_10-gemmascope-res-16k.json" 6 | max_concepts: 500 7 | master_data_dir: "axbench/data" 8 | dataset_category: "instruction" 9 | lm_use_cache: false 10 | seed: 42 11 | train: 12 | model_name: "google/gemma-2-2b-it" 13 | layer: 10 14 | component: "res" 15 | seed: 42 16 | use_bf16: true 17 | models: 18 | LsReFT: 19 | batch_size: 6 20 | gradient_accumulation_steps: 1 21 | n_epochs: 3 22 | lr: 0.01 23 | weight_decay: 0.00 24 | topk: 8 25 | coeff_latent_l1_loss: 0.005 26 | low_rank_dimension: 1 27 | intervention_positions: "all" 28 | intervention_type: "addition" # clamping 29 | binarize_dataset: false 30 | train_on_negative: true 31 | exclude_bos: true 32 | inference: 33 | use_bf16: true 34 | models: ["LsReFT"] 35 | model_name: "google/gemma-2-2b-it" 36 | # latent related params 37 | output_length: 128 38 | latent_num_of_examples: 36 39 | latent_batch_size: 16 40 | # steering related params 41 | steering_intervention_type: "addition" # clamping 42 | steering_model_name: "google/gemma-2-2b-it" 43 | steering_datasets: ["AlpacaEval"] 44 | steering_batch_size: 10 45 | steering_output_length: 128 46 | steering_layers: [10] 47 | steering_num_of_examples: 10 # number of examples per concept and per factor 48 | steering_factors: [0.2, 0.4, 0.6, 0.8, 1.0, 1.2, 1.4, 1.6, 1.8, 2.0, 2.5, 3.0, 4.0, 5.0] # number of steering factors per example 49 | # master data dir is shared across all jobs. 50 | master_data_dir: "axbench/data" 51 | seed: 42 52 | lm_model: "gpt-4o-mini" 53 | # generation related params 54 | temperature: 1.0 55 | evaluate: 56 | models: ["LsReFT"] 57 | latent_evaluators: [ 58 | "AUCROCEvaluator", 59 | "HardNegativeEvaluator", 60 | ] 61 | steering_evaluators: [ 62 | "PerplexityEvaluator", 63 | "LMJudgeEvaluator", 64 | ] 65 | winrate_split_ratio: 0.5 # this is for steering only, we use a separate partition for factor selection. 66 | # Number of processes to run in parallel for steering evaluation. 67 | num_of_workers: 32 68 | lm_model: "gpt-4o-mini" 69 | run_winrate: false 70 | winrate_baseline: "PromptSteering" 71 | # master data dir is shared across all jobs. 72 | master_data_dir: "axbench/data" 73 | -------------------------------------------------------------------------------- /axbench/sweep/wuzhengx/2b/l10/lsreft_fd.yaml: -------------------------------------------------------------------------------- 1 | generate: 2 | lm_model: "gpt-4o-mini" 3 | output_length: 128 4 | num_of_examples: 144 5 | concept_path: "axbench/data/gemma-2-2b_10-featured-res-16k.json" 6 | max_concepts: 500 7 | master_data_dir: "axbench/data" 8 | dataset_category: "instruction" 9 | lm_use_cache: false 10 | seed: 42 11 | train: 12 | model_name: "google/gemma-2-2b-it" 13 | layer: 10 14 | component: "res" 15 | seed: 42 16 | use_bf16: true 17 | models: 18 | LsReFT: 19 | batch_size: 6 20 | gradient_accumulation_steps: 1 21 | n_epochs: 3 22 | lr: 0.01 23 | weight_decay: 0.00 24 | topk: 8 25 | coeff_latent_l1_loss: 0.005 26 | low_rank_dimension: 1 27 | intervention_positions: "all" 28 | intervention_type: "addition" # clamping 29 | binarize_dataset: false 30 | train_on_negative: true 31 | exclude_bos: true 32 | inference: 33 | use_bf16: true 34 | models: ["LsReFT"] 35 | model_name: "google/gemma-2-2b-it" 36 | # latent related params 37 | output_length: 128 38 | latent_num_of_examples: 36 39 | latent_batch_size: 16 40 | # steering related params 41 | steering_intervention_type: "addition" # clamping 42 | steering_model_name: "google/gemma-2-2b-it" 43 | steering_datasets: ["AlpacaEval"] 44 | steering_batch_size: 10 45 | steering_output_length: 128 46 | steering_layers: [10] 47 | steering_num_of_examples: 10 # number of examples per concept and per factor 48 | steering_factors: [0.2, 0.4, 0.6, 0.8, 1.0, 1.2, 1.4, 1.6, 1.8, 2.0, 2.5, 3.0, 4.0, 5.0] # number of steering factors per example 49 | # master data dir is shared across all jobs. 50 | master_data_dir: "axbench/data" 51 | seed: 42 52 | lm_model: "gpt-4o-mini" 53 | # generation related params 54 | temperature: 1.0 55 | evaluate: 56 | models: ["LsReFT"] 57 | latent_evaluators: [ 58 | "AUCROCEvaluator", 59 | "HardNegativeEvaluator", 60 | ] 61 | steering_evaluators: [ 62 | "PerplexityEvaluator", 63 | "LMJudgeEvaluator", 64 | ] 65 | winrate_split_ratio: 0.5 # this is for steering only, we use a separate partition for factor selection. 66 | # Number of processes to run in parallel for steering evaluation. 67 | num_of_workers: 32 68 | lm_model: "gpt-4o-mini" 69 | run_winrate: false 70 | winrate_baseline: "PromptSteering" 71 | # master data dir is shared across all jobs. 72 | master_data_dir: "axbench/data" 73 | -------------------------------------------------------------------------------- /axbench/sweep/wuzhengx/2b/l10/lsreft_scaling_law.yaml: -------------------------------------------------------------------------------- 1 | generate: 2 | lm_model: "gpt-4o-mini" 3 | output_length: 128 4 | num_of_examples: 144 5 | concept_path: "axbench/data/gemma-2-2b_10-gemmascope-res-16k.json" 6 | max_concepts: 10 7 | master_data_dir: "axbench/data" 8 | dataset_category: "instruction" 9 | lm_use_cache: false 10 | seed: 42 11 | train: 12 | model_name: "google/gemma-2-2b-it" 13 | layer: 10 14 | component: "res" 15 | seed: 42 16 | use_bf16: true 17 | max_num_of_examples: 144 # [6, 12, 24, 48, 72, 96, 120, 144] 18 | models: 19 | LsReFT: 20 | batch_size: 6 21 | gradient_accumulation_steps: 1 22 | lr: 0.01 23 | weight_decay: 0.00 24 | topk: 8 25 | coeff_latent_l1_loss: 0.005 26 | low_rank_dimension: 1 27 | intervention_positions: "all" 28 | intervention_type: "addition" # clamping 29 | binarize_dataset: false 30 | train_on_negative: true 31 | exclude_bos: true 32 | inference: 33 | use_bf16: true 34 | models: ["LsReFT"] 35 | model_name: "google/gemma-2-2b-it" 36 | # latent related params 37 | output_length: 128 38 | latent_num_of_examples: 36 39 | latent_batch_size: 16 40 | # steering related params 41 | steering_intervention_type: "addition" # clamping 42 | steering_model_name: "google/gemma-2-2b-it" 43 | steering_datasets: ["AlpacaEval"] 44 | steering_batch_size: 10 45 | steering_output_length: 128 46 | steering_layers: [10] 47 | steering_num_of_examples: 10 # number of examples per concept and per factor 48 | steering_factors: [0.2, 0.4, 0.6, 0.8, 1.0, 1.2, 1.4, 1.6, 1.8, 2.0, 2.5, 3.0, 4.0, 5.0] # number of steering factors per example 49 | # master data dir is shared across all jobs. 50 | master_data_dir: "axbench/data" 51 | seed: 42 52 | lm_model: "gpt-4o-mini" 53 | # generation related params 54 | temperature: 1.0 55 | evaluate: 56 | models: ["LsReFT"] 57 | latent_evaluators: [ 58 | "AUCROCEvaluator", 59 | "HardNegativeEvaluator", 60 | ] 61 | steering_evaluators: [ 62 | "PerplexityEvaluator", 63 | "LMJudgeEvaluator", 64 | ] 65 | winrate_split_ratio: 0.5 # this is for steering only, we use a separate partition for factor selection. 66 | # Number of processes to run in parallel for steering evaluation. 67 | num_of_workers: 32 68 | lm_model: "gpt-4o-mini" 69 | run_winrate: false 70 | winrate_baseline: "PromptSteering" 71 | # master data dir is shared across all jobs. 72 | master_data_dir: "axbench/data" 73 | -------------------------------------------------------------------------------- /axbench/sweep/wuzhengx/2b/l10/lsreft_synergy.yaml: -------------------------------------------------------------------------------- 1 | generate: 2 | lm_model: "gpt-4o-mini" 3 | output_length: 128 4 | num_of_examples: 144 5 | concept_path: "axbench/data/gemma-2-2b_10-gemmascope-res-16k.json" 6 | max_concepts: 500 7 | master_data_dir: "axbench/data" 8 | dataset_category: "instruction" 9 | lm_use_cache: false 10 | seed: 42 11 | train: 12 | model_name: "google/gemma-2-2b-it" 13 | layer: 10 14 | component: "res" 15 | seed: 42 16 | use_bf16: true 17 | models: 18 | LsReFT: 19 | batch_size: 6 20 | gradient_accumulation_steps: 1 21 | n_epochs: 3 22 | lr: 0.01 23 | weight_decay: 0.00 24 | topk: 8 25 | coeff_latent_l1_loss: 0.005 26 | low_rank_dimension: 1 27 | intervention_positions: "all" 28 | intervention_type: "addition" # clamping 29 | binarize_dataset: false 30 | train_on_negative: true 31 | exclude_bos: true 32 | use_synergy: true 33 | inference: 34 | use_bf16: true 35 | models: ["LsReFT"] 36 | model_name: "google/gemma-2-2b-it" 37 | # latent related params 38 | output_length: 128 39 | latent_num_of_examples: 36 40 | latent_batch_size: 16 41 | # steering related params 42 | steering_intervention_type: "addition" # clamping 43 | steering_model_name: "google/gemma-2-2b-it" 44 | steering_datasets: ["AlpacaEval"] 45 | steering_batch_size: 10 46 | steering_output_length: 128 47 | steering_layers: [10] 48 | steering_num_of_examples: 10 # number of examples per concept and per factor 49 | steering_factors: [0.2, 0.4, 0.6, 0.8, 1.0, 1.2, 1.4, 1.6, 1.8, 2.0, 2.5, 3.0, 4.0, 5.0] # number of steering factors per example 50 | # master data dir is shared across all jobs. 51 | master_data_dir: "axbench/data" 52 | seed: 42 53 | lm_model: "gpt-4o-mini" 54 | # generation related params 55 | temperature: 1.0 56 | evaluate: 57 | models: ["LsReFT"] 58 | latent_evaluators: [ 59 | "AUCROCEvaluator", 60 | "HardNegativeEvaluator", 61 | ] 62 | steering_evaluators: [ 63 | "PerplexityEvaluator", 64 | "LMJudgeEvaluator", 65 | ] 66 | winrate_split_ratio: 0.5 # this is for steering only, we use a separate partition for factor selection. 67 | # Number of processes to run in parallel for steering evaluation. 68 | num_of_workers: 32 69 | lm_model: "gpt-4o-mini" 70 | run_winrate: false 71 | winrate_baseline: "PromptSteering" 72 | # master data dir is shared across all jobs. 73 | master_data_dir: "axbench/data" 74 | -------------------------------------------------------------------------------- /axbench/sweep/wuzhengx/2b/l10/no_grad.yaml: -------------------------------------------------------------------------------- 1 | generate: 2 | lm_model: "gpt-4o-mini" 3 | output_length: 128 4 | num_of_examples: 144 5 | concept_path: "axbench/data/gemma-2-2b_10-gemmascope-res-16k.json" 6 | max_concepts: 500 7 | master_data_dir: "axbench/data" 8 | dataset_category: "instruction" 9 | lm_use_cache: false 10 | seed: 42 11 | train: 12 | model_name: "google/gemma-2-2b-it" 13 | layer: 10 14 | component: "res" 15 | seed: 42 16 | use_bf16: true 17 | models: 18 | DiffMean: 19 | batch_size: 6 20 | n_epochs: 1 21 | binarize_dataset: true 22 | low_rank_dimension: 1 23 | PCA: 24 | batch_size: 6 25 | n_epochs: 1 26 | binarize_dataset: true 27 | low_rank_dimension: 1 28 | LAT: 29 | batch_size: 6 30 | n_epochs: 1 31 | binarize_dataset: true 32 | low_rank_dimension: 1 33 | inference: 34 | use_bf16: true 35 | models: ["PromptSteering", "DiffMean", "PCA", "LAT", "GemmaScopeSAE"] 36 | model_name: "google/gemma-2-2b-it" 37 | # latent related params 38 | output_length: 128 39 | latent_num_of_examples: 36 40 | latent_batch_size: 16 41 | # steering related params 42 | steering_intervention_type: "addition" # clamping 43 | steering_model_name: "google/gemma-2-2b-it" 44 | steering_datasets: ["AlpacaEval"] 45 | steering_batch_size: 10 46 | steering_output_length: 128 47 | steering_layers: [10] 48 | steering_num_of_examples: 10 # number of examples per concept and per factor 49 | steering_factors: [0.2, 0.4, 0.6, 0.8, 1.0, 1.2, 1.4, 1.6, 1.8, 2.0, 2.5, 3.0, 4.0, 5.0] # number of steering factors per example 50 | # master data dir is shared across all jobs. 51 | master_data_dir: "axbench/data" 52 | seed: 42 53 | lm_model: "gpt-4o-mini" 54 | # generation related params 55 | temperature: 1.0 56 | evaluate: 57 | models: ["PromptSteering", "DiffMean", "PCA", "LAT", "GemmaScopeSAE"] 58 | latent_evaluators: [ 59 | "AUCROCEvaluator", 60 | "HardNegativeEvaluator", 61 | ] 62 | steering_evaluators: [ 63 | "PerplexityEvaluator", 64 | "LMJudgeEvaluator", 65 | ] 66 | winrate_split_ratio: 0.5 # this is for steering only, we use a separate partition for factor selection. 67 | # Number of processes to run in parallel for steering evaluation. 68 | num_of_workers: 32 69 | lm_model: "gpt-4o-mini" 70 | run_winrate: false 71 | winrate_baseline: "PromptSteering" 72 | # master data dir is shared across all jobs. 73 | master_data_dir: "axbench/data" 74 | -------------------------------------------------------------------------------- /axbench/sweep/wuzhengx/2b/l10/probe.yaml: -------------------------------------------------------------------------------- 1 | generate: 2 | lm_model: "gpt-4o-mini" 3 | output_length: 128 4 | num_of_examples: 144 5 | concept_path: "axbench/data/gemma-2-2b_10-gemmascope-res-16k.json" 6 | max_concepts: 500 7 | master_data_dir: "axbench/data" 8 | dataset_category: "instruction" 9 | lm_use_cache: false 10 | seed: 42 11 | train: 12 | model_name: "google/gemma-2-2b-it" 13 | layer: 10 14 | component: "res" 15 | seed: 42 16 | use_bf16: true 17 | models: 18 | LinearProbe: 19 | batch_size: 12 20 | gradient_accumulation_steps: 4 21 | n_epochs: 24 22 | lr: 0.005 23 | weight_decay: 0.001 24 | coeff_l1_loss: 0.000 25 | binarize_dataset: true 26 | low_rank_dimension: 1 27 | inference: 28 | use_bf16: true 29 | models: ["LinearProbe"] 30 | model_name: "google/gemma-2-2b-it" 31 | # latent related params 32 | output_length: 128 33 | latent_num_of_examples: 36 34 | latent_batch_size: 16 35 | # steering related params 36 | steering_intervention_type: "addition" # clamping 37 | steering_model_name: "google/gemma-2-2b-it" 38 | steering_datasets: ["AlpacaEval"] 39 | steering_batch_size: 10 40 | steering_output_length: 128 41 | steering_layers: [10] 42 | steering_num_of_examples: 10 # number of examples per concept and per factor 43 | steering_factors: [0.2, 0.4, 0.6, 0.8, 1.0, 1.2, 1.4, 1.6, 1.8, 2.0, 2.5, 3.0, 4.0, 5.0] # number of steering factors per example 44 | # master data dir is shared across all jobs. 45 | master_data_dir: "axbench/data" 46 | seed: 42 47 | lm_model: "gpt-4o-mini" 48 | # generation related params 49 | temperature: 1.0 50 | evaluate: 51 | models: ["LinearProbe"] 52 | latent_evaluators: [ 53 | "AUCROCEvaluator", 54 | "HardNegativeEvaluator", 55 | ] 56 | steering_evaluators: [ 57 | "PerplexityEvaluator", 58 | "LMJudgeEvaluator", 59 | ] 60 | winrate_split_ratio: 0.5 # this is for steering only, we use a separate partition for factor selection. 61 | # Number of processes to run in parallel for steering evaluation. 62 | num_of_workers: 32 63 | lm_model: "gpt-4o-mini" 64 | run_winrate: false 65 | winrate_baseline: "PromptSteering" 66 | # master data dir is shared across all jobs. 67 | master_data_dir: "axbench/data" 68 | -------------------------------------------------------------------------------- /axbench/sweep/wuzhengx/2b/l10/prompt_detection.yaml: -------------------------------------------------------------------------------- 1 | generate: 2 | lm_model: "gpt-4o-mini" 3 | output_length: 128 4 | num_of_examples: 144 5 | concept_path: "axbench/data/gemma-2-2b_10-gemmascope-res-16k.json" 6 | max_concepts: 500 7 | master_data_dir: "axbench/data" 8 | dataset_category: "instruction" 9 | lm_use_cache: false 10 | seed: 42 11 | train: 12 | model_name: "google/gemma-2-2b-it" 13 | layer: 10 14 | component: "res" 15 | seed: 42 16 | use_bf16: true 17 | inference: 18 | use_bf16: true 19 | models: ["PromptDetection"] 20 | model_name: "google/gemma-2-2b-it" 21 | # latent related params 22 | output_length: 128 23 | latent_num_of_examples: 36 24 | latent_batch_size: 256 25 | # steering related params 26 | steering_intervention_type: "addition" # clamping 27 | steering_model_name: "google/gemma-2-2b-it" 28 | steering_datasets: ["AlpacaEval"] 29 | steering_batch_size: 10 30 | steering_output_length: 128 31 | steering_layers: [10] 32 | steering_num_of_examples: 10 # number of examples per concept and per factor 33 | steering_factors: [0.2, 0.4, 0.6, 0.8, 1.0, 1.2, 1.4, 1.6, 1.8, 2.0, 2.5, 3.0, 4.0, 5.0] # number of steering factors per example 34 | # master data dir is shared across all jobs. 35 | master_data_dir: "axbench/data" 36 | seed: 42 37 | lm_model: "gpt-4o-mini" 38 | # generation related params 39 | temperature: 1.0 40 | evaluate: 41 | models: ["PromptDetection"] 42 | latent_evaluators: [ 43 | "AUCROCEvaluator", 44 | "HardNegativeEvaluator", 45 | ] 46 | steering_evaluators: [ 47 | "PerplexityEvaluator", 48 | "LMJudgeEvaluator", 49 | ] 50 | winrate_split_ratio: 0.5 # this is for steering only, we use a separate partition for factor selection. 51 | # Number of processes to run in parallel for steering evaluation. 52 | num_of_workers: 32 53 | lm_model: "gpt-4o-mini" 54 | run_winrate: false 55 | winrate_baseline: "PromptSteering" 56 | # master data dir is shared across all jobs. 57 | master_data_dir: "axbench/data" 58 | -------------------------------------------------------------------------------- /axbench/sweep/wuzhengx/2b/l10/sft.yaml: -------------------------------------------------------------------------------- 1 | generate: 2 | lm_model: "gpt-4o-mini" 3 | output_length: 128 4 | num_of_examples: 144 5 | concept_path: "axbench/data/gemma-2-2b_10-gemmascope-res-16k.json" 6 | max_concepts: 500 7 | master_data_dir: "axbench/data" 8 | dataset_category: "instruction" 9 | lm_use_cache: false 10 | seed: 42 11 | train: 12 | model_name: "google/gemma-2-2b-it" 13 | layer: 10 14 | component: "res" 15 | seed: 42 16 | use_bf16: true 17 | max_concepts: 20 18 | models: 19 | SFT: 20 | batch_size: 18 21 | gradient_accumulation_steps: 4 22 | n_epochs: 8 23 | lr: 0.00004 24 | weight_decay: 0.00 25 | binarize_dataset: false 26 | train_on_negative: false 27 | exclude_bos: true 28 | inference: 29 | use_bf16: true 30 | models: ["SFT"] 31 | model_name: "google/gemma-2-2b-it" 32 | # latent related params 33 | output_length: 128 34 | latent_num_of_examples: 36 35 | latent_batch_size: 16 36 | # steering related params 37 | steering_intervention_type: "addition" # clamping 38 | steering_model_name: "google/gemma-2-2b-it" 39 | steering_datasets: ["AlpacaEval"] 40 | steering_batch_size: 10 41 | steering_output_length: 128 42 | steering_layers: [10] 43 | steering_num_of_examples: 10 # number of examples per concept and per factor 44 | steering_factors: [0.2, 0.4, 0.6, 0.8, 1.0, 1.2, 1.4, 1.6, 1.8, 2.0, 2.5, 3.0, 4.0, 5.0] # number of steering factors per example 45 | # master data dir is shared across all jobs. 46 | master_data_dir: "axbench/data" 47 | seed: 42 48 | lm_model: "gpt-4o-mini" 49 | # generation related params 50 | temperature: 1.0 51 | evaluate: 52 | models: ["SFT"] 53 | latent_evaluators: [ 54 | "AUCROCEvaluator", 55 | "HardNegativeEvaluator", 56 | ] 57 | steering_evaluators: [ 58 | "PerplexityEvaluator", 59 | "LMJudgeEvaluator", 60 | ] 61 | winrate_split_ratio: 0.5 # this is for steering only, we use a separate partition for factor selection. 62 | # Number of processes to run in parallel for steering evaluation. 63 | num_of_workers: 32 64 | lm_model: "gpt-4o-mini" 65 | run_winrate: false 66 | winrate_baseline: "PromptSteering" 67 | # master data dir is shared across all jobs. 68 | master_data_dir: "axbench/data" 69 | -------------------------------------------------------------------------------- /axbench/sweep/wuzhengx/2b/l10/simple_prompt_steering.yaml: -------------------------------------------------------------------------------- 1 | generate: 2 | lm_model: "gpt-4o-mini" 3 | output_length: 128 4 | num_of_examples: 144 5 | concept_path: "axbench/data/gemma-2-2b_10-gemmascope-res-16k.json" 6 | max_concepts: 500 7 | master_data_dir: "axbench/data" 8 | dataset_category: "instruction" 9 | lm_use_cache: false 10 | seed: 42 11 | train: 12 | model_name: "google/gemma-2-2b-it" 13 | layer: 10 14 | component: "res" 15 | seed: 42 16 | use_bf16: true 17 | models: 18 | DiffMean: 19 | batch_size: 6 20 | n_epochs: 1 21 | binarize_dataset: true 22 | low_rank_dimension: 1 23 | inference: 24 | use_bf16: true 25 | models: ["SimplePromptSteering"] 26 | model_name: "google/gemma-2-2b-it" 27 | # latent related params 28 | output_length: 128 29 | latent_num_of_examples: 36 30 | latent_batch_size: 16 31 | # steering related params 32 | steering_intervention_type: "addition" # clamping 33 | steering_model_name: "google/gemma-2-2b-it" 34 | steering_datasets: ["AlpacaEval"] 35 | steering_batch_size: 10 36 | steering_output_length: 128 37 | steering_layers: [10] 38 | steering_num_of_examples: 10 # number of examples per concept and per factor 39 | steering_factors: [0.2, 0.4, 0.6, 0.8, 1.0, 1.2, 1.4, 1.6, 1.8, 2.0, 2.5, 3.0, 4.0, 5.0] # number of steering factors per example 40 | # master data dir is shared across all jobs. 41 | master_data_dir: "axbench/data" 42 | seed: 42 43 | lm_model: "gpt-4o-mini" 44 | # generation related params 45 | temperature: 1.0 46 | evaluate: 47 | models: ["SimplePromptSteering"] 48 | latent_evaluators: [ 49 | "AUCROCEvaluator", 50 | "HardNegativeEvaluator", 51 | ] 52 | steering_evaluators: [ 53 | "PerplexityEvaluator", 54 | "LMJudgeEvaluator", 55 | ] 56 | winrate_split_ratio: 0.5 # this is for steering only, we use a separate partition for factor selection. 57 | # Number of processes to run in parallel for steering evaluation. 58 | num_of_workers: 32 59 | lm_model: "gpt-4o-mini" 60 | run_winrate: false 61 | winrate_baseline: "PromptSteering" 62 | # master data dir is shared across all jobs. 63 | master_data_dir: "axbench/data" 64 | -------------------------------------------------------------------------------- /axbench/sweep/wuzhengx/2b/l10/steering_vec.yaml: -------------------------------------------------------------------------------- 1 | generate: 2 | lm_model: "gpt-4o-mini" 3 | output_length: 128 4 | num_of_examples: 144 5 | concept_path: "axbench/data/gemma-2-2b_10-gemmascope-res-16k.json" 6 | max_concepts: 500 7 | master_data_dir: "axbench/data" 8 | dataset_category: "instruction" 9 | lm_use_cache: false 10 | seed: 42 11 | train: 12 | model_name: "google/gemma-2-2b-it" 13 | layer: 10 14 | component: "res" 15 | seed: 42 16 | use_bf16: true 17 | models: 18 | SteeringVector: 19 | batch_size: 6 20 | gradient_accumulation_steps: 1 21 | n_epochs: 3 22 | lr: 0.01 23 | weight_decay: 0.00 24 | low_rank_dimension: 1 25 | intervention_positions: "all" 26 | intervention_type: "addition" # clamping 27 | binarize_dataset: false 28 | train_on_negative: false 29 | exclude_bos: true 30 | inference: 31 | use_bf16: true 32 | # if you are verifying latent, you dont have to include steering models. 33 | models: ["SteeringVector"] 34 | model_name: "google/gemma-2-2b-it" 35 | # latent related params 36 | output_length: 128 37 | latent_num_of_examples: 36 38 | latent_batch_size: 16 39 | # steering related params 40 | steering_intervention_type: "addition" # clamping 41 | steering_model_name: "google/gemma-2-2b-it" 42 | steering_datasets: ["AlpacaEval"] 43 | steering_batch_size: 10 44 | steering_output_length: 128 45 | steering_layers: [10] 46 | steering_num_of_examples: 10 # number of examples per concept and per factor 47 | steering_factors: [0.2, 0.4, 0.6, 0.8, 1.0, 1.2, 1.4, 1.6, 1.8, 2.0, 2.5, 3.0, 4.0, 5.0] # number of steering factors per example 48 | # master data dir is shared across all jobs. 49 | master_data_dir: "axbench/data" 50 | seed: 42 51 | lm_model: "gpt-4o-mini" 52 | # generation related params 53 | temperature: 1.0 54 | evaluate: 55 | models: ["SteeringVector"] 56 | latent_evaluators: [ 57 | "AUCROCEvaluator", 58 | "HardNegativeEvaluator", 59 | ] 60 | steering_evaluators: [ 61 | "PerplexityEvaluator", 62 | "LMJudgeEvaluator", 63 | ] 64 | winrate_split_ratio: 0.5 # this is for steering only, we use a separate partition for factor selection. 65 | # Number of processes to run in parallel for steering evaluation. 66 | num_of_workers: 32 67 | lm_model: "gpt-4o-mini" 68 | run_winrate: false 69 | winrate_baseline: "PromptSteering" 70 | # master data dir is shared across all jobs. 71 | master_data_dir: "axbench/data" 72 | -------------------------------------------------------------------------------- /axbench/sweep/wuzhengx/2b/l20/16k_diffmean.yaml: -------------------------------------------------------------------------------- 1 | generate: 2 | lm_model: "gpt-4o-mini" 3 | output_length: 128 4 | num_of_examples: 144 5 | concept_path: "axbench/data/gemma-2-2b_20-gemmascope-res-16k.json" 6 | max_concepts: 16000 7 | master_data_dir: "axbench/data" 8 | dataset_category: "instruction" 9 | lm_use_cache: false 10 | seed: 42 11 | train: 12 | model_name: "google/gemma-2-2b-it" 13 | layer: 20 14 | component: "res" 15 | seed: 42 16 | use_bf16: true 17 | models: 18 | DiffMean: 19 | batch_size: 6 20 | n_epochs: 1 21 | binarize_dataset: true 22 | low_rank_dimension: 1 23 | inference: 24 | use_bf16: true 25 | models: ["DiffMean"] 26 | model_name: "google/gemma-2-2b-it" 27 | # latent related params 28 | output_length: 128 29 | latent_num_of_examples: 36 30 | latent_batch_size: 36 31 | imbalance_factor: 2 32 | # steering related params 33 | steering_intervention_type: "addition" # clamping 34 | steering_model_name: "google/gemma-2-2b-it" 35 | steering_datasets: ["AlpacaEval"] 36 | steering_batch_size: 10 37 | steering_output_length: 128 38 | steering_layers: [20] 39 | steering_num_of_examples: 10 # number of examples per concept and per factor 40 | steering_factors: [0.2, 0.4, 0.6, 0.8, 1.0, 1.2, 1.4, 1.6, 1.8, 2.0, 2.5, 3.0, 4.0, 5.0] # number of steering factors per example 41 | # master data dir is shared across all jobs. 42 | master_data_dir: "axbench/data" 43 | seed: 42 44 | lm_model: "gpt-4o-mini" 45 | # generation related params 46 | temperature: 1.0 -------------------------------------------------------------------------------- /axbench/sweep/wuzhengx/2b/l20/16k_diffmean_crossfit.yaml: -------------------------------------------------------------------------------- 1 | generate: 2 | lm_model: "gpt-4o-mini" 3 | output_length: 128 4 | num_of_examples: 144 5 | concept_path: "axbench/data/gemma-2-9b_20-gemmascope-res-16k.json" 6 | max_concepts: 16000 7 | master_data_dir: "axbench/data" 8 | dataset_category: "instruction" 9 | lm_use_cache: false 10 | seed: 42 11 | train: 12 | model_name: "google/gemma-2-2b-it" 13 | layer: 20 14 | component: "res" 15 | seed: 42 16 | use_bf16: true 17 | models: 18 | DiffMean: 19 | batch_size: 6 20 | n_epochs: 1 21 | binarize_dataset: true 22 | low_rank_dimension: 1 -------------------------------------------------------------------------------- /axbench/sweep/wuzhengx/2b/l20/16k_lsreft.yaml: -------------------------------------------------------------------------------- 1 | generate: 2 | lm_model: "gpt-4o-mini" 3 | output_length: 128 4 | num_of_examples: 144 5 | concept_path: "axbench/data/gemma-2-2b_20-gemmascope-res-16k.json" 6 | max_concepts: 16000 7 | master_data_dir: "axbench/data" 8 | dataset_category: "instruction" 9 | lm_use_cache: false 10 | seed: 42 11 | train: 12 | model_name: "google/gemma-2-2b-it" 13 | layer: 20 14 | component: "res" 15 | seed: 42 16 | use_bf16: true 17 | models: 18 | LsReFT: 19 | batch_size: 6 20 | gradient_accumulation_steps: 1 21 | n_epochs: 3 22 | lr: 0.01 23 | weight_decay: 0.00 24 | topk: 8 25 | coeff_latent_l1_loss: 0.005 26 | low_rank_dimension: 1 27 | intervention_positions: "all" 28 | intervention_type: "addition" # clamping 29 | binarize_dataset: false 30 | train_on_negative: true 31 | exclude_bos: true 32 | inference: 33 | use_bf16: true 34 | models: ["LsReFT"] 35 | model_name: "google/gemma-2-2b-it" 36 | # latent related params 37 | output_length: 128 38 | latent_num_of_examples: 36 39 | latent_batch_size: 36 40 | imbalance_factor: 2 41 | # steering related params 42 | steering_intervention_type: "addition" # clamping 43 | steering_model_name: "google/gemma-2-2b-it" 44 | steering_datasets: ["AlpacaEval"] 45 | steering_batch_size: 10 46 | steering_output_length: 128 47 | steering_layers: [20] 48 | steering_num_of_examples: 10 # number of examples per concept and per factor 49 | steering_factors: [0.2, 0.4, 0.6, 0.8, 1.0, 1.2, 1.4, 1.6, 1.8, 2.0, 2.5, 3.0, 4.0, 5.0] # number of steering factors per example 50 | # master data dir is shared across all jobs. 51 | master_data_dir: "axbench/data" 52 | seed: 42 53 | lm_model: "gpt-4o-mini" 54 | # generation related params 55 | temperature: 1.0 -------------------------------------------------------------------------------- /axbench/sweep/wuzhengx/2b/l20/16k_lsreft_crossfit.yaml: -------------------------------------------------------------------------------- 1 | generate: 2 | lm_model: "gpt-4o-mini" 3 | output_length: 128 4 | num_of_examples: 144 5 | concept_path: "axbench/data/gemma-2-9b_20-gemmascope-res-16k.json" 6 | max_concepts: 16000 7 | master_data_dir: "axbench/data" 8 | dataset_category: "instruction" 9 | lm_use_cache: false 10 | seed: 42 11 | train: 12 | model_name: "google/gemma-2-2b-it" 13 | layer: 20 14 | component: "res" 15 | seed: 42 16 | use_bf16: true 17 | models: 18 | LsReFT: 19 | batch_size: 6 20 | gradient_accumulation_steps: 1 21 | n_epochs: 3 22 | lr: 0.01 23 | weight_decay: 0.00 24 | topk: 8 25 | coeff_latent_l1_loss: 0.005 26 | low_rank_dimension: 1 27 | intervention_positions: "all" 28 | intervention_type: "addition" # clamping 29 | binarize_dataset: false 30 | train_on_negative: true 31 | exclude_bos: true -------------------------------------------------------------------------------- /axbench/sweep/wuzhengx/2b/l20/bow.yaml: -------------------------------------------------------------------------------- 1 | generate: 2 | lm_model: "gpt-4o-mini" 3 | output_length: 128 4 | num_of_examples: 144 5 | concept_path: "axbench/data/gemma-2-2b_20-gemmascope-res-16k.json" 6 | max_concepts: 500 7 | master_data_dir: "axbench/data" 8 | dataset_category: "instruction" 9 | lm_use_cache: false 10 | seed: 42 11 | train: 12 | model_name: "google/gemma-2-2b-it" 13 | layer: 20 14 | component: "res" 15 | seed: 42 16 | use_bf16: true 17 | models: 18 | BoW: 19 | binarize_dataset: true 20 | bow_penalty: "l2" # l1, l2 21 | bow_C: 100 # 0.001, 0.01, 0.1, 1, 10, 100 22 | inference: 23 | use_bf16: true 24 | models: ["BoW"] 25 | model_name: "google/gemma-2-2b-it" 26 | # latent related params 27 | output_length: 128 28 | latent_num_of_examples: 36 29 | latent_batch_size: 16 30 | # steering related params 31 | steering_intervention_type: "addition" # clamping 32 | steering_model_name: "google/gemma-2-2b-it" 33 | steering_datasets: ["AlpacaEval"] 34 | steering_batch_size: 10 35 | steering_output_length: 128 36 | steering_layers: [20] 37 | steering_num_of_examples: 10 # number of examples per concept and per factor 38 | steering_factors: [0.2, 0.4, 0.6, 0.8, 1.0, 1.2, 1.4, 1.6, 1.8, 2.0, 2.5, 3.0, 4.0, 5.0] # number of steering factors per example 39 | # master data dir is shared across all jobs. 40 | master_data_dir: "axbench/data" 41 | seed: 42 42 | lm_model: "gpt-4o-mini" 43 | # generation related params 44 | temperature: 1.0 45 | evaluate: 46 | models: ["BoW"] 47 | latent_evaluators: [ 48 | "AUCROCEvaluator", 49 | "HardNegativeEvaluator", 50 | ] 51 | steering_evaluators: [ 52 | "PerplexityEvaluator", 53 | "LMJudgeEvaluator", 54 | ] 55 | winrate_split_ratio: 0.5 # this is for steering only, we use a separate partition for factor selection. 56 | # Number of processes to run in parallel for steering evaluation. 57 | num_of_workers: 32 58 | lm_model: "gpt-4o-mini" 59 | run_winrate: false 60 | winrate_baseline: "PromptSteering" 61 | # master data dir is shared across all jobs. 62 | master_data_dir: "axbench/data" 63 | -------------------------------------------------------------------------------- /axbench/sweep/wuzhengx/2b/l20/gemmascope_axbench_max_act.yaml: -------------------------------------------------------------------------------- 1 | generate: 2 | lm_model: "gpt-4o-mini" 3 | output_length: 128 4 | num_of_examples: 144 5 | concept_path: "axbench/data/gemma-2-2b_20-gemmascope-res-16k.json" 6 | max_concepts: 500 7 | master_data_dir: "axbench/data" 8 | dataset_category: "instruction" 9 | lm_use_cache: false 10 | seed: 42 11 | train: 12 | model_name: "google/gemma-2-2b-it" 13 | layer: 20 14 | component: "res" 15 | seed: 42 16 | use_bf16: true 17 | inference: 18 | use_bf16: true 19 | models: ["GemmaScopeSAE"] 20 | model_name: "google/gemma-2-2b-it" 21 | # latent related params 22 | output_length: 128 23 | latent_num_of_examples: 36 24 | latent_batch_size: 16 25 | # steering related params 26 | disable_neuronpedia_max_act: true 27 | steering_intervention_type: "addition" 28 | steering_model_name: "google/gemma-2-2b-it" 29 | steering_datasets: ["AlpacaEval"] 30 | steering_batch_size: 10 31 | steering_output_length: 128 32 | steering_layers: [20] 33 | steering_num_of_examples: 10 # number of examples per concept and per factor 34 | steering_factors: [0.2, 0.4, 0.6, 0.8, 1.0, 1.2, 1.4, 1.6, 1.8, 2.0, 2.5, 3.0, 4.0, 5.0] # number of steering factors per example 35 | # steering_factors: [0.4, 0.8, 1.2, 1.6, 2.0, 3.0, 4.0, 6.0, 8.0, 10.0, 20.0, 40.0, 60.0, 100.0] # SAE clamping only 36 | # master data dir is shared across all jobs. 37 | master_data_dir: "axbench/data" 38 | seed: 42 39 | lm_model: "gpt-4o-mini" 40 | # generation related params 41 | temperature: 1.0 42 | evaluate: 43 | models: ["GemmaScopeSAE"] 44 | latent_evaluators: [ 45 | "AUCROCEvaluator", 46 | "HardNegativeEvaluator", 47 | ] 48 | steering_evaluators: [ 49 | "PerplexityEvaluator", 50 | "LMJudgeEvaluator", 51 | ] 52 | winrate_split_ratio: 0.5 # this is for steering only, we use a separate partition for factor selection. 53 | # Number of processes to run in parallel for steering evaluation. 54 | num_of_workers: 32 55 | lm_model: "gpt-4o-mini" 56 | run_winrate: false 57 | winrate_baseline: "PromptSteering" 58 | # master data dir is shared across all jobs. 59 | master_data_dir: "axbench/data" 60 | -------------------------------------------------------------------------------- /axbench/sweep/wuzhengx/2b/l20/gemmascope_clamp.yaml: -------------------------------------------------------------------------------- 1 | generate: 2 | lm_model: "gpt-4o-mini" 3 | output_length: 128 4 | num_of_examples: 144 5 | concept_path: "axbench/data/gemma-2-2b_20-gemmascope-res-16k.json" 6 | max_concepts: 500 7 | master_data_dir: "axbench/data" 8 | dataset_category: "instruction" 9 | lm_use_cache: false 10 | seed: 42 11 | train: 12 | model_name: "google/gemma-2-2b-it" 13 | layer: 20 14 | component: "res" 15 | seed: 42 16 | use_bf16: true 17 | inference: 18 | use_bf16: true 19 | models: ["GemmaScopeSAE"] 20 | model_name: "google/gemma-2-2b-it" 21 | # latent related params 22 | output_length: 128 23 | latent_num_of_examples: 36 24 | latent_batch_size: 16 25 | # steering related params 26 | steering_intervention_type: "clamping" # clamping 27 | steering_model_name: "google/gemma-2-2b-it" 28 | steering_datasets: ["AlpacaEval"] 29 | steering_batch_size: 10 30 | steering_output_length: 128 31 | steering_layers: [20] 32 | steering_num_of_examples: 10 # number of examples per concept and per factor 33 | # steering_factors: [0.2, 0.4, 0.6, 0.8, 1.0, 1.2, 1.4, 1.6, 1.8, 2.0, 2.5, 3.0, 4.0, 5.0] # number of steering factors per example 34 | steering_factors: [0.4, 0.8, 1.2, 1.6, 2.0, 3.0, 4.0, 6.0, 8.0, 10.0, 20.0, 40.0, 60.0, 100.0] # SAE clamping only 35 | # master data dir is shared across all jobs. 36 | master_data_dir: "axbench/data" 37 | seed: 42 38 | lm_model: "gpt-4o-mini" 39 | # generation related params 40 | temperature: 1.0 41 | evaluate: 42 | models: ["GemmaScopeSAE"] 43 | latent_evaluators: [ 44 | "AUCROCEvaluator", 45 | "HardNegativeEvaluator", 46 | ] 47 | steering_evaluators: [ 48 | "PerplexityEvaluator", 49 | "LMJudgeEvaluator", 50 | ] 51 | winrate_split_ratio: 0.5 # this is for steering only, we use a separate partition for factor selection. 52 | # Number of processes to run in parallel for steering evaluation. 53 | num_of_workers: 32 54 | lm_model: "gpt-4o-mini" 55 | run_winrate: false 56 | winrate_baseline: "PromptSteering" 57 | # master data dir is shared across all jobs. 58 | master_data_dir: "axbench/data" 59 | -------------------------------------------------------------------------------- /axbench/sweep/wuzhengx/2b/l20/gemmascope_fd.yaml: -------------------------------------------------------------------------------- 1 | generate: 2 | lm_model: "gpt-4o-mini" 3 | output_length: 128 4 | num_of_examples: 144 5 | concept_path: "axbench/data/gemma-2-2b_20-featured-res-16k.json" 6 | max_concepts: 500 7 | master_data_dir: "axbench/data" 8 | dataset_category: "instruction" 9 | lm_use_cache: false 10 | seed: 42 11 | train: 12 | model_name: "google/gemma-2-2b-it" 13 | layer: 20 14 | component: "res" 15 | seed: 42 16 | use_bf16: true 17 | models: 18 | DiffMean: 19 | batch_size: 6 20 | n_epochs: 1 21 | binarize_dataset: true 22 | low_rank_dimension: 1 23 | inference: 24 | use_bf16: true 25 | models: ["PromptSteering", "GemmaScopeSAE"] 26 | model_name: "google/gemma-2-2b-it" 27 | # latent related params 28 | output_length: 128 29 | latent_num_of_examples: 36 30 | latent_batch_size: 16 31 | # steering related params 32 | steering_intervention_type: "addition" # clamping 33 | steering_model_name: "google/gemma-2-2b-it" 34 | steering_datasets: ["AlpacaEval"] 35 | steering_batch_size: 10 36 | steering_output_length: 128 37 | steering_layers: [20] 38 | steering_num_of_examples: 10 # number of examples per concept and per factor 39 | steering_factors: [0.2, 0.4, 0.6, 0.8, 1.0, 1.2, 1.4, 1.6, 1.8, 2.0, 2.5, 3.0, 4.0, 5.0] # number of steering factors per example 40 | # master data dir is shared across all jobs. 41 | master_data_dir: "axbench/data" 42 | seed: 42 43 | lm_model: "gpt-4o-mini" 44 | # generation related params 45 | temperature: 1.0 46 | evaluate: 47 | models: ["PromptSteering", "GemmaScopeSAE"] 48 | latent_evaluators: [ 49 | "AUCROCEvaluator", 50 | "HardNegativeEvaluator", 51 | ] 52 | steering_evaluators: [ 53 | "PerplexityEvaluator", 54 | "LMJudgeEvaluator", 55 | ] 56 | winrate_split_ratio: 0.5 # this is for steering only, we use a separate partition for factor selection. 57 | # Number of processes to run in parallel for steering evaluation. 58 | num_of_workers: 32 59 | lm_model: "gpt-4o-mini" 60 | run_winrate: false 61 | winrate_baseline: "PromptSteering" 62 | # master data dir is shared across all jobs. 63 | master_data_dir: "axbench/data" 64 | -------------------------------------------------------------------------------- /axbench/sweep/wuzhengx/2b/l20/gemmascope_min_clamp.yaml: -------------------------------------------------------------------------------- 1 | generate: 2 | lm_model: "gpt-4o-mini" 3 | output_length: 128 4 | num_of_examples: 144 5 | concept_path: "axbench/data/gemma-2-2b_20-gemmascope-res-16k.json" 6 | max_concepts: 500 7 | master_data_dir: "axbench/data" 8 | dataset_category: "instruction" 9 | lm_use_cache: false 10 | seed: 42 11 | train: 12 | model_name: "google/gemma-2-2b-it" 13 | layer: 20 14 | component: "res" 15 | seed: 42 16 | use_bf16: true 17 | inference: 18 | use_bf16: true 19 | models: ["GemmaScopeSAE"] 20 | model_name: "google/gemma-2-2b-it" 21 | # latent related params 22 | output_length: 128 23 | latent_num_of_examples: 36 24 | latent_batch_size: 16 25 | # steering related params 26 | steering_intervention_type: "min_clamping" # clamping 27 | steering_model_name: "google/gemma-2-2b-it" 28 | steering_datasets: ["AlpacaEval"] 29 | steering_batch_size: 10 30 | steering_output_length: 128 31 | steering_layers: [20] 32 | steering_num_of_examples: 10 # number of examples per concept and per factor 33 | # steering_factors: [0.2, 0.4, 0.6, 0.8, 1.0, 1.2, 1.4, 1.6, 1.8, 2.0, 2.5, 3.0, 4.0, 5.0] # number of steering factors per example 34 | steering_factors: [0.4, 0.8, 1.2, 1.6, 2.0, 3.0, 4.0, 6.0, 8.0, 10.0, 20.0, 40.0, 60.0, 100.0] # SAE clamping only 35 | # master data dir is shared across all jobs. 36 | master_data_dir: "axbench/data" 37 | seed: 42 38 | lm_model: "gpt-4o-mini" 39 | # generation related params 40 | temperature: 1.0 41 | evaluate: 42 | models: ["GemmaScopeSAE"] 43 | latent_evaluators: [ 44 | "AUCROCEvaluator", 45 | "HardNegativeEvaluator", 46 | ] 47 | steering_evaluators: [ 48 | "PerplexityEvaluator", 49 | "LMJudgeEvaluator", 50 | ] 51 | winrate_split_ratio: 0.5 # this is for steering only, we use a separate partition for factor selection. 52 | # Number of processes to run in parallel for steering evaluation. 53 | num_of_workers: 32 54 | lm_model: "gpt-4o-mini" 55 | run_winrate: false 56 | winrate_baseline: "PromptSteering" 57 | # master data dir is shared across all jobs. 58 | master_data_dir: "axbench/data" 59 | -------------------------------------------------------------------------------- /axbench/sweep/wuzhengx/2b/l20/lora.yaml: -------------------------------------------------------------------------------- 1 | generate: 2 | lm_model: "gpt-4o-mini" 3 | output_length: 128 4 | num_of_examples: 144 5 | concept_path: "axbench/data/gemma-2-2b_20-gemmascope-res-16k.json" 6 | max_concepts: 500 7 | master_data_dir: "axbench/data" 8 | dataset_category: "instruction" 9 | lm_use_cache: false 10 | seed: 42 11 | train: 12 | model_name: "google/gemma-2-2b-it" 13 | layer: 20 14 | component: "res" 15 | seed: 42 16 | use_bf16: true 17 | models: 18 | LoRA: 19 | batch_size: 18 20 | gradient_accumulation_steps: 2 21 | n_epochs: 24 22 | lr: 0.0009 23 | weight_decay: 0.00 24 | low_rank_dimension: 4 25 | lora_layers: [5, 10, 15, 20] 26 | lora_components: ["o_proj"] 27 | lora_alpha: 32 28 | binarize_dataset: false 29 | train_on_negative: false 30 | exclude_bos: true 31 | inference: 32 | use_bf16: true 33 | models: ["LoRA"] 34 | model_name: "google/gemma-2-2b-it" 35 | # latent related params 36 | output_length: 128 37 | latent_num_of_examples: 36 38 | latent_batch_size: 16 39 | # steering related params 40 | steering_intervention_type: "addition" # clamping 41 | steering_model_name: "google/gemma-2-2b-it" 42 | steering_datasets: ["AlpacaEval"] 43 | steering_batch_size: 10 44 | steering_output_length: 128 45 | steering_layers: [20] 46 | steering_num_of_examples: 10 # number of examples per concept and per factor 47 | steering_factors: [0.2, 0.4, 0.6, 0.8, 1.0, 1.2, 1.4, 1.6, 1.8, 2.0, 2.5, 3.0, 4.0, 5.0] # number of steering factors per example 48 | # master data dir is shared across all jobs. 49 | master_data_dir: "axbench/data" 50 | seed: 42 51 | lm_model: "gpt-4o-mini" 52 | # generation related params 53 | temperature: 1.0 54 | evaluate: 55 | models: ["LoRA"] 56 | latent_evaluators: [ 57 | "AUCROCEvaluator", 58 | "HardNegativeEvaluator", 59 | ] 60 | steering_evaluators: [ 61 | "PerplexityEvaluator", 62 | "LMJudgeEvaluator", 63 | ] 64 | winrate_split_ratio: 0.5 # this is for steering only, we use a separate partition for factor selection. 65 | # Number of processes to run in parallel for steering evaluation. 66 | num_of_workers: 32 67 | lm_model: "gpt-4o-mini" 68 | run_winrate: false 69 | winrate_baseline: "PromptSteering" 70 | # master data dir is shared across all jobs. 71 | master_data_dir: "axbench/data" 72 | -------------------------------------------------------------------------------- /axbench/sweep/wuzhengx/2b/l20/lsreft.yaml: -------------------------------------------------------------------------------- 1 | generate: 2 | lm_model: "gpt-4o-mini" 3 | output_length: 128 4 | num_of_examples: 144 5 | concept_path: "axbench/data/gemma-2-2b_20-gemmascope-res-16k.json" 6 | max_concepts: 500 7 | master_data_dir: "axbench/data" 8 | dataset_category: "instruction" 9 | lm_use_cache: false 10 | seed: 42 11 | train: 12 | model_name: "google/gemma-2-2b-it" 13 | layer: 20 14 | component: "res" 15 | seed: 42 16 | use_bf16: true 17 | models: 18 | LsReFT: 19 | batch_size: 6 20 | gradient_accumulation_steps: 1 21 | n_epochs: 3 22 | lr: 0.01 23 | weight_decay: 0.00 24 | topk: 8 25 | coeff_latent_l1_loss: 0.005 26 | low_rank_dimension: 1 27 | intervention_positions: "all" 28 | intervention_type: "addition" # clamping 29 | binarize_dataset: false 30 | train_on_negative: true 31 | exclude_bos: true 32 | inference: 33 | use_bf16: true 34 | models: ["LsReFT"] 35 | model_name: "google/gemma-2-2b-it" 36 | # latent related params 37 | output_length: 128 38 | latent_num_of_examples: 36 39 | latent_batch_size: 16 40 | # steering related params 41 | steering_intervention_type: "addition" # clamping 42 | steering_model_name: "google/gemma-2-2b-it" 43 | steering_datasets: ["AlpacaEval"] 44 | steering_batch_size: 10 45 | steering_output_length: 128 46 | steering_layers: [20] 47 | steering_num_of_examples: 10 # number of examples per concept and per factor 48 | steering_factors: [0.2, 0.4, 0.6, 0.8, 1.0, 1.2, 1.4, 1.6, 1.8, 2.0, 2.5, 3.0, 4.0, 5.0] # number of steering factors per example 49 | # master data dir is shared across all jobs. 50 | master_data_dir: "axbench/data" 51 | seed: 42 52 | lm_model: "gpt-4o-mini" 53 | # generation related params 54 | temperature: 1.0 55 | evaluate: 56 | models: ["LsReFT"] 57 | latent_evaluators: [ 58 | "AUCROCEvaluator", 59 | "HardNegativeEvaluator", 60 | ] 61 | steering_evaluators: [ 62 | "PerplexityEvaluator", 63 | "LMJudgeEvaluator", 64 | ] 65 | winrate_split_ratio: 0.5 # this is for steering only, we use a separate partition for factor selection. 66 | # Number of processes to run in parallel for steering evaluation. 67 | num_of_workers: 32 68 | lm_model: "gpt-4o-mini" 69 | run_winrate: false 70 | winrate_baseline: "PromptSteering" 71 | # master data dir is shared across all jobs. 72 | master_data_dir: "axbench/data" 73 | -------------------------------------------------------------------------------- /axbench/sweep/wuzhengx/2b/l20/lsreft_fd.yaml: -------------------------------------------------------------------------------- 1 | generate: 2 | lm_model: "gpt-4o-mini" 3 | output_length: 128 4 | num_of_examples: 144 5 | concept_path: "axbench/data/gemma-2-2b_20-featured-res-16k.json" 6 | max_concepts: 500 7 | master_data_dir: "axbench/data" 8 | dataset_category: "instruction" 9 | lm_use_cache: false 10 | seed: 42 11 | train: 12 | model_name: "google/gemma-2-2b-it" 13 | layer: 20 14 | component: "res" 15 | seed: 42 16 | use_bf16: true 17 | models: 18 | LsReFT: 19 | batch_size: 6 20 | gradient_accumulation_steps: 1 21 | n_epochs: 3 22 | lr: 0.01 23 | weight_decay: 0.00 24 | topk: 8 25 | coeff_latent_l1_loss: 0.005 26 | low_rank_dimension: 1 27 | intervention_positions: "all" 28 | intervention_type: "addition" # clamping 29 | binarize_dataset: false 30 | train_on_negative: true 31 | exclude_bos: true 32 | inference: 33 | use_bf16: true 34 | models: ["LsReFT"] 35 | model_name: "google/gemma-2-2b-it" 36 | # latent related params 37 | output_length: 128 38 | latent_num_of_examples: 36 39 | latent_batch_size: 16 40 | # steering related params 41 | steering_intervention_type: "addition" # clamping 42 | steering_model_name: "google/gemma-2-2b-it" 43 | steering_datasets: ["AlpacaEval"] 44 | steering_batch_size: 10 45 | steering_output_length: 128 46 | steering_layers: [20] 47 | steering_num_of_examples: 10 # number of examples per concept and per factor 48 | steering_factors: [0.2, 0.4, 0.6, 0.8, 1.0, 1.2, 1.4, 1.6, 1.8, 2.0, 2.5, 3.0, 4.0, 5.0] # number of steering factors per example 49 | # master data dir is shared across all jobs. 50 | master_data_dir: "axbench/data" 51 | seed: 42 52 | lm_model: "gpt-4o-mini" 53 | # generation related params 54 | temperature: 1.0 55 | evaluate: 56 | models: ["LsReFT"] 57 | latent_evaluators: [ 58 | "AUCROCEvaluator", 59 | "HardNegativeEvaluator", 60 | ] 61 | steering_evaluators: [ 62 | "PerplexityEvaluator", 63 | "LMJudgeEvaluator", 64 | ] 65 | winrate_split_ratio: 0.5 # this is for steering only, we use a separate partition for factor selection. 66 | # Number of processes to run in parallel for steering evaluation. 67 | num_of_workers: 32 68 | lm_model: "gpt-4o-mini" 69 | run_winrate: false 70 | winrate_baseline: "PromptSteering" 71 | # master data dir is shared across all jobs. 72 | master_data_dir: "axbench/data" 73 | -------------------------------------------------------------------------------- /axbench/sweep/wuzhengx/2b/l20/lsreft_scaling_law.yaml: -------------------------------------------------------------------------------- 1 | generate: 2 | lm_model: "gpt-4o-mini" 3 | output_length: 128 4 | num_of_examples: 144 5 | concept_path: "axbench/data/gemma-2-2b_20-gemmascope-res-16k.json" 6 | max_concepts: 10 7 | master_data_dir: "axbench/data" 8 | dataset_category: "instruction" 9 | lm_use_cache: false 10 | seed: 42 11 | train: 12 | model_name: "google/gemma-2-2b-it" 13 | layer: 20 14 | component: "res" 15 | seed: 42 16 | use_bf16: true 17 | max_num_of_examples: 144 # [6, 12, 24, 48, 72, 96, 120, 144] 18 | models: 19 | LsReFT: 20 | batch_size: 6 21 | gradient_accumulation_steps: 1 22 | lr: 0.01 23 | weight_decay: 0.00 24 | topk: 8 25 | coeff_latent_l1_loss: 0.005 26 | low_rank_dimension: 1 27 | intervention_positions: "all" 28 | intervention_type: "addition" # clamping 29 | binarize_dataset: false 30 | train_on_negative: true 31 | exclude_bos: true 32 | inference: 33 | use_bf16: true 34 | models: ["LsReFT"] 35 | model_name: "google/gemma-2-2b-it" 36 | # latent related params 37 | output_length: 128 38 | latent_num_of_examples: 36 39 | latent_batch_size: 16 40 | # steering related params 41 | steering_intervention_type: "addition" # clamping 42 | steering_model_name: "google/gemma-2-2b-it" 43 | steering_datasets: ["AlpacaEval"] 44 | steering_batch_size: 10 45 | steering_output_length: 128 46 | steering_layers: [20] 47 | steering_num_of_examples: 10 # number of examples per concept and per factor 48 | steering_factors: [0.2, 0.4, 0.6, 0.8, 1.0, 1.2, 1.4, 1.6, 1.8, 2.0, 2.5, 3.0, 4.0, 5.0] # number of steering factors per example 49 | # master data dir is shared across all jobs. 50 | master_data_dir: "axbench/data" 51 | seed: 42 52 | lm_model: "gpt-4o-mini" 53 | # generation related params 54 | temperature: 1.0 55 | evaluate: 56 | models: ["LsReFT"] 57 | latent_evaluators: [ 58 | "AUCROCEvaluator", 59 | "HardNegativeEvaluator", 60 | ] 61 | steering_evaluators: [ 62 | "PerplexityEvaluator", 63 | "LMJudgeEvaluator", 64 | ] 65 | winrate_split_ratio: 0.5 # this is for steering only, we use a separate partition for factor selection. 66 | # Number of processes to run in parallel for steering evaluation. 67 | num_of_workers: 32 68 | lm_model: "gpt-4o-mini" 69 | run_winrate: false 70 | winrate_baseline: "PromptSteering" 71 | # master data dir is shared across all jobs. 72 | master_data_dir: "axbench/data" 73 | -------------------------------------------------------------------------------- /axbench/sweep/wuzhengx/2b/l20/lsreft_synergy.yaml: -------------------------------------------------------------------------------- 1 | generate: 2 | lm_model: "gpt-4o-mini" 3 | output_length: 128 4 | num_of_examples: 144 5 | concept_path: "axbench/data/gemma-2-2b_20-gemmascope-res-16k.json" 6 | max_concepts: 500 7 | master_data_dir: "axbench/data" 8 | dataset_category: "instruction" 9 | lm_use_cache: false 10 | seed: 42 11 | train: 12 | model_name: "google/gemma-2-2b-it" 13 | layer: 20 14 | component: "res" 15 | seed: 42 16 | use_bf16: true 17 | models: 18 | LsReFT: 19 | batch_size: 6 20 | gradient_accumulation_steps: 1 21 | n_epochs: 3 22 | lr: 0.01 23 | weight_decay: 0.00 24 | topk: 8 25 | coeff_latent_l1_loss: 0.005 26 | low_rank_dimension: 1 27 | intervention_positions: "all" 28 | intervention_type: "addition" # clamping 29 | binarize_dataset: false 30 | train_on_negative: true 31 | exclude_bos: true 32 | use_synergy: true 33 | inference: 34 | use_bf16: true 35 | models: ["LsReFT"] 36 | model_name: "google/gemma-2-2b-it" 37 | # latent related params 38 | output_length: 128 39 | latent_num_of_examples: 36 40 | latent_batch_size: 16 41 | # steering related params 42 | steering_intervention_type: "addition" # clamping 43 | steering_model_name: "google/gemma-2-2b-it" 44 | steering_datasets: ["AlpacaEval"] 45 | steering_batch_size: 10 46 | steering_output_length: 128 47 | steering_layers: [20] 48 | steering_num_of_examples: 10 # number of examples per concept and per factor 49 | steering_factors: [0.2, 0.4, 0.6, 0.8, 1.0, 1.2, 1.4, 1.6, 1.8, 2.0, 2.5, 3.0, 4.0, 5.0] # number of steering factors per example 50 | # master data dir is shared across all jobs. 51 | master_data_dir: "axbench/data" 52 | seed: 42 53 | lm_model: "gpt-4o-mini" 54 | # generation related params 55 | temperature: 1.0 56 | evaluate: 57 | models: ["LsReFT"] 58 | latent_evaluators: [ 59 | "AUCROCEvaluator", 60 | "HardNegativeEvaluator", 61 | ] 62 | steering_evaluators: [ 63 | "PerplexityEvaluator", 64 | "LMJudgeEvaluator", 65 | ] 66 | winrate_split_ratio: 0.5 # this is for steering only, we use a separate partition for factor selection. 67 | # Number of processes to run in parallel for steering evaluation. 68 | num_of_workers: 32 69 | lm_model: "gpt-4o-mini" 70 | run_winrate: false 71 | winrate_baseline: "PromptSteering" 72 | # master data dir is shared across all jobs. 73 | master_data_dir: "axbench/data" 74 | -------------------------------------------------------------------------------- /axbench/sweep/wuzhengx/2b/l20/no_grad.yaml: -------------------------------------------------------------------------------- 1 | generate: 2 | lm_model: "gpt-4o-mini" 3 | output_length: 128 4 | num_of_examples: 144 5 | concept_path: "axbench/data/gemma-2-2b_20-gemmascope-res-16k.json" 6 | max_concepts: 500 7 | master_data_dir: "axbench/data" 8 | dataset_category: "instruction" 9 | lm_use_cache: false 10 | seed: 42 11 | train: 12 | model_name: "google/gemma-2-2b-it" 13 | layer: 20 14 | component: "res" 15 | seed: 42 16 | use_bf16: true 17 | models: 18 | DiffMean: 19 | batch_size: 6 20 | n_epochs: 1 21 | binarize_dataset: true 22 | low_rank_dimension: 1 23 | PCA: 24 | batch_size: 6 25 | n_epochs: 1 26 | binarize_dataset: true 27 | low_rank_dimension: 1 28 | LAT: 29 | batch_size: 6 30 | n_epochs: 1 31 | binarize_dataset: true 32 | low_rank_dimension: 1 33 | inference: 34 | use_bf16: true 35 | models: ["PromptSteering", "DiffMean", "PCA", "LAT", "GemmaScopeSAE"] 36 | model_name: "google/gemma-2-2b-it" 37 | # latent related params 38 | output_length: 128 39 | latent_num_of_examples: 36 40 | latent_batch_size: 16 41 | # steering related params 42 | steering_intervention_type: "addition" # clamping 43 | steering_model_name: "google/gemma-2-2b-it" 44 | steering_datasets: ["AlpacaEval"] 45 | steering_batch_size: 10 46 | steering_output_length: 128 47 | steering_layers: [20] 48 | steering_num_of_examples: 10 # number of examples per concept and per factor 49 | steering_factors: [0.2, 0.4, 0.6, 0.8, 1.0, 1.2, 1.4, 1.6, 1.8, 2.0, 2.5, 3.0, 4.0, 5.0] # number of steering factors per example 50 | # master data dir is shared across all jobs. 51 | master_data_dir: "axbench/data" 52 | seed: 42 53 | lm_model: "gpt-4o-mini" 54 | # generation related params 55 | temperature: 1.0 56 | evaluate: 57 | models: ["PromptSteering", "DiffMean", "PCA", "LAT", "GemmaScopeSAE"] 58 | latent_evaluators: [ 59 | "AUCROCEvaluator", 60 | "HardNegativeEvaluator", 61 | ] 62 | steering_evaluators: [ 63 | "PerplexityEvaluator", 64 | "LMJudgeEvaluator", 65 | ] 66 | winrate_split_ratio: 0.5 # this is for steering only, we use a separate partition for factor selection. 67 | # Number of processes to run in parallel for steering evaluation. 68 | num_of_workers: 32 69 | lm_model: "gpt-4o-mini" 70 | run_winrate: false 71 | winrate_baseline: "PromptSteering" 72 | # master data dir is shared across all jobs. 73 | master_data_dir: "axbench/data" 74 | -------------------------------------------------------------------------------- /axbench/sweep/wuzhengx/2b/l20/probe.yaml: -------------------------------------------------------------------------------- 1 | generate: 2 | lm_model: "gpt-4o-mini" 3 | output_length: 128 4 | num_of_examples: 144 5 | concept_path: "axbench/data/gemma-2-2b_20-gemmascope-res-16k.json" 6 | max_concepts: 500 7 | master_data_dir: "axbench/data" 8 | dataset_category: "instruction" 9 | lm_use_cache: false 10 | seed: 42 11 | train: 12 | model_name: "google/gemma-2-2b-it" 13 | layer: 20 14 | component: "res" 15 | seed: 42 16 | use_bf16: true 17 | models: 18 | LinearProbe: 19 | batch_size: 12 20 | gradient_accumulation_steps: 4 21 | n_epochs: 24 22 | lr: 0.005 23 | weight_decay: 0.001 24 | coeff_l1_loss: 0.000 25 | binarize_dataset: true 26 | low_rank_dimension: 1 27 | inference: 28 | use_bf16: true 29 | models: ["LinearProbe"] 30 | model_name: "google/gemma-2-2b-it" 31 | # latent related params 32 | output_length: 128 33 | latent_num_of_examples: 36 34 | latent_batch_size: 16 35 | # steering related params 36 | steering_intervention_type: "addition" # clamping 37 | steering_model_name: "google/gemma-2-2b-it" 38 | steering_datasets: ["AlpacaEval"] 39 | steering_batch_size: 10 40 | steering_output_length: 128 41 | steering_layers: [20] 42 | steering_num_of_examples: 10 # number of examples per concept and per factor 43 | steering_factors: [0.2, 0.4, 0.6, 0.8, 1.0, 1.2, 1.4, 1.6, 1.8, 2.0, 2.5, 3.0, 4.0, 5.0] # number of steering factors per example 44 | # master data dir is shared across all jobs. 45 | master_data_dir: "axbench/data" 46 | seed: 42 47 | lm_model: "gpt-4o-mini" 48 | # generation related params 49 | temperature: 1.0 50 | evaluate: 51 | models: ["LinearProbe"] 52 | latent_evaluators: [ 53 | "AUCROCEvaluator", 54 | "HardNegativeEvaluator", 55 | ] 56 | steering_evaluators: [ 57 | "PerplexityEvaluator", 58 | "LMJudgeEvaluator", 59 | ] 60 | winrate_split_ratio: 0.5 # this is for steering only, we use a separate partition for factor selection. 61 | # Number of processes to run in parallel for steering evaluation. 62 | num_of_workers: 32 63 | lm_model: "gpt-4o-mini" 64 | run_winrate: false 65 | winrate_baseline: "PromptSteering" 66 | # master data dir is shared across all jobs. 67 | master_data_dir: "axbench/data" 68 | -------------------------------------------------------------------------------- /axbench/sweep/wuzhengx/2b/l20/prompt_detection.yaml: -------------------------------------------------------------------------------- 1 | generate: 2 | lm_model: "gpt-4o-mini" 3 | output_length: 128 4 | num_of_examples: 144 5 | concept_path: "axbench/data/gemma-2-2b_20-gemmascope-res-16k.json" 6 | max_concepts: 500 7 | master_data_dir: "axbench/data" 8 | dataset_category: "instruction" 9 | lm_use_cache: false 10 | seed: 42 11 | train: 12 | model_name: "google/gemma-2-2b-it" 13 | layer: 20 14 | component: "res" 15 | seed: 42 16 | use_bf16: true 17 | inference: 18 | use_bf16: true 19 | models: ["PromptDetection"] 20 | model_name: "google/gemma-2-2b-it" 21 | # latent related params 22 | output_length: 128 23 | latent_num_of_examples: 36 24 | latent_batch_size: 256 25 | # steering related params 26 | steering_intervention_type: "addition" # clamping 27 | steering_model_name: "google/gemma-2-2b-it" 28 | steering_datasets: ["AlpacaEval"] 29 | steering_batch_size: 10 30 | steering_output_length: 128 31 | steering_layers: [20] 32 | steering_num_of_examples: 10 # number of examples per concept and per factor 33 | steering_factors: [0.2, 0.4, 0.6, 0.8, 1.0, 1.2, 1.4, 1.6, 1.8, 2.0, 2.5, 3.0, 4.0, 5.0] # number of steering factors per example 34 | # master data dir is shared across all jobs. 35 | master_data_dir: "axbench/data" 36 | seed: 42 37 | lm_model: "gpt-4o-mini" 38 | # generation related params 39 | temperature: 1.0 40 | evaluate: 41 | models: ["PromptDetection"] 42 | latent_evaluators: [ 43 | "AUCROCEvaluator", 44 | "HardNegativeEvaluator", 45 | ] 46 | steering_evaluators: [ 47 | "PerplexityEvaluator", 48 | "LMJudgeEvaluator", 49 | ] 50 | winrate_split_ratio: 0.5 # this is for steering only, we use a separate partition for factor selection. 51 | # Number of processes to run in parallel for steering evaluation. 52 | num_of_workers: 32 53 | lm_model: "gpt-4o-mini" 54 | run_winrate: false 55 | winrate_baseline: "PromptSteering" 56 | # master data dir is shared across all jobs. 57 | master_data_dir: "axbench/data" 58 | -------------------------------------------------------------------------------- /axbench/sweep/wuzhengx/2b/l20/sft.yaml: -------------------------------------------------------------------------------- 1 | generate: 2 | lm_model: "gpt-4o-mini" 3 | output_length: 128 4 | num_of_examples: 144 5 | concept_path: "axbench/data/gemma-2-2b_20-gemmascope-res-16k.json" 6 | max_concepts: 500 7 | master_data_dir: "axbench/data" 8 | dataset_category: "instruction" 9 | lm_use_cache: false 10 | seed: 42 11 | train: 12 | model_name: "google/gemma-2-2b-it" 13 | layer: 20 14 | component: "res" 15 | seed: 42 16 | use_bf16: true 17 | max_concepts: 20 18 | models: 19 | SFT: 20 | batch_size: 18 21 | gradient_accumulation_steps: 4 22 | n_epochs: 8 23 | lr: 0.00004 24 | weight_decay: 0.00 25 | binarize_dataset: false 26 | train_on_negative: false 27 | exclude_bos: true 28 | inference: 29 | use_bf16: true 30 | models: ["SFT"] 31 | model_name: "google/gemma-2-2b-it" 32 | # latent related params 33 | output_length: 128 34 | latent_num_of_examples: 36 35 | latent_batch_size: 16 36 | # steering related params 37 | steering_intervention_type: "addition" # clamping 38 | steering_model_name: "google/gemma-2-2b-it" 39 | steering_datasets: ["AlpacaEval"] 40 | steering_batch_size: 10 41 | steering_output_length: 128 42 | steering_layers: [20] 43 | steering_num_of_examples: 10 # number of examples per concept and per factor 44 | steering_factors: [0.2, 0.4, 0.6, 0.8, 1.0, 1.2, 1.4, 1.6, 1.8, 2.0, 2.5, 3.0, 4.0, 5.0] # number of steering factors per example 45 | # master data dir is shared across all jobs. 46 | master_data_dir: "axbench/data" 47 | seed: 42 48 | lm_model: "gpt-4o-mini" 49 | # generation related params 50 | temperature: 1.0 51 | evaluate: 52 | models: ["SFT"] 53 | latent_evaluators: [ 54 | "AUCROCEvaluator", 55 | "HardNegativeEvaluator", 56 | ] 57 | steering_evaluators: [ 58 | "PerplexityEvaluator", 59 | "LMJudgeEvaluator", 60 | ] 61 | winrate_split_ratio: 0.5 # this is for steering only, we use a separate partition for factor selection. 62 | # Number of processes to run in parallel for steering evaluation. 63 | num_of_workers: 32 64 | lm_model: "gpt-4o-mini" 65 | run_winrate: false 66 | winrate_baseline: "PromptSteering" 67 | # master data dir is shared across all jobs. 68 | master_data_dir: "axbench/data" 69 | -------------------------------------------------------------------------------- /axbench/sweep/wuzhengx/2b/l20/simple_prompt_steering.yaml: -------------------------------------------------------------------------------- 1 | generate: 2 | lm_model: "gpt-4o-mini" 3 | output_length: 128 4 | num_of_examples: 144 5 | concept_path: "axbench/data/gemma-2-2b_20-gemmascope-res-16k.json" 6 | max_concepts: 500 7 | master_data_dir: "axbench/data" 8 | dataset_category: "instruction" 9 | lm_use_cache: false 10 | seed: 42 11 | train: 12 | model_name: "google/gemma-2-2b-it" 13 | layer: 20 14 | component: "res" 15 | seed: 42 16 | use_bf16: true 17 | models: 18 | DiffMean: 19 | batch_size: 6 20 | n_epochs: 1 21 | binarize_dataset: true 22 | low_rank_dimension: 1 23 | inference: 24 | use_bf16: true 25 | models: ["SimplePromptSteering"] 26 | model_name: "google/gemma-2-2b-it" 27 | # latent related params 28 | output_length: 128 29 | latent_num_of_examples: 36 30 | latent_batch_size: 16 31 | # steering related params 32 | steering_intervention_type: "addition" # clamping 33 | steering_model_name: "google/gemma-2-2b-it" 34 | steering_datasets: ["AlpacaEval"] 35 | steering_batch_size: 10 36 | steering_output_length: 128 37 | steering_layers: [20] 38 | steering_num_of_examples: 10 # number of examples per concept and per factor 39 | steering_factors: [0.2, 0.4, 0.6, 0.8, 1.0, 1.2, 1.4, 1.6, 1.8, 2.0, 2.5, 3.0, 4.0, 5.0] # number of steering factors per example 40 | # master data dir is shared across all jobs. 41 | master_data_dir: "axbench/data" 42 | seed: 42 43 | lm_model: "gpt-4o-mini" 44 | # generation related params 45 | temperature: 1.0 46 | evaluate: 47 | models: ["SimplePromptSteering"] 48 | latent_evaluators: [ 49 | "AUCROCEvaluator", 50 | "HardNegativeEvaluator", 51 | ] 52 | steering_evaluators: [ 53 | "PerplexityEvaluator", 54 | "LMJudgeEvaluator", 55 | ] 56 | winrate_split_ratio: 0.5 # this is for steering only, we use a separate partition for factor selection. 57 | # Number of processes to run in parallel for steering evaluation. 58 | num_of_workers: 32 59 | lm_model: "gpt-4o-mini" 60 | run_winrate: false 61 | winrate_baseline: "PromptSteering" 62 | # master data dir is shared across all jobs. 63 | master_data_dir: "axbench/data" 64 | -------------------------------------------------------------------------------- /axbench/sweep/wuzhengx/2b/l20/steering_vec.yaml: -------------------------------------------------------------------------------- 1 | generate: 2 | lm_model: "gpt-4o-mini" 3 | output_length: 128 4 | num_of_examples: 144 5 | concept_path: "axbench/data/gemma-2-2b_20-gemmascope-res-16k.json" 6 | max_concepts: 500 7 | master_data_dir: "axbench/data" 8 | dataset_category: "instruction" 9 | lm_use_cache: false 10 | seed: 42 11 | train: 12 | model_name: "google/gemma-2-2b-it" 13 | layer: 20 14 | component: "res" 15 | seed: 42 16 | use_bf16: true 17 | models: 18 | SteeringVector: 19 | batch_size: 6 20 | gradient_accumulation_steps: 1 21 | n_epochs: 3 22 | lr: 0.01 23 | weight_decay: 0.00 24 | low_rank_dimension: 1 25 | intervention_positions: "all" 26 | intervention_type: "addition" # clamping 27 | binarize_dataset: false 28 | train_on_negative: false 29 | exclude_bos: true 30 | inference: 31 | use_bf16: true 32 | # if you are verifying latent, you dont have to include steering models. 33 | models: ["SteeringVector"] 34 | model_name: "google/gemma-2-2b-it" 35 | # latent related params 36 | output_length: 128 37 | latent_num_of_examples: 36 38 | latent_batch_size: 16 39 | # steering related params 40 | steering_intervention_type: "addition" # clamping 41 | steering_model_name: "google/gemma-2-2b-it" 42 | steering_datasets: ["AlpacaEval"] 43 | steering_batch_size: 10 44 | steering_output_length: 128 45 | steering_layers: [20] 46 | steering_num_of_examples: 10 # number of examples per concept and per factor 47 | steering_factors: [0.2, 0.4, 0.6, 0.8, 1.0, 1.2, 1.4, 1.6, 1.8, 2.0, 2.5, 3.0, 4.0, 5.0] # number of steering factors per example 48 | # master data dir is shared across all jobs. 49 | master_data_dir: "axbench/data" 50 | seed: 42 51 | lm_model: "gpt-4o-mini" 52 | # generation related params 53 | temperature: 1.0 54 | evaluate: 55 | models: ["SteeringVector"] 56 | latent_evaluators: [ 57 | "AUCROCEvaluator", 58 | "HardNegativeEvaluator", 59 | ] 60 | steering_evaluators: [ 61 | "PerplexityEvaluator", 62 | "LMJudgeEvaluator", 63 | ] 64 | winrate_split_ratio: 0.5 # this is for steering only, we use a separate partition for factor selection. 65 | # Number of processes to run in parallel for steering evaluation. 66 | num_of_workers: 32 67 | lm_model: "gpt-4o-mini" 68 | run_winrate: false 69 | winrate_baseline: "PromptSteering" 70 | # master data dir is shared across all jobs. 71 | master_data_dir: "axbench/data" 72 | -------------------------------------------------------------------------------- /axbench/sweep/wuzhengx/9b/l20/16k_diffmean.yaml: -------------------------------------------------------------------------------- 1 | generate: 2 | lm_model: "gpt-4o-mini" 3 | output_length: 128 4 | num_of_examples: 144 5 | concept_path: "axbench/data/gemma-2-9b_20-gemmascope-res-16k.json" 6 | max_concepts: 16000 7 | master_data_dir: "axbench/data" 8 | dataset_category: "instruction" 9 | lm_use_cache: false 10 | seed: 42 11 | train: 12 | model_name: "google/gemma-2-9b-it" 13 | layer: 20 14 | component: "res" 15 | seed: 42 16 | use_bf16: true 17 | models: 18 | DiffMean: 19 | batch_size: 6 20 | n_epochs: 1 21 | binarize_dataset: true 22 | low_rank_dimension: 1 23 | inference: 24 | use_bf16: true 25 | models: ["DiffMean"] 26 | model_name: "google/gemma-2-9b-it" 27 | # latent related params 28 | output_length: 128 29 | latent_num_of_examples: 36 30 | latent_batch_size: 36 31 | imbalance_factor: 2 32 | # steering related params 33 | steering_intervention_type: "addition" # clamping 34 | steering_model_name: "google/gemma-2-9b-it" 35 | steering_datasets: ["AlpacaEval"] 36 | steering_batch_size: 5 37 | steering_output_length: 128 38 | steering_layers: [20] 39 | steering_num_of_examples: 10 # number of examples per concept and per factor 40 | steering_factors: [0.2, 0.4, 0.6, 0.8, 1.0, 1.2, 1.4, 1.6, 1.8, 2.0, 2.5, 3.0, 4.0, 5.0] # number of steering factors per example 41 | # master data dir is shared across all jobs. 42 | master_data_dir: "axbench/data" 43 | seed: 42 44 | lm_model: "gpt-4o-mini" 45 | # generation related params 46 | temperature: 1.0 -------------------------------------------------------------------------------- /axbench/sweep/wuzhengx/9b/l20/16k_diffmean_crossfit.yaml: -------------------------------------------------------------------------------- 1 | generate: 2 | lm_model: "gpt-4o-mini" 3 | output_length: 128 4 | num_of_examples: 144 5 | concept_path: "axbench/data/gemma-2-2b_20-gemmascope-res-16k.json" 6 | max_concepts: 16000 7 | master_data_dir: "axbench/data" 8 | dataset_category: "instruction" 9 | lm_use_cache: false 10 | seed: 42 11 | train: 12 | model_name: "google/gemma-2-9b-it" 13 | layer: 20 14 | component: "res" 15 | seed: 42 16 | use_bf16: true 17 | models: 18 | DiffMean: 19 | batch_size: 6 20 | n_epochs: 1 21 | binarize_dataset: true 22 | low_rank_dimension: 1 -------------------------------------------------------------------------------- /axbench/sweep/wuzhengx/9b/l20/16k_lsreft.yaml: -------------------------------------------------------------------------------- 1 | generate: 2 | lm_model: "gpt-4o-mini" 3 | output_length: 128 4 | num_of_examples: 144 5 | concept_path: "axbench/data/gemma-2-9b_20-gemmascope-res-16k.json" 6 | max_concepts: 16000 7 | master_data_dir: "axbench/data" 8 | dataset_category: "instruction" 9 | lm_use_cache: false 10 | seed: 42 11 | train: 12 | model_name: "google/gemma-2-9b-it" 13 | layer: 20 14 | component: "res" 15 | seed: 42 16 | use_bf16: true 17 | models: 18 | LsReFT: 19 | batch_size: 6 20 | gradient_accumulation_steps: 1 21 | n_epochs: 3 22 | lr: 0.005 23 | weight_decay: 0.00 24 | topk: 8 25 | coeff_latent_l1_loss: 0.005 26 | low_rank_dimension: 1 27 | intervention_positions: "all" 28 | intervention_type: "addition" # clamping 29 | binarize_dataset: false 30 | train_on_negative: true 31 | exclude_bos: true 32 | inference: 33 | use_bf16: true 34 | models: ["LsReFT"] 35 | model_name: "google/gemma-2-9b-it" 36 | # latent related params 37 | output_length: 128 38 | latent_num_of_examples: 36 39 | latent_batch_size: 36 40 | imbalance_factor: 2 41 | # steering related params 42 | steering_intervention_type: "addition" # clamping 43 | steering_model_name: "google/gemma-2-9b-it" 44 | steering_datasets: ["AlpacaEval"] 45 | steering_batch_size: 5 46 | steering_output_length: 128 47 | steering_layers: [20] 48 | steering_num_of_examples: 10 # number of examples per concept and per factor 49 | steering_factors: [0.2, 0.4, 0.6, 0.8, 1.0, 1.2, 1.4, 1.6, 1.8, 2.0, 2.5, 3.0, 4.0, 5.0] # number of steering factors per example 50 | # master data dir is shared across all jobs. 51 | master_data_dir: "axbench/data" 52 | seed: 42 53 | lm_model: "gpt-4o-mini" 54 | # generation related params 55 | temperature: 1.0 -------------------------------------------------------------------------------- /axbench/sweep/wuzhengx/9b/l20/16k_lsreft_crossfit.yaml: -------------------------------------------------------------------------------- 1 | generate: 2 | lm_model: "gpt-4o-mini" 3 | output_length: 128 4 | num_of_examples: 144 5 | concept_path: "axbench/data/gemma-2-2b_20-gemmascope-res-16k.json" 6 | max_concepts: 16000 7 | master_data_dir: "axbench/data" 8 | dataset_category: "instruction" 9 | lm_use_cache: false 10 | seed: 42 11 | train: 12 | model_name: "google/gemma-2-9b-it" 13 | layer: 20 14 | component: "res" 15 | seed: 42 16 | use_bf16: true 17 | models: 18 | LsReFT: 19 | batch_size: 6 20 | gradient_accumulation_steps: 1 21 | n_epochs: 3 22 | lr: 0.005 23 | weight_decay: 0.00 24 | topk: 8 25 | coeff_latent_l1_loss: 0.005 26 | low_rank_dimension: 1 27 | intervention_positions: "all" 28 | intervention_type: "addition" # clamping 29 | binarize_dataset: false 30 | train_on_negative: true 31 | exclude_bos: true -------------------------------------------------------------------------------- /axbench/sweep/wuzhengx/9b/l20/bow.yaml: -------------------------------------------------------------------------------- 1 | generate: 2 | lm_model: "gpt-4o-mini" 3 | output_length: 128 4 | num_of_examples: 144 5 | concept_path: "axbench/data/gemma-2-9b_20-gemmascope-res-16k.json" 6 | max_concepts: 500 7 | master_data_dir: "axbench/data" 8 | dataset_category: "instruction" 9 | lm_use_cache: false 10 | seed: 42 11 | train: 12 | model_name: "google/gemma-2-9b-it" 13 | layer: 20 14 | component: "res" 15 | seed: 42 16 | use_bf16: true 17 | models: 18 | BoW: 19 | binarize_dataset: true 20 | bow_penalty: "l2" # l1, l2 21 | bow_C: 100 # 0.001, 0.01, 0.1, 1, 10, 100 22 | inference: 23 | use_bf16: true 24 | models: ["BoW"] 25 | model_name: "google/gemma-2-9b-it" 26 | # latent related params 27 | output_length: 128 28 | latent_num_of_examples: 36 29 | latent_batch_size: 16 30 | # steering related params 31 | steering_intervention_type: "addition" # clamping 32 | steering_model_name: "google/gemma-2-9b-it" 33 | steering_datasets: ["AlpacaEval"] 34 | steering_batch_size: 10 35 | steering_output_length: 128 36 | steering_layers: [20] 37 | steering_num_of_examples: 10 # number of examples per concept and per factor 38 | steering_factors: [0.2, 0.4, 0.6, 0.8, 1.0, 1.2, 1.4, 1.6, 1.8, 2.0, 2.5, 3.0, 4.0, 5.0] # number of steering factors per example 39 | # master data dir is shared across all jobs. 40 | master_data_dir: "axbench/data" 41 | seed: 42 42 | lm_model: "gpt-4o-mini" 43 | # generation related params 44 | temperature: 1.0 45 | evaluate: 46 | models: ["BoW"] 47 | latent_evaluators: [ 48 | "AUCROCEvaluator", 49 | "HardNegativeEvaluator", 50 | ] 51 | steering_evaluators: [ 52 | "PerplexityEvaluator", 53 | "LMJudgeEvaluator", 54 | ] 55 | winrate_split_ratio: 0.5 # this is for steering only, we use a separate partition for factor selection. 56 | # Number of processes to run in parallel for steering evaluation. 57 | num_of_workers: 32 58 | lm_model: "gpt-4o-mini" 59 | run_winrate: false 60 | winrate_baseline: "PromptSteering" 61 | # master data dir is shared across all jobs. 62 | master_data_dir: "axbench/data" 63 | -------------------------------------------------------------------------------- /axbench/sweep/wuzhengx/9b/l20/gemmascope_axbench_max_act.yaml: -------------------------------------------------------------------------------- 1 | generate: 2 | lm_model: "gpt-4o-mini" 3 | output_length: 128 4 | num_of_examples: 144 5 | concept_path: "axbench/data/gemma-2-9b_20-gemmascope-res-16k.json" 6 | max_concepts: 500 7 | master_data_dir: "axbench/data" 8 | dataset_category: "instruction" 9 | lm_use_cache: false 10 | seed: 42 11 | train: 12 | model_name: "google/gemma-2-9b-it" 13 | layer: 20 14 | component: "res" 15 | seed: 42 16 | use_bf16: true 17 | inference: 18 | use_bf16: true 19 | models: ["GemmaScopeSAE"] 20 | model_name: "google/gemma-2-9b-it" 21 | # latent related params 22 | output_length: 128 23 | latent_num_of_examples: 36 24 | latent_batch_size: 16 25 | # steering related params 26 | disable_neuronpedia_max_act: true 27 | steering_intervention_type: "addition" 28 | steering_model_name: "google/gemma-2-9b-it" 29 | steering_datasets: ["AlpacaEval"] 30 | steering_batch_size: 5 31 | steering_output_length: 128 32 | steering_layers: [20] 33 | steering_num_of_examples: 10 # number of examples per concept and per factor 34 | # steering_factors: [0.2, 0.4, 0.6, 0.8, 1.0, 1.2, 1.4, 1.6, 1.8, 2.0, 2.5, 3.0, 4.0, 5.0] # number of steering factors per example 35 | # steering_factors: [0.4, 0.8, 1.2, 1.6, 2.0, 3.0, 4.0, 6.0, 8.0, 10.0, 20.0, 40.0, 60.0, 100.0] # SAE clamping only 36 | steering_factors: [0.4, 1.0, 1.4, 1.8, 2.2, 2.6, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0] # SAE using AxBench max act 37 | # master data dir is shared across all jobs. 38 | master_data_dir: "axbench/data" 39 | seed: 42 40 | lm_model: "gpt-4o-mini" 41 | # generation related params 42 | temperature: 1.0 43 | evaluate: 44 | models: ["GemmaScopeSAE"] 45 | latent_evaluators: [ 46 | "AUCROCEvaluator", 47 | "HardNegativeEvaluator", 48 | ] 49 | steering_evaluators: [ 50 | "PerplexityEvaluator", 51 | "LMJudgeEvaluator", 52 | ] 53 | winrate_split_ratio: 0.5 # this is for steering only, we use a separate partition for factor selection. 54 | # Number of processes to run in parallel for steering evaluation. 55 | num_of_workers: 32 56 | lm_model: "gpt-4o-mini" 57 | run_winrate: false 58 | winrate_baseline: "PromptSteering" 59 | # master data dir is shared across all jobs. 60 | master_data_dir: "axbench/data" 61 | -------------------------------------------------------------------------------- /axbench/sweep/wuzhengx/9b/l20/gemmascope_clamp.yaml: -------------------------------------------------------------------------------- 1 | generate: 2 | lm_model: "gpt-4o-mini" 3 | output_length: 128 4 | num_of_examples: 144 5 | concept_path: "axbench/data/gemma-2-9b_20-gemmascope-res-16k.json" 6 | max_concepts: 500 7 | master_data_dir: "axbench/data" 8 | dataset_category: "instruction" 9 | lm_use_cache: false 10 | seed: 42 11 | train: 12 | model_name: "google/gemma-2-9b-it" 13 | layer: 20 14 | component: "res" 15 | seed: 42 16 | use_bf16: true 17 | inference: 18 | use_bf16: true 19 | models: ["GemmaScopeSAE"] 20 | model_name: "google/gemma-2-9b-it" 21 | # latent related params 22 | output_length: 128 23 | latent_num_of_examples: 36 24 | latent_batch_size: 16 25 | # steering related params 26 | steering_intervention_type: "clamping" # clamping 27 | steering_model_name: "google/gemma-2-9b-it" 28 | steering_datasets: ["AlpacaEval"] 29 | steering_batch_size: 5 30 | steering_output_length: 128 31 | steering_layers: [20] 32 | steering_num_of_examples: 10 # number of examples per concept and per factor 33 | # steering_factors: [0.2, 0.4, 0.6, 0.8, 1.0, 1.2, 1.4, 1.6, 1.8, 2.0, 2.5, 3.0, 4.0, 5.0] # number of steering factors per example 34 | steering_factors: [0.4, 0.8, 1.2, 1.6, 2.0, 3.0, 4.0, 6.0, 8.0, 10.0, 20.0, 40.0, 60.0, 100.0] # SAE clamping only 35 | # master data dir is shared across all jobs. 36 | master_data_dir: "axbench/data" 37 | seed: 42 38 | lm_model: "gpt-4o-mini" 39 | # generation related params 40 | temperature: 1.0 41 | evaluate: 42 | models: ["GemmaScopeSAE"] 43 | latent_evaluators: [ 44 | "AUCROCEvaluator", 45 | "HardNegativeEvaluator", 46 | ] 47 | steering_evaluators: [ 48 | "PerplexityEvaluator", 49 | "LMJudgeEvaluator", 50 | ] 51 | winrate_split_ratio: 0.5 # this is for steering only, we use a separate partition for factor selection. 52 | # Number of processes to run in parallel for steering evaluation. 53 | num_of_workers: 32 54 | lm_model: "gpt-4o-mini" 55 | run_winrate: false 56 | winrate_baseline: "PromptSteering" 57 | # master data dir is shared across all jobs. 58 | master_data_dir: "axbench/data" 59 | -------------------------------------------------------------------------------- /axbench/sweep/wuzhengx/9b/l20/gemmascope_min_clamp.yaml: -------------------------------------------------------------------------------- 1 | generate: 2 | lm_model: "gpt-4o-mini" 3 | output_length: 128 4 | num_of_examples: 144 5 | concept_path: "axbench/data/gemma-2-9b_20-gemmascope-res-16k.json" 6 | max_concepts: 500 7 | master_data_dir: "axbench/data" 8 | dataset_category: "instruction" 9 | lm_use_cache: false 10 | seed: 42 11 | train: 12 | model_name: "google/gemma-2-9b-it" 13 | layer: 20 14 | component: "res" 15 | seed: 42 16 | use_bf16: true 17 | inference: 18 | use_bf16: true 19 | models: ["GemmaScopeSAE"] 20 | model_name: "google/gemma-2-9b-it" 21 | # latent related params 22 | output_length: 128 23 | latent_num_of_examples: 36 24 | latent_batch_size: 16 25 | # steering related params 26 | steering_intervention_type: "min_clamping" # clamping 27 | steering_model_name: "google/gemma-2-9b-it" 28 | steering_datasets: ["AlpacaEval"] 29 | steering_batch_size: 5 30 | steering_output_length: 128 31 | steering_layers: [20] 32 | steering_num_of_examples: 10 # number of examples per concept and per factor 33 | # steering_factors: [0.2, 0.4, 0.6, 0.8, 1.0, 1.2, 1.4, 1.6, 1.8, 2.0, 2.5, 3.0, 4.0, 5.0] # number of steering factors per example 34 | steering_factors: [0.4, 0.8, 1.2, 1.6, 2.0, 3.0, 4.0, 6.0, 8.0, 10.0, 20.0, 40.0, 60.0, 100.0] # SAE clamping only 35 | # master data dir is shared across all jobs. 36 | master_data_dir: "axbench/data" 37 | seed: 42 38 | lm_model: "gpt-4o-mini" 39 | # generation related params 40 | temperature: 1.0 41 | evaluate: 42 | models: ["GemmaScopeSAE"] 43 | latent_evaluators: [ 44 | "AUCROCEvaluator", 45 | "HardNegativeEvaluator", 46 | ] 47 | steering_evaluators: [ 48 | "PerplexityEvaluator", 49 | "LMJudgeEvaluator", 50 | ] 51 | winrate_split_ratio: 0.5 # this is for steering only, we use a separate partition for factor selection. 52 | # Number of processes to run in parallel for steering evaluation. 53 | num_of_workers: 32 54 | lm_model: "gpt-4o-mini" 55 | run_winrate: false 56 | winrate_baseline: "PromptSteering" 57 | # master data dir is shared across all jobs. 58 | master_data_dir: "axbench/data" 59 | -------------------------------------------------------------------------------- /axbench/sweep/wuzhengx/9b/l20/lora.yaml: -------------------------------------------------------------------------------- 1 | generate: 2 | lm_model: "gpt-4o-mini" 3 | output_length: 128 4 | num_of_examples: 144 5 | concept_path: "axbench/data/gemma-2-9b_20-gemmascope-res-16k.json" 6 | max_concepts: 500 7 | master_data_dir: "axbench/data" 8 | dataset_category: "instruction" 9 | lm_use_cache: false 10 | seed: 42 11 | train: 12 | model_name: "google/gemma-2-9b-it" 13 | layer: 20 14 | component: "res" 15 | seed: 42 16 | use_bf16: true 17 | models: 18 | LoRA: 19 | batch_size: 18 20 | gradient_accumulation_steps: 2 21 | n_epochs: 24 22 | lr: 0.005 23 | weight_decay: 0.00 24 | low_rank_dimension: 4 25 | lora_layers: [12, 20, 31, 39] 26 | lora_components: ["o_proj"] 27 | lora_alpha: 32 28 | binarize_dataset: false 29 | train_on_negative: false 30 | exclude_bos: true 31 | inference: 32 | use_bf16: true 33 | models: ["LoRA"] 34 | model_name: "google/gemma-2-9b-it" 35 | # latent related params 36 | output_length: 128 37 | latent_num_of_examples: 36 38 | latent_batch_size: 16 39 | # steering related params 40 | steering_intervention_type: "addition" # clamping 41 | steering_model_name: "google/gemma-2-9b-it" 42 | steering_datasets: ["AlpacaEval"] 43 | steering_batch_size: 5 44 | steering_output_length: 128 45 | steering_layers: [20] 46 | steering_num_of_examples: 10 # number of examples per concept and per factor 47 | steering_factors: [0.2, 0.4, 0.6, 0.8, 1.0, 1.2, 1.4, 1.6, 1.8, 2.0, 2.5, 3.0, 4.0, 5.0] # number of steering factors per example 48 | # master data dir is shared across all jobs. 49 | master_data_dir: "axbench/data" 50 | seed: 42 51 | lm_model: "gpt-4o-mini" 52 | # generation related params 53 | temperature: 1.0 54 | evaluate: 55 | models: ["LoRA"] 56 | latent_evaluators: [ 57 | "AUCROCEvaluator", 58 | "HardNegativeEvaluator", 59 | ] 60 | steering_evaluators: [ 61 | "PerplexityEvaluator", 62 | "LMJudgeEvaluator", 63 | ] 64 | winrate_split_ratio: 0.5 # this is for steering only, we use a separate partition for factor selection. 65 | # Number of processes to run in parallel for steering evaluation. 66 | num_of_workers: 32 67 | lm_model: "gpt-4o-mini" 68 | run_winrate: false 69 | winrate_baseline: "PromptSteering" 70 | # master data dir is shared across all jobs. 71 | master_data_dir: "axbench/data" 72 | -------------------------------------------------------------------------------- /axbench/sweep/wuzhengx/9b/l20/lsreft.yaml: -------------------------------------------------------------------------------- 1 | generate: 2 | lm_model: "gpt-4o-mini" 3 | output_length: 128 4 | num_of_examples: 144 5 | concept_path: "axbench/data/gemma-2-9b_20-gemmascope-res-16k.json" 6 | max_concepts: 500 7 | master_data_dir: "axbench/data" 8 | dataset_category: "instruction" 9 | lm_use_cache: false 10 | seed: 42 11 | train: 12 | model_name: "google/gemma-2-9b-it" 13 | layer: 20 14 | component: "res" 15 | seed: 42 16 | use_bf16: true 17 | models: 18 | LsReFT: 19 | batch_size: 6 20 | gradient_accumulation_steps: 1 21 | n_epochs: 3 22 | lr: 0.005 23 | weight_decay: 0.00 24 | topk: 8 25 | coeff_latent_l1_loss: 0.005 26 | low_rank_dimension: 1 27 | intervention_positions: "all" 28 | intervention_type: "addition" # clamping 29 | binarize_dataset: false 30 | train_on_negative: true 31 | exclude_bos: true 32 | inference: 33 | use_bf16: true 34 | models: ["LsReFT"] 35 | model_name: "google/gemma-2-9b-it" 36 | # latent related params 37 | output_length: 128 38 | latent_num_of_examples: 36 39 | latent_batch_size: 16 40 | # steering related params 41 | steering_intervention_type: "addition" # clamping 42 | steering_model_name: "google/gemma-2-9b-it" 43 | steering_datasets: ["AlpacaEval"] 44 | steering_batch_size: 5 45 | steering_output_length: 128 46 | steering_layers: [20] 47 | steering_num_of_examples: 10 # number of examples per concept and per factor 48 | steering_factors: [0.2, 0.4, 0.6, 0.8, 1.0, 1.2, 1.4, 1.6, 1.8, 2.0, 2.5, 3.0, 4.0, 5.0] # number of steering factors per example 49 | # master data dir is shared across all jobs. 50 | master_data_dir: "axbench/data" 51 | seed: 42 52 | lm_model: "gpt-4o-mini" 53 | # generation related params 54 | temperature: 1.0 55 | evaluate: 56 | models: ["LsReFT"] 57 | latent_evaluators: [ 58 | "AUCROCEvaluator", 59 | "HardNegativeEvaluator", 60 | ] 61 | steering_evaluators: [ 62 | "PerplexityEvaluator", 63 | "LMJudgeEvaluator", 64 | ] 65 | winrate_split_ratio: 0.5 # this is for steering only, we use a separate partition for factor selection. 66 | # Number of processes to run in parallel for steering evaluation. 67 | num_of_workers: 32 68 | lm_model: "gpt-4o-mini" 69 | run_winrate: false 70 | winrate_baseline: "PromptSteering" 71 | # master data dir is shared across all jobs. 72 | master_data_dir: "axbench/data" 73 | -------------------------------------------------------------------------------- /axbench/sweep/wuzhengx/9b/l20/lsreft_scaling_law.yaml: -------------------------------------------------------------------------------- 1 | generate: 2 | lm_model: "gpt-4o-mini" 3 | output_length: 128 4 | num_of_examples: 144 5 | concept_path: "axbench/data/gemma-2-9b_20-gemmascope-res-16k.json" 6 | max_concepts: 10 7 | master_data_dir: "axbench/data" 8 | dataset_category: "instruction" 9 | lm_use_cache: false 10 | seed: 42 11 | train: 12 | model_name: "google/gemma-2-9b-it" 13 | layer: 20 14 | component: "res" 15 | seed: 42 16 | use_bf16: true 17 | max_num_of_examples: 144 # [6, 12, 24, 48, 72, 96, 120, 144] 18 | models: 19 | LsReFT: 20 | batch_size: 6 21 | gradient_accumulation_steps: 1 22 | lr: 0.005 23 | weight_decay: 0.00 24 | topk: 8 25 | coeff_latent_l1_loss: 0.005 26 | low_rank_dimension: 1 27 | intervention_positions: "all" 28 | intervention_type: "addition" # clamping 29 | binarize_dataset: false 30 | train_on_negative: true 31 | exclude_bos: true 32 | inference: 33 | use_bf16: true 34 | models: ["LsReFT"] 35 | model_name: "google/gemma-2-9b-it" 36 | # latent related params 37 | output_length: 128 38 | latent_num_of_examples: 36 39 | latent_batch_size: 16 40 | # steering related params 41 | steering_intervention_type: "addition" # clamping 42 | steering_model_name: "google/gemma-2-9b-it" 43 | steering_datasets: ["AlpacaEval"] 44 | steering_batch_size: 5 45 | steering_output_length: 128 46 | steering_layers: [20] 47 | steering_num_of_examples: 10 # number of examples per concept and per factor 48 | steering_factors: [0.2, 0.4, 0.6, 0.8, 1.0, 1.2, 1.4, 1.6, 1.8, 2.0, 2.5, 3.0, 4.0, 5.0] # number of steering factors per example 49 | # master data dir is shared across all jobs. 50 | master_data_dir: "axbench/data" 51 | seed: 42 52 | lm_model: "gpt-4o-mini" 53 | # generation related params 54 | temperature: 1.0 55 | evaluate: 56 | models: ["LsReFT"] 57 | latent_evaluators: [ 58 | "AUCROCEvaluator", 59 | "HardNegativeEvaluator", 60 | ] 61 | steering_evaluators: [ 62 | "PerplexityEvaluator", 63 | "LMJudgeEvaluator", 64 | ] 65 | winrate_split_ratio: 0.5 # this is for steering only, we use a separate partition for factor selection. 66 | # Number of processes to run in parallel for steering evaluation. 67 | num_of_workers: 32 68 | lm_model: "gpt-4o-mini" 69 | run_winrate: false 70 | winrate_baseline: "PromptSteering" 71 | # master data dir is shared across all jobs. 72 | master_data_dir: "axbench/data" 73 | -------------------------------------------------------------------------------- /axbench/sweep/wuzhengx/9b/l20/lsreft_synergy.yaml: -------------------------------------------------------------------------------- 1 | generate: 2 | lm_model: "gpt-4o-mini" 3 | output_length: 128 4 | num_of_examples: 144 5 | concept_path: "axbench/data/gemma-2-9b_20-gemmascope-res-16k.json" 6 | max_concepts: 500 7 | master_data_dir: "axbench/data" 8 | dataset_category: "instruction" 9 | lm_use_cache: false 10 | seed: 42 11 | train: 12 | model_name: "google/gemma-2-9b-it" 13 | layer: 20 14 | component: "res" 15 | seed: 42 16 | use_bf16: true 17 | models: 18 | LsReFT: 19 | batch_size: 6 20 | gradient_accumulation_steps: 1 21 | n_epochs: 3 22 | lr: 0.005 23 | weight_decay: 0.00 24 | topk: 8 25 | coeff_latent_l1_loss: 0.005 26 | low_rank_dimension: 1 27 | intervention_positions: "all" 28 | intervention_type: "addition" # clamping 29 | binarize_dataset: false 30 | train_on_negative: true 31 | exclude_bos: true 32 | use_synergy: true 33 | inference: 34 | use_bf16: true 35 | models: ["LsReFT"] 36 | model_name: "google/gemma-2-9b-it" 37 | # latent related params 38 | output_length: 128 39 | latent_num_of_examples: 36 40 | latent_batch_size: 16 41 | # steering related params 42 | steering_intervention_type: "addition" # clamping 43 | steering_model_name: "google/gemma-2-9b-it" 44 | steering_datasets: ["AlpacaEval"] 45 | steering_batch_size: 5 46 | steering_output_length: 128 47 | steering_layers: [20] 48 | steering_num_of_examples: 10 # number of examples per concept and per factor 49 | steering_factors: [0.2, 0.4, 0.6, 0.8, 1.0, 1.2, 1.4, 1.6, 1.8, 2.0, 2.5, 3.0, 4.0, 5.0] # number of steering factors per example 50 | # master data dir is shared across all jobs. 51 | master_data_dir: "axbench/data" 52 | seed: 42 53 | lm_model: "gpt-4o-mini" 54 | # generation related params 55 | temperature: 1.0 56 | evaluate: 57 | models: ["LsReFT"] 58 | latent_evaluators: [ 59 | "AUCROCEvaluator", 60 | "HardNegativeEvaluator", 61 | ] 62 | steering_evaluators: [ 63 | "PerplexityEvaluator", 64 | "LMJudgeEvaluator", 65 | ] 66 | winrate_split_ratio: 0.5 # this is for steering only, we use a separate partition for factor selection. 67 | # Number of processes to run in parallel for steering evaluation. 68 | num_of_workers: 32 69 | lm_model: "gpt-4o-mini" 70 | run_winrate: false 71 | winrate_baseline: "PromptSteering" 72 | # master data dir is shared across all jobs. 73 | master_data_dir: "axbench/data" 74 | -------------------------------------------------------------------------------- /axbench/sweep/wuzhengx/9b/l20/no_grad.yaml: -------------------------------------------------------------------------------- 1 | generate: 2 | lm_model: "gpt-4o-mini" 3 | output_length: 128 4 | num_of_examples: 144 5 | concept_path: "axbench/data/gemma-2-9b_20-gemmascope-res-16k.json" 6 | max_concepts: 500 7 | master_data_dir: "axbench/data" 8 | dataset_category: "instruction" 9 | lm_use_cache: false 10 | seed: 42 11 | train: 12 | model_name: "google/gemma-2-9b-it" 13 | layer: 20 14 | component: "res" 15 | seed: 42 16 | use_bf16: true 17 | models: 18 | DiffMean: 19 | batch_size: 6 20 | n_epochs: 1 21 | binarize_dataset: true 22 | low_rank_dimension: 1 23 | PCA: 24 | batch_size: 6 25 | n_epochs: 1 26 | binarize_dataset: true 27 | low_rank_dimension: 1 28 | LAT: 29 | batch_size: 6 30 | n_epochs: 1 31 | binarize_dataset: true 32 | low_rank_dimension: 1 33 | inference: 34 | use_bf16: true 35 | models: ["PromptSteering", "DiffMean", "PCA", "LAT", "GemmaScopeSAE"] 36 | model_name: "google/gemma-2-9b-it" 37 | # latent related params 38 | output_length: 128 39 | latent_num_of_examples: 36 40 | latent_batch_size: 16 41 | # steering related params 42 | steering_intervention_type: "addition" # clamping 43 | steering_model_name: "google/gemma-2-9b-it" 44 | steering_datasets: ["AlpacaEval"] 45 | steering_batch_size: 5 46 | steering_output_length: 128 47 | steering_layers: [20] 48 | steering_num_of_examples: 10 # number of examples per concept and per factor 49 | steering_factors: [0.2, 0.4, 0.6, 0.8, 1.0, 1.2, 1.4, 1.6, 1.8, 2.0, 2.5, 3.0, 4.0, 5.0] # number of steering factors per example 50 | # master data dir is shared across all jobs. 51 | master_data_dir: "axbench/data" 52 | seed: 42 53 | lm_model: "gpt-4o-mini" 54 | # generation related params 55 | temperature: 1.0 56 | evaluate: 57 | models: ["PromptSteering", "DiffMean", "PCA", "LAT", "GemmaScopeSAE"] 58 | latent_evaluators: [ 59 | "AUCROCEvaluator", 60 | "HardNegativeEvaluator", 61 | ] 62 | steering_evaluators: [ 63 | "PerplexityEvaluator", 64 | "LMJudgeEvaluator", 65 | ] 66 | winrate_split_ratio: 0.5 # this is for steering only, we use a separate partition for factor selection. 67 | # Number of processes to run in parallel for steering evaluation. 68 | num_of_workers: 32 69 | lm_model: "gpt-4o-mini" 70 | run_winrate: false 71 | winrate_baseline: "PromptSteering" 72 | # master data dir is shared across all jobs. 73 | master_data_dir: "axbench/data" 74 | -------------------------------------------------------------------------------- /axbench/sweep/wuzhengx/9b/l20/probe.yaml: -------------------------------------------------------------------------------- 1 | generate: 2 | lm_model: "gpt-4o-mini" 3 | output_length: 128 4 | num_of_examples: 144 5 | concept_path: "axbench/data/gemma-2-9b_20-gemmascope-res-16k.json" 6 | max_concepts: 500 7 | master_data_dir: "axbench/data" 8 | dataset_category: "instruction" 9 | lm_use_cache: false 10 | seed: 42 11 | train: 12 | model_name: "google/gemma-2-9b-it" 13 | layer: 20 14 | component: "res" 15 | seed: 42 16 | use_bf16: true 17 | models: 18 | LinearProbe: 19 | batch_size: 12 20 | gradient_accumulation_steps: 4 21 | n_epochs: 24 22 | lr: 0.001 23 | weight_decay: 0.0001 24 | coeff_l1_loss: 0.000 25 | binarize_dataset: true 26 | low_rank_dimension: 1 27 | inference: 28 | use_bf16: true 29 | models: ["LinearProbe"] 30 | model_name: "google/gemma-2-9b-it" 31 | # latent related params 32 | output_length: 128 33 | latent_num_of_examples: 36 34 | latent_batch_size: 16 35 | # steering related params 36 | steering_intervention_type: "addition" # clamping 37 | steering_model_name: "google/gemma-2-9b-it" 38 | steering_datasets: ["AlpacaEval"] 39 | steering_batch_size: 5 40 | steering_output_length: 128 41 | steering_layers: [20] 42 | steering_num_of_examples: 10 # number of examples per concept and per factor 43 | steering_factors: [0.2, 0.4, 0.6, 0.8, 1.0, 1.2, 1.4, 1.6, 1.8, 2.0, 2.5, 3.0, 4.0, 5.0] # number of steering factors per example 44 | # master data dir is shared across all jobs. 45 | master_data_dir: "axbench/data" 46 | seed: 42 47 | lm_model: "gpt-4o-mini" 48 | # generation related params 49 | temperature: 1.0 50 | evaluate: 51 | models: ["LinearProbe"] 52 | latent_evaluators: [ 53 | "AUCROCEvaluator", 54 | "HardNegativeEvaluator", 55 | ] 56 | steering_evaluators: [ 57 | "PerplexityEvaluator", 58 | "LMJudgeEvaluator", 59 | ] 60 | winrate_split_ratio: 0.5 # this is for steering only, we use a separate partition for factor selection. 61 | # Number of processes to run in parallel for steering evaluation. 62 | num_of_workers: 32 63 | lm_model: "gpt-4o-mini" 64 | run_winrate: false 65 | winrate_baseline: "PromptSteering" 66 | # master data dir is shared across all jobs. 67 | master_data_dir: "axbench/data" 68 | -------------------------------------------------------------------------------- /axbench/sweep/wuzhengx/9b/l20/prompt_detection.yaml: -------------------------------------------------------------------------------- 1 | generate: 2 | lm_model: "gpt-4o-mini" 3 | output_length: 128 4 | num_of_examples: 144 5 | concept_path: "axbench/data/gemma-2-9b_20-gemmascope-res-16k.json" 6 | max_concepts: 500 7 | master_data_dir: "axbench/data" 8 | dataset_category: "instruction" 9 | lm_use_cache: false 10 | seed: 42 11 | train: 12 | model_name: "google/gemma-2-9b-it" 13 | layer: 20 14 | component: "res" 15 | seed: 42 16 | use_bf16: true 17 | inference: 18 | use_bf16: true 19 | models: ["PromptDetection"] 20 | model_name: "google/gemma-2-9b-it" 21 | # latent related params 22 | output_length: 128 23 | latent_num_of_examples: 36 24 | latent_batch_size: 128 25 | # steering related params 26 | steering_intervention_type: "addition" # clamping 27 | steering_model_name: "google/gemma-2-9b-it" 28 | steering_datasets: ["AlpacaEval"] 29 | steering_batch_size: 5 30 | steering_output_length: 128 31 | steering_layers: [20] 32 | steering_num_of_examples: 10 # number of examples per concept and per factor 33 | steering_factors: [0.2, 0.4, 0.6, 0.8, 1.0, 1.2, 1.4, 1.6, 1.8, 2.0, 2.5, 3.0, 4.0, 5.0] # number of steering factors per example 34 | # master data dir is shared across all jobs. 35 | master_data_dir: "axbench/data" 36 | seed: 42 37 | lm_model: "gpt-4o-mini" 38 | # generation related params 39 | temperature: 1.0 40 | evaluate: 41 | models: ["PromptDetection"] 42 | latent_evaluators: [ 43 | "AUCROCEvaluator", 44 | "HardNegativeEvaluator", 45 | ] 46 | steering_evaluators: [ 47 | "PerplexityEvaluator", 48 | "LMJudgeEvaluator", 49 | ] 50 | winrate_split_ratio: 0.5 # this is for steering only, we use a separate partition for factor selection. 51 | # Number of processes to run in parallel for steering evaluation. 52 | num_of_workers: 32 53 | lm_model: "gpt-4o-mini" 54 | run_winrate: false 55 | winrate_baseline: "PromptSteering" 56 | # master data dir is shared across all jobs. 57 | master_data_dir: "axbench/data" 58 | -------------------------------------------------------------------------------- /axbench/sweep/wuzhengx/9b/l20/simple_prompt_steering.yaml: -------------------------------------------------------------------------------- 1 | generate: 2 | lm_model: "gpt-4o-mini" 3 | output_length: 128 4 | num_of_examples: 144 5 | concept_path: "axbench/data/gemma-2-9b_20-gemmascope-res-16k.json" 6 | max_concepts: 500 7 | master_data_dir: "axbench/data" 8 | dataset_category: "instruction" 9 | lm_use_cache: false 10 | seed: 42 11 | train: 12 | model_name: "google/gemma-2-9b-it" 13 | layer: 20 14 | component: "res" 15 | seed: 42 16 | use_bf16: true 17 | models: 18 | DiffMean: 19 | batch_size: 6 20 | n_epochs: 1 21 | binarize_dataset: true 22 | low_rank_dimension: 1 23 | inference: 24 | use_bf16: true 25 | models: ["SimplePromptSteering"] 26 | model_name: "google/gemma-2-9b-it" 27 | # latent related params 28 | output_length: 128 29 | latent_num_of_examples: 36 30 | latent_batch_size: 16 31 | # steering related params 32 | steering_intervention_type: "addition" # clamping 33 | steering_model_name: "google/gemma-2-9b-it" 34 | steering_datasets: ["AlpacaEval"] 35 | steering_batch_size: 5 36 | steering_output_length: 128 37 | steering_layers: [20] 38 | steering_num_of_examples: 10 # number of examples per concept and per factor 39 | steering_factors: [0.2, 0.4, 0.6, 0.8, 1.0, 1.2, 1.4, 1.6, 1.8, 2.0, 2.5, 3.0, 4.0, 5.0] # number of steering factors per example 40 | # master data dir is shared across all jobs. 41 | master_data_dir: "axbench/data" 42 | seed: 42 43 | lm_model: "gpt-4o-mini" 44 | # generation related params 45 | temperature: 1.0 46 | evaluate: 47 | models: ["SimplePromptSteering"] 48 | latent_evaluators: [ 49 | "AUCROCEvaluator", 50 | "HardNegativeEvaluator", 51 | ] 52 | steering_evaluators: [ 53 | "PerplexityEvaluator", 54 | "LMJudgeEvaluator", 55 | ] 56 | winrate_split_ratio: 0.5 # this is for steering only, we use a separate partition for factor selection. 57 | # Number of processes to run in parallel for steering evaluation. 58 | num_of_workers: 32 59 | lm_model: "gpt-4o-mini" 60 | run_winrate: false 61 | winrate_baseline: "PromptSteering" 62 | # master data dir is shared across all jobs. 63 | master_data_dir: "axbench/data" 64 | -------------------------------------------------------------------------------- /axbench/sweep/wuzhengx/9b/l20/steering_vec.yaml: -------------------------------------------------------------------------------- 1 | generate: 2 | lm_model: "gpt-4o-mini" 3 | output_length: 128 4 | num_of_examples: 144 5 | concept_path: "axbench/data/gemma-2-9b_20-gemmascope-res-16k.json" 6 | max_concepts: 500 7 | master_data_dir: "axbench/data" 8 | dataset_category: "instruction" 9 | lm_use_cache: false 10 | seed: 42 11 | train: 12 | model_name: "google/gemma-2-9b-it" 13 | layer: 20 14 | component: "res" 15 | seed: 42 16 | use_bf16: true 17 | models: 18 | SteeringVector: 19 | batch_size: 6 20 | gradient_accumulation_steps: 1 21 | n_epochs: 3 22 | lr: 0.01 23 | weight_decay: 0.00 24 | low_rank_dimension: 1 25 | intervention_positions: "all" 26 | intervention_type: "addition" # clamping 27 | binarize_dataset: false 28 | train_on_negative: false 29 | exclude_bos: true 30 | inference: 31 | use_bf16: true 32 | models: ["SteeringVector"] 33 | model_name: "google/gemma-2-9b-it" 34 | # latent related params 35 | output_length: 128 36 | latent_num_of_examples: 36 37 | latent_batch_size: 16 38 | # steering related params 39 | steering_intervention_type: "addition" # clamping 40 | steering_model_name: "google/gemma-2-9b-it" 41 | steering_datasets: ["AlpacaEval"] 42 | steering_batch_size: 5 43 | steering_output_length: 128 44 | steering_layers: [20] 45 | steering_num_of_examples: 10 # number of examples per concept and per factor 46 | steering_factors: [0.2, 0.4, 0.6, 0.8, 1.0, 1.2, 1.4, 1.6, 1.8, 2.0, 2.5, 3.0, 4.0, 5.0] # number of steering factors per example 47 | # master data dir is shared across all jobs. 48 | master_data_dir: "axbench/data" 49 | seed: 42 50 | lm_model: "gpt-4o-mini" 51 | # generation related params 52 | temperature: 1.0 53 | evaluate: 54 | models: ["SteeringVector"] 55 | latent_evaluators: [ 56 | "AUCROCEvaluator", 57 | "HardNegativeEvaluator", 58 | ] 59 | steering_evaluators: [ 60 | "PerplexityEvaluator", 61 | "LMJudgeEvaluator", 62 | ] 63 | winrate_split_ratio: 0.5 # this is for steering only, we use a separate partition for factor selection. 64 | # Number of processes to run in parallel for steering evaluation. 65 | num_of_workers: 32 66 | lm_model: "gpt-4o-mini" 67 | run_winrate: false 68 | winrate_baseline: "PromptSteering" 69 | # master data dir is shared across all jobs. 70 | master_data_dir: "axbench/data" 71 | -------------------------------------------------------------------------------- /axbench/sweep/wuzhengx/9b/l31/bow.yaml: -------------------------------------------------------------------------------- 1 | generate: 2 | lm_model: "gpt-4o-mini" 3 | output_length: 128 4 | num_of_examples: 144 5 | concept_path: "axbench/data/gemma-2-9b_31-gemmascope-res-16k.json" 6 | max_concepts: 500 7 | master_data_dir: "axbench/data" 8 | dataset_category: "instruction" 9 | lm_use_cache: false 10 | seed: 42 11 | train: 12 | model_name: "google/gemma-2-9b-it" 13 | layer: 31 14 | component: "res" 15 | seed: 42 16 | use_bf16: true 17 | models: 18 | BoW: 19 | binarize_dataset: true 20 | bow_penalty: "l2" # l1, l2 21 | bow_C: 100 # 0.001, 0.01, 0.1, 1, 10, 100 22 | inference: 23 | use_bf16: true 24 | models: ["BoW"] 25 | model_name: "google/gemma-2-9b-it" 26 | # latent related params 27 | output_length: 128 28 | latent_num_of_examples: 36 29 | latent_batch_size: 16 30 | # steering related params 31 | steering_intervention_type: "addition" # clamping 32 | steering_model_name: "google/gemma-2-9b-it" 33 | steering_datasets: ["AlpacaEval"] 34 | steering_batch_size: 10 35 | steering_output_length: 128 36 | steering_layers: [31] 37 | steering_num_of_examples: 10 # number of examples per concept and per factor 38 | steering_factors: [0.2, 0.4, 0.6, 0.8, 1.0, 1.2, 1.4, 1.6, 1.8, 2.0, 2.5, 3.0, 4.0, 5.0] # number of steering factors per example 39 | # master data dir is shared across all jobs. 40 | master_data_dir: "axbench/data" 41 | seed: 42 42 | lm_model: "gpt-4o-mini" 43 | # generation related params 44 | temperature: 1.0 45 | evaluate: 46 | models: ["BoW"] 47 | latent_evaluators: [ 48 | "AUCROCEvaluator", 49 | "HardNegativeEvaluator", 50 | ] 51 | steering_evaluators: [ 52 | "PerplexityEvaluator", 53 | "LMJudgeEvaluator", 54 | ] 55 | winrate_split_ratio: 0.5 # this is for steering only, we use a separate partition for factor selection. 56 | # Number of processes to run in parallel for steering evaluation. 57 | num_of_workers: 32 58 | lm_model: "gpt-4o-mini" 59 | run_winrate: false 60 | winrate_baseline: "PromptSteering" 61 | # master data dir is shared across all jobs. 62 | master_data_dir: "axbench/data" 63 | -------------------------------------------------------------------------------- /axbench/sweep/wuzhengx/9b/l31/gemmascope_axbench_max_act.yaml: -------------------------------------------------------------------------------- 1 | generate: 2 | lm_model: "gpt-4o-mini" 3 | output_length: 128 4 | num_of_examples: 144 5 | concept_path: "axbench/data/gemma-2-9b_31-gemmascope-res-16k.json" 6 | max_concepts: 500 7 | master_data_dir: "axbench/data" 8 | dataset_category: "instruction" 9 | lm_use_cache: false 10 | seed: 42 11 | train: 12 | model_name: "google/gemma-2-9b-it" 13 | layer: 31 14 | component: "res" 15 | seed: 42 16 | use_bf16: true 17 | inference: 18 | use_bf16: true 19 | models: ["GemmaScopeSAE"] 20 | model_name: "google/gemma-2-9b-it" 21 | # latent related params 22 | output_length: 128 23 | latent_num_of_examples: 36 24 | latent_batch_size: 16 25 | # steering related params 26 | disable_neuronpedia_max_act: true 27 | steering_intervention_type: "addition" 28 | steering_model_name: "google/gemma-2-9b-it" 29 | steering_datasets: ["AlpacaEval"] 30 | steering_batch_size: 5 31 | steering_output_length: 128 32 | steering_layers: [31] 33 | steering_num_of_examples: 10 # number of examples per concept and per factor 34 | # steering_factors: [0.2, 0.4, 0.6, 0.8, 1.0, 1.2, 1.4, 1.6, 1.8, 2.0, 2.5, 3.0, 4.0, 5.0] # number of steering factors per example 35 | # steering_factors: [0.4, 0.8, 1.2, 1.6, 2.0, 3.0, 4.0, 6.0, 8.0, 10.0, 20.0, 40.0, 60.0, 100.0] # SAE clamping only 36 | steering_factors: [0.4, 1.0, 1.4, 1.8, 2.2, 2.6, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0] # SAE using AxBench max act 37 | # master data dir is shared across all jobs. 38 | master_data_dir: "axbench/data" 39 | seed: 42 40 | lm_model: "gpt-4o-mini" 41 | # generation related params 42 | temperature: 1.0 43 | evaluate: 44 | models: ["GemmaScopeSAE"] 45 | latent_evaluators: [ 46 | "AUCROCEvaluator", 47 | "HardNegativeEvaluator", 48 | ] 49 | steering_evaluators: [ 50 | "PerplexityEvaluator", 51 | "LMJudgeEvaluator", 52 | ] 53 | winrate_split_ratio: 0.5 # this is for steering only, we use a separate partition for factor selection. 54 | # Number of processes to run in parallel for steering evaluation. 55 | num_of_workers: 32 56 | lm_model: "gpt-4o-mini" 57 | run_winrate: false 58 | winrate_baseline: "PromptSteering" 59 | # master data dir is shared across all jobs. 60 | master_data_dir: "axbench/data" 61 | -------------------------------------------------------------------------------- /axbench/sweep/wuzhengx/9b/l31/gemmascope_clamp.yaml: -------------------------------------------------------------------------------- 1 | generate: 2 | lm_model: "gpt-4o-mini" 3 | output_length: 128 4 | num_of_examples: 144 5 | concept_path: "axbench/data/gemma-2-9b_31-gemmascope-res-16k.json" 6 | max_concepts: 500 7 | master_data_dir: "axbench/data" 8 | dataset_category: "instruction" 9 | lm_use_cache: false 10 | seed: 42 11 | train: 12 | model_name: "google/gemma-2-9b-it" 13 | layer: 31 14 | component: "res" 15 | seed: 42 16 | use_bf16: true 17 | inference: 18 | use_bf16: true 19 | models: ["GemmaScopeSAE"] 20 | model_name: "google/gemma-2-9b-it" 21 | # latent related params 22 | output_length: 128 23 | latent_num_of_examples: 36 24 | latent_batch_size: 16 25 | # steering related params 26 | steering_intervention_type: "clamping" # clamping 27 | steering_model_name: "google/gemma-2-9b-it" 28 | steering_datasets: ["AlpacaEval"] 29 | steering_batch_size: 5 30 | steering_output_length: 128 31 | steering_layers: [31] 32 | steering_num_of_examples: 10 # number of examples per concept and per factor 33 | # steering_factors: [0.2, 0.4, 0.6, 0.8, 1.0, 1.2, 1.4, 1.6, 1.8, 2.0, 2.5, 3.0, 4.0, 5.0] # number of steering factors per example 34 | steering_factors: [0.4, 0.8, 1.2, 1.6, 2.0, 3.0, 4.0, 6.0, 8.0, 10.0, 20.0, 40.0, 60.0, 100.0] # SAE clamping only 35 | # master data dir is shared across all jobs. 36 | master_data_dir: "axbench/data" 37 | seed: 42 38 | lm_model: "gpt-4o-mini" 39 | # generation related params 40 | temperature: 1.0 41 | evaluate: 42 | models: ["GemmaScopeSAE"] 43 | latent_evaluators: [ 44 | "AUCROCEvaluator", 45 | "HardNegativeEvaluator", 46 | ] 47 | steering_evaluators: [ 48 | "PerplexityEvaluator", 49 | "LMJudgeEvaluator", 50 | ] 51 | winrate_split_ratio: 0.5 # this is for steering only, we use a separate partition for factor selection. 52 | # Number of processes to run in parallel for steering evaluation. 53 | num_of_workers: 32 54 | lm_model: "gpt-4o-mini" 55 | run_winrate: false 56 | winrate_baseline: "PromptSteering" 57 | # master data dir is shared across all jobs. 58 | master_data_dir: "axbench/data" 59 | -------------------------------------------------------------------------------- /axbench/sweep/wuzhengx/9b/l31/gemmascope_min_clamp.yaml: -------------------------------------------------------------------------------- 1 | generate: 2 | lm_model: "gpt-4o-mini" 3 | output_length: 128 4 | num_of_examples: 144 5 | concept_path: "axbench/data/gemma-2-9b_31-gemmascope-res-16k.json" 6 | max_concepts: 500 7 | master_data_dir: "axbench/data" 8 | dataset_category: "instruction" 9 | lm_use_cache: false 10 | seed: 42 11 | train: 12 | model_name: "google/gemma-2-9b-it" 13 | layer: 31 14 | component: "res" 15 | seed: 42 16 | use_bf16: true 17 | inference: 18 | use_bf16: true 19 | models: ["GemmaScopeSAE"] 20 | model_name: "google/gemma-2-9b-it" 21 | # latent related params 22 | output_length: 128 23 | latent_num_of_examples: 36 24 | latent_batch_size: 16 25 | # steering related params 26 | steering_intervention_type: "min_clamping" # clamping 27 | steering_model_name: "google/gemma-2-9b-it" 28 | steering_datasets: ["AlpacaEval"] 29 | steering_batch_size: 5 30 | steering_output_length: 128 31 | steering_layers: [31] 32 | steering_num_of_examples: 10 # number of examples per concept and per factor 33 | # steering_factors: [0.2, 0.4, 0.6, 0.8, 1.0, 1.2, 1.4, 1.6, 1.8, 2.0, 2.5, 3.0, 4.0, 5.0] # number of steering factors per example 34 | steering_factors: [0.4, 0.8, 1.2, 1.6, 2.0, 3.0, 4.0, 6.0, 8.0, 10.0, 20.0, 40.0, 60.0, 100.0] # SAE clamping only 35 | # master data dir is shared across all jobs. 36 | master_data_dir: "axbench/data" 37 | seed: 42 38 | lm_model: "gpt-4o-mini" 39 | # generation related params 40 | temperature: 1.0 41 | evaluate: 42 | models: ["GemmaScopeSAE"] 43 | latent_evaluators: [ 44 | "AUCROCEvaluator", 45 | "HardNegativeEvaluator", 46 | ] 47 | steering_evaluators: [ 48 | "PerplexityEvaluator", 49 | "LMJudgeEvaluator", 50 | ] 51 | winrate_split_ratio: 0.5 # this is for steering only, we use a separate partition for factor selection. 52 | # Number of processes to run in parallel for steering evaluation. 53 | num_of_workers: 32 54 | lm_model: "gpt-4o-mini" 55 | run_winrate: false 56 | winrate_baseline: "PromptSteering" 57 | # master data dir is shared across all jobs. 58 | master_data_dir: "axbench/data" 59 | -------------------------------------------------------------------------------- /axbench/sweep/wuzhengx/9b/l31/lora.yaml: -------------------------------------------------------------------------------- 1 | generate: 2 | lm_model: "gpt-4o-mini" 3 | output_length: 128 4 | num_of_examples: 144 5 | concept_path: "axbench/data/gemma-2-9b_31-gemmascope-res-16k.json" 6 | max_concepts: 500 7 | master_data_dir: "axbench/data" 8 | dataset_category: "instruction" 9 | lm_use_cache: false 10 | seed: 42 11 | train: 12 | model_name: "google/gemma-2-9b-it" 13 | layer: 31 14 | component: "res" 15 | seed: 42 16 | use_bf16: true 17 | models: 18 | LoRA: 19 | batch_size: 9 20 | gradient_accumulation_steps: 4 21 | n_epochs: 24 22 | lr: 0.005 23 | weight_decay: 0.00 24 | low_rank_dimension: 4 25 | lora_layers: [12, 20, 31, 39] 26 | lora_components: ["o_proj"] 27 | lora_alpha: 32 28 | binarize_dataset: false 29 | train_on_negative: false 30 | exclude_bos: true 31 | inference: 32 | use_bf16: true 33 | models: ["LoRA"] 34 | model_name: "google/gemma-2-9b-it" 35 | # latent related params 36 | output_length: 128 37 | latent_num_of_examples: 36 38 | latent_batch_size: 16 39 | # steering related params 40 | steering_intervention_type: "addition" # clamping 41 | steering_model_name: "google/gemma-2-9b-it" 42 | steering_datasets: ["AlpacaEval"] 43 | steering_batch_size: 5 44 | steering_output_length: 128 45 | steering_layers: [31] 46 | steering_num_of_examples: 10 # number of examples per concept and per factor 47 | steering_factors: [0.2, 0.4, 0.6, 0.8, 1.0, 1.2, 1.4, 1.6, 1.8, 2.0, 2.5, 3.0, 4.0, 5.0] # number of steering factors per example 48 | # master data dir is shared across all jobs. 49 | master_data_dir: "axbench/data" 50 | seed: 42 51 | lm_model: "gpt-4o-mini" 52 | # generation related params 53 | temperature: 1.0 54 | evaluate: 55 | models: ["LoRA"] 56 | latent_evaluators: [ 57 | "AUCROCEvaluator", 58 | "HardNegativeEvaluator", 59 | ] 60 | steering_evaluators: [ 61 | "PerplexityEvaluator", 62 | "LMJudgeEvaluator", 63 | ] 64 | winrate_split_ratio: 0.5 # this is for steering only, we use a separate partition for factor selection. 65 | # Number of processes to run in parallel for steering evaluation. 66 | num_of_workers: 32 67 | lm_model: "gpt-4o-mini" 68 | run_winrate: false 69 | winrate_baseline: "PromptSteering" 70 | # master data dir is shared across all jobs. 71 | master_data_dir: "axbench/data" 72 | -------------------------------------------------------------------------------- /axbench/sweep/wuzhengx/9b/l31/lsreft.yaml: -------------------------------------------------------------------------------- 1 | generate: 2 | lm_model: "gpt-4o-mini" 3 | output_length: 128 4 | num_of_examples: 144 5 | concept_path: "axbench/data/gemma-2-9b_31-gemmascope-res-16k.json" 6 | max_concepts: 500 7 | master_data_dir: "axbench/data" 8 | dataset_category: "instruction" 9 | lm_use_cache: false 10 | seed: 42 11 | train: 12 | model_name: "google/gemma-2-9b-it" 13 | layer: 31 14 | component: "res" 15 | seed: 42 16 | use_bf16: true 17 | models: 18 | LsReFT: 19 | batch_size: 6 20 | gradient_accumulation_steps: 1 21 | n_epochs: 3 22 | lr: 0.005 23 | weight_decay: 0.00 24 | topk: 8 25 | coeff_latent_l1_loss: 0.005 26 | low_rank_dimension: 1 27 | intervention_positions: "all" 28 | intervention_type: "addition" # clamping 29 | binarize_dataset: false 30 | train_on_negative: true 31 | exclude_bos: true 32 | inference: 33 | use_bf16: true 34 | models: ["LsReFT"] 35 | model_name: "google/gemma-2-9b-it" 36 | # latent related params 37 | output_length: 128 38 | latent_num_of_examples: 36 39 | latent_batch_size: 16 40 | # steering related params 41 | steering_intervention_type: "addition" # clamping 42 | steering_model_name: "google/gemma-2-9b-it" 43 | steering_datasets: ["AlpacaEval"] 44 | steering_batch_size: 5 45 | steering_output_length: 128 46 | steering_layers: [31] 47 | steering_num_of_examples: 10 # number of examples per concept and per factor 48 | steering_factors: [0.2, 0.4, 0.6, 0.8, 1.0, 1.2, 1.4, 1.6, 1.8, 2.0, 2.5, 3.0, 4.0, 5.0] # number of steering factors per example 49 | # master data dir is shared across all jobs. 50 | master_data_dir: "axbench/data" 51 | seed: 42 52 | lm_model: "gpt-4o-mini" 53 | # generation related params 54 | temperature: 1.0 55 | evaluate: 56 | models: ["LsReFT"] 57 | latent_evaluators: [ 58 | "AUCROCEvaluator", 59 | "HardNegativeEvaluator", 60 | ] 61 | steering_evaluators: [ 62 | "PerplexityEvaluator", 63 | "LMJudgeEvaluator", 64 | ] 65 | winrate_split_ratio: 0.5 # this is for steering only, we use a separate partition for factor selection. 66 | # Number of processes to run in parallel for steering evaluation. 67 | num_of_workers: 32 68 | lm_model: "gpt-4o-mini" 69 | run_winrate: false 70 | winrate_baseline: "PromptSteering" 71 | # master data dir is shared across all jobs. 72 | master_data_dir: "axbench/data" 73 | -------------------------------------------------------------------------------- /axbench/sweep/wuzhengx/9b/l31/lsreft_scaling_law.yaml: -------------------------------------------------------------------------------- 1 | generate: 2 | lm_model: "gpt-4o-mini" 3 | output_length: 128 4 | num_of_examples: 144 5 | concept_path: "axbench/data/gemma-2-9b_31-gemmascope-res-16k.json" 6 | max_concepts: 10 7 | master_data_dir: "axbench/data" 8 | dataset_category: "instruction" 9 | lm_use_cache: false 10 | seed: 42 11 | train: 12 | model_name: "google/gemma-2-9b-it" 13 | layer: 31 14 | component: "res" 15 | seed: 42 16 | use_bf16: true 17 | max_num_of_examples: 144 # [6, 12, 24, 48, 72, 96, 120, 144] 18 | models: 19 | LsReFT: 20 | batch_size: 6 21 | gradient_accumulation_steps: 1 22 | lr: 0.005 23 | weight_decay: 0.00 24 | topk: 8 25 | coeff_latent_l1_loss: 0.005 26 | low_rank_dimension: 1 27 | intervention_positions: "all" 28 | intervention_type: "addition" # clamping 29 | binarize_dataset: false 30 | train_on_negative: true 31 | exclude_bos: true 32 | inference: 33 | use_bf16: true 34 | models: ["LsReFT"] 35 | model_name: "google/gemma-2-9b-it" 36 | # latent related params 37 | output_length: 128 38 | latent_num_of_examples: 36 39 | latent_batch_size: 16 40 | # steering related params 41 | steering_intervention_type: "addition" # clamping 42 | steering_model_name: "google/gemma-2-9b-it" 43 | steering_datasets: ["AlpacaEval"] 44 | steering_batch_size: 5 45 | steering_output_length: 128 46 | steering_layers: [31] 47 | steering_num_of_examples: 10 # number of examples per concept and per factor 48 | steering_factors: [0.2, 0.4, 0.6, 0.8, 1.0, 1.2, 1.4, 1.6, 1.8, 2.0, 2.5, 3.0, 4.0, 5.0] # number of steering factors per example 49 | # master data dir is shared across all jobs. 50 | master_data_dir: "axbench/data" 51 | seed: 42 52 | lm_model: "gpt-4o-mini" 53 | # generation related params 54 | temperature: 1.0 55 | evaluate: 56 | models: ["LsReFT"] 57 | latent_evaluators: [ 58 | "AUCROCEvaluator", 59 | "HardNegativeEvaluator", 60 | ] 61 | steering_evaluators: [ 62 | "PerplexityEvaluator", 63 | "LMJudgeEvaluator", 64 | ] 65 | winrate_split_ratio: 0.5 # this is for steering only, we use a separate partition for factor selection. 66 | # Number of processes to run in parallel for steering evaluation. 67 | num_of_workers: 32 68 | lm_model: "gpt-4o-mini" 69 | run_winrate: false 70 | winrate_baseline: "PromptSteering" 71 | # master data dir is shared across all jobs. 72 | master_data_dir: "axbench/data" 73 | -------------------------------------------------------------------------------- /axbench/sweep/wuzhengx/9b/l31/lsreft_synergy.yaml: -------------------------------------------------------------------------------- 1 | generate: 2 | lm_model: "gpt-4o-mini" 3 | output_length: 128 4 | num_of_examples: 144 5 | concept_path: "axbench/data/gemma-2-9b_31-gemmascope-res-16k.json" 6 | max_concepts: 500 7 | master_data_dir: "axbench/data" 8 | dataset_category: "instruction" 9 | lm_use_cache: false 10 | seed: 42 11 | train: 12 | model_name: "google/gemma-2-9b-it" 13 | layer: 31 14 | component: "res" 15 | seed: 42 16 | use_bf16: true 17 | models: 18 | LsReFT: 19 | batch_size: 6 20 | gradient_accumulation_steps: 1 21 | n_epochs: 3 22 | lr: 0.005 23 | weight_decay: 0.00 24 | topk: 8 25 | coeff_latent_l1_loss: 0.005 26 | low_rank_dimension: 1 27 | intervention_positions: "all" 28 | intervention_type: "addition" # clamping 29 | binarize_dataset: false 30 | train_on_negative: true 31 | exclude_bos: true 32 | use_synergy: true 33 | inference: 34 | use_bf16: true 35 | models: ["LsReFT"] 36 | model_name: "google/gemma-2-9b-it" 37 | # latent related params 38 | output_length: 128 39 | latent_num_of_examples: 36 40 | latent_batch_size: 16 41 | # steering related params 42 | steering_intervention_type: "addition" # clamping 43 | steering_model_name: "google/gemma-2-9b-it" 44 | steering_datasets: ["AlpacaEval"] 45 | steering_batch_size: 5 46 | steering_output_length: 128 47 | steering_layers: [31] 48 | steering_num_of_examples: 10 # number of examples per concept and per factor 49 | steering_factors: [0.2, 0.4, 0.6, 0.8, 1.0, 1.2, 1.4, 1.6, 1.8, 2.0, 2.5, 3.0, 4.0, 5.0] # number of steering factors per example 50 | # master data dir is shared across all jobs. 51 | master_data_dir: "axbench/data" 52 | seed: 42 53 | lm_model: "gpt-4o-mini" 54 | # generation related params 55 | temperature: 1.0 56 | evaluate: 57 | models: ["LsReFT"] 58 | latent_evaluators: [ 59 | "AUCROCEvaluator", 60 | "HardNegativeEvaluator", 61 | ] 62 | steering_evaluators: [ 63 | "PerplexityEvaluator", 64 | "LMJudgeEvaluator", 65 | ] 66 | winrate_split_ratio: 0.5 # this is for steering only, we use a separate partition for factor selection. 67 | # Number of processes to run in parallel for steering evaluation. 68 | num_of_workers: 32 69 | lm_model: "gpt-4o-mini" 70 | run_winrate: false 71 | winrate_baseline: "PromptSteering" 72 | # master data dir is shared across all jobs. 73 | master_data_dir: "axbench/data" 74 | -------------------------------------------------------------------------------- /axbench/sweep/wuzhengx/9b/l31/no_grad.yaml: -------------------------------------------------------------------------------- 1 | generate: 2 | lm_model: "gpt-4o-mini" 3 | output_length: 128 4 | num_of_examples: 144 5 | concept_path: "axbench/data/gemma-2-9b_31-gemmascope-res-16k.json" 6 | max_concepts: 500 7 | master_data_dir: "axbench/data" 8 | dataset_category: "instruction" 9 | lm_use_cache: false 10 | seed: 42 11 | train: 12 | model_name: "google/gemma-2-9b-it" 13 | layer: 31 14 | component: "res" 15 | seed: 42 16 | use_bf16: true 17 | models: 18 | DiffMean: 19 | batch_size: 6 20 | n_epochs: 1 21 | binarize_dataset: true 22 | low_rank_dimension: 1 23 | PCA: 24 | batch_size: 6 25 | n_epochs: 1 26 | binarize_dataset: true 27 | low_rank_dimension: 1 28 | LAT: 29 | batch_size: 6 30 | n_epochs: 1 31 | binarize_dataset: true 32 | low_rank_dimension: 1 33 | inference: 34 | use_bf16: true 35 | models: ["PromptSteering", "DiffMean", "PCA", "LAT", "GemmaScopeSAE"] 36 | model_name: "google/gemma-2-9b-it" 37 | # latent related params 38 | output_length: 128 39 | latent_num_of_examples: 36 40 | latent_batch_size: 16 41 | # steering related params 42 | steering_intervention_type: "addition" # clamping 43 | steering_model_name: "google/gemma-2-9b-it" 44 | steering_datasets: ["AlpacaEval"] 45 | steering_batch_size: 5 46 | steering_output_length: 128 47 | steering_layers: [31] 48 | steering_num_of_examples: 10 # number of examples per concept and per factor 49 | steering_factors: [0.2, 0.4, 0.6, 0.8, 1.0, 1.2, 1.4, 1.6, 1.8, 2.0, 2.5, 3.0, 4.0, 5.0] # number of steering factors per example 50 | # master data dir is shared across all jobs. 51 | master_data_dir: "axbench/data" 52 | seed: 42 53 | lm_model: "gpt-4o-mini" 54 | # generation related params 55 | temperature: 1.0 56 | evaluate: 57 | models: ["PromptSteering", "DiffMean", "PCA", "LAT", "GemmaScopeSAE"] 58 | latent_evaluators: [ 59 | "AUCROCEvaluator", 60 | "HardNegativeEvaluator", 61 | ] 62 | steering_evaluators: [ 63 | "PerplexityEvaluator", 64 | "LMJudgeEvaluator", 65 | ] 66 | winrate_split_ratio: 0.5 # this is for steering only, we use a separate partition for factor selection. 67 | # Number of processes to run in parallel for steering evaluation. 68 | num_of_workers: 32 69 | lm_model: "gpt-4o-mini" 70 | run_winrate: false 71 | winrate_baseline: "PromptSteering" 72 | # master data dir is shared across all jobs. 73 | master_data_dir: "axbench/data" 74 | -------------------------------------------------------------------------------- /axbench/sweep/wuzhengx/9b/l31/probe.yaml: -------------------------------------------------------------------------------- 1 | generate: 2 | lm_model: "gpt-4o-mini" 3 | output_length: 128 4 | num_of_examples: 144 5 | concept_path: "axbench/data/gemma-2-9b_31-gemmascope-res-16k.json" 6 | max_concepts: 500 7 | master_data_dir: "axbench/data" 8 | dataset_category: "instruction" 9 | lm_use_cache: false 10 | seed: 42 11 | train: 12 | model_name: "google/gemma-2-9b-it" 13 | layer: 31 14 | component: "res" 15 | seed: 42 16 | use_bf16: true 17 | models: 18 | LinearProbe: 19 | batch_size: 12 20 | gradient_accumulation_steps: 4 21 | n_epochs: 24 22 | lr: 0.001 23 | weight_decay: 0.0001 24 | coeff_l1_loss: 0.000 25 | binarize_dataset: true 26 | low_rank_dimension: 1 27 | inference: 28 | use_bf16: true 29 | models: ["LinearProbe"] 30 | model_name: "google/gemma-2-9b-it" 31 | # latent related params 32 | output_length: 128 33 | latent_num_of_examples: 36 34 | latent_batch_size: 16 35 | # steering related params 36 | steering_intervention_type: "addition" # clamping 37 | steering_model_name: "google/gemma-2-9b-it" 38 | steering_datasets: ["AlpacaEval"] 39 | steering_batch_size: 5 40 | steering_output_length: 128 41 | steering_layers: [31] 42 | steering_num_of_examples: 10 # number of examples per concept and per factor 43 | steering_factors: [0.2, 0.4, 0.6, 0.8, 1.0, 1.2, 1.4, 1.6, 1.8, 2.0, 2.5, 3.0, 4.0, 5.0] # number of steering factors per example 44 | # master data dir is shared across all jobs. 45 | master_data_dir: "axbench/data" 46 | seed: 42 47 | lm_model: "gpt-4o-mini" 48 | # generation related params 49 | temperature: 1.0 50 | evaluate: 51 | models: ["LinearProbe"] 52 | latent_evaluators: [ 53 | "AUCROCEvaluator", 54 | "HardNegativeEvaluator", 55 | ] 56 | steering_evaluators: [ 57 | "PerplexityEvaluator", 58 | "LMJudgeEvaluator", 59 | ] 60 | winrate_split_ratio: 0.5 # this is for steering only, we use a separate partition for factor selection. 61 | # Number of processes to run in parallel for steering evaluation. 62 | num_of_workers: 32 63 | lm_model: "gpt-4o-mini" 64 | run_winrate: false 65 | winrate_baseline: "PromptSteering" 66 | # master data dir is shared across all jobs. 67 | master_data_dir: "axbench/data" 68 | -------------------------------------------------------------------------------- /axbench/sweep/wuzhengx/9b/l31/prompt_detection.yaml: -------------------------------------------------------------------------------- 1 | generate: 2 | lm_model: "gpt-4o-mini" 3 | output_length: 128 4 | num_of_examples: 144 5 | concept_path: "axbench/data/gemma-2-9b_31-gemmascope-res-16k.json" 6 | max_concepts: 500 7 | master_data_dir: "axbench/data" 8 | dataset_category: "instruction" 9 | lm_use_cache: false 10 | seed: 42 11 | train: 12 | model_name: "google/gemma-2-9b-it" 13 | layer: 31 14 | component: "res" 15 | seed: 42 16 | use_bf16: true 17 | inference: 18 | use_bf16: true 19 | models: ["PromptDetection"] 20 | model_name: "google/gemma-2-9b-it" 21 | # latent related params 22 | output_length: 128 23 | latent_num_of_examples: 36 24 | latent_batch_size: 128 25 | # steering related params 26 | steering_intervention_type: "addition" # clamping 27 | steering_model_name: "google/gemma-2-9b-it" 28 | steering_datasets: ["AlpacaEval"] 29 | steering_batch_size: 5 30 | steering_output_length: 128 31 | steering_layers: [31] 32 | steering_num_of_examples: 10 # number of examples per concept and per factor 33 | steering_factors: [0.2, 0.4, 0.6, 0.8, 1.0, 1.2, 1.4, 1.6, 1.8, 2.0, 2.5, 3.0, 4.0, 5.0] # number of steering factors per example 34 | # master data dir is shared across all jobs. 35 | master_data_dir: "axbench/data" 36 | seed: 42 37 | lm_model: "gpt-4o-mini" 38 | # generation related params 39 | temperature: 1.0 40 | evaluate: 41 | models: ["PromptDetection"] 42 | latent_evaluators: [ 43 | "AUCROCEvaluator", 44 | "HardNegativeEvaluator", 45 | ] 46 | steering_evaluators: [ 47 | "PerplexityEvaluator", 48 | "LMJudgeEvaluator", 49 | ] 50 | winrate_split_ratio: 0.5 # this is for steering only, we use a separate partition for factor selection. 51 | # Number of processes to run in parallel for steering evaluation. 52 | num_of_workers: 32 53 | lm_model: "gpt-4o-mini" 54 | run_winrate: false 55 | winrate_baseline: "PromptSteering" 56 | # master data dir is shared across all jobs. 57 | master_data_dir: "axbench/data" 58 | -------------------------------------------------------------------------------- /axbench/sweep/wuzhengx/9b/l31/simple_prompt_steering.yaml: -------------------------------------------------------------------------------- 1 | generate: 2 | lm_model: "gpt-4o-mini" 3 | output_length: 128 4 | num_of_examples: 144 5 | concept_path: "axbench/data/gemma-2-9b_31-gemmascope-res-16k.json" 6 | max_concepts: 500 7 | master_data_dir: "axbench/data" 8 | dataset_category: "instruction" 9 | lm_use_cache: false 10 | seed: 42 11 | train: 12 | model_name: "google/gemma-2-9b-it" 13 | layer: 31 14 | component: "res" 15 | seed: 42 16 | use_bf16: true 17 | models: 18 | DiffMean: 19 | batch_size: 6 20 | n_epochs: 1 21 | binarize_dataset: true 22 | low_rank_dimension: 1 23 | inference: 24 | use_bf16: true 25 | models: ["SimplePromptSteering"] 26 | model_name: "google/gemma-2-9b-it" 27 | # latent related params 28 | output_length: 128 29 | latent_num_of_examples: 36 30 | latent_batch_size: 16 31 | # steering related params 32 | steering_intervention_type: "addition" # clamping 33 | steering_model_name: "google/gemma-2-9b-it" 34 | steering_datasets: ["AlpacaEval"] 35 | steering_batch_size: 5 36 | steering_output_length: 128 37 | steering_layers: [31] 38 | steering_num_of_examples: 10 # number of examples per concept and per factor 39 | steering_factors: [0.2, 0.4, 0.6, 0.8, 1.0, 1.2, 1.4, 1.6, 1.8, 2.0, 2.5, 3.0, 4.0, 5.0] # number of steering factors per example 40 | # master data dir is shared across all jobs. 41 | master_data_dir: "axbench/data" 42 | seed: 42 43 | lm_model: "gpt-4o-mini" 44 | # generation related params 45 | temperature: 1.0 46 | evaluate: 47 | models: ["SimplePromptSteering"] 48 | latent_evaluators: [ 49 | "AUCROCEvaluator", 50 | "HardNegativeEvaluator", 51 | ] 52 | steering_evaluators: [ 53 | "PerplexityEvaluator", 54 | "LMJudgeEvaluator", 55 | ] 56 | winrate_split_ratio: 0.5 # this is for steering only, we use a separate partition for factor selection. 57 | # Number of processes to run in parallel for steering evaluation. 58 | num_of_workers: 32 59 | lm_model: "gpt-4o-mini" 60 | run_winrate: false 61 | winrate_baseline: "PromptSteering" 62 | # master data dir is shared across all jobs. 63 | master_data_dir: "axbench/data" 64 | -------------------------------------------------------------------------------- /axbench/sweep/wuzhengx/9b/l31/steering_vec.yaml: -------------------------------------------------------------------------------- 1 | generate: 2 | lm_model: "gpt-4o-mini" 3 | output_length: 128 4 | num_of_examples: 144 5 | concept_path: "axbench/data/gemma-2-9b_31-gemmascope-res-16k.json" 6 | max_concepts: 500 7 | master_data_dir: "axbench/data" 8 | dataset_category: "instruction" 9 | lm_use_cache: false 10 | seed: 42 11 | train: 12 | model_name: "google/gemma-2-9b-it" 13 | layer: 31 14 | component: "res" 15 | seed: 42 16 | use_bf16: true 17 | models: 18 | SteeringVector: 19 | batch_size: 6 20 | gradient_accumulation_steps: 1 21 | n_epochs: 3 22 | lr: 0.01 23 | weight_decay: 0.00 24 | low_rank_dimension: 1 25 | intervention_positions: "all" 26 | intervention_type: "addition" # clamping 27 | binarize_dataset: false 28 | train_on_negative: false 29 | exclude_bos: true 30 | inference: 31 | use_bf16: true 32 | models: ["SteeringVector"] 33 | model_name: "google/gemma-2-9b-it" 34 | # latent related params 35 | output_length: 128 36 | latent_num_of_examples: 36 37 | latent_batch_size: 16 38 | # steering related params 39 | steering_intervention_type: "addition" # clamping 40 | steering_model_name: "google/gemma-2-9b-it" 41 | steering_datasets: ["AlpacaEval"] 42 | steering_batch_size: 5 43 | steering_output_length: 128 44 | steering_layers: [31] 45 | steering_num_of_examples: 10 # number of examples per concept and per factor 46 | steering_factors: [0.2, 0.4, 0.6, 0.8, 1.0, 1.2, 1.4, 1.6, 1.8, 2.0, 2.5, 3.0, 4.0, 5.0] # number of steering factors per example 47 | # master data dir is shared across all jobs. 48 | master_data_dir: "axbench/data" 49 | seed: 42 50 | lm_model: "gpt-4o-mini" 51 | # generation related params 52 | temperature: 1.0 53 | evaluate: 54 | models: ["SteeringVector"] 55 | latent_evaluators: [ 56 | "AUCROCEvaluator", 57 | "HardNegativeEvaluator", 58 | ] 59 | steering_evaluators: [ 60 | "PerplexityEvaluator", 61 | "LMJudgeEvaluator", 62 | ] 63 | winrate_split_ratio: 0.5 # this is for steering only, we use a separate partition for factor selection. 64 | # Number of processes to run in parallel for steering evaluation. 65 | num_of_workers: 32 66 | lm_model: "gpt-4o-mini" 67 | run_winrate: false 68 | winrate_baseline: "PromptSteering" 69 | # master data dir is shared across all jobs. 70 | master_data_dir: "axbench/data" 71 | -------------------------------------------------------------------------------- /axbench/sweep/wuzhengx/llama_8b/l20/lsreft.yaml: -------------------------------------------------------------------------------- 1 | generate: 2 | lm_model: "gpt-4o-mini-2024-07-18" 3 | output_length: 128 4 | num_of_examples: 144 5 | concept_path: "axbench/data/llama3.1-8b_20-llamascope-res-131k.json" 6 | max_concepts: 500 7 | master_data_dir: "axbench/data" 8 | dataset_category: "instruction" 9 | lm_use_cache: false 10 | seed: 42 11 | train: 12 | model_name: "meta-llama/Llama-3.1-8B-Instruct" 13 | layer: 20 14 | component: "res" 15 | seed: 42 16 | use_bf16: true 17 | models: 18 | LsReFT: 19 | batch_size: 6 20 | gradient_accumulation_steps: 1 21 | n_epochs: 3 22 | lr: 0.005 23 | weight_decay: 0.00 24 | topk: 8 25 | coeff_latent_l1_loss: 0.005 26 | low_rank_dimension: 1 27 | intervention_positions: "all" 28 | intervention_type: "addition" # clamping 29 | binarize_dataset: false 30 | train_on_negative: true 31 | exclude_bos: true 32 | inference: 33 | use_bf16: true 34 | models: ["LsReFT"] 35 | model_name: "meta-llama/Llama-3.1-8B-Instruct" 36 | # latent related params 37 | output_length: 128 38 | latent_num_of_examples: 36 39 | latent_batch_size: 16 40 | # steering related params 41 | steering_intervention_type: "addition" # clamping 42 | steering_model_name: "meta-llama/Llama-3.1-8B-Instruct" 43 | steering_datasets: ["AlpacaEval"] 44 | steering_batch_size: 5 45 | steering_output_length: 128 46 | steering_layers: [20] 47 | steering_num_of_examples: 10 # number of examples per concept and per factor 48 | steering_factors: [0.2, 0.4, 0.6, 0.8, 1.0, 1.2, 1.4, 1.6, 1.8, 2.0, 2.5, 3.0, 4.0, 5.0] # number of steering factors per example 49 | # master data dir is shared across all jobs. 50 | master_data_dir: "axbench/data" 51 | seed: 42 52 | lm_model: "gpt-4o-mini-2024-07-18" 53 | # generation related params 54 | temperature: 1.0 55 | evaluate: 56 | models: ["LsReFT"] 57 | latent_evaluators: [ 58 | "AUCROCEvaluator", 59 | "HardNegativeEvaluator", 60 | ] 61 | steering_evaluators: [ 62 | "PerplexityEvaluator", 63 | "LMJudgeEvaluator", 64 | ] 65 | winrate_split_ratio: 0.5 # this is for steering only, we use a separate partition for factor selection. 66 | # Number of processes to run in parallel for steering evaluation. 67 | num_of_workers: 32 68 | lm_model: "gpt-4o-mini-2024-07-18" 69 | run_winrate: false 70 | winrate_baseline: "PromptSteering" 71 | # master data dir is shared across all jobs. 72 | master_data_dir: "axbench/data" 73 | -------------------------------------------------------------------------------- /axbench/sweep/wuzhengx/others/prod_2b_l10_fd_v1.yaml: -------------------------------------------------------------------------------- 1 | generate: 2 | lm_model: "gpt-4o-mini" 3 | output_length: 128 4 | num_of_examples: 144 5 | concept_path: "axbench/data/gemma-2-2b_10-featured-res-16k.json" 6 | max_concepts: 500 # feature-description is much smaller than 500 as its original release. 7 | master_data_dir: "axbench/data" 8 | dataset_category: "instruction" 9 | lm_use_cache: false 10 | seed: 42 11 | inference_batch_size: 16 # for generating DPO dataset. 12 | inference: 13 | use_bf16: true 14 | models: ["PromptSteering"] 15 | model_name: "google/gemma-2-2b-it" 16 | # latent related params 17 | output_length: 128 18 | latent_num_of_examples: 36 19 | latent_batch_size: 16 20 | # steering related params 21 | steering_intervention_type: "addition" # clamping 22 | steering_model_name: "google/gemma-2-2b-it" 23 | steering_datasets: ["AlpacaEval"] 24 | steering_batch_size: 10 25 | steering_output_length: 128 26 | steering_layers: [10] 27 | steering_num_of_examples: 10 # number of examples per concept and per factor 28 | steering_factors: [0.2, 0.4, 0.6, 0.8, 1.0, 1.2, 1.4, 1.6, 1.8, 2.0, 2.5, 3.0, 4.0, 5.0] # number of steering factors per example 29 | # master data dir is shared across all jobs. 30 | master_data_dir: "axbench/data" 31 | seed: 42 32 | lm_model: "gpt-4o-mini" 33 | # generation related params 34 | temperature: 1.0 -------------------------------------------------------------------------------- /axbench/sweep/wuzhengx/others/prod_2b_l10_v1.yaml: -------------------------------------------------------------------------------- 1 | generate: 2 | lm_model: "gpt-4o-mini" 3 | output_length: 128 4 | num_of_examples: 144 5 | concept_path: "axbench/data/gemma-2-2b_10-gemmascope-res-16k.json" 6 | max_concepts: 16000 7 | master_data_dir: "axbench/data" 8 | dataset_category: "instruction" 9 | lm_use_cache: false 10 | seed: 42 -------------------------------------------------------------------------------- /axbench/sweep/wuzhengx/others/prod_2b_l20_fd_v1.yaml: -------------------------------------------------------------------------------- 1 | generate: 2 | lm_model: "gpt-4o-mini" 3 | output_length: 128 4 | num_of_examples: 144 5 | concept_path: "axbench/data/gemma-2-2b_20-featured-res-16k.json" 6 | max_concepts: 500 # feature-description is much smaller than 500 as its original release. 7 | master_data_dir: "axbench/data" 8 | dataset_category: "instruction" 9 | lm_use_cache: false 10 | seed: 42 11 | inference: 12 | use_bf16: true 13 | models: ["PromptSteering"] 14 | model_name: "google/gemma-2-2b-it" 15 | # latent related params 16 | output_length: 128 17 | latent_num_of_examples: 36 18 | latent_batch_size: 16 19 | # steering related params 20 | steering_intervention_type: "addition" # clamping 21 | steering_model_name: "google/gemma-2-2b-it" 22 | steering_datasets: ["AlpacaEval"] 23 | steering_batch_size: 10 24 | steering_output_length: 128 25 | steering_layers: [20] 26 | steering_num_of_examples: 10 # number of examples per concept and per factor 27 | steering_factors: [0.2, 0.4, 0.6, 0.8, 1.0, 1.2, 1.4, 1.6, 1.8, 2.0, 2.5, 3.0, 4.0, 5.0] # number of steering factors per example 28 | # master data dir is shared across all jobs. 29 | master_data_dir: "axbench/data" 30 | seed: 42 31 | lm_model: "gpt-4o-mini" 32 | # generation related params 33 | temperature: 1.0 -------------------------------------------------------------------------------- /axbench/sweep/wuzhengx/others/prod_2b_l20_v1.yaml: -------------------------------------------------------------------------------- 1 | generate: 2 | lm_model: "gpt-4o-mini" 3 | output_length: 128 4 | num_of_examples: 144 5 | concept_path: "axbench/data/gemma-2-2b_20-gemmascope-res-16k.json" 6 | max_concepts: 16000 7 | master_data_dir: "axbench/data" 8 | dataset_category: "instruction" 9 | lm_use_cache: false 10 | seed: 42 -------------------------------------------------------------------------------- /axbench/sweep/wuzhengx/others/prod_9b_l20_v1.yaml: -------------------------------------------------------------------------------- 1 | generate: 2 | lm_model: "gpt-4o-mini" 3 | output_length: 128 4 | num_of_examples: 144 5 | concept_path: "axbench/data/gemma-2-9b-it_20-gemmascope-res-131k.json" 6 | max_concepts: 16000 7 | master_data_dir: "axbench/data" 8 | dataset_category: "instruction" 9 | lm_use_cache: false 10 | seed: 42 -------------------------------------------------------------------------------- /axbench/sweep/wuzhengx/others/prod_9b_l31_v1.yaml: -------------------------------------------------------------------------------- 1 | generate: 2 | lm_model: "gpt-4o-mini" 3 | output_length: 128 4 | num_of_examples: 144 5 | concept_path: "axbench/data/gemma-2-9b-it_31-gemmascope-res-131k.json" 6 | max_concepts: 16000 7 | master_data_dir: "axbench/data" 8 | dataset_category: "instruction" 9 | lm_use_cache: false 10 | seed: 42 -------------------------------------------------------------------------------- /axbench/sweep/wuzhengx/pls/prod_2b_l20_gemma_65k.yaml: -------------------------------------------------------------------------------- 1 | generate: 2 | lm_model: "gpt-4o-mini-2024-07-18" 3 | output_length: 128 4 | num_of_examples: 144 5 | concept_path: "axbench/data/gemma-2-2b_20-gemmascope-res-65k.json" 6 | max_concepts: 16000 7 | master_data_dir: "axbench/data" 8 | dataset_category: "instruction" 9 | lm_use_cache: false 10 | seed: 42 11 | train: 12 | model_name: "google/gemma-2-2b-it" 13 | layer: 20 14 | component: "res" 15 | seed: 42 16 | use_bf16: true 17 | models: 18 | LsReFT: 19 | batch_size: 6 20 | gradient_accumulation_steps: 1 21 | n_epochs: 3 22 | lr: 0.01 23 | weight_decay: 0.00 24 | topk: 8 25 | coeff_latent_l1_loss: 0.005 26 | low_rank_dimension: 1 27 | intervention_positions: "all" 28 | intervention_type: "addition" # clamping 29 | binarize_dataset: false 30 | train_on_negative: true 31 | exclude_bos: true -------------------------------------------------------------------------------- /axbench/sweep/wuzhengx/pls/prod_9b_l20_gemma_131k.yaml: -------------------------------------------------------------------------------- 1 | generate: 2 | lm_model: "gpt-4o-mini-2024-07-18" 3 | output_length: 128 4 | num_of_examples: 144 5 | concept_path: "axbench/data/gemma-2-9b-it_20-gemmascope-res-131k.json" 6 | max_concepts: 16000 7 | master_data_dir: "axbench/data" 8 | dataset_category: "instruction" 9 | lm_use_cache: false 10 | seed: 42 11 | train: 12 | model_name: "google/gemma-2-9b-it" 13 | layer: 20 14 | component: "res" 15 | seed: 42 16 | use_bf16: true 17 | models: 18 | LsReFT: 19 | batch_size: 6 20 | gradient_accumulation_steps: 1 21 | n_epochs: 3 22 | lr: 0.005 23 | weight_decay: 0.00 24 | topk: 8 25 | coeff_latent_l1_loss: 0.005 26 | low_rank_dimension: 1 27 | intervention_positions: "all" 28 | intervention_type: "addition" # clamping 29 | binarize_dataset: false 30 | train_on_negative: true 31 | exclude_bos: true -------------------------------------------------------------------------------- /axbench/sweep/wuzhengx/reps/dataset/concept100.yaml: -------------------------------------------------------------------------------- 1 | generate: 2 | lm_model: "gpt-4o-mini" 3 | num_of_examples: 144 4 | concept_path: "axbench/data/gemma-2-9b-it_20-gemmascope-res-131k.json" # there is no special reason why we pick this. we treat it as a concept scrapper. 5 | max_concepts: 100 6 | master_data_dir: "axbench/data" 7 | dataset_category: "instruction" 8 | lm_use_cache: false 9 | seed: 42 10 | inference_batch_size: 16 # for generating DPO dataset. 11 | disable_local_model: true 12 | inference: 13 | use_bf16: true 14 | models: ["PromptSteering"] 15 | # latent related params 16 | latent_num_of_examples: 36 17 | latent_batch_size: 16 18 | # steering related params 19 | steering_intervention_type: "addition" # clamping 20 | steering_model_name: "use_cmd_to_define" 21 | steering_datasets: ["AlpacaEval"] 22 | steering_batch_size: 10 23 | steering_output_length: 128 24 | steering_layers: [10] 25 | steering_num_of_examples: 10 # number of examples per concept and per factor 26 | steering_factors: [0.2, 0.4, 0.6, 0.8, 1.0, 1.2, 1.4, 1.6, 1.8, 2.0, 2.5, 3.0, 4.0, 5.0] # number of steering factors per example 27 | # master data dir is shared across all jobs. 28 | master_data_dir: "axbench/data" 29 | seed: 42 30 | lm_model: "gpt-4o-mini" 31 | # generation related params 32 | temperature: 1.0 -------------------------------------------------------------------------------- /axbench/sweep/wuzhengx/reps/dataset/concept500.yaml: -------------------------------------------------------------------------------- 1 | generate: 2 | lm_model: "gpt-4o-mini" 3 | output_length: 128 4 | num_of_examples: 144 5 | max_concepts: 10 6 | master_data_dir: "axbench/data" 7 | dataset_category: "instruction" 8 | lm_use_cache: false 9 | seed: 42 10 | keep_orig_axbench_format: true 11 | inference: 12 | use_bf16: true 13 | models: ["PromptSteering"] 14 | # latent related params 15 | latent_num_of_examples: 36 16 | latent_batch_size: 16 17 | # steering related params 18 | steering_intervention_type: "addition" # clamping 19 | steering_model_name: "use_cmd_to_define" 20 | steering_datasets: ["AlpacaEval"] 21 | steering_batch_size: 10 22 | steering_output_length: 128 23 | steering_layers: [10] 24 | steering_num_of_examples: 10 # number of examples per concept and per factor 25 | steering_factors: [0.2, 0.4, 0.6, 0.8, 1.0, 1.2, 1.4, 1.6, 1.8, 2.0, 2.5, 3.0, 4.0, 5.0] # number of steering factors per example 26 | # master data dir is shared across all jobs. 27 | master_data_dir: "axbench/data" 28 | seed: 42 29 | lm_model: "gpt-4o-mini" 30 | # generation related params 31 | temperature: 1.0 -------------------------------------------------------------------------------- /axbench/sweep/wuzhengx/reps/experiments/c_vector_g3-27b_axbench_suppress_overwrite_prepend.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanfordnlp/axbench/bc9189153f475de725298fc8031bc0e48ff1132d/axbench/sweep/wuzhengx/reps/experiments/c_vector_g3-27b_axbench_suppress_overwrite_prepend.yaml -------------------------------------------------------------------------------- /axbench/sweep/wuzhengx/reps/experiments/p_vector_dps_g2-9b_axbench_suppress_rule.yam: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanfordnlp/axbench/bc9189153f475de725298fc8031bc0e48ff1132d/axbench/sweep/wuzhengx/reps/experiments/p_vector_dps_g2-9b_axbench_suppress_rule.yam -------------------------------------------------------------------------------- /axbench/sweep/wuzhengx/reps/experiments/p_vector_dps_g3-27b_axbench_suppress.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanfordnlp/axbench/bc9189153f475de725298fc8031bc0e48ff1132d/axbench/sweep/wuzhengx/reps/experiments/p_vector_dps_g3-27b_axbench_suppress.yaml -------------------------------------------------------------------------------- /axbench/sweep/wuzhengx/reps/experiments/p_vector_dps_g3-27b_axbench_suppress_rule.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanfordnlp/axbench/bc9189153f475de725298fc8031bc0e48ff1132d/axbench/sweep/wuzhengx/reps/experiments/p_vector_dps_g3-27b_axbench_suppress_rule.yaml -------------------------------------------------------------------------------- /axbench/sweep/wuzhengx/reps/experiments/prompt_steering_g2-2b_concept500_suppress.yaml: -------------------------------------------------------------------------------- 1 | generate: 2 | lm_model: "gpt-4o-mini" 3 | output_length: 128 4 | num_of_examples: 144 5 | max_concepts: 500 6 | master_data_dir: "axbench/data" 7 | dataset_category: "instruction" 8 | lm_use_cache: false 9 | seed: 42 10 | keep_orig_axbench_format: true 11 | ### use cmd to define ### 12 | # concept_path: "axbench/data/gemma-2-9b_20-gemmascope-res-16k.json" 13 | train: 14 | component: "res" 15 | seed: 42 16 | use_bf16: true 17 | models: 18 | DiffMean: 19 | batch_size: 6 20 | n_epochs: 1 21 | binarize_dataset: true 22 | low_rank_dimension: 1 23 | output_length: 768 # use_cmd_to_define 24 | model_name: "google/gemma-2-2b-it" 25 | steer_dataset_type: "concept" 26 | inference: 27 | use_bf16: true 28 | models: ["PromptSteering"] 29 | # latent related params 30 | output_length: 768 31 | latent_num_of_examples: 36 32 | latent_batch_size: 16 33 | # steering related params 34 | steering_intervention_type: "addition" # clamping 35 | steering_datasets: ["AlpacaEvalSuppress"] 36 | steering_batch_size: 10 37 | steering_output_length: 768 38 | steering_num_of_examples: 10 # number of examples per concept and per factor 39 | steering_factors: [1.0] # number of steering factors per example 40 | # master data dir is shared across all jobs. 41 | master_data_dir: "axbench/data" 42 | seed: 42 43 | lm_model: "gpt-4o-mini" 44 | # generation related params 45 | temperature: 1.0 46 | ### use cmd to define ### 47 | steering_model_name: "google/gemma-2-2b-it" 48 | model_name: "google/gemma-2-2b-it" 49 | defense: ["prepend_original"] 50 | # steering_layer: 20 51 | # latent_layer: 20 52 | evaluate: 53 | models: ["PromptSteering"] 54 | latent_evaluators: [ 55 | "AUCROCEvaluator", 56 | "HardNegativeEvaluator", 57 | ] 58 | steering_evaluators: [ 59 | # "PerplexityEvaluator", 60 | "LMJudgeEvaluator", 61 | ] 62 | winrate_split_ratio: 0.5 # this is for steering only, we use a separate partition for factor selection. 63 | # Number of processes to run in parallel for steering evaluation. 64 | num_of_workers: 32 65 | lm_model: "gpt-4o-mini" 66 | run_winrate: false 67 | winrate_baseline: "PromptSteering" 68 | # master data dir is shared across all jobs. 69 | master_data_dir: "axbench/data" 70 | steer_dataset_type: "concept" 71 | defense: ["prepend_original"] 72 | 73 | -------------------------------------------------------------------------------- /axbench/sweep/wuzhengx/reps/experiments/prompt_steering_g2-2b_prompt_rule.yaml: -------------------------------------------------------------------------------- 1 | generate: 2 | lm_model: "gpt-4o-mini" 3 | output_length: 128 4 | num_of_examples: 144 5 | max_concepts: 500 6 | master_data_dir: "axbench/data" 7 | dataset_category: "instruction" 8 | lm_use_cache: false 9 | seed: 42 10 | keep_orig_axbench_format: true 11 | ### use cmd to define ### 12 | # concept_path: "axbench/data/gemma-2-9b_20-gemmascope-res-16k.json" 13 | train: 14 | component: "res" 15 | seed: 42 16 | use_bf16: true 17 | models: 18 | DiffMean: 19 | batch_size: 6 20 | n_epochs: 1 21 | binarize_dataset: true 22 | low_rank_dimension: 1 23 | output_length: 768 # use_cmd_to_define 24 | model_name: "google/gemma-2-2b-it" 25 | steer_dataset_type: "concept" 26 | inference: 27 | use_bf16: true 28 | models: ["PromptSteering"] 29 | # latent related params 30 | output_length: 768 31 | latent_num_of_examples: 36 32 | latent_batch_size: 16 33 | # steering related params 34 | steering_intervention_type: "addition" # clamping 35 | steering_datasets: ["AlpacaEvalSuppress"] 36 | steering_batch_size: 10 37 | steering_output_length: 768 38 | steering_num_of_examples: 10 # number of examples per concept and per factor 39 | steering_factors: [1.0] # number of steering factors per example 40 | # master data dir is shared across all jobs. 41 | master_data_dir: "axbench/data" 42 | seed: 42 43 | lm_model: "gpt-4o-mini" 44 | # generation related params 45 | temperature: 1.0 46 | ### use cmd to define ### 47 | steering_model_name: "google/gemma-2-2b-it" 48 | model_name: "google/gemma-2-2b-it" 49 | defense: ["prepend_original"] 50 | # steering_layer: 20 51 | # latent_layer: 20 52 | evaluate: 53 | models: ["PromptSteering"] 54 | latent_evaluators: [ 55 | "AUCROCEvaluator", 56 | "HardNegativeEvaluator", 57 | ] 58 | steering_evaluators: [ 59 | # "PerplexityEvaluator", 60 | "LMJudgeEvaluator", 61 | ] 62 | winrate_split_ratio: 0.5 # this is for steering only, we use a separate partition for factor selection. 63 | # Number of processes to run in parallel for steering evaluation. 64 | num_of_workers: 32 65 | lm_model: "gpt-4o-mini" 66 | run_winrate: false 67 | winrate_baseline: "PromptSteering" 68 | # master data dir is shared across all jobs. 69 | master_data_dir: "axbench/data" 70 | steer_dataset_type: "concept" 71 | defense: ["prepend_original"] 72 | 73 | -------------------------------------------------------------------------------- /axbench/sweep/wuzhengx/reps/experiments/prompt_steering_g2-9b_concept20_suppress.yaml: -------------------------------------------------------------------------------- 1 | generate: 2 | lm_model: "gpt-4o-mini" 3 | output_length: 128 4 | num_of_examples: 144 5 | max_concepts: 500 6 | master_data_dir: "axbench/data" 7 | dataset_category: "instruction" 8 | lm_use_cache: false 9 | seed: 42 10 | keep_orig_axbench_format: true 11 | steer_data_type: "rule" ### use cmd to define ### 12 | # concept_path: "axbench/data/gemma-2-9b_20-gemmascope-res-16k.json" 13 | train: 14 | component: "res" 15 | seed: 42 16 | use_bf16: true 17 | models: 18 | DiffMean: 19 | batch_size: 6 20 | n_epochs: 1 21 | binarize_dataset: true 22 | low_rank_dimension: 1 23 | output_length: 768 # use_cmd_to_define 24 | model_name: "google/gemma-2-9b-it" 25 | steer_data_type: "rule" 26 | inference: 27 | use_bf16: true 28 | models: ["PromptSteering"] 29 | # latent related params 30 | output_length: 768 31 | latent_num_of_examples: 36 32 | latent_batch_size: 16 33 | # steering related params 34 | steering_intervention_type: "addition" # clamping 35 | steering_datasets: ["AlpacaEvalSuppress"] 36 | steering_batch_size: 10 37 | steering_output_length: 768 38 | steering_num_of_examples: 10 # number of examples per concept and per factor 39 | steering_factors: [1.0] # number of steering factors per example 40 | # master data dir is shared across all jobs. 41 | master_data_dir: "axbench/data" 42 | seed: 42 43 | lm_model: "gpt-4o-mini" 44 | # generation related params 45 | temperature: 1.0 46 | ### use cmd to define ### 47 | steering_model_name: "google/gemma-2-9b-it" 48 | model_name: "google/gemma-2-9b-it" 49 | defense: ["prepend_original"] 50 | steer_data_type: "rule" 51 | # steering_layer: 20 52 | # latent_layer: 20 53 | evaluate: 54 | models: ["PromptSteering"] 55 | latent_evaluators: [ 56 | "AUCROCEvaluator", 57 | "HardNegativeEvaluator", 58 | ] 59 | steering_evaluators: [ 60 | # "PerplexityEvaluator", 61 | "LMJudgeEvaluator", 62 | "RuleEvaluator" 63 | ] 64 | winrate_split_ratio: 0.5 # this is for steering only, we use a separate partition for factor selection. 65 | # Number of processes to run in parallel for steering evaluation. 66 | num_of_workers: 32 67 | lm_model: "gpt-4o-mini" 68 | run_winrate: false 69 | winrate_baseline: "PromptSteering" 70 | # master data dir is shared across all jobs. 71 | master_data_dir: "axbench/data" 72 | steer_dataset_type: "rule" 73 | defense: ["prepend_original"] 74 | 75 | -------------------------------------------------------------------------------- /axbench/sweep/wuzhengx/reps/experiments/prompt_steering_g2-9b_concept500_suppress.yaml: -------------------------------------------------------------------------------- 1 | generate: 2 | lm_model: "gpt-4o-mini" 3 | output_length: 128 4 | num_of_examples: 144 5 | max_concepts: 500 6 | master_data_dir: "axbench/data" 7 | dataset_category: "instruction" 8 | lm_use_cache: false 9 | seed: 42 10 | keep_orig_axbench_format: true 11 | ### use cmd to define ### 12 | # concept_path: "axbench/data/gemma-2-9b_20-gemmascope-res-16k.json" 13 | train: 14 | component: "res" 15 | seed: 42 16 | use_bf16: true 17 | models: 18 | DiffMean: 19 | batch_size: 6 20 | n_epochs: 1 21 | binarize_dataset: true 22 | low_rank_dimension: 1 23 | output_length: 768 # use_cmd_to_define 24 | model_name: "google/gemma-2-9b-it" 25 | steer_dataset_type: "concept" 26 | inference: 27 | use_bf16: true 28 | models: ["PromptSteering"] 29 | # latent related params 30 | output_length: 768 31 | latent_num_of_examples: 36 32 | latent_batch_size: 16 33 | # steering related params 34 | steering_intervention_type: "addition" # clamping 35 | steering_datasets: ["AlpacaEvalSuppress"] 36 | steering_batch_size: 10 37 | steering_output_length: 768 38 | steering_num_of_examples: 10 # number of examples per concept and per factor 39 | steering_factors: [1.0] # number of steering factors per example 40 | # master data dir is shared across all jobs. 41 | master_data_dir: "axbench/data" 42 | seed: 42 43 | lm_model: "gpt-4o-mini" 44 | # generation related params 45 | temperature: 1.0 46 | ### use cmd to define ### 47 | steering_model_name: "google/gemma-2-9b-it" 48 | model_name: "google/gemma-2-9b-it" 49 | defense: ["prepend_original"] 50 | # steering_layer: 20 51 | # latent_layer: 20 52 | evaluate: 53 | models: ["PromptSteering"] 54 | latent_evaluators: [ 55 | "AUCROCEvaluator", 56 | "HardNegativeEvaluator", 57 | ] 58 | steering_evaluators: [ 59 | # "PerplexityEvaluator", 60 | "LMJudgeEvaluator", 61 | ] 62 | winrate_split_ratio: 0.5 # this is for steering only, we use a separate partition for factor selection. 63 | # Number of processes to run in parallel for steering evaluation. 64 | num_of_workers: 32 65 | lm_model: "gpt-4o-mini" 66 | run_winrate: false 67 | winrate_baseline: "PromptSteering" 68 | # master data dir is shared across all jobs. 69 | master_data_dir: "axbench/data" 70 | steer_dataset_type: "concept" 71 | defense: ["prepend_original"] 72 | 73 | -------------------------------------------------------------------------------- /axbench/sweep/wuzhengx/reps/experiments/prompt_steering_g3-12b_concept100.yaml: -------------------------------------------------------------------------------- 1 | generate: 2 | lm_model: "gpt-4o-mini" 3 | output_length: 128 4 | num_of_examples: 144 5 | max_concepts: 500 6 | master_data_dir: "axbench/data" 7 | dataset_category: "instruction" 8 | lm_use_cache: false 9 | seed: 42 10 | keep_orig_axbench_format: true 11 | ### use cmd to define ### 12 | # concept_path: "axbench/data/gemma-2-9b_20-gemmascope-res-16k.json" 13 | train: 14 | component: "res" 15 | seed: 42 16 | use_bf16: true 17 | models: 18 | DiffMean: 19 | batch_size: 6 20 | n_epochs: 1 21 | binarize_dataset: true 22 | low_rank_dimension: 1 23 | output_length: 768 # use_cmd_to_define 24 | model_name: "google/gemma-3-12b-it" 25 | steer_dataset_type: "concept" 26 | inference: 27 | use_bf16: true 28 | models: ["PromptSteering"] 29 | # latent related params 30 | output_length: 768 31 | latent_num_of_examples: 36 32 | latent_batch_size: 16 33 | # steering related params 34 | steering_intervention_type: "addition" # clamping 35 | steering_datasets: ["AlpacaEval"] 36 | steering_batch_size: 10 37 | steering_output_length: 768 38 | steering_num_of_examples: 10 # number of examples per concept and per factor 39 | steering_factors: [1.0] # number of steering factors per example 40 | # master data dir is shared across all jobs. 41 | master_data_dir: "axbench/data" 42 | seed: 42 43 | lm_model: "gpt-4o-mini" 44 | # generation related params 45 | temperature: 1.0 46 | ### use cmd to define ### 47 | steering_model_name: "google/gemma-3-12b-it" 48 | model_name: "google/gemma-3-12b-it" 49 | # steering_layer: 20 50 | # latent_layer: 20 51 | evaluate: 52 | models: ["PromptSteering"] 53 | latent_evaluators: [ 54 | "AUCROCEvaluator", 55 | "HardNegativeEvaluator", 56 | ] 57 | steering_evaluators: [ 58 | # "PerplexityEvaluator", 59 | "LMJudgeEvaluator", 60 | ] 61 | winrate_split_ratio: 0.5 # this is for steering only, we use a separate partition for factor selection. 62 | # Number of processes to run in parallel for steering evaluation. 63 | num_of_workers: 32 64 | lm_model: "gpt-4o-mini" 65 | run_winrate: false 66 | winrate_baseline: "PromptSteering" 67 | # master data dir is shared across all jobs. 68 | master_data_dir: "axbench/data" 69 | steer_dataset_type: "concept" 70 | 71 | -------------------------------------------------------------------------------- /axbench/sweep/wuzhengx/reps/experiments/prompt_steering_g3-12b_concept20_suppress.yaml: -------------------------------------------------------------------------------- 1 | generate: 2 | lm_model: "gpt-4o-mini" 3 | output_length: 128 4 | num_of_examples: 144 5 | max_concepts: 500 6 | master_data_dir: "axbench/data" 7 | dataset_category: "instruction" 8 | lm_use_cache: false 9 | seed: 42 10 | keep_orig_axbench_format: true 11 | ### use cmd to define ### 12 | # concept_path: "axbench/data/gemma-2-9b_20-gemmascope-res-16k.json" 13 | train: 14 | component: "res" 15 | seed: 42 16 | use_bf16: true 17 | models: 18 | DiffMean: 19 | batch_size: 6 20 | n_epochs: 1 21 | binarize_dataset: true 22 | low_rank_dimension: 1 23 | output_length: 768 # use_cmd_to_define 24 | model_name: "google/gemma-3-12b-it" 25 | steer_data_type: "rule" 26 | inference: 27 | use_bf16: true 28 | models: ["PromptSteering"] 29 | # latent related params 30 | output_length: 768 31 | latent_num_of_examples: 36 32 | latent_batch_size: 16 33 | # steering related params 34 | steering_intervention_type: "addition" # clamping 35 | steering_datasets: ["AlpacaEvalSuppress"] 36 | steering_batch_size: 10 37 | steering_output_length: 768 38 | steering_num_of_examples: 10 # number of examples per concept and per factor 39 | steering_factors: [1.0] # number of steering factors per example 40 | # master data dir is shared across all jobs. 41 | master_data_dir: "axbench/data" 42 | seed: 42 43 | lm_model: "gpt-4o-mini" 44 | # generation related params 45 | temperature: 1.0 46 | ### use cmd to define ### 47 | steering_model_name: "google/gemma-3-12b-it" 48 | model_name: "google/gemma-3-12b-it" 49 | defense: ["prepend_original"] 50 | # steering_layer: 20 51 | # latent_layer: 20 52 | evaluate: 53 | models: ["PromptSteering"] 54 | latent_evaluators: [ 55 | "AUCROCEvaluator", 56 | "HardNegativeEvaluator", 57 | ] 58 | steering_evaluators: [ 59 | # "PerplexityEvaluator", 60 | "LMJudgeEvaluator", 61 | "RuleEvaluator" 62 | ] 63 | winrate_split_ratio: 0.5 # this is for steering only, we use a separate partition for factor selection. 64 | # Number of processes to run in parallel for steering evaluation. 65 | num_of_workers: 32 66 | lm_model: "gpt-4o-mini" 67 | run_winrate: false 68 | winrate_baseline: "PromptSteering" 69 | # master data dir is shared across all jobs. 70 | master_data_dir: "axbench/data" 71 | steer_dataset_type: "concept" 72 | defense: ["prepend_original"] 73 | 74 | -------------------------------------------------------------------------------- /axbench/sweep/wuzhengx/reps/experiments/prompt_steering_g3-12b_concept20_suppress_overwrite_append.yaml: -------------------------------------------------------------------------------- 1 | generate: 2 | lm_model: "gpt-4o-mini" 3 | output_length: 128 4 | num_of_examples: 144 5 | max_concepts: 500 6 | master_data_dir: "axbench/data" 7 | dataset_category: "instruction" 8 | lm_use_cache: false 9 | seed: 42 10 | keep_orig_axbench_format: true 11 | steer_data_type: "rule" 12 | ### use cmd to define ### 13 | # concept_path: "axbench/data/gemma-2-9b_20-gemmascope-res-16k.json" 14 | train: 15 | component: "res" 16 | seed: 42 17 | use_bf16: true 18 | models: 19 | DiffMean: 20 | batch_size: 6 21 | n_epochs: 1 22 | binarize_dataset: true 23 | low_rank_dimension: 1 24 | output_length: 768 # use_cmd_to_define 25 | model_name: "google/gemma-3-12b-it" 26 | steer_data_type: "rule" 27 | inference: 28 | use_bf16: true 29 | models: ["PromptSteering"] 30 | # latent related params 31 | output_length: 768 32 | latent_num_of_examples: 36 33 | latent_batch_size: 16 34 | # steering related params 35 | steering_intervention_type: "addition" # clamping 36 | steering_datasets: ["AttackOverwrite"] 37 | steering_batch_size: 10 38 | steering_output_length: 768 39 | steering_num_of_examples: 10 # number of examples per concept and per factor 40 | steering_factors: [1.0] # number of steering factors per example 41 | # master data dir is shared across all jobs. 42 | master_data_dir: "axbench/data" 43 | seed: 42 44 | lm_model: "gpt-4o-mini" 45 | # generation related params 46 | temperature: 1.0 47 | ### use cmd to define ### 48 | steering_model_name: "google/gemma-3-12b-it" 49 | model_name: "google/gemma-3-12b-it" 50 | defense: ["append_original"] 51 | # steering_layer: 20 52 | # latent_layer: 20 53 | evaluate: 54 | models: ["PromptSteering"] 55 | latent_evaluators: [ 56 | "AUCROCEvaluator", 57 | "HardNegativeEvaluator", 58 | ] 59 | steering_evaluators: [ 60 | # "PerplexityEvaluator", 61 | "LMJudgeEvaluator", 62 | "RuleEvaluator" 63 | ] 64 | winrate_split_ratio: 0.5 # this is for steering only, we use a separate partition for factor selection. 65 | # Number of processes to run in parallel for steering evaluation. 66 | num_of_workers: 32 67 | lm_model: "gpt-4o-mini" 68 | run_winrate: false 69 | winrate_baseline: "PromptSteering" 70 | # master data dir is shared across all jobs. 71 | master_data_dir: "axbench/data" 72 | steer_dataset_type: "rule" 73 | defense: ["append_original"] 74 | 75 | -------------------------------------------------------------------------------- /axbench/sweep/wuzhengx/reps/experiments/prompt_steering_g3-12b_concept20_suppress_rule.yaml: -------------------------------------------------------------------------------- 1 | generate: 2 | lm_model: "gpt-4o-mini" 3 | output_length: 128 4 | num_of_examples: 144 5 | max_concepts: 500 6 | master_data_dir: "axbench/data" 7 | dataset_category: "instruction" 8 | lm_use_cache: false 9 | seed: 42 10 | keep_orig_axbench_format: true 11 | steer_data_type: "rule" 12 | ### use cmd to define ### 13 | # concept_path: "axbench/data/gemma-2-9b_20-gemmascope-res-16k.json" 14 | train: 15 | component: "res" 16 | seed: 42 17 | use_bf16: true 18 | models: 19 | DiffMean: 20 | batch_size: 6 21 | n_epochs: 1 22 | binarize_dataset: true 23 | low_rank_dimension: 1 24 | output_length: 768 # use_cmd_to_define 25 | model_name: "google/gemma-3-12b-it" 26 | steer_data_type: "rule" 27 | inference: 28 | use_bf16: true 29 | models: ["PromptSteering"] 30 | # latent related params 31 | output_length: 768 32 | latent_num_of_examples: 36 33 | latent_batch_size: 16 34 | # steering related params 35 | steering_intervention_type: "addition" # clamping 36 | steering_datasets: ["AlpacaEvalSuppress"] 37 | steering_batch_size: 10 38 | steering_output_length: 768 39 | steering_num_of_examples: 10 # number of examples per concept and per factor 40 | steering_factors: [1.0] # number of steering factors per example 41 | # master data dir is shared across all jobs. 42 | master_data_dir: "axbench/data" 43 | seed: 42 44 | lm_model: "gpt-4o-mini" 45 | # generation related params 46 | temperature: 1.0 47 | ### use cmd to define ### 48 | steering_model_name: "google/gemma-3-12b-it" 49 | model_name: "google/gemma-3-12b-it" 50 | defense: ["prepend_original"] 51 | # steering_layer: 20 52 | # latent_layer: 20 53 | evaluate: 54 | models: ["PromptSteering"] 55 | latent_evaluators: [ 56 | "AUCROCEvaluator", 57 | "HardNegativeEvaluator", 58 | ] 59 | steering_evaluators: [ 60 | # "PerplexityEvaluator", 61 | "LMJudgeEvaluator", 62 | "RuleEvaluator" 63 | ] 64 | winrate_split_ratio: 0.5 # this is for steering only, we use a separate partition for factor selection. 65 | # Number of processes to run in parallel for steering evaluation. 66 | num_of_workers: 32 67 | lm_model: "gpt-4o-mini" 68 | run_winrate: false 69 | winrate_baseline: "PromptSteering" 70 | # master data dir is shared across all jobs. 71 | master_data_dir: "axbench/data" 72 | steer_dataset_type: "concept" 73 | defense: ["prepend_original"] 74 | 75 | -------------------------------------------------------------------------------- /axbench/sweep/wuzhengx/reps/experiments/prompt_steering_g3-27b_concept100.yaml: -------------------------------------------------------------------------------- 1 | generate: 2 | lm_model: "gpt-4o-mini" 3 | output_length: 128 4 | num_of_examples: 144 5 | max_concepts: 500 6 | master_data_dir: "axbench/data" 7 | dataset_category: "instruction" 8 | lm_use_cache: false 9 | seed: 42 10 | keep_orig_axbench_format: true 11 | ### use cmd to define ### 12 | # concept_path: "axbench/data/gemma-2-9b_20-gemmascope-res-16k.json" 13 | train: 14 | component: "res" 15 | seed: 42 16 | use_bf16: true 17 | models: 18 | DiffMean: 19 | batch_size: 6 20 | n_epochs: 1 21 | binarize_dataset: true 22 | low_rank_dimension: 1 23 | output_length: 768 24 | model_name: "google/gemma-3-27b-it" 25 | steer_dataset_type: "concept" 26 | inference: 27 | use_bf16: true 28 | models: ["PromptSteering"] 29 | # latent related params 30 | output_length: 768 31 | latent_num_of_examples: 36 32 | latent_batch_size: 16 33 | # steering related params 34 | steering_intervention_type: "addition" # clamping 35 | steering_datasets: ["AlpacaEval"] 36 | steering_batch_size: 10 37 | steering_output_length: 768 38 | steering_num_of_examples: 10 # number of examples per concept and per factor 39 | steering_factors: [1.0] # number of steering factors per example 40 | # master data dir is shared across all jobs. 41 | master_data_dir: "axbench/data" 42 | seed: 42 43 | lm_model: "gpt-4o-mini" 44 | # generation related params 45 | temperature: 1.0 46 | ### use cmd to define ### 47 | steering_model_name: "google/gemma-3-27b-it" 48 | model_name: "google/gemma-3-27b-it" 49 | # steering_layer: 20 50 | # latent_layer: 20 51 | evaluate: 52 | models: ["PromptSteering"] 53 | latent_evaluators: [ 54 | "AUCROCEvaluator", 55 | "HardNegativeEvaluator", 56 | ] 57 | steering_evaluators: [ 58 | # "PerplexityEvaluator", 59 | "LMJudgeEvaluator", 60 | ] 61 | winrate_split_ratio: 0.5 # this is for steering only, we use a separate partition for factor selection. 62 | # Number of processes to run in parallel for steering evaluation. 63 | num_of_workers: 32 64 | lm_model: "gpt-4o-mini" 65 | run_winrate: false 66 | winrate_baseline: "PromptSteering" 67 | # master data dir is shared across all jobs. 68 | master_data_dir: "axbench/data" 69 | steer_dataset_type: "concept" 70 | 71 | -------------------------------------------------------------------------------- /axbench/sweep/wuzhengx/reps/experiments/prompt_steering_g3-27b_concept20_suppress.yaml: -------------------------------------------------------------------------------- 1 | generate: 2 | lm_model: "gpt-4o-mini" 3 | output_length: 128 4 | num_of_examples: 144 5 | max_concepts: 500 6 | master_data_dir: "axbench/data" 7 | dataset_category: "instruction" 8 | lm_use_cache: false 9 | seed: 42 10 | keep_orig_axbench_format: true 11 | ### use cmd to define ### 12 | # concept_path: "axbench/data/gemma-2-9b_20-gemmascope-res-16k.json" 13 | train: 14 | component: "res" 15 | seed: 42 16 | use_bf16: true 17 | models: 18 | DiffMean: 19 | batch_size: 6 20 | n_epochs: 1 21 | binarize_dataset: true 22 | low_rank_dimension: 1 23 | output_length: 768 # use_cmd_to_define 24 | model_name: "google/gemma-3-27b-it" 25 | steer_data_type: "rule" 26 | inference: 27 | use_bf16: true 28 | models: ["PromptSteering"] 29 | # latent related params 30 | output_length: 768 31 | latent_num_of_examples: 36 32 | latent_batch_size: 16 33 | # steering related params 34 | steering_intervention_type: "addition" # clamping 35 | steering_datasets: ["AlpacaEvalSuppress"] 36 | steering_batch_size: 10 37 | steering_output_length: 768 38 | steering_num_of_examples: 10 # number of examples per concept and per factor 39 | steering_factors: [1.0] # number of steering factors per example 40 | # master data dir is shared across all jobs. 41 | master_data_dir: "axbench/data" 42 | seed: 42 43 | lm_model: "gpt-4o-mini" 44 | # generation related params 45 | temperature: 1.0 46 | ### use cmd to define ### 47 | steering_model_name: "google/gemma-3-27b-it" 48 | model_name: "google/gemma-3-27b-it" 49 | defense: ["prepend_original"] 50 | # steering_layer: 20 51 | # latent_layer: 20 52 | evaluate: 53 | models: ["PromptSteering"] 54 | latent_evaluators: [ 55 | "AUCROCEvaluator", 56 | "HardNegativeEvaluator", 57 | ] 58 | steering_evaluators: [ 59 | # "PerplexityEvaluator", 60 | "LMJudgeEvaluator", 61 | "RuleEvaluator" 62 | ] 63 | winrate_split_ratio: 0.5 # this is for steering only, we use a separate partition for factor selection. 64 | # Number of processes to run in parallel for steering evaluation. 65 | num_of_workers: 32 66 | lm_model: "gpt-4o-mini" 67 | run_winrate: false 68 | winrate_baseline: "PromptSteering" 69 | # master data dir is shared across all jobs. 70 | master_data_dir: "axbench/data" 71 | steer_dataset_type: "concept" 72 | defense: ["prepend_original"] 73 | 74 | -------------------------------------------------------------------------------- /axbench/sweep/wuzhengx/reps/experiments/prompt_steering_g3-27b_concept20_suppress_overwrite_append.yaml: -------------------------------------------------------------------------------- 1 | generate: 2 | lm_model: "gpt-4o-mini" 3 | output_length: 128 4 | num_of_examples: 144 5 | max_concepts: 500 6 | master_data_dir: "axbench/data" 7 | dataset_category: "instruction" 8 | lm_use_cache: false 9 | seed: 42 10 | keep_orig_axbench_format: true 11 | steer_data_type: "rule" 12 | ### use cmd to define ### 13 | # concept_path: "axbench/data/gemma-2-9b_20-gemmascope-res-16k.json" 14 | train: 15 | component: "res" 16 | seed: 42 17 | use_bf16: true 18 | models: 19 | DiffMean: 20 | batch_size: 6 21 | n_epochs: 1 22 | binarize_dataset: true 23 | low_rank_dimension: 1 24 | output_length: 768 # use_cmd_to_define 25 | model_name: "google/gemma-3-27b-it" 26 | steer_data_type: "rule" 27 | inference: 28 | use_bf16: true 29 | models: ["PromptSteering"] 30 | # latent related params 31 | output_length: 768 32 | latent_num_of_examples: 36 33 | latent_batch_size: 16 34 | # steering related params 35 | steering_intervention_type: "addition" # clamping 36 | steering_datasets: ["AttackOverwrite"] 37 | steering_batch_size: 10 38 | steering_output_length: 768 39 | steering_num_of_examples: 10 # number of examples per concept and per factor 40 | steering_factors: [1.0] # number of steering factors per example 41 | # master data dir is shared across all jobs. 42 | master_data_dir: "axbench/data" 43 | seed: 42 44 | lm_model: "gpt-4o-mini" 45 | # generation related params 46 | temperature: 1.0 47 | ### use cmd to define ### 48 | steering_model_name: "google/gemma-3-27b-it" 49 | model_name: "google/gemma-3-27b-it" 50 | defense: ["append_original"] 51 | # steering_layer: 20 52 | # latent_layer: 20 53 | evaluate: 54 | models: ["PromptSteering"] 55 | latent_evaluators: [ 56 | "AUCROCEvaluator", 57 | "HardNegativeEvaluator", 58 | ] 59 | steering_evaluators: [ 60 | # "PerplexityEvaluator", 61 | "LMJudgeEvaluator", 62 | "RuleEvaluator" 63 | ] 64 | winrate_split_ratio: 0.5 # this is for steering only, we use a separate partition for factor selection. 65 | # Number of processes to run in parallel for steering evaluation. 66 | num_of_workers: 32 67 | lm_model: "gpt-4o-mini" 68 | run_winrate: false 69 | winrate_baseline: "PromptSteering" 70 | # master data dir is shared across all jobs. 71 | master_data_dir: "axbench/data" 72 | steer_dataset_type: "rule" 73 | defense: ["append_original"] 74 | 75 | -------------------------------------------------------------------------------- /axbench/sweep/wuzhengx/reps/experiments/prompt_steering_g3-27b_concept20_suppress_rule.yaml: -------------------------------------------------------------------------------- 1 | generate: 2 | lm_model: "gpt-4o-mini" 3 | output_length: 128 4 | num_of_examples: 144 5 | max_concepts: 500 6 | master_data_dir: "axbench/data" 7 | dataset_category: "instruction" 8 | lm_use_cache: false 9 | seed: 42 10 | keep_orig_axbench_format: true 11 | steer_data_type: "rule"## use cmd to define ### 12 | # concept_path: "axbench/data/gemma-2-9b_20-gemmascope-res-16k.json" 13 | train: 14 | component: "res" 15 | seed: 42 16 | use_bf16: true 17 | models: 18 | DiffMean: 19 | batch_size: 6 20 | n_epochs: 1 21 | binarize_dataset: true 22 | low_rank_dimension: 1 23 | output_length: 768 # use_cmd_to_define 24 | model_name: "google/gemma-3-27b-it" 25 | steer_data_type: "rule" 26 | inference: 27 | use_bf16: true 28 | models: ["PromptSteering"] 29 | # latent related params 30 | output_length: 768 31 | latent_num_of_examples: 36 32 | latent_batch_size: 16 33 | # steering related params 34 | steering_intervention_type: "addition" # clamping 35 | steering_datasets: ["AlpacaEvalSuppress"] 36 | steering_batch_size: 10 37 | steering_output_length: 768 38 | steering_num_of_examples: 10 # number of examples per concept and per factor 39 | steering_factors: [1.0] # number of steering factors per example 40 | # master data dir is shared across all jobs. 41 | master_data_dir: "axbench/data" 42 | seed: 42 43 | lm_model: "gpt-4o-mini" 44 | # generation related params 45 | temperature: 1.0 46 | ### use cmd to define ### 47 | steering_model_name: "google/gemma-3-27b-it" 48 | model_name: "google/gemma-3-27b-it" 49 | defense: ["prepend_original"] 50 | # steering_layer: 20 51 | # latent_layer: 20 52 | evaluate: 53 | models: ["PromptSteering"] 54 | latent_evaluators: [ 55 | "AUCROCEvaluator", 56 | "HardNegativeEvaluator", 57 | ] 58 | steering_evaluators: [ 59 | # "PerplexityEvaluator", 60 | "LMJudgeEvaluator", 61 | "RuleEvaluator" 62 | ] 63 | winrate_split_ratio: 0.5 # this is for steering only, we use a separate partition for factor selection. 64 | # Number of processes to run in parallel for steering evaluation. 65 | num_of_workers: 32 66 | lm_model: "gpt-4o-mini" 67 | run_winrate: false 68 | winrate_baseline: "PromptSteering" 69 | # master data dir is shared across all jobs. 70 | master_data_dir: "axbench/data" 71 | steer_dataset_type: "concept" 72 | defense: ["prepend_original"] 73 | 74 | -------------------------------------------------------------------------------- /axbench/templates/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanfordnlp/axbench/bc9189153f475de725298fc8031bc0e48ff1132d/axbench/templates/__init__.py -------------------------------------------------------------------------------- /axbench/tests/README.md: -------------------------------------------------------------------------------- 1 | ## How to run our script-based tests for released datasets 2 | 3 | This script will do basic checks on Concept10, Concept500 and Concept16K datasets. 4 | 5 | ```bash 6 | python axbench/tests/test_released_datasets.py 7 | ``` 8 | 9 | 10 | ## How to run our unit tests 11 | 12 | Once the dataset test passes, you can run the unit tests for functonal modules. 13 | 14 | ```bash 15 | python -m unittest discover -s axbench/tests/unit_tests 16 | ``` -------------------------------------------------------------------------------- /axbench/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanfordnlp/axbench/bc9189153f475de725298fc8031bc0e48ff1132d/axbench/utils/__init__.py -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "axbench" 3 | version = "0.1.0" 4 | description = "Add your description here" 5 | readme = "README.md" 6 | requires-python = ">=3.12" 7 | dependencies = [ 8 | "asyncio>=3.4.3", 9 | "datasets>=3.0.2", 10 | "einops>=0.8.0", 11 | "httpx>=0.27.2", 12 | "openai>=1.52.1", 13 | "pyvene>=0.1.8", 14 | "torch>=2.5.0", 15 | "transformers>=4.42.4", 16 | "wandb>=0.18.5", 17 | "scikit-learn>=1.5.2", 18 | "seaborn>=0.12.2", 19 | "pyreft>=0.0.8", 20 | "peft>=0.13.2", 21 | "jupyter>=1.1.1", 22 | "adjusttext>=1.3.0", 23 | "altair>=5.5.0", 24 | "umap-learn>=0.5.7", 25 | ] 26 | 27 | [build-system] 28 | requires = ["hatchling"] 29 | build-backend = "hatchling.build" 30 | --------------------------------------------------------------------------------