├── LICENSE ├── README.md ├── TransformerLens ├── README.md ├── makefile ├── poetry.lock ├── pyproject.toml └── transformer_lens │ ├── ActivationCache.py │ ├── FactoredMatrix.py │ ├── HookedEncoder.py │ ├── HookedEncoderDecoder.py │ ├── HookedTransformer.py │ ├── HookedTransformerConfig.py │ ├── SVDInterpreter.py │ ├── __init__.py │ ├── components │ ├── __init__.py │ ├── abstract_attention.py │ ├── attention.py │ ├── bert_block.py │ ├── bert_embed.py │ ├── bert_mlm_head.py │ ├── embed.py │ ├── grouped_query_attention.py │ ├── layer_norm.py │ ├── layer_norm_pre.py │ ├── mlps │ │ ├── can_be_used_as_mlp.py │ │ ├── gated_mlp.py │ │ ├── gated_mlp_4bit.py │ │ ├── mlp.py │ │ └── moe.py │ ├── pos_embed.py │ ├── rms_norm.py │ ├── rms_norm_pre.py │ ├── t5_attention.py │ ├── t5_block.py │ ├── token_typed_embed.py │ ├── transformer_block.py │ └── unembed.py │ ├── evals.py │ ├── factories │ ├── activation_function_factory.py │ └── mlp_factory.py │ ├── head_detector.py │ ├── hook_points.py │ ├── loading_from_pretrained.py │ ├── past_key_value_caching.py │ ├── patching.py │ ├── pretrained │ ├── __init__.py │ └── weight_conversions │ │ ├── __init__.py │ │ ├── bert.py │ │ ├── bloom.py │ │ ├── coder.py │ │ ├── gemma.py │ │ ├── gpt2.py │ │ ├── gptj.py │ │ ├── llama.py │ │ ├── mingpt.py │ │ ├── mistral.py │ │ ├── mixtral.py │ │ ├── nanogpt.py │ │ ├── neel_solu_old.py │ │ ├── neo.py │ │ ├── neox.py │ │ ├── opt.py │ │ ├── phi.py │ │ ├── phi3.py │ │ ├── qwen.py │ │ ├── qwen2.py │ │ └── t5.py │ ├── train.py │ ├── utilities │ ├── __init__.py │ ├── activation_functions.py │ ├── addmm.py │ ├── attention.py │ └── devices.py │ └── utils.py ├── baseline ├── base_safety_mmlu.py ├── caa_safety_mmlu.py ├── generate_vectors.py ├── generate_vectors_sys.py ├── model_wrapper.py ├── our_sae_caa_safety.py ├── our_sae_caa_safety_mmlu.py ├── sae_caa_safety.py ├── sae_caa_safety_mmlu.py ├── steering_base.py ├── steering_caa.py ├── steering_r1_qwen_gsm.py ├── steering_r1_qwen_gsm.sh ├── steering_r1_qwen_gsm_max_len.py ├── steering_r1_qwen_gsm_max_len.sh ├── test_r1.ipynb ├── top1_sae_caa_safety.py ├── top1_sae_caa_safety_mmlu.py └── utils │ ├── helpers.py │ ├── infer_utils.py │ ├── input_format.py │ ├── input_format_time.py │ ├── test_helpers.py │ └── tokenize.py ├── dataloader.py ├── evaluate_safety ├── eval_DINM.py ├── eval_fluency.py ├── eval_gemma_gsm.py └── eval_realtoxicity_api.py ├── generate_sae_caa_vector.py ├── requirements.txt ├── run_eval.sh ├── run_generate_vector.sh ├── run_main_table.sh ├── sae_feature_selection.py ├── sae_utils.py └── scripts ├── eval ├── run_DINM.sh ├── run_gemma_gsm.sh └── run_realtoxicity.sh ├── experiment └── main │ ├── gemma-it │ ├── base.sh │ ├── caa.sh │ ├── our_STA.sh │ ├── prompt_auto.sh │ └── prompt_hand.sh │ └── gemma │ ├── base.sh │ ├── caa.sh │ ├── our_STA.sh │ ├── prompt_auto.sh │ └── prompt_hand.sh └── generate_vector └── gemma ├── caa ├── generate_vectors_gemma_it_DINM.sh └── generate_vectors_gemma_pt_DINM.sh └── sta ├── run_save_gemma_it_act-and-fre_trim_DINM.sh ├── run_save_gemma_pt_act-and-fre_trim_DINM.sh ├── run_selection_safe_gemma_it_DINM.sh └── run_selection_safe_gemma_pt_DINM.sh /LICENSE: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zjunlp/steer-target-atoms/HEAD/LICENSE -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zjunlp/steer-target-atoms/HEAD/README.md -------------------------------------------------------------------------------- /TransformerLens/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zjunlp/steer-target-atoms/HEAD/TransformerLens/README.md -------------------------------------------------------------------------------- /TransformerLens/makefile: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zjunlp/steer-target-atoms/HEAD/TransformerLens/makefile -------------------------------------------------------------------------------- /TransformerLens/poetry.lock: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zjunlp/steer-target-atoms/HEAD/TransformerLens/poetry.lock -------------------------------------------------------------------------------- /TransformerLens/pyproject.toml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zjunlp/steer-target-atoms/HEAD/TransformerLens/pyproject.toml -------------------------------------------------------------------------------- /TransformerLens/transformer_lens/ActivationCache.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zjunlp/steer-target-atoms/HEAD/TransformerLens/transformer_lens/ActivationCache.py -------------------------------------------------------------------------------- /TransformerLens/transformer_lens/FactoredMatrix.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zjunlp/steer-target-atoms/HEAD/TransformerLens/transformer_lens/FactoredMatrix.py -------------------------------------------------------------------------------- /TransformerLens/transformer_lens/HookedEncoder.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zjunlp/steer-target-atoms/HEAD/TransformerLens/transformer_lens/HookedEncoder.py -------------------------------------------------------------------------------- /TransformerLens/transformer_lens/HookedEncoderDecoder.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zjunlp/steer-target-atoms/HEAD/TransformerLens/transformer_lens/HookedEncoderDecoder.py -------------------------------------------------------------------------------- /TransformerLens/transformer_lens/HookedTransformer.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zjunlp/steer-target-atoms/HEAD/TransformerLens/transformer_lens/HookedTransformer.py -------------------------------------------------------------------------------- /TransformerLens/transformer_lens/HookedTransformerConfig.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zjunlp/steer-target-atoms/HEAD/TransformerLens/transformer_lens/HookedTransformerConfig.py -------------------------------------------------------------------------------- /TransformerLens/transformer_lens/SVDInterpreter.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zjunlp/steer-target-atoms/HEAD/TransformerLens/transformer_lens/SVDInterpreter.py -------------------------------------------------------------------------------- /TransformerLens/transformer_lens/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zjunlp/steer-target-atoms/HEAD/TransformerLens/transformer_lens/__init__.py -------------------------------------------------------------------------------- /TransformerLens/transformer_lens/components/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zjunlp/steer-target-atoms/HEAD/TransformerLens/transformer_lens/components/__init__.py -------------------------------------------------------------------------------- /TransformerLens/transformer_lens/components/abstract_attention.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zjunlp/steer-target-atoms/HEAD/TransformerLens/transformer_lens/components/abstract_attention.py -------------------------------------------------------------------------------- /TransformerLens/transformer_lens/components/attention.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zjunlp/steer-target-atoms/HEAD/TransformerLens/transformer_lens/components/attention.py -------------------------------------------------------------------------------- /TransformerLens/transformer_lens/components/bert_block.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zjunlp/steer-target-atoms/HEAD/TransformerLens/transformer_lens/components/bert_block.py -------------------------------------------------------------------------------- /TransformerLens/transformer_lens/components/bert_embed.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zjunlp/steer-target-atoms/HEAD/TransformerLens/transformer_lens/components/bert_embed.py -------------------------------------------------------------------------------- /TransformerLens/transformer_lens/components/bert_mlm_head.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zjunlp/steer-target-atoms/HEAD/TransformerLens/transformer_lens/components/bert_mlm_head.py -------------------------------------------------------------------------------- /TransformerLens/transformer_lens/components/embed.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zjunlp/steer-target-atoms/HEAD/TransformerLens/transformer_lens/components/embed.py -------------------------------------------------------------------------------- /TransformerLens/transformer_lens/components/grouped_query_attention.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zjunlp/steer-target-atoms/HEAD/TransformerLens/transformer_lens/components/grouped_query_attention.py -------------------------------------------------------------------------------- /TransformerLens/transformer_lens/components/layer_norm.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zjunlp/steer-target-atoms/HEAD/TransformerLens/transformer_lens/components/layer_norm.py -------------------------------------------------------------------------------- /TransformerLens/transformer_lens/components/layer_norm_pre.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zjunlp/steer-target-atoms/HEAD/TransformerLens/transformer_lens/components/layer_norm_pre.py -------------------------------------------------------------------------------- /TransformerLens/transformer_lens/components/mlps/can_be_used_as_mlp.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zjunlp/steer-target-atoms/HEAD/TransformerLens/transformer_lens/components/mlps/can_be_used_as_mlp.py -------------------------------------------------------------------------------- /TransformerLens/transformer_lens/components/mlps/gated_mlp.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zjunlp/steer-target-atoms/HEAD/TransformerLens/transformer_lens/components/mlps/gated_mlp.py -------------------------------------------------------------------------------- /TransformerLens/transformer_lens/components/mlps/gated_mlp_4bit.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zjunlp/steer-target-atoms/HEAD/TransformerLens/transformer_lens/components/mlps/gated_mlp_4bit.py -------------------------------------------------------------------------------- /TransformerLens/transformer_lens/components/mlps/mlp.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zjunlp/steer-target-atoms/HEAD/TransformerLens/transformer_lens/components/mlps/mlp.py -------------------------------------------------------------------------------- /TransformerLens/transformer_lens/components/mlps/moe.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zjunlp/steer-target-atoms/HEAD/TransformerLens/transformer_lens/components/mlps/moe.py -------------------------------------------------------------------------------- /TransformerLens/transformer_lens/components/pos_embed.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zjunlp/steer-target-atoms/HEAD/TransformerLens/transformer_lens/components/pos_embed.py -------------------------------------------------------------------------------- /TransformerLens/transformer_lens/components/rms_norm.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zjunlp/steer-target-atoms/HEAD/TransformerLens/transformer_lens/components/rms_norm.py -------------------------------------------------------------------------------- /TransformerLens/transformer_lens/components/rms_norm_pre.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zjunlp/steer-target-atoms/HEAD/TransformerLens/transformer_lens/components/rms_norm_pre.py -------------------------------------------------------------------------------- /TransformerLens/transformer_lens/components/t5_attention.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zjunlp/steer-target-atoms/HEAD/TransformerLens/transformer_lens/components/t5_attention.py -------------------------------------------------------------------------------- /TransformerLens/transformer_lens/components/t5_block.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zjunlp/steer-target-atoms/HEAD/TransformerLens/transformer_lens/components/t5_block.py -------------------------------------------------------------------------------- /TransformerLens/transformer_lens/components/token_typed_embed.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zjunlp/steer-target-atoms/HEAD/TransformerLens/transformer_lens/components/token_typed_embed.py -------------------------------------------------------------------------------- /TransformerLens/transformer_lens/components/transformer_block.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zjunlp/steer-target-atoms/HEAD/TransformerLens/transformer_lens/components/transformer_block.py -------------------------------------------------------------------------------- /TransformerLens/transformer_lens/components/unembed.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zjunlp/steer-target-atoms/HEAD/TransformerLens/transformer_lens/components/unembed.py -------------------------------------------------------------------------------- /TransformerLens/transformer_lens/evals.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zjunlp/steer-target-atoms/HEAD/TransformerLens/transformer_lens/evals.py -------------------------------------------------------------------------------- /TransformerLens/transformer_lens/factories/activation_function_factory.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zjunlp/steer-target-atoms/HEAD/TransformerLens/transformer_lens/factories/activation_function_factory.py -------------------------------------------------------------------------------- /TransformerLens/transformer_lens/factories/mlp_factory.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zjunlp/steer-target-atoms/HEAD/TransformerLens/transformer_lens/factories/mlp_factory.py -------------------------------------------------------------------------------- /TransformerLens/transformer_lens/head_detector.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zjunlp/steer-target-atoms/HEAD/TransformerLens/transformer_lens/head_detector.py -------------------------------------------------------------------------------- /TransformerLens/transformer_lens/hook_points.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zjunlp/steer-target-atoms/HEAD/TransformerLens/transformer_lens/hook_points.py -------------------------------------------------------------------------------- /TransformerLens/transformer_lens/loading_from_pretrained.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zjunlp/steer-target-atoms/HEAD/TransformerLens/transformer_lens/loading_from_pretrained.py -------------------------------------------------------------------------------- /TransformerLens/transformer_lens/past_key_value_caching.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zjunlp/steer-target-atoms/HEAD/TransformerLens/transformer_lens/past_key_value_caching.py -------------------------------------------------------------------------------- /TransformerLens/transformer_lens/patching.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zjunlp/steer-target-atoms/HEAD/TransformerLens/transformer_lens/patching.py -------------------------------------------------------------------------------- /TransformerLens/transformer_lens/pretrained/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /TransformerLens/transformer_lens/pretrained/weight_conversions/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zjunlp/steer-target-atoms/HEAD/TransformerLens/transformer_lens/pretrained/weight_conversions/__init__.py -------------------------------------------------------------------------------- /TransformerLens/transformer_lens/pretrained/weight_conversions/bert.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zjunlp/steer-target-atoms/HEAD/TransformerLens/transformer_lens/pretrained/weight_conversions/bert.py -------------------------------------------------------------------------------- /TransformerLens/transformer_lens/pretrained/weight_conversions/bloom.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zjunlp/steer-target-atoms/HEAD/TransformerLens/transformer_lens/pretrained/weight_conversions/bloom.py -------------------------------------------------------------------------------- /TransformerLens/transformer_lens/pretrained/weight_conversions/coder.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zjunlp/steer-target-atoms/HEAD/TransformerLens/transformer_lens/pretrained/weight_conversions/coder.py -------------------------------------------------------------------------------- /TransformerLens/transformer_lens/pretrained/weight_conversions/gemma.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zjunlp/steer-target-atoms/HEAD/TransformerLens/transformer_lens/pretrained/weight_conversions/gemma.py -------------------------------------------------------------------------------- /TransformerLens/transformer_lens/pretrained/weight_conversions/gpt2.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zjunlp/steer-target-atoms/HEAD/TransformerLens/transformer_lens/pretrained/weight_conversions/gpt2.py -------------------------------------------------------------------------------- /TransformerLens/transformer_lens/pretrained/weight_conversions/gptj.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zjunlp/steer-target-atoms/HEAD/TransformerLens/transformer_lens/pretrained/weight_conversions/gptj.py -------------------------------------------------------------------------------- /TransformerLens/transformer_lens/pretrained/weight_conversions/llama.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zjunlp/steer-target-atoms/HEAD/TransformerLens/transformer_lens/pretrained/weight_conversions/llama.py -------------------------------------------------------------------------------- /TransformerLens/transformer_lens/pretrained/weight_conversions/mingpt.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zjunlp/steer-target-atoms/HEAD/TransformerLens/transformer_lens/pretrained/weight_conversions/mingpt.py -------------------------------------------------------------------------------- /TransformerLens/transformer_lens/pretrained/weight_conversions/mistral.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zjunlp/steer-target-atoms/HEAD/TransformerLens/transformer_lens/pretrained/weight_conversions/mistral.py -------------------------------------------------------------------------------- /TransformerLens/transformer_lens/pretrained/weight_conversions/mixtral.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zjunlp/steer-target-atoms/HEAD/TransformerLens/transformer_lens/pretrained/weight_conversions/mixtral.py -------------------------------------------------------------------------------- /TransformerLens/transformer_lens/pretrained/weight_conversions/nanogpt.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zjunlp/steer-target-atoms/HEAD/TransformerLens/transformer_lens/pretrained/weight_conversions/nanogpt.py -------------------------------------------------------------------------------- /TransformerLens/transformer_lens/pretrained/weight_conversions/neel_solu_old.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zjunlp/steer-target-atoms/HEAD/TransformerLens/transformer_lens/pretrained/weight_conversions/neel_solu_old.py -------------------------------------------------------------------------------- /TransformerLens/transformer_lens/pretrained/weight_conversions/neo.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zjunlp/steer-target-atoms/HEAD/TransformerLens/transformer_lens/pretrained/weight_conversions/neo.py -------------------------------------------------------------------------------- /TransformerLens/transformer_lens/pretrained/weight_conversions/neox.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zjunlp/steer-target-atoms/HEAD/TransformerLens/transformer_lens/pretrained/weight_conversions/neox.py -------------------------------------------------------------------------------- /TransformerLens/transformer_lens/pretrained/weight_conversions/opt.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zjunlp/steer-target-atoms/HEAD/TransformerLens/transformer_lens/pretrained/weight_conversions/opt.py -------------------------------------------------------------------------------- /TransformerLens/transformer_lens/pretrained/weight_conversions/phi.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zjunlp/steer-target-atoms/HEAD/TransformerLens/transformer_lens/pretrained/weight_conversions/phi.py -------------------------------------------------------------------------------- /TransformerLens/transformer_lens/pretrained/weight_conversions/phi3.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zjunlp/steer-target-atoms/HEAD/TransformerLens/transformer_lens/pretrained/weight_conversions/phi3.py -------------------------------------------------------------------------------- /TransformerLens/transformer_lens/pretrained/weight_conversions/qwen.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zjunlp/steer-target-atoms/HEAD/TransformerLens/transformer_lens/pretrained/weight_conversions/qwen.py -------------------------------------------------------------------------------- /TransformerLens/transformer_lens/pretrained/weight_conversions/qwen2.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zjunlp/steer-target-atoms/HEAD/TransformerLens/transformer_lens/pretrained/weight_conversions/qwen2.py -------------------------------------------------------------------------------- /TransformerLens/transformer_lens/pretrained/weight_conversions/t5.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zjunlp/steer-target-atoms/HEAD/TransformerLens/transformer_lens/pretrained/weight_conversions/t5.py -------------------------------------------------------------------------------- /TransformerLens/transformer_lens/train.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zjunlp/steer-target-atoms/HEAD/TransformerLens/transformer_lens/train.py -------------------------------------------------------------------------------- /TransformerLens/transformer_lens/utilities/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /TransformerLens/transformer_lens/utilities/activation_functions.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zjunlp/steer-target-atoms/HEAD/TransformerLens/transformer_lens/utilities/activation_functions.py -------------------------------------------------------------------------------- /TransformerLens/transformer_lens/utilities/addmm.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zjunlp/steer-target-atoms/HEAD/TransformerLens/transformer_lens/utilities/addmm.py -------------------------------------------------------------------------------- /TransformerLens/transformer_lens/utilities/attention.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zjunlp/steer-target-atoms/HEAD/TransformerLens/transformer_lens/utilities/attention.py -------------------------------------------------------------------------------- /TransformerLens/transformer_lens/utilities/devices.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zjunlp/steer-target-atoms/HEAD/TransformerLens/transformer_lens/utilities/devices.py -------------------------------------------------------------------------------- /TransformerLens/transformer_lens/utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zjunlp/steer-target-atoms/HEAD/TransformerLens/transformer_lens/utils.py -------------------------------------------------------------------------------- /baseline/base_safety_mmlu.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zjunlp/steer-target-atoms/HEAD/baseline/base_safety_mmlu.py -------------------------------------------------------------------------------- /baseline/caa_safety_mmlu.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zjunlp/steer-target-atoms/HEAD/baseline/caa_safety_mmlu.py -------------------------------------------------------------------------------- /baseline/generate_vectors.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zjunlp/steer-target-atoms/HEAD/baseline/generate_vectors.py -------------------------------------------------------------------------------- /baseline/generate_vectors_sys.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zjunlp/steer-target-atoms/HEAD/baseline/generate_vectors_sys.py -------------------------------------------------------------------------------- /baseline/model_wrapper.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zjunlp/steer-target-atoms/HEAD/baseline/model_wrapper.py -------------------------------------------------------------------------------- /baseline/our_sae_caa_safety.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zjunlp/steer-target-atoms/HEAD/baseline/our_sae_caa_safety.py -------------------------------------------------------------------------------- /baseline/our_sae_caa_safety_mmlu.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zjunlp/steer-target-atoms/HEAD/baseline/our_sae_caa_safety_mmlu.py -------------------------------------------------------------------------------- /baseline/sae_caa_safety.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zjunlp/steer-target-atoms/HEAD/baseline/sae_caa_safety.py -------------------------------------------------------------------------------- /baseline/sae_caa_safety_mmlu.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zjunlp/steer-target-atoms/HEAD/baseline/sae_caa_safety_mmlu.py -------------------------------------------------------------------------------- /baseline/steering_base.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zjunlp/steer-target-atoms/HEAD/baseline/steering_base.py -------------------------------------------------------------------------------- /baseline/steering_caa.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zjunlp/steer-target-atoms/HEAD/baseline/steering_caa.py -------------------------------------------------------------------------------- /baseline/steering_r1_qwen_gsm.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zjunlp/steer-target-atoms/HEAD/baseline/steering_r1_qwen_gsm.py -------------------------------------------------------------------------------- /baseline/steering_r1_qwen_gsm.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zjunlp/steer-target-atoms/HEAD/baseline/steering_r1_qwen_gsm.sh -------------------------------------------------------------------------------- /baseline/steering_r1_qwen_gsm_max_len.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zjunlp/steer-target-atoms/HEAD/baseline/steering_r1_qwen_gsm_max_len.py -------------------------------------------------------------------------------- /baseline/steering_r1_qwen_gsm_max_len.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zjunlp/steer-target-atoms/HEAD/baseline/steering_r1_qwen_gsm_max_len.sh -------------------------------------------------------------------------------- /baseline/test_r1.ipynb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zjunlp/steer-target-atoms/HEAD/baseline/test_r1.ipynb -------------------------------------------------------------------------------- /baseline/top1_sae_caa_safety.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zjunlp/steer-target-atoms/HEAD/baseline/top1_sae_caa_safety.py -------------------------------------------------------------------------------- /baseline/top1_sae_caa_safety_mmlu.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zjunlp/steer-target-atoms/HEAD/baseline/top1_sae_caa_safety_mmlu.py -------------------------------------------------------------------------------- /baseline/utils/helpers.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zjunlp/steer-target-atoms/HEAD/baseline/utils/helpers.py -------------------------------------------------------------------------------- /baseline/utils/infer_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zjunlp/steer-target-atoms/HEAD/baseline/utils/infer_utils.py -------------------------------------------------------------------------------- /baseline/utils/input_format.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zjunlp/steer-target-atoms/HEAD/baseline/utils/input_format.py -------------------------------------------------------------------------------- /baseline/utils/input_format_time.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zjunlp/steer-target-atoms/HEAD/baseline/utils/input_format_time.py -------------------------------------------------------------------------------- /baseline/utils/test_helpers.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zjunlp/steer-target-atoms/HEAD/baseline/utils/test_helpers.py -------------------------------------------------------------------------------- /baseline/utils/tokenize.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zjunlp/steer-target-atoms/HEAD/baseline/utils/tokenize.py -------------------------------------------------------------------------------- /dataloader.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zjunlp/steer-target-atoms/HEAD/dataloader.py -------------------------------------------------------------------------------- /evaluate_safety/eval_DINM.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zjunlp/steer-target-atoms/HEAD/evaluate_safety/eval_DINM.py -------------------------------------------------------------------------------- /evaluate_safety/eval_fluency.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zjunlp/steer-target-atoms/HEAD/evaluate_safety/eval_fluency.py -------------------------------------------------------------------------------- /evaluate_safety/eval_gemma_gsm.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zjunlp/steer-target-atoms/HEAD/evaluate_safety/eval_gemma_gsm.py -------------------------------------------------------------------------------- /evaluate_safety/eval_realtoxicity_api.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zjunlp/steer-target-atoms/HEAD/evaluate_safety/eval_realtoxicity_api.py -------------------------------------------------------------------------------- /generate_sae_caa_vector.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zjunlp/steer-target-atoms/HEAD/generate_sae_caa_vector.py -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zjunlp/steer-target-atoms/HEAD/requirements.txt -------------------------------------------------------------------------------- /run_eval.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zjunlp/steer-target-atoms/HEAD/run_eval.sh -------------------------------------------------------------------------------- /run_generate_vector.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zjunlp/steer-target-atoms/HEAD/run_generate_vector.sh -------------------------------------------------------------------------------- /run_main_table.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zjunlp/steer-target-atoms/HEAD/run_main_table.sh -------------------------------------------------------------------------------- /sae_feature_selection.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zjunlp/steer-target-atoms/HEAD/sae_feature_selection.py -------------------------------------------------------------------------------- /sae_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zjunlp/steer-target-atoms/HEAD/sae_utils.py -------------------------------------------------------------------------------- /scripts/eval/run_DINM.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zjunlp/steer-target-atoms/HEAD/scripts/eval/run_DINM.sh -------------------------------------------------------------------------------- /scripts/eval/run_gemma_gsm.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zjunlp/steer-target-atoms/HEAD/scripts/eval/run_gemma_gsm.sh -------------------------------------------------------------------------------- /scripts/eval/run_realtoxicity.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zjunlp/steer-target-atoms/HEAD/scripts/eval/run_realtoxicity.sh -------------------------------------------------------------------------------- /scripts/experiment/main/gemma-it/base.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zjunlp/steer-target-atoms/HEAD/scripts/experiment/main/gemma-it/base.sh -------------------------------------------------------------------------------- /scripts/experiment/main/gemma-it/caa.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zjunlp/steer-target-atoms/HEAD/scripts/experiment/main/gemma-it/caa.sh -------------------------------------------------------------------------------- /scripts/experiment/main/gemma-it/our_STA.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zjunlp/steer-target-atoms/HEAD/scripts/experiment/main/gemma-it/our_STA.sh -------------------------------------------------------------------------------- /scripts/experiment/main/gemma-it/prompt_auto.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zjunlp/steer-target-atoms/HEAD/scripts/experiment/main/gemma-it/prompt_auto.sh -------------------------------------------------------------------------------- /scripts/experiment/main/gemma-it/prompt_hand.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zjunlp/steer-target-atoms/HEAD/scripts/experiment/main/gemma-it/prompt_hand.sh -------------------------------------------------------------------------------- /scripts/experiment/main/gemma/base.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zjunlp/steer-target-atoms/HEAD/scripts/experiment/main/gemma/base.sh -------------------------------------------------------------------------------- /scripts/experiment/main/gemma/caa.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zjunlp/steer-target-atoms/HEAD/scripts/experiment/main/gemma/caa.sh -------------------------------------------------------------------------------- /scripts/experiment/main/gemma/our_STA.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zjunlp/steer-target-atoms/HEAD/scripts/experiment/main/gemma/our_STA.sh -------------------------------------------------------------------------------- /scripts/experiment/main/gemma/prompt_auto.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zjunlp/steer-target-atoms/HEAD/scripts/experiment/main/gemma/prompt_auto.sh -------------------------------------------------------------------------------- /scripts/experiment/main/gemma/prompt_hand.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zjunlp/steer-target-atoms/HEAD/scripts/experiment/main/gemma/prompt_hand.sh -------------------------------------------------------------------------------- /scripts/generate_vector/gemma/caa/generate_vectors_gemma_it_DINM.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zjunlp/steer-target-atoms/HEAD/scripts/generate_vector/gemma/caa/generate_vectors_gemma_it_DINM.sh -------------------------------------------------------------------------------- /scripts/generate_vector/gemma/caa/generate_vectors_gemma_pt_DINM.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zjunlp/steer-target-atoms/HEAD/scripts/generate_vector/gemma/caa/generate_vectors_gemma_pt_DINM.sh -------------------------------------------------------------------------------- /scripts/generate_vector/gemma/sta/run_save_gemma_it_act-and-fre_trim_DINM.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zjunlp/steer-target-atoms/HEAD/scripts/generate_vector/gemma/sta/run_save_gemma_it_act-and-fre_trim_DINM.sh -------------------------------------------------------------------------------- /scripts/generate_vector/gemma/sta/run_save_gemma_pt_act-and-fre_trim_DINM.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zjunlp/steer-target-atoms/HEAD/scripts/generate_vector/gemma/sta/run_save_gemma_pt_act-and-fre_trim_DINM.sh -------------------------------------------------------------------------------- /scripts/generate_vector/gemma/sta/run_selection_safe_gemma_it_DINM.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zjunlp/steer-target-atoms/HEAD/scripts/generate_vector/gemma/sta/run_selection_safe_gemma_it_DINM.sh -------------------------------------------------------------------------------- /scripts/generate_vector/gemma/sta/run_selection_safe_gemma_pt_DINM.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zjunlp/steer-target-atoms/HEAD/scripts/generate_vector/gemma/sta/run_selection_safe_gemma_pt_DINM.sh --------------------------------------------------------------------------------