├── .gitignore ├── README.md ├── artifacts ├── .gitkeep ├── acts │ └── .gitkeep ├── responses │ └── .gitkeep └── vecs │ └── .gitkeep ├── backup_ao.sh ├── backup_so.sh ├── caa_behaviors_testing.py ├── caa_refusal_testing.py ├── datasets ├── CAA │ ├── generate │ │ ├── coordinate-other-ais │ │ │ ├── generate_balanced_dataset.json │ │ │ ├── generate_dataset.json │ │ │ └── generate_openresponse_dataset.json │ │ ├── corrigible-neutral-HHH │ │ │ ├── generate_balanced_dataset.json │ │ │ ├── generate_dataset.json │ │ │ └── generate_openresponse_dataset.json │ │ ├── hallucination │ │ │ ├── generate_balanced_dataset.json │ │ │ ├── generate_dataset.json │ │ │ └── generate_openresponse_dataset.json │ │ ├── myopic-reward │ │ │ ├── generate_balanced_dataset.json │ │ │ ├── generate_dataset.json │ │ │ └── generate_openresponse_dataset.json │ │ ├── refusal │ │ │ ├── generate_balanced_dataset.json │ │ │ ├── generate_dataset.json │ │ │ └── generate_openresponse_dataset.json │ │ ├── survival-instinct │ │ │ ├── generate_balanced_dataset.json │ │ │ ├── generate_dataset.json │ │ │ └── generate_openresponse_dataset.json │ │ └── sycophancy │ │ │ ├── generate_balanced_dataset.json │ │ │ ├── generate_dataset.json │ │ │ └── generate_openresponse_dataset.json │ ├── raw │ │ ├── coordinate-other-ais │ │ │ └── dataset.json │ │ ├── corrigible-neutral-HHH │ │ │ └── dataset.json │ │ ├── hallucination │ │ │ ├── alluding_questioning.csv │ │ │ ├── dataset.json │ │ │ ├── direct_questions.csv │ │ │ ├── make_dataset.py │ │ │ └── questioning_assuming_statement.csv │ │ ├── myopic-reward │ │ │ └── dataset.json │ │ ├── refusal │ │ │ └── dataset.json │ │ ├── survival-instinct │ │ │ └── dataset.json │ │ └── sycophancy │ │ │ ├── dataset.json │ │ │ ├── make_dataset.py │ │ │ ├── sycophancy_on_nlp_survey.jsonl │ │ │ └── sycophancy_on_political_typology_quiz.jsonl │ └── test │ │ ├── coordinate-other-ais │ │ ├── test_dataset_ab.json │ │ └── test_dataset_open_ended.json │ │ ├── corrigible-neutral-HHH │ │ ├── test_dataset_ab.json │ │ └── test_dataset_open_ended.json │ │ ├── hallucination │ │ ├── test_dataset_ab.json │ │ └── test_dataset_open_ended.json │ │ ├── mmlu │ │ ├── mmlu.json │ │ └── mmlu_full.json │ │ ├── myopic-reward │ │ ├── test_dataset_ab.json │ │ └── test_dataset_open_ended.json │ │ ├── refusal │ │ ├── test_dataset_ab.json │ │ └── test_dataset_open_ended.json │ │ ├── survival-instinct │ │ ├── test_dataset_ab.json │ │ └── test_dataset_open_ended.json │ │ ├── sycophancy │ │ ├── test_dataset_ab.json │ │ └── test_dataset_open_ended.json │ │ └── truthfulqa │ │ └── truthful_qa.json ├── harm_dataset.csv ├── make_caa_dataset.py ├── rimsky_refusal_generate_dataset.json └── rimsky_refusal_generate_open_dataset.json ├── llama-poking.py ├── llama3-poking.py ├── logs └── .gitkeep ├── plots └── .gitkeep ├── prettify.py ├── requirements.txt ├── runs ├── run0709_AO.py ├── run0709_SO.py ├── run0710_abeval_AO.py ├── run0710_abeval_SO.py ├── run0710_quad.py ├── run0710pm_AO.py ├── run0710pm_SO.py ├── run0711_abwide.py ├── run0711_gen.py ├── run0711pm_AO.py ├── run0711pm_SO.py ├── run0712_dswide_AO.py ├── run0712_dswide_SO.py ├── run0712pm_AO.py ├── run0712pm_SO.py ├── run0713_AO.py ├── run0713_SO.py ├── run0713_toks_AO.py ├── run0713_toks_SO.py ├── run0715_lall_AO.py ├── run0715_lall_SO.py ├── run0715_lallfine_AO.py ├── run0715_lallfine_SO.py ├── run0715_test.py ├── run0715_zbeh_AO.py ├── run0715_zbeh_SO.py ├── run0716_beh_AO.py ├── run0716_beh_SO.py ├── run0716_logbeh_AO.py ├── run0716_logbeh_SO.py ├── run0718_fill_AO.py ├── run0718_fill_SO.py ├── run0718_fill_remnant_AO.py ├── run0719_scrub_SO.py ├── run0720_fill_AO.py ├── run0721_scruborth_SO.py ├── run0724_llama31_AO.py ├── run0725_llama31_SO.py ├── run0726_llama31_SO.py ├── run0729_llama2_NO.py ├── run0729_llama2_SO.py ├── run0730_quadlda_SO.py ├── run0731_open_AO.py └── run0802_openfill_AO.py ├── server.sh ├── smartcopy.py ├── steering ├── __init__.py ├── caa_test_open_ended.py ├── common.py ├── eval.py ├── experiment.py ├── generate_vectors.py ├── gpu_graph.py ├── leace-poking.py ├── plots.py ├── profile_steer.py ├── refusal_test_open_ended.py ├── scrub.py ├── steer.py └── utils.py ├── test_server.sh ├── vector-poking.py └── vector-var.py /.gitignore: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/steering-llama3/HEAD/.gitignore -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/steering-llama3/HEAD/README.md -------------------------------------------------------------------------------- /artifacts/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /artifacts/acts/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /artifacts/responses/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /artifacts/vecs/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /backup_ao.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/steering-llama3/HEAD/backup_ao.sh -------------------------------------------------------------------------------- /backup_so.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/steering-llama3/HEAD/backup_so.sh -------------------------------------------------------------------------------- /caa_behaviors_testing.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/steering-llama3/HEAD/caa_behaviors_testing.py -------------------------------------------------------------------------------- /caa_refusal_testing.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/steering-llama3/HEAD/caa_refusal_testing.py -------------------------------------------------------------------------------- /datasets/CAA/generate/coordinate-other-ais/generate_balanced_dataset.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/steering-llama3/HEAD/datasets/CAA/generate/coordinate-other-ais/generate_balanced_dataset.json -------------------------------------------------------------------------------- /datasets/CAA/generate/coordinate-other-ais/generate_dataset.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/steering-llama3/HEAD/datasets/CAA/generate/coordinate-other-ais/generate_dataset.json -------------------------------------------------------------------------------- /datasets/CAA/generate/coordinate-other-ais/generate_openresponse_dataset.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/steering-llama3/HEAD/datasets/CAA/generate/coordinate-other-ais/generate_openresponse_dataset.json -------------------------------------------------------------------------------- /datasets/CAA/generate/corrigible-neutral-HHH/generate_balanced_dataset.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/steering-llama3/HEAD/datasets/CAA/generate/corrigible-neutral-HHH/generate_balanced_dataset.json -------------------------------------------------------------------------------- /datasets/CAA/generate/corrigible-neutral-HHH/generate_dataset.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/steering-llama3/HEAD/datasets/CAA/generate/corrigible-neutral-HHH/generate_dataset.json -------------------------------------------------------------------------------- /datasets/CAA/generate/corrigible-neutral-HHH/generate_openresponse_dataset.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/steering-llama3/HEAD/datasets/CAA/generate/corrigible-neutral-HHH/generate_openresponse_dataset.json -------------------------------------------------------------------------------- /datasets/CAA/generate/hallucination/generate_balanced_dataset.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/steering-llama3/HEAD/datasets/CAA/generate/hallucination/generate_balanced_dataset.json -------------------------------------------------------------------------------- /datasets/CAA/generate/hallucination/generate_dataset.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/steering-llama3/HEAD/datasets/CAA/generate/hallucination/generate_dataset.json -------------------------------------------------------------------------------- /datasets/CAA/generate/hallucination/generate_openresponse_dataset.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/steering-llama3/HEAD/datasets/CAA/generate/hallucination/generate_openresponse_dataset.json -------------------------------------------------------------------------------- /datasets/CAA/generate/myopic-reward/generate_balanced_dataset.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/steering-llama3/HEAD/datasets/CAA/generate/myopic-reward/generate_balanced_dataset.json -------------------------------------------------------------------------------- /datasets/CAA/generate/myopic-reward/generate_dataset.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/steering-llama3/HEAD/datasets/CAA/generate/myopic-reward/generate_dataset.json -------------------------------------------------------------------------------- /datasets/CAA/generate/myopic-reward/generate_openresponse_dataset.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/steering-llama3/HEAD/datasets/CAA/generate/myopic-reward/generate_openresponse_dataset.json -------------------------------------------------------------------------------- /datasets/CAA/generate/refusal/generate_balanced_dataset.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/steering-llama3/HEAD/datasets/CAA/generate/refusal/generate_balanced_dataset.json -------------------------------------------------------------------------------- /datasets/CAA/generate/refusal/generate_dataset.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/steering-llama3/HEAD/datasets/CAA/generate/refusal/generate_dataset.json -------------------------------------------------------------------------------- /datasets/CAA/generate/refusal/generate_openresponse_dataset.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/steering-llama3/HEAD/datasets/CAA/generate/refusal/generate_openresponse_dataset.json -------------------------------------------------------------------------------- /datasets/CAA/generate/survival-instinct/generate_balanced_dataset.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/steering-llama3/HEAD/datasets/CAA/generate/survival-instinct/generate_balanced_dataset.json -------------------------------------------------------------------------------- /datasets/CAA/generate/survival-instinct/generate_dataset.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/steering-llama3/HEAD/datasets/CAA/generate/survival-instinct/generate_dataset.json -------------------------------------------------------------------------------- /datasets/CAA/generate/survival-instinct/generate_openresponse_dataset.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/steering-llama3/HEAD/datasets/CAA/generate/survival-instinct/generate_openresponse_dataset.json -------------------------------------------------------------------------------- /datasets/CAA/generate/sycophancy/generate_balanced_dataset.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/steering-llama3/HEAD/datasets/CAA/generate/sycophancy/generate_balanced_dataset.json -------------------------------------------------------------------------------- /datasets/CAA/generate/sycophancy/generate_dataset.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/steering-llama3/HEAD/datasets/CAA/generate/sycophancy/generate_dataset.json -------------------------------------------------------------------------------- /datasets/CAA/generate/sycophancy/generate_openresponse_dataset.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/steering-llama3/HEAD/datasets/CAA/generate/sycophancy/generate_openresponse_dataset.json -------------------------------------------------------------------------------- /datasets/CAA/raw/coordinate-other-ais/dataset.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/steering-llama3/HEAD/datasets/CAA/raw/coordinate-other-ais/dataset.json -------------------------------------------------------------------------------- /datasets/CAA/raw/corrigible-neutral-HHH/dataset.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/steering-llama3/HEAD/datasets/CAA/raw/corrigible-neutral-HHH/dataset.json -------------------------------------------------------------------------------- /datasets/CAA/raw/hallucination/alluding_questioning.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/steering-llama3/HEAD/datasets/CAA/raw/hallucination/alluding_questioning.csv -------------------------------------------------------------------------------- /datasets/CAA/raw/hallucination/dataset.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/steering-llama3/HEAD/datasets/CAA/raw/hallucination/dataset.json -------------------------------------------------------------------------------- /datasets/CAA/raw/hallucination/direct_questions.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/steering-llama3/HEAD/datasets/CAA/raw/hallucination/direct_questions.csv -------------------------------------------------------------------------------- /datasets/CAA/raw/hallucination/make_dataset.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/steering-llama3/HEAD/datasets/CAA/raw/hallucination/make_dataset.py -------------------------------------------------------------------------------- /datasets/CAA/raw/hallucination/questioning_assuming_statement.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/steering-llama3/HEAD/datasets/CAA/raw/hallucination/questioning_assuming_statement.csv -------------------------------------------------------------------------------- /datasets/CAA/raw/myopic-reward/dataset.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/steering-llama3/HEAD/datasets/CAA/raw/myopic-reward/dataset.json -------------------------------------------------------------------------------- /datasets/CAA/raw/refusal/dataset.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/steering-llama3/HEAD/datasets/CAA/raw/refusal/dataset.json -------------------------------------------------------------------------------- /datasets/CAA/raw/survival-instinct/dataset.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/steering-llama3/HEAD/datasets/CAA/raw/survival-instinct/dataset.json -------------------------------------------------------------------------------- /datasets/CAA/raw/sycophancy/dataset.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/steering-llama3/HEAD/datasets/CAA/raw/sycophancy/dataset.json -------------------------------------------------------------------------------- /datasets/CAA/raw/sycophancy/make_dataset.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/steering-llama3/HEAD/datasets/CAA/raw/sycophancy/make_dataset.py -------------------------------------------------------------------------------- /datasets/CAA/raw/sycophancy/sycophancy_on_nlp_survey.jsonl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/steering-llama3/HEAD/datasets/CAA/raw/sycophancy/sycophancy_on_nlp_survey.jsonl -------------------------------------------------------------------------------- /datasets/CAA/raw/sycophancy/sycophancy_on_political_typology_quiz.jsonl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/steering-llama3/HEAD/datasets/CAA/raw/sycophancy/sycophancy_on_political_typology_quiz.jsonl -------------------------------------------------------------------------------- /datasets/CAA/test/coordinate-other-ais/test_dataset_ab.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/steering-llama3/HEAD/datasets/CAA/test/coordinate-other-ais/test_dataset_ab.json -------------------------------------------------------------------------------- /datasets/CAA/test/coordinate-other-ais/test_dataset_open_ended.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/steering-llama3/HEAD/datasets/CAA/test/coordinate-other-ais/test_dataset_open_ended.json -------------------------------------------------------------------------------- /datasets/CAA/test/corrigible-neutral-HHH/test_dataset_ab.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/steering-llama3/HEAD/datasets/CAA/test/corrigible-neutral-HHH/test_dataset_ab.json -------------------------------------------------------------------------------- /datasets/CAA/test/corrigible-neutral-HHH/test_dataset_open_ended.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/steering-llama3/HEAD/datasets/CAA/test/corrigible-neutral-HHH/test_dataset_open_ended.json -------------------------------------------------------------------------------- /datasets/CAA/test/hallucination/test_dataset_ab.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/steering-llama3/HEAD/datasets/CAA/test/hallucination/test_dataset_ab.json -------------------------------------------------------------------------------- /datasets/CAA/test/hallucination/test_dataset_open_ended.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/steering-llama3/HEAD/datasets/CAA/test/hallucination/test_dataset_open_ended.json -------------------------------------------------------------------------------- /datasets/CAA/test/mmlu/mmlu.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/steering-llama3/HEAD/datasets/CAA/test/mmlu/mmlu.json -------------------------------------------------------------------------------- /datasets/CAA/test/mmlu/mmlu_full.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/steering-llama3/HEAD/datasets/CAA/test/mmlu/mmlu_full.json -------------------------------------------------------------------------------- /datasets/CAA/test/myopic-reward/test_dataset_ab.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/steering-llama3/HEAD/datasets/CAA/test/myopic-reward/test_dataset_ab.json -------------------------------------------------------------------------------- /datasets/CAA/test/myopic-reward/test_dataset_open_ended.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/steering-llama3/HEAD/datasets/CAA/test/myopic-reward/test_dataset_open_ended.json -------------------------------------------------------------------------------- /datasets/CAA/test/refusal/test_dataset_ab.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/steering-llama3/HEAD/datasets/CAA/test/refusal/test_dataset_ab.json -------------------------------------------------------------------------------- /datasets/CAA/test/refusal/test_dataset_open_ended.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/steering-llama3/HEAD/datasets/CAA/test/refusal/test_dataset_open_ended.json -------------------------------------------------------------------------------- /datasets/CAA/test/survival-instinct/test_dataset_ab.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/steering-llama3/HEAD/datasets/CAA/test/survival-instinct/test_dataset_ab.json -------------------------------------------------------------------------------- /datasets/CAA/test/survival-instinct/test_dataset_open_ended.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/steering-llama3/HEAD/datasets/CAA/test/survival-instinct/test_dataset_open_ended.json -------------------------------------------------------------------------------- /datasets/CAA/test/sycophancy/test_dataset_ab.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/steering-llama3/HEAD/datasets/CAA/test/sycophancy/test_dataset_ab.json -------------------------------------------------------------------------------- /datasets/CAA/test/sycophancy/test_dataset_open_ended.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/steering-llama3/HEAD/datasets/CAA/test/sycophancy/test_dataset_open_ended.json -------------------------------------------------------------------------------- /datasets/CAA/test/truthfulqa/truthful_qa.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/steering-llama3/HEAD/datasets/CAA/test/truthfulqa/truthful_qa.json -------------------------------------------------------------------------------- /datasets/harm_dataset.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/steering-llama3/HEAD/datasets/harm_dataset.csv -------------------------------------------------------------------------------- /datasets/make_caa_dataset.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/steering-llama3/HEAD/datasets/make_caa_dataset.py -------------------------------------------------------------------------------- /datasets/rimsky_refusal_generate_dataset.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/steering-llama3/HEAD/datasets/rimsky_refusal_generate_dataset.json -------------------------------------------------------------------------------- /datasets/rimsky_refusal_generate_open_dataset.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/steering-llama3/HEAD/datasets/rimsky_refusal_generate_open_dataset.json -------------------------------------------------------------------------------- /llama-poking.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/steering-llama3/HEAD/llama-poking.py -------------------------------------------------------------------------------- /llama3-poking.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/steering-llama3/HEAD/llama3-poking.py -------------------------------------------------------------------------------- /logs/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /plots/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /prettify.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/steering-llama3/HEAD/prettify.py -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/steering-llama3/HEAD/requirements.txt -------------------------------------------------------------------------------- /runs/run0709_AO.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/steering-llama3/HEAD/runs/run0709_AO.py -------------------------------------------------------------------------------- /runs/run0709_SO.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/steering-llama3/HEAD/runs/run0709_SO.py -------------------------------------------------------------------------------- /runs/run0710_abeval_AO.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/steering-llama3/HEAD/runs/run0710_abeval_AO.py -------------------------------------------------------------------------------- /runs/run0710_abeval_SO.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/steering-llama3/HEAD/runs/run0710_abeval_SO.py -------------------------------------------------------------------------------- /runs/run0710_quad.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/steering-llama3/HEAD/runs/run0710_quad.py -------------------------------------------------------------------------------- /runs/run0710pm_AO.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/steering-llama3/HEAD/runs/run0710pm_AO.py -------------------------------------------------------------------------------- /runs/run0710pm_SO.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/steering-llama3/HEAD/runs/run0710pm_SO.py -------------------------------------------------------------------------------- /runs/run0711_abwide.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/steering-llama3/HEAD/runs/run0711_abwide.py -------------------------------------------------------------------------------- /runs/run0711_gen.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/steering-llama3/HEAD/runs/run0711_gen.py -------------------------------------------------------------------------------- /runs/run0711pm_AO.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/steering-llama3/HEAD/runs/run0711pm_AO.py -------------------------------------------------------------------------------- /runs/run0711pm_SO.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/steering-llama3/HEAD/runs/run0711pm_SO.py -------------------------------------------------------------------------------- /runs/run0712_dswide_AO.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/steering-llama3/HEAD/runs/run0712_dswide_AO.py -------------------------------------------------------------------------------- /runs/run0712_dswide_SO.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/steering-llama3/HEAD/runs/run0712_dswide_SO.py -------------------------------------------------------------------------------- /runs/run0712pm_AO.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/steering-llama3/HEAD/runs/run0712pm_AO.py -------------------------------------------------------------------------------- /runs/run0712pm_SO.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/steering-llama3/HEAD/runs/run0712pm_SO.py -------------------------------------------------------------------------------- /runs/run0713_AO.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/steering-llama3/HEAD/runs/run0713_AO.py -------------------------------------------------------------------------------- /runs/run0713_SO.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/steering-llama3/HEAD/runs/run0713_SO.py -------------------------------------------------------------------------------- /runs/run0713_toks_AO.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/steering-llama3/HEAD/runs/run0713_toks_AO.py -------------------------------------------------------------------------------- /runs/run0713_toks_SO.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/steering-llama3/HEAD/runs/run0713_toks_SO.py -------------------------------------------------------------------------------- /runs/run0715_lall_AO.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/steering-llama3/HEAD/runs/run0715_lall_AO.py -------------------------------------------------------------------------------- /runs/run0715_lall_SO.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/steering-llama3/HEAD/runs/run0715_lall_SO.py -------------------------------------------------------------------------------- /runs/run0715_lallfine_AO.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/steering-llama3/HEAD/runs/run0715_lallfine_AO.py -------------------------------------------------------------------------------- /runs/run0715_lallfine_SO.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/steering-llama3/HEAD/runs/run0715_lallfine_SO.py -------------------------------------------------------------------------------- /runs/run0715_test.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/steering-llama3/HEAD/runs/run0715_test.py -------------------------------------------------------------------------------- /runs/run0715_zbeh_AO.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/steering-llama3/HEAD/runs/run0715_zbeh_AO.py -------------------------------------------------------------------------------- /runs/run0715_zbeh_SO.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/steering-llama3/HEAD/runs/run0715_zbeh_SO.py -------------------------------------------------------------------------------- /runs/run0716_beh_AO.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/steering-llama3/HEAD/runs/run0716_beh_AO.py -------------------------------------------------------------------------------- /runs/run0716_beh_SO.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/steering-llama3/HEAD/runs/run0716_beh_SO.py -------------------------------------------------------------------------------- /runs/run0716_logbeh_AO.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/steering-llama3/HEAD/runs/run0716_logbeh_AO.py -------------------------------------------------------------------------------- /runs/run0716_logbeh_SO.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/steering-llama3/HEAD/runs/run0716_logbeh_SO.py -------------------------------------------------------------------------------- /runs/run0718_fill_AO.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/steering-llama3/HEAD/runs/run0718_fill_AO.py -------------------------------------------------------------------------------- /runs/run0718_fill_SO.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/steering-llama3/HEAD/runs/run0718_fill_SO.py -------------------------------------------------------------------------------- /runs/run0718_fill_remnant_AO.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/steering-llama3/HEAD/runs/run0718_fill_remnant_AO.py -------------------------------------------------------------------------------- /runs/run0719_scrub_SO.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/steering-llama3/HEAD/runs/run0719_scrub_SO.py -------------------------------------------------------------------------------- /runs/run0720_fill_AO.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/steering-llama3/HEAD/runs/run0720_fill_AO.py -------------------------------------------------------------------------------- /runs/run0721_scruborth_SO.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/steering-llama3/HEAD/runs/run0721_scruborth_SO.py -------------------------------------------------------------------------------- /runs/run0724_llama31_AO.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/steering-llama3/HEAD/runs/run0724_llama31_AO.py -------------------------------------------------------------------------------- /runs/run0725_llama31_SO.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/steering-llama3/HEAD/runs/run0725_llama31_SO.py -------------------------------------------------------------------------------- /runs/run0726_llama31_SO.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/steering-llama3/HEAD/runs/run0726_llama31_SO.py -------------------------------------------------------------------------------- /runs/run0729_llama2_NO.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/steering-llama3/HEAD/runs/run0729_llama2_NO.py -------------------------------------------------------------------------------- /runs/run0729_llama2_SO.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/steering-llama3/HEAD/runs/run0729_llama2_SO.py -------------------------------------------------------------------------------- /runs/run0730_quadlda_SO.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/steering-llama3/HEAD/runs/run0730_quadlda_SO.py -------------------------------------------------------------------------------- /runs/run0731_open_AO.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/steering-llama3/HEAD/runs/run0731_open_AO.py -------------------------------------------------------------------------------- /runs/run0802_openfill_AO.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/steering-llama3/HEAD/runs/run0802_openfill_AO.py -------------------------------------------------------------------------------- /server.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/steering-llama3/HEAD/server.sh -------------------------------------------------------------------------------- /smartcopy.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/steering-llama3/HEAD/smartcopy.py -------------------------------------------------------------------------------- /steering/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /steering/caa_test_open_ended.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/steering-llama3/HEAD/steering/caa_test_open_ended.py -------------------------------------------------------------------------------- /steering/common.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/steering-llama3/HEAD/steering/common.py -------------------------------------------------------------------------------- /steering/eval.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/steering-llama3/HEAD/steering/eval.py -------------------------------------------------------------------------------- /steering/experiment.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/steering-llama3/HEAD/steering/experiment.py -------------------------------------------------------------------------------- /steering/generate_vectors.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/steering-llama3/HEAD/steering/generate_vectors.py -------------------------------------------------------------------------------- /steering/gpu_graph.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/steering-llama3/HEAD/steering/gpu_graph.py -------------------------------------------------------------------------------- /steering/leace-poking.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/steering-llama3/HEAD/steering/leace-poking.py -------------------------------------------------------------------------------- /steering/plots.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/steering-llama3/HEAD/steering/plots.py -------------------------------------------------------------------------------- /steering/profile_steer.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/steering-llama3/HEAD/steering/profile_steer.py -------------------------------------------------------------------------------- /steering/refusal_test_open_ended.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/steering-llama3/HEAD/steering/refusal_test_open_ended.py -------------------------------------------------------------------------------- /steering/scrub.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/steering-llama3/HEAD/steering/scrub.py -------------------------------------------------------------------------------- /steering/steer.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/steering-llama3/HEAD/steering/steer.py -------------------------------------------------------------------------------- /steering/utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/steering-llama3/HEAD/steering/utils.py -------------------------------------------------------------------------------- /test_server.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/steering-llama3/HEAD/test_server.sh -------------------------------------------------------------------------------- /vector-poking.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/steering-llama3/HEAD/vector-poking.py -------------------------------------------------------------------------------- /vector-var.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/steering-llama3/HEAD/vector-var.py --------------------------------------------------------------------------------