├── .gitignore ├── .pre-commit-config.yaml ├── LICENSE ├── README.md ├── elk_generalization ├── __init__.py ├── anomaly │ ├── __init__.py │ ├── anomaly_experiment.py │ ├── detect_anomaly.py │ └── run_anomaly.py ├── datasets │ ├── __init__.py │ ├── binary_operation_dataset.py │ ├── books_dataset.py │ ├── cities_dataset.py │ ├── create_datasets.py │ ├── loader_utils.py │ ├── nli_dataset.py │ ├── quirky_dataset.py │ ├── sciq_dataset.py │ ├── sentiment_dataset.py │ └── unary_operation_dataset.py ├── elk │ ├── __init__.py │ ├── burns_norm.py │ ├── ccs.py │ ├── ccs_losses.py │ ├── classifier.py │ ├── crc.py │ ├── extract_hiddens.py │ ├── lda.py │ ├── lr_classifier.py │ ├── mean_diff.py │ ├── random_baseline.py │ ├── roc_auc.py │ ├── run_transfers.py │ └── transfer.py ├── interventions │ ├── intervene.py │ └── run_intervene.py ├── results │ ├── __init__.py │ ├── figures.ipynb │ └── viz.py ├── training │ ├── __init__.py │ ├── lora-sft.sh │ ├── run_sft.py │ └── sft.py └── utils.py ├── pyproject.toml ├── ruff.toml ├── setup.py └── test ├── __init__.py └── dataset_viz.ipynb /.gitignore: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/elk-generalization/HEAD/.gitignore -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/elk-generalization/HEAD/.pre-commit-config.yaml -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/elk-generalization/HEAD/LICENSE -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/elk-generalization/HEAD/README.md -------------------------------------------------------------------------------- /elk_generalization/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/elk-generalization/HEAD/elk_generalization/__init__.py -------------------------------------------------------------------------------- /elk_generalization/anomaly/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /elk_generalization/anomaly/anomaly_experiment.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/elk-generalization/HEAD/elk_generalization/anomaly/anomaly_experiment.py -------------------------------------------------------------------------------- /elk_generalization/anomaly/detect_anomaly.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/elk-generalization/HEAD/elk_generalization/anomaly/detect_anomaly.py -------------------------------------------------------------------------------- /elk_generalization/anomaly/run_anomaly.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/elk-generalization/HEAD/elk_generalization/anomaly/run_anomaly.py -------------------------------------------------------------------------------- /elk_generalization/datasets/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /elk_generalization/datasets/binary_operation_dataset.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/elk-generalization/HEAD/elk_generalization/datasets/binary_operation_dataset.py -------------------------------------------------------------------------------- /elk_generalization/datasets/books_dataset.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/elk-generalization/HEAD/elk_generalization/datasets/books_dataset.py -------------------------------------------------------------------------------- /elk_generalization/datasets/cities_dataset.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/elk-generalization/HEAD/elk_generalization/datasets/cities_dataset.py -------------------------------------------------------------------------------- /elk_generalization/datasets/create_datasets.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/elk-generalization/HEAD/elk_generalization/datasets/create_datasets.py -------------------------------------------------------------------------------- /elk_generalization/datasets/loader_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/elk-generalization/HEAD/elk_generalization/datasets/loader_utils.py -------------------------------------------------------------------------------- /elk_generalization/datasets/nli_dataset.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/elk-generalization/HEAD/elk_generalization/datasets/nli_dataset.py -------------------------------------------------------------------------------- /elk_generalization/datasets/quirky_dataset.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/elk-generalization/HEAD/elk_generalization/datasets/quirky_dataset.py -------------------------------------------------------------------------------- /elk_generalization/datasets/sciq_dataset.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/elk-generalization/HEAD/elk_generalization/datasets/sciq_dataset.py -------------------------------------------------------------------------------- /elk_generalization/datasets/sentiment_dataset.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/elk-generalization/HEAD/elk_generalization/datasets/sentiment_dataset.py -------------------------------------------------------------------------------- /elk_generalization/datasets/unary_operation_dataset.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/elk-generalization/HEAD/elk_generalization/datasets/unary_operation_dataset.py -------------------------------------------------------------------------------- /elk_generalization/elk/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /elk_generalization/elk/burns_norm.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/elk-generalization/HEAD/elk_generalization/elk/burns_norm.py -------------------------------------------------------------------------------- /elk_generalization/elk/ccs.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/elk-generalization/HEAD/elk_generalization/elk/ccs.py -------------------------------------------------------------------------------- /elk_generalization/elk/ccs_losses.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/elk-generalization/HEAD/elk_generalization/elk/ccs_losses.py -------------------------------------------------------------------------------- /elk_generalization/elk/classifier.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/elk-generalization/HEAD/elk_generalization/elk/classifier.py -------------------------------------------------------------------------------- /elk_generalization/elk/crc.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/elk-generalization/HEAD/elk_generalization/elk/crc.py -------------------------------------------------------------------------------- /elk_generalization/elk/extract_hiddens.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/elk-generalization/HEAD/elk_generalization/elk/extract_hiddens.py -------------------------------------------------------------------------------- /elk_generalization/elk/lda.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/elk-generalization/HEAD/elk_generalization/elk/lda.py -------------------------------------------------------------------------------- /elk_generalization/elk/lr_classifier.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/elk-generalization/HEAD/elk_generalization/elk/lr_classifier.py -------------------------------------------------------------------------------- /elk_generalization/elk/mean_diff.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/elk-generalization/HEAD/elk_generalization/elk/mean_diff.py -------------------------------------------------------------------------------- /elk_generalization/elk/random_baseline.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/elk-generalization/HEAD/elk_generalization/elk/random_baseline.py -------------------------------------------------------------------------------- /elk_generalization/elk/roc_auc.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/elk-generalization/HEAD/elk_generalization/elk/roc_auc.py -------------------------------------------------------------------------------- /elk_generalization/elk/run_transfers.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/elk-generalization/HEAD/elk_generalization/elk/run_transfers.py -------------------------------------------------------------------------------- /elk_generalization/elk/transfer.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/elk-generalization/HEAD/elk_generalization/elk/transfer.py -------------------------------------------------------------------------------- /elk_generalization/interventions/intervene.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/elk-generalization/HEAD/elk_generalization/interventions/intervene.py -------------------------------------------------------------------------------- /elk_generalization/interventions/run_intervene.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/elk-generalization/HEAD/elk_generalization/interventions/run_intervene.py -------------------------------------------------------------------------------- /elk_generalization/results/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /elk_generalization/results/figures.ipynb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/elk-generalization/HEAD/elk_generalization/results/figures.ipynb -------------------------------------------------------------------------------- /elk_generalization/results/viz.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/elk-generalization/HEAD/elk_generalization/results/viz.py -------------------------------------------------------------------------------- /elk_generalization/training/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /elk_generalization/training/lora-sft.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/elk-generalization/HEAD/elk_generalization/training/lora-sft.sh -------------------------------------------------------------------------------- /elk_generalization/training/run_sft.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/elk-generalization/HEAD/elk_generalization/training/run_sft.py -------------------------------------------------------------------------------- /elk_generalization/training/sft.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/elk-generalization/HEAD/elk_generalization/training/sft.py -------------------------------------------------------------------------------- /elk_generalization/utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/elk-generalization/HEAD/elk_generalization/utils.py -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/elk-generalization/HEAD/pyproject.toml -------------------------------------------------------------------------------- /ruff.toml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/elk-generalization/HEAD/ruff.toml -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/elk-generalization/HEAD/setup.py -------------------------------------------------------------------------------- /test/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /test/dataset_viz.ipynb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/elk-generalization/HEAD/test/dataset_viz.ipynb --------------------------------------------------------------------------------