├── .gitignore ├── .vscode ├── launch.json └── settings.json ├── LICENSE ├── README.md ├── dataset ├── __init__.py ├── sequence_dataset.py └── text │ ├── c4.py │ ├── chunked_setencepiece_lm_dataset.py │ ├── enwik8.py │ ├── lm_dataset.py │ ├── pes2o.py │ ├── tokenizers │ └── sentencepiece.py │ └── wikitext_sentence_piece.py ├── framework ├── .gitignore ├── __init__.py ├── data_structures │ ├── __init__.py │ ├── dotdict.py │ └── vocabulary.py ├── helpers │ ├── __init__.py │ ├── argument_parser.py │ ├── distributed.py │ ├── saver.py │ ├── stopping_parallel_loop.py │ └── training_helper.py ├── layers │ ├── __init__.py │ ├── gumbel_sigmoid.py │ ├── gumbel_softmax.py │ └── positional_encoding.py ├── loader │ ├── __init__.py │ ├── collate.py │ ├── dataset_splitter.py │ └── sampler.py ├── utils │ ├── __init__.py │ ├── average.py │ ├── cossim.py │ ├── decompose_factors.py │ ├── distributed_ops.py │ ├── download.py │ ├── entropy.py │ ├── gpu_allocator.py │ ├── init.py │ ├── lockfile.py │ ├── parallel_map.py │ ├── port.py │ ├── process.py │ ├── seed.py │ ├── set_lr.py │ ├── time_meter.py │ └── universal.py └── visualize │ ├── __init__.py │ ├── plot.py │ └── tensorboard.py ├── interfaces ├── __init__.py ├── language_model_interface.py ├── model_interface.py └── result.py ├── layers ├── __init__.py ├── cvmm.py ├── layer_with_visualization.py ├── logging_layer.py ├── moe_layer.py ├── once_per_iter_layer.py ├── regularized_layer.py └── transformer │ ├── __init__.py │ ├── fast_rope_attention.py │ ├── fast_rope_transformer.py │ ├── full_moe_relative_attention.py │ ├── moa.py │ ├── moe_attention_relative_transformer.py │ ├── multi_head_attention.py │ ├── multi_head_relative_pos_attention.py │ ├── relative_moe_transformer.py │ ├── relative_preln_transformer.py │ ├── relative_transformer.py │ ├── rotary_pos_encoding.py │ ├── transformer.py │ └── transformer_preln.py ├── main.py ├── models ├── __init__.py └── transformer_language_model.py ├── optimizer ├── __init__.py └── step_lr_sched.py ├── paper ├── .gitignore ├── common.py ├── config.json ├── framework ├── layers ├── lib │ ├── __init__.py │ ├── common.py │ ├── config.py │ ├── cross_validate_stats.py │ ├── get_ckpt.py │ ├── matplotlib_config.py │ ├── run_command.py │ ├── source.py │ └── stat_tracker.py ├── my_vs_moa.py ├── plot_attention.py ├── plot_components.py ├── plot_components_wandb.py ├── plot_datasets.py ├── plot_fullmoe.py ├── plot_hyperparams.py ├── plot_rope.py ├── plot_rope2.py └── run_tests.py ├── requrements.txt ├── run.py ├── sweeps ├── c4_baseline_big.yaml ├── c4_baseline_big_h4.yaml ├── c4_big_fullmoe_h4_matchfix.yaml ├── c4_moeatt_big_h4_matched.yaml ├── c4_moeatt_small_k3.yaml ├── c4_small_fullmoe.yaml ├── c4_xl.yaml ├── c4_xl_h2.yaml ├── enwik8_baseline.yaml ├── enwik8_baseline_h2.yaml ├── enwik8_moeatt.yaml ├── matched_projection_search │ ├── wikitext103_moeatt_matched_k_only.yaml │ ├── wikitext103_moeatt_matched_ko.yaml │ ├── wikitext103_moeatt_matched_l_ff.yaml │ ├── wikitext103_moeatt_matched_o_only.yaml │ ├── wikitext103_moeatt_matched_q_only.yaml │ ├── wikitext103_moeatt_matched_qk.yaml │ ├── wikitext103_moeatt_matched_qko.yaml │ ├── wikitext103_moeatt_matched_qkv.yaml │ ├── wikitext103_moeatt_matched_qo.yaml │ ├── wikitext103_moeatt_matched_v_only.yaml │ ├── wikitext103_moeatt_matched_vk.yaml │ ├── wikitext103_moeatt_matched_vkqo.yaml │ ├── wikitext103_moeatt_matched_vok.yaml │ ├── wikitext103_moeatt_matched_voq.yaml │ └── wikitext103_moeatt_matched_vq.yaml ├── pes2o_baseline_big.yaml ├── pes2o_baseline_big_h4.yaml ├── pes2o_big_fullmoe_h4_matchfix.yaml ├── pes2o_moeatt_big_h4_matched_norminit.yaml ├── pes2o_moeatt_small_k3.yaml ├── pes2o_small_fullmoe.yaml ├── pes2o_xl.yaml ├── pes2o_xl_h2.yaml ├── wikitext103_baseline_big.yaml ├── wikitext103_baseline_big_h2.yaml ├── wikitext103_baseline_big_rope.yaml ├── wikitext103_baseline_big_rope_h2.yaml ├── wikitext103_big_fullmoe_h4_k2_matchfix.yaml ├── wikitext103_moeatt_big_h2_matched_k4.yaml ├── wikitext103_moeatt_big_h4_matched_k2_rope.yaml ├── wikitext103_moeatt_rope_matched_k3.yaml ├── wikitext103_small_fullmoe.yaml ├── wikitext103_xl.yaml ├── wikitext103_xl_h2.yaml ├── wikitext103_xl_rope.yaml └── wikitext103_xl_rope_h2.yaml └── tasks ├── __init__.py ├── simple ├── __init__.py ├── language_model │ ├── __init__.py │ ├── c4_transformer.py │ ├── enwik8_transformer.py │ ├── pes2o_transformer.py │ ├── transformer_lm_mixin.py │ ├── transformer_relu_analyze.py │ └── wikitext103_sp_transformer.py └── simple_task.py ├── task.py └── task_db.py /.gitignore: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RobertCsordas/moe_attention/HEAD/.gitignore -------------------------------------------------------------------------------- /.vscode/launch.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RobertCsordas/moe_attention/HEAD/.vscode/launch.json -------------------------------------------------------------------------------- /.vscode/settings.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RobertCsordas/moe_attention/HEAD/.vscode/settings.json -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RobertCsordas/moe_attention/HEAD/LICENSE -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RobertCsordas/moe_attention/HEAD/README.md -------------------------------------------------------------------------------- /dataset/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RobertCsordas/moe_attention/HEAD/dataset/__init__.py -------------------------------------------------------------------------------- /dataset/sequence_dataset.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RobertCsordas/moe_attention/HEAD/dataset/sequence_dataset.py -------------------------------------------------------------------------------- /dataset/text/c4.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RobertCsordas/moe_attention/HEAD/dataset/text/c4.py -------------------------------------------------------------------------------- /dataset/text/chunked_setencepiece_lm_dataset.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RobertCsordas/moe_attention/HEAD/dataset/text/chunked_setencepiece_lm_dataset.py -------------------------------------------------------------------------------- /dataset/text/enwik8.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RobertCsordas/moe_attention/HEAD/dataset/text/enwik8.py -------------------------------------------------------------------------------- /dataset/text/lm_dataset.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RobertCsordas/moe_attention/HEAD/dataset/text/lm_dataset.py -------------------------------------------------------------------------------- /dataset/text/pes2o.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RobertCsordas/moe_attention/HEAD/dataset/text/pes2o.py -------------------------------------------------------------------------------- /dataset/text/tokenizers/sentencepiece.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RobertCsordas/moe_attention/HEAD/dataset/text/tokenizers/sentencepiece.py -------------------------------------------------------------------------------- /dataset/text/wikitext_sentence_piece.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RobertCsordas/moe_attention/HEAD/dataset/text/wikitext_sentence_piece.py -------------------------------------------------------------------------------- /framework/.gitignore: -------------------------------------------------------------------------------- 1 | .idea 2 | __pycache__ 3 | -------------------------------------------------------------------------------- /framework/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RobertCsordas/moe_attention/HEAD/framework/__init__.py -------------------------------------------------------------------------------- /framework/data_structures/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RobertCsordas/moe_attention/HEAD/framework/data_structures/__init__.py -------------------------------------------------------------------------------- /framework/data_structures/dotdict.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RobertCsordas/moe_attention/HEAD/framework/data_structures/dotdict.py -------------------------------------------------------------------------------- /framework/data_structures/vocabulary.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RobertCsordas/moe_attention/HEAD/framework/data_structures/vocabulary.py -------------------------------------------------------------------------------- /framework/helpers/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RobertCsordas/moe_attention/HEAD/framework/helpers/__init__.py -------------------------------------------------------------------------------- /framework/helpers/argument_parser.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RobertCsordas/moe_attention/HEAD/framework/helpers/argument_parser.py -------------------------------------------------------------------------------- /framework/helpers/distributed.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RobertCsordas/moe_attention/HEAD/framework/helpers/distributed.py -------------------------------------------------------------------------------- /framework/helpers/saver.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RobertCsordas/moe_attention/HEAD/framework/helpers/saver.py -------------------------------------------------------------------------------- /framework/helpers/stopping_parallel_loop.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RobertCsordas/moe_attention/HEAD/framework/helpers/stopping_parallel_loop.py -------------------------------------------------------------------------------- /framework/helpers/training_helper.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RobertCsordas/moe_attention/HEAD/framework/helpers/training_helper.py -------------------------------------------------------------------------------- /framework/layers/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RobertCsordas/moe_attention/HEAD/framework/layers/__init__.py -------------------------------------------------------------------------------- /framework/layers/gumbel_sigmoid.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RobertCsordas/moe_attention/HEAD/framework/layers/gumbel_sigmoid.py -------------------------------------------------------------------------------- /framework/layers/gumbel_softmax.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RobertCsordas/moe_attention/HEAD/framework/layers/gumbel_softmax.py -------------------------------------------------------------------------------- /framework/layers/positional_encoding.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RobertCsordas/moe_attention/HEAD/framework/layers/positional_encoding.py -------------------------------------------------------------------------------- /framework/loader/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RobertCsordas/moe_attention/HEAD/framework/loader/__init__.py -------------------------------------------------------------------------------- /framework/loader/collate.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RobertCsordas/moe_attention/HEAD/framework/loader/collate.py -------------------------------------------------------------------------------- /framework/loader/dataset_splitter.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RobertCsordas/moe_attention/HEAD/framework/loader/dataset_splitter.py -------------------------------------------------------------------------------- /framework/loader/sampler.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RobertCsordas/moe_attention/HEAD/framework/loader/sampler.py -------------------------------------------------------------------------------- /framework/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RobertCsordas/moe_attention/HEAD/framework/utils/__init__.py -------------------------------------------------------------------------------- /framework/utils/average.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RobertCsordas/moe_attention/HEAD/framework/utils/average.py -------------------------------------------------------------------------------- /framework/utils/cossim.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RobertCsordas/moe_attention/HEAD/framework/utils/cossim.py -------------------------------------------------------------------------------- /framework/utils/decompose_factors.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RobertCsordas/moe_attention/HEAD/framework/utils/decompose_factors.py -------------------------------------------------------------------------------- /framework/utils/distributed_ops.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RobertCsordas/moe_attention/HEAD/framework/utils/distributed_ops.py -------------------------------------------------------------------------------- /framework/utils/download.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RobertCsordas/moe_attention/HEAD/framework/utils/download.py -------------------------------------------------------------------------------- /framework/utils/entropy.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RobertCsordas/moe_attention/HEAD/framework/utils/entropy.py -------------------------------------------------------------------------------- /framework/utils/gpu_allocator.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RobertCsordas/moe_attention/HEAD/framework/utils/gpu_allocator.py -------------------------------------------------------------------------------- /framework/utils/init.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RobertCsordas/moe_attention/HEAD/framework/utils/init.py -------------------------------------------------------------------------------- /framework/utils/lockfile.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RobertCsordas/moe_attention/HEAD/framework/utils/lockfile.py -------------------------------------------------------------------------------- /framework/utils/parallel_map.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RobertCsordas/moe_attention/HEAD/framework/utils/parallel_map.py -------------------------------------------------------------------------------- /framework/utils/port.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RobertCsordas/moe_attention/HEAD/framework/utils/port.py -------------------------------------------------------------------------------- /framework/utils/process.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RobertCsordas/moe_attention/HEAD/framework/utils/process.py -------------------------------------------------------------------------------- /framework/utils/seed.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RobertCsordas/moe_attention/HEAD/framework/utils/seed.py -------------------------------------------------------------------------------- /framework/utils/set_lr.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RobertCsordas/moe_attention/HEAD/framework/utils/set_lr.py -------------------------------------------------------------------------------- /framework/utils/time_meter.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RobertCsordas/moe_attention/HEAD/framework/utils/time_meter.py -------------------------------------------------------------------------------- /framework/utils/universal.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RobertCsordas/moe_attention/HEAD/framework/utils/universal.py -------------------------------------------------------------------------------- /framework/visualize/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RobertCsordas/moe_attention/HEAD/framework/visualize/__init__.py -------------------------------------------------------------------------------- /framework/visualize/plot.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RobertCsordas/moe_attention/HEAD/framework/visualize/plot.py -------------------------------------------------------------------------------- /framework/visualize/tensorboard.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RobertCsordas/moe_attention/HEAD/framework/visualize/tensorboard.py -------------------------------------------------------------------------------- /interfaces/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RobertCsordas/moe_attention/HEAD/interfaces/__init__.py -------------------------------------------------------------------------------- /interfaces/language_model_interface.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RobertCsordas/moe_attention/HEAD/interfaces/language_model_interface.py -------------------------------------------------------------------------------- /interfaces/model_interface.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RobertCsordas/moe_attention/HEAD/interfaces/model_interface.py -------------------------------------------------------------------------------- /interfaces/result.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RobertCsordas/moe_attention/HEAD/interfaces/result.py -------------------------------------------------------------------------------- /layers/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RobertCsordas/moe_attention/HEAD/layers/__init__.py -------------------------------------------------------------------------------- /layers/cvmm.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RobertCsordas/moe_attention/HEAD/layers/cvmm.py -------------------------------------------------------------------------------- /layers/layer_with_visualization.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RobertCsordas/moe_attention/HEAD/layers/layer_with_visualization.py -------------------------------------------------------------------------------- /layers/logging_layer.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RobertCsordas/moe_attention/HEAD/layers/logging_layer.py -------------------------------------------------------------------------------- /layers/moe_layer.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RobertCsordas/moe_attention/HEAD/layers/moe_layer.py -------------------------------------------------------------------------------- /layers/once_per_iter_layer.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RobertCsordas/moe_attention/HEAD/layers/once_per_iter_layer.py -------------------------------------------------------------------------------- /layers/regularized_layer.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RobertCsordas/moe_attention/HEAD/layers/regularized_layer.py -------------------------------------------------------------------------------- /layers/transformer/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RobertCsordas/moe_attention/HEAD/layers/transformer/__init__.py -------------------------------------------------------------------------------- /layers/transformer/fast_rope_attention.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RobertCsordas/moe_attention/HEAD/layers/transformer/fast_rope_attention.py -------------------------------------------------------------------------------- /layers/transformer/fast_rope_transformer.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RobertCsordas/moe_attention/HEAD/layers/transformer/fast_rope_transformer.py -------------------------------------------------------------------------------- /layers/transformer/full_moe_relative_attention.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RobertCsordas/moe_attention/HEAD/layers/transformer/full_moe_relative_attention.py -------------------------------------------------------------------------------- /layers/transformer/moa.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RobertCsordas/moe_attention/HEAD/layers/transformer/moa.py -------------------------------------------------------------------------------- /layers/transformer/moe_attention_relative_transformer.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RobertCsordas/moe_attention/HEAD/layers/transformer/moe_attention_relative_transformer.py -------------------------------------------------------------------------------- /layers/transformer/multi_head_attention.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RobertCsordas/moe_attention/HEAD/layers/transformer/multi_head_attention.py -------------------------------------------------------------------------------- /layers/transformer/multi_head_relative_pos_attention.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RobertCsordas/moe_attention/HEAD/layers/transformer/multi_head_relative_pos_attention.py -------------------------------------------------------------------------------- /layers/transformer/relative_moe_transformer.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RobertCsordas/moe_attention/HEAD/layers/transformer/relative_moe_transformer.py -------------------------------------------------------------------------------- /layers/transformer/relative_preln_transformer.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RobertCsordas/moe_attention/HEAD/layers/transformer/relative_preln_transformer.py -------------------------------------------------------------------------------- /layers/transformer/relative_transformer.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RobertCsordas/moe_attention/HEAD/layers/transformer/relative_transformer.py -------------------------------------------------------------------------------- /layers/transformer/rotary_pos_encoding.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RobertCsordas/moe_attention/HEAD/layers/transformer/rotary_pos_encoding.py -------------------------------------------------------------------------------- /layers/transformer/transformer.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RobertCsordas/moe_attention/HEAD/layers/transformer/transformer.py -------------------------------------------------------------------------------- /layers/transformer/transformer_preln.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RobertCsordas/moe_attention/HEAD/layers/transformer/transformer_preln.py -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RobertCsordas/moe_attention/HEAD/main.py -------------------------------------------------------------------------------- /models/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RobertCsordas/moe_attention/HEAD/models/__init__.py -------------------------------------------------------------------------------- /models/transformer_language_model.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RobertCsordas/moe_attention/HEAD/models/transformer_language_model.py -------------------------------------------------------------------------------- /optimizer/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RobertCsordas/moe_attention/HEAD/optimizer/__init__.py -------------------------------------------------------------------------------- /optimizer/step_lr_sched.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RobertCsordas/moe_attention/HEAD/optimizer/step_lr_sched.py -------------------------------------------------------------------------------- /paper/.gitignore: -------------------------------------------------------------------------------- 1 | checkpoints* 2 | coocurence* 3 | cache 4 | att_plots -------------------------------------------------------------------------------- /paper/common.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RobertCsordas/moe_attention/HEAD/paper/common.py -------------------------------------------------------------------------------- /paper/config.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RobertCsordas/moe_attention/HEAD/paper/config.json -------------------------------------------------------------------------------- /paper/framework: -------------------------------------------------------------------------------- 1 | ../framework -------------------------------------------------------------------------------- /paper/layers: -------------------------------------------------------------------------------- 1 | ../layers -------------------------------------------------------------------------------- /paper/lib/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RobertCsordas/moe_attention/HEAD/paper/lib/__init__.py -------------------------------------------------------------------------------- /paper/lib/common.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RobertCsordas/moe_attention/HEAD/paper/lib/common.py -------------------------------------------------------------------------------- /paper/lib/config.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RobertCsordas/moe_attention/HEAD/paper/lib/config.py -------------------------------------------------------------------------------- /paper/lib/cross_validate_stats.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RobertCsordas/moe_attention/HEAD/paper/lib/cross_validate_stats.py -------------------------------------------------------------------------------- /paper/lib/get_ckpt.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RobertCsordas/moe_attention/HEAD/paper/lib/get_ckpt.py -------------------------------------------------------------------------------- /paper/lib/matplotlib_config.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RobertCsordas/moe_attention/HEAD/paper/lib/matplotlib_config.py -------------------------------------------------------------------------------- /paper/lib/run_command.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RobertCsordas/moe_attention/HEAD/paper/lib/run_command.py -------------------------------------------------------------------------------- /paper/lib/source.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RobertCsordas/moe_attention/HEAD/paper/lib/source.py -------------------------------------------------------------------------------- /paper/lib/stat_tracker.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RobertCsordas/moe_attention/HEAD/paper/lib/stat_tracker.py -------------------------------------------------------------------------------- /paper/my_vs_moa.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RobertCsordas/moe_attention/HEAD/paper/my_vs_moa.py -------------------------------------------------------------------------------- /paper/plot_attention.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RobertCsordas/moe_attention/HEAD/paper/plot_attention.py -------------------------------------------------------------------------------- /paper/plot_components.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RobertCsordas/moe_attention/HEAD/paper/plot_components.py -------------------------------------------------------------------------------- /paper/plot_components_wandb.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RobertCsordas/moe_attention/HEAD/paper/plot_components_wandb.py -------------------------------------------------------------------------------- /paper/plot_datasets.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RobertCsordas/moe_attention/HEAD/paper/plot_datasets.py -------------------------------------------------------------------------------- /paper/plot_fullmoe.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RobertCsordas/moe_attention/HEAD/paper/plot_fullmoe.py -------------------------------------------------------------------------------- /paper/plot_hyperparams.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RobertCsordas/moe_attention/HEAD/paper/plot_hyperparams.py -------------------------------------------------------------------------------- /paper/plot_rope.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RobertCsordas/moe_attention/HEAD/paper/plot_rope.py -------------------------------------------------------------------------------- /paper/plot_rope2.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RobertCsordas/moe_attention/HEAD/paper/plot_rope2.py -------------------------------------------------------------------------------- /paper/run_tests.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RobertCsordas/moe_attention/HEAD/paper/run_tests.py -------------------------------------------------------------------------------- /requrements.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RobertCsordas/moe_attention/HEAD/requrements.txt -------------------------------------------------------------------------------- /run.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RobertCsordas/moe_attention/HEAD/run.py -------------------------------------------------------------------------------- /sweeps/c4_baseline_big.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RobertCsordas/moe_attention/HEAD/sweeps/c4_baseline_big.yaml -------------------------------------------------------------------------------- /sweeps/c4_baseline_big_h4.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RobertCsordas/moe_attention/HEAD/sweeps/c4_baseline_big_h4.yaml -------------------------------------------------------------------------------- /sweeps/c4_big_fullmoe_h4_matchfix.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RobertCsordas/moe_attention/HEAD/sweeps/c4_big_fullmoe_h4_matchfix.yaml -------------------------------------------------------------------------------- /sweeps/c4_moeatt_big_h4_matched.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RobertCsordas/moe_attention/HEAD/sweeps/c4_moeatt_big_h4_matched.yaml -------------------------------------------------------------------------------- /sweeps/c4_moeatt_small_k3.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RobertCsordas/moe_attention/HEAD/sweeps/c4_moeatt_small_k3.yaml -------------------------------------------------------------------------------- /sweeps/c4_small_fullmoe.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RobertCsordas/moe_attention/HEAD/sweeps/c4_small_fullmoe.yaml -------------------------------------------------------------------------------- /sweeps/c4_xl.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RobertCsordas/moe_attention/HEAD/sweeps/c4_xl.yaml -------------------------------------------------------------------------------- /sweeps/c4_xl_h2.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RobertCsordas/moe_attention/HEAD/sweeps/c4_xl_h2.yaml -------------------------------------------------------------------------------- /sweeps/enwik8_baseline.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RobertCsordas/moe_attention/HEAD/sweeps/enwik8_baseline.yaml -------------------------------------------------------------------------------- /sweeps/enwik8_baseline_h2.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RobertCsordas/moe_attention/HEAD/sweeps/enwik8_baseline_h2.yaml -------------------------------------------------------------------------------- /sweeps/enwik8_moeatt.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RobertCsordas/moe_attention/HEAD/sweeps/enwik8_moeatt.yaml -------------------------------------------------------------------------------- /sweeps/matched_projection_search/wikitext103_moeatt_matched_k_only.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RobertCsordas/moe_attention/HEAD/sweeps/matched_projection_search/wikitext103_moeatt_matched_k_only.yaml -------------------------------------------------------------------------------- /sweeps/matched_projection_search/wikitext103_moeatt_matched_ko.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RobertCsordas/moe_attention/HEAD/sweeps/matched_projection_search/wikitext103_moeatt_matched_ko.yaml -------------------------------------------------------------------------------- /sweeps/matched_projection_search/wikitext103_moeatt_matched_l_ff.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RobertCsordas/moe_attention/HEAD/sweeps/matched_projection_search/wikitext103_moeatt_matched_l_ff.yaml -------------------------------------------------------------------------------- /sweeps/matched_projection_search/wikitext103_moeatt_matched_o_only.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RobertCsordas/moe_attention/HEAD/sweeps/matched_projection_search/wikitext103_moeatt_matched_o_only.yaml -------------------------------------------------------------------------------- /sweeps/matched_projection_search/wikitext103_moeatt_matched_q_only.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RobertCsordas/moe_attention/HEAD/sweeps/matched_projection_search/wikitext103_moeatt_matched_q_only.yaml -------------------------------------------------------------------------------- /sweeps/matched_projection_search/wikitext103_moeatt_matched_qk.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RobertCsordas/moe_attention/HEAD/sweeps/matched_projection_search/wikitext103_moeatt_matched_qk.yaml -------------------------------------------------------------------------------- /sweeps/matched_projection_search/wikitext103_moeatt_matched_qko.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RobertCsordas/moe_attention/HEAD/sweeps/matched_projection_search/wikitext103_moeatt_matched_qko.yaml -------------------------------------------------------------------------------- /sweeps/matched_projection_search/wikitext103_moeatt_matched_qkv.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RobertCsordas/moe_attention/HEAD/sweeps/matched_projection_search/wikitext103_moeatt_matched_qkv.yaml -------------------------------------------------------------------------------- /sweeps/matched_projection_search/wikitext103_moeatt_matched_qo.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RobertCsordas/moe_attention/HEAD/sweeps/matched_projection_search/wikitext103_moeatt_matched_qo.yaml -------------------------------------------------------------------------------- /sweeps/matched_projection_search/wikitext103_moeatt_matched_v_only.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RobertCsordas/moe_attention/HEAD/sweeps/matched_projection_search/wikitext103_moeatt_matched_v_only.yaml -------------------------------------------------------------------------------- /sweeps/matched_projection_search/wikitext103_moeatt_matched_vk.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RobertCsordas/moe_attention/HEAD/sweeps/matched_projection_search/wikitext103_moeatt_matched_vk.yaml -------------------------------------------------------------------------------- /sweeps/matched_projection_search/wikitext103_moeatt_matched_vkqo.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RobertCsordas/moe_attention/HEAD/sweeps/matched_projection_search/wikitext103_moeatt_matched_vkqo.yaml -------------------------------------------------------------------------------- /sweeps/matched_projection_search/wikitext103_moeatt_matched_vok.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RobertCsordas/moe_attention/HEAD/sweeps/matched_projection_search/wikitext103_moeatt_matched_vok.yaml -------------------------------------------------------------------------------- /sweeps/matched_projection_search/wikitext103_moeatt_matched_voq.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RobertCsordas/moe_attention/HEAD/sweeps/matched_projection_search/wikitext103_moeatt_matched_voq.yaml -------------------------------------------------------------------------------- /sweeps/matched_projection_search/wikitext103_moeatt_matched_vq.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RobertCsordas/moe_attention/HEAD/sweeps/matched_projection_search/wikitext103_moeatt_matched_vq.yaml -------------------------------------------------------------------------------- /sweeps/pes2o_baseline_big.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RobertCsordas/moe_attention/HEAD/sweeps/pes2o_baseline_big.yaml -------------------------------------------------------------------------------- /sweeps/pes2o_baseline_big_h4.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RobertCsordas/moe_attention/HEAD/sweeps/pes2o_baseline_big_h4.yaml -------------------------------------------------------------------------------- /sweeps/pes2o_big_fullmoe_h4_matchfix.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RobertCsordas/moe_attention/HEAD/sweeps/pes2o_big_fullmoe_h4_matchfix.yaml -------------------------------------------------------------------------------- /sweeps/pes2o_moeatt_big_h4_matched_norminit.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RobertCsordas/moe_attention/HEAD/sweeps/pes2o_moeatt_big_h4_matched_norminit.yaml -------------------------------------------------------------------------------- /sweeps/pes2o_moeatt_small_k3.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RobertCsordas/moe_attention/HEAD/sweeps/pes2o_moeatt_small_k3.yaml -------------------------------------------------------------------------------- /sweeps/pes2o_small_fullmoe.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RobertCsordas/moe_attention/HEAD/sweeps/pes2o_small_fullmoe.yaml -------------------------------------------------------------------------------- /sweeps/pes2o_xl.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RobertCsordas/moe_attention/HEAD/sweeps/pes2o_xl.yaml -------------------------------------------------------------------------------- /sweeps/pes2o_xl_h2.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RobertCsordas/moe_attention/HEAD/sweeps/pes2o_xl_h2.yaml -------------------------------------------------------------------------------- /sweeps/wikitext103_baseline_big.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RobertCsordas/moe_attention/HEAD/sweeps/wikitext103_baseline_big.yaml -------------------------------------------------------------------------------- /sweeps/wikitext103_baseline_big_h2.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RobertCsordas/moe_attention/HEAD/sweeps/wikitext103_baseline_big_h2.yaml -------------------------------------------------------------------------------- /sweeps/wikitext103_baseline_big_rope.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RobertCsordas/moe_attention/HEAD/sweeps/wikitext103_baseline_big_rope.yaml -------------------------------------------------------------------------------- /sweeps/wikitext103_baseline_big_rope_h2.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RobertCsordas/moe_attention/HEAD/sweeps/wikitext103_baseline_big_rope_h2.yaml -------------------------------------------------------------------------------- /sweeps/wikitext103_big_fullmoe_h4_k2_matchfix.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RobertCsordas/moe_attention/HEAD/sweeps/wikitext103_big_fullmoe_h4_k2_matchfix.yaml -------------------------------------------------------------------------------- /sweeps/wikitext103_moeatt_big_h2_matched_k4.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RobertCsordas/moe_attention/HEAD/sweeps/wikitext103_moeatt_big_h2_matched_k4.yaml -------------------------------------------------------------------------------- /sweeps/wikitext103_moeatt_big_h4_matched_k2_rope.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RobertCsordas/moe_attention/HEAD/sweeps/wikitext103_moeatt_big_h4_matched_k2_rope.yaml -------------------------------------------------------------------------------- /sweeps/wikitext103_moeatt_rope_matched_k3.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RobertCsordas/moe_attention/HEAD/sweeps/wikitext103_moeatt_rope_matched_k3.yaml -------------------------------------------------------------------------------- /sweeps/wikitext103_small_fullmoe.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RobertCsordas/moe_attention/HEAD/sweeps/wikitext103_small_fullmoe.yaml -------------------------------------------------------------------------------- /sweeps/wikitext103_xl.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RobertCsordas/moe_attention/HEAD/sweeps/wikitext103_xl.yaml -------------------------------------------------------------------------------- /sweeps/wikitext103_xl_h2.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RobertCsordas/moe_attention/HEAD/sweeps/wikitext103_xl_h2.yaml -------------------------------------------------------------------------------- /sweeps/wikitext103_xl_rope.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RobertCsordas/moe_attention/HEAD/sweeps/wikitext103_xl_rope.yaml -------------------------------------------------------------------------------- /sweeps/wikitext103_xl_rope_h2.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RobertCsordas/moe_attention/HEAD/sweeps/wikitext103_xl_rope_h2.yaml -------------------------------------------------------------------------------- /tasks/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RobertCsordas/moe_attention/HEAD/tasks/__init__.py -------------------------------------------------------------------------------- /tasks/simple/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RobertCsordas/moe_attention/HEAD/tasks/simple/__init__.py -------------------------------------------------------------------------------- /tasks/simple/language_model/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RobertCsordas/moe_attention/HEAD/tasks/simple/language_model/__init__.py -------------------------------------------------------------------------------- /tasks/simple/language_model/c4_transformer.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RobertCsordas/moe_attention/HEAD/tasks/simple/language_model/c4_transformer.py -------------------------------------------------------------------------------- /tasks/simple/language_model/enwik8_transformer.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RobertCsordas/moe_attention/HEAD/tasks/simple/language_model/enwik8_transformer.py -------------------------------------------------------------------------------- /tasks/simple/language_model/pes2o_transformer.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RobertCsordas/moe_attention/HEAD/tasks/simple/language_model/pes2o_transformer.py -------------------------------------------------------------------------------- /tasks/simple/language_model/transformer_lm_mixin.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RobertCsordas/moe_attention/HEAD/tasks/simple/language_model/transformer_lm_mixin.py -------------------------------------------------------------------------------- /tasks/simple/language_model/transformer_relu_analyze.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RobertCsordas/moe_attention/HEAD/tasks/simple/language_model/transformer_relu_analyze.py -------------------------------------------------------------------------------- /tasks/simple/language_model/wikitext103_sp_transformer.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RobertCsordas/moe_attention/HEAD/tasks/simple/language_model/wikitext103_sp_transformer.py -------------------------------------------------------------------------------- /tasks/simple/simple_task.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RobertCsordas/moe_attention/HEAD/tasks/simple/simple_task.py -------------------------------------------------------------------------------- /tasks/task.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RobertCsordas/moe_attention/HEAD/tasks/task.py -------------------------------------------------------------------------------- /tasks/task_db.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RobertCsordas/moe_attention/HEAD/tasks/task_db.py --------------------------------------------------------------------------------