├── .gitignore ├── README.md ├── cache_t0.sh ├── configs ├── exp │ ├── LayerNorm │ │ ├── post_layernorm.gin │ │ ├── pre_layernorm.gin │ │ ├── reset_optim.gin │ │ ├── training.gin │ │ └── training_pp.gin │ ├── PositionEmbedding │ │ ├── abs_pos.gin │ │ ├── alibi.gin │ │ ├── no_relpos.gin │ │ ├── relpos.gin │ │ └── rotary.gin │ ├── batch_size.gin │ ├── gptj.gin │ ├── memorization.gin │ ├── optim.gin │ ├── partition.gin │ ├── ratio.gin │ └── scaling.gin ├── size │ ├── 110m │ │ └── vanilla.gin │ ├── 1_6b │ │ ├── deep.gin │ │ ├── vanilla.gin │ │ └── wide.gin │ ├── 200m │ │ ├── deep.gin │ │ ├── vanilla.gin │ │ ├── vanilla_gpt.gin │ │ └── wide.gin │ ├── 25m │ │ └── vanilla.gin │ ├── 3_1b │ │ └── vanilla.gin │ ├── 470m │ │ └── vanilla.gin │ ├── 60m │ │ └── vanilla.gin │ └── 920m │ │ └── vanilla.gin ├── t5v2 │ ├── base.gin │ └── large.gin └── task │ ├── eval │ └── t0_eval.gin │ ├── finetune │ ├── codexglue │ │ ├── code_to_text_go.gin │ │ ├── code_to_text_java.gin │ │ ├── code_to_text_javascript.gin │ │ ├── code_to_text_php.gin │ │ ├── code_to_text_python.gin │ │ └── code_to_text_ruby.gin │ ├── extend_1024.gin │ ├── extend_2048-causal.gin │ ├── extend_2048-prefix.gin │ ├── extend_2048.gin │ ├── extend_4096.gin │ ├── extend_512.gin │ ├── extend_8192.gin │ ├── flan2021.gin │ ├── flan2021_t5.gin │ ├── flan2022.gin │ ├── flan2022_t5.gin │ ├── natural_sglue.gin │ ├── natural_sglue_t5.gin │ ├── pile_mlm.gin │ ├── pile_prefix_lm.gin │ ├── pile_prefix_lm_causal.gin │ ├── pile_ul2r.gin │ ├── sglue.gin │ ├── sglue_t5.gin │ └── t0_train.gin │ └── pretrain │ ├── c4_mixed_objective.gin │ ├── c4_mlm.gin │ ├── pile_causal.gin │ ├── pile_mixed_objective.gin │ ├── pile_mlm.gin │ ├── pile_mlm_causal.gin │ └── pile_prefix_lm.gin ├── convert_weights ├── configs │ ├── base_v1 │ │ ├── config.json │ │ ├── generation_config.json │ │ ├── special_tokens_map.json │ │ ├── spiece.model │ │ └── tokenizer_config.json │ ├── base_v2 │ │ ├── config.json │ │ ├── generation_config.json │ │ ├── special_tokens_map.json │ │ ├── tokenizer.model │ │ └── tokenizer_config.json │ ├── large_v1 │ │ ├── config.json │ │ ├── generation_config.json │ │ ├── special_tokens_map.json │ │ ├── spiece.model │ │ └── tokenizer_config.json │ ├── large_v2 │ │ ├── config.json │ │ ├── generation_config.json │ │ ├── special_tokens_map.json │ │ ├── tokenizer.model │ │ └── tokenizer_config.json │ ├── xl_v1 │ │ ├── config.json │ │ ├── generation_config.json │ │ ├── special_tokens_map.json │ │ ├── spiece.model │ │ └── tokenizer_config.json │ ├── xl_v2 │ │ ├── config.json │ │ ├── generation_config.json │ │ ├── special_tokens_map.json │ │ ├── tokenizer.model │ │ └── tokenizer_config.json │ ├── xxl_v1 │ │ ├── config.json │ │ ├── generation_config.json │ │ ├── special_tokens_map.json │ │ ├── spiece.model │ │ └── tokenizer_config.json │ └── xxl_v2 │ │ ├── config.json │ │ ├── generation_config.json │ │ ├── special_tokens_map.json │ │ ├── tokenizer.model │ │ └── tokenizer_config.json ├── convert-t5.txt ├── convert_t5v1_checkpoint_to_pytorch.py ├── convert_t5v2_checkpoint_to_pytorch.py ├── convert_t5x_checkpoint_to_flax.py ├── convert_t5x_checkpoint_to_pytorch.py ├── scripts │ ├── convert_v1.sh │ └── convert_v2.sh ├── upload-codexglue.sh ├── upload-multiple.sh ├── upload-t5x.sh └── upload.sh ├── data ├── __init__.py ├── bigbenchlite │ ├── __init__.py │ └── tasks.py ├── c4 │ ├── __init__.py │ ├── c4_utils.py │ └── tasks.py ├── codexglue │ ├── __init__.py │ └── tasks.py ├── flan │ ├── __init__.py │ ├── tasks.py │ └── tasks_alt.py ├── metrics.py ├── p3 │ ├── __init__.py │ └── tasks.py ├── pile │ ├── __init__.py │ ├── pile_utils.py │ └── tasks.py ├── preprocessors.py ├── preprocessors_test.py ├── sglue │ ├── __init__.py │ ├── postprocessors.py │ ├── preprocessors.py │ ├── tasks.py │ ├── tasks_natural.py │ └── tasks_t5.py ├── utils.py └── vocab.py ├── evals ├── eval-all.sh ├── eval-bbh.sh ├── eval-codexglue.sh ├── eval-cot.sh ├── eval-held_in.sh ├── eval-mmlu.sh └── eval-sglue.sh ├── experiments ├── benchmarks │ ├── t5-v1.1-lm100k │ │ ├── finetune_base_code_to_text.sh │ │ ├── finetune_large_code_to_text.sh │ │ ├── sglue_base.sh │ │ ├── sglue_large.sh │ │ ├── t0-train_base.sh │ │ └── t0-train_large.sh │ ├── t5-v1.1-lm_adapt │ │ ├── base_flan.sh │ │ ├── base_flan2021.sh │ │ ├── large_flan.sh │ │ ├── large_flan2021.sh │ │ ├── xl_flan.sh │ │ ├── xl_flan2021.sh │ │ ├── xxl_flan.sh │ │ └── xxl_flan2021.sh │ └── t5-v1.1 │ │ ├── base_code_to_text.sh │ │ ├── base_flan2021.sh │ │ ├── base_flan2022.sh │ │ ├── base_sglue.sh │ │ ├── base_t0-train.sh │ │ ├── large_code_to_text.sh │ │ ├── large_flan2021.sh │ │ ├── large_flan2022.sh │ │ ├── large_sglue.sh │ │ ├── large_t0-train.sh │ │ ├── xl_code_to_text.sh │ │ ├── xl_flan2021.sh │ │ ├── xl_flan2022.sh │ │ ├── xl_sglue.sh │ │ ├── xxl_code_to_text.sh │ │ ├── xxl_flan2021.sh │ │ ├── xxl_flan2022.sh │ │ └── xxl_sglue.sh ├── improved_t5 │ ├── ablations │ │ ├── v1-1_xl_flan2021_submix.sh │ │ └── v2_xl_flan2021_submix.sh │ ├── eval_bf16.sh │ ├── eval_fp16.sh │ ├── lm_adapt │ │ ├── finetune_base_flan.sh │ │ ├── finetune_large_flan.sh │ │ ├── finetune_xl_flan.sh │ │ ├── finetune_xxl_flan.sh │ │ ├── lm_adapt_base.sh │ │ ├── lm_adapt_large.sh │ │ ├── lm_adapt_xl.sh │ │ ├── lm_adapt_xxl.sh │ │ ├── s_causal │ │ │ ├── finetune_base_flan.sh │ │ │ ├── finetune_large_flan.sh │ │ │ ├── finetune_xl_flan.sh │ │ │ ├── finetune_xxl_flan.sh │ │ │ ├── lm_adapt_base.sh │ │ │ ├── lm_adapt_large.sh │ │ │ ├── lm_adapt_xl.sh │ │ │ └── lm_adapt_xxl.sh │ │ └── ul2r │ │ │ ├── finetune_base_ns_flan2021.sh │ │ │ ├── finetune_xl_flan2021.sh │ │ │ ├── finetune_xl_flan2022.sh │ │ │ ├── lm_adapt_base.sh │ │ │ ├── lm_adapt_base_ns.sh │ │ │ ├── lm_adapt_large.sh │ │ │ ├── lm_adapt_xl.sh │ │ │ └── lm_adapt_xxl.sh │ ├── mlm │ │ ├── extend-causal.sh │ │ ├── extend-prefix.sh │ │ ├── finetune_code_to_text.sh │ │ ├── finetune_flan2022.sh │ │ ├── finetune_sglue.sh │ │ ├── finetune_t0.sh │ │ ├── pretrain_mlm.sh │ │ └── pretrain_mlm_causal.sh │ └── ul2_causal │ │ ├── finetune_base_code_to_text.sh │ │ ├── finetune_base_flan.sh │ │ ├── finetune_base_sglue.sh │ │ ├── finetune_large_code_to_text.sh │ │ ├── finetune_large_flan.sh │ │ ├── finetune_large_sglue.sh │ │ ├── finetune_xl_code_to_text.sh │ │ ├── finetune_xl_flan2021.sh │ │ ├── finetune_xl_sglue.sh │ │ ├── finetune_xxl_flan.sh │ │ ├── finetune_xxl_sglue.sh │ │ ├── pretrain_base.sh │ │ ├── pretrain_large.sh │ │ ├── pretrain_xl.sh │ │ └── pretrain_xxl.sh └── preliminary │ ├── layernorm │ ├── eval │ │ ├── t0_eval_adamw_post.sh │ │ ├── t0_eval_adamw_post_rotary.sh │ │ ├── t0_eval_adamw_pre.sh │ │ ├── t0_eval_alibi_relpos.sh │ │ └── t0_eval_base_lm100k.sh │ ├── finetune │ │ ├── t0_train_adafactor_post.sh │ │ ├── t0_train_adafactor_pre.sh │ │ ├── t0_train_adamw_post.sh │ │ ├── t0_train_adamw_post_rotary.sh │ │ ├── t0_train_adamw_pre.sh │ │ └── t0_train_base_lm100k.sh │ ├── pretrain │ │ ├── pile_mlm_adafactor_post.sh │ │ ├── pile_mlm_adafactor_pre.sh │ │ ├── pile_mlm_adamw_post.sh │ │ ├── pile_mlm_adamw_post_rotary.sh │ │ ├── pile_mlm_adamw_pre.sh │ │ └── pile_mlm_adamw_pre_rotary.sh │ └── t0_eval.py │ ├── mixed_pretraining_objectives │ ├── finetune │ │ ├── 0_10 │ │ │ ├── sglue_finetune_920m_128000.sh │ │ │ ├── sglue_finetune_920m_160000.sh │ │ │ ├── sglue_finetune_920m_192000.sh │ │ │ ├── sglue_finetune_920m_224000.sh │ │ │ ├── sglue_finetune_920m_256000.sh │ │ │ ├── sglue_finetune_920m_32000.sh │ │ │ ├── sglue_finetune_920m_64000.sh │ │ │ └── sglue_finetune_920m_96000.sh │ │ ├── 0_15 │ │ │ ├── sglue_finetune_920m_128000.sh │ │ │ ├── sglue_finetune_920m_160000.sh │ │ │ ├── sglue_finetune_920m_192000.sh │ │ │ ├── sglue_finetune_920m_224000.sh │ │ │ ├── sglue_finetune_920m_256000.sh │ │ │ ├── sglue_finetune_920m_32000.sh │ │ │ ├── sglue_finetune_920m_64000.sh │ │ │ └── sglue_finetune_920m_96000.sh │ │ ├── 0_25 │ │ │ ├── sglue_finetune_920m_128000.sh │ │ │ ├── sglue_finetune_920m_160000.sh │ │ │ ├── sglue_finetune_920m_192000.sh │ │ │ ├── sglue_finetune_920m_224000.sh │ │ │ ├── sglue_finetune_920m_256000.sh │ │ │ ├── sglue_finetune_920m_32000.sh │ │ │ ├── sglue_finetune_920m_64000.sh │ │ │ └── sglue_finetune_920m_96000.sh │ │ ├── 0_50 │ │ │ ├── sglue_finetune_920m_128000.sh │ │ │ ├── sglue_finetune_920m_160000.sh │ │ │ ├── sglue_finetune_920m_192000.sh │ │ │ ├── sglue_finetune_920m_224000.sh │ │ │ ├── sglue_finetune_920m_256000.sh │ │ │ ├── sglue_finetune_920m_32000.sh │ │ │ ├── sglue_finetune_920m_64000.sh │ │ │ └── sglue_finetune_920m_96000.sh │ │ ├── 0_60 │ │ │ ├── sglue_finetune_920m_128000.sh │ │ │ ├── sglue_finetune_920m_160000.sh │ │ │ ├── sglue_finetune_920m_192000.sh │ │ │ ├── sglue_finetune_920m_224000.sh │ │ │ ├── sglue_finetune_920m_256000.sh │ │ │ ├── sglue_finetune_920m_32000.sh │ │ │ ├── sglue_finetune_920m_64000.sh │ │ │ └── sglue_finetune_920m_96000.sh │ │ └── 0_75 │ │ │ ├── sglue_finetune_920m_128000.sh │ │ │ ├── sglue_finetune_920m_160000.sh │ │ │ ├── sglue_finetune_920m_192000.sh │ │ │ ├── sglue_finetune_920m_224000.sh │ │ │ ├── sglue_finetune_920m_256000.sh │ │ │ ├── sglue_finetune_920m_32000.sh │ │ │ ├── sglue_finetune_920m_64000.sh │ │ │ └── sglue_finetune_920m_96000.sh │ ├── pretrain_c4 │ │ ├── 920m_c4_mlm_0_10.sh │ │ ├── 920m_c4_mlm_0_15.sh │ │ ├── 920m_c4_mlm_0_25.sh │ │ ├── 920m_c4_mlm_0_50.sh │ │ ├── 920m_c4_mlm_0_75.sh │ │ └── 920m_c4_mlm_1_00.sh │ ├── pretrain_pile │ │ ├── 920m_pile_mix_0_10.sh │ │ ├── 920m_pile_mix_0_15.sh │ │ ├── 920m_pile_mix_0_25.sh │ │ ├── 920m_pile_mix_0_50.sh │ │ ├── 920m_pile_mix_0_60.sh │ │ └── 920m_pile_mix_0_75.sh │ ├── sglue_graph.py │ ├── super_glue_performance_0-10.png │ └── super_glue_performance_flop_256k.png │ ├── partition │ ├── pretrain_pile_1.sh │ ├── pretrain_pile_2.sh │ ├── pretrain_pile_4.sh │ └── pretrain_pile_8.sh │ ├── positional_embeddings │ ├── alibi │ │ ├── eval_perplexity │ │ │ ├── no_alibi_1024_1024.sh │ │ │ ├── no_alibi_1024_114.sh │ │ │ ├── no_alibi_1024_2048.sh │ │ │ ├── no_alibi_1024_256.sh │ │ │ ├── no_alibi_1024_512.sh │ │ │ ├── no_alibi_1_1024.sh │ │ │ ├── no_alibi_1_114.sh │ │ │ ├── no_alibi_1_2048.sh │ │ │ ├── no_alibi_1_256.sh │ │ │ ├── no_alibi_1_512.sh │ │ │ ├── no_alibi_512_1024.sh │ │ │ ├── no_alibi_512_114.sh │ │ │ ├── no_alibi_512_2048.sh │ │ │ ├── no_alibi_512_256.sh │ │ │ ├── no_alibi_512_512.sh │ │ │ ├── with_alibi_1024_1024.sh │ │ │ ├── with_alibi_1024_114.sh │ │ │ ├── with_alibi_1024_2048.sh │ │ │ ├── with_alibi_1024_256.sh │ │ │ ├── with_alibi_1024_512.sh │ │ │ ├── with_alibi_1_1024.sh │ │ │ ├── with_alibi_1_114.sh │ │ │ ├── with_alibi_1_2048.sh │ │ │ ├── with_alibi_1_256.sh │ │ │ ├── with_alibi_1_512.sh │ │ │ ├── with_alibi_512_1024.sh │ │ │ ├── with_alibi_512_114.sh │ │ │ ├── with_alibi_512_2048.sh │ │ │ ├── with_alibi_512_256.sh │ │ │ └── with_alibi_512_512.sh │ │ ├── finetune_sglue_prefix_lm_no_alibi.sh │ │ ├── finetune_sglue_prefix_lm_with_alibi.sh │ │ ├── finetune_sglue_prefix_lm_with_alibi_plus_relpos.sh │ │ ├── pretrain_pile_prefix_lm_no_alibi.sh │ │ ├── pretrain_pile_prefix_lm_with_alibi.sh │ │ └── pretrain_pile_prefix_lm_with_alibi_plus_relpos.sh │ ├── pretrain_rotary_pile_mlm.sh │ └── rotary │ │ ├── eval │ │ ├── t0_eval_alibi_relpos.sh │ │ ├── t0_eval_benchmark.sh │ │ ├── t0_eval_metro_learning_only_training.sh │ │ ├── t0_eval_metro_no_alibi.sh │ │ └── t0_eval_metro_training.sh │ │ ├── finetune │ │ └── sglue │ │ │ ├── sglue_train_benchmark.sh │ │ │ ├── sglue_train_rotary.sh │ │ │ └── sglue_train_rotary_relpos.sh │ │ ├── pretrain_benchmark_pile_mlm.sh │ │ ├── pretrain_rotary_relpos_pile_mlm.sh │ │ ├── sglue_graph.py │ │ └── t0_eval.py │ └── scaling_laws │ ├── 110m │ ├── sglue_finetune_110m_16000.sh │ ├── sglue_finetune_110m_32000.sh │ ├── sglue_finetune_110m_48000.sh │ ├── sglue_finetune_110m_64000.sh │ └── sglue_finetune_110m_80000.sh │ ├── 1_6b │ ├── sglue_finetune_1_6b_128000.sh │ ├── sglue_finetune_1_6b_192000.sh │ ├── sglue_finetune_1_6b_256000.sh │ ├── sglue_finetune_1_6b_320000.sh │ ├── sglue_finetune_1_6b_384000.sh │ ├── sglue_finetune_1_6b_424000.sh │ ├── sglue_finetune_1_6b_448000.sh │ ├── sglue_finetune_1_6b_512000.sh │ └── sglue_finetune_1_6b_64000.sh │ ├── 25m │ ├── sglue_finetune_25m_16000.sh │ ├── sglue_finetune_25m_24000.sh │ ├── sglue_finetune_25m_32000.sh │ ├── sglue_finetune_25m_40000.sh │ ├── sglue_finetune_25m_48000.sh │ ├── sglue_finetune_25m_56000.sh │ ├── sglue_finetune_25m_64000.sh │ └── sglue_finetune_25m_8000.sh │ ├── 920m │ ├── pretrain_c4.sh │ ├── pretrain_pile.sh │ ├── sglue_finetune_920m_128000.sh │ ├── sglue_finetune_920m_160000.sh │ ├── sglue_finetune_920m_192000.sh │ ├── sglue_finetune_920m_224000.sh │ ├── sglue_finetune_920m_256000.sh │ ├── sglue_finetune_920m_32000.sh │ ├── sglue_finetune_920m_64000.sh │ └── sglue_finetune_920m_96000.sh │ ├── sglue_finetune.sh │ ├── sglue_graph.py │ ├── sglue_graph_.py │ ├── super_glue_performance.png │ └── super_glue_performance_flop.png ├── models ├── decoder_t5 │ ├── __init__.py │ └── modeling_decoder_t5.py └── scalable_t5 │ ├── README.md │ ├── __init__.py │ ├── alibi_position_biases.py │ ├── alibi_position_biases_test.py │ ├── layers.py │ ├── layers_test.py │ ├── local_tiny.gin │ ├── mt5 │ ├── __init__.py │ ├── base.gin │ ├── large.gin │ ├── small.gin │ ├── xl.gin │ └── xxl.gin │ ├── network.py │ ├── network_test.py │ ├── rotary_embedding.py │ ├── rotary_embedding_test.py │ └── t5_1_1 │ ├── __init__.py │ ├── base.gin │ ├── examples │ ├── __init__.py │ └── wmt19_ende_from_scratch.gin │ ├── large.gin │ ├── small.gin │ ├── xl.gin │ └── xxl.gin ├── setup.py └── tpu-scripts ├── kill.sh ├── run.sh ├── send.sh └── setup.sh /cache_t0.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --partition=g40 3 | #SBATCH --job-name=pile-t5x 4 | #SBATCH --nodes=1 5 | #SBATCH --gpus-per-node=8 6 | #SBATCH --ntasks-per-node=12 7 | #SBATCH --output=/fsx/lintangsutawika/improved_t5/logs/%x_%j.out 8 | #SBATCH --exclusive 9 | #SBATCH --requeue 10 | #SBATCH --account=neox 11 | 12 | source /fsx/lintangsutawika/t5_env/bin/activate 13 | 14 | srun --account neox \ 15 | seqio_cache_tasks \ 16 | --tasks="anli_must_be_true_r1" \ 17 | --output_cache_dir=/fsx/lintangsutawika/data \ 18 | --module_import=t0.seqio_tasks \ 19 | --alsologtostderr 20 | -------------------------------------------------------------------------------- /configs/exp/LayerNorm/post_layernorm.gin: -------------------------------------------------------------------------------- 1 | network.T5Config.pre_layer_norm = False -------------------------------------------------------------------------------- /configs/exp/LayerNorm/pre_layernorm.gin: -------------------------------------------------------------------------------- 1 | network.T5Config.pre_layer_norm = True 2 | -------------------------------------------------------------------------------- /configs/exp/LayerNorm/reset_optim.gin: -------------------------------------------------------------------------------- 1 | from __gin__ import dynamic_registration 2 | 3 | from t5x import utils 4 | 5 | utils.RestoreCheckpointConfig: 6 | assignment_map = [(".*param_states.*", None)] 7 | fallback_to_scratch = True 8 | -------------------------------------------------------------------------------- /configs/exp/LayerNorm/training.gin: -------------------------------------------------------------------------------- 1 | from __gin__ import dynamic_registration 2 | 3 | import optax 4 | 5 | from t5x import utils 6 | from t5x import optimizers 7 | 8 | BATCH_SIZE = 2048 9 | DROPOUT_RATE = 0.1 10 | 11 | # ------------------- Optimizer ------------------------------------------------ 12 | # `learning_rate` is set by `Trainer.learning_rate_fn`. 13 | OPTIMIZER = @optimizers.chain() 14 | 15 | optimizers.chain: 16 | transformations = [@optax.adamw()] 17 | 18 | optax.adamw: 19 | learning_rate = @utils.create_learning_rate_scheduler() 20 | eps = 1e-06 21 | b1 = 0.9 22 | b2 = 0.98 23 | weight_decay = 0.01 24 | 25 | utils.create_learning_rate_scheduler: 26 | factors = 'linear_decay' 27 | decay_factor = 8e-06 28 | base_learning_rate = 4e-04 29 | warmup_steps = 10000 30 | -------------------------------------------------------------------------------- /configs/exp/LayerNorm/training_pp.gin: -------------------------------------------------------------------------------- 1 | from __gin__ import dynamic_registration 2 | 3 | import optax 4 | 5 | from t5x import utils 6 | from t5x import optimizers 7 | 8 | BATCH_SIZE = 2048 9 | DROPOUT_RATE = 0.1 10 | 11 | # ------------------- Optimizer ------------------------------------------------ 12 | # `learning_rate` is set by `Trainer.learning_rate_fn`. 13 | OPTIMIZER = @optimizers.chain() 14 | 15 | optimizers.chain: 16 | transformations = [@optax.clip(), @optax.adamw()] 17 | 18 | optax.clip: 19 | max_delta = 2.0 20 | 21 | optax.adamw: 22 | learning_rate = @utils.create_learning_rate_scheduler() 23 | eps = 1e-06 24 | b1 = 0.9 25 | b2 = 0.98 26 | weight_decay = 0.01 27 | 28 | utils.create_learning_rate_scheduler: 29 | factors = 'linear_decay' 30 | decay_factor = 8e-06 31 | base_learning_rate = 4e-04 32 | warmup_steps = 10000 33 | -------------------------------------------------------------------------------- /configs/exp/PositionEmbedding/abs_pos.gin: -------------------------------------------------------------------------------- 1 | network.T5Config.use_abs_pos_embedding = True -------------------------------------------------------------------------------- /configs/exp/PositionEmbedding/alibi.gin: -------------------------------------------------------------------------------- 1 | network.T5Config.use_alibi = True -------------------------------------------------------------------------------- /configs/exp/PositionEmbedding/no_relpos.gin: -------------------------------------------------------------------------------- 1 | network.T5Config.use_rel_pos = False -------------------------------------------------------------------------------- /configs/exp/PositionEmbedding/relpos.gin: -------------------------------------------------------------------------------- 1 | network.T5Config.use_rel_pos = True -------------------------------------------------------------------------------- /configs/exp/PositionEmbedding/rotary.gin: -------------------------------------------------------------------------------- 1 | network.T5Config.use_rotary_embedding = True -------------------------------------------------------------------------------- /configs/exp/batch_size.gin: -------------------------------------------------------------------------------- 1 | utils.SaveCheckpointConfig.keep = 1 -------------------------------------------------------------------------------- /configs/exp/gptj.gin: -------------------------------------------------------------------------------- 1 | BATCH_SIZE = 2048 2 | network.T5Config.gptj = True 3 | utils.SaveCheckpointConfig.keep = 1 -------------------------------------------------------------------------------- /configs/exp/memorization.gin: -------------------------------------------------------------------------------- 1 | BATCH_SIZE = 2048 2 | 3 | utils.SaveCheckpointConfig.keep = 1 -------------------------------------------------------------------------------- /configs/exp/optim.gin: -------------------------------------------------------------------------------- 1 | TRAIN_STEPS = 32000 2 | BATCH_SIZE = 2048 3 | DECAY_STEPS = 32000 4 | LEARNING_RATE = 2.0e-4 5 | 6 | utils.SaveCheckpointConfig.keep = 1 -------------------------------------------------------------------------------- /configs/exp/partition.gin: -------------------------------------------------------------------------------- 1 | NUM_PARTITIONS = 1 2 | TRAIN_STEPS = 100 3 | SAVING_PERIOD = 100 4 | 5 | partitioning.PjitPartitioner: 6 | num_partitions = %NUM_PARTITIONS -------------------------------------------------------------------------------- /configs/exp/ratio.gin: -------------------------------------------------------------------------------- 1 | BATCH_SIZE = 2048 2 | 3 | utils.SaveCheckpointConfig.keep = 1 -------------------------------------------------------------------------------- /configs/exp/scaling.gin: -------------------------------------------------------------------------------- 1 | BATCH_SIZE = 2048 2 | 3 | utils.SaveCheckpointConfig: 4 | keep = None 5 | period = %SAVING_PERIOD -------------------------------------------------------------------------------- /configs/size/110m/vanilla.gin: -------------------------------------------------------------------------------- 1 | network.T5Config: 2 | emb_dim = 640 3 | num_heads = 5 4 | num_encoder_layers = 10 5 | num_decoder_layers = 10 6 | head_dim = 128 7 | mlp_dim = 1920 -------------------------------------------------------------------------------- /configs/size/1_6b/deep.gin: -------------------------------------------------------------------------------- 1 | network.T5Config: 2 | emb_dim = 1024 3 | num_heads = 8 4 | num_encoder_layers = 48 5 | num_decoder_layers = 48 6 | head_dim = 128 7 | mlp_dim = 3072 8 | -------------------------------------------------------------------------------- /configs/size/1_6b/vanilla.gin: -------------------------------------------------------------------------------- 1 | network.T5Config: 2 | emb_dim = 1536 3 | num_heads = 12 4 | num_encoder_layers = 24 5 | num_decoder_layers = 24 6 | head_dim = 128 7 | mlp_dim = 4096 8 | -------------------------------------------------------------------------------- /configs/size/1_6b/wide.gin: -------------------------------------------------------------------------------- 1 | network.T5Config: 2 | emb_dim = 2048 3 | num_heads = 16 4 | num_encoder_layers = 12 5 | num_decoder_layers = 12 6 | head_dim = 128 7 | mlp_dim = 6144 8 | -------------------------------------------------------------------------------- /configs/size/200m/deep.gin: -------------------------------------------------------------------------------- 1 | network.T5Config: 2 | emb_dim = 512 3 | num_heads = 4 4 | num_encoder_layers = 24 5 | num_decoder_layers = 24 6 | head_dim = 128 7 | mlp_dim = 1536 -------------------------------------------------------------------------------- /configs/size/200m/vanilla.gin: -------------------------------------------------------------------------------- 1 | network.T5Config: 2 | emb_dim = 768 3 | num_heads = 6 4 | num_encoder_layers = 12 5 | num_decoder_layers = 12 6 | head_dim = 128 7 | mlp_dim = 2048 -------------------------------------------------------------------------------- /configs/size/200m/vanilla_gpt.gin: -------------------------------------------------------------------------------- 1 | network.TransformerConfig: 2 | emb_dim = 768 3 | num_heads = 6 4 | num_layers = 12 5 | head_dim = 128 6 | mlp_dim = 2048 7 | -------------------------------------------------------------------------------- /configs/size/200m/wide.gin: -------------------------------------------------------------------------------- 1 | network.T5Config: 2 | emb_dim = 1024 3 | num_heads = 8 4 | num_encoder_layers = 6 5 | num_decoder_layers = 6 6 | head_dim = 128 7 | mlp_dim = 3072 -------------------------------------------------------------------------------- /configs/size/25m/vanilla.gin: -------------------------------------------------------------------------------- 1 | network.T5Config: 2 | emb_dim = 384 3 | num_heads = 3 4 | num_encoder_layers = 6 5 | num_decoder_layers = 6 6 | head_dim = 128 7 | mlp_dim = 1152 -------------------------------------------------------------------------------- /configs/size/3_1b/vanilla.gin: -------------------------------------------------------------------------------- 1 | network.T5Config: 2 | emb_dim = 1920 3 | num_heads = 15 4 | num_encoder_layers = 20 5 | num_decoder_layers = 20 6 | head_dim = 128 7 | mlp_dim = 5760 -------------------------------------------------------------------------------- /configs/size/470m/vanilla.gin: -------------------------------------------------------------------------------- 1 | network.T5Config: 2 | emb_dim = 1024 3 | num_heads = 8 4 | num_encoder_layers = 16 5 | num_decoder_layers = 16 6 | head_dim = 128 7 | mlp_dim = 3072 -------------------------------------------------------------------------------- /configs/size/60m/vanilla.gin: -------------------------------------------------------------------------------- 1 | network.T5Config: 2 | emb_dim = 512 3 | num_heads = 4 4 | num_encoder_layers = 8 5 | num_decoder_layers = 8 6 | head_dim = 128 7 | mlp_dim = 1536 -------------------------------------------------------------------------------- /configs/size/920m/vanilla.gin: -------------------------------------------------------------------------------- 1 | network.T5Config: 2 | emb_dim = 1280 3 | num_heads = 10 4 | num_encoder_layers = 20 5 | num_decoder_layers = 20 6 | head_dim = 128 7 | mlp_dim = 3840 -------------------------------------------------------------------------------- /configs/task/eval/t0_eval.gin: -------------------------------------------------------------------------------- 1 | from __gin__ import dynamic_registration 2 | 3 | import __main__ as train_script 4 | from t5x import utils 5 | 6 | import data.p3.tasks 7 | 8 | include 't5x/configs/runs/finetune.gin' 9 | 10 | MIXTURE_OR_TASK_NAME = "t0_eval_score_eval" 11 | TASK_FEATURE_LENGTHS = {'inputs': 1024, 'targets': 256} 12 | 13 | DROPOUT_RATE = 0.1 14 | BATCH_SIZE = 1024 15 | EVAL_STEPS = 100 16 | EVAL_PERIOD = %SAVING_PERIOD 17 | 18 | train_script.train: 19 | run_eval_before_training = True -------------------------------------------------------------------------------- /configs/task/finetune/t0_train.gin: -------------------------------------------------------------------------------- 1 | from __gin__ import dynamic_registration 2 | 3 | import __main__ as train_script 4 | from t5x import utils 5 | 6 | import data.p3.tasks 7 | 8 | include 't5x/configs/runs/finetune.gin' 9 | 10 | MIXTURE_OR_TASK_NAME = "t0_train" 11 | TASK_FEATURE_LENGTHS = {'inputs': 1024, 'targets': 256} 12 | 13 | DROPOUT_RATE = 0.1 14 | BATCH_SIZE = 1024 15 | EVAL_STEPS = 100 16 | EVAL_PERIOD = %SAVING_PERIOD 17 | 18 | utils.SaveCheckpointConfig: 19 | period = %SAVING_PERIOD 20 | -------------------------------------------------------------------------------- /configs/task/pretrain/c4_mlm.gin: -------------------------------------------------------------------------------- 1 | from __gin__ import dynamic_registration 2 | 3 | import __main__ as train_script 4 | from t5x import partitioning 5 | from t5x import utils 6 | from t5x import trainer 7 | 8 | import data.c4.tasks 9 | 10 | include 't5x/configs/runs/pretrain.gin' 11 | 12 | MIXTURE_OR_TASK_NAME = "c4_eye_span_corruption" 13 | TASK_FEATURE_LENGTHS = {"inputs": 512, "targets": 114} -------------------------------------------------------------------------------- /convert_weights/configs/base_v1/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "_name_or_path": "/home/patrick/hugging_face/t5/t5-v1_1-base", 3 | "architectures": [ 4 | "T5ForConditionalGeneration" 5 | ], 6 | "d_ff": 2048, 7 | "d_kv": 64, 8 | "d_model": 768, 9 | "decoder_start_token_id": 0, 10 | "dropout_rate": 0.1, 11 | "eos_token_id": 1, 12 | "feed_forward_proj": "gated-gelu", 13 | "initializer_factor": 1.0, 14 | "is_encoder_decoder": true, 15 | "layer_norm_epsilon": 1e-06, 16 | "model_type": "t5", 17 | "num_decoder_layers": 12, 18 | "num_heads": 12, 19 | "num_layers": 12, 20 | "output_past": true, 21 | "pad_token_id": 0, 22 | "relative_attention_num_buckets": 32, 23 | "tie_word_embeddings": false, 24 | "vocab_size": 32128 25 | } -------------------------------------------------------------------------------- /convert_weights/configs/base_v1/generation_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "_from_model_config": true, 3 | "decoder_start_token_id": 0, 4 | "eos_token_id": 1, 5 | "pad_token_id": 0, 6 | "transformers_version": "4.27.0.dev0" 7 | } -------------------------------------------------------------------------------- /convert_weights/configs/base_v1/spiece.model: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/improved-t5/08beb733e5df07d24fbf95f720d7e87e09c547cd/convert_weights/configs/base_v1/spiece.model -------------------------------------------------------------------------------- /convert_weights/configs/base_v2/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "architectures": [ 3 | "UMT5ForConditionalGeneration" 4 | ], 5 | "d_ff": 2048, 6 | "d_kv": 64, 7 | "d_model": 768, 8 | "decoder_start_token_id": 0, 9 | "dense_act_fn": "gelu_new", 10 | "dropout_rate": 0.1, 11 | "eos_token_id": 2, 12 | "feed_forward_proj": "gated-gelu", 13 | "initializer_factor": 1.0, 14 | "is_encoder_decoder": true, 15 | "is_gated_act": true, 16 | "layer_norm_epsilon": 1e-06, 17 | "model_type": "umt5", 18 | "num_decoder_layers": 12, 19 | "num_heads": 12, 20 | "num_layers": 12, 21 | "output_past": true, 22 | "pad_token_id": 0, 23 | "relative_attention_max_distance": 128, 24 | "relative_attention_num_buckets": 32, 25 | "scalable_attention": true, 26 | "tie_word_embeddings": false, 27 | "tokenizer_class": "LlamaTokenizerFast", 28 | "torch_dtype": "bfloat16", 29 | "transformers_version": "4.31.0", 30 | "use_cache": true, 31 | "vocab_size": 32128 32 | } 33 | -------------------------------------------------------------------------------- /convert_weights/configs/base_v2/generation_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "_from_model_config": true, 3 | "decoder_start_token_id": 0, 4 | "eos_token_id": 2, 5 | "pad_token_id": 0, 6 | "transformers_version": "4.31.0" 7 | } 8 | -------------------------------------------------------------------------------- /convert_weights/configs/base_v2/tokenizer.model: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/improved-t5/08beb733e5df07d24fbf95f720d7e87e09c547cd/convert_weights/configs/base_v2/tokenizer.model -------------------------------------------------------------------------------- /convert_weights/configs/large_v1/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "_name_or_path": "/home/patrick/hugging_face/t5/t5-v1_1-large", 3 | "architectures": [ 4 | "T5ForConditionalGeneration" 5 | ], 6 | "d_ff": 2816, 7 | "d_kv": 64, 8 | "d_model": 1024, 9 | "decoder_start_token_id": 0, 10 | "dropout_rate": 0.1, 11 | "eos_token_id": 1, 12 | "feed_forward_proj": "gated-gelu", 13 | "initializer_factor": 1.0, 14 | "is_encoder_decoder": true, 15 | "layer_norm_epsilon": 1e-06, 16 | "model_type": "t5", 17 | "num_decoder_layers": 24, 18 | "num_heads": 16, 19 | "num_layers": 24, 20 | "output_past": true, 21 | "pad_token_id": 0, 22 | "relative_attention_num_buckets": 32, 23 | "tie_word_embeddings": false, 24 | "vocab_size": 32128 25 | } -------------------------------------------------------------------------------- /convert_weights/configs/large_v1/generation_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "_from_model_config": true, 3 | "decoder_start_token_id": 0, 4 | "eos_token_id": 1, 5 | "pad_token_id": 0, 6 | "transformers_version": "4.27.0.dev0" 7 | } -------------------------------------------------------------------------------- /convert_weights/configs/large_v1/spiece.model: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/improved-t5/08beb733e5df07d24fbf95f720d7e87e09c547cd/convert_weights/configs/large_v1/spiece.model -------------------------------------------------------------------------------- /convert_weights/configs/large_v2/generation_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "_from_model_config": true, 3 | "decoder_start_token_id": 0, 4 | "eos_token_id": 2, 5 | "pad_token_id": 0, 6 | "transformers_version": "4.31.0" 7 | } 8 | -------------------------------------------------------------------------------- /convert_weights/configs/large_v2/tokenizer.model: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/improved-t5/08beb733e5df07d24fbf95f720d7e87e09c547cd/convert_weights/configs/large_v2/tokenizer.model -------------------------------------------------------------------------------- /convert_weights/configs/xl_v1/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "_name_or_path": "/home/patrick/t5/t5-v1_1-xl", 3 | "architectures": [ 4 | "T5ForConditionalGeneration" 5 | ], 6 | "d_ff": 5120, 7 | "d_kv": 64, 8 | "d_model": 2048, 9 | "decoder_start_token_id": 0, 10 | "dropout_rate": 0.1, 11 | "eos_token_id": 1, 12 | "feed_forward_proj": "gated-gelu", 13 | "initializer_factor": 1.0, 14 | "is_encoder_decoder": true, 15 | "layer_norm_epsilon": 1e-06, 16 | "model_type": "t5", 17 | "num_decoder_layers": 24, 18 | "num_heads": 32, 19 | "num_layers": 24, 20 | "output_past": true, 21 | "pad_token_id": 0, 22 | "relative_attention_num_buckets": 32, 23 | "tie_word_embeddings": false, 24 | "vocab_size": 32128 25 | } -------------------------------------------------------------------------------- /convert_weights/configs/xl_v1/generation_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "_from_model_config": true, 3 | "decoder_start_token_id": 0, 4 | "eos_token_id": 1, 5 | "pad_token_id": 0, 6 | "transformers_version": "4.27.0.dev0" 7 | } -------------------------------------------------------------------------------- /convert_weights/configs/xl_v1/spiece.model: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/improved-t5/08beb733e5df07d24fbf95f720d7e87e09c547cd/convert_weights/configs/xl_v1/spiece.model -------------------------------------------------------------------------------- /convert_weights/configs/xl_v2/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "architectures": [ 3 | "UMT5ForConditionalGeneration" 4 | ], 5 | "d_ff": 5120, 6 | "d_kv": 64, 7 | "d_model": 2048, 8 | "decoder_start_token_id": 0, 9 | "dense_act_fn": "gelu_new", 10 | "dropout_rate": 0.1, 11 | "eos_token_id": 2, 12 | "feed_forward_proj": "gated-gelu", 13 | "initializer_factor": 1.0, 14 | "is_encoder_decoder": true, 15 | "is_gated_act": true, 16 | "layer_norm_epsilon": 1e-06, 17 | "model_type": "umt5", 18 | "num_decoder_layers": 24, 19 | "num_heads": 32, 20 | "num_layers": 24, 21 | "output_past": true, 22 | "pad_token_id": 0, 23 | "relative_attention_max_distance": 128, 24 | "relative_attention_num_buckets": 32, 25 | "scalable_attention": true, 26 | "tie_word_embeddings": false, 27 | "tokenizer_class": "LlamaTokenizerFast", 28 | "torch_dtype": "bfloat16", 29 | "transformers_version": "4.31.0", 30 | "use_cache": true, 31 | "vocab_size": 32128 32 | } 33 | -------------------------------------------------------------------------------- /convert_weights/configs/xl_v2/generation_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "_from_model_config": true, 3 | "decoder_start_token_id": 0, 4 | "eos_token_id": 2, 5 | "pad_token_id": 0, 6 | "transformers_version": "4.31.0" 7 | } 8 | -------------------------------------------------------------------------------- /convert_weights/configs/xl_v2/tokenizer.model: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/improved-t5/08beb733e5df07d24fbf95f720d7e87e09c547cd/convert_weights/configs/xl_v2/tokenizer.model -------------------------------------------------------------------------------- /convert_weights/configs/xxl_v1/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "_name_or_path": "/home/patrick/t5/t5-v1_1-xxl", 3 | "architectures": [ 4 | "T5ForConditionalGeneration" 5 | ], 6 | "d_ff": 10240, 7 | "d_kv": 64, 8 | "d_model": 4096, 9 | "decoder_start_token_id": 0, 10 | "dropout_rate": 0.1, 11 | "eos_token_id": 1, 12 | "feed_forward_proj": "gated-gelu", 13 | "initializer_factor": 1.0, 14 | "is_encoder_decoder": true, 15 | "layer_norm_epsilon": 1e-06, 16 | "model_type": "t5", 17 | "num_decoder_layers": 24, 18 | "num_heads": 64, 19 | "num_layers": 24, 20 | "output_past": true, 21 | "pad_token_id": 0, 22 | "relative_attention_num_buckets": 32, 23 | "tie_word_embeddings": false, 24 | "vocab_size": 32128 25 | } 26 | -------------------------------------------------------------------------------- /convert_weights/configs/xxl_v1/generation_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "_from_model_config": true, 3 | "decoder_start_token_id": 0, 4 | "eos_token_id": 1, 5 | "pad_token_id": 0, 6 | "transformers_version": "4.27.0.dev0" 7 | } -------------------------------------------------------------------------------- /convert_weights/configs/xxl_v1/spiece.model: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/improved-t5/08beb733e5df07d24fbf95f720d7e87e09c547cd/convert_weights/configs/xxl_v1/spiece.model -------------------------------------------------------------------------------- /convert_weights/configs/xxl_v2/generation_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "_from_model_config": true, 3 | "decoder_start_token_id": 0, 4 | "eos_token_id": 2, 5 | "pad_token_id": 0, 6 | "transformers_version": "4.31.0" 7 | } 8 | -------------------------------------------------------------------------------- /convert_weights/configs/xxl_v2/tokenizer.model: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/improved-t5/08beb733e5df07d24fbf95f720d7e87e09c547cd/convert_weights/configs/xxl_v2/tokenizer.model -------------------------------------------------------------------------------- /convert_weights/scripts/convert_v1.sh: -------------------------------------------------------------------------------- 1 | mkdir -p $3 2 | 3 | python convert_t5v1_checkpoint_to_pytorch.py \ 4 | --config_file configs/${1}_v1/config.json \ 5 | --t5x_checkpoint_path $2 \ 6 | --pytorch_dump_path $3 7 | 8 | cp configs/${1}_v1/* $3 -------------------------------------------------------------------------------- /convert_weights/scripts/convert_v2.sh: -------------------------------------------------------------------------------- 1 | mkdir -p $3 2 | 3 | python convert_t5v2_checkpoint_to_pytorch.py \ 4 | --config_file configs/${1}_v2/config.json \ 5 | --t5x_checkpoint_path $2 \ 6 | --pytorch_dump_path $3 \ 7 | --scalable_attention 8 | 9 | cp configs/${1}_v2/* $3 -------------------------------------------------------------------------------- /convert_weights/upload-codexglue.sh: -------------------------------------------------------------------------------- 1 | SIZE=$1 2 | LANG=$2 3 | HF_MODEL_PATH=$3 4 | HF_PATH=$4 5 | T5X_PATH=$5 6 | 7 | mkdir -p "${HF_PATH}" 8 | git lfs install 9 | git clone "https://huggingface.co/${HF_MODEL_PATH}" "${HF_PATH}" 10 | git -C "${HF_PATH}" remote set-url origin "https://${HF_USERNAME}:${HF_KEY}@huggingface.co/${HF_MODEL_PATH}" 11 | huggingface-cli lfs-enable-largefiles "${HF_PATH}" 12 | 13 | # Switch branch 14 | git -C "${HF_PATH}" checkout -b "$LANG" 15 | git -C "${HF_PATH}" config http.postBuffer 524288000 16 | 17 | bash scripts/convert_v2.sh ${SIZE} $T5X_PATH $HF_PATH 18 | 19 | git -C "${HF_PATH}" add . 20 | git -C "${HF_PATH}" commit -am "add files for finetuning on $LANG" 21 | git -C "${HF_PATH}" push origin "$LANG" 22 | git -C "${HF_PATH}" checkout main 23 | -------------------------------------------------------------------------------- /convert_weights/upload.sh: -------------------------------------------------------------------------------- 1 | SIZE=$1 2 | HF_MODEL_PATH=$2 3 | HF_PATH=$3 4 | T5X_PATH=$4 5 | 6 | mkdir -p "${HF_PATH}" 7 | git lfs install 8 | git clone "https://huggingface.co/${HF_MODEL_PATH}" "${HF_PATH}" 9 | git -C "${HF_PATH}" remote set-url origin "https://${HF_USERNAME}:${HF_KEY}@huggingface.co/${HF_MODEL_PATH}" 10 | huggingface-cli lfs-enable-largefiles "${HF_PATH}" 11 | 12 | # in main branch 13 | git -C "${HF_PATH}" checkout main 14 | git -C "${HF_PATH}" config http.postBuffer 524288000 15 | 16 | bash scripts/convert_v2.sh ${SIZE} $T5X_PATH $HF_PATH 17 | 18 | git -C "${HF_PATH}" add . 19 | git -C "${HF_PATH}" commit -am "add files" 20 | git -C "${HF_PATH}" push origin main 21 | -------------------------------------------------------------------------------- /data/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2022 2 | 3 | from data.utils import * 4 | from data.vocab import * 5 | from data.metrics import * 6 | from data.preprocessors import * 7 | -------------------------------------------------------------------------------- /data/bigbenchlite/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/improved-t5/08beb733e5df07d24fbf95f720d7e87e09c547cd/data/bigbenchlite/__init__.py -------------------------------------------------------------------------------- /data/bigbenchlite/tasks.py: -------------------------------------------------------------------------------- 1 | import seqio 2 | from bigbench.bbseqio import task_api 3 | from bigbench.bbseqio import tasks 4 | 5 | from t5x.data.vocab import DEFAULT_OUTPUT_FEATURES, get_default_vocabulary 6 | 7 | default_vocab = task_api.SeqIOVocabulary( 8 | name="default", 9 | description="default vocab", 10 | vocabulary=get_default_vocabulary()) 11 | 12 | # Register BIG-bench lite tasks. 13 | # bigbench:bigbench_lite_v1.mix.default_vocab.0_shot.all_examples 14 | num_shots = 0 15 | tasks.register_bigbench_lite(num_shots, default_vocab) -------------------------------------------------------------------------------- /data/c4/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/improved-t5/08beb733e5df07d24fbf95f720d7e87e09c547cd/data/c4/__init__.py -------------------------------------------------------------------------------- /data/c4/c4_utils.py: -------------------------------------------------------------------------------- 1 | # Copyright 2022 2 | import os 3 | 4 | def get_c4_files(path): 5 | """name of the c4 files""" 6 | 7 | num_files_c4=1024 8 | file_list = [os.path.join(path, f"c4-train.{i:05}-of-01024.json") for i in range(num_files_c4)] 9 | 10 | return { 11 | "train": file_list[:-1], 12 | "validation": file_list[-2:-1], 13 | "test": file_list[-1:], 14 | } 15 | -------------------------------------------------------------------------------- /data/codexglue/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/improved-t5/08beb733e5df07d24fbf95f720d7e87e09c547cd/data/codexglue/__init__.py -------------------------------------------------------------------------------- /data/flan/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/improved-t5/08beb733e5df07d24fbf95f720d7e87e09c547cd/data/flan/__init__.py -------------------------------------------------------------------------------- /data/metrics.py: -------------------------------------------------------------------------------- 1 | import seqio 2 | import numpy as np 3 | 4 | from typing import Sequence 5 | 6 | def perplexity(targets: Sequence[str], scores: Sequence[int]): 7 | 8 | cross_entropy = -np.mean(scores)/len(targets) 9 | perplexity = np.exp(cross_entropy) 10 | 11 | return { 12 | "perplexity": seqio.metrics.Scalar(perplexity) 13 | } 14 | -------------------------------------------------------------------------------- /data/p3/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/improved-t5/08beb733e5df07d24fbf95f720d7e87e09c547cd/data/p3/__init__.py -------------------------------------------------------------------------------- /data/pile/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/improved-t5/08beb733e5df07d24fbf95f720d7e87e09c547cd/data/pile/__init__.py -------------------------------------------------------------------------------- /data/pile/pile_utils.py: -------------------------------------------------------------------------------- 1 | # Copyright 2022 2 | import os 3 | 4 | def get_pile_files(path): 5 | """name of the pile files""" 6 | 7 | file_list = [os.path.join(path, f"{i:02}.txt") for i in range(20)] 8 | return { 9 | "train": file_list[:-1], 10 | "validation": file_list[-1:], 11 | "test": file_list[-1:], 12 | } 13 | 14 | def get_minipile_files(path, num_files): 15 | """name of the minipile files""" 16 | 17 | file_list = [f"shuffled_00_x0{i:02}.txt" for i in range(num_files)] 18 | return { 19 | "train": file_list[:-1], 20 | "validation": file_list[-1:], 21 | "test": file_list[-1:], 22 | } 23 | -------------------------------------------------------------------------------- /data/sglue/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/improved-t5/08beb733e5df07d24fbf95f720d7e87e09c547cd/data/sglue/__init__.py -------------------------------------------------------------------------------- /evals/eval-all.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | MODEL_PATH=$1 3 | MODEL=$2 4 | LM_EVAL=$3 5 | EXTRA=$4 6 | 7 | echo "Evaluating ${MODEL}" 8 | echo "BBH" 9 | bash eval-bbh.sh ${MODEL_PATH} ${MODEL} ${LM_EVAL} ${EXTRA} 10 | echo "MMLU" 11 | bash eval-mmlu.sh ${MODEL_PATH} ${MODEL} ${LM_EVAL} ${EXTRA} 12 | echo "Held In" 13 | bash eval-held_in.sh ${MODEL_PATH} ${MODEL} ${LM_EVAL} ${EXTRA} 14 | echo "CoT" 15 | bash eval-cot.sh ${MODEL_PATH} ${MODEL} ${LM_EVAL} ${EXTRA} 16 | -------------------------------------------------------------------------------- /evals/eval-bbh.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | MODEL_PATH=$1 3 | MODEL=$2 4 | LM_EVAL=$3 5 | EXTRA=$4 6 | 7 | for TASK in bbh_zeroshot bbh_fewshot 8 | do 9 | ${LM_EVAL} \ 10 | --model hf \ 11 | --model_args "pretrained=${MODEL_PATH}${MODEL}${EXTRA}" \ 12 | --tasks ${TASK} \ 13 | --batch_size 8 \ 14 | --output "output/${TASK}/${MODEL}" \ 15 | --log_samples 16 | done 17 | -------------------------------------------------------------------------------- /evals/eval-codexglue.sh: -------------------------------------------------------------------------------- 1 | MODEL_PATH=$1 2 | MODEL=$2 3 | LM_EVAL=$3 4 | EXTRA=$4 5 | 6 | for LANG in go java php python ruby javascript; do 7 | ${LM_EVAL} \ 8 | --model hf \ 9 | --model_args "pretrained=${MODEL_PATH}${MODEL}${EXTRA},revision=${LANG}" \ 10 | --tasks "code2text_${LANG}" \ 11 | --batch_size 4 \ 12 | --output "output/codexglue_code2text/${MODEL}/${LANG}/" \ 13 | --log_samples 14 | done 15 | -------------------------------------------------------------------------------- /evals/eval-cot.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | MODEL_PATH=$1 3 | MODEL=$2 4 | LM_EVAL=$3 5 | EXTRA=$4 6 | 7 | for TASK in bbh_cot_zeroshot bbh_cot_fewshot mmlu_flan_cot_zeroshot mmlu_flan_cot_fewshot 8 | do 9 | ${LM_EVAL} \ 10 | --model hf \ 11 | --model_args "pretrained=${MODEL_PATH}${MODEL}${EXTRA}" \ 12 | --tasks ${TASK} \ 13 | --batch_size 4 \ 14 | --output "output/${TASK}/${MODEL}" \ 15 | --log_samples 16 | done 17 | done 18 | -------------------------------------------------------------------------------- /evals/eval-held_in.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | MODEL_PATH=$1 3 | MODEL=$2 4 | LM_EVAL=$3 5 | EXTRA=$4 6 | 7 | TASK=flan_held_in 8 | 9 | ${LM_EVAL} \ 10 | --model hf \ 11 | --model_args "pretrained=${MODEL_PATH}${MODEL}${EXTRA}" \ 12 | --tasks ${TASK} \ 13 | --batch_size 8 \ 14 | --output "OUTPUT/${TASK}/${MODEL}" \ 15 | --log_samples 16 | -------------------------------------------------------------------------------- /evals/eval-mmlu.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | MODEL_PATH=$1 3 | MODEL=$2 4 | LM_EVAL=$3 5 | EXTRA=$4 6 | 7 | for TASK in mmlu_generative mmlu_flan_n_shot_generative 8 | do 9 | for NUM in 0 5 10 | do 11 | ${LM_EVAL} \ 12 | --model hf \ 13 | --model_args "pretrained=${MODEL_PATH}${MODEL}${EXTRA}" \ 14 | --tasks ${TASK} \ 15 | --batch_size 8 \ 16 | --output "OUTPUT/${TASK}/${MODEL}-${NUM}-shot" \ 17 | --num_fewshot $NUM \ 18 | --log_samples 19 | done 20 | done 21 | 22 | for TASK in mmlu mmlu_flan_n_shot_loglikelihood 23 | do 24 | for NUM in 0 5 25 | do 26 | ${LM_EVAL} \ 27 | --model hf \ 28 | --model_args "pretrained=${MODEL_PATH}${MODEL}${EXTRA}" \ 29 | --tasks ${TASK} \ 30 | --batch_size 8 \ 31 | --output "OUTPUT/${TASK}/${MODEL}-${NUM}-shot" \ 32 | --num_fewshot $NUM \ 33 | --log_samples 34 | done 35 | done 36 | -------------------------------------------------------------------------------- /evals/eval-sglue.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | MODEL_PATH=$1 3 | MODEL=$2 4 | LM_EVAL=$3 5 | EXTRA=$4 6 | 7 | export TASK="super-glue-t5-prompt" 8 | 9 | ${LM_EVAL} \ 10 | --model hf \ 11 | --model_args "pretrained=${MODEL_PATH}${MODEL}${EXTRA},truncation=True,max_length=512" \ 12 | --tasks ${TASK} \ 13 | --batch_size 8 \ 14 | --output "OUTPUT/${TASK}/${MODEL}" \ 15 | --log_samples 16 | -------------------------------------------------------------------------------- /experiments/benchmarks/t5-v1.1-lm100k/sglue_base.sh: -------------------------------------------------------------------------------- 1 | 2 | python3 -m t5x.train \ 3 | --gin_file="../t5x/t5x/examples/t5/t5_1_1/base.gin" \ 4 | --gin_file="configs/task/finetune/sglue.gin" \ 5 | --gin.TRAIN_STEPS=1_128_000 \ 6 | --gin.SAVING_PERIOD=4000 \ 7 | --gin.MODEL_DIR=\"gs://improved-t5/ckpts/t5_1_1_base/sglue_finetune\" \ 8 | --gin.INITIAL_CHECKPOINT_PATH=\"gs://t5-data/pretrained_models/t5x/t5_1_1_base/checkpoint_1000000\" \ 9 | --seqio_additional_cache_dirs=\"gs://improved-t5/data\" \ 10 | --alsologtostderr -------------------------------------------------------------------------------- /experiments/benchmarks/t5-v1.1-lm100k/sglue_large.sh: -------------------------------------------------------------------------------- 1 | 2 | python3 -m t5x.train \ 3 | --gin_file="../t5x/t5x/examples/t5/t5_1_1/large.gin" \ 4 | --gin_file="configs/task/finetune/sglue.gin" \ 5 | --gin.TRAIN_STEPS=1_128_000 \ 6 | --gin.SAVING_PERIOD=4000 \ 7 | --gin.MODEL_DIR=\"gs://improved-t5/ckpts/t5_1_1_large/sglue_finetune\" \ 8 | --gin.INITIAL_CHECKPOINT_PATH=\"gs://t5-data/pretrained_models/t5x/t5_1_1_large/checkpoint_1000000\" \ 9 | --seqio_additional_cache_dirs=\"gs://improved-t5/data\" \ 10 | --alsologtostderr -------------------------------------------------------------------------------- /experiments/benchmarks/t5-v1.1-lm100k/t0-train_base.sh: -------------------------------------------------------------------------------- 1 | 2 | python3 -m t5x.train \ 3 | --gin_file="../t5x/t5x/examples/t5/t5_1_1/base.gin" \ 4 | --gin_file="configs/task/finetune/t0_train.gin" \ 5 | --gin.TRAIN_STEPS=1_128_000 \ 6 | --gin.SAVING_PERIOD=4000 \ 7 | --gin.MODEL_DIR=\"gs://improved-t5/ckpts/t5_1_1_base/t0-train_finetune\" \ 8 | --gin.INITIAL_CHECKPOINT_PATH=\"gs://t5-data/pretrained_models/t5x/t5_1_1_base/checkpoint_1000000\" \ 9 | --seqio_additional_cache_dirs=\"gs://improved-t5/data\" \ 10 | --alsologtostderr -------------------------------------------------------------------------------- /experiments/benchmarks/t5-v1.1-lm100k/t0-train_large.sh: -------------------------------------------------------------------------------- 1 | 2 | python3 -m t5x.train \ 3 | --gin_file="../t5x/t5x/examples/t5/t5_1_1/large.gin" \ 4 | --gin_file="configs/task/finetune/t0_train.gin" \ 5 | --gin.TRAIN_STEPS=1_128_000 \ 6 | --gin.SAVING_PERIOD=4000 \ 7 | --gin.MODEL_DIR=\"gs://improved-t5/ckpts/t5_1_1_large/t0-train_finetune\" \ 8 | --gin.INITIAL_CHECKPOINT_PATH=\"gs://t5-data/pretrained_models/t5x/t5_1_1_large/checkpoint_1000000\" \ 9 | --seqio_additional_cache_dirs=\"gs://improved-t5/data\" \ 10 | --alsologtostderr 11 | -------------------------------------------------------------------------------- /experiments/benchmarks/t5-v1.1-lm_adapt/base_flan.sh: -------------------------------------------------------------------------------- 1 | ADDR=$1 2 | MODEL_DIR=$2 3 | 4 | python -m t5x.train \ 5 | --gin_file="../t5x/t5x/examples/t5/t5_1_1/base.gin" \ 6 | --gin_file="configs/task/finetune/flan_t5.gin" \ 7 | --gin.TRAIN_STEPS=1_184_000 \ 8 | --gin.SAVING_PERIOD=10_000 \ 9 | --gin.MODEL_DIR=\"gs://improved-t5/ckpts/t5_1_1_lm100k_base/flan_finetune\" \ 10 | --gin.INITIAL_CHECKPOINT_PATH=\"gs://t5-data/pretrained_models/t5x/t5_1_1_lm100k_base/checkpoint_1100000\" \ 11 | --gin.USE_CACHED_TASKS=False \ 12 | --alsologtostderr 13 | # --multiprocess_gpu \ 14 | # --coordinator_address=${ADDR} \ 15 | # --process_count=${SLURM_NTASKS} \ 16 | # --process_index=${SLURM_PROCID} 17 | -------------------------------------------------------------------------------- /experiments/benchmarks/t5-v1.1-lm_adapt/base_flan2021.sh: -------------------------------------------------------------------------------- 1 | ADDR=$1 2 | MODEL_DIR=$2 3 | 4 | python -m t5x.train \ 5 | --gin_file="../t5x/t5x/examples/t5/t5_1_1/base.gin" \ 6 | --gin_file="configs/task/finetune/flan2021_t5.gin" \ 7 | --gin.TRAIN_STEPS=1_184_000 \ 8 | --gin.SAVING_PERIOD=10_000 \ 9 | --gin.MODEL_DIR=\"gs://improved-t5/ckpts/t5_1_1_lm100k_base/flan2021submix_finetune\" \ 10 | --gin.INITIAL_CHECKPOINT_PATH=\"gs://t5-data/pretrained_models/t5x/t5_1_1_lm100k_base/checkpoint_1100000\" \ 11 | --gin.USE_CACHED_TASKS=False \ 12 | --alsologtostderr 13 | # --multiprocess_gpu \ 14 | # --coordinator_address=${ADDR} \ 15 | # --process_count=${SLURM_NTASKS} \ 16 | # --process_index=${SLURM_PROCID} 17 | -------------------------------------------------------------------------------- /experiments/benchmarks/t5-v1.1-lm_adapt/large_flan.sh: -------------------------------------------------------------------------------- 1 | ADDR=$1 2 | MODEL_DIR=$2 3 | 4 | python -m t5x.train \ 5 | --gin_file="../t5x/t5x/examples/t5/t5_1_1/large.gin" \ 6 | --gin_file="configs/task/finetune/flan_t5.gin" \ 7 | --gin.TRAIN_STEPS=1_164_000 \ 8 | --gin.SAVING_PERIOD=10_000 \ 9 | --gin.MODEL_DIR=\"gs://improved-t5/ckpts/t5_1_1_lm100k_large/flan_finetune\" \ 10 | --gin.INITIAL_CHECKPOINT_PATH=\"gs://t5-data/pretrained_models/t5x/t5_1_1_lm100k_large/checkpoint_1100000\" \ 11 | --gin.USE_CACHED_TASKS=False \ 12 | --alsologtostderr 13 | # --multiprocess_gpu \ 14 | # --coordinator_address=${ADDR} \ 15 | # --process_count=${SLURM_NTASKS} \ 16 | # --process_index=${SLURM_PROCID} 17 | -------------------------------------------------------------------------------- /experiments/benchmarks/t5-v1.1-lm_adapt/large_flan2021.sh: -------------------------------------------------------------------------------- 1 | ADDR=$1 2 | MODEL_DIR=$2 3 | 4 | python -m t5x.train \ 5 | --gin_file="../t5x/t5x/examples/t5/t5_1_1/large.gin" \ 6 | --gin_file="configs/task/finetune/flan2021_t5.gin" \ 7 | --gin.TRAIN_STEPS=1_164_000 \ 8 | --gin.SAVING_PERIOD=10_000 \ 9 | --gin.MODEL_DIR=\"gs://improved-t5/ckpts/t5_1_1_lm100k_large/flan2021_finetune\" \ 10 | --gin.INITIAL_CHECKPOINT_PATH=\"gs://t5-data/pretrained_models/t5x/t5_1_1_lm100k_large/checkpoint_1100000\" \ 11 | --gin.USE_CACHED_TASKS=False \ 12 | --alsologtostderr 13 | # --multiprocess_gpu \ 14 | # --coordinator_address=${ADDR} \ 15 | # --process_count=${SLURM_NTASKS} \ 16 | # --process_index=${SLURM_PROCID} 17 | -------------------------------------------------------------------------------- /experiments/benchmarks/t5-v1.1-lm_adapt/xl_flan.sh: -------------------------------------------------------------------------------- 1 | ADDR=$1 2 | MODEL_DIR=$2 3 | 4 | python -m t5x.train \ 5 | --gin_file="../t5x/t5x/examples/t5/t5_1_1/xl.gin" \ 6 | --gin_file="configs/task/finetune/flan_t5.gin" \ 7 | --gin.partitioning.PjitPartitioner.model_parallel_submesh="(1, 1, 2, 1)" \ 8 | --gin.TRAIN_STEPS=1_138_000 \ 9 | --gin.SAVING_PERIOD=10_000 \ 10 | --gin.MODEL_DIR=\"gs://improved-t5/ckpts/t5_1_1_lm100k_xl/flan_finetune\" \ 11 | --gin.INITIAL_CHECKPOINT_PATH=\"gs://t5-data/pretrained_models/t5x/t5_1_1_lm100k_xl/checkpoint_1100000\" \ 12 | --gin.USE_CACHED_TASKS=False \ 13 | --alsologtostderr 14 | # --multiprocess_gpu \ 15 | # --coordinator_address=${ADDR} \ 16 | # --process_count=${SLURM_NTASKS} \ 17 | # --process_index=${SLURM_PROCID} 18 | -------------------------------------------------------------------------------- /experiments/benchmarks/t5-v1.1-lm_adapt/xl_flan2021.sh: -------------------------------------------------------------------------------- 1 | ADDR=$1 2 | MODEL_DIR=$2 3 | 4 | python -m t5x.train \ 5 | --gin_file="../t5x/t5x/examples/t5/t5_1_1/xl.gin" \ 6 | --gin_file="configs/task/finetune/flan2021_t5.gin" \ 7 | --gin.partitioning.PjitPartitioner.model_parallel_submesh="(1, 1, 2, 1)" \ 8 | --gin.TRAIN_STEPS=1_138_000 \ 9 | --gin.SAVING_PERIOD=10_000 \ 10 | --gin.MODEL_DIR=\"gs://improved-t5/ckpts/t5_1_1_lm100k_xl/flan2021_finetune\" \ 11 | --gin.INITIAL_CHECKPOINT_PATH=\"gs://t5-data/pretrained_models/t5x/t5_1_1_lm100k_xl/checkpoint_1100000\" \ 12 | --gin.USE_CACHED_TASKS=False \ 13 | --alsologtostderr 14 | # --multiprocess_gpu \ 15 | # --coordinator_address=${ADDR} \ 16 | # --process_count=${SLURM_NTASKS} \ 17 | # --process_index=${SLURM_PROCID} 18 | -------------------------------------------------------------------------------- /experiments/benchmarks/t5-v1.1-lm_adapt/xxl_flan.sh: -------------------------------------------------------------------------------- 1 | ADDR=$1 2 | MODEL_DIR=$2 3 | 4 | python -m t5x.train \ 5 | --gin_file="../t5x/t5x/examples/t5/t5_1_1/xxl.gin" \ 6 | --gin_file="configs/task/finetune/flan_t5.gin" \ 7 | --gin.partitioning.PjitPartitioner.model_parallel_submesh="(1, 1, 2, 1)" \ 8 | --gin.TRAIN_STEPS=1_114_000 \ 9 | --gin.SAVING_PERIOD=10_000 \ 10 | --gin.MODEL_DIR=\"gs://improved-t5/ckpts/t5_1_1_lm100k_xxl/flan_finetune\" \ 11 | --gin.INITIAL_CHECKPOINT_PATH=\"gs://t5-data/pretrained_models/t5x/t5_1_1_lm100k_xxl/checkpoint_1100000\" \ 12 | --gin.USE_CACHED_TASKS=False \ 13 | --alsologtostderr 14 | # --multiprocess_gpu \ 15 | # --coordinator_address=${ADDR} \ 16 | # --process_count=${SLURM_NTASKS} \ 17 | # --process_index=${SLURM_PROCID} 18 | -------------------------------------------------------------------------------- /experiments/benchmarks/t5-v1.1-lm_adapt/xxl_flan2021.sh: -------------------------------------------------------------------------------- 1 | ADDR=$1 2 | MODEL_DIR=$2 3 | 4 | python -m t5x.train \ 5 | --gin_file="../t5x/t5x/examples/t5/t5_1_1/xxl.gin" \ 6 | --gin_file="configs/task/finetune/flan2021_t5.gin" \ 7 | --gin.partitioning.PjitPartitioner.model_parallel_submesh="(1, 1, 2, 1)" \ 8 | --gin.TRAIN_STEPS=1_114_000 \ 9 | --gin.SAVING_PERIOD=10_000 \ 10 | --gin.MODEL_DIR=\"gs://improved-t5/ckpts/t5_1_1_lm100k_xxl/flan2021_finetune\" \ 11 | --gin.INITIAL_CHECKPOINT_PATH=\"gs://t5-data/pretrained_models/t5x/t5_1_1_lm100k_xxl/checkpoint_1100000\" \ 12 | --gin.USE_CACHED_TASKS=False \ 13 | --alsologtostderr 14 | # --multiprocess_gpu \ 15 | # --coordinator_address=${ADDR} \ 16 | # --process_count=${SLURM_NTASKS} \ 17 | # --process_index=${SLURM_PROCID} 18 | -------------------------------------------------------------------------------- /experiments/benchmarks/t5-v1.1/base_flan2021.sh: -------------------------------------------------------------------------------- 1 | ADDR=$1 2 | MODEL_DIR=$2 3 | 4 | python -m t5x.train \ 5 | --gin_file="../t5x/t5x/examples/t5/t5_1_1/base.gin" \ 6 | --gin_file="configs/task/finetune/flan2021_t5.gin" \ 7 | --gin.TRAIN_STEPS=1_084_000 \ 8 | --gin.SAVING_PERIOD=10_000 \ 9 | --gin.MODEL_DIR=\"gs://improved-t5/ckpts/t5_1_1_base/finetune_flan2021\" \ 10 | --gin.INITIAL_CHECKPOINT_PATH=\"gs://t5-data/pretrained_models/t5x/t5_1_1_base/checkpoint_1000000\" \ 11 | --gin.USE_CACHED_TASKS=False \ 12 | --alsologtostderr 13 | # --multiprocess_gpu \ 14 | # --coordinator_address=${ADDR} \ 15 | # --process_count=${SLURM_NTASKS} \ 16 | # --process_index=${SLURM_PROCID} 17 | -------------------------------------------------------------------------------- /experiments/benchmarks/t5-v1.1/base_flan2022.sh: -------------------------------------------------------------------------------- 1 | ADDR=$1 2 | MODEL_DIR=$2 3 | 4 | python -m t5x.train \ 5 | --gin_file="../t5x/t5x/examples/t5/t5_1_1/base.gin" \ 6 | --gin_file="configs/task/finetune/flan2022_t5.gin" \ 7 | --gin.TRAIN_STEPS=1_084_000 \ 8 | --gin.SAVING_PERIOD=10_000 \ 9 | --gin.MODEL_DIR=\"gs://improved-t5/ckpts/t5_1_1_base/finetune_flan2022\" \ 10 | --gin.INITIAL_CHECKPOINT_PATH=\"gs://t5-data/pretrained_models/t5x/t5_1_1_base/checkpoint_1000000\" \ 11 | --gin.USE_CACHED_TASKS=False \ 12 | --alsologtostderr 13 | # --multiprocess_gpu \ 14 | # --coordinator_address=${ADDR} \ 15 | # --process_count=${SLURM_NTASKS} \ 16 | # --process_index=${SLURM_PROCID} 17 | -------------------------------------------------------------------------------- /experiments/benchmarks/t5-v1.1/base_sglue.sh: -------------------------------------------------------------------------------- 1 | 2 | python3 -m t5x.train \ 3 | --gin_file="../t5x/t5x/examples/t5/t5_1_1/base.gin" \ 4 | --gin_file="configs/task/finetune/sglue_t5.gin" \ 5 | --gin.TRAIN_STEPS=1_262_144 \ 6 | --gin.SAVING_PERIOD=5000 \ 7 | --gin.MODEL_DIR=\"gs://improved-t5/ckpts/t5_1_1_base/sglue_finetune\" \ 8 | --gin.INITIAL_CHECKPOINT_PATH=\"gs://t5-data/pretrained_models/t5x/t5_1_1_base/checkpoint_1000000\" \ 9 | --gin.USE_CACHED_TASKS=False \ 10 | --alsologtostderr 11 | 12 | # --seqio_additional_cache_dirs=\"gs://improved-t5/data\" \ 13 | -------------------------------------------------------------------------------- /experiments/benchmarks/t5-v1.1/base_t0-train.sh: -------------------------------------------------------------------------------- 1 | 2 | python3 -m t5x.train \ 3 | --gin_file="../t5x/t5x/examples/t5/t5_1_1/base.gin" \ 4 | --gin_file="configs/task/finetune/t0_train.gin" \ 5 | --gin.TRAIN_STEPS=1_128_000 \ 6 | --gin.SAVING_PERIOD=4000 \ 7 | --gin.MODEL_DIR=\"gs://improved-t5/ckpts/t5_1_1_base/t0-train_finetune\" \ 8 | --gin.INITIAL_CHECKPOINT_PATH=\"gs://t5-data/pretrained_models/t5x/t5_1_1_base/checkpoint_1000000\" \ 9 | --seqio_additional_cache_dirs=\"gs://improved-t5/data\" \ 10 | --alsologtostderr -------------------------------------------------------------------------------- /experiments/benchmarks/t5-v1.1/large_flan2021.sh: -------------------------------------------------------------------------------- 1 | ADDR=$1 2 | MODEL_DIR=$2 3 | 4 | python -m t5x.train \ 5 | --gin_file="../t5x/t5x/examples/t5/t5_1_1/large.gin" \ 6 | --gin_file="configs/task/finetune/flan2021_t5.gin" \ 7 | --gin.TRAIN_STEPS=1_064_000 \ 8 | --gin.SAVING_PERIOD=10_000 \ 9 | --gin.MODEL_DIR=\"gs://improved-t5/ckpts/t5_1_1_large/finetune_flan2021\" \ 10 | --gin.INITIAL_CHECKPOINT_PATH=\"gs://t5-data/pretrained_models/t5x/t5_1_1_large/checkpoint_1000000\" \ 11 | --gin.USE_CACHED_TASKS=False \ 12 | --alsologtostderr 13 | # --multiprocess_gpu \ 14 | # --coordinator_address=${ADDR} \ 15 | # --process_count=${SLURM_NTASKS} \ 16 | # --process_index=${SLURM_PROCID} 17 | -------------------------------------------------------------------------------- /experiments/benchmarks/t5-v1.1/large_flan2022.sh: -------------------------------------------------------------------------------- 1 | ADDR=$1 2 | MODEL_DIR=$2 3 | 4 | python -m t5x.train \ 5 | --gin_file="../t5x/t5x/examples/t5/t5_1_1/large.gin" \ 6 | --gin_file="configs/task/finetune/flan2022_t5.gin" \ 7 | --gin.TRAIN_STEPS=1_064_000 \ 8 | --gin.SAVING_PERIOD=10_000 \ 9 | --gin.MODEL_DIR=\"gs://improved-t5/ckpts/t5_1_1_large/finetune_flan2022\" \ 10 | --gin.INITIAL_CHECKPOINT_PATH=\"gs://t5-data/pretrained_models/t5x/t5_1_1_large/checkpoint_1000000\" \ 11 | --gin.USE_CACHED_TASKS=False \ 12 | --alsologtostderr 13 | # --multiprocess_gpu \ 14 | # --coordinator_address=${ADDR} \ 15 | # --process_count=${SLURM_NTASKS} \ 16 | # --process_index=${SLURM_PROCID} 17 | -------------------------------------------------------------------------------- /experiments/benchmarks/t5-v1.1/large_sglue.sh: -------------------------------------------------------------------------------- 1 | 2 | python3 -m t5x.train \ 3 | --gin_file="../t5x/t5x/examples/t5/t5_1_1/large.gin" \ 4 | --gin_file="configs/task/finetune/sglue_t5.gin" \ 5 | --gin.TRAIN_STEPS=1_262_144 \ 6 | --gin.SAVING_PERIOD=5000 \ 7 | --gin.MODEL_DIR=\"gs://improved-t5/ckpts/t5_1_1_large/sglue_finetune\" \ 8 | --gin.INITIAL_CHECKPOINT_PATH=\"gs://t5-data/pretrained_models/t5x/t5_1_1_large/checkpoint_1000000\" \ 9 | --gin.USE_CACHED_TASKS=False \ 10 | --alsologtostderr -------------------------------------------------------------------------------- /experiments/benchmarks/t5-v1.1/large_t0-train.sh: -------------------------------------------------------------------------------- 1 | 2 | python3 -m t5x.train \ 3 | --gin_file="../t5x/t5x/examples/t5/t5_1_1/large.gin" \ 4 | --gin_file="configs/task/finetune/t0_train.gin" \ 5 | --gin.TRAIN_STEPS=1_128_000 \ 6 | --gin.SAVING_PERIOD=4000 \ 7 | --gin.MODEL_DIR=\"gs://improved-t5/ckpts/t5_1_1_large/t0-train_finetune\" \ 8 | --gin.INITIAL_CHECKPOINT_PATH=\"gs://t5-data/pretrained_models/t5x/t5_1_1_large/checkpoint_1000000\" \ 9 | --seqio_additional_cache_dirs=\"gs://improved-t5/data\" \ 10 | --alsologtostderr 11 | -------------------------------------------------------------------------------- /experiments/benchmarks/t5-v1.1/xl_flan2021.sh: -------------------------------------------------------------------------------- 1 | ADDR=$1 2 | MODEL_DIR=$2 3 | 4 | python -m t5x.train \ 5 | --gin_file="../t5x/t5x/examples/t5/t5_1_1/xl.gin" \ 6 | --gin_file="configs/task/finetune/flan2021_t5.gin" \ 7 | --gin.partitioning.PjitPartitioner.model_parallel_submesh="(1, 1, 2, 1)" \ 8 | --gin.TRAIN_STEPS=1_038_000 \ 9 | --gin.SAVING_PERIOD=10_000 \ 10 | --gin.MODEL_DIR=\"gs://improved-t5/ckpts/t5_1_1_xl/finetune_flan2021\" \ 11 | --gin.INITIAL_CHECKPOINT_PATH=\"gs://t5-data/pretrained_models/t5x/t5_1_1_xl/checkpoint_1000000\" \ 12 | --gin.USE_CACHED_TASKS=False \ 13 | --alsologtostderr 14 | # --multiprocess_gpu \ 15 | # --coordinator_address=${ADDR} \ 16 | # --process_count=${SLURM_NTASKS} \ 17 | # --process_index=${SLURM_PROCID} 18 | -------------------------------------------------------------------------------- /experiments/benchmarks/t5-v1.1/xl_flan2022.sh: -------------------------------------------------------------------------------- 1 | ADDR=$1 2 | MODEL_DIR=$2 3 | 4 | python -m t5x.train \ 5 | --gin_file="../t5x/t5x/examples/t5/t5_1_1/xl.gin" \ 6 | --gin_file="configs/task/finetune/flan2022_t5.gin" \ 7 | --gin.partitioning.PjitPartitioner.model_parallel_submesh="(1, 1, 2, 1)" \ 8 | --gin.TRAIN_STEPS=1_038_000 \ 9 | --gin.SAVING_PERIOD=10_000 \ 10 | --gin.MODEL_DIR=\"gs://improved-t5/ckpts/t5_1_1_xl/finetune_flan2022\" \ 11 | --gin.INITIAL_CHECKPOINT_PATH=\"gs://t5-data/pretrained_models/t5x/t5_1_1_xl/checkpoint_1000000\" \ 12 | --gin.USE_CACHED_TASKS=False \ 13 | --alsologtostderr 14 | # --multiprocess_gpu \ 15 | # --coordinator_address=${ADDR} \ 16 | # --process_count=${SLURM_NTASKS} \ 17 | # --process_index=${SLURM_PROCID} 18 | -------------------------------------------------------------------------------- /experiments/benchmarks/t5-v1.1/xl_sglue.sh: -------------------------------------------------------------------------------- 1 | 2 | python3 -m t5x.train \ 3 | --gin_file="../t5x/t5x/examples/t5/t5_1_1/xl.gin" \ 4 | --gin_file="configs/task/finetune/sglue_t5.gin" \ 5 | --gin.TRAIN_STEPS=1_262_144 \ 6 | --gin.SAVING_PERIOD=5000 \ 7 | --gin.MODEL_DIR=\"gs://improved-t5/ckpts/t5_1_1_xl/sglue_finetune\" \ 8 | --gin.INITIAL_CHECKPOINT_PATH=\"gs://t5-data/pretrained_models/t5x/t5_1_1_xl/checkpoint_1000000\" \ 9 | --gin.USE_CACHED_TASKS=False \ 10 | --alsologtostderr -------------------------------------------------------------------------------- /experiments/benchmarks/t5-v1.1/xxl_flan2021.sh: -------------------------------------------------------------------------------- 1 | ADDR=$1 2 | MODEL_DIR=$2 3 | 4 | python -m t5x.train \ 5 | --gin_file="../t5x/t5x/examples/t5/t5_1_1/xxl.gin" \ 6 | --gin_file="configs/task/finetune/flan2021_t5.gin" \ 7 | --gin.partitioning.standard_logical_axis_rules.activation_partitioning_dims=2 \ 8 | --gin.partitioning.standard_logical_axis_rules.parameter_partitioning_dims=2 \ 9 | --gin.TRAIN_STEPS=1_014_000 \ 10 | --gin.SAVING_PERIOD=10_000 \ 11 | --gin.MODEL_DIR=\"gs://improved-t5/ckpts/t5_1_1_xxl/finetune_flan2021\" \ 12 | --gin.INITIAL_CHECKPOINT_PATH=\"gs://t5-data/pretrained_models/t5x/t5_1_1_xxl/checkpoint_1000000\" \ 13 | --gin.USE_CACHED_TASKS=False \ 14 | --alsologtostderr 15 | # --multiprocess_gpu \ 16 | # --coordinator_address=${ADDR} \ 17 | # --process_count=${SLURM_NTASKS} \ 18 | # --process_index=${SLURM_PROCID} 19 | -------------------------------------------------------------------------------- /experiments/benchmarks/t5-v1.1/xxl_flan2022.sh: -------------------------------------------------------------------------------- 1 | ADDR=$1 2 | MODEL_DIR=$2 3 | 4 | python -m t5x.train \ 5 | --gin_file="../t5x/t5x/examples/t5/t5_1_1/xxl.gin" \ 6 | --gin_file="configs/task/finetune/flan2022_t5.gin" \ 7 | --gin.TRAIN_STEPS=1_014_000 \ 8 | --gin.SAVING_PERIOD=2000 \ 9 | --gin.MODEL_DIR=\"gs://improved-t5/ckpts/t5_1_1_xxl/finetune_flan2022\" \ 10 | --gin.INITIAL_CHECKPOINT_PATH=\"gs://t5-data/pretrained_models/t5x/t5_1_1_xxl/checkpoint_1000000\" \ 11 | --gin.USE_CACHED_TASKS=False \ 12 | --alsologtostderr 13 | # --multiprocess_gpu \ 14 | # --coordinator_address=${ADDR} \ 15 | # --process_count=${SLURM_NTASKS} \ 16 | # --process_index=${SLURM_PROCID} 17 | # --gin.trainer.Trainer.num_microbatches=32 \ 18 | # --gin.partitioning.standard_logical_axis_rules.activation_partitioning_dims=1 \ 19 | # --gin.partitioning.standard_logical_axis_rules.parameter_partitioning_dims=2 \ -------------------------------------------------------------------------------- /experiments/benchmarks/t5-v1.1/xxl_sglue.sh: -------------------------------------------------------------------------------- 1 | 2 | python3 -m t5x.train \ 3 | --gin_file="../t5x/t5x/examples/t5/t5_1_1/xxl.gin" \ 4 | --gin_file="configs/task/finetune/sglue_t5.gin" \ 5 | --gin.partitioning.standard_logical_axis_rules.activation_partitioning_dims=1 \ 6 | --gin.partitioning.standard_logical_axis_rules.parameter_partitioning_dims=2 \ 7 | --gin.TRAIN_STEPS=1_262_144 \ 8 | --gin.SAVING_PERIOD=5000 \ 9 | --gin.MODEL_DIR=\"gs://improved-t5/ckpts/t5_1_1_xxl/sglue_finetune\" \ 10 | --gin.INITIAL_CHECKPOINT_PATH=\"gs://t5-data/pretrained_models/t5x/t5_1_1_xxl/checkpoint_1000000\" \ 11 | --gin.USE_CACHED_TASKS=False \ 12 | --alsologtostderr -------------------------------------------------------------------------------- /experiments/improved_t5/ablations/v1-1_xl_flan2021_submix.sh: -------------------------------------------------------------------------------- 1 | ADDR=$1 2 | MODEL_DIR=$2 3 | 4 | python -m t5x.train \ 5 | --gin_file="../t5x/t5x/examples/t5/t5_1_1/xl.gin" \ 6 | --gin.partitioning.PjitPartitioner.model_parallel_submesh="(1, 1, 2, 1)" \ 7 | --gin_file="configs/task/finetune/flan2021_t5.gin" \ 8 | --gin.MIXTURE_OR_TASK_NAME=\"flan2021_submix_original_t5\" \ 9 | --gin.TRAIN_STEPS=1_038_000 \ 10 | --gin.SAVING_PERIOD=2_000 \ 11 | --gin.INITIAL_CHECKPOINT_PATH=\"gs://t5-data/pretrained_models/t5x/t5_1_1_xl/checkpoint_1000000\" \ 12 | --gin.MODEL_DIR=\"gs://improved-t5/ckpts/ablations/v1_1_xl_flan2021_submix\" \ 13 | --gin.USE_CACHED_TASKS=False \ 14 | --alsologtostderr 15 | # --multiprocess_gpu \ 16 | # --coordinator_address=${ADDR} \ 17 | # --process_count=${SLURM_NTASKS} \ 18 | # --process_index=${SLURM_PROCID} 19 | -------------------------------------------------------------------------------- /experiments/improved_t5/eval_bf16.sh: -------------------------------------------------------------------------------- 1 | ADDR=$1 2 | MODEL_DIR=$2 3 | 4 | python -m t5x.train \ 5 | --gin_file="models/scalable_t5/t5_1_1/base.gin" \ 6 | --gin_file="configs/task/finetune/pile_mlm.gin" \ 7 | --gin.seqio.SentencePieceVocabulary.sentencepiece_model_file=\""gs://improved-t5/vocabs/tokenizer.model"\" \ 8 | --gin.seqio.SentencePieceVocabulary.extra_ids=100 \ 9 | --gin.TRAIN_STEPS=1000000 \ 10 | --gin.SAVING_PERIOD=10000 \ 11 | --gin.network.T5Config.dtype=\"bfloat16\" \ 12 | --gin.MODEL_DIR=\"gs://improved-t5/ckpts/v2_base_mlm/checkpoint_1000000/pile_bf16\" \ 13 | --gin.INITIAL_CHECKPOINT_PATH=\"gs://improved-t5/ckpts/v2_base_mlm/checkpoint_1000000\" \ 14 | --gin.USE_CACHED_TASKS=False \ 15 | --alsologtostderr -------------------------------------------------------------------------------- /experiments/improved_t5/eval_fp16.sh: -------------------------------------------------------------------------------- 1 | ADDR=$1 2 | MODEL_DIR=$2 3 | 4 | python -m t5x.train \ 5 | --gin_file="models/scalable_t5/t5_1_1/base.gin" \ 6 | --gin_file="configs/task/finetune/pile_mlm.gin" \ 7 | --gin.seqio.SentencePieceVocabulary.sentencepiece_model_file=\""gs://improved-t5/vocabs/tokenizer.model"\" \ 8 | --gin.seqio.SentencePieceVocabulary.extra_ids=100 \ 9 | --gin.TRAIN_STEPS=1000000 \ 10 | --gin.SAVING_PERIOD=10000 \ 11 | --gin.network.T5Config.dtype=\"float16\" \ 12 | --gin.MODEL_DIR=\"gs://improved-t5/ckpts/v2_base_mlm/checkpoint_1000000/pile_fp16\" \ 13 | --gin.INITIAL_CHECKPOINT_PATH=\"gs://improved-t5/ckpts/v2_base_mlm/checkpoint_1000000\" \ 14 | --gin.USE_CACHED_TASKS=False \ 15 | --alsologtostderr -------------------------------------------------------------------------------- /experiments/improved_t5/lm_adapt/finetune_base_flan.sh: -------------------------------------------------------------------------------- 1 | ADDR=$1 2 | MODEL_DIR=$2 3 | 4 | python -m t5x.train \ 5 | --gin_file="models/scalable_t5/t5_1_1/base.gin" \ 6 | --gin.seqio.SentencePieceVocabulary.sentencepiece_model_file=\""gs://improved-t5/vocabs/tokenizer.model"\" \ 7 | --gin.seqio.SentencePieceVocabulary.extra_ids=100 \ 8 | --gin_file="configs/task/finetune/flan.gin" \ 9 | --gin.TRAIN_STEPS=1_184_000 \ 10 | --gin.SAVING_PERIOD=10_000 \ 11 | --gin.MODEL_DIR=\"gs://improved-t5/ckpts/v2_base_mlm/checkpoint_1000000/lm_adapt_flan_finetune\" \ 12 | --gin.INITIAL_CHECKPOINT_PATH=\"gs://improved-t5/ckpts/v2_base_mlm/checkpoint_1000000/lm_adapt/checkpoint_1100000\" \ 13 | --gin.USE_CACHED_TASKS=False \ 14 | --alsologtostderr 15 | # --multiprocess_gpu \ 16 | # --coordinator_address=${ADDR} \ 17 | # --process_count=${SLURM_NTASKS} \ 18 | # --process_index=${SLURM_PROCID} 19 | -------------------------------------------------------------------------------- /experiments/improved_t5/lm_adapt/finetune_large_flan.sh: -------------------------------------------------------------------------------- 1 | ADDR=$1 2 | MODEL_DIR=$2 3 | 4 | python -m t5x.train \ 5 | --gin_file="models/scalable_t5/t5_1_1/large.gin" \ 6 | --gin.seqio.SentencePieceVocabulary.sentencepiece_model_file=\""gs://improved-t5/vocabs/tokenizer.model"\" \ 7 | --gin.seqio.SentencePieceVocabulary.extra_ids=100 \ 8 | --gin_file="configs/task/finetune/flan.gin" \ 9 | --gin.TRAIN_STEPS=1_164_000 \ 10 | --gin.SAVING_PERIOD=10_000 \ 11 | --gin.MODEL_DIR=\"gs://improved-t5/ckpts/v2_large_mlm/checkpoint_1000000/lm_adapt_flan_finetune\" \ 12 | --gin.INITIAL_CHECKPOINT_PATH=\"gs://improved-t5/ckpts/v2_large_mlm/checkpoint_1000000/lm_adapt/checkpoint_1100000\" \ 13 | --gin.USE_CACHED_TASKS=False \ 14 | --alsologtostderr 15 | # --multiprocess_gpu \ 16 | # --coordinator_address=${ADDR} \ 17 | # --process_count=${SLURM_NTASKS} \ 18 | # --process_index=${SLURM_PROCID} 19 | -------------------------------------------------------------------------------- /experiments/improved_t5/lm_adapt/finetune_xxl_flan.sh: -------------------------------------------------------------------------------- 1 | ADDR=$1 2 | MODEL_DIR=$2 3 | 4 | python -m t5x.train \ 5 | --gin_file="models/scalable_t5/t5_1_1/xxl.gin" \ 6 | --gin.seqio.SentencePieceVocabulary.sentencepiece_model_file=\""gs://improved-t5/vocabs/tokenizer.model"\" \ 7 | --gin.seqio.SentencePieceVocabulary.extra_ids=100 \ 8 | --gin_file="configs/task/finetune/flan.gin" \ 9 | --gin.TRAIN_STEPS=1_114_000 \ 10 | --gin.SAVING_PERIOD=10_000 \ 11 | --gin.MODEL_DIR=\"gs://improved-t5/ckpts/v2_xxl_mlm/checkpoint_1000000/lm_adapt_flan_finetune\" \ 12 | --gin.INITIAL_CHECKPOINT_PATH=\"gs://improved-t5/ckpts/v2_xxl_mlm/checkpoint_1000000/lm_adapt/checkpoint_1100000\" \ 13 | --gin.USE_CACHED_TASKS=False \ 14 | --alsologtostderr 15 | # --multiprocess_gpu \ 16 | # --coordinator_address=${ADDR} \ 17 | # --process_count=${SLURM_NTASKS} \ 18 | # --process_index=${SLURM_PROCID} 19 | -------------------------------------------------------------------------------- /experiments/improved_t5/lm_adapt/lm_adapt_base.sh: -------------------------------------------------------------------------------- 1 | ADDR=$1 2 | MODEL_DIR=$2 3 | 4 | python -m t5x.train \ 5 | --gin_file="models/scalable_t5/t5_1_1/base.gin" \ 6 | --gin.seqio.SentencePieceVocabulary.sentencepiece_model_file=\""gs://improved-t5/vocabs/tokenizer.model"\" \ 7 | --gin.seqio.SentencePieceVocabulary.extra_ids=100 \ 8 | --gin_file="configs/task/finetune/pile_prefix_lm.gin" \ 9 | --gin.TRAIN_STEPS=1_100_000 \ 10 | --gin.SAVING_PERIOD=10_000 \ 11 | --gin.MODEL_DIR=\"gs://improved-t5/ckpts/v2_base_mlm/checkpoint_1000000/lm_adapt\" \ 12 | --gin.INITIAL_CHECKPOINT_PATH=\"gs://improved-t5/ckpts/v2_base_mlm/checkpoint_1000000\" \ 13 | --gin.USE_CACHED_TASKS=False \ 14 | --alsologtostderr 15 | -------------------------------------------------------------------------------- /experiments/improved_t5/lm_adapt/lm_adapt_large.sh: -------------------------------------------------------------------------------- 1 | ADDR=$1 2 | MODEL_DIR=$2 3 | 4 | python -m t5x.train \ 5 | --gin_file="models/scalable_t5/t5_1_1/large.gin" \ 6 | --gin.seqio.SentencePieceVocabulary.sentencepiece_model_file=\""gs://improved-t5/vocabs/tokenizer.model"\" \ 7 | --gin.seqio.SentencePieceVocabulary.extra_ids=100 \ 8 | --gin_file="configs/task/finetune/pile_prefix_lm.gin" \ 9 | --gin.TRAIN_STEPS=1_100_000 \ 10 | --gin.SAVING_PERIOD=10_000 \ 11 | --gin.MODEL_DIR=\"gs://improved-t5/ckpts/v2_large_mlm/checkpoint_1000000/lm_adapt\" \ 12 | --gin.INITIAL_CHECKPOINT_PATH=\"gs://improved-t5/ckpts/v2_large_mlm/checkpoint_1000000\" \ 13 | --gin.USE_CACHED_TASKS=False \ 14 | --alsologtostderr 15 | -------------------------------------------------------------------------------- /experiments/improved_t5/lm_adapt/lm_adapt_xl.sh: -------------------------------------------------------------------------------- 1 | ADDR=$1 2 | MODEL_DIR=$2 3 | 4 | python -m t5x.train \ 5 | --gin_file="models/scalable_t5/t5_1_1/xl.gin" \ 6 | --gin.partitioning.PjitPartitioner.model_parallel_submesh="(1, 1, 2, 1)" \ 7 | --gin.seqio.SentencePieceVocabulary.sentencepiece_model_file=\""gs://improved-t5/vocabs/tokenizer.model"\" \ 8 | --gin.seqio.SentencePieceVocabulary.extra_ids=100 \ 9 | --gin_file="configs/task/finetune/pile_prefix_lm.gin" \ 10 | --gin.TRAIN_STEPS=1_100_000 \ 11 | --gin.SAVING_PERIOD=10_000 \ 12 | --gin.MODEL_DIR=\"gs://improved-t5/ckpts/v2_xl_mlm/checkpoint_1000000/lm_adapt\" \ 13 | --gin.INITIAL_CHECKPOINT_PATH=\"gs://improved-t5/ckpts/v2_xl_mlm/checkpoint_1000000\" \ 14 | --gin.USE_CACHED_TASKS=False \ 15 | --alsologtostderr 16 | -------------------------------------------------------------------------------- /experiments/improved_t5/lm_adapt/lm_adapt_xxl.sh: -------------------------------------------------------------------------------- 1 | ADDR=$1 2 | MODEL_DIR=$2 3 | 4 | python -m t5x.train \ 5 | --gin_file="models/scalable_t5/t5_1_1/xxl.gin" \ 6 | --gin.partitioning.PjitPartitioner.model_parallel_submesh="(1, 1, 2, 1)" \ 7 | --gin.seqio.SentencePieceVocabulary.sentencepiece_model_file=\""gs://improved-t5/vocabs/tokenizer.model"\" \ 8 | --gin.seqio.SentencePieceVocabulary.extra_ids=100 \ 9 | --gin_file="configs/task/finetune/pile_prefix_lm.gin" \ 10 | --gin.TRAIN_STEPS=1_100_000 \ 11 | --gin.SAVING_PERIOD=10_000 \ 12 | --gin.MODEL_DIR=\"gs://improved-t5/ckpts/v2_xxl_mlm/checkpoint_1000000/lm_adapt\" \ 13 | --gin.INITIAL_CHECKPOINT_PATH=\"gs://improved-t5/ckpts/v2_xxl_mlm/checkpoint_1000000\" \ 14 | --gin.USE_CACHED_TASKS=False \ 15 | --alsologtostderr 16 | -------------------------------------------------------------------------------- /experiments/improved_t5/lm_adapt/s_causal/finetune_base_flan.sh: -------------------------------------------------------------------------------- 1 | ADDR=$1 2 | MODEL_DIR=$2 3 | 4 | python -m t5x.train \ 5 | --gin_file="models/scalable_t5/t5_1_1/base.gin" \ 6 | --gin.seqio.SentencePieceVocabulary.sentencepiece_model_file=\""gs://improved-t5/vocabs/tokenizer.model"\" \ 7 | --gin.seqio.SentencePieceVocabulary.extra_ids=100 \ 8 | --gin_file="configs/task/finetune/flan.gin" \ 9 | --gin.TRAIN_STEPS=1_184_000 \ 10 | --gin.SAVING_PERIOD=10_000 \ 11 | --gin.MODEL_DIR=\"gs://improved-t5/ckpts/v2_base_mlm/checkpoint_1000000/lm_adapt_sc_flan_finetune\" \ 12 | --gin.INITIAL_CHECKPOINT_PATH=\"gs://improved-t5/ckpts/v2_base_mlm/checkpoint_1000000/lm_adapt_sc/checkpoint_1100000\" \ 13 | --gin.USE_CACHED_TASKS=False \ 14 | --alsologtostderr 15 | # --multiprocess_gpu \ 16 | # --coordinator_address=${ADDR} \ 17 | # --process_count=${SLURM_NTASKS} \ 18 | # --process_index=${SLURM_PROCID} 19 | -------------------------------------------------------------------------------- /experiments/improved_t5/lm_adapt/s_causal/finetune_large_flan.sh: -------------------------------------------------------------------------------- 1 | ADDR=$1 2 | MODEL_DIR=$2 3 | 4 | python -m t5x.train \ 5 | --gin_file="models/scalable_t5/t5_1_1/large.gin" \ 6 | --gin.seqio.SentencePieceVocabulary.sentencepiece_model_file=\""gs://improved-t5/vocabs/tokenizer.model"\" \ 7 | --gin.seqio.SentencePieceVocabulary.extra_ids=100 \ 8 | --gin_file="configs/task/finetune/flan.gin" \ 9 | --gin.TRAIN_STEPS=1_164_000 \ 10 | --gin.SAVING_PERIOD=10_000 \ 11 | --gin.MODEL_DIR=\"gs://improved-t5/ckpts/v2_large_mlm/checkpoint_1000000/lm_adapt_sc_flan_finetune\" \ 12 | --gin.INITIAL_CHECKPOINT_PATH=\"gs://improved-t5/ckpts/v2_large_mlm/checkpoint_1000000/lm_adapt_sc/checkpoint_1100000\" \ 13 | --gin.USE_CACHED_TASKS=False \ 14 | --alsologtostderr 15 | # --multiprocess_gpu \ 16 | # --coordinator_address=${ADDR} \ 17 | # --process_count=${SLURM_NTASKS} \ 18 | # --process_index=${SLURM_PROCID} 19 | -------------------------------------------------------------------------------- /experiments/improved_t5/lm_adapt/s_causal/finetune_xxl_flan.sh: -------------------------------------------------------------------------------- 1 | ADDR=$1 2 | MODEL_DIR=$2 3 | 4 | python -m t5x.train \ 5 | --gin_file="models/scalable_t5/t5_1_1/xxl.gin" \ 6 | --gin.seqio.SentencePieceVocabulary.sentencepiece_model_file=\""gs://improved-t5/vocabs/tokenizer.model"\" \ 7 | --gin.seqio.SentencePieceVocabulary.extra_ids=100 \ 8 | --gin_file="configs/task/finetune/flan.gin" \ 9 | --gin.TRAIN_STEPS=1_114_000 \ 10 | --gin.SAVING_PERIOD=10_000 \ 11 | --gin.MODEL_DIR=\"gs://improved-t5/ckpts/v2_xxl_mlm/checkpoint_1000000/lm_adapt_sc_flan_finetune\" \ 12 | --gin.INITIAL_CHECKPOINT_PATH=\"gs://improved-t5/ckpts/v2_xxl_mlm/checkpoint_1000000/lm_adapt_sc/checkpoint_1100000\" \ 13 | --gin.USE_CACHED_TASKS=False \ 14 | --alsologtostderr 15 | # --multiprocess_gpu \ 16 | # --coordinator_address=${ADDR} \ 17 | # --process_count=${SLURM_NTASKS} \ 18 | # --process_index=${SLURM_PROCID} 19 | -------------------------------------------------------------------------------- /experiments/improved_t5/lm_adapt/s_causal/lm_adapt_base.sh: -------------------------------------------------------------------------------- 1 | ADDR=$1 2 | MODEL_DIR=$2 3 | 4 | python -m t5x.train \ 5 | --gin_file="models/scalable_t5/t5_1_1/base.gin" \ 6 | --gin.seqio.SentencePieceVocabulary.sentencepiece_model_file=\""gs://improved-t5/vocabs/tokenizer.model"\" \ 7 | --gin.seqio.SentencePieceVocabulary.extra_ids=100 \ 8 | --gin_file="configs/task/finetune/pile_prefix_lm_causal.gin" \ 9 | --gin.TRAIN_STEPS=1_100_000 \ 10 | --gin.SAVING_PERIOD=10_000 \ 11 | --gin.MODEL_DIR=\"gs://improved-t5/ckpts/v2_base_mlm/checkpoint_1000000/lm_adapt_sc\" \ 12 | --gin.INITIAL_CHECKPOINT_PATH=\"gs://improved-t5/ckpts/v2_base_mlm/checkpoint_1000000\" \ 13 | --gin.USE_CACHED_TASKS=False \ 14 | --alsologtostderr 15 | -------------------------------------------------------------------------------- /experiments/improved_t5/lm_adapt/s_causal/lm_adapt_large.sh: -------------------------------------------------------------------------------- 1 | ADDR=$1 2 | MODEL_DIR=$2 3 | 4 | python -m t5x.train \ 5 | --gin_file="models/scalable_t5/t5_1_1/large.gin" \ 6 | --gin.seqio.SentencePieceVocabulary.sentencepiece_model_file=\""gs://improved-t5/vocabs/tokenizer.model"\" \ 7 | --gin.seqio.SentencePieceVocabulary.extra_ids=100 \ 8 | --gin_file="configs/task/finetune/pile_prefix_lm_causal.gin" \ 9 | --gin.TRAIN_STEPS=1_100_000 \ 10 | --gin.SAVING_PERIOD=10_000 \ 11 | --gin.MODEL_DIR=\"gs://improved-t5/ckpts/v2_large_mlm/checkpoint_1000000/lm_adapt_sc\" \ 12 | --gin.INITIAL_CHECKPOINT_PATH=\"gs://improved-t5/ckpts/v2_large_mlm/checkpoint_1000000\" \ 13 | --gin.USE_CACHED_TASKS=False \ 14 | --alsologtostderr 15 | -------------------------------------------------------------------------------- /experiments/improved_t5/lm_adapt/s_causal/lm_adapt_xl.sh: -------------------------------------------------------------------------------- 1 | ADDR=$1 2 | MODEL_DIR=$2 3 | 4 | python -m t5x.train \ 5 | --gin_file="models/scalable_t5/t5_1_1/xl.gin" \ 6 | --gin.partitioning.PjitPartitioner.model_parallel_submesh="(1, 1, 2, 1)" \ 7 | --gin.seqio.SentencePieceVocabulary.sentencepiece_model_file=\""gs://improved-t5/vocabs/tokenizer.model"\" \ 8 | --gin.seqio.SentencePieceVocabulary.extra_ids=100 \ 9 | --gin_file="configs/task/finetune/pile_prefix_lm_causal.gin" \ 10 | --gin.TRAIN_STEPS=1_100_000 \ 11 | --gin.SAVING_PERIOD=10_000 \ 12 | --gin.MODEL_DIR=\"gs://improved-t5/ckpts/v2_xl_mlm/checkpoint_1000000/lm_adapt_sc\" \ 13 | --gin.INITIAL_CHECKPOINT_PATH=\"gs://improved-t5/ckpts/v2_xl_mlm/checkpoint_1000000\" \ 14 | --gin.USE_CACHED_TASKS=False \ 15 | --alsologtostderr 16 | -------------------------------------------------------------------------------- /experiments/improved_t5/lm_adapt/s_causal/lm_adapt_xxl.sh: -------------------------------------------------------------------------------- 1 | ADDR=$1 2 | MODEL_DIR=$2 3 | 4 | python -m t5x.train \ 5 | --gin_file="models/scalable_t5/t5_1_1/xxl.gin" \ 6 | --gin.partitioning.PjitPartitioner.model_parallel_submesh="(1, 1, 2, 1)" \ 7 | --gin.seqio.SentencePieceVocabulary.sentencepiece_model_file=\""gs://improved-t5/vocabs/tokenizer.model"\" \ 8 | --gin.seqio.SentencePieceVocabulary.extra_ids=100 \ 9 | --gin_file="configs/task/finetune/pile_prefix_lm_causal.gin" \ 10 | --gin.TRAIN_STEPS=1_100_000 \ 11 | --gin.SAVING_PERIOD=10_000 \ 12 | --gin.MODEL_DIR=\"gs://improved-t5/ckpts/v2_xxl_mlm/checkpoint_1000000/lm_adapt_sc\" \ 13 | --gin.INITIAL_CHECKPOINT_PATH=\"gs://improved-t5/ckpts/v2_xxl_mlm/checkpoint_1000000\" \ 14 | --gin.USE_CACHED_TASKS=False \ 15 | --alsologtostderr 16 | -------------------------------------------------------------------------------- /experiments/improved_t5/lm_adapt/ul2r/lm_adapt_base.sh: -------------------------------------------------------------------------------- 1 | ADDR=$1 2 | MODEL_DIR=$2 3 | 4 | python -m t5x.train \ 5 | --gin_file="models/scalable_t5/t5_1_1/base.gin" \ 6 | --gin.seqio.SentencePieceVocabulary.sentencepiece_model_file=\""gs://improved-t5/vocabs/tokenizer.model"\" \ 7 | --gin.seqio.SentencePieceVocabulary.extra_ids=100 \ 8 | --gin_file="configs/task/finetune/pile_ul2r.gin" \ 9 | --gin.TRAIN_STEPS=1_008_000 \ 10 | --gin.SAVING_PERIOD=10_000 \ 11 | --gin.MODEL_DIR=\"gs://improved-t5/ckpts/v2_base_mlm/checkpoint_1000000/ul2r\" \ 12 | --gin.INITIAL_CHECKPOINT_PATH=\"gs://improved-t5/ckpts/v2_base_mlm/checkpoint_1000000\" \ 13 | --gin.USE_CACHED_TASKS=False \ 14 | --alsologtostderr 15 | -------------------------------------------------------------------------------- /experiments/improved_t5/lm_adapt/ul2r/lm_adapt_base_ns.sh: -------------------------------------------------------------------------------- 1 | ADDR=$1 2 | MODEL_DIR=$2 3 | 4 | python -m t5x.train \ 5 | --gin_file="../t5x/t5x/examples/t5/t5_1_1/base.gin" \ 6 | --gin.seqio.SentencePieceVocabulary.sentencepiece_model_file=\""gs://improved-t5/vocabs/tokenizer.model"\" \ 7 | --gin.seqio.SentencePieceVocabulary.extra_ids=100 \ 8 | --gin_file="configs/task/finetune/pile_ul2r.gin" \ 9 | --gin.TRAIN_STEPS=1_008_000 \ 10 | --gin.SAVING_PERIOD=10_000 \ 11 | --gin.MODEL_DIR=\"gs://improved-t5/ckpts/v2_base_mlm_ns/checkpoint_1000000/ul2r\" \ 12 | --gin.INITIAL_CHECKPOINT_PATH=\"gs://improved-t5/ckpts/v2_base_mlm_ns/checkpoint_1000000\" \ 13 | --gin.USE_CACHED_TASKS=False \ 14 | --alsologtostderr 15 | -------------------------------------------------------------------------------- /experiments/improved_t5/lm_adapt/ul2r/lm_adapt_large.sh: -------------------------------------------------------------------------------- 1 | ADDR=$1 2 | MODEL_DIR=$2 3 | 4 | python -m t5x.train \ 5 | --gin_file="models/scalable_t5/t5_1_1/large.gin" \ 6 | --gin.seqio.SentencePieceVocabulary.sentencepiece_model_file=\""gs://improved-t5/vocabs/tokenizer.model"\" \ 7 | --gin.seqio.SentencePieceVocabulary.extra_ids=100 \ 8 | --gin_file="configs/task/finetune/pile_ul2r.gin" \ 9 | --gin.TRAIN_STEPS=1_008_000 \ 10 | --gin.SAVING_PERIOD=10_000 \ 11 | --gin.MODEL_DIR=\"gs://improved-t5/ckpts/v2_large_mlm/checkpoint_1000000/ul2r\" \ 12 | --gin.INITIAL_CHECKPOINT_PATH=\"gs://improved-t5/ckpts/v2_large_mlm/checkpoint_1000000\" \ 13 | --gin.USE_CACHED_TASKS=False \ 14 | --alsologtostderr 15 | -------------------------------------------------------------------------------- /experiments/improved_t5/lm_adapt/ul2r/lm_adapt_xl.sh: -------------------------------------------------------------------------------- 1 | ADDR=$1 2 | MODEL_DIR=$2 3 | 4 | python -m t5x.train \ 5 | --gin_file="models/scalable_t5/t5_1_1/xl.gin" \ 6 | --gin.partitioning.PjitPartitioner.model_parallel_submesh="(1, 1, 2, 1)" \ 7 | --gin.seqio.SentencePieceVocabulary.sentencepiece_model_file=\""gs://improved-t5/vocabs/tokenizer.model"\" \ 8 | --gin.seqio.SentencePieceVocabulary.extra_ids=100 \ 9 | --gin_file="configs/task/finetune/pile_ul2r.gin" \ 10 | --gin.TRAIN_STEPS=1_008_000 \ 11 | --gin.SAVING_PERIOD=10_000 \ 12 | --gin.MODEL_DIR=\"gs://improved-t5/ckpts/v2_xl_mlm/checkpoint_1000000/ul2r\" \ 13 | --gin.INITIAL_CHECKPOINT_PATH=\"gs://improved-t5/ckpts/v2_xl_mlm/checkpoint_1000000\" \ 14 | --gin.USE_CACHED_TASKS=False \ 15 | --alsologtostderr 16 | -------------------------------------------------------------------------------- /experiments/improved_t5/lm_adapt/ul2r/lm_adapt_xxl.sh: -------------------------------------------------------------------------------- 1 | ADDR=$1 2 | MODEL_DIR=$2 3 | 4 | python -m t5x.train \ 5 | --gin_file="models/scalable_t5/t5_1_1/xxl.gin" \ 6 | --gin.partitioning.PjitPartitioner.model_parallel_submesh="(1, 1, 2, 1)" \ 7 | --gin.seqio.SentencePieceVocabulary.sentencepiece_model_file=\""gs://improved-t5/vocabs/tokenizer.model"\" \ 8 | --gin.seqio.SentencePieceVocabulary.extra_ids=100 \ 9 | --gin_file="configs/task/finetune/pile_ul2r.gin" \ 10 | --gin.TRAIN_STEPS=1_008_000 \ 11 | --gin.SAVING_PERIOD=10_000 \ 12 | --gin.MODEL_DIR=\"gs://improved-t5/ckpts/v2_xxl_mlm/checkpoint_1000000/ul2r\" \ 13 | --gin.INITIAL_CHECKPOINT_PATH=\"gs://improved-t5/ckpts/v2_xxl_mlm/checkpoint_1000000\" \ 14 | --gin.USE_CACHED_TASKS=False \ 15 | --alsologtostderr 16 | -------------------------------------------------------------------------------- /experiments/improved_t5/mlm/finetune_flan2022.sh: -------------------------------------------------------------------------------- 1 | SIZE=$1 2 | STEP=$2 3 | INIT_DIR=$3 4 | MODEL_DIR=$4 5 | 6 | python -m t5x.train \ 7 | --gin_file="models/scalable_t5/t5_1_1/${SIZE}.gin" \ 8 | --gin.seqio.SentencePieceVocabulary.sentencepiece_model_file=\""${GCP_BUCKET}/vocabs/tokenizer.model"\" \ 9 | --gin.seqio.SentencePieceVocabulary.extra_ids=100 \ 10 | --gin_file="configs/task/finetune/flan2022.gin" \ 11 | --gin.train.use_orbax=False \ 12 | --gin.TRAIN_STEPS=${STEP} \ 13 | --gin.SAVING_PERIOD=2000 \ 14 | --gin.INITIAL_CHECKPOINT_PATH=\"${INIT_DIR}\" \ 15 | --gin.MODEL_DIR=\"${MODEL_DIR}\" \ 16 | --gin.USE_CACHED_TASKS=False \ 17 | --alsologtostderr 18 | # --gin.partitioning.PjitPartitioner.model_parallel_submesh="(1, 1, 8, 1)" \ 19 | # --multiprocess_gpu \ 20 | # --coordinator_address=${ADDR} \ 21 | # --process_count=${SLURM_NTASKS} \ 22 | # --process_index=${SLURM_PROCID} 23 | -------------------------------------------------------------------------------- /experiments/improved_t5/mlm/finetune_sglue.sh: -------------------------------------------------------------------------------- 1 | SIZE=$1 2 | START_STEP=$2 3 | INIT_DIR=$3 4 | MODEL_DIR=$4 5 | 6 | TRAIN_STEPS=$(( ${START_STEP} + 262144 )) 7 | 8 | python -m t5x.train \ 9 | --gin_file="models/scalable_t5/t5_1_1/${SIZE}.gin" \ 10 | --gin.seqio.SentencePieceVocabulary.sentencepiece_model_file=\""${GCP_BUCKET}/vocabs/tokenizer.model"\" \ 11 | --gin.seqio.SentencePieceVocabulary.extra_ids=100 \ 12 | --gin_file="configs/task/finetune/sglue.gin" \ 13 | --gin.train.use_orbax=False \ 14 | --gin.TRAIN_STEPS=${TRAIN_STEPS} \ 15 | --gin.SAVING_PERIOD=5000 \ 16 | --gin.train.infer_eval_dataset_cfg=None \ 17 | --gin.INITIAL_CHECKPOINT_PATH=\"${INIT_DIR}\" \ 18 | --gin.MODEL_DIR=\"${MODEL_DIR}\" \ 19 | --gin.USE_CACHED_TASKS=False \ 20 | --alsologtostderr 21 | # --gin.partitioning.PjitPartitioner.model_parallel_submesh="(1, 1, 8, 1)" \ 22 | # --gin.Trainer.num_microbatches=2 \ 23 | -------------------------------------------------------------------------------- /experiments/improved_t5/mlm/finetune_t0.sh: -------------------------------------------------------------------------------- 1 | SIZE=$1 2 | STEP=$2 3 | INIT_DIR=$3 4 | MODEL_DIR=$4 5 | 6 | python -m t5x.train \ 7 | --gin_file="models/scalable_t5/t5_1_1/${SIZE}.gin" \ 8 | --gin.seqio.SentencePieceVocabulary.sentencepiece_model_file=\""${GCP_BUCKET}/vocabs/tokenizer.model"\" \ 9 | --gin.seqio.SentencePieceVocabulary.extra_ids=100 \ 10 | --gin_file="configs/task/finetune/t0_train.gin" \ 11 | --gin.train.use_orbax=False \ 12 | --gin.TRAIN_STEPS=${STEP} \ 13 | --gin.SAVING_PERIOD=2000 \ 14 | --gin.INITIAL_CHECKPOINT_PATH=\"${INIT_DIR}\" \ 15 | --gin.MODEL_DIR=\"${MODEL_DIR}\" \ 16 | --gin.USE_CACHED_TASKS=False \ 17 | --alsologtostderr 18 | # --gin.partitioning.PjitPartitioner.model_parallel_submesh="(1, 1, 8, 1)" \ 19 | # --multiprocess_gpu \ 20 | # --coordinator_address=${ADDR} \ 21 | # --process_count=${SLURM_NTASKS} \ 22 | # --process_index=${SLURM_PROCID} -------------------------------------------------------------------------------- /experiments/improved_t5/mlm/pretrain_mlm.sh: -------------------------------------------------------------------------------- 1 | SIZE=$1 2 | TRAIN_STEPS=$2 3 | MODEL_DIR=$3 4 | 5 | python -m t5x.train \ 6 | --gin_file="models/scalable_t5/t5_1_1/${SIZE}.gin" \ 7 | --gin_file="configs/task/pretrain/pile_mlm.gin" \ 8 | --gin.seqio.SentencePieceVocabulary.sentencepiece_model_file=\""${GCP_BUCKET}/vocabs/tokenizer.model"\" \ 9 | --gin.seqio.SentencePieceVocabulary.extra_ids=100 \ 10 | --gin.train.use_orbax=False \ 11 | --gin.TRAIN_STEPS=${TRAIN_STEPS} \ 12 | --gin.SAVING_PERIOD=10000 \ 13 | --gin.MODEL_DIR=\"${MODEL_DIR}\" \ 14 | --gin.USE_CACHED_TASKS=False \ 15 | --alsologtostderr 16 | # --gin.Trainer.num_microbatches=2 \ 17 | # --gin.partitioning.standard_logical_axis_rules.activation_partitioning_dims=2 \ 18 | # --gin.partitioning.PjitPartitioner.model_parallel_submesh="(1, 1, 8, 1)" \ 19 | # --gin.partitioning.standard_logical_axis_rules.parameter_partitioning_dims=2 \ -------------------------------------------------------------------------------- /experiments/improved_t5/mlm/pretrain_mlm_causal.sh: -------------------------------------------------------------------------------- 1 | SIZE=$1 2 | TRAIN_STEPS=$2 3 | MODEL_DIR=$3 4 | 5 | python -m t5x.train \ 6 | --gin_file="models/scalable_t5/t5_1_1/${SIZE}.gin" \ 7 | --gin_file="configs/task/pretrain/pile_mlm_causal.gin" \ 8 | --gin.seqio.SentencePieceVocabulary.sentencepiece_model_file=\""${GCP_BUCKET}/vocabs/tokenizer.model"\" \ 9 | --gin.seqio.SentencePieceVocabulary.extra_ids=100 \ 10 | --gin.train.use_orbax=False \ 11 | --gin.TRAIN_STEPS=${TRAIN_STEPS} \ 12 | --gin.SAVING_PERIOD=10000 \ 13 | --gin.MODEL_DIR=\"${MODEL_DIR}\" \ 14 | --gin.USE_CACHED_TASKS=False \ 15 | --alsologtostderr 16 | # --gin.Trainer.num_microbatches=2 \ 17 | # --gin.partitioning.standard_logical_axis_rules.activation_partitioning_dims=2 \ 18 | # --gin.partitioning.PjitPartitioner.model_parallel_submesh="(1, 1, 8, 1)" \ 19 | # --gin.partitioning.standard_logical_axis_rules.parameter_partitioning_dims=2 \ 20 | -------------------------------------------------------------------------------- /experiments/improved_t5/ul2_causal/finetune_base_flan.sh: -------------------------------------------------------------------------------- 1 | ADDR=$1 2 | MODEL_DIR=$2 3 | 4 | python -m t5x.train \ 5 | --gin_file="models/scalable_t5/t5_1_1/base.gin" \ 6 | --gin.seqio.SentencePieceVocabulary.sentencepiece_model_file=\""gs://improved-t5/vocabs/tokenizer.model"\" \ 7 | --gin.seqio.SentencePieceVocabulary.extra_ids=100 \ 8 | --gin_file="configs/task/finetune/flan.gin" \ 9 | --gin.TRAIN_STEPS=1_084_000 \ 10 | --gin.SAVING_PERIOD=10_000 \ 11 | --gin.MODEL_DIR=\"gs://improved-t5/ckpts/v2_base/checkpoint_1000000/flan_finetune\" \ 12 | --gin.INITIAL_CHECKPOINT_PATH=\"gs://improved-t5/ckpts/v2_base/checkpoint_1000000\" \ 13 | --gin.USE_CACHED_TASKS=False \ 14 | --alsologtostderr 15 | # --multiprocess_gpu \ 16 | # --coordinator_address=${ADDR} \ 17 | # --process_count=${SLURM_NTASKS} \ 18 | # --process_index=${SLURM_PROCID} 19 | -------------------------------------------------------------------------------- /experiments/improved_t5/ul2_causal/finetune_base_sglue.sh: -------------------------------------------------------------------------------- 1 | ADDR=$1 2 | MODEL_DIR=$2 3 | 4 | python -m t5x.train \ 5 | --gin_file="models/scalable_t5/t5_1_1/base.gin" \ 6 | --gin.seqio.SentencePieceVocabulary.sentencepiece_model_file=\""gs://improved-t5/vocabs/tokenizer.model"\" \ 7 | --gin.seqio.SentencePieceVocabulary.extra_ids=100 \ 8 | --gin_file="configs/task/finetune/sglue.gin" \ 9 | --gin.TRAIN_STEPS=1_262_144 \ 10 | --gin.SAVING_PERIOD=5000 \ 11 | --gin.MODEL_DIR=\"gs://improved-t5/ckpts/v2_base/checkpoint_1000000/sglue_finetune\" \ 12 | --gin.INITIAL_CHECKPOINT_PATH=\"gs://improved-t5/ckpts/v2_base/checkpoint_1000000\" \ 13 | --gin.USE_CACHED_TASKS=False \ 14 | --alsologtostderr 15 | # --multiprocess_gpu \ 16 | # --coordinator_address=${ADDR} \ 17 | # --process_count=${SLURM_NTASKS} \ 18 | # --process_index=${SLURM_PROCID} 19 | -------------------------------------------------------------------------------- /experiments/improved_t5/ul2_causal/finetune_large_flan.sh: -------------------------------------------------------------------------------- 1 | ADDR=$1 2 | MODEL_DIR=$2 3 | 4 | python -m t5x.train \ 5 | --gin_file="models/scalable_t5/t5_1_1/large.gin" \ 6 | --gin.seqio.SentencePieceVocabulary.sentencepiece_model_file=\""gs://improved-t5/vocabs/tokenizer.model"\" \ 7 | --gin.seqio.SentencePieceVocabulary.extra_ids=100 \ 8 | --gin_file="configs/task/finetune/flan.gin" \ 9 | --gin.TRAIN_STEPS=1_064_000 \ 10 | --gin.SAVING_PERIOD=10_000 \ 11 | --gin.MODEL_DIR=\"gs://improved-t5/ckpts/v2_large/checkpoint_1000000/flan_finetune\" \ 12 | --gin.INITIAL_CHECKPOINT_PATH=\"gs://improved-t5/ckpts/v2_large/checkpoint_1000000\" \ 13 | --gin.USE_CACHED_TASKS=False \ 14 | --alsologtostderr 15 | # --multiprocess_gpu \ 16 | # --coordinator_address=${ADDR} \ 17 | # --process_count=${SLURM_NTASKS} \ 18 | # --process_index=${SLURM_PROCID} 19 | -------------------------------------------------------------------------------- /experiments/improved_t5/ul2_causal/finetune_large_sglue.sh: -------------------------------------------------------------------------------- 1 | ADDR=$1 2 | MODEL_DIR=$2 3 | 4 | python -m t5x.train \ 5 | --gin_file="models/scalable_t5/t5_1_1/large.gin" \ 6 | --gin.seqio.SentencePieceVocabulary.sentencepiece_model_file=\""gs://improved-t5/vocabs/tokenizer.model"\" \ 7 | --gin.seqio.SentencePieceVocabulary.extra_ids=100 \ 8 | --gin_file="configs/task/finetune/sglue.gin" \ 9 | --gin.TRAIN_STEPS=1_262_144 \ 10 | --gin.SAVING_PERIOD=5000 \ 11 | --gin.MODEL_DIR=\"gs://improved-t5/ckpts/v2_large/checkpoint_1000000/sglue_finetune\" \ 12 | --gin.INITIAL_CHECKPOINT_PATH=\"gs://improved-t5/ckpts/v2_large/checkpoint_1000000\" \ 13 | --gin.USE_CACHED_TASKS=False \ 14 | --alsologtostderr 15 | # --multiprocess_gpu \ 16 | # --coordinator_address=${ADDR} \ 17 | # --process_count=${SLURM_NTASKS} \ 18 | # --process_index=${SLURM_PROCID} 19 | -------------------------------------------------------------------------------- /experiments/improved_t5/ul2_causal/finetune_xl_flan2021.sh: -------------------------------------------------------------------------------- 1 | ADDR=$1 2 | MODEL_DIR=$2 3 | 4 | python -m t5x.train \ 5 | --gin_file="models/scalable_t5/t5_1_1/xl.gin" \ 6 | --gin.partitioning.PjitPartitioner.model_parallel_submesh="(1, 1, 2, 1)" \ 7 | --gin.seqio.SentencePieceVocabulary.sentencepiece_model_file=\""gs://improved-t5/vocabs/tokenizer.model"\" \ 8 | --gin.seqio.SentencePieceVocabulary.extra_ids=100 \ 9 | --gin_file="configs/task/finetune/flan2021.gin" \ 10 | --gin.TRAIN_STEPS=1_038_000 \ 11 | --gin.SAVING_PERIOD=10_000 \ 12 | --gin.MODEL_DIR=\"gs://improved-t5/ckpts/v2_xl/checkpoint_1000000/finetune_flan2021\" \ 13 | --gin.INITIAL_CHECKPOINT_PATH=\"gs://improved-t5/ckpts/v2_xl/checkpoint_1000000\" \ 14 | --gin.USE_CACHED_TASKS=False \ 15 | --alsologtostderr 16 | # --multiprocess_gpu \ 17 | # --coordinator_address=${ADDR} \ 18 | # --process_count=${SLURM_NTASKS} \ 19 | # --process_index=${SLURM_PROCID} 20 | -------------------------------------------------------------------------------- /experiments/improved_t5/ul2_causal/finetune_xl_sglue.sh: -------------------------------------------------------------------------------- 1 | ADDR=$1 2 | MODEL_DIR=$2 3 | 4 | python -m t5x.train \ 5 | --gin_file="models/scalable_t5/t5_1_1/xl.gin" \ 6 | --gin.seqio.SentencePieceVocabulary.sentencepiece_model_file=\""gs://improved-t5/vocabs/tokenizer.model"\" \ 7 | --gin.seqio.SentencePieceVocabulary.extra_ids=100 \ 8 | --gin_file="configs/task/finetune/sglue.gin" \ 9 | --gin.TRAIN_STEPS=1_262_144 \ 10 | --gin.SAVING_PERIOD=5000 \ 11 | --gin.MODEL_DIR=\"gs://improved-t5/ckpts/v2_xl/checkpoint_1000000/sglue_finetune\" \ 12 | --gin.INITIAL_CHECKPOINT_PATH=\"gs://improved-t5/ckpts/v2_xl/checkpoint_1000000\" \ 13 | --gin.USE_CACHED_TASKS=False \ 14 | --alsologtostderr 15 | # --multiprocess_gpu \ 16 | # --coordinator_address=${ADDR} \ 17 | # --process_count=${SLURM_NTASKS} \ 18 | # --process_index=${SLURM_PROCID} 19 | -------------------------------------------------------------------------------- /experiments/improved_t5/ul2_causal/finetune_xxl_flan.sh: -------------------------------------------------------------------------------- 1 | ADDR=$1 2 | MODEL_DIR=$2 3 | 4 | python -m t5x.train \ 5 | --gin_file="models/scalable_t5/t5_1_1/xxl.gin" \ 6 | --gin.seqio.SentencePieceVocabulary.sentencepiece_model_file=\""gs://improved-t5/vocabs/tokenizer.model"\" \ 7 | --gin.seqio.SentencePieceVocabulary.extra_ids=100 \ 8 | --gin_file="configs/task/finetune/flan.gin" \ 9 | --gin.TRAIN_STEPS=1_038_000 \ 10 | --gin.SAVING_PERIOD=10_000 \ 11 | --gin.MODEL_DIR=\"gs://improved-t5/ckpts/v2_xxl/checkpoint_1000000/flan_finetune\" \ 12 | --gin.INITIAL_CHECKPOINT_PATH=\"gs://improved-t5/ckpts/v2_xxl/checkpoint_1000000\" \ 13 | --gin.USE_CACHED_TASKS=False \ 14 | --alsologtostderr 15 | # --multiprocess_gpu \ 16 | # --coordinator_address=${ADDR} \ 17 | # --process_count=${SLURM_NTASKS} \ 18 | # --process_index=${SLURM_PROCID} 19 | -------------------------------------------------------------------------------- /experiments/improved_t5/ul2_causal/finetune_xxl_sglue.sh: -------------------------------------------------------------------------------- 1 | ADDR=$1 2 | MODEL_DIR=$2 3 | 4 | python -m t5x.train \ 5 | --gin_file="models/scalable_t5/t5_1_1/xxl.gin" \ 6 | --gin.seqio.SentencePieceVocabulary.sentencepiece_model_file=\""gs://improved-t5/vocabs/tokenizer.model"\" \ 7 | --gin.seqio.SentencePieceVocabulary.extra_ids=100 \ 8 | --gin_file="configs/task/finetune/sglue.gin" \ 9 | --gin.TRAIN_STEPS=1_262_144 \ 10 | --gin.SAVING_PERIOD=5000 \ 11 | --gin.MODEL_DIR=\"gs://improved-t5/ckpts/v2_xxl/checkpoint_1000000/sglue_finetune\" \ 12 | --gin.INITIAL_CHECKPOINT_PATH=\"gs://improved-t5/ckpts/v2_xxl/checkpoint_1000000\" \ 13 | --seqio_additional_cache_dirs=\"gs://improved-t5/data\" \ 14 | --alsologtostderr 15 | # --multiprocess_gpu \ 16 | # --coordinator_address=${ADDR} \ 17 | # --process_count=${SLURM_NTASKS} \ 18 | # --process_index=${SLURM_PROCID} 19 | -------------------------------------------------------------------------------- /experiments/improved_t5/ul2_causal/pretrain_base.sh: -------------------------------------------------------------------------------- 1 | ADDR=$1 2 | MODEL_DIR=$2 3 | 4 | python -m t5x.train \ 5 | --gin_file="models/scalable_t5/t5_1_1/base.gin" \ 6 | --gin_file="configs/task/pretrain/pile_mixed_objective.gin" \ 7 | --gin.seqio.SentencePieceVocabulary.sentencepiece_model_file=\""gs://improved-t5/vocabs/tokenizer.model"\" \ 8 | --gin.seqio.SentencePieceVocabulary.extra_ids=100 \ 9 | --gin.MIXTURE_OR_TASK_NAME=\""pile_ul2_causal_0_50"\" \ 10 | --gin.TRAIN_STEPS=2000000 \ 11 | --gin.SAVING_PERIOD=10000 \ 12 | --gin.MODEL_DIR=\"'gs://improved-t5/ckpts/v2_base/'\" \ 13 | --gin.USE_CACHED_TASKS=False \ 14 | --alsologtostderr 15 | -------------------------------------------------------------------------------- /experiments/improved_t5/ul2_causal/pretrain_large.sh: -------------------------------------------------------------------------------- 1 | ADDR=$1 2 | MODEL_DIR=$2 3 | 4 | python -m t5x.train \ 5 | --gin_file="models/scalable_t5/t5_1_1/large.gin" \ 6 | --gin_file="configs/task/pretrain/pile_mixed_objective.gin" \ 7 | --gin.seqio.SentencePieceVocabulary.sentencepiece_model_file=\""gs://improved-t5/vocabs/tokenizer.model"\" \ 8 | --gin.seqio.SentencePieceVocabulary.extra_ids=100 \ 9 | --gin.MIXTURE_OR_TASK_NAME=\""pile_ul2_causal_0_50"\" \ 10 | --gin.TRAIN_STEPS=2000000 \ 11 | --gin.SAVING_PERIOD=10000 \ 12 | --gin.MODEL_DIR=\"'gs://improved-t5/ckpts/v2_large/'\" \ 13 | --gin.USE_CACHED_TASKS=False \ 14 | --alsologtostderr 15 | -------------------------------------------------------------------------------- /experiments/improved_t5/ul2_causal/pretrain_xl.sh: -------------------------------------------------------------------------------- 1 | ADDR=$1 2 | MODEL_DIR=$2 3 | 4 | python -m t5x.train \ 5 | --gin_file="models/scalable_t5/t5_1_1/xl.gin" \ 6 | --gin_file="configs/task/pretrain/pile_mixed_objective.gin" \ 7 | --gin.partitioning.PjitPartitioner.model_parallel_submesh="(1, 1, 2, 1)" \ 8 | --gin.seqio.SentencePieceVocabulary.sentencepiece_model_file=\""gs://improved-t5/vocabs/tokenizer.model"\" \ 9 | --gin.seqio.SentencePieceVocabulary.extra_ids=100 \ 10 | --gin.MIXTURE_OR_TASK_NAME=\""pile_ul2_causal_0_50"\" \ 11 | --gin.TRAIN_STEPS=1000000 \ 12 | --gin.SAVING_PERIOD=10000 \ 13 | --gin.MODEL_DIR=\"'gs://improved-t5/ckpts/v2_xl/'\" \ 14 | --gin.USE_CACHED_TASKS=False \ 15 | --alsologtostderr 16 | -------------------------------------------------------------------------------- /experiments/improved_t5/ul2_causal/pretrain_xxl.sh: -------------------------------------------------------------------------------- 1 | ADDR=$1 2 | MODEL_DIR=$2 3 | 4 | python -m t5x.train \ 5 | --gin_file="models/scalable_t5/t5_1_1/xxl.gin" \ 6 | --gin_file="configs/task/pretrain/pile_mixed_objective.gin" \ 7 | --gin.partitioning.PjitPartitioner.model_parallel_submesh="(1, 1, 2, 1)" \ 8 | --gin.seqio.SentencePieceVocabulary.sentencepiece_model_file=\""gs://improved-t5/vocabs/tokenizer.model"\" \ 9 | --gin.seqio.SentencePieceVocabulary.extra_ids=100 \ 10 | --gin.MIXTURE_OR_TASK_NAME=\""pile_ul2_causal_0_50"\" \ 11 | --gin.TRAIN_STEPS=1000000 \ 12 | --gin.SAVING_PERIOD=10000 \ 13 | --gin.MODEL_DIR=\"'gs://improved-t5/ckpts/v2_xxl/'\" \ 14 | --gin.USE_CACHED_TASKS=False \ 15 | --alsologtostderr 16 | -------------------------------------------------------------------------------- /experiments/preliminary/layernorm/eval/t0_eval_alibi_relpos.sh: -------------------------------------------------------------------------------- 1 | export CACHED_DATA_DIR="/fsx/lintangsutawika/data" 2 | 3 | ADDR=$1 4 | MODEL_DIR=$2 5 | 6 | python -m t5x.train \ 7 | --gin_file="models/scalable_t5/t5_1_1/base.gin" \ 8 | --gin_file="configs/task/eval/t0_eval.gin" \ 9 | --gin.TRAIN_STEPS=135000 \ 10 | --gin.SAVING_PERIOD=5000 \ 11 | --gin.MODEL_DIR=\"'/fsx/lintangsutawika/improved_t5/ckpts/METRO/alibi_relpos_pile_mlm/finetune_t0_eval/'\" \ 12 | --gin.INITIAL_CHECKPOINT_PATH=\"'/fsx/lintangsutawika/improved_t5/ckpts/METRO/alibi_relpos_pile_mlm/finetune_t0_train/checkpoint_135000'\" \ 13 | --seqio_additional_cache_dirs="${CACHED_DATA_DIR}" \ 14 | --gin.USE_CACHED_TASKS=True \ 15 | --alsologtostderr \ 16 | --multiprocess_gpu \ 17 | --coordinator_address=${ADDR} \ 18 | --process_count=${SLURM_NTASKS} \ 19 | --process_index=${SLURM_PROCID} -------------------------------------------------------------------------------- /experiments/preliminary/layernorm/eval/t0_eval_base_lm100k.sh: -------------------------------------------------------------------------------- 1 | export CACHED_DATA_DIR="/fsx/lintangsutawika/data" 2 | export PREFIX="/fsx/lintangsutawika/improved_t5/ckpts/LayerNorm/with_abs_pos/" 3 | 4 | ADDR=$1 5 | MODEL_DIR=$2 6 | 7 | python -m t5x.train \ 8 | --gin_file="models/scalable_t5/t5_1_1/base.gin" \ 9 | --gin_file="configs/task/eval/t0_eval.gin" \ 10 | --gin_file="configs/exp/LayerNorm/reset_optim.gin" \ 11 | --gin.TRAIN_STEPS=1110000 \ 12 | --gin.SAVING_PERIOD=5000 \ 13 | --gin.MODEL_DIR=\"${PREFIX}'base_lm100k/finetune_t0_eval/'\" \ 14 | --gin.INITIAL_CHECKPOINT_PATH=\"${PREFIX}'base_lm100k/finetune_t0_train/checkpoint_1110000/'\" \ 15 | --seqio_additional_cache_dirs="${CACHED_DATA_DIR}" \ 16 | --gin.USE_CACHED_TASKS=True \ 17 | --alsologtostderr \ 18 | --multiprocess_gpu \ 19 | --coordinator_address=${ADDR} \ 20 | --process_count=${SLURM_NTASKS} \ 21 | --process_index=${SLURM_PROCID} 22 | 23 | -------------------------------------------------------------------------------- /experiments/preliminary/layernorm/pretrain/pile_mlm_adafactor_post.sh: -------------------------------------------------------------------------------- 1 | ADDR=$1 2 | MODEL_DIR=$2 3 | 4 | python -m t5x.train \ 5 | --gin_file="models/scalable_t5/t5_1_1/base.gin" \ 6 | --gin_file="configs/task/pretrain/pile_mlm.gin" \ 7 | --gin_file="configs/exp/LayerNorm/post_layernorm.gin" \ 8 | --gin.TRAIN_STEPS=125000 \ 9 | --gin.SAVING_PERIOD=25000 \ 10 | --gin.MODEL_DIR=\"'/fsx/lintangsutawika/improved_t5/ckpts/LayerNorm/pile_mlm_adafactor_post/'\" \ 11 | --gin.USE_CACHED_TASKS=False \ 12 | --alsologtostderr \ 13 | --multiprocess_gpu \ 14 | --coordinator_address=${ADDR} \ 15 | --process_count=${SLURM_NTASKS} \ 16 | --process_index=${SLURM_PROCID} 17 | -------------------------------------------------------------------------------- /experiments/preliminary/layernorm/pretrain/pile_mlm_adamw_post.sh: -------------------------------------------------------------------------------- 1 | ADDR=$1 2 | MODEL_DIR=$2 3 | 4 | export CACHED_DATA_DIR="/fsx/lintangsutawika/data" 5 | export PREFIX="/fsx/lintangsutawika/improved_t5/ckpts/LayerNorm/with_abs_pos/" 6 | 7 | python -m t5x.train \ 8 | --gin_file="models/scalable_t5/t5_1_1/base.gin" \ 9 | --gin_file="configs/task/pretrain/pile_mlm.gin" \ 10 | --gin_file="configs/exp/LayerNorm/training.gin" \ 11 | --gin_file="configs/exp/LayerNorm/post_layernorm.gin" \ 12 | --gin.TRAIN_STEPS=125000 \ 13 | --gin.SAVING_PERIOD=25000 \ 14 | --gin.MODEL_DIR=\"${PREFIX}'pile_mlm_adamw_post/'\" \ 15 | --gin.USE_CACHED_TASKS=False \ 16 | --alsologtostderr \ 17 | --multiprocess_gpu \ 18 | --coordinator_address=${ADDR} \ 19 | --process_count=${SLURM_NTASKS} \ 20 | --process_index=${SLURM_PROCID} 21 | -------------------------------------------------------------------------------- /experiments/preliminary/layernorm/pretrain/pile_mlm_adamw_pre.sh: -------------------------------------------------------------------------------- 1 | ADDR=$1 2 | MODEL_DIR=$2 3 | 4 | export CACHED_DATA_DIR="/fsx/lintangsutawika/data" 5 | export PREFIX="/fsx/lintangsutawika/improved_t5/ckpts/LayerNorm/with_abs_pos/" 6 | 7 | python -m t5x.train \ 8 | --gin_file="models/scalable_t5/t5_1_1/base.gin" \ 9 | --gin_file="configs/task/pretrain/pile_mlm.gin" \ 10 | --gin_file="configs/exp/LayerNorm/training.gin" \ 11 | --gin_file="configs/exp/LayerNorm/pre_layernorm.gin" \ 12 | --gin.TRAIN_STEPS=125000 \ 13 | --gin.SAVING_PERIOD=25000 \ 14 | --gin.MODEL_DIR=\"${PREFIX}'pile_mlm_adamw_pre/'\" \ 15 | --gin.USE_CACHED_TASKS=False \ 16 | --alsologtostderr \ 17 | --multiprocess_gpu \ 18 | --coordinator_address=${ADDR} \ 19 | --process_count=${SLURM_NTASKS} \ 20 | --process_index=${SLURM_PROCID} 21 | -------------------------------------------------------------------------------- /experiments/preliminary/mixed_pretraining_objectives/finetune/0_10/sglue_finetune_920m_32000.sh: -------------------------------------------------------------------------------- 1 | export CACHED_DATA_DIR="/fsx/lintangsutawika/data" 2 | 3 | ADDR=$1 4 | MODEL_DIR=$2 5 | 6 | let "TRAIN_STEPS = 32000 + 128000" 7 | 8 | python -m t5x.train \ 9 | --gin_file="models/scalable_t5/t5_1_1/base.gin" \ 10 | --gin_file="configs/size/920m/vanilla.gin" \ 11 | --gin_file="configs/task/finetune/sglue.gin" \ 12 | --gin.TRAIN_STEPS=${TRAIN_STEPS} \ 13 | --gin.SAVING_PERIOD=2_000 \ 14 | --gin.MODEL_DIR=\"'/fsx/lintangsutawika/improved_t5/ckpts/920m_pile_ul2_causal_0_10/finetune_32k/'\" \ 15 | --gin.INITIAL_CHECKPOINT_PATH=\"'/fsx/lintangsutawika/improved_t5/ckpts/920m_pile_ul2_causal_0_10/checkpoint_32000/'\" \ 16 | --seqio_additional_cache_dirs="${CACHED_DATA_DIR}" \ 17 | --alsologtostderr \ 18 | --multiprocess_gpu \ 19 | --coordinator_address=${ADDR} \ 20 | --process_count=${SLURM_NTASKS} \ 21 | --process_index=${SLURM_PROCID} -------------------------------------------------------------------------------- /experiments/preliminary/mixed_pretraining_objectives/finetune/0_10/sglue_finetune_920m_64000.sh: -------------------------------------------------------------------------------- 1 | export CACHED_DATA_DIR="/fsx/lintangsutawika/data" 2 | 3 | ADDR=$1 4 | MODEL_DIR=$2 5 | 6 | let "TRAIN_STEPS = 64000 + 128000" 7 | 8 | python -m t5x.train \ 9 | --gin_file="models/scalable_t5/t5_1_1/base.gin" \ 10 | --gin_file="configs/size/920m/vanilla.gin" \ 11 | --gin_file="configs/task/finetune/sglue.gin" \ 12 | --gin.TRAIN_STEPS=${TRAIN_STEPS} \ 13 | --gin.SAVING_PERIOD=2_000 \ 14 | --gin.MODEL_DIR=\"'/fsx/lintangsutawika/improved_t5/ckpts/920m_pile_ul2_causal_0_10/finetune_64k/'\" \ 15 | --gin.INITIAL_CHECKPOINT_PATH=\"'/fsx/lintangsutawika/improved_t5/ckpts/920m_pile_ul2_causal_0_10/checkpoint_64000/'\" \ 16 | --seqio_additional_cache_dirs="${CACHED_DATA_DIR}" \ 17 | --alsologtostderr \ 18 | --multiprocess_gpu \ 19 | --coordinator_address=${ADDR} \ 20 | --process_count=${SLURM_NTASKS} \ 21 | --process_index=${SLURM_PROCID} -------------------------------------------------------------------------------- /experiments/preliminary/mixed_pretraining_objectives/finetune/0_10/sglue_finetune_920m_96000.sh: -------------------------------------------------------------------------------- 1 | export CACHED_DATA_DIR="/fsx/lintangsutawika/data" 2 | 3 | ADDR=$1 4 | MODEL_DIR=$2 5 | 6 | let "TRAIN_STEPS = 96000 + 128000" 7 | 8 | python -m t5x.train \ 9 | --gin_file="models/scalable_t5/t5_1_1/base.gin" \ 10 | --gin_file="configs/size/920m/vanilla.gin" \ 11 | --gin_file="configs/task/finetune/sglue.gin" \ 12 | --gin.TRAIN_STEPS=${TRAIN_STEPS} \ 13 | --gin.SAVING_PERIOD=2_000 \ 14 | --gin.MODEL_DIR=\"'/fsx/lintangsutawika/improved_t5/ckpts/920m_pile_ul2_causal_0_10/finetune_96k/'\" \ 15 | --gin.INITIAL_CHECKPOINT_PATH=\"'/fsx/lintangsutawika/improved_t5/ckpts/920m_pile_ul2_causal_0_10/checkpoint_96000/'\" \ 16 | --seqio_additional_cache_dirs="${CACHED_DATA_DIR}" \ 17 | --alsologtostderr \ 18 | --multiprocess_gpu \ 19 | --coordinator_address=${ADDR} \ 20 | --process_count=${SLURM_NTASKS} \ 21 | --process_index=${SLURM_PROCID} -------------------------------------------------------------------------------- /experiments/preliminary/mixed_pretraining_objectives/finetune/0_15/sglue_finetune_920m_32000.sh: -------------------------------------------------------------------------------- 1 | export CACHED_DATA_DIR="/fsx/lintangsutawika/data" 2 | 3 | ADDR=$1 4 | MODEL_DIR=$2 5 | 6 | let "TRAIN_STEPS = 32000 + 128000" 7 | 8 | python -m t5x.train \ 9 | --gin_file="models/scalable_t5/t5_1_1/base.gin" \ 10 | --gin_file="configs/size/920m/vanilla.gin" \ 11 | --gin_file="configs/task/finetune/sglue.gin" \ 12 | --gin.TRAIN_STEPS=${TRAIN_STEPS} \ 13 | --gin.SAVING_PERIOD=2_000 \ 14 | --gin.MODEL_DIR=\"'/fsx/lintangsutawika/improved_t5/ckpts/920m_pile_ul2_causal_0_15/finetune_32k/'\" \ 15 | --gin.INITIAL_CHECKPOINT_PATH=\"'/fsx/lintangsutawika/improved_t5/ckpts/920m_pile_ul2_causal_0_15/checkpoint_32000/'\" \ 16 | --seqio_additional_cache_dirs="${CACHED_DATA_DIR}" \ 17 | --alsologtostderr \ 18 | --multiprocess_gpu \ 19 | --coordinator_address=${ADDR} \ 20 | --process_count=${SLURM_NTASKS} \ 21 | --process_index=${SLURM_PROCID} -------------------------------------------------------------------------------- /experiments/preliminary/mixed_pretraining_objectives/finetune/0_15/sglue_finetune_920m_64000.sh: -------------------------------------------------------------------------------- 1 | export CACHED_DATA_DIR="/fsx/lintangsutawika/data" 2 | 3 | ADDR=$1 4 | MODEL_DIR=$2 5 | 6 | let "TRAIN_STEPS = 64000 + 128000" 7 | 8 | python -m t5x.train \ 9 | --gin_file="models/scalable_t5/t5_1_1/base.gin" \ 10 | --gin_file="configs/size/920m/vanilla.gin" \ 11 | --gin_file="configs/task/finetune/sglue.gin" \ 12 | --gin.TRAIN_STEPS=${TRAIN_STEPS} \ 13 | --gin.SAVING_PERIOD=2_000 \ 14 | --gin.MODEL_DIR=\"'/fsx/lintangsutawika/improved_t5/ckpts/920m_pile_ul2_causal_0_15/finetune_64k/'\" \ 15 | --gin.INITIAL_CHECKPOINT_PATH=\"'/fsx/lintangsutawika/improved_t5/ckpts/920m_pile_ul2_causal_0_15/checkpoint_64000/'\" \ 16 | --seqio_additional_cache_dirs="${CACHED_DATA_DIR}" \ 17 | --alsologtostderr \ 18 | --multiprocess_gpu \ 19 | --coordinator_address=${ADDR} \ 20 | --process_count=${SLURM_NTASKS} \ 21 | --process_index=${SLURM_PROCID} -------------------------------------------------------------------------------- /experiments/preliminary/mixed_pretraining_objectives/finetune/0_15/sglue_finetune_920m_96000.sh: -------------------------------------------------------------------------------- 1 | export CACHED_DATA_DIR="/fsx/lintangsutawika/data" 2 | 3 | ADDR=$1 4 | MODEL_DIR=$2 5 | 6 | let "TRAIN_STEPS = 96000 + 128000" 7 | 8 | python -m t5x.train \ 9 | --gin_file="models/scalable_t5/t5_1_1/base.gin" \ 10 | --gin_file="configs/size/920m/vanilla.gin" \ 11 | --gin_file="configs/task/finetune/sglue.gin" \ 12 | --gin.TRAIN_STEPS=${TRAIN_STEPS} \ 13 | --gin.SAVING_PERIOD=2_000 \ 14 | --gin.MODEL_DIR=\"'/fsx/lintangsutawika/improved_t5/ckpts/920m_pile_ul2_causal_0_15/finetune_96k/'\" \ 15 | --gin.INITIAL_CHECKPOINT_PATH=\"'/fsx/lintangsutawika/improved_t5/ckpts/920m_pile_ul2_causal_0_15/checkpoint_96000/'\" \ 16 | --seqio_additional_cache_dirs="${CACHED_DATA_DIR}" \ 17 | --alsologtostderr \ 18 | --multiprocess_gpu \ 19 | --coordinator_address=${ADDR} \ 20 | --process_count=${SLURM_NTASKS} \ 21 | --process_index=${SLURM_PROCID} -------------------------------------------------------------------------------- /experiments/preliminary/mixed_pretraining_objectives/finetune/0_25/sglue_finetune_920m_128000.sh: -------------------------------------------------------------------------------- 1 | export CACHED_DATA_DIR="/fsx/lintangsutawika/data" 2 | 3 | ADDR=$1 4 | MODEL_DIR=$2 5 | 6 | let "TRAIN_STEPS = 128000 + 128000" 7 | 8 | python -m t5x.train \ 9 | --gin_file="models/scalable_t5/t5_1_1/base.gin" \ 10 | --gin_file="configs/size/920m/vanilla.gin" \ 11 | --gin_file="configs/task/finetune/sglue.gin" \ 12 | --gin.TRAIN_STEPS=${TRAIN_STEPS} \ 13 | --gin.SAVING_PERIOD=2_000 \ 14 | --gin.MODEL_DIR=\"'/fsx/lintangsutawika/improved_t5/ckpts/920m_pile_ul2_causal_0_25/sglue_128k/'\" \ 15 | --gin.INITIAL_CHECKPOINT_PATH=\"'/fsx/lintangsutawika/improved_t5/ckpts/920m_pile_ul2_causal_0_25/checkpoint_128000/'\" \ 16 | --seqio_additional_cache_dirs="${CACHED_DATA_DIR}" \ 17 | --alsologtostderr \ 18 | --multiprocess_gpu \ 19 | --coordinator_address=${ADDR} \ 20 | --process_count=${SLURM_NTASKS} \ 21 | --process_index=${SLURM_PROCID} -------------------------------------------------------------------------------- /experiments/preliminary/mixed_pretraining_objectives/finetune/0_25/sglue_finetune_920m_160000.sh: -------------------------------------------------------------------------------- 1 | export CACHED_DATA_DIR="/fsx/lintangsutawika/data" 2 | 3 | ADDR=$1 4 | MODEL_DIR=$2 5 | 6 | let "TRAIN_STEPS = 160000 + 128000" 7 | 8 | python -m t5x.train \ 9 | --gin_file="models/scalable_t5/t5_1_1/base.gin" \ 10 | --gin_file="configs/size/920m/vanilla.gin" \ 11 | --gin_file="configs/task/finetune/sglue.gin" \ 12 | --gin.TRAIN_STEPS=${TRAIN_STEPS} \ 13 | --gin.SAVING_PERIOD=2_000 \ 14 | --gin.MODEL_DIR=\"'/fsx/lintangsutawika/improved_t5/ckpts/920m_pile_ul2_causal_0_25/sglue_160k/'\" \ 15 | --gin.INITIAL_CHECKPOINT_PATH=\"'/fsx/lintangsutawika/improved_t5/ckpts/920m_pile_ul2_causal_0_25/checkpoint_160000/'\" \ 16 | --seqio_additional_cache_dirs="${CACHED_DATA_DIR}" \ 17 | --alsologtostderr \ 18 | --multiprocess_gpu \ 19 | --coordinator_address=${ADDR} \ 20 | --process_count=${SLURM_NTASKS} \ 21 | --process_index=${SLURM_PROCID} -------------------------------------------------------------------------------- /experiments/preliminary/mixed_pretraining_objectives/finetune/0_25/sglue_finetune_920m_192000.sh: -------------------------------------------------------------------------------- 1 | export CACHED_DATA_DIR="/fsx/lintangsutawika/data" 2 | 3 | ADDR=$1 4 | MODEL_DIR=$2 5 | 6 | let "TRAIN_STEPS = 192000 + 128000" 7 | 8 | python -m t5x.train \ 9 | --gin_file="models/scalable_t5/t5_1_1/base.gin" \ 10 | --gin_file="configs/size/920m/vanilla.gin" \ 11 | --gin_file="configs/task/finetune/sglue.gin" \ 12 | --gin.TRAIN_STEPS=${TRAIN_STEPS} \ 13 | --gin.SAVING_PERIOD=2_000 \ 14 | --gin.MODEL_DIR=\"'/fsx/lintangsutawika/improved_t5/ckpts/920m_pile_ul2_causal_0_25/sglue_192k/'\" \ 15 | --gin.INITIAL_CHECKPOINT_PATH=\"'/fsx/lintangsutawika/improved_t5/ckpts/920m_pile_ul2_causal_0_25/checkpoint_192000/'\" \ 16 | --seqio_additional_cache_dirs="${CACHED_DATA_DIR}" \ 17 | --alsologtostderr \ 18 | --multiprocess_gpu \ 19 | --coordinator_address=${ADDR} \ 20 | --process_count=${SLURM_NTASKS} \ 21 | --process_index=${SLURM_PROCID} -------------------------------------------------------------------------------- /experiments/preliminary/mixed_pretraining_objectives/finetune/0_25/sglue_finetune_920m_224000.sh: -------------------------------------------------------------------------------- 1 | export CACHED_DATA_DIR="/fsx/lintangsutawika/data" 2 | 3 | ADDR=$1 4 | MODEL_DIR=$2 5 | 6 | let "TRAIN_STEPS = 224000 + 128000" 7 | 8 | python -m t5x.train \ 9 | --gin_file="models/scalable_t5/t5_1_1/base.gin" \ 10 | --gin_file="configs/size/920m/vanilla.gin" \ 11 | --gin_file="configs/task/finetune/sglue.gin" \ 12 | --gin.TRAIN_STEPS=${TRAIN_STEPS} \ 13 | --gin.SAVING_PERIOD=2_000 \ 14 | --gin.MODEL_DIR=\"'/fsx/lintangsutawika/improved_t5/ckpts/920m_pile_ul2_causal_0_25/sglue_224k/'\" \ 15 | --gin.INITIAL_CHECKPOINT_PATH=\"'/fsx/lintangsutawika/improved_t5/ckpts/920m_pile_ul2_causal_0_25/checkpoint_224000/'\" \ 16 | --seqio_additional_cache_dirs="${CACHED_DATA_DIR}" \ 17 | --alsologtostderr \ 18 | --multiprocess_gpu \ 19 | --coordinator_address=${ADDR} \ 20 | --process_count=${SLURM_NTASKS} \ 21 | --process_index=${SLURM_PROCID} -------------------------------------------------------------------------------- /experiments/preliminary/mixed_pretraining_objectives/finetune/0_25/sglue_finetune_920m_256000.sh: -------------------------------------------------------------------------------- 1 | export CACHED_DATA_DIR="/fsx/lintangsutawika/data" 2 | 3 | ADDR=$1 4 | MODEL_DIR=$2 5 | 6 | let "TRAIN_STEPS = 256000 + 128000" 7 | 8 | python -m t5x.train \ 9 | --gin_file="models/scalable_t5/t5_1_1/base.gin" \ 10 | --gin_file="configs/size/920m/vanilla.gin" \ 11 | --gin_file="configs/task/finetune/sglue.gin" \ 12 | --gin.TRAIN_STEPS=${TRAIN_STEPS} \ 13 | --gin.SAVING_PERIOD=2_000 \ 14 | --gin.MODEL_DIR=\"'/fsx/lintangsutawika/improved_t5/ckpts/920m_pile_ul2_causal_0_25/sglue_256k/'\" \ 15 | --gin.INITIAL_CHECKPOINT_PATH=\"'/fsx/lintangsutawika/improved_t5/ckpts/920m_pile_ul2_causal_0_25/checkpoint_256000/'\" \ 16 | --seqio_additional_cache_dirs="${CACHED_DATA_DIR}" \ 17 | --alsologtostderr \ 18 | --multiprocess_gpu \ 19 | --coordinator_address=${ADDR} \ 20 | --process_count=${SLURM_NTASKS} \ 21 | --process_index=${SLURM_PROCID} -------------------------------------------------------------------------------- /experiments/preliminary/mixed_pretraining_objectives/finetune/0_25/sglue_finetune_920m_32000.sh: -------------------------------------------------------------------------------- 1 | export CACHED_DATA_DIR="/fsx/lintangsutawika/data" 2 | 3 | ADDR=$1 4 | MODEL_DIR=$2 5 | 6 | let "TRAIN_STEPS = 32000 + 128000" 7 | 8 | python -m t5x.train \ 9 | --gin_file="models/scalable_t5/t5_1_1/base.gin" \ 10 | --gin_file="configs/size/920m/vanilla.gin" \ 11 | --gin_file="configs/task/finetune/sglue.gin" \ 12 | --gin.TRAIN_STEPS=${TRAIN_STEPS} \ 13 | --gin.SAVING_PERIOD=2_000 \ 14 | --gin.MODEL_DIR=\"'/fsx/lintangsutawika/improved_t5/ckpts/920m_pile_ul2_causal_0_25/sglue_32k/'\" \ 15 | --gin.INITIAL_CHECKPOINT_PATH=\"'/fsx/lintangsutawika/improved_t5/ckpts/920m_pile_ul2_causal_0_25/checkpoint_32000/'\" \ 16 | --seqio_additional_cache_dirs="${CACHED_DATA_DIR}" \ 17 | --alsologtostderr \ 18 | --multiprocess_gpu \ 19 | --coordinator_address=${ADDR} \ 20 | --process_count=${SLURM_NTASKS} \ 21 | --process_index=${SLURM_PROCID} -------------------------------------------------------------------------------- /experiments/preliminary/mixed_pretraining_objectives/finetune/0_25/sglue_finetune_920m_64000.sh: -------------------------------------------------------------------------------- 1 | export CACHED_DATA_DIR="/fsx/lintangsutawika/data" 2 | 3 | ADDR=$1 4 | MODEL_DIR=$2 5 | 6 | let "TRAIN_STEPS = 64000 + 128000" 7 | 8 | python -m t5x.train \ 9 | --gin_file="models/scalable_t5/t5_1_1/base.gin" \ 10 | --gin_file="configs/size/920m/vanilla.gin" \ 11 | --gin_file="configs/task/finetune/sglue.gin" \ 12 | --gin.TRAIN_STEPS=${TRAIN_STEPS} \ 13 | --gin.SAVING_PERIOD=2_000 \ 14 | --gin.MODEL_DIR=\"'/fsx/lintangsutawika/improved_t5/ckpts/920m_pile_ul2_causal_0_25/sglue_64k/'\" \ 15 | --gin.INITIAL_CHECKPOINT_PATH=\"'/fsx/lintangsutawika/improved_t5/ckpts/920m_pile_ul2_causal_0_25/checkpoint_64000/'\" \ 16 | --seqio_additional_cache_dirs="${CACHED_DATA_DIR}" \ 17 | --alsologtostderr \ 18 | --multiprocess_gpu \ 19 | --coordinator_address=${ADDR} \ 20 | --process_count=${SLURM_NTASKS} \ 21 | --process_index=${SLURM_PROCID} -------------------------------------------------------------------------------- /experiments/preliminary/mixed_pretraining_objectives/finetune/0_25/sglue_finetune_920m_96000.sh: -------------------------------------------------------------------------------- 1 | export CACHED_DATA_DIR="/fsx/lintangsutawika/data" 2 | 3 | ADDR=$1 4 | MODEL_DIR=$2 5 | 6 | let "TRAIN_STEPS = 96000 + 128000" 7 | 8 | python -m t5x.train \ 9 | --gin_file="models/scalable_t5/t5_1_1/base.gin" \ 10 | --gin_file="configs/size/920m/vanilla.gin" \ 11 | --gin_file="configs/task/finetune/sglue.gin" \ 12 | --gin.TRAIN_STEPS=${TRAIN_STEPS} \ 13 | --gin.SAVING_PERIOD=2_000 \ 14 | --gin.MODEL_DIR=\"'/fsx/lintangsutawika/improved_t5/ckpts/920m_pile_ul2_causal_0_25/sglue_96k/'\" \ 15 | --gin.INITIAL_CHECKPOINT_PATH=\"'/fsx/lintangsutawika/improved_t5/ckpts/920m_pile_ul2_causal_0_25/checkpoint_96000/'\" \ 16 | --seqio_additional_cache_dirs="${CACHED_DATA_DIR}" \ 17 | --alsologtostderr \ 18 | --multiprocess_gpu \ 19 | --coordinator_address=${ADDR} \ 20 | --process_count=${SLURM_NTASKS} \ 21 | --process_index=${SLURM_PROCID} -------------------------------------------------------------------------------- /experiments/preliminary/mixed_pretraining_objectives/finetune/0_50/sglue_finetune_920m_128000.sh: -------------------------------------------------------------------------------- 1 | export CACHED_DATA_DIR="/fsx/lintangsutawika/data" 2 | 3 | ADDR=$1 4 | MODEL_DIR=$2 5 | 6 | let "TRAIN_STEPS = 128000 + 128000" 7 | 8 | python -m t5x.train \ 9 | --gin_file="models/scalable_t5/t5_1_1/base.gin" \ 10 | --gin_file="configs/size/920m/vanilla.gin" \ 11 | --gin_file="configs/task/finetune/sglue.gin" \ 12 | --gin.TRAIN_STEPS=${TRAIN_STEPS} \ 13 | --gin.SAVING_PERIOD=2_000 \ 14 | --gin.MODEL_DIR=\"'/fsx/lintangsutawika/improved_t5/ckpts/920m_pile_ul2_causal_0_50/sglue_128k/'\" \ 15 | --gin.INITIAL_CHECKPOINT_PATH=\"'/fsx/lintangsutawika/improved_t5/ckpts/920m_pile_ul2_causal_0_50/checkpoint_128000/'\" \ 16 | --seqio_additional_cache_dirs="${CACHED_DATA_DIR}" \ 17 | --alsologtostderr \ 18 | --multiprocess_gpu \ 19 | --coordinator_address=${ADDR} \ 20 | --process_count=${SLURM_NTASKS} \ 21 | --process_index=${SLURM_PROCID} -------------------------------------------------------------------------------- /experiments/preliminary/mixed_pretraining_objectives/finetune/0_50/sglue_finetune_920m_160000.sh: -------------------------------------------------------------------------------- 1 | export CACHED_DATA_DIR="/fsx/lintangsutawika/data" 2 | 3 | ADDR=$1 4 | MODEL_DIR=$2 5 | 6 | let "TRAIN_STEPS = 160000 + 128000" 7 | 8 | python -m t5x.train \ 9 | --gin_file="models/scalable_t5/t5_1_1/base.gin" \ 10 | --gin_file="configs/size/920m/vanilla.gin" \ 11 | --gin_file="configs/task/finetune/sglue.gin" \ 12 | --gin.TRAIN_STEPS=${TRAIN_STEPS} \ 13 | --gin.SAVING_PERIOD=2_000 \ 14 | --gin.MODEL_DIR=\"'/fsx/lintangsutawika/improved_t5/ckpts/920m_pile_ul2_causal_0_50/sglue_160k/'\" \ 15 | --gin.INITIAL_CHECKPOINT_PATH=\"'/fsx/lintangsutawika/improved_t5/ckpts/920m_pile_ul2_causal_0_50/checkpoint_160000/'\" \ 16 | --seqio_additional_cache_dirs="${CACHED_DATA_DIR}" \ 17 | --alsologtostderr \ 18 | --multiprocess_gpu \ 19 | --coordinator_address=${ADDR} \ 20 | --process_count=${SLURM_NTASKS} \ 21 | --process_index=${SLURM_PROCID} -------------------------------------------------------------------------------- /experiments/preliminary/mixed_pretraining_objectives/finetune/0_50/sglue_finetune_920m_192000.sh: -------------------------------------------------------------------------------- 1 | export CACHED_DATA_DIR="/fsx/lintangsutawika/data" 2 | 3 | ADDR=$1 4 | MODEL_DIR=$2 5 | 6 | let "TRAIN_STEPS = 192000 + 128000" 7 | 8 | python -m t5x.train \ 9 | --gin_file="models/scalable_t5/t5_1_1/base.gin" \ 10 | --gin_file="configs/size/920m/vanilla.gin" \ 11 | --gin_file="configs/task/finetune/sglue.gin" \ 12 | --gin.TRAIN_STEPS=${TRAIN_STEPS} \ 13 | --gin.SAVING_PERIOD=2_000 \ 14 | --gin.MODEL_DIR=\"'/fsx/lintangsutawika/improved_t5/ckpts/920m_pile_ul2_causal_0_50/sglue_192k/'\" \ 15 | --gin.INITIAL_CHECKPOINT_PATH=\"'/fsx/lintangsutawika/improved_t5/ckpts/920m_pile_ul2_causal_0_50/checkpoint_192000/'\" \ 16 | --seqio_additional_cache_dirs="${CACHED_DATA_DIR}" \ 17 | --alsologtostderr \ 18 | --multiprocess_gpu \ 19 | --coordinator_address=${ADDR} \ 20 | --process_count=${SLURM_NTASKS} \ 21 | --process_index=${SLURM_PROCID} -------------------------------------------------------------------------------- /experiments/preliminary/mixed_pretraining_objectives/finetune/0_50/sglue_finetune_920m_224000.sh: -------------------------------------------------------------------------------- 1 | export CACHED_DATA_DIR="/fsx/lintangsutawika/data" 2 | 3 | ADDR=$1 4 | MODEL_DIR=$2 5 | 6 | let "TRAIN_STEPS = 224000 + 128000" 7 | 8 | python -m t5x.train \ 9 | --gin_file="models/scalable_t5/t5_1_1/base.gin" \ 10 | --gin_file="configs/size/920m/vanilla.gin" \ 11 | --gin_file="configs/task/finetune/sglue.gin" \ 12 | --gin.TRAIN_STEPS=${TRAIN_STEPS} \ 13 | --gin.SAVING_PERIOD=2_000 \ 14 | --gin.MODEL_DIR=\"'/fsx/lintangsutawika/improved_t5/ckpts/920m_pile_ul2_causal_0_50/sglue_224k/'\" \ 15 | --gin.INITIAL_CHECKPOINT_PATH=\"'/fsx/lintangsutawika/improved_t5/ckpts/920m_pile_ul2_causal_0_50/checkpoint_224000/'\" \ 16 | --seqio_additional_cache_dirs="${CACHED_DATA_DIR}" \ 17 | --alsologtostderr \ 18 | --multiprocess_gpu \ 19 | --coordinator_address=${ADDR} \ 20 | --process_count=${SLURM_NTASKS} \ 21 | --process_index=${SLURM_PROCID} -------------------------------------------------------------------------------- /experiments/preliminary/mixed_pretraining_objectives/finetune/0_50/sglue_finetune_920m_256000.sh: -------------------------------------------------------------------------------- 1 | export CACHED_DATA_DIR="/fsx/lintangsutawika/data" 2 | 3 | ADDR=$1 4 | MODEL_DIR=$2 5 | 6 | let "TRAIN_STEPS = 256000 + 128000" 7 | 8 | python -m t5x.train \ 9 | --gin_file="models/scalable_t5/t5_1_1/base.gin" \ 10 | --gin_file="configs/size/920m/vanilla.gin" \ 11 | --gin_file="configs/task/finetune/sglue.gin" \ 12 | --gin.TRAIN_STEPS=${TRAIN_STEPS} \ 13 | --gin.SAVING_PERIOD=2_000 \ 14 | --gin.MODEL_DIR=\"'/fsx/lintangsutawika/improved_t5/ckpts/920m_pile_ul2_causal_0_50/sglue_256k/'\" \ 15 | --gin.INITIAL_CHECKPOINT_PATH=\"'/fsx/lintangsutawika/improved_t5/ckpts/920m_pile_ul2_causal_0_50/checkpoint_256000/'\" \ 16 | --seqio_additional_cache_dirs="${CACHED_DATA_DIR}" \ 17 | --alsologtostderr \ 18 | --multiprocess_gpu \ 19 | --coordinator_address=${ADDR} \ 20 | --process_count=${SLURM_NTASKS} \ 21 | --process_index=${SLURM_PROCID} -------------------------------------------------------------------------------- /experiments/preliminary/mixed_pretraining_objectives/finetune/0_50/sglue_finetune_920m_32000.sh: -------------------------------------------------------------------------------- 1 | export CACHED_DATA_DIR="/fsx/lintangsutawika/data" 2 | 3 | ADDR=$1 4 | MODEL_DIR=$2 5 | 6 | let "TRAIN_STEPS = 32000 + 128000" 7 | 8 | python -m t5x.train \ 9 | --gin_file="models/scalable_t5/t5_1_1/base.gin" \ 10 | --gin_file="configs/size/920m/vanilla.gin" \ 11 | --gin_file="configs/task/finetune/sglue.gin" \ 12 | --gin.TRAIN_STEPS=${TRAIN_STEPS} \ 13 | --gin.SAVING_PERIOD=2_000 \ 14 | --gin.MODEL_DIR=\"'/fsx/lintangsutawika/improved_t5/ckpts/920m_pile_ul2_causal_0_50/sglue_32k/'\" \ 15 | --gin.INITIAL_CHECKPOINT_PATH=\"'/fsx/lintangsutawika/improved_t5/ckpts/920m_pile_ul2_causal_0_50/checkpoint_32000/'\" \ 16 | --seqio_additional_cache_dirs="${CACHED_DATA_DIR}" \ 17 | --alsologtostderr \ 18 | --multiprocess_gpu \ 19 | --coordinator_address=${ADDR} \ 20 | --process_count=${SLURM_NTASKS} \ 21 | --process_index=${SLURM_PROCID} -------------------------------------------------------------------------------- /experiments/preliminary/mixed_pretraining_objectives/finetune/0_50/sglue_finetune_920m_64000.sh: -------------------------------------------------------------------------------- 1 | export CACHED_DATA_DIR="/fsx/lintangsutawika/data" 2 | 3 | ADDR=$1 4 | MODEL_DIR=$2 5 | 6 | let "TRAIN_STEPS = 64000 + 128000" 7 | 8 | python -m t5x.train \ 9 | --gin_file="models/scalable_t5/t5_1_1/base.gin" \ 10 | --gin_file="configs/size/920m/vanilla.gin" \ 11 | --gin_file="configs/task/finetune/sglue.gin" \ 12 | --gin.TRAIN_STEPS=${TRAIN_STEPS} \ 13 | --gin.SAVING_PERIOD=2_000 \ 14 | --gin.MODEL_DIR=\"'/fsx/lintangsutawika/improved_t5/ckpts/920m_pile_ul2_causal_0_50/sglue_64k/'\" \ 15 | --gin.INITIAL_CHECKPOINT_PATH=\"'/fsx/lintangsutawika/improved_t5/ckpts/920m_pile_ul2_causal_0_50/checkpoint_64000/'\" \ 16 | --seqio_additional_cache_dirs="${CACHED_DATA_DIR}" \ 17 | --alsologtostderr \ 18 | --multiprocess_gpu \ 19 | --coordinator_address=${ADDR} \ 20 | --process_count=${SLURM_NTASKS} \ 21 | --process_index=${SLURM_PROCID} -------------------------------------------------------------------------------- /experiments/preliminary/mixed_pretraining_objectives/finetune/0_50/sglue_finetune_920m_96000.sh: -------------------------------------------------------------------------------- 1 | export CACHED_DATA_DIR="/fsx/lintangsutawika/data" 2 | 3 | ADDR=$1 4 | MODEL_DIR=$2 5 | 6 | let "TRAIN_STEPS = 96000 + 128000" 7 | 8 | python -m t5x.train \ 9 | --gin_file="models/scalable_t5/t5_1_1/base.gin" \ 10 | --gin_file="configs/size/920m/vanilla.gin" \ 11 | --gin_file="configs/task/finetune/sglue.gin" \ 12 | --gin.TRAIN_STEPS=${TRAIN_STEPS} \ 13 | --gin.SAVING_PERIOD=2_000 \ 14 | --gin.MODEL_DIR=\"'/fsx/lintangsutawika/improved_t5/ckpts/920m_pile_ul2_causal_0_50/sglue_96k/'\" \ 15 | --gin.INITIAL_CHECKPOINT_PATH=\"'/fsx/lintangsutawika/improved_t5/ckpts/920m_pile_ul2_causal_0_50/checkpoint_96000/'\" \ 16 | --seqio_additional_cache_dirs="${CACHED_DATA_DIR}" \ 17 | --alsologtostderr \ 18 | --multiprocess_gpu \ 19 | --coordinator_address=${ADDR} \ 20 | --process_count=${SLURM_NTASKS} \ 21 | --process_index=${SLURM_PROCID} -------------------------------------------------------------------------------- /experiments/preliminary/mixed_pretraining_objectives/finetune/0_60/sglue_finetune_920m_128000.sh: -------------------------------------------------------------------------------- 1 | export CACHED_DATA_DIR="/fsx/lintangsutawika/data" 2 | 3 | ADDR=$1 4 | MODEL_DIR=$2 5 | 6 | let "TRAIN_STEPS = 128000 + 128000" 7 | 8 | python -m t5x.train \ 9 | --gin_file="models/scalable_t5/t5_1_1/base.gin" \ 10 | --gin_file="configs/size/920m/vanilla.gin" \ 11 | --gin_file="configs/task/finetune/sglue.gin" \ 12 | --gin.TRAIN_STEPS=${TRAIN_STEPS} \ 13 | --gin.SAVING_PERIOD=2_000 \ 14 | --gin.MODEL_DIR=\"'/fsx/lintangsutawika/improved_t5/ckpts/920m_pile_ul2_causal_0_60/sglue_128k/'\" \ 15 | --gin.INITIAL_CHECKPOINT_PATH=\"'/fsx/lintangsutawika/improved_t5/ckpts/920m_pile_ul2_causal_0_60/checkpoint_128000/'\" \ 16 | --seqio_additional_cache_dirs="${CACHED_DATA_DIR}" \ 17 | --alsologtostderr \ 18 | --multiprocess_gpu \ 19 | --coordinator_address=${ADDR} \ 20 | --process_count=${SLURM_NTASKS} \ 21 | --process_index=${SLURM_PROCID} -------------------------------------------------------------------------------- /experiments/preliminary/mixed_pretraining_objectives/finetune/0_60/sglue_finetune_920m_160000.sh: -------------------------------------------------------------------------------- 1 | export CACHED_DATA_DIR="/fsx/lintangsutawika/data" 2 | 3 | ADDR=$1 4 | MODEL_DIR=$2 5 | 6 | let "TRAIN_STEPS = 160000 + 128000" 7 | 8 | python -m t5x.train \ 9 | --gin_file="models/scalable_t5/t5_1_1/base.gin" \ 10 | --gin_file="configs/size/920m/vanilla.gin" \ 11 | --gin_file="configs/task/finetune/sglue.gin" \ 12 | --gin.TRAIN_STEPS=${TRAIN_STEPS} \ 13 | --gin.SAVING_PERIOD=2_000 \ 14 | --gin.MODEL_DIR=\"'/fsx/lintangsutawika/improved_t5/ckpts/920m_pile_ul2_causal_0_60/sglue_160k/'\" \ 15 | --gin.INITIAL_CHECKPOINT_PATH=\"'/fsx/lintangsutawika/improved_t5/ckpts/920m_pile_ul2_causal_0_60/checkpoint_160000/'\" \ 16 | --seqio_additional_cache_dirs="${CACHED_DATA_DIR}" \ 17 | --alsologtostderr \ 18 | --multiprocess_gpu \ 19 | --coordinator_address=${ADDR} \ 20 | --process_count=${SLURM_NTASKS} \ 21 | --process_index=${SLURM_PROCID} -------------------------------------------------------------------------------- /experiments/preliminary/mixed_pretraining_objectives/finetune/0_60/sglue_finetune_920m_192000.sh: -------------------------------------------------------------------------------- 1 | export CACHED_DATA_DIR="/fsx/lintangsutawika/data" 2 | 3 | ADDR=$1 4 | MODEL_DIR=$2 5 | 6 | let "TRAIN_STEPS = 192000 + 128000" 7 | 8 | python -m t5x.train \ 9 | --gin_file="models/scalable_t5/t5_1_1/base.gin" \ 10 | --gin_file="configs/size/920m/vanilla.gin" \ 11 | --gin_file="configs/task/finetune/sglue.gin" \ 12 | --gin.TRAIN_STEPS=${TRAIN_STEPS} \ 13 | --gin.SAVING_PERIOD=2_000 \ 14 | --gin.MODEL_DIR=\"'/fsx/lintangsutawika/improved_t5/ckpts/920m_pile_ul2_causal_0_60/sglue_192k/'\" \ 15 | --gin.INITIAL_CHECKPOINT_PATH=\"'/fsx/lintangsutawika/improved_t5/ckpts/920m_pile_ul2_causal_0_60/checkpoint_192000/'\" \ 16 | --seqio_additional_cache_dirs="${CACHED_DATA_DIR}" \ 17 | --alsologtostderr \ 18 | --multiprocess_gpu \ 19 | --coordinator_address=${ADDR} \ 20 | --process_count=${SLURM_NTASKS} \ 21 | --process_index=${SLURM_PROCID} -------------------------------------------------------------------------------- /experiments/preliminary/mixed_pretraining_objectives/finetune/0_60/sglue_finetune_920m_224000.sh: -------------------------------------------------------------------------------- 1 | export CACHED_DATA_DIR="/fsx/lintangsutawika/data" 2 | 3 | ADDR=$1 4 | MODEL_DIR=$2 5 | 6 | let "TRAIN_STEPS = 224000 + 128000" 7 | 8 | python -m t5x.train \ 9 | --gin_file="models/scalable_t5/t5_1_1/base.gin" \ 10 | --gin_file="configs/size/920m/vanilla.gin" \ 11 | --gin_file="configs/task/finetune/sglue.gin" \ 12 | --gin.TRAIN_STEPS=${TRAIN_STEPS} \ 13 | --gin.SAVING_PERIOD=2_000 \ 14 | --gin.MODEL_DIR=\"'/fsx/lintangsutawika/improved_t5/ckpts/920m_pile_ul2_causal_0_60/sglue_224k/'\" \ 15 | --gin.INITIAL_CHECKPOINT_PATH=\"'/fsx/lintangsutawika/improved_t5/ckpts/920m_pile_ul2_causal_0_60/checkpoint_224000/'\" \ 16 | --seqio_additional_cache_dirs="${CACHED_DATA_DIR}" \ 17 | --alsologtostderr \ 18 | --multiprocess_gpu \ 19 | --coordinator_address=${ADDR} \ 20 | --process_count=${SLURM_NTASKS} \ 21 | --process_index=${SLURM_PROCID} -------------------------------------------------------------------------------- /experiments/preliminary/mixed_pretraining_objectives/finetune/0_60/sglue_finetune_920m_32000.sh: -------------------------------------------------------------------------------- 1 | export CACHED_DATA_DIR="/fsx/lintangsutawika/data" 2 | 3 | ADDR=$1 4 | MODEL_DIR=$2 5 | 6 | let "TRAIN_STEPS = 32000 + 128000" 7 | 8 | python -m t5x.train \ 9 | --gin_file="models/scalable_t5/t5_1_1/base.gin" \ 10 | --gin_file="configs/size/920m/vanilla.gin" \ 11 | --gin_file="configs/task/finetune/sglue.gin" \ 12 | --gin.TRAIN_STEPS=${TRAIN_STEPS} \ 13 | --gin.SAVING_PERIOD=2_000 \ 14 | --gin.MODEL_DIR=\"'/fsx/lintangsutawika/improved_t5/ckpts/920m_pile_ul2_causal_0_60/sglue_32k/'\" \ 15 | --gin.INITIAL_CHECKPOINT_PATH=\"'/fsx/lintangsutawika/improved_t5/ckpts/920m_pile_ul2_causal_0_60/checkpoint_32000/'\" \ 16 | --seqio_additional_cache_dirs="${CACHED_DATA_DIR}" \ 17 | --alsologtostderr \ 18 | --multiprocess_gpu \ 19 | --coordinator_address=${ADDR} \ 20 | --process_count=${SLURM_NTASKS} \ 21 | --process_index=${SLURM_PROCID} -------------------------------------------------------------------------------- /experiments/preliminary/mixed_pretraining_objectives/finetune/0_60/sglue_finetune_920m_64000.sh: -------------------------------------------------------------------------------- 1 | export CACHED_DATA_DIR="/fsx/lintangsutawika/data" 2 | 3 | ADDR=$1 4 | MODEL_DIR=$2 5 | 6 | let "TRAIN_STEPS = 64000 + 128000" 7 | 8 | python -m t5x.train \ 9 | --gin_file="models/scalable_t5/t5_1_1/base.gin" \ 10 | --gin_file="configs/size/920m/vanilla.gin" \ 11 | --gin_file="configs/task/finetune/sglue.gin" \ 12 | --gin.TRAIN_STEPS=${TRAIN_STEPS} \ 13 | --gin.SAVING_PERIOD=2_000 \ 14 | --gin.MODEL_DIR=\"'/fsx/lintangsutawika/improved_t5/ckpts/920m_pile_ul2_causal_0_60/sglue_64k/'\" \ 15 | --gin.INITIAL_CHECKPOINT_PATH=\"'/fsx/lintangsutawika/improved_t5/ckpts/920m_pile_ul2_causal_0_60/checkpoint_64000/'\" \ 16 | --seqio_additional_cache_dirs="${CACHED_DATA_DIR}" \ 17 | --alsologtostderr \ 18 | --multiprocess_gpu \ 19 | --coordinator_address=${ADDR} \ 20 | --process_count=${SLURM_NTASKS} \ 21 | --process_index=${SLURM_PROCID} -------------------------------------------------------------------------------- /experiments/preliminary/mixed_pretraining_objectives/finetune/0_60/sglue_finetune_920m_96000.sh: -------------------------------------------------------------------------------- 1 | export CACHED_DATA_DIR="/fsx/lintangsutawika/data" 2 | 3 | ADDR=$1 4 | MODEL_DIR=$2 5 | 6 | let "TRAIN_STEPS = 96000 + 128000" 7 | 8 | python -m t5x.train \ 9 | --gin_file="models/scalable_t5/t5_1_1/base.gin" \ 10 | --gin_file="configs/size/920m/vanilla.gin" \ 11 | --gin_file="configs/task/finetune/sglue.gin" \ 12 | --gin.TRAIN_STEPS=${TRAIN_STEPS} \ 13 | --gin.SAVING_PERIOD=2_000 \ 14 | --gin.MODEL_DIR=\"'/fsx/lintangsutawika/improved_t5/ckpts/920m_pile_ul2_causal_0_60/sglue_96k/'\" \ 15 | --gin.INITIAL_CHECKPOINT_PATH=\"'/fsx/lintangsutawika/improved_t5/ckpts/920m_pile_ul2_causal_0_60/checkpoint_96000/'\" \ 16 | --seqio_additional_cache_dirs="${CACHED_DATA_DIR}" \ 17 | --alsologtostderr \ 18 | --multiprocess_gpu \ 19 | --coordinator_address=${ADDR} \ 20 | --process_count=${SLURM_NTASKS} \ 21 | --process_index=${SLURM_PROCID} -------------------------------------------------------------------------------- /experiments/preliminary/mixed_pretraining_objectives/finetune/0_75/sglue_finetune_920m_32000.sh: -------------------------------------------------------------------------------- 1 | export CACHED_DATA_DIR="/fsx/lintangsutawika/data" 2 | 3 | ADDR=$1 4 | MODEL_DIR=$2 5 | 6 | let "TRAIN_STEPS = 32000 + 128000" 7 | 8 | python -m t5x.train \ 9 | --gin_file="models/scalable_t5/t5_1_1/base.gin" \ 10 | --gin_file="configs/size/920m/vanilla.gin" \ 11 | --gin_file="configs/task/finetune/sglue.gin" \ 12 | --gin.TRAIN_STEPS=${TRAIN_STEPS} \ 13 | --gin.SAVING_PERIOD=2_000 \ 14 | --gin.MODEL_DIR=\"'/fsx/lintangsutawika/improved_t5/ckpts/920m_pile_ul2_causal_0_75/sglue_32k/'\" \ 15 | --gin.INITIAL_CHECKPOINT_PATH=\"'/fsx/lintangsutawika/improved_t5/ckpts/920m_pile_ul2_causal_0_75/checkpoint_32000/'\" \ 16 | --seqio_additional_cache_dirs="${CACHED_DATA_DIR}" \ 17 | --alsologtostderr \ 18 | --multiprocess_gpu \ 19 | --coordinator_address=${ADDR} \ 20 | --process_count=${SLURM_NTASKS} \ 21 | --process_index=${SLURM_PROCID} -------------------------------------------------------------------------------- /experiments/preliminary/mixed_pretraining_objectives/finetune/0_75/sglue_finetune_920m_64000.sh: -------------------------------------------------------------------------------- 1 | export CACHED_DATA_DIR="/fsx/lintangsutawika/data" 2 | 3 | ADDR=$1 4 | MODEL_DIR=$2 5 | 6 | let "TRAIN_STEPS = 64000 + 128000" 7 | 8 | python -m t5x.train \ 9 | --gin_file="models/scalable_t5/t5_1_1/base.gin" \ 10 | --gin_file="configs/size/920m/vanilla.gin" \ 11 | --gin_file="configs/task/finetune/sglue.gin" \ 12 | --gin.TRAIN_STEPS=${TRAIN_STEPS} \ 13 | --gin.SAVING_PERIOD=2_000 \ 14 | --gin.MODEL_DIR=\"'/fsx/lintangsutawika/improved_t5/ckpts/920m_pile_ul2_causal_0_75/sglue_64k/'\" \ 15 | --gin.INITIAL_CHECKPOINT_PATH=\"'/fsx/lintangsutawika/improved_t5/ckpts/920m_pile_ul2_causal_0_75/checkpoint_64000/'\" \ 16 | --seqio_additional_cache_dirs="${CACHED_DATA_DIR}" \ 17 | --alsologtostderr \ 18 | --multiprocess_gpu \ 19 | --coordinator_address=${ADDR} \ 20 | --process_count=${SLURM_NTASKS} \ 21 | --process_index=${SLURM_PROCID} -------------------------------------------------------------------------------- /experiments/preliminary/mixed_pretraining_objectives/finetune/0_75/sglue_finetune_920m_96000.sh: -------------------------------------------------------------------------------- 1 | export CACHED_DATA_DIR="/fsx/lintangsutawika/data" 2 | 3 | ADDR=$1 4 | MODEL_DIR=$2 5 | 6 | let "TRAIN_STEPS = 96000 + 128000" 7 | 8 | python -m t5x.train \ 9 | --gin_file="models/scalable_t5/t5_1_1/base.gin" \ 10 | --gin_file="configs/size/920m/vanilla.gin" \ 11 | --gin_file="configs/task/finetune/sglue.gin" \ 12 | --gin.TRAIN_STEPS=${TRAIN_STEPS} \ 13 | --gin.SAVING_PERIOD=2_000 \ 14 | --gin.MODEL_DIR=\"'/fsx/lintangsutawika/improved_t5/ckpts/920m_pile_ul2_causal_0_75/sglue_96k/'\" \ 15 | --gin.INITIAL_CHECKPOINT_PATH=\"'/fsx/lintangsutawika/improved_t5/ckpts/920m_pile_ul2_causal_0_75/checkpoint_96000/'\" \ 16 | --seqio_additional_cache_dirs="${CACHED_DATA_DIR}" \ 17 | --alsologtostderr \ 18 | --multiprocess_gpu \ 19 | --coordinator_address=${ADDR} \ 20 | --process_count=${SLURM_NTASKS} \ 21 | --process_index=${SLURM_PROCID} -------------------------------------------------------------------------------- /experiments/preliminary/mixed_pretraining_objectives/pretrain_c4/920m_c4_mlm_0_10.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | ADDR=$1 4 | MODEL_DIR=$2 5 | 6 | python -m t5x.train \ 7 | --gin_file="models/scalable_t5/t5_1_1/base.gin" \ 8 | --gin_file="configs/size/920m/vanilla.gin" \ 9 | --gin_file="configs/pretrain/c4_mlm.gin" \ 10 | --gin.MIXTURE_OR_TASK_NAME="c4_mlm_0_10" \ 11 | --gin.MODEL_DIR=\"'/fsx/lintangsutawika/ckpts/base_c4_mlm_0_10/'\" \ 12 | --gin.USE_CACHED_TASKS=False \ 13 | --gin.TRAIN_STEPS=256000 \ 14 | --gin.SAVING_PERIOD=32000 \ 15 | --gin.BATCH_SIZE=2048 \ 16 | --alsologtostderr \ 17 | --multiprocess_gpu \ 18 | --coordinator_address=${ADDR} \ 19 | --process_count=${SLURM_NTASKS} \ 20 | --process_index=${SLURM_PROCID} 21 | -------------------------------------------------------------------------------- /experiments/preliminary/mixed_pretraining_objectives/pretrain_c4/920m_c4_mlm_0_15.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | ADDR=$1 4 | MODEL_DIR=$2 5 | 6 | python -m t5x.train \ 7 | --gin_file="models/scalable_t5/t5_1_1/base.gin" \ 8 | --gin_file="configs/size/920m/vanilla.gin" \ 9 | --gin_file="configs/pretrain/c4_mlm.gin" \ 10 | --gin.MIXTURE_OR_TASK_NAME="c4_mlm_0_15" \ 11 | --gin.MODEL_DIR=\"'/fsx/lintangsutawika/ckpts/base_c4_mlm_0_15/'\" \ 12 | --gin.USE_CACHED_TASKS=False \ 13 | --gin.TRAIN_STEPS=256000 \ 14 | --gin.SAVING_PERIOD=32000 \ 15 | --gin.BATCH_SIZE=2048 \ 16 | --alsologtostderr \ 17 | --multiprocess_gpu \ 18 | --coordinator_address=${ADDR} \ 19 | --process_count=${SLURM_NTASKS} \ 20 | --process_index=${SLURM_PROCID} 21 | -------------------------------------------------------------------------------- /experiments/preliminary/mixed_pretraining_objectives/pretrain_c4/920m_c4_mlm_0_25.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | ADDR=$1 4 | MODEL_DIR=$2 5 | 6 | python -m t5x.train \ 7 | --gin_file="models/scalable_t5/t5_1_1/base.gin" \ 8 | --gin_file="configs/size/920m/vanilla.gin" \ 9 | --gin_file="configs/pretrain/c4_mlm.gin" \ 10 | --gin.MIXTURE_OR_TASK_NAME="c4_mlm_0_25" \ 11 | --gin.MODEL_DIR=\"'/fsx/lintangsutawika/ckpts/base_c4_mlm_0_25/'\" \ 12 | --gin.USE_CACHED_TASKS=False \ 13 | --gin.TRAIN_STEPS=256000 \ 14 | --gin.SAVING_PERIOD=32000 \ 15 | --gin.BATCH_SIZE=2048 \ 16 | --alsologtostderr \ 17 | --multiprocess_gpu \ 18 | --coordinator_address=${ADDR} \ 19 | --process_count=${SLURM_NTASKS} \ 20 | --process_index=${SLURM_PROCID} 21 | -------------------------------------------------------------------------------- /experiments/preliminary/mixed_pretraining_objectives/pretrain_c4/920m_c4_mlm_0_50.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | ADDR=$1 4 | MODEL_DIR=$2 5 | 6 | python -m t5x.train \ 7 | --gin_file="models/scalable_t5/t5_1_1/base.gin" \ 8 | --gin_file="configs/size/920m/vanilla.gin" \ 9 | --gin_file="configs/pretrain/c4_mlm.gin" \ 10 | --gin.MIXTURE_OR_TASK_NAME="c4_mlm_0_50" \ 11 | --gin.MODEL_DIR=\"'/fsx/lintangsutawika/ckpts/base_c4_mlm_0_50/'\" \ 12 | --gin.USE_CACHED_TASKS=False \ 13 | --gin.TRAIN_STEPS=256000 \ 14 | --gin.SAVING_PERIOD=32000 \ 15 | --gin.BATCH_SIZE=2048 \ 16 | --alsologtostderr \ 17 | --multiprocess_gpu \ 18 | --coordinator_address=${ADDR} \ 19 | --process_count=${SLURM_NTASKS} \ 20 | --process_index=${SLURM_PROCID} 21 | -------------------------------------------------------------------------------- /experiments/preliminary/mixed_pretraining_objectives/pretrain_c4/920m_c4_mlm_0_75.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | ADDR=$1 4 | MODEL_DIR=$2 5 | 6 | python -m t5x.train \ 7 | --gin_file="models/scalable_t5/t5_1_1/base.gin" \ 8 | --gin_file="configs/size/920m/vanilla.gin" \ 9 | --gin_file="configs/pretrain/c4_mlm.gin" \ 10 | --gin.MIXTURE_OR_TASK_NAME="c4_mlm_0_75" \ 11 | --gin.MODEL_DIR=\"'/fsx/lintangsutawika/ckpts/base_c4_mlm_0_75/'\" \ 12 | --gin.USE_CACHED_TASKS=False \ 13 | --gin.TRAIN_STEPS=256000 \ 14 | --gin.SAVING_PERIOD=32000 \ 15 | --gin.BATCH_SIZE=2048 \ 16 | --alsologtostderr \ 17 | --multiprocess_gpu \ 18 | --coordinator_address=${ADDR} \ 19 | --process_count=${SLURM_NTASKS} \ 20 | --process_index=${SLURM_PROCID} 21 | -------------------------------------------------------------------------------- /experiments/preliminary/mixed_pretraining_objectives/pretrain_c4/920m_c4_mlm_1_00.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | ADDR=$1 4 | MODEL_DIR=$2 5 | 6 | python -m t5x.train \ 7 | --gin_file="models/scalable_t5/t5_1_1/base.gin" \ 8 | --gin_file="configs/size/920m/vanilla.gin" \ 9 | --gin_file="configs/pretrain/c4_mlm.gin" \ 10 | --gin.MIXTURE_OR_TASK_NAME="c4_mlm_1_00" \ 11 | --gin.MODEL_DIR=\"'/fsx/lintangsutawika/ckpts/base_c4_mlm_1_00/'\" \ 12 | --gin.USE_CACHED_TASKS=False \ 13 | --gin.TRAIN_STEPS=256000 \ 14 | --gin.SAVING_PERIOD=32000 \ 15 | --gin.BATCH_SIZE=2048 \ 16 | --alsologtostderr \ 17 | --multiprocess_gpu \ 18 | --coordinator_address=${ADDR} \ 19 | --process_count=${SLURM_NTASKS} \ 20 | --process_index=${SLURM_PROCID} 21 | -------------------------------------------------------------------------------- /experiments/preliminary/mixed_pretraining_objectives/pretrain_pile/920m_pile_mix_0_10.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | ADDR=$1 4 | MODEL_DIR=$2 5 | 6 | python -m t5x.train \ 7 | --gin_file="models/scalable_t5/t5_1_1/base.gin" \ 8 | --gin_file="configs/size/920m/vanilla.gin" \ 9 | --gin_file="configs/task/pretrain/pile_mixed_objective.gin" \ 10 | --gin.MIXTURE_OR_TASK_NAME=\"'pile_ul2_causal_0_10'\" \ 11 | --gin.MODEL_DIR=\"'/fsx/lintangsutawika/improved_t5/ckpts/920m_pile_ul2_causal_0_10/'\" \ 12 | --gin.USE_CACHED_TASKS=False \ 13 | --gin.TRAIN_STEPS=256000 \ 14 | --gin.SAVING_PERIOD=32000 \ 15 | --gin.BATCH_SIZE=2048 \ 16 | --alsologtostderr \ 17 | --multiprocess_gpu \ 18 | --coordinator_address=${ADDR} \ 19 | --process_count=${SLURM_NTASKS} \ 20 | --process_index=${SLURM_PROCID} 21 | -------------------------------------------------------------------------------- /experiments/preliminary/mixed_pretraining_objectives/pretrain_pile/920m_pile_mix_0_15.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | ADDR=$1 4 | MODEL_DIR=$2 5 | 6 | python -m t5x.train \ 7 | --gin_file="models/scalable_t5/t5_1_1/base.gin" \ 8 | --gin_file="configs/size/920m/vanilla.gin" \ 9 | --gin_file="configs/task/pretrain/pile_mixed_objective.gin" \ 10 | --gin.MIXTURE_OR_TASK_NAME=\"'pile_ul2_causal_0_15'\" \ 11 | --gin.MODEL_DIR=\"'/fsx/lintangsutawika/improved_t5/ckpts/920m_pile_ul2_causal_0_15/'\" \ 12 | --gin.USE_CACHED_TASKS=False \ 13 | --gin.TRAIN_STEPS=256000 \ 14 | --gin.SAVING_PERIOD=32000 \ 15 | --gin.BATCH_SIZE=2048 \ 16 | --alsologtostderr \ 17 | --multiprocess_gpu \ 18 | --coordinator_address=${ADDR} \ 19 | --process_count=${SLURM_NTASKS} \ 20 | --process_index=${SLURM_PROCID} 21 | -------------------------------------------------------------------------------- /experiments/preliminary/mixed_pretraining_objectives/pretrain_pile/920m_pile_mix_0_25.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | ADDR=$1 4 | MODEL_DIR=$2 5 | 6 | python -m t5x.train \ 7 | --gin_file="models/scalable_t5/t5_1_1/base.gin" \ 8 | --gin_file="configs/size/920m/vanilla.gin" \ 9 | --gin_file="configs/task/pretrain/pile_mixed_objective.gin" \ 10 | --gin.MIXTURE_OR_TASK_NAME=\"'pile_ul2_causal_0_25'\" \ 11 | --gin.MODEL_DIR=\"'/fsx/lintangsutawika/improved_t5/ckpts/920m_pile_ul2_causal_0_25/'\" \ 12 | --gin.USE_CACHED_TASKS=False \ 13 | --gin.TRAIN_STEPS=256000 \ 14 | --gin.SAVING_PERIOD=32000 \ 15 | --gin.BATCH_SIZE=2048 \ 16 | --alsologtostderr \ 17 | --multiprocess_gpu \ 18 | --coordinator_address=${ADDR} \ 19 | --process_count=${SLURM_NTASKS} \ 20 | --process_index=${SLURM_PROCID} 21 | -------------------------------------------------------------------------------- /experiments/preliminary/mixed_pretraining_objectives/pretrain_pile/920m_pile_mix_0_50.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | ADDR=$1 4 | MODEL_DIR=$2 5 | 6 | python -m t5x.train \ 7 | --gin_file="models/scalable_t5/t5_1_1/base.gin" \ 8 | --gin_file="configs/size/920m/vanilla.gin" \ 9 | --gin_file="configs/task/pretrain/pile_mixed_objective.gin" \ 10 | --gin.MIXTURE_OR_TASK_NAME=\"'pile_ul2_causal_0_50'\" \ 11 | --gin.MODEL_DIR=\"'/fsx/lintangsutawika/improved_t5/ckpts/920m_pile_ul2_causal_0_50/'\" \ 12 | --gin.USE_CACHED_TASKS=False \ 13 | --gin.TRAIN_STEPS=256000 \ 14 | --gin.SAVING_PERIOD=32000 \ 15 | --gin.BATCH_SIZE=2048 \ 16 | --alsologtostderr \ 17 | --multiprocess_gpu \ 18 | --coordinator_address=${ADDR} \ 19 | --process_count=${SLURM_NTASKS} \ 20 | --process_index=${SLURM_PROCID} 21 | -------------------------------------------------------------------------------- /experiments/preliminary/mixed_pretraining_objectives/pretrain_pile/920m_pile_mix_0_60.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | ADDR=$1 4 | MODEL_DIR=$2 5 | 6 | python -m t5x.train \ 7 | --gin_file="models/scalable_t5/t5_1_1/base.gin" \ 8 | --gin_file="configs/size/920m/vanilla.gin" \ 9 | --gin_file="configs/task/pretrain/pile_mixed_objective.gin" \ 10 | --gin.MIXTURE_OR_TASK_NAME=\"'pile_ul2_causal_0_60'\" \ 11 | --gin.MODEL_DIR=\"'/fsx/lintangsutawika/improved_t5/ckpts/920m_pile_ul2_causal_0_60/'\" \ 12 | --gin.USE_CACHED_TASKS=False \ 13 | --gin.TRAIN_STEPS=256000 \ 14 | --gin.SAVING_PERIOD=32000 \ 15 | --gin.BATCH_SIZE=2048 \ 16 | --alsologtostderr \ 17 | --multiprocess_gpu \ 18 | --coordinator_address=${ADDR} \ 19 | --process_count=${SLURM_NTASKS} \ 20 | --process_index=${SLURM_PROCID} 21 | -------------------------------------------------------------------------------- /experiments/preliminary/mixed_pretraining_objectives/pretrain_pile/920m_pile_mix_0_75.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | ADDR=$1 4 | MODEL_DIR=$2 5 | 6 | python -m t5x.train \ 7 | --gin_file="models/scalable_t5/t5_1_1/base.gin" \ 8 | --gin_file="configs/size/920m/vanilla.gin" \ 9 | --gin_file="configs/task/pretrain/pile_mixed_objective.gin" \ 10 | --gin.MIXTURE_OR_TASK_NAME=\"'pile_ul2_causal_0_75'\" \ 11 | --gin.MODEL_DIR=\"'/fsx/lintangsutawika/improved_t5/ckpts/920m_pile_ul2_causal_0_75/'\" \ 12 | --gin.USE_CACHED_TASKS=False \ 13 | --gin.TRAIN_STEPS=256000 \ 14 | --gin.SAVING_PERIOD=32000 \ 15 | --gin.BATCH_SIZE=2048 \ 16 | --alsologtostderr \ 17 | --multiprocess_gpu \ 18 | --coordinator_address=${ADDR} \ 19 | --process_count=${SLURM_NTASKS} \ 20 | --process_index=${SLURM_PROCID} 21 | -------------------------------------------------------------------------------- /experiments/preliminary/mixed_pretraining_objectives/super_glue_performance_0-10.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/improved-t5/08beb733e5df07d24fbf95f720d7e87e09c547cd/experiments/preliminary/mixed_pretraining_objectives/super_glue_performance_0-10.png -------------------------------------------------------------------------------- /experiments/preliminary/mixed_pretraining_objectives/super_glue_performance_flop_256k.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/improved-t5/08beb733e5df07d24fbf95f720d7e87e09c547cd/experiments/preliminary/mixed_pretraining_objectives/super_glue_performance_flop_256k.png -------------------------------------------------------------------------------- /experiments/preliminary/partition/pretrain_pile_1.sh: -------------------------------------------------------------------------------- 1 | ADDR=$1 2 | MODEL_DIR=$2 3 | 4 | rm -rf /fsx/lintangsutawika/improved_t5/ckpts/partition_1/ 5 | 6 | python -m t5x.train \ 7 | --gin_file="models/scalable_t5/t5_1_1/xl.gin" \ 8 | --gin_file="configs/task/pretrain/pile_mlm.gin" \ 9 | --gin_file="configs/exp/partition.gin" \ 10 | --gin.NUM_PARTITIONS=1 \ 11 | --gin.MODEL_DIR=\"'/fsx/lintangsutawika/improved_t5/ckpts/partition_1/'\" \ 12 | --gin.USE_CACHED_TASKS=False \ 13 | --alsologtostderr \ 14 | --multiprocess_gpu \ 15 | --coordinator_address=${ADDR} \ 16 | --process_count=${SLURM_NTASKS} \ 17 | --process_index=${SLURM_PROCID} 18 | -------------------------------------------------------------------------------- /experiments/preliminary/partition/pretrain_pile_2.sh: -------------------------------------------------------------------------------- 1 | ADDR=$1 2 | MODEL_DIR=$2 3 | 4 | rm -rf /fsx/lintangsutawika/improved_t5/ckpts/partition_2/ 5 | 6 | python -m t5x.train \ 7 | --gin_file="models/scalable_t5/t5_1_1/xl.gin" \ 8 | --gin_file="configs/task/pretrain/pile_mlm.gin" \ 9 | --gin_file="configs/exp/partition.gin" \ 10 | --gin.NUM_PARTITIONS=2 \ 11 | --gin.MODEL_DIR=\"'/fsx/lintangsutawika/improved_t5/ckpts/partition_2/'\" \ 12 | --gin.USE_CACHED_TASKS=False \ 13 | --alsologtostderr \ 14 | --multiprocess_gpu \ 15 | --coordinator_address=${ADDR} \ 16 | --process_count=${SLURM_NTASKS} \ 17 | --process_index=${SLURM_PROCID} 18 | -------------------------------------------------------------------------------- /experiments/preliminary/partition/pretrain_pile_4.sh: -------------------------------------------------------------------------------- 1 | ADDR=$1 2 | MODEL_DIR=$2 3 | 4 | rm -rf /fsx/lintangsutawika/improved_t5/ckpts/partition_4/ 5 | 6 | python -m t5x.train \ 7 | --gin_file="models/scalable_t5/t5_1_1/xl.gin" \ 8 | --gin_file="configs/task/pretrain/pile_mlm.gin" \ 9 | --gin_file="configs/exp/partition.gin" \ 10 | --gin.NUM_PARTITIONS=4 \ 11 | --gin.MODEL_DIR=\"'/fsx/lintangsutawika/improved_t5/ckpts/partition_4/'\" \ 12 | --gin.USE_CACHED_TASKS=False \ 13 | --alsologtostderr \ 14 | --multiprocess_gpu \ 15 | --coordinator_address=${ADDR} \ 16 | --process_count=${SLURM_NTASKS} \ 17 | --process_index=${SLURM_PROCID} 18 | -------------------------------------------------------------------------------- /experiments/preliminary/partition/pretrain_pile_8.sh: -------------------------------------------------------------------------------- 1 | ADDR=$1 2 | MODEL_DIR=$2 3 | 4 | rm -rf /fsx/lintangsutawika/improved_t5/ckpts/partition_8/ 5 | 6 | python -m t5x.train \ 7 | --gin_file="models/scalable_t5/t5_1_1/xl.gin" \ 8 | --gin_file="configs/task/pretrain/pile_mlm.gin" \ 9 | --gin_file="configs/exp/partition.gin" \ 10 | --gin.NUM_PARTITIONS=8 \ 11 | --gin.MODEL_DIR=\"'/fsx/lintangsutawika/improved_t5/ckpts/partition_8/'\" \ 12 | --gin.USE_CACHED_TASKS=False \ 13 | --alsologtostderr \ 14 | --multiprocess_gpu \ 15 | --coordinator_address=${ADDR} \ 16 | --process_count=${SLURM_NTASKS} \ 17 | --process_index=${SLURM_PROCID} 18 | -------------------------------------------------------------------------------- /experiments/preliminary/positional_embeddings/alibi/eval_perplexity/no_alibi_1024_1024.sh: -------------------------------------------------------------------------------- 1 | python -m t5x.train \ 2 | --gin_file="models/scalable_t5/t5_1_1/base.gin" \ 3 | --gin_file="configs/task/pretrain/pile_prefix_lm.gin" \ 4 | --gin_file="configs/task/partition.gin" \ 5 | --gin.BATCH_SIZE=2048 \ 6 | --gin.EVAL_BATCH_SIZE=128 \ 7 | --gin.TRAIN_STEPS=125000 \ 8 | --gin.SAVING_PERIOD=25000 \ 9 | --gin.TASK_FEATURE_LENGTHS="{'inputs': 1024, 'targets': 1024}" \ 10 | --gin.MODEL_DIR=\"'/fsx/lintangsutawika/improved_t5/ckpts/pile_prefix_lm_no_alibi/'\" \ 11 | --gin.USE_CACHED_TASKS=False \ 12 | --alsologtostderr \ 13 | --multiprocess_gpu \ 14 | --coordinator_address=${ADDR} \ 15 | --process_count=${SLURM_NTASKS} \ 16 | --process_index=${SLURM_PROCID} 17 | -------------------------------------------------------------------------------- /experiments/preliminary/positional_embeddings/alibi/eval_perplexity/no_alibi_1024_114.sh: -------------------------------------------------------------------------------- 1 | python -m t5x.train \ 2 | --gin_file="models/scalable_t5/t5_1_1/base.gin" \ 3 | --gin_file="configs/task/pretrain/pile_prefix_lm.gin" \ 4 | --gin_file="configs/task/partition.gin" \ 5 | --gin.BATCH_SIZE=2048 \ 6 | --gin.TRAIN_STEPS=125000 \ 7 | --gin.SAVING_PERIOD=25000 \ 8 | --gin.TASK_FEATURE_LENGTHS="{'inputs': 1024, 'targets': 114}" \ 9 | --gin.MODEL_DIR=\"'/fsx/lintangsutawika/improved_t5/ckpts/pile_prefix_lm_no_alibi/'\" \ 10 | --gin.USE_CACHED_TASKS=False \ 11 | --alsologtostderr \ 12 | --multiprocess_gpu \ 13 | --coordinator_address=${ADDR} \ 14 | --process_count=${SLURM_NTASKS} \ 15 | --process_index=${SLURM_PROCID} 16 | -------------------------------------------------------------------------------- /experiments/preliminary/positional_embeddings/alibi/eval_perplexity/no_alibi_1024_2048.sh: -------------------------------------------------------------------------------- 1 | python -m t5x.train \ 2 | --gin_file="models/scalable_t5/t5_1_1/base.gin" \ 3 | --gin_file="configs/task/pretrain/pile_prefix_lm.gin" \ 4 | --gin_file="configs/task/partition.gin" \ 5 | --gin.BATCH_SIZE=2048 \ 6 | --gin.EVAL_BATCH_SIZE=128 \ 7 | --gin.TRAIN_STEPS=125000 \ 8 | --gin.SAVING_PERIOD=25000 \ 9 | --gin.TASK_FEATURE_LENGTHS="{'inputs': 1024, 'targets': 2048}" \ 10 | --gin.MODEL_DIR=\"'/fsx/lintangsutawika/improved_t5/ckpts/pile_prefix_lm_no_alibi/'\" \ 11 | --gin.USE_CACHED_TASKS=False \ 12 | --alsologtostderr \ 13 | --multiprocess_gpu \ 14 | --coordinator_address=${ADDR} \ 15 | --process_count=${SLURM_NTASKS} \ 16 | --process_index=${SLURM_PROCID} 17 | -------------------------------------------------------------------------------- /experiments/preliminary/positional_embeddings/alibi/eval_perplexity/no_alibi_1024_256.sh: -------------------------------------------------------------------------------- 1 | python -m t5x.train \ 2 | --gin_file="models/scalable_t5/t5_1_1/base.gin" \ 3 | --gin_file="configs/task/pretrain/pile_prefix_lm.gin" \ 4 | --gin_file="configs/task/partition.gin" \ 5 | --gin.BATCH_SIZE=2048 \ 6 | --gin.TRAIN_STEPS=125000 \ 7 | --gin.SAVING_PERIOD=25000 \ 8 | --gin.TASK_FEATURE_LENGTHS="{'inputs': 1024, 'targets': 256}" \ 9 | --gin.MODEL_DIR=\"'/fsx/lintangsutawika/improved_t5/ckpts/pile_prefix_lm_no_alibi/'\" \ 10 | --gin.USE_CACHED_TASKS=False \ 11 | --alsologtostderr \ 12 | --multiprocess_gpu \ 13 | --coordinator_address=${ADDR} \ 14 | --process_count=${SLURM_NTASKS} \ 15 | --process_index=${SLURM_PROCID} 16 | -------------------------------------------------------------------------------- /experiments/preliminary/positional_embeddings/alibi/eval_perplexity/no_alibi_1024_512.sh: -------------------------------------------------------------------------------- 1 | python -m t5x.train \ 2 | --gin_file="models/scalable_t5/t5_1_1/base.gin" \ 3 | --gin_file="configs/task/pretrain/pile_prefix_lm.gin" \ 4 | --gin_file="configs/task/partition.gin" \ 5 | --gin.BATCH_SIZE=2048 \ 6 | --gin.TRAIN_STEPS=125000 \ 7 | --gin.SAVING_PERIOD=25000 \ 8 | --gin.TASK_FEATURE_LENGTHS="{'inputs': 1024, 'targets': 512}" \ 9 | --gin.MODEL_DIR=\"'/fsx/lintangsutawika/improved_t5/ckpts/pile_prefix_lm_no_alibi/'\" \ 10 | --gin.USE_CACHED_TASKS=False \ 11 | --alsologtostderr \ 12 | --multiprocess_gpu \ 13 | --coordinator_address=${ADDR} \ 14 | --process_count=${SLURM_NTASKS} \ 15 | --process_index=${SLURM_PROCID} 16 | -------------------------------------------------------------------------------- /experiments/preliminary/positional_embeddings/alibi/eval_perplexity/no_alibi_1_1024.sh: -------------------------------------------------------------------------------- 1 | python -m t5x.train \ 2 | --gin_file="models/scalable_t5/t5_1_1/base.gin" \ 3 | --gin_file="configs/task/pretrain/pile_prefix_lm.gin" \ 4 | --gin_file="configs/task/partition.gin" \ 5 | --gin.BATCH_SIZE=2048 \ 6 | --gin.EVAL_BATCH_SIZE=128 \ 7 | --gin.TRAIN_STEPS=125000 \ 8 | --gin.SAVING_PERIOD=25000 \ 9 | --gin.TASK_FEATURE_LENGTHS="{'inputs': 1024, 'targets': 1024}" \ 10 | --gin.MODEL_DIR=\"'/fsx/lintangsutawika/improved_t5/ckpts/pile_prefix_lm_no_alibi/'\" \ 11 | --gin.USE_CACHED_TASKS=False \ 12 | --alsologtostderr \ 13 | --multiprocess_gpu \ 14 | --coordinator_address=${ADDR} \ 15 | --process_count=${SLURM_NTASKS} \ 16 | --process_index=${SLURM_PROCID} 17 | -------------------------------------------------------------------------------- /experiments/preliminary/positional_embeddings/alibi/eval_perplexity/no_alibi_1_114.sh: -------------------------------------------------------------------------------- 1 | python -m t5x.train \ 2 | --gin_file="models/scalable_t5/t5_1_1/base.gin" \ 3 | --gin_file="configs/task/pretrain/pile_prefix_lm.gin" \ 4 | --gin_file="configs/task/partition.gin" \ 5 | --gin.BATCH_SIZE=2048 \ 6 | --gin.TRAIN_STEPS=125000 \ 7 | --gin.SAVING_PERIOD=25000 \ 8 | --gin.TASK_FEATURE_LENGTHS="{'inputs': 1024, 'targets': 114}" \ 9 | --gin.MODEL_DIR=\"'/fsx/lintangsutawika/improved_t5/ckpts/pile_prefix_lm_no_alibi/'\" \ 10 | --gin.USE_CACHED_TASKS=False \ 11 | --alsologtostderr \ 12 | --multiprocess_gpu \ 13 | --coordinator_address=${ADDR} \ 14 | --process_count=${SLURM_NTASKS} \ 15 | --process_index=${SLURM_PROCID} 16 | -------------------------------------------------------------------------------- /experiments/preliminary/positional_embeddings/alibi/eval_perplexity/no_alibi_1_2048.sh: -------------------------------------------------------------------------------- 1 | python -m t5x.train \ 2 | --gin_file="models/scalable_t5/t5_1_1/base.gin" \ 3 | --gin_file="configs/task/pretrain/pile_prefix_lm.gin" \ 4 | --gin_file="configs/task/partition.gin" \ 5 | --gin.BATCH_SIZE=2048 \ 6 | --gin.EVAL_BATCH_SIZE=128 \ 7 | --gin.TRAIN_STEPS=125000 \ 8 | --gin.SAVING_PERIOD=25000 \ 9 | --gin.TASK_FEATURE_LENGTHS="{'inputs': 1024, 'targets': 2048}" \ 10 | --gin.MODEL_DIR=\"'/fsx/lintangsutawika/improved_t5/ckpts/pile_prefix_lm_no_alibi/'\" \ 11 | --gin.USE_CACHED_TASKS=False \ 12 | --alsologtostderr \ 13 | --multiprocess_gpu \ 14 | --coordinator_address=${ADDR} \ 15 | --process_count=${SLURM_NTASKS} \ 16 | --process_index=${SLURM_PROCID} 17 | -------------------------------------------------------------------------------- /experiments/preliminary/positional_embeddings/alibi/eval_perplexity/no_alibi_1_256.sh: -------------------------------------------------------------------------------- 1 | python -m t5x.train \ 2 | --gin_file="models/scalable_t5/t5_1_1/base.gin" \ 3 | --gin_file="configs/task/pretrain/pile_prefix_lm.gin" \ 4 | --gin_file="configs/task/partition.gin" \ 5 | --gin.BATCH_SIZE=2048 \ 6 | --gin.TRAIN_STEPS=125000 \ 7 | --gin.SAVING_PERIOD=25000 \ 8 | --gin.TASK_FEATURE_LENGTHS="{'inputs': 1024, 'targets': 256}" \ 9 | --gin.MODEL_DIR=\"'/fsx/lintangsutawika/improved_t5/ckpts/pile_prefix_lm_no_alibi/'\" \ 10 | --gin.USE_CACHED_TASKS=False \ 11 | --alsologtostderr \ 12 | --multiprocess_gpu \ 13 | --coordinator_address=${ADDR} \ 14 | --process_count=${SLURM_NTASKS} \ 15 | --process_index=${SLURM_PROCID} 16 | -------------------------------------------------------------------------------- /experiments/preliminary/positional_embeddings/alibi/eval_perplexity/no_alibi_1_512.sh: -------------------------------------------------------------------------------- 1 | python -m t5x.train \ 2 | --gin_file="models/scalable_t5/t5_1_1/base.gin" \ 3 | --gin_file="configs/task/pretrain/pile_prefix_lm.gin" \ 4 | --gin_file="configs/task/partition.gin" \ 5 | --gin.BATCH_SIZE=2048 \ 6 | --gin.TRAIN_STEPS=125000 \ 7 | --gin.SAVING_PERIOD=25000 \ 8 | --gin.TASK_FEATURE_LENGTHS="{'inputs': 1024, 'targets': 512}" \ 9 | --gin.MODEL_DIR=\"'/fsx/lintangsutawika/improved_t5/ckpts/pile_prefix_lm_no_alibi/'\" \ 10 | --gin.USE_CACHED_TASKS=False \ 11 | --alsologtostderr \ 12 | --multiprocess_gpu \ 13 | --coordinator_address=${ADDR} \ 14 | --process_count=${SLURM_NTASKS} \ 15 | --process_index=${SLURM_PROCID} 16 | -------------------------------------------------------------------------------- /experiments/preliminary/positional_embeddings/alibi/eval_perplexity/no_alibi_512_1024.sh: -------------------------------------------------------------------------------- 1 | python -m t5x.train \ 2 | --gin_file="models/scalable_t5/t5_1_1/base.gin" \ 3 | --gin_file="configs/task/pretrain/pile_prefix_lm.gin" \ 4 | --gin_file="configs/task/partition.gin" \ 5 | --gin.BATCH_SIZE=2048 \ 6 | --gin.TRAIN_STEPS=125000 \ 7 | --gin.SAVING_PERIOD=25000 \ 8 | --gin.TASK_FEATURE_LENGTHS="{'inputs': 512, 'targets': 1024}" \ 9 | --gin.MODEL_DIR=\"'/fsx/lintangsutawika/improved_t5/ckpts/pile_prefix_lm_no_alibi/'\" \ 10 | --gin.USE_CACHED_TASKS=False \ 11 | --alsologtostderr \ 12 | --multiprocess_gpu \ 13 | --coordinator_address=${ADDR} \ 14 | --process_count=${SLURM_NTASKS} \ 15 | --process_index=${SLURM_PROCID} 16 | -------------------------------------------------------------------------------- /experiments/preliminary/positional_embeddings/alibi/eval_perplexity/no_alibi_512_114.sh: -------------------------------------------------------------------------------- 1 | python -m t5x.train \ 2 | --gin_file="models/scalable_t5/t5_1_1/base.gin" \ 3 | --gin_file="configs/task/pretrain/pile_prefix_lm.gin" \ 4 | --gin_file="configs/task/partition.gin" \ 5 | --gin.BATCH_SIZE=2048 \ 6 | --gin.TRAIN_STEPS=125000 \ 7 | --gin.SAVING_PERIOD=25000 \ 8 | --gin.TASK_FEATURE_LENGTHS="{'inputs': 512, 'targets': 114}" \ 9 | --gin.MODEL_DIR=\"'/fsx/lintangsutawika/improved_t5/ckpts/pile_prefix_lm_no_alibi/'\" \ 10 | --gin.USE_CACHED_TASKS=False \ 11 | --alsologtostderr \ 12 | --multiprocess_gpu \ 13 | --coordinator_address=${ADDR} \ 14 | --process_count=${SLURM_NTASKS} \ 15 | --process_index=${SLURM_PROCID} 16 | -------------------------------------------------------------------------------- /experiments/preliminary/positional_embeddings/alibi/eval_perplexity/no_alibi_512_2048.sh: -------------------------------------------------------------------------------- 1 | python -m t5x.train \ 2 | --gin_file="models/scalable_t5/t5_1_1/base.gin" \ 3 | --gin_file="configs/task/pretrain/pile_prefix_lm.gin" \ 4 | --gin_file="configs/task/partition.gin" \ 5 | --gin.BATCH_SIZE=2048 \ 6 | --gin.TRAIN_STEPS=125000 \ 7 | --gin.SAVING_PERIOD=25000 \ 8 | --gin.TASK_FEATURE_LENGTHS="{'inputs': 512, 'targets': 2048}" \ 9 | --gin.MODEL_DIR=\"'/fsx/lintangsutawika/improved_t5/ckpts/pile_prefix_lm_no_alibi/'\" \ 10 | --gin.USE_CACHED_TASKS=False \ 11 | --alsologtostderr \ 12 | --multiprocess_gpu \ 13 | --coordinator_address=${ADDR} \ 14 | --process_count=${SLURM_NTASKS} \ 15 | --process_index=${SLURM_PROCID} 16 | -------------------------------------------------------------------------------- /experiments/preliminary/positional_embeddings/alibi/eval_perplexity/no_alibi_512_256.sh: -------------------------------------------------------------------------------- 1 | python -m t5x.train \ 2 | --gin_file="models/scalable_t5/t5_1_1/base.gin" \ 3 | --gin_file="configs/task/pretrain/pile_prefix_lm.gin" \ 4 | --gin_file="configs/task/partition.gin" \ 5 | --gin.BATCH_SIZE=2048 \ 6 | --gin.TRAIN_STEPS=125000 \ 7 | --gin.SAVING_PERIOD=25000 \ 8 | --gin.TASK_FEATURE_LENGTHS="{'inputs': 512, 'targets': 256}" \ 9 | --gin.MODEL_DIR=\"'/fsx/lintangsutawika/improved_t5/ckpts/pile_prefix_lm_no_alibi/'\" \ 10 | --gin.USE_CACHED_TASKS=False \ 11 | --alsologtostderr \ 12 | --multiprocess_gpu \ 13 | --coordinator_address=${ADDR} \ 14 | --process_count=${SLURM_NTASKS} \ 15 | --process_index=${SLURM_PROCID} 16 | -------------------------------------------------------------------------------- /experiments/preliminary/positional_embeddings/alibi/eval_perplexity/no_alibi_512_512.sh: -------------------------------------------------------------------------------- 1 | python -m t5x.train \ 2 | --gin_file="models/scalable_t5/t5_1_1/base.gin" \ 3 | --gin_file="configs/task/pretrain/pile_prefix_lm.gin" \ 4 | --gin_file="configs/task/partition.gin" \ 5 | --gin.BATCH_SIZE=2048 \ 6 | --gin.TRAIN_STEPS=125000 \ 7 | --gin.SAVING_PERIOD=25000 \ 8 | --gin.TASK_FEATURE_LENGTHS="{'inputs': 512, 'targets': 512}" \ 9 | --gin.MODEL_DIR=\"'/fsx/lintangsutawika/improved_t5/ckpts/pile_prefix_lm_no_alibi/'\" \ 10 | --gin.USE_CACHED_TASKS=False \ 11 | --alsologtostderr \ 12 | --multiprocess_gpu \ 13 | --coordinator_address=${ADDR} \ 14 | --process_count=${SLURM_NTASKS} \ 15 | --process_index=${SLURM_PROCID} 16 | -------------------------------------------------------------------------------- /experiments/preliminary/positional_embeddings/alibi/eval_perplexity/with_alibi_1024_1024.sh: -------------------------------------------------------------------------------- 1 | python -m t5x.train \ 2 | --gin_file="models/scalable_t5/t5_1_1/base.gin" \ 3 | --gin_file="configs/task/pretrain/pile_prefix_lm.gin" \ 4 | --gin_file="configs/exp/alibi.gin" \ 5 | --gin_file="configs/task/partition.gin" \ 6 | --gin.BATCH_SIZE=2048 \ 7 | --gin.EVAL_BATCH_SIZE=128 \ 8 | --gin.TRAIN_STEPS=125000 \ 9 | --gin.SAVING_PERIOD=25000 \ 10 | --gin.TASK_FEATURE_LENGTHS="{'inputs': 1024, 'targets': 1024}" \ 11 | --gin.MODEL_DIR=\"'/fsx/lintangsutawika/improved_t5/ckpts/pile_prefix_lm_no_alibi/'\" \ 12 | --gin.USE_CACHED_TASKS=False \ 13 | --alsologtostderr \ 14 | --multiprocess_gpu \ 15 | --coordinator_address=${ADDR} \ 16 | --process_count=${SLURM_NTASKS} \ 17 | --process_index=${SLURM_PROCID} 18 | -------------------------------------------------------------------------------- /experiments/preliminary/positional_embeddings/alibi/eval_perplexity/with_alibi_1024_114.sh: -------------------------------------------------------------------------------- 1 | python -m t5x.train \ 2 | --gin_file="models/scalable_t5/t5_1_1/base.gin" \ 3 | --gin_file="configs/task/pretrain/pile_prefix_lm.gin" \ 4 | --gin_file="configs/exp/alibi.gin" \ 5 | --gin_file="configs/task/partition.gin" \ 6 | --gin.BATCH_SIZE=2048 \ 7 | --gin.TRAIN_STEPS=125000 \ 8 | --gin.SAVING_PERIOD=25000 \ 9 | --gin.TASK_FEATURE_LENGTHS="{'inputs': 1024, 'targets': 114}" \ 10 | --gin.MODEL_DIR=\"'/fsx/lintangsutawika/improved_t5/ckpts/pile_prefix_lm_no_alibi/'\" \ 11 | --gin.USE_CACHED_TASKS=False \ 12 | --alsologtostderr \ 13 | --multiprocess_gpu \ 14 | --coordinator_address=${ADDR} \ 15 | --process_count=${SLURM_NTASKS} \ 16 | --process_index=${SLURM_PROCID} 17 | -------------------------------------------------------------------------------- /experiments/preliminary/positional_embeddings/alibi/eval_perplexity/with_alibi_1024_2048.sh: -------------------------------------------------------------------------------- 1 | python -m t5x.train \ 2 | --gin_file="models/scalable_t5/t5_1_1/base.gin" \ 3 | --gin_file="configs/task/pretrain/pile_prefix_lm.gin" \ 4 | --gin_file="configs/exp/alibi.gin" \ 5 | --gin_file="configs/task/partition.gin" \ 6 | --gin.BATCH_SIZE=2048 \ 7 | --gin.EVAL_BATCH_SIZE=128 \ 8 | --gin.TRAIN_STEPS=125000 \ 9 | --gin.SAVING_PERIOD=25000 \ 10 | --gin.TASK_FEATURE_LENGTHS="{'inputs': 1024, 'targets': 2048}" \ 11 | --gin.MODEL_DIR=\"'/fsx/lintangsutawika/improved_t5/ckpts/pile_prefix_lm_with_alibi/'\" \ 12 | --gin.USE_CACHED_TASKS=False \ 13 | --alsologtostderr \ 14 | --multiprocess_gpu \ 15 | --coordinator_address=${ADDR} \ 16 | --process_count=${SLURM_NTASKS} \ 17 | --process_index=${SLURM_PROCID} 18 | -------------------------------------------------------------------------------- /experiments/preliminary/positional_embeddings/alibi/eval_perplexity/with_alibi_1024_256.sh: -------------------------------------------------------------------------------- 1 | python -m t5x.train \ 2 | --gin_file="models/scalable_t5/t5_1_1/base.gin" \ 3 | --gin_file="configs/task/pretrain/pile_prefix_lm.gin" \ 4 | --gin_file="configs/exp/alibi.gin" \ 5 | --gin_file="configs/task/partition.gin" \ 6 | --gin.BATCH_SIZE=2048 \ 7 | --gin.TRAIN_STEPS=125000 \ 8 | --gin.SAVING_PERIOD=25000 \ 9 | --gin.TASK_FEATURE_LENGTHS="{'inputs': 1024, 'targets': 256}" \ 10 | --gin.MODEL_DIR=\"'/fsx/lintangsutawika/improved_t5/ckpts/pile_prefix_lm_no_alibi/'\" \ 11 | --gin.USE_CACHED_TASKS=False \ 12 | --alsologtostderr \ 13 | --multiprocess_gpu \ 14 | --coordinator_address=${ADDR} \ 15 | --process_count=${SLURM_NTASKS} \ 16 | --process_index=${SLURM_PROCID} 17 | -------------------------------------------------------------------------------- /experiments/preliminary/positional_embeddings/alibi/eval_perplexity/with_alibi_1024_512.sh: -------------------------------------------------------------------------------- 1 | python -m t5x.train \ 2 | --gin_file="models/scalable_t5/t5_1_1/base.gin" \ 3 | --gin_file="configs/task/pretrain/pile_prefix_lm.gin" \ 4 | --gin_file="configs/exp/alibi.gin" \ 5 | --gin_file="configs/task/partition.gin" \ 6 | --gin.BATCH_SIZE=2048 \ 7 | --gin.TRAIN_STEPS=125000 \ 8 | --gin.SAVING_PERIOD=25000 \ 9 | --gin.TASK_FEATURE_LENGTHS="{'inputs': 1024, 'targets': 512}" \ 10 | --gin.MODEL_DIR=\"'/fsx/lintangsutawika/improved_t5/ckpts/pile_prefix_lm_no_alibi/'\" \ 11 | --gin.USE_CACHED_TASKS=False \ 12 | --alsologtostderr \ 13 | --multiprocess_gpu \ 14 | --coordinator_address=${ADDR} \ 15 | --process_count=${SLURM_NTASKS} \ 16 | --process_index=${SLURM_PROCID} 17 | -------------------------------------------------------------------------------- /experiments/preliminary/positional_embeddings/alibi/eval_perplexity/with_alibi_1_1024.sh: -------------------------------------------------------------------------------- 1 | python -m t5x.train \ 2 | --gin_file="models/scalable_t5/t5_1_1/base.gin" \ 3 | --gin_file="configs/task/pretrain/pile_prefix_lm.gin" \ 4 | --gin_file="configs/exp/alibi.gin" \ 5 | --gin_file="configs/task/partition.gin" \ 6 | --gin.BATCH_SIZE=2048 \ 7 | --gin.EVAL_BATCH_SIZE=128 \ 8 | --gin.TRAIN_STEPS=125000 \ 9 | --gin.SAVING_PERIOD=25000 \ 10 | --gin.TASK_FEATURE_LENGTHS="{'inputs': 1024, 'targets': 1024}" \ 11 | --gin.MODEL_DIR=\"'/fsx/lintangsutawika/improved_t5/ckpts/pile_prefix_lm_no_alibi/'\" \ 12 | --gin.USE_CACHED_TASKS=False \ 13 | --alsologtostderr \ 14 | --multiprocess_gpu \ 15 | --coordinator_address=${ADDR} \ 16 | --process_count=${SLURM_NTASKS} \ 17 | --process_index=${SLURM_PROCID} 18 | -------------------------------------------------------------------------------- /experiments/preliminary/positional_embeddings/alibi/eval_perplexity/with_alibi_1_114.sh: -------------------------------------------------------------------------------- 1 | python -m t5x.train \ 2 | --gin_file="models/scalable_t5/t5_1_1/base.gin" \ 3 | --gin_file="configs/task/pretrain/pile_prefix_lm.gin" \ 4 | --gin_file="configs/exp/alibi.gin" \ 5 | --gin_file="configs/task/partition.gin" \ 6 | --gin.BATCH_SIZE=2048 \ 7 | --gin.EVAL_BATCH_SIZE=128 \ 8 | --gin.TRAIN_STEPS=125000 \ 9 | --gin.SAVING_PERIOD=25000 \ 10 | --gin.TASK_FEATURE_LENGTHS="{'inputs': 1024, 'targets': 114}" \ 11 | --gin.MODEL_DIR=\"'/fsx/lintangsutawika/improved_t5/ckpts/pile_prefix_lm_no_alibi/'\" \ 12 | --gin.USE_CACHED_TASKS=False \ 13 | --alsologtostderr \ 14 | --multiprocess_gpu \ 15 | --coordinator_address=${ADDR} \ 16 | --process_count=${SLURM_NTASKS} \ 17 | --process_index=${SLURM_PROCID} 18 | -------------------------------------------------------------------------------- /experiments/preliminary/positional_embeddings/alibi/eval_perplexity/with_alibi_1_2048.sh: -------------------------------------------------------------------------------- 1 | python -m t5x.train \ 2 | --gin_file="models/scalable_t5/t5_1_1/base.gin" \ 3 | --gin_file="configs/task/pretrain/pile_prefix_lm.gin" \ 4 | --gin_file="configs/exp/alibi.gin" \ 5 | --gin_file="configs/task/partition.gin" \ 6 | --gin.BATCH_SIZE=2048 \ 7 | --gin.EVAL_BATCH_SIZE=128 \ 8 | --gin.TRAIN_STEPS=125000 \ 9 | --gin.SAVING_PERIOD=25000 \ 10 | --gin.TASK_FEATURE_LENGTHS="{'inputs': 1024, 'targets': 2048}" \ 11 | --gin.MODEL_DIR=\"'/fsx/lintangsutawika/improved_t5/ckpts/pile_prefix_lm_no_alibi/'\" \ 12 | --gin.USE_CACHED_TASKS=False \ 13 | --alsologtostderr \ 14 | --multiprocess_gpu \ 15 | --coordinator_address=${ADDR} \ 16 | --process_count=${SLURM_NTASKS} \ 17 | --process_index=${SLURM_PROCID} 18 | -------------------------------------------------------------------------------- /experiments/preliminary/positional_embeddings/alibi/eval_perplexity/with_alibi_1_256.sh: -------------------------------------------------------------------------------- 1 | python -m t5x.train \ 2 | --gin_file="models/scalable_t5/t5_1_1/base.gin" \ 3 | --gin_file="configs/task/pretrain/pile_prefix_lm.gin" \ 4 | --gin_file="configs/exp/alibi.gin" \ 5 | --gin_file="configs/task/partition.gin" \ 6 | --gin.BATCH_SIZE=2048 \ 7 | --gin.EVAL_BATCH_SIZE=128 \ 8 | --gin.TRAIN_STEPS=125000 \ 9 | --gin.SAVING_PERIOD=25000 \ 10 | --gin.TASK_FEATURE_LENGTHS="{'inputs': 1024, 'targets': 256}" \ 11 | --gin.MODEL_DIR=\"'/fsx/lintangsutawika/improved_t5/ckpts/pile_prefix_lm_no_alibi/'\" \ 12 | --gin.USE_CACHED_TASKS=False \ 13 | --alsologtostderr \ 14 | --multiprocess_gpu \ 15 | --coordinator_address=${ADDR} \ 16 | --process_count=${SLURM_NTASKS} \ 17 | --process_index=${SLURM_PROCID} 18 | -------------------------------------------------------------------------------- /experiments/preliminary/positional_embeddings/alibi/eval_perplexity/with_alibi_1_512.sh: -------------------------------------------------------------------------------- 1 | python -m t5x.train \ 2 | --gin_file="models/scalable_t5/t5_1_1/base.gin" \ 3 | --gin_file="configs/task/pretrain/pile_prefix_lm.gin" \ 4 | --gin_file="configs/exp/alibi.gin" \ 5 | --gin_file="configs/task/partition.gin" \ 6 | --gin.BATCH_SIZE=2048 \ 7 | --gin.EVAL_BATCH_SIZE=128 \ 8 | --gin.TRAIN_STEPS=125000 \ 9 | --gin.SAVING_PERIOD=25000 \ 10 | --gin.TASK_FEATURE_LENGTHS="{'inputs': 1024, 'targets': 512}" \ 11 | --gin.MODEL_DIR=\"'/fsx/lintangsutawika/improved_t5/ckpts/pile_prefix_lm_no_alibi/'\" \ 12 | --gin.USE_CACHED_TASKS=False \ 13 | --alsologtostderr \ 14 | --multiprocess_gpu \ 15 | --coordinator_address=${ADDR} \ 16 | --process_count=${SLURM_NTASKS} \ 17 | --process_index=${SLURM_PROCID} 18 | -------------------------------------------------------------------------------- /experiments/preliminary/positional_embeddings/alibi/eval_perplexity/with_alibi_512_1024.sh: -------------------------------------------------------------------------------- 1 | python -m t5x.train \ 2 | --gin_file="models/scalable_t5/t5_1_1/base.gin" \ 3 | --gin_file="configs/task/pretrain/pile_prefix_lm.gin" \ 4 | --gin_file="configs/exp/alibi.gin" \ 5 | --gin_file="configs/task/partition.gin" \ 6 | --gin.BATCH_SIZE=2048 \ 7 | --gin.TRAIN_STEPS=125000 \ 8 | --gin.SAVING_PERIOD=25000 \ 9 | --gin.TASK_FEATURE_LENGTHS="{'inputs': 512, 'targets': 1024}" \ 10 | --gin.MODEL_DIR=\"'/fsx/lintangsutawika/improved_t5/ckpts/pile_prefix_lm_with_alibi/'\" \ 11 | --gin.USE_CACHED_TASKS=False \ 12 | --alsologtostderr \ 13 | --multiprocess_gpu \ 14 | --coordinator_address=${ADDR} \ 15 | --process_count=${SLURM_NTASKS} \ 16 | --process_index=${SLURM_PROCID} 17 | -------------------------------------------------------------------------------- /experiments/preliminary/positional_embeddings/alibi/eval_perplexity/with_alibi_512_114.sh: -------------------------------------------------------------------------------- 1 | ADDR=$1 2 | MODEL_DIR=$2 3 | 4 | python -m t5x.train \ 5 | --gin_file="models/scalable_t5/t5_1_1/base.gin" \ 6 | --gin_file="configs/task/pretrain/pile_prefix_lm.gin" \ 7 | --gin_file="configs/exp/alibi.gin" \ 8 | --gin.BATCH_SIZE=2048 \ 9 | --gin.TRAIN_STEPS=125000 \ 10 | --gin.SAVING_PERIOD=25000 \ 11 | --gin.TASK_FEATURE_LENGTHS="{'inputs': 512, 'targets': 114}" \ 12 | --gin.MODEL_DIR=\"'/fsx/lintangsutawika/improved_t5/ckpts/pile_prefix_lm_with_alibi_dot_relpos/'\" \ 13 | --gin.USE_CACHED_TASKS=False \ 14 | --alsologtostderr \ 15 | --multiprocess_gpu \ 16 | --coordinator_address=${ADDR} \ 17 | --process_count=${SLURM_NTASKS} \ 18 | --process_index=${SLURM_PROCID} 19 | -------------------------------------------------------------------------------- /experiments/preliminary/positional_embeddings/alibi/eval_perplexity/with_alibi_512_2048.sh: -------------------------------------------------------------------------------- 1 | python -m t5x.train \ 2 | --gin_file="models/scalable_t5/t5_1_1/base.gin" \ 3 | --gin_file="configs/task/pretrain/pile_prefix_lm.gin" \ 4 | --gin_file="configs/exp/alibi.gin" \ 5 | --gin_file="configs/task/partition.gin" \ 6 | --gin.BATCH_SIZE=2048 \ 7 | --gin.TRAIN_STEPS=125000 \ 8 | --gin.SAVING_PERIOD=25000 \ 9 | --gin.TASK_FEATURE_LENGTHS="{'inputs': 512, 'targets': 2048}" \ 10 | --gin.MODEL_DIR=\"'/fsx/lintangsutawika/improved_t5/ckpts/pile_prefix_lm_with_alibi/'\" \ 11 | --gin.USE_CACHED_TASKS=False \ 12 | --alsologtostderr \ 13 | --multiprocess_gpu \ 14 | --coordinator_address=${ADDR} \ 15 | --process_count=${SLURM_NTASKS} \ 16 | --process_index=${SLURM_PROCID} 17 | -------------------------------------------------------------------------------- /experiments/preliminary/positional_embeddings/alibi/eval_perplexity/with_alibi_512_256.sh: -------------------------------------------------------------------------------- 1 | ADDR=$1 2 | MODEL_DIR=$2 3 | 4 | python -m t5x.train \ 5 | --gin_file="models/scalable_t5/t5_1_1/base.gin" \ 6 | --gin_file="configs/task/pretrain/pile_prefix_lm.gin" \ 7 | --gin_file="configs/exp/alibi.gin" \ 8 | --gin.BATCH_SIZE=2048 \ 9 | --gin.TRAIN_STEPS=125000 \ 10 | --gin.SAVING_PERIOD=25000 \ 11 | --gin.TASK_FEATURE_LENGTHS="{'inputs': 512, 'targets': 256}" \ 12 | --gin.MODEL_DIR=\"'/fsx/lintangsutawika/improved_t5/ckpts/pile_prefix_lm_with_alibi_dot_relpos/'\" \ 13 | --gin.USE_CACHED_TASKS=False \ 14 | --alsologtostderr \ 15 | --multiprocess_gpu \ 16 | --coordinator_address=${ADDR} \ 17 | --process_count=${SLURM_NTASKS} \ 18 | --process_index=${SLURM_PROCID} 19 | -------------------------------------------------------------------------------- /experiments/preliminary/positional_embeddings/alibi/eval_perplexity/with_alibi_512_512.sh: -------------------------------------------------------------------------------- 1 | ADDR=$1 2 | MODEL_DIR=$2 3 | 4 | python -m t5x.train \ 5 | --gin_file="models/scalable_t5/t5_1_1/base.gin" \ 6 | --gin_file="configs/task/pretrain/pile_prefix_lm.gin" \ 7 | --gin_file="configs/exp/alibi.gin" \ 8 | --gin.BATCH_SIZE=2048 \ 9 | --gin.TRAIN_STEPS=125000 \ 10 | --gin.SAVING_PERIOD=25000 \ 11 | --gin.TASK_FEATURE_LENGTHS="{'inputs': 512, 'targets': 512}" \ 12 | --gin.MODEL_DIR=\"'/fsx/lintangsutawika/improved_t5/ckpts/pile_prefix_lm_with_alibi_dot_relpos/'\" \ 13 | --gin.USE_CACHED_TASKS=False \ 14 | --alsologtostderr \ 15 | --multiprocess_gpu \ 16 | --coordinator_address=${ADDR} \ 17 | --process_count=${SLURM_NTASKS} \ 18 | --process_index=${SLURM_PROCID} 19 | -------------------------------------------------------------------------------- /experiments/preliminary/positional_embeddings/alibi/finetune_sglue_prefix_lm_no_alibi.sh: -------------------------------------------------------------------------------- 1 | export CACHED_DATA_DIR="/fsx/lintangsutawika/data" 2 | 3 | ADDR=$1 4 | MODEL_DIR=$2 5 | 6 | python -m t5x.train \ 7 | --gin_file="models/scalable_t5/t5_1_1/base.gin" \ 8 | --gin_file="configs/task/finetune/sglue.gin" \ 9 | --gin.TRAIN_STEPS=135000 \ 10 | --gin.SAVING_PERIOD=2000 \ 11 | --gin.BATCH_SIZE=2048 \ 12 | --gin.MODEL_DIR=\"'/fsx/lintangsutawika/improved_t5/ckpts/sglue_finetune_no_alibi/'\" \ 13 | --gin.INITIAL_CHECKPOINT_PATH=\"'/fsx/lintangsutawika/improved_t5/ckpts/pile_prefix_lm_no_alibi/checkpoint_125000/'\" \ 14 | --seqio_additional_cache_dirs="${CACHED_DATA_DIR}" \ 15 | --alsologtostderr \ 16 | --multiprocess_gpu \ 17 | --coordinator_address=${ADDR} \ 18 | --process_count=${SLURM_NTASKS} \ 19 | --process_index=${SLURM_PROCID} 20 | -------------------------------------------------------------------------------- /experiments/preliminary/positional_embeddings/alibi/finetune_sglue_prefix_lm_with_alibi.sh: -------------------------------------------------------------------------------- 1 | export CACHED_DATA_DIR="/fsx/lintangsutawika/data" 2 | 3 | ADDR=$1 4 | MODEL_DIR=$2 5 | 6 | python -m t5x.train \ 7 | --gin_file="models/scalable_t5/t5_1_1/base.gin" \ 8 | --gin_file="configs/task/finetune/sglue.gin" \ 9 | --gin_file="configs/exp/alibi.gin" \ 10 | --gin.TRAIN_STEPS=135000 \ 11 | --gin.SAVING_PERIOD=2000 \ 12 | --gin.BATCH_SIZE=2048 \ 13 | --gin.MODEL_DIR=\"'/fsx/lintangsutawika/improved_t5/ckpts/sglue_finetune_with_alibi/'\" \ 14 | --gin.INITIAL_CHECKPOINT_PATH=\"'/fsx/lintangsutawika/improved_t5/ckpts/pile_prefix_lm_with_alibi/checkpoint_125000/'\" \ 15 | --seqio_additional_cache_dirs="${CACHED_DATA_DIR}" \ 16 | --alsologtostderr \ 17 | --multiprocess_gpu \ 18 | --coordinator_address=${ADDR} \ 19 | --process_count=${SLURM_NTASKS} \ 20 | --process_index=${SLURM_PROCID} 21 | -------------------------------------------------------------------------------- /experiments/preliminary/positional_embeddings/alibi/pretrain_pile_prefix_lm_no_alibi.sh: -------------------------------------------------------------------------------- 1 | ADDR=$1 2 | MODEL_DIR=$2 3 | 4 | python -m t5x.train \ 5 | --gin_file="models/scalable_t5/t5_1_1/base.gin" \ 6 | --gin_file="configs/task/pretrain/pile_prefix_lm.gin" \ 7 | --gin.TRAIN_STEPS=125000 \ 8 | --gin.SAVING_PERIOD=25000 \ 9 | --gin.MODEL_DIR=\"'/fsx/lintangsutawika/improved_t5/ckpts/pile_prefix_lm_no_alibi/'\" \ 10 | --gin.USE_CACHED_TASKS=False \ 11 | --alsologtostderr \ 12 | --multiprocess_gpu \ 13 | --coordinator_address=${ADDR} \ 14 | --process_count=${SLURM_NTASKS} \ 15 | --process_index=${SLURM_PROCID} 16 | -------------------------------------------------------------------------------- /experiments/preliminary/positional_embeddings/alibi/pretrain_pile_prefix_lm_with_alibi.sh: -------------------------------------------------------------------------------- 1 | ADDR=$1 2 | MODEL_DIR=$2 3 | 4 | python -m t5x.train \ 5 | --gin_file="models/scalable_t5/t5_1_1/base.gin" \ 6 | --gin_file="configs/task/pretrain/pile_prefix_lm.gin" \ 7 | --gin_file="configs/exp/alibi.gin" \ 8 | --gin.TRAIN_STEPS=125000 \ 9 | --gin.SAVING_PERIOD=25000 \ 10 | --gin.MODEL_DIR=\"'/fsx/lintangsutawika/improved_t5/ckpts/pile_prefix_lm_with_alibi/'\" \ 11 | --gin.USE_CACHED_TASKS=False \ 12 | --alsologtostderr \ 13 | --multiprocess_gpu \ 14 | --coordinator_address=${ADDR} \ 15 | --process_count=${SLURM_NTASKS} \ 16 | --process_index=${SLURM_PROCID} 17 | -------------------------------------------------------------------------------- /experiments/preliminary/positional_embeddings/alibi/pretrain_pile_prefix_lm_with_alibi_plus_relpos.sh: -------------------------------------------------------------------------------- 1 | ADDR=$1 2 | MODEL_DIR=$2 3 | 4 | python -m t5x.train \ 5 | --gin_file="models/scalable_t5/t5_1_1/base.gin" \ 6 | --gin_file="configs/task/pretrain/pile_prefix_lm.gin" \ 7 | --gin_file="configs/exp/alibi_plus_relpos.gin" \ 8 | --gin.TRAIN_STEPS=125000 \ 9 | --gin.SAVING_PERIOD=5000 \ 10 | --gin.MODEL_DIR=\"'/fsx/lintangsutawika/improved_t5/ckpts/pile_prefix_lm_with_alibi_plus_relpos/'\" \ 11 | --gin.USE_CACHED_TASKS=False \ 12 | --alsologtostderr \ 13 | --multiprocess_gpu \ 14 | --coordinator_address=${ADDR} \ 15 | --process_count=${SLURM_NTASKS} \ 16 | --process_index=${SLURM_PROCID} 17 | -------------------------------------------------------------------------------- /experiments/preliminary/positional_embeddings/pretrain_rotary_pile_mlm.sh: -------------------------------------------------------------------------------- 1 | ADDR=$1 2 | MODEL_DIR=$2 3 | 4 | python -m t5x.train \ 5 | --gin_file="models/scalable_t5/t5_1_1/base.gin" \ 6 | --gin_file="configs/task/pretrain/pile_mlm.gin" \ 7 | --gin_file="configs/exp/rotary.gin" \ 8 | --gin.TRAIN_STEPS=125000 \ 9 | --gin.SAVING_PERIOD=25000 \ 10 | --gin.MODEL_DIR=\"'/fsx/lintangsutawika/improved_t5/ckpts/rotary/rotary_pile_mlm/'\" \ 11 | --gin.USE_CACHED_TASKS=False \ 12 | --alsologtostderr \ 13 | --multiprocess_gpu \ 14 | --coordinator_address=${ADDR} \ 15 | --process_count=${SLURM_NTASKS} \ 16 | --process_index=${SLURM_PROCID} 17 | -------------------------------------------------------------------------------- /experiments/preliminary/positional_embeddings/rotary/eval/t0_eval_alibi_relpos.sh: -------------------------------------------------------------------------------- 1 | export CACHED_DATA_DIR="/fsx/lintangsutawika/data" 2 | 3 | ADDR=$1 4 | MODEL_DIR=$2 5 | 6 | python -m t5x.train \ 7 | --gin_file="models/scalable_t5/t5_1_1/base.gin" \ 8 | --gin_file="configs/task/eval/t0_eval.gin" \ 9 | --gin.TRAIN_STEPS=135000 \ 10 | --gin.SAVING_PERIOD=5000 \ 11 | --gin.MODEL_DIR=\"'/fsx/lintangsutawika/improved_t5/ckpts/METRO/alibi_relpos_pile_mlm/finetune_t0_eval/'\" \ 12 | --gin.INITIAL_CHECKPOINT_PATH=\"'/fsx/lintangsutawika/improved_t5/ckpts/METRO/alibi_relpos_pile_mlm/finetune_t0_train/checkpoint_135000'\" \ 13 | --seqio_additional_cache_dirs="${CACHED_DATA_DIR}" \ 14 | --gin.USE_CACHED_TASKS=True \ 15 | --alsologtostderr \ 16 | --multiprocess_gpu \ 17 | --coordinator_address=${ADDR} \ 18 | --process_count=${SLURM_NTASKS} \ 19 | --process_index=${SLURM_PROCID} -------------------------------------------------------------------------------- /experiments/preliminary/positional_embeddings/rotary/eval/t0_eval_benchmark.sh: -------------------------------------------------------------------------------- 1 | export CACHED_DATA_DIR="/fsx/lintangsutawika/data" 2 | 3 | ADDR=$1 4 | MODEL_DIR=$2 5 | 6 | python -m t5x.train \ 7 | --gin_file="models/scalable_t5/t5_1_1/base.gin" \ 8 | --gin_file="configs/task/eval/t0_eval.gin" \ 9 | --gin.TRAIN_STEPS=135000 \ 10 | --gin.SAVING_PERIOD=5000 \ 11 | --gin.MODEL_DIR=\"'/fsx/lintangsutawika/improved_t5/ckpts/METRO/benchmark_pile_mlm/finetune_t0_eval/'\" \ 12 | --gin.INITIAL_CHECKPOINT_PATH=\"'/fsx/lintangsutawika/improved_t5/ckpts/METRO/benchmark_pile_mlm/finetune_t0_train/checkpoint_135000'\" \ 13 | --seqio_additional_cache_dirs="${CACHED_DATA_DIR}" \ 14 | --gin.USE_CACHED_TASKS=True \ 15 | --alsologtostderr \ 16 | --multiprocess_gpu \ 17 | --coordinator_address=${ADDR} \ 18 | --process_count=${SLURM_NTASKS} \ 19 | --process_index=${SLURM_PROCID} -------------------------------------------------------------------------------- /experiments/preliminary/positional_embeddings/rotary/finetune/sglue/sglue_train_benchmark.sh: -------------------------------------------------------------------------------- 1 | export CACHED_DATA_DIR="/fsx/lintangsutawika/data" 2 | 3 | ADDR=$1 4 | MODEL_DIR=$2 5 | 6 | python -m t5x.train \ 7 | --gin_file="models/scalable_t5/t5_1_1/base.gin" \ 8 | --gin_file="configs/task/finetune/sglue.gin" \ 9 | --gin.TRAIN_STEPS=135000 \ 10 | --gin.SAVING_PERIOD=2000 \ 11 | --gin.BATCH_SIZE=2048 \ 12 | --gin.MODEL_DIR=\"'/fsx/lintangsutawika/improved_t5/ckpts/rotary/benchmark_pile_mlm/finetune_sglue/'\" \ 13 | --gin.INITIAL_CHECKPOINT_PATH=\"'/fsx/lintangsutawika/improved_t5/ckpts/rotary/benchmark_pile_mlm/checkpoint_125000/'\" \ 14 | --seqio_additional_cache_dirs="${CACHED_DATA_DIR}" \ 15 | --gin.USE_CACHED_TASKS=True \ 16 | --alsologtostderr \ 17 | --multiprocess_gpu \ 18 | --coordinator_address=${ADDR} \ 19 | --process_count=${SLURM_NTASKS} \ 20 | --process_index=${SLURM_PROCID} -------------------------------------------------------------------------------- /experiments/preliminary/positional_embeddings/rotary/pretrain_benchmark_pile_mlm.sh: -------------------------------------------------------------------------------- 1 | ADDR=$1 2 | MODEL_DIR=$2 3 | 4 | python -m t5x.train \ 5 | --gin_file="models/scalable_t5/t5_1_1/base.gin" \ 6 | --gin_file="configs/task/pretrain/pile_mlm.gin" \ 7 | --gin.TRAIN_STEPS=125000 \ 8 | --gin.SAVING_PERIOD=25000 \ 9 | --gin.MODEL_DIR=\"'/fsx/lintangsutawika/improved_t5/ckpts/rotary/benchmark_pile_mlm/'\" \ 10 | --gin.USE_CACHED_TASKS=False \ 11 | --alsologtostderr \ 12 | --multiprocess_gpu \ 13 | --coordinator_address=${ADDR} \ 14 | --process_count=${SLURM_NTASKS} \ 15 | --process_index=${SLURM_PROCID} 16 | -------------------------------------------------------------------------------- /experiments/preliminary/positional_embeddings/rotary/pretrain_rotary_relpos_pile_mlm.sh: -------------------------------------------------------------------------------- 1 | ADDR=$1 2 | MODEL_DIR=$2 3 | 4 | python -m t5x.train \ 5 | --gin_file="models/scalable_t5/t5_1_1/base.gin" \ 6 | --gin_file="configs/task/pretrain/pile_mlm.gin" \ 7 | --gin_file="configs/exp/rotary_relpos.gin" \ 8 | --gin.TRAIN_STEPS=125000 \ 9 | --gin.SAVING_PERIOD=25000 \ 10 | --gin.MODEL_DIR=\"'/fsx/lintangsutawika/improved_t5/ckpts/rotary/rotary_relpos_pile_mlm/'\" \ 11 | --gin.USE_CACHED_TASKS=False \ 12 | --alsologtostderr \ 13 | --multiprocess_gpu \ 14 | --coordinator_address=${ADDR} \ 15 | --process_count=${SLURM_NTASKS} \ 16 | --process_index=${SLURM_PROCID} 17 | -------------------------------------------------------------------------------- /experiments/preliminary/scaling_laws/110m/sglue_finetune_110m_16000.sh: -------------------------------------------------------------------------------- 1 | export T5X_DIR="/fsx/lintangsutawika/t5x/" 2 | export CONFIG_PATH="/fsx/aran/jax/t5x_2/architecture-objective/experiments/configs" 3 | export CACHED_DATA_DIR="/fsx/lintangsutawika/data" 4 | 5 | ADDR=$1 6 | MODEL_DIR=$2 7 | 8 | python -m t5x.train \ 9 | --gin_file="configs/finetune_sglue.gin" \ 10 | --gin_file="configs/size/110m/vanilla.gin" \ 11 | --gin_file="${CONFIG_PATH}/mode/gpu.gin" \ 12 | --gin.TRAIN_STEPS=144_000 \ 13 | --gin.MODEL_DIR=\"'/fsx/aran/jax/ckpts/scaling/110m/vanilla_16k_finetune'\" \ 14 | --gin.INITIAL_CHECKPOINT_PATH=\"'/fsx/aran/jax/ckpts/scaling/110m/checkpoint_16000'\" \ 15 | --seqio_additional_cache_dirs="${CACHED_DATA_DIR}" \ 16 | --alsologtostderr \ 17 | --multiprocess_gpu \ 18 | --coordinator_address=${ADDR} \ 19 | --process_count=${SLURM_NTASKS} \ 20 | --process_index=${SLURM_PROCID} -------------------------------------------------------------------------------- /experiments/preliminary/scaling_laws/110m/sglue_finetune_110m_32000.sh: -------------------------------------------------------------------------------- 1 | export T5X_DIR="/fsx/lintangsutawika/t5x/" 2 | export CONFIG_PATH="/fsx/aran/jax/t5x_2/architecture-objective/experiments/configs" 3 | export CACHED_DATA_DIR="/fsx/lintangsutawika/data" 4 | 5 | ADDR=$1 6 | MODEL_DIR=$2 7 | 8 | python -m t5x.train \ 9 | --gin_file="configs/finetune_sglue.gin" \ 10 | --gin_file="configs/size/110m/vanilla.gin" \ 11 | --gin_file="${CONFIG_PATH}/mode/gpu.gin" \ 12 | --gin.TRAIN_STEPS=160_000 \ 13 | --gin.MODEL_DIR=\"'/fsx/aran/jax/ckpts/scaling/110m/vanilla_32k_finetune'\" \ 14 | --gin.INITIAL_CHECKPOINT_PATH=\"'/fsx/aran/jax/ckpts/scaling/110m/checkpoint_32000'\" \ 15 | --seqio_additional_cache_dirs="${CACHED_DATA_DIR}" \ 16 | --alsologtostderr \ 17 | --multiprocess_gpu \ 18 | --coordinator_address=${ADDR} \ 19 | --process_count=${SLURM_NTASKS} \ 20 | --process_index=${SLURM_PROCID} -------------------------------------------------------------------------------- /experiments/preliminary/scaling_laws/110m/sglue_finetune_110m_48000.sh: -------------------------------------------------------------------------------- 1 | export T5X_DIR="/fsx/lintangsutawika/t5x/" 2 | export CONFIG_PATH="/fsx/aran/jax/t5x_2/architecture-objective/experiments/configs" 3 | export CACHED_DATA_DIR="/fsx/lintangsutawika/data" 4 | 5 | ADDR=$1 6 | MODEL_DIR=$2 7 | 8 | python -m t5x.train \ 9 | --gin_file="configs/finetune_sglue.gin" \ 10 | --gin_file="configs/size/110m/vanilla.gin" \ 11 | --gin_file="${CONFIG_PATH}/mode/gpu.gin" \ 12 | --gin.TRAIN_STEPS=176_000 \ 13 | --gin.MODEL_DIR=\"'/fsx/aran/jax/ckpts/scaling/110m/vanilla_48k_finetune'\" \ 14 | --gin.INITIAL_CHECKPOINT_PATH=\"'/fsx/aran/jax/ckpts/scaling/110m/checkpoint_48000'\" \ 15 | --seqio_additional_cache_dirs="${CACHED_DATA_DIR}" \ 16 | --alsologtostderr \ 17 | --multiprocess_gpu \ 18 | --coordinator_address=${ADDR} \ 19 | --process_count=${SLURM_NTASKS} \ 20 | --process_index=${SLURM_PROCID} -------------------------------------------------------------------------------- /experiments/preliminary/scaling_laws/110m/sglue_finetune_110m_64000.sh: -------------------------------------------------------------------------------- 1 | export T5X_DIR="/fsx/lintangsutawika/t5x/" 2 | export CONFIG_PATH="/fsx/aran/jax/t5x_2/architecture-objective/experiments/configs" 3 | export CACHED_DATA_DIR="/fsx/lintangsutawika/data" 4 | 5 | ADDR=$1 6 | MODEL_DIR=$2 7 | 8 | python -m t5x.train \ 9 | --gin_file="configs/finetune_sglue.gin" \ 10 | --gin_file="configs/size/110m/vanilla.gin" \ 11 | --gin_file="${CONFIG_PATH}/mode/gpu.gin" \ 12 | --gin.TRAIN_STEPS=192_000 \ 13 | --gin.MODEL_DIR=\"'/fsx/aran/jax/ckpts/scaling/110m/vanilla_64k_finetune'\" \ 14 | --gin.INITIAL_CHECKPOINT_PATH=\"'/fsx/aran/jax/ckpts/scaling/110m/checkpoint_64000'\" \ 15 | --seqio_additional_cache_dirs="${CACHED_DATA_DIR}" \ 16 | --alsologtostderr \ 17 | --multiprocess_gpu \ 18 | --coordinator_address=${ADDR} \ 19 | --process_count=${SLURM_NTASKS} \ 20 | --process_index=${SLURM_PROCID} -------------------------------------------------------------------------------- /experiments/preliminary/scaling_laws/110m/sglue_finetune_110m_80000.sh: -------------------------------------------------------------------------------- 1 | export T5X_DIR="/fsx/lintangsutawika/t5x/" 2 | export CONFIG_PATH="/fsx/aran/jax/t5x_2/architecture-objective/experiments/configs" 3 | export CACHED_DATA_DIR="/fsx/lintangsutawika/data" 4 | 5 | ADDR=$1 6 | MODEL_DIR=$2 7 | 8 | python -m t5x.train \ 9 | --gin_file="configs/finetune_sglue.gin" \ 10 | --gin_file="configs/size/110m/vanilla.gin" \ 11 | --gin_file="${CONFIG_PATH}/mode/gpu.gin" \ 12 | --gin.TRAIN_STEPS=208_000 \ 13 | --gin.MODEL_DIR=\"'/fsx/aran/jax/ckpts/scaling/110m/vanilla_80k_finetune'\" \ 14 | --gin.INITIAL_CHECKPOINT_PATH=\"'/fsx/aran/jax/ckpts/scaling/110m/checkpoint_80000'\" \ 15 | --seqio_additional_cache_dirs="${CACHED_DATA_DIR}" \ 16 | --alsologtostderr \ 17 | --multiprocess_gpu \ 18 | --coordinator_address=${ADDR} \ 19 | --process_count=${SLURM_NTASKS} \ 20 | --process_index=${SLURM_PROCID} -------------------------------------------------------------------------------- /experiments/preliminary/scaling_laws/1_6b/sglue_finetune_1_6b_128000.sh: -------------------------------------------------------------------------------- 1 | export T5X_DIR="/fsx/lintangsutawika/t5x/" 2 | export CONFIG_PATH="/fsx/aran/jax/t5x_2/architecture-objective/experiments/configs" 3 | export CACHED_DATA_DIR="/fsx/lintangsutawika/data" 4 | 5 | ADDR=$1 6 | MODEL_DIR=$2 7 | 8 | python -m t5x.train \ 9 | --gin_file="configs/finetune_sglue.gin" \ 10 | --gin_file="${CONFIG_PATH}/size/1_6b/vanilla.gin" \ 11 | --gin_file="${CONFIG_PATH}/mode/gpu.gin" \ 12 | --gin.TRAIN_STEPS=256_000 \ 13 | --gin.MODEL_DIR=\"'/fsx/aran/jax/ckpts/scaling/1_6b/vanilla_128k_finetune'\" \ 14 | --gin.INITIAL_CHECKPOINT_PATH=\"'/fsx/aran/jax/ckpts/scaling/1_6b/checkpoint_128000'\" \ 15 | --seqio_additional_cache_dirs="${CACHED_DATA_DIR}" \ 16 | --alsologtostderr \ 17 | --multiprocess_gpu \ 18 | --coordinator_address=${ADDR} \ 19 | --process_count=${SLURM_NTASKS} \ 20 | --process_index=${SLURM_PROCID} -------------------------------------------------------------------------------- /experiments/preliminary/scaling_laws/1_6b/sglue_finetune_1_6b_192000.sh: -------------------------------------------------------------------------------- 1 | export T5X_DIR="/fsx/lintangsutawika/t5x/" 2 | export CONFIG_PATH="/fsx/aran/jax/t5x_2/architecture-objective/experiments/configs" 3 | export CACHED_DATA_DIR="/fsx/lintangsutawika/data" 4 | 5 | ADDR=$1 6 | MODEL_DIR=$2 7 | 8 | python -m t5x.train \ 9 | --gin_file="configs/finetune_sglue.gin" \ 10 | --gin_file="${CONFIG_PATH}/size/1_6b/vanilla.gin" \ 11 | --gin_file="${CONFIG_PATH}/mode/gpu.gin" \ 12 | --gin.TRAIN_STEPS=320_000 \ 13 | --gin.MODEL_DIR=\"'/fsx/aran/jax/ckpts/scaling/1_6b/vanilla_192k_finetune'\" \ 14 | --gin.INITIAL_CHECKPOINT_PATH=\"'/fsx/aran/jax/ckpts/scaling/1_6b/checkpoint_192000'\" \ 15 | --seqio_additional_cache_dirs="${CACHED_DATA_DIR}" \ 16 | --alsologtostderr \ 17 | --multiprocess_gpu \ 18 | --coordinator_address=${ADDR} \ 19 | --process_count=${SLURM_NTASKS} \ 20 | --process_index=${SLURM_PROCID} -------------------------------------------------------------------------------- /experiments/preliminary/scaling_laws/1_6b/sglue_finetune_1_6b_256000.sh: -------------------------------------------------------------------------------- 1 | export T5X_DIR="/fsx/lintangsutawika/t5x/" 2 | export CONFIG_PATH="/fsx/aran/jax/t5x_2/architecture-objective/experiments/configs" 3 | export CACHED_DATA_DIR="/fsx/lintangsutawika/data" 4 | 5 | ADDR=$1 6 | MODEL_DIR=$2 7 | 8 | python -m t5x.train \ 9 | --gin_file="configs/finetune_sglue.gin" \ 10 | --gin_file="${CONFIG_PATH}/size/1_6b/vanilla.gin" \ 11 | --gin_file="${CONFIG_PATH}/mode/gpu.gin" \ 12 | --gin.TRAIN_STEPS=384_000 \ 13 | --gin.MODEL_DIR=\"'/fsx/aran/jax/ckpts/scaling/1_6b/vanilla_256k_finetune'\" \ 14 | --gin.INITIAL_CHECKPOINT_PATH=\"'/fsx/aran/jax/ckpts/scaling/1_6b/checkpoint_256000'\" \ 15 | --seqio_additional_cache_dirs="${CACHED_DATA_DIR}" \ 16 | --alsologtostderr \ 17 | --multiprocess_gpu \ 18 | --coordinator_address=${ADDR} \ 19 | --process_count=${SLURM_NTASKS} \ 20 | --process_index=${SLURM_PROCID} -------------------------------------------------------------------------------- /experiments/preliminary/scaling_laws/1_6b/sglue_finetune_1_6b_320000.sh: -------------------------------------------------------------------------------- 1 | export T5X_DIR="/fsx/lintangsutawika/t5x/" 2 | export CONFIG_PATH="/fsx/aran/jax/t5x_2/architecture-objective/experiments/configs" 3 | export CACHED_DATA_DIR="/fsx/lintangsutawika/data" 4 | 5 | ADDR=$1 6 | MODEL_DIR=$2 7 | 8 | python -m t5x.train \ 9 | --gin_file="configs/finetune_sglue.gin" \ 10 | --gin_file="${CONFIG_PATH}/size/1_6b/vanilla.gin" \ 11 | --gin_file="${CONFIG_PATH}/mode/gpu.gin" \ 12 | --gin.TRAIN_STEPS=448_000 \ 13 | --gin.MODEL_DIR=\"'/fsx/aran/jax/ckpts/scaling/1_6b/vanilla_320k_finetune'\" \ 14 | --gin.INITIAL_CHECKPOINT_PATH=\"'/fsx/aran/jax/ckpts/scaling/1_6b/checkpoint_320000'\" \ 15 | --seqio_additional_cache_dirs="${CACHED_DATA_DIR}" \ 16 | --alsologtostderr \ 17 | --multiprocess_gpu \ 18 | --coordinator_address=${ADDR} \ 19 | --process_count=${SLURM_NTASKS} \ 20 | --process_index=${SLURM_PROCID} -------------------------------------------------------------------------------- /experiments/preliminary/scaling_laws/1_6b/sglue_finetune_1_6b_384000.sh: -------------------------------------------------------------------------------- 1 | export T5X_DIR="/fsx/lintangsutawika/t5x/" 2 | export CONFIG_PATH="/fsx/aran/jax/t5x_2/architecture-objective/experiments/configs" 3 | export CACHED_DATA_DIR="/fsx/lintangsutawika/data" 4 | 5 | ADDR=$1 6 | MODEL_DIR=$2 7 | 8 | python -m t5x.train \ 9 | --gin_file="configs/finetune_sglue.gin" \ 10 | --gin_file="${CONFIG_PATH}/size/1_6b/vanilla.gin" \ 11 | --gin_file="${CONFIG_PATH}/mode/gpu.gin" \ 12 | --gin.TRAIN_STEPS=512_000 \ 13 | --gin.MODEL_DIR=\"'/fsx/aran/jax/ckpts/scaling/1_6b/vanilla_384k_finetune'\" \ 14 | --gin.INITIAL_CHECKPOINT_PATH=\"'/fsx/aran/jax/ckpts/scaling/1_6b/checkpoint_384000'\" \ 15 | --seqio_additional_cache_dirs="${CACHED_DATA_DIR}" \ 16 | --alsologtostderr \ 17 | --multiprocess_gpu \ 18 | --coordinator_address=${ADDR} \ 19 | --process_count=${SLURM_NTASKS} \ 20 | --process_index=${SLURM_PROCID} -------------------------------------------------------------------------------- /experiments/preliminary/scaling_laws/1_6b/sglue_finetune_1_6b_424000.sh: -------------------------------------------------------------------------------- 1 | export T5X_DIR="/fsx/lintangsutawika/t5x/" 2 | export CONFIG_PATH="/fsx/aran/jax/t5x_2/architecture-objective/experiments/configs" 3 | export CACHED_DATA_DIR="/fsx/lintangsutawika/data" 4 | 5 | ADDR=$1 6 | MODEL_DIR=$2 7 | 8 | python -m t5x.train \ 9 | --gin_file="configs/finetune_sglue.gin" \ 10 | --gin_file="${CONFIG_PATH}/size/1_6b/vanilla.gin" \ 11 | --gin_file="${CONFIG_PATH}/mode/gpu.gin" \ 12 | --gin.TRAIN_STEPS=552_000 \ 13 | --gin.MODEL_DIR=\"'/fsx/aran/jax/ckpts/scaling/1_6b/vanilla_424k_finetune'\" \ 14 | --gin.INITIAL_CHECKPOINT_PATH=\"'/fsx/aran/jax/ckpts/scaling/1_6b/checkpoint_424000'\" \ 15 | --seqio_additional_cache_dirs="${CACHED_DATA_DIR}" \ 16 | --alsologtostderr \ 17 | --multiprocess_gpu \ 18 | --coordinator_address=${ADDR} \ 19 | --process_count=${SLURM_NTASKS} \ 20 | --process_index=${SLURM_PROCID} -------------------------------------------------------------------------------- /experiments/preliminary/scaling_laws/1_6b/sglue_finetune_1_6b_448000.sh: -------------------------------------------------------------------------------- 1 | export T5X_DIR="/fsx/lintangsutawika/t5x/" 2 | export CONFIG_PATH="/fsx/aran/jax/t5x_2/architecture-objective/experiments/configs" 3 | export CACHED_DATA_DIR="/fsx/lintangsutawika/data" 4 | 5 | ADDR=$1 6 | MODEL_DIR=$2 7 | 8 | python -m t5x.train \ 9 | --gin_file="configs/finetune_sglue.gin" \ 10 | --gin_file="${CONFIG_PATH}/size/1_6b/vanilla.gin" \ 11 | --gin_file="${CONFIG_PATH}/mode/gpu.gin" \ 12 | --gin.TRAIN_STEPS=576_000 \ 13 | --gin.MODEL_DIR=\"'/fsx/aran/jax/ckpts/scaling/1_6b/vanilla_448k_finetune'\" \ 14 | --gin.INITIAL_CHECKPOINT_PATH=\"'/fsx/aran/jax/ckpts/scaling/1_6b/checkpoint_448000'\" \ 15 | --seqio_additional_cache_dirs="${CACHED_DATA_DIR}" \ 16 | --alsologtostderr \ 17 | --multiprocess_gpu \ 18 | --coordinator_address=${ADDR} \ 19 | --process_count=${SLURM_NTASKS} \ 20 | --process_index=${SLURM_PROCID} -------------------------------------------------------------------------------- /experiments/preliminary/scaling_laws/1_6b/sglue_finetune_1_6b_512000.sh: -------------------------------------------------------------------------------- 1 | export T5X_DIR="/fsx/lintangsutawika/t5x/" 2 | export CONFIG_PATH="/fsx/aran/jax/t5x_2/architecture-objective/experiments/configs" 3 | export CACHED_DATA_DIR="/fsx/lintangsutawika/data" 4 | 5 | ADDR=$1 6 | MODEL_DIR=$2 7 | 8 | python -m t5x.train \ 9 | --gin_file="configs/finetune_sglue.gin" \ 10 | --gin_file="${CONFIG_PATH}/size/1_6b/vanilla.gin" \ 11 | --gin_file="${CONFIG_PATH}/mode/gpu.gin" \ 12 | --gin.TRAIN_STEPS=640_000 \ 13 | --gin.MODEL_DIR=\"'/fsx/aran/jax/ckpts/scaling/1_6b/vanilla_512k_finetune'\" \ 14 | --gin.INITIAL_CHECKPOINT_PATH=\"'/fsx/aran/jax/ckpts/scaling/1_6b/checkpoint_512000'\" \ 15 | --seqio_additional_cache_dirs="${CACHED_DATA_DIR}" \ 16 | --alsologtostderr \ 17 | --multiprocess_gpu \ 18 | --coordinator_address=${ADDR} \ 19 | --process_count=${SLURM_NTASKS} \ 20 | --process_index=${SLURM_PROCID} -------------------------------------------------------------------------------- /experiments/preliminary/scaling_laws/1_6b/sglue_finetune_1_6b_64000.sh: -------------------------------------------------------------------------------- 1 | export T5X_DIR="/fsx/lintangsutawika/t5x/" 2 | export CONFIG_PATH="/fsx/aran/jax/t5x_2/architecture-objective/experiments/configs" 3 | export CACHED_DATA_DIR="/fsx/lintangsutawika/data" 4 | 5 | ADDR=$1 6 | MODEL_DIR=$2 7 | 8 | python -m t5x.train \ 9 | --gin_file="configs/finetune_sglue.gin" \ 10 | --gin_file="${CONFIG_PATH}/size/1_6b/vanilla.gin" \ 11 | --gin_file="${CONFIG_PATH}/mode/gpu.gin" \ 12 | --gin.TRAIN_STEPS=192_000 \ 13 | --gin.MODEL_DIR=\"'/fsx/aran/jax/ckpts/scaling/1_6b/vanilla_64k_finetune'\" \ 14 | --gin.INITIAL_CHECKPOINT_PATH=\"'/fsx/aran/jax/ckpts/scaling/1_6b/checkpoint_64000'\" \ 15 | --seqio_additional_cache_dirs="${CACHED_DATA_DIR}" \ 16 | --alsologtostderr \ 17 | --multiprocess_gpu \ 18 | --coordinator_address=${ADDR} \ 19 | --process_count=${SLURM_NTASKS} \ 20 | --process_index=${SLURM_PROCID} -------------------------------------------------------------------------------- /experiments/preliminary/scaling_laws/25m/sglue_finetune_25m_16000.sh: -------------------------------------------------------------------------------- 1 | export T5X_DIR="/fsx/lintangsutawika/t5x/" 2 | export CONFIG_PATH="/fsx/aran/jax/t5x_2/architecture-objective/experiments/configs" 3 | export CACHED_DATA_DIR="/fsx/lintangsutawika/data" 4 | 5 | ADDR=$1 6 | MODEL_DIR=$2 7 | 8 | python -m t5x.train \ 9 | --gin_file="configs/finetune_sglue.gin" \ 10 | --gin_file="configs/size/25m/vanilla.gin" \ 11 | --gin_file="${CONFIG_PATH}/mode/gpu.gin" \ 12 | --gin.TRAIN_STEPS=144_000 \ 13 | --gin.MODEL_DIR=\"'/fsx/aran/jax/ckpts/scaling/25m/vanilla_16k_finetune'\" \ 14 | --gin.INITIAL_CHECKPOINT_PATH=\"'/fsx/aran/jax/ckpts/scaling/25m/checkpoint_16000'\" \ 15 | --seqio_additional_cache_dirs="${CACHED_DATA_DIR}" \ 16 | --alsologtostderr \ 17 | --multiprocess_gpu \ 18 | --coordinator_address=${ADDR} \ 19 | --process_count=${SLURM_NTASKS} \ 20 | --process_index=${SLURM_PROCID} -------------------------------------------------------------------------------- /experiments/preliminary/scaling_laws/25m/sglue_finetune_25m_24000.sh: -------------------------------------------------------------------------------- 1 | export T5X_DIR="/fsx/lintangsutawika/t5x/" 2 | export CONFIG_PATH="/fsx/aran/jax/t5x_2/architecture-objective/experiments/configs" 3 | export CACHED_DATA_DIR="/fsx/lintangsutawika/data" 4 | 5 | ADDR=$1 6 | MODEL_DIR=$2 7 | 8 | python -m t5x.train \ 9 | --gin_file="configs/finetune_sglue.gin" \ 10 | --gin_file="configs/size/25m/vanilla.gin" \ 11 | --gin_file="${CONFIG_PATH}/mode/gpu.gin" \ 12 | --gin.TRAIN_STEPS=152_000 \ 13 | --gin.MODEL_DIR=\"'/fsx/aran/jax/ckpts/scaling/25m/vanilla_24k_finetune'\" \ 14 | --gin.INITIAL_CHECKPOINT_PATH=\"'/fsx/aran/jax/ckpts/scaling/25m/checkpoint_24000'\" \ 15 | --seqio_additional_cache_dirs="${CACHED_DATA_DIR}" \ 16 | --alsologtostderr \ 17 | --multiprocess_gpu \ 18 | --coordinator_address=${ADDR} \ 19 | --process_count=${SLURM_NTASKS} \ 20 | --process_index=${SLURM_PROCID} -------------------------------------------------------------------------------- /experiments/preliminary/scaling_laws/25m/sglue_finetune_25m_32000.sh: -------------------------------------------------------------------------------- 1 | export T5X_DIR="/fsx/lintangsutawika/t5x/" 2 | export CONFIG_PATH="/fsx/aran/jax/t5x_2/architecture-objective/experiments/configs" 3 | export CACHED_DATA_DIR="/fsx/lintangsutawika/data" 4 | 5 | ADDR=$1 6 | MODEL_DIR=$2 7 | 8 | python -m t5x.train \ 9 | --gin_file="configs/finetune_sglue.gin" \ 10 | --gin_file="configs/size/25m/vanilla.gin" \ 11 | --gin_file="${CONFIG_PATH}/mode/gpu.gin" \ 12 | --gin.TRAIN_STEPS=160_000 \ 13 | --gin.MODEL_DIR=\"'/fsx/aran/jax/ckpts/scaling/25m/vanilla_32k_finetune'\" \ 14 | --gin.INITIAL_CHECKPOINT_PATH=\"'/fsx/aran/jax/ckpts/scaling/25m/checkpoint_32000'\" \ 15 | --seqio_additional_cache_dirs="${CACHED_DATA_DIR}" \ 16 | --alsologtostderr \ 17 | --multiprocess_gpu \ 18 | --coordinator_address=${ADDR} \ 19 | --process_count=${SLURM_NTASKS} \ 20 | --process_index=${SLURM_PROCID} -------------------------------------------------------------------------------- /experiments/preliminary/scaling_laws/25m/sglue_finetune_25m_40000.sh: -------------------------------------------------------------------------------- 1 | export T5X_DIR="/fsx/lintangsutawika/t5x/" 2 | export CONFIG_PATH="/fsx/aran/jax/t5x_2/architecture-objective/experiments/configs" 3 | export CACHED_DATA_DIR="/fsx/lintangsutawika/data" 4 | 5 | ADDR=$1 6 | MODEL_DIR=$2 7 | 8 | python -m t5x.train \ 9 | --gin_file="configs/finetune_sglue.gin" \ 10 | --gin_file="configs/size/25m/vanilla.gin" \ 11 | --gin_file="${CONFIG_PATH}/mode/gpu.gin" \ 12 | --gin.TRAIN_STEPS=168_000 \ 13 | --gin.MODEL_DIR=\"'/fsx/aran/jax/ckpts/scaling/25m/vanilla_40k_finetune'\" \ 14 | --gin.INITIAL_CHECKPOINT_PATH=\"'/fsx/aran/jax/ckpts/scaling/25m/checkpoint_40000'\" \ 15 | --seqio_additional_cache_dirs="${CACHED_DATA_DIR}" \ 16 | --alsologtostderr \ 17 | --multiprocess_gpu \ 18 | --coordinator_address=${ADDR} \ 19 | --process_count=${SLURM_NTASKS} \ 20 | --process_index=${SLURM_PROCID} -------------------------------------------------------------------------------- /experiments/preliminary/scaling_laws/25m/sglue_finetune_25m_48000.sh: -------------------------------------------------------------------------------- 1 | export T5X_DIR="/fsx/lintangsutawika/t5x/" 2 | export CONFIG_PATH="/fsx/aran/jax/t5x_2/architecture-objective/experiments/configs" 3 | export CACHED_DATA_DIR="/fsx/lintangsutawika/data" 4 | 5 | ADDR=$1 6 | MODEL_DIR=$2 7 | 8 | python -m t5x.train \ 9 | --gin_file="configs/finetune_sglue.gin" \ 10 | --gin_file="configs/size/25m/vanilla.gin" \ 11 | --gin_file="${CONFIG_PATH}/mode/gpu.gin" \ 12 | --gin.TRAIN_STEPS=176_000 \ 13 | --gin.MODEL_DIR=\"'/fsx/aran/jax/ckpts/scaling/25m/vanilla_48k_finetune'\" \ 14 | --gin.INITIAL_CHECKPOINT_PATH=\"'/fsx/aran/jax/ckpts/scaling/25m/checkpoint_48000'\" \ 15 | --seqio_additional_cache_dirs="${CACHED_DATA_DIR}" \ 16 | --alsologtostderr \ 17 | --multiprocess_gpu \ 18 | --coordinator_address=${ADDR} \ 19 | --process_count=${SLURM_NTASKS} \ 20 | --process_index=${SLURM_PROCID} -------------------------------------------------------------------------------- /experiments/preliminary/scaling_laws/25m/sglue_finetune_25m_56000.sh: -------------------------------------------------------------------------------- 1 | export T5X_DIR="/fsx/lintangsutawika/t5x/" 2 | export CONFIG_PATH="/fsx/aran/jax/t5x_2/architecture-objective/experiments/configs" 3 | export CACHED_DATA_DIR="/fsx/lintangsutawika/data" 4 | 5 | ADDR=$1 6 | MODEL_DIR=$2 7 | 8 | python -m t5x.train \ 9 | --gin_file="configs/finetune_sglue.gin" \ 10 | --gin_file="configs/size/25m/vanilla.gin" \ 11 | --gin_file="${CONFIG_PATH}/mode/gpu.gin" \ 12 | --gin.TRAIN_STEPS=184_000 \ 13 | --gin.MODEL_DIR=\"'/fsx/aran/jax/ckpts/scaling/25m/vanilla_56k_finetune'\" \ 14 | --gin.INITIAL_CHECKPOINT_PATH=\"'/fsx/aran/jax/ckpts/scaling/25m/checkpoint_56000'\" \ 15 | --seqio_additional_cache_dirs="${CACHED_DATA_DIR}" \ 16 | --alsologtostderr \ 17 | --multiprocess_gpu \ 18 | --coordinator_address=${ADDR} \ 19 | --process_count=${SLURM_NTASKS} \ 20 | --process_index=${SLURM_PROCID} -------------------------------------------------------------------------------- /experiments/preliminary/scaling_laws/25m/sglue_finetune_25m_64000.sh: -------------------------------------------------------------------------------- 1 | export T5X_DIR="/fsx/lintangsutawika/t5x/" 2 | export CONFIG_PATH="/fsx/aran/jax/t5x_2/architecture-objective/experiments/configs" 3 | export CACHED_DATA_DIR="/fsx/lintangsutawika/data" 4 | 5 | ADDR=$1 6 | MODEL_DIR=$2 7 | 8 | python -m t5x.train \ 9 | --gin_file="configs/finetune_sglue.gin" \ 10 | --gin_file="configs/size/25m/vanilla.gin" \ 11 | --gin_file="${CONFIG_PATH}/mode/gpu.gin" \ 12 | --gin.TRAIN_STEPS=192_000 \ 13 | --gin.MODEL_DIR=\"'/fsx/aran/jax/ckpts/scaling/25m/vanilla_64k_finetune'\" \ 14 | --gin.INITIAL_CHECKPOINT_PATH=\"'/fsx/aran/jax/ckpts/scaling/25m/checkpoint_64000'\" \ 15 | --seqio_additional_cache_dirs="${CACHED_DATA_DIR}" \ 16 | --alsologtostderr \ 17 | --multiprocess_gpu \ 18 | --coordinator_address=${ADDR} \ 19 | --process_count=${SLURM_NTASKS} \ 20 | --process_index=${SLURM_PROCID} -------------------------------------------------------------------------------- /experiments/preliminary/scaling_laws/25m/sglue_finetune_25m_8000.sh: -------------------------------------------------------------------------------- 1 | export T5X_DIR="/fsx/lintangsutawika/t5x/" 2 | export CONFIG_PATH="/fsx/aran/jax/t5x_2/architecture-objective/experiments/configs" 3 | export CACHED_DATA_DIR="/fsx/lintangsutawika/data" 4 | 5 | ADDR=$1 6 | MODEL_DIR=$2 7 | 8 | python -m t5x.train \ 9 | --gin_file="configs/finetune_sglue.gin" \ 10 | --gin_file="configs/size/25m/vanilla.gin" \ 11 | --gin_file="${CONFIG_PATH}/mode/gpu.gin" \ 12 | --gin.TRAIN_STEPS=136_000 \ 13 | --gin.MODEL_DIR=\"'/fsx/aran/jax/ckpts/scaling/25m/vanilla_8k_finetune'\" \ 14 | --gin.INITIAL_CHECKPOINT_PATH=\"'/fsx/aran/jax/ckpts/scaling/25m/checkpoint_8000'\" \ 15 | --seqio_additional_cache_dirs="${CACHED_DATA_DIR}" \ 16 | --alsologtostderr \ 17 | --multiprocess_gpu \ 18 | --coordinator_address=${ADDR} \ 19 | --process_count=${SLURM_NTASKS} \ 20 | --process_index=${SLURM_PROCID} -------------------------------------------------------------------------------- /experiments/preliminary/scaling_laws/920m/pretrain_c4.sh: -------------------------------------------------------------------------------- 1 | ADDR=$1 2 | MODEL_DIR=$2 3 | 4 | python -m t5x.train \ 5 | --gin_file="configs/task/pretrain/c4_mlm.gin" \ 6 | --gin_file="configs/size/920m/vanilla.gin" \ 7 | --gin_file="configs/exp/scaling.gin" \ 8 | --gin.TRAIN_STEPS=256000 \ 9 | --gin.SAVING_PERIOD=32000 \ 10 | --gin.MODEL_DIR=\"'/fsx/aran/jax/ckpts/scaling/920m_c4/'\" \ 11 | --gin.USE_CACHED_TASKS=False \ 12 | --alsologtostderr \ 13 | --multiprocess_gpu \ 14 | --coordinator_address=${ADDR} \ 15 | --process_count=${SLURM_NTASKS} \ 16 | --process_index=${SLURM_PROCID} 17 | -------------------------------------------------------------------------------- /experiments/preliminary/scaling_laws/920m/pretrain_pile.sh: -------------------------------------------------------------------------------- 1 | ADDR=$1 2 | MODEL_DIR=$2 3 | 4 | python -m t5x.train \ 5 | --gin_file="configs/task/pretrain/pile_mlm.gin" \ 6 | --gin_file="configs/size/920m/vanilla.gin" \ 7 | --gin_file="configs/exp/scaling.gin" \ 8 | --gin.TRAIN_STEPS=256000 \ 9 | --gin.SAVING_PERIOD=32000 \ 10 | --gin.MODEL_DIR=\"'/fsx/aran/jax/ckpts/scaling/920m_pile/'\" \ 11 | --gin.USE_CACHED_TASKS=False \ 12 | --alsologtostderr \ 13 | --multiprocess_gpu \ 14 | --coordinator_address=${ADDR} \ 15 | --process_count=${SLURM_NTASKS} \ 16 | --process_index=${SLURM_PROCID} 17 | -------------------------------------------------------------------------------- /experiments/preliminary/scaling_laws/920m/sglue_finetune_920m_128000.sh: -------------------------------------------------------------------------------- 1 | export T5X_DIR="/fsx/lintangsutawika/t5x/" 2 | export CONFIG_PATH="/fsx/aran/jax/t5x_2/architecture-objective/experiments/configs" 3 | export CACHED_DATA_DIR="/fsx/lintangsutawika/data" 4 | 5 | ADDR=$1 6 | MODEL_DIR=$2 7 | 8 | python -m t5x.train \ 9 | --gin_file="configs/finetune_sglue.gin" \ 10 | --gin_file="configs/size/920m/vanilla.gin" \ 11 | --gin_file="${CONFIG_PATH}/mode/gpu.gin" \ 12 | --gin.TRAIN_STEPS=256_000 \ 13 | --gin.MODEL_DIR=\"'/fsx/aran/jax/ckpts/scaling/920m/vanilla_128k_finetune'\" \ 14 | --gin.INITIAL_CHECKPOINT_PATH=\"'/fsx/aran/jax/ckpts/scaling/920m/checkpoint_128000'\" \ 15 | --seqio_additional_cache_dirs="${CACHED_DATA_DIR}" \ 16 | --alsologtostderr \ 17 | --multiprocess_gpu \ 18 | --coordinator_address=${ADDR} \ 19 | --process_count=${SLURM_NTASKS} \ 20 | --process_index=${SLURM_PROCID} -------------------------------------------------------------------------------- /experiments/preliminary/scaling_laws/920m/sglue_finetune_920m_160000.sh: -------------------------------------------------------------------------------- 1 | export T5X_DIR="/fsx/lintangsutawika/t5x/" 2 | export CONFIG_PATH="/fsx/aran/jax/t5x_2/architecture-objective/experiments/configs" 3 | export CACHED_DATA_DIR="/fsx/lintangsutawika/data" 4 | 5 | ADDR=$1 6 | MODEL_DIR=$2 7 | 8 | python -m t5x.train \ 9 | --gin_file="configs/finetune_sglue.gin" \ 10 | --gin_file="configs/size/920m/vanilla.gin" \ 11 | --gin_file="${CONFIG_PATH}/mode/gpu.gin" \ 12 | --gin.TRAIN_STEPS=288_000 \ 13 | --gin.MODEL_DIR=\"'/fsx/aran/jax/ckpts/scaling/920m/vanilla_160k_finetune'\" \ 14 | --gin.INITIAL_CHECKPOINT_PATH=\"'/fsx/aran/jax/ckpts/scaling/920m/checkpoint_160000'\" \ 15 | --seqio_additional_cache_dirs="${CACHED_DATA_DIR}" \ 16 | --alsologtostderr \ 17 | --multiprocess_gpu \ 18 | --coordinator_address=${ADDR} \ 19 | --process_count=${SLURM_NTASKS} \ 20 | --process_index=${SLURM_PROCID} -------------------------------------------------------------------------------- /experiments/preliminary/scaling_laws/920m/sglue_finetune_920m_192000.sh: -------------------------------------------------------------------------------- 1 | export T5X_DIR="/fsx/lintangsutawika/t5x/" 2 | export CONFIG_PATH="/fsx/aran/jax/t5x_2/architecture-objective/experiments/configs" 3 | export CACHED_DATA_DIR="/fsx/lintangsutawika/data" 4 | 5 | ADDR=$1 6 | MODEL_DIR=$2 7 | 8 | python -m t5x.train \ 9 | --gin_file="configs/finetune_sglue.gin" \ 10 | --gin_file="configs/size/920m/vanilla.gin" \ 11 | --gin_file="${CONFIG_PATH}/mode/gpu.gin" \ 12 | --gin.TRAIN_STEPS=320_000 \ 13 | --gin.MODEL_DIR=\"'/fsx/aran/jax/ckpts/scaling/920m/vanilla_192k_finetune'\" \ 14 | --gin.INITIAL_CHECKPOINT_PATH=\"'/fsx/aran/jax/ckpts/scaling/920m/checkpoint_192000'\" \ 15 | --seqio_additional_cache_dirs="${CACHED_DATA_DIR}" \ 16 | --alsologtostderr \ 17 | --multiprocess_gpu \ 18 | --coordinator_address=${ADDR} \ 19 | --process_count=${SLURM_NTASKS} \ 20 | --process_index=${SLURM_PROCID} -------------------------------------------------------------------------------- /experiments/preliminary/scaling_laws/920m/sglue_finetune_920m_224000.sh: -------------------------------------------------------------------------------- 1 | export T5X_DIR="/fsx/lintangsutawika/t5x/" 2 | export CONFIG_PATH="/fsx/aran/jax/t5x_2/architecture-objective/experiments/configs" 3 | export CACHED_DATA_DIR="/fsx/lintangsutawika/data" 4 | 5 | ADDR=$1 6 | MODEL_DIR=$2 7 | 8 | python -m t5x.train \ 9 | --gin_file="configs/finetune_sglue.gin" \ 10 | --gin_file="configs/size/920m/vanilla.gin" \ 11 | --gin_file="${CONFIG_PATH}/mode/gpu.gin" \ 12 | --gin.TRAIN_STEPS=352_000 \ 13 | --gin.MODEL_DIR=\"'/fsx/aran/jax/ckpts/scaling/920m/vanilla_224k_finetune'\" \ 14 | --gin.INITIAL_CHECKPOINT_PATH=\"'/fsx/aran/jax/ckpts/scaling/920m/checkpoint_224000'\" \ 15 | --seqio_additional_cache_dirs="${CACHED_DATA_DIR}" \ 16 | --alsologtostderr \ 17 | --multiprocess_gpu \ 18 | --coordinator_address=${ADDR} \ 19 | --process_count=${SLURM_NTASKS} \ 20 | --process_index=${SLURM_PROCID} -------------------------------------------------------------------------------- /experiments/preliminary/scaling_laws/920m/sglue_finetune_920m_256000.sh: -------------------------------------------------------------------------------- 1 | export T5X_DIR="/fsx/lintangsutawika/t5x/" 2 | export CONFIG_PATH="/fsx/aran/jax/t5x_2/architecture-objective/experiments/configs" 3 | export CACHED_DATA_DIR="/fsx/lintangsutawika/data" 4 | 5 | ADDR=$1 6 | MODEL_DIR=$2 7 | 8 | python -m t5x.train \ 9 | --gin_file="configs/finetune_sglue.gin" \ 10 | --gin_file="configs/size/920m/vanilla.gin" \ 11 | --gin_file="${CONFIG_PATH}/mode/gpu.gin" \ 12 | --gin.TRAIN_STEPS=384_000 \ 13 | --gin.MODEL_DIR=\"'/fsx/aran/jax/ckpts/scaling/920m/vanilla_256k_finetune'\" \ 14 | --gin.INITIAL_CHECKPOINT_PATH=\"'/fsx/aran/jax/ckpts/scaling/920m/checkpoint_256000'\" \ 15 | --seqio_additional_cache_dirs="${CACHED_DATA_DIR}" \ 16 | --alsologtostderr \ 17 | --multiprocess_gpu \ 18 | --coordinator_address=${ADDR} \ 19 | --process_count=${SLURM_NTASKS} \ 20 | --process_index=${SLURM_PROCID} -------------------------------------------------------------------------------- /experiments/preliminary/scaling_laws/920m/sglue_finetune_920m_32000.sh: -------------------------------------------------------------------------------- 1 | export T5X_DIR="/fsx/lintangsutawika/t5x/" 2 | export CONFIG_PATH="/fsx/aran/jax/t5x_2/architecture-objective/experiments/configs" 3 | export CACHED_DATA_DIR="/fsx/lintangsutawika/data" 4 | 5 | ADDR=$1 6 | MODEL_DIR=$2 7 | 8 | python -m t5x.train \ 9 | --gin_file="configs/finetune_sglue.gin" \ 10 | --gin_file="configs/size/920m/vanilla.gin" \ 11 | --gin_file="${CONFIG_PATH}/mode/gpu.gin" \ 12 | --gin.TRAIN_STEPS=160_000 \ 13 | --gin.MODEL_DIR=\"'/fsx/aran/jax/ckpts/scaling/920m/vanilla_32k_finetune'\" \ 14 | --gin.INITIAL_CHECKPOINT_PATH=\"'/fsx/aran/jax/ckpts/scaling/920m/checkpoint_32000'\" \ 15 | --seqio_additional_cache_dirs="${CACHED_DATA_DIR}" \ 16 | --alsologtostderr \ 17 | --multiprocess_gpu \ 18 | --coordinator_address=${ADDR} \ 19 | --process_count=${SLURM_NTASKS} \ 20 | --process_index=${SLURM_PROCID} -------------------------------------------------------------------------------- /experiments/preliminary/scaling_laws/920m/sglue_finetune_920m_64000.sh: -------------------------------------------------------------------------------- 1 | export T5X_DIR="/fsx/lintangsutawika/t5x/" 2 | export CONFIG_PATH="/fsx/aran/jax/t5x_2/architecture-objective/experiments/configs" 3 | export CACHED_DATA_DIR="/fsx/lintangsutawika/data" 4 | 5 | ADDR=$1 6 | MODEL_DIR=$2 7 | 8 | python -m t5x.train \ 9 | --gin_file="configs/finetune_sglue.gin" \ 10 | --gin_file="configs/size/920m/vanilla.gin" \ 11 | --gin_file="${CONFIG_PATH}/mode/gpu.gin" \ 12 | --gin.TRAIN_STEPS=192_000 \ 13 | --gin.MODEL_DIR=\"'/fsx/aran/jax/ckpts/scaling/920m/vanilla_64k_finetune'\" \ 14 | --gin.INITIAL_CHECKPOINT_PATH=\"'/fsx/aran/jax/ckpts/scaling/920m/checkpoint_64000'\" \ 15 | --seqio_additional_cache_dirs="${CACHED_DATA_DIR}" \ 16 | --alsologtostderr \ 17 | --multiprocess_gpu \ 18 | --coordinator_address=${ADDR} \ 19 | --process_count=${SLURM_NTASKS} \ 20 | --process_index=${SLURM_PROCID} -------------------------------------------------------------------------------- /experiments/preliminary/scaling_laws/920m/sglue_finetune_920m_96000.sh: -------------------------------------------------------------------------------- 1 | export T5X_DIR="/fsx/lintangsutawika/t5x/" 2 | export CONFIG_PATH="/fsx/aran/jax/t5x_2/architecture-objective/experiments/configs" 3 | export CACHED_DATA_DIR="/fsx/lintangsutawika/data" 4 | 5 | ADDR=$1 6 | MODEL_DIR=$2 7 | 8 | python -m t5x.train \ 9 | --gin_file="configs/finetune_sglue.gin" \ 10 | --gin_file="configs/size/920m/vanilla.gin" \ 11 | --gin_file="${CONFIG_PATH}/mode/gpu.gin" \ 12 | --gin.TRAIN_STEPS=224_000 \ 13 | --gin.MODEL_DIR=\"'/fsx/aran/jax/ckpts/scaling/920m/vanilla_96k_finetune'\" \ 14 | --gin.INITIAL_CHECKPOINT_PATH=\"'/fsx/aran/jax/ckpts/scaling/920m/checkpoint_96000'\" \ 15 | --seqio_additional_cache_dirs="${CACHED_DATA_DIR}" \ 16 | --alsologtostderr \ 17 | --multiprocess_gpu \ 18 | --coordinator_address=${ADDR} \ 19 | --process_count=${SLURM_NTASKS} \ 20 | --process_index=${SLURM_PROCID} -------------------------------------------------------------------------------- /experiments/preliminary/scaling_laws/super_glue_performance.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/improved-t5/08beb733e5df07d24fbf95f720d7e87e09c547cd/experiments/preliminary/scaling_laws/super_glue_performance.png -------------------------------------------------------------------------------- /experiments/preliminary/scaling_laws/super_glue_performance_flop.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EleutherAI/improved-t5/08beb733e5df07d24fbf95f720d7e87e09c547cd/experiments/preliminary/scaling_laws/super_glue_performance_flop.png -------------------------------------------------------------------------------- /models/decoder_t5/__init__.py: -------------------------------------------------------------------------------- 1 | # from typing import TYPE_CHECKING 2 | 3 | # _import_structure = {"configuration_t5": ["T5_PRETRAINED_CONFIG_ARCHIVE_MAP", "T5Config", "T5OnnxConfig"]} 4 | 5 | # _import_structure["modeling_t5"] = [ 6 | # # "T5_PRETRAINED_MODEL_ARCHIVE_LIST", 7 | # # "T5EncoderModel", 8 | # "DecoderT5ForConditionalGeneration", 9 | # # "T5Model", 10 | # # "T5PreTrainedModel", 11 | # # "load_tf_weights_in_t5", 12 | # ] -------------------------------------------------------------------------------- /models/scalable_t5/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2022 The T5X Authors. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | # This empty file is needed for loading the gin files in this directory. 16 | -------------------------------------------------------------------------------- /models/scalable_t5/mt5/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2022 The T5X Authors. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | # This empty file is needed for loading the gin files in this directory. 16 | -------------------------------------------------------------------------------- /models/scalable_t5/mt5/large.gin: -------------------------------------------------------------------------------- 1 | # T5.1.1 Large model. 2 | 3 | include 't5x/examples/scalable_t5/mt5/base.gin' # imports vocab, optimizer and model. 4 | 5 | # ------------------- Network specification overrides -------------------------- 6 | network.Transformer.config = @network.T5Config() 7 | network.T5Config: 8 | emb_dim = 1024 9 | num_heads = 16 10 | num_encoder_layers = 24 11 | num_decoder_layers = 24 12 | head_dim = 64 13 | mlp_dim = 2816 14 | -------------------------------------------------------------------------------- /models/scalable_t5/mt5/small.gin: -------------------------------------------------------------------------------- 1 | # T5.1.1 Small model. 2 | 3 | include 't5x/examples/scalable_t5/mt5/base.gin' # imports vocab, optimizer and model. 4 | 5 | # ------------------- Network specification overrides -------------------------- 6 | network.Transformer.config = @network.T5Config() 7 | network.T5Config: 8 | emb_dim = 512 9 | num_heads = 6 10 | num_encoder_layers = 8 11 | num_decoder_layers = 8 12 | head_dim = 64 13 | mlp_dim = 1024 14 | -------------------------------------------------------------------------------- /models/scalable_t5/mt5/xl.gin: -------------------------------------------------------------------------------- 1 | # T5.1.1 XL model. 2 | 3 | include 't5x/examples/scalable_t5/mt5/base.gin' # imports vocab, optimizer and model. 4 | 5 | # ------------------- Network specification overrides -------------------------- 6 | network.Transformer.config = @network.T5Config() 7 | network.T5Config: 8 | emb_dim = 2048 9 | num_heads = 32 10 | num_encoder_layers = 24 11 | num_decoder_layers = 24 12 | head_dim = 64 13 | mlp_dim = 5120 14 | -------------------------------------------------------------------------------- /models/scalable_t5/mt5/xxl.gin: -------------------------------------------------------------------------------- 1 | # T5.1.1 XXL model. 2 | 3 | include 't5x/examples/scalable_t5/mt5/base.gin' # imports vocab, optimizer and model. 4 | 5 | # ------------------- Network specification overrides -------------------------- 6 | network.Transformer.config = @network.T5Config() 7 | network.T5Config: 8 | emb_dim = 4096 9 | num_heads = 64 10 | num_encoder_layers = 24 11 | num_decoder_layers = 24 12 | head_dim = 64 13 | mlp_dim = 10240 14 | -------------------------------------------------------------------------------- /models/scalable_t5/t5_1_1/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2022 The T5X Authors. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | # This empty file is needed for loading the gin files in this directory. 16 | -------------------------------------------------------------------------------- /models/scalable_t5/t5_1_1/examples/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2022 The T5X Authors. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | # This empty file is needed for loading the gin files in this directory. 16 | -------------------------------------------------------------------------------- /models/scalable_t5/t5_1_1/large.gin: -------------------------------------------------------------------------------- 1 | # T5.1.1 Large model. 2 | 3 | include 'models/scalable_t5/t5_1_1/base.gin' # imports vocab, optimizer and model. 4 | 5 | # ------------------- Network specification overrides -------------------------- 6 | network.Transformer.config = @network.T5Config() 7 | network.T5Config: 8 | emb_dim = 1024 9 | num_heads = 16 10 | num_encoder_layers = 24 11 | num_decoder_layers = 24 12 | head_dim = 64 13 | mlp_dim = 2816 14 | -------------------------------------------------------------------------------- /models/scalable_t5/t5_1_1/small.gin: -------------------------------------------------------------------------------- 1 | # T5.1.1 Small model. 2 | 3 | include 'models/scalable_t5/t5_1_1/base.gin' # imports vocab, optimizer and model. 4 | 5 | # ------------------- Network specification overrides -------------------------- 6 | network.Transformer.config = @network.T5Config() 7 | network.T5Config: 8 | emb_dim = 512 9 | num_heads = 6 10 | num_encoder_layers = 8 11 | num_decoder_layers = 8 12 | head_dim = 64 13 | mlp_dim = 1024 14 | -------------------------------------------------------------------------------- /models/scalable_t5/t5_1_1/xl.gin: -------------------------------------------------------------------------------- 1 | # T5.1.1 XL model. 2 | 3 | import __main__ as train_script 4 | 5 | from t5x import partitioning 6 | include 'models/scalable_t5/t5_1_1/base.gin' # imports vocab, optimizer and model. 7 | 8 | # ------------------- Network specification overrides -------------------------- 9 | network.Transformer.config = @network.T5Config() 10 | network.T5Config: 11 | emb_dim = 2048 12 | num_heads = 32 13 | num_encoder_layers = 24 14 | num_decoder_layers = 24 15 | head_dim = 64 16 | mlp_dim = 5120 17 | -------------------------------------------------------------------------------- /models/scalable_t5/t5_1_1/xxl.gin: -------------------------------------------------------------------------------- 1 | # T5.1.1 XXL model. 2 | 3 | include 'models/scalable_t5/t5_1_1/base.gin' # imports vocab, optimizer and model. 4 | 5 | # ------------------- Network specification overrides -------------------------- 6 | network.Transformer.config = @network.T5Config() 7 | network.T5Config: 8 | emb_dim = 4096 9 | num_heads = 64 10 | num_encoder_layers = 24 11 | num_decoder_layers = 24 12 | head_dim = 64 13 | mlp_dim = 10240 14 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup 2 | 3 | setup( 4 | name='improved_t5', 5 | packages=[ 6 | 'data', 7 | ] 8 | ) 9 | -------------------------------------------------------------------------------- /tpu-scripts/kill.sh: -------------------------------------------------------------------------------- 1 | gcloud compute tpus tpu-vm ssh $1 \ 2 | --worker=all \ 3 | --zone us-central2-b \ 4 | --command "pkill -9 train.py; rm -f /tmp/libtpu_lockfile" 5 | -------------------------------------------------------------------------------- /tpu-scripts/run.sh: -------------------------------------------------------------------------------- 1 | #pkill train.py 2 | #rm -f /tmp/libtpu_lockfile 3 | 4 | # --worker=all 5 | gcloud compute tpus tpu-vm ssh $1 \ 6 | --worker=all \ 7 | --zone us-central2-b \ 8 | --command "$2" 9 | -------------------------------------------------------------------------------- /tpu-scripts/send.sh: -------------------------------------------------------------------------------- 1 | gcloud compute tpus tpu-vm scp $2 $1:$3 --worker=all --zone us-central2-b 2 | --------------------------------------------------------------------------------