├── .gitignore
├── README.md
├── cache_t0.sh
├── configs
    ├── exp
    │   ├── LayerNorm
    │   │   ├── post_layernorm.gin
    │   │   ├── pre_layernorm.gin
    │   │   ├── reset_optim.gin
    │   │   ├── training.gin
    │   │   └── training_pp.gin
    │   ├── PositionEmbedding
    │   │   ├── abs_pos.gin
    │   │   ├── alibi.gin
    │   │   ├── no_relpos.gin
    │   │   ├── relpos.gin
    │   │   └── rotary.gin
    │   ├── batch_size.gin
    │   ├── gptj.gin
    │   ├── memorization.gin
    │   ├── optim.gin
    │   ├── partition.gin
    │   ├── ratio.gin
    │   └── scaling.gin
    ├── size
    │   ├── 110m
    │   │   └── vanilla.gin
    │   ├── 1_6b
    │   │   ├── deep.gin
    │   │   ├── vanilla.gin
    │   │   └── wide.gin
    │   ├── 200m
    │   │   ├── deep.gin
    │   │   ├── vanilla.gin
    │   │   ├── vanilla_gpt.gin
    │   │   └── wide.gin
    │   ├── 25m
    │   │   └── vanilla.gin
    │   ├── 3_1b
    │   │   └── vanilla.gin
    │   ├── 470m
    │   │   └── vanilla.gin
    │   ├── 60m
    │   │   └── vanilla.gin
    │   └── 920m
    │   │   └── vanilla.gin
    ├── t5v2
    │   ├── base.gin
    │   └── large.gin
    └── task
    │   ├── eval
    │       └── t0_eval.gin
    │   ├── finetune
    │       ├── codexglue
    │       │   ├── code_to_text_go.gin
    │       │   ├── code_to_text_java.gin
    │       │   ├── code_to_text_javascript.gin
    │       │   ├── code_to_text_php.gin
    │       │   ├── code_to_text_python.gin
    │       │   └── code_to_text_ruby.gin
    │       ├── extend_1024.gin
    │       ├── extend_2048-causal.gin
    │       ├── extend_2048-prefix.gin
    │       ├── extend_2048.gin
    │       ├── extend_4096.gin
    │       ├── extend_512.gin
    │       ├── extend_8192.gin
    │       ├── flan2021.gin
    │       ├── flan2021_t5.gin
    │       ├── flan2022.gin
    │       ├── flan2022_t5.gin
    │       ├── natural_sglue.gin
    │       ├── natural_sglue_t5.gin
    │       ├── pile_mlm.gin
    │       ├── pile_prefix_lm.gin
    │       ├── pile_prefix_lm_causal.gin
    │       ├── pile_ul2r.gin
    │       ├── sglue.gin
    │       ├── sglue_t5.gin
    │       └── t0_train.gin
    │   └── pretrain
    │       ├── c4_mixed_objective.gin
    │       ├── c4_mlm.gin
    │       ├── pile_causal.gin
    │       ├── pile_mixed_objective.gin
    │       ├── pile_mlm.gin
    │       ├── pile_mlm_causal.gin
    │       └── pile_prefix_lm.gin
├── convert_weights
    ├── configs
    │   ├── base_v1
    │   │   ├── config.json
    │   │   ├── generation_config.json
    │   │   ├── special_tokens_map.json
    │   │   ├── spiece.model
    │   │   └── tokenizer_config.json
    │   ├── base_v2
    │   │   ├── config.json
    │   │   ├── generation_config.json
    │   │   ├── special_tokens_map.json
    │   │   ├── tokenizer.model
    │   │   └── tokenizer_config.json
    │   ├── large_v1
    │   │   ├── config.json
    │   │   ├── generation_config.json
    │   │   ├── special_tokens_map.json
    │   │   ├── spiece.model
    │   │   └── tokenizer_config.json
    │   ├── large_v2
    │   │   ├── config.json
    │   │   ├── generation_config.json
    │   │   ├── special_tokens_map.json
    │   │   ├── tokenizer.model
    │   │   └── tokenizer_config.json
    │   ├── xl_v1
    │   │   ├── config.json
    │   │   ├── generation_config.json
    │   │   ├── special_tokens_map.json
    │   │   ├── spiece.model
    │   │   └── tokenizer_config.json
    │   ├── xl_v2
    │   │   ├── config.json
    │   │   ├── generation_config.json
    │   │   ├── special_tokens_map.json
    │   │   ├── tokenizer.model
    │   │   └── tokenizer_config.json
    │   ├── xxl_v1
    │   │   ├── config.json
    │   │   ├── generation_config.json
    │   │   ├── special_tokens_map.json
    │   │   ├── spiece.model
    │   │   └── tokenizer_config.json
    │   └── xxl_v2
    │   │   ├── config.json
    │   │   ├── generation_config.json
    │   │   ├── special_tokens_map.json
    │   │   ├── tokenizer.model
    │   │   └── tokenizer_config.json
    ├── convert-t5.txt
    ├── convert_t5v1_checkpoint_to_pytorch.py
    ├── convert_t5v2_checkpoint_to_pytorch.py
    ├── convert_t5x_checkpoint_to_flax.py
    ├── convert_t5x_checkpoint_to_pytorch.py
    ├── scripts
    │   ├── convert_v1.sh
    │   └── convert_v2.sh
    ├── upload-codexglue.sh
    ├── upload-multiple.sh
    ├── upload-t5x.sh
    └── upload.sh
├── data
    ├── __init__.py
    ├── bigbenchlite
    │   ├── __init__.py
    │   └── tasks.py
    ├── c4
    │   ├── __init__.py
    │   ├── c4_utils.py
    │   └── tasks.py
    ├── codexglue
    │   ├── __init__.py
    │   └── tasks.py
    ├── flan
    │   ├── __init__.py
    │   ├── tasks.py
    │   └── tasks_alt.py
    ├── metrics.py
    ├── p3
    │   ├── __init__.py
    │   └── tasks.py
    ├── pile
    │   ├── __init__.py
    │   ├── pile_utils.py
    │   └── tasks.py
    ├── preprocessors.py
    ├── preprocessors_test.py
    ├── sglue
    │   ├── __init__.py
    │   ├── postprocessors.py
    │   ├── preprocessors.py
    │   ├── tasks.py
    │   ├── tasks_natural.py
    │   └── tasks_t5.py
    ├── utils.py
    └── vocab.py
├── evals
    ├── eval-all.sh
    ├── eval-bbh.sh
    ├── eval-codexglue.sh
    ├── eval-cot.sh
    ├── eval-held_in.sh
    ├── eval-mmlu.sh
    └── eval-sglue.sh
├── experiments
    ├── benchmarks
    │   ├── t5-v1.1-lm100k
    │   │   ├── finetune_base_code_to_text.sh
    │   │   ├── finetune_large_code_to_text.sh
    │   │   ├── sglue_base.sh
    │   │   ├── sglue_large.sh
    │   │   ├── t0-train_base.sh
    │   │   └── t0-train_large.sh
    │   ├── t5-v1.1-lm_adapt
    │   │   ├── base_flan.sh
    │   │   ├── base_flan2021.sh
    │   │   ├── large_flan.sh
    │   │   ├── large_flan2021.sh
    │   │   ├── xl_flan.sh
    │   │   ├── xl_flan2021.sh
    │   │   ├── xxl_flan.sh
    │   │   └── xxl_flan2021.sh
    │   └── t5-v1.1
    │   │   ├── base_code_to_text.sh
    │   │   ├── base_flan2021.sh
    │   │   ├── base_flan2022.sh
    │   │   ├── base_sglue.sh
    │   │   ├── base_t0-train.sh
    │   │   ├── large_code_to_text.sh
    │   │   ├── large_flan2021.sh
    │   │   ├── large_flan2022.sh
    │   │   ├── large_sglue.sh
    │   │   ├── large_t0-train.sh
    │   │   ├── xl_code_to_text.sh
    │   │   ├── xl_flan2021.sh
    │   │   ├── xl_flan2022.sh
    │   │   ├── xl_sglue.sh
    │   │   ├── xxl_code_to_text.sh
    │   │   ├── xxl_flan2021.sh
    │   │   ├── xxl_flan2022.sh
    │   │   └── xxl_sglue.sh
    ├── improved_t5
    │   ├── ablations
    │   │   ├── v1-1_xl_flan2021_submix.sh
    │   │   └── v2_xl_flan2021_submix.sh
    │   ├── eval_bf16.sh
    │   ├── eval_fp16.sh
    │   ├── lm_adapt
    │   │   ├── finetune_base_flan.sh
    │   │   ├── finetune_large_flan.sh
    │   │   ├── finetune_xl_flan.sh
    │   │   ├── finetune_xxl_flan.sh
    │   │   ├── lm_adapt_base.sh
    │   │   ├── lm_adapt_large.sh
    │   │   ├── lm_adapt_xl.sh
    │   │   ├── lm_adapt_xxl.sh
    │   │   ├── s_causal
    │   │   │   ├── finetune_base_flan.sh
    │   │   │   ├── finetune_large_flan.sh
    │   │   │   ├── finetune_xl_flan.sh
    │   │   │   ├── finetune_xxl_flan.sh
    │   │   │   ├── lm_adapt_base.sh
    │   │   │   ├── lm_adapt_large.sh
    │   │   │   ├── lm_adapt_xl.sh
    │   │   │   └── lm_adapt_xxl.sh
    │   │   └── ul2r
    │   │   │   ├── finetune_base_ns_flan2021.sh
    │   │   │   ├── finetune_xl_flan2021.sh
    │   │   │   ├── finetune_xl_flan2022.sh
    │   │   │   ├── lm_adapt_base.sh
    │   │   │   ├── lm_adapt_base_ns.sh
    │   │   │   ├── lm_adapt_large.sh
    │   │   │   ├── lm_adapt_xl.sh
    │   │   │   └── lm_adapt_xxl.sh
    │   ├── mlm
    │   │   ├── extend-causal.sh
    │   │   ├── extend-prefix.sh
    │   │   ├── finetune_code_to_text.sh
    │   │   ├── finetune_flan2022.sh
    │   │   ├── finetune_sglue.sh
    │   │   ├── finetune_t0.sh
    │   │   ├── pretrain_mlm.sh
    │   │   └── pretrain_mlm_causal.sh
    │   └── ul2_causal
    │   │   ├── finetune_base_code_to_text.sh
    │   │   ├── finetune_base_flan.sh
    │   │   ├── finetune_base_sglue.sh
    │   │   ├── finetune_large_code_to_text.sh
    │   │   ├── finetune_large_flan.sh
    │   │   ├── finetune_large_sglue.sh
    │   │   ├── finetune_xl_code_to_text.sh
    │   │   ├── finetune_xl_flan2021.sh
    │   │   ├── finetune_xl_sglue.sh
    │   │   ├── finetune_xxl_flan.sh
    │   │   ├── finetune_xxl_sglue.sh
    │   │   ├── pretrain_base.sh
    │   │   ├── pretrain_large.sh
    │   │   ├── pretrain_xl.sh
    │   │   └── pretrain_xxl.sh
    └── preliminary
    │   ├── layernorm
    │       ├── eval
    │       │   ├── t0_eval_adamw_post.sh
    │       │   ├── t0_eval_adamw_post_rotary.sh
    │       │   ├── t0_eval_adamw_pre.sh
    │       │   ├── t0_eval_alibi_relpos.sh
    │       │   └── t0_eval_base_lm100k.sh
    │       ├── finetune
    │       │   ├── t0_train_adafactor_post.sh
    │       │   ├── t0_train_adafactor_pre.sh
    │       │   ├── t0_train_adamw_post.sh
    │       │   ├── t0_train_adamw_post_rotary.sh
    │       │   ├── t0_train_adamw_pre.sh
    │       │   └── t0_train_base_lm100k.sh
    │       ├── pretrain
    │       │   ├── pile_mlm_adafactor_post.sh
    │       │   ├── pile_mlm_adafactor_pre.sh
    │       │   ├── pile_mlm_adamw_post.sh
    │       │   ├── pile_mlm_adamw_post_rotary.sh
    │       │   ├── pile_mlm_adamw_pre.sh
    │       │   └── pile_mlm_adamw_pre_rotary.sh
    │       └── t0_eval.py
    │   ├── mixed_pretraining_objectives
    │       ├── finetune
    │       │   ├── 0_10
    │       │   │   ├── sglue_finetune_920m_128000.sh
    │       │   │   ├── sglue_finetune_920m_160000.sh
    │       │   │   ├── sglue_finetune_920m_192000.sh
    │       │   │   ├── sglue_finetune_920m_224000.sh
    │       │   │   ├── sglue_finetune_920m_256000.sh
    │       │   │   ├── sglue_finetune_920m_32000.sh
    │       │   │   ├── sglue_finetune_920m_64000.sh
    │       │   │   └── sglue_finetune_920m_96000.sh
    │       │   ├── 0_15
    │       │   │   ├── sglue_finetune_920m_128000.sh
    │       │   │   ├── sglue_finetune_920m_160000.sh
    │       │   │   ├── sglue_finetune_920m_192000.sh
    │       │   │   ├── sglue_finetune_920m_224000.sh
    │       │   │   ├── sglue_finetune_920m_256000.sh
    │       │   │   ├── sglue_finetune_920m_32000.sh
    │       │   │   ├── sglue_finetune_920m_64000.sh
    │       │   │   └── sglue_finetune_920m_96000.sh
    │       │   ├── 0_25
    │       │   │   ├── sglue_finetune_920m_128000.sh
    │       │   │   ├── sglue_finetune_920m_160000.sh
    │       │   │   ├── sglue_finetune_920m_192000.sh
    │       │   │   ├── sglue_finetune_920m_224000.sh
    │       │   │   ├── sglue_finetune_920m_256000.sh
    │       │   │   ├── sglue_finetune_920m_32000.sh
    │       │   │   ├── sglue_finetune_920m_64000.sh
    │       │   │   └── sglue_finetune_920m_96000.sh
    │       │   ├── 0_50
    │       │   │   ├── sglue_finetune_920m_128000.sh
    │       │   │   ├── sglue_finetune_920m_160000.sh
    │       │   │   ├── sglue_finetune_920m_192000.sh
    │       │   │   ├── sglue_finetune_920m_224000.sh
    │       │   │   ├── sglue_finetune_920m_256000.sh
    │       │   │   ├── sglue_finetune_920m_32000.sh
    │       │   │   ├── sglue_finetune_920m_64000.sh
    │       │   │   └── sglue_finetune_920m_96000.sh
    │       │   ├── 0_60
    │       │   │   ├── sglue_finetune_920m_128000.sh
    │       │   │   ├── sglue_finetune_920m_160000.sh
    │       │   │   ├── sglue_finetune_920m_192000.sh
    │       │   │   ├── sglue_finetune_920m_224000.sh
    │       │   │   ├── sglue_finetune_920m_256000.sh
    │       │   │   ├── sglue_finetune_920m_32000.sh
    │       │   │   ├── sglue_finetune_920m_64000.sh
    │       │   │   └── sglue_finetune_920m_96000.sh
    │       │   └── 0_75
    │       │   │   ├── sglue_finetune_920m_128000.sh
    │       │   │   ├── sglue_finetune_920m_160000.sh
    │       │   │   ├── sglue_finetune_920m_192000.sh
    │       │   │   ├── sglue_finetune_920m_224000.sh
    │       │   │   ├── sglue_finetune_920m_256000.sh
    │       │   │   ├── sglue_finetune_920m_32000.sh
    │       │   │   ├── sglue_finetune_920m_64000.sh
    │       │   │   └── sglue_finetune_920m_96000.sh
    │       ├── pretrain_c4
    │       │   ├── 920m_c4_mlm_0_10.sh
    │       │   ├── 920m_c4_mlm_0_15.sh
    │       │   ├── 920m_c4_mlm_0_25.sh
    │       │   ├── 920m_c4_mlm_0_50.sh
    │       │   ├── 920m_c4_mlm_0_75.sh
    │       │   └── 920m_c4_mlm_1_00.sh
    │       ├── pretrain_pile
    │       │   ├── 920m_pile_mix_0_10.sh
    │       │   ├── 920m_pile_mix_0_15.sh
    │       │   ├── 920m_pile_mix_0_25.sh
    │       │   ├── 920m_pile_mix_0_50.sh
    │       │   ├── 920m_pile_mix_0_60.sh
    │       │   └── 920m_pile_mix_0_75.sh
    │       ├── sglue_graph.py
    │       ├── super_glue_performance_0-10.png
    │       └── super_glue_performance_flop_256k.png
    │   ├── partition
    │       ├── pretrain_pile_1.sh
    │       ├── pretrain_pile_2.sh
    │       ├── pretrain_pile_4.sh
    │       └── pretrain_pile_8.sh
    │   ├── positional_embeddings
    │       ├── alibi
    │       │   ├── eval_perplexity
    │       │   │   ├── no_alibi_1024_1024.sh
    │       │   │   ├── no_alibi_1024_114.sh
    │       │   │   ├── no_alibi_1024_2048.sh
    │       │   │   ├── no_alibi_1024_256.sh
    │       │   │   ├── no_alibi_1024_512.sh
    │       │   │   ├── no_alibi_1_1024.sh
    │       │   │   ├── no_alibi_1_114.sh
    │       │   │   ├── no_alibi_1_2048.sh
    │       │   │   ├── no_alibi_1_256.sh
    │       │   │   ├── no_alibi_1_512.sh
    │       │   │   ├── no_alibi_512_1024.sh
    │       │   │   ├── no_alibi_512_114.sh
    │       │   │   ├── no_alibi_512_2048.sh
    │       │   │   ├── no_alibi_512_256.sh
    │       │   │   ├── no_alibi_512_512.sh
    │       │   │   ├── with_alibi_1024_1024.sh
    │       │   │   ├── with_alibi_1024_114.sh
    │       │   │   ├── with_alibi_1024_2048.sh
    │       │   │   ├── with_alibi_1024_256.sh
    │       │   │   ├── with_alibi_1024_512.sh
    │       │   │   ├── with_alibi_1_1024.sh
    │       │   │   ├── with_alibi_1_114.sh
    │       │   │   ├── with_alibi_1_2048.sh
    │       │   │   ├── with_alibi_1_256.sh
    │       │   │   ├── with_alibi_1_512.sh
    │       │   │   ├── with_alibi_512_1024.sh
    │       │   │   ├── with_alibi_512_114.sh
    │       │   │   ├── with_alibi_512_2048.sh
    │       │   │   ├── with_alibi_512_256.sh
    │       │   │   └── with_alibi_512_512.sh
    │       │   ├── finetune_sglue_prefix_lm_no_alibi.sh
    │       │   ├── finetune_sglue_prefix_lm_with_alibi.sh
    │       │   ├── finetune_sglue_prefix_lm_with_alibi_plus_relpos.sh
    │       │   ├── pretrain_pile_prefix_lm_no_alibi.sh
    │       │   ├── pretrain_pile_prefix_lm_with_alibi.sh
    │       │   └── pretrain_pile_prefix_lm_with_alibi_plus_relpos.sh
    │       ├── pretrain_rotary_pile_mlm.sh
    │       └── rotary
    │       │   ├── eval
    │       │       ├── t0_eval_alibi_relpos.sh
    │       │       ├── t0_eval_benchmark.sh
    │       │       ├── t0_eval_metro_learning_only_training.sh
    │       │       ├── t0_eval_metro_no_alibi.sh
    │       │       └── t0_eval_metro_training.sh
    │       │   ├── finetune
    │       │       └── sglue
    │       │       │   ├── sglue_train_benchmark.sh
    │       │       │   ├── sglue_train_rotary.sh
    │       │       │   └── sglue_train_rotary_relpos.sh
    │       │   ├── pretrain_benchmark_pile_mlm.sh
    │       │   ├── pretrain_rotary_relpos_pile_mlm.sh
    │       │   ├── sglue_graph.py
    │       │   └── t0_eval.py
    │   └── scaling_laws
    │       ├── 110m
    │           ├── sglue_finetune_110m_16000.sh
    │           ├── sglue_finetune_110m_32000.sh
    │           ├── sglue_finetune_110m_48000.sh
    │           ├── sglue_finetune_110m_64000.sh
    │           └── sglue_finetune_110m_80000.sh
    │       ├── 1_6b
    │           ├── sglue_finetune_1_6b_128000.sh
    │           ├── sglue_finetune_1_6b_192000.sh
    │           ├── sglue_finetune_1_6b_256000.sh
    │           ├── sglue_finetune_1_6b_320000.sh
    │           ├── sglue_finetune_1_6b_384000.sh
    │           ├── sglue_finetune_1_6b_424000.sh
    │           ├── sglue_finetune_1_6b_448000.sh
    │           ├── sglue_finetune_1_6b_512000.sh
    │           └── sglue_finetune_1_6b_64000.sh
    │       ├── 25m
    │           ├── sglue_finetune_25m_16000.sh
    │           ├── sglue_finetune_25m_24000.sh
    │           ├── sglue_finetune_25m_32000.sh
    │           ├── sglue_finetune_25m_40000.sh
    │           ├── sglue_finetune_25m_48000.sh
    │           ├── sglue_finetune_25m_56000.sh
    │           ├── sglue_finetune_25m_64000.sh
    │           └── sglue_finetune_25m_8000.sh
    │       ├── 920m
    │           ├── pretrain_c4.sh
    │           ├── pretrain_pile.sh
    │           ├── sglue_finetune_920m_128000.sh
    │           ├── sglue_finetune_920m_160000.sh
    │           ├── sglue_finetune_920m_192000.sh
    │           ├── sglue_finetune_920m_224000.sh
    │           ├── sglue_finetune_920m_256000.sh
    │           ├── sglue_finetune_920m_32000.sh
    │           ├── sglue_finetune_920m_64000.sh
    │           └── sglue_finetune_920m_96000.sh
    │       ├── sglue_finetune.sh
    │       ├── sglue_graph.py
    │       ├── sglue_graph_.py
    │       ├── super_glue_performance.png
    │       └── super_glue_performance_flop.png
├── models
    ├── decoder_t5
    │   ├── __init__.py
    │   └── modeling_decoder_t5.py
    └── scalable_t5
    │   ├── README.md
    │   ├── __init__.py
    │   ├── alibi_position_biases.py
    │   ├── alibi_position_biases_test.py
    │   ├── layers.py
    │   ├── layers_test.py
    │   ├── local_tiny.gin
    │   ├── mt5
    │       ├── __init__.py
    │       ├── base.gin
    │       ├── large.gin
    │       ├── small.gin
    │       ├── xl.gin
    │       └── xxl.gin
    │   ├── network.py
    │   ├── network_test.py
    │   ├── rotary_embedding.py
    │   ├── rotary_embedding_test.py
    │   └── t5_1_1
    │       ├── __init__.py
    │       ├── base.gin
    │       ├── examples
    │           ├── __init__.py
    │           └── wmt19_ende_from_scratch.gin
    │       ├── large.gin
    │       ├── small.gin
    │       ├── xl.gin
    │       └── xxl.gin
├── setup.py
└── tpu-scripts
    ├── kill.sh
    ├── run.sh
    ├── send.sh
    └── setup.sh


/cache_t0.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH --partition=g40
 3 | #SBATCH --job-name=pile-t5x
 4 | #SBATCH --nodes=1
 5 | #SBATCH --gpus-per-node=8
 6 | #SBATCH --ntasks-per-node=12
 7 | #SBATCH --output=/fsx/lintangsutawika/improved_t5/logs/%x_%j.out
 8 | #SBATCH --exclusive
 9 | #SBATCH --requeue
10 | #SBATCH --account=neox
11 | 
12 | source /fsx/lintangsutawika/t5_env/bin/activate
13 | 
14 | srun --account neox \
15 |     seqio_cache_tasks \
16 | 	--tasks="anli_must_be_true_r1" \
17 | 	--output_cache_dir=/fsx/lintangsutawika/data \
18 | 	--module_import=t0.seqio_tasks \
19 | 	--alsologtostderr
20 | 


--------------------------------------------------------------------------------
/configs/exp/LayerNorm/post_layernorm.gin:
--------------------------------------------------------------------------------
1 | network.T5Config.pre_layer_norm = False


--------------------------------------------------------------------------------
/configs/exp/LayerNorm/pre_layernorm.gin:
--------------------------------------------------------------------------------
1 | network.T5Config.pre_layer_norm = True
2 | 


--------------------------------------------------------------------------------
/configs/exp/LayerNorm/reset_optim.gin:
--------------------------------------------------------------------------------
1 | from __gin__ import dynamic_registration
2 | 
3 | from t5x import utils
4 | 
5 | utils.RestoreCheckpointConfig:
6 |     assignment_map = [(".*param_states.*", None)]
7 |     fallback_to_scratch = True
8 | 


--------------------------------------------------------------------------------
/configs/exp/LayerNorm/training.gin:
--------------------------------------------------------------------------------
 1 | from __gin__ import dynamic_registration
 2 | 
 3 | import optax
 4 | 
 5 | from t5x import utils
 6 | from t5x import optimizers
 7 | 
 8 | BATCH_SIZE = 2048
 9 | DROPOUT_RATE = 0.1
10 | 
11 | # ------------------- Optimizer ------------------------------------------------
12 | # `learning_rate` is set by `Trainer.learning_rate_fn`.
13 | OPTIMIZER = @optimizers.chain()
14 | 
15 | optimizers.chain:
16 |   transformations = [@optax.adamw()]
17 | 
18 | optax.adamw:
19 |   learning_rate = @utils.create_learning_rate_scheduler()
20 |   eps = 1e-06
21 |   b1 = 0.9
22 |   b2 = 0.98
23 |   weight_decay = 0.01
24 | 
25 | utils.create_learning_rate_scheduler:
26 |   factors = 'linear_decay'
27 |   decay_factor = 8e-06
28 |   base_learning_rate = 4e-04
29 |   warmup_steps = 10000
30 | 


--------------------------------------------------------------------------------
/configs/exp/LayerNorm/training_pp.gin:
--------------------------------------------------------------------------------
 1 | from __gin__ import dynamic_registration
 2 | 
 3 | import optax
 4 | 
 5 | from t5x import utils
 6 | from t5x import optimizers
 7 | 
 8 | BATCH_SIZE = 2048
 9 | DROPOUT_RATE = 0.1
10 | 
11 | # ------------------- Optimizer ------------------------------------------------
12 | # `learning_rate` is set by `Trainer.learning_rate_fn`.
13 | OPTIMIZER = @optimizers.chain()
14 | 
15 | optimizers.chain:
16 |   transformations = [@optax.clip(), @optax.adamw()]
17 | 
18 | optax.clip:
19 |   max_delta = 2.0
20 | 
21 | optax.adamw:
22 |   learning_rate = @utils.create_learning_rate_scheduler()
23 |   eps = 1e-06
24 |   b1 = 0.9
25 |   b2 = 0.98
26 |   weight_decay = 0.01
27 | 
28 | utils.create_learning_rate_scheduler:
29 |   factors = 'linear_decay'
30 |   decay_factor = 8e-06
31 |   base_learning_rate = 4e-04
32 |   warmup_steps = 10000
33 | 


--------------------------------------------------------------------------------
/configs/exp/PositionEmbedding/abs_pos.gin:
--------------------------------------------------------------------------------
1 | network.T5Config.use_abs_pos_embedding = True


--------------------------------------------------------------------------------
/configs/exp/PositionEmbedding/alibi.gin:
--------------------------------------------------------------------------------
1 | network.T5Config.use_alibi = True


--------------------------------------------------------------------------------
/configs/exp/PositionEmbedding/no_relpos.gin:
--------------------------------------------------------------------------------
1 | network.T5Config.use_rel_pos = False


--------------------------------------------------------------------------------
/configs/exp/PositionEmbedding/relpos.gin:
--------------------------------------------------------------------------------
1 | network.T5Config.use_rel_pos = True


--------------------------------------------------------------------------------
/configs/exp/PositionEmbedding/rotary.gin:
--------------------------------------------------------------------------------
1 | network.T5Config.use_rotary_embedding = True


--------------------------------------------------------------------------------
/configs/exp/batch_size.gin:
--------------------------------------------------------------------------------
1 | utils.SaveCheckpointConfig.keep = 1


--------------------------------------------------------------------------------
/configs/exp/gptj.gin:
--------------------------------------------------------------------------------
1 | BATCH_SIZE = 2048
2 | network.T5Config.gptj = True
3 | utils.SaveCheckpointConfig.keep = 1


--------------------------------------------------------------------------------
/configs/exp/memorization.gin:
--------------------------------------------------------------------------------
1 | BATCH_SIZE = 2048
2 | 
3 | utils.SaveCheckpointConfig.keep = 1


--------------------------------------------------------------------------------
/configs/exp/optim.gin:
--------------------------------------------------------------------------------
1 | TRAIN_STEPS = 32000
2 | BATCH_SIZE = 2048
3 | DECAY_STEPS = 32000
4 | LEARNING_RATE = 2.0e-4
5 | 
6 | utils.SaveCheckpointConfig.keep = 1


--------------------------------------------------------------------------------
/configs/exp/partition.gin:
--------------------------------------------------------------------------------
1 | NUM_PARTITIONS = 1
2 | TRAIN_STEPS = 100
3 | SAVING_PERIOD = 100
4 | 
5 | partitioning.PjitPartitioner:
6 |   num_partitions = %NUM_PARTITIONS


--------------------------------------------------------------------------------
/configs/exp/ratio.gin:
--------------------------------------------------------------------------------
1 | BATCH_SIZE = 2048
2 | 
3 | utils.SaveCheckpointConfig.keep = 1


--------------------------------------------------------------------------------
/configs/exp/scaling.gin:
--------------------------------------------------------------------------------
1 | BATCH_SIZE = 2048
2 | 
3 | utils.SaveCheckpointConfig:
4 |   keep = None
5 |   period = %SAVING_PERIOD


--------------------------------------------------------------------------------
/configs/size/110m/vanilla.gin:
--------------------------------------------------------------------------------
1 | network.T5Config:
2 |   emb_dim = 640
3 |   num_heads = 5
4 |   num_encoder_layers = 10
5 |   num_decoder_layers = 10
6 |   head_dim = 128
7 |   mlp_dim = 1920


--------------------------------------------------------------------------------
/configs/size/1_6b/deep.gin:
--------------------------------------------------------------------------------
1 | network.T5Config:
2 |   emb_dim = 1024
3 |   num_heads = 8
4 |   num_encoder_layers = 48
5 |   num_decoder_layers = 48
6 |   head_dim = 128
7 |   mlp_dim = 3072
8 | 


--------------------------------------------------------------------------------
/configs/size/1_6b/vanilla.gin:
--------------------------------------------------------------------------------
1 | network.T5Config:
2 |   emb_dim = 1536
3 |   num_heads = 12
4 |   num_encoder_layers = 24
5 |   num_decoder_layers = 24
6 |   head_dim = 128
7 |   mlp_dim = 4096
8 | 


--------------------------------------------------------------------------------
/configs/size/1_6b/wide.gin:
--------------------------------------------------------------------------------
1 | network.T5Config:
2 |   emb_dim = 2048
3 |   num_heads = 16
4 |   num_encoder_layers = 12
5 |   num_decoder_layers = 12
6 |   head_dim = 128
7 |   mlp_dim = 6144
8 | 


--------------------------------------------------------------------------------
/configs/size/200m/deep.gin:
--------------------------------------------------------------------------------
1 | network.T5Config:
2 |   emb_dim = 512
3 |   num_heads = 4
4 |   num_encoder_layers = 24
5 |   num_decoder_layers = 24
6 |   head_dim = 128
7 |   mlp_dim = 1536


--------------------------------------------------------------------------------
/configs/size/200m/vanilla.gin:
--------------------------------------------------------------------------------
1 | network.T5Config:
2 |   emb_dim = 768
3 |   num_heads = 6
4 |   num_encoder_layers = 12
5 |   num_decoder_layers = 12
6 |   head_dim = 128
7 |   mlp_dim = 2048


--------------------------------------------------------------------------------
/configs/size/200m/vanilla_gpt.gin:
--------------------------------------------------------------------------------
1 | network.TransformerConfig:
2 |   emb_dim = 768
3 |   num_heads = 6
4 |   num_layers = 12
5 |   head_dim = 128
6 |   mlp_dim = 2048
7 | 


--------------------------------------------------------------------------------
/configs/size/200m/wide.gin:
--------------------------------------------------------------------------------
1 | network.T5Config:
2 |   emb_dim = 1024
3 |   num_heads = 8
4 |   num_encoder_layers = 6
5 |   num_decoder_layers = 6
6 |   head_dim = 128
7 |   mlp_dim = 3072


--------------------------------------------------------------------------------
/configs/size/25m/vanilla.gin:
--------------------------------------------------------------------------------
1 | network.T5Config:
2 |   emb_dim = 384
3 |   num_heads = 3
4 |   num_encoder_layers = 6
5 |   num_decoder_layers = 6
6 |   head_dim = 128
7 |   mlp_dim = 1152


--------------------------------------------------------------------------------
/configs/size/3_1b/vanilla.gin:
--------------------------------------------------------------------------------
1 | network.T5Config:
2 |   emb_dim = 1920
3 |   num_heads = 15
4 |   num_encoder_layers = 20
5 |   num_decoder_layers = 20
6 |   head_dim = 128
7 |   mlp_dim = 5760


--------------------------------------------------------------------------------
/configs/size/470m/vanilla.gin:
--------------------------------------------------------------------------------
1 | network.T5Config:
2 |   emb_dim = 1024
3 |   num_heads = 8
4 |   num_encoder_layers = 16
5 |   num_decoder_layers = 16
6 |   head_dim = 128
7 |   mlp_dim = 3072


--------------------------------------------------------------------------------
/configs/size/60m/vanilla.gin:
--------------------------------------------------------------------------------
1 | network.T5Config:
2 |   emb_dim = 512
3 |   num_heads = 4
4 |   num_encoder_layers = 8
5 |   num_decoder_layers = 8
6 |   head_dim = 128
7 |   mlp_dim = 1536


--------------------------------------------------------------------------------
/configs/size/920m/vanilla.gin:
--------------------------------------------------------------------------------
1 | network.T5Config:
2 |   emb_dim = 1280
3 |   num_heads = 10
4 |   num_encoder_layers = 20
5 |   num_decoder_layers = 20
6 |   head_dim = 128
7 |   mlp_dim = 3840


--------------------------------------------------------------------------------
/configs/task/eval/t0_eval.gin:
--------------------------------------------------------------------------------
 1 | from __gin__ import dynamic_registration
 2 | 
 3 | import __main__ as train_script
 4 | from t5x import utils
 5 | 
 6 | import data.p3.tasks
 7 | 
 8 | include 't5x/configs/runs/finetune.gin'
 9 | 
10 | MIXTURE_OR_TASK_NAME = "t0_eval_score_eval"
11 | TASK_FEATURE_LENGTHS = {'inputs': 1024, 'targets': 256}
12 | 
13 | DROPOUT_RATE = 0.1
14 | BATCH_SIZE = 1024
15 | EVAL_STEPS = 100
16 | EVAL_PERIOD = %SAVING_PERIOD
17 | 
18 | train_script.train:
19 |   run_eval_before_training = True


--------------------------------------------------------------------------------
/configs/task/finetune/t0_train.gin:
--------------------------------------------------------------------------------
 1 | from __gin__ import dynamic_registration
 2 | 
 3 | import __main__ as train_script
 4 | from t5x import utils
 5 | 
 6 | import data.p3.tasks
 7 | 
 8 | include 't5x/configs/runs/finetune.gin'
 9 | 
10 | MIXTURE_OR_TASK_NAME = "t0_train"
11 | TASK_FEATURE_LENGTHS = {'inputs': 1024, 'targets': 256}
12 | 
13 | DROPOUT_RATE = 0.1
14 | BATCH_SIZE = 1024
15 | EVAL_STEPS = 100
16 | EVAL_PERIOD = %SAVING_PERIOD
17 | 
18 | utils.SaveCheckpointConfig:
19 |   period = %SAVING_PERIOD
20 | 


--------------------------------------------------------------------------------
/configs/task/pretrain/c4_mlm.gin:
--------------------------------------------------------------------------------
 1 | from __gin__ import dynamic_registration
 2 | 
 3 | import __main__ as train_script
 4 | from t5x import partitioning
 5 | from t5x import utils
 6 | from t5x import trainer
 7 | 
 8 | import data.c4.tasks
 9 | 
10 | include 't5x/configs/runs/pretrain.gin'
11 | 
12 | MIXTURE_OR_TASK_NAME = "c4_eye_span_corruption"
13 | TASK_FEATURE_LENGTHS = {"inputs": 512, "targets": 114}


--------------------------------------------------------------------------------
/convert_weights/configs/base_v1/config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "_name_or_path": "/home/patrick/hugging_face/t5/t5-v1_1-base",
 3 |     "architectures": [
 4 |       "T5ForConditionalGeneration"
 5 |     ],
 6 |     "d_ff": 2048,
 7 |     "d_kv": 64,
 8 |     "d_model": 768,
 9 |     "decoder_start_token_id": 0,
10 |     "dropout_rate": 0.1,
11 |     "eos_token_id": 1,
12 |     "feed_forward_proj": "gated-gelu",
13 |     "initializer_factor": 1.0,
14 |     "is_encoder_decoder": true,
15 |     "layer_norm_epsilon": 1e-06,
16 |     "model_type": "t5",
17 |     "num_decoder_layers": 12,
18 |     "num_heads": 12,
19 |     "num_layers": 12,
20 |     "output_past": true,
21 |     "pad_token_id": 0,
22 |     "relative_attention_num_buckets": 32,
23 |     "tie_word_embeddings": false,
24 |     "vocab_size": 32128
25 |   }


--------------------------------------------------------------------------------
/convert_weights/configs/base_v1/generation_config.json:
--------------------------------------------------------------------------------
1 | {
2 |     "_from_model_config": true,
3 |     "decoder_start_token_id": 0,
4 |     "eos_token_id": 1,
5 |     "pad_token_id": 0,
6 |     "transformers_version": "4.27.0.dev0"
7 |   }


--------------------------------------------------------------------------------
/convert_weights/configs/base_v1/spiece.model:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EleutherAI/improved-t5/08beb733e5df07d24fbf95f720d7e87e09c547cd/convert_weights/configs/base_v1/spiece.model


--------------------------------------------------------------------------------
/convert_weights/configs/base_v2/config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "architectures": [
 3 |     "UMT5ForConditionalGeneration"
 4 |   ],
 5 |   "d_ff": 2048,
 6 |   "d_kv": 64,
 7 |   "d_model": 768,
 8 |   "decoder_start_token_id": 0,
 9 |   "dense_act_fn": "gelu_new",
10 |   "dropout_rate": 0.1,
11 |   "eos_token_id": 2,
12 |   "feed_forward_proj": "gated-gelu",
13 |   "initializer_factor": 1.0,
14 |   "is_encoder_decoder": true,
15 |   "is_gated_act": true,
16 |   "layer_norm_epsilon": 1e-06,
17 |   "model_type": "umt5",
18 |   "num_decoder_layers": 12,
19 |   "num_heads": 12,
20 |   "num_layers": 12,
21 |   "output_past": true,
22 |   "pad_token_id": 0,
23 |   "relative_attention_max_distance": 128,
24 |   "relative_attention_num_buckets": 32,
25 |   "scalable_attention": true,
26 |   "tie_word_embeddings": false,
27 |   "tokenizer_class": "LlamaTokenizerFast",
28 |   "torch_dtype": "bfloat16",
29 |   "transformers_version": "4.31.0",
30 |   "use_cache": true,
31 |   "vocab_size": 32128
32 | }
33 | 


--------------------------------------------------------------------------------
/convert_weights/configs/base_v2/generation_config.json:
--------------------------------------------------------------------------------
1 | {
2 |     "_from_model_config": true,
3 |     "decoder_start_token_id": 0,
4 |     "eos_token_id": 2,
5 |     "pad_token_id": 0,
6 |     "transformers_version": "4.31.0"
7 |   }
8 |   


--------------------------------------------------------------------------------
/convert_weights/configs/base_v2/tokenizer.model:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EleutherAI/improved-t5/08beb733e5df07d24fbf95f720d7e87e09c547cd/convert_weights/configs/base_v2/tokenizer.model


--------------------------------------------------------------------------------
/convert_weights/configs/large_v1/config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "_name_or_path": "/home/patrick/hugging_face/t5/t5-v1_1-large",
 3 |   "architectures": [
 4 |     "T5ForConditionalGeneration"
 5 |   ],
 6 |   "d_ff": 2816,
 7 |   "d_kv": 64,
 8 |   "d_model": 1024,
 9 |   "decoder_start_token_id": 0,
10 |   "dropout_rate": 0.1,
11 |   "eos_token_id": 1,
12 |   "feed_forward_proj": "gated-gelu",
13 |   "initializer_factor": 1.0,
14 |   "is_encoder_decoder": true,
15 |   "layer_norm_epsilon": 1e-06,
16 |   "model_type": "t5",
17 |   "num_decoder_layers": 24,
18 |   "num_heads": 16,
19 |   "num_layers": 24,
20 |   "output_past": true,
21 |   "pad_token_id": 0,
22 |   "relative_attention_num_buckets": 32,
23 |   "tie_word_embeddings": false,
24 |   "vocab_size": 32128
25 | }


--------------------------------------------------------------------------------
/convert_weights/configs/large_v1/generation_config.json:
--------------------------------------------------------------------------------
1 | {
2 |     "_from_model_config": true,
3 |     "decoder_start_token_id": 0,
4 |     "eos_token_id": 1,
5 |     "pad_token_id": 0,
6 |     "transformers_version": "4.27.0.dev0"
7 |   }


--------------------------------------------------------------------------------
/convert_weights/configs/large_v1/spiece.model:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EleutherAI/improved-t5/08beb733e5df07d24fbf95f720d7e87e09c547cd/convert_weights/configs/large_v1/spiece.model


--------------------------------------------------------------------------------
/convert_weights/configs/large_v2/generation_config.json:
--------------------------------------------------------------------------------
1 | {
2 |     "_from_model_config": true,
3 |     "decoder_start_token_id": 0,
4 |     "eos_token_id": 2,
5 |     "pad_token_id": 0,
6 |     "transformers_version": "4.31.0"
7 |   }
8 |   


--------------------------------------------------------------------------------
/convert_weights/configs/large_v2/tokenizer.model:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EleutherAI/improved-t5/08beb733e5df07d24fbf95f720d7e87e09c547cd/convert_weights/configs/large_v2/tokenizer.model


--------------------------------------------------------------------------------
/convert_weights/configs/xl_v1/config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "_name_or_path": "/home/patrick/t5/t5-v1_1-xl",
 3 |   "architectures": [
 4 |     "T5ForConditionalGeneration"
 5 |   ],
 6 |   "d_ff": 5120,
 7 |   "d_kv": 64,
 8 |   "d_model": 2048,
 9 |   "decoder_start_token_id": 0,
10 |   "dropout_rate": 0.1,
11 |   "eos_token_id": 1,
12 |   "feed_forward_proj": "gated-gelu",
13 |   "initializer_factor": 1.0,
14 |   "is_encoder_decoder": true,
15 |   "layer_norm_epsilon": 1e-06,
16 |   "model_type": "t5",
17 |   "num_decoder_layers": 24,
18 |   "num_heads": 32,
19 |   "num_layers": 24,
20 |   "output_past": true,
21 |   "pad_token_id": 0,
22 |   "relative_attention_num_buckets": 32,
23 |   "tie_word_embeddings": false,
24 |   "vocab_size": 32128
25 | }


--------------------------------------------------------------------------------
/convert_weights/configs/xl_v1/generation_config.json:
--------------------------------------------------------------------------------
1 | {
2 |     "_from_model_config": true,
3 |     "decoder_start_token_id": 0,
4 |     "eos_token_id": 1,
5 |     "pad_token_id": 0,
6 |     "transformers_version": "4.27.0.dev0"
7 |   }


--------------------------------------------------------------------------------
/convert_weights/configs/xl_v1/spiece.model:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EleutherAI/improved-t5/08beb733e5df07d24fbf95f720d7e87e09c547cd/convert_weights/configs/xl_v1/spiece.model


--------------------------------------------------------------------------------
/convert_weights/configs/xl_v2/config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "architectures": [
 3 |     "UMT5ForConditionalGeneration"
 4 |   ],
 5 |   "d_ff": 5120,
 6 |   "d_kv": 64,
 7 |   "d_model": 2048,
 8 |   "decoder_start_token_id": 0,
 9 |   "dense_act_fn": "gelu_new",
10 |   "dropout_rate": 0.1,
11 |   "eos_token_id": 2,
12 |   "feed_forward_proj": "gated-gelu",
13 |   "initializer_factor": 1.0,
14 |   "is_encoder_decoder": true,
15 |   "is_gated_act": true,
16 |   "layer_norm_epsilon": 1e-06,
17 |   "model_type": "umt5",
18 |   "num_decoder_layers": 24,
19 |   "num_heads": 32,
20 |   "num_layers": 24,
21 |   "output_past": true,
22 |   "pad_token_id": 0,
23 |   "relative_attention_max_distance": 128,
24 |   "relative_attention_num_buckets": 32,
25 |   "scalable_attention": true,
26 |   "tie_word_embeddings": false,
27 |   "tokenizer_class": "LlamaTokenizerFast",
28 |   "torch_dtype": "bfloat16",
29 |   "transformers_version": "4.31.0",
30 |   "use_cache": true,
31 |   "vocab_size": 32128
32 | }
33 | 


--------------------------------------------------------------------------------
/convert_weights/configs/xl_v2/generation_config.json:
--------------------------------------------------------------------------------
1 | {
2 |     "_from_model_config": true,
3 |     "decoder_start_token_id": 0,
4 |     "eos_token_id": 2,
5 |     "pad_token_id": 0,
6 |     "transformers_version": "4.31.0"
7 |   }
8 |   


--------------------------------------------------------------------------------
/convert_weights/configs/xl_v2/tokenizer.model:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EleutherAI/improved-t5/08beb733e5df07d24fbf95f720d7e87e09c547cd/convert_weights/configs/xl_v2/tokenizer.model


--------------------------------------------------------------------------------
/convert_weights/configs/xxl_v1/config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "_name_or_path": "/home/patrick/t5/t5-v1_1-xxl",
 3 |   "architectures": [
 4 |     "T5ForConditionalGeneration"
 5 |   ],
 6 |   "d_ff": 10240,
 7 |   "d_kv": 64,
 8 |   "d_model": 4096,
 9 |   "decoder_start_token_id": 0,
10 |   "dropout_rate": 0.1,
11 |   "eos_token_id": 1,
12 |   "feed_forward_proj": "gated-gelu",
13 |   "initializer_factor": 1.0,
14 |   "is_encoder_decoder": true,
15 |   "layer_norm_epsilon": 1e-06,
16 |   "model_type": "t5",
17 |   "num_decoder_layers": 24,
18 |   "num_heads": 64,
19 |   "num_layers": 24,
20 |   "output_past": true,
21 |   "pad_token_id": 0,
22 |   "relative_attention_num_buckets": 32,
23 |   "tie_word_embeddings": false,
24 |   "vocab_size": 32128
25 | }
26 | 


--------------------------------------------------------------------------------
/convert_weights/configs/xxl_v1/generation_config.json:
--------------------------------------------------------------------------------
1 | {
2 |     "_from_model_config": true,
3 |     "decoder_start_token_id": 0,
4 |     "eos_token_id": 1,
5 |     "pad_token_id": 0,
6 |     "transformers_version": "4.27.0.dev0"
7 |   }


--------------------------------------------------------------------------------
/convert_weights/configs/xxl_v1/spiece.model:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EleutherAI/improved-t5/08beb733e5df07d24fbf95f720d7e87e09c547cd/convert_weights/configs/xxl_v1/spiece.model


--------------------------------------------------------------------------------
/convert_weights/configs/xxl_v2/generation_config.json:
--------------------------------------------------------------------------------
1 | {
2 |     "_from_model_config": true,
3 |     "decoder_start_token_id": 0,
4 |     "eos_token_id": 2,
5 |     "pad_token_id": 0,
6 |     "transformers_version": "4.31.0"
7 |   }
8 |   


--------------------------------------------------------------------------------
/convert_weights/configs/xxl_v2/tokenizer.model:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EleutherAI/improved-t5/08beb733e5df07d24fbf95f720d7e87e09c547cd/convert_weights/configs/xxl_v2/tokenizer.model


--------------------------------------------------------------------------------
/convert_weights/scripts/convert_v1.sh:
--------------------------------------------------------------------------------
1 | mkdir -p $3
2 | 
3 | python convert_t5v1_checkpoint_to_pytorch.py \
4 |     --config_file configs/${1}_v1/config.json \
5 |     --t5x_checkpoint_path $2 \
6 |     --pytorch_dump_path $3
7 | 
8 | cp configs/${1}_v1/* $3


--------------------------------------------------------------------------------
/convert_weights/scripts/convert_v2.sh:
--------------------------------------------------------------------------------
1 | mkdir -p $3
2 | 
3 | python convert_t5v2_checkpoint_to_pytorch.py \
4 |     --config_file configs/${1}_v2/config.json \
5 |     --t5x_checkpoint_path $2 \
6 |     --pytorch_dump_path $3 \
7 |     --scalable_attention
8 | 
9 | cp configs/${1}_v2/* $3


--------------------------------------------------------------------------------
/convert_weights/upload-codexglue.sh:
--------------------------------------------------------------------------------
 1 | SIZE=$1
 2 | LANG=$2
 3 | HF_MODEL_PATH=$3
 4 | HF_PATH=$4
 5 | T5X_PATH=$5
 6 | 
 7 | mkdir -p "${HF_PATH}"
 8 | git lfs install
 9 | git clone "https://huggingface.co/${HF_MODEL_PATH}" "${HF_PATH}"
10 | git -C "${HF_PATH}" remote set-url origin "https://${HF_USERNAME}:${HF_KEY}@huggingface.co/${HF_MODEL_PATH}"
11 | huggingface-cli lfs-enable-largefiles "${HF_PATH}"
12 | 
13 | # Switch branch
14 | git -C "${HF_PATH}" checkout -b "$LANG"
15 | git -C "${HF_PATH}" config http.postBuffer 524288000
16 | 
17 | bash scripts/convert_v2.sh ${SIZE} $T5X_PATH $HF_PATH
18 | 
19 | git -C "${HF_PATH}" add .
20 | git -C "${HF_PATH}" commit -am "add files for finetuning on $LANG"
21 | git -C "${HF_PATH}" push origin "$LANG"
22 | git -C "${HF_PATH}" checkout main
23 | 


--------------------------------------------------------------------------------
/convert_weights/upload.sh:
--------------------------------------------------------------------------------
 1 | SIZE=$1
 2 | HF_MODEL_PATH=$2
 3 | HF_PATH=$3
 4 | T5X_PATH=$4
 5 | 
 6 | mkdir -p "${HF_PATH}"
 7 | git lfs install
 8 | git clone "https://huggingface.co/${HF_MODEL_PATH}" "${HF_PATH}"
 9 | git -C "${HF_PATH}" remote set-url origin "https://${HF_USERNAME}:${HF_KEY}@huggingface.co/${HF_MODEL_PATH}"
10 | huggingface-cli lfs-enable-largefiles "${HF_PATH}"
11 | 
12 | # in main branch
13 | git -C "${HF_PATH}" checkout main
14 | git -C "${HF_PATH}" config http.postBuffer 524288000
15 | 
16 | bash scripts/convert_v2.sh ${SIZE} $T5X_PATH $HF_PATH
17 | 
18 | git -C "${HF_PATH}" add .
19 | git -C "${HF_PATH}" commit -am "add files"
20 | git -C "${HF_PATH}" push origin main
21 | 


--------------------------------------------------------------------------------
/data/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright 2022 
2 | 
3 | from data.utils import *
4 | from data.vocab import *
5 | from data.metrics import *
6 | from data.preprocessors import *
7 | 


--------------------------------------------------------------------------------
/data/bigbenchlite/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EleutherAI/improved-t5/08beb733e5df07d24fbf95f720d7e87e09c547cd/data/bigbenchlite/__init__.py


--------------------------------------------------------------------------------
/data/bigbenchlite/tasks.py:
--------------------------------------------------------------------------------
 1 | import seqio
 2 | from bigbench.bbseqio import task_api
 3 | from bigbench.bbseqio import tasks
 4 | 
 5 | from t5x.data.vocab import DEFAULT_OUTPUT_FEATURES, get_default_vocabulary
 6 | 
 7 | default_vocab = task_api.SeqIOVocabulary(
 8 |     name="default",
 9 |     description="default vocab",
10 |     vocabulary=get_default_vocabulary())
11 | 
12 | # Register BIG-bench lite tasks.
13 | # bigbench:bigbench_lite_v1.mix.default_vocab.0_shot.all_examples
14 | num_shots = 0
15 | tasks.register_bigbench_lite(num_shots, default_vocab)


--------------------------------------------------------------------------------
/data/c4/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EleutherAI/improved-t5/08beb733e5df07d24fbf95f720d7e87e09c547cd/data/c4/__init__.py


--------------------------------------------------------------------------------
/data/c4/c4_utils.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2022
 2 | import os
 3 | 
 4 | def get_c4_files(path):
 5 |     """name of the c4 files"""
 6 | 
 7 |     num_files_c4=1024
 8 |     file_list = [os.path.join(path, f"c4-train.{i:05}-of-01024.json") for i in range(num_files_c4)]
 9 |     
10 |     return {
11 |         "train": file_list[:-1],
12 |         "validation": file_list[-2:-1],
13 |         "test": file_list[-1:],
14 |         }
15 | 


--------------------------------------------------------------------------------
/data/codexglue/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EleutherAI/improved-t5/08beb733e5df07d24fbf95f720d7e87e09c547cd/data/codexglue/__init__.py


--------------------------------------------------------------------------------
/data/flan/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EleutherAI/improved-t5/08beb733e5df07d24fbf95f720d7e87e09c547cd/data/flan/__init__.py


--------------------------------------------------------------------------------
/data/metrics.py:
--------------------------------------------------------------------------------
 1 | import seqio
 2 | import numpy as np
 3 | 
 4 | from typing import Sequence
 5 | 
 6 | def perplexity(targets: Sequence[str], scores: Sequence[int]):
 7 | 
 8 |     cross_entropy = -np.mean(scores)/len(targets)
 9 |     perplexity = np.exp(cross_entropy)
10 | 
11 |     return {
12 |         "perplexity": seqio.metrics.Scalar(perplexity)
13 |     }
14 | 


--------------------------------------------------------------------------------
/data/p3/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EleutherAI/improved-t5/08beb733e5df07d24fbf95f720d7e87e09c547cd/data/p3/__init__.py


--------------------------------------------------------------------------------
/data/pile/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EleutherAI/improved-t5/08beb733e5df07d24fbf95f720d7e87e09c547cd/data/pile/__init__.py


--------------------------------------------------------------------------------
/data/pile/pile_utils.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2022
 2 | import os
 3 | 
 4 | def get_pile_files(path):
 5 |     """name of the pile files"""
 6 | 
 7 |     file_list = [os.path.join(path, f"{i:02}.txt") for i in range(20)]
 8 |     return {
 9 |         "train": file_list[:-1],
10 |         "validation": file_list[-1:],
11 |         "test": file_list[-1:],
12 |         }
13 | 
14 | def get_minipile_files(path, num_files):
15 |     """name of the minipile files"""
16 | 
17 |     file_list = [f"shuffled_00_x0{i:02}.txt" for i in range(num_files)]
18 |     return {
19 |         "train": file_list[:-1],
20 |         "validation": file_list[-1:],
21 |         "test": file_list[-1:],
22 |         }
23 | 


--------------------------------------------------------------------------------
/data/sglue/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EleutherAI/improved-t5/08beb733e5df07d24fbf95f720d7e87e09c547cd/data/sglue/__init__.py


--------------------------------------------------------------------------------
/evals/eval-all.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash 
 2 | MODEL_PATH=$1
 3 | MODEL=$2
 4 | LM_EVAL=$3
 5 | EXTRA=$4
 6 | 
 7 | echo "Evaluating ${MODEL}"
 8 | echo "BBH"
 9 | bash eval-bbh.sh ${MODEL_PATH} ${MODEL} ${LM_EVAL} ${EXTRA}
10 | echo "MMLU"
11 | bash eval-mmlu.sh ${MODEL_PATH} ${MODEL} ${LM_EVAL} ${EXTRA}
12 | echo "Held In"
13 | bash eval-held_in.sh ${MODEL_PATH} ${MODEL} ${LM_EVAL} ${EXTRA}
14 | echo "CoT"
15 | bash eval-cot.sh ${MODEL_PATH} ${MODEL} ${LM_EVAL} ${EXTRA}
16 | 


--------------------------------------------------------------------------------
/evals/eval-bbh.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash 
 2 | MODEL_PATH=$1
 3 | MODEL=$2
 4 | LM_EVAL=$3
 5 | EXTRA=$4
 6 | 
 7 | for TASK in bbh_zeroshot bbh_fewshot
 8 | do
 9 |     ${LM_EVAL} \
10 |         --model hf \
11 |         --model_args "pretrained=${MODEL_PATH}${MODEL}${EXTRA}" \
12 |         --tasks ${TASK} \
13 |         --batch_size 8 \
14 |         --output "output/${TASK}/${MODEL}" \
15 |         --log_samples
16 | done
17 | 


--------------------------------------------------------------------------------
/evals/eval-codexglue.sh:
--------------------------------------------------------------------------------
 1 | MODEL_PATH=$1
 2 | MODEL=$2
 3 | LM_EVAL=$3
 4 | EXTRA=$4
 5 | 
 6 | for LANG in go java php python ruby javascript; do
 7 |     ${LM_EVAL} \
 8 |         --model hf \
 9 |         --model_args "pretrained=${MODEL_PATH}${MODEL}${EXTRA},revision=${LANG}" \
10 |         --tasks "code2text_${LANG}" \
11 |         --batch_size 4 \
12 |         --output "output/codexglue_code2text/${MODEL}/${LANG}/" \
13 |         --log_samples
14 | done
15 | 


--------------------------------------------------------------------------------
/evals/eval-cot.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash 
 2 | MODEL_PATH=$1
 3 | MODEL=$2
 4 | LM_EVAL=$3
 5 | EXTRA=$4
 6 | 
 7 | for TASK in bbh_cot_zeroshot bbh_cot_fewshot mmlu_flan_cot_zeroshot mmlu_flan_cot_fewshot 
 8 | do
 9 |     ${LM_EVAL} \
10 |         --model hf \
11 |         --model_args "pretrained=${MODEL_PATH}${MODEL}${EXTRA}" \
12 |         --tasks ${TASK} \
13 |         --batch_size 4 \
14 |         --output "output/${TASK}/${MODEL}" \
15 |         --log_samples
16 |     done
17 | done
18 | 


--------------------------------------------------------------------------------
/evals/eval-held_in.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | MODEL_PATH=$1
 3 | MODEL=$2
 4 | LM_EVAL=$3
 5 | EXTRA=$4
 6 | 
 7 | TASK=flan_held_in
 8 | 
 9 | ${LM_EVAL} \
10 |     --model hf \
11 |     --model_args "pretrained=${MODEL_PATH}${MODEL}${EXTRA}" \
12 |     --tasks ${TASK} \
13 |     --batch_size 8 \
14 |     --output "OUTPUT/${TASK}/${MODEL}" \
15 |     --log_samples
16 | 


--------------------------------------------------------------------------------
/evals/eval-mmlu.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | MODEL_PATH=$1
 3 | MODEL=$2
 4 | LM_EVAL=$3
 5 | EXTRA=$4
 6 | 
 7 | for TASK in mmlu_generative mmlu_flan_n_shot_generative
 8 | do
 9 |     for NUM in 0 5
10 |     do
11 |     ${LM_EVAL} \
12 |         --model hf \
13 |         --model_args "pretrained=${MODEL_PATH}${MODEL}${EXTRA}" \
14 |         --tasks ${TASK} \
15 |         --batch_size 8 \
16 |         --output "OUTPUT/${TASK}/${MODEL}-${NUM}-shot" \
17 |         --num_fewshot $NUM \
18 |         --log_samples
19 |     done
20 | done
21 | 
22 | for TASK in mmlu mmlu_flan_n_shot_loglikelihood
23 | do
24 |     for NUM in 0 5
25 |     do
26 |     ${LM_EVAL} \
27 |         --model hf \
28 |         --model_args "pretrained=${MODEL_PATH}${MODEL}${EXTRA}" \
29 |         --tasks ${TASK} \
30 |         --batch_size 8 \
31 |         --output "OUTPUT/${TASK}/${MODEL}-${NUM}-shot" \
32 |         --num_fewshot $NUM \
33 |         --log_samples
34 |     done
35 | done
36 | 


--------------------------------------------------------------------------------
/evals/eval-sglue.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | MODEL_PATH=$1
 3 | MODEL=$2
 4 | LM_EVAL=$3
 5 | EXTRA=$4
 6 | 
 7 | export TASK="super-glue-t5-prompt"
 8 | 
 9 | ${LM_EVAL} \
10 |     --model hf \
11 |     --model_args "pretrained=${MODEL_PATH}${MODEL}${EXTRA},truncation=True,max_length=512" \
12 |     --tasks ${TASK} \
13 |     --batch_size 8 \
14 |     --output "OUTPUT/${TASK}/${MODEL}" \
15 |     --log_samples
16 | 


--------------------------------------------------------------------------------
/experiments/benchmarks/t5-v1.1-lm100k/sglue_base.sh:
--------------------------------------------------------------------------------
 1 | 
 2 | python3 -m t5x.train \
 3 |     --gin_file="../t5x/t5x/examples/t5/t5_1_1/base.gin" \
 4 |     --gin_file="configs/task/finetune/sglue.gin" \
 5 |     --gin.TRAIN_STEPS=1_128_000 \
 6 |     --gin.SAVING_PERIOD=4000 \
 7 |     --gin.MODEL_DIR=\"gs://improved-t5/ckpts/t5_1_1_base/sglue_finetune\" \
 8 |     --gin.INITIAL_CHECKPOINT_PATH=\"gs://t5-data/pretrained_models/t5x/t5_1_1_base/checkpoint_1000000\" \
 9 |     --seqio_additional_cache_dirs=\"gs://improved-t5/data\" \
10 |     --alsologtostderr


--------------------------------------------------------------------------------
/experiments/benchmarks/t5-v1.1-lm100k/sglue_large.sh:
--------------------------------------------------------------------------------
 1 | 
 2 | python3 -m t5x.train \
 3 |     --gin_file="../t5x/t5x/examples/t5/t5_1_1/large.gin" \
 4 |     --gin_file="configs/task/finetune/sglue.gin" \
 5 |     --gin.TRAIN_STEPS=1_128_000 \
 6 |     --gin.SAVING_PERIOD=4000 \
 7 |     --gin.MODEL_DIR=\"gs://improved-t5/ckpts/t5_1_1_large/sglue_finetune\" \
 8 |     --gin.INITIAL_CHECKPOINT_PATH=\"gs://t5-data/pretrained_models/t5x/t5_1_1_large/checkpoint_1000000\" \
 9 |     --seqio_additional_cache_dirs=\"gs://improved-t5/data\" \
10 |     --alsologtostderr


--------------------------------------------------------------------------------
/experiments/benchmarks/t5-v1.1-lm100k/t0-train_base.sh:
--------------------------------------------------------------------------------
 1 | 
 2 | python3 -m t5x.train \
 3 |     --gin_file="../t5x/t5x/examples/t5/t5_1_1/base.gin" \
 4 |     --gin_file="configs/task/finetune/t0_train.gin" \
 5 |     --gin.TRAIN_STEPS=1_128_000 \
 6 |     --gin.SAVING_PERIOD=4000 \
 7 |     --gin.MODEL_DIR=\"gs://improved-t5/ckpts/t5_1_1_base/t0-train_finetune\" \
 8 |     --gin.INITIAL_CHECKPOINT_PATH=\"gs://t5-data/pretrained_models/t5x/t5_1_1_base/checkpoint_1000000\" \
 9 |     --seqio_additional_cache_dirs=\"gs://improved-t5/data\" \
10 |     --alsologtostderr


--------------------------------------------------------------------------------
/experiments/benchmarks/t5-v1.1-lm100k/t0-train_large.sh:
--------------------------------------------------------------------------------
 1 | 
 2 | python3 -m t5x.train \
 3 |     --gin_file="../t5x/t5x/examples/t5/t5_1_1/large.gin" \
 4 |     --gin_file="configs/task/finetune/t0_train.gin" \
 5 |     --gin.TRAIN_STEPS=1_128_000 \
 6 |     --gin.SAVING_PERIOD=4000 \
 7 |     --gin.MODEL_DIR=\"gs://improved-t5/ckpts/t5_1_1_large/t0-train_finetune\" \
 8 |     --gin.INITIAL_CHECKPOINT_PATH=\"gs://t5-data/pretrained_models/t5x/t5_1_1_large/checkpoint_1000000\" \
 9 |     --seqio_additional_cache_dirs=\"gs://improved-t5/data\" \
10 |     --alsologtostderr
11 |     


--------------------------------------------------------------------------------
/experiments/benchmarks/t5-v1.1-lm_adapt/base_flan.sh:
--------------------------------------------------------------------------------
 1 | ADDR=$1
 2 | MODEL_DIR=$2
 3 | 
 4 | python -m t5x.train \
 5 |     --gin_file="../t5x/t5x/examples/t5/t5_1_1/base.gin" \
 6 |     --gin_file="configs/task/finetune/flan_t5.gin" \
 7 |     --gin.TRAIN_STEPS=1_184_000 \
 8 |     --gin.SAVING_PERIOD=10_000 \
 9 |     --gin.MODEL_DIR=\"gs://improved-t5/ckpts/t5_1_1_lm100k_base/flan_finetune\" \
10 |     --gin.INITIAL_CHECKPOINT_PATH=\"gs://t5-data/pretrained_models/t5x/t5_1_1_lm100k_base/checkpoint_1100000\" \
11 |     --gin.USE_CACHED_TASKS=False \
12 |     --alsologtostderr
13 |     # --multiprocess_gpu \
14 |     # --coordinator_address=${ADDR} \
15 |     # --process_count=${SLURM_NTASKS} \
16 |     # --process_index=${SLURM_PROCID}
17 | 


--------------------------------------------------------------------------------
/experiments/benchmarks/t5-v1.1-lm_adapt/base_flan2021.sh:
--------------------------------------------------------------------------------
 1 | ADDR=$1
 2 | MODEL_DIR=$2
 3 | 
 4 | python -m t5x.train \
 5 |     --gin_file="../t5x/t5x/examples/t5/t5_1_1/base.gin" \
 6 |     --gin_file="configs/task/finetune/flan2021_t5.gin" \
 7 |     --gin.TRAIN_STEPS=1_184_000 \
 8 |     --gin.SAVING_PERIOD=10_000 \
 9 |     --gin.MODEL_DIR=\"gs://improved-t5/ckpts/t5_1_1_lm100k_base/flan2021submix_finetune\" \
10 |     --gin.INITIAL_CHECKPOINT_PATH=\"gs://t5-data/pretrained_models/t5x/t5_1_1_lm100k_base/checkpoint_1100000\" \
11 |     --gin.USE_CACHED_TASKS=False \
12 |     --alsologtostderr
13 |     # --multiprocess_gpu \
14 |     # --coordinator_address=${ADDR} \
15 |     # --process_count=${SLURM_NTASKS} \
16 |     # --process_index=${SLURM_PROCID}
17 | 


--------------------------------------------------------------------------------
/experiments/benchmarks/t5-v1.1-lm_adapt/large_flan.sh:
--------------------------------------------------------------------------------
 1 | ADDR=$1
 2 | MODEL_DIR=$2
 3 | 
 4 | python -m t5x.train \
 5 |     --gin_file="../t5x/t5x/examples/t5/t5_1_1/large.gin" \
 6 |     --gin_file="configs/task/finetune/flan_t5.gin" \
 7 |     --gin.TRAIN_STEPS=1_164_000 \
 8 |     --gin.SAVING_PERIOD=10_000 \
 9 |     --gin.MODEL_DIR=\"gs://improved-t5/ckpts/t5_1_1_lm100k_large/flan_finetune\" \
10 |     --gin.INITIAL_CHECKPOINT_PATH=\"gs://t5-data/pretrained_models/t5x/t5_1_1_lm100k_large/checkpoint_1100000\" \
11 |     --gin.USE_CACHED_TASKS=False \
12 |     --alsologtostderr
13 |     # --multiprocess_gpu \
14 |     # --coordinator_address=${ADDR} \
15 |     # --process_count=${SLURM_NTASKS} \
16 |     # --process_index=${SLURM_PROCID}
17 | 


--------------------------------------------------------------------------------
/experiments/benchmarks/t5-v1.1-lm_adapt/large_flan2021.sh:
--------------------------------------------------------------------------------
 1 | ADDR=$1
 2 | MODEL_DIR=$2
 3 | 
 4 | python -m t5x.train \
 5 |     --gin_file="../t5x/t5x/examples/t5/t5_1_1/large.gin" \
 6 |     --gin_file="configs/task/finetune/flan2021_t5.gin" \
 7 |     --gin.TRAIN_STEPS=1_164_000 \
 8 |     --gin.SAVING_PERIOD=10_000 \
 9 |     --gin.MODEL_DIR=\"gs://improved-t5/ckpts/t5_1_1_lm100k_large/flan2021_finetune\" \
10 |     --gin.INITIAL_CHECKPOINT_PATH=\"gs://t5-data/pretrained_models/t5x/t5_1_1_lm100k_large/checkpoint_1100000\" \
11 |     --gin.USE_CACHED_TASKS=False \
12 |     --alsologtostderr
13 |     # --multiprocess_gpu \
14 |     # --coordinator_address=${ADDR} \
15 |     # --process_count=${SLURM_NTASKS} \
16 |     # --process_index=${SLURM_PROCID}
17 | 


--------------------------------------------------------------------------------
/experiments/benchmarks/t5-v1.1-lm_adapt/xl_flan.sh:
--------------------------------------------------------------------------------
 1 | ADDR=$1
 2 | MODEL_DIR=$2
 3 | 
 4 | python -m t5x.train \
 5 |     --gin_file="../t5x/t5x/examples/t5/t5_1_1/xl.gin" \
 6 |     --gin_file="configs/task/finetune/flan_t5.gin" \
 7 |     --gin.partitioning.PjitPartitioner.model_parallel_submesh="(1, 1, 2, 1)" \
 8 |     --gin.TRAIN_STEPS=1_138_000 \
 9 |     --gin.SAVING_PERIOD=10_000 \
10 |     --gin.MODEL_DIR=\"gs://improved-t5/ckpts/t5_1_1_lm100k_xl/flan_finetune\" \
11 |     --gin.INITIAL_CHECKPOINT_PATH=\"gs://t5-data/pretrained_models/t5x/t5_1_1_lm100k_xl/checkpoint_1100000\" \
12 |     --gin.USE_CACHED_TASKS=False \
13 |     --alsologtostderr
14 |     # --multiprocess_gpu \
15 |     # --coordinator_address=${ADDR} \
16 |     # --process_count=${SLURM_NTASKS} \
17 |     # --process_index=${SLURM_PROCID}
18 | 


--------------------------------------------------------------------------------
/experiments/benchmarks/t5-v1.1-lm_adapt/xl_flan2021.sh:
--------------------------------------------------------------------------------
 1 | ADDR=$1
 2 | MODEL_DIR=$2
 3 | 
 4 | python -m t5x.train \
 5 |     --gin_file="../t5x/t5x/examples/t5/t5_1_1/xl.gin" \
 6 |     --gin_file="configs/task/finetune/flan2021_t5.gin" \
 7 |     --gin.partitioning.PjitPartitioner.model_parallel_submesh="(1, 1, 2, 1)" \
 8 |     --gin.TRAIN_STEPS=1_138_000 \
 9 |     --gin.SAVING_PERIOD=10_000 \
10 |     --gin.MODEL_DIR=\"gs://improved-t5/ckpts/t5_1_1_lm100k_xl/flan2021_finetune\" \
11 |     --gin.INITIAL_CHECKPOINT_PATH=\"gs://t5-data/pretrained_models/t5x/t5_1_1_lm100k_xl/checkpoint_1100000\" \
12 |     --gin.USE_CACHED_TASKS=False \
13 |     --alsologtostderr
14 |     # --multiprocess_gpu \
15 |     # --coordinator_address=${ADDR} \
16 |     # --process_count=${SLURM_NTASKS} \
17 |     # --process_index=${SLURM_PROCID}
18 | 


--------------------------------------------------------------------------------
/experiments/benchmarks/t5-v1.1-lm_adapt/xxl_flan.sh:
--------------------------------------------------------------------------------
 1 | ADDR=$1
 2 | MODEL_DIR=$2
 3 | 
 4 | python -m t5x.train \
 5 |     --gin_file="../t5x/t5x/examples/t5/t5_1_1/xxl.gin" \
 6 |     --gin_file="configs/task/finetune/flan_t5.gin" \
 7 |     --gin.partitioning.PjitPartitioner.model_parallel_submesh="(1, 1, 2, 1)" \
 8 |     --gin.TRAIN_STEPS=1_114_000 \
 9 |     --gin.SAVING_PERIOD=10_000 \
10 |     --gin.MODEL_DIR=\"gs://improved-t5/ckpts/t5_1_1_lm100k_xxl/flan_finetune\" \
11 |     --gin.INITIAL_CHECKPOINT_PATH=\"gs://t5-data/pretrained_models/t5x/t5_1_1_lm100k_xxl/checkpoint_1100000\" \
12 |     --gin.USE_CACHED_TASKS=False \
13 |     --alsologtostderr
14 |     # --multiprocess_gpu \
15 |     # --coordinator_address=${ADDR} \
16 |     # --process_count=${SLURM_NTASKS} \
17 |     # --process_index=${SLURM_PROCID}
18 | 


--------------------------------------------------------------------------------
/experiments/benchmarks/t5-v1.1-lm_adapt/xxl_flan2021.sh:
--------------------------------------------------------------------------------
 1 | ADDR=$1
 2 | MODEL_DIR=$2
 3 | 
 4 | python -m t5x.train \
 5 |     --gin_file="../t5x/t5x/examples/t5/t5_1_1/xxl.gin" \
 6 |     --gin_file="configs/task/finetune/flan2021_t5.gin" \
 7 |     --gin.partitioning.PjitPartitioner.model_parallel_submesh="(1, 1, 2, 1)" \
 8 |     --gin.TRAIN_STEPS=1_114_000 \
 9 |     --gin.SAVING_PERIOD=10_000 \
10 |     --gin.MODEL_DIR=\"gs://improved-t5/ckpts/t5_1_1_lm100k_xxl/flan2021_finetune\" \
11 |     --gin.INITIAL_CHECKPOINT_PATH=\"gs://t5-data/pretrained_models/t5x/t5_1_1_lm100k_xxl/checkpoint_1100000\" \
12 |     --gin.USE_CACHED_TASKS=False \
13 |     --alsologtostderr
14 |     # --multiprocess_gpu \
15 |     # --coordinator_address=${ADDR} \
16 |     # --process_count=${SLURM_NTASKS} \
17 |     # --process_index=${SLURM_PROCID}
18 | 


--------------------------------------------------------------------------------
/experiments/benchmarks/t5-v1.1/base_flan2021.sh:
--------------------------------------------------------------------------------
 1 | ADDR=$1
 2 | MODEL_DIR=$2
 3 | 
 4 | python -m t5x.train \
 5 |     --gin_file="../t5x/t5x/examples/t5/t5_1_1/base.gin" \
 6 |     --gin_file="configs/task/finetune/flan2021_t5.gin" \
 7 |     --gin.TRAIN_STEPS=1_084_000 \
 8 |     --gin.SAVING_PERIOD=10_000 \
 9 |     --gin.MODEL_DIR=\"gs://improved-t5/ckpts/t5_1_1_base/finetune_flan2021\" \
10 |     --gin.INITIAL_CHECKPOINT_PATH=\"gs://t5-data/pretrained_models/t5x/t5_1_1_base/checkpoint_1000000\" \
11 |     --gin.USE_CACHED_TASKS=False \
12 |     --alsologtostderr
13 |     # --multiprocess_gpu \
14 |     # --coordinator_address=${ADDR} \
15 |     # --process_count=${SLURM_NTASKS} \
16 |     # --process_index=${SLURM_PROCID}
17 | 


--------------------------------------------------------------------------------
/experiments/benchmarks/t5-v1.1/base_flan2022.sh:
--------------------------------------------------------------------------------
 1 | ADDR=$1
 2 | MODEL_DIR=$2
 3 | 
 4 | python -m t5x.train \
 5 |     --gin_file="../t5x/t5x/examples/t5/t5_1_1/base.gin" \
 6 |     --gin_file="configs/task/finetune/flan2022_t5.gin" \
 7 |     --gin.TRAIN_STEPS=1_084_000 \
 8 |     --gin.SAVING_PERIOD=10_000 \
 9 |     --gin.MODEL_DIR=\"gs://improved-t5/ckpts/t5_1_1_base/finetune_flan2022\" \
10 |     --gin.INITIAL_CHECKPOINT_PATH=\"gs://t5-data/pretrained_models/t5x/t5_1_1_base/checkpoint_1000000\" \
11 |     --gin.USE_CACHED_TASKS=False \
12 |     --alsologtostderr
13 |     # --multiprocess_gpu \
14 |     # --coordinator_address=${ADDR} \
15 |     # --process_count=${SLURM_NTASKS} \
16 |     # --process_index=${SLURM_PROCID}
17 | 


--------------------------------------------------------------------------------
/experiments/benchmarks/t5-v1.1/base_sglue.sh:
--------------------------------------------------------------------------------
 1 | 
 2 | python3 -m t5x.train \
 3 |     --gin_file="../t5x/t5x/examples/t5/t5_1_1/base.gin" \
 4 |     --gin_file="configs/task/finetune/sglue_t5.gin" \
 5 |     --gin.TRAIN_STEPS=1_262_144 \
 6 |     --gin.SAVING_PERIOD=5000 \
 7 |     --gin.MODEL_DIR=\"gs://improved-t5/ckpts/t5_1_1_base/sglue_finetune\" \
 8 |     --gin.INITIAL_CHECKPOINT_PATH=\"gs://t5-data/pretrained_models/t5x/t5_1_1_base/checkpoint_1000000\" \
 9 |     --gin.USE_CACHED_TASKS=False \
10 |     --alsologtostderr
11 | 
12 |     # --seqio_additional_cache_dirs=\"gs://improved-t5/data\" \
13 | 


--------------------------------------------------------------------------------
/experiments/benchmarks/t5-v1.1/base_t0-train.sh:
--------------------------------------------------------------------------------
 1 | 
 2 | python3 -m t5x.train \
 3 |     --gin_file="../t5x/t5x/examples/t5/t5_1_1/base.gin" \
 4 |     --gin_file="configs/task/finetune/t0_train.gin" \
 5 |     --gin.TRAIN_STEPS=1_128_000 \
 6 |     --gin.SAVING_PERIOD=4000 \
 7 |     --gin.MODEL_DIR=\"gs://improved-t5/ckpts/t5_1_1_base/t0-train_finetune\" \
 8 |     --gin.INITIAL_CHECKPOINT_PATH=\"gs://t5-data/pretrained_models/t5x/t5_1_1_base/checkpoint_1000000\" \
 9 |     --seqio_additional_cache_dirs=\"gs://improved-t5/data\" \
10 |     --alsologtostderr


--------------------------------------------------------------------------------
/experiments/benchmarks/t5-v1.1/large_flan2021.sh:
--------------------------------------------------------------------------------
 1 | ADDR=$1
 2 | MODEL_DIR=$2
 3 | 
 4 | python -m t5x.train \
 5 |     --gin_file="../t5x/t5x/examples/t5/t5_1_1/large.gin" \
 6 |     --gin_file="configs/task/finetune/flan2021_t5.gin" \
 7 |     --gin.TRAIN_STEPS=1_064_000 \
 8 |     --gin.SAVING_PERIOD=10_000 \
 9 |     --gin.MODEL_DIR=\"gs://improved-t5/ckpts/t5_1_1_large/finetune_flan2021\" \
10 |     --gin.INITIAL_CHECKPOINT_PATH=\"gs://t5-data/pretrained_models/t5x/t5_1_1_large/checkpoint_1000000\" \
11 |     --gin.USE_CACHED_TASKS=False \
12 |     --alsologtostderr
13 |     # --multiprocess_gpu \
14 |     # --coordinator_address=${ADDR} \
15 |     # --process_count=${SLURM_NTASKS} \
16 |     # --process_index=${SLURM_PROCID}
17 | 


--------------------------------------------------------------------------------
/experiments/benchmarks/t5-v1.1/large_flan2022.sh:
--------------------------------------------------------------------------------
 1 | ADDR=$1
 2 | MODEL_DIR=$2
 3 | 
 4 | python -m t5x.train \
 5 |     --gin_file="../t5x/t5x/examples/t5/t5_1_1/large.gin" \
 6 |     --gin_file="configs/task/finetune/flan2022_t5.gin" \
 7 |     --gin.TRAIN_STEPS=1_064_000 \
 8 |     --gin.SAVING_PERIOD=10_000 \
 9 |     --gin.MODEL_DIR=\"gs://improved-t5/ckpts/t5_1_1_large/finetune_flan2022\" \
10 |     --gin.INITIAL_CHECKPOINT_PATH=\"gs://t5-data/pretrained_models/t5x/t5_1_1_large/checkpoint_1000000\" \
11 |     --gin.USE_CACHED_TASKS=False \
12 |     --alsologtostderr
13 |     # --multiprocess_gpu \
14 |     # --coordinator_address=${ADDR} \
15 |     # --process_count=${SLURM_NTASKS} \
16 |     # --process_index=${SLURM_PROCID}
17 | 


--------------------------------------------------------------------------------
/experiments/benchmarks/t5-v1.1/large_sglue.sh:
--------------------------------------------------------------------------------
 1 | 
 2 | python3 -m t5x.train \
 3 |     --gin_file="../t5x/t5x/examples/t5/t5_1_1/large.gin" \
 4 |     --gin_file="configs/task/finetune/sglue_t5.gin" \
 5 |     --gin.TRAIN_STEPS=1_262_144 \
 6 |     --gin.SAVING_PERIOD=5000 \
 7 |     --gin.MODEL_DIR=\"gs://improved-t5/ckpts/t5_1_1_large/sglue_finetune\" \
 8 |     --gin.INITIAL_CHECKPOINT_PATH=\"gs://t5-data/pretrained_models/t5x/t5_1_1_large/checkpoint_1000000\" \
 9 |     --gin.USE_CACHED_TASKS=False \
10 |     --alsologtostderr


--------------------------------------------------------------------------------
/experiments/benchmarks/t5-v1.1/large_t0-train.sh:
--------------------------------------------------------------------------------
 1 | 
 2 | python3 -m t5x.train \
 3 |     --gin_file="../t5x/t5x/examples/t5/t5_1_1/large.gin" \
 4 |     --gin_file="configs/task/finetune/t0_train.gin" \
 5 |     --gin.TRAIN_STEPS=1_128_000 \
 6 |     --gin.SAVING_PERIOD=4000 \
 7 |     --gin.MODEL_DIR=\"gs://improved-t5/ckpts/t5_1_1_large/t0-train_finetune\" \
 8 |     --gin.INITIAL_CHECKPOINT_PATH=\"gs://t5-data/pretrained_models/t5x/t5_1_1_large/checkpoint_1000000\" \
 9 |     --seqio_additional_cache_dirs=\"gs://improved-t5/data\" \
10 |     --alsologtostderr
11 |     


--------------------------------------------------------------------------------
/experiments/benchmarks/t5-v1.1/xl_flan2021.sh:
--------------------------------------------------------------------------------
 1 | ADDR=$1
 2 | MODEL_DIR=$2
 3 | 
 4 | python -m t5x.train \
 5 |     --gin_file="../t5x/t5x/examples/t5/t5_1_1/xl.gin" \
 6 |     --gin_file="configs/task/finetune/flan2021_t5.gin" \
 7 |     --gin.partitioning.PjitPartitioner.model_parallel_submesh="(1, 1, 2, 1)" \
 8 |     --gin.TRAIN_STEPS=1_038_000 \
 9 |     --gin.SAVING_PERIOD=10_000 \
10 |     --gin.MODEL_DIR=\"gs://improved-t5/ckpts/t5_1_1_xl/finetune_flan2021\" \
11 |     --gin.INITIAL_CHECKPOINT_PATH=\"gs://t5-data/pretrained_models/t5x/t5_1_1_xl/checkpoint_1000000\" \
12 |     --gin.USE_CACHED_TASKS=False \
13 |     --alsologtostderr
14 |     # --multiprocess_gpu \
15 |     # --coordinator_address=${ADDR} \
16 |     # --process_count=${SLURM_NTASKS} \
17 |     # --process_index=${SLURM_PROCID}
18 | 


--------------------------------------------------------------------------------
/experiments/benchmarks/t5-v1.1/xl_flan2022.sh:
--------------------------------------------------------------------------------
 1 | ADDR=$1
 2 | MODEL_DIR=$2
 3 | 
 4 | python -m t5x.train \
 5 |     --gin_file="../t5x/t5x/examples/t5/t5_1_1/xl.gin" \
 6 |     --gin_file="configs/task/finetune/flan2022_t5.gin" \
 7 |     --gin.partitioning.PjitPartitioner.model_parallel_submesh="(1, 1, 2, 1)" \
 8 |     --gin.TRAIN_STEPS=1_038_000 \
 9 |     --gin.SAVING_PERIOD=10_000 \
10 |     --gin.MODEL_DIR=\"gs://improved-t5/ckpts/t5_1_1_xl/finetune_flan2022\" \
11 |     --gin.INITIAL_CHECKPOINT_PATH=\"gs://t5-data/pretrained_models/t5x/t5_1_1_xl/checkpoint_1000000\" \
12 |     --gin.USE_CACHED_TASKS=False \
13 |     --alsologtostderr
14 |     # --multiprocess_gpu \
15 |     # --coordinator_address=${ADDR} \
16 |     # --process_count=${SLURM_NTASKS} \
17 |     # --process_index=${SLURM_PROCID}
18 | 


--------------------------------------------------------------------------------
/experiments/benchmarks/t5-v1.1/xl_sglue.sh:
--------------------------------------------------------------------------------
 1 | 
 2 | python3 -m t5x.train \
 3 |     --gin_file="../t5x/t5x/examples/t5/t5_1_1/xl.gin" \
 4 |     --gin_file="configs/task/finetune/sglue_t5.gin" \
 5 |     --gin.TRAIN_STEPS=1_262_144 \
 6 |     --gin.SAVING_PERIOD=5000 \
 7 |     --gin.MODEL_DIR=\"gs://improved-t5/ckpts/t5_1_1_xl/sglue_finetune\" \
 8 |     --gin.INITIAL_CHECKPOINT_PATH=\"gs://t5-data/pretrained_models/t5x/t5_1_1_xl/checkpoint_1000000\" \
 9 |     --gin.USE_CACHED_TASKS=False \
10 |     --alsologtostderr


--------------------------------------------------------------------------------
/experiments/benchmarks/t5-v1.1/xxl_flan2021.sh:
--------------------------------------------------------------------------------
 1 | ADDR=$1
 2 | MODEL_DIR=$2
 3 | 
 4 | python -m t5x.train \
 5 |     --gin_file="../t5x/t5x/examples/t5/t5_1_1/xxl.gin" \
 6 |     --gin_file="configs/task/finetune/flan2021_t5.gin" \
 7 |     --gin.partitioning.standard_logical_axis_rules.activation_partitioning_dims=2 \
 8 |     --gin.partitioning.standard_logical_axis_rules.parameter_partitioning_dims=2 \
 9 |     --gin.TRAIN_STEPS=1_014_000 \
10 |     --gin.SAVING_PERIOD=10_000 \
11 |     --gin.MODEL_DIR=\"gs://improved-t5/ckpts/t5_1_1_xxl/finetune_flan2021\" \
12 |     --gin.INITIAL_CHECKPOINT_PATH=\"gs://t5-data/pretrained_models/t5x/t5_1_1_xxl/checkpoint_1000000\" \
13 |     --gin.USE_CACHED_TASKS=False \
14 |     --alsologtostderr
15 |     # --multiprocess_gpu \
16 |     # --coordinator_address=${ADDR} \
17 |     # --process_count=${SLURM_NTASKS} \
18 |     # --process_index=${SLURM_PROCID}
19 | 


--------------------------------------------------------------------------------
/experiments/benchmarks/t5-v1.1/xxl_flan2022.sh:
--------------------------------------------------------------------------------
 1 | ADDR=$1
 2 | MODEL_DIR=$2
 3 | 
 4 | python -m t5x.train \
 5 |     --gin_file="../t5x/t5x/examples/t5/t5_1_1/xxl.gin" \
 6 |     --gin_file="configs/task/finetune/flan2022_t5.gin" \
 7 |     --gin.TRAIN_STEPS=1_014_000 \
 8 |     --gin.SAVING_PERIOD=2000 \
 9 |     --gin.MODEL_DIR=\"gs://improved-t5/ckpts/t5_1_1_xxl/finetune_flan2022\" \
10 |     --gin.INITIAL_CHECKPOINT_PATH=\"gs://t5-data/pretrained_models/t5x/t5_1_1_xxl/checkpoint_1000000\" \
11 |     --gin.USE_CACHED_TASKS=False \
12 |     --alsologtostderr
13 |     # --multiprocess_gpu \
14 |     # --coordinator_address=${ADDR} \
15 |     # --process_count=${SLURM_NTASKS} \
16 |     # --process_index=${SLURM_PROCID}
17 |     # --gin.trainer.Trainer.num_microbatches=32 \
18 |     # --gin.partitioning.standard_logical_axis_rules.activation_partitioning_dims=1 \
19 |     # --gin.partitioning.standard_logical_axis_rules.parameter_partitioning_dims=2 \


--------------------------------------------------------------------------------
/experiments/benchmarks/t5-v1.1/xxl_sglue.sh:
--------------------------------------------------------------------------------
 1 | 
 2 | python3 -m t5x.train \
 3 |     --gin_file="../t5x/t5x/examples/t5/t5_1_1/xxl.gin" \
 4 |     --gin_file="configs/task/finetune/sglue_t5.gin" \
 5 |     --gin.partitioning.standard_logical_axis_rules.activation_partitioning_dims=1 \
 6 |     --gin.partitioning.standard_logical_axis_rules.parameter_partitioning_dims=2 \
 7 |     --gin.TRAIN_STEPS=1_262_144 \
 8 |     --gin.SAVING_PERIOD=5000 \
 9 |     --gin.MODEL_DIR=\"gs://improved-t5/ckpts/t5_1_1_xxl/sglue_finetune\" \
10 |     --gin.INITIAL_CHECKPOINT_PATH=\"gs://t5-data/pretrained_models/t5x/t5_1_1_xxl/checkpoint_1000000\" \
11 |     --gin.USE_CACHED_TASKS=False \
12 |     --alsologtostderr


--------------------------------------------------------------------------------
/experiments/improved_t5/ablations/v1-1_xl_flan2021_submix.sh:
--------------------------------------------------------------------------------
 1 | ADDR=$1
 2 | MODEL_DIR=$2
 3 | 
 4 | python -m t5x.train \
 5 |     --gin_file="../t5x/t5x/examples/t5/t5_1_1/xl.gin" \
 6 |     --gin.partitioning.PjitPartitioner.model_parallel_submesh="(1, 1, 2, 1)" \
 7 |     --gin_file="configs/task/finetune/flan2021_t5.gin" \
 8 |     --gin.MIXTURE_OR_TASK_NAME=\"flan2021_submix_original_t5\" \
 9 |     --gin.TRAIN_STEPS=1_038_000 \
10 |     --gin.SAVING_PERIOD=2_000 \
11 |     --gin.INITIAL_CHECKPOINT_PATH=\"gs://t5-data/pretrained_models/t5x/t5_1_1_xl/checkpoint_1000000\" \
12 |     --gin.MODEL_DIR=\"gs://improved-t5/ckpts/ablations/v1_1_xl_flan2021_submix\" \
13 |     --gin.USE_CACHED_TASKS=False \
14 |     --alsologtostderr
15 |     # --multiprocess_gpu \
16 |     # --coordinator_address=${ADDR} \
17 |     # --process_count=${SLURM_NTASKS} \
18 |     # --process_index=${SLURM_PROCID}
19 | 


--------------------------------------------------------------------------------
/experiments/improved_t5/eval_bf16.sh:
--------------------------------------------------------------------------------
 1 | ADDR=$1
 2 | MODEL_DIR=$2
 3 | 
 4 | python -m t5x.train \
 5 |     --gin_file="models/scalable_t5/t5_1_1/base.gin" \
 6 |     --gin_file="configs/task/finetune/pile_mlm.gin" \
 7 |     --gin.seqio.SentencePieceVocabulary.sentencepiece_model_file=\""gs://improved-t5/vocabs/tokenizer.model"\" \
 8 |     --gin.seqio.SentencePieceVocabulary.extra_ids=100 \
 9 |     --gin.TRAIN_STEPS=1000000 \
10 |     --gin.SAVING_PERIOD=10000 \
11 |     --gin.network.T5Config.dtype=\"bfloat16\" \
12 |     --gin.MODEL_DIR=\"gs://improved-t5/ckpts/v2_base_mlm/checkpoint_1000000/pile_bf16\" \
13 |     --gin.INITIAL_CHECKPOINT_PATH=\"gs://improved-t5/ckpts/v2_base_mlm/checkpoint_1000000\" \
14 |     --gin.USE_CACHED_TASKS=False \
15 |     --alsologtostderr


--------------------------------------------------------------------------------
/experiments/improved_t5/eval_fp16.sh:
--------------------------------------------------------------------------------
 1 | ADDR=$1
 2 | MODEL_DIR=$2
 3 | 
 4 | python -m t5x.train \
 5 |     --gin_file="models/scalable_t5/t5_1_1/base.gin" \
 6 |     --gin_file="configs/task/finetune/pile_mlm.gin" \
 7 |     --gin.seqio.SentencePieceVocabulary.sentencepiece_model_file=\""gs://improved-t5/vocabs/tokenizer.model"\" \
 8 |     --gin.seqio.SentencePieceVocabulary.extra_ids=100 \
 9 |     --gin.TRAIN_STEPS=1000000 \
10 |     --gin.SAVING_PERIOD=10000 \
11 |     --gin.network.T5Config.dtype=\"float16\" \
12 |     --gin.MODEL_DIR=\"gs://improved-t5/ckpts/v2_base_mlm/checkpoint_1000000/pile_fp16\" \
13 |     --gin.INITIAL_CHECKPOINT_PATH=\"gs://improved-t5/ckpts/v2_base_mlm/checkpoint_1000000\" \
14 |     --gin.USE_CACHED_TASKS=False \
15 |     --alsologtostderr


--------------------------------------------------------------------------------
/experiments/improved_t5/lm_adapt/finetune_base_flan.sh:
--------------------------------------------------------------------------------
 1 | ADDR=$1
 2 | MODEL_DIR=$2
 3 | 
 4 | python -m t5x.train \
 5 |     --gin_file="models/scalable_t5/t5_1_1/base.gin" \
 6 |     --gin.seqio.SentencePieceVocabulary.sentencepiece_model_file=\""gs://improved-t5/vocabs/tokenizer.model"\" \
 7 |     --gin.seqio.SentencePieceVocabulary.extra_ids=100 \
 8 |     --gin_file="configs/task/finetune/flan.gin" \
 9 |     --gin.TRAIN_STEPS=1_184_000 \
10 |     --gin.SAVING_PERIOD=10_000 \
11 |     --gin.MODEL_DIR=\"gs://improved-t5/ckpts/v2_base_mlm/checkpoint_1000000/lm_adapt_flan_finetune\" \
12 |     --gin.INITIAL_CHECKPOINT_PATH=\"gs://improved-t5/ckpts/v2_base_mlm/checkpoint_1000000/lm_adapt/checkpoint_1100000\" \
13 |     --gin.USE_CACHED_TASKS=False \
14 |     --alsologtostderr
15 |     # --multiprocess_gpu \
16 |     # --coordinator_address=${ADDR} \
17 |     # --process_count=${SLURM_NTASKS} \
18 |     # --process_index=${SLURM_PROCID}
19 | 


--------------------------------------------------------------------------------
/experiments/improved_t5/lm_adapt/finetune_large_flan.sh:
--------------------------------------------------------------------------------
 1 | ADDR=$1
 2 | MODEL_DIR=$2
 3 | 
 4 | python -m t5x.train \
 5 |     --gin_file="models/scalable_t5/t5_1_1/large.gin" \
 6 |     --gin.seqio.SentencePieceVocabulary.sentencepiece_model_file=\""gs://improved-t5/vocabs/tokenizer.model"\" \
 7 |     --gin.seqio.SentencePieceVocabulary.extra_ids=100 \
 8 |     --gin_file="configs/task/finetune/flan.gin" \
 9 |     --gin.TRAIN_STEPS=1_164_000 \
10 |     --gin.SAVING_PERIOD=10_000 \
11 |     --gin.MODEL_DIR=\"gs://improved-t5/ckpts/v2_large_mlm/checkpoint_1000000/lm_adapt_flan_finetune\" \
12 |     --gin.INITIAL_CHECKPOINT_PATH=\"gs://improved-t5/ckpts/v2_large_mlm/checkpoint_1000000/lm_adapt/checkpoint_1100000\" \
13 |     --gin.USE_CACHED_TASKS=False \
14 |     --alsologtostderr
15 |     # --multiprocess_gpu \
16 |     # --coordinator_address=${ADDR} \
17 |     # --process_count=${SLURM_NTASKS} \
18 |     # --process_index=${SLURM_PROCID}
19 | 


--------------------------------------------------------------------------------
/experiments/improved_t5/lm_adapt/finetune_xxl_flan.sh:
--------------------------------------------------------------------------------
 1 | ADDR=$1
 2 | MODEL_DIR=$2
 3 | 
 4 | python -m t5x.train \
 5 |     --gin_file="models/scalable_t5/t5_1_1/xxl.gin" \
 6 |     --gin.seqio.SentencePieceVocabulary.sentencepiece_model_file=\""gs://improved-t5/vocabs/tokenizer.model"\" \
 7 |     --gin.seqio.SentencePieceVocabulary.extra_ids=100 \
 8 |     --gin_file="configs/task/finetune/flan.gin" \
 9 |     --gin.TRAIN_STEPS=1_114_000 \
10 |     --gin.SAVING_PERIOD=10_000 \
11 |     --gin.MODEL_DIR=\"gs://improved-t5/ckpts/v2_xxl_mlm/checkpoint_1000000/lm_adapt_flan_finetune\" \
12 |     --gin.INITIAL_CHECKPOINT_PATH=\"gs://improved-t5/ckpts/v2_xxl_mlm/checkpoint_1000000/lm_adapt/checkpoint_1100000\" \
13 |     --gin.USE_CACHED_TASKS=False \
14 |     --alsologtostderr
15 |     # --multiprocess_gpu \
16 |     # --coordinator_address=${ADDR} \
17 |     # --process_count=${SLURM_NTASKS} \
18 |     # --process_index=${SLURM_PROCID}
19 | 


--------------------------------------------------------------------------------
/experiments/improved_t5/lm_adapt/lm_adapt_base.sh:
--------------------------------------------------------------------------------
 1 | ADDR=$1
 2 | MODEL_DIR=$2
 3 | 
 4 | python -m t5x.train \
 5 |     --gin_file="models/scalable_t5/t5_1_1/base.gin" \
 6 |     --gin.seqio.SentencePieceVocabulary.sentencepiece_model_file=\""gs://improved-t5/vocabs/tokenizer.model"\" \
 7 |     --gin.seqio.SentencePieceVocabulary.extra_ids=100 \
 8 |     --gin_file="configs/task/finetune/pile_prefix_lm.gin" \
 9 |     --gin.TRAIN_STEPS=1_100_000 \
10 |     --gin.SAVING_PERIOD=10_000 \
11 |     --gin.MODEL_DIR=\"gs://improved-t5/ckpts/v2_base_mlm/checkpoint_1000000/lm_adapt\" \
12 |     --gin.INITIAL_CHECKPOINT_PATH=\"gs://improved-t5/ckpts/v2_base_mlm/checkpoint_1000000\" \
13 |     --gin.USE_CACHED_TASKS=False \
14 |     --alsologtostderr
15 | 


--------------------------------------------------------------------------------
/experiments/improved_t5/lm_adapt/lm_adapt_large.sh:
--------------------------------------------------------------------------------
 1 | ADDR=$1
 2 | MODEL_DIR=$2
 3 | 
 4 | python -m t5x.train \
 5 |     --gin_file="models/scalable_t5/t5_1_1/large.gin" \
 6 |     --gin.seqio.SentencePieceVocabulary.sentencepiece_model_file=\""gs://improved-t5/vocabs/tokenizer.model"\" \
 7 |     --gin.seqio.SentencePieceVocabulary.extra_ids=100 \
 8 |     --gin_file="configs/task/finetune/pile_prefix_lm.gin" \
 9 |     --gin.TRAIN_STEPS=1_100_000 \
10 |     --gin.SAVING_PERIOD=10_000 \
11 |     --gin.MODEL_DIR=\"gs://improved-t5/ckpts/v2_large_mlm/checkpoint_1000000/lm_adapt\" \
12 |     --gin.INITIAL_CHECKPOINT_PATH=\"gs://improved-t5/ckpts/v2_large_mlm/checkpoint_1000000\" \
13 |     --gin.USE_CACHED_TASKS=False \
14 |     --alsologtostderr
15 | 


--------------------------------------------------------------------------------
/experiments/improved_t5/lm_adapt/lm_adapt_xl.sh:
--------------------------------------------------------------------------------
 1 | ADDR=$1
 2 | MODEL_DIR=$2
 3 | 
 4 | python -m t5x.train \
 5 |     --gin_file="models/scalable_t5/t5_1_1/xl.gin" \
 6 |     --gin.partitioning.PjitPartitioner.model_parallel_submesh="(1, 1, 2, 1)" \
 7 |     --gin.seqio.SentencePieceVocabulary.sentencepiece_model_file=\""gs://improved-t5/vocabs/tokenizer.model"\" \
 8 |     --gin.seqio.SentencePieceVocabulary.extra_ids=100 \
 9 |     --gin_file="configs/task/finetune/pile_prefix_lm.gin" \
10 |     --gin.TRAIN_STEPS=1_100_000 \
11 |     --gin.SAVING_PERIOD=10_000 \
12 |     --gin.MODEL_DIR=\"gs://improved-t5/ckpts/v2_xl_mlm/checkpoint_1000000/lm_adapt\" \
13 |     --gin.INITIAL_CHECKPOINT_PATH=\"gs://improved-t5/ckpts/v2_xl_mlm/checkpoint_1000000\" \
14 |     --gin.USE_CACHED_TASKS=False \
15 |     --alsologtostderr
16 | 


--------------------------------------------------------------------------------
/experiments/improved_t5/lm_adapt/lm_adapt_xxl.sh:
--------------------------------------------------------------------------------
 1 | ADDR=$1
 2 | MODEL_DIR=$2
 3 | 
 4 | python -m t5x.train \
 5 |     --gin_file="models/scalable_t5/t5_1_1/xxl.gin" \
 6 |     --gin.partitioning.PjitPartitioner.model_parallel_submesh="(1, 1, 2, 1)" \
 7 |     --gin.seqio.SentencePieceVocabulary.sentencepiece_model_file=\""gs://improved-t5/vocabs/tokenizer.model"\" \
 8 |     --gin.seqio.SentencePieceVocabulary.extra_ids=100 \
 9 |     --gin_file="configs/task/finetune/pile_prefix_lm.gin" \
10 |     --gin.TRAIN_STEPS=1_100_000 \
11 |     --gin.SAVING_PERIOD=10_000 \
12 |     --gin.MODEL_DIR=\"gs://improved-t5/ckpts/v2_xxl_mlm/checkpoint_1000000/lm_adapt\" \
13 |     --gin.INITIAL_CHECKPOINT_PATH=\"gs://improved-t5/ckpts/v2_xxl_mlm/checkpoint_1000000\" \
14 |     --gin.USE_CACHED_TASKS=False \
15 |     --alsologtostderr
16 | 


--------------------------------------------------------------------------------
/experiments/improved_t5/lm_adapt/s_causal/finetune_base_flan.sh:
--------------------------------------------------------------------------------
 1 | ADDR=$1
 2 | MODEL_DIR=$2
 3 | 
 4 | python -m t5x.train \
 5 |     --gin_file="models/scalable_t5/t5_1_1/base.gin" \
 6 |     --gin.seqio.SentencePieceVocabulary.sentencepiece_model_file=\""gs://improved-t5/vocabs/tokenizer.model"\" \
 7 |     --gin.seqio.SentencePieceVocabulary.extra_ids=100 \
 8 |     --gin_file="configs/task/finetune/flan.gin" \
 9 |     --gin.TRAIN_STEPS=1_184_000 \
10 |     --gin.SAVING_PERIOD=10_000 \
11 |     --gin.MODEL_DIR=\"gs://improved-t5/ckpts/v2_base_mlm/checkpoint_1000000/lm_adapt_sc_flan_finetune\" \
12 |     --gin.INITIAL_CHECKPOINT_PATH=\"gs://improved-t5/ckpts/v2_base_mlm/checkpoint_1000000/lm_adapt_sc/checkpoint_1100000\" \
13 |     --gin.USE_CACHED_TASKS=False \
14 |     --alsologtostderr
15 |     # --multiprocess_gpu \
16 |     # --coordinator_address=${ADDR} \
17 |     # --process_count=${SLURM_NTASKS} \
18 |     # --process_index=${SLURM_PROCID}
19 | 


--------------------------------------------------------------------------------
/experiments/improved_t5/lm_adapt/s_causal/finetune_large_flan.sh:
--------------------------------------------------------------------------------
 1 | ADDR=$1
 2 | MODEL_DIR=$2
 3 | 
 4 | python -m t5x.train \
 5 |     --gin_file="models/scalable_t5/t5_1_1/large.gin" \
 6 |     --gin.seqio.SentencePieceVocabulary.sentencepiece_model_file=\""gs://improved-t5/vocabs/tokenizer.model"\" \
 7 |     --gin.seqio.SentencePieceVocabulary.extra_ids=100 \
 8 |     --gin_file="configs/task/finetune/flan.gin" \
 9 |     --gin.TRAIN_STEPS=1_164_000 \
10 |     --gin.SAVING_PERIOD=10_000 \
11 |     --gin.MODEL_DIR=\"gs://improved-t5/ckpts/v2_large_mlm/checkpoint_1000000/lm_adapt_sc_flan_finetune\" \
12 |     --gin.INITIAL_CHECKPOINT_PATH=\"gs://improved-t5/ckpts/v2_large_mlm/checkpoint_1000000/lm_adapt_sc/checkpoint_1100000\" \
13 |     --gin.USE_CACHED_TASKS=False \
14 |     --alsologtostderr
15 |     # --multiprocess_gpu \
16 |     # --coordinator_address=${ADDR} \
17 |     # --process_count=${SLURM_NTASKS} \
18 |     # --process_index=${SLURM_PROCID}
19 | 


--------------------------------------------------------------------------------
/experiments/improved_t5/lm_adapt/s_causal/finetune_xxl_flan.sh:
--------------------------------------------------------------------------------
 1 | ADDR=$1
 2 | MODEL_DIR=$2
 3 | 
 4 | python -m t5x.train \
 5 |     --gin_file="models/scalable_t5/t5_1_1/xxl.gin" \
 6 |     --gin.seqio.SentencePieceVocabulary.sentencepiece_model_file=\""gs://improved-t5/vocabs/tokenizer.model"\" \
 7 |     --gin.seqio.SentencePieceVocabulary.extra_ids=100 \
 8 |     --gin_file="configs/task/finetune/flan.gin" \
 9 |     --gin.TRAIN_STEPS=1_114_000 \
10 |     --gin.SAVING_PERIOD=10_000 \
11 |     --gin.MODEL_DIR=\"gs://improved-t5/ckpts/v2_xxl_mlm/checkpoint_1000000/lm_adapt_sc_flan_finetune\" \
12 |     --gin.INITIAL_CHECKPOINT_PATH=\"gs://improved-t5/ckpts/v2_xxl_mlm/checkpoint_1000000/lm_adapt_sc/checkpoint_1100000\" \
13 |     --gin.USE_CACHED_TASKS=False \
14 |     --alsologtostderr
15 |     # --multiprocess_gpu \
16 |     # --coordinator_address=${ADDR} \
17 |     # --process_count=${SLURM_NTASKS} \
18 |     # --process_index=${SLURM_PROCID}
19 | 


--------------------------------------------------------------------------------
/experiments/improved_t5/lm_adapt/s_causal/lm_adapt_base.sh:
--------------------------------------------------------------------------------
 1 | ADDR=$1
 2 | MODEL_DIR=$2
 3 | 
 4 | python -m t5x.train \
 5 |     --gin_file="models/scalable_t5/t5_1_1/base.gin" \
 6 |     --gin.seqio.SentencePieceVocabulary.sentencepiece_model_file=\""gs://improved-t5/vocabs/tokenizer.model"\" \
 7 |     --gin.seqio.SentencePieceVocabulary.extra_ids=100 \
 8 |     --gin_file="configs/task/finetune/pile_prefix_lm_causal.gin" \
 9 |     --gin.TRAIN_STEPS=1_100_000 \
10 |     --gin.SAVING_PERIOD=10_000 \
11 |     --gin.MODEL_DIR=\"gs://improved-t5/ckpts/v2_base_mlm/checkpoint_1000000/lm_adapt_sc\" \
12 |     --gin.INITIAL_CHECKPOINT_PATH=\"gs://improved-t5/ckpts/v2_base_mlm/checkpoint_1000000\" \
13 |     --gin.USE_CACHED_TASKS=False \
14 |     --alsologtostderr
15 | 


--------------------------------------------------------------------------------
/experiments/improved_t5/lm_adapt/s_causal/lm_adapt_large.sh:
--------------------------------------------------------------------------------
 1 | ADDR=$1
 2 | MODEL_DIR=$2
 3 | 
 4 | python -m t5x.train \
 5 |     --gin_file="models/scalable_t5/t5_1_1/large.gin" \
 6 |     --gin.seqio.SentencePieceVocabulary.sentencepiece_model_file=\""gs://improved-t5/vocabs/tokenizer.model"\" \
 7 |     --gin.seqio.SentencePieceVocabulary.extra_ids=100 \
 8 |     --gin_file="configs/task/finetune/pile_prefix_lm_causal.gin" \
 9 |     --gin.TRAIN_STEPS=1_100_000 \
10 |     --gin.SAVING_PERIOD=10_000 \
11 |     --gin.MODEL_DIR=\"gs://improved-t5/ckpts/v2_large_mlm/checkpoint_1000000/lm_adapt_sc\" \
12 |     --gin.INITIAL_CHECKPOINT_PATH=\"gs://improved-t5/ckpts/v2_large_mlm/checkpoint_1000000\" \
13 |     --gin.USE_CACHED_TASKS=False \
14 |     --alsologtostderr
15 | 


--------------------------------------------------------------------------------
/experiments/improved_t5/lm_adapt/s_causal/lm_adapt_xl.sh:
--------------------------------------------------------------------------------
 1 | ADDR=$1
 2 | MODEL_DIR=$2
 3 | 
 4 | python -m t5x.train \
 5 |     --gin_file="models/scalable_t5/t5_1_1/xl.gin" \
 6 |     --gin.partitioning.PjitPartitioner.model_parallel_submesh="(1, 1, 2, 1)" \
 7 |     --gin.seqio.SentencePieceVocabulary.sentencepiece_model_file=\""gs://improved-t5/vocabs/tokenizer.model"\" \
 8 |     --gin.seqio.SentencePieceVocabulary.extra_ids=100 \
 9 |     --gin_file="configs/task/finetune/pile_prefix_lm_causal.gin" \
10 |     --gin.TRAIN_STEPS=1_100_000 \
11 |     --gin.SAVING_PERIOD=10_000 \
12 |     --gin.MODEL_DIR=\"gs://improved-t5/ckpts/v2_xl_mlm/checkpoint_1000000/lm_adapt_sc\" \
13 |     --gin.INITIAL_CHECKPOINT_PATH=\"gs://improved-t5/ckpts/v2_xl_mlm/checkpoint_1000000\" \
14 |     --gin.USE_CACHED_TASKS=False \
15 |     --alsologtostderr
16 | 


--------------------------------------------------------------------------------
/experiments/improved_t5/lm_adapt/s_causal/lm_adapt_xxl.sh:
--------------------------------------------------------------------------------
 1 | ADDR=$1
 2 | MODEL_DIR=$2
 3 | 
 4 | python -m t5x.train \
 5 |     --gin_file="models/scalable_t5/t5_1_1/xxl.gin" \
 6 |     --gin.partitioning.PjitPartitioner.model_parallel_submesh="(1, 1, 2, 1)" \
 7 |     --gin.seqio.SentencePieceVocabulary.sentencepiece_model_file=\""gs://improved-t5/vocabs/tokenizer.model"\" \
 8 |     --gin.seqio.SentencePieceVocabulary.extra_ids=100 \
 9 |     --gin_file="configs/task/finetune/pile_prefix_lm_causal.gin" \
10 |     --gin.TRAIN_STEPS=1_100_000 \
11 |     --gin.SAVING_PERIOD=10_000 \
12 |     --gin.MODEL_DIR=\"gs://improved-t5/ckpts/v2_xxl_mlm/checkpoint_1000000/lm_adapt_sc\" \
13 |     --gin.INITIAL_CHECKPOINT_PATH=\"gs://improved-t5/ckpts/v2_xxl_mlm/checkpoint_1000000\" \
14 |     --gin.USE_CACHED_TASKS=False \
15 |     --alsologtostderr
16 | 


--------------------------------------------------------------------------------
/experiments/improved_t5/lm_adapt/ul2r/lm_adapt_base.sh:
--------------------------------------------------------------------------------
 1 | ADDR=$1
 2 | MODEL_DIR=$2
 3 | 
 4 | python -m t5x.train \
 5 |     --gin_file="models/scalable_t5/t5_1_1/base.gin" \
 6 |     --gin.seqio.SentencePieceVocabulary.sentencepiece_model_file=\""gs://improved-t5/vocabs/tokenizer.model"\" \
 7 |     --gin.seqio.SentencePieceVocabulary.extra_ids=100 \
 8 |     --gin_file="configs/task/finetune/pile_ul2r.gin" \
 9 |     --gin.TRAIN_STEPS=1_008_000 \
10 |     --gin.SAVING_PERIOD=10_000 \
11 |     --gin.MODEL_DIR=\"gs://improved-t5/ckpts/v2_base_mlm/checkpoint_1000000/ul2r\" \
12 |     --gin.INITIAL_CHECKPOINT_PATH=\"gs://improved-t5/ckpts/v2_base_mlm/checkpoint_1000000\" \
13 |     --gin.USE_CACHED_TASKS=False \
14 |     --alsologtostderr
15 | 


--------------------------------------------------------------------------------
/experiments/improved_t5/lm_adapt/ul2r/lm_adapt_base_ns.sh:
--------------------------------------------------------------------------------
 1 | ADDR=$1
 2 | MODEL_DIR=$2
 3 | 
 4 | python -m t5x.train \
 5 |     --gin_file="../t5x/t5x/examples/t5/t5_1_1/base.gin" \
 6 |     --gin.seqio.SentencePieceVocabulary.sentencepiece_model_file=\""gs://improved-t5/vocabs/tokenizer.model"\" \
 7 |     --gin.seqio.SentencePieceVocabulary.extra_ids=100 \
 8 |     --gin_file="configs/task/finetune/pile_ul2r.gin" \
 9 |     --gin.TRAIN_STEPS=1_008_000 \
10 |     --gin.SAVING_PERIOD=10_000 \
11 |     --gin.MODEL_DIR=\"gs://improved-t5/ckpts/v2_base_mlm_ns/checkpoint_1000000/ul2r\" \
12 |     --gin.INITIAL_CHECKPOINT_PATH=\"gs://improved-t5/ckpts/v2_base_mlm_ns/checkpoint_1000000\" \
13 |     --gin.USE_CACHED_TASKS=False \
14 |     --alsologtostderr
15 | 


--------------------------------------------------------------------------------
/experiments/improved_t5/lm_adapt/ul2r/lm_adapt_large.sh:
--------------------------------------------------------------------------------
 1 | ADDR=$1
 2 | MODEL_DIR=$2
 3 | 
 4 | python -m t5x.train \
 5 |     --gin_file="models/scalable_t5/t5_1_1/large.gin" \
 6 |     --gin.seqio.SentencePieceVocabulary.sentencepiece_model_file=\""gs://improved-t5/vocabs/tokenizer.model"\" \
 7 |     --gin.seqio.SentencePieceVocabulary.extra_ids=100 \
 8 |     --gin_file="configs/task/finetune/pile_ul2r.gin" \
 9 |     --gin.TRAIN_STEPS=1_008_000 \
10 |     --gin.SAVING_PERIOD=10_000 \
11 |     --gin.MODEL_DIR=\"gs://improved-t5/ckpts/v2_large_mlm/checkpoint_1000000/ul2r\" \
12 |     --gin.INITIAL_CHECKPOINT_PATH=\"gs://improved-t5/ckpts/v2_large_mlm/checkpoint_1000000\" \
13 |     --gin.USE_CACHED_TASKS=False \
14 |     --alsologtostderr
15 | 


--------------------------------------------------------------------------------
/experiments/improved_t5/lm_adapt/ul2r/lm_adapt_xl.sh:
--------------------------------------------------------------------------------
 1 | ADDR=$1
 2 | MODEL_DIR=$2
 3 | 
 4 | python -m t5x.train \
 5 |     --gin_file="models/scalable_t5/t5_1_1/xl.gin" \
 6 |     --gin.partitioning.PjitPartitioner.model_parallel_submesh="(1, 1, 2, 1)" \
 7 |     --gin.seqio.SentencePieceVocabulary.sentencepiece_model_file=\""gs://improved-t5/vocabs/tokenizer.model"\" \
 8 |     --gin.seqio.SentencePieceVocabulary.extra_ids=100 \
 9 |     --gin_file="configs/task/finetune/pile_ul2r.gin" \
10 |     --gin.TRAIN_STEPS=1_008_000 \
11 |     --gin.SAVING_PERIOD=10_000 \
12 |     --gin.MODEL_DIR=\"gs://improved-t5/ckpts/v2_xl_mlm/checkpoint_1000000/ul2r\" \
13 |     --gin.INITIAL_CHECKPOINT_PATH=\"gs://improved-t5/ckpts/v2_xl_mlm/checkpoint_1000000\" \
14 |     --gin.USE_CACHED_TASKS=False \
15 |     --alsologtostderr
16 | 


--------------------------------------------------------------------------------
/experiments/improved_t5/lm_adapt/ul2r/lm_adapt_xxl.sh:
--------------------------------------------------------------------------------
 1 | ADDR=$1
 2 | MODEL_DIR=$2
 3 | 
 4 | python -m t5x.train \
 5 |     --gin_file="models/scalable_t5/t5_1_1/xxl.gin" \
 6 |     --gin.partitioning.PjitPartitioner.model_parallel_submesh="(1, 1, 2, 1)" \
 7 |     --gin.seqio.SentencePieceVocabulary.sentencepiece_model_file=\""gs://improved-t5/vocabs/tokenizer.model"\" \
 8 |     --gin.seqio.SentencePieceVocabulary.extra_ids=100 \
 9 |     --gin_file="configs/task/finetune/pile_ul2r.gin" \
10 |     --gin.TRAIN_STEPS=1_008_000 \
11 |     --gin.SAVING_PERIOD=10_000 \
12 |     --gin.MODEL_DIR=\"gs://improved-t5/ckpts/v2_xxl_mlm/checkpoint_1000000/ul2r\" \
13 |     --gin.INITIAL_CHECKPOINT_PATH=\"gs://improved-t5/ckpts/v2_xxl_mlm/checkpoint_1000000\" \
14 |     --gin.USE_CACHED_TASKS=False \
15 |     --alsologtostderr
16 | 


--------------------------------------------------------------------------------
/experiments/improved_t5/mlm/finetune_flan2022.sh:
--------------------------------------------------------------------------------
 1 | SIZE=$1
 2 | STEP=$2
 3 | INIT_DIR=$3
 4 | MODEL_DIR=$4
 5 | 
 6 | python -m t5x.train \
 7 |     --gin_file="models/scalable_t5/t5_1_1/${SIZE}.gin" \
 8 |     --gin.seqio.SentencePieceVocabulary.sentencepiece_model_file=\""${GCP_BUCKET}/vocabs/tokenizer.model"\" \
 9 |     --gin.seqio.SentencePieceVocabulary.extra_ids=100 \
10 |     --gin_file="configs/task/finetune/flan2022.gin" \
11 |     --gin.train.use_orbax=False \
12 |     --gin.TRAIN_STEPS=${STEP} \
13 |     --gin.SAVING_PERIOD=2000 \
14 |     --gin.INITIAL_CHECKPOINT_PATH=\"${INIT_DIR}\" \
15 |     --gin.MODEL_DIR=\"${MODEL_DIR}\" \
16 |     --gin.USE_CACHED_TASKS=False \
17 |     --alsologtostderr
18 |     # --gin.partitioning.PjitPartitioner.model_parallel_submesh="(1, 1, 8, 1)" \
19 |     # --multiprocess_gpu \
20 |     # --coordinator_address=${ADDR} \
21 |     # --process_count=${SLURM_NTASKS} \
22 |     # --process_index=${SLURM_PROCID}
23 | 


--------------------------------------------------------------------------------
/experiments/improved_t5/mlm/finetune_sglue.sh:
--------------------------------------------------------------------------------
 1 | SIZE=$1
 2 | START_STEP=$2
 3 | INIT_DIR=$3
 4 | MODEL_DIR=$4
 5 | 
 6 | TRAIN_STEPS=$(( ${START_STEP} + 262144 ))
 7 | 
 8 | python -m t5x.train \
 9 |     --gin_file="models/scalable_t5/t5_1_1/${SIZE}.gin" \
10 |     --gin.seqio.SentencePieceVocabulary.sentencepiece_model_file=\""${GCP_BUCKET}/vocabs/tokenizer.model"\" \
11 |     --gin.seqio.SentencePieceVocabulary.extra_ids=100 \
12 |     --gin_file="configs/task/finetune/sglue.gin" \
13 |     --gin.train.use_orbax=False \
14 |     --gin.TRAIN_STEPS=${TRAIN_STEPS} \
15 |     --gin.SAVING_PERIOD=5000 \
16 |     --gin.train.infer_eval_dataset_cfg=None \
17 |     --gin.INITIAL_CHECKPOINT_PATH=\"${INIT_DIR}\" \
18 |     --gin.MODEL_DIR=\"${MODEL_DIR}\" \
19 |     --gin.USE_CACHED_TASKS=False \
20 |     --alsologtostderr
21 |     # --gin.partitioning.PjitPartitioner.model_parallel_submesh="(1, 1, 8, 1)" \
22 |     # --gin.Trainer.num_microbatches=2 \
23 | 


--------------------------------------------------------------------------------
/experiments/improved_t5/mlm/finetune_t0.sh:
--------------------------------------------------------------------------------
 1 | SIZE=$1
 2 | STEP=$2
 3 | INIT_DIR=$3
 4 | MODEL_DIR=$4
 5 | 
 6 | python -m t5x.train \
 7 |     --gin_file="models/scalable_t5/t5_1_1/${SIZE}.gin" \
 8 |     --gin.seqio.SentencePieceVocabulary.sentencepiece_model_file=\""${GCP_BUCKET}/vocabs/tokenizer.model"\" \
 9 |     --gin.seqio.SentencePieceVocabulary.extra_ids=100 \
10 |     --gin_file="configs/task/finetune/t0_train.gin" \
11 |     --gin.train.use_orbax=False \
12 |     --gin.TRAIN_STEPS=${STEP} \
13 |     --gin.SAVING_PERIOD=2000 \
14 |     --gin.INITIAL_CHECKPOINT_PATH=\"${INIT_DIR}\" \
15 |     --gin.MODEL_DIR=\"${MODEL_DIR}\" \
16 |     --gin.USE_CACHED_TASKS=False \
17 |     --alsologtostderr
18 |     # --gin.partitioning.PjitPartitioner.model_parallel_submesh="(1, 1, 8, 1)" \
19 |     # --multiprocess_gpu \
20 |     # --coordinator_address=${ADDR} \
21 |     # --process_count=${SLURM_NTASKS} \
22 |     # --process_index=${SLURM_PROCID}


--------------------------------------------------------------------------------
/experiments/improved_t5/mlm/pretrain_mlm.sh:
--------------------------------------------------------------------------------
 1 | SIZE=$1
 2 | TRAIN_STEPS=$2
 3 | MODEL_DIR=$3
 4 | 
 5 | python -m t5x.train \
 6 |     --gin_file="models/scalable_t5/t5_1_1/${SIZE}.gin" \
 7 |     --gin_file="configs/task/pretrain/pile_mlm.gin" \
 8 |     --gin.seqio.SentencePieceVocabulary.sentencepiece_model_file=\""${GCP_BUCKET}/vocabs/tokenizer.model"\" \
 9 |     --gin.seqio.SentencePieceVocabulary.extra_ids=100 \
10 |     --gin.train.use_orbax=False \
11 |     --gin.TRAIN_STEPS=${TRAIN_STEPS} \
12 |     --gin.SAVING_PERIOD=10000 \
13 |     --gin.MODEL_DIR=\"${MODEL_DIR}\" \
14 |     --gin.USE_CACHED_TASKS=False \
15 |     --alsologtostderr
16 |     # --gin.Trainer.num_microbatches=2 \
17 |     # --gin.partitioning.standard_logical_axis_rules.activation_partitioning_dims=2 \
18 |     # --gin.partitioning.PjitPartitioner.model_parallel_submesh="(1, 1, 8, 1)" \
19 |     # --gin.partitioning.standard_logical_axis_rules.parameter_partitioning_dims=2 \


--------------------------------------------------------------------------------
/experiments/improved_t5/mlm/pretrain_mlm_causal.sh:
--------------------------------------------------------------------------------
 1 | SIZE=$1
 2 | TRAIN_STEPS=$2
 3 | MODEL_DIR=$3
 4 | 
 5 | python -m t5x.train \
 6 |     --gin_file="models/scalable_t5/t5_1_1/${SIZE}.gin" \
 7 |     --gin_file="configs/task/pretrain/pile_mlm_causal.gin" \
 8 |     --gin.seqio.SentencePieceVocabulary.sentencepiece_model_file=\""${GCP_BUCKET}/vocabs/tokenizer.model"\" \
 9 |     --gin.seqio.SentencePieceVocabulary.extra_ids=100 \
10 |     --gin.train.use_orbax=False \
11 |     --gin.TRAIN_STEPS=${TRAIN_STEPS} \
12 |     --gin.SAVING_PERIOD=10000 \
13 |     --gin.MODEL_DIR=\"${MODEL_DIR}\" \
14 |     --gin.USE_CACHED_TASKS=False \
15 |     --alsologtostderr
16 |     # --gin.Trainer.num_microbatches=2 \
17 |     # --gin.partitioning.standard_logical_axis_rules.activation_partitioning_dims=2 \
18 |     # --gin.partitioning.PjitPartitioner.model_parallel_submesh="(1, 1, 8, 1)" \
19 |     # --gin.partitioning.standard_logical_axis_rules.parameter_partitioning_dims=2 \
20 | 


--------------------------------------------------------------------------------
/experiments/improved_t5/ul2_causal/finetune_base_flan.sh:
--------------------------------------------------------------------------------
 1 | ADDR=$1
 2 | MODEL_DIR=$2
 3 | 
 4 | python -m t5x.train \
 5 |     --gin_file="models/scalable_t5/t5_1_1/base.gin" \
 6 |     --gin.seqio.SentencePieceVocabulary.sentencepiece_model_file=\""gs://improved-t5/vocabs/tokenizer.model"\" \
 7 |     --gin.seqio.SentencePieceVocabulary.extra_ids=100 \
 8 |     --gin_file="configs/task/finetune/flan.gin" \
 9 |     --gin.TRAIN_STEPS=1_084_000 \
10 |     --gin.SAVING_PERIOD=10_000 \
11 |     --gin.MODEL_DIR=\"gs://improved-t5/ckpts/v2_base/checkpoint_1000000/flan_finetune\" \
12 |     --gin.INITIAL_CHECKPOINT_PATH=\"gs://improved-t5/ckpts/v2_base/checkpoint_1000000\" \
13 |     --gin.USE_CACHED_TASKS=False \
14 |     --alsologtostderr
15 |     # --multiprocess_gpu \
16 |     # --coordinator_address=${ADDR} \
17 |     # --process_count=${SLURM_NTASKS} \
18 |     # --process_index=${SLURM_PROCID}
19 | 


--------------------------------------------------------------------------------
/experiments/improved_t5/ul2_causal/finetune_base_sglue.sh:
--------------------------------------------------------------------------------
 1 | ADDR=$1
 2 | MODEL_DIR=$2
 3 | 
 4 | python -m t5x.train \
 5 |     --gin_file="models/scalable_t5/t5_1_1/base.gin" \
 6 |     --gin.seqio.SentencePieceVocabulary.sentencepiece_model_file=\""gs://improved-t5/vocabs/tokenizer.model"\" \
 7 |     --gin.seqio.SentencePieceVocabulary.extra_ids=100 \
 8 |     --gin_file="configs/task/finetune/sglue.gin" \
 9 |     --gin.TRAIN_STEPS=1_262_144 \
10 |     --gin.SAVING_PERIOD=5000 \
11 |     --gin.MODEL_DIR=\"gs://improved-t5/ckpts/v2_base/checkpoint_1000000/sglue_finetune\" \
12 |     --gin.INITIAL_CHECKPOINT_PATH=\"gs://improved-t5/ckpts/v2_base/checkpoint_1000000\" \
13 |     --gin.USE_CACHED_TASKS=False \
14 |     --alsologtostderr
15 |     # --multiprocess_gpu \
16 |     # --coordinator_address=${ADDR} \
17 |     # --process_count=${SLURM_NTASKS} \
18 |     # --process_index=${SLURM_PROCID}
19 | 


--------------------------------------------------------------------------------
/experiments/improved_t5/ul2_causal/finetune_large_flan.sh:
--------------------------------------------------------------------------------
 1 | ADDR=$1
 2 | MODEL_DIR=$2
 3 | 
 4 | python -m t5x.train \
 5 |     --gin_file="models/scalable_t5/t5_1_1/large.gin" \
 6 |     --gin.seqio.SentencePieceVocabulary.sentencepiece_model_file=\""gs://improved-t5/vocabs/tokenizer.model"\" \
 7 |     --gin.seqio.SentencePieceVocabulary.extra_ids=100 \
 8 |     --gin_file="configs/task/finetune/flan.gin" \
 9 |     --gin.TRAIN_STEPS=1_064_000 \
10 |     --gin.SAVING_PERIOD=10_000 \
11 |     --gin.MODEL_DIR=\"gs://improved-t5/ckpts/v2_large/checkpoint_1000000/flan_finetune\" \
12 |     --gin.INITIAL_CHECKPOINT_PATH=\"gs://improved-t5/ckpts/v2_large/checkpoint_1000000\" \
13 |     --gin.USE_CACHED_TASKS=False \
14 |     --alsologtostderr
15 |     # --multiprocess_gpu \
16 |     # --coordinator_address=${ADDR} \
17 |     # --process_count=${SLURM_NTASKS} \
18 |     # --process_index=${SLURM_PROCID}
19 | 


--------------------------------------------------------------------------------
/experiments/improved_t5/ul2_causal/finetune_large_sglue.sh:
--------------------------------------------------------------------------------
 1 | ADDR=$1
 2 | MODEL_DIR=$2
 3 | 
 4 | python -m t5x.train \
 5 |     --gin_file="models/scalable_t5/t5_1_1/large.gin" \
 6 |     --gin.seqio.SentencePieceVocabulary.sentencepiece_model_file=\""gs://improved-t5/vocabs/tokenizer.model"\" \
 7 |     --gin.seqio.SentencePieceVocabulary.extra_ids=100 \
 8 |     --gin_file="configs/task/finetune/sglue.gin" \
 9 |     --gin.TRAIN_STEPS=1_262_144 \
10 |     --gin.SAVING_PERIOD=5000 \
11 |     --gin.MODEL_DIR=\"gs://improved-t5/ckpts/v2_large/checkpoint_1000000/sglue_finetune\" \
12 |     --gin.INITIAL_CHECKPOINT_PATH=\"gs://improved-t5/ckpts/v2_large/checkpoint_1000000\" \
13 |     --gin.USE_CACHED_TASKS=False \
14 |     --alsologtostderr
15 |     # --multiprocess_gpu \
16 |     # --coordinator_address=${ADDR} \
17 |     # --process_count=${SLURM_NTASKS} \
18 |     # --process_index=${SLURM_PROCID}
19 | 


--------------------------------------------------------------------------------
/experiments/improved_t5/ul2_causal/finetune_xl_flan2021.sh:
--------------------------------------------------------------------------------
 1 | ADDR=$1
 2 | MODEL_DIR=$2
 3 | 
 4 | python -m t5x.train \
 5 |     --gin_file="models/scalable_t5/t5_1_1/xl.gin" \
 6 |     --gin.partitioning.PjitPartitioner.model_parallel_submesh="(1, 1, 2, 1)" \
 7 |     --gin.seqio.SentencePieceVocabulary.sentencepiece_model_file=\""gs://improved-t5/vocabs/tokenizer.model"\" \
 8 |     --gin.seqio.SentencePieceVocabulary.extra_ids=100 \
 9 |     --gin_file="configs/task/finetune/flan2021.gin" \
10 |     --gin.TRAIN_STEPS=1_038_000 \
11 |     --gin.SAVING_PERIOD=10_000 \
12 |     --gin.MODEL_DIR=\"gs://improved-t5/ckpts/v2_xl/checkpoint_1000000/finetune_flan2021\" \
13 |     --gin.INITIAL_CHECKPOINT_PATH=\"gs://improved-t5/ckpts/v2_xl/checkpoint_1000000\" \
14 |     --gin.USE_CACHED_TASKS=False \
15 |     --alsologtostderr
16 |     # --multiprocess_gpu \
17 |     # --coordinator_address=${ADDR} \
18 |     # --process_count=${SLURM_NTASKS} \
19 |     # --process_index=${SLURM_PROCID}
20 | 


--------------------------------------------------------------------------------
/experiments/improved_t5/ul2_causal/finetune_xl_sglue.sh:
--------------------------------------------------------------------------------
 1 | ADDR=$1
 2 | MODEL_DIR=$2
 3 | 
 4 | python -m t5x.train \
 5 |     --gin_file="models/scalable_t5/t5_1_1/xl.gin" \
 6 |     --gin.seqio.SentencePieceVocabulary.sentencepiece_model_file=\""gs://improved-t5/vocabs/tokenizer.model"\" \
 7 |     --gin.seqio.SentencePieceVocabulary.extra_ids=100 \
 8 |     --gin_file="configs/task/finetune/sglue.gin" \
 9 |     --gin.TRAIN_STEPS=1_262_144 \
10 |     --gin.SAVING_PERIOD=5000 \
11 |     --gin.MODEL_DIR=\"gs://improved-t5/ckpts/v2_xl/checkpoint_1000000/sglue_finetune\" \
12 |     --gin.INITIAL_CHECKPOINT_PATH=\"gs://improved-t5/ckpts/v2_xl/checkpoint_1000000\" \
13 |     --gin.USE_CACHED_TASKS=False \
14 |     --alsologtostderr
15 |     # --multiprocess_gpu \
16 |     # --coordinator_address=${ADDR} \
17 |     # --process_count=${SLURM_NTASKS} \
18 |     # --process_index=${SLURM_PROCID}
19 | 


--------------------------------------------------------------------------------
/experiments/improved_t5/ul2_causal/finetune_xxl_flan.sh:
--------------------------------------------------------------------------------
 1 | ADDR=$1
 2 | MODEL_DIR=$2
 3 | 
 4 | python -m t5x.train \
 5 |     --gin_file="models/scalable_t5/t5_1_1/xxl.gin" \
 6 |     --gin.seqio.SentencePieceVocabulary.sentencepiece_model_file=\""gs://improved-t5/vocabs/tokenizer.model"\" \
 7 |     --gin.seqio.SentencePieceVocabulary.extra_ids=100 \
 8 |     --gin_file="configs/task/finetune/flan.gin" \
 9 |     --gin.TRAIN_STEPS=1_038_000 \
10 |     --gin.SAVING_PERIOD=10_000 \
11 |     --gin.MODEL_DIR=\"gs://improved-t5/ckpts/v2_xxl/checkpoint_1000000/flan_finetune\" \
12 |     --gin.INITIAL_CHECKPOINT_PATH=\"gs://improved-t5/ckpts/v2_xxl/checkpoint_1000000\" \
13 |     --gin.USE_CACHED_TASKS=False \
14 |     --alsologtostderr
15 |     # --multiprocess_gpu \
16 |     # --coordinator_address=${ADDR} \
17 |     # --process_count=${SLURM_NTASKS} \
18 |     # --process_index=${SLURM_PROCID}
19 | 


--------------------------------------------------------------------------------
/experiments/improved_t5/ul2_causal/finetune_xxl_sglue.sh:
--------------------------------------------------------------------------------
 1 | ADDR=$1
 2 | MODEL_DIR=$2
 3 | 
 4 | python -m t5x.train \
 5 |     --gin_file="models/scalable_t5/t5_1_1/xxl.gin" \
 6 |     --gin.seqio.SentencePieceVocabulary.sentencepiece_model_file=\""gs://improved-t5/vocabs/tokenizer.model"\" \
 7 |     --gin.seqio.SentencePieceVocabulary.extra_ids=100 \
 8 |     --gin_file="configs/task/finetune/sglue.gin" \
 9 |     --gin.TRAIN_STEPS=1_262_144 \
10 |     --gin.SAVING_PERIOD=5000 \
11 |     --gin.MODEL_DIR=\"gs://improved-t5/ckpts/v2_xxl/checkpoint_1000000/sglue_finetune\" \
12 |     --gin.INITIAL_CHECKPOINT_PATH=\"gs://improved-t5/ckpts/v2_xxl/checkpoint_1000000\" \
13 |     --seqio_additional_cache_dirs=\"gs://improved-t5/data\" \
14 |     --alsologtostderr
15 |     # --multiprocess_gpu \
16 |     # --coordinator_address=${ADDR} \
17 |     # --process_count=${SLURM_NTASKS} \
18 |     # --process_index=${SLURM_PROCID}
19 | 


--------------------------------------------------------------------------------
/experiments/improved_t5/ul2_causal/pretrain_base.sh:
--------------------------------------------------------------------------------
 1 | ADDR=$1
 2 | MODEL_DIR=$2
 3 | 
 4 | python -m t5x.train \
 5 |     --gin_file="models/scalable_t5/t5_1_1/base.gin" \
 6 |     --gin_file="configs/task/pretrain/pile_mixed_objective.gin" \
 7 |     --gin.seqio.SentencePieceVocabulary.sentencepiece_model_file=\""gs://improved-t5/vocabs/tokenizer.model"\" \
 8 |     --gin.seqio.SentencePieceVocabulary.extra_ids=100 \
 9 |     --gin.MIXTURE_OR_TASK_NAME=\""pile_ul2_causal_0_50"\" \
10 |     --gin.TRAIN_STEPS=2000000 \
11 |     --gin.SAVING_PERIOD=10000 \
12 |     --gin.MODEL_DIR=\"'gs://improved-t5/ckpts/v2_base/'\" \
13 |     --gin.USE_CACHED_TASKS=False \
14 |     --alsologtostderr
15 | 


--------------------------------------------------------------------------------
/experiments/improved_t5/ul2_causal/pretrain_large.sh:
--------------------------------------------------------------------------------
 1 | ADDR=$1
 2 | MODEL_DIR=$2
 3 | 
 4 | python -m t5x.train \
 5 |     --gin_file="models/scalable_t5/t5_1_1/large.gin" \
 6 |     --gin_file="configs/task/pretrain/pile_mixed_objective.gin" \
 7 |     --gin.seqio.SentencePieceVocabulary.sentencepiece_model_file=\""gs://improved-t5/vocabs/tokenizer.model"\" \
 8 |     --gin.seqio.SentencePieceVocabulary.extra_ids=100 \
 9 |     --gin.MIXTURE_OR_TASK_NAME=\""pile_ul2_causal_0_50"\" \
10 |     --gin.TRAIN_STEPS=2000000 \
11 |     --gin.SAVING_PERIOD=10000 \
12 |     --gin.MODEL_DIR=\"'gs://improved-t5/ckpts/v2_large/'\" \
13 |     --gin.USE_CACHED_TASKS=False \
14 |     --alsologtostderr
15 | 


--------------------------------------------------------------------------------
/experiments/improved_t5/ul2_causal/pretrain_xl.sh:
--------------------------------------------------------------------------------
 1 | ADDR=$1
 2 | MODEL_DIR=$2
 3 | 
 4 | python -m t5x.train \
 5 |     --gin_file="models/scalable_t5/t5_1_1/xl.gin" \
 6 |     --gin_file="configs/task/pretrain/pile_mixed_objective.gin" \
 7 |     --gin.partitioning.PjitPartitioner.model_parallel_submesh="(1, 1, 2, 1)" \
 8 |     --gin.seqio.SentencePieceVocabulary.sentencepiece_model_file=\""gs://improved-t5/vocabs/tokenizer.model"\" \
 9 |     --gin.seqio.SentencePieceVocabulary.extra_ids=100 \
10 |     --gin.MIXTURE_OR_TASK_NAME=\""pile_ul2_causal_0_50"\" \
11 |     --gin.TRAIN_STEPS=1000000 \
12 |     --gin.SAVING_PERIOD=10000 \
13 |     --gin.MODEL_DIR=\"'gs://improved-t5/ckpts/v2_xl/'\" \
14 |     --gin.USE_CACHED_TASKS=False \
15 |     --alsologtostderr
16 | 


--------------------------------------------------------------------------------
/experiments/improved_t5/ul2_causal/pretrain_xxl.sh:
--------------------------------------------------------------------------------
 1 | ADDR=$1
 2 | MODEL_DIR=$2
 3 | 
 4 | python -m t5x.train \
 5 |     --gin_file="models/scalable_t5/t5_1_1/xxl.gin" \
 6 |     --gin_file="configs/task/pretrain/pile_mixed_objective.gin" \
 7 |     --gin.partitioning.PjitPartitioner.model_parallel_submesh="(1, 1, 2, 1)" \
 8 |     --gin.seqio.SentencePieceVocabulary.sentencepiece_model_file=\""gs://improved-t5/vocabs/tokenizer.model"\" \
 9 |     --gin.seqio.SentencePieceVocabulary.extra_ids=100 \
10 |     --gin.MIXTURE_OR_TASK_NAME=\""pile_ul2_causal_0_50"\" \
11 |     --gin.TRAIN_STEPS=1000000 \
12 |     --gin.SAVING_PERIOD=10000 \
13 |     --gin.MODEL_DIR=\"'gs://improved-t5/ckpts/v2_xxl/'\" \
14 |     --gin.USE_CACHED_TASKS=False \
15 |     --alsologtostderr
16 | 


--------------------------------------------------------------------------------
/experiments/preliminary/layernorm/eval/t0_eval_alibi_relpos.sh:
--------------------------------------------------------------------------------
 1 | export CACHED_DATA_DIR="/fsx/lintangsutawika/data"
 2 | 
 3 | ADDR=$1
 4 | MODEL_DIR=$2
 5 | 
 6 | python -m t5x.train \
 7 |     --gin_file="models/scalable_t5/t5_1_1/base.gin" \
 8 |     --gin_file="configs/task/eval/t0_eval.gin" \
 9 |     --gin.TRAIN_STEPS=135000 \
10 |     --gin.SAVING_PERIOD=5000 \
11 |     --gin.MODEL_DIR=\"'/fsx/lintangsutawika/improved_t5/ckpts/METRO/alibi_relpos_pile_mlm/finetune_t0_eval/'\" \
12 |     --gin.INITIAL_CHECKPOINT_PATH=\"'/fsx/lintangsutawika/improved_t5/ckpts/METRO/alibi_relpos_pile_mlm/finetune_t0_train/checkpoint_135000'\" \
13 |     --seqio_additional_cache_dirs="${CACHED_DATA_DIR}" \
14 |     --gin.USE_CACHED_TASKS=True \
15 |     --alsologtostderr \
16 |     --multiprocess_gpu \
17 |     --coordinator_address=${ADDR} \
18 |     --process_count=${SLURM_NTASKS} \
19 |     --process_index=${SLURM_PROCID}


--------------------------------------------------------------------------------
/experiments/preliminary/layernorm/eval/t0_eval_base_lm100k.sh:
--------------------------------------------------------------------------------
 1 | export CACHED_DATA_DIR="/fsx/lintangsutawika/data"
 2 | export PREFIX="/fsx/lintangsutawika/improved_t5/ckpts/LayerNorm/with_abs_pos/"
 3 | 
 4 | ADDR=$1
 5 | MODEL_DIR=$2
 6 | 
 7 | python -m t5x.train \
 8 |     --gin_file="models/scalable_t5/t5_1_1/base.gin" \
 9 |     --gin_file="configs/task/eval/t0_eval.gin" \
10 |     --gin_file="configs/exp/LayerNorm/reset_optim.gin" \
11 |     --gin.TRAIN_STEPS=1110000 \
12 |     --gin.SAVING_PERIOD=5000 \
13 |     --gin.MODEL_DIR=\"${PREFIX}'base_lm100k/finetune_t0_eval/'\" \
14 |     --gin.INITIAL_CHECKPOINT_PATH=\"${PREFIX}'base_lm100k/finetune_t0_train/checkpoint_1110000/'\" \
15 |     --seqio_additional_cache_dirs="${CACHED_DATA_DIR}" \
16 |     --gin.USE_CACHED_TASKS=True \
17 |     --alsologtostderr \
18 |     --multiprocess_gpu \
19 |     --coordinator_address=${ADDR} \
20 |     --process_count=${SLURM_NTASKS} \
21 |     --process_index=${SLURM_PROCID}
22 | 
23 | 


--------------------------------------------------------------------------------
/experiments/preliminary/layernorm/pretrain/pile_mlm_adafactor_post.sh:
--------------------------------------------------------------------------------
 1 | ADDR=$1
 2 | MODEL_DIR=$2
 3 | 
 4 | python -m t5x.train \
 5 |     --gin_file="models/scalable_t5/t5_1_1/base.gin" \
 6 |     --gin_file="configs/task/pretrain/pile_mlm.gin" \
 7 |     --gin_file="configs/exp/LayerNorm/post_layernorm.gin" \
 8 |     --gin.TRAIN_STEPS=125000 \
 9 |     --gin.SAVING_PERIOD=25000 \
10 |     --gin.MODEL_DIR=\"'/fsx/lintangsutawika/improved_t5/ckpts/LayerNorm/pile_mlm_adafactor_post/'\" \
11 |     --gin.USE_CACHED_TASKS=False \
12 |     --alsologtostderr \
13 |     --multiprocess_gpu \
14 |     --coordinator_address=${ADDR} \
15 |     --process_count=${SLURM_NTASKS} \
16 |     --process_index=${SLURM_PROCID}
17 | 


--------------------------------------------------------------------------------
/experiments/preliminary/layernorm/pretrain/pile_mlm_adamw_post.sh:
--------------------------------------------------------------------------------
 1 | ADDR=$1
 2 | MODEL_DIR=$2
 3 | 
 4 | export CACHED_DATA_DIR="/fsx/lintangsutawika/data"
 5 | export PREFIX="/fsx/lintangsutawika/improved_t5/ckpts/LayerNorm/with_abs_pos/"
 6 | 
 7 | python -m t5x.train \
 8 |     --gin_file="models/scalable_t5/t5_1_1/base.gin" \
 9 |     --gin_file="configs/task/pretrain/pile_mlm.gin" \
10 |     --gin_file="configs/exp/LayerNorm/training.gin" \
11 |     --gin_file="configs/exp/LayerNorm/post_layernorm.gin" \
12 |     --gin.TRAIN_STEPS=125000 \
13 |     --gin.SAVING_PERIOD=25000 \
14 |     --gin.MODEL_DIR=\"${PREFIX}'pile_mlm_adamw_post/'\" \
15 |     --gin.USE_CACHED_TASKS=False \
16 |     --alsologtostderr \
17 |     --multiprocess_gpu \
18 |     --coordinator_address=${ADDR} \
19 |     --process_count=${SLURM_NTASKS} \
20 |     --process_index=${SLURM_PROCID}
21 | 


--------------------------------------------------------------------------------
/experiments/preliminary/layernorm/pretrain/pile_mlm_adamw_pre.sh:
--------------------------------------------------------------------------------
 1 | ADDR=$1
 2 | MODEL_DIR=$2
 3 | 
 4 | export CACHED_DATA_DIR="/fsx/lintangsutawika/data"
 5 | export PREFIX="/fsx/lintangsutawika/improved_t5/ckpts/LayerNorm/with_abs_pos/"
 6 | 
 7 | python -m t5x.train \
 8 |     --gin_file="models/scalable_t5/t5_1_1/base.gin" \
 9 |     --gin_file="configs/task/pretrain/pile_mlm.gin" \
10 |     --gin_file="configs/exp/LayerNorm/training.gin" \
11 |     --gin_file="configs/exp/LayerNorm/pre_layernorm.gin" \
12 |     --gin.TRAIN_STEPS=125000 \
13 |     --gin.SAVING_PERIOD=25000 \
14 |     --gin.MODEL_DIR=\"${PREFIX}'pile_mlm_adamw_pre/'\" \
15 |     --gin.USE_CACHED_TASKS=False \
16 |     --alsologtostderr \
17 |     --multiprocess_gpu \
18 |     --coordinator_address=${ADDR} \
19 |     --process_count=${SLURM_NTASKS} \
20 |     --process_index=${SLURM_PROCID}
21 | 


--------------------------------------------------------------------------------
/experiments/preliminary/mixed_pretraining_objectives/finetune/0_10/sglue_finetune_920m_32000.sh:
--------------------------------------------------------------------------------
 1 | export CACHED_DATA_DIR="/fsx/lintangsutawika/data"
 2 | 
 3 | ADDR=$1
 4 | MODEL_DIR=$2
 5 | 
 6 | let "TRAIN_STEPS = 32000 + 128000"
 7 | 
 8 | python -m t5x.train \
 9 |     --gin_file="models/scalable_t5/t5_1_1/base.gin" \
10 |     --gin_file="configs/size/920m/vanilla.gin" \
11 |     --gin_file="configs/task/finetune/sglue.gin" \
12 |     --gin.TRAIN_STEPS=${TRAIN_STEPS} \
13 |     --gin.SAVING_PERIOD=2_000 \
14 |     --gin.MODEL_DIR=\"'/fsx/lintangsutawika/improved_t5/ckpts/920m_pile_ul2_causal_0_10/finetune_32k/'\" \
15 |     --gin.INITIAL_CHECKPOINT_PATH=\"'/fsx/lintangsutawika/improved_t5/ckpts/920m_pile_ul2_causal_0_10/checkpoint_32000/'\" \
16 |     --seqio_additional_cache_dirs="${CACHED_DATA_DIR}" \
17 |     --alsologtostderr \
18 |     --multiprocess_gpu \
19 |     --coordinator_address=${ADDR} \
20 |     --process_count=${SLURM_NTASKS} \
21 |     --process_index=${SLURM_PROCID}


--------------------------------------------------------------------------------
/experiments/preliminary/mixed_pretraining_objectives/finetune/0_10/sglue_finetune_920m_64000.sh:
--------------------------------------------------------------------------------
 1 | export CACHED_DATA_DIR="/fsx/lintangsutawika/data"
 2 | 
 3 | ADDR=$1
 4 | MODEL_DIR=$2
 5 | 
 6 | let "TRAIN_STEPS = 64000 + 128000"
 7 | 
 8 | python -m t5x.train \
 9 |     --gin_file="models/scalable_t5/t5_1_1/base.gin" \
10 |     --gin_file="configs/size/920m/vanilla.gin" \
11 |     --gin_file="configs/task/finetune/sglue.gin" \
12 |     --gin.TRAIN_STEPS=${TRAIN_STEPS} \
13 |     --gin.SAVING_PERIOD=2_000 \
14 |     --gin.MODEL_DIR=\"'/fsx/lintangsutawika/improved_t5/ckpts/920m_pile_ul2_causal_0_10/finetune_64k/'\" \
15 |     --gin.INITIAL_CHECKPOINT_PATH=\"'/fsx/lintangsutawika/improved_t5/ckpts/920m_pile_ul2_causal_0_10/checkpoint_64000/'\" \
16 |     --seqio_additional_cache_dirs="${CACHED_DATA_DIR}" \
17 |     --alsologtostderr \
18 |     --multiprocess_gpu \
19 |     --coordinator_address=${ADDR} \
20 |     --process_count=${SLURM_NTASKS} \
21 |     --process_index=${SLURM_PROCID}


--------------------------------------------------------------------------------
/experiments/preliminary/mixed_pretraining_objectives/finetune/0_10/sglue_finetune_920m_96000.sh:
--------------------------------------------------------------------------------
 1 | export CACHED_DATA_DIR="/fsx/lintangsutawika/data"
 2 | 
 3 | ADDR=$1
 4 | MODEL_DIR=$2
 5 | 
 6 | let "TRAIN_STEPS = 96000 + 128000"
 7 | 
 8 | python -m t5x.train \
 9 |     --gin_file="models/scalable_t5/t5_1_1/base.gin" \
10 |     --gin_file="configs/size/920m/vanilla.gin" \
11 |     --gin_file="configs/task/finetune/sglue.gin" \
12 |     --gin.TRAIN_STEPS=${TRAIN_STEPS} \
13 |     --gin.SAVING_PERIOD=2_000 \
14 |     --gin.MODEL_DIR=\"'/fsx/lintangsutawika/improved_t5/ckpts/920m_pile_ul2_causal_0_10/finetune_96k/'\" \
15 |     --gin.INITIAL_CHECKPOINT_PATH=\"'/fsx/lintangsutawika/improved_t5/ckpts/920m_pile_ul2_causal_0_10/checkpoint_96000/'\" \
16 |     --seqio_additional_cache_dirs="${CACHED_DATA_DIR}" \
17 |     --alsologtostderr \
18 |     --multiprocess_gpu \
19 |     --coordinator_address=${ADDR} \
20 |     --process_count=${SLURM_NTASKS} \
21 |     --process_index=${SLURM_PROCID}


--------------------------------------------------------------------------------
/experiments/preliminary/mixed_pretraining_objectives/finetune/0_15/sglue_finetune_920m_32000.sh:
--------------------------------------------------------------------------------
 1 | export CACHED_DATA_DIR="/fsx/lintangsutawika/data"
 2 | 
 3 | ADDR=$1
 4 | MODEL_DIR=$2
 5 | 
 6 | let "TRAIN_STEPS = 32000 + 128000"
 7 | 
 8 | python -m t5x.train \
 9 |     --gin_file="models/scalable_t5/t5_1_1/base.gin" \
10 |     --gin_file="configs/size/920m/vanilla.gin" \
11 |     --gin_file="configs/task/finetune/sglue.gin" \
12 |     --gin.TRAIN_STEPS=${TRAIN_STEPS} \
13 |     --gin.SAVING_PERIOD=2_000 \
14 |     --gin.MODEL_DIR=\"'/fsx/lintangsutawika/improved_t5/ckpts/920m_pile_ul2_causal_0_15/finetune_32k/'\" \
15 |     --gin.INITIAL_CHECKPOINT_PATH=\"'/fsx/lintangsutawika/improved_t5/ckpts/920m_pile_ul2_causal_0_15/checkpoint_32000/'\" \
16 |     --seqio_additional_cache_dirs="${CACHED_DATA_DIR}" \
17 |     --alsologtostderr \
18 |     --multiprocess_gpu \
19 |     --coordinator_address=${ADDR} \
20 |     --process_count=${SLURM_NTASKS} \
21 |     --process_index=${SLURM_PROCID}


--------------------------------------------------------------------------------
/experiments/preliminary/mixed_pretraining_objectives/finetune/0_15/sglue_finetune_920m_64000.sh:
--------------------------------------------------------------------------------
 1 | export CACHED_DATA_DIR="/fsx/lintangsutawika/data"
 2 | 
 3 | ADDR=$1
 4 | MODEL_DIR=$2
 5 | 
 6 | let "TRAIN_STEPS = 64000 + 128000"
 7 | 
 8 | python -m t5x.train \
 9 |     --gin_file="models/scalable_t5/t5_1_1/base.gin" \
10 |     --gin_file="configs/size/920m/vanilla.gin" \
11 |     --gin_file="configs/task/finetune/sglue.gin" \
12 |     --gin.TRAIN_STEPS=${TRAIN_STEPS} \
13 |     --gin.SAVING_PERIOD=2_000 \
14 |     --gin.MODEL_DIR=\"'/fsx/lintangsutawika/improved_t5/ckpts/920m_pile_ul2_causal_0_15/finetune_64k/'\" \
15 |     --gin.INITIAL_CHECKPOINT_PATH=\"'/fsx/lintangsutawika/improved_t5/ckpts/920m_pile_ul2_causal_0_15/checkpoint_64000/'\" \
16 |     --seqio_additional_cache_dirs="${CACHED_DATA_DIR}" \
17 |     --alsologtostderr \
18 |     --multiprocess_gpu \
19 |     --coordinator_address=${ADDR} \
20 |     --process_count=${SLURM_NTASKS} \
21 |     --process_index=${SLURM_PROCID}


--------------------------------------------------------------------------------
/experiments/preliminary/mixed_pretraining_objectives/finetune/0_15/sglue_finetune_920m_96000.sh:
--------------------------------------------------------------------------------
 1 | export CACHED_DATA_DIR="/fsx/lintangsutawika/data"
 2 | 
 3 | ADDR=$1
 4 | MODEL_DIR=$2
 5 | 
 6 | let "TRAIN_STEPS = 96000 + 128000"
 7 | 
 8 | python -m t5x.train \
 9 |     --gin_file="models/scalable_t5/t5_1_1/base.gin" \
10 |     --gin_file="configs/size/920m/vanilla.gin" \
11 |     --gin_file="configs/task/finetune/sglue.gin" \
12 |     --gin.TRAIN_STEPS=${TRAIN_STEPS} \
13 |     --gin.SAVING_PERIOD=2_000 \
14 |     --gin.MODEL_DIR=\"'/fsx/lintangsutawika/improved_t5/ckpts/920m_pile_ul2_causal_0_15/finetune_96k/'\" \
15 |     --gin.INITIAL_CHECKPOINT_PATH=\"'/fsx/lintangsutawika/improved_t5/ckpts/920m_pile_ul2_causal_0_15/checkpoint_96000/'\" \
16 |     --seqio_additional_cache_dirs="${CACHED_DATA_DIR}" \
17 |     --alsologtostderr \
18 |     --multiprocess_gpu \
19 |     --coordinator_address=${ADDR} \
20 |     --process_count=${SLURM_NTASKS} \
21 |     --process_index=${SLURM_PROCID}


--------------------------------------------------------------------------------
/experiments/preliminary/mixed_pretraining_objectives/finetune/0_25/sglue_finetune_920m_128000.sh:
--------------------------------------------------------------------------------
 1 | export CACHED_DATA_DIR="/fsx/lintangsutawika/data"
 2 | 
 3 | ADDR=$1
 4 | MODEL_DIR=$2
 5 | 
 6 | let "TRAIN_STEPS = 128000 + 128000"
 7 | 
 8 | python -m t5x.train \
 9 |     --gin_file="models/scalable_t5/t5_1_1/base.gin" \
10 |     --gin_file="configs/size/920m/vanilla.gin" \
11 |     --gin_file="configs/task/finetune/sglue.gin" \
12 |     --gin.TRAIN_STEPS=${TRAIN_STEPS} \
13 |     --gin.SAVING_PERIOD=2_000 \
14 |     --gin.MODEL_DIR=\"'/fsx/lintangsutawika/improved_t5/ckpts/920m_pile_ul2_causal_0_25/sglue_128k/'\" \
15 |     --gin.INITIAL_CHECKPOINT_PATH=\"'/fsx/lintangsutawika/improved_t5/ckpts/920m_pile_ul2_causal_0_25/checkpoint_128000/'\" \
16 |     --seqio_additional_cache_dirs="${CACHED_DATA_DIR}" \
17 |     --alsologtostderr \
18 |     --multiprocess_gpu \
19 |     --coordinator_address=${ADDR} \
20 |     --process_count=${SLURM_NTASKS} \
21 |     --process_index=${SLURM_PROCID}


--------------------------------------------------------------------------------
/experiments/preliminary/mixed_pretraining_objectives/finetune/0_25/sglue_finetune_920m_160000.sh:
--------------------------------------------------------------------------------
 1 | export CACHED_DATA_DIR="/fsx/lintangsutawika/data"
 2 | 
 3 | ADDR=$1
 4 | MODEL_DIR=$2
 5 | 
 6 | let "TRAIN_STEPS = 160000 + 128000"
 7 | 
 8 | python -m t5x.train \
 9 |     --gin_file="models/scalable_t5/t5_1_1/base.gin" \
10 |     --gin_file="configs/size/920m/vanilla.gin" \
11 |     --gin_file="configs/task/finetune/sglue.gin" \
12 |     --gin.TRAIN_STEPS=${TRAIN_STEPS} \
13 |     --gin.SAVING_PERIOD=2_000 \
14 |     --gin.MODEL_DIR=\"'/fsx/lintangsutawika/improved_t5/ckpts/920m_pile_ul2_causal_0_25/sglue_160k/'\" \
15 |     --gin.INITIAL_CHECKPOINT_PATH=\"'/fsx/lintangsutawika/improved_t5/ckpts/920m_pile_ul2_causal_0_25/checkpoint_160000/'\" \
16 |     --seqio_additional_cache_dirs="${CACHED_DATA_DIR}" \
17 |     --alsologtostderr \
18 |     --multiprocess_gpu \
19 |     --coordinator_address=${ADDR} \
20 |     --process_count=${SLURM_NTASKS} \
21 |     --process_index=${SLURM_PROCID}


--------------------------------------------------------------------------------
/experiments/preliminary/mixed_pretraining_objectives/finetune/0_25/sglue_finetune_920m_192000.sh:
--------------------------------------------------------------------------------
 1 | export CACHED_DATA_DIR="/fsx/lintangsutawika/data"
 2 | 
 3 | ADDR=$1
 4 | MODEL_DIR=$2
 5 | 
 6 | let "TRAIN_STEPS = 192000 + 128000"
 7 | 
 8 | python -m t5x.train \
 9 |     --gin_file="models/scalable_t5/t5_1_1/base.gin" \
10 |     --gin_file="configs/size/920m/vanilla.gin" \
11 |     --gin_file="configs/task/finetune/sglue.gin" \
12 |     --gin.TRAIN_STEPS=${TRAIN_STEPS} \
13 |     --gin.SAVING_PERIOD=2_000 \
14 |     --gin.MODEL_DIR=\"'/fsx/lintangsutawika/improved_t5/ckpts/920m_pile_ul2_causal_0_25/sglue_192k/'\" \
15 |     --gin.INITIAL_CHECKPOINT_PATH=\"'/fsx/lintangsutawika/improved_t5/ckpts/920m_pile_ul2_causal_0_25/checkpoint_192000/'\" \
16 |     --seqio_additional_cache_dirs="${CACHED_DATA_DIR}" \
17 |     --alsologtostderr \
18 |     --multiprocess_gpu \
19 |     --coordinator_address=${ADDR} \
20 |     --process_count=${SLURM_NTASKS} \
21 |     --process_index=${SLURM_PROCID}


--------------------------------------------------------------------------------
/experiments/preliminary/mixed_pretraining_objectives/finetune/0_25/sglue_finetune_920m_224000.sh:
--------------------------------------------------------------------------------
 1 | export CACHED_DATA_DIR="/fsx/lintangsutawika/data"
 2 | 
 3 | ADDR=$1
 4 | MODEL_DIR=$2
 5 | 
 6 | let "TRAIN_STEPS = 224000 + 128000"
 7 | 
 8 | python -m t5x.train \
 9 |     --gin_file="models/scalable_t5/t5_1_1/base.gin" \
10 |     --gin_file="configs/size/920m/vanilla.gin" \
11 |     --gin_file="configs/task/finetune/sglue.gin" \
12 |     --gin.TRAIN_STEPS=${TRAIN_STEPS} \
13 |     --gin.SAVING_PERIOD=2_000 \
14 |     --gin.MODEL_DIR=\"'/fsx/lintangsutawika/improved_t5/ckpts/920m_pile_ul2_causal_0_25/sglue_224k/'\" \
15 |     --gin.INITIAL_CHECKPOINT_PATH=\"'/fsx/lintangsutawika/improved_t5/ckpts/920m_pile_ul2_causal_0_25/checkpoint_224000/'\" \
16 |     --seqio_additional_cache_dirs="${CACHED_DATA_DIR}" \
17 |     --alsologtostderr \
18 |     --multiprocess_gpu \
19 |     --coordinator_address=${ADDR} \
20 |     --process_count=${SLURM_NTASKS} \
21 |     --process_index=${SLURM_PROCID}


--------------------------------------------------------------------------------
/experiments/preliminary/mixed_pretraining_objectives/finetune/0_25/sglue_finetune_920m_256000.sh:
--------------------------------------------------------------------------------
 1 | export CACHED_DATA_DIR="/fsx/lintangsutawika/data"
 2 | 
 3 | ADDR=$1
 4 | MODEL_DIR=$2
 5 | 
 6 | let "TRAIN_STEPS = 256000 + 128000"
 7 | 
 8 | python -m t5x.train \
 9 |     --gin_file="models/scalable_t5/t5_1_1/base.gin" \
10 |     --gin_file="configs/size/920m/vanilla.gin" \
11 |     --gin_file="configs/task/finetune/sglue.gin" \
12 |     --gin.TRAIN_STEPS=${TRAIN_STEPS} \
13 |     --gin.SAVING_PERIOD=2_000 \
14 |     --gin.MODEL_DIR=\"'/fsx/lintangsutawika/improved_t5/ckpts/920m_pile_ul2_causal_0_25/sglue_256k/'\" \
15 |     --gin.INITIAL_CHECKPOINT_PATH=\"'/fsx/lintangsutawika/improved_t5/ckpts/920m_pile_ul2_causal_0_25/checkpoint_256000/'\" \
16 |     --seqio_additional_cache_dirs="${CACHED_DATA_DIR}" \
17 |     --alsologtostderr \
18 |     --multiprocess_gpu \
19 |     --coordinator_address=${ADDR} \
20 |     --process_count=${SLURM_NTASKS} \
21 |     --process_index=${SLURM_PROCID}


--------------------------------------------------------------------------------
/experiments/preliminary/mixed_pretraining_objectives/finetune/0_25/sglue_finetune_920m_32000.sh:
--------------------------------------------------------------------------------
 1 | export CACHED_DATA_DIR="/fsx/lintangsutawika/data"
 2 | 
 3 | ADDR=$1
 4 | MODEL_DIR=$2
 5 | 
 6 | let "TRAIN_STEPS = 32000 + 128000"
 7 | 
 8 | python -m t5x.train \
 9 |     --gin_file="models/scalable_t5/t5_1_1/base.gin" \
10 |     --gin_file="configs/size/920m/vanilla.gin" \
11 |     --gin_file="configs/task/finetune/sglue.gin" \
12 |     --gin.TRAIN_STEPS=${TRAIN_STEPS} \
13 |     --gin.SAVING_PERIOD=2_000 \
14 |     --gin.MODEL_DIR=\"'/fsx/lintangsutawika/improved_t5/ckpts/920m_pile_ul2_causal_0_25/sglue_32k/'\" \
15 |     --gin.INITIAL_CHECKPOINT_PATH=\"'/fsx/lintangsutawika/improved_t5/ckpts/920m_pile_ul2_causal_0_25/checkpoint_32000/'\" \
16 |     --seqio_additional_cache_dirs="${CACHED_DATA_DIR}" \
17 |     --alsologtostderr \
18 |     --multiprocess_gpu \
19 |     --coordinator_address=${ADDR} \
20 |     --process_count=${SLURM_NTASKS} \
21 |     --process_index=${SLURM_PROCID}


--------------------------------------------------------------------------------
/experiments/preliminary/mixed_pretraining_objectives/finetune/0_25/sglue_finetune_920m_64000.sh:
--------------------------------------------------------------------------------
 1 | export CACHED_DATA_DIR="/fsx/lintangsutawika/data"
 2 | 
 3 | ADDR=$1
 4 | MODEL_DIR=$2
 5 | 
 6 | let "TRAIN_STEPS = 64000 + 128000"
 7 | 
 8 | python -m t5x.train \
 9 |     --gin_file="models/scalable_t5/t5_1_1/base.gin" \
10 |     --gin_file="configs/size/920m/vanilla.gin" \
11 |     --gin_file="configs/task/finetune/sglue.gin" \
12 |     --gin.TRAIN_STEPS=${TRAIN_STEPS} \
13 |     --gin.SAVING_PERIOD=2_000 \
14 |     --gin.MODEL_DIR=\"'/fsx/lintangsutawika/improved_t5/ckpts/920m_pile_ul2_causal_0_25/sglue_64k/'\" \
15 |     --gin.INITIAL_CHECKPOINT_PATH=\"'/fsx/lintangsutawika/improved_t5/ckpts/920m_pile_ul2_causal_0_25/checkpoint_64000/'\" \
16 |     --seqio_additional_cache_dirs="${CACHED_DATA_DIR}" \
17 |     --alsologtostderr \
18 |     --multiprocess_gpu \
19 |     --coordinator_address=${ADDR} \
20 |     --process_count=${SLURM_NTASKS} \
21 |     --process_index=${SLURM_PROCID}


--------------------------------------------------------------------------------
/experiments/preliminary/mixed_pretraining_objectives/finetune/0_25/sglue_finetune_920m_96000.sh:
--------------------------------------------------------------------------------
 1 | export CACHED_DATA_DIR="/fsx/lintangsutawika/data"
 2 | 
 3 | ADDR=$1
 4 | MODEL_DIR=$2
 5 | 
 6 | let "TRAIN_STEPS = 96000 + 128000"
 7 | 
 8 | python -m t5x.train \
 9 |     --gin_file="models/scalable_t5/t5_1_1/base.gin" \
10 |     --gin_file="configs/size/920m/vanilla.gin" \
11 |     --gin_file="configs/task/finetune/sglue.gin" \
12 |     --gin.TRAIN_STEPS=${TRAIN_STEPS} \
13 |     --gin.SAVING_PERIOD=2_000 \
14 |     --gin.MODEL_DIR=\"'/fsx/lintangsutawika/improved_t5/ckpts/920m_pile_ul2_causal_0_25/sglue_96k/'\" \
15 |     --gin.INITIAL_CHECKPOINT_PATH=\"'/fsx/lintangsutawika/improved_t5/ckpts/920m_pile_ul2_causal_0_25/checkpoint_96000/'\" \
16 |     --seqio_additional_cache_dirs="${CACHED_DATA_DIR}" \
17 |     --alsologtostderr \
18 |     --multiprocess_gpu \
19 |     --coordinator_address=${ADDR} \
20 |     --process_count=${SLURM_NTASKS} \
21 |     --process_index=${SLURM_PROCID}


--------------------------------------------------------------------------------
/experiments/preliminary/mixed_pretraining_objectives/finetune/0_50/sglue_finetune_920m_128000.sh:
--------------------------------------------------------------------------------
 1 | export CACHED_DATA_DIR="/fsx/lintangsutawika/data"
 2 | 
 3 | ADDR=$1
 4 | MODEL_DIR=$2
 5 | 
 6 | let "TRAIN_STEPS = 128000 + 128000"
 7 | 
 8 | python -m t5x.train \
 9 |     --gin_file="models/scalable_t5/t5_1_1/base.gin" \
10 |     --gin_file="configs/size/920m/vanilla.gin" \
11 |     --gin_file="configs/task/finetune/sglue.gin" \
12 |     --gin.TRAIN_STEPS=${TRAIN_STEPS} \
13 |     --gin.SAVING_PERIOD=2_000 \
14 |     --gin.MODEL_DIR=\"'/fsx/lintangsutawika/improved_t5/ckpts/920m_pile_ul2_causal_0_50/sglue_128k/'\" \
15 |     --gin.INITIAL_CHECKPOINT_PATH=\"'/fsx/lintangsutawika/improved_t5/ckpts/920m_pile_ul2_causal_0_50/checkpoint_128000/'\" \
16 |     --seqio_additional_cache_dirs="${CACHED_DATA_DIR}" \
17 |     --alsologtostderr \
18 |     --multiprocess_gpu \
19 |     --coordinator_address=${ADDR} \
20 |     --process_count=${SLURM_NTASKS} \
21 |     --process_index=${SLURM_PROCID}


--------------------------------------------------------------------------------
/experiments/preliminary/mixed_pretraining_objectives/finetune/0_50/sglue_finetune_920m_160000.sh:
--------------------------------------------------------------------------------
 1 | export CACHED_DATA_DIR="/fsx/lintangsutawika/data"
 2 | 
 3 | ADDR=$1
 4 | MODEL_DIR=$2
 5 | 
 6 | let "TRAIN_STEPS = 160000 + 128000"
 7 | 
 8 | python -m t5x.train \
 9 |     --gin_file="models/scalable_t5/t5_1_1/base.gin" \
10 |     --gin_file="configs/size/920m/vanilla.gin" \
11 |     --gin_file="configs/task/finetune/sglue.gin" \
12 |     --gin.TRAIN_STEPS=${TRAIN_STEPS} \
13 |     --gin.SAVING_PERIOD=2_000 \
14 |     --gin.MODEL_DIR=\"'/fsx/lintangsutawika/improved_t5/ckpts/920m_pile_ul2_causal_0_50/sglue_160k/'\" \
15 |     --gin.INITIAL_CHECKPOINT_PATH=\"'/fsx/lintangsutawika/improved_t5/ckpts/920m_pile_ul2_causal_0_50/checkpoint_160000/'\" \
16 |     --seqio_additional_cache_dirs="${CACHED_DATA_DIR}" \
17 |     --alsologtostderr \
18 |     --multiprocess_gpu \
19 |     --coordinator_address=${ADDR} \
20 |     --process_count=${SLURM_NTASKS} \
21 |     --process_index=${SLURM_PROCID}


--------------------------------------------------------------------------------
/experiments/preliminary/mixed_pretraining_objectives/finetune/0_50/sglue_finetune_920m_192000.sh:
--------------------------------------------------------------------------------
 1 | export CACHED_DATA_DIR="/fsx/lintangsutawika/data"
 2 | 
 3 | ADDR=$1
 4 | MODEL_DIR=$2
 5 | 
 6 | let "TRAIN_STEPS = 192000 + 128000"
 7 | 
 8 | python -m t5x.train \
 9 |     --gin_file="models/scalable_t5/t5_1_1/base.gin" \
10 |     --gin_file="configs/size/920m/vanilla.gin" \
11 |     --gin_file="configs/task/finetune/sglue.gin" \
12 |     --gin.TRAIN_STEPS=${TRAIN_STEPS} \
13 |     --gin.SAVING_PERIOD=2_000 \
14 |     --gin.MODEL_DIR=\"'/fsx/lintangsutawika/improved_t5/ckpts/920m_pile_ul2_causal_0_50/sglue_192k/'\" \
15 |     --gin.INITIAL_CHECKPOINT_PATH=\"'/fsx/lintangsutawika/improved_t5/ckpts/920m_pile_ul2_causal_0_50/checkpoint_192000/'\" \
16 |     --seqio_additional_cache_dirs="${CACHED_DATA_DIR}" \
17 |     --alsologtostderr \
18 |     --multiprocess_gpu \
19 |     --coordinator_address=${ADDR} \
20 |     --process_count=${SLURM_NTASKS} \
21 |     --process_index=${SLURM_PROCID}


--------------------------------------------------------------------------------
/experiments/preliminary/mixed_pretraining_objectives/finetune/0_50/sglue_finetune_920m_224000.sh:
--------------------------------------------------------------------------------
 1 | export CACHED_DATA_DIR="/fsx/lintangsutawika/data"
 2 | 
 3 | ADDR=$1
 4 | MODEL_DIR=$2
 5 | 
 6 | let "TRAIN_STEPS = 224000 + 128000"
 7 | 
 8 | python -m t5x.train \
 9 |     --gin_file="models/scalable_t5/t5_1_1/base.gin" \
10 |     --gin_file="configs/size/920m/vanilla.gin" \
11 |     --gin_file="configs/task/finetune/sglue.gin" \
12 |     --gin.TRAIN_STEPS=${TRAIN_STEPS} \
13 |     --gin.SAVING_PERIOD=2_000 \
14 |     --gin.MODEL_DIR=\"'/fsx/lintangsutawika/improved_t5/ckpts/920m_pile_ul2_causal_0_50/sglue_224k/'\" \
15 |     --gin.INITIAL_CHECKPOINT_PATH=\"'/fsx/lintangsutawika/improved_t5/ckpts/920m_pile_ul2_causal_0_50/checkpoint_224000/'\" \
16 |     --seqio_additional_cache_dirs="${CACHED_DATA_DIR}" \
17 |     --alsologtostderr \
18 |     --multiprocess_gpu \
19 |     --coordinator_address=${ADDR} \
20 |     --process_count=${SLURM_NTASKS} \
21 |     --process_index=${SLURM_PROCID}


--------------------------------------------------------------------------------
/experiments/preliminary/mixed_pretraining_objectives/finetune/0_50/sglue_finetune_920m_256000.sh:
--------------------------------------------------------------------------------
 1 | export CACHED_DATA_DIR="/fsx/lintangsutawika/data"
 2 | 
 3 | ADDR=$1
 4 | MODEL_DIR=$2
 5 | 
 6 | let "TRAIN_STEPS = 256000 + 128000"
 7 | 
 8 | python -m t5x.train \
 9 |     --gin_file="models/scalable_t5/t5_1_1/base.gin" \
10 |     --gin_file="configs/size/920m/vanilla.gin" \
11 |     --gin_file="configs/task/finetune/sglue.gin" \
12 |     --gin.TRAIN_STEPS=${TRAIN_STEPS} \
13 |     --gin.SAVING_PERIOD=2_000 \
14 |     --gin.MODEL_DIR=\"'/fsx/lintangsutawika/improved_t5/ckpts/920m_pile_ul2_causal_0_50/sglue_256k/'\" \
15 |     --gin.INITIAL_CHECKPOINT_PATH=\"'/fsx/lintangsutawika/improved_t5/ckpts/920m_pile_ul2_causal_0_50/checkpoint_256000/'\" \
16 |     --seqio_additional_cache_dirs="${CACHED_DATA_DIR}" \
17 |     --alsologtostderr \
18 |     --multiprocess_gpu \
19 |     --coordinator_address=${ADDR} \
20 |     --process_count=${SLURM_NTASKS} \
21 |     --process_index=${SLURM_PROCID}


--------------------------------------------------------------------------------
/experiments/preliminary/mixed_pretraining_objectives/finetune/0_50/sglue_finetune_920m_32000.sh:
--------------------------------------------------------------------------------
 1 | export CACHED_DATA_DIR="/fsx/lintangsutawika/data"
 2 | 
 3 | ADDR=$1
 4 | MODEL_DIR=$2
 5 | 
 6 | let "TRAIN_STEPS = 32000 + 128000"
 7 | 
 8 | python -m t5x.train \
 9 |     --gin_file="models/scalable_t5/t5_1_1/base.gin" \
10 |     --gin_file="configs/size/920m/vanilla.gin" \
11 |     --gin_file="configs/task/finetune/sglue.gin" \
12 |     --gin.TRAIN_STEPS=${TRAIN_STEPS} \
13 |     --gin.SAVING_PERIOD=2_000 \
14 |     --gin.MODEL_DIR=\"'/fsx/lintangsutawika/improved_t5/ckpts/920m_pile_ul2_causal_0_50/sglue_32k/'\" \
15 |     --gin.INITIAL_CHECKPOINT_PATH=\"'/fsx/lintangsutawika/improved_t5/ckpts/920m_pile_ul2_causal_0_50/checkpoint_32000/'\" \
16 |     --seqio_additional_cache_dirs="${CACHED_DATA_DIR}" \
17 |     --alsologtostderr \
18 |     --multiprocess_gpu \
19 |     --coordinator_address=${ADDR} \
20 |     --process_count=${SLURM_NTASKS} \
21 |     --process_index=${SLURM_PROCID}


--------------------------------------------------------------------------------
/experiments/preliminary/mixed_pretraining_objectives/finetune/0_50/sglue_finetune_920m_64000.sh:
--------------------------------------------------------------------------------
 1 | export CACHED_DATA_DIR="/fsx/lintangsutawika/data"
 2 | 
 3 | ADDR=$1
 4 | MODEL_DIR=$2
 5 | 
 6 | let "TRAIN_STEPS = 64000 + 128000"
 7 | 
 8 | python -m t5x.train \
 9 |     --gin_file="models/scalable_t5/t5_1_1/base.gin" \
10 |     --gin_file="configs/size/920m/vanilla.gin" \
11 |     --gin_file="configs/task/finetune/sglue.gin" \
12 |     --gin.TRAIN_STEPS=${TRAIN_STEPS} \
13 |     --gin.SAVING_PERIOD=2_000 \
14 |     --gin.MODEL_DIR=\"'/fsx/lintangsutawika/improved_t5/ckpts/920m_pile_ul2_causal_0_50/sglue_64k/'\" \
15 |     --gin.INITIAL_CHECKPOINT_PATH=\"'/fsx/lintangsutawika/improved_t5/ckpts/920m_pile_ul2_causal_0_50/checkpoint_64000/'\" \
16 |     --seqio_additional_cache_dirs="${CACHED_DATA_DIR}" \
17 |     --alsologtostderr \
18 |     --multiprocess_gpu \
19 |     --coordinator_address=${ADDR} \
20 |     --process_count=${SLURM_NTASKS} \
21 |     --process_index=${SLURM_PROCID}


--------------------------------------------------------------------------------
/experiments/preliminary/mixed_pretraining_objectives/finetune/0_50/sglue_finetune_920m_96000.sh:
--------------------------------------------------------------------------------
 1 | export CACHED_DATA_DIR="/fsx/lintangsutawika/data"
 2 | 
 3 | ADDR=$1
 4 | MODEL_DIR=$2
 5 | 
 6 | let "TRAIN_STEPS = 96000 + 128000"
 7 | 
 8 | python -m t5x.train \
 9 |     --gin_file="models/scalable_t5/t5_1_1/base.gin" \
10 |     --gin_file="configs/size/920m/vanilla.gin" \
11 |     --gin_file="configs/task/finetune/sglue.gin" \
12 |     --gin.TRAIN_STEPS=${TRAIN_STEPS} \
13 |     --gin.SAVING_PERIOD=2_000 \
14 |     --gin.MODEL_DIR=\"'/fsx/lintangsutawika/improved_t5/ckpts/920m_pile_ul2_causal_0_50/sglue_96k/'\" \
15 |     --gin.INITIAL_CHECKPOINT_PATH=\"'/fsx/lintangsutawika/improved_t5/ckpts/920m_pile_ul2_causal_0_50/checkpoint_96000/'\" \
16 |     --seqio_additional_cache_dirs="${CACHED_DATA_DIR}" \
17 |     --alsologtostderr \
18 |     --multiprocess_gpu \
19 |     --coordinator_address=${ADDR} \
20 |     --process_count=${SLURM_NTASKS} \
21 |     --process_index=${SLURM_PROCID}


--------------------------------------------------------------------------------
/experiments/preliminary/mixed_pretraining_objectives/finetune/0_60/sglue_finetune_920m_128000.sh:
--------------------------------------------------------------------------------
 1 | export CACHED_DATA_DIR="/fsx/lintangsutawika/data"
 2 | 
 3 | ADDR=$1
 4 | MODEL_DIR=$2
 5 | 
 6 | let "TRAIN_STEPS = 128000 + 128000"
 7 | 
 8 | python -m t5x.train \
 9 |     --gin_file="models/scalable_t5/t5_1_1/base.gin" \
10 |     --gin_file="configs/size/920m/vanilla.gin" \
11 |     --gin_file="configs/task/finetune/sglue.gin" \
12 |     --gin.TRAIN_STEPS=${TRAIN_STEPS} \
13 |     --gin.SAVING_PERIOD=2_000 \
14 |     --gin.MODEL_DIR=\"'/fsx/lintangsutawika/improved_t5/ckpts/920m_pile_ul2_causal_0_60/sglue_128k/'\" \
15 |     --gin.INITIAL_CHECKPOINT_PATH=\"'/fsx/lintangsutawika/improved_t5/ckpts/920m_pile_ul2_causal_0_60/checkpoint_128000/'\" \
16 |     --seqio_additional_cache_dirs="${CACHED_DATA_DIR}" \
17 |     --alsologtostderr \
18 |     --multiprocess_gpu \
19 |     --coordinator_address=${ADDR} \
20 |     --process_count=${SLURM_NTASKS} \
21 |     --process_index=${SLURM_PROCID}


--------------------------------------------------------------------------------
/experiments/preliminary/mixed_pretraining_objectives/finetune/0_60/sglue_finetune_920m_160000.sh:
--------------------------------------------------------------------------------
 1 | export CACHED_DATA_DIR="/fsx/lintangsutawika/data"
 2 | 
 3 | ADDR=$1
 4 | MODEL_DIR=$2
 5 | 
 6 | let "TRAIN_STEPS = 160000 + 128000"
 7 | 
 8 | python -m t5x.train \
 9 |     --gin_file="models/scalable_t5/t5_1_1/base.gin" \
10 |     --gin_file="configs/size/920m/vanilla.gin" \
11 |     --gin_file="configs/task/finetune/sglue.gin" \
12 |     --gin.TRAIN_STEPS=${TRAIN_STEPS} \
13 |     --gin.SAVING_PERIOD=2_000 \
14 |     --gin.MODEL_DIR=\"'/fsx/lintangsutawika/improved_t5/ckpts/920m_pile_ul2_causal_0_60/sglue_160k/'\" \
15 |     --gin.INITIAL_CHECKPOINT_PATH=\"'/fsx/lintangsutawika/improved_t5/ckpts/920m_pile_ul2_causal_0_60/checkpoint_160000/'\" \
16 |     --seqio_additional_cache_dirs="${CACHED_DATA_DIR}" \
17 |     --alsologtostderr \
18 |     --multiprocess_gpu \
19 |     --coordinator_address=${ADDR} \
20 |     --process_count=${SLURM_NTASKS} \
21 |     --process_index=${SLURM_PROCID}


--------------------------------------------------------------------------------
/experiments/preliminary/mixed_pretraining_objectives/finetune/0_60/sglue_finetune_920m_192000.sh:
--------------------------------------------------------------------------------
 1 | export CACHED_DATA_DIR="/fsx/lintangsutawika/data"
 2 | 
 3 | ADDR=$1
 4 | MODEL_DIR=$2
 5 | 
 6 | let "TRAIN_STEPS = 192000 + 128000"
 7 | 
 8 | python -m t5x.train \
 9 |     --gin_file="models/scalable_t5/t5_1_1/base.gin" \
10 |     --gin_file="configs/size/920m/vanilla.gin" \
11 |     --gin_file="configs/task/finetune/sglue.gin" \
12 |     --gin.TRAIN_STEPS=${TRAIN_STEPS} \
13 |     --gin.SAVING_PERIOD=2_000 \
14 |     --gin.MODEL_DIR=\"'/fsx/lintangsutawika/improved_t5/ckpts/920m_pile_ul2_causal_0_60/sglue_192k/'\" \
15 |     --gin.INITIAL_CHECKPOINT_PATH=\"'/fsx/lintangsutawika/improved_t5/ckpts/920m_pile_ul2_causal_0_60/checkpoint_192000/'\" \
16 |     --seqio_additional_cache_dirs="${CACHED_DATA_DIR}" \
17 |     --alsologtostderr \
18 |     --multiprocess_gpu \
19 |     --coordinator_address=${ADDR} \
20 |     --process_count=${SLURM_NTASKS} \
21 |     --process_index=${SLURM_PROCID}


--------------------------------------------------------------------------------
/experiments/preliminary/mixed_pretraining_objectives/finetune/0_60/sglue_finetune_920m_224000.sh:
--------------------------------------------------------------------------------
 1 | export CACHED_DATA_DIR="/fsx/lintangsutawika/data"
 2 | 
 3 | ADDR=$1
 4 | MODEL_DIR=$2
 5 | 
 6 | let "TRAIN_STEPS = 224000 + 128000"
 7 | 
 8 | python -m t5x.train \
 9 |     --gin_file="models/scalable_t5/t5_1_1/base.gin" \
10 |     --gin_file="configs/size/920m/vanilla.gin" \
11 |     --gin_file="configs/task/finetune/sglue.gin" \
12 |     --gin.TRAIN_STEPS=${TRAIN_STEPS} \
13 |     --gin.SAVING_PERIOD=2_000 \
14 |     --gin.MODEL_DIR=\"'/fsx/lintangsutawika/improved_t5/ckpts/920m_pile_ul2_causal_0_60/sglue_224k/'\" \
15 |     --gin.INITIAL_CHECKPOINT_PATH=\"'/fsx/lintangsutawika/improved_t5/ckpts/920m_pile_ul2_causal_0_60/checkpoint_224000/'\" \
16 |     --seqio_additional_cache_dirs="${CACHED_DATA_DIR}" \
17 |     --alsologtostderr \
18 |     --multiprocess_gpu \
19 |     --coordinator_address=${ADDR} \
20 |     --process_count=${SLURM_NTASKS} \
21 |     --process_index=${SLURM_PROCID}


--------------------------------------------------------------------------------
/experiments/preliminary/mixed_pretraining_objectives/finetune/0_60/sglue_finetune_920m_32000.sh:
--------------------------------------------------------------------------------
 1 | export CACHED_DATA_DIR="/fsx/lintangsutawika/data"
 2 | 
 3 | ADDR=$1
 4 | MODEL_DIR=$2
 5 | 
 6 | let "TRAIN_STEPS = 32000 + 128000"
 7 | 
 8 | python -m t5x.train \
 9 |     --gin_file="models/scalable_t5/t5_1_1/base.gin" \
10 |     --gin_file="configs/size/920m/vanilla.gin" \
11 |     --gin_file="configs/task/finetune/sglue.gin" \
12 |     --gin.TRAIN_STEPS=${TRAIN_STEPS} \
13 |     --gin.SAVING_PERIOD=2_000 \
14 |     --gin.MODEL_DIR=\"'/fsx/lintangsutawika/improved_t5/ckpts/920m_pile_ul2_causal_0_60/sglue_32k/'\" \
15 |     --gin.INITIAL_CHECKPOINT_PATH=\"'/fsx/lintangsutawika/improved_t5/ckpts/920m_pile_ul2_causal_0_60/checkpoint_32000/'\" \
16 |     --seqio_additional_cache_dirs="${CACHED_DATA_DIR}" \
17 |     --alsologtostderr \
18 |     --multiprocess_gpu \
19 |     --coordinator_address=${ADDR} \
20 |     --process_count=${SLURM_NTASKS} \
21 |     --process_index=${SLURM_PROCID}


--------------------------------------------------------------------------------
/experiments/preliminary/mixed_pretraining_objectives/finetune/0_60/sglue_finetune_920m_64000.sh:
--------------------------------------------------------------------------------
 1 | export CACHED_DATA_DIR="/fsx/lintangsutawika/data"
 2 | 
 3 | ADDR=$1
 4 | MODEL_DIR=$2
 5 | 
 6 | let "TRAIN_STEPS = 64000 + 128000"
 7 | 
 8 | python -m t5x.train \
 9 |     --gin_file="models/scalable_t5/t5_1_1/base.gin" \
10 |     --gin_file="configs/size/920m/vanilla.gin" \
11 |     --gin_file="configs/task/finetune/sglue.gin" \
12 |     --gin.TRAIN_STEPS=${TRAIN_STEPS} \
13 |     --gin.SAVING_PERIOD=2_000 \
14 |     --gin.MODEL_DIR=\"'/fsx/lintangsutawika/improved_t5/ckpts/920m_pile_ul2_causal_0_60/sglue_64k/'\" \
15 |     --gin.INITIAL_CHECKPOINT_PATH=\"'/fsx/lintangsutawika/improved_t5/ckpts/920m_pile_ul2_causal_0_60/checkpoint_64000/'\" \
16 |     --seqio_additional_cache_dirs="${CACHED_DATA_DIR}" \
17 |     --alsologtostderr \
18 |     --multiprocess_gpu \
19 |     --coordinator_address=${ADDR} \
20 |     --process_count=${SLURM_NTASKS} \
21 |     --process_index=${SLURM_PROCID}


--------------------------------------------------------------------------------
/experiments/preliminary/mixed_pretraining_objectives/finetune/0_60/sglue_finetune_920m_96000.sh:
--------------------------------------------------------------------------------
 1 | export CACHED_DATA_DIR="/fsx/lintangsutawika/data"
 2 | 
 3 | ADDR=$1
 4 | MODEL_DIR=$2
 5 | 
 6 | let "TRAIN_STEPS = 96000 + 128000"
 7 | 
 8 | python -m t5x.train \
 9 |     --gin_file="models/scalable_t5/t5_1_1/base.gin" \
10 |     --gin_file="configs/size/920m/vanilla.gin" \
11 |     --gin_file="configs/task/finetune/sglue.gin" \
12 |     --gin.TRAIN_STEPS=${TRAIN_STEPS} \
13 |     --gin.SAVING_PERIOD=2_000 \
14 |     --gin.MODEL_DIR=\"'/fsx/lintangsutawika/improved_t5/ckpts/920m_pile_ul2_causal_0_60/sglue_96k/'\" \
15 |     --gin.INITIAL_CHECKPOINT_PATH=\"'/fsx/lintangsutawika/improved_t5/ckpts/920m_pile_ul2_causal_0_60/checkpoint_96000/'\" \
16 |     --seqio_additional_cache_dirs="${CACHED_DATA_DIR}" \
17 |     --alsologtostderr \
18 |     --multiprocess_gpu \
19 |     --coordinator_address=${ADDR} \
20 |     --process_count=${SLURM_NTASKS} \
21 |     --process_index=${SLURM_PROCID}


--------------------------------------------------------------------------------
/experiments/preliminary/mixed_pretraining_objectives/finetune/0_75/sglue_finetune_920m_32000.sh:
--------------------------------------------------------------------------------
 1 | export CACHED_DATA_DIR="/fsx/lintangsutawika/data"
 2 | 
 3 | ADDR=$1
 4 | MODEL_DIR=$2
 5 | 
 6 | let "TRAIN_STEPS = 32000 + 128000"
 7 | 
 8 | python -m t5x.train \
 9 |     --gin_file="models/scalable_t5/t5_1_1/base.gin" \
10 |     --gin_file="configs/size/920m/vanilla.gin" \
11 |     --gin_file="configs/task/finetune/sglue.gin" \
12 |     --gin.TRAIN_STEPS=${TRAIN_STEPS} \
13 |     --gin.SAVING_PERIOD=2_000 \
14 |     --gin.MODEL_DIR=\"'/fsx/lintangsutawika/improved_t5/ckpts/920m_pile_ul2_causal_0_75/sglue_32k/'\" \
15 |     --gin.INITIAL_CHECKPOINT_PATH=\"'/fsx/lintangsutawika/improved_t5/ckpts/920m_pile_ul2_causal_0_75/checkpoint_32000/'\" \
16 |     --seqio_additional_cache_dirs="${CACHED_DATA_DIR}" \
17 |     --alsologtostderr \
18 |     --multiprocess_gpu \
19 |     --coordinator_address=${ADDR} \
20 |     --process_count=${SLURM_NTASKS} \
21 |     --process_index=${SLURM_PROCID}


--------------------------------------------------------------------------------
/experiments/preliminary/mixed_pretraining_objectives/finetune/0_75/sglue_finetune_920m_64000.sh:
--------------------------------------------------------------------------------
 1 | export CACHED_DATA_DIR="/fsx/lintangsutawika/data"
 2 | 
 3 | ADDR=$1
 4 | MODEL_DIR=$2
 5 | 
 6 | let "TRAIN_STEPS = 64000 + 128000"
 7 | 
 8 | python -m t5x.train \
 9 |     --gin_file="models/scalable_t5/t5_1_1/base.gin" \
10 |     --gin_file="configs/size/920m/vanilla.gin" \
11 |     --gin_file="configs/task/finetune/sglue.gin" \
12 |     --gin.TRAIN_STEPS=${TRAIN_STEPS} \
13 |     --gin.SAVING_PERIOD=2_000 \
14 |     --gin.MODEL_DIR=\"'/fsx/lintangsutawika/improved_t5/ckpts/920m_pile_ul2_causal_0_75/sglue_64k/'\" \
15 |     --gin.INITIAL_CHECKPOINT_PATH=\"'/fsx/lintangsutawika/improved_t5/ckpts/920m_pile_ul2_causal_0_75/checkpoint_64000/'\" \
16 |     --seqio_additional_cache_dirs="${CACHED_DATA_DIR}" \
17 |     --alsologtostderr \
18 |     --multiprocess_gpu \
19 |     --coordinator_address=${ADDR} \
20 |     --process_count=${SLURM_NTASKS} \
21 |     --process_index=${SLURM_PROCID}


--------------------------------------------------------------------------------
/experiments/preliminary/mixed_pretraining_objectives/finetune/0_75/sglue_finetune_920m_96000.sh:
--------------------------------------------------------------------------------
 1 | export CACHED_DATA_DIR="/fsx/lintangsutawika/data"
 2 | 
 3 | ADDR=$1
 4 | MODEL_DIR=$2
 5 | 
 6 | let "TRAIN_STEPS = 96000 + 128000"
 7 | 
 8 | python -m t5x.train \
 9 |     --gin_file="models/scalable_t5/t5_1_1/base.gin" \
10 |     --gin_file="configs/size/920m/vanilla.gin" \
11 |     --gin_file="configs/task/finetune/sglue.gin" \
12 |     --gin.TRAIN_STEPS=${TRAIN_STEPS} \
13 |     --gin.SAVING_PERIOD=2_000 \
14 |     --gin.MODEL_DIR=\"'/fsx/lintangsutawika/improved_t5/ckpts/920m_pile_ul2_causal_0_75/sglue_96k/'\" \
15 |     --gin.INITIAL_CHECKPOINT_PATH=\"'/fsx/lintangsutawika/improved_t5/ckpts/920m_pile_ul2_causal_0_75/checkpoint_96000/'\" \
16 |     --seqio_additional_cache_dirs="${CACHED_DATA_DIR}" \
17 |     --alsologtostderr \
18 |     --multiprocess_gpu \
19 |     --coordinator_address=${ADDR} \
20 |     --process_count=${SLURM_NTASKS} \
21 |     --process_index=${SLURM_PROCID}


--------------------------------------------------------------------------------
/experiments/preliminary/mixed_pretraining_objectives/pretrain_c4/920m_c4_mlm_0_10.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | ADDR=$1
 4 | MODEL_DIR=$2
 5 | 
 6 | python -m t5x.train \
 7 |     --gin_file="models/scalable_t5/t5_1_1/base.gin" \
 8 |     --gin_file="configs/size/920m/vanilla.gin" \
 9 |     --gin_file="configs/pretrain/c4_mlm.gin" \
10 |     --gin.MIXTURE_OR_TASK_NAME="c4_mlm_0_10" \
11 |     --gin.MODEL_DIR=\"'/fsx/lintangsutawika/ckpts/base_c4_mlm_0_10/'\" \
12 |     --gin.USE_CACHED_TASKS=False \
13 |     --gin.TRAIN_STEPS=256000 \
14 |     --gin.SAVING_PERIOD=32000 \
15 |     --gin.BATCH_SIZE=2048 \
16 |     --alsologtostderr \
17 |     --multiprocess_gpu \
18 |     --coordinator_address=${ADDR} \
19 |     --process_count=${SLURM_NTASKS} \
20 |     --process_index=${SLURM_PROCID}
21 | 


--------------------------------------------------------------------------------
/experiments/preliminary/mixed_pretraining_objectives/pretrain_c4/920m_c4_mlm_0_15.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | ADDR=$1
 4 | MODEL_DIR=$2
 5 | 
 6 | python -m t5x.train \
 7 |     --gin_file="models/scalable_t5/t5_1_1/base.gin" \
 8 |     --gin_file="configs/size/920m/vanilla.gin" \
 9 |     --gin_file="configs/pretrain/c4_mlm.gin" \
10 |     --gin.MIXTURE_OR_TASK_NAME="c4_mlm_0_15" \
11 |     --gin.MODEL_DIR=\"'/fsx/lintangsutawika/ckpts/base_c4_mlm_0_15/'\" \
12 |     --gin.USE_CACHED_TASKS=False \
13 |     --gin.TRAIN_STEPS=256000 \
14 |     --gin.SAVING_PERIOD=32000 \
15 |     --gin.BATCH_SIZE=2048 \
16 |     --alsologtostderr \
17 |     --multiprocess_gpu \
18 |     --coordinator_address=${ADDR} \
19 |     --process_count=${SLURM_NTASKS} \
20 |     --process_index=${SLURM_PROCID}
21 | 


--------------------------------------------------------------------------------
/experiments/preliminary/mixed_pretraining_objectives/pretrain_c4/920m_c4_mlm_0_25.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | ADDR=$1
 4 | MODEL_DIR=$2
 5 | 
 6 | python -m t5x.train \
 7 |     --gin_file="models/scalable_t5/t5_1_1/base.gin" \
 8 |     --gin_file="configs/size/920m/vanilla.gin" \
 9 |     --gin_file="configs/pretrain/c4_mlm.gin" \
10 |     --gin.MIXTURE_OR_TASK_NAME="c4_mlm_0_25" \
11 |     --gin.MODEL_DIR=\"'/fsx/lintangsutawika/ckpts/base_c4_mlm_0_25/'\" \
12 |     --gin.USE_CACHED_TASKS=False \
13 |     --gin.TRAIN_STEPS=256000 \
14 |     --gin.SAVING_PERIOD=32000 \
15 |     --gin.BATCH_SIZE=2048 \
16 |     --alsologtostderr \
17 |     --multiprocess_gpu \
18 |     --coordinator_address=${ADDR} \
19 |     --process_count=${SLURM_NTASKS} \
20 |     --process_index=${SLURM_PROCID}
21 | 


--------------------------------------------------------------------------------
/experiments/preliminary/mixed_pretraining_objectives/pretrain_c4/920m_c4_mlm_0_50.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | ADDR=$1
 4 | MODEL_DIR=$2
 5 | 
 6 | python -m t5x.train \
 7 |     --gin_file="models/scalable_t5/t5_1_1/base.gin" \
 8 |     --gin_file="configs/size/920m/vanilla.gin" \
 9 |     --gin_file="configs/pretrain/c4_mlm.gin" \
10 |     --gin.MIXTURE_OR_TASK_NAME="c4_mlm_0_50" \
11 |     --gin.MODEL_DIR=\"'/fsx/lintangsutawika/ckpts/base_c4_mlm_0_50/'\" \
12 |     --gin.USE_CACHED_TASKS=False \
13 |     --gin.TRAIN_STEPS=256000 \
14 |     --gin.SAVING_PERIOD=32000 \
15 |     --gin.BATCH_SIZE=2048 \
16 |     --alsologtostderr \
17 |     --multiprocess_gpu \
18 |     --coordinator_address=${ADDR} \
19 |     --process_count=${SLURM_NTASKS} \
20 |     --process_index=${SLURM_PROCID}
21 | 


--------------------------------------------------------------------------------
/experiments/preliminary/mixed_pretraining_objectives/pretrain_c4/920m_c4_mlm_0_75.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | ADDR=$1
 4 | MODEL_DIR=$2
 5 | 
 6 | python -m t5x.train \
 7 |     --gin_file="models/scalable_t5/t5_1_1/base.gin" \
 8 |     --gin_file="configs/size/920m/vanilla.gin" \
 9 |     --gin_file="configs/pretrain/c4_mlm.gin" \
10 |     --gin.MIXTURE_OR_TASK_NAME="c4_mlm_0_75" \
11 |     --gin.MODEL_DIR=\"'/fsx/lintangsutawika/ckpts/base_c4_mlm_0_75/'\" \
12 |     --gin.USE_CACHED_TASKS=False \
13 |     --gin.TRAIN_STEPS=256000 \
14 |     --gin.SAVING_PERIOD=32000 \
15 |     --gin.BATCH_SIZE=2048 \
16 |     --alsologtostderr \
17 |     --multiprocess_gpu \
18 |     --coordinator_address=${ADDR} \
19 |     --process_count=${SLURM_NTASKS} \
20 |     --process_index=${SLURM_PROCID}
21 | 


--------------------------------------------------------------------------------
/experiments/preliminary/mixed_pretraining_objectives/pretrain_c4/920m_c4_mlm_1_00.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | ADDR=$1
 4 | MODEL_DIR=$2
 5 | 
 6 | python -m t5x.train \
 7 |     --gin_file="models/scalable_t5/t5_1_1/base.gin" \
 8 |     --gin_file="configs/size/920m/vanilla.gin" \
 9 |     --gin_file="configs/pretrain/c4_mlm.gin" \
10 |     --gin.MIXTURE_OR_TASK_NAME="c4_mlm_1_00" \
11 |     --gin.MODEL_DIR=\"'/fsx/lintangsutawika/ckpts/base_c4_mlm_1_00/'\" \
12 |     --gin.USE_CACHED_TASKS=False \
13 |     --gin.TRAIN_STEPS=256000 \
14 |     --gin.SAVING_PERIOD=32000 \
15 |     --gin.BATCH_SIZE=2048 \
16 |     --alsologtostderr \
17 |     --multiprocess_gpu \
18 |     --coordinator_address=${ADDR} \
19 |     --process_count=${SLURM_NTASKS} \
20 |     --process_index=${SLURM_PROCID}
21 | 


--------------------------------------------------------------------------------
/experiments/preliminary/mixed_pretraining_objectives/pretrain_pile/920m_pile_mix_0_10.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | ADDR=$1
 4 | MODEL_DIR=$2
 5 | 
 6 | python -m t5x.train \
 7 |     --gin_file="models/scalable_t5/t5_1_1/base.gin" \
 8 |     --gin_file="configs/size/920m/vanilla.gin" \
 9 |     --gin_file="configs/task/pretrain/pile_mixed_objective.gin" \
10 |     --gin.MIXTURE_OR_TASK_NAME=\"'pile_ul2_causal_0_10'\" \
11 |     --gin.MODEL_DIR=\"'/fsx/lintangsutawika/improved_t5/ckpts/920m_pile_ul2_causal_0_10/'\" \
12 |     --gin.USE_CACHED_TASKS=False \
13 |     --gin.TRAIN_STEPS=256000 \
14 |     --gin.SAVING_PERIOD=32000 \
15 |     --gin.BATCH_SIZE=2048 \
16 |     --alsologtostderr \
17 |     --multiprocess_gpu \
18 |     --coordinator_address=${ADDR} \
19 |     --process_count=${SLURM_NTASKS} \
20 |     --process_index=${SLURM_PROCID}
21 | 


--------------------------------------------------------------------------------
/experiments/preliminary/mixed_pretraining_objectives/pretrain_pile/920m_pile_mix_0_15.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | ADDR=$1
 4 | MODEL_DIR=$2
 5 | 
 6 | python -m t5x.train \
 7 |     --gin_file="models/scalable_t5/t5_1_1/base.gin" \
 8 |     --gin_file="configs/size/920m/vanilla.gin" \
 9 |     --gin_file="configs/task/pretrain/pile_mixed_objective.gin" \
10 |     --gin.MIXTURE_OR_TASK_NAME=\"'pile_ul2_causal_0_15'\" \
11 |     --gin.MODEL_DIR=\"'/fsx/lintangsutawika/improved_t5/ckpts/920m_pile_ul2_causal_0_15/'\" \
12 |     --gin.USE_CACHED_TASKS=False \
13 |     --gin.TRAIN_STEPS=256000 \
14 |     --gin.SAVING_PERIOD=32000 \
15 |     --gin.BATCH_SIZE=2048 \
16 |     --alsologtostderr \
17 |     --multiprocess_gpu \
18 |     --coordinator_address=${ADDR} \
19 |     --process_count=${SLURM_NTASKS} \
20 |     --process_index=${SLURM_PROCID}
21 | 


--------------------------------------------------------------------------------
/experiments/preliminary/mixed_pretraining_objectives/pretrain_pile/920m_pile_mix_0_25.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | ADDR=$1
 4 | MODEL_DIR=$2
 5 | 
 6 | python -m t5x.train \
 7 |     --gin_file="models/scalable_t5/t5_1_1/base.gin" \
 8 |     --gin_file="configs/size/920m/vanilla.gin" \
 9 |     --gin_file="configs/task/pretrain/pile_mixed_objective.gin" \
10 |     --gin.MIXTURE_OR_TASK_NAME=\"'pile_ul2_causal_0_25'\" \
11 |     --gin.MODEL_DIR=\"'/fsx/lintangsutawika/improved_t5/ckpts/920m_pile_ul2_causal_0_25/'\" \
12 |     --gin.USE_CACHED_TASKS=False \
13 |     --gin.TRAIN_STEPS=256000 \
14 |     --gin.SAVING_PERIOD=32000 \
15 |     --gin.BATCH_SIZE=2048 \
16 |     --alsologtostderr \
17 |     --multiprocess_gpu \
18 |     --coordinator_address=${ADDR} \
19 |     --process_count=${SLURM_NTASKS} \
20 |     --process_index=${SLURM_PROCID}
21 | 


--------------------------------------------------------------------------------
/experiments/preliminary/mixed_pretraining_objectives/pretrain_pile/920m_pile_mix_0_50.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | ADDR=$1
 4 | MODEL_DIR=$2
 5 | 
 6 | python -m t5x.train \
 7 |     --gin_file="models/scalable_t5/t5_1_1/base.gin" \
 8 |     --gin_file="configs/size/920m/vanilla.gin" \
 9 |     --gin_file="configs/task/pretrain/pile_mixed_objective.gin" \
10 |     --gin.MIXTURE_OR_TASK_NAME=\"'pile_ul2_causal_0_50'\" \
11 |     --gin.MODEL_DIR=\"'/fsx/lintangsutawika/improved_t5/ckpts/920m_pile_ul2_causal_0_50/'\" \
12 |     --gin.USE_CACHED_TASKS=False \
13 |     --gin.TRAIN_STEPS=256000 \
14 |     --gin.SAVING_PERIOD=32000 \
15 |     --gin.BATCH_SIZE=2048 \
16 |     --alsologtostderr \
17 |     --multiprocess_gpu \
18 |     --coordinator_address=${ADDR} \
19 |     --process_count=${SLURM_NTASKS} \
20 |     --process_index=${SLURM_PROCID}
21 | 


--------------------------------------------------------------------------------
/experiments/preliminary/mixed_pretraining_objectives/pretrain_pile/920m_pile_mix_0_60.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | ADDR=$1
 4 | MODEL_DIR=$2
 5 | 
 6 | python -m t5x.train \
 7 |     --gin_file="models/scalable_t5/t5_1_1/base.gin" \
 8 |     --gin_file="configs/size/920m/vanilla.gin" \
 9 |     --gin_file="configs/task/pretrain/pile_mixed_objective.gin" \
10 |     --gin.MIXTURE_OR_TASK_NAME=\"'pile_ul2_causal_0_60'\" \
11 |     --gin.MODEL_DIR=\"'/fsx/lintangsutawika/improved_t5/ckpts/920m_pile_ul2_causal_0_60/'\" \
12 |     --gin.USE_CACHED_TASKS=False \
13 |     --gin.TRAIN_STEPS=256000 \
14 |     --gin.SAVING_PERIOD=32000 \
15 |     --gin.BATCH_SIZE=2048 \
16 |     --alsologtostderr \
17 |     --multiprocess_gpu \
18 |     --coordinator_address=${ADDR} \
19 |     --process_count=${SLURM_NTASKS} \
20 |     --process_index=${SLURM_PROCID}
21 | 


--------------------------------------------------------------------------------
/experiments/preliminary/mixed_pretraining_objectives/pretrain_pile/920m_pile_mix_0_75.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | ADDR=$1
 4 | MODEL_DIR=$2
 5 | 
 6 | python -m t5x.train \
 7 |     --gin_file="models/scalable_t5/t5_1_1/base.gin" \
 8 |     --gin_file="configs/size/920m/vanilla.gin" \
 9 |     --gin_file="configs/task/pretrain/pile_mixed_objective.gin" \
10 |     --gin.MIXTURE_OR_TASK_NAME=\"'pile_ul2_causal_0_75'\" \
11 |     --gin.MODEL_DIR=\"'/fsx/lintangsutawika/improved_t5/ckpts/920m_pile_ul2_causal_0_75/'\" \
12 |     --gin.USE_CACHED_TASKS=False \
13 |     --gin.TRAIN_STEPS=256000 \
14 |     --gin.SAVING_PERIOD=32000 \
15 |     --gin.BATCH_SIZE=2048 \
16 |     --alsologtostderr \
17 |     --multiprocess_gpu \
18 |     --coordinator_address=${ADDR} \
19 |     --process_count=${SLURM_NTASKS} \
20 |     --process_index=${SLURM_PROCID}
21 | 


--------------------------------------------------------------------------------
/experiments/preliminary/mixed_pretraining_objectives/super_glue_performance_0-10.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EleutherAI/improved-t5/08beb733e5df07d24fbf95f720d7e87e09c547cd/experiments/preliminary/mixed_pretraining_objectives/super_glue_performance_0-10.png


--------------------------------------------------------------------------------
/experiments/preliminary/mixed_pretraining_objectives/super_glue_performance_flop_256k.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EleutherAI/improved-t5/08beb733e5df07d24fbf95f720d7e87e09c547cd/experiments/preliminary/mixed_pretraining_objectives/super_glue_performance_flop_256k.png


--------------------------------------------------------------------------------
/experiments/preliminary/partition/pretrain_pile_1.sh:
--------------------------------------------------------------------------------
 1 | ADDR=$1
 2 | MODEL_DIR=$2
 3 | 
 4 | rm -rf /fsx/lintangsutawika/improved_t5/ckpts/partition_1/
 5 | 
 6 | python -m t5x.train \
 7 |     --gin_file="models/scalable_t5/t5_1_1/xl.gin" \
 8 |     --gin_file="configs/task/pretrain/pile_mlm.gin" \
 9 |     --gin_file="configs/exp/partition.gin" \
10 |     --gin.NUM_PARTITIONS=1 \
11 |     --gin.MODEL_DIR=\"'/fsx/lintangsutawika/improved_t5/ckpts/partition_1/'\" \
12 |     --gin.USE_CACHED_TASKS=False \
13 |     --alsologtostderr \
14 |     --multiprocess_gpu \
15 |     --coordinator_address=${ADDR} \
16 |     --process_count=${SLURM_NTASKS} \
17 |     --process_index=${SLURM_PROCID}
18 | 


--------------------------------------------------------------------------------
/experiments/preliminary/partition/pretrain_pile_2.sh:
--------------------------------------------------------------------------------
 1 | ADDR=$1
 2 | MODEL_DIR=$2
 3 | 
 4 | rm -rf /fsx/lintangsutawika/improved_t5/ckpts/partition_2/
 5 | 
 6 | python -m t5x.train \
 7 |     --gin_file="models/scalable_t5/t5_1_1/xl.gin" \
 8 |     --gin_file="configs/task/pretrain/pile_mlm.gin" \
 9 |     --gin_file="configs/exp/partition.gin" \
10 |     --gin.NUM_PARTITIONS=2 \
11 |     --gin.MODEL_DIR=\"'/fsx/lintangsutawika/improved_t5/ckpts/partition_2/'\" \
12 |     --gin.USE_CACHED_TASKS=False \
13 |     --alsologtostderr \
14 |     --multiprocess_gpu \
15 |     --coordinator_address=${ADDR} \
16 |     --process_count=${SLURM_NTASKS} \
17 |     --process_index=${SLURM_PROCID}
18 | 


--------------------------------------------------------------------------------
/experiments/preliminary/partition/pretrain_pile_4.sh:
--------------------------------------------------------------------------------
 1 | ADDR=$1
 2 | MODEL_DIR=$2
 3 | 
 4 | rm -rf /fsx/lintangsutawika/improved_t5/ckpts/partition_4/
 5 | 
 6 | python -m t5x.train \
 7 |     --gin_file="models/scalable_t5/t5_1_1/xl.gin" \
 8 |     --gin_file="configs/task/pretrain/pile_mlm.gin" \
 9 |     --gin_file="configs/exp/partition.gin" \
10 |     --gin.NUM_PARTITIONS=4 \
11 |     --gin.MODEL_DIR=\"'/fsx/lintangsutawika/improved_t5/ckpts/partition_4/'\" \
12 |     --gin.USE_CACHED_TASKS=False \
13 |     --alsologtostderr \
14 |     --multiprocess_gpu \
15 |     --coordinator_address=${ADDR} \
16 |     --process_count=${SLURM_NTASKS} \
17 |     --process_index=${SLURM_PROCID}
18 | 


--------------------------------------------------------------------------------
/experiments/preliminary/partition/pretrain_pile_8.sh:
--------------------------------------------------------------------------------
 1 | ADDR=$1
 2 | MODEL_DIR=$2
 3 | 
 4 | rm -rf /fsx/lintangsutawika/improved_t5/ckpts/partition_8/
 5 | 
 6 | python -m t5x.train \
 7 |     --gin_file="models/scalable_t5/t5_1_1/xl.gin" \
 8 |     --gin_file="configs/task/pretrain/pile_mlm.gin" \
 9 |     --gin_file="configs/exp/partition.gin" \
10 |     --gin.NUM_PARTITIONS=8 \
11 |     --gin.MODEL_DIR=\"'/fsx/lintangsutawika/improved_t5/ckpts/partition_8/'\" \
12 |     --gin.USE_CACHED_TASKS=False \
13 |     --alsologtostderr \
14 |     --multiprocess_gpu \
15 |     --coordinator_address=${ADDR} \
16 |     --process_count=${SLURM_NTASKS} \
17 |     --process_index=${SLURM_PROCID}
18 | 


--------------------------------------------------------------------------------
/experiments/preliminary/positional_embeddings/alibi/eval_perplexity/no_alibi_1024_1024.sh:
--------------------------------------------------------------------------------
 1 | python -m t5x.train \
 2 |     --gin_file="models/scalable_t5/t5_1_1/base.gin" \
 3 |     --gin_file="configs/task/pretrain/pile_prefix_lm.gin" \
 4 |     --gin_file="configs/task/partition.gin" \
 5 |     --gin.BATCH_SIZE=2048 \
 6 |     --gin.EVAL_BATCH_SIZE=128 \
 7 |     --gin.TRAIN_STEPS=125000 \
 8 |     --gin.SAVING_PERIOD=25000 \
 9 |     --gin.TASK_FEATURE_LENGTHS="{'inputs': 1024, 'targets': 1024}" \
10 |     --gin.MODEL_DIR=\"'/fsx/lintangsutawika/improved_t5/ckpts/pile_prefix_lm_no_alibi/'\" \
11 |     --gin.USE_CACHED_TASKS=False \
12 |     --alsologtostderr \
13 |     --multiprocess_gpu \
14 |     --coordinator_address=${ADDR} \
15 |     --process_count=${SLURM_NTASKS} \
16 |     --process_index=${SLURM_PROCID}
17 | 


--------------------------------------------------------------------------------
/experiments/preliminary/positional_embeddings/alibi/eval_perplexity/no_alibi_1024_114.sh:
--------------------------------------------------------------------------------
 1 | python -m t5x.train \
 2 |     --gin_file="models/scalable_t5/t5_1_1/base.gin" \
 3 |     --gin_file="configs/task/pretrain/pile_prefix_lm.gin" \
 4 |     --gin_file="configs/task/partition.gin" \
 5 |     --gin.BATCH_SIZE=2048 \
 6 |     --gin.TRAIN_STEPS=125000 \
 7 |     --gin.SAVING_PERIOD=25000 \
 8 |     --gin.TASK_FEATURE_LENGTHS="{'inputs': 1024, 'targets': 114}" \
 9 |     --gin.MODEL_DIR=\"'/fsx/lintangsutawika/improved_t5/ckpts/pile_prefix_lm_no_alibi/'\" \
10 |     --gin.USE_CACHED_TASKS=False \
11 |     --alsologtostderr \
12 |     --multiprocess_gpu \
13 |     --coordinator_address=${ADDR} \
14 |     --process_count=${SLURM_NTASKS} \
15 |     --process_index=${SLURM_PROCID}
16 | 


--------------------------------------------------------------------------------
/experiments/preliminary/positional_embeddings/alibi/eval_perplexity/no_alibi_1024_2048.sh:
--------------------------------------------------------------------------------
 1 | python -m t5x.train \
 2 |     --gin_file="models/scalable_t5/t5_1_1/base.gin" \
 3 |     --gin_file="configs/task/pretrain/pile_prefix_lm.gin" \
 4 |     --gin_file="configs/task/partition.gin" \
 5 |     --gin.BATCH_SIZE=2048 \
 6 |     --gin.EVAL_BATCH_SIZE=128 \
 7 |     --gin.TRAIN_STEPS=125000 \
 8 |     --gin.SAVING_PERIOD=25000 \
 9 |     --gin.TASK_FEATURE_LENGTHS="{'inputs': 1024, 'targets': 2048}" \
10 |     --gin.MODEL_DIR=\"'/fsx/lintangsutawika/improved_t5/ckpts/pile_prefix_lm_no_alibi/'\" \
11 |     --gin.USE_CACHED_TASKS=False \
12 |     --alsologtostderr \
13 |     --multiprocess_gpu \
14 |     --coordinator_address=${ADDR} \
15 |     --process_count=${SLURM_NTASKS} \
16 |     --process_index=${SLURM_PROCID}
17 | 


--------------------------------------------------------------------------------
/experiments/preliminary/positional_embeddings/alibi/eval_perplexity/no_alibi_1024_256.sh:
--------------------------------------------------------------------------------
 1 | python -m t5x.train \
 2 |     --gin_file="models/scalable_t5/t5_1_1/base.gin" \
 3 |     --gin_file="configs/task/pretrain/pile_prefix_lm.gin" \
 4 |     --gin_file="configs/task/partition.gin" \
 5 |     --gin.BATCH_SIZE=2048 \
 6 |     --gin.TRAIN_STEPS=125000 \
 7 |     --gin.SAVING_PERIOD=25000 \
 8 |     --gin.TASK_FEATURE_LENGTHS="{'inputs': 1024, 'targets': 256}" \
 9 |     --gin.MODEL_DIR=\"'/fsx/lintangsutawika/improved_t5/ckpts/pile_prefix_lm_no_alibi/'\" \
10 |     --gin.USE_CACHED_TASKS=False \
11 |     --alsologtostderr \
12 |     --multiprocess_gpu \
13 |     --coordinator_address=${ADDR} \
14 |     --process_count=${SLURM_NTASKS} \
15 |     --process_index=${SLURM_PROCID}
16 | 


--------------------------------------------------------------------------------
/experiments/preliminary/positional_embeddings/alibi/eval_perplexity/no_alibi_1024_512.sh:
--------------------------------------------------------------------------------
 1 | python -m t5x.train \
 2 |     --gin_file="models/scalable_t5/t5_1_1/base.gin" \
 3 |     --gin_file="configs/task/pretrain/pile_prefix_lm.gin" \
 4 |     --gin_file="configs/task/partition.gin" \
 5 |     --gin.BATCH_SIZE=2048 \
 6 |     --gin.TRAIN_STEPS=125000 \
 7 |     --gin.SAVING_PERIOD=25000 \
 8 |     --gin.TASK_FEATURE_LENGTHS="{'inputs': 1024, 'targets': 512}" \
 9 |     --gin.MODEL_DIR=\"'/fsx/lintangsutawika/improved_t5/ckpts/pile_prefix_lm_no_alibi/'\" \
10 |     --gin.USE_CACHED_TASKS=False \
11 |     --alsologtostderr \
12 |     --multiprocess_gpu \
13 |     --coordinator_address=${ADDR} \
14 |     --process_count=${SLURM_NTASKS} \
15 |     --process_index=${SLURM_PROCID}
16 | 


--------------------------------------------------------------------------------
/experiments/preliminary/positional_embeddings/alibi/eval_perplexity/no_alibi_1_1024.sh:
--------------------------------------------------------------------------------
 1 | python -m t5x.train \
 2 |     --gin_file="models/scalable_t5/t5_1_1/base.gin" \
 3 |     --gin_file="configs/task/pretrain/pile_prefix_lm.gin" \
 4 |     --gin_file="configs/task/partition.gin" \
 5 |     --gin.BATCH_SIZE=2048 \
 6 |     --gin.EVAL_BATCH_SIZE=128 \
 7 |     --gin.TRAIN_STEPS=125000 \
 8 |     --gin.SAVING_PERIOD=25000 \
 9 |     --gin.TASK_FEATURE_LENGTHS="{'inputs': 1024, 'targets': 1024}" \
10 |     --gin.MODEL_DIR=\"'/fsx/lintangsutawika/improved_t5/ckpts/pile_prefix_lm_no_alibi/'\" \
11 |     --gin.USE_CACHED_TASKS=False \
12 |     --alsologtostderr \
13 |     --multiprocess_gpu \
14 |     --coordinator_address=${ADDR} \
15 |     --process_count=${SLURM_NTASKS} \
16 |     --process_index=${SLURM_PROCID}
17 | 


--------------------------------------------------------------------------------
/experiments/preliminary/positional_embeddings/alibi/eval_perplexity/no_alibi_1_114.sh:
--------------------------------------------------------------------------------
 1 | python -m t5x.train \
 2 |     --gin_file="models/scalable_t5/t5_1_1/base.gin" \
 3 |     --gin_file="configs/task/pretrain/pile_prefix_lm.gin" \
 4 |     --gin_file="configs/task/partition.gin" \
 5 |     --gin.BATCH_SIZE=2048 \
 6 |     --gin.TRAIN_STEPS=125000 \
 7 |     --gin.SAVING_PERIOD=25000 \
 8 |     --gin.TASK_FEATURE_LENGTHS="{'inputs': 1024, 'targets': 114}" \
 9 |     --gin.MODEL_DIR=\"'/fsx/lintangsutawika/improved_t5/ckpts/pile_prefix_lm_no_alibi/'\" \
10 |     --gin.USE_CACHED_TASKS=False \
11 |     --alsologtostderr \
12 |     --multiprocess_gpu \
13 |     --coordinator_address=${ADDR} \
14 |     --process_count=${SLURM_NTASKS} \
15 |     --process_index=${SLURM_PROCID}
16 | 


--------------------------------------------------------------------------------
/experiments/preliminary/positional_embeddings/alibi/eval_perplexity/no_alibi_1_2048.sh:
--------------------------------------------------------------------------------
 1 | python -m t5x.train \
 2 |     --gin_file="models/scalable_t5/t5_1_1/base.gin" \
 3 |     --gin_file="configs/task/pretrain/pile_prefix_lm.gin" \
 4 |     --gin_file="configs/task/partition.gin" \
 5 |     --gin.BATCH_SIZE=2048 \
 6 |     --gin.EVAL_BATCH_SIZE=128 \
 7 |     --gin.TRAIN_STEPS=125000 \
 8 |     --gin.SAVING_PERIOD=25000 \
 9 |     --gin.TASK_FEATURE_LENGTHS="{'inputs': 1024, 'targets': 2048}" \
10 |     --gin.MODEL_DIR=\"'/fsx/lintangsutawika/improved_t5/ckpts/pile_prefix_lm_no_alibi/'\" \
11 |     --gin.USE_CACHED_TASKS=False \
12 |     --alsologtostderr \
13 |     --multiprocess_gpu \
14 |     --coordinator_address=${ADDR} \
15 |     --process_count=${SLURM_NTASKS} \
16 |     --process_index=${SLURM_PROCID}
17 | 


--------------------------------------------------------------------------------
/experiments/preliminary/positional_embeddings/alibi/eval_perplexity/no_alibi_1_256.sh:
--------------------------------------------------------------------------------
 1 | python -m t5x.train \
 2 |     --gin_file="models/scalable_t5/t5_1_1/base.gin" \
 3 |     --gin_file="configs/task/pretrain/pile_prefix_lm.gin" \
 4 |     --gin_file="configs/task/partition.gin" \
 5 |     --gin.BATCH_SIZE=2048 \
 6 |     --gin.TRAIN_STEPS=125000 \
 7 |     --gin.SAVING_PERIOD=25000 \
 8 |     --gin.TASK_FEATURE_LENGTHS="{'inputs': 1024, 'targets': 256}" \
 9 |     --gin.MODEL_DIR=\"'/fsx/lintangsutawika/improved_t5/ckpts/pile_prefix_lm_no_alibi/'\" \
10 |     --gin.USE_CACHED_TASKS=False \
11 |     --alsologtostderr \
12 |     --multiprocess_gpu \
13 |     --coordinator_address=${ADDR} \
14 |     --process_count=${SLURM_NTASKS} \
15 |     --process_index=${SLURM_PROCID}
16 | 


--------------------------------------------------------------------------------
/experiments/preliminary/positional_embeddings/alibi/eval_perplexity/no_alibi_1_512.sh:
--------------------------------------------------------------------------------
 1 | python -m t5x.train \
 2 |     --gin_file="models/scalable_t5/t5_1_1/base.gin" \
 3 |     --gin_file="configs/task/pretrain/pile_prefix_lm.gin" \
 4 |     --gin_file="configs/task/partition.gin" \
 5 |     --gin.BATCH_SIZE=2048 \
 6 |     --gin.TRAIN_STEPS=125000 \
 7 |     --gin.SAVING_PERIOD=25000 \
 8 |     --gin.TASK_FEATURE_LENGTHS="{'inputs': 1024, 'targets': 512}" \
 9 |     --gin.MODEL_DIR=\"'/fsx/lintangsutawika/improved_t5/ckpts/pile_prefix_lm_no_alibi/'\" \
10 |     --gin.USE_CACHED_TASKS=False \
11 |     --alsologtostderr \
12 |     --multiprocess_gpu \
13 |     --coordinator_address=${ADDR} \
14 |     --process_count=${SLURM_NTASKS} \
15 |     --process_index=${SLURM_PROCID}
16 | 


--------------------------------------------------------------------------------
/experiments/preliminary/positional_embeddings/alibi/eval_perplexity/no_alibi_512_1024.sh:
--------------------------------------------------------------------------------
 1 | python -m t5x.train \
 2 |     --gin_file="models/scalable_t5/t5_1_1/base.gin" \
 3 |     --gin_file="configs/task/pretrain/pile_prefix_lm.gin" \
 4 |     --gin_file="configs/task/partition.gin" \
 5 |     --gin.BATCH_SIZE=2048 \
 6 |     --gin.TRAIN_STEPS=125000 \
 7 |     --gin.SAVING_PERIOD=25000 \
 8 |     --gin.TASK_FEATURE_LENGTHS="{'inputs': 512, 'targets': 1024}" \
 9 |     --gin.MODEL_DIR=\"'/fsx/lintangsutawika/improved_t5/ckpts/pile_prefix_lm_no_alibi/'\" \
10 |     --gin.USE_CACHED_TASKS=False \
11 |     --alsologtostderr \
12 |     --multiprocess_gpu \
13 |     --coordinator_address=${ADDR} \
14 |     --process_count=${SLURM_NTASKS} \
15 |     --process_index=${SLURM_PROCID}
16 | 


--------------------------------------------------------------------------------
/experiments/preliminary/positional_embeddings/alibi/eval_perplexity/no_alibi_512_114.sh:
--------------------------------------------------------------------------------
 1 | python -m t5x.train \
 2 |     --gin_file="models/scalable_t5/t5_1_1/base.gin" \
 3 |     --gin_file="configs/task/pretrain/pile_prefix_lm.gin" \
 4 |     --gin_file="configs/task/partition.gin" \
 5 |     --gin.BATCH_SIZE=2048 \
 6 |     --gin.TRAIN_STEPS=125000 \
 7 |     --gin.SAVING_PERIOD=25000 \
 8 |     --gin.TASK_FEATURE_LENGTHS="{'inputs': 512, 'targets': 114}" \
 9 |     --gin.MODEL_DIR=\"'/fsx/lintangsutawika/improved_t5/ckpts/pile_prefix_lm_no_alibi/'\" \
10 |     --gin.USE_CACHED_TASKS=False \
11 |     --alsologtostderr \
12 |     --multiprocess_gpu \
13 |     --coordinator_address=${ADDR} \
14 |     --process_count=${SLURM_NTASKS} \
15 |     --process_index=${SLURM_PROCID}
16 | 


--------------------------------------------------------------------------------
/experiments/preliminary/positional_embeddings/alibi/eval_perplexity/no_alibi_512_2048.sh:
--------------------------------------------------------------------------------
 1 | python -m t5x.train \
 2 |     --gin_file="models/scalable_t5/t5_1_1/base.gin" \
 3 |     --gin_file="configs/task/pretrain/pile_prefix_lm.gin" \
 4 |     --gin_file="configs/task/partition.gin" \
 5 |     --gin.BATCH_SIZE=2048 \
 6 |     --gin.TRAIN_STEPS=125000 \
 7 |     --gin.SAVING_PERIOD=25000 \
 8 |     --gin.TASK_FEATURE_LENGTHS="{'inputs': 512, 'targets': 2048}" \
 9 |     --gin.MODEL_DIR=\"'/fsx/lintangsutawika/improved_t5/ckpts/pile_prefix_lm_no_alibi/'\" \
10 |     --gin.USE_CACHED_TASKS=False \
11 |     --alsologtostderr \
12 |     --multiprocess_gpu \
13 |     --coordinator_address=${ADDR} \
14 |     --process_count=${SLURM_NTASKS} \
15 |     --process_index=${SLURM_PROCID}
16 | 


--------------------------------------------------------------------------------
/experiments/preliminary/positional_embeddings/alibi/eval_perplexity/no_alibi_512_256.sh:
--------------------------------------------------------------------------------
 1 | python -m t5x.train \
 2 |     --gin_file="models/scalable_t5/t5_1_1/base.gin" \
 3 |     --gin_file="configs/task/pretrain/pile_prefix_lm.gin" \
 4 |     --gin_file="configs/task/partition.gin" \
 5 |     --gin.BATCH_SIZE=2048 \
 6 |     --gin.TRAIN_STEPS=125000 \
 7 |     --gin.SAVING_PERIOD=25000 \
 8 |     --gin.TASK_FEATURE_LENGTHS="{'inputs': 512, 'targets': 256}" \
 9 |     --gin.MODEL_DIR=\"'/fsx/lintangsutawika/improved_t5/ckpts/pile_prefix_lm_no_alibi/'\" \
10 |     --gin.USE_CACHED_TASKS=False \
11 |     --alsologtostderr \
12 |     --multiprocess_gpu \
13 |     --coordinator_address=${ADDR} \
14 |     --process_count=${SLURM_NTASKS} \
15 |     --process_index=${SLURM_PROCID}
16 | 


--------------------------------------------------------------------------------
/experiments/preliminary/positional_embeddings/alibi/eval_perplexity/no_alibi_512_512.sh:
--------------------------------------------------------------------------------
 1 | python -m t5x.train \
 2 |     --gin_file="models/scalable_t5/t5_1_1/base.gin" \
 3 |     --gin_file="configs/task/pretrain/pile_prefix_lm.gin" \
 4 |     --gin_file="configs/task/partition.gin" \
 5 |     --gin.BATCH_SIZE=2048 \
 6 |     --gin.TRAIN_STEPS=125000 \
 7 |     --gin.SAVING_PERIOD=25000 \
 8 |     --gin.TASK_FEATURE_LENGTHS="{'inputs': 512, 'targets': 512}" \
 9 |     --gin.MODEL_DIR=\"'/fsx/lintangsutawika/improved_t5/ckpts/pile_prefix_lm_no_alibi/'\" \
10 |     --gin.USE_CACHED_TASKS=False \
11 |     --alsologtostderr \
12 |     --multiprocess_gpu \
13 |     --coordinator_address=${ADDR} \
14 |     --process_count=${SLURM_NTASKS} \
15 |     --process_index=${SLURM_PROCID}
16 | 


--------------------------------------------------------------------------------
/experiments/preliminary/positional_embeddings/alibi/eval_perplexity/with_alibi_1024_1024.sh:
--------------------------------------------------------------------------------
 1 | python -m t5x.train \
 2 |     --gin_file="models/scalable_t5/t5_1_1/base.gin" \
 3 |     --gin_file="configs/task/pretrain/pile_prefix_lm.gin" \
 4 |     --gin_file="configs/exp/alibi.gin" \
 5 |     --gin_file="configs/task/partition.gin" \
 6 |     --gin.BATCH_SIZE=2048 \
 7 |     --gin.EVAL_BATCH_SIZE=128 \
 8 |     --gin.TRAIN_STEPS=125000 \
 9 |     --gin.SAVING_PERIOD=25000 \
10 |     --gin.TASK_FEATURE_LENGTHS="{'inputs': 1024, 'targets': 1024}" \
11 |     --gin.MODEL_DIR=\"'/fsx/lintangsutawika/improved_t5/ckpts/pile_prefix_lm_no_alibi/'\" \
12 |     --gin.USE_CACHED_TASKS=False \
13 |     --alsologtostderr \
14 |     --multiprocess_gpu \
15 |     --coordinator_address=${ADDR} \
16 |     --process_count=${SLURM_NTASKS} \
17 |     --process_index=${SLURM_PROCID}
18 | 


--------------------------------------------------------------------------------
/experiments/preliminary/positional_embeddings/alibi/eval_perplexity/with_alibi_1024_114.sh:
--------------------------------------------------------------------------------
 1 | python -m t5x.train \
 2 |     --gin_file="models/scalable_t5/t5_1_1/base.gin" \
 3 |     --gin_file="configs/task/pretrain/pile_prefix_lm.gin" \
 4 |     --gin_file="configs/exp/alibi.gin" \
 5 |     --gin_file="configs/task/partition.gin" \
 6 |     --gin.BATCH_SIZE=2048 \
 7 |     --gin.TRAIN_STEPS=125000 \
 8 |     --gin.SAVING_PERIOD=25000 \
 9 |     --gin.TASK_FEATURE_LENGTHS="{'inputs': 1024, 'targets': 114}" \
10 |     --gin.MODEL_DIR=\"'/fsx/lintangsutawika/improved_t5/ckpts/pile_prefix_lm_no_alibi/'\" \
11 |     --gin.USE_CACHED_TASKS=False \
12 |     --alsologtostderr \
13 |     --multiprocess_gpu \
14 |     --coordinator_address=${ADDR} \
15 |     --process_count=${SLURM_NTASKS} \
16 |     --process_index=${SLURM_PROCID}
17 | 


--------------------------------------------------------------------------------
/experiments/preliminary/positional_embeddings/alibi/eval_perplexity/with_alibi_1024_2048.sh:
--------------------------------------------------------------------------------
 1 | python -m t5x.train \
 2 |     --gin_file="models/scalable_t5/t5_1_1/base.gin" \
 3 |     --gin_file="configs/task/pretrain/pile_prefix_lm.gin" \
 4 |     --gin_file="configs/exp/alibi.gin" \
 5 |     --gin_file="configs/task/partition.gin" \
 6 |     --gin.BATCH_SIZE=2048 \
 7 |     --gin.EVAL_BATCH_SIZE=128 \
 8 |     --gin.TRAIN_STEPS=125000 \
 9 |     --gin.SAVING_PERIOD=25000 \
10 |     --gin.TASK_FEATURE_LENGTHS="{'inputs': 1024, 'targets': 2048}" \
11 |     --gin.MODEL_DIR=\"'/fsx/lintangsutawika/improved_t5/ckpts/pile_prefix_lm_with_alibi/'\" \
12 |     --gin.USE_CACHED_TASKS=False \
13 |     --alsologtostderr \
14 |     --multiprocess_gpu \
15 |     --coordinator_address=${ADDR} \
16 |     --process_count=${SLURM_NTASKS} \
17 |     --process_index=${SLURM_PROCID}
18 | 


--------------------------------------------------------------------------------
/experiments/preliminary/positional_embeddings/alibi/eval_perplexity/with_alibi_1024_256.sh:
--------------------------------------------------------------------------------
 1 | python -m t5x.train \
 2 |     --gin_file="models/scalable_t5/t5_1_1/base.gin" \
 3 |     --gin_file="configs/task/pretrain/pile_prefix_lm.gin" \
 4 |     --gin_file="configs/exp/alibi.gin" \
 5 |     --gin_file="configs/task/partition.gin" \
 6 |     --gin.BATCH_SIZE=2048 \
 7 |     --gin.TRAIN_STEPS=125000 \
 8 |     --gin.SAVING_PERIOD=25000 \
 9 |     --gin.TASK_FEATURE_LENGTHS="{'inputs': 1024, 'targets': 256}" \
10 |     --gin.MODEL_DIR=\"'/fsx/lintangsutawika/improved_t5/ckpts/pile_prefix_lm_no_alibi/'\" \
11 |     --gin.USE_CACHED_TASKS=False \
12 |     --alsologtostderr \
13 |     --multiprocess_gpu \
14 |     --coordinator_address=${ADDR} \
15 |     --process_count=${SLURM_NTASKS} \
16 |     --process_index=${SLURM_PROCID}
17 | 


--------------------------------------------------------------------------------
/experiments/preliminary/positional_embeddings/alibi/eval_perplexity/with_alibi_1024_512.sh:
--------------------------------------------------------------------------------
 1 | python -m t5x.train \
 2 |     --gin_file="models/scalable_t5/t5_1_1/base.gin" \
 3 |     --gin_file="configs/task/pretrain/pile_prefix_lm.gin" \
 4 |     --gin_file="configs/exp/alibi.gin" \
 5 |     --gin_file="configs/task/partition.gin" \
 6 |     --gin.BATCH_SIZE=2048 \
 7 |     --gin.TRAIN_STEPS=125000 \
 8 |     --gin.SAVING_PERIOD=25000 \
 9 |     --gin.TASK_FEATURE_LENGTHS="{'inputs': 1024, 'targets': 512}" \
10 |     --gin.MODEL_DIR=\"'/fsx/lintangsutawika/improved_t5/ckpts/pile_prefix_lm_no_alibi/'\" \
11 |     --gin.USE_CACHED_TASKS=False \
12 |     --alsologtostderr \
13 |     --multiprocess_gpu \
14 |     --coordinator_address=${ADDR} \
15 |     --process_count=${SLURM_NTASKS} \
16 |     --process_index=${SLURM_PROCID}
17 | 


--------------------------------------------------------------------------------
/experiments/preliminary/positional_embeddings/alibi/eval_perplexity/with_alibi_1_1024.sh:
--------------------------------------------------------------------------------
 1 | python -m t5x.train \
 2 |     --gin_file="models/scalable_t5/t5_1_1/base.gin" \
 3 |     --gin_file="configs/task/pretrain/pile_prefix_lm.gin" \
 4 |     --gin_file="configs/exp/alibi.gin" \
 5 |     --gin_file="configs/task/partition.gin" \
 6 |     --gin.BATCH_SIZE=2048 \
 7 |     --gin.EVAL_BATCH_SIZE=128 \
 8 |     --gin.TRAIN_STEPS=125000 \
 9 |     --gin.SAVING_PERIOD=25000 \
10 |     --gin.TASK_FEATURE_LENGTHS="{'inputs': 1024, 'targets': 1024}" \
11 |     --gin.MODEL_DIR=\"'/fsx/lintangsutawika/improved_t5/ckpts/pile_prefix_lm_no_alibi/'\" \
12 |     --gin.USE_CACHED_TASKS=False \
13 |     --alsologtostderr \
14 |     --multiprocess_gpu \
15 |     --coordinator_address=${ADDR} \
16 |     --process_count=${SLURM_NTASKS} \
17 |     --process_index=${SLURM_PROCID}
18 | 


--------------------------------------------------------------------------------
/experiments/preliminary/positional_embeddings/alibi/eval_perplexity/with_alibi_1_114.sh:
--------------------------------------------------------------------------------
 1 | python -m t5x.train \
 2 |     --gin_file="models/scalable_t5/t5_1_1/base.gin" \
 3 |     --gin_file="configs/task/pretrain/pile_prefix_lm.gin" \
 4 |     --gin_file="configs/exp/alibi.gin" \
 5 |     --gin_file="configs/task/partition.gin" \
 6 |     --gin.BATCH_SIZE=2048 \
 7 |     --gin.EVAL_BATCH_SIZE=128 \
 8 |     --gin.TRAIN_STEPS=125000 \
 9 |     --gin.SAVING_PERIOD=25000 \
10 |     --gin.TASK_FEATURE_LENGTHS="{'inputs': 1024, 'targets': 114}" \
11 |     --gin.MODEL_DIR=\"'/fsx/lintangsutawika/improved_t5/ckpts/pile_prefix_lm_no_alibi/'\" \
12 |     --gin.USE_CACHED_TASKS=False \
13 |     --alsologtostderr \
14 |     --multiprocess_gpu \
15 |     --coordinator_address=${ADDR} \
16 |     --process_count=${SLURM_NTASKS} \
17 |     --process_index=${SLURM_PROCID}
18 | 


--------------------------------------------------------------------------------
/experiments/preliminary/positional_embeddings/alibi/eval_perplexity/with_alibi_1_2048.sh:
--------------------------------------------------------------------------------
 1 | python -m t5x.train \
 2 |     --gin_file="models/scalable_t5/t5_1_1/base.gin" \
 3 |     --gin_file="configs/task/pretrain/pile_prefix_lm.gin" \
 4 |     --gin_file="configs/exp/alibi.gin" \
 5 |     --gin_file="configs/task/partition.gin" \
 6 |     --gin.BATCH_SIZE=2048 \
 7 |     --gin.EVAL_BATCH_SIZE=128 \
 8 |     --gin.TRAIN_STEPS=125000 \
 9 |     --gin.SAVING_PERIOD=25000 \
10 |     --gin.TASK_FEATURE_LENGTHS="{'inputs': 1024, 'targets': 2048}" \
11 |     --gin.MODEL_DIR=\"'/fsx/lintangsutawika/improved_t5/ckpts/pile_prefix_lm_no_alibi/'\" \
12 |     --gin.USE_CACHED_TASKS=False \
13 |     --alsologtostderr \
14 |     --multiprocess_gpu \
15 |     --coordinator_address=${ADDR} \
16 |     --process_count=${SLURM_NTASKS} \
17 |     --process_index=${SLURM_PROCID}
18 | 


--------------------------------------------------------------------------------
/experiments/preliminary/positional_embeddings/alibi/eval_perplexity/with_alibi_1_256.sh:
--------------------------------------------------------------------------------
 1 | python -m t5x.train \
 2 |     --gin_file="models/scalable_t5/t5_1_1/base.gin" \
 3 |     --gin_file="configs/task/pretrain/pile_prefix_lm.gin" \
 4 |     --gin_file="configs/exp/alibi.gin" \
 5 |     --gin_file="configs/task/partition.gin" \
 6 |     --gin.BATCH_SIZE=2048 \
 7 |     --gin.EVAL_BATCH_SIZE=128 \
 8 |     --gin.TRAIN_STEPS=125000 \
 9 |     --gin.SAVING_PERIOD=25000 \
10 |     --gin.TASK_FEATURE_LENGTHS="{'inputs': 1024, 'targets': 256}" \
11 |     --gin.MODEL_DIR=\"'/fsx/lintangsutawika/improved_t5/ckpts/pile_prefix_lm_no_alibi/'\" \
12 |     --gin.USE_CACHED_TASKS=False \
13 |     --alsologtostderr \
14 |     --multiprocess_gpu \
15 |     --coordinator_address=${ADDR} \
16 |     --process_count=${SLURM_NTASKS} \
17 |     --process_index=${SLURM_PROCID}
18 | 


--------------------------------------------------------------------------------
/experiments/preliminary/positional_embeddings/alibi/eval_perplexity/with_alibi_1_512.sh:
--------------------------------------------------------------------------------
 1 | python -m t5x.train \
 2 |     --gin_file="models/scalable_t5/t5_1_1/base.gin" \
 3 |     --gin_file="configs/task/pretrain/pile_prefix_lm.gin" \
 4 |     --gin_file="configs/exp/alibi.gin" \
 5 |     --gin_file="configs/task/partition.gin" \
 6 |     --gin.BATCH_SIZE=2048 \
 7 |     --gin.EVAL_BATCH_SIZE=128 \
 8 |     --gin.TRAIN_STEPS=125000 \
 9 |     --gin.SAVING_PERIOD=25000 \
10 |     --gin.TASK_FEATURE_LENGTHS="{'inputs': 1024, 'targets': 512}" \
11 |     --gin.MODEL_DIR=\"'/fsx/lintangsutawika/improved_t5/ckpts/pile_prefix_lm_no_alibi/'\" \
12 |     --gin.USE_CACHED_TASKS=False \
13 |     --alsologtostderr \
14 |     --multiprocess_gpu \
15 |     --coordinator_address=${ADDR} \
16 |     --process_count=${SLURM_NTASKS} \
17 |     --process_index=${SLURM_PROCID}
18 | 


--------------------------------------------------------------------------------
/experiments/preliminary/positional_embeddings/alibi/eval_perplexity/with_alibi_512_1024.sh:
--------------------------------------------------------------------------------
 1 | python -m t5x.train \
 2 |     --gin_file="models/scalable_t5/t5_1_1/base.gin" \
 3 |     --gin_file="configs/task/pretrain/pile_prefix_lm.gin" \
 4 |     --gin_file="configs/exp/alibi.gin" \
 5 |     --gin_file="configs/task/partition.gin" \
 6 |     --gin.BATCH_SIZE=2048 \
 7 |     --gin.TRAIN_STEPS=125000 \
 8 |     --gin.SAVING_PERIOD=25000 \
 9 |     --gin.TASK_FEATURE_LENGTHS="{'inputs': 512, 'targets': 1024}" \
10 |     --gin.MODEL_DIR=\"'/fsx/lintangsutawika/improved_t5/ckpts/pile_prefix_lm_with_alibi/'\" \
11 |     --gin.USE_CACHED_TASKS=False \
12 |     --alsologtostderr \
13 |     --multiprocess_gpu \
14 |     --coordinator_address=${ADDR} \
15 |     --process_count=${SLURM_NTASKS} \
16 |     --process_index=${SLURM_PROCID}
17 | 


--------------------------------------------------------------------------------
/experiments/preliminary/positional_embeddings/alibi/eval_perplexity/with_alibi_512_114.sh:
--------------------------------------------------------------------------------
 1 | ADDR=$1
 2 | MODEL_DIR=$2
 3 | 
 4 | python -m t5x.train \
 5 |     --gin_file="models/scalable_t5/t5_1_1/base.gin" \
 6 |     --gin_file="configs/task/pretrain/pile_prefix_lm.gin" \
 7 |     --gin_file="configs/exp/alibi.gin" \
 8 |     --gin.BATCH_SIZE=2048 \
 9 |     --gin.TRAIN_STEPS=125000 \
10 |     --gin.SAVING_PERIOD=25000 \
11 |     --gin.TASK_FEATURE_LENGTHS="{'inputs': 512, 'targets': 114}" \
12 |     --gin.MODEL_DIR=\"'/fsx/lintangsutawika/improved_t5/ckpts/pile_prefix_lm_with_alibi_dot_relpos/'\" \
13 |     --gin.USE_CACHED_TASKS=False \
14 |     --alsologtostderr \
15 |     --multiprocess_gpu \
16 |     --coordinator_address=${ADDR} \
17 |     --process_count=${SLURM_NTASKS} \
18 |     --process_index=${SLURM_PROCID}
19 | 


--------------------------------------------------------------------------------
/experiments/preliminary/positional_embeddings/alibi/eval_perplexity/with_alibi_512_2048.sh:
--------------------------------------------------------------------------------
 1 | python -m t5x.train \
 2 |     --gin_file="models/scalable_t5/t5_1_1/base.gin" \
 3 |     --gin_file="configs/task/pretrain/pile_prefix_lm.gin" \
 4 |     --gin_file="configs/exp/alibi.gin" \
 5 |     --gin_file="configs/task/partition.gin" \
 6 |     --gin.BATCH_SIZE=2048 \
 7 |     --gin.TRAIN_STEPS=125000 \
 8 |     --gin.SAVING_PERIOD=25000 \
 9 |     --gin.TASK_FEATURE_LENGTHS="{'inputs': 512, 'targets': 2048}" \
10 |     --gin.MODEL_DIR=\"'/fsx/lintangsutawika/improved_t5/ckpts/pile_prefix_lm_with_alibi/'\" \
11 |     --gin.USE_CACHED_TASKS=False \
12 |     --alsologtostderr \
13 |     --multiprocess_gpu \
14 |     --coordinator_address=${ADDR} \
15 |     --process_count=${SLURM_NTASKS} \
16 |     --process_index=${SLURM_PROCID}
17 | 


--------------------------------------------------------------------------------
/experiments/preliminary/positional_embeddings/alibi/eval_perplexity/with_alibi_512_256.sh:
--------------------------------------------------------------------------------
 1 | ADDR=$1
 2 | MODEL_DIR=$2
 3 | 
 4 | python -m t5x.train \
 5 |     --gin_file="models/scalable_t5/t5_1_1/base.gin" \
 6 |     --gin_file="configs/task/pretrain/pile_prefix_lm.gin" \
 7 |     --gin_file="configs/exp/alibi.gin" \
 8 |     --gin.BATCH_SIZE=2048 \
 9 |     --gin.TRAIN_STEPS=125000 \
10 |     --gin.SAVING_PERIOD=25000 \
11 |     --gin.TASK_FEATURE_LENGTHS="{'inputs': 512, 'targets': 256}" \
12 |     --gin.MODEL_DIR=\"'/fsx/lintangsutawika/improved_t5/ckpts/pile_prefix_lm_with_alibi_dot_relpos/'\" \
13 |     --gin.USE_CACHED_TASKS=False \
14 |     --alsologtostderr \
15 |     --multiprocess_gpu \
16 |     --coordinator_address=${ADDR} \
17 |     --process_count=${SLURM_NTASKS} \
18 |     --process_index=${SLURM_PROCID}
19 | 


--------------------------------------------------------------------------------
/experiments/preliminary/positional_embeddings/alibi/eval_perplexity/with_alibi_512_512.sh:
--------------------------------------------------------------------------------
 1 | ADDR=$1
 2 | MODEL_DIR=$2
 3 | 
 4 | python -m t5x.train \
 5 |     --gin_file="models/scalable_t5/t5_1_1/base.gin" \
 6 |     --gin_file="configs/task/pretrain/pile_prefix_lm.gin" \
 7 |     --gin_file="configs/exp/alibi.gin" \
 8 |     --gin.BATCH_SIZE=2048 \
 9 |     --gin.TRAIN_STEPS=125000 \
10 |     --gin.SAVING_PERIOD=25000 \
11 |     --gin.TASK_FEATURE_LENGTHS="{'inputs': 512, 'targets': 512}" \
12 |     --gin.MODEL_DIR=\"'/fsx/lintangsutawika/improved_t5/ckpts/pile_prefix_lm_with_alibi_dot_relpos/'\" \
13 |     --gin.USE_CACHED_TASKS=False \
14 |     --alsologtostderr \
15 |     --multiprocess_gpu \
16 |     --coordinator_address=${ADDR} \
17 |     --process_count=${SLURM_NTASKS} \
18 |     --process_index=${SLURM_PROCID}
19 | 


--------------------------------------------------------------------------------
/experiments/preliminary/positional_embeddings/alibi/finetune_sglue_prefix_lm_no_alibi.sh:
--------------------------------------------------------------------------------
 1 | export CACHED_DATA_DIR="/fsx/lintangsutawika/data"
 2 | 
 3 | ADDR=$1
 4 | MODEL_DIR=$2
 5 | 
 6 | python -m t5x.train \
 7 |     --gin_file="models/scalable_t5/t5_1_1/base.gin" \
 8 |     --gin_file="configs/task/finetune/sglue.gin" \
 9 |     --gin.TRAIN_STEPS=135000 \
10 |     --gin.SAVING_PERIOD=2000 \
11 |     --gin.BATCH_SIZE=2048 \
12 |     --gin.MODEL_DIR=\"'/fsx/lintangsutawika/improved_t5/ckpts/sglue_finetune_no_alibi/'\" \
13 |     --gin.INITIAL_CHECKPOINT_PATH=\"'/fsx/lintangsutawika/improved_t5/ckpts/pile_prefix_lm_no_alibi/checkpoint_125000/'\" \
14 |     --seqio_additional_cache_dirs="${CACHED_DATA_DIR}" \
15 |     --alsologtostderr \
16 |     --multiprocess_gpu \
17 |     --coordinator_address=${ADDR} \
18 |     --process_count=${SLURM_NTASKS} \
19 |     --process_index=${SLURM_PROCID}
20 | 


--------------------------------------------------------------------------------
/experiments/preliminary/positional_embeddings/alibi/finetune_sglue_prefix_lm_with_alibi.sh:
--------------------------------------------------------------------------------
 1 | export CACHED_DATA_DIR="/fsx/lintangsutawika/data"
 2 | 
 3 | ADDR=$1
 4 | MODEL_DIR=$2
 5 | 
 6 | python -m t5x.train \
 7 |     --gin_file="models/scalable_t5/t5_1_1/base.gin" \
 8 |     --gin_file="configs/task/finetune/sglue.gin" \
 9 |     --gin_file="configs/exp/alibi.gin" \
10 |     --gin.TRAIN_STEPS=135000 \
11 |     --gin.SAVING_PERIOD=2000 \
12 |     --gin.BATCH_SIZE=2048 \
13 |     --gin.MODEL_DIR=\"'/fsx/lintangsutawika/improved_t5/ckpts/sglue_finetune_with_alibi/'\" \
14 |     --gin.INITIAL_CHECKPOINT_PATH=\"'/fsx/lintangsutawika/improved_t5/ckpts/pile_prefix_lm_with_alibi/checkpoint_125000/'\" \
15 |     --seqio_additional_cache_dirs="${CACHED_DATA_DIR}" \
16 |     --alsologtostderr \
17 |     --multiprocess_gpu \
18 |     --coordinator_address=${ADDR} \
19 |     --process_count=${SLURM_NTASKS} \
20 |     --process_index=${SLURM_PROCID}
21 | 


--------------------------------------------------------------------------------
/experiments/preliminary/positional_embeddings/alibi/pretrain_pile_prefix_lm_no_alibi.sh:
--------------------------------------------------------------------------------
 1 | ADDR=$1
 2 | MODEL_DIR=$2
 3 | 
 4 | python -m t5x.train \
 5 |     --gin_file="models/scalable_t5/t5_1_1/base.gin" \
 6 |     --gin_file="configs/task/pretrain/pile_prefix_lm.gin" \
 7 |     --gin.TRAIN_STEPS=125000 \
 8 |     --gin.SAVING_PERIOD=25000 \
 9 |     --gin.MODEL_DIR=\"'/fsx/lintangsutawika/improved_t5/ckpts/pile_prefix_lm_no_alibi/'\" \
10 |     --gin.USE_CACHED_TASKS=False \
11 |     --alsologtostderr \
12 |     --multiprocess_gpu \
13 |     --coordinator_address=${ADDR} \
14 |     --process_count=${SLURM_NTASKS} \
15 |     --process_index=${SLURM_PROCID}
16 | 


--------------------------------------------------------------------------------
/experiments/preliminary/positional_embeddings/alibi/pretrain_pile_prefix_lm_with_alibi.sh:
--------------------------------------------------------------------------------
 1 | ADDR=$1
 2 | MODEL_DIR=$2
 3 | 
 4 | python -m t5x.train \
 5 |     --gin_file="models/scalable_t5/t5_1_1/base.gin" \
 6 |     --gin_file="configs/task/pretrain/pile_prefix_lm.gin" \
 7 |     --gin_file="configs/exp/alibi.gin" \
 8 |     --gin.TRAIN_STEPS=125000 \
 9 |     --gin.SAVING_PERIOD=25000 \
10 |     --gin.MODEL_DIR=\"'/fsx/lintangsutawika/improved_t5/ckpts/pile_prefix_lm_with_alibi/'\" \
11 |     --gin.USE_CACHED_TASKS=False \
12 |     --alsologtostderr \
13 |     --multiprocess_gpu \
14 |     --coordinator_address=${ADDR} \
15 |     --process_count=${SLURM_NTASKS} \
16 |     --process_index=${SLURM_PROCID}
17 | 


--------------------------------------------------------------------------------
/experiments/preliminary/positional_embeddings/alibi/pretrain_pile_prefix_lm_with_alibi_plus_relpos.sh:
--------------------------------------------------------------------------------
 1 | ADDR=$1
 2 | MODEL_DIR=$2
 3 | 
 4 | python -m t5x.train \
 5 |     --gin_file="models/scalable_t5/t5_1_1/base.gin" \
 6 |     --gin_file="configs/task/pretrain/pile_prefix_lm.gin" \
 7 |     --gin_file="configs/exp/alibi_plus_relpos.gin" \
 8 |     --gin.TRAIN_STEPS=125000 \
 9 |     --gin.SAVING_PERIOD=5000 \
10 |     --gin.MODEL_DIR=\"'/fsx/lintangsutawika/improved_t5/ckpts/pile_prefix_lm_with_alibi_plus_relpos/'\" \
11 |     --gin.USE_CACHED_TASKS=False \
12 |     --alsologtostderr \
13 |     --multiprocess_gpu \
14 |     --coordinator_address=${ADDR} \
15 |     --process_count=${SLURM_NTASKS} \
16 |     --process_index=${SLURM_PROCID}
17 | 


--------------------------------------------------------------------------------
/experiments/preliminary/positional_embeddings/pretrain_rotary_pile_mlm.sh:
--------------------------------------------------------------------------------
 1 | ADDR=$1
 2 | MODEL_DIR=$2
 3 | 
 4 | python -m t5x.train \
 5 |     --gin_file="models/scalable_t5/t5_1_1/base.gin" \
 6 |     --gin_file="configs/task/pretrain/pile_mlm.gin" \
 7 |     --gin_file="configs/exp/rotary.gin" \
 8 |     --gin.TRAIN_STEPS=125000 \
 9 |     --gin.SAVING_PERIOD=25000 \
10 |     --gin.MODEL_DIR=\"'/fsx/lintangsutawika/improved_t5/ckpts/rotary/rotary_pile_mlm/'\" \
11 |     --gin.USE_CACHED_TASKS=False \
12 |     --alsologtostderr \
13 |     --multiprocess_gpu \
14 |     --coordinator_address=${ADDR} \
15 |     --process_count=${SLURM_NTASKS} \
16 |     --process_index=${SLURM_PROCID}
17 | 


--------------------------------------------------------------------------------
/experiments/preliminary/positional_embeddings/rotary/eval/t0_eval_alibi_relpos.sh:
--------------------------------------------------------------------------------
 1 | export CACHED_DATA_DIR="/fsx/lintangsutawika/data"
 2 | 
 3 | ADDR=$1
 4 | MODEL_DIR=$2
 5 | 
 6 | python -m t5x.train \
 7 |     --gin_file="models/scalable_t5/t5_1_1/base.gin" \
 8 |     --gin_file="configs/task/eval/t0_eval.gin" \
 9 |     --gin.TRAIN_STEPS=135000 \
10 |     --gin.SAVING_PERIOD=5000 \
11 |     --gin.MODEL_DIR=\"'/fsx/lintangsutawika/improved_t5/ckpts/METRO/alibi_relpos_pile_mlm/finetune_t0_eval/'\" \
12 |     --gin.INITIAL_CHECKPOINT_PATH=\"'/fsx/lintangsutawika/improved_t5/ckpts/METRO/alibi_relpos_pile_mlm/finetune_t0_train/checkpoint_135000'\" \
13 |     --seqio_additional_cache_dirs="${CACHED_DATA_DIR}" \
14 |     --gin.USE_CACHED_TASKS=True \
15 |     --alsologtostderr \
16 |     --multiprocess_gpu \
17 |     --coordinator_address=${ADDR} \
18 |     --process_count=${SLURM_NTASKS} \
19 |     --process_index=${SLURM_PROCID}


--------------------------------------------------------------------------------
/experiments/preliminary/positional_embeddings/rotary/eval/t0_eval_benchmark.sh:
--------------------------------------------------------------------------------
 1 | export CACHED_DATA_DIR="/fsx/lintangsutawika/data"
 2 | 
 3 | ADDR=$1
 4 | MODEL_DIR=$2
 5 | 
 6 | python -m t5x.train \
 7 |     --gin_file="models/scalable_t5/t5_1_1/base.gin" \
 8 |     --gin_file="configs/task/eval/t0_eval.gin" \
 9 |     --gin.TRAIN_STEPS=135000 \
10 |     --gin.SAVING_PERIOD=5000 \
11 |     --gin.MODEL_DIR=\"'/fsx/lintangsutawika/improved_t5/ckpts/METRO/benchmark_pile_mlm/finetune_t0_eval/'\" \
12 |     --gin.INITIAL_CHECKPOINT_PATH=\"'/fsx/lintangsutawika/improved_t5/ckpts/METRO/benchmark_pile_mlm/finetune_t0_train/checkpoint_135000'\" \
13 |     --seqio_additional_cache_dirs="${CACHED_DATA_DIR}" \
14 |     --gin.USE_CACHED_TASKS=True \
15 |     --alsologtostderr \
16 |     --multiprocess_gpu \
17 |     --coordinator_address=${ADDR} \
18 |     --process_count=${SLURM_NTASKS} \
19 |     --process_index=${SLURM_PROCID}


--------------------------------------------------------------------------------
/experiments/preliminary/positional_embeddings/rotary/finetune/sglue/sglue_train_benchmark.sh:
--------------------------------------------------------------------------------
 1 | export CACHED_DATA_DIR="/fsx/lintangsutawika/data"
 2 | 
 3 | ADDR=$1
 4 | MODEL_DIR=$2
 5 | 
 6 | python -m t5x.train \
 7 |     --gin_file="models/scalable_t5/t5_1_1/base.gin" \
 8 |     --gin_file="configs/task/finetune/sglue.gin" \
 9 |     --gin.TRAIN_STEPS=135000 \
10 |     --gin.SAVING_PERIOD=2000 \
11 |     --gin.BATCH_SIZE=2048 \
12 |     --gin.MODEL_DIR=\"'/fsx/lintangsutawika/improved_t5/ckpts/rotary/benchmark_pile_mlm/finetune_sglue/'\" \
13 |     --gin.INITIAL_CHECKPOINT_PATH=\"'/fsx/lintangsutawika/improved_t5/ckpts/rotary/benchmark_pile_mlm/checkpoint_125000/'\" \
14 |     --seqio_additional_cache_dirs="${CACHED_DATA_DIR}" \
15 |     --gin.USE_CACHED_TASKS=True \
16 |     --alsologtostderr \
17 |     --multiprocess_gpu \
18 |     --coordinator_address=${ADDR} \
19 |     --process_count=${SLURM_NTASKS} \
20 |     --process_index=${SLURM_PROCID}


--------------------------------------------------------------------------------
/experiments/preliminary/positional_embeddings/rotary/pretrain_benchmark_pile_mlm.sh:
--------------------------------------------------------------------------------
 1 | ADDR=$1
 2 | MODEL_DIR=$2
 3 | 
 4 | python -m t5x.train \
 5 |     --gin_file="models/scalable_t5/t5_1_1/base.gin" \
 6 |     --gin_file="configs/task/pretrain/pile_mlm.gin" \
 7 |     --gin.TRAIN_STEPS=125000 \
 8 |     --gin.SAVING_PERIOD=25000 \
 9 |     --gin.MODEL_DIR=\"'/fsx/lintangsutawika/improved_t5/ckpts/rotary/benchmark_pile_mlm/'\" \
10 |     --gin.USE_CACHED_TASKS=False \
11 |     --alsologtostderr \
12 |     --multiprocess_gpu \
13 |     --coordinator_address=${ADDR} \
14 |     --process_count=${SLURM_NTASKS} \
15 |     --process_index=${SLURM_PROCID}
16 | 


--------------------------------------------------------------------------------
/experiments/preliminary/positional_embeddings/rotary/pretrain_rotary_relpos_pile_mlm.sh:
--------------------------------------------------------------------------------
 1 | ADDR=$1
 2 | MODEL_DIR=$2
 3 | 
 4 | python -m t5x.train \
 5 |     --gin_file="models/scalable_t5/t5_1_1/base.gin" \
 6 |     --gin_file="configs/task/pretrain/pile_mlm.gin" \
 7 |     --gin_file="configs/exp/rotary_relpos.gin" \
 8 |     --gin.TRAIN_STEPS=125000 \
 9 |     --gin.SAVING_PERIOD=25000 \
10 |     --gin.MODEL_DIR=\"'/fsx/lintangsutawika/improved_t5/ckpts/rotary/rotary_relpos_pile_mlm/'\" \
11 |     --gin.USE_CACHED_TASKS=False \
12 |     --alsologtostderr \
13 |     --multiprocess_gpu \
14 |     --coordinator_address=${ADDR} \
15 |     --process_count=${SLURM_NTASKS} \
16 |     --process_index=${SLURM_PROCID}
17 | 


--------------------------------------------------------------------------------
/experiments/preliminary/scaling_laws/110m/sglue_finetune_110m_16000.sh:
--------------------------------------------------------------------------------
 1 | export T5X_DIR="/fsx/lintangsutawika/t5x/"
 2 | export CONFIG_PATH="/fsx/aran/jax/t5x_2/architecture-objective/experiments/configs"
 3 | export CACHED_DATA_DIR="/fsx/lintangsutawika/data"
 4 | 
 5 | ADDR=$1
 6 | MODEL_DIR=$2
 7 | 
 8 | python -m t5x.train \
 9 |     --gin_file="configs/finetune_sglue.gin" \
10 |     --gin_file="configs/size/110m/vanilla.gin" \
11 |     --gin_file="${CONFIG_PATH}/mode/gpu.gin" \
12 |     --gin.TRAIN_STEPS=144_000 \
13 |     --gin.MODEL_DIR=\"'/fsx/aran/jax/ckpts/scaling/110m/vanilla_16k_finetune'\" \
14 |     --gin.INITIAL_CHECKPOINT_PATH=\"'/fsx/aran/jax/ckpts/scaling/110m/checkpoint_16000'\" \
15 |     --seqio_additional_cache_dirs="${CACHED_DATA_DIR}" \
16 |     --alsologtostderr \
17 |     --multiprocess_gpu \
18 |     --coordinator_address=${ADDR} \
19 |     --process_count=${SLURM_NTASKS} \
20 |     --process_index=${SLURM_PROCID}


--------------------------------------------------------------------------------
/experiments/preliminary/scaling_laws/110m/sglue_finetune_110m_32000.sh:
--------------------------------------------------------------------------------
 1 | export T5X_DIR="/fsx/lintangsutawika/t5x/"
 2 | export CONFIG_PATH="/fsx/aran/jax/t5x_2/architecture-objective/experiments/configs"
 3 | export CACHED_DATA_DIR="/fsx/lintangsutawika/data"
 4 | 
 5 | ADDR=$1
 6 | MODEL_DIR=$2
 7 | 
 8 | python -m t5x.train \
 9 |     --gin_file="configs/finetune_sglue.gin" \
10 |     --gin_file="configs/size/110m/vanilla.gin" \
11 |     --gin_file="${CONFIG_PATH}/mode/gpu.gin" \
12 |     --gin.TRAIN_STEPS=160_000 \
13 |     --gin.MODEL_DIR=\"'/fsx/aran/jax/ckpts/scaling/110m/vanilla_32k_finetune'\" \
14 |     --gin.INITIAL_CHECKPOINT_PATH=\"'/fsx/aran/jax/ckpts/scaling/110m/checkpoint_32000'\" \
15 |     --seqio_additional_cache_dirs="${CACHED_DATA_DIR}" \
16 |     --alsologtostderr \
17 |     --multiprocess_gpu \
18 |     --coordinator_address=${ADDR} \
19 |     --process_count=${SLURM_NTASKS} \
20 |     --process_index=${SLURM_PROCID}


--------------------------------------------------------------------------------
/experiments/preliminary/scaling_laws/110m/sglue_finetune_110m_48000.sh:
--------------------------------------------------------------------------------
 1 | export T5X_DIR="/fsx/lintangsutawika/t5x/"
 2 | export CONFIG_PATH="/fsx/aran/jax/t5x_2/architecture-objective/experiments/configs"
 3 | export CACHED_DATA_DIR="/fsx/lintangsutawika/data"
 4 | 
 5 | ADDR=$1
 6 | MODEL_DIR=$2
 7 | 
 8 | python -m t5x.train \
 9 |     --gin_file="configs/finetune_sglue.gin" \
10 |     --gin_file="configs/size/110m/vanilla.gin" \
11 |     --gin_file="${CONFIG_PATH}/mode/gpu.gin" \
12 |     --gin.TRAIN_STEPS=176_000 \
13 |     --gin.MODEL_DIR=\"'/fsx/aran/jax/ckpts/scaling/110m/vanilla_48k_finetune'\" \
14 |     --gin.INITIAL_CHECKPOINT_PATH=\"'/fsx/aran/jax/ckpts/scaling/110m/checkpoint_48000'\" \
15 |     --seqio_additional_cache_dirs="${CACHED_DATA_DIR}" \
16 |     --alsologtostderr \
17 |     --multiprocess_gpu \
18 |     --coordinator_address=${ADDR} \
19 |     --process_count=${SLURM_NTASKS} \
20 |     --process_index=${SLURM_PROCID}


--------------------------------------------------------------------------------
/experiments/preliminary/scaling_laws/110m/sglue_finetune_110m_64000.sh:
--------------------------------------------------------------------------------
 1 | export T5X_DIR="/fsx/lintangsutawika/t5x/"
 2 | export CONFIG_PATH="/fsx/aran/jax/t5x_2/architecture-objective/experiments/configs"
 3 | export CACHED_DATA_DIR="/fsx/lintangsutawika/data"
 4 | 
 5 | ADDR=$1
 6 | MODEL_DIR=$2
 7 | 
 8 | python -m t5x.train \
 9 |     --gin_file="configs/finetune_sglue.gin" \
10 |     --gin_file="configs/size/110m/vanilla.gin" \
11 |     --gin_file="${CONFIG_PATH}/mode/gpu.gin" \
12 |     --gin.TRAIN_STEPS=192_000 \
13 |     --gin.MODEL_DIR=\"'/fsx/aran/jax/ckpts/scaling/110m/vanilla_64k_finetune'\" \
14 |     --gin.INITIAL_CHECKPOINT_PATH=\"'/fsx/aran/jax/ckpts/scaling/110m/checkpoint_64000'\" \
15 |     --seqio_additional_cache_dirs="${CACHED_DATA_DIR}" \
16 |     --alsologtostderr \
17 |     --multiprocess_gpu \
18 |     --coordinator_address=${ADDR} \
19 |     --process_count=${SLURM_NTASKS} \
20 |     --process_index=${SLURM_PROCID}


--------------------------------------------------------------------------------
/experiments/preliminary/scaling_laws/110m/sglue_finetune_110m_80000.sh:
--------------------------------------------------------------------------------
 1 | export T5X_DIR="/fsx/lintangsutawika/t5x/"
 2 | export CONFIG_PATH="/fsx/aran/jax/t5x_2/architecture-objective/experiments/configs"
 3 | export CACHED_DATA_DIR="/fsx/lintangsutawika/data"
 4 | 
 5 | ADDR=$1
 6 | MODEL_DIR=$2
 7 | 
 8 | python -m t5x.train \
 9 |     --gin_file="configs/finetune_sglue.gin" \
10 |     --gin_file="configs/size/110m/vanilla.gin" \
11 |     --gin_file="${CONFIG_PATH}/mode/gpu.gin" \
12 |     --gin.TRAIN_STEPS=208_000 \
13 |     --gin.MODEL_DIR=\"'/fsx/aran/jax/ckpts/scaling/110m/vanilla_80k_finetune'\" \
14 |     --gin.INITIAL_CHECKPOINT_PATH=\"'/fsx/aran/jax/ckpts/scaling/110m/checkpoint_80000'\" \
15 |     --seqio_additional_cache_dirs="${CACHED_DATA_DIR}" \
16 |     --alsologtostderr \
17 |     --multiprocess_gpu \
18 |     --coordinator_address=${ADDR} \
19 |     --process_count=${SLURM_NTASKS} \
20 |     --process_index=${SLURM_PROCID}


--------------------------------------------------------------------------------
/experiments/preliminary/scaling_laws/1_6b/sglue_finetune_1_6b_128000.sh:
--------------------------------------------------------------------------------
 1 | export T5X_DIR="/fsx/lintangsutawika/t5x/"
 2 | export CONFIG_PATH="/fsx/aran/jax/t5x_2/architecture-objective/experiments/configs"
 3 | export CACHED_DATA_DIR="/fsx/lintangsutawika/data"
 4 | 
 5 | ADDR=$1
 6 | MODEL_DIR=$2
 7 | 
 8 | python -m t5x.train \
 9 |     --gin_file="configs/finetune_sglue.gin" \
10 |     --gin_file="${CONFIG_PATH}/size/1_6b/vanilla.gin" \
11 |     --gin_file="${CONFIG_PATH}/mode/gpu.gin" \
12 |     --gin.TRAIN_STEPS=256_000 \
13 |     --gin.MODEL_DIR=\"'/fsx/aran/jax/ckpts/scaling/1_6b/vanilla_128k_finetune'\" \
14 |     --gin.INITIAL_CHECKPOINT_PATH=\"'/fsx/aran/jax/ckpts/scaling/1_6b/checkpoint_128000'\" \
15 |     --seqio_additional_cache_dirs="${CACHED_DATA_DIR}" \
16 |     --alsologtostderr \
17 |     --multiprocess_gpu \
18 |     --coordinator_address=${ADDR} \
19 |     --process_count=${SLURM_NTASKS} \
20 |     --process_index=${SLURM_PROCID}


--------------------------------------------------------------------------------
/experiments/preliminary/scaling_laws/1_6b/sglue_finetune_1_6b_192000.sh:
--------------------------------------------------------------------------------
 1 | export T5X_DIR="/fsx/lintangsutawika/t5x/"
 2 | export CONFIG_PATH="/fsx/aran/jax/t5x_2/architecture-objective/experiments/configs"
 3 | export CACHED_DATA_DIR="/fsx/lintangsutawika/data"
 4 | 
 5 | ADDR=$1
 6 | MODEL_DIR=$2
 7 | 
 8 | python -m t5x.train \
 9 |     --gin_file="configs/finetune_sglue.gin" \
10 |     --gin_file="${CONFIG_PATH}/size/1_6b/vanilla.gin" \
11 |     --gin_file="${CONFIG_PATH}/mode/gpu.gin" \
12 |     --gin.TRAIN_STEPS=320_000 \
13 |     --gin.MODEL_DIR=\"'/fsx/aran/jax/ckpts/scaling/1_6b/vanilla_192k_finetune'\" \
14 |     --gin.INITIAL_CHECKPOINT_PATH=\"'/fsx/aran/jax/ckpts/scaling/1_6b/checkpoint_192000'\" \
15 |     --seqio_additional_cache_dirs="${CACHED_DATA_DIR}" \
16 |     --alsologtostderr \
17 |     --multiprocess_gpu \
18 |     --coordinator_address=${ADDR} \
19 |     --process_count=${SLURM_NTASKS} \
20 |     --process_index=${SLURM_PROCID}


--------------------------------------------------------------------------------
/experiments/preliminary/scaling_laws/1_6b/sglue_finetune_1_6b_256000.sh:
--------------------------------------------------------------------------------
 1 | export T5X_DIR="/fsx/lintangsutawika/t5x/"
 2 | export CONFIG_PATH="/fsx/aran/jax/t5x_2/architecture-objective/experiments/configs"
 3 | export CACHED_DATA_DIR="/fsx/lintangsutawika/data"
 4 | 
 5 | ADDR=$1
 6 | MODEL_DIR=$2
 7 | 
 8 | python -m t5x.train \
 9 |     --gin_file="configs/finetune_sglue.gin" \
10 |     --gin_file="${CONFIG_PATH}/size/1_6b/vanilla.gin" \
11 |     --gin_file="${CONFIG_PATH}/mode/gpu.gin" \
12 |     --gin.TRAIN_STEPS=384_000 \
13 |     --gin.MODEL_DIR=\"'/fsx/aran/jax/ckpts/scaling/1_6b/vanilla_256k_finetune'\" \
14 |     --gin.INITIAL_CHECKPOINT_PATH=\"'/fsx/aran/jax/ckpts/scaling/1_6b/checkpoint_256000'\" \
15 |     --seqio_additional_cache_dirs="${CACHED_DATA_DIR}" \
16 |     --alsologtostderr \
17 |     --multiprocess_gpu \
18 |     --coordinator_address=${ADDR} \
19 |     --process_count=${SLURM_NTASKS} \
20 |     --process_index=${SLURM_PROCID}


--------------------------------------------------------------------------------
/experiments/preliminary/scaling_laws/1_6b/sglue_finetune_1_6b_320000.sh:
--------------------------------------------------------------------------------
 1 | export T5X_DIR="/fsx/lintangsutawika/t5x/"
 2 | export CONFIG_PATH="/fsx/aran/jax/t5x_2/architecture-objective/experiments/configs"
 3 | export CACHED_DATA_DIR="/fsx/lintangsutawika/data"
 4 | 
 5 | ADDR=$1
 6 | MODEL_DIR=$2
 7 | 
 8 | python -m t5x.train \
 9 |     --gin_file="configs/finetune_sglue.gin" \
10 |     --gin_file="${CONFIG_PATH}/size/1_6b/vanilla.gin" \
11 |     --gin_file="${CONFIG_PATH}/mode/gpu.gin" \
12 |     --gin.TRAIN_STEPS=448_000 \
13 |     --gin.MODEL_DIR=\"'/fsx/aran/jax/ckpts/scaling/1_6b/vanilla_320k_finetune'\" \
14 |     --gin.INITIAL_CHECKPOINT_PATH=\"'/fsx/aran/jax/ckpts/scaling/1_6b/checkpoint_320000'\" \
15 |     --seqio_additional_cache_dirs="${CACHED_DATA_DIR}" \
16 |     --alsologtostderr \
17 |     --multiprocess_gpu \
18 |     --coordinator_address=${ADDR} \
19 |     --process_count=${SLURM_NTASKS} \
20 |     --process_index=${SLURM_PROCID}


--------------------------------------------------------------------------------
/experiments/preliminary/scaling_laws/1_6b/sglue_finetune_1_6b_384000.sh:
--------------------------------------------------------------------------------
 1 | export T5X_DIR="/fsx/lintangsutawika/t5x/"
 2 | export CONFIG_PATH="/fsx/aran/jax/t5x_2/architecture-objective/experiments/configs"
 3 | export CACHED_DATA_DIR="/fsx/lintangsutawika/data"
 4 | 
 5 | ADDR=$1
 6 | MODEL_DIR=$2
 7 | 
 8 | python -m t5x.train \
 9 |     --gin_file="configs/finetune_sglue.gin" \
10 |     --gin_file="${CONFIG_PATH}/size/1_6b/vanilla.gin" \
11 |     --gin_file="${CONFIG_PATH}/mode/gpu.gin" \
12 |     --gin.TRAIN_STEPS=512_000 \
13 |     --gin.MODEL_DIR=\"'/fsx/aran/jax/ckpts/scaling/1_6b/vanilla_384k_finetune'\" \
14 |     --gin.INITIAL_CHECKPOINT_PATH=\"'/fsx/aran/jax/ckpts/scaling/1_6b/checkpoint_384000'\" \
15 |     --seqio_additional_cache_dirs="${CACHED_DATA_DIR}" \
16 |     --alsologtostderr \
17 |     --multiprocess_gpu \
18 |     --coordinator_address=${ADDR} \
19 |     --process_count=${SLURM_NTASKS} \
20 |     --process_index=${SLURM_PROCID}


--------------------------------------------------------------------------------
/experiments/preliminary/scaling_laws/1_6b/sglue_finetune_1_6b_424000.sh:
--------------------------------------------------------------------------------
 1 | export T5X_DIR="/fsx/lintangsutawika/t5x/"
 2 | export CONFIG_PATH="/fsx/aran/jax/t5x_2/architecture-objective/experiments/configs"
 3 | export CACHED_DATA_DIR="/fsx/lintangsutawika/data"
 4 | 
 5 | ADDR=$1
 6 | MODEL_DIR=$2
 7 | 
 8 | python -m t5x.train \
 9 |     --gin_file="configs/finetune_sglue.gin" \
10 |     --gin_file="${CONFIG_PATH}/size/1_6b/vanilla.gin" \
11 |     --gin_file="${CONFIG_PATH}/mode/gpu.gin" \
12 |     --gin.TRAIN_STEPS=552_000 \
13 |     --gin.MODEL_DIR=\"'/fsx/aran/jax/ckpts/scaling/1_6b/vanilla_424k_finetune'\" \
14 |     --gin.INITIAL_CHECKPOINT_PATH=\"'/fsx/aran/jax/ckpts/scaling/1_6b/checkpoint_424000'\" \
15 |     --seqio_additional_cache_dirs="${CACHED_DATA_DIR}" \
16 |     --alsologtostderr \
17 |     --multiprocess_gpu \
18 |     --coordinator_address=${ADDR} \
19 |     --process_count=${SLURM_NTASKS} \
20 |     --process_index=${SLURM_PROCID}


--------------------------------------------------------------------------------
/experiments/preliminary/scaling_laws/1_6b/sglue_finetune_1_6b_448000.sh:
--------------------------------------------------------------------------------
 1 | export T5X_DIR="/fsx/lintangsutawika/t5x/"
 2 | export CONFIG_PATH="/fsx/aran/jax/t5x_2/architecture-objective/experiments/configs"
 3 | export CACHED_DATA_DIR="/fsx/lintangsutawika/data"
 4 | 
 5 | ADDR=$1
 6 | MODEL_DIR=$2
 7 | 
 8 | python -m t5x.train \
 9 |     --gin_file="configs/finetune_sglue.gin" \
10 |     --gin_file="${CONFIG_PATH}/size/1_6b/vanilla.gin" \
11 |     --gin_file="${CONFIG_PATH}/mode/gpu.gin" \
12 |     --gin.TRAIN_STEPS=576_000 \
13 |     --gin.MODEL_DIR=\"'/fsx/aran/jax/ckpts/scaling/1_6b/vanilla_448k_finetune'\" \
14 |     --gin.INITIAL_CHECKPOINT_PATH=\"'/fsx/aran/jax/ckpts/scaling/1_6b/checkpoint_448000'\" \
15 |     --seqio_additional_cache_dirs="${CACHED_DATA_DIR}" \
16 |     --alsologtostderr \
17 |     --multiprocess_gpu \
18 |     --coordinator_address=${ADDR} \
19 |     --process_count=${SLURM_NTASKS} \
20 |     --process_index=${SLURM_PROCID}


--------------------------------------------------------------------------------
/experiments/preliminary/scaling_laws/1_6b/sglue_finetune_1_6b_512000.sh:
--------------------------------------------------------------------------------
 1 | export T5X_DIR="/fsx/lintangsutawika/t5x/"
 2 | export CONFIG_PATH="/fsx/aran/jax/t5x_2/architecture-objective/experiments/configs"
 3 | export CACHED_DATA_DIR="/fsx/lintangsutawika/data"
 4 | 
 5 | ADDR=$1
 6 | MODEL_DIR=$2
 7 | 
 8 | python -m t5x.train \
 9 |     --gin_file="configs/finetune_sglue.gin" \
10 |     --gin_file="${CONFIG_PATH}/size/1_6b/vanilla.gin" \
11 |     --gin_file="${CONFIG_PATH}/mode/gpu.gin" \
12 |     --gin.TRAIN_STEPS=640_000 \
13 |     --gin.MODEL_DIR=\"'/fsx/aran/jax/ckpts/scaling/1_6b/vanilla_512k_finetune'\" \
14 |     --gin.INITIAL_CHECKPOINT_PATH=\"'/fsx/aran/jax/ckpts/scaling/1_6b/checkpoint_512000'\" \
15 |     --seqio_additional_cache_dirs="${CACHED_DATA_DIR}" \
16 |     --alsologtostderr \
17 |     --multiprocess_gpu \
18 |     --coordinator_address=${ADDR} \
19 |     --process_count=${SLURM_NTASKS} \
20 |     --process_index=${SLURM_PROCID}


--------------------------------------------------------------------------------
/experiments/preliminary/scaling_laws/1_6b/sglue_finetune_1_6b_64000.sh:
--------------------------------------------------------------------------------
 1 | export T5X_DIR="/fsx/lintangsutawika/t5x/"
 2 | export CONFIG_PATH="/fsx/aran/jax/t5x_2/architecture-objective/experiments/configs"
 3 | export CACHED_DATA_DIR="/fsx/lintangsutawika/data"
 4 | 
 5 | ADDR=$1
 6 | MODEL_DIR=$2
 7 | 
 8 | python -m t5x.train \
 9 |     --gin_file="configs/finetune_sglue.gin" \
10 |     --gin_file="${CONFIG_PATH}/size/1_6b/vanilla.gin" \
11 |     --gin_file="${CONFIG_PATH}/mode/gpu.gin" \
12 |     --gin.TRAIN_STEPS=192_000 \
13 |     --gin.MODEL_DIR=\"'/fsx/aran/jax/ckpts/scaling/1_6b/vanilla_64k_finetune'\" \
14 |     --gin.INITIAL_CHECKPOINT_PATH=\"'/fsx/aran/jax/ckpts/scaling/1_6b/checkpoint_64000'\" \
15 |     --seqio_additional_cache_dirs="${CACHED_DATA_DIR}" \
16 |     --alsologtostderr \
17 |     --multiprocess_gpu \
18 |     --coordinator_address=${ADDR} \
19 |     --process_count=${SLURM_NTASKS} \
20 |     --process_index=${SLURM_PROCID}


--------------------------------------------------------------------------------
/experiments/preliminary/scaling_laws/25m/sglue_finetune_25m_16000.sh:
--------------------------------------------------------------------------------
 1 | export T5X_DIR="/fsx/lintangsutawika/t5x/"
 2 | export CONFIG_PATH="/fsx/aran/jax/t5x_2/architecture-objective/experiments/configs"
 3 | export CACHED_DATA_DIR="/fsx/lintangsutawika/data"
 4 | 
 5 | ADDR=$1
 6 | MODEL_DIR=$2
 7 | 
 8 | python -m t5x.train \
 9 |     --gin_file="configs/finetune_sglue.gin" \
10 |     --gin_file="configs/size/25m/vanilla.gin" \
11 |     --gin_file="${CONFIG_PATH}/mode/gpu.gin" \
12 |     --gin.TRAIN_STEPS=144_000 \
13 |     --gin.MODEL_DIR=\"'/fsx/aran/jax/ckpts/scaling/25m/vanilla_16k_finetune'\" \
14 |     --gin.INITIAL_CHECKPOINT_PATH=\"'/fsx/aran/jax/ckpts/scaling/25m/checkpoint_16000'\" \
15 |     --seqio_additional_cache_dirs="${CACHED_DATA_DIR}" \
16 |     --alsologtostderr \
17 |     --multiprocess_gpu \
18 |     --coordinator_address=${ADDR} \
19 |     --process_count=${SLURM_NTASKS} \
20 |     --process_index=${SLURM_PROCID}


--------------------------------------------------------------------------------
/experiments/preliminary/scaling_laws/25m/sglue_finetune_25m_24000.sh:
--------------------------------------------------------------------------------
 1 | export T5X_DIR="/fsx/lintangsutawika/t5x/"
 2 | export CONFIG_PATH="/fsx/aran/jax/t5x_2/architecture-objective/experiments/configs"
 3 | export CACHED_DATA_DIR="/fsx/lintangsutawika/data"
 4 | 
 5 | ADDR=$1
 6 | MODEL_DIR=$2
 7 | 
 8 | python -m t5x.train \
 9 |     --gin_file="configs/finetune_sglue.gin" \
10 |     --gin_file="configs/size/25m/vanilla.gin" \
11 |     --gin_file="${CONFIG_PATH}/mode/gpu.gin" \
12 |     --gin.TRAIN_STEPS=152_000 \
13 |     --gin.MODEL_DIR=\"'/fsx/aran/jax/ckpts/scaling/25m/vanilla_24k_finetune'\" \
14 |     --gin.INITIAL_CHECKPOINT_PATH=\"'/fsx/aran/jax/ckpts/scaling/25m/checkpoint_24000'\" \
15 |     --seqio_additional_cache_dirs="${CACHED_DATA_DIR}" \
16 |     --alsologtostderr \
17 |     --multiprocess_gpu \
18 |     --coordinator_address=${ADDR} \
19 |     --process_count=${SLURM_NTASKS} \
20 |     --process_index=${SLURM_PROCID}


--------------------------------------------------------------------------------
/experiments/preliminary/scaling_laws/25m/sglue_finetune_25m_32000.sh:
--------------------------------------------------------------------------------
 1 | export T5X_DIR="/fsx/lintangsutawika/t5x/"
 2 | export CONFIG_PATH="/fsx/aran/jax/t5x_2/architecture-objective/experiments/configs"
 3 | export CACHED_DATA_DIR="/fsx/lintangsutawika/data"
 4 | 
 5 | ADDR=$1
 6 | MODEL_DIR=$2
 7 | 
 8 | python -m t5x.train \
 9 |     --gin_file="configs/finetune_sglue.gin" \
10 |     --gin_file="configs/size/25m/vanilla.gin" \
11 |     --gin_file="${CONFIG_PATH}/mode/gpu.gin" \
12 |     --gin.TRAIN_STEPS=160_000 \
13 |     --gin.MODEL_DIR=\"'/fsx/aran/jax/ckpts/scaling/25m/vanilla_32k_finetune'\" \
14 |     --gin.INITIAL_CHECKPOINT_PATH=\"'/fsx/aran/jax/ckpts/scaling/25m/checkpoint_32000'\" \
15 |     --seqio_additional_cache_dirs="${CACHED_DATA_DIR}" \
16 |     --alsologtostderr \
17 |     --multiprocess_gpu \
18 |     --coordinator_address=${ADDR} \
19 |     --process_count=${SLURM_NTASKS} \
20 |     --process_index=${SLURM_PROCID}


--------------------------------------------------------------------------------
/experiments/preliminary/scaling_laws/25m/sglue_finetune_25m_40000.sh:
--------------------------------------------------------------------------------
 1 | export T5X_DIR="/fsx/lintangsutawika/t5x/"
 2 | export CONFIG_PATH="/fsx/aran/jax/t5x_2/architecture-objective/experiments/configs"
 3 | export CACHED_DATA_DIR="/fsx/lintangsutawika/data"
 4 | 
 5 | ADDR=$1
 6 | MODEL_DIR=$2
 7 | 
 8 | python -m t5x.train \
 9 |     --gin_file="configs/finetune_sglue.gin" \
10 |     --gin_file="configs/size/25m/vanilla.gin" \
11 |     --gin_file="${CONFIG_PATH}/mode/gpu.gin" \
12 |     --gin.TRAIN_STEPS=168_000 \
13 |     --gin.MODEL_DIR=\"'/fsx/aran/jax/ckpts/scaling/25m/vanilla_40k_finetune'\" \
14 |     --gin.INITIAL_CHECKPOINT_PATH=\"'/fsx/aran/jax/ckpts/scaling/25m/checkpoint_40000'\" \
15 |     --seqio_additional_cache_dirs="${CACHED_DATA_DIR}" \
16 |     --alsologtostderr \
17 |     --multiprocess_gpu \
18 |     --coordinator_address=${ADDR} \
19 |     --process_count=${SLURM_NTASKS} \
20 |     --process_index=${SLURM_PROCID}


--------------------------------------------------------------------------------
/experiments/preliminary/scaling_laws/25m/sglue_finetune_25m_48000.sh:
--------------------------------------------------------------------------------
 1 | export T5X_DIR="/fsx/lintangsutawika/t5x/"
 2 | export CONFIG_PATH="/fsx/aran/jax/t5x_2/architecture-objective/experiments/configs"
 3 | export CACHED_DATA_DIR="/fsx/lintangsutawika/data"
 4 | 
 5 | ADDR=$1
 6 | MODEL_DIR=$2
 7 | 
 8 | python -m t5x.train \
 9 |     --gin_file="configs/finetune_sglue.gin" \
10 |     --gin_file="configs/size/25m/vanilla.gin" \
11 |     --gin_file="${CONFIG_PATH}/mode/gpu.gin" \
12 |     --gin.TRAIN_STEPS=176_000 \
13 |     --gin.MODEL_DIR=\"'/fsx/aran/jax/ckpts/scaling/25m/vanilla_48k_finetune'\" \
14 |     --gin.INITIAL_CHECKPOINT_PATH=\"'/fsx/aran/jax/ckpts/scaling/25m/checkpoint_48000'\" \
15 |     --seqio_additional_cache_dirs="${CACHED_DATA_DIR}" \
16 |     --alsologtostderr \
17 |     --multiprocess_gpu \
18 |     --coordinator_address=${ADDR} \
19 |     --process_count=${SLURM_NTASKS} \
20 |     --process_index=${SLURM_PROCID}


--------------------------------------------------------------------------------
/experiments/preliminary/scaling_laws/25m/sglue_finetune_25m_56000.sh:
--------------------------------------------------------------------------------
 1 | export T5X_DIR="/fsx/lintangsutawika/t5x/"
 2 | export CONFIG_PATH="/fsx/aran/jax/t5x_2/architecture-objective/experiments/configs"
 3 | export CACHED_DATA_DIR="/fsx/lintangsutawika/data"
 4 | 
 5 | ADDR=$1
 6 | MODEL_DIR=$2
 7 | 
 8 | python -m t5x.train \
 9 |     --gin_file="configs/finetune_sglue.gin" \
10 |     --gin_file="configs/size/25m/vanilla.gin" \
11 |     --gin_file="${CONFIG_PATH}/mode/gpu.gin" \
12 |     --gin.TRAIN_STEPS=184_000 \
13 |     --gin.MODEL_DIR=\"'/fsx/aran/jax/ckpts/scaling/25m/vanilla_56k_finetune'\" \
14 |     --gin.INITIAL_CHECKPOINT_PATH=\"'/fsx/aran/jax/ckpts/scaling/25m/checkpoint_56000'\" \
15 |     --seqio_additional_cache_dirs="${CACHED_DATA_DIR}" \
16 |     --alsologtostderr \
17 |     --multiprocess_gpu \
18 |     --coordinator_address=${ADDR} \
19 |     --process_count=${SLURM_NTASKS} \
20 |     --process_index=${SLURM_PROCID}


--------------------------------------------------------------------------------
/experiments/preliminary/scaling_laws/25m/sglue_finetune_25m_64000.sh:
--------------------------------------------------------------------------------
 1 | export T5X_DIR="/fsx/lintangsutawika/t5x/"
 2 | export CONFIG_PATH="/fsx/aran/jax/t5x_2/architecture-objective/experiments/configs"
 3 | export CACHED_DATA_DIR="/fsx/lintangsutawika/data"
 4 | 
 5 | ADDR=$1
 6 | MODEL_DIR=$2
 7 | 
 8 | python -m t5x.train \
 9 |     --gin_file="configs/finetune_sglue.gin" \
10 |     --gin_file="configs/size/25m/vanilla.gin" \
11 |     --gin_file="${CONFIG_PATH}/mode/gpu.gin" \
12 |     --gin.TRAIN_STEPS=192_000 \
13 |     --gin.MODEL_DIR=\"'/fsx/aran/jax/ckpts/scaling/25m/vanilla_64k_finetune'\" \
14 |     --gin.INITIAL_CHECKPOINT_PATH=\"'/fsx/aran/jax/ckpts/scaling/25m/checkpoint_64000'\" \
15 |     --seqio_additional_cache_dirs="${CACHED_DATA_DIR}" \
16 |     --alsologtostderr \
17 |     --multiprocess_gpu \
18 |     --coordinator_address=${ADDR} \
19 |     --process_count=${SLURM_NTASKS} \
20 |     --process_index=${SLURM_PROCID}


--------------------------------------------------------------------------------
/experiments/preliminary/scaling_laws/25m/sglue_finetune_25m_8000.sh:
--------------------------------------------------------------------------------
 1 | export T5X_DIR="/fsx/lintangsutawika/t5x/"
 2 | export CONFIG_PATH="/fsx/aran/jax/t5x_2/architecture-objective/experiments/configs"
 3 | export CACHED_DATA_DIR="/fsx/lintangsutawika/data"
 4 | 
 5 | ADDR=$1
 6 | MODEL_DIR=$2
 7 | 
 8 | python -m t5x.train \
 9 |     --gin_file="configs/finetune_sglue.gin" \
10 |     --gin_file="configs/size/25m/vanilla.gin" \
11 |     --gin_file="${CONFIG_PATH}/mode/gpu.gin" \
12 |     --gin.TRAIN_STEPS=136_000 \
13 |     --gin.MODEL_DIR=\"'/fsx/aran/jax/ckpts/scaling/25m/vanilla_8k_finetune'\" \
14 |     --gin.INITIAL_CHECKPOINT_PATH=\"'/fsx/aran/jax/ckpts/scaling/25m/checkpoint_8000'\" \
15 |     --seqio_additional_cache_dirs="${CACHED_DATA_DIR}" \
16 |     --alsologtostderr \
17 |     --multiprocess_gpu \
18 |     --coordinator_address=${ADDR} \
19 |     --process_count=${SLURM_NTASKS} \
20 |     --process_index=${SLURM_PROCID}


--------------------------------------------------------------------------------
/experiments/preliminary/scaling_laws/920m/pretrain_c4.sh:
--------------------------------------------------------------------------------
 1 | ADDR=$1
 2 | MODEL_DIR=$2
 3 | 
 4 | python -m t5x.train \
 5 |     --gin_file="configs/task/pretrain/c4_mlm.gin" \
 6 |     --gin_file="configs/size/920m/vanilla.gin" \
 7 |     --gin_file="configs/exp/scaling.gin" \
 8 |     --gin.TRAIN_STEPS=256000 \
 9 |     --gin.SAVING_PERIOD=32000 \
10 |     --gin.MODEL_DIR=\"'/fsx/aran/jax/ckpts/scaling/920m_c4/'\" \
11 |     --gin.USE_CACHED_TASKS=False \
12 |     --alsologtostderr \
13 |     --multiprocess_gpu \
14 |     --coordinator_address=${ADDR} \
15 |     --process_count=${SLURM_NTASKS} \
16 |     --process_index=${SLURM_PROCID}
17 | 


--------------------------------------------------------------------------------
/experiments/preliminary/scaling_laws/920m/pretrain_pile.sh:
--------------------------------------------------------------------------------
 1 | ADDR=$1
 2 | MODEL_DIR=$2
 3 | 
 4 | python -m t5x.train \
 5 |     --gin_file="configs/task/pretrain/pile_mlm.gin" \
 6 |     --gin_file="configs/size/920m/vanilla.gin" \
 7 |     --gin_file="configs/exp/scaling.gin" \
 8 |     --gin.TRAIN_STEPS=256000 \
 9 |     --gin.SAVING_PERIOD=32000 \
10 |     --gin.MODEL_DIR=\"'/fsx/aran/jax/ckpts/scaling/920m_pile/'\" \
11 |     --gin.USE_CACHED_TASKS=False \
12 |     --alsologtostderr \
13 |     --multiprocess_gpu \
14 |     --coordinator_address=${ADDR} \
15 |     --process_count=${SLURM_NTASKS} \
16 |     --process_index=${SLURM_PROCID}
17 | 


--------------------------------------------------------------------------------
/experiments/preliminary/scaling_laws/920m/sglue_finetune_920m_128000.sh:
--------------------------------------------------------------------------------
 1 | export T5X_DIR="/fsx/lintangsutawika/t5x/"
 2 | export CONFIG_PATH="/fsx/aran/jax/t5x_2/architecture-objective/experiments/configs"
 3 | export CACHED_DATA_DIR="/fsx/lintangsutawika/data"
 4 | 
 5 | ADDR=$1
 6 | MODEL_DIR=$2
 7 | 
 8 | python -m t5x.train \
 9 |     --gin_file="configs/finetune_sglue.gin" \
10 |     --gin_file="configs/size/920m/vanilla.gin" \
11 |     --gin_file="${CONFIG_PATH}/mode/gpu.gin" \
12 |     --gin.TRAIN_STEPS=256_000 \
13 |     --gin.MODEL_DIR=\"'/fsx/aran/jax/ckpts/scaling/920m/vanilla_128k_finetune'\" \
14 |     --gin.INITIAL_CHECKPOINT_PATH=\"'/fsx/aran/jax/ckpts/scaling/920m/checkpoint_128000'\" \
15 |     --seqio_additional_cache_dirs="${CACHED_DATA_DIR}" \
16 |     --alsologtostderr \
17 |     --multiprocess_gpu \
18 |     --coordinator_address=${ADDR} \
19 |     --process_count=${SLURM_NTASKS} \
20 |     --process_index=${SLURM_PROCID}


--------------------------------------------------------------------------------
/experiments/preliminary/scaling_laws/920m/sglue_finetune_920m_160000.sh:
--------------------------------------------------------------------------------
 1 | export T5X_DIR="/fsx/lintangsutawika/t5x/"
 2 | export CONFIG_PATH="/fsx/aran/jax/t5x_2/architecture-objective/experiments/configs"
 3 | export CACHED_DATA_DIR="/fsx/lintangsutawika/data"
 4 | 
 5 | ADDR=$1
 6 | MODEL_DIR=$2
 7 | 
 8 | python -m t5x.train \
 9 |     --gin_file="configs/finetune_sglue.gin" \
10 |     --gin_file="configs/size/920m/vanilla.gin" \
11 |     --gin_file="${CONFIG_PATH}/mode/gpu.gin" \
12 |     --gin.TRAIN_STEPS=288_000 \
13 |     --gin.MODEL_DIR=\"'/fsx/aran/jax/ckpts/scaling/920m/vanilla_160k_finetune'\" \
14 |     --gin.INITIAL_CHECKPOINT_PATH=\"'/fsx/aran/jax/ckpts/scaling/920m/checkpoint_160000'\" \
15 |     --seqio_additional_cache_dirs="${CACHED_DATA_DIR}" \
16 |     --alsologtostderr \
17 |     --multiprocess_gpu \
18 |     --coordinator_address=${ADDR} \
19 |     --process_count=${SLURM_NTASKS} \
20 |     --process_index=${SLURM_PROCID}


--------------------------------------------------------------------------------
/experiments/preliminary/scaling_laws/920m/sglue_finetune_920m_192000.sh:
--------------------------------------------------------------------------------
 1 | export T5X_DIR="/fsx/lintangsutawika/t5x/"
 2 | export CONFIG_PATH="/fsx/aran/jax/t5x_2/architecture-objective/experiments/configs"
 3 | export CACHED_DATA_DIR="/fsx/lintangsutawika/data"
 4 | 
 5 | ADDR=$1
 6 | MODEL_DIR=$2
 7 | 
 8 | python -m t5x.train \
 9 |     --gin_file="configs/finetune_sglue.gin" \
10 |     --gin_file="configs/size/920m/vanilla.gin" \
11 |     --gin_file="${CONFIG_PATH}/mode/gpu.gin" \
12 |     --gin.TRAIN_STEPS=320_000 \
13 |     --gin.MODEL_DIR=\"'/fsx/aran/jax/ckpts/scaling/920m/vanilla_192k_finetune'\" \
14 |     --gin.INITIAL_CHECKPOINT_PATH=\"'/fsx/aran/jax/ckpts/scaling/920m/checkpoint_192000'\" \
15 |     --seqio_additional_cache_dirs="${CACHED_DATA_DIR}" \
16 |     --alsologtostderr \
17 |     --multiprocess_gpu \
18 |     --coordinator_address=${ADDR} \
19 |     --process_count=${SLURM_NTASKS} \
20 |     --process_index=${SLURM_PROCID}


--------------------------------------------------------------------------------
/experiments/preliminary/scaling_laws/920m/sglue_finetune_920m_224000.sh:
--------------------------------------------------------------------------------
 1 | export T5X_DIR="/fsx/lintangsutawika/t5x/"
 2 | export CONFIG_PATH="/fsx/aran/jax/t5x_2/architecture-objective/experiments/configs"
 3 | export CACHED_DATA_DIR="/fsx/lintangsutawika/data"
 4 | 
 5 | ADDR=$1
 6 | MODEL_DIR=$2
 7 | 
 8 | python -m t5x.train \
 9 |     --gin_file="configs/finetune_sglue.gin" \
10 |     --gin_file="configs/size/920m/vanilla.gin" \
11 |     --gin_file="${CONFIG_PATH}/mode/gpu.gin" \
12 |     --gin.TRAIN_STEPS=352_000 \
13 |     --gin.MODEL_DIR=\"'/fsx/aran/jax/ckpts/scaling/920m/vanilla_224k_finetune'\" \
14 |     --gin.INITIAL_CHECKPOINT_PATH=\"'/fsx/aran/jax/ckpts/scaling/920m/checkpoint_224000'\" \
15 |     --seqio_additional_cache_dirs="${CACHED_DATA_DIR}" \
16 |     --alsologtostderr \
17 |     --multiprocess_gpu \
18 |     --coordinator_address=${ADDR} \
19 |     --process_count=${SLURM_NTASKS} \
20 |     --process_index=${SLURM_PROCID}


--------------------------------------------------------------------------------
/experiments/preliminary/scaling_laws/920m/sglue_finetune_920m_256000.sh:
--------------------------------------------------------------------------------
 1 | export T5X_DIR="/fsx/lintangsutawika/t5x/"
 2 | export CONFIG_PATH="/fsx/aran/jax/t5x_2/architecture-objective/experiments/configs"
 3 | export CACHED_DATA_DIR="/fsx/lintangsutawika/data"
 4 | 
 5 | ADDR=$1
 6 | MODEL_DIR=$2
 7 | 
 8 | python -m t5x.train \
 9 |     --gin_file="configs/finetune_sglue.gin" \
10 |     --gin_file="configs/size/920m/vanilla.gin" \
11 |     --gin_file="${CONFIG_PATH}/mode/gpu.gin" \
12 |     --gin.TRAIN_STEPS=384_000 \
13 |     --gin.MODEL_DIR=\"'/fsx/aran/jax/ckpts/scaling/920m/vanilla_256k_finetune'\" \
14 |     --gin.INITIAL_CHECKPOINT_PATH=\"'/fsx/aran/jax/ckpts/scaling/920m/checkpoint_256000'\" \
15 |     --seqio_additional_cache_dirs="${CACHED_DATA_DIR}" \
16 |     --alsologtostderr \
17 |     --multiprocess_gpu \
18 |     --coordinator_address=${ADDR} \
19 |     --process_count=${SLURM_NTASKS} \
20 |     --process_index=${SLURM_PROCID}


--------------------------------------------------------------------------------
/experiments/preliminary/scaling_laws/920m/sglue_finetune_920m_32000.sh:
--------------------------------------------------------------------------------
 1 | export T5X_DIR="/fsx/lintangsutawika/t5x/"
 2 | export CONFIG_PATH="/fsx/aran/jax/t5x_2/architecture-objective/experiments/configs"
 3 | export CACHED_DATA_DIR="/fsx/lintangsutawika/data"
 4 | 
 5 | ADDR=$1
 6 | MODEL_DIR=$2
 7 | 
 8 | python -m t5x.train \
 9 |     --gin_file="configs/finetune_sglue.gin" \
10 |     --gin_file="configs/size/920m/vanilla.gin" \
11 |     --gin_file="${CONFIG_PATH}/mode/gpu.gin" \
12 |     --gin.TRAIN_STEPS=160_000 \
13 |     --gin.MODEL_DIR=\"'/fsx/aran/jax/ckpts/scaling/920m/vanilla_32k_finetune'\" \
14 |     --gin.INITIAL_CHECKPOINT_PATH=\"'/fsx/aran/jax/ckpts/scaling/920m/checkpoint_32000'\" \
15 |     --seqio_additional_cache_dirs="${CACHED_DATA_DIR}" \
16 |     --alsologtostderr \
17 |     --multiprocess_gpu \
18 |     --coordinator_address=${ADDR} \
19 |     --process_count=${SLURM_NTASKS} \
20 |     --process_index=${SLURM_PROCID}


--------------------------------------------------------------------------------
/experiments/preliminary/scaling_laws/920m/sglue_finetune_920m_64000.sh:
--------------------------------------------------------------------------------
 1 | export T5X_DIR="/fsx/lintangsutawika/t5x/"
 2 | export CONFIG_PATH="/fsx/aran/jax/t5x_2/architecture-objective/experiments/configs"
 3 | export CACHED_DATA_DIR="/fsx/lintangsutawika/data"
 4 | 
 5 | ADDR=$1
 6 | MODEL_DIR=$2
 7 | 
 8 | python -m t5x.train \
 9 |     --gin_file="configs/finetune_sglue.gin" \
10 |     --gin_file="configs/size/920m/vanilla.gin" \
11 |     --gin_file="${CONFIG_PATH}/mode/gpu.gin" \
12 |     --gin.TRAIN_STEPS=192_000 \
13 |     --gin.MODEL_DIR=\"'/fsx/aran/jax/ckpts/scaling/920m/vanilla_64k_finetune'\" \
14 |     --gin.INITIAL_CHECKPOINT_PATH=\"'/fsx/aran/jax/ckpts/scaling/920m/checkpoint_64000'\" \
15 |     --seqio_additional_cache_dirs="${CACHED_DATA_DIR}" \
16 |     --alsologtostderr \
17 |     --multiprocess_gpu \
18 |     --coordinator_address=${ADDR} \
19 |     --process_count=${SLURM_NTASKS} \
20 |     --process_index=${SLURM_PROCID}


--------------------------------------------------------------------------------
/experiments/preliminary/scaling_laws/920m/sglue_finetune_920m_96000.sh:
--------------------------------------------------------------------------------
 1 | export T5X_DIR="/fsx/lintangsutawika/t5x/"
 2 | export CONFIG_PATH="/fsx/aran/jax/t5x_2/architecture-objective/experiments/configs"
 3 | export CACHED_DATA_DIR="/fsx/lintangsutawika/data"
 4 | 
 5 | ADDR=$1
 6 | MODEL_DIR=$2
 7 | 
 8 | python -m t5x.train \
 9 |     --gin_file="configs/finetune_sglue.gin" \
10 |     --gin_file="configs/size/920m/vanilla.gin" \
11 |     --gin_file="${CONFIG_PATH}/mode/gpu.gin" \
12 |     --gin.TRAIN_STEPS=224_000 \
13 |     --gin.MODEL_DIR=\"'/fsx/aran/jax/ckpts/scaling/920m/vanilla_96k_finetune'\" \
14 |     --gin.INITIAL_CHECKPOINT_PATH=\"'/fsx/aran/jax/ckpts/scaling/920m/checkpoint_96000'\" \
15 |     --seqio_additional_cache_dirs="${CACHED_DATA_DIR}" \
16 |     --alsologtostderr \
17 |     --multiprocess_gpu \
18 |     --coordinator_address=${ADDR} \
19 |     --process_count=${SLURM_NTASKS} \
20 |     --process_index=${SLURM_PROCID}


--------------------------------------------------------------------------------
/experiments/preliminary/scaling_laws/super_glue_performance.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EleutherAI/improved-t5/08beb733e5df07d24fbf95f720d7e87e09c547cd/experiments/preliminary/scaling_laws/super_glue_performance.png


--------------------------------------------------------------------------------
/experiments/preliminary/scaling_laws/super_glue_performance_flop.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EleutherAI/improved-t5/08beb733e5df07d24fbf95f720d7e87e09c547cd/experiments/preliminary/scaling_laws/super_glue_performance_flop.png


--------------------------------------------------------------------------------
/models/decoder_t5/__init__.py:
--------------------------------------------------------------------------------
 1 | # from typing import TYPE_CHECKING
 2 | 
 3 | # _import_structure = {"configuration_t5": ["T5_PRETRAINED_CONFIG_ARCHIVE_MAP", "T5Config", "T5OnnxConfig"]}
 4 | 
 5 | # _import_structure["modeling_t5"] = [
 6 | #     # "T5_PRETRAINED_MODEL_ARCHIVE_LIST",
 7 | #     # "T5EncoderModel",
 8 | #     "DecoderT5ForConditionalGeneration",
 9 | #     # "T5Model",
10 | #     # "T5PreTrainedModel",
11 | #     # "load_tf_weights_in_t5",
12 | # ]


--------------------------------------------------------------------------------
/models/scalable_t5/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2022 The T5X Authors.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | # This empty file is needed for loading the gin files in this directory.
16 | 


--------------------------------------------------------------------------------
/models/scalable_t5/mt5/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2022 The T5X Authors.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | # This empty file is needed for loading the gin files in this directory.
16 | 


--------------------------------------------------------------------------------
/models/scalable_t5/mt5/large.gin:
--------------------------------------------------------------------------------
 1 | # T5.1.1 Large model.
 2 | 
 3 | include 't5x/examples/scalable_t5/mt5/base.gin'  # imports vocab, optimizer and model.
 4 | 
 5 | # ------------------- Network specification overrides --------------------------
 6 | network.Transformer.config = @network.T5Config()
 7 | network.T5Config:
 8 |   emb_dim = 1024
 9 |   num_heads = 16
10 |   num_encoder_layers = 24
11 |   num_decoder_layers = 24
12 |   head_dim = 64
13 |   mlp_dim = 2816
14 | 


--------------------------------------------------------------------------------
/models/scalable_t5/mt5/small.gin:
--------------------------------------------------------------------------------
 1 | # T5.1.1 Small model.
 2 | 
 3 | include 't5x/examples/scalable_t5/mt5/base.gin'  # imports vocab, optimizer and model.
 4 | 
 5 | # ------------------- Network specification overrides --------------------------
 6 | network.Transformer.config = @network.T5Config()
 7 | network.T5Config:
 8 |   emb_dim = 512
 9 |   num_heads = 6
10 |   num_encoder_layers = 8
11 |   num_decoder_layers = 8
12 |   head_dim = 64
13 |   mlp_dim = 1024
14 | 


--------------------------------------------------------------------------------
/models/scalable_t5/mt5/xl.gin:
--------------------------------------------------------------------------------
 1 | # T5.1.1 XL model.
 2 | 
 3 | include 't5x/examples/scalable_t5/mt5/base.gin'  # imports vocab, optimizer and model.
 4 | 
 5 | # ------------------- Network specification overrides --------------------------
 6 | network.Transformer.config = @network.T5Config()
 7 | network.T5Config:
 8 |   emb_dim = 2048
 9 |   num_heads = 32
10 |   num_encoder_layers = 24
11 |   num_decoder_layers = 24
12 |   head_dim = 64
13 |   mlp_dim = 5120
14 | 


--------------------------------------------------------------------------------
/models/scalable_t5/mt5/xxl.gin:
--------------------------------------------------------------------------------
 1 | # T5.1.1 XXL model.
 2 | 
 3 | include 't5x/examples/scalable_t5/mt5/base.gin'  # imports vocab, optimizer and model.
 4 | 
 5 | # ------------------- Network specification overrides --------------------------
 6 | network.Transformer.config = @network.T5Config()
 7 | network.T5Config:
 8 |   emb_dim = 4096
 9 |   num_heads = 64
10 |   num_encoder_layers = 24
11 |   num_decoder_layers = 24
12 |   head_dim = 64
13 |   mlp_dim = 10240
14 | 


--------------------------------------------------------------------------------
/models/scalable_t5/t5_1_1/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2022 The T5X Authors.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | # This empty file is needed for loading the gin files in this directory.
16 | 


--------------------------------------------------------------------------------
/models/scalable_t5/t5_1_1/examples/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2022 The T5X Authors.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | # This empty file is needed for loading the gin files in this directory.
16 | 


--------------------------------------------------------------------------------
/models/scalable_t5/t5_1_1/large.gin:
--------------------------------------------------------------------------------
 1 | # T5.1.1 Large model.
 2 | 
 3 | include 'models/scalable_t5/t5_1_1/base.gin'  # imports vocab, optimizer and model.
 4 | 
 5 | # ------------------- Network specification overrides --------------------------
 6 | network.Transformer.config = @network.T5Config()
 7 | network.T5Config:
 8 |   emb_dim = 1024
 9 |   num_heads = 16
10 |   num_encoder_layers = 24
11 |   num_decoder_layers = 24
12 |   head_dim = 64
13 |   mlp_dim = 2816
14 | 


--------------------------------------------------------------------------------
/models/scalable_t5/t5_1_1/small.gin:
--------------------------------------------------------------------------------
 1 | # T5.1.1 Small model.
 2 | 
 3 | include 'models/scalable_t5/t5_1_1/base.gin'  # imports vocab, optimizer and model.
 4 | 
 5 | # ------------------- Network specification overrides --------------------------
 6 | network.Transformer.config = @network.T5Config()
 7 | network.T5Config:
 8 |   emb_dim = 512
 9 |   num_heads = 6
10 |   num_encoder_layers = 8
11 |   num_decoder_layers = 8
12 |   head_dim = 64
13 |   mlp_dim = 1024
14 | 


--------------------------------------------------------------------------------
/models/scalable_t5/t5_1_1/xl.gin:
--------------------------------------------------------------------------------
 1 | # T5.1.1 XL model.
 2 | 
 3 | import __main__ as train_script
 4 | 
 5 | from t5x import partitioning
 6 | include 'models/scalable_t5/t5_1_1/base.gin'  # imports vocab, optimizer and model.
 7 | 
 8 | # ------------------- Network specification overrides --------------------------
 9 | network.Transformer.config = @network.T5Config()
10 | network.T5Config:
11 |   emb_dim = 2048
12 |   num_heads = 32
13 |   num_encoder_layers = 24
14 |   num_decoder_layers = 24
15 |   head_dim = 64
16 |   mlp_dim = 5120
17 | 


--------------------------------------------------------------------------------
/models/scalable_t5/t5_1_1/xxl.gin:
--------------------------------------------------------------------------------
 1 | # T5.1.1 XXL model.
 2 | 
 3 | include 'models/scalable_t5/t5_1_1/base.gin'  # imports vocab, optimizer and model.
 4 | 
 5 | # ------------------- Network specification overrides --------------------------
 6 | network.Transformer.config = @network.T5Config()
 7 | network.T5Config:
 8 |   emb_dim = 4096
 9 |   num_heads = 64
10 |   num_encoder_layers = 24
11 |   num_decoder_layers = 24
12 |   head_dim = 64
13 |   mlp_dim = 10240
14 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | from setuptools import setup
2 | 
3 | setup(
4 |     name='improved_t5',
5 |     packages=[
6 |         'data',
7 |     ]
8 | )
9 | 


--------------------------------------------------------------------------------
/tpu-scripts/kill.sh:
--------------------------------------------------------------------------------
1 | gcloud compute tpus tpu-vm ssh $1 \
2 |     --worker=all \
3 |     --zone us-central2-b \
4 |     --command "pkill -9 train.py; rm -f /tmp/libtpu_lockfile"
5 | 


--------------------------------------------------------------------------------
/tpu-scripts/run.sh:
--------------------------------------------------------------------------------
1 | #pkill train.py
2 | #rm -f /tmp/libtpu_lockfile
3 | 
4 | # --worker=all
5 | gcloud compute tpus tpu-vm ssh $1 \
6 |     --worker=all \
7 |     --zone us-central2-b \
8 |     --command "$2"
9 | 


--------------------------------------------------------------------------------
/tpu-scripts/send.sh:
--------------------------------------------------------------------------------
1 | gcloud compute tpus tpu-vm scp $2 $1:$3 --worker=all --zone us-central2-b 
2 | 


--------------------------------------------------------------------------------