├── assets
    ├── logo.png
    └── training_loss.png
├── requirements.txt
├── .gitmodules
├── run_qa.sh
├── run_chitchat.sh
├── train_tokenizer.py
├── run_summarization.sh
├── .gitignore
├── t5_tokenizer_model.py
├── scripts
    ├── run_summarization.py
    ├── run_chitchat.py
    ├── trainer_seq2seq_qa.py
    └── run_qa.py
├── LICENSE
└── README.md


/assets/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LazarusNLP/IndoT5/HEAD/assets/logo.png


--------------------------------------------------------------------------------
/assets/training_loss.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LazarusNLP/IndoT5/HEAD/assets/training_loss.png


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | datargs
2 | datasets
3 | tokenizers
4 | sentencepiece
5 | transformers
6 | rouge_score
7 | evaluate


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "nanoT5"]
2 | 	path = nanoT5
3 | 	url = https://github.com/LazarusNLP/nanoT5.git
4 | 	branch = ind
5 | 


--------------------------------------------------------------------------------
/run_qa.sh:
--------------------------------------------------------------------------------
 1 | python scripts/run_qa.py \
 2 |     --model-checkpoint LazarusNLP/IndoNanoT5-base \
 3 |     --dataset-name LazarusNLP/indonlg \
 4 |     --dataset-config question_answering \
 5 |     --context-column-name context \
 6 |     --question-column-name input \
 7 |     --answer-column-name references \
 8 |     --id-column-name gem_id \
 9 |     --input-max-length 512 \
10 |     --target-max-length 512 \
11 |     --num-beams 5 \
12 |     --output-dir outputs/indo-nanot5-tydiqa \
13 |     --num-train-epochs 50 \
14 |     --optim adamw_torch_fused \
15 |     --learning-rate 1e-5 \
16 |     --weight-decay 0.01 \
17 |     --per-device-train-batch-size 8 \
18 |     --per-device-eval-batch-size 16 \
19 |     --hub-model-id LazarusNLP/IndoNanoT5-base-TyDiQA


--------------------------------------------------------------------------------
/run_chitchat.sh:
--------------------------------------------------------------------------------
 1 | python scripts/run_chitchat.py \
 2 |     --model-checkpoint LazarusNLP/IndoNanoT5-base \
 3 |     --dataset-name LazarusNLP/indonlg \
 4 |     --dataset-config xpersona \
 5 |     --context-column-name context \
 6 |     --input-column-name input \
 7 |     --target-column-name target \
 8 |     --use-persona \
 9 |     --input-max-length 512 \
10 |     --target-max-length 512 \
11 |     --num-beams 5 \
12 |     --output-dir outputs/indo-nanot5-xpersona \
13 |     --num-train-epochs 50 \
14 |     --early-stopping-patience 5 \
15 |     --early-stopping-threshold 0.0 \
16 |     --optim adamw_torch_fused \
17 |     --learning-rate 1e-5 \
18 |     --weight-decay 0.01 \
19 |     --per-device-train-batch-size 8 \
20 |     --per-device-eval-batch-size 16 \
21 |     --hub-model-id LazarusNLP/IndoNanoT5-base-XPersona


--------------------------------------------------------------------------------
/train_tokenizer.py:
--------------------------------------------------------------------------------
 1 | from dataclasses import dataclass
 2 | from datargs import parse
 3 | 
 4 | from datasets import load_dataset
 5 | from transformers import T5Config, AutoTokenizer
 6 | 
 7 | from t5_tokenizer_model import SentencePieceUnigramTokenizer
 8 | 
 9 | 
10 | @dataclass
11 | class Args:
12 |     vocab_size: int = 32_000
13 |     batch_length: int = 1000
14 |     dataset_name: str = "uonlp/CulturaX"
15 |     dataset_config: str = "id"
16 |     dataset_split: str = "train"
17 |     output_dir: str = "outputs/indonesian-t5-base/"
18 |     base_model_config: str = "google/t5-v1_1-base"
19 |     hf_repo_id: str = "LazarusNLP/IndoNanoT5-base"
20 | 
21 | 
22 | def main(args: Args):
23 |     # Initialize a dataset
24 |     dataset = load_dataset(args.dataset_name, args.dataset_config, split=args.dataset_split, streaming=True)
25 | 
26 |     # Build an iterator over this dataset
27 |     def batch_iterator():
28 |         batch = []
29 |         for example in dataset:
30 |             batch.append(example["text"])
31 |             if len(batch) == args.batch_length:
32 |                 yield batch
33 |                 batch = []
34 |         if batch:  # yield last batch
35 |             yield batch
36 | 
37 |     tokenizer = SentencePieceUnigramTokenizer(unk_token="<unk>", eos_token="</s>", pad_token="<pad>")
38 |     tokenizer.train_from_iterator(
39 |         iterator=batch_iterator(),
40 |         vocab_size=args.vocab_size,
41 |         show_progress=True,
42 |     )
43 |     tokenizer.save(f"{args.output_dir}/tokenizer.json")
44 | 
45 |     # Create HF T5 Tokenizer and push to HF Hub
46 |     tokenizer = AutoTokenizer.from_pretrained(args.output_dir)
47 |     tokenizer.push_to_hub(args.hf_repo_id)
48 | 
49 |     # Create model config based on T5v1.1 and push to HF Hub
50 |     config = T5Config.from_pretrained(args.base_model_config)
51 |     config.push_to_hub(args.hf_repo_id)
52 | 
53 | 
54 | if __name__ == "__main__":
55 |     args = parse(Args)
56 |     main(args)
57 | 


--------------------------------------------------------------------------------
/run_summarization.sh:
--------------------------------------------------------------------------------
 1 | python scripts/run_summarization.py \
 2 |     --model-checkpoint LazarusNLP/IndoNanoT5-base \
 3 |     --dataset-name LazarusNLP/indonlg \
 4 |     --dataset-config indosum \
 5 |     --input-column-name input \
 6 |     --target-column-name target \
 7 |     --input-max-length 512 \
 8 |     --target-max-length 512 \
 9 |     --num-beams 5 \
10 |     --output-dir outputs/indo-nanot5-indosum \
11 |     --num-train-epochs 5 \
12 |     --optim adamw_torch_fused \
13 |     --learning-rate 1e-3 \
14 |     --weight-decay 0.01 \
15 |     --per-device-train-batch-size 8 \
16 |     --per-device-eval-batch-size 16 \
17 |     --hub-model-id LazarusNLP/IndoNanoT5-base-IndoSum
18 | 
19 | python scripts/run_summarization.py \
20 |     --model-checkpoint LazarusNLP/IndoNanoT5-base \
21 |     --dataset-name LazarusNLP/indonlg \
22 |     --dataset-config liputan6_canonical \
23 |     --input-column-name input \
24 |     --target-column-name target \
25 |     --input-max-length 512 \
26 |     --target-max-length 512 \
27 |     --num-beams 5 \
28 |     --output-dir outputs/indo-nanot5-liputan6-canonical \
29 |     --num-train-epochs 50 \
30 |     --optim adamw_torch_fused \
31 |     --learning-rate 1e-5 \
32 |     --weight-decay 0.01 \
33 |     --per-device-train-batch-size 8 \
34 |     --per-device-eval-batch-size 16 \
35 |     --hub-model-id LazarusNLP/IndoNanoT5-base-Liputan6-Canonical
36 | 
37 | # eval Canonical model on Extreme test set
38 | python scripts/run_summarization.py \
39 |     --model-checkpoint LazarusNLP/IndoNanoT5-base-Liputan6-Canonical \
40 |     --dataset-name LazarusNLP/indonlg \
41 |     --dataset-config liputan6_extreme \
42 |     --input-column-name input \
43 |     --target-column-name target \
44 |     --input-max-length 512 \
45 |     --target-max-length 512 \
46 |     --num-beams 5 \
47 |     --output-dir outputs/indo-nanot5-liputan6-extreme \
48 |     --per-device-eval-batch-size 16 \
49 |     --do-eval-only \
50 |     --hub-model-id LazarusNLP/IndoNanoT5-base-Liputan6-Extreme


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .nox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | *.py,cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | cover/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | .pybuilder/
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | #   For a library or package, you might want to ignore these files since the code is
 87 | #   intended to run in multiple environments; otherwise, check them in:
 88 | # .python-version
 89 | 
 90 | # pipenv
 91 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 92 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 93 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 94 | #   install all needed dependencies.
 95 | #Pipfile.lock
 96 | 
 97 | # poetry
 98 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
 99 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
100 | #   commonly ignored for libraries.
101 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102 | #poetry.lock
103 | 
104 | # pdm
105 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106 | #pdm.lock
107 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108 | #   in version control.
109 | #   https://pdm.fming.dev/#use-with-ide
110 | .pdm.toml
111 | 
112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
113 | __pypackages__/
114 | 
115 | # Celery stuff
116 | celerybeat-schedule
117 | celerybeat.pid
118 | 
119 | # SageMath parsed files
120 | *.sage.py
121 | 
122 | # Environments
123 | .env
124 | .venv
125 | env/
126 | venv/
127 | ENV/
128 | env.bak/
129 | venv.bak/
130 | 
131 | # Spyder project settings
132 | .spyderproject
133 | .spyproject
134 | 
135 | # Rope project settings
136 | .ropeproject
137 | 
138 | # mkdocs documentation
139 | /site
140 | 
141 | # mypy
142 | .mypy_cache/
143 | .dmypy.json
144 | dmypy.json
145 | 
146 | # Pyre type checker
147 | .pyre/
148 | 
149 | # pytype static type analyzer
150 | .pytype/
151 | 
152 | # Cython debug symbols
153 | cython_debug/
154 | 
155 | # PyCharm
156 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
157 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
158 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
159 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
160 | #.idea/
161 | 
162 | outputs/
163 | notebooks/


--------------------------------------------------------------------------------
/t5_tokenizer_model.py:
--------------------------------------------------------------------------------
  1 | # Copied from https://github.com/huggingface/transformers/blob/main/examples/flax/language-modeling/t5_tokenizer_model.py
  2 | 
  3 | #!/usr/bin/env python3
  4 | import json
  5 | from typing import Iterator, List, Union
  6 | 
  7 | from tokenizers import AddedToken, Regex, Tokenizer, decoders, normalizers, pre_tokenizers, trainers
  8 | from tokenizers.implementations.base_tokenizer import BaseTokenizer
  9 | from tokenizers.models import Unigram
 10 | from tokenizers.processors import TemplateProcessing
 11 | 
 12 | 
 13 | class SentencePieceUnigramTokenizer(BaseTokenizer):
 14 |     """
 15 |     This class is a copy of `DeDLOC's tokenizer implementation <https://github.com/yandex-research/DeDLOC/blob/main/sahajbert/tokenizer/tokenizer_model.py>`__ .
 16 | 
 17 |     Custom SentencePiece Unigram Tokenizer with NMT, NKFC, spaces and lower-casing characters normalization
 18 |     Represents the Unigram algorithm, with the pretokenization used by SentencePiece
 19 |     """
 20 | 
 21 |     def __init__(
 22 |         self,
 23 |         replacement: str = "▁",
 24 |         add_prefix_space: bool = True,
 25 |         unk_token: Union[str, AddedToken] = "<unk>",
 26 |         eos_token: Union[str, AddedToken] = "</s>",
 27 |         pad_token: Union[str, AddedToken] = "<pad>",
 28 |     ):
 29 |         self.special_tokens = {
 30 |             "pad": {"id": 0, "token": pad_token},
 31 |             "eos": {"id": 1, "token": eos_token},
 32 |             "unk": {"id": 2, "token": unk_token},
 33 |         }
 34 | 
 35 |         self.special_tokens_list = [None] * len(self.special_tokens)
 36 |         for token_dict in self.special_tokens.values():
 37 |             self.special_tokens_list[token_dict["id"]] = token_dict["token"]
 38 | 
 39 |         tokenizer = Tokenizer(Unigram())
 40 | 
 41 |         tokenizer.normalizer = normalizers.Sequence(
 42 |             [
 43 |                 normalizers.Nmt(),
 44 |                 normalizers.NFKC(),
 45 |                 normalizers.Replace(Regex(" {2,}"), " "),
 46 |                 normalizers.Lowercase(),
 47 |             ]
 48 |         )
 49 |         tokenizer.pre_tokenizer = pre_tokenizers.Sequence(
 50 |             [
 51 |                 pre_tokenizers.Metaspace(replacement=replacement, add_prefix_space=add_prefix_space),
 52 |                 pre_tokenizers.Digits(individual_digits=True),
 53 |                 pre_tokenizers.Punctuation(),
 54 |             ]
 55 |         )
 56 |         tokenizer.decoder = decoders.Metaspace(replacement=replacement, add_prefix_space=add_prefix_space)
 57 | 
 58 |         tokenizer.post_processor = TemplateProcessing(
 59 |             single=f"$A {self.special_tokens['eos']['token']}",
 60 |             special_tokens=[(self.special_tokens["eos"]["token"], self.special_tokens["eos"]["id"])],
 61 |         )
 62 | 
 63 |         parameters = {
 64 |             "model": "SentencePieceUnigram",
 65 |             "replacement": replacement,
 66 |             "add_prefix_space": add_prefix_space,
 67 |         }
 68 | 
 69 |         super().__init__(tokenizer, parameters)
 70 | 
 71 |     def train(
 72 |         self,
 73 |         files: Union[str, List[str]],
 74 |         vocab_size: int = 8000,
 75 |         show_progress: bool = True,
 76 |     ):
 77 |         """Train the model using the given files"""
 78 | 
 79 |         trainer = trainers.UnigramTrainer(
 80 |             vocab_size=vocab_size,
 81 |             special_tokens=self.special_tokens_list,
 82 |             show_progress=show_progress,
 83 |         )
 84 | 
 85 |         if isinstance(files, str):
 86 |             files = [files]
 87 |         self._tokenizer.train(files, trainer=trainer)
 88 | 
 89 |         self.add_unk_id()
 90 | 
 91 |     def train_from_iterator(
 92 |         self,
 93 |         iterator: Union[Iterator[str], Iterator[Iterator[str]]],
 94 |         vocab_size: int = 8000,
 95 |         show_progress: bool = True,
 96 |     ):
 97 |         """Train the model using the given iterator"""
 98 | 
 99 |         trainer = trainers.UnigramTrainer(
100 |             vocab_size=vocab_size,
101 |             special_tokens=self.special_tokens_list,
102 |             show_progress=show_progress,
103 |         )
104 | 
105 |         self._tokenizer.train_from_iterator(iterator, trainer=trainer)
106 | 
107 |         self.add_unk_id()
108 | 
109 |     def add_unk_id(self):
110 |         tokenizer_json = json.loads(self._tokenizer.to_str())
111 | 
112 |         tokenizer_json["model"]["unk_id"] = self.special_tokens["unk"]["id"]
113 | 
114 |         self._tokenizer = Tokenizer.from_str(json.dumps(tokenizer_json))


--------------------------------------------------------------------------------
/scripts/run_summarization.py:
--------------------------------------------------------------------------------
  1 | # Modified from: https://github.com/huggingface/transformers/blob/main/examples/pytorch/summarization/run_summarization.py
  2 | 
  3 | import json
  4 | from dataclasses import dataclass
  5 | from datargs import parse
  6 | 
  7 | import evaluate
  8 | import numpy as np
  9 | from datasets import load_dataset
 10 | from transformers import (
 11 |     AutoTokenizer,
 12 |     AutoModelForSeq2SeqLM,
 13 |     DataCollatorForSeq2Seq,
 14 |     Seq2SeqTrainingArguments,
 15 |     Seq2SeqTrainer,
 16 |     EarlyStoppingCallback,
 17 | )
 18 | 
 19 | 
 20 | @dataclass
 21 | class Args:
 22 |     model_checkpoint: str = "LazarusNLP/IndoNanoT5-base"
 23 |     dataset_name: str = "LazarusNLP/indonlg"
 24 |     dataset_config: str = "indosum"
 25 |     input_column_name: str = "input"
 26 |     target_column_name: str = "target"
 27 |     input_max_length: int = 512
 28 |     target_max_length: int = 512
 29 |     num_beams: int = 5
 30 |     output_dir: str = "outputs/indo-nanot5-indosum"
 31 |     num_train_epochs: int = 5
 32 |     early_stopping_patience: int = 5
 33 |     early_stopping_threshold: float = 0.0
 34 |     optim: str = "adamw_torch_fused"
 35 |     learning_rate: float = 1e-5
 36 |     weight_decay: float = 0.01
 37 |     per_device_train_batch_size: int = 8
 38 |     per_device_eval_batch_size: int = 16
 39 |     hub_model_id: str = "LazarusNLP/IndoNanoT5-base-IndoSum"
 40 |     do_eval_only: bool = False
 41 | 
 42 | 
 43 | def main(args: Args):
 44 |     # load dataset, tokenizer, model
 45 |     dataset = load_dataset(args.dataset_name, args.dataset_config, trust_remote_code=True)
 46 |     tokenizer = AutoTokenizer.from_pretrained(args.model_checkpoint)
 47 |     model = AutoModelForSeq2SeqLM.from_pretrained(args.model_checkpoint)
 48 | 
 49 |     def preprocess_function(examples):
 50 |         model_inputs = tokenizer(examples[args.input_column_name], max_length=args.input_max_length, truncation=True)
 51 |         labels = tokenizer(
 52 |             text_target=examples[args.target_column_name], max_length=args.target_max_length, truncation=True
 53 |         )
 54 |         model_inputs["labels"] = labels["input_ids"]
 55 |         return model_inputs
 56 | 
 57 |     tokenized_dataset = dataset.map(preprocess_function, batched=True)
 58 | 
 59 |     # prepare s2s collator
 60 |     data_collator = DataCollatorForSeq2Seq(
 61 |         tokenizer=tokenizer, model=args.model_checkpoint, label_pad_token_id=tokenizer.pad_token_id
 62 |     )
 63 | 
 64 |     # ROUGE metric for evaluation
 65 |     rouge = evaluate.load("rouge")
 66 | 
 67 |     def compute_metrics(eval_pred):
 68 |         predictions, labels = eval_pred
 69 |         predictions = np.where(predictions != -100, predictions, tokenizer.pad_token_id)
 70 |         decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
 71 |         labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
 72 |         decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
 73 | 
 74 |         result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
 75 | 
 76 |         prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
 77 |         result["gen_len"] = np.mean(prediction_lens)
 78 | 
 79 |         return {k: round(v, 4) for k, v in result.items()}
 80 | 
 81 |     callbacks = [EarlyStoppingCallback(args.early_stopping_patience, args.early_stopping_threshold)]
 82 | 
 83 |     training_args = Seq2SeqTrainingArguments(
 84 |         output_dir=args.output_dir,
 85 |         evaluation_strategy="epoch",
 86 |         save_strategy="epoch",
 87 |         per_device_train_batch_size=args.per_device_train_batch_size,
 88 |         per_device_eval_batch_size=args.per_device_eval_batch_size,
 89 |         optim=args.optim,
 90 |         learning_rate=args.learning_rate,
 91 |         weight_decay=args.weight_decay,
 92 |         num_train_epochs=args.num_train_epochs,
 93 |         save_total_limit=3,
 94 |         predict_with_generate=True,
 95 |         load_best_model_at_end=True,
 96 |         metric_for_best_model="rouge1",
 97 |         bf16=True,
 98 |         report_to="tensorboard",
 99 |         push_to_hub=True,
100 |         hub_model_id=args.hub_model_id,
101 |         hub_private_repo=True,
102 |     )
103 | 
104 |     trainer = Seq2SeqTrainer(
105 |         model=model,
106 |         args=training_args,
107 |         train_dataset=tokenized_dataset["train"],
108 |         eval_dataset=tokenized_dataset["validation"],
109 |         tokenizer=tokenizer,
110 |         data_collator=data_collator,
111 |         compute_metrics=compute_metrics,
112 |         callbacks=callbacks,
113 |     )
114 | 
115 |     if args.do_eval_only:
116 |         results = trainer.evaluate(
117 |             tokenized_dataset["test"], max_length=args.target_max_length, num_beams=args.num_beams
118 |         )
119 |         with open(f"{args.output_dir}/eval_results.json", "w") as f:
120 |             f.write(json.dumps(results))
121 |         return
122 | 
123 |     trainer.train()
124 | 
125 |     trainer.evaluate(tokenized_dataset["test"], max_length=args.target_max_length, num_beams=args.num_beams)
126 | 
127 |     trainer.push_to_hub()
128 | 
129 | 
130 | if __name__ == "__main__":
131 |     args = parse(Args)
132 |     main(args)
133 | 


--------------------------------------------------------------------------------
/scripts/run_chitchat.py:
--------------------------------------------------------------------------------
  1 | from dataclasses import dataclass
  2 | from datargs import parse
  3 | 
  4 | import evaluate
  5 | import numpy as np
  6 | from datasets import load_dataset
  7 | from transformers import (
  8 |     AutoTokenizer,
  9 |     AutoModelForSeq2SeqLM,
 10 |     DataCollatorForSeq2Seq,
 11 |     Seq2SeqTrainingArguments,
 12 |     Seq2SeqTrainer,
 13 |     EarlyStoppingCallback,
 14 | )
 15 | 
 16 | 
 17 | @dataclass
 18 | class Args:
 19 |     model_checkpoint: str = "LazarusNLP/IndoNanoT5-base"
 20 |     dataset_name: str = "LazarusNLP/indonlg"
 21 |     dataset_config: str = "xpersona"
 22 |     context_column_name: str = "context"
 23 |     input_column_name: str = "input"
 24 |     target_column_name: str = "target"
 25 |     use_persona: bool = False
 26 |     input_max_length: int = 512
 27 |     target_max_length: int = 512
 28 |     num_beams: int = 5
 29 |     output_dir: str = "outputs/indo-nanot5-xpersona"
 30 |     num_train_epochs: int = 50
 31 |     early_stopping_patience: int = 5
 32 |     early_stopping_threshold: float = 0.0
 33 |     optim: str = "adamw_torch_fused"
 34 |     learning_rate: float = 1e-5
 35 |     weight_decay: float = 0.01
 36 |     per_device_train_batch_size: int = 8
 37 |     per_device_eval_batch_size: int = 16
 38 |     hub_model_id: str = "LazarusNLP/IndoNanoT5-base-XPersona"
 39 | 
 40 | 
 41 | def main(args: Args):
 42 |     # load dataset, tokenizer, model
 43 |     dataset = load_dataset(args.dataset_name, args.dataset_config, trust_remote_code=True)
 44 |     tokenizer = AutoTokenizer.from_pretrained(args.model_checkpoint)
 45 |     model = AutoModelForSeq2SeqLM.from_pretrained(args.model_checkpoint)
 46 | 
 47 |     def preprocess_function(examples):
 48 |         def generate_history(history):
 49 |             # take turns between user and system
 50 |             history = [f"U: {text}" if i % 2 == 0 else f"S: {text}" for i, text in enumerate(history)]
 51 |             # prompt for next system response
 52 |             history.append("S: ")
 53 |             return history
 54 | 
 55 |         def generate_input(history, persona=None) -> str:
 56 |             # try removing conversations from the start until we are within max token length
 57 |             while history:
 58 |                 combined_input = f"{persona} | {' | '.join(history)}" if persona else " | ".join(history)
 59 |                 tokenized_input = tokenizer(combined_input, truncation=False)
 60 |                 # return input once it is within sequence length
 61 |                 if len(tokenized_input["input_ids"]) <= args.input_max_length:
 62 |                     return combined_input
 63 |                 # otherwise, remove oldest chat history
 64 |                 history.pop(0)
 65 | 
 66 |             raise NotImplementedError
 67 | 
 68 |         persona = [f"P: {' '.join(ex)}" if args.use_persona else None for ex in examples[args.context_column_name]]
 69 |         history = [generate_history(ex) for ex in examples[args.input_column_name]]
 70 |         inputs = [generate_input(h, p) for h, p in zip(history, persona)]
 71 |         targets = examples[args.target_column_name]
 72 | 
 73 |         model_inputs = tokenizer(inputs, max_length=args.input_max_length, truncation=False)
 74 |         labels = tokenizer(text_target=targets, max_length=args.target_max_length, truncation=True)
 75 |         model_inputs["labels"] = labels["input_ids"]
 76 |         return model_inputs
 77 | 
 78 |     tokenized_dataset = dataset.map(preprocess_function, batched=True)
 79 | 
 80 |     data_collator = DataCollatorForSeq2Seq(
 81 |         tokenizer=tokenizer, model=args.model_checkpoint, label_pad_token_id=tokenizer.pad_token_id
 82 |     )
 83 | 
 84 |     bleu = evaluate.load("bleu")
 85 |     sacrebleu = evaluate.load("sacrebleu")
 86 | 
 87 |     def compute_metrics(eval_pred):
 88 |         predictions, labels = eval_pred
 89 |         predictions = np.where(predictions != -100, predictions, tokenizer.pad_token_id)
 90 |         decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
 91 |         labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
 92 |         decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
 93 | 
 94 |         bleu_score = bleu.compute(predictions=decoded_preds, references=decoded_labels)["bleu"]
 95 |         sacrebleu_score = sacrebleu.compute(predictions=decoded_preds, references=decoded_labels)["score"]
 96 | 
 97 |         return {"bleu": round(bleu_score * 100, 4), "sacrebleu": round(sacrebleu_score, 4)}
 98 | 
 99 |     callbacks = [EarlyStoppingCallback(args.early_stopping_patience, args.early_stopping_threshold)]
100 | 
101 |     training_args = Seq2SeqTrainingArguments(
102 |         output_dir=args.output_dir,
103 |         evaluation_strategy="epoch",
104 |         save_strategy="epoch",
105 |         per_device_train_batch_size=args.per_device_train_batch_size,
106 |         per_device_eval_batch_size=args.per_device_eval_batch_size,
107 |         optim=args.optim,
108 |         learning_rate=args.learning_rate,
109 |         weight_decay=args.weight_decay,
110 |         num_train_epochs=args.num_train_epochs,
111 |         save_total_limit=5,
112 |         predict_with_generate=True,
113 |         metric_for_best_model="sacrebleu",
114 |         load_best_model_at_end=True,
115 |         bf16=True,
116 |         report_to="tensorboard",
117 |         push_to_hub=True,
118 |         hub_model_id=args.hub_model_id,
119 |         hub_private_repo=True,
120 |     )
121 | 
122 |     trainer = Seq2SeqTrainer(
123 |         model=model,
124 |         args=training_args,
125 |         train_dataset=tokenized_dataset["train"],
126 |         eval_dataset=tokenized_dataset["validation"],
127 |         tokenizer=tokenizer,
128 |         data_collator=data_collator,
129 |         compute_metrics=compute_metrics,
130 |         callbacks=callbacks,
131 |     )
132 | 
133 |     trainer.train()
134 | 
135 |     trainer.evaluate(tokenized_dataset["test"], max_length=args.target_max_length, num_beams=args.num_beams)
136 | 
137 |     trainer.push_to_hub()
138 | 
139 | 
140 | if __name__ == "__main__":
141 |     args = parse(Args)
142 |     main(args)
143 | 


--------------------------------------------------------------------------------
/scripts/trainer_seq2seq_qa.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2021 The HuggingFace Team All rights reserved.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | """
 16 | A subclass of `Trainer` specific to Question-Answering tasks
 17 | """
 18 | import math
 19 | import time
 20 | from typing import Dict, List, Optional
 21 | 
 22 | from torch.utils.data import Dataset
 23 | 
 24 | from transformers import Seq2SeqTrainer, is_torch_tpu_available
 25 | from transformers.trainer_utils import PredictionOutput, speed_metrics
 26 | 
 27 | 
 28 | if is_torch_tpu_available(check_device=False):
 29 |     import torch_xla.core.xla_model as xm
 30 |     import torch_xla.debug.metrics as met
 31 | 
 32 | 
 33 | class QuestionAnsweringSeq2SeqTrainer(Seq2SeqTrainer):
 34 |     def __init__(self, *args, eval_examples=None, post_process_function=None, **kwargs):
 35 |         super().__init__(*args, **kwargs)
 36 |         self.eval_examples = eval_examples
 37 |         self.post_process_function = post_process_function
 38 | 
 39 |     # def evaluate(self, eval_dataset=None, eval_examples=None, ignore_keys=None, metric_key_prefix: str = "eval"):
 40 |     def evaluate(
 41 |         self,
 42 |         eval_dataset: Optional[Dataset] = None,
 43 |         eval_examples=None,
 44 |         ignore_keys: Optional[List[str]] = None,
 45 |         metric_key_prefix: str = "eval",
 46 |         **gen_kwargs,
 47 |     ) -> Dict[str, float]:
 48 |         gen_kwargs = gen_kwargs.copy()
 49 | 
 50 |         # Use legacy argument setting if a) the option is not explicitly passed; and b) the argument is set in the
 51 |         # training args
 52 |         if gen_kwargs.get("max_length") is None and self.args.generation_max_length is not None:
 53 |             gen_kwargs["max_length"] = self.args.generation_max_length
 54 |         if gen_kwargs.get("num_beams") is None and self.args.generation_num_beams is not None:
 55 |             gen_kwargs["num_beams"] = self.args.generation_num_beams
 56 |         self._gen_kwargs = gen_kwargs
 57 | 
 58 |         eval_dataset = self.eval_dataset if eval_dataset is None else eval_dataset
 59 |         eval_dataloader = self.get_eval_dataloader(eval_dataset)
 60 |         eval_examples = self.eval_examples if eval_examples is None else eval_examples
 61 | 
 62 |         # Temporarily disable metric computation, we will do it in the loop here.
 63 |         compute_metrics = self.compute_metrics
 64 |         self.compute_metrics = None
 65 |         start_time = time.time()
 66 |         eval_loop = self.prediction_loop if self.args.use_legacy_prediction_loop else self.evaluation_loop
 67 |         try:
 68 |             output = eval_loop(
 69 |                 eval_dataloader,
 70 |                 description="Evaluation",
 71 |                 # No point gathering the predictions if there are no metrics, otherwise we defer to
 72 |                 # self.args.prediction_loss_only
 73 |                 prediction_loss_only=True if compute_metrics is None else None,
 74 |                 ignore_keys=ignore_keys,
 75 |                 metric_key_prefix=metric_key_prefix,
 76 |             )
 77 |         finally:
 78 |             self.compute_metrics = compute_metrics
 79 |         total_batch_size = self.args.eval_batch_size * self.args.world_size
 80 |         if f"{metric_key_prefix}_jit_compilation_time" in output.metrics:
 81 |             start_time += output.metrics[f"{metric_key_prefix}_jit_compilation_time"]
 82 |         output.metrics.update(
 83 |             speed_metrics(
 84 |                 metric_key_prefix,
 85 |                 start_time,
 86 |                 num_samples=output.num_samples,
 87 |                 num_steps=math.ceil(output.num_samples / total_batch_size),
 88 |             )
 89 |         )
 90 | 
 91 |         if self.post_process_function is not None and self.compute_metrics is not None and self.args.should_save:
 92 |             # Only the main node write the results by default
 93 |             eval_preds = self.post_process_function(eval_examples, eval_dataset, output)
 94 |             metrics = self.compute_metrics(eval_preds)
 95 | 
 96 |             # Prefix all keys with metric_key_prefix + '_'
 97 |             for key in list(metrics.keys()):
 98 |                 if not key.startswith(f"{metric_key_prefix}_"):
 99 |                     metrics[f"{metric_key_prefix}_{key}"] = metrics.pop(key)
100 | 
101 |             metrics.update(output.metrics)
102 |         else:
103 |             metrics = output.metrics
104 | 
105 |         if self.args.should_log:
106 |             # Only the main node log the results by default
107 |             self.log(metrics)
108 | 
109 |         if self.args.tpu_metrics_debug or self.args.debug:
110 |             # tpu-comment: Logging debug metrics for PyTorch/XLA (compile, execute times, ops, etc.)
111 |             xm.master_print(met.metrics_report())
112 | 
113 |         self.control = self.callback_handler.on_evaluate(self.args, self.state, self.control, metrics)
114 |         return metrics
115 | 
116 |     def predict(
117 |         self, predict_dataset, predict_examples, ignore_keys=None, metric_key_prefix: str = "test", **gen_kwargs
118 |     ):
119 |         self._gen_kwargs = gen_kwargs.copy()
120 | 
121 |         predict_dataloader = self.get_test_dataloader(predict_dataset)
122 | 
123 |         # Temporarily disable metric computation, we will do it in the loop here.
124 |         compute_metrics = self.compute_metrics
125 |         self.compute_metrics = None
126 |         start_time = time.time()
127 |         eval_loop = self.prediction_loop if self.args.use_legacy_prediction_loop else self.evaluation_loop
128 |         try:
129 |             output = eval_loop(
130 |                 predict_dataloader,
131 |                 description="Prediction",
132 |                 # No point gathering the predictions if there are no metrics, otherwise we defer to
133 |                 # self.args.prediction_loss_only
134 |                 prediction_loss_only=True if compute_metrics is None else None,
135 |                 ignore_keys=ignore_keys,
136 |                 metric_key_prefix=metric_key_prefix,
137 |             )
138 |         finally:
139 |             self.compute_metrics = compute_metrics
140 | 
141 |         total_batch_size = self.args.eval_batch_size * self.args.world_size
142 |         if f"{metric_key_prefix}_jit_compilation_time" in output.metrics:
143 |             start_time += output.metrics[f"{metric_key_prefix}_jit_compilation_time"]
144 |         output.metrics.update(
145 |             speed_metrics(
146 |                 metric_key_prefix,
147 |                 start_time,
148 |                 num_samples=output.num_samples,
149 |                 num_steps=math.ceil(output.num_samples / total_batch_size),
150 |             )
151 |         )
152 |         if self.post_process_function is None or self.compute_metrics is None:
153 |             return output
154 | 
155 |         predictions = self.post_process_function(predict_examples, predict_dataset, output, "predict")
156 |         metrics = self.compute_metrics(predictions)
157 | 
158 |         # Prefix all keys with metric_key_prefix + '_'
159 |         for key in list(metrics.keys()):
160 |             if not key.startswith(f"{metric_key_prefix}_"):
161 |                 metrics[f"{metric_key_prefix}_{key}"] = metrics.pop(key)
162 |         metrics.update(output.metrics)
163 |         return PredictionOutput(predictions=predictions.predictions, label_ids=predictions.label_ids, metrics=metrics)


--------------------------------------------------------------------------------
/scripts/run_qa.py:
--------------------------------------------------------------------------------
  1 | # Modified from: https://github.com/huggingface/transformers/blob/main/examples/pytorch/question-answering/run_seq2seq_qa.py
  2 | 
  3 | from dataclasses import dataclass
  4 | from datargs import parse
  5 | 
  6 | import evaluate
  7 | import numpy as np
  8 | from datasets import load_dataset
  9 | from transformers.trainer_utils import EvalPrediction
 10 | from transformers import (
 11 |     AutoTokenizer,
 12 |     AutoModelForSeq2SeqLM,
 13 |     DataCollatorForSeq2Seq,
 14 |     Seq2SeqTrainingArguments,
 15 |     EarlyStoppingCallback,
 16 | )
 17 | 
 18 | from trainer_seq2seq_qa import QuestionAnsweringSeq2SeqTrainer
 19 | 
 20 | 
 21 | @dataclass
 22 | class Args:
 23 |     model_checkpoint: str = "LazarusNLP/IndoNanoT5-base"
 24 |     dataset_name: str = "LazarusNLP/indonlg"
 25 |     dataset_config: str = "question_answering"
 26 |     context_column_name: str = "context"
 27 |     question_column_name: str = "input"
 28 |     answer_column_name: str = "references"
 29 |     id_column_name: str = "gem_id"
 30 |     input_max_length: int = 512
 31 |     target_max_length: int = 512
 32 |     num_beams: int = 5
 33 |     output_dir: str = "outputs/indo-nanot5-tydiqa"
 34 |     num_train_epochs: int = 50
 35 |     early_stopping_patience: int = 5
 36 |     early_stopping_threshold: float = 0.01
 37 |     optim: str = "adamw_torch_fused"
 38 |     learning_rate: float = 1e-5
 39 |     weight_decay: float = 0.01
 40 |     per_device_train_batch_size: int = 8
 41 |     per_device_eval_batch_size: int = 16
 42 |     hub_model_id: str = "LazarusNLP/IndoNanoT5-base-TyDiQA"
 43 | 
 44 | 
 45 | def main(args: Args):
 46 |     # load dataset, tokenizer, model
 47 |     dataset = load_dataset(args.dataset_name, args.dataset_config, trust_remote_code=True)
 48 |     tokenizer = AutoTokenizer.from_pretrained(args.model_checkpoint)
 49 |     model = AutoModelForSeq2SeqLM.from_pretrained(args.model_checkpoint)
 50 | 
 51 |     def preprocess_squad_batch(examples, question_column, context_column, answer_column):
 52 |         questions = examples[question_column]
 53 |         contexts = examples[context_column]
 54 |         answers = examples[answer_column]
 55 | 
 56 |         def generate_input(_question, _context):
 57 |             return " ".join(["question:", _question.lstrip(), "context:", _context.lstrip()])
 58 | 
 59 |         inputs = [generate_input(question, context) for question, context in zip(questions, contexts)]
 60 |         targets = [answer[0] if len(answer) > 0 else "" for answer in answers]
 61 |         return inputs, targets
 62 | 
 63 |     def preprocess_function(examples):
 64 |         inputs, targets = preprocess_squad_batch(
 65 |             examples, args.question_column_name, args.context_column_name, args.answer_column_name
 66 |         )
 67 | 
 68 |         model_inputs = tokenizer(inputs, max_length=args.input_max_length, truncation=True)
 69 |         labels = tokenizer(text_target=targets, max_length=args.target_max_length, truncation=True)
 70 |         model_inputs["labels"] = labels["input_ids"]
 71 |         return model_inputs
 72 | 
 73 |     def preprocess_validation_function(examples):
 74 |         inputs, targets = preprocess_squad_batch(
 75 |             examples, args.question_column_name, args.context_column_name, args.answer_column_name
 76 |         )
 77 | 
 78 |         model_inputs = tokenizer(
 79 |             inputs,
 80 |             max_length=args.input_max_length,
 81 |             truncation=True,
 82 |             return_overflowing_tokens=True,
 83 |             return_offsets_mapping=True,
 84 |         )
 85 |         # Tokenize targets with the `text_target` keyword argument
 86 |         labels = tokenizer(text_target=targets, max_length=args.target_max_length, truncation=True)
 87 | 
 88 |         # Since one example might give us several features if it has a long context, we need a map from a feature to
 89 |         # its corresponding example. This key gives us just that.
 90 |         sample_mapping = model_inputs.pop("overflow_to_sample_mapping")
 91 | 
 92 |         # For evaluation, we will need to convert our predictions to substrings of the context, so we keep the
 93 |         # corresponding example_id and we will store the offset mappings.
 94 |         model_inputs["example_id"] = []
 95 |         # Augment the overflowing tokens to the labels
 96 |         labels_out = []
 97 | 
 98 |         for i in range(len(model_inputs["input_ids"])):
 99 |             # One example can give several spans, this is the index of the example containing this span of text.
100 |             sample_index = sample_mapping[i]
101 |             model_inputs["example_id"].append(examples[args.id_column_name][sample_index])
102 |             labels_out.append(labels["input_ids"][sample_index])
103 | 
104 |         model_inputs["labels"] = labels_out
105 |         return model_inputs
106 | 
107 |     train_dataset = dataset["train"]
108 |     validation_dataset = dataset["validation"]
109 |     test_dataset = dataset["test"]
110 | 
111 |     tokenized_train_dataset = train_dataset.map(
112 |         preprocess_function, batched=True, remove_columns=train_dataset.column_names
113 |     )
114 |     tokenized_validation_dataset = validation_dataset.map(
115 |         preprocess_validation_function, batched=True, remove_columns=validation_dataset.column_names
116 |     )
117 |     tokenized_test_dataset = test_dataset.map(
118 |         preprocess_validation_function, batched=True, remove_columns=test_dataset.column_names
119 |     )
120 | 
121 |     # prepare s2s collator
122 |     data_collator = DataCollatorForSeq2Seq(
123 |         tokenizer=tokenizer, model=args.model_checkpoint, label_pad_token_id=tokenizer.pad_token_id
124 |     )
125 | 
126 |     # SQuAD v2 metric for evaluation
127 |     squad_v2 = evaluate.load("squad_v2")
128 | 
129 |     def compute_metrics(p: EvalPrediction):
130 |         return squad_v2.compute(predictions=p.predictions, references=p.label_ids)
131 | 
132 |     def post_processing_function(examples, features, outputs, stage="eval"):
133 |         # Decode the predicted tokens.
134 |         preds = outputs.predictions
135 |         if isinstance(preds, tuple):
136 |             preds = preds[0]
137 |         # Replace -100s used for padding as we can't decode them
138 |         preds = np.where(preds != -100, preds, tokenizer.pad_token_id)
139 |         decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
140 | 
141 |         # Build a map example to its corresponding features.
142 |         example_id_to_index = {k: i for i, k in enumerate(examples[args.id_column_name])}
143 |         feature_per_example = {example_id_to_index[feature["example_id"]]: i for i, feature in enumerate(features)}
144 |         predictions = {}
145 |         # Let's loop over all the examples!
146 |         for example_index, example in enumerate(examples):
147 |             # This is the index of the feature associated to the current example.
148 |             feature_index = feature_per_example[example_index]
149 |             predictions[example[args.id_column_name]] = decoded_preds[feature_index]
150 | 
151 |         # Format the result to the format the metric expects.
152 |         formatted_predictions = [
153 |             {"id": k, "prediction_text": v, "no_answer_probability": 0.0} for k, v in predictions.items()
154 |         ]
155 | 
156 |         references = [
157 |             {"id": ex[args.id_column_name], "answers": {"answer_start": [0], "text": ex[args.answer_column_name]}}
158 |             for ex in examples
159 |         ]
160 |         return EvalPrediction(predictions=formatted_predictions, label_ids=references)
161 | 
162 |     callbacks = [EarlyStoppingCallback(args.early_stopping_patience, args.early_stopping_threshold)]
163 | 
164 |     training_args = Seq2SeqTrainingArguments(
165 |         output_dir=args.output_dir,
166 |         evaluation_strategy="epoch",
167 |         save_strategy="epoch",
168 |         per_device_train_batch_size=args.per_device_train_batch_size,
169 |         per_device_eval_batch_size=args.per_device_eval_batch_size,
170 |         optim=args.optim,
171 |         learning_rate=args.learning_rate,
172 |         weight_decay=args.weight_decay,
173 |         num_train_epochs=args.num_train_epochs,
174 |         save_total_limit=3,
175 |         predict_with_generate=True,
176 |         load_best_model_at_end=True,
177 |         metric_for_best_model="exact",
178 |         bf16=True,
179 |         report_to="tensorboard",
180 |         push_to_hub=True,
181 |         hub_model_id=args.hub_model_id,
182 |         hub_private_repo=True,
183 |     )
184 | 
185 |     trainer = QuestionAnsweringSeq2SeqTrainer(
186 |         model=model,
187 |         args=training_args,
188 |         train_dataset=tokenized_train_dataset,
189 |         eval_dataset=tokenized_validation_dataset,
190 |         eval_examples=validation_dataset,
191 |         tokenizer=tokenizer,
192 |         data_collator=data_collator,
193 |         compute_metrics=compute_metrics,
194 |         callbacks=callbacks,
195 |         post_process_function=post_processing_function,
196 |     )
197 | 
198 |     trainer.train()
199 | 
200 |     trainer.evaluate(tokenized_test_dataset, test_dataset, max_length=args.target_max_length, num_beams=args.num_beams)
201 | 
202 |     trainer.push_to_hub()
203 | 
204 | 
205 | if __name__ == "__main__":
206 |     args = parse(Args)
207 |     main(args)
208 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright 2024 Lazarus NLP
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # IndoT5: T5 Language Models for the Indonesian Language!
  2 | 
  3 | <div align="center">
  4 | 
  5 | <a href="https://huggingface.co/collections/LazarusNLP/indonesian-t5-language-models-65c1b9a0f6342b3eb3d6d450"><img src="https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Collections-yellow"></img></a>
  6 | 
  7 | </div>
  8 | 
  9 | This project focuses on pre-training a [T5](https://arxiv.org/abs/1910.10683) (Text-to-Text Transfer Transformer) model specifically for the Indonesian language, using [nanoT5](https://github.com/PiotrNawrot/nanoT5) as its training framework. Our aim is to provide fully open-source, budget-constrained, sequence-to-sequence language models for Indonesia that are on-par with state-of-the-art models!
 10 | 
 11 | <p align="center">
 12 |     <img src="https://raw.githubusercontent.com/LazarusNLP/IndoT5/main/assets/logo.png" alt="logo" width="400"/>
 13 | </p>
 14 | 
 15 | ## Pre-trained Models
 16 | 
 17 | | Model                                                                           | #params | Dataset                                                          |
 18 | | ------------------------------------------------------------------------------- | :-----: | ---------------------------------------------------------------- |
 19 | | [LazarusNLP/IndoNanoT5-base](https://huggingface.co/LazarusNLP/IndoNanoT5-base) |  248M   | [uonlp/CulturaX](https://huggingface.co/datasets/uonlp/CulturaX) |
 20 | 
 21 | ## Results
 22 | 
 23 | We evaluate our models on [IndoNLG](https://github.com/IndoNLP/indonlg), which consists of multiple downsteam generation tasks in Indonesian. The dataset also supports Javanese and Sundanese, but as our model is currently monolingual, we fine-tune on Indonesian tasks only.
 24 | 
 25 | > IndoNLG baseline results are obtained from the [official IndoNLG paper](https://aclanthology.org/2021.emnlp-main.699/).
 26 | 
 27 | ### IndoSum
 28 | 
 29 | | Model                                                                           | #params |   R1 ↑    |   R2 ↑    |   RL ↑    |
 30 | | ------------------------------------------------------------------------------- | :-----: | :-------: | :-------: | :-------: |
 31 | | Scratch                                                                         |  132M   |   70.52   |   65.43   |   68.35   |
 32 | | mBART Large                                                                     |  610M   |   74.65   |   70.43   |   72.54   |
 33 | | mT5 Small                                                                       |  300M   |   74.04   |   69.64   |   71.89   |
 34 | | IndoBART                                                                        |  132M   |   70.67   |   65.59   |   68.18   |
 35 | | IndoGPT                                                                         |  117M   |   74.49   |   70.34   |   72.46   |
 36 | | *Our work*                                                                      |
 37 | | [LazarusNLP/IndoNanoT5-base](https://huggingface.co/LazarusNLP/IndoNanoT5-base) |  248M   | **75.29** | **71.23** | **73.30** |
 38 | 
 39 | ### Liputan6 Canonical
 40 | 
 41 | | Model                                                                           | #params |   R1 ↑    |   R2 ↑    |   RL ↑    |
 42 | | ------------------------------------------------------------------------------- | :-----: | :-------: | :-------: | :-------: |
 43 | | Scratch                                                                         |  132M   |   38.14   |   20.67   |   31.85   |
 44 | | See et al. (2017)                                                               |   22M   |   36.09   |   19.19   |   29.81   |
 45 | | Koto et al. (2020)                                                              |  153M   | **41.06** | **22.83** | **34.23** |
 46 | | mBART Large                                                                     |  610M   |   39.17   |   21.75   |   32.85   |
 47 | | mT5 Small                                                                       |  300M   |   39.69   |   22.03   |   33.28   |
 48 | | IndoBART                                                                        |  132M   |   39.87   |   22.24   |   33.50   |
 49 | | IndoGPT                                                                         |  117M   |   37.41   |   20.61   |   31.54   |
 50 | | *Our work*                                                                      |
 51 | | [LazarusNLP/IndoNanoT5-base](https://huggingface.co/LazarusNLP/IndoNanoT5-base) |  248M   |   39.76   |   22.29   |   33.46   |
 52 | 
 53 | ### Liputan6 Extreme
 54 | 
 55 | | Model                                                                           | #params |   R1 ↑    |   R2 ↑    |   RL ↑    |
 56 | | ------------------------------------------------------------------------------- | :-----: | :-------: | :-------: | :-------: |
 57 | | Scratch                                                                         |  132M   |   32.47   |   13.45   |   25.52   |
 58 | | See et al. (2017)                                                               |   22M   |   30.39   |   12.03   |   23.55   |
 59 | | Koto et al. (2020)                                                              |  153M   | **34.84** | **15.03** | **27.44** |
 60 | | mBART Large                                                                     |  610M   |   32.87   |   13.79   |   25.91   |
 61 | | mT5 Small                                                                       |  300M   |   33.37   |   14.01   |   26.21   |
 62 | | IndoBART                                                                        |  132M   |   33.58   |   14.45   |   26.68   |
 63 | | IndoGPT                                                                         |  117M   |   31.45   |   13.09   |   24.91   |
 64 | | *Our work*                                                                      |
 65 | | [LazarusNLP/IndoNanoT5-base](https://huggingface.co/LazarusNLP/IndoNanoT5-base) |  248M   |   33.26   |   14.17   |   26.21   |
 66 | 
 67 | ### TyDiQA
 68 | 
 69 | | Model                                                                           | #params |   EM ↑    |   F1 ↑    |
 70 | | ------------------------------------------------------------------------------- | :-----: | :-------: | :-------: |
 71 | | Scratch                                                                         |  132M   |   21.40   |   29.77   |
 72 | | mBART Large                                                                     |  610M   | **62.69** | **76.41** |
 73 | | mT5 Small                                                                       |  300M   |   35.67   |   51.90   |
 74 | | IndoBART                                                                        |  132M   |   57.31   |   69.59   |
 75 | | IndoGPT                                                                         |  117M   |   50.18   |   63.97   |
 76 | | *Our work*                                                                      |
 77 | | [LazarusNLP/IndoNanoT5-base](https://huggingface.co/LazarusNLP/IndoNanoT5-base) |  248M   |   58.94   |   72.19   |
 78 | 
 79 | ### XPersona
 80 | 
 81 | | Model                                                                           | #params | SacreBLEU ↑ |  BLEU ↑  |
 82 | | ------------------------------------------------------------------------------- | :-----: | :---------: | :------: |
 83 | | Scratch                                                                         |  132M   |    1.86     |   1.86   |
 84 | | CausalBERT $^\dagger$                                                           |  110M   |    2.24     |   2.23   |
 85 | | mBART Large                                                                     |  610M   |    2.57     |   2.56   |
 86 | | mT5 Small                                                                       |  300M   |    1.90     |   1.89   |
 87 | | IndoBART                                                                        |  132M   |    2.93     |   2.93   |
 88 | | IndoGPT                                                                         |  117M   |    2.02     |   2.02   |
 89 | | *Our work* $^\dagger$                                                           |
 90 | | [LazarusNLP/IndoNanoT5-base](https://huggingface.co/LazarusNLP/IndoNanoT5-base) |  248M   |  **4.07**   | **4.07** |
 91 | 
 92 | > $^\dagger$ Our models are trained with additional persona information, just like the original CausalBERT baseline. The remaining models are not trained with persona information. Our findings suggest that persona information is crucial for this task; serving a similar purpose to system prompts in recent LLM development.
 93 | 
 94 | ## Installation
 95 | 
 96 | ```sh
 97 | git clone https://github.com/LazarusNLP/IndoT5.git
 98 | cd IndoT5
 99 | git submodule update --init # clone nanoT5 submodule
100 | pip install -r requirements.txt
101 | pip install -r nanoT5/requirements.txt
102 | ```
103 | 
104 | ## Dataset
105 | 
106 | We leverage the existing [uonlp/CulturaX](https://huggingface.co/datasets/uonlp/CulturaX) dataset which contains 23M Indonesian documents, collected and cleaned from the [OSCAR](https://oscar-corpus.com/) corpora and [mc4](https://huggingface.co/datasets/mc4). We selected this dataset as it is sufficiently large and has been deduplicated. More details can be found in their dataset card.
107 | 
108 | Since this dataset is rather large, we utilize the dataset streaming feature of Hugging Face datasets, which is thankfully also supported in nanoT5. This feature is likewise usable during tokenizer training.
109 | 
110 | ## Train SentencePiece Tokenizer
111 | 
112 | We first need to train a SentencePiece tokenizer on our pre-pretraining corpus. We followed the uncased T5 tokenizer training implementation from [HuggingFace](https://github.com/huggingface/transformers/tree/main/examples/flax/language-modeling#train-tokenizer-2). We then initialize a T5 config based on [google/t5-v1_1-base](https://huggingface.co/google/t5-v1_1-base) and the newly trained tokenizer. Both the tokenizer and the config are then saved for loading later. 
113 | 
114 | To train the SentencePiece tokenizer, run `train_tokenizer.py` with the desired arguments:
115 | 
116 | ```sh
117 | python train_tokenizer.py \
118 |     --vocab-size 32000 \
119 |     --dataset-name uonlp/CulturaX \
120 |     --dataset-config id \
121 |     --output-dir outputs/indonesian-t5-base/ \
122 |     --base-model-config google/t5-v1_1-base \
123 |     --hf-repo-id LazarusNLP/IndoNanoT5-base
124 | ```
125 | 
126 | It took us about an hour to train the tokenizer.
127 | 
128 | ## Pre-train T5
129 | 
130 | NanoT5 handles most of the training process and exposes a clean API to pre-train a T5 model from scratch. We follow their default training configuration, with the exception of a lower learning rate which is specific to our dataset. Other than that, running pre-training is as simple as:
131 | 
132 | ```sh
133 | python -m nanoT5.main \
134 |     optim.name=adamwscale \
135 |     optim.lr_scheduler=cosine \
136 |     optim.base_lr=5e-3 \
137 |     model.name=LazarusNLP/IndoNanoT5-base \
138 |     model.compile=true \
139 |     data.num_workers=16
140 | ```
141 | 
142 | We achieved a negative log-likelihood loss of 2.082 and an accuracy of 57.4% on a heldout subset (1%) of the pre-training corpus.
143 | 
144 | ### Experiments
145 | 
146 | We experimented with different learning rates, optimizers, and layer initialization strategies. Whilst we found that the default scaled AdamW optimizer worked best for our baseline results, we aim to further improve the results. Specifically, we aim to experiment with:
147 | 
148 | - [x] Initializing `lm_head` weights with `std=1/sqrt(d_model)`
149 | - [ ] (Unscaled) AdamW Optimizer
150 | - [ ] [NAdamW Optimizer](https://pytorch.org/docs/2.2/generated/torch.optim.NAdam.html#torch.optim.NAdam)
151 | - [ ] [Shampoo](https://arxiv.org/abs/1802.09568) and [CASPR](https://openreview.net/forum?id=8j9hz8DVi8) Optimizers
152 | 
153 | This growing list of ideas stem from a fruitful discussion [here](https://github.com/PiotrNawrot/nanoT5/issues/25).
154 | 
155 | <details>
156 |   <summary>Training Losses</summary>
157 | 
158 |   <img src="https://raw.githubusercontent.com/LazarusNLP/IndoT5/main/assets/training_loss.png"/>
159 | </details>
160 | 
161 | ## Fine-tune T5
162 | 
163 | NanoT5 supports fine-tuning to a downstream dataset like Super Natural-Instructions (SNI). However, since this requires further customization of fine-tuning code to other downstream datasets, we opted to develop our own fine-tuning script based on Hugging Face's [sample fine-tuning code](https://github.com/huggingface/transformers/tree/main/examples/pytorch).
164 | 
165 | In particular, we developed fine-tuning scripts for 3 IndoNLG tasks, namely: summarization, question-answering, and chit-chat (conversational), which you can find in [scripts](https://github.com/LazarusNLP/IndoT5/tree/main/scripts).
166 | 
167 | ### Summarization
168 | 
169 | To fine-tune for summarization, run the following command and modify accordingly:
170 | 
171 | ```sh
172 | python scripts/run_summarization.py \
173 |     --model-checkpoint LazarusNLP/IndoNanoT5-base \ # pre-trained model checkpoint
174 |     --dataset-name LazarusNLP/indonlg \ # Hugging Face 🤗 dataset name
175 |     --dataset-config indosum \ # dataset config
176 |     --input-column-name input \ # input column (text passage) name in dataset
177 |     --target-column-name target \ # target column (summary) name in dataset
178 |     --input-max-length 512 \
179 |     --target-max-length 512 \
180 |     --num-beams 5 \ # beam width during beam search
181 |     --output-dir outputs/indo-nanot5-indosum \
182 |     --num-train-epochs 5 \
183 |     --optim adamw_torch_fused \ # any optimizer supported in Hugging Face 🤗 transformers
184 |     --learning-rate 1e-3 \
185 |     --weight-decay 0.01 \
186 |     --per-device-train-batch-size 8 \
187 |     --per-device-eval-batch-size 16 \
188 |     --hub-model-id LazarusNLP/IndoNanoT5-base-IndoSum # Hugging Face 🤗 Hub repo name
189 | ```
190 | 
191 | IndoNLG summarization recipes are provided [here](https://github.com/LazarusNLP/IndoT5/blob/main/run_summarization.sh).
192 | 
193 | ### Question-Answering
194 | 
195 | To fine-tune for question-answering, run the following command and modify accordingly:
196 | 
197 | ```sh
198 | python scripts/run_qa.py \
199 |     --model-checkpoint LazarusNLP/IndoNanoT5-base \
200 |     --dataset-name LazarusNLP/indonlg \
201 |     --dataset-config question_answering \
202 |     --context-column-name context \ # context/passage column name
203 |     --question-column-name input \ # question column name
204 |     --answer-column-name references \ # answer column name, must be list
205 |     --id-column-name gem_id \ # question-answer pair id
206 |     --input-max-length 512 \
207 |     --target-max-length 512 \
208 |     --num-beams 5 \
209 |     --output-dir outputs/indo-nanot5-tydiqa \
210 |     --num-train-epochs 50 \
211 |     --optim adamw_torch_fused \
212 |     --learning-rate 1e-5 \
213 |     --weight-decay 0.01 \
214 |     --per-device-train-batch-size 8 \
215 |     --per-device-eval-batch-size 16 \
216 |     --hub-model-id LazarusNLP/IndoNanoT5-base-TyDiQA
217 | ```
218 | 
219 | IndoNLG question-answering recipe is provided [here](https://github.com/LazarusNLP/IndoT5/blob/main/run_qa.sh).
220 | 
221 | ### Chit-chat
222 | 
223 | To fine-tune for chit-chat, run the following command and modify accordingly:
224 | 
225 | ```sh
226 | python scripts/run_chitchat.py \
227 |     --model-checkpoint LazarusNLP/IndoNanoT5-base \
228 |     --dataset-name LazarusNLP/indonlg \
229 |     --dataset-config xpersona \
230 |     --context-column-name context \ # context/persona column name
231 |     --question-column-name input \ # conversation history/dialogues column name
232 |     --answer-column-name references \ # response column name
233 |     --use-persona \ # whether to use persona or not
234 |     --input-max-length 512 \
235 |     --target-max-length 512 \
236 |     --num-beams 5 \
237 |     --output-dir outputs/indo-nanot5-xpersona \
238 |     --num-train-epochs 50 \
239 |     --optim adamw_torch_fused \
240 |     --learning-rate 1e-5 \
241 |     --weight-decay 0.01 \
242 |     --per-device-train-batch-size 8 \
243 |     --per-device-eval-batch-size 16 \
244 |     --hub-model-id LazarusNLP/IndoNanoT5-base-XPersona
245 | ```
246 | 
247 | ## Acknowledgements
248 | 
249 | Thanks to [@PiotrNawrot](https://github.com/PiotrNawrot) and [@Birch-san](https://github.com/Birch-san) for the engaging discussion and ideas.
250 | 
251 | ## References
252 | 
253 | ```bibtex
254 | @article{Nawrot2023nanoT5AP,
255 |   title={nanoT5: A PyTorch Framework for Pre-training and Fine-tuning T5-style Models with Limited Resources},
256 |   author={Piotr Nawrot},
257 |   journal={ArXiv},
258 |   year={2023},
259 |   volume={abs/2309.02373},
260 | }
261 | ```
262 | 
263 | ## Credits
264 | 
265 | IndoT5 is developed with love by:
266 | 
267 | <div style="display: flex;">
268 | <a href="https://github.com/anantoj">
269 |     <img src="https://github.com/anantoj.png" alt="GitHub Profile" style="border-radius: 50%;width: 64px;border: solid 0px #fff;margin:0 4px;">
270 | </a>
271 | 
272 | <a href="https://github.com/DavidSamuell">
273 |     <img src="https://github.com/DavidSamuell.png" alt="GitHub Profile" style="border-radius: 50%;width: 64px;border: solid 0px #fff;margin:0 4px;">
274 | </a>
275 | 
276 | <a href="https://github.com/stevenlimcorn">
277 |     <img src="https://github.com/stevenlimcorn.png" alt="GitHub Profile" style="border-radius: 50%;width: 64px;border: solid 0px #fff;margin:0 4px;">
278 | </a>
279 | 
280 | <a href="https://github.com/w11wo">
281 |     <img src="https://github.com/w11wo.png" alt="GitHub Profile" style="border-radius: 50%;width: 64px;border: solid 0px #fff;margin:0 4px;">
282 | </a>
283 | </div>


--------------------------------------------------------------------------------