├── CODE_OF_CONDUCT.md ├── LICENSE ├── README.md ├── configs ├── deepspeed_eval_config.yaml ├── deepspeed_eval_config_bf16.yaml ├── deepspeed_eval_config_zero3_bf16.yaml ├── deepspeed_train_config.yaml └── deepspeed_train_config_bf16.yaml ├── evals ├── alma_13b.sh ├── alma_13b_lora.sh ├── alma_13b_lora_no_parallel.sh ├── alma_13b_no_parallel.sh ├── alma_13b_r.sh ├── alma_13b_r_wmt23.sh ├── alma_7b.sh ├── alma_7b_lora.sh ├── alma_7b_lora_no_parallel.sh ├── alma_7b_no_parallel.sh ├── eval_generation.sh ├── eval_generation_wmt23.sh ├── eval_other_models.sh └── llama-2-13b-5-shot.sh ├── figures ├── alma.jpg ├── alma_logo.png ├── alma_origin_logo.png ├── almar.png └── xalma.png ├── human_written_data ├── Filtered-5-shot │ ├── shots.cs-en.json │ ├── shots.de-en.json │ ├── shots.de-fr.json │ ├── shots.en-cs.json │ ├── shots.en-de.json │ ├── shots.en-ha.json │ ├── shots.en-is.json │ ├── shots.en-ja.json │ ├── shots.en-ru.json │ ├── shots.en-uk.json │ ├── shots.en-zh.json │ ├── shots.fr-de.json │ ├── shots.ha-en.json │ ├── shots.is-en.json │ ├── shots.ja-en.json │ ├── shots.ru-en.json │ ├── shots.uk-en.json │ └── shots.zh-en.json ├── HW-5-shot │ ├── shots.cs-en.json │ ├── shots.de-en.json │ ├── shots.en-cs.json │ ├── shots.en-de.json │ ├── shots.en-is.json │ ├── shots.en-ru.json │ ├── shots.en-zh.json │ ├── shots.is-en.json │ ├── shots.ru-en.json │ └── shots.zh-en.json ├── csen │ ├── test.cs-en.cs │ ├── test.cs-en.en │ ├── test.cs-en.json │ ├── test.en-cs.cs │ ├── test.en-cs.en │ ├── test.en-cs.json │ ├── train.cs-en.cs │ ├── train.cs-en.en │ ├── train.cs-en.json │ └── valid.cs-en.json ├── deen │ ├── test.de-en.json │ ├── test.en-de.json │ ├── train.de-en.de │ ├── train.de-en.en │ ├── train.de-en.json │ └── valid.de-en.json ├── isen │ ├── test.en-is.json │ ├── test.is-en.json │ ├── train.is-en.en │ ├── train.is-en.is │ └── train.is-en.json ├── ruen │ ├── test.en-ru.json │ ├── test.ru-en.json │ ├── train.ru-en.en │ ├── train.ru-en.json │ ├── train.ru-en.ru │ └── valid.ru-en.json └── zhen │ ├── test.en-zh.json │ ├── test.zh-en.json │ ├── train.zh-en.en │ ├── train.zh-en.json │ ├── train.zh-en.zh │ └── valid.zh-en.json ├── install_alma.sh ├── modeling_xalma.py ├── outputs ├── wmt22_outputs │ ├── ALMA-13B-LoRA │ │ ├── csen │ │ │ └── test.cs-en.en │ │ ├── deen │ │ │ └── test.de-en.en │ │ ├── encs │ │ │ └── test.en-cs.cs │ │ ├── ende │ │ │ └── test.en-de.de │ │ ├── enis │ │ │ └── test.en-is.is │ │ ├── enru │ │ │ └── test.en-ru.ru │ │ ├── enzh │ │ │ └── test.en-zh.zh │ │ ├── isen │ │ │ └── test.is-en.en │ │ ├── ruen │ │ │ └── test.ru-en.en │ │ └── zhen │ │ │ └── test.zh-en.en │ ├── ALMA-13B-R │ │ ├── csen │ │ │ └── test.cs-en.en │ │ ├── deen │ │ │ └── test.de-en.en │ │ ├── encs │ │ │ └── test.en-cs.cs │ │ ├── ende │ │ │ └── test.en-de.de │ │ ├── enis │ │ │ └── test.en-is.is │ │ ├── enru │ │ │ └── test.en-ru.ru │ │ ├── enzh │ │ │ └── test.en-zh.zh │ │ ├── isen │ │ │ └── test.is-en.en │ │ ├── ruen │ │ │ └── test.ru-en.en │ │ └── zhen │ │ │ └── test.zh-en.en │ ├── GPT-4-1106-preview │ │ ├── csen │ │ │ └── test.cs-en.en │ │ ├── deen │ │ │ └── test.de-en.en │ │ ├── encs │ │ │ └── test.en-cs.cs │ │ ├── ende │ │ │ └── test.en-de.de │ │ ├── enis │ │ │ └── test.en-is.is │ │ ├── enru │ │ │ └── test.en-ru.ru │ │ ├── enzh │ │ │ └── test.en-zh.zh │ │ ├── isen │ │ │ └── test.is-en.en │ │ ├── ruen │ │ │ └── test.ru-en.en │ │ └── zhen │ │ │ └── test.zh-en.en │ ├── MADLAD-10B │ │ ├── csen │ │ │ └── test.cs-en.en │ │ ├── deen │ │ │ └── test.de-en.en │ │ ├── encs │ │ │ └── test.en-cs.cs │ │ ├── ende │ │ │ └── test.en-de.de │ │ ├── enis │ │ │ └── test.en-is.is │ │ ├── enru │ │ │ └── test.en-ru.ru │ │ ├── enzh │ │ │ └── test.en-zh.zh │ │ ├── isen │ │ │ └── test.is-en.en │ │ ├── ruen │ │ │ └── test.ru-en.en │ │ └── zhen │ │ │ └── test.zh-en.en │ ├── README.md │ ├── WMT_Winners │ │ ├── csen │ │ │ └── test.cs-en.en │ │ ├── deen │ │ │ └── test.de-en.en │ │ ├── encs │ │ │ └── test.en-cs.cs │ │ ├── ende │ │ │ └── test.en-de.de │ │ ├── enis │ │ │ └── test.en-is.is │ │ ├── enru │ │ │ └── test.en-ru.ru │ │ ├── enzh │ │ │ └── test.en-zh.zh │ │ ├── isen │ │ │ └── test.is-en.en │ │ ├── ruen │ │ │ └── test.ru-en.en │ │ └── zhen │ │ │ └── test.zh-en.en │ ├── gpt3.5-text-davinci-003 │ │ ├── csen │ │ │ └── test.cs-en.en │ │ ├── deen │ │ │ └── test.de-en.en │ │ ├── encs │ │ │ └── test.en-cs.cs │ │ ├── ende │ │ │ └── test.en-de.de │ │ ├── enis │ │ │ └── test.en-is.is │ │ ├── enru │ │ │ └── test.en-ru.ru │ │ ├── enzh │ │ │ └── test.en-zh.zh │ │ ├── isen │ │ │ └── test.is-en.en │ │ ├── ruen │ │ │ └── test.ru-en.en │ │ └── zhen │ │ │ └── test.zh-en.en │ └── wmt-testset │ │ ├── csen │ │ ├── test.cs-en.cs │ │ └── test.cs-en.en │ │ ├── deen │ │ ├── test.de-en.de │ │ └── test.de-en.en │ │ ├── encs │ │ ├── test.en-cs.cs │ │ └── test.en-cs.en │ │ ├── ende │ │ ├── test.en-de.de │ │ └── test.en-de.en │ │ ├── enis │ │ ├── test.en-is.en │ │ └── test.en-is.is │ │ ├── enru │ │ ├── test.en-ru.en │ │ └── test.en-ru.ru │ │ ├── enzh │ │ ├── test.en-zh.en │ │ └── test.en-zh.zh │ │ ├── isen │ │ ├── test.is-en.en │ │ └── test.is-en.is │ │ ├── ruen │ │ ├── test.ru-en.en │ │ └── test.ru-en.ru │ │ └── zhen │ │ ├── test.zh-en.en │ │ └── test.zh-en.zh └── wmt23_outputs │ ├── ALMA-13B-R │ ├── deen │ │ └── test.de-en.en │ ├── ende │ │ └── test.en-de.de │ ├── enru │ │ └── test.en-ru.ru │ ├── enzh │ │ └── test.en-zh.zh │ ├── ruen │ │ └── test.ru-en.en │ └── zhen │ │ └── test.zh-en.en │ ├── MADLAD-10B │ ├── deen │ │ └── test.de-en.en │ ├── ende │ │ └── test.en-de.de │ ├── enru │ │ └── test.en-ru.ru │ ├── enzh │ │ └── test.en-zh.zh │ ├── ruen │ │ └── test.ru-en.en │ └── zhen │ │ └── test.zh-en.en │ ├── TowerInstruct-7B-v0.1 │ ├── deen │ │ └── test.de-en.en │ ├── ende │ │ └── test.en-de.de │ ├── enru │ │ └── test.en-ru.ru │ ├── enzh │ │ └── test.en-zh.zh │ ├── ruen │ │ └── test.ru-en.en │ └── zhen │ │ └── test.zh-en.en │ ├── WMT_Winners │ ├── deen │ │ └── test.de-en.en │ ├── ende │ │ └── test.en-de.de │ ├── enru │ │ └── test.en-ru.ru │ ├── enzh │ │ └── test.en-zh.zh │ ├── ruen │ │ └── test.ru-en.en │ └── zhen │ │ └── test.zh-en.en │ └── wmt-testset │ ├── deen │ ├── test.de-en.de │ └── test.de-en.en │ ├── ende │ ├── test.en-de.de │ └── test.en-de.en │ ├── enru │ ├── test.en-ru.en │ └── test.en-ru.ru │ ├── enzh │ ├── test.en-zh.en │ └── test.en-zh.zh │ ├── ruen │ ├── test.ru-en.en │ └── test.ru-en.ru │ └── zhen │ ├── test.zh-en.en │ └── test.zh-en.zh ├── run_cpo_llmmt.py ├── run_llmmt.py ├── runs ├── cpo_ft.sh ├── mono_ft.sh ├── parallel_ft.sh └── parallel_ft_lora.sh └── utils ├── __init__.py ├── arguments.py ├── cpo_config.py ├── cpo_trainer.py ├── trainer_llmmt.py ├── ul2collator.py └── utils.py /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Microsoft Open Source Code of Conduct 2 | 3 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/). 4 | 5 | Resources: 6 | 7 | - [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/) 8 | - [Microsoft Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) 9 | - Contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with questions or concerns 10 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) Microsoft Corporation. 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |

2 | ALMA 3 |

4 | 5 |
6 | 7 | # ALMA: Advanced Language Model-based translator 8 |
9 | 10 |

11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | follow on Twitter 21 |

22 | 23 | ALMA has three generations: ALMA (1st), ALMA-R (2nd), and **X-ALMA(3rd NEW!)**. 24 | 25 | [**ALMA**](https://arxiv.org/abs/2309.11674) (**A**dvanced **L**anguage **M**odel-based Tr**A**nslator) is a many-to-many LLM-based translation model, which adopts a new translation model paradigm: it begins with fine-tuning on monolingual data and is further optimized using high-quality parallel data. This two-step fine-tuning process ensures strong translation performance. 26 | 27 | **[ALMA-R](https://arxiv.org/pdf/2401.08417v2.pdf)** builds upon ALMA models, with further LoRA fine-tuning with our proposed **Contrastive Preference Optimization (CPO)** as opposed to the Supervised Fine-tuning used in ALMA. CPO fine-tuning requires our [triplet preference data](https://huggingface.co/datasets/haoranxu/ALMA-R-Preference) for preference learning. ALMA-R now can matches or even exceeds GPT-4 or WMT winners! 28 | 29 | **[X-ALMA](https://arxiv.org/pdf/2410.03115) (NEW!) extends ALMA(-R) from 6 languages to 50 languages and ensures top-tier performance across 50 diverse languages, regardless of their resource levels. This is achieved by plug-and-play language-specific module architecture and a carefully designed 5-step training recipe with novel *Adaptive-Rejection Preference Optimization* methods.** 30 | 31 | *Old ALMA Repo:* 32 | - The original **ALMA** repository can be found [here](https://github.com/fe1ixxu/ALMA/tree/a3cc7877752779346312bb07798172eadc83d692). 33 | - The original **ALMA-R** repository can be found [here](https://github.com/fe1ixxu/ALMA/tree/ac120eb44c609ad9a386d617172d40432c2c0df6). 34 | 35 | # News 🌟 36 | ⭐ Jan. 22 2025 **X-ALMA** has been accepted at **ICLR 2025**! 37 | 38 | ⭐ Oct. 6 2024 **X-ALMA** is out! Please find the [paper here](https://arxiv.org/pdf/2410.03115) and [models & datasets here](https://huggingface.co/collections/haoranxu/x-alma-66fde464ef90be465920abaa). 39 | 40 | ⭐ Jun. 20 2024 We want to give a shout out to [SimPO](https://arxiv.org/pdf/2405.14734), which shares a similar reference-free preference learning framework with CPO but in a more stable manner due to its special length normalization and target reward margin. The most exciting thing is that CPO and SimPO can potentially be used together! Learn more about [CPO-SimPO](https://github.com/fe1ixxu/CPO_SIMPO)! 41 | 42 | ⭐ May.1 CPO paper has been accepted at **ICML 2024**! 43 | 44 | ⭐ Mar.22 2024 CPO method now is merged at [huggingface trl](https://github.com/huggingface/trl)! See details [here](https://github.com/huggingface/trl/pull/1382). 45 | 46 | ⭐ Jan.16 2024 **ALMA-R** is released! Please check more details with our new paper: [Contrastive Preference Optimization: Pushing the Boundaries of LLM Performance in Machine Translation](https://arxiv.org/abs/2401.08417). 47 | 48 | ⭐ Jan.16 2024 The ALMA paper: [A Paradigm Shift in Machine Translation: Boosting Translation Performance of Large Language Models](https://arxiv.org/abs/2309.11674) has been accepted at **ICLR 2024**! Check out more details [here](https://openreview.net/forum?id=farT6XXntP)! 49 | 50 | # Contents 📄 51 | - [Download ALMA Models and Dataset](#download-alma-models-and-dataset-) 52 | - [A Quick Start](#a-quick-start) 53 | - [Environment Setup](#environment-setup-) 54 | - [Evaluation](#evaluation-) 55 | - [Training](#training-) 56 | - [FAQs](#faqs-) 57 | 58 | :star: Supports :star: 59 | - AMD and Nvidia Cards 60 | - Data Parallel Evaluation 61 | - Also support LLaMA-1, LLaMA-2, OPT, Faclon, BLOOM, MPT 62 | - LoRA Fine-tuning 63 | - Monolingual data fine-tuning, parallel data fine-tuning 64 | 65 |

66 | 67 |

68 | 69 | # Download ALMA Models and Dataset 🚀 70 | 71 | We release seven translation models for ALMA series: 72 | 73 | Model checkpoints are released at huggingface: 74 | | Models | Base Model Link | LoRA Link | 75 | |:-------------:|:---------------:|:---------:| 76 | | ALMA-7B (1st gen) | [haoranxu/ALMA-7B](https://huggingface.co/haoranxu/ALMA-7B) | - | 77 | | ALMA-7B-LoRA (1st gen) | [haoranxu/ALMA-7B-Pretrain](https://huggingface.co/haoranxu/ALMA-7B-Pretrain) | [haoranxu/ALMA-7B-Pretrain-LoRA](https://huggingface.co/haoranxu/ALMA-7B-Pretrain-LoRA) | 78 | | ALMA-7B-R (2nd gen) | [haoranxu/ALMA-7B-R (LoRA merged)](https://huggingface.co/haoranxu/ALMA-7B-R) | - | 79 | | ALMA-13B-LoRA (1st gen) | [haoranxu/ALMA-13B](https://huggingface.co/haoranxu/ALMA-13B) | - | 80 | | ALMA-13B-LoRA | [haoranxu/ALMA-13B-Pretrain](https://huggingface.co/haoranxu/ALMA-13B-Pretrain) | [haoranxu/ALMA-13B-Pretrain-LoRA](https://huggingface.co/haoranxu/ALMA-13B-Pretrain-LoRA) | 81 | | ALMA-13B-R (2nd gen) | [haoranxu/ALMA-13B-R (LoRA merged)](https://huggingface.co/haoranxu/ALMA-13B-R) | - | 82 | | **X-ALMA (NEW, 3rd gen)** | [X-ALMA Models](https://huggingface.co/collections/haoranxu/x-alma-66fde464ef90be465920abaa) | - | 83 | 84 | **Note that `ALMA-7B-Pretrain` and `ALMA-13B-Pretrain` are NOT translation models. They only experience stage 1 monolingual fine-tuning (20B tokens for the 7B model and 12B tokens for the 13B model), and should be utilized in conjunction with their LoRA models.** 85 | 86 | *We have also provided the WMT'22 and WMT'23 translation outputs from ALMA-13B-LoRA and ALMA-13B-R in the `outputs` directory. These outputs also includes our outputs of baselines and can be directly accessed and used for subsequent evaluations.* 87 | 88 | Datasets used by ALMA and ALMA-R are also released at huggingface now (NEW!) 89 | | Datasets | Train / Validation| Test | 90 | |:-------------:|:---------------:|:---------:| 91 | | ALMA Human-Written Parallel Data | [Parallel train and validation](https://huggingface.co/datasets/haoranxu/ALMA-Human-Parallel) | [WMT'22](https://huggingface.co/datasets/haoranxu/WMT22-Test) | 92 | | ALMA-R Triplet Preference Data | [Triplet Preference Data](https://huggingface.co/datasets/haoranxu/ALMA-R-Preference) | [WMT'22](https://huggingface.co/datasets/haoranxu/WMT22-Test) and [WMT'23](https://huggingface.co/datasets/haoranxu/WMT23-Test) | 93 | | **X-ALMA Data** | 50-language [parallel data](https://huggingface.co/datasets/haoranxu/X-ALMA-Parallel-Data) and [preference data](https://huggingface.co/datasets/haoranxu/X-ALMA-Preference) | [WMT'23](https://huggingface.co/datasets/haoranxu/WMT23-Test) and [FLORES-200](https://huggingface.co/datasets/haoranxu/X-ALMA-Parallel-Data) | 94 | 95 | 96 | # A Quick Start 97 | X-ALMA is designed with a plug-and-play architecture, consisting of two components: a base model and language-specific modules, with each module shared across different language groups. 98 | There are three ways to load X-ALMA for translation. An example of translating "我爱机器翻译。" into English (X-ALMA should also able to do multilingual open-ended QA). 99 | 100 | **The first way**: loading the merged model where the language-specific module has been merged into the base model (Recommended): 101 | ``` 102 | import torch 103 | from transformers import AutoModelForCausalLM 104 | from transformers import AutoTokenizer 105 | from peft import PeftModel 106 | 107 | GROUP2LANG = { 108 | 1: ["da", "nl", "de", "is", "no", "sv", "af"], 109 | 2: ["ca", "ro", "gl", "it", "pt", "es"], 110 | 3: ["bg", "mk", "sr", "uk", "ru"], 111 | 4: ["id", "ms", "th", "vi", "mg", "fr"], 112 | 5: ["hu", "el", "cs", "pl", "lt", "lv"], 113 | 6: ["ka", "zh", "ja", "ko", "fi", "et"], 114 | 7: ["gu", "hi", "mr", "ne", "ur"], 115 | 8: ["az", "kk", "ky", "tr", "uz", "ar", "he", "fa"], 116 | } 117 | LANG2GROUP = {lang: str(group) for group, langs in GROUP2LANG.items() for lang in langs} 118 | group_id = LANG2GROUP["zh"] 119 | 120 | model = AutoModelForCausalLM.from_pretrained(f"haoranxu/X-ALMA-13B-Group{group_id}", torch_dtype=torch.float16, device_map="auto") 121 | tokenizer = AutoTokenizer.from_pretrained(f"haoranxu/X-ALMA-13B-Group{group_id}", padding_side='left') 122 | 123 | # Add the source sentence into the prompt template 124 | prompt="Translate this from Chinese to English:\nChinese: 我爱机器翻译。\nEnglish:" 125 | 126 | # X-ALMA needs chat template but ALMA and ALMA-R don't need it. 127 | chat_style_prompt = [{"role": "user", "content": prompt}] 128 | prompt = tokenizer.apply_chat_template(chat_style_prompt, tokenize=False, add_generation_prompt=True) 129 | 130 | input_ids = tokenizer(prompt, return_tensors="pt", padding=True, max_length=40, truncation=True).input_ids.cuda() 131 | 132 | # Translation 133 | with torch.no_grad(): 134 | generated_ids = model.generate(input_ids=input_ids, num_beams=5, max_new_tokens=20, do_sample=True, temperature=0.6, top_p=0.9) 135 | outputs = tokenizer.batch_decode(generated_ids, skip_special_tokens=True) 136 | print(outputs) 137 | ``` 138 | 139 | **The second way**: loading the base model and language-specific module (Recommended): 140 | ``` 141 | model = AutoModelForCausalLM.from_pretrained("haoranxu/X-ALMA-13B-Pretrain", torch_dtype=torch.float16, device_map="auto") 142 | model = PeftModel.from_pretrained(model, f"haoranxu/X-ALMA-13B-Group{group_id}") 143 | tokenizer = AutoTokenizer.from_pretrained(f"haoranxu/X-ALMA-13B-Group{group_id}", padding_side='left') 144 | ``` 145 | 146 | **The third way**: loading the base model with all language-specific modules like MoE: (Require large GPU memory) 147 | ``` 148 | from modeling_xalma import XALMAForCausalLM 149 | model = XALMAForCausalLM.from_pretrained("haoranxu/X-ALMA", torch_dtype=torch.float16, device_map="auto") 150 | tokenizer = AutoTokenizer.from_pretrained("haoranxu/X-ALMA", padding_side='left') 151 | 152 | # Add `lang="zh"`: specify the language to instruct the model on which group to use for the third loading method during generation. 153 | generated_ids = model.generate(input_ids=input_ids, num_beams=5, max_new_tokens=20, do_sample=True, temperature=0.6, top_p=0.9, lang="zh") 154 | ``` 155 | 156 | 157 | The ALMA and ALMA-R translation prompt is: 158 | ``` 159 | Translate this from into : 160 | : 161 | : 162 | ``` 163 | 164 | The X-ALMA translation prompt is: 165 | ``` 166 | [INST] Translate this from into : 167 | : 168 | : [/INST] 169 | ``` 170 | 171 | # Environment Setup 🔧 172 | ``` 173 | conda create -n xalma python=3.11 174 | conda activate xalma 175 | ``` 176 | If you use **AMD GPUs**, please first install torch with ROCm. 177 | 178 | Then install other dependencies: 179 | ``` 180 | bash install_alma.sh 181 | ``` 182 | # Evaluation 💻 183 | ### Evaluation on X-ALMA 184 | This is a quick start to evaluate our X-ALMA model. To produce translation outputs for FLORES-200 in both en→cs and cs→en directions, (If you want to evaluate WMT'23 instead, simply pass `--override_test_data_path haoranxu/WMT23-Test`.), run the following command. **Note that You don't need enable `--chat_style` for ALMA and ALMA-R. This is only for X-ALMA** 185 | 186 | ``` 187 | accelerate launch --config_file configs/deepspeed_eval_config_bf16.yaml \ 188 | run_llmmt.py \ 189 | --model_name_or_path haoranxu/X-ALMA-13B-Group5 \ 190 | --do_predict \ 191 | --low_cpu_mem_usage \ 192 | --language_pairs en-cs,cs-en \ 193 | --mmt_data_path placeholder \ 194 | --override_test_data_path haoranxu/FLORES-200 \ 195 | --per_device_eval_batch_size 1 \ 196 | --output_dir ./your_output_dir/ \ 197 | --predict_with_generate \ 198 | --max_new_tokens 256 \ 199 | --max_source_length 256 \ 200 | --bf16 \ 201 | --seed 42 \ 202 | --num_beams 5 \ 203 | --overwrite_cache \ 204 | --overwrite_output_dir \ 205 | --chat_style # `--chat_style` only for X-ALMA. You don't need enable `--chat_style` for ALMA and ALMA-R 206 | 207 | ``` 208 | The generated outputs will be saved in the `your_output_dir`. The translation file for the `en→cs` direction is named `test-en-cs`, and the file for the cs→en direction is `test-cs-en`. 209 | The variable `${test_pairs}` denotes the translation directions you wish to evaluate. It supports testing multiple directions at once. For example, you can use `de-en,en-de,en-cs,cs-en`. 210 | 211 | Please see more other examples for evaluating ALMA(-R) under the `./evals` folder: 212 | 213 | **Note that this will perform data-parallel evaluation supported by deepspeed: that is, placing a single full copy of your model onto each available GPU and splitting batches across GPUs to evaluate on K GPUs K times faster than on one**. For those with limited GPU memory, we offer an alternative method. The user can pass `--multi_gpu_one_model` to run the process by distributing a single model across multiple GPUs. Please see evaluation examples in `evals/alma_13b_r.sh` or `evals/*no_parallel` files. 214 | 215 | # Training 🔥 216 | Here we show how to 217 | - contrastive Preference Optmization Upon ALMA Models (ALMA→ALMA-R). 218 | - fine-tune LLaMA-2-7B on monolingual OSCAR data (stage 1) 219 | - fine-tune human-written parallel data fine-tuning once stage 1 is completed, including full-weight and LoRA fine-tuning (stage 2) 220 | 221 | Please note that we do not share the training process for X-ALMA specifically, as it would require releasing numerous intermediate checkpoints, making the process overly complex. 222 | 223 | ## **CPO Fine-Tuning** 224 | To run the CPO fine-tuning with our triplet preference data, run the following command: 225 | ``` 226 | bash runs/cpo_ft.sh ${your_output_dir} 227 | ``` 228 | ### OSCAR Monolingual Fine-Tuning 229 | To execute the OSCAR monolingual fine-tuning, use the following command: 230 | ``` 231 | bash runs/mono_ft.sh ${your_output_dir} 232 | ``` 233 | ### Parallel Data Fine-Tuning (Full-Weight) 234 | Once the monolingual data fine-tuning is complete, proceed to the parallel data fine-tuning using the full-weight approach. Execute the following command: 235 | ``` 236 | bash runs/parallel_ft.sh ${your_output_dir} $training_pairs$ 237 | ``` 238 | where `training_pairs` is the translation directions you considered. The default is all 10 directions: `de-en,cs-en,is-en,zh-en,ru-en,en-de,en-cs,en-is,en-zh,en-ru`. 239 | 240 | ### Parallel Data Fine-Tuning (LoRA) 241 | In Stage 2, there's also an option to employ LoRA for fine-tuning on the parallel data. To do so, execute the following command: 242 | ``` 243 | bash runs/parallel_ft_lora.sh ${your_output_dir} $training_pairs$ 244 | ``` 245 | 246 | # FAQs ❓ 247 | ### What language directions do ALMA and ALMA-R support? 248 | Currently, ALMA supports 10 directions: English↔German, Englishs↔Czech, Englishs↔Icelandic, Englishs↔Chinese, Englishs↔Russian. However, it may surprise us in other directions :) 249 | 250 | ### What language directions do X-ALMA support? 251 | X-ALMA supports 50 languages and 98 directions (into and from English): da,nl,de,is,no,sv,af,ca,ro,gl,it,pt,es,bg,mk,sr,uk,ru,id,ms,th,vi,mg,fr,hu,el,cs,pl,lt,lv,ka,zh,ja,ko,fi,et,gu,hi,mr,ne,ur,az,kk,ky,tr,uz,ar,he,fa 252 | 253 | ### When should I stop fine-tuning at stage 1? 254 | Our 7B and 13B models are trained on 20B and 12B tokens, respectively. However, as indicated in the paper, fine-tuning 1B tokens should boost the performance substantially. The steps required to fine-tune 1 billion tokens also vary based on your batch size. In our case, the batch size is calculated as follows: 16 GPUs * 4 (batch size per GPU) * 4 (gradient accumulation steps) = 256. With a sequence length of 512, we need approximately 8,000 steps to train on 1 billion tokens, calculated as 10^9 / (256*512) ≈8000 steps. However, you may choose to fine-tune more steps to get better performance. 255 | 256 | ### How to decide the interleave probability at stage 1? 257 | Please find the reasons for interleave probability selection for stage 1 in Appendix D.1 in the [paper](https://arxiv.org/pdf/2309.11674.pdf)! 258 | 259 | # Reference 260 | Please find more details for ALMA models in our [paper](https://arxiv.org/abs/2309.11674) or the [summary](https://notes.aimodels.fyi/alma-a-new-training-method-that-boosts-translation-performance-for-large-language-models/) of the paper. 261 | ``` 262 | @inproceedings{ 263 | xu2024a, 264 | title={A Paradigm Shift in Machine Translation: Boosting Translation Performance of Large Language Models}, 265 | author={Haoran Xu and Young Jin Kim and Amr Sharaf and Hany Hassan Awadalla}, 266 | booktitle={The Twelfth International Conference on Learning Representations}, 267 | year={2024}, 268 | url={https://openreview.net/forum?id=farT6XXntP} 269 | } 270 | ``` 271 | 272 | Please also find more detailed information for the ALMA-R model with Contrastive Preference Optimization in the [paper](https://arxiv.org/pdf/2401.08417v2.pdf). 273 | ``` 274 | @inproceedings{ 275 | xu2024contrastive, 276 | title={Contrastive Preference Optimization: Pushing the Boundaries of {LLM} Performance in Machine Translation}, 277 | author={Haoran Xu and Amr Sharaf and Yunmo Chen and Weiting Tan and Lingfeng Shen and Benjamin Van Durme and Kenton Murray and Young Jin Kim}, 278 | booktitle={Forty-first International Conference on Machine Learning}, 279 | year={2024}, 280 | url={https://openreview.net/forum?id=51iwkioZpn} 281 | } 282 | ``` 283 | 284 | Please find details about X-ALMA in the latest [paper](https://arxiv.org/pdf/2410.03115) 285 | ``` 286 | @inproceedings{ 287 | xu2025xalma, 288 | title={X-{ALMA}: Plug \& Play Modules and Adaptive Rejection for Quality Translation at Scale}, 289 | author={Haoran Xu and Kenton Murray and Philipp Koehn and Hieu Hoang and Akiko Eriguchi and Huda Khayrallah}, 290 | booktitle={The Thirteenth International Conference on Learning Representations}, 291 | year={2025}, 292 | url={https://openreview.net/forum?id=csbf1p8xUq} 293 | } 294 | ``` 295 | -------------------------------------------------------------------------------- /configs/deepspeed_eval_config.yaml: -------------------------------------------------------------------------------- 1 | compute_environment: LOCAL_MACHINE 2 | distributed_type: MULTI_GPU 3 | downcast_bf16: 'no' 4 | gpu_ids: all 5 | machine_rank: 0 6 | main_training_function: main 7 | mixed_precision: fp16 8 | num_machines: 1 9 | num_processes: 8 10 | rdzv_backend: static 11 | same_network: true 12 | tpu_env: [] 13 | tpu_use_cluster: false 14 | tpu_use_sudo: false 15 | use_cpu: false 16 | -------------------------------------------------------------------------------- /configs/deepspeed_eval_config_bf16.yaml: -------------------------------------------------------------------------------- 1 | compute_environment: LOCAL_MACHINE 2 | distributed_type: MULTI_GPU 3 | downcast_bf16: 'no' 4 | gpu_ids: all 5 | machine_rank: 0 6 | main_training_function: main 7 | mixed_precision: bf16 8 | num_machines: 1 9 | num_processes: 8 10 | rdzv_backend: static 11 | same_network: true 12 | tpu_env: [] 13 | tpu_use_cluster: false 14 | tpu_use_sudo: false 15 | use_cpu: false 16 | -------------------------------------------------------------------------------- /configs/deepspeed_eval_config_zero3_bf16.yaml: -------------------------------------------------------------------------------- 1 | compute_environment: LOCAL_MACHINE 2 | debug: false 3 | deepspeed_config: 4 | gradient_accumulation_steps: 1 5 | offload_optimizer_device: none 6 | offload_param_device: none 7 | zero3_init_flag: false 8 | zero3_save_16bit_model: false 9 | zero_stage: 3 10 | distributed_type: DEEPSPEED 11 | downcast_bf16: 'no' 12 | machine_rank: 0 13 | main_training_function: main 14 | mixed_precision: bf16 15 | num_machines: 1 16 | num_processes: 8 17 | rdzv_backend: static 18 | same_network: true 19 | tpu_env: [] 20 | tpu_use_cluster: false 21 | tpu_use_sudo: false 22 | use_cpu: false 23 | -------------------------------------------------------------------------------- /configs/deepspeed_train_config.yaml: -------------------------------------------------------------------------------- 1 | compute_environment: LOCAL_MACHINE 2 | deepspeed_config: 3 | gradient_accumulation_steps: 4 4 | gradient_clipping: 1.0 5 | offload_optimizer_device: none 6 | offload_param_device: cpu 7 | zero3_init_flag: false 8 | zero_stage: 2 9 | distributed_type: DEEPSPEED 10 | downcast_bf16: 'no' 11 | machine_rank: 0 12 | main_training_function: main 13 | mixed_precision: fp16 14 | num_machines: 1 15 | num_processes: 8 16 | rdzv_backend: static 17 | same_network: true 18 | tpu_env: [] 19 | tpu_use_cluster: false 20 | tpu_use_sudo: false 21 | use_cpu: false 22 | -------------------------------------------------------------------------------- /configs/deepspeed_train_config_bf16.yaml: -------------------------------------------------------------------------------- 1 | compute_environment: LOCAL_MACHINE 2 | deepspeed_config: 3 | gradient_accumulation_steps: 1 4 | gradient_clipping: 1.0 5 | offload_optimizer_device: none 6 | offload_param_device: cpu 7 | zero3_init_flag: false 8 | zero_stage: 2 9 | distributed_type: DEEPSPEED 10 | downcast_bf16: 'no' 11 | machine_rank: 0 12 | main_training_function: main 13 | mixed_precision: bf16 14 | num_machines: 1 15 | num_processes: 8 16 | rdzv_backend: static 17 | same_network: true 18 | tpu_env: [] 19 | tpu_use_cluster: false 20 | tpu_use_sudo: false 21 | use_cpu: false 22 | -------------------------------------------------------------------------------- /evals/alma_13b.sh: -------------------------------------------------------------------------------- 1 | OUTPUT_DIR=${1:-"./outputs-alma-13b-full-ft/"} 2 | TEST_PAIRS=${2:-"de-en,cs-en,is-en,zh-en,ru-en,en-de,en-cs,en-is,en-zh,en-ru"} 3 | # random port between 30000 and 50000 4 | port=$(( RANDOM % (50000 - 30000 + 1 ) + 30000 )) 5 | 6 | ## Generation 7 | accelerate launch --main_process_port ${port} --config_file configs/deepspeed_eval_config_bf16.yaml\ 8 | run_llmmt.py \ 9 | --model_name_or_path haoranxu/ALMA-13B \ 10 | --do_predict \ 11 | --low_cpu_mem_usage \ 12 | --language_pairs ${TEST_PAIRS} \ 13 | --mmt_data_path ./human_written_data/ \ 14 | --per_device_eval_batch_size 2 \ 15 | --output_dir ${OUTPUT_DIR} \ 16 | --predict_with_generate \ 17 | --max_new_tokens 256 \ 18 | --max_source_length 256 \ 19 | --fp16 \ 20 | --seed 42 \ 21 | --num_beams 5 \ 22 | --overwrite_cache \ 23 | --overwrite_output_dir \ 24 | 25 | if [[ ${TEST_PAIRS} == *zh-en* ]]; then 26 | accelerate launch --main_process_port ${port} --config_file configs/deepspeed_eval_config_bf16.yaml \ 27 | run_llmmt.py \ 28 | --model_name_or_path haoranxu/ALMA-13B \ 29 | --do_predict \ 30 | --low_cpu_mem_usage \ 31 | --language_pairs zh-en \ 32 | --mmt_data_path ./human_written_data/ \ 33 | --per_device_eval_batch_size 2 \ 34 | --output_dir ${OUTPUT_DIR} \ 35 | --predict_with_generate \ 36 | --max_new_tokens 256 \ 37 | --max_source_length 512 \ 38 | --fp16 \ 39 | --seed 42 \ 40 | --num_beams 5 \ 41 | --overwrite_cache \ 42 | --overwrite_output_dir \ 43 | fi 44 | 45 | ## Evaluation (BLEU, COMET) 46 | bash ./evals/eval_generation.sh ${OUTPUT_DIR} ${TEST_PAIRS} -------------------------------------------------------------------------------- /evals/alma_13b_lora.sh: -------------------------------------------------------------------------------- 1 | OUTPUT_DIR=${1:-"./outputs-alma-13b-lora/"} 2 | TEST_PAIRS=${2:-"de-en,cs-en,is-en,zh-en,ru-en,en-de,en-cs,en-is,en-zh,en-ru"} 3 | # random port between 30000 and 50000 4 | port=$(( RANDOM % (50000 - 30000 + 1 ) + 30000 )) 5 | 6 | accelerate launch --main_process_port ${port} --config_file configs/deepspeed_eval_config_bf16.yaml \ 7 | run_llmmt.py \ 8 | --model_name_or_path haoranxu/ALMA-13B-Pretrain \ 9 | --do_predict \ 10 | --low_cpu_mem_usage \ 11 | --language_pairs ${TEST_PAIRS} \ 12 | --mmt_data_path ./human_written_data/ \ 13 | --per_device_eval_batch_size 2 \ 14 | --output_dir ${OUTPUT_DIR} \ 15 | --use_peft \ 16 | --peft_model_id haoranxu/ALMA-13B-Pretrain-LoRA \ 17 | --predict_with_generate \ 18 | --max_new_tokens 256 \ 19 | --max_source_length 256 \ 20 | --fp16 \ 21 | --seed 42 \ 22 | --num_beams 5 \ 23 | --overwrite_cache \ 24 | --overwrite_output_dir \ 25 | 26 | if [[ ${TEST_PAIRS} == *zh-en* ]]; then 27 | accelerate launch --main_process_port ${port} --config_file configs/deepspeed_eval_config_bf16.yaml \ 28 | run_llmmt.py \ 29 | --model_name_or_path haoranxu/ALMA-13B-Pretrain \ 30 | --do_predict \ 31 | --low_cpu_mem_usage \ 32 | --language_pairs zh-en \ 33 | --mmt_data_path ./human_written_data/ \ 34 | --per_device_eval_batch_size 2 \ 35 | --output_dir ${OUTPUT_DIR} \ 36 | --use_peft \ 37 | --peft_model_id haoranxu/ALMA-13B-Pretrain-LoRA \ 38 | --predict_with_generate \ 39 | --max_new_tokens 256 \ 40 | --max_source_length 512 \ 41 | --fp16 \ 42 | --seed 42 \ 43 | --num_beams 5 \ 44 | --overwrite_cache \ 45 | --overwrite_output_dir \ 46 | fi 47 | 48 | ## Evaluation (BLEU, COMET) 49 | bash ./evals/eval_generation.sh ${OUTPUT_DIR} ${TEST_PAIRS} 50 | -------------------------------------------------------------------------------- /evals/alma_13b_lora_no_parallel.sh: -------------------------------------------------------------------------------- 1 | OUTPUT_DIR=${1:-"./outputs-alma-13b-lora/"} 2 | TEST_PAIRS=${2:-"de-en,cs-en,is-en,zh-en,ru-en,en-de,en-cs,en-is,en-zh,en-ru"} 3 | 4 | python \ 5 | run_llmmt.py \ 6 | --model_name_or_path haoranxu/ALMA-13B-Pretrain \ 7 | --do_predict \ 8 | --low_cpu_mem_usage \ 9 | --language_pairs ${TEST_PAIRS} \ 10 | --mmt_data_path ./human_written_data/ \ 11 | --per_device_eval_batch_size 2 \ 12 | --output_dir ${OUTPUT_DIR} \ 13 | --use_peft \ 14 | --peft_model_id haoranxu/ALMA-13B-Pretrain-LoRA \ 15 | --predict_with_generate \ 16 | --max_new_tokens 256 \ 17 | --max_source_length 256 \ 18 | --fp16 \ 19 | --seed 42 \ 20 | --num_beams 5 \ 21 | --overwrite_cache \ 22 | --overwrite_output_dir \ 23 | --multi_gpu_one_model 24 | 25 | if [[ ${TEST_PAIRS} == *zh-en* ]]; then 26 | python \ 27 | run_llmmt.py \ 28 | --model_name_or_path haoranxu/ALMA-13B-Pretrain \ 29 | --do_predict \ 30 | --low_cpu_mem_usage \ 31 | --language_pairs zh-en \ 32 | --mmt_data_path ./human_written_data/ \ 33 | --per_device_eval_batch_size 2 \ 34 | --output_dir ${OUTPUT_DIR} \ 35 | --use_peft \ 36 | --peft_model_id haoranxu/ALMA-13B-Pretrain-LoRA \ 37 | --predict_with_generate \ 38 | --max_new_tokens 256 \ 39 | --max_source_length 512 \ 40 | --fp16 \ 41 | --seed 42 \ 42 | --num_beams 5 \ 43 | --overwrite_cache \ 44 | --overwrite_output_dir \ 45 | --multi_gpu_one_model 46 | fi 47 | 48 | ## Evaluation (BLEU, COMET) 49 | bash ./evals/eval_generation.sh ${OUTPUT_DIR} ${TEST_PAIRS} 50 | -------------------------------------------------------------------------------- /evals/alma_13b_no_parallel.sh: -------------------------------------------------------------------------------- 1 | OUTPUT_DIR=${1:-"./outputs-alma-13b-full-ft/"} 2 | TEST_PAIRS=${2:-"de-en,cs-en,is-en,zh-en,ru-en,en-de,en-cs,en-is,en-zh,en-ru"} 3 | 4 | ## Generation 5 | python \ 6 | run_llmmt.py \ 7 | --model_name_or_path haoranxu/ALMA-13B \ 8 | --do_predict \ 9 | --low_cpu_mem_usage \ 10 | --language_pairs ${TEST_PAIRS} \ 11 | --mmt_data_path ./human_written_data/ \ 12 | --per_device_eval_batch_size 2 \ 13 | --output_dir ${OUTPUT_DIR} \ 14 | --predict_with_generate \ 15 | --max_new_tokens 256 \ 16 | --max_source_length 256 \ 17 | --fp16 \ 18 | --seed 42 \ 19 | --num_beams 5 \ 20 | --overwrite_cache \ 21 | --overwrite_output_dir \ 22 | --multi_gpu_one_model 23 | 24 | if [[ ${TEST_PAIRS} == *zh-en* ]]; then 25 | python \ 26 | run_llmmt.py \ 27 | --model_name_or_path haoranxu/ALMA-13B \ 28 | --do_predict \ 29 | --low_cpu_mem_usage \ 30 | --language_pairs zh-en \ 31 | --mmt_data_path ./human_written_data/ \ 32 | --per_device_eval_batch_size 2 \ 33 | --output_dir ${OUTPUT_DIR} \ 34 | --predict_with_generate \ 35 | --max_new_tokens 256 \ 36 | --max_source_length 512 \ 37 | --fp16 \ 38 | --seed 42 \ 39 | --num_beams 5 \ 40 | --overwrite_cache \ 41 | --overwrite_output_dir \ 42 | --multi_gpu_one_model 43 | fi 44 | 45 | ## Evaluation (BLEU, COMET) 46 | bash ./evals/eval_generation.sh ${OUTPUT_DIR} ${TEST_PAIRS} -------------------------------------------------------------------------------- /evals/alma_13b_r.sh: -------------------------------------------------------------------------------- 1 | OUTPUT_DIR=${1:-"./outputs-alma-13b-r/"} 2 | TEST_PAIRS=${2:-"de-en,cs-en,is-en,zh-en,ru-en,en-de,en-cs,en-is,en-zh,en-ru"} 3 | # random port between 30000 and 50000 4 | port=$(( RANDOM % (50000 - 30000 + 1 ) + 30000 )) 5 | 6 | accelerate launch --main_process_port ${port} --config_file configs/deepspeed_eval_config_bf16.yaml \ 7 | run_llmmt.py \ 8 | --model_name_or_path haoranxu/ALMA-13B-R \ 9 | --do_predict \ 10 | --low_cpu_mem_usage \ 11 | --language_pairs ${TEST_PAIRS} \ 12 | --mmt_data_path ./human_written_data/ \ 13 | --per_device_eval_batch_size 4 \ 14 | --output_dir ${OUTPUT_DIR} \ 15 | --predict_with_generate \ 16 | --max_new_tokens 256 \ 17 | --max_source_length 256 \ 18 | --bf16 \ 19 | --seed 42 \ 20 | --num_beams 5 \ 21 | --overwrite_cache \ 22 | --overwrite_output_dir 23 | 24 | if [[ ${TEST_PAIRS} == *zh-en* ]]; then 25 | accelerate launch --main_process_port ${port} --config_file configs/deepspeed_eval_config_bf16.yaml \ 26 | run_llmmt.py \ 27 | --model_name_or_path haoranxu/ALMA-13B-R \ 28 | --do_predict \ 29 | --low_cpu_mem_usage \ 30 | --language_pairs zh-en \ 31 | --mmt_data_path ./human_written_data/ \ 32 | --per_device_eval_batch_size 4 \ 33 | --output_dir ${OUTPUT_DIR} \ 34 | --predict_with_generate \ 35 | --max_new_tokens 256 \ 36 | --max_source_length 512 \ 37 | --bf16 \ 38 | --seed 42 \ 39 | --num_beams 5 \ 40 | --overwrite_cache \ 41 | --overwrite_output_dir 42 | fi 43 | 44 | ## Evaluation (BLEU, COMET) 45 | bash ./evals/eval_generation.sh ${OUTPUT_DIR} ${TEST_PAIRS} 46 | -------------------------------------------------------------------------------- /evals/alma_13b_r_wmt23.sh: -------------------------------------------------------------------------------- 1 | OUTPUT_DIR=${1:-"./outputs-alma-13b-r-wmt23/"} 2 | # random port between 30000 and 50000 3 | port=$(( RANDOM % (50000 - 30000 + 1 ) + 30000 )) 4 | 5 | accelerate launch --main_process_port ${port} --config_file configs/deepspeed_eval_config_bf16.yaml\ 6 | run_llmmt.py \ 7 | --model_name_or_path haoranxu/ALMA-13B-R \ 8 | --do_predict \ 9 | --low_cpu_mem_usage \ 10 | --language_pairs en-ru,en-zh \ 11 | --mmt_data_path ./human_written_data/ \ 12 | --override_test_data_path haoranxu/WMT23-Test \ 13 | --per_device_eval_batch_size 4 \ 14 | --output_dir ${OUTPUT_DIR} \ 15 | --predict_with_generate \ 16 | --max_new_tokens 256 \ 17 | --max_source_length 256 \ 18 | --bf16 \ 19 | --seed 42 \ 20 | --num_beams 5 \ 21 | --overwrite_cache \ 22 | --overwrite_output_dir \ 23 | 24 | accelerate launch --main_process_port ${port} --config_file configs/deepspeed_eval_config_bf16.yaml \ 25 | run_llmmt.py \ 26 | --model_name_or_path haoranxu/ALMA-13B-R \ 27 | --do_predict \ 28 | --low_cpu_mem_usage \ 29 | --language_pairs ru-en,de-en,en-de,zh-en \ 30 | --mmt_data_path ./human_written_data/ \ 31 | --override_test_data_path haoranxu/WMT23-Test \ 32 | --per_device_eval_batch_size 4 \ 33 | --output_dir ${OUTPUT_DIR} \ 34 | --predict_with_generate \ 35 | --max_new_tokens 512 \ 36 | --max_source_length 1024 \ 37 | --bf16 \ 38 | --seed 42 \ 39 | --num_beams 5 \ 40 | --overwrite_cache \ 41 | --overwrite_output_dir \ 42 | 43 | bash ./evals/eval_generation_wmt23.sh ${OUTPUT_DIR} de-en,zh-en,ru-en,en-de,en-ru,en-zh 44 | -------------------------------------------------------------------------------- /evals/alma_7b.sh: -------------------------------------------------------------------------------- 1 | OUTPUT_DIR=${1:-"./outputs-alma-7b-full-ft/"} 2 | TEST_PAIRS=${2:-"de-en,cs-en,is-en,zh-en,ru-en,en-de,en-cs,en-is,en-zh,en-ru"} 3 | # random port between 30000 and 50000 4 | port=$(( RANDOM % (50000 - 30000 + 1 ) + 30000 )) 5 | 6 | ## Generation 7 | accelerate launch --main_process_port ${port} --config_file configs/deepspeed_eval_config_bf16.yaml \ 8 | run_llmmt.py \ 9 | --model_name_or_path haoranxu/ALMA-7B \ 10 | --do_predict \ 11 | --low_cpu_mem_usage \ 12 | --language_pairs ${TEST_PAIRS} \ 13 | --mmt_data_path ./human_written_data/ \ 14 | --per_device_eval_batch_size 2 \ 15 | --output_dir ${OUTPUT_DIR} \ 16 | --predict_with_generate \ 17 | --max_new_tokens 256 \ 18 | --max_source_length 256 \ 19 | --fp16 \ 20 | --seed 42 \ 21 | --num_beams 5 \ 22 | --overwrite_cache \ 23 | --overwrite_output_dir 24 | 25 | ## Some tokenized zh source sentence is longer than 256, here we set 512 26 | if [[ ${TEST_PAIRS} == *zh-en* ]]; then 27 | accelerate launch --main_process_port ${port} --config_file configs/deepspeed_eval_config_bf16.yaml \ 28 | run_llmmt.py \ 29 | --model_name_or_path "haoranxu/ALMA-7B" \ 30 | --do_predict \ 31 | --low_cpu_mem_usage \ 32 | --language_pairs zh-en \ 33 | --mmt_data_path ./human_written_data/ \ 34 | --per_device_eval_batch_size 2 \ 35 | --output_dir ${OUTPUT_DIR} \ 36 | --predict_with_generate \ 37 | --max_new_tokens 256 \ 38 | --max_source_length 512 \ 39 | --fp16 \ 40 | --seed 42 \ 41 | --num_beams 5 \ 42 | --overwrite_cache \ 43 | --overwrite_output_dir 44 | fi 45 | 46 | ## Evaluation (BLEU, COMET) 47 | bash ./evals/eval_generation.sh ${OUTPUT_DIR} ${TEST_PAIRS} -------------------------------------------------------------------------------- /evals/alma_7b_lora.sh: -------------------------------------------------------------------------------- 1 | OUTPUT_DIR=${1:-"./outputs-alma-7b-lora/"} 2 | TEST_PAIRS=${2:-"de-en,cs-en,is-en,zh-en,ru-en,en-de,en-cs,en-is,en-zh,en-ru"} 3 | 4 | # random port between 30000 and 50000 5 | port=$(( RANDOM % (50000 - 30000 + 1 ) + 30000 )) 6 | 7 | accelerate launch --main_process_port ${port} --config_file configs/deepspeed_eval_config_bf16.yaml \ 8 | run_llmmt.py \ 9 | --model_name_or_path haoranxu/ALMA-7B-Pretrain \ 10 | --do_predict \ 11 | --low_cpu_mem_usage \ 12 | --language_pairs ${TEST_PAIRS} \ 13 | --mmt_data_path ./human_written_data/ \ 14 | --per_device_eval_batch_size 2 \ 15 | --output_dir ${OUTPUT_DIR} \ 16 | --use_peft \ 17 | --peft_model_id haoranxu/ALMA-7B-Pretrain-LoRA \ 18 | --predict_with_generate \ 19 | --max_new_tokens 256 \ 20 | --max_source_length 256 \ 21 | --fp16 \ 22 | --seed 42 \ 23 | --num_beams 5 \ 24 | --overwrite_cache \ 25 | --overwrite_output_dir 26 | 27 | if [[ ${TEST_PAIRS} == *zh-en* ]]; then 28 | accelerate launch --main_process_port ${port} --config_file configs/deepspeed_eval_config_bf16.yaml \ 29 | run_llmmt.py \ 30 | --model_name_or_path haoranxu/ALMA-7B-Pretrain \ 31 | --do_predict \ 32 | --low_cpu_mem_usage \ 33 | --language_pairs zh-en \ 34 | --mmt_data_path ./human_written_data/ \ 35 | --per_device_eval_batch_size 2 \ 36 | --output_dir ${OUTPUT_DIR} \ 37 | --use_peft \ 38 | --peft_model_id haoranxu/ALMA-7B-Pretrain-LoRA \ 39 | --predict_with_generate \ 40 | --max_new_tokens 256 \ 41 | --max_source_length 512 \ 42 | --fp16 \ 43 | --seed 42 \ 44 | --num_beams 5 \ 45 | --overwrite_cache \ 46 | --overwrite_output_dir 47 | 48 | fi 49 | 50 | ## Evaluation (BLEU, COMET) 51 | bash ./evals/eval_generation.sh ${OUTPUT_DIR} ${TEST_PAIRS} -------------------------------------------------------------------------------- /evals/alma_7b_lora_no_parallel.sh: -------------------------------------------------------------------------------- 1 | OUTPUT_DIR=${1:-"./outputs-alma-7b-lora/"} 2 | TEST_PAIRS=${2:-"de-en,cs-en,is-en,zh-en,ru-en,en-de,en-cs,en-is,en-zh,en-ru"} 3 | # random port between 30000 and 50000 4 | port=$(( RANDOM % (50000 - 30000 + 1 ) + 30000 )) 5 | 6 | python \ 7 | run_llmmt.py \ 8 | --model_name_or_path haoranxu/ALMA-7B-Pretrain \ 9 | --do_predict \ 10 | --low_cpu_mem_usage \ 11 | --language_pairs ${TEST_PAIRS} \ 12 | --mmt_data_path ./human_written_data/ \ 13 | --per_device_eval_batch_size 2 \ 14 | --output_dir ${OUTPUT_DIR} \ 15 | --use_peft \ 16 | --peft_model_id haoranxu/ALMA-7B-Pretrain-LoRA \ 17 | --predict_with_generate \ 18 | --max_new_tokens 256 \ 19 | --max_source_length 256 \ 20 | --fp16 \ 21 | --seed 42 \ 22 | --num_beams 5 \ 23 | --overwrite_cache \ 24 | --overwrite_output_dir \ 25 | --multi_gpu_one_model 26 | 27 | if [[ ${TEST_PAIRS} == *zh-en* ]]; then 28 | python \ 29 | run_llmmt.py \ 30 | --model_name_or_path haoranxu/ALMA-7B-Pretrain \ 31 | --do_predict \ 32 | --low_cpu_mem_usage \ 33 | --language_pairs zh-en \ 34 | --mmt_data_path ./human_written_data/ \ 35 | --per_device_eval_batch_size 2 \ 36 | --output_dir ${OUTPUT_DIR} \ 37 | --use_peft \ 38 | --peft_model_id haoranxu/ALMA-7B-Pretrain-LoRA \ 39 | --predict_with_generate \ 40 | --max_new_tokens 256 \ 41 | --max_source_length 512 \ 42 | --fp16 \ 43 | --seed 42 \ 44 | --num_beams 5 \ 45 | --overwrite_cache \ 46 | --overwrite_output_dir \ 47 | --multi_gpu_one_model 48 | 49 | fi 50 | 51 | ## Evaluation (BLEU, COMET) 52 | bash ./evals/eval_generation.sh ${OUTPUT_DIR} ${TEST_PAIRS} -------------------------------------------------------------------------------- /evals/alma_7b_no_parallel.sh: -------------------------------------------------------------------------------- 1 | OUTPUT_DIR=${1:-"./outputs-alma-7b-full-ft/"} 2 | TEST_PAIRS=${2:-"de-en,cs-en,is-en,zh-en,ru-en,en-de,en-cs,en-is,en-zh,en-ru"} 3 | 4 | ## Generation 5 | python \ 6 | run_llmmt.py \ 7 | --model_name_or_path "haoranxu/ALMA-7B" \ 8 | --do_predict \ 9 | --low_cpu_mem_usage \ 10 | --language_pairs ${TEST_PAIRS} \ 11 | --mmt_data_path ./human_written_data/ \ 12 | --per_device_eval_batch_size 2 \ 13 | --output_dir ${OUTPUT_DIR} \ 14 | --predict_with_generate \ 15 | --max_new_tokens 256 \ 16 | --max_source_length 256 \ 17 | --fp16 \ 18 | --seed 42 \ 19 | --num_beams 5 \ 20 | --overwrite_cache \ 21 | --overwrite_output_dir \ 22 | --multi_gpu_one_model 23 | 24 | if [[ ${TEST_PAIRS} == *zh-en* ]]; then 25 | python \ 26 | run_llmmt.py \ 27 | --model_name_or_path "haoranxu/ALMA-7B" \ 28 | --do_predict \ 29 | --low_cpu_mem_usage \ 30 | --language_pairs zh-en \ 31 | --mmt_data_path ./human_written_data/ \ 32 | --per_device_eval_batch_size 2 \ 33 | --output_dir ${OUTPUT_DIR} \ 34 | --predict_with_generate \ 35 | --max_new_tokens 256 \ 36 | --max_source_length 512 \ 37 | --fp16 \ 38 | --seed 42 \ 39 | --num_beams 5 \ 40 | --overwrite_cache \ 41 | --overwrite_output_dir \ 42 | --multi_gpu_one_model 43 | fi 44 | 45 | ## Evaluation (BLEU, COMET) 46 | bash ./evals/eval_generation.sh ${OUTPUT_DIR} ${TEST_PAIRS} -------------------------------------------------------------------------------- /evals/eval_generation.sh: -------------------------------------------------------------------------------- 1 | OUTPUT_DIR=${1} 2 | TEST_PAIRS=${2} 3 | 4 | ## Evaluation 5 | for pair in ${TEST_PAIRS//,/ }; do 6 | src=$(echo ${pair} | cut -d "-" -f 1) 7 | tgt=$(echo ${pair} | cut -d "-" -f 2) 8 | TOK="13a" 9 | if [ ${tgt} == "zh" ]; then 10 | TOK="zh" 11 | elif [ ${tgt} == "ja" ]; then 12 | TOK="ja-mecab" 13 | fi 14 | echo "--------------------Results for ${pair}-------------------------------------" 15 | src_path=./outputs/wmt22_outputs/wmt-testset/${src}${tgt}/test.${src}-${tgt}.${src} 16 | tgt_path=./outputs/wmt22_outputs/wmt-testset/${src}${tgt}/test.${src}-${tgt}.${tgt} 17 | output_path=${OUTPUT_DIR}/test-${src}-${tgt} 18 | SACREBLEU_FORMAT=text sacrebleu -tok ${TOK} -w 2 ${tgt_path} < ${output_path} > ${output_path}.bleu 19 | cat ${output_path}.bleu 20 | comet-score -s ${src_path} -t ${output_path} -r ${tgt_path} --batch_size 256 --model Unbabel/wmt22-comet-da --gpus 1 > ${output_path}.comet 21 | comet-score -s ${src_path} -t ${output_path} --batch_size 256 --model Unbabel/wmt22-cometkiwi-da --gpus 1 > ${output_path}.cometkiwi 22 | comet-score -s ${src_path} -t ${output_path} --batch_size 8 --model Unbabel/wmt23-cometkiwi-da-xxl --gpus 1 > ${output_path}.cometkiwi_10b 23 | comet-score -s ${src_path} -t ${output_path} --batch_size 8 --model Unbabel/XCOMET-XXL --gpus 1 --to_json ${output_path}.xcomet.output.json > ${output_path}.xcomet_10b 24 | tail -n 1 ${output_path}.comet 25 | done 26 | 27 | for pair in ${TEST_PAIRS//,/ }; do 28 | src=$(echo ${pair} | cut -d "-" -f 1) 29 | tgt=$(echo ${pair} | cut -d "-" -f 2) 30 | echo "---------------------------${src}-${tgt}-------------------------------" 31 | output_path=${OUTPUT_DIR}/test-${src}-${tgt} 32 | cat ${output_path}.bleu 33 | tail -n 1 ${output_path}.comet 34 | tail -n 1 ${output_path}.cometkiwi 35 | tail -n 1 ${output_path}.cometkiwi_10b 36 | tail -n 2 ${output_path}.xcomet_10b 37 | done 38 | -------------------------------------------------------------------------------- /evals/eval_generation_wmt23.sh: -------------------------------------------------------------------------------- 1 | OUTPUT_DIR=${1} 2 | TEST_PAIRS=${2} 3 | 4 | ## Evaluation 5 | for pair in ${TEST_PAIRS//,/ }; do 6 | src=$(echo ${pair} | cut -d "-" -f 1) 7 | tgt=$(echo ${pair} | cut -d "-" -f 2) 8 | TOK="13a" 9 | if [ ${tgt} == "zh" ]; then 10 | TOK="zh" 11 | elif [ ${tgt} == "ja" ]; then 12 | TOK="ja-mecab" 13 | fi 14 | echo "--------------------Results for ${pair}-------------------------------------" 15 | src_path=./outputs/wmt23_outputs/wmt-testset/${src}${tgt}/test.${src}-${tgt}.${src} 16 | tgt_path=./outputs/wmt23_outputs/wmt-testset/${src}${tgt}/test.${src}-${tgt}.${tgt} 17 | output_path=${OUTPUT_DIR}/test-${src}-${tgt} 18 | SACREBLEU_FORMAT=text sacrebleu -tok ${TOK} -w 2 ${tgt_path} < ${output_path} > ${output_path}.bleu 19 | cat ${output_path}.bleu 20 | comet-score -s ${src_path} -t ${output_path} -r ${tgt_path} --batch_size 256 --model Unbabel/wmt22-comet-da --gpus 1 > ${output_path}.comet 21 | comet-score -s ${src_path} -t ${output_path} --batch_size 256 --model Unbabel/wmt22-cometkiwi-da --gpus 1 > ${output_path}.cometkiwi 22 | comet-score -s ${src_path} -t ${output_path} --batch_size 8 --model Unbabel/wmt23-cometkiwi-da-xxl --gpus 1 > ${output_path}.cometkiwi_10b 23 | comet-score -s ${src_path} -t ${output_path} --batch_size 8 --model Unbabel/XCOMET-XXL --gpus 1 --to_json ${output_path}.xcomet.output.json > ${output_path}.xcomet_10b 24 | tail -n 1 ${output_path}.comet 25 | done 26 | 27 | for pair in ${TEST_PAIRS//,/ }; do 28 | src=$(echo ${pair} | cut -d "-" -f 1) 29 | tgt=$(echo ${pair} | cut -d "-" -f 2) 30 | echo "---------------------------${src}-${tgt}-------------------------------" 31 | output_path=${OUTPUT_DIR}/test-${src}-${tgt} 32 | cat ${output_path}.bleu 33 | tail -n 1 ${output_path}.comet 34 | tail -n 1 ${output_path}.cometkiwi 35 | tail -n 1 ${output_path}.cometkiwi_10b 36 | tail -n 2 ${output_path}.xcomet_10b 37 | done 38 | -------------------------------------------------------------------------------- /evals/eval_other_models.sh: -------------------------------------------------------------------------------- 1 | MODEL_NAME=${1} 2 | TEST_PAIRS=${2:-"de-en,cs-en,is-en,zh-en,ru-en,en-de,en-cs,en-is,en-zh,en-ru"} 3 | MODEL="${MODEL_NAME//\//-}" 4 | OUTPUT_DIR=outputs-${MODEL} 5 | 6 | export HF_DATASETS_CACHE=".cache/huggingface_cache/datasets" 7 | export TRANSFORMERS_CACHE=".cache/models/" 8 | # random port between 30000 and 50000 9 | port=$(( RANDOM % (50000 - 30000 + 1 ) + 30000 )) 10 | 11 | if [[ ${MODEL_NAME} == "meta-llama/Llama-2-7b-hf" ]]; then 12 | REVISION="--model_revision 637a748546bb9abca62b0684183cc362bc1ece6d" 13 | elif [[ ${MODEL_NAME} == "meta-llama/Llama-2-13b-hf" ]]; then 14 | REVISION="--model_revision --model_revision 9474c6d222f45e7eb328c0f6b55501e7da67c9c3" 15 | fi 16 | 17 | ## Generation 18 | accelerate launch --main_process_port ${port} --config_file configs/deepspeed_eval_config.yaml \ 19 | run_llmmt.py \ 20 | --model_name_or_path ${MODEL} \ 21 | --do_predict \ 22 | --low_cpu_mem_usage \ 23 | --language_pairs ${TEST_PAIRS} \ 24 | --mmt_data_path ./human_written_data/ \ 25 | --per_device_eval_batch_size 2 \ 26 | --output_dir ${OUTPUT_DIR} \ 27 | --predict_with_generate \ 28 | --max_new_tokens 256 \ 29 | --max_source_length 256 \ 30 | --fp16 \ 31 | --seed 42 \ 32 | --num_beams 5 \ 33 | --overwrite_cache \ 34 | --overwrite_output_dir \ 35 | ${REVISION} 36 | 37 | ## Evaluation (BLEU, COMET) 38 | bash ./evals/eval_generation.sh ${OUTPUT_DIR} ${TEST_PAIRS} -------------------------------------------------------------------------------- /evals/llama-2-13b-5-shot.sh: -------------------------------------------------------------------------------- 1 | OUTPUT_DIR=${1:-"./outputs-llama-2-13b-5-shot/"} 2 | TEST_PAIRS=${2:-"de-en,cs-en,is-en,zh-en,ru-en,en-de,en-cs,en-is,en-zh,en-ru"} 3 | 4 | # random port between 30000 and 50000 5 | port=$(( RANDOM % (50000 - 30000 + 1 ) + 30000 )) 6 | 7 | accelerate launch --main_process_port ${port} --config_file configs/deepspeed_eval_config_bf16.yaml \ 8 | run_llmmt.py \ 9 | --model_name_or_path meta-llama/Llama-2-13b-hf \ 10 | --model_revision 9474c6d222f45e7eb328c0f6b55501e7da67c9c3 \ 11 | --do_predict \ 12 | --low_cpu_mem_usage \ 13 | --language_pairs ${TEST_PAIRS} \ 14 | --mmt_data_path ./human_written_data/ \ 15 | --per_device_eval_batch_size 2 \ 16 | --output_dir ${OUTPUT_DIR} \ 17 | --predict_with_generate \ 18 | --max_new_tokens 256 \ 19 | --max_source_length 768 \ 20 | --fp16 \ 21 | --seed 42 \ 22 | --num_beams 1 \ 23 | --few_shot_eval_path ./human_written_data/HR-5-shot/ \ 24 | --overwrite_cache \ 25 | --overwrite_output_dir 26 | 27 | ## Evaluation (BLEU, COMET) 28 | bash ./evals/eval_generation.sh ${OUTPUT_DIR} ${TEST_PAIRS} -------------------------------------------------------------------------------- /figures/alma.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fe1ixxu/ALMA/e4545ca1f9799f1c7052e16d757ea3ed6b6b4286/figures/alma.jpg -------------------------------------------------------------------------------- /figures/alma_logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fe1ixxu/ALMA/e4545ca1f9799f1c7052e16d757ea3ed6b6b4286/figures/alma_logo.png -------------------------------------------------------------------------------- /figures/alma_origin_logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fe1ixxu/ALMA/e4545ca1f9799f1c7052e16d757ea3ed6b6b4286/figures/alma_origin_logo.png -------------------------------------------------------------------------------- /figures/almar.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fe1ixxu/ALMA/e4545ca1f9799f1c7052e16d757ea3ed6b6b4286/figures/almar.png -------------------------------------------------------------------------------- /figures/xalma.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fe1ixxu/ALMA/e4545ca1f9799f1c7052e16d757ea3ed6b6b4286/figures/xalma.png -------------------------------------------------------------------------------- /human_written_data/Filtered-5-shot/shots.cs-en.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "source": "Tak\u00e9 v r\u00e1mci t\u00e9to oblasti je Hotel Tropico Mallorca Island a Tropico Playa.", 4 | "target": "Also within the area is the Hotel Tropico Mallorca Island and the Tropico Playa." 5 | }, 6 | { 7 | "source": "Hotely ve m\u011bst\u011b Furmanovka Hotely \u2013 Furmanovka Hledat hotely \u2013 Furmanovka", 8 | "target": "Furmanovka Hotels Furmanovka Hotels Search for hotels in Furmanovka" 9 | }, 10 | { 11 | "source": "Kone\u010dn\u011b m\u016f\u017eete zjistit v\u0161e, co pot\u0159ebujete v\u011bd\u011bt o va\u0161em GAF souboru... okam\u017eit\u011b!", 12 | "target": "Finally, you can now discover everything you need to know about your GAF file... instantly!" 13 | }, 14 | { 15 | "source": "Copyright \u00a9 AQE advisors, a.s., V\u0161echna pr\u00e1va vyhrazena", 16 | "target": "Copyright \u00a9 AQE advisors, a.s., All rights reserved" 17 | }, 18 | { 19 | "source": "AirPrint a logo AirPrint jsou ochrann\u00e9 zn\u00e1mky spole\u010dnosti Apple Inc.", 20 | "target": "AirPrint and the AirPrint logo are trademarks of Apple Inc." 21 | } 22 | ] -------------------------------------------------------------------------------- /human_written_data/Filtered-5-shot/shots.de-en.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "source": "Hotel Inglaterra in Granada, Spain - Besten Preise Garantiert | Lets Book Hotel", 4 | "target": "Hotel Inglaterra in Granada, Spain - Best Rates Guaranteed | Lets Book Hotel" 5 | }, 6 | { 7 | "source": "Porsche Design Cervo 2.0 Laptoptasche 13\u2033 Leder schwarz", 8 | "target": "Porsche Design Cervo 2.0 Laptop bag 13\u2033 leather black" 9 | }, 10 | { 11 | "source": "Fl\u00fcge von Brisbane nach Amsterdam via Shanghai Pudong", 12 | "target": "Flights from Brisbane to Amsterdam via Shanghai Pudong" 13 | }, 14 | { 15 | "source": "Hans Reiter (Ministerium f\u00fcr Wissenschaft, Forschung, und Kunst Baden-W\u00fcrttemberg) Foto: Johannes Zimmermann, Stuttgart", 16 | "target": "Hans Reiter (Baden-W\u00fcrttemberg Ministry of Science, Research, and Art) Photo: Johannes Zimmermann, Stuttgart" 17 | }, 18 | { 19 | "source": "Das Bild \"Mountain landscape at sunset\" von Pavlo Vakhrushev ist bei Fotolia lizenzfrei ab 2 Credits erh\u00e4ltlich (Credit ab 0,74 \u20ac).", 20 | "target": "The photo \"Mountain landscape at sunset\" from Pavlo Vakhrushev is available on Fotolia under a royalty-free license from 2 credits (Credit from $0.74)." 21 | } 22 | ] -------------------------------------------------------------------------------- /human_written_data/Filtered-5-shot/shots.de-fr.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "source": "Seit 2000 haben sich aus der Notwendigkeit des Wiederaufbaus nach den unrechtm\u00e4\u00dfigen und ungerechten Kriegen im Kosovo, in Serbien und in Afghanistan neue Schwerpunkte ergeben.", 4 | "target": "Depuis 2000, de nouvelles priorit\u00e9s se sont fait jour en raison des besoins de reconstruction faisant suite aux guerres ill\u00e9gales et iniques au Kosovo, en Serbie et en Afghanistan." 5 | }, 6 | { 7 | "source": "Levetiracetam SUN muss zweimal t\u00e4glich verabreicht werden, morgens und abends, jeden Tag ungef\u00e4hr zur gleichen Uhrzeit.", 8 | "target": "Levetiracetam SUN doit \u00eatre administr\u00e9 2 fois par jour, une fois le matin et une fois le soir, approximativement \u00e0 la m\u00eame heure chaque jour." 9 | }, 10 | { 11 | "source": "Wir verl\u00e4ngern unsere w\u00e4rmsten Gr\u00fc\u00dfe und laden Sie ein, am Econo Lodge City Star Brisbane zu bleiben.", 12 | "target": "Nous prolongeons nos salutations plus chaudes et vous invitons \u00e0 rester au Econo Lodge City Star Brisbane." 13 | }, 14 | { 15 | "source": "Sie sehen Wettervorhersage in Elizabethtown. Sie sehen auch die Wettervorhersage in Vereinigte Staaten,", 16 | "target": "vous regardez Pr\u00e9visions m\u00e9t\u00e9orologiques dans Elizabethtown. Voir aussi Pr\u00e9visions m\u00e9t\u00e9orologiques pour le pays \u00c9tats-Unis," 17 | }, 18 | { 19 | "source": "AVG Free Edition 2016.71.7597 Freigegeben: 16 Mai 2016 (Vor 4 Wochen) Technische Details | Change Log", 20 | "target": "AVG Free Edition 2016.71.7597 Sorties: 16 mai 2016 (Il y a 4 semaines) D\u00e9tails techniques | Journal des changements" 21 | } 22 | ] -------------------------------------------------------------------------------- /human_written_data/Filtered-5-shot/shots.en-cs.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "source": "Free download pdf User's Manual for Garmin GNC 420AW GPS", 4 | "target": "Sta\u017een\u00ed zdarma pdf U\u017eivatelsk\u00fd manu\u00e1l for Garmin GNC 420AW GPS" 5 | }, 6 | { 7 | "source": "Games developed by NetEnt, Amatic, Pragmatic Play and more providers at WildBlaster.com.", 8 | "target": "Games vyvinut\u00fd NetEnt, Amatic, Pragmatic Play a v\u00edce poskytovatel\u016f na WildBlaster.com." 9 | }, 10 | { 11 | "source": "Article 14 Entry into force This Decision shall enter into force on the date of its adoption.", 12 | "target": "\u010cl\u00e1nek 14 Vstup v platnost Toto rozhodnut\u00ed vstupuje v platnost dnem p\u0159ijet\u00ed." 13 | }, 14 | { 15 | "source": "Russak Forum4 Photos3 Russak Czech Republic, Europe Send a message Facebook Twitter Google+ Vehicle's owners (1) 1994 Sephia / Shuma / Mentor 20 photos Fuel gasoline.", 16 | "target": "Russak F\u00f3rum4 Fotky3 Russak \u010cesk\u00e1 republika, Evropa Poslat zpr\u00e1vu Facebook Twitter Google+ Majitel vozidel (1) 1994 Sephia / Shuma / Mentor 20 fotek Palivo benz\u00edn." 17 | }, 18 | { 19 | "source": "Please inform KeyBarcelona Plaza Universidad Apartment - Gran Via in advance of your expected arrival time.", 20 | "target": "Informujte pros\u00edm KeyBarcelona Plaza Universidad Apartment - Gran Via o sv\u00e9m p\u0159edpokl\u00e1dan\u00e9m \u010dase p\u0159\u00edjezdu p\u0159edem." 21 | } 22 | ] -------------------------------------------------------------------------------- /human_written_data/Filtered-5-shot/shots.en-de.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "source": "Hotels near General Limousine Service Roma Day Tours", 4 | "target": "Hotels in der N\u00e4he von General Limousine Service Roma Day Tours" 5 | }, 6 | { 7 | "source": "Do you have a question concerning ZyXEL Communications 2602RL-D3A?", 8 | "target": "Haben Sie eine Frage bez\u00fcglich ZyXEL Communications 2602RL-D3A?" 9 | }, 10 | { 11 | "source": "How long does it take to get from Duisburg to London?", 12 | "target": "Wie lange dauert es von Duisburg nach London zu kommen?" 13 | }, 14 | { 15 | "source": "Explore.exe is located in the C:\\Windows\\System32 folder.", 16 | "target": "Explore.exe befindet sich im Ordner C:\\Windows\\System32." 17 | }, 18 | { 19 | "source": "Find the best hotels in Corniglio and plan your trip", 20 | "target": "Finden Sie die besten Hotels in Corniglio und planen Sie Ihre Reise" 21 | } 22 | ] -------------------------------------------------------------------------------- /human_written_data/Filtered-5-shot/shots.en-ha.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "source": "What preparations did Abram have to make for his journey , and why might that have involved sacrifice ?", 4 | "target": "Wa\u0257anne shirye - shirye Abram ya yi don tafiyarsa , kuma me ya sa lalle ya \u0199unshi sadaukarwa ?" 5 | }, 6 | { 7 | "source": "The Bible also gives wise counsel about human relations , urging us to love one another and to treat others with respect , dignity , and kindness .", 8 | "target": "Littafi Mai Tsarki ya kuma ba da shawara mai kyau game da dangantakar \u2019 yan Adam , ya aririce mu mu \u0199aunaci juna kuma mu bi da juna cikin girmamawa , mutunci , da kuma kirki ." 9 | }, 10 | { 11 | "source": "EPDM sheet rubber is optimally suited for use outside and at high temperatures due to properties such as resistance to heat, sunlight, ozone, acids, alkaline and oxygen-containing solvents.", 12 | "target": "Rubin rubutun EPDM yana dacewa da kyau don amfani a waje da kuma yanayin zafi saboda kaddarorin irin su jure yanayin zafi, hasken rana, ozone, acid, alkaline da kuma oxygen-hade." 13 | }, 14 | { 15 | "source": "8. What would we be able to anticipate from you in the initial 90 days?", 16 | "target": "8. Menene za mu iya jira daga gare ku a farkon kwanakin 90?" 17 | }, 18 | { 19 | "source": "Capacity(t/h) Feed Both at Center and Sides 120-180 200-260 300-380 450-520", 20 | "target": "\u0198imar (t / h) Ciyar Dukansu a Cibiyar da Sides 120-180 200-260 300-380 450-520" 21 | } 22 | ] 23 | -------------------------------------------------------------------------------- /human_written_data/Filtered-5-shot/shots.en-is.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "source": "FOOD SERVICE: \u2022 The Florida State Fire College has a full-service cafeteria operated by MCTAE Culinary School.", 4 | "target": "Food Service: \u2022 The Florida State Fire College me\u00f0 allri \u00fej\u00f3nustu m\u00f6tuneyti reki\u00f0 af MCTAE Culinary School." 5 | }, 6 | { 7 | "source": "Luxury homes for sale in Spain Found 60 Properties", 8 | "target": "L\u00faxus heimili til s\u00f6lu \u00e1 Sp\u00e1ni Hafa fundist 60 Properties Ver\u00f0 fr\u00e1" 9 | }, 10 | { 11 | "source": "At the time, Belgium was part of the Burgundian empire, Rome, Spain, was part of the Netherlands and even France .", 12 | "target": "\u00c1 \u00feeim t\u00edma var Belg\u00eda hluti af Bourgogne heimsveldinu, R\u00f3m, Sp\u00e1ni, var hluti af Hollandi og jafnvel Frakklandi." 13 | }, 14 | { 15 | "source": "Yummy, the Neptune burger was the best of the best, the flavor well yes oh yes.", 16 | "target": "Yummy, The Neptune hamborgari var the bestur af the bestur, brag\u00f0i\u00f0 vel j\u00e1 \u00f3 j\u00e1." 17 | }, 18 | { 19 | "source": "This applies no less to economic performance than other criteria: Economic growth, research and development, tehnological innovation, productivity pr. hour of work, job creation, participation in the labour-market, (especially female participation), equality of the sexes, level of education, social mobility, absence of poverty, health and longevity, quality of infrastructure, access to unspoilt nature, the overall quality of life. Less inequality than in most places.", 20 | "target": "\u00deetta \u00e1 ekki s\u00ed\u00f0ur vi\u00f0 um hagr\u00e6na m\u00e6likvar\u00f0a en a\u00f0ra: Hagv\u00f6xt, framlei\u00f0ni pr vinnustund, ranns\u00f3knir og \u00fer\u00f3un, t\u00e6knin\u00fdjungar og \u00fatbrei\u00f0slu \u00feeirra, sk\u00f6pun starfa, \u00fe\u00e1ttt\u00f6ku \u00e1 vinnumarka\u00f0i (s\u00e9rstaklega \u00fe\u00e1ttt\u00f6ku kvenna), jafnr\u00e6\u00f0i kynjanna, menntunarstig og starfs\u00fej\u00e1lfun, f\u00e9lagslegan hreyfanleika, heilbrig\u00f0i og langl\u00edfi, g\u00e6\u00f0i innvi\u00f0a, \u00fatr\u00fdmingu f\u00e1t\u00e6ktar, a\u00f0gang a\u00f0 \u00f3spilltri n\u00e1tt\u00faru, almenn l\u00edfsg\u00e6\u00f0i." 21 | } 22 | ] -------------------------------------------------------------------------------- /human_written_data/Filtered-5-shot/shots.en-ja.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "source": "About ADI Corporate Information Executive Team Vincent Roche Attention Internet Explorer users: Analog.com no longer supports IE 11.", 4 | "target": "ADI\u306b\u3064\u3044\u3066 \u4f1a\u793e\u6982\u8981 \u7d4c\u55b6\u9663 Vincent Roche Internet Explorer\u3092\u304a\u4f7f\u3044\u306e\u304a\u5ba2\u69d8\u3078\u306e\u6ce8\u610f: Analog.com\u30b5\u30a4\u30c8\u306f\u3001IE11\u3092\u30b5\u30dd\u30fc\u30c8\u3057\u3066\u304a\u308a\u307e\u305b\u3093\u3002" 5 | }, 6 | { 7 | "source": "Simplify performance tuning and troubleshooting with Azure SQL Database \u2013 Azure \u30d6\u30ed\u30b0 \u307e\u3068\u3081", 8 | "target": "Azure SQL Database \u3067\u30d1\u30d5\u30a9\u30fc\u30de\u30f3\u30b9\u306e\u30c1\u30e5\u30fc\u30cb\u30f3\u30b0\u3068\u30c8\u30e9\u30d6\u30eb\u30b7\u30e5\u30fc\u30c6\u30a3\u30f3\u30b0\u3092\u7c21\u7d20\u5316 \u2013 Azure \u30d6\u30ed\u30b0 \u307e\u3068\u3081" 9 | }, 10 | { 11 | "source": "Ant Financial has so far raised a staggering $14 billion this year alone in Series C funding.", 12 | "target": "Ant Financial\u306f\u3053\u308c\u307e\u3067\u3001\u30b7\u30ea\u30fc\u30baC\u306e\u8cc7\u91d1\u8abf\u9054\u3067\u4eca\u5e74\u3060\u3051\u3067$ 14\u5104\u3092\u9a5a\u7570\u7684\u306b\u4e0a\u3052\u3066\u3044\u307e\u3059\u3002" 13 | }, 14 | { 15 | "source": "On Windows 8.1, completely uninstalling DirectX 9, 10 and 11 without consequences is possible only if they are not installed during a system update.", 16 | "target": "Windows 8.1\u3067\u306f\u3001DirectX 9\u300110\u3001\u304a\u3088\u307311\u3092\u30b7\u30b9\u30c6\u30e0\u306e\u66f4\u65b0\u4e2d\u306b\u30a4\u30f3\u30b9\u30c8\u30fc\u30eb\u3057\u306a\u3044\u5834\u5408\u306b\u306e\u307f\u3001\u5f71\u97ff\u3092\u4e0e\u3048\u308b\u3053\u3068\u306a\u304f\u5b8c\u5168\u306b\u30a2\u30f3\u30a4\u30f3\u30b9\u30c8\u30fc\u30eb\u3067\u304d\u307e\u3059\u3002" 17 | }, 18 | { 19 | "source": "At 6:16 pm the entire cloudbank is illuminated, but by 6:40 pm only the top of the cloudbank is illuminated.", 20 | "target": "\u5348\u5f8c6:16\u306b\u3001\u96f2\u306e\u571f\u624b\u5168\u4f53\u304c\u7167\u3089\u3055\u308c\u307e\u3059\u304c\u3001\u5348\u5f8c6:40\u307e\u3067\u306b\u306f\u3001\u96f2\u306e\u571f\u624b\u306e\u9802\u70b9\u3060\u3051\u304c\u7167\u3089\u3055\u308c\u307e\u3059\u3002" 21 | } 22 | ] -------------------------------------------------------------------------------- /human_written_data/Filtered-5-shot/shots.en-ru.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "source": "2016-2017 - master of production training of the department \u00abTransport equipment and technologies\u00bb, Kazakh Agrotechnical University named after S.Seifullin, Astana", 4 | "target": "2016-2017 \u0433\u0433. - \u043c\u0430\u0441\u0442\u0435\u0440 \u043f\u0440\u043e\u0438\u0437\u0432\u043e\u0434\u0441\u0442\u0432\u0435\u043d\u043d\u043e\u0433\u043e \u043e\u0431\u0443\u0447\u0435\u043d\u0438\u044f \u043a\u0430\u0444\u0435\u0434\u0440\u044b \u00ab\u0422\u0440\u0430\u043d\u0441\u043f\u043e\u0440\u0442\u043d\u0430\u044f \u0442\u0435\u0445\u043d\u0438\u043a\u0430 \u0438 \u0442\u0435\u0445\u043d\u043e\u043b\u043e\u0433\u0438\u0439\u00bb, \u041a\u0430\u0437\u0430\u0445\u0441\u043a\u0438\u0439 \u0430\u0433\u0440\u043e\u0442\u0435\u0445\u043d\u0438\u0447\u0435\u0441\u043a\u0438\u0439 \u0443\u043d\u0438\u0432\u0435\u0440\u0441\u0438\u0442\u0435\u0442 \u0438\u043c.\u0421.\u0421\u0435\u0439\u0444\u0443\u043b\u043b\u0438\u043d\u0430, \u0433. \u0410\u0441\u0442\u0430\u043d\u0430" 5 | }, 6 | { 7 | "source": "Is it also possible to use standards from the EN 301489 series?", 8 | "target": "\u041c\u043e\u0436\u043d\u043e \u043b\u0438 \u0442\u0430\u043a\u0436\u0435 \u0438\u0441\u043f\u043e\u043b\u044c\u0437\u043e\u0432\u0430\u0442\u044c \u0441\u0442\u0430\u043d\u0434\u0430\u0440\u0442\u044b \u0441\u0435\u0440\u0438\u0438 EN 301489?" 9 | }, 10 | { 11 | "source": "Chairman: Mr. Hamaneh (Vice-Chairman) (Islamic Republic of Iran)", 12 | "target": "\u041f\u0440\u0435\u0434\u0441\u0435\u0434\u0430\u0442\u0435\u043b\u044c: \u0433-\u043d \u0425\u0430\u043c\u0430\u043d\u0435 (\u0437\u0430\u043c\u0435\u0441\u0442\u0438\u0442\u0435\u043b\u044c \u041f\u0440\u0435\u0434\u0441\u0435\u0434\u0430\u0442\u0435\u043b\u044f) (\u0418\u0441\u043b\u0430\u043c\u0441\u043a\u0430\u044f \u0420\u0435\u0441\u043f\u0443\u0431\u043b\u0438\u043a\u0430 \u0418\u0440\u0430\u043d)" 13 | }, 14 | { 15 | "source": "The report covers the period from 1 January 2006 to 15 March 2007.", 16 | "target": "\u0414\u043e\u043a\u043b\u0430\u0434 \u043e\u0445\u0432\u0430\u0442\u044b\u0432\u0430\u0435\u0442 \u043f\u0435\u0440\u0438\u043e\u0434 \u0441 1 \u044f\u043d\u0432\u0430\u0440\u044f 2006 \u0433\u043e\u0434\u0430 \u043f\u043e 15 \u043c\u0430\u0440\u0442\u0430 2007 \u0433\u043e\u0434\u0430." 17 | }, 18 | { 19 | "source": "2 years ago 1:10:00 ProPorn small tits, beauty, erotic, girlfriend", 20 | "target": "2 \u0433\u043e\u0434\u0430 \u043d\u0430\u0437\u0430\u0434 1:10:00 ProPorn \u043c\u0430\u043b\u0435\u043d\u044c\u043a\u0438\u0435 \u0441\u0438\u0441\u044c\u043a\u0438, \u043a\u0440\u0430\u0441\u0430\u0432\u0438\u0446\u044b, \u044d\u0440\u043e\u0442\u0438\u043a\u0430, \u043f\u043e\u0434\u0440\u0443\u0433\u0430" 21 | } 22 | ] -------------------------------------------------------------------------------- /human_written_data/Filtered-5-shot/shots.en-uk.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "source": "You are here: Home>Tech blog>Joomla FAQ>Joomla 1.5 and Joomla 1.7, 2.5, 3.x How to load modules using ajax", 4 | "target": "\u0412\u0438 \u0442\u0443\u0442: \u0413\u043e\u043b\u043e\u0432\u043d\u0430>\u0422\u0435\u0445\u043d\u0456\u0447\u043d\u0438\u0439 \u0431\u043b\u043e\u0433>\u0427\u0430\u041f\u0438 \u043f\u043e Joomla>Joomla 1.5 \u0442\u0430 1.7, 2.5, 3.x. \u042f\u043a \u0437\u0430\u0432\u0430\u043d\u0442\u0430\u0436\u0443\u0432\u0430\u0442\u0438 \u043c\u043e\u0434\u0443\u043b\u0456 \u0437\u0430 \u0434\u043e\u043f\u043e\u043c\u043e\u0433\u043e\u044e Ajax" 5 | }, 6 | { 7 | "source": "Western Hungary - Photo album, picture galleries (985 photos / 27 galleries)", 8 | "target": "\u0417\u0430\u0445\u0456\u0434\u043d\u0430 \u0423\u0433\u043e\u0440\u0449\u0438\u043d\u0430 - \u0424\u043e\u0442\u043e\u0430\u043b\u044c\u0431\u043e\u043c, \u0444\u043e\u0442\u043e\u0433\u0440\u0430\u0444\u0456\u0457, \u0433\u0430\u043b\u0435\u0440\u0435\u0457 \u0437\u043e\u0431\u0440\u0430\u0436\u0435\u043d\u044c (985 \u0444\u043e\u0442\u043e\u0433\u0440\u0430\u0444\u0456\u0457 / 27 \u0433\u0430\u043b\u0435\u0440\u0435\u0457)" 9 | }, 10 | { 11 | "source": "Exchange of scientific information and publications; joint research and publication of results Poland", 12 | "target": "\u041e\u0431\u043c\u0456\u043d \u043d\u0430\u0443\u043a\u043e\u0432\u043e\u044e \u0456\u043d\u0444\u043e\u0440\u043c\u0430\u0446\u0456\u0454\u044e \u0456 \u043f\u0443\u0431\u043b\u0456\u043a\u0430\u0446\u0456\u044f\u043c\u0438; \u043f\u0440\u043e\u0432\u0435\u0434\u0435\u043d\u043d\u044f \u0441\u043f\u0456\u043b\u044c\u043d\u0438\u0445 \u043d\u0430\u0443\u043a\u043e\u0432\u0438\u0445 \u0434\u043e\u0441\u043b\u0456\u0434\u0436\u0435\u043d\u044c \u0456 \u0441\u0443\u043c\u0456\u0441\u043d\u0430 \u043f\u0443\u0431\u043b\u0456\u043a\u0430\u0446\u0456\u044f \u0440\u0435\u0437\u0443\u043b\u044c\u0442\u0430\u0442\u0456\u0432 \u041f\u043e\u043b\u044c\u0449\u0430" 13 | }, 14 | { 15 | "source": "Look at the air fares Batumi \u2014 Odessa for the next two weeks and choose the option that fits you, in price and in time.", 16 | "target": "\u041f\u0435\u0440\u0435\u0433\u043b\u044f\u043d\u044c\u0442\u0435 \u0446\u0456\u043d\u0438 \u043d\u0430 \u0430\u0432\u0456\u0430\u043f\u0435\u0440\u0435\u043b\u0456\u0442 \u0411\u0430\u0442\u0443\u043c\u0456 \u2014 \u041e\u0434\u0435\u0441\u0430 \u043d\u0430 \u043d\u0430\u0439\u0431\u043b\u0438\u0436\u0447\u0456 \u0434\u0432\u0430 \u0442\u0438\u0436\u043d\u0456 \u0456 \u0432\u0438\u0431\u0435\u0440\u0456\u0442\u044c \u043e\u043f\u0442\u0438\u043c\u0430\u043b\u044c\u043d\u0438\u0439 \u0432\u0430\u0440\u0456\u0430\u043d\u0442, \u044f\u043a\u0438\u0439 \u043f\u0456\u0434\u0445\u043e\u0434\u0438\u0442\u044c \u0432\u0430\u043c \u0456 \u0437\u0430 \u0432\u0430\u0440\u0442\u0456\u0441\u0442\u044e, \u0456 \u0437\u0430 \u0447\u0430\u0441\u043e\u043c." 17 | }, 18 | { 19 | "source": "Exhibition: 1984. Kyiv. Shevchenko \u2013 artist. To 170\u2013th birthday [Shevchenko \u2013 artist: Exhibition Catalogue. \u2013 K., 1986. \u2013 p. 8].", 20 | "target": "\u0412\u0438\u0441\u0442\u0430\u0432\u043a\u0430: 1984. \u041a\u0438\u0457\u0432. \u0428\u0435\u0432\u0447\u0435\u043d\u043a\u043e-\u0445\u0443\u0434\u043e\u0436\u043d\u0438\u043a. \u0414\u043e 170-\u0440\u0456\u0447\u0447\u044f \u0432\u0456\u0434 \u0434\u043d\u044f \u043d\u0430\u0440\u043e\u0434\u0436\u0435\u043d\u043d\u044f [\u0428\u0435\u0432\u0447\u0435\u043d\u043a\u043e-\u0445\u0443\u0434\u043e\u0436\u043d\u0438\u043a: \u041a\u0430\u0442\u0430\u043b\u043e\u0433 \u0432\u0438\u0441\u0442\u0430\u0432\u043a\u0438. \u2013 \u041a., 1986. \u2013 \u0421. 8]." 21 | } 22 | ] -------------------------------------------------------------------------------- /human_written_data/Filtered-5-shot/shots.en-zh.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "source": "In 2001, 38% of reported HIV infections and 33% of AIDS cases were in women (see tables 12.7 and 12.8 in annex 2, p. 14-15).", 4 | "target": "2001\u5e74\uff0c\u5728\u62a5\u544a\u7684\u827e\u6ecb\u75c5\u75c5\u6bd2\u611f\u67d3\u548c\u827e\u6ecb\u75c5\u75c5\u4f8b\u4e2d\uff0c\u5987\u5973\u5206\u522b\u5360 38\uff05\u548c33\uff05\uff08\u89c1\u9644\u4ef62\u886812.7 \u548c\u8868 12.8\uff0c\u7b2c14-15\u9875\uff09\u3002" 5 | }, 6 | { 7 | "source": "Where Are You? I'm Here", 8 | "target": "\u4f60\u5728\u54ea\u88e1\uff1f\u6211\u5728\u9019\u88e1" 9 | }, 10 | { 11 | "source": "9. Future needs in environmental monitoring", 12 | "target": "9. \u73af\u5883\u76d1\u6d4b\u65b9\u9762\u4eca\u540e\u7684\u9700\u8981" 13 | }, 14 | { 15 | "source": "Central Bank of Nigeria", 16 | "target": "\u5c3c\u65e5\u5229\u4e9a\u4e2d\u592e\u94f6\u884c (\u82f1\u8bed : Central Bank of Nigeria )" 17 | }, 18 | { 19 | "source": "Apache CentOs Linux PHP Server", 20 | "target": "\u963f\u5e15\u5947 CentOs Linux PHP Server" 21 | } 22 | ] -------------------------------------------------------------------------------- /human_written_data/Filtered-5-shot/shots.fr-de.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "source": "Depuis 2000, de nouvelles priorit\u00e9s se sont fait jour en raison des besoins de reconstruction faisant suite aux guerres ill\u00e9gales et iniques au Kosovo, en Serbie et en Afghanistan.", 4 | "target": "Seit 2000 haben sich aus der Notwendigkeit des Wiederaufbaus nach den unrechtm\u00e4\u00dfigen und ungerechten Kriegen im Kosovo, in Serbien und in Afghanistan neue Schwerpunkte ergeben." 5 | }, 6 | { 7 | "source": "Levetiracetam SUN doit \u00eatre administr\u00e9 2 fois par jour, une fois le matin et une fois le soir, approximativement \u00e0 la m\u00eame heure chaque jour.", 8 | "target": "Levetiracetam SUN muss zweimal t\u00e4glich verabreicht werden, morgens und abends, jeden Tag ungef\u00e4hr zur gleichen Uhrzeit." 9 | }, 10 | { 11 | "source": "Nous prolongeons nos salutations plus chaudes et vous invitons \u00e0 rester au Econo Lodge City Star Brisbane.", 12 | "target": "Wir verl\u00e4ngern unsere w\u00e4rmsten Gr\u00fc\u00dfe und laden Sie ein, am Econo Lodge City Star Brisbane zu bleiben." 13 | }, 14 | { 15 | "source": "vous regardez Pr\u00e9visions m\u00e9t\u00e9orologiques dans Elizabethtown. Voir aussi Pr\u00e9visions m\u00e9t\u00e9orologiques pour le pays \u00c9tats-Unis,", 16 | "target": "Sie sehen Wettervorhersage in Elizabethtown. Sie sehen auch die Wettervorhersage in Vereinigte Staaten," 17 | }, 18 | { 19 | "source": "AVG Free Edition 2016.71.7597 Sorties: 16 mai 2016 (Il y a 4 semaines) D\u00e9tails techniques | Journal des changements", 20 | "target": "AVG Free Edition 2016.71.7597 Freigegeben: 16 Mai 2016 (Vor 4 Wochen) Technische Details | Change Log" 21 | } 22 | ] -------------------------------------------------------------------------------- /human_written_data/Filtered-5-shot/shots.ha-en.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "source": "Wa\u0257anne shirye - shirye Abram ya yi don tafiyarsa , kuma me ya sa lalle ya \u0199unshi sadaukarwa ?", 4 | "target": "What preparations did Abram have to make for his journey , and why might that have involved sacrifice ?" 5 | }, 6 | { 7 | "source": "Littafi Mai Tsarki ya kuma ba da shawara mai kyau game da dangantakar \u2019 yan Adam , ya aririce mu mu \u0199aunaci juna kuma mu bi da juna cikin girmamawa , mutunci , da kuma kirki .", 8 | "target": "The Bible also gives wise counsel about human relations , urging us to love one another and to treat others with respect , dignity , and kindness ." 9 | }, 10 | { 11 | "source": "Rubin rubutun EPDM yana dacewa da kyau don amfani a waje da kuma yanayin zafi saboda kaddarorin irin su jure yanayin zafi, hasken rana, ozone, acid, alkaline da kuma oxygen-hade.", 12 | "target": "EPDM sheet rubber is optimally suited for use outside and at high temperatures due to properties such as resistance to heat, sunlight, ozone, acids, alkaline and oxygen-containing solvents." 13 | }, 14 | { 15 | "source": "8. Menene za mu iya jira daga gare ku a farkon kwanakin 90?", 16 | "target": "8. What would we be able to anticipate from you in the initial 90 days?" 17 | }, 18 | { 19 | "source": "\u0198imar (t / h) Ciyar Dukansu a Cibiyar da Sides 120-180 200-260 300-380 450-520", 20 | "target": "Capacity(t/h) Feed Both at Center and Sides 120-180 200-260 300-380 450-520" 21 | } 22 | ] 23 | -------------------------------------------------------------------------------- /human_written_data/Filtered-5-shot/shots.is-en.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "source": "Finndu fr\u00e1b\u00e6r ver\u00f0 me\u00f0 Autatlantis \u00e1 Proserpine Flugv\u00f6llur, sj\u00e1\u00f0u einkunnir vi\u00f0skiptavina - og b\u00f3ka\u00f0u \u00e1 netinu, hratt og \u00f6rugglega", 4 | "target": "Find great prices with Autatlantis at Proserpine Airport, see customer ratings - and book online, quickly and easily" 5 | }, 6 | { 7 | "source": "Li\u00f0i\u00f0 n\u00e6r Geo drengur, st\u00falka Milly og gr\u00e6nt Android v\u00e9lmenni Botha.", 8 | "target": "The team includes Geo boy, girl Milly and green android robot Botha." 9 | }, 10 | { 11 | "source": "The Crow Lake Verkefni\u00f0 spannar \u00ferj\u00fa s\u00fdslum \u00ed Su\u00f0ur-Dakota og er st\u00e6rsta vindur verkefni \u00ed Bandar\u00edkjunum \u00e1tti eing\u00f6ngu vi\u00f0 samstarfsverkefni, me\u00f0 afkastagetu upp \u00e1 151,5 megav\u00f6tt.", 12 | "target": "The Crow Lake project spans three counties in South Dakota and is the largest wind project in the United States owned solely by a cooperative, with a capacity of 151.5 megawatts." 13 | }, 14 | { 15 | "source": "All Slot Mobile spilav\u00edti\u00f0 er eitt hi\u00f0 fullkomna d\u00e6mi um framfarir \u00ed spilav\u00edtum.", 16 | "target": "The All Slot Mobile Casino is one of the perfect examples of advancement in casino gaming." 17 | }, 18 | { 19 | "source": "Heiti og flokkunarfr\u00e6\u00f0ileg hugt\u00f6k, eins og \u00feau eru skilgreind \u00ed tegundaskr\u00e1num \u00ed tilskipunum 2009/147/EB (fuglatilskipunin) og 92/43/EBE (vistger\u00f0atilskipunin).", 20 | "target": "Names and taxonomic concepts as defined by the species lists in Directives 2009/147/EC (Birds Directive) and 92/43/EEC (Habitats Directive)." 21 | } 22 | ] -------------------------------------------------------------------------------- /human_written_data/Filtered-5-shot/shots.ja-en.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "source": "\u7269\u7406\u7684\u30bb\u30ad\u30e5\u30ea\u30c6\u30a3\u3068\u74b0\u5883\u5236\u5fa1 Claris\u3067\u306f\u3001\u30db\u30b9\u30c6\u30a3\u30f3\u30b0\u30b5\u30fc\u30d3\u30b9\u306b Amazon Web Services(AWS)\u3092\u5229\u7528\u3057\u3066\u3044\u307e\u3059\u3002", 4 | "target": "Physical Security and Environmental Controls Claris uses Amazon Web Services (AWS) for its hosting needs." 5 | }, 6 | { 7 | "source": "SOC \u306f\u3001\u91cd\u5927\u306a\u640d\u5bb3\u304c\u767a\u751f \u3059\u308b\u524d\u306b\u3001\u653b\u6483\u3092\u5148\u53d6\u308a\u3057\u305f\u308a\u3001\u89e3\u6c7a\u3057\u305f\u308a\u3059\u308b\u305f\u3081\u306e\u30b9\u30de\u30fc \u30c8\u3067\u52b9\u7387\u7684\u306a\u691c\u51fa\u3001\u8abf\u67fb\u3001\u5fdc\u7b54\u6a5f\u80fd\u3092\u5099\u3048\u308b\u5fc5\u8981\u304c\u3042\u308a\u307e\u3059\u3002", 8 | "target": "SOCs need to be empowered with smart and efficient detect, investigate, and respond capabilities to preempt attacks or resolve them before significant damage occurs." 9 | }, 10 | { 11 | "source": "\u30c0\u30a6\u30f3\u30ed\u30fc\u30c9\u3059\u308b\u30d5\u30a1\u30a4\u30eb\u306e\u66f8\u5f0f (CAB\u3001XPI\u3001\u307e\u305f\u306f CRX ) \u306f\u30af\u30e9\u30a4\u30a2\u30f3\u30c8\u306e\u30d6\u30e9\u30a6\u30b6\u30fc\u306b\u3088\u308a\u7570\u306a\u308a\u307e\u3059\u3002", 12 | "target": "The format of the file that is downloaded (CAB, XPI, or CRX) will depend on the client's browser." 13 | }, 14 | { 15 | "source": "Citrix ADM \u30c7\u30a3\u30b6\u30b9\u30bf\u30ea\u30ab\u30d0\u30ea(DR)\u6a5f\u80fd\u306f\u3001\u9ad8\u53ef\u7528\u6027\u30e2\u30fc\u30c9\u3067\u5c55\u958b\u3055\u308c\u305fCitrix ADM \u5b8c\u5168\u306a\u30b7\u30b9\u30c6\u30e0\u30d0\u30c3\u30af\u30a2\u30c3\u30d7\u3068\u30ea\u30ab\u30d0\u30ea\u6a5f\u80fd\u3092\u63d0\u4f9b\u3057\u307e\u3059\u3002", 16 | "target": "The Citrix ADM disaster recovery (DR) feature provides full system backup and recovery capabilities for Citrix ADM deployed in high availability mode." 17 | }, 18 | { 19 | "source": "\u3053\u306e ASIC \u304a\u3088\u3073 SoC \u306e\u5b8c\u5168\u306a\u308b\u30bd\u30ea\u30e5\u30fc\u30b7\u30e7\u30f3\u306f\u3001\u30cf\u30fc\u30c9\u30a6\u30a7\u30a2\u30d9\u30fc\u30b9\u3067\u3001\u6700\u5927 4096 x 4096 \u307e\u3067\u306eISO/IEC 14496-10 Advanced Video Coding Standard (MPEG-4 Part 10)\u898f\u683c\u306b\u5b8c\u5168\u6e96\u62e0\u3057\u305f\u30c7\u30b3\u30fc\u30c9\u304c\u53ef\u80fd\u3067\u3059\u3002", 20 | "target": "The perfect solution for ASICs and SoCs is hardware-based and capable of full ISO/IEC 14496-10 Advanced Video Coding Standard (MPEG-4 Part 10) compliance decoding up to a resolution of 4096 x 4096." 21 | } 22 | ] -------------------------------------------------------------------------------- /human_written_data/Filtered-5-shot/shots.ru-en.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "source": "\u042d\u0440\u0438\u0445 \u0421\u0438\u0433\u0430\u043b \u00ab\u041c\u0443\u0436\u0447\u0438\u043d\u0430, \u0436\u0435\u043d\u0449\u0438\u043d\u0430 \u0438 \u0440\u0435\u0431\u0451\u043d\u043e\u043a\u00bb, 1982 \u2014 \u0438\u044e\u043b\u044c-\u0430\u0432\u0433\u0443\u0441\u0442, \u2116 4.", 4 | "target": "Erich Segal \u00abMan, Woman and Child\u00bb, 1982 \u2013 July\u2013August , \u21164." 5 | }, 6 | { 7 | "source": "\u0421\u0430\u0439\u0442 \u0438 \u0423\u0441\u043b\u0443\u0433\u0430 \u043f\u0440\u0435\u0434\u043e\u0441\u0442\u0430\u0432\u043b\u044f\u044e\u0442\u0441\u044f \u0438 \u0443\u043f\u0440\u0430\u0432\u043b\u044f\u044e\u0442\u0441\u044f Tyche Technologies AG. \u0421\u0430\u0439\u0442 www.greeklesibians.lesbianscrowd.com.", 8 | "target": "The Website and the service is hosted and administered by Tyche Technologies AG. Website www.greeklesibians.lesbianscrowd.com." 9 | }, 10 | { 11 | "source": "\u041e\u0431\u0443\u0447\u0435\u043d\u0438\u0435 \u0430\u043d\u0433\u043b\u0438\u0439\u0441\u043a\u043e\u043c\u0443 \u0441\u043e speaking24.com - \u0441\u043b\u043e\u0432\u043e \u0434\u043d\u044f - TO EXHORT", 12 | "target": "Learn English with speaking24.com - word of the day - TO EXHORT" 13 | }, 14 | { 15 | "source": "\u041c\u0435\u043d\u0434\u0435\u043b\u0435\u0435\u0432\u0441\u043a (\u0420\u0435\u0441\u043f\u0443\u0431\u043b\u0438\u043a\u0430 \u0422\u0430\u0442\u0430\u0440\u0441\u0442\u0430\u043d) 85549251 ** \u0422\u0435\u043b\u0435\u0444\u043e\u043d", 16 | "target": "Mendeleevsk (The Republic Of Tatarstan) 85549251 ** Phone" 17 | }, 18 | { 19 | "source": "Moncler \u0420\u043e\u0434 \u041f\u0430\u043b\u044c\u0442\u043e \u0412\u043d\u0438\u0437 \u041c\u0443\u0436\u0447\u0438\u043d\u044b \u0427\u0435\u0440\u043d\u044b\u0439, Moncler\u043e\u0434\u0435\u0436\u0434\u0430 \u043e\u0449\u0443\u0449\u0435\u043d\u0438\u0435 \u043d\u0430\u0447\u0438\u043d\u0430\u044f \u0441 \u043c\u043e\u0434\u044b \u0441\u0442\u043e\u043b\u0438\u0446\u0430 \u043c\u0438\u0440\u0430, \u0424\u0440\u0430\u043d\u0446\u0438\u044f. Moncler \u0431\u044b\u043b \u043d\u0430\u0439\u0434\u0435\u043d \u0432 1950 \u0433\u043e\u0434\u0443 \u0420\u0435\u043d\u0435 Ramilion . \u042d\u0442\u0430 \u043a\u043e\u043b\u043b\u0435\u043a\u0446\u0438\u044f \u0431\u044b\u043b\u0430 \u0441\u043e\u0437\u0434\u0430\u043d\u0430 \u0434\u043b\u044f \u0438\u043d\u0442\u0435\u0440\u043f\u0440\u0435\u0442\u0430\u0446\u0438\u0438 \u0433\u043e\u0440\u043e\u0434\u0441\u043a\u043e\u0439 \u0438 \u0441\u043e\u0432\u0440\u0435\u043c\u0435\u043d\u043d\u044b\u0439 \u0441\u0442\u0438\u043b\u044c \u0433\u043b\u0430\u0437\u0430\u043c\u0438 \u043a\u043b\u0430\u0441\u0441 \u0438 \u044d\u043b\u0435\u0433\u0430\u043d\u0442\u043d\u043e\u0441\u0442\u044c . Moncler \u043f\u0440\u0435\u0434\u0441\u0442\u0430\u0432\u043b\u044f\u0435\u0442 \u0441\u043e\u0431\u043e\u0439 \u0441\u0431\u043e\u0440\u043d\u0438\u043a, \u0441\u043f\u0435\u0446\u0438\u0430\u043b\u0438\u0437\u0438\u0440\u0443\u044e\u0449\u0430\u044f\u0441\u044f \u043d\u0430 \u0432\u0435\u0440\u0445\u043d\u044e\u044e \u043e\u0434\u0435\u0436\u0434\u0443 .", 20 | "target": "Moncler Rod Coat Down Men Black,Moncler a clothing sensation beginning in the fashion capital of the world, France. Moncler was found in the 1950's by Rene Ramilion. This collection has been created to interpret an urban and modern style through the eyes of class and elegance. Moncler is a collection specialising in outdoor clothing." 21 | } 22 | ] -------------------------------------------------------------------------------- /human_written_data/Filtered-5-shot/shots.uk-en.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "source": "\u043f\u0440\u043e\u0433\u043d\u043e\u0437 \u043f\u043e\u0433\u043e\u0434\u0438 \u0443 \u043c\u0456\u0441\u0442\u0456 \u041c\u0430\u0434\u0430\u0440\u0445 (v4), \u043f\u0440\u043e\u0433\u043d\u043e\u0437 \u043f\u043e\u0433\u043e\u0434\u0438 \u0432 \u043a\u0440\u0430\u0457\u043d\u0456 \u041c\u0430\u0440\u043e\u043a\u043a\u043e (v4), \u043f\u0440\u043e\u0433\u043d\u043e\u0437 \u043f\u043e\u0433\u043e\u0434\u0438 \u0443 \u043c\u0456\u0441\u0442\u0456 \u041c\u0430\u0434\u0430\u0440\u0445 (v1), \u043f\u0440\u043e\u0433\u043d\u043e\u0437 \u043f\u043e\u0433\u043e\u0434\u0438 \u0432 \u043a\u0440\u0430\u0457\u043d\u0456 \u041c\u0430\u0440\u043e\u043a\u043a\u043e (v1), \u043f\u0440\u043e\u0433\u043d\u043e\u0437 \u043f\u043e\u0433\u043e\u0434\u0438 \u0443 \u043c\u0456\u0441\u0442\u0456 \u041c\u0430\u0434\u0430\u0440\u0445 (v2), \u043f\u0440\u043e\u0433\u043d\u043e\u0437 \u043f\u043e\u0433\u043e\u0434\u0438 \u0432 \u043a\u0440\u0430\u0457\u043d\u0456 \u041c\u0430\u0440\u043e\u043a\u043a\u043e (v2)", 4 | "target": "weather forecast in Madagh (v4), weather forecast in Morocco (v4), weather forecast in Madagh (v1), weather forecast in Morocco (v1), weather forecast in Madagh (v2), weather forecast in Morocco (v2)" 5 | }, 6 | { 7 | "source": "- \u043f\u0440\u0456\u0437\u0432\u0438\u0449\u0435, \u0456\u043c'\u044f, \u043f\u043e \u0431\u0430\u0442\u044c\u043a\u043e\u0432\u0456 (\u0437\u0430 \u043d\u0430\u044f\u0432\u043d\u043e\u0441\u0442\u0456), \u0434\u0430\u0442\u0430 \u043d\u0430\u0440\u043e\u0434\u0436\u0435\u043d\u043d\u044f \u0431\u043e\u0440\u0436\u043d\u0438\u043a\u0430 \u2014 \u0444\u0456\u0437\u0438\u0447\u043d\u043e\u0457 \u043e\u0441\u043e\u0431\u0438 \u0430\u0431\u043e \u043d\u0430\u0439\u043c\u0435\u043d\u0443\u0432\u0430\u043d\u043d\u044f, \u0456\u0434\u0435\u043d\u0442\u0438\u0444\u0456\u043a\u0430\u0446\u0456\u0439\u043d\u0438\u0439 \u043a\u043e\u0434 \u044e\u0440\u0438\u0434\u0438\u0447\u043d\u043e\u0457 \u043e\u0441\u043e\u0431\u0438 \u0443 \u0404\u0434\u0438\u043d\u043e\u043c\u0443 \u0434\u0435\u0440\u0436\u0430\u0432\u043d\u043e\u043c\u0443 \u0440\u0435\u0454\u0441\u0442\u0440\u0456 \u044e\u0440\u0438\u0434\u0438\u0447\u043d\u0438\u0445 \u043e\u0441\u0456\u0431, \u0444\u0456\u0437\u0438\u0447\u043d\u0438\u0445 \u043e\u0441\u0456\u0431-\u043f\u0456\u0434\u043f\u0440\u0438\u0454\u043c\u0446\u0456\u0432 \u0442\u0430 \u0433\u0440\u043e\u043c\u0430\u0434\u0441\u044c\u043a\u0438\u0445 \u0444\u043e\u0440\u043c\u0443\u0432\u0430\u043d\u044c \u0431\u043e\u0440\u0436\u043d\u0438\u043a\u0430 \u2014 \u044e\u0440\u0438\u0434\u0438\u0447\u043d\u043e\u0457 \u043e\u0441\u043e\u0431\u0438;", 8 | "target": "- surname, name, patronymic (if any), date of birth of the debtor - an individual or name, identification code of the legal entity in the Unified State Register of Legal Entities, individuals-entrepreneurs and public entities of the debtor - a legal entity;" 9 | }, 10 | { 11 | "source": "\u041f\u0435\u0440\u0435\u0433\u043b\u044f\u043d\u044c\u0442\u0435 \u0446\u0456\u043d\u0438 \u043d\u0430 \u0430\u0432\u0456\u0430\u043f\u0435\u0440\u0435\u043b\u0456\u0442 \u0412\u0435\u0440\u043e\u043d\u0430 \u2014 \u0421\u0430\u043d\u043a\u0442-\u041f\u0435\u0442\u0435\u0440\u0431\u0443\u0440\u0433 \u043d\u0430 \u043d\u0430\u0439\u0431\u043b\u0438\u0436\u0447\u0456 \u0434\u0432\u0430 \u0442\u0438\u0436\u043d\u0456 \u0456 \u0432\u0438\u0431\u0435\u0440\u0456\u0442\u044c \u043e\u043f\u0442\u0438\u043c\u0430\u043b\u044c\u043d\u0438\u0439 \u0432\u0430\u0440\u0456\u0430\u043d\u0442, \u044f\u043a\u0438\u0439 \u043f\u0456\u0434\u0445\u043e\u0434\u0438\u0442\u044c \u0432\u0430\u043c \u0456 \u0437\u0430 \u0432\u0430\u0440\u0442\u0456\u0441\u0442\u044e, \u0456 \u0437\u0430 \u0447\u0430\u0441\u043e\u043c.", 12 | "target": "Look at the air fares Verona \u2014 St Petersburg for the next two weeks and choose the option that fits you, in price and in time." 13 | }, 14 | { 15 | "source": "Accace Ukraine \u0440\u0430\u0437\u043e\u043c \u0437 \u043a\u043e\u043c\u0430\u043d\u0434\u043e\u044e EBA Education \u0437\u0430\u043f\u0440\u043e\u0448\u0443\u044e\u0442\u044c \u0432\u0430\u0441 \u0432\u0437\u044f\u0442\u0438 \u0443\u0447\u0430\u0441\u0442\u044c \u0443 EBA Education Update: \u0410\u0443\u0434\u0438\u0442 \u0435\u0444\u0435\u043a\u0442\u0438\u0432\u043d\u043e\u0441\u0442\u0456.", 16 | "target": "EBA Education Team together with Accace Ukraine invite you to join the EBA Education Update: Performance Audit." 17 | }, 18 | { 19 | "source": "Almatherm , IP \u0432 \u0410\u043b\u043c\u0430\u0442\u0438 _ \u0406\u043d\u0442\u0435\u0440\u043d\u0435\u0442-\u043c\u0430\u0433\u0430\u0437\u0438\u043d Almatherm , IP \u0410\u043b\u043c\u0430\u0442\u0438 (\u041a\u0430\u0437\u0430\u0445\u0441\u0442\u0430\u043d)", 20 | "target": "Almatherm , IP in Almaty _ Online-store Almatherm , IP Almaty (Kazakhstan)" 21 | } 22 | ] -------------------------------------------------------------------------------- /human_written_data/Filtered-5-shot/shots.zh-en.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "source": "\u5728\u4ed6\u7684\u4f5c\u54c1\u5f00\u59cb\u5728\u610f\u5927\u5229\u4e8e1924\u5e74\u5728\u5f88\u5927\u7a0b\u5ea6\u4e0a\u4f5c\u98ce\"\u610f\u5927\u5229\u4ee3\u6570\u51e0\u4f55 , \" Zariski\u610f\u8bc6\u5230,\u6574\u4e2a\u95ee\u9898\u9700\u8981\u9002\u5f53\u7684\u57fa\u7840\u3002", 4 | "target": "After beginning his work in Italy in 1924 very much in the style of \"Italian algebraic geometry,\" Zariski realised that the whole subject needed proper foundations." 5 | }, 6 | { 7 | "source": "\u9776\u5411DSB\u5f62\u6210\u540e,\u7ec6\u80de\u901a\u5e38\u4f7f\u7528\u4e24\u79cdDNA\u4fee\u590d\u9014\u5f84\u4e2d\u7684\u4e00\u79cd\u6765\u5b58\u6d3b:\u975e\u540c\u6e90\u672b\u7aef\u8fde\u63a5(NHEJ)\u6216\u540c\u6e90\u4f9d\u8d56\u6027\u4fee\u590d(HDR ) \u3002", 8 | "target": "Following targeted DSB formation, the cell typically uses one of two DNA repair pathways to survive: non-homologous end joining (NHEJ) or homology dependent repair (HDR)." 9 | }, 10 | { 11 | "source": "\u300a\u85c9\u7531\u539f\u5b50\u5f48\u62b5\u9054\u706b\u661f\uff1a\u7375\u6236\u5ea7\u8a08\u5283\u79d8\u53f2\u300b\uff08To Mars by A-Bomb: The Secret History of Project Orion\uff09\u662f\u4e00\u90e82003\u5e74\u82f1\u570b\u5ee3\u64ad\u516c\u53f8\uff08BBC\uff09\u95dc\u65bc\u8a72\u8a08\u5283\u7684\u7d00\u9304\u7247\u3002", 12 | "target": "To Mars by A-Bomb: The Secret History of Project Orion was a 2003 BBC documentary film about the project." 13 | }, 14 | { 15 | "source": "1998\u5e74\u8fdb\u884c\u7684\u4e00\u9879\u8c03\u67e5\u53d1\u73b0\uff0c\u65e5\u672c\u670929.5%\u7684\u4eba\u53e3\u76f8\u4fe1\u6765\u751f\uff0c\u8fd8\u6709\u53e6\u591640%\u613f\u610f\u76f8\u4fe1\uff0c\u5e76\u4e14\u53c8\u4ee5\u5e74\u8f7b\u4eba\u76f8\u4fe1\u7684\u6bd4\u4f8b\u6700\u9ad8\u3002", 16 | "target": "A 1998 survey found that 29.5% of the Japanese population believed in an afterlife, and a further 40% wanted to believe; belief was highest among the young." 17 | }, 18 | { 19 | "source": "2001\u5e7412\u6708\uff0cRTECS\u88abNIOSH\u8f6c\u8ba9\u7ed9\u4e86\u7231\u601d\u552f\u5c14MDL\uff08Elsevier MDL\uff09\uff0c\u4e00\u5bb6\u79c1\u8425\u516c\u53f8\u3002", 20 | "target": "In December 2001 RTECS was transferred from NIOSH to the private company Elsevier MDL." 21 | } 22 | ] -------------------------------------------------------------------------------- /human_written_data/HW-5-shot/shots.cs-en.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "source": "Podle všeho šlo zcela určitě o pokračující bitvu.", 4 | "target": "It appears that this was definitely an ongoing battle." 5 | }, 6 | { 7 | "source": "Nová doba přinesla nové návody.", 8 | "target": "The new age brought new instructions." 9 | }, 10 | { 11 | "source": "Jenže změnila název a IČO a nebylo co řešit.", 12 | "target": "The company then changed its company ID number, and that was it." 13 | }, 14 | { 15 | "source": "Chodí s dopomocí a jezdí na kole.", 16 | "target": "He's walking with help, and riding a bike." 17 | }, 18 | { 19 | "source": "V Londýně zemřel cyklista po srážce s automobilem.", 20 | "target": "A cyclist has died in a collision involving a car in London." 21 | } 22 | ] -------------------------------------------------------------------------------- /human_written_data/HW-5-shot/shots.de-en.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "source": "Die Lehrstellen beim Kreis sind offenbar sehr begehrt.", 4 | "target": "The apprenticeship positions at the district are apparently very coveted." 5 | }, 6 | { 7 | "source": "Michael Rottmann, Manager der Halle, berichtet, dass bereits nach der Berufswahlmesse am 19. September ein entsprechender Schriftzug auf dem Werbeschild am Treppenaufgang zum Gempt-Bistro entdeckt worden sei.", 8 | "target": "Michael Rottmann, manager of the hall, reports that writing was found on the advertising sign on the staircase to the Gempt Bistro on September 19 after the career choice fair." 9 | }, 10 | { 11 | "source": "Die CDU mit ihrem Spitzenkandidaten, dem Innenminister Lorenz Caffier, hat schon erlebt, wie es ist, wenn man zur falschen Zeit am richtigen Ort Wahlkampf macht.", 12 | "target": "The CDU and its top-level candidate, Minister of the Interior Lorenz Caffier, has already experienced what happens when you fight an election in the right place at the wrong time." 13 | }, 14 | { 15 | "source": "Sechs Monate Bauarbeiten, das ist schon brutal.", 16 | "target": "Six months of construction works, that's brutal." 17 | }, 18 | { 19 | "source": "Die Därme der Mäuse wurden DNA-sequenziert und es wurden sechs Bakterienarten gefunden, die in den Mäusen mit den Immunzellen vorhanden waren, aber ohne sie bei den Mäusen fehlten.", 20 | "target": "The guts of the mice were DNA sequenced and it was found six bacterial species present in the mice with the immune cells but absent from the mice without them." 21 | } 22 | ] -------------------------------------------------------------------------------- /human_written_data/HW-5-shot/shots.en-cs.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "source": "Garcia said he appreciates Lomachenko as \"a tremendous fighter,\" and may attend his Saturday bout.", 4 | "target": "Garcia uvedl, že uznává Lomačenka jako „úžasného boxera“ a mohl by se zúčastnit jeho sobotního zápasu." 5 | }, 6 | { 7 | "source": "My supervisor, wo was Czech, once told me about the French football team and said something along the lines of:", 8 | "target": "Moje vedoucí, Češka, mi jednou vyprávěla o francouzském fotbalovém týmu a pronesla něco ve smyslu: Vždyť to nejsou Francouzi, ale Afričané." 9 | }, 10 | { 11 | "source": "The report, being considered by Theresa May, also calls for tougher checks on registration to prevent the electoral register being used for immigration and benefit fraud.", 12 | "target": "Zpráva, kterou nyní hodnotí Theresa Mayová, také žádá o důkladnější kontroly při registracích, aby se zabránilo využívání volebních seznamů pro imigrační podvody a podvody s dávkami." 13 | }, 14 | { 15 | "source": "As Miller scolded Acosta: \"I don't want to get off into a whole thing about history here, but the Statue of Liberty is ... a symbol of American liberty lighting the world.", 16 | "target": "Miller pokáral Acostu slovy: „Nechci se tady pouštět do historie, ale Socha Svobody je... symbol americké svobody, který vrhá světlo na celý svět." 17 | }, 18 | { 19 | "source": "People will also not be able to vote at the Consulate General in Donetsk.", 20 | "target": "Volit se nebude ani na generálním konzulátu v Doněcku." 21 | } 22 | ] -------------------------------------------------------------------------------- /human_written_data/HW-5-shot/shots.en-de.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "source": "According to Rouhani, European signatories of the 2015 nuclear deal urged him to meet with US President Donald Trump, promising that Washington would lift \"all\" sanctions in return.", 4 | "target": "Laut Rouhani drängten ihn europäische Unterzeichnerstaaten der Nuklearvereinbarung von 2015 zu einem Treffen mit dem amerikanischen Präsidenten Donald Trump, und versprachen, dass Washington in Gegenzug „alle“ Sanktionen aufheben würden." 5 | }, 6 | { 7 | "source": "My last stop is a shop said to have inspired a fresh generation of vinyl lovers: Urban Outfitters.", 8 | "target": "Mein letzter Halt ist ein Geschäft, von dem gesagt wird, dass eine neue Generation von Plattenliebhabern inspiriert haben soll: Urban Outfitters." 9 | }, 10 | { 11 | "source": "\"You've really just got to dig deep and worry about your own match,\" Spieth said.", 12 | "target": "„Du musst dich wirklich stark konzentrieren und dir nur Sorgen um dein eigenes Spiel machen\", sagte Spieth." 13 | }, 14 | { 15 | "source": "The Report opens with plea for open debate and the formation of a consensus in the United States about the policy towards the Middle East.", 16 | "target": "Der Bericht fängt an mit der Bitte um einen offenen Diskurs und um Konsensbildung in den USA über eine Strategie für den Mittleren Osten." 17 | }, 18 | { 19 | "source": "Who should go to a sleep lab, and what happens there?", 20 | "target": "Wer sollte ins Schlaflabor, und was passiert da?" 21 | } 22 | ] -------------------------------------------------------------------------------- /human_written_data/HW-5-shot/shots.en-is.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "source": "In the year 1970, Raymond Damadian, a medical doctor and research scientist, discovered the basis for using magnetic resonance imaging as a tool for medical diagnosis.", 4 | "target": "Árið 1970 uppgötvaði Raymond Damatian, læknir og rannsakandi, undirstöðuatriðin við að nota segulómun við læknisfræðilega greiningu." 5 | }, 6 | { 7 | "source": "It is related to but usually not involving alpine style ski touring or mountaineering, the latter ones done in steep terrain and requiring much stiffer skis and boots.", 8 | "target": "Þetta er skyld skíðaíþrótt en felur yfirleitt ekki í sér alpagreinar né fjallaklifur, en síðarnefndu íþróttagreinarnar fara fram í miklum bratta og krefjast notkunar á mun stífari skíðum og skíðaskóm." 9 | }, 10 | { 11 | "source": "In late 2015, TogiNet established AstroNet Radio as a subsidiary station.", 12 | "target": "Síðari hluta árs 2015 stofnaði TogiNet útvarpsstöðina AstroNet sem dótturstöð." 13 | }, 14 | { 15 | "source": "Boating is a national pastime in Finland, with a boat to every seven or eight people.", 16 | "target": "Siglingar á bátum eru vinsæl dægrastytting í Finnlandi þar sem er einn bátur á hverja sjö til átta einstaklinga." 17 | }, 18 | { 19 | "source": "Dustin \"Goldust\" Runnels commented that \"Luna was as freaky as me...maybe even more...love her and will miss her...hopefully she's in a better place.\"", 20 | "target": "Dustin \"Goldust\" Runnels sagði: \"Luna var jafn klikkuð og ég ... jafnvel klikkaðri ... elska hana og mun sakna hennar ... vonandi er hún á betri stað.\"" 21 | } 22 | ] -------------------------------------------------------------------------------- /human_written_data/HW-5-shot/shots.en-ru.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "source": "During this period of European history, the Catholic Church, which had become rich and powerful, came under scrutiny.", 4 | "target": "В течение этого периода европейской истории, католическая церковь, ставшая богатой и сильной, оказалась объектом пристального внимания." 5 | }, 6 | { 7 | "source": "House Intelligence Committee Chairman Adam Schiff (D-Calif.) condemned Trump's attacks on the whistleblower and his or her sources.", 8 | "target": "Председатель Комитета палаты представителей США по разведке Адам Шифф осудил нападение Трампа на осведомителя и его источники." 9 | }, 10 | { 11 | "source": "According to the RT Telegram channel, the journalists wore special protective gear and were there for less than five minutes, since the ruins of the reactor that was destroyed by the explosion still emit 40 thousand times more radiation than normal.", 12 | "target": "Как сообщается в Telegram-канале RT, журналисты находились там в специальных защитных костюмах менее пяти минут, так как руины разрушенного взрывом энергоблока до сих пор источают радиацию, в 40 тысяч раз превышающую норму." 13 | }, 14 | { 15 | "source": "\"Best Screenplay\" was awarded to the work of Sergey Dmitrenko based on stories told by Fazil Iskander in the book \"Sandro from Chegem\".", 16 | "target": "\"Лучший киносценарий\" - работа Сергея Дмитренко по мотивам историй, рассказанных Фазилем Искандером в книге \"Сандро из Чегема\"." 17 | }, 18 | { 19 | "source": "An action participant, Nevroz Duman claimed that the aim of the march is to show that the society is united and consolidated about this issue.", 20 | "target": "Одна из участниц акции Невроз Думан заявила, что цель демонстрации - показать единство общества и солидарность в данной проблеме." 21 | } 22 | ] -------------------------------------------------------------------------------- /human_written_data/HW-5-shot/shots.en-zh.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "source": "But playing can be tough.", 4 | "target": "但是踢球有时会很艰难。" 5 | }, 6 | { 7 | "source": "\"In the coming months, we will consult on introducing a complete ban on the advertising and promotion of vapour products.\"", 8 | "target": "“在未来几个月,我们将就全面禁止雾化产品的广告和促销进行磋商。”" 9 | }, 10 | { 11 | "source": "A traveller with piles of money might consider a round the world flight, broken up with stays in many of these hotels.", 12 | "target": "一名腰缠万贯的旅客可能会考虑乘飞机环游世界,并在旅途中陆续入住许多家这样的酒店。" 13 | }, 14 | { 15 | "source": "This is the second institution of higher learning jointly established by a US$130 million donation by the Li Ka Shing Foundation. With effect from next year, the Foundation will commit another RMB2 billion to build Shantou University.", 16 | "target": "该校为李嘉诚基金会捐资1.3亿美元合作筹建的第二所高校,而李嘉诚基金会明年起将再投资20亿元(人民币,下同)建设汕头大学。" 17 | }, 18 | { 19 | "source": "The 25th Asia-Pacific Economic Cooperation (APEC) Economic Leaders' Meeting was held in the central Vietnam city of Da Nang from the 10th to 11th. The progress of the Free Trade Area of the Asia-Pacific (FTAAP) will be one of the highlights of this meeting.", 20 | "target": "亚太经合组织(APEC)第二十五次领导人非正式会议将于10日至11日在越南中部城市岘港举行,亚太自贸区(FTAAP)进程是此次会议讨论的热点之一。" 21 | } 22 | ] -------------------------------------------------------------------------------- /human_written_data/HW-5-shot/shots.is-en.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "source": "Vörubílstjórinn, sem er 64 ára, slasaðist ekki við áreksturinn.", 4 | "target": "The truck driver, who is aged 64, was not injured in the crash." 5 | }, 6 | { 7 | "source": "Það er auðveldlega hægt að gera með því að nota tiltölulega hljóðláta vekjaraklukku til að koma þér til meðvitundar án þess að vekja þig að fullu.", 8 | "target": "This can be easily done by using a relatively quiet alarm clock to bring you to consciousness without fully waking you." 9 | }, 10 | { 11 | "source": "Hsieh gaf í skyn þegar kosningarnar fóru fram að Ma kynni að flýja land um leið og neyðarástand yrði.", 12 | "target": "Hsieh implied during the election that Ma might flee the country during a time of crisis." 13 | }, 14 | { 15 | "source": "Þú verður alltaf að bóka beint hjá flugfélaginu í gegnum síma.", 16 | "target": "In all cases, you must book by phone directly with the airline." 17 | }, 18 | { 19 | "source": "Kannski er algengasta tegund ferðaþjónustu sú sem fólk tengir við ferðalög: Afþreyingarferðamennska.", 20 | "target": "Perhaps the most common type of tourism is what most people associate with traveling: Recreation tourism." 21 | } 22 | ] -------------------------------------------------------------------------------- /human_written_data/HW-5-shot/shots.ru-en.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "source": "Материалы переданы в Пресненский районный суд Москвы для рассмотрения по существу.", 4 | "target": "The files have been handed over to Moscow’s Presnensky District Court for consideration on the merits." 5 | }, 6 | { 7 | "source": "По словам Бондарева, за три года в Сирии уничтожены десятки тысяч объектов террористов - речь идет, например, о складах боеприпасов, укрепрайонах и штабах.", 8 | "target": "In Bondarev's words, after three years in Syria tens of thousands of terrorist targets have been destroyed - including, for example, ammunition stockpiles, fortifications, and headquarters." 9 | }, 10 | { 11 | "source": "Мне показали эту квартиру в Эрлс Корт, и здесь были такие же высокие потолки.", 12 | "target": "I was shown this apartment in Earls Court and it had the same high ceilings." 13 | }, 14 | { 15 | "source": "\"Я сказал, нигга, убирайся из моего дома!\".", 16 | "target": "\"I said n---a, get off my property!\"" 17 | }, 18 | { 19 | "source": "Однако говорить о триумфе Европы пока преждевременно.", 20 | "target": "Talk of European glory remains premature, though." 21 | } 22 | ] -------------------------------------------------------------------------------- /human_written_data/HW-5-shot/shots.zh-en.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "source": "这将造福中国人民,也将造福世界各国人民。", 4 | "target": "This will benefit the Chinese people, and benefit all the peoples of the world as well." 5 | }, 6 | { 7 | "source": "人工智能有很强的科幻色彩,但它其实是计算机科学非常重要的一个分支,研究的是机器的行为、学习和智能适应。", 8 | "target": "Although AI has a strong connotation of science fiction, AI forms a very important branch of computer science, dealing with behavior, learning and intelligent adaptation in a machine." 9 | }, 10 | { 11 | "source": "井贤栋表示,过去3年“蚂蚁森林”影响了5亿人,未来3年“蚂蚁森林”的目标是带动全球10亿人参与低碳行动。", 12 | "target": "Jing said \"Ant Forest\" had affected 500 million people in the past 3 years, and its goal in the next 3 years is to inspire 1 billion people worldwide to take part in low-carbon activities." 13 | }, 14 | { 15 | "source": "接下来,研究人员等母狮体内的荷尔蒙含量达到合适程度时,为它进行人工授精。", 16 | "target": "Next, the researchers waited for the hormones in the female lion to reach a proper level so as to conduct the artificial insemination in it." 17 | }, 18 | { 19 | "source": "划转工作时间紧,任务重 。 ”", 20 | "target": "The transfer work is tough job in limited time. \"" 21 | } 22 | ] -------------------------------------------------------------------------------- /install_alma.sh: -------------------------------------------------------------------------------- 1 | pip install transformers==4.51.1 2 | pip install peft==0.13.0 3 | pip install sentencepiece 4 | pip install sacrebleu 5 | pip install ipython 6 | pip install datasets 7 | pip install evaluate 8 | pip3 install deepspeed==0.15.1 9 | pip install einops 10 | pip install wandb 11 | pip install zstandard 12 | pip install accelerate==0.34.2 13 | pip install jsonlines 14 | pip install trl 15 | -------------------------------------------------------------------------------- /outputs/wmt22_outputs/README.md: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /run_cpo_llmmt.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding=utf-8 3 | 4 | import logging 5 | import os 6 | import sys 7 | import json 8 | 9 | import datasets 10 | import torch 11 | from datasets import load_dataset 12 | 13 | import transformers 14 | from transformers import ( 15 | HfArgumentParser, 16 | set_seed, 17 | ) 18 | from utils.utils import preprocess_cpo_data, load_tokenizer, load_model, SavePeftModelCallback 19 | from utils.arguments import ModelArguments, DataTrainingArguments 20 | from trl import CPOTrainer, CPOConfig 21 | 22 | logger = logging.getLogger(__name__) 23 | 24 | def main(): 25 | # See all possible arguments in src/transformers/training_args.py 26 | # or by passing the --help flag to this script. 27 | # We now keep distinct sets of args, for a cleaner separation of concerns. 28 | 29 | parser = HfArgumentParser((ModelArguments, DataTrainingArguments, CPOConfig)) 30 | if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): 31 | # If we pass only one argument to the script and it's the path to a json file, 32 | # let's parse it to get our arguments. 33 | model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1])) 34 | else: 35 | model_args, data_args, training_args = parser.parse_args_into_dataclasses() 36 | 37 | # Setup logging 38 | logging.basicConfig( 39 | format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", 40 | datefmt="%m/%d/%Y %H:%M:%S", 41 | handlers=[logging.StreamHandler(sys.stdout)], 42 | ) 43 | 44 | if training_args.should_log: 45 | # The default of training_args.log_level is passive, so we set log level at info here to have that default. 46 | transformers.utils.logging.set_verbosity_info() 47 | 48 | log_level = training_args.get_process_log_level() 49 | logger.setLevel(log_level) 50 | datasets.utils.logging.set_verbosity(log_level) 51 | transformers.utils.logging.set_verbosity(log_level) 52 | transformers.utils.logging.enable_default_handler() 53 | transformers.utils.logging.enable_explicit_format() 54 | 55 | # Log on each process the small summary: 56 | logger.warning( 57 | f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" 58 | + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" 59 | ) 60 | logger.info(f"Training/evaluation parameters {training_args}") 61 | 62 | # Get the datasets 63 | pairs = set(data_args.language_pairs.split(",")) 64 | train_raw_data, valid_raw_data, test_raw_data = {}, None, None 65 | seen = set() 66 | ## load cpo dataset 67 | train_raw_data["mmt"] = {} 68 | for pair in pairs: 69 | src_lang, tgt_lang = pair.split("-") 70 | first_lang = src_lang if src_lang != "en" else tgt_lang 71 | second_lang = "en" 72 | if (first_lang, second_lang) not in seen and training_args.do_train: 73 | train_raw_data["mmt"][f"{first_lang}-{second_lang}"] = load_dataset( 74 | data_args.cpo_data_path, 75 | f"{first_lang}-{second_lang}", 76 | cache_dir=model_args.cache_dir, 77 | use_auth_token=True if model_args.use_auth_token else None, 78 | streaming=data_args.streaming, 79 | ) 80 | seen.add((first_lang, second_lang)) 81 | 82 | # load tokenizer 83 | set_seed(training_args.seed) 84 | tokenizer = load_tokenizer(data_args, model_args, training_args, logger) 85 | 86 | shots_eval_dict = {} 87 | if data_args.few_shot_eval_path: 88 | for lg_pair in test_raw_data.keys(): 89 | pair_shot_path = os.path.join(data_args.few_shot_eval_path, f"shots.{lg_pair}.json") 90 | if not os.path.isfile(pair_shot_path): 91 | ValueError(f"Make sure the language pair {lg_pair} is in the few shot eval folder!") 92 | with open(pair_shot_path) as f: 93 | shots_eval_dict[lg_pair] = json.load(f) 94 | 95 | # Preprocess data 96 | train_datasets, eval_datasets, test_datasets = preprocess_cpo_data(train_raw_data, valid_raw_data, test_raw_data, pairs, tokenizer, shots_eval_dict, data_args, training_args, model_args) 97 | 98 | # Load model 99 | model = load_model(data_args, model_args, training_args, tokenizer, logger) 100 | 101 | # Initialize our Trainer 102 | trainer = CPOTrainer( 103 | model, 104 | args=training_args, 105 | train_dataset=train_datasets, 106 | eval_dataset=eval_datasets, 107 | tokenizer=tokenizer, 108 | callbacks=[SavePeftModelCallback] if model_args.use_peft else None, 109 | ) 110 | # Training 111 | if training_args.do_train: 112 | checkpoint = None 113 | if training_args.resume_from_checkpoint is not None: 114 | checkpoint = training_args.resume_from_checkpoint 115 | 116 | trainer.train(resume_from_checkpoint=checkpoint) 117 | 118 | trainer.save_state() 119 | if model_args.use_peft: 120 | if torch.distributed.get_rank() == 0: 121 | model.save_pretrained(training_args.output_dir) 122 | else: 123 | trainer.save_model() # Saves the tokenizer too for easy upload 124 | 125 | def _mp_fn(index): 126 | # For xla_spawn (TPUs) 127 | main() 128 | 129 | 130 | if __name__ == "__main__": 131 | main() -------------------------------------------------------------------------------- /run_llmmt.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding=utf-8 3 | 4 | import logging 5 | import copy 6 | import math 7 | import os 8 | import sys 9 | import json 10 | import random 11 | from dataclasses import dataclass, field 12 | from itertools import chain 13 | from typing import Optional 14 | import numpy as np 15 | import jsonlines 16 | 17 | import datasets 18 | import evaluate 19 | import torch 20 | from datasets import load_dataset 21 | 22 | import transformers 23 | from transformers import ( 24 | CONFIG_MAPPING, 25 | MODEL_FOR_CAUSAL_LM_MAPPING, 26 | AutoConfig, 27 | AutoModelForCausalLM, 28 | AutoTokenizer, 29 | HfArgumentParser, 30 | Trainer, 31 | TrainingArguments, 32 | Seq2SeqTrainingArguments, 33 | default_data_collator, 34 | is_torch_tpu_available, 35 | set_seed, 36 | LlamaTokenizer, 37 | ) 38 | from transformers.testing_utils import CaptureLogger 39 | from transformers.trainer_utils import get_last_checkpoint 40 | from transformers.utils import check_min_version, send_example_telemetry 41 | from transformers.utils.versions import require_version 42 | from peft import LoraConfig, get_peft_model, TaskType 43 | from peft import PeftModel, PeftConfig 44 | from collections import defaultdict 45 | from transformers.trainer_callback import TrainerCallback 46 | from datasets import concatenate_datasets, interleave_datasets 47 | from utils.trainer_llmmt import LlmmtTrainer 48 | from utils.utils import LANG_TABLE, load_mmt_dataset, get_preprocessed_data, clean_outputstring, load_a_single_text_file, load_tokenizer, load_model, SavePeftModelCallback, get_key_suffix, NLLB_CODE, ISO1_ISO3_map 49 | from utils.arguments import ModelArguments, DataTrainingArguments 50 | from utils.ul2collator import DataCollatorForUL2 51 | 52 | logger = logging.getLogger(__name__) 53 | 54 | from peft import get_peft_config, get_peft_model, LoraConfig, TaskType 55 | 56 | from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR 57 | 58 | 59 | def main(): 60 | # See all possible arguments in src/transformers/training_args.py 61 | # or by passing the --help flag to this script. 62 | # We now keep distinct sets of args, for a cleaner separation of concerns. 63 | 64 | parser = HfArgumentParser((ModelArguments, DataTrainingArguments, Seq2SeqTrainingArguments)) 65 | if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): 66 | # If we pass only one argument to the script and it's the path to a json file, 67 | # let's parse it to get our arguments. 68 | model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1])) 69 | else: 70 | model_args, data_args, training_args = parser.parse_args_into_dataclasses() 71 | 72 | # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The 73 | # information sent is the one passed as arguments along with your Python/PyTorch versions. 74 | send_example_telemetry("run_llmmt", model_args, data_args) 75 | 76 | # Setup logging 77 | logging.basicConfig( 78 | format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", 79 | datefmt="%m/%d/%Y %H:%M:%S", 80 | handlers=[logging.StreamHandler(sys.stdout)], 81 | ) 82 | 83 | if training_args.should_log: 84 | # The default of training_args.log_level is passive, so we set log level at info here to have that default. 85 | transformers.utils.logging.set_verbosity_info() 86 | 87 | log_level = training_args.get_process_log_level() 88 | logger.setLevel(log_level) 89 | datasets.utils.logging.set_verbosity(log_level) 90 | transformers.utils.logging.set_verbosity(log_level) 91 | transformers.utils.logging.enable_default_handler() 92 | transformers.utils.logging.enable_explicit_format() 93 | 94 | # Log on each process the small summary: 95 | logger.warning( 96 | f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" 97 | + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" 98 | ) 99 | logger.info(f"Training/evaluation parameters {training_args}") 100 | 101 | # Get the datasets 102 | pairs = data_args.language_pairs.split(",") 103 | train_raw_data, valid_raw_data, test_raw_data = {}, None, {} 104 | if data_args.text_test_file: 105 | test_raw_data["mmt"] = load_a_single_text_file(pairs, data_args, model_args) 106 | elif data_args.mmt_data_path: 107 | mmt_train_raw_data, valid_raw_data, mmt_test_raw_data = load_mmt_dataset(pairs, data_args, model_args, training_args, logger) 108 | train_raw_data["mmt"] = mmt_train_raw_data 109 | test_raw_data["mmt"] = mmt_test_raw_data 110 | 111 | load_kwargs = { 112 | 'cache_dir': model_args.cache_dir, 113 | 'token': True if model_args.use_auth_token else None, 114 | 'streaming': data_args.streaming, 115 | "trust_remote_code": True, 116 | } 117 | 118 | if data_args.aya_datasets: 119 | considered_languages_ISO1 = sorted(set(lang for p in pairs for lang in p.split('-'))) 120 | considered_languages_ISO3 = [ISO1_ISO3_map[lang] for lang in considered_languages_ISO1] 121 | if training_args.do_train: 122 | aya_train_raw_data = load_dataset( 123 | data_args.aya_datasets, 124 | **load_kwargs, 125 | )['train'] 126 | 127 | train_raw_data["aya"] = aya_train_raw_data.filter(lambda x: x['language_code'] in considered_languages_ISO3) 128 | if training_args.do_predict: 129 | test_raw_data["aya"] = {} 130 | aya_test_raw_data = load_dataset( 131 | data_args.aya_datasets, 132 | **load_kwargs, 133 | )['test'] 134 | for lg in considered_languages_ISO1: 135 | sub_aya_test_raw_data = aya_test_raw_data.filter(lambda x: x['language_code'] in [ISO1_ISO3_map[lg]]) 136 | if len(sub_aya_test_raw_data) > 0: 137 | test_raw_data["aya"][lg] = sub_aya_test_raw_data 138 | 139 | if data_args.mono_data_path: 140 | train_raw_data["mono"] = load_dataset( 141 | "json", 142 | data_files=data_args.mono_data_path, 143 | **load_kwargs, 144 | ) 145 | 146 | if data_args.nllb_pretrain_data_path: 147 | if data_args.nllb_interleave_probs: 148 | interleave_probs = [float(p) for p in data_args.nllb_interleave_probs.split(",")] 149 | else: 150 | interleave_probs = [1/len(pairs)] * len(pairs) 151 | 152 | nllb_raw_data = [] 153 | for lg_pair in pairs: 154 | src_lang, tgt_lang = lg_pair.split("-") 155 | src_lang, tgt_lang = NLLB_CODE[src_lang], NLLB_CODE[tgt_lang] 156 | language_key = f"{src_lang}-{tgt_lang}" if src_lang < tgt_lang else f"{tgt_lang}-{src_lang}" 157 | 158 | if src_lang == "tha_Thai" or tgt_lang == "tha_Thai": 159 | lg_dataset = load_dataset( 160 | "Helsinki-NLP/opus-100", 161 | "en-th", 162 | **load_kwargs, 163 | )["train"].shuffle(seed=training_args.seed) 164 | else: 165 | lg_dataset = load_dataset( 166 | data_args.nllb_pretrain_data_path, 167 | language_key, 168 | **load_kwargs, 169 | )['train'].shuffle(seed=training_args.seed) 170 | 171 | def normalize_example(example): 172 | lg1, lg2 = example["translation"].keys() 173 | if random.random() < 0.5: 174 | combined_translation = example["translation"][lg1] + " " + example["translation"][lg2] 175 | else: 176 | combined_translation = example["translation"][lg2] + " " + example["translation"][lg1] 177 | return { 178 | "raw_text": combined_translation, 179 | } 180 | 181 | lg_dataset = lg_dataset.map(normalize_example, remove_columns=lg_dataset.column_names) 182 | nllb_raw_data.append(lg_dataset) 183 | 184 | train_raw_data["nllb_pretrain"] = interleave_datasets(nllb_raw_data, probabilities=interleave_probs, seed=training_args.seed, stopping_strategy="all_exhausted") 185 | 186 | if data_args.oscar_data_path: 187 | oscar_langs = data_args.oscar_data_lang.split(",") 188 | if data_args.interleave_probs: 189 | interleave_probs = [float(p) for p in data_args.interleave_probs.split(",")] 190 | else: 191 | interleave_probs = [1/len(oscar_langs)] * len(oscar_langs) 192 | oscar_langs = [x for x, _ in sorted(zip(oscar_langs, interleave_probs), key=lambda zippair: zippair[1])] 193 | interleave_probs = sorted(interleave_probs) 194 | oscar_train_raw_data = [] 195 | 196 | for lg in oscar_langs: 197 | oscar_lg_data = load_dataset( 198 | data_args.oscar_data_path, 199 | lg, 200 | **load_kwargs 201 | )['train'].shuffle(seed=training_args.seed) 202 | # if data_args.oscar_data_path != "cc100" else 203 | # load_dataset( 204 | # data_args.oscar_data_path, 205 | # lang=lg, 206 | # **load_kwargs 207 | # )['train'].shuffle(seed=training_args.seed) 208 | 209 | def normalize_oscar_example(example): 210 | return { 211 | "raw_text": example["text"], 212 | } 213 | oscar_lg_data = oscar_lg_data.map(normalize_oscar_example, remove_columns=oscar_lg_data.column_names) 214 | oscar_train_raw_data.append(oscar_lg_data) 215 | train_raw_data["oscar"] = interleave_datasets(oscar_train_raw_data, probabilities=interleave_probs, seed=training_args.seed, stopping_strategy="all_exhausted") 216 | 217 | if "nllb_pretrain" in train_raw_data: 218 | ## only for nllb pretrain and oscar: 219 | train_raw_data["oscar"] = interleave_datasets([train_raw_data["oscar"], train_raw_data["nllb_pretrain"]], probabilities=[0.5, 0.5], seed=training_args.seed, stopping_strategy="all_exhausted").shuffle(seed=training_args.seed) 220 | train_raw_data.pop("nllb_pretrain") 221 | 222 | # load tokenizer 223 | set_seed(training_args.seed) 224 | tokenizer = load_tokenizer(data_args, model_args, training_args, logger) 225 | if data_args.use_ul2: 226 | assert data_args.use_prefix_lm, "Must enable use prefix language model" 227 | 228 | shots_eval_dict = {} 229 | if data_args.few_shot_eval_path: 230 | for lg_pair in test_raw_data["mmt"].keys(): 231 | pair_shot_path = os.path.join(data_args.few_shot_eval_path, f"shots.{lg_pair}.json") 232 | if not os.path.isfile(pair_shot_path): 233 | ValueError(f"Make sure the language pair {lg_pair} is in the few shot eval folder!") 234 | with open(pair_shot_path) as f: 235 | shots_eval_dict[lg_pair] = json.load(f) 236 | 237 | if model_args.chat_style: 238 | dummy_sentence = "This is a dummy sentence" 239 | chat_dummy_sentence = [{"role": "user", "content": dummy_sentence}] 240 | dummy_sentence_with_speical_tokens = tokenizer.apply_chat_template(chat_dummy_sentence, tokenize=False, add_generation_prompt=True) 241 | encoded = tokenizer.encode(dummy_sentence_with_speical_tokens, add_special_tokens=False) 242 | decoded_text = tokenizer.decode(encoded, skip_special_tokens=True) 243 | begin_prefix = decoded_text.split(dummy_sentence, 1)[0].strip() 244 | additional_suffix = decoded_text.split(dummy_sentence, 1)[-1] 245 | else: 246 | begin_prefix = "" 247 | additional_suffix = "" 248 | 249 | train_datasets, eval_datasets, test_datasets = get_preprocessed_data(train_raw_data, valid_raw_data, test_raw_data, pairs, tokenizer, shots_eval_dict, data_args, training_args, model_args) 250 | metric = evaluate.load("sacrebleu") 251 | 252 | # Load model 253 | model = load_model(data_args, model_args, training_args, tokenizer, logger) 254 | collate_fn = DataCollatorForUL2(model, tokenizer) if data_args.use_ul2 else default_data_collator 255 | 256 | # Initialize our Trainer 257 | trainer = LlmmtTrainer( 258 | model=model, 259 | args=training_args, 260 | train_dataset=train_datasets if training_args.do_train else None, 261 | eval_dataset=eval_datasets if training_args.do_eval else None, 262 | tokenizer=tokenizer, 263 | data_collator=collate_fn, 264 | callbacks=[SavePeftModelCallback] if model_args.use_peft else None, 265 | ) 266 | 267 | # Training 268 | if training_args.do_train: 269 | checkpoint = None 270 | if training_args.resume_from_checkpoint is not None: 271 | checkpoint = training_args.resume_from_checkpoint 272 | 273 | train_result = trainer.train(resume_from_checkpoint=checkpoint) 274 | 275 | trainer.save_state() 276 | if model_args.use_peft: 277 | model.save_pretrained(training_args.output_dir) 278 | else: 279 | trainer.save_model() # Saves the tokenizer too for easy upload 280 | # Prediction 281 | if training_args.do_predict: 282 | trainer.args.prediction_loss_only = False 283 | if data_args.mmt_data_path: 284 | lg_pairs = sorted(test_datasets["mmt"].keys()) # make sure each device print in the same order 285 | for lg_pair in lg_pairs: 286 | test_dataset = test_datasets["mmt"][lg_pair] 287 | src_lang, tgt_lang = lg_pair.split("-") 288 | logger.info(f"*** Prediction for {lg_pair}***") 289 | if model_args.encoder_decoder_type == "nllb": 290 | preds, _, _ = trainer.predict( 291 | test_dataset=test_dataset, 292 | max_new_tokens=data_args.max_new_tokens, 293 | num_beams=data_args.num_beams, 294 | metric_key_prefix="test", 295 | use_cache=True, 296 | forced_bos_token_id=tokenizer.lang_code_to_id[NLLB_CODE[tgt_lang]], 297 | ) 298 | else: 299 | preds, _, _ = trainer.predict( 300 | test_dataset=test_dataset, 301 | max_new_tokens=data_args.max_new_tokens, 302 | num_beams=data_args.num_beams, 303 | metric_key_prefix="test", 304 | use_cache=True, 305 | ) 306 | 307 | # Replace -100s used for padding as we can't decode them 308 | if int(torch.cuda.current_device()) == 0: 309 | preds = np.where(preds != -100, preds, tokenizer.pad_token_id) 310 | decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True) 311 | 312 | # Some simple post-processing 313 | decoded_preds = [pred.strip() for pred in decoded_preds] 314 | 315 | for idx in range(data_args.display_num_translations): 316 | print("------------------------") 317 | print(decoded_preds[idx]) 318 | 319 | with open(os.path.join(training_args.output_dir, f"test-{src_lang}-{tgt_lang}{data_args.suffix_eval_file}"), "w", encoding="utf-8") as f: 320 | suffix = get_key_suffix(tgt_lang, data_args, additional_suffix) 321 | if len(shots_eval_dict) != 0: 322 | split_idx = len(shots_eval_dict[lg_pair]) + 1 323 | else: 324 | split_idx = 1 325 | for pred in decoded_preds: 326 | # Output is itself if it is an encoder-decoder model, otherwise it is the prefix + output 327 | pred = clean_outputstring(pred, suffix, logger, split_idx) if not model_args.encoder_decoder_type else pred.strip() 328 | f.writelines([pred, "\n"]) 329 | 330 | if data_args.aya_datasets: 331 | langs = sorted(test_datasets["aya"].keys()) # make sure each device print in the same order 332 | for lg in langs: 333 | test_dataset = test_datasets["aya"][lg] 334 | logger.info(f"*** Prediction aya for {lg}***") 335 | preds, _, _ = trainer.predict( 336 | test_dataset=test_dataset, 337 | max_new_tokens=data_args.max_new_tokens, 338 | num_beams=data_args.num_beams, 339 | metric_key_prefix="test", 340 | use_cache=True, 341 | ) 342 | 343 | # Replace -100s used for padding as we can't decode them 344 | if int(torch.cuda.current_device()) == 0: 345 | preds = np.where(preds != -100, preds, tokenizer.pad_token_id) 346 | decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True) 347 | 348 | # Some simple post-processing 349 | decoded_preds = [pred.strip() for pred in decoded_preds] 350 | 351 | for idx in range(data_args.display_num_translations): 352 | print("------------------------") 353 | print(decoded_preds[idx]) 354 | 355 | with jsonlines.open(os.path.join(training_args.output_dir, f"aya-test-{lg}.json"), "w") as f: 356 | for pred in decoded_preds: 357 | # Output is itself if it is an encoder-decoder model, otherwise it is the prefix + output 358 | # pred = clean_outputstring(pred, suffix, logger, split_idx) if not model_args.encoder_decoder_type else pred.strip() 359 | try: 360 | if begin_prefix or additional_suffix: 361 | question = pred.split(additional_suffix)[0].split(begin_prefix)[1].strip() 362 | response = pred.split(additional_suffix)[1].strip() 363 | else: 364 | question = "" 365 | response = pred.strip() 366 | json_input = { 367 | "question": question, 368 | "response": response, 369 | } 370 | f.write(json_input) 371 | except: 372 | json_input = { 373 | "question": pred, 374 | "response": "TODO", 375 | } 376 | f.write(json_input) 377 | print(f"Error in saving aya test {lg} json file. The output is {pred}") 378 | continue 379 | 380 | 381 | def _mp_fn(index): 382 | # For xla_spawn (TPUs) 383 | main() 384 | 385 | 386 | if __name__ == "__main__": 387 | main() 388 | 389 | -------------------------------------------------------------------------------- /runs/cpo_ft.sh: -------------------------------------------------------------------------------- 1 | OUTPUT_DIR=${1:-"./alma-7b-dpo-ft"} 2 | pairs=${2:-"de-en,cs-en,is-en,zh-en,ru-en,en-de,en-cs,en-is,en-zh,en-ru"} 3 | # random port between 30000 and 50000 4 | port=$(( RANDOM % (50000 - 30000 + 1 ) + 30000 )) 5 | 6 | accelerate launch --main_process_port ${port} --config_file configs/deepspeed_train_config_bf16.yaml \ 7 | run_cpo_llmmt.py \ 8 | --model_name_or_path haoranxu/ALMA-13B-Pretrain \ 9 | --tokenizer_name haoranxu/ALMA-13B-Pretrain \ 10 | --peft_model_id haoranxu/ALMA-13B-Pretrain-LoRA \ 11 | --cpo_scorer kiwi_xcomet \ 12 | --beta 0.1 \ 13 | --use_peft \ 14 | --use_fast_tokenizer False \ 15 | --cpo_data_path haoranxu/ALMA-R-Preference \ 16 | --do_train \ 17 | --language_pairs ${pairs} \ 18 | --low_cpu_mem_usage \ 19 | --bf16 \ 20 | --learning_rate 1e-4 \ 21 | --weight_decay 0.01 \ 22 | --gradient_accumulation_steps 1 \ 23 | --lr_scheduler_type inverse_sqrt \ 24 | --warmup_ratio 0.01 \ 25 | --ignore_pad_token_for_loss \ 26 | --ignore_prompt_token_for_loss \ 27 | --per_device_train_batch_size 2 \ 28 | --evaluation_strategy no \ 29 | --save_strategy steps \ 30 | --save_total_limit 1 \ 31 | --logging_strategy steps \ 32 | --logging_steps 0.05 \ 33 | --output_dir ${OUTPUT_DIR} \ 34 | --num_train_epochs 1 \ 35 | --prediction_loss_only \ 36 | --max_new_tokens 256 \ 37 | --max_source_length 256 \ 38 | --max_prompt_length 256 \ 39 | --max_length 512 \ 40 | --seed 42 \ 41 | --overwrite_output_dir \ 42 | --report_to none \ 43 | --overwrite_cache -------------------------------------------------------------------------------- /runs/mono_ft.sh: -------------------------------------------------------------------------------- 1 | OUTPUT_DIR=${1:-"./llama-2-7b-oscar-ft"} 2 | # random port between 30000 and 50000 3 | port=$(( RANDOM % (50000 - 30000 + 1 ) + 30000 )) 4 | accelerate launch --main_process_port ${port} --config_file configs/deepspeed_train_config.yaml \ 5 | run_llmmt.py \ 6 | --model_name_or_path meta-llama/Llama-2-7b-hf \ 7 | --oscar_data_path oscar-corpus/OSCAR-2301 \ 8 | --oscar_data_lang en,ru,cs,zh,is,de \ 9 | --interleave_probs "0.17,0.22,0.14,0.19,0.08,0.2" \ 10 | --streaming \ 11 | --max_steps 600000 \ 12 | --do_train \ 13 | --low_cpu_mem_usage \ 14 | --fp16 \ 15 | --learning_rate 2e-5 \ 16 | --weight_decay 0.01 \ 17 | --gradient_accumulation_steps 4 \ 18 | --lr_scheduler_type cosine \ 19 | --warmup_ratio 0.01 \ 20 | --ignore_pad_token_for_loss \ 21 | --ignore_prompt_token_for_loss \ 22 | --per_device_train_batch_size 4 \ 23 | --per_device_eval_batch_size 4 \ 24 | --save_strategy steps \ 25 | --save_steps 2000 \ 26 | --save_total_limit 1 \ 27 | --logging_strategy steps \ 28 | --logging_steps 1 \ 29 | --output_dir ${OUTPUT_DIR} \ 30 | --max_new_tokens 256 \ 31 | --max_source_length 256 \ 32 | --seed 42 \ 33 | --overwrite_output_dir \ 34 | --report_to none 35 | -------------------------------------------------------------------------------- /runs/parallel_ft.sh: -------------------------------------------------------------------------------- 1 | OUTPUT_DIR=${1:-"./alma-7b-parallel-ft"} 2 | pairs=${2:-"de-en,cs-en,is-en,zh-en,ru-en,en-de,en-cs,en-is,en-zh,en-ru"} 3 | 4 | # random port between 30000 and 50000 5 | port=$(( RANDOM % (50000 - 30000 + 1 ) + 30000 )) 6 | accelerate launch --main_process_port ${port} --config_file configs/deepspeed_train_config.yaml \ 7 | run_llmmt.py \ 8 | --model_name_or_path haoranxu/ALMA-7B-Pretrain \ 9 | --mmt_data_path ./human_written_data/ \ 10 | --do_train \ 11 | --do_eval \ 12 | --do_predict \ 13 | --language_pairs ${pairs} \ 14 | --load_best_model_at_end \ 15 | --low_cpu_mem_usage \ 16 | --fp16 \ 17 | --learning_rate 2e-5 \ 18 | --weight_decay 0.01 \ 19 | --gradient_accumulation_steps 4 \ 20 | --lr_scheduler_type inverse_sqrt \ 21 | --warmup_ratio 0.01 \ 22 | --ignore_pad_token_for_loss \ 23 | --ignore_prompt_token_for_loss \ 24 | --per_device_train_batch_size 4 \ 25 | --per_device_eval_batch_size 4 \ 26 | --evaluation_strategy steps \ 27 | --eval_steps 0.1 \ 28 | --save_strategy steps \ 29 | --save_steps 0.1 \ 30 | --save_total_limit 1 \ 31 | --logging_strategy steps \ 32 | --logging_steps 0.05 \ 33 | --output_dir ${OUTPUT_DIR} \ 34 | --num_train_epochs 1 \ 35 | --predict_with_generate \ 36 | --prediction_loss_only \ 37 | --max_new_tokens 256 \ 38 | --max_source_length 256 \ 39 | --seed 42 \ 40 | --overwrite_output_dir \ 41 | --num_beams 5 \ 42 | --ddp_timeout 999999 \ 43 | --report_to none \ 44 | --overwrite_cache 45 | 46 | -------------------------------------------------------------------------------- /runs/parallel_ft_lora.sh: -------------------------------------------------------------------------------- 1 | OUTPUT_DIR=${1:-"./alma-7b-parallel-ft-lora"} 2 | pairs=${2:-"de-en,cs-en,is-en,zh-en,ru-en,en-de,en-cs,en-is,en-zh,en-ru"} 3 | LORA_RANK=${3:-"16"} 4 | 5 | # random port between 30000 and 50000 6 | port=$(( RANDOM % (50000 - 30000 + 1 ) + 30000 )) 7 | 8 | accelerate launch --main_process_port ${port} --config_file configs/deepspeed_train_config_bf16.yaml \ 9 | run_llmmt.py \ 10 | --model_name_or_path haoranxu/ALMA-7B-Pretrain \ 11 | --mmt_data_path ./human_written_data/ \ 12 | --use_peft \ 13 | --lora_rank ${LORA_RANK} \ 14 | --do_train \ 15 | --do_eval \ 16 | --do_predict \ 17 | --language_pairs ${pairs} \ 18 | --load_best_model_at_end \ 19 | --low_cpu_mem_usage \ 20 | --fp16 \ 21 | --learning_rate 2e-3 \ 22 | --weight_decay 0.01 \ 23 | --gradient_accumulation_steps 4 \ 24 | --lr_scheduler_type inverse_sqrt \ 25 | --warmup_ratio 0.01 \ 26 | --ignore_pad_token_for_loss \ 27 | --ignore_prompt_token_for_loss \ 28 | --per_device_train_batch_size 4 \ 29 | --per_device_eval_batch_size 4 \ 30 | --evaluation_strategy steps \ 31 | --eval_steps 0.05 \ 32 | --save_strategy steps \ 33 | --save_steps 0.05 \ 34 | --save_total_limit 1 \ 35 | --logging_strategy steps \ 36 | --logging_steps 0.05 \ 37 | --output_dir ${OUTPUT_DIR} \ 38 | --num_train_epochs 1 \ 39 | --predict_with_generate \ 40 | --prediction_loss_only \ 41 | --max_new_tokens 256 \ 42 | --max_source_length 256 \ 43 | --seed 42 \ 44 | --overwrite_output_dir \ 45 | --num_beams 5 \ 46 | --ddp_timeout 999999 \ 47 | --report_to none \ 48 | --overwrite_cache 49 | 50 | ## Evaluation (BLEU, COMET) 51 | bash ./evals/eval_generation.sh ${OUTPUT_DIR} ${pairs} -------------------------------------------------------------------------------- /utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fe1ixxu/ALMA/e4545ca1f9799f1c7052e16d757ea3ed6b6b4286/utils/__init__.py -------------------------------------------------------------------------------- /utils/arguments.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass, field 2 | from typing import Optional 3 | from transformers import MODEL_FOR_CAUSAL_LM_MAPPING 4 | from transformers.utils.versions import require_version 5 | 6 | MODEL_CONFIG_CLASSES = list(MODEL_FOR_CAUSAL_LM_MAPPING.keys()) 7 | MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES) 8 | @dataclass 9 | class ModelArguments: 10 | """ 11 | Arguments pertaining to which model/config/tokenizer we are going to fine-tune, or train from scratch. 12 | """ 13 | 14 | model_name_or_path: Optional[str] = field( 15 | default=None, 16 | metadata={ 17 | "help": ( 18 | "The model checkpoint for weights initialization.Don't set if you want to train a model from scratch." 19 | ) 20 | }, 21 | ) 22 | model_type: Optional[str] = field( 23 | default=None, 24 | metadata={"help": "If training from scratch, pass a model type from the list: " + ", ".join(MODEL_TYPES)}, 25 | ) 26 | config_overrides: Optional[str] = field( 27 | default=None, 28 | metadata={ 29 | "help": ( 30 | "Override some existing default config settings when a model is trained from scratch. Example: " 31 | "n_embd=10,resid_pdrop=0.2,scale_attn_weights=false,summary_type=cls_index" 32 | ) 33 | }, 34 | ) 35 | config_name: Optional[str] = field( 36 | default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"} 37 | ) 38 | tokenizer_name: Optional[str] = field( 39 | default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"} 40 | ) 41 | cache_dir: Optional[str] = field( 42 | default=None, 43 | metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"}, 44 | ) 45 | use_fast_tokenizer: bool = field( 46 | default=True, 47 | metadata={"help": "Whether to use one of the fast tokenizer (backed by the tokenizers library) or not."}, 48 | ) 49 | model_revision: str = field( 50 | default="main", 51 | metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."}, 52 | ) 53 | use_auth_token: bool = field( 54 | default=False, 55 | metadata={ 56 | "help": ( 57 | "Will use the token generated when running `huggingface-cli login` (necessary to use this script " 58 | "with private models)." 59 | ) 60 | }, 61 | ) 62 | use_flash_attention_2: bool = field( 63 | default=False, 64 | metadata={ 65 | "help": ( 66 | "Will enable flash attention 2" 67 | ) 68 | }, 69 | ) 70 | torch_dtype: Optional[str] = field( 71 | default=None, 72 | metadata={ 73 | "help": ( 74 | "Override the default `torch.dtype` and load the model under this dtype. If `auto` is passed, the " 75 | "dtype will be automatically derived from the model's weights." 76 | ), 77 | "choices": ["auto", "bfloat16", "float16", "float32"], 78 | }, 79 | ) 80 | low_cpu_mem_usage: bool = field( 81 | default=False, 82 | metadata={ 83 | "help": ( 84 | "It is an option to create the model as an empty shell, then only materialize its parameters when the pretrained weights are loaded." 85 | "set True will benefit LLM loading time and RAM consumption." 86 | ) 87 | }, 88 | ) 89 | encoder_decoder_type: str = field( 90 | default="", 91 | metadata={"help": "Enable it if the model is encoder-decoder architecture."}, 92 | ) 93 | 94 | chat_style: bool = field( 95 | default=False, 96 | metadata={ 97 | "help": ( 98 | "Whether to use chat style decoding" 99 | ) 100 | }, 101 | ) 102 | 103 | load_in_8bit: bool = field( 104 | default=False, 105 | metadata={ 106 | "help": ( 107 | "Whether load model with int8" 108 | ) 109 | }, 110 | ) 111 | use_peft: bool = field( 112 | default=False, 113 | metadata={ 114 | "help": ( 115 | "Whether use PEFT (parameter efficient fine-tuning)" 116 | ) 117 | }, 118 | ) 119 | lora_rank: int = field( 120 | default=16, 121 | metadata={ 122 | "help": ( 123 | "The rank for LoRA" 124 | ) 125 | }, 126 | ) 127 | multi_gpu_one_model: bool = field( 128 | default=False, 129 | metadata={ 130 | "help": "Use multiple GPUs to load one model." 131 | }, 132 | ) 133 | peft_model_id: str = field( 134 | default="", 135 | metadata={ 136 | "help": ( 137 | "PEFT model location" 138 | ) 139 | }, 140 | ) 141 | def __post_init__(self): 142 | if self.config_overrides is not None and (self.config_name is not None or self.model_name_or_path is not None): 143 | raise ValueError( 144 | "--config_overrides can't be used in combination with --config_name or --model_name_or_path" 145 | ) 146 | 147 | 148 | @dataclass 149 | class DataTrainingArguments: 150 | """ 151 | Arguments pertaining to what data we are going to input our model for training and eval. 152 | """ 153 | language_pairs: str = field(default="", metadata={"help": "training language pairs"}) 154 | dataset_name: Optional[str] = field( 155 | default=None, metadata={"help": "The name of the dataset to use (via the datasets library)."} 156 | ) 157 | dataset_config_name: Optional[str] = field( 158 | default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."} 159 | ) 160 | mmt_data_path: Optional[str] = field(default=None, metadata={"help": "The input MMT training data path."}) 161 | override_test_data_path: Optional[str] = field(default=None, metadata={"help": "This will override the default test data in the mmt data"}) 162 | cpo_data_path: Optional[str] = field(default=None, metadata={"help": "The input CPO training data path."}) 163 | mono_data_path: Optional[str] = field(default=None, metadata={"help": "The input mono data training data path."}) 164 | oscar_data_path: Optional[str] = field(default=None, metadata={"help": "The input Oscar mono data name."}) 165 | nllb_pretrain_data_path: Optional[str] = field(default=None, metadata={"help": "The input NLLB pretrain (parallel) data path."}) 166 | oscar_data_lang: Optional[str] = field(default=None, metadata={"help": "The input Oscar mono data language."}) 167 | text_test_file: Optional[str] = field(default=None, metadata={"help": "A single test data file in text format, this will override the mmt_data_path and override_test_data_path"}) 168 | aya_datasets: Optional[str] = field(default=None, metadata={"help": "The datasets for Aya model."}) 169 | 170 | max_train_samples: Optional[int] = field( 171 | default=None, 172 | metadata={ 173 | "help": ( 174 | "For debugging purposes or quicker training, truncate the number of training examples to this " 175 | "value if set." 176 | ) 177 | }, 178 | ) 179 | max_eval_samples: Optional[int] = field( 180 | default=None, 181 | metadata={ 182 | "help": ( 183 | "For debugging purposes or quicker training, truncate the number of evaluation examples to this " 184 | "value if set." 185 | ) 186 | }, 187 | ) 188 | max_test_samples: Optional[int] = field( 189 | default=None, 190 | metadata={ 191 | "help": ( 192 | "For debugging purposes, truncate the number of test examples to this " 193 | "value if set." 194 | ) 195 | }, 196 | ) 197 | streaming: bool = field(default=False, metadata={"help": "Enable streaming mode"}) 198 | block_size: Optional[int] = field( 199 | default=None, 200 | metadata={ 201 | "help": ( 202 | "Optional input sequence length after tokenization. " 203 | "The training dataset will be truncated in block of this size for training. " 204 | "Default to the model max input length for single sentence inputs (take into account special tokens)." 205 | ) 206 | }, 207 | ) 208 | overwrite_cache: bool = field( 209 | default=False, metadata={"help": "Overwrite the cached training and evaluation sets"} 210 | ) 211 | validation_split_percentage: Optional[int] = field( 212 | default=5, 213 | metadata={ 214 | "help": "The percentage of the train set used as validation set in case there's no validation split" 215 | }, 216 | ) 217 | preprocessing_num_workers: Optional[int] = field( 218 | default=None, 219 | metadata={"help": "The number of processes to use for the preprocessing."}, 220 | ) 221 | keep_linebreaks: bool = field( 222 | default=True, metadata={"help": "Whether to keep line breaks when using TXT files or not."} 223 | ) 224 | ignore_pad_token_for_loss: bool = field( 225 | default=True, 226 | metadata={ 227 | "help": "Whether to ignore the tokens corresponding to padded labels in the loss computation or not." 228 | }, 229 | ) 230 | ignore_prompt_token_for_loss: bool = field( 231 | default=False, 232 | metadata={ 233 | "help": "Whether to ignore the prompt tokens in the loss computation or not." 234 | }, 235 | ) 236 | use_ul2: bool = field( 237 | default=False, 238 | metadata={ 239 | "help": "Whether to enable mixture of denoisers from UL2 model." 240 | }, 241 | ) 242 | max_source_length: Optional[int] = field( 243 | default=256, 244 | metadata={ 245 | "help": ( 246 | "The maximum total sequence length text after tokenization. Sequences longer " 247 | "than this will be truncated, sequences shorter will be padded." 248 | ) 249 | }, 250 | ) 251 | max_new_tokens: Optional[int] = field( 252 | default=256, 253 | metadata={ 254 | "help": ( 255 | "The maximum new tokens to generate except the prompt." 256 | ) 257 | }, 258 | ) 259 | num_beams: Optional[int] = field( 260 | default=5, 261 | metadata={ 262 | "help": ( 263 | "Beam size for generation" 264 | ) 265 | } 266 | ) 267 | 268 | display_num_translations: Optional[int] = field( 269 | default=10, 270 | metadata={ 271 | "help": ( 272 | "Number of translations will be displayed after translation." 273 | ) 274 | } 275 | ) 276 | 277 | right_pad: bool = field( 278 | default=False, 279 | metadata={ 280 | "help": "Use right pad for training, especially for models like MPT." 281 | }, 282 | ) 283 | 284 | use_prefix_lm: bool = field( 285 | default=False, 286 | metadata={ 287 | "help": "Use prefix language model, especially for models like MPT." 288 | }, 289 | ) 290 | few_shot_eval_path: str = field( 291 | default="", 292 | metadata={ 293 | "help": "The path for few show evaluation" 294 | }, 295 | ) 296 | use_target_lang_prompt_eval: bool = field( 297 | default=False, 298 | metadata={ 299 | "help": "Enable prompt from target language, e.g., in Chinese, the prompt is 将其从英语翻译成汉语:......" 300 | }, 301 | ) 302 | 303 | interleave_probs: str = field( 304 | default="", 305 | metadata={ 306 | "help": "Usung interleave to concatenate datasets, with probabilities of p1,p2,p3,..., splited by commas" 307 | }, 308 | ) 309 | 310 | nllb_interleave_probs: str = field( 311 | default="", 312 | metadata={ 313 | "help": "Usung interleave to concatenate datasets, with probabilities of p1,p2,p3,..., splited by commas for NLLB" 314 | }, 315 | ) 316 | 317 | suffix_eval_file: str = field( 318 | default="", 319 | metadata={ 320 | "help": "The suffix for the eval file: test-src-tgt'suffix_eval_file'" 321 | }, 322 | ) 323 | 324 | cpo_scorer: str = field( 325 | default="xcomet_kiwi", 326 | metadata={ 327 | "help": "The scorer of CPO, e.g., using xcomet, kiwi, or both of them (xcomet-kiwi) for CPO training" 328 | }, 329 | ) 330 | 331 | 332 | # predict_source_lang: str = field(default="", metadata={"help": "The source language for testing"}) 333 | # predict_target_lang: str = field(default="en", metadata={"help": "The target language for testing"}) 334 | 335 | suffix: Optional[str] = field(default="", metadata={"help": "The suffix of the training file."}) 336 | 337 | def __post_init__(self): 338 | if self.streaming: 339 | require_version("datasets>=2.0.0", "The streaming feature requires `datasets>=2.0.0`") 340 | -------------------------------------------------------------------------------- /utils/cpo_config.py: -------------------------------------------------------------------------------- 1 | # Copyright 2024 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | from dataclasses import dataclass 15 | from typing import Dict, Literal, Optional 16 | 17 | from transformers import TrainingArguments 18 | 19 | 20 | @dataclass 21 | class CPOConfig(TrainingArguments): 22 | r""" 23 | CPOConfig collects all training arguments related to the [`CPOTrainer`] class. 24 | 25 | Using [`HfArgumentParser`] we can turn this class into 26 | [argparse](https://docs.python.org/3/library/argparse#module-argparse) arguments that can be specified on the 27 | command line. 28 | 29 | Parameters: 30 | max_length (`int`, defaults to `None`): 31 | The maximum length of the sequences in the batch. This argument is required if you want to use the default data collator. 32 | max_prompt_length (`int`, defaults to `None`): 33 | The maximum length of the prompt. This argument is required if you want to use the default data collator. 34 | max_target_length (`int`, defaults to `None`): 35 | The maximum length of the target. This argument is required if you want to use the default data collator and your model is an encoder-decoder. 36 | beta (`float`, defaults to 0.1): 37 | The beta factor in CPO loss. 38 | label_smoothing (`float`, defaults to 0): 39 | The label smoothing factor. This argument is required if you want to use the default data collator. 40 | loss_type (`str`, defaults to `sigmoid`): 41 | The type of loss to use. This argument is required if you want to use the default data collator. 42 | label_pad_token_id (`int`, defaults to `-100`): 43 | The label pad token id. This argument is required if you want to use the default data collator. 44 | cpo_alpha (`float`, defaults to `1.0`): 45 | A hyperparameter that controls the strength of the BC regularizer in CPO training. 46 | simpo_gamma (`float`, defaults to `0.5`): 47 | A target reward margin for the SimPO loss, used only when the "simpo" option is enabled. 48 | padding_value (`int`, defaults to `None`): 49 | The padding value if it is different to the tokenizer's pad_token_id. 50 | truncation_mode (`str`, defaults to `keep_end`): 51 | The truncation mode to use, either `keep_end` or `keep_start`. This argument is required if you want to use the default data collator. 52 | generate_during_eval (`bool`, defaults to `False`): 53 | Whether to sample and log generations during evaluation step. 54 | is_encoder_decoder (`Optional[bool]`, `optional`, defaults to `None`): 55 | If no model is provided, we need to know if the model_init returns an encoder-decoder. 56 | disable_dropout (`bool`, defaults to `True`): 57 | Whether or not to disable dropouts in `model`. 58 | model_init_kwargs (`Optional[Dict]`, *optional*): 59 | Dict of Optional kwargs to pass when instantiating the model from a string 60 | dataset_num_proc (`Optional[int]`, *optional*): 61 | The number of workers to use to tokenize the data. Defaults to None. 62 | """ 63 | 64 | max_length: Optional[int] = None 65 | max_prompt_length: Optional[int] = None 66 | max_completion_length: Optional[int] = None 67 | max_target_length: Optional[int] = None 68 | 69 | beta: float = 0.1 70 | label_smoothing: float = 0 71 | loss_type: Literal["sigmoid", "hinge", "ipo", "simpo"] = "sigmoid" 72 | disable_dropout: bool = True 73 | cpo_alpha: float = 1.0 74 | simpo_gamma: float = 0.5 75 | relax_cofficient_1: float = 0.9 76 | relax_cofficient_2: float = 0.4 77 | 78 | label_pad_token_id: int = -100 79 | padding_value: int = None 80 | truncation_mode: str = "keep_end" 81 | generate_during_eval: bool = False 82 | is_encoder_decoder: Optional[bool] = None 83 | 84 | model_init_kwargs: Optional[Dict] = None 85 | 86 | dataset_num_proc: Optional[int] = None 87 | 88 | def __post_init__(self): 89 | if self.loss_type == "kto_pair": 90 | raise ValueError("Support for kto_pair has been removed in CPOTrainer. Please use KTOTrainer.") 91 | return super().__post_init__() 92 | -------------------------------------------------------------------------------- /utils/trainer_llmmt.py: -------------------------------------------------------------------------------- 1 | # Copyright 2020 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from copy import deepcopy 16 | from pathlib import Path 17 | from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Union 18 | 19 | import torch 20 | from torch import nn 21 | from torch.utils.data import Dataset 22 | 23 | from transformers.deepspeed import is_deepspeed_zero3_enabled 24 | from transformers.generation.configuration_utils import GenerationConfig 25 | from transformers.trainer import Trainer 26 | from transformers.utils import logging 27 | 28 | 29 | if TYPE_CHECKING: 30 | from transformers.data.data_collator import DataCollator 31 | from transformers.modeling_utils import PreTrainedModel 32 | from transformers.tokenization_utils_base import PreTrainedTokenizerBase 33 | from transformers.trainer_callback import TrainerCallback 34 | from transformers.trainer_utils import EvalPrediction, PredictionOutput 35 | from transformers.training_args import TrainingArguments 36 | 37 | 38 | logger = logging.get_logger(__name__) 39 | 40 | 41 | class LlmmtTrainer(Trainer): 42 | def __init__( 43 | self, 44 | model: Union["PreTrainedModel", nn.Module] = None, 45 | args: "TrainingArguments" = None, 46 | data_collator: Optional["DataCollator"] = None, 47 | train_dataset: Optional[Dataset] = None, 48 | eval_dataset: Optional[Union[Dataset, Dict[str, Dataset]]] = None, 49 | tokenizer: Optional["PreTrainedTokenizerBase"] = None, 50 | model_init: Optional[Callable[[], "PreTrainedModel"]] = None, 51 | compute_metrics: Optional[Callable[["EvalPrediction"], Dict]] = None, 52 | callbacks: Optional[List["TrainerCallback"]] = None, 53 | optimizers: Tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR] = (None, None), 54 | preprocess_logits_for_metrics: Optional[Callable[[torch.Tensor, torch.Tensor], torch.Tensor]] = None, 55 | ): 56 | super().__init__( 57 | model=model, 58 | args=args, 59 | data_collator=data_collator, 60 | train_dataset=train_dataset, 61 | eval_dataset=eval_dataset, 62 | tokenizer=tokenizer, 63 | model_init=model_init, 64 | compute_metrics=compute_metrics, 65 | callbacks=callbacks, 66 | optimizers=optimizers, 67 | preprocess_logits_for_metrics=preprocess_logits_for_metrics, 68 | ) 69 | 70 | # Override self.model.generation_config if a GenerationConfig is specified in args. 71 | # Priority: args.generation_config > model.generation_config > default GenerationConfig. 72 | if self.args.generation_config is not None: 73 | gen_config = self.load_generation_config(self.args.generation_config) 74 | self.model.generation_config = gen_config 75 | 76 | @staticmethod 77 | def load_generation_config(gen_config_arg: Union[str, GenerationConfig]) -> GenerationConfig: 78 | """ 79 | Loads a `~generation.GenerationConfig` from the `Seq2SeqTrainingArguments.generation_config` arguments. 80 | 81 | Args: 82 | gen_config_arg (`str` or [`~generation.GenerationConfig`]): 83 | `Seq2SeqTrainingArguments.generation_config` argument. 84 | 85 | Returns: 86 | A `~generation.GenerationConfig`. 87 | """ 88 | 89 | # GenerationConfig provided, nothing to do 90 | if isinstance(gen_config_arg, GenerationConfig): 91 | return deepcopy(gen_config_arg) 92 | 93 | # str or Path 94 | pretrained_model_name = Path(gen_config_arg) if isinstance(gen_config_arg, str) else gen_config_arg 95 | config_file_name = None 96 | 97 | # Figuring if it is path pointing to a file, pointing to a directory or else a model id or URL 98 | # This step is required in order to determine config_file_name 99 | if pretrained_model_name.is_file(): 100 | config_file_name = pretrained_model_name.name 101 | pretrained_model_name = pretrained_model_name.parent 102 | # dir path 103 | elif pretrained_model_name.is_dir(): 104 | pass 105 | # model id or URL 106 | else: 107 | pretrained_model_name = gen_config_arg 108 | 109 | gen_config = GenerationConfig.from_pretrained(pretrained_model_name, config_file_name) 110 | return gen_config 111 | 112 | def evaluate( 113 | self, 114 | eval_dataset: Optional[Dataset] = None, 115 | ignore_keys: Optional[List[str]] = None, 116 | metric_key_prefix: str = "eval", 117 | **gen_kwargs, 118 | ) -> Dict[str, float]: 119 | """ 120 | Run evaluation and returns metrics. 121 | 122 | The calling script will be responsible for providing a method to compute metrics, as they are task-dependent 123 | (pass it to the init `compute_metrics` argument). 124 | 125 | You can also subclass and override this method to inject custom behavior. 126 | 127 | Args: 128 | eval_dataset (`Dataset`, *optional*): 129 | Pass a dataset if you wish to override `self.eval_dataset`. If it is an [`~datasets.Dataset`], columns 130 | not accepted by the `model.forward()` method are automatically removed. It must implement the `__len__` 131 | method. 132 | ignore_keys (`List[str]`, *optional*): 133 | A list of keys in the output of your model (if it is a dictionary) that should be ignored when 134 | gathering predictions. 135 | metric_key_prefix (`str`, *optional*, defaults to `"eval"`): 136 | An optional prefix to be used as the metrics key prefix. For example the metrics "bleu" will be named 137 | "eval_bleu" if the prefix is `"eval"` (default) 138 | max_length (`int`, *optional*): 139 | The maximum target length to use when predicting with the generate method. 140 | num_beams (`int`, *optional*): 141 | Number of beams for beam search that will be used when predicting with the generate method. 1 means no 142 | beam search. 143 | gen_kwargs: 144 | Additional `generate` specific kwargs. 145 | 146 | Returns: 147 | A dictionary containing the evaluation loss and the potential metrics computed from the predictions. The 148 | dictionary also contains the epoch number which comes from the training state. 149 | """ 150 | 151 | gen_kwargs = gen_kwargs.copy() 152 | if gen_kwargs.get("max_length") is None and gen_kwargs.get("max_new_tokens") is None: 153 | gen_kwargs["max_length"] = self.args.generation_max_length 154 | gen_kwargs["num_beams"] = ( 155 | gen_kwargs["num_beams"] if gen_kwargs.get("num_beams") is not None else self.args.generation_num_beams 156 | ) 157 | self._gen_kwargs = gen_kwargs 158 | 159 | return super().evaluate(eval_dataset, ignore_keys=ignore_keys, metric_key_prefix=metric_key_prefix) 160 | 161 | def predict( 162 | self, 163 | test_dataset: Dataset, 164 | ignore_keys: Optional[List[str]] = None, 165 | metric_key_prefix: str = "test", 166 | **gen_kwargs, 167 | ) -> "PredictionOutput": 168 | """ 169 | Run prediction and returns predictions and potential metrics. 170 | 171 | Depending on the dataset and your use case, your test dataset may contain labels. In that case, this method 172 | will also return metrics, like in `evaluate()`. 173 | 174 | Args: 175 | test_dataset (`Dataset`): 176 | Dataset to run the predictions on. If it is a [`~datasets.Dataset`], columns not accepted by the 177 | `model.forward()` method are automatically removed. Has to implement the method `__len__` 178 | ignore_keys (`List[str]`, *optional*): 179 | A list of keys in the output of your model (if it is a dictionary) that should be ignored when 180 | gathering predictions. 181 | metric_key_prefix (`str`, *optional*, defaults to `"eval"`): 182 | An optional prefix to be used as the metrics key prefix. For example the metrics "bleu" will be named 183 | "eval_bleu" if the prefix is `"eval"` (default) 184 | max_length (`int`, *optional*): 185 | The maximum target length to use when predicting with the generate method. 186 | num_beams (`int`, *optional*): 187 | Number of beams for beam search that will be used when predicting with the generate method. 1 means no 188 | beam search. 189 | gen_kwargs: 190 | Additional `generate` specific kwargs. 191 | 192 | 193 | 194 | If your predictions or labels have different sequence lengths (for instance because you're doing dynamic 195 | padding in a token classification task) the predictions will be padded (on the right) to allow for 196 | concatenation into one array. The padding index is -100. 197 | 198 | 199 | 200 | Returns: *NamedTuple* A namedtuple with the following keys: 201 | 202 | - predictions (`np.ndarray`): The predictions on `test_dataset`. 203 | - label_ids (`np.ndarray`, *optional*): The labels (if the dataset contained some). 204 | - metrics (`Dict[str, float]`, *optional*): The potential dictionary of metrics (if the dataset contained 205 | labels). 206 | """ 207 | 208 | gen_kwargs = gen_kwargs.copy() 209 | if gen_kwargs.get("max_length") is None and gen_kwargs.get("max_new_tokens") is None: 210 | gen_kwargs["max_length"] = self.args.generation_max_length 211 | gen_kwargs["num_beams"] = ( 212 | gen_kwargs["num_beams"] if gen_kwargs.get("num_beams") is not None else self.args.generation_num_beams 213 | ) 214 | self._gen_kwargs = gen_kwargs 215 | 216 | return super().predict(test_dataset, ignore_keys=ignore_keys, metric_key_prefix=metric_key_prefix) 217 | 218 | def prediction_step( 219 | self, 220 | model: nn.Module, 221 | inputs: Dict[str, Union[torch.Tensor, Any]], 222 | prediction_loss_only: bool, 223 | ignore_keys: Optional[List[str]] = None, 224 | ) -> Tuple[Optional[float], Optional[torch.Tensor], Optional[torch.Tensor]]: 225 | """ 226 | Perform an evaluation step on `model` using `inputs`. 227 | 228 | Subclass and override to inject custom behavior. 229 | 230 | Args: 231 | model (`nn.Module`): 232 | The model to evaluate. 233 | inputs (`Dict[str, Union[torch.Tensor, Any]]`): 234 | The inputs and targets of the model. 235 | 236 | The dictionary will be unpacked before being fed to the model. Most models expect the targets under the 237 | argument `labels`. Check your model's documentation for all accepted arguments. 238 | prediction_loss_only (`bool`): 239 | Whether or not to return the loss only. 240 | 241 | Return: 242 | Tuple[Optional[float], Optional[torch.Tensor], Optional[torch.Tensor]]: A tuple with the loss, logits and 243 | labels (each being optional). 244 | """ 245 | 246 | if not self.args.predict_with_generate or prediction_loss_only: 247 | return super().prediction_step( 248 | model, inputs, prediction_loss_only=prediction_loss_only, ignore_keys=ignore_keys 249 | ) 250 | 251 | has_labels = "labels" in inputs 252 | inputs = self._prepare_inputs(inputs) 253 | 254 | # XXX: adapt synced_gpus for fairscale as well 255 | # Priority (handled in generate): 256 | # gen_kwargs > model.generation_config > default GenerationConfig() 257 | gen_kwargs = self._gen_kwargs.copy() 258 | if gen_kwargs.get("max_length") is None and gen_kwargs.get("max_new_tokens") is None: 259 | gen_kwargs["max_length"] = self.model.config.max_length 260 | gen_kwargs["num_beams"] = ( 261 | gen_kwargs["num_beams"] if gen_kwargs.get("num_beams") is not None else self.model.config.num_beams 262 | ) 263 | default_synced_gpus = True if is_deepspeed_zero3_enabled() else False 264 | gen_kwargs["synced_gpus"] = ( 265 | gen_kwargs["synced_gpus"] if gen_kwargs.get("synced_gpus") is not None else default_synced_gpus 266 | ) 267 | 268 | # If the `decoder_input_ids` was created from `labels`, evict the former, so that the model can freely generate 269 | # (otherwise, it would continue generating from the padded `decoder_input_ids`) 270 | if ( 271 | "labels" in inputs 272 | and "decoder_input_ids" in inputs 273 | and inputs["labels"].shape == inputs["decoder_input_ids"].shape 274 | ): 275 | inputs = {k: v for k, v in inputs.items() if k != "decoder_input_ids"} 276 | generated_tokens = self.model.generate(**inputs, **gen_kwargs) 277 | 278 | # Temporary hack to ensure the generation config is not initialized for each iteration of the evaluation loop 279 | # TODO: remove this hack when the legacy code that initializes generation_config from a model config is 280 | # removed in https://github.com/huggingface/transformers/blob/98d88b23f54e5a23e741833f1e973fdf600cc2c5/src/transformers/generation/utils.py#L1183 281 | if self.model.generation_config._from_model_config: 282 | self.model.generation_config._from_model_config = False 283 | 284 | # Retrieves GenerationConfig from model.generation_config 285 | gen_config = self.model.generation_config 286 | # in case the batch is shorter than max length, the output should be padded 287 | if generated_tokens.shape[-1] < gen_config.max_length: 288 | generated_tokens = self._pad_tensors_to_max_len(generated_tokens, gen_config.max_length) 289 | elif gen_config.max_new_tokens is not None and generated_tokens.shape[-1] < gen_config.max_new_tokens + 1: 290 | generated_tokens = self._pad_tensors_to_max_len(generated_tokens, gen_config.max_new_tokens + 1) 291 | 292 | loss = None 293 | 294 | if self.args.prediction_loss_only: 295 | return loss, None, None 296 | 297 | if has_labels: 298 | labels = inputs["labels"] 299 | if labels.shape[-1] < gen_config.max_length: 300 | labels = self._pad_tensors_to_max_len(labels, gen_config.max_length) 301 | elif gen_config.max_new_tokens is not None and labels.shape[-1] < gen_config.max_new_tokens + 1: 302 | labels = self._pad_tensors_to_max_len(labels, gen_config.max_new_tokens + 1) 303 | else: 304 | labels = None 305 | 306 | return loss, generated_tokens, labels 307 | 308 | def _pad_tensors_to_max_len(self, tensor, max_length): 309 | if self.tokenizer is not None and hasattr(self.tokenizer, "pad_token_id"): 310 | # If PAD token is not defined at least EOS token has to be defined 311 | pad_token_id = ( 312 | self.tokenizer.pad_token_id if self.tokenizer.pad_token_id is not None else self.tokenizer.eos_token_id 313 | ) 314 | else: 315 | if self.model.config.pad_token_id is not None: 316 | pad_token_id = self.model.config.pad_token_id 317 | else: 318 | raise ValueError("Pad_token_id must be set in the configuration of the model, in order to pad tensors") 319 | 320 | padded_tensor = pad_token_id * torch.ones( 321 | (tensor.shape[0], max_length), dtype=tensor.dtype, device=tensor.device 322 | ) 323 | padded_tensor[:, : tensor.shape[-1]] = tensor 324 | return padded_tensor 325 | -------------------------------------------------------------------------------- /utils/ul2collator.py: -------------------------------------------------------------------------------- 1 | import random 2 | from collections.abc import Mapping 3 | import numpy as np 4 | import torch 5 | from torch.nn import functional as F 6 | from dataclasses import dataclass 7 | from typing import Any, Dict, List, Optional, Tuple, Union 8 | from transformers.tokenization_utils_base import PreTrainedTokenizerBase 9 | from transformers import AutoModelForCausalLM 10 | from transformers.data.data_collator import ( 11 | DataCollatorMixin, 12 | _torch_collate_batch, 13 | ) 14 | import copy 15 | 16 | from transformers import default_data_collator 17 | from .utils import get_first_non_specical_index, get_first_special_index, get_first_special_index_batch 18 | 19 | def random_spans_noise_mask(length, mean_noise_span_length, noise_density): 20 | """ 21 | A copy from https://github.com/EleutherAI/oslo/blob/main/oslo/transformers/tasks/data_t5_pretraining.py#L230 (inception) 22 | This function is copy of `random_spans_helper `__ . 23 | Noise mask consisting of random spans of noise tokens. 24 | The number of noise tokens and the number of noise spans and non-noise spans 25 | are determined deterministically as follows: 26 | num_noise_tokens = round(length * noise_density) 27 | num_nonnoise_spans = num_noise_spans = round(num_noise_tokens / mean_noise_span_length) 28 | Spans alternate between non-noise and noise, beginning with non-noise. 29 | Subject to the above restrictions, all masks are equally likely. 30 | Args: 31 | length: an int32 scalar (length of the incoming token sequence) 32 | noise_density: a float - approximate density of output mask 33 | mean_noise_span_length: a number 34 | Returns: 35 | a boolean tensor with shape [length] 36 | """ 37 | 38 | orig_length = length 39 | 40 | num_noise_tokens = int(np.round(length * noise_density)) 41 | # avoid degeneracy by ensuring positive numbers of noise and nonnoise tokens. 42 | num_noise_tokens = min(max(num_noise_tokens, 1), length - 1) 43 | num_noise_spans = int(np.round(num_noise_tokens / mean_noise_span_length)) 44 | 45 | # avoid degeneracy by ensuring positive number of noise spans 46 | num_noise_spans = max(num_noise_spans, 1) 47 | num_nonnoise_tokens = length - num_noise_tokens 48 | 49 | # pick the lengths of the noise spans and the non-noise spans 50 | def _random_segmentation(num_items, num_segments): 51 | """Partition a sequence of items randomly into non-empty segments. 52 | Args: 53 | num_items: an integer scalar > 0 54 | num_segments: an integer scalar in [1, num_items] 55 | Returns: 56 | a Tensor with shape [num_segments] containing positive integers that add 57 | up to num_items 58 | """ 59 | mask_indices = np.arange(num_items - 1) < (num_segments - 1) 60 | np.random.shuffle(mask_indices) 61 | first_in_segment = np.pad(mask_indices, [[1, 0]]) 62 | segment_id = np.cumsum(first_in_segment) 63 | # count length of sub segments assuming that list is sorted 64 | _, segment_length = np.unique(segment_id, return_counts=True) 65 | return segment_length 66 | 67 | noise_span_lengths = _random_segmentation(num_noise_tokens, num_noise_spans) 68 | nonnoise_span_lengths = _random_segmentation( 69 | num_nonnoise_tokens, num_noise_spans 70 | ) 71 | 72 | interleaved_span_lengths = np.reshape( 73 | np.stack([nonnoise_span_lengths, noise_span_lengths], axis=1), 74 | [num_noise_spans * 2], 75 | ) 76 | span_starts = np.cumsum(interleaved_span_lengths)[:-1] 77 | span_start_indicator = np.zeros((length,), dtype=np.int8) 78 | span_start_indicator[span_starts] = True 79 | span_num = np.cumsum(span_start_indicator) 80 | is_noise = np.equal(span_num % 2, 1) 81 | 82 | return is_noise[:orig_length] 83 | 84 | @dataclass 85 | class DataCollatorForUL2(DataCollatorMixin): 86 | """ 87 | 88 | Data collator used for UL2 89 | 90 | """ 91 | model: AutoModelForCausalLM 92 | tokenizer: PreTrainedTokenizerBase 93 | r_denoising: bool = True 94 | r_probability: float = 0.25 95 | r_denoising_config: Tuple[Tuple] = ((3, 0.15),) 96 | s_denoising: bool = True 97 | s_probability: float = 0.5 98 | x_denoising: bool = True 99 | x_probability: float = 0.25 100 | x_denoising_config: Tuple[Tuple] = ((32, 0.5, 0.5),) 101 | pad_to_multiple_of: Optional[int] = None 102 | tf_experimental_compile: bool = False 103 | return_tensors: str = "pt" 104 | label_pad_token_id: int = -100 105 | 106 | def __post_init__(self): 107 | self.total_task = [0, 1, 2] 108 | task_prob = [] 109 | task_prob.append(self.r_probability if self.r_denoising else 0.0) 110 | task_prob.append(self.s_probability if self.s_denoising else 0.0) 111 | task_prob.append(self.x_probability if self.x_denoising else 0.0) 112 | self.task_prob = task_prob 113 | self.pad_token_id = self.tokenizer.pad_token_id 114 | self.decoder_start_token_id = self.tokenizer.bos_token_id 115 | 116 | def assign_task_type(self, batch_size: int): 117 | ''' 118 | Randomly assign S,R,X to each sentence based on weighted prob 119 | ''' 120 | return random.choices(self.total_task,weights=self.task_prob, k=batch_size) 121 | 122 | def torch_call(self, examples: List[Union[List[int], Any, Dict[str, Any]]]) -> Dict[str, Any]: 123 | torch.set_printoptions(threshold=10_000) 124 | np.set_printoptions(threshold=10_000) 125 | if torch.rand(1) < -1 or not self.model.training: 126 | return default_data_collator(examples) 127 | 128 | # Handle dict or lists with proper padding and conversion to tensor. 129 | # print(examples) 130 | task_ids = self.assign_task_type(len(examples)) 131 | task_type = torch.tensor(task_ids) 132 | lengths = torch.tensor([ len(e['input_ids']) for e in examples ], dtype=torch.long) 133 | if isinstance(examples[0], Mapping): 134 | batch = self.tokenizer.pad(examples, return_tensors="pt", 135 | pad_to_multiple_of=self.pad_to_multiple_of) 136 | else: 137 | batch = { 138 | "input_ids": _torch_collate_batch(examples, self.tokenizer, 139 | pad_to_multiple_of=self.pad_to_multiple_of) 140 | } 141 | max_length = batch['input_ids'].shape[-1] 142 | 143 | # new_batch = copy.deepcopy(batch) 144 | new_batch = { 145 | "input_ids": torch.zeros(batch['input_ids'].shape, dtype=torch.long), 146 | "labels": torch.zeros(batch['input_ids'].shape, dtype=torch.long), 147 | "attention_mask": torch.zeros(batch['input_ids'].shape, dtype=torch.long), 148 | "prefix_mask": torch.zeros(batch['input_ids'].shape, dtype=torch.long), 149 | } 150 | 151 | _, expanded_length = batch['input_ids'].shape 152 | input_ids = batch["input_ids"] 153 | r_denoising_idx = task_type == 0 154 | r_denoising_idx_num = torch.where(r_denoising_idx)[0] 155 | if r_denoising_idx.any(): 156 | mask_indices = None 157 | sub_input_ids = input_ids[r_denoising_idx] 158 | # union of different denoising settings 159 | for (mean_span, noise) in self.r_denoising_config: 160 | _mask_indices = np.array([ 161 | random_spans_noise_mask(expanded_length, mean_span, noise) for _ in range(len(sub_input_ids)) 162 | ]) 163 | 164 | if mask_indices is None: 165 | mask_indices = _mask_indices 166 | else: 167 | mask_indices = mask_indices | _mask_indices 168 | 169 | valid_lengths = get_first_special_index_batch(sub_input_ids, self.pad_token_id) 170 | for idx, valid_len in enumerate(valid_lengths): 171 | mask_indices[idx, valid_len:] = False 172 | input_ids_sentinel = self.create_sentinel_ids(mask_indices.astype(np.int8)) 173 | labels_mask = ~mask_indices 174 | labels_sentinel = self.create_sentinel_ids(labels_mask.astype(np.int8)) 175 | _sub_input_ids = self.filter_input_ids(sub_input_ids, input_ids_sentinel) 176 | _labels = self.filter_input_ids(sub_input_ids, labels_sentinel) 177 | 178 | labels = [] 179 | _input_ids = [] 180 | for idx, _label in enumerate(_labels): 181 | label = _label[_label != self.pad_token_id] 182 | _sub_input_ids_idx = _sub_input_ids[idx][_sub_input_ids[idx] != self.pad_token_id] 183 | sub_input_len = len(_sub_input_ids_idx) 184 | _sub_input_ids_idx = np.concatenate((_sub_input_ids_idx, label)) 185 | label = np.concatenate(([self.label_pad_token_id] * sub_input_len, label)) 186 | new_batch['attention_mask'][r_denoising_idx_num[idx]][:len(label)] = 1 187 | new_batch["prefix_mask"][r_denoising_idx_num[idx]][:sub_input_len] = 1 188 | if len(label) > max_length: 189 | label = torch.from_numpy(label[: max_length]) 190 | _sub_input_ids_idx = torch.from_numpy(_sub_input_ids_idx[: max_length]) 191 | else: 192 | diff = max_length - len(label) 193 | label = F.pad(torch.from_numpy(label), (0, diff), 'constant', self.label_pad_token_id) 194 | _sub_input_ids_idx = F.pad(torch.from_numpy(_sub_input_ids_idx), (0, diff), 'constant', self.pad_token_id) 195 | labels.append(label) 196 | _input_ids.append(_sub_input_ids_idx) 197 | labels = torch.stack(labels) 198 | _input_ids = torch.stack(_input_ids) 199 | 200 | new_batch['input_ids'][r_denoising_idx] = _input_ids.long() 201 | new_batch['labels'][r_denoising_idx] = labels.long() 202 | 203 | s_denoising_idx = task_type == 1 204 | s_denoising_idx_num = torch.where(s_denoising_idx)[0] 205 | if s_denoising_idx.any(): 206 | sub_input_ids = input_ids[s_denoising_idx] 207 | _labels = [] 208 | _input_ids = [] 209 | 210 | for idx, input_id in enumerate(sub_input_ids): 211 | valid_len = get_first_special_index(input_id, self.pad_token_id) 212 | split = max(valid_len//2, 2) 213 | new_batch["prefix_mask"][s_denoising_idx_num[idx]][:split] = 1 214 | 215 | # for input_id, len_ in zip(sub_input_ids, lengths[s_denoising_idx]): 216 | # if self.tokenizer.padding_side == "left": 217 | # idx = get_first_non_specical_index(input_id, self.pad_token_id) 218 | # valid_len = len_ - idx - 1 219 | # split = max(valid_len//2, 2) + idx 220 | # diff = expanded_length - split 221 | # _input_ids.append(F.pad(input_id[:split], (0, diff), 'constant', self.pad_token_id)) 222 | # past_seq = input_id[split:] 223 | # if past_seq[-1] != self.tokenizer.eos_token_id: 224 | # past_seq[-1] = self.tokenizer.eos_token_id 225 | # # _labels.append(F.pad(past_seq, (split, 0), 'constant', self.pad_token_id)) 226 | # else: 227 | # valid_len = get_first_special_index(input_id, self.pad_token_id) 228 | # split = max(valid_len//2, 2) 229 | # # diff = expanded_length - split 230 | # # _input_ids.append(F.pad(input_id[:split], (0, diff), 'constant', self.pad_token_id)) 231 | # # past_seq = input_id[split:] 232 | # # past_seq = torch.where(past_seq == self.pad_token_id, self.label_pad_token_id, past_seq) 233 | # # _labels.append(F.pad(past_seq, (split, 0), 'constant', self.label_pad_token_id)) 234 | 235 | new_batch['input_ids'][s_denoising_idx] = batch['input_ids'][s_denoising_idx] 236 | new_batch['labels'][s_denoising_idx] = batch['labels'][s_denoising_idx] 237 | new_batch['attention_mask'][s_denoising_idx] = batch['attention_mask'][s_denoising_idx] 238 | 239 | 240 | x_denoising_idx = task_type == 2 241 | x_denoising_idx_num = torch.where(x_denoising_idx)[0] 242 | if x_denoising_idx.any(): 243 | sub_input_ids = input_ids[x_denoising_idx] 244 | mask_indices = [] 245 | valid_lengths = get_first_special_index_batch(sub_input_ids, self.pad_token_id) 246 | for len_, valid_len in zip(lengths[x_denoising_idx], valid_lengths): 247 | mask_index = None 248 | # idx = get_first_non_specical_index(input_id, self.pad_token_id) 249 | # valid_len = len_ - idx - 1 250 | for (mean_span, noise, ratio) in self.x_denoising_config: 251 | mean_span = min(mean_span, valid_len * ratio) 252 | _mask_index = np.array( 253 | random_spans_noise_mask(expanded_length, mean_span, noise) 254 | ) 255 | if mask_index is None: 256 | mask_index = _mask_index 257 | else: 258 | mask_index = mask_index | _mask_index 259 | mask_index[valid_len:] = False 260 | mask_indices.append(mask_index[np.newaxis,:]) 261 | 262 | mask_indices = np.concatenate(mask_indices, axis=0) 263 | input_ids_sentinel = self.create_sentinel_ids(mask_indices.astype(np.int8)) 264 | labels_mask = ~mask_indices 265 | labels_sentinel = self.create_sentinel_ids(labels_mask.astype(np.int8)) 266 | _sub_input_ids = self.filter_input_ids(sub_input_ids, input_ids_sentinel) 267 | _labels = self.filter_input_ids(sub_input_ids, labels_sentinel) 268 | 269 | labels = [] 270 | _input_ids = [] 271 | for idx, _label in enumerate(_labels): 272 | label = _label[_label != self.pad_token_id] 273 | _sub_input_ids_idx = _sub_input_ids[idx][_sub_input_ids[idx] != self.pad_token_id] 274 | sub_input_len = len(_sub_input_ids_idx) 275 | _sub_input_ids_idx = np.concatenate((_sub_input_ids_idx, label)) 276 | label = np.concatenate(([self.label_pad_token_id] * sub_input_len, label)) 277 | new_batch['attention_mask'][x_denoising_idx_num[idx]][:len(label)] = 1 278 | new_batch["prefix_mask"][x_denoising_idx_num[idx]][:sub_input_len] = 1 279 | if len(label) > max_length: 280 | label = torch.from_numpy(label[: max_length]) 281 | _sub_input_ids_idx = torch.from_numpy(_sub_input_ids_idx[: max_length]) 282 | else: 283 | diff = max_length - len(label) 284 | label = F.pad(torch.from_numpy(label), (0, diff), 'constant', self.label_pad_token_id) 285 | _sub_input_ids_idx = F.pad(torch.from_numpy(_sub_input_ids_idx), (0, diff), 'constant', self.pad_token_id) 286 | labels.append(label) 287 | _input_ids.append(_sub_input_ids_idx) 288 | labels = torch.stack(labels) 289 | _input_ids = torch.stack(_input_ids) 290 | 291 | new_batch['input_ids'][x_denoising_idx] = _input_ids.long() 292 | new_batch['labels'][x_denoising_idx] = labels.long() 293 | 294 | # if torch.cuda.current_device() == 0: 295 | # print(new_batch) 296 | # exit(0) 297 | ## Override labels 298 | # if "labels" in batch: 299 | # new_batch["labels"] = batch["labels"] 300 | # new_batch["attention_mask"] = batch["attention_mask"] 301 | 302 | return new_batch 303 | 304 | 305 | def filter_input_ids(self, input_ids, sentinel_ids): 306 | """ 307 | Puts sentinel mask on `input_ids` and fuse consecutive mask tokens into a single mask token by deleting. 308 | This will reduce the sequence length from `expanded_inputs_length` to `input_length`. 309 | """ 310 | 311 | input_ids_full = np.where(sentinel_ids != 0, sentinel_ids, input_ids) 312 | # input_ids tokens and sentinel tokens are >= 0, tokens < 0 are 313 | # masked tokens coming after sentinel tokens and should be removed 314 | input_ids = [] 315 | for row in input_ids_full: 316 | collapsed_id = row[row >= 0] 317 | diff = len(row) - len(collapsed_id) 318 | collapsed_id = np.pad(collapsed_id, (0, diff), 'constant', constant_values=self.pad_token_id) 319 | input_ids.append(collapsed_id) 320 | return np.array(input_ids) 321 | 322 | def create_sentinel_ids(self, mask_indices): 323 | """ 324 | Sentinel ids creation given the indices that should be masked. 325 | The start indices of each mask are replaced by the sentinel ids in increasing 326 | order. Consecutive mask indices to be deleted are replaced with `-1`. 327 | """ 328 | start_indices = mask_indices - np.roll(mask_indices, 1, axis=-1) * mask_indices 329 | start_indices[:, 0] = mask_indices[:, 0] 330 | 331 | sentinel_ids = np.where( 332 | start_indices != 0, np.cumsum(start_indices, axis=-1), start_indices 333 | ) 334 | sentinel_ids = np.where( 335 | sentinel_ids != 0, (len(self.tokenizer) - sentinel_ids), 0 336 | ) 337 | sentinel_ids -= mask_indices - start_indices 338 | 339 | return sentinel_ids 340 | 341 | 342 | def prepare_decoder_inputs_from_labels(self, batch): 343 | # decoder_start_token_id has to be defined. In T5 it is usually set to the pad_token_id. 344 | # See T5 docs for more information 345 | batch["labels"][ batch["labels"] == self.pad_token_id ] = self.label_pad_token_id 346 | shifted_labels = batch["labels"].new_zeros(batch["labels"].shape) 347 | shifted_labels[..., 1:] = batch["labels"][..., :-1].clone() 348 | shifted_labels[..., 0] = self.decoder_start_token_id # decoder_start_token_id 349 | 350 | batch["decoder_input_ids"] = torch.masked_fill( 351 | shifted_labels, 352 | shifted_labels == self.label_pad_token_id, 353 | self.pad_token_id 354 | ) 355 | batch["decoder_attention_mask"] = torch.where( 356 | shifted_labels == self.label_pad_token_id, 357 | 0, 358 | torch.ones_like(shifted_labels), 359 | ) 360 | return batch 361 | 362 | def np_prepare_decoder_inputs_from_labels(self, batch): 363 | batch["labels"][ batch["labels"] == self.pad_token_id ] = self.label_pad_token_id 364 | shifted_labels = np.zeros(batch["labels"].shape) 365 | shifted_labels[..., 1:] = batch["labels"][..., :-1].copy() 366 | shifted_labels[..., 0] = self.decoder_start_token_id 367 | 368 | batch["decoder_input_ids"] = np.where( 369 | shifted_labels == self.label_pad_token_id, 370 | self.pad_token_id, 371 | shifted_labels 372 | ) 373 | batch["decoder_attention_mask"] = np.where( 374 | shifted_labels == self.label_pad_token_id, 375 | 0, 376 | np.ones_like(shifted_labels) 377 | ) 378 | return batch --------------------------------------------------------------------------------