├── CODE_OF_CONDUCT.md
├── LICENSE
├── README.md
├── configs
    ├── deepspeed_eval_config.yaml
    ├── deepspeed_eval_config_bf16.yaml
    ├── deepspeed_eval_config_zero3_bf16.yaml
    ├── deepspeed_train_config.yaml
    └── deepspeed_train_config_bf16.yaml
├── evals
    ├── alma_13b.sh
    ├── alma_13b_lora.sh
    ├── alma_13b_lora_no_parallel.sh
    ├── alma_13b_no_parallel.sh
    ├── alma_13b_r.sh
    ├── alma_13b_r_wmt23.sh
    ├── alma_7b.sh
    ├── alma_7b_lora.sh
    ├── alma_7b_lora_no_parallel.sh
    ├── alma_7b_no_parallel.sh
    ├── eval_generation.sh
    ├── eval_generation_wmt23.sh
    ├── eval_other_models.sh
    └── llama-2-13b-5-shot.sh
├── figures
    ├── alma.jpg
    ├── alma_logo.png
    ├── alma_origin_logo.png
    ├── almar.png
    └── xalma.png
├── human_written_data
    ├── Filtered-5-shot
    │   ├── shots.cs-en.json
    │   ├── shots.de-en.json
    │   ├── shots.de-fr.json
    │   ├── shots.en-cs.json
    │   ├── shots.en-de.json
    │   ├── shots.en-ha.json
    │   ├── shots.en-is.json
    │   ├── shots.en-ja.json
    │   ├── shots.en-ru.json
    │   ├── shots.en-uk.json
    │   ├── shots.en-zh.json
    │   ├── shots.fr-de.json
    │   ├── shots.ha-en.json
    │   ├── shots.is-en.json
    │   ├── shots.ja-en.json
    │   ├── shots.ru-en.json
    │   ├── shots.uk-en.json
    │   └── shots.zh-en.json
    ├── HW-5-shot
    │   ├── shots.cs-en.json
    │   ├── shots.de-en.json
    │   ├── shots.en-cs.json
    │   ├── shots.en-de.json
    │   ├── shots.en-is.json
    │   ├── shots.en-ru.json
    │   ├── shots.en-zh.json
    │   ├── shots.is-en.json
    │   ├── shots.ru-en.json
    │   └── shots.zh-en.json
    ├── csen
    │   ├── test.cs-en.cs
    │   ├── test.cs-en.en
    │   ├── test.cs-en.json
    │   ├── test.en-cs.cs
    │   ├── test.en-cs.en
    │   ├── test.en-cs.json
    │   ├── train.cs-en.cs
    │   ├── train.cs-en.en
    │   ├── train.cs-en.json
    │   └── valid.cs-en.json
    ├── deen
    │   ├── test.de-en.json
    │   ├── test.en-de.json
    │   ├── train.de-en.de
    │   ├── train.de-en.en
    │   ├── train.de-en.json
    │   └── valid.de-en.json
    ├── isen
    │   ├── test.en-is.json
    │   ├── test.is-en.json
    │   ├── train.is-en.en
    │   ├── train.is-en.is
    │   └── train.is-en.json
    ├── ruen
    │   ├── test.en-ru.json
    │   ├── test.ru-en.json
    │   ├── train.ru-en.en
    │   ├── train.ru-en.json
    │   ├── train.ru-en.ru
    │   └── valid.ru-en.json
    └── zhen
    │   ├── test.en-zh.json
    │   ├── test.zh-en.json
    │   ├── train.zh-en.en
    │   ├── train.zh-en.json
    │   ├── train.zh-en.zh
    │   └── valid.zh-en.json
├── install_alma.sh
├── modeling_xalma.py
├── outputs
    ├── wmt22_outputs
    │   ├── ALMA-13B-LoRA
    │   │   ├── csen
    │   │   │   └── test.cs-en.en
    │   │   ├── deen
    │   │   │   └── test.de-en.en
    │   │   ├── encs
    │   │   │   └── test.en-cs.cs
    │   │   ├── ende
    │   │   │   └── test.en-de.de
    │   │   ├── enis
    │   │   │   └── test.en-is.is
    │   │   ├── enru
    │   │   │   └── test.en-ru.ru
    │   │   ├── enzh
    │   │   │   └── test.en-zh.zh
    │   │   ├── isen
    │   │   │   └── test.is-en.en
    │   │   ├── ruen
    │   │   │   └── test.ru-en.en
    │   │   └── zhen
    │   │   │   └── test.zh-en.en
    │   ├── ALMA-13B-R
    │   │   ├── csen
    │   │   │   └── test.cs-en.en
    │   │   ├── deen
    │   │   │   └── test.de-en.en
    │   │   ├── encs
    │   │   │   └── test.en-cs.cs
    │   │   ├── ende
    │   │   │   └── test.en-de.de
    │   │   ├── enis
    │   │   │   └── test.en-is.is
    │   │   ├── enru
    │   │   │   └── test.en-ru.ru
    │   │   ├── enzh
    │   │   │   └── test.en-zh.zh
    │   │   ├── isen
    │   │   │   └── test.is-en.en
    │   │   ├── ruen
    │   │   │   └── test.ru-en.en
    │   │   └── zhen
    │   │   │   └── test.zh-en.en
    │   ├── GPT-4-1106-preview
    │   │   ├── csen
    │   │   │   └── test.cs-en.en
    │   │   ├── deen
    │   │   │   └── test.de-en.en
    │   │   ├── encs
    │   │   │   └── test.en-cs.cs
    │   │   ├── ende
    │   │   │   └── test.en-de.de
    │   │   ├── enis
    │   │   │   └── test.en-is.is
    │   │   ├── enru
    │   │   │   └── test.en-ru.ru
    │   │   ├── enzh
    │   │   │   └── test.en-zh.zh
    │   │   ├── isen
    │   │   │   └── test.is-en.en
    │   │   ├── ruen
    │   │   │   └── test.ru-en.en
    │   │   └── zhen
    │   │   │   └── test.zh-en.en
    │   ├── MADLAD-10B
    │   │   ├── csen
    │   │   │   └── test.cs-en.en
    │   │   ├── deen
    │   │   │   └── test.de-en.en
    │   │   ├── encs
    │   │   │   └── test.en-cs.cs
    │   │   ├── ende
    │   │   │   └── test.en-de.de
    │   │   ├── enis
    │   │   │   └── test.en-is.is
    │   │   ├── enru
    │   │   │   └── test.en-ru.ru
    │   │   ├── enzh
    │   │   │   └── test.en-zh.zh
    │   │   ├── isen
    │   │   │   └── test.is-en.en
    │   │   ├── ruen
    │   │   │   └── test.ru-en.en
    │   │   └── zhen
    │   │   │   └── test.zh-en.en
    │   ├── README.md
    │   ├── WMT_Winners
    │   │   ├── csen
    │   │   │   └── test.cs-en.en
    │   │   ├── deen
    │   │   │   └── test.de-en.en
    │   │   ├── encs
    │   │   │   └── test.en-cs.cs
    │   │   ├── ende
    │   │   │   └── test.en-de.de
    │   │   ├── enis
    │   │   │   └── test.en-is.is
    │   │   ├── enru
    │   │   │   └── test.en-ru.ru
    │   │   ├── enzh
    │   │   │   └── test.en-zh.zh
    │   │   ├── isen
    │   │   │   └── test.is-en.en
    │   │   ├── ruen
    │   │   │   └── test.ru-en.en
    │   │   └── zhen
    │   │   │   └── test.zh-en.en
    │   ├── gpt3.5-text-davinci-003
    │   │   ├── csen
    │   │   │   └── test.cs-en.en
    │   │   ├── deen
    │   │   │   └── test.de-en.en
    │   │   ├── encs
    │   │   │   └── test.en-cs.cs
    │   │   ├── ende
    │   │   │   └── test.en-de.de
    │   │   ├── enis
    │   │   │   └── test.en-is.is
    │   │   ├── enru
    │   │   │   └── test.en-ru.ru
    │   │   ├── enzh
    │   │   │   └── test.en-zh.zh
    │   │   ├── isen
    │   │   │   └── test.is-en.en
    │   │   ├── ruen
    │   │   │   └── test.ru-en.en
    │   │   └── zhen
    │   │   │   └── test.zh-en.en
    │   └── wmt-testset
    │   │   ├── csen
    │   │       ├── test.cs-en.cs
    │   │       └── test.cs-en.en
    │   │   ├── deen
    │   │       ├── test.de-en.de
    │   │       └── test.de-en.en
    │   │   ├── encs
    │   │       ├── test.en-cs.cs
    │   │       └── test.en-cs.en
    │   │   ├── ende
    │   │       ├── test.en-de.de
    │   │       └── test.en-de.en
    │   │   ├── enis
    │   │       ├── test.en-is.en
    │   │       └── test.en-is.is
    │   │   ├── enru
    │   │       ├── test.en-ru.en
    │   │       └── test.en-ru.ru
    │   │   ├── enzh
    │   │       ├── test.en-zh.en
    │   │       └── test.en-zh.zh
    │   │   ├── isen
    │   │       ├── test.is-en.en
    │   │       └── test.is-en.is
    │   │   ├── ruen
    │   │       ├── test.ru-en.en
    │   │       └── test.ru-en.ru
    │   │   └── zhen
    │   │       ├── test.zh-en.en
    │   │       └── test.zh-en.zh
    └── wmt23_outputs
    │   ├── ALMA-13B-R
    │       ├── deen
    │       │   └── test.de-en.en
    │       ├── ende
    │       │   └── test.en-de.de
    │       ├── enru
    │       │   └── test.en-ru.ru
    │       ├── enzh
    │       │   └── test.en-zh.zh
    │       ├── ruen
    │       │   └── test.ru-en.en
    │       └── zhen
    │       │   └── test.zh-en.en
    │   ├── MADLAD-10B
    │       ├── deen
    │       │   └── test.de-en.en
    │       ├── ende
    │       │   └── test.en-de.de
    │       ├── enru
    │       │   └── test.en-ru.ru
    │       ├── enzh
    │       │   └── test.en-zh.zh
    │       ├── ruen
    │       │   └── test.ru-en.en
    │       └── zhen
    │       │   └── test.zh-en.en
    │   ├── TowerInstruct-7B-v0.1
    │       ├── deen
    │       │   └── test.de-en.en
    │       ├── ende
    │       │   └── test.en-de.de
    │       ├── enru
    │       │   └── test.en-ru.ru
    │       ├── enzh
    │       │   └── test.en-zh.zh
    │       ├── ruen
    │       │   └── test.ru-en.en
    │       └── zhen
    │       │   └── test.zh-en.en
    │   ├── WMT_Winners
    │       ├── deen
    │       │   └── test.de-en.en
    │       ├── ende
    │       │   └── test.en-de.de
    │       ├── enru
    │       │   └── test.en-ru.ru
    │       ├── enzh
    │       │   └── test.en-zh.zh
    │       ├── ruen
    │       │   └── test.ru-en.en
    │       └── zhen
    │       │   └── test.zh-en.en
    │   └── wmt-testset
    │       ├── deen
    │           ├── test.de-en.de
    │           └── test.de-en.en
    │       ├── ende
    │           ├── test.en-de.de
    │           └── test.en-de.en
    │       ├── enru
    │           ├── test.en-ru.en
    │           └── test.en-ru.ru
    │       ├── enzh
    │           ├── test.en-zh.en
    │           └── test.en-zh.zh
    │       ├── ruen
    │           ├── test.ru-en.en
    │           └── test.ru-en.ru
    │       └── zhen
    │           ├── test.zh-en.en
    │           └── test.zh-en.zh
├── run_cpo_llmmt.py
├── run_llmmt.py
├── runs
    ├── cpo_ft.sh
    ├── mono_ft.sh
    ├── parallel_ft.sh
    └── parallel_ft_lora.sh
└── utils
    ├── __init__.py
    ├── arguments.py
    ├── cpo_config.py
    ├── cpo_trainer.py
    ├── trainer_llmmt.py
    ├── ul2collator.py
    └── utils.py


/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
 1 | # Microsoft Open Source Code of Conduct
 2 | 
 3 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/).
 4 | 
 5 | Resources:
 6 | 
 7 | - [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/)
 8 | - [Microsoft Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/)
 9 | - Contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with questions or concerns
10 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 |     MIT License
 2 | 
 3 |     Copyright (c) Microsoft Corporation.
 4 | 
 5 |     Permission is hereby granted, free of charge, to any person obtaining a copy
 6 |     of this software and associated documentation files (the "Software"), to deal
 7 |     in the Software without restriction, including without limitation the rights
 8 |     to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 |     copies of the Software, and to permit persons to whom the Software is
10 |     furnished to do so, subject to the following conditions:
11 | 
12 |     The above copyright notice and this permission notice shall be included in all
13 |     copies or substantial portions of the Software.
14 | 
15 |     THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 |     IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 |     FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 |     AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 |     LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 |     OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 |     SOFTWARE
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | <p align="center">
  2 |     <img alt="ALMA" src="./figures/alma_logo.png" width="500" height="203">
  3 | </p>
  4 | 
  5 | <div align="center">
  6 |     
  7 | # ALMA: Advanced Language Model-based translator
  8 | </div>
  9 | 
 10 | <p align="center">
 11 | <a href="LICENSE" alt="MIT License"><img src="https://img.shields.io/badge/license-MIT-FAD689.svg" /></a>
 12 | <a href="https://arxiv.org/abs/2309.11674" alt="ALMA paper"><img src="https://img.shields.io/badge/ALMA-Paper-D9AB42" /></a>
 13 | <a href="https://arxiv.org/abs/2401.08417" alt="ALMA-R paper"><img src="https://img.shields.io/badge/ALMA--R-Paper-F6C555" /></a>
 14 | <a href="https://arxiv.org/pdf/2410.03115" alt="X-ALMA paper"><img src="https://img.shields.io/badge/X--ALMA-Paper-F3B425" /></a>
 15 | <!-- <a href="https://notes.aimodels.fyi/alma-a-new-training-method-that-boosts-translation-performance-for-large-language-models/"><img alt="Summary Link" src="https://img.shields.io/badge/summary-link-F6C555" /></a> -->
 16 | <a href="https://www.clsp.jhu.edu/" alt="jhu"><img src="https://img.shields.io/badge/Johns_Hopkins_University-BEC23F" /></a>
 17 | <a href="https://www.microsoft.com/en-us/research/" alt="MSlogo"><img src="https://img.shields.io/badge/Microsoft-B1B479?logo=microsoft" /></a>
 18 | <a href="https://twitter.com/fe1ixxu">
 19 |   <img src="https://img.shields.io/twitter/follow/haoranxu?style=social&logo=twitter"
 20 |       alt="follow on Twitter"></a>
 21 | </p>
 22 | 
 23 | ALMA has three generations: ALMA (1st), ALMA-R (2nd), and **X-ALMA(3rd NEW!)**.
 24 | 
 25 | [**ALMA**](https://arxiv.org/abs/2309.11674) (**A**dvanced **L**anguage **M**odel-based Tr**A**nslator) is a many-to-many LLM-based translation model,  which adopts a new translation model paradigm: it begins with fine-tuning on monolingual data and is further optimized using high-quality parallel data. This two-step fine-tuning process ensures strong translation performance.
 26 | 
 27 | **[ALMA-R](https://arxiv.org/pdf/2401.08417v2.pdf)** builds upon ALMA models, with further LoRA fine-tuning with our proposed **Contrastive Preference Optimization (CPO)** as opposed to the Supervised Fine-tuning used in ALMA. CPO fine-tuning requires our [triplet preference data](https://huggingface.co/datasets/haoranxu/ALMA-R-Preference) for preference learning. ALMA-R now can matches or even exceeds GPT-4 or WMT winners!
 28 | 
 29 | **[X-ALMA](https://arxiv.org/pdf/2410.03115) (NEW!) extends ALMA(-R) from 6 languages to 50 languages and ensures top-tier performance across 50 diverse languages, regardless of their resource levels. This is achieved by plug-and-play language-specific module architecture and a carefully designed 5-step training recipe with novel *Adaptive-Rejection Preference Optimization* methods.** 
 30 | 
 31 | *Old ALMA Repo:*
 32 | - The original **ALMA** repository can be found [here](https://github.com/fe1ixxu/ALMA/tree/a3cc7877752779346312bb07798172eadc83d692).
 33 | - The original **ALMA-R** repository can be found [here](https://github.com/fe1ixxu/ALMA/tree/ac120eb44c609ad9a386d617172d40432c2c0df6).
 34 | 
 35 | # News 🌟
 36 | ⭐ Jan. 22 2025 **X-ALMA** has been accepted at **ICLR 2025**!
 37 | 
 38 | ⭐ Oct. 6 2024 **X-ALMA** is out! Please find the [paper here](https://arxiv.org/pdf/2410.03115) and [models & datasets here](https://huggingface.co/collections/haoranxu/x-alma-66fde464ef90be465920abaa).
 39 | 
 40 | ⭐ Jun. 20 2024 We want to give a shout out to [SimPO](https://arxiv.org/pdf/2405.14734), which shares a similar reference-free preference learning framework with CPO but in a more stable manner due to its special length normalization and target reward margin. The most exciting thing is that CPO and SimPO can potentially be used together! Learn more about [CPO-SimPO](https://github.com/fe1ixxu/CPO_SIMPO)!
 41 | 
 42 | ⭐ May.1 CPO paper has been accepted at **ICML 2024**!
 43 | 
 44 | ⭐ Mar.22 2024 CPO method now is merged at [huggingface trl](https://github.com/huggingface/trl)! See details [here](https://github.com/huggingface/trl/pull/1382).
 45 | 
 46 | ⭐ Jan.16 2024 **ALMA-R** is released! Please check more details with our new paper: [Contrastive Preference Optimization: Pushing the Boundaries of LLM Performance in Machine Translation](https://arxiv.org/abs/2401.08417).
 47 | 
 48 | ⭐ Jan.16 2024 The ALMA paper: [A Paradigm Shift in Machine Translation: Boosting Translation Performance of Large Language Models](https://arxiv.org/abs/2309.11674) has been accepted at **ICLR 2024**! Check out more details [here](https://openreview.net/forum?id=farT6XXntP)!
 49 | 
 50 | # Contents 📄
 51 | - [Download ALMA Models and Dataset](#download-alma-models-and-dataset-)
 52 | - [A Quick Start](#a-quick-start)
 53 | - [Environment Setup](#environment-setup-)
 54 | - [Evaluation](#evaluation-)
 55 | - [Training](#training-)
 56 | - [FAQs](#faqs-)
 57 | 
 58 | :star: Supports :star:
 59 |   - AMD and Nvidia Cards
 60 |   - Data Parallel Evaluation
 61 |   - Also support LLaMA-1, LLaMA-2, OPT, Faclon, BLOOM, MPT
 62 |   - LoRA Fine-tuning
 63 |   - Monolingual data fine-tuning, parallel data fine-tuning
 64 | 
 65 | <p align="center">
 66 | <img src="./figures/xalma.png" width="700" height="300">
 67 | </p>
 68 | 
 69 | # Download ALMA Models and Dataset 🚀
 70 | 
 71 | We release seven translation models for ALMA series:
 72 | 
 73 | Model checkpoints are released at huggingface:
 74 | |     Models    | Base Model Link | LoRA Link |
 75 | |:-------------:|:---------------:|:---------:|
 76 | |    ALMA-7B (1st gen)    |        [haoranxu/ALMA-7B](https://huggingface.co/haoranxu/ALMA-7B)        |     -     |
 77 | |  ALMA-7B-LoRA (1st gen) |        [haoranxu/ALMA-7B-Pretrain](https://huggingface.co/haoranxu/ALMA-7B-Pretrain)        |     [haoranxu/ALMA-7B-Pretrain-LoRA](https://huggingface.co/haoranxu/ALMA-7B-Pretrain-LoRA)     |
 78 | |  ALMA-7B-R (2nd gen) |        [haoranxu/ALMA-7B-R (LoRA merged)](https://huggingface.co/haoranxu/ALMA-7B-R)        |     -    |
 79 | |    ALMA-13B-LoRA (1st gen)   |        [haoranxu/ALMA-13B](https://huggingface.co/haoranxu/ALMA-13B)        |     -     |
 80 | | ALMA-13B-LoRA |        [haoranxu/ALMA-13B-Pretrain](https://huggingface.co/haoranxu/ALMA-13B-Pretrain)        |     [haoranxu/ALMA-13B-Pretrain-LoRA](https://huggingface.co/haoranxu/ALMA-13B-Pretrain-LoRA)     |
 81 | | ALMA-13B-R (2nd gen) |        [haoranxu/ALMA-13B-R (LoRA merged)](https://huggingface.co/haoranxu/ALMA-13B-R)        |    -   |
 82 | |  **X-ALMA (NEW, 3rd gen)** |        [X-ALMA Models](https://huggingface.co/collections/haoranxu/x-alma-66fde464ef90be465920abaa)        |    -   |
 83 | 
 84 | **Note that `ALMA-7B-Pretrain` and `ALMA-13B-Pretrain` are NOT translation models. They only experience stage 1 monolingual fine-tuning (20B tokens for the 7B model and 12B tokens for the 13B model), and should be utilized in conjunction with their LoRA models.** 
 85 | 
 86 | *We have also provided the WMT'22 and WMT'23 translation outputs from ALMA-13B-LoRA and ALMA-13B-R in the `outputs` directory. These outputs also includes our outputs of baselines and can be directly accessed and used for subsequent evaluations.*
 87 | 
 88 | Datasets used by ALMA and ALMA-R are also released at huggingface now (NEW!)
 89 | |     Datasets    | Train / Validation| Test |
 90 | |:-------------:|:---------------:|:---------:|
 91 | |   ALMA Human-Written Parallel Data    |        [Parallel train and validation](https://huggingface.co/datasets/haoranxu/ALMA-Human-Parallel)        |     [WMT'22](https://huggingface.co/datasets/haoranxu/WMT22-Test)    |
 92 | |  ALMA-R Triplet Preference Data |        [Triplet Preference Data](https://huggingface.co/datasets/haoranxu/ALMA-R-Preference)        |   [WMT'22](https://huggingface.co/datasets/haoranxu/WMT22-Test) and [WMT'23](https://huggingface.co/datasets/haoranxu/WMT23-Test)   |
 93 | |  **X-ALMA Data** |   50-language   [parallel data](https://huggingface.co/datasets/haoranxu/X-ALMA-Parallel-Data) and [preference data](https://huggingface.co/datasets/haoranxu/X-ALMA-Preference)        |   [WMT'23](https://huggingface.co/datasets/haoranxu/WMT23-Test) and [FLORES-200](https://huggingface.co/datasets/haoranxu/X-ALMA-Parallel-Data)  |
 94 | 
 95 | 
 96 | # A Quick Start
 97 | X-ALMA is designed with a plug-and-play architecture, consisting of two components: a base model and language-specific modules, with each module shared across different language groups.
 98 | There are three ways to load X-ALMA for translation. An example of translating "我爱机器翻译。" into English (X-ALMA should also able to do multilingual open-ended QA). 
 99 | 
100 | **The first way**: loading the merged model where the language-specific module has been merged into the base model (Recommended):
101 | ```
102 | import torch
103 | from transformers import AutoModelForCausalLM
104 | from transformers import AutoTokenizer
105 | from peft import PeftModel
106 | 
107 | GROUP2LANG = {
108 |     1: ["da", "nl", "de", "is", "no", "sv", "af"],
109 |     2: ["ca", "ro", "gl", "it", "pt", "es"],
110 |     3: ["bg", "mk", "sr", "uk", "ru"],
111 |     4: ["id", "ms", "th", "vi", "mg", "fr"],
112 |     5: ["hu", "el", "cs", "pl", "lt", "lv"],
113 |     6: ["ka", "zh", "ja", "ko", "fi", "et"],
114 |     7: ["gu", "hi", "mr", "ne", "ur"],
115 |     8: ["az", "kk", "ky", "tr", "uz", "ar", "he", "fa"],
116 | }
117 | LANG2GROUP = {lang: str(group) for group, langs in GROUP2LANG.items() for lang in langs}
118 | group_id = LANG2GROUP["zh"]
119 | 
120 | model = AutoModelForCausalLM.from_pretrained(f"haoranxu/X-ALMA-13B-Group{group_id}", torch_dtype=torch.float16, device_map="auto")
121 | tokenizer = AutoTokenizer.from_pretrained(f"haoranxu/X-ALMA-13B-Group{group_id}", padding_side='left')
122 | 
123 | # Add the source sentence into the prompt template
124 | prompt="Translate this from Chinese to English:\nChinese: 我爱机器翻译。\nEnglish:"
125 | 
126 | # X-ALMA needs chat template but ALMA and ALMA-R don't need it.
127 | chat_style_prompt = [{"role": "user", "content": prompt}]
128 | prompt = tokenizer.apply_chat_template(chat_style_prompt, tokenize=False, add_generation_prompt=True)
129 | 
130 | input_ids = tokenizer(prompt, return_tensors="pt", padding=True, max_length=40, truncation=True).input_ids.cuda()
131 | 
132 | # Translation
133 | with torch.no_grad():
134 |     generated_ids = model.generate(input_ids=input_ids, num_beams=5, max_new_tokens=20, do_sample=True, temperature=0.6, top_p=0.9)
135 | outputs = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
136 | print(outputs)
137 | ```
138 | 
139 | **The second way**: loading the base model and language-specific module (Recommended):
140 | ```
141 | model = AutoModelForCausalLM.from_pretrained("haoranxu/X-ALMA-13B-Pretrain", torch_dtype=torch.float16, device_map="auto")
142 | model = PeftModel.from_pretrained(model, f"haoranxu/X-ALMA-13B-Group{group_id}")
143 | tokenizer = AutoTokenizer.from_pretrained(f"haoranxu/X-ALMA-13B-Group{group_id}", padding_side='left')
144 | ```
145 | 
146 | **The third way**: loading the base model with all language-specific modules like MoE: (Require large GPU memory)
147 | ```
148 | from modeling_xalma import XALMAForCausalLM
149 | model = XALMAForCausalLM.from_pretrained("haoranxu/X-ALMA", torch_dtype=torch.float16, device_map="auto")
150 | tokenizer = AutoTokenizer.from_pretrained("haoranxu/X-ALMA", padding_side='left')
151 | 
152 | # Add `lang="zh"`: specify the language to instruct the model on which group to use for the third loading method during generation.
153 | generated_ids = model.generate(input_ids=input_ids, num_beams=5, max_new_tokens=20, do_sample=True, temperature=0.6, top_p=0.9, lang="zh")
154 | ```
155 | 
156 | 
157 | The ALMA and ALMA-R translation prompt is:
158 | ```
159 | Translate this from <source language name> into <target language name>:
160 | <source language name>: <source language sentence>
161 | <target language name>:
162 | ```
163 | 
164 | The X-ALMA translation prompt is:
165 | ```
166 | <s>[INST] Translate this from <source language name> into <target language name>:
167 | <source language name>: <source language sentence>
168 | <target language name>: [/INST]
169 | ```
170 | 
171 | # Environment Setup 🔧
172 | ```
173 | conda create -n xalma python=3.11
174 | conda activate xalma
175 | ```
176 | If you use **AMD GPUs**, please first install torch with ROCm.
177 | 
178 | Then install other dependencies:
179 | ```
180 | bash install_alma.sh
181 | ```
182 | # Evaluation 💻
183 | ### Evaluation on X-ALMA
184 | This is a quick start to evaluate our X-ALMA model. To produce translation outputs for FLORES-200 in both en→cs and cs→en directions, (If you want to evaluate WMT'23 instead, simply pass `--override_test_data_path haoranxu/WMT23-Test`.), run the following command. **Note that You don't need enable `--chat_style` for ALMA and ALMA-R. This is only for X-ALMA**
185 | 
186 | ```
187 | accelerate launch --config_file configs/deepspeed_eval_config_bf16.yaml \
188 |     run_llmmt.py \
189 |     --model_name_or_path haoranxu/X-ALMA-13B-Group5 \
190 |     --do_predict \
191 |     --low_cpu_mem_usage \
192 |     --language_pairs en-cs,cs-en \
193 |     --mmt_data_path placeholder \
194 |     --override_test_data_path haoranxu/FLORES-200 \
195 |     --per_device_eval_batch_size 1 \
196 |     --output_dir ./your_output_dir/ \
197 |     --predict_with_generate \
198 |     --max_new_tokens 256 \
199 |     --max_source_length 256 \
200 |     --bf16 \
201 |     --seed 42 \
202 |     --num_beams 5 \
203 |     --overwrite_cache \
204 |     --overwrite_output_dir \
205 |     --chat_style # `--chat_style` only for X-ALMA. You don't need enable `--chat_style` for ALMA and ALMA-R
206 | 
207 | ```
208 | The generated outputs will be saved in the `your_output_dir`. The translation file for the `en→cs` direction is named `test-en-cs`, and the file for the cs→en direction is `test-cs-en`.
209 | The variable `${test_pairs}` denotes the translation directions you wish to evaluate. It supports testing multiple directions at once. For example, you can use `de-en,en-de,en-cs,cs-en`.
210 | 
211 | Please see more other examples for evaluating ALMA(-R) under the `./evals` folder:
212 | 
213 | **Note that this will perform data-parallel evaluation supported by deepspeed: that is, placing a single full copy of your model onto each available GPU and splitting batches across GPUs to evaluate on K GPUs K times faster than on one**. For those with limited GPU memory, we offer an alternative method. The user can pass `--multi_gpu_one_model` to run the process by distributing a single model across multiple GPUs. Please see evaluation examples in `evals/alma_13b_r.sh` or  `evals/*no_parallel` files.
214 | 
215 | # Training 🔥
216 | Here we show how to 
217 | - contrastive Preference Optmization Upon ALMA Models (ALMA→ALMA-R).
218 | - fine-tune LLaMA-2-7B on monolingual OSCAR data (stage 1)
219 | - fine-tune human-written parallel data fine-tuning once stage 1 is completed, including full-weight and LoRA fine-tuning (stage 2)
220 | 
221 | Please note that we do not share the training process for X-ALMA specifically, as it would require releasing numerous intermediate checkpoints, making the process overly complex.
222 | 
223 | ## **CPO Fine-Tuning**
224 | To run the CPO fine-tuning with our triplet preference data, run the following command:
225 | ```
226 | bash runs/cpo_ft.sh ${your_output_dir}
227 | ```
228 | ### OSCAR Monolingual Fine-Tuning
229 | To execute the OSCAR monolingual fine-tuning, use the following command:
230 | ```
231 | bash runs/mono_ft.sh ${your_output_dir}
232 | ```
233 | ### Parallel Data Fine-Tuning (Full-Weight)
234 | Once the monolingual data fine-tuning is complete, proceed to the parallel data fine-tuning using the full-weight approach. Execute the following command:
235 | ```
236 | bash runs/parallel_ft.sh ${your_output_dir} $training_pairs$
237 | ```
238 | where `training_pairs` is the translation directions you considered. The default is all 10 directions: `de-en,cs-en,is-en,zh-en,ru-en,en-de,en-cs,en-is,en-zh,en-ru`.
239 | 
240 | ### Parallel Data Fine-Tuning (LoRA)
241 | In Stage 2, there's also an option to employ LoRA for fine-tuning on the parallel data. To do so, execute the following command:
242 | ```
243 | bash runs/parallel_ft_lora.sh ${your_output_dir} $training_pairs$
244 | ```
245 | 
246 | # FAQs ❓
247 | ### What language directions do ALMA and ALMA-R support?
248 | Currently, ALMA supports 10 directions: English↔German, Englishs↔Czech, Englishs↔Icelandic, Englishs↔Chinese, Englishs↔Russian. However, it may surprise us in other directions :)
249 | 
250 | ### What language directions do X-ALMA support?
251 | X-ALMA supports 50 languages and 98 directions (into and from English): da,nl,de,is,no,sv,af,ca,ro,gl,it,pt,es,bg,mk,sr,uk,ru,id,ms,th,vi,mg,fr,hu,el,cs,pl,lt,lv,ka,zh,ja,ko,fi,et,gu,hi,mr,ne,ur,az,kk,ky,tr,uz,ar,he,fa
252 | 
253 | ### When should I stop fine-tuning at stage 1?
254 | Our 7B and 13B models are trained on 20B and 12B tokens, respectively. However, as indicated in the paper, fine-tuning 1B tokens should boost the performance substantially. The steps required to fine-tune 1 billion tokens also vary based on your batch size. In our case, the batch size is calculated as follows: 16 GPUs * 4 (batch size per GPU) * 4 (gradient accumulation steps) = 256. With a sequence length of 512, we need approximately 8,000 steps to train on 1 billion tokens, calculated as 10^9 / (256*512) ≈8000 steps. However, you may choose to fine-tune more steps to get better performance.
255 | 
256 | ### How to decide the interleave probability at stage 1?
257 | Please find the reasons for interleave probability selection for stage 1 in Appendix D.1 in the [paper](https://arxiv.org/pdf/2309.11674.pdf)!
258 | 
259 | # Reference
260 | Please find more details for ALMA models in our [paper](https://arxiv.org/abs/2309.11674) or the [summary](https://notes.aimodels.fyi/alma-a-new-training-method-that-boosts-translation-performance-for-large-language-models/) of the paper.
261 | ```
262 | @inproceedings{
263 |     xu2024a,
264 |     title={A Paradigm Shift in Machine Translation: Boosting Translation Performance of Large Language Models},
265 |     author={Haoran Xu and Young Jin Kim and Amr Sharaf and Hany Hassan Awadalla},
266 |     booktitle={The Twelfth International Conference on Learning Representations},
267 |     year={2024},
268 |     url={https://openreview.net/forum?id=farT6XXntP}
269 | }
270 | ```
271 | 
272 | Please also find more detailed information for the ALMA-R model with Contrastive Preference Optimization in the [paper](https://arxiv.org/pdf/2401.08417v2.pdf).
273 | ```
274 | @inproceedings{
275 |     xu2024contrastive,
276 |     title={Contrastive Preference Optimization: Pushing the Boundaries of {LLM} Performance in Machine Translation},
277 |     author={Haoran Xu and Amr Sharaf and Yunmo Chen and Weiting Tan and Lingfeng Shen and Benjamin Van Durme and Kenton Murray and Young Jin Kim},
278 |     booktitle={Forty-first International Conference on Machine Learning},
279 |     year={2024},
280 |     url={https://openreview.net/forum?id=51iwkioZpn}
281 | }
282 | ```
283 | 
284 | Please find details about X-ALMA in the latest [paper](https://arxiv.org/pdf/2410.03115)
285 | ```
286 | @inproceedings{
287 | xu2025xalma,
288 | title={X-{ALMA}: Plug \& Play Modules and Adaptive Rejection for Quality Translation at Scale},
289 | author={Haoran Xu and Kenton Murray and Philipp Koehn and Hieu Hoang and Akiko Eriguchi and Huda Khayrallah},
290 | booktitle={The Thirteenth International Conference on Learning Representations},
291 | year={2025},
292 | url={https://openreview.net/forum?id=csbf1p8xUq}
293 | }
294 | ```
295 | 


--------------------------------------------------------------------------------
/configs/deepspeed_eval_config.yaml:
--------------------------------------------------------------------------------
 1 | compute_environment: LOCAL_MACHINE
 2 | distributed_type: MULTI_GPU
 3 | downcast_bf16: 'no'
 4 | gpu_ids: all
 5 | machine_rank: 0
 6 | main_training_function: main
 7 | mixed_precision: fp16
 8 | num_machines: 1
 9 | num_processes: 8
10 | rdzv_backend: static
11 | same_network: true
12 | tpu_env: []
13 | tpu_use_cluster: false
14 | tpu_use_sudo: false
15 | use_cpu: false
16 | 


--------------------------------------------------------------------------------
/configs/deepspeed_eval_config_bf16.yaml:
--------------------------------------------------------------------------------
 1 | compute_environment: LOCAL_MACHINE
 2 | distributed_type: MULTI_GPU
 3 | downcast_bf16: 'no'
 4 | gpu_ids: all
 5 | machine_rank: 0
 6 | main_training_function: main
 7 | mixed_precision: bf16
 8 | num_machines: 1
 9 | num_processes: 8
10 | rdzv_backend: static
11 | same_network: true
12 | tpu_env: []
13 | tpu_use_cluster: false
14 | tpu_use_sudo: false
15 | use_cpu: false
16 | 


--------------------------------------------------------------------------------
/configs/deepspeed_eval_config_zero3_bf16.yaml:
--------------------------------------------------------------------------------
 1 | compute_environment: LOCAL_MACHINE
 2 | debug: false
 3 | deepspeed_config:
 4 |   gradient_accumulation_steps: 1
 5 |   offload_optimizer_device: none
 6 |   offload_param_device: none
 7 |   zero3_init_flag: false
 8 |   zero3_save_16bit_model: false
 9 |   zero_stage: 3
10 | distributed_type: DEEPSPEED
11 | downcast_bf16: 'no'
12 | machine_rank: 0
13 | main_training_function: main
14 | mixed_precision: bf16
15 | num_machines: 1
16 | num_processes: 8
17 | rdzv_backend: static
18 | same_network: true
19 | tpu_env: []
20 | tpu_use_cluster: false
21 | tpu_use_sudo: false
22 | use_cpu: false
23 | 


--------------------------------------------------------------------------------
/configs/deepspeed_train_config.yaml:
--------------------------------------------------------------------------------
 1 | compute_environment: LOCAL_MACHINE
 2 | deepspeed_config:
 3 |   gradient_accumulation_steps: 4
 4 |   gradient_clipping: 1.0
 5 |   offload_optimizer_device: none
 6 |   offload_param_device: cpu
 7 |   zero3_init_flag: false
 8 |   zero_stage: 2
 9 | distributed_type: DEEPSPEED
10 | downcast_bf16: 'no'
11 | machine_rank: 0
12 | main_training_function: main
13 | mixed_precision: fp16
14 | num_machines: 1
15 | num_processes: 8
16 | rdzv_backend: static
17 | same_network: true
18 | tpu_env: []
19 | tpu_use_cluster: false
20 | tpu_use_sudo: false
21 | use_cpu: false
22 | 


--------------------------------------------------------------------------------
/configs/deepspeed_train_config_bf16.yaml:
--------------------------------------------------------------------------------
 1 | compute_environment: LOCAL_MACHINE
 2 | deepspeed_config:
 3 |   gradient_accumulation_steps: 1
 4 |   gradient_clipping: 1.0
 5 |   offload_optimizer_device: none
 6 |   offload_param_device: cpu
 7 |   zero3_init_flag: false
 8 |   zero_stage: 2
 9 | distributed_type: DEEPSPEED
10 | downcast_bf16: 'no'
11 | machine_rank: 0
12 | main_training_function: main
13 | mixed_precision: bf16
14 | num_machines: 1
15 | num_processes: 8
16 | rdzv_backend: static
17 | same_network: true
18 | tpu_env: []
19 | tpu_use_cluster: false
20 | tpu_use_sudo: false
21 | use_cpu: false
22 | 


--------------------------------------------------------------------------------
/evals/alma_13b.sh:
--------------------------------------------------------------------------------
 1 | OUTPUT_DIR=${1:-"./outputs-alma-13b-full-ft/"}
 2 | TEST_PAIRS=${2:-"de-en,cs-en,is-en,zh-en,ru-en,en-de,en-cs,en-is,en-zh,en-ru"}
 3 | # random port between 30000 and 50000
 4 | port=$(( RANDOM % (50000 - 30000 + 1 ) + 30000 ))
 5 | 
 6 | ## Generation
 7 | accelerate launch --main_process_port ${port} --config_file configs/deepspeed_eval_config_bf16.yaml\
 8 |     run_llmmt.py \
 9 |     --model_name_or_path haoranxu/ALMA-13B \
10 |     --do_predict \
11 |     --low_cpu_mem_usage \
12 |     --language_pairs ${TEST_PAIRS} \
13 |     --mmt_data_path ./human_written_data/ \
14 |     --per_device_eval_batch_size 2 \
15 |     --output_dir ${OUTPUT_DIR} \
16 |     --predict_with_generate \
17 |     --max_new_tokens 256 \
18 |     --max_source_length 256 \
19 |     --fp16 \
20 |     --seed 42 \
21 |     --num_beams 5 \
22 |     --overwrite_cache \
23 |     --overwrite_output_dir \
24 | 
25 | if [[ ${TEST_PAIRS} == *zh-en* ]]; then
26 |     accelerate launch --main_process_port ${port} --config_file configs/deepspeed_eval_config_bf16.yaml \
27 |         run_llmmt.py \
28 |         --model_name_or_path haoranxu/ALMA-13B \
29 |         --do_predict \
30 |         --low_cpu_mem_usage \
31 |         --language_pairs zh-en \
32 |         --mmt_data_path ./human_written_data/ \
33 |         --per_device_eval_batch_size 2 \
34 |         --output_dir ${OUTPUT_DIR} \
35 |         --predict_with_generate \
36 |         --max_new_tokens 256 \
37 |         --max_source_length 512 \
38 |         --fp16 \
39 |         --seed 42 \
40 |         --num_beams 5 \
41 |         --overwrite_cache \
42 |         --overwrite_output_dir \
43 | fi
44 | 
45 | ## Evaluation (BLEU, COMET)
46 | bash ./evals/eval_generation.sh ${OUTPUT_DIR} ${TEST_PAIRS}


--------------------------------------------------------------------------------
/evals/alma_13b_lora.sh:
--------------------------------------------------------------------------------
 1 | OUTPUT_DIR=${1:-"./outputs-alma-13b-lora/"}
 2 | TEST_PAIRS=${2:-"de-en,cs-en,is-en,zh-en,ru-en,en-de,en-cs,en-is,en-zh,en-ru"}
 3 | # random port between 30000 and 50000
 4 | port=$(( RANDOM % (50000 - 30000 + 1 ) + 30000 ))
 5 | 
 6 | accelerate launch --main_process_port ${port} --config_file configs/deepspeed_eval_config_bf16.yaml \
 7 |     run_llmmt.py \
 8 |     --model_name_or_path haoranxu/ALMA-13B-Pretrain \
 9 |     --do_predict \
10 |     --low_cpu_mem_usage \
11 |     --language_pairs ${TEST_PAIRS} \
12 |     --mmt_data_path ./human_written_data/ \
13 |     --per_device_eval_batch_size 2 \
14 |     --output_dir ${OUTPUT_DIR} \
15 |     --use_peft \
16 |     --peft_model_id  haoranxu/ALMA-13B-Pretrain-LoRA \
17 |     --predict_with_generate \
18 |     --max_new_tokens 256 \
19 |     --max_source_length 256 \
20 |     --fp16 \
21 |     --seed 42 \
22 |     --num_beams 5 \
23 |     --overwrite_cache \
24 |     --overwrite_output_dir \
25 | 
26 | if [[ ${TEST_PAIRS} == *zh-en* ]]; then
27 | accelerate launch --main_process_port ${port} --config_file configs/deepspeed_eval_config_bf16.yaml \
28 |     run_llmmt.py \
29 |     --model_name_or_path haoranxu/ALMA-13B-Pretrain \
30 |     --do_predict \
31 |     --low_cpu_mem_usage \
32 |     --language_pairs zh-en \
33 |     --mmt_data_path ./human_written_data/ \
34 |     --per_device_eval_batch_size 2 \
35 |     --output_dir ${OUTPUT_DIR} \
36 |     --use_peft \
37 |     --peft_model_id  haoranxu/ALMA-13B-Pretrain-LoRA \
38 |     --predict_with_generate \
39 |     --max_new_tokens 256 \
40 |     --max_source_length 512 \
41 |     --fp16 \
42 |     --seed 42 \
43 |     --num_beams 5 \
44 |     --overwrite_cache \
45 |     --overwrite_output_dir \
46 | fi
47 | 
48 | ## Evaluation (BLEU, COMET)
49 | bash ./evals/eval_generation.sh ${OUTPUT_DIR} ${TEST_PAIRS}
50 | 


--------------------------------------------------------------------------------
/evals/alma_13b_lora_no_parallel.sh:
--------------------------------------------------------------------------------
 1 | OUTPUT_DIR=${1:-"./outputs-alma-13b-lora/"}
 2 | TEST_PAIRS=${2:-"de-en,cs-en,is-en,zh-en,ru-en,en-de,en-cs,en-is,en-zh,en-ru"}
 3 | 
 4 | python \
 5 |     run_llmmt.py \
 6 |     --model_name_or_path haoranxu/ALMA-13B-Pretrain \
 7 |     --do_predict \
 8 |     --low_cpu_mem_usage \
 9 |     --language_pairs ${TEST_PAIRS} \
10 |     --mmt_data_path ./human_written_data/ \
11 |     --per_device_eval_batch_size 2 \
12 |     --output_dir ${OUTPUT_DIR} \
13 |     --use_peft \
14 |     --peft_model_id  haoranxu/ALMA-13B-Pretrain-LoRA \
15 |     --predict_with_generate \
16 |     --max_new_tokens 256 \
17 |     --max_source_length 256 \
18 |     --fp16 \
19 |     --seed 42 \
20 |     --num_beams 5 \
21 |     --overwrite_cache \
22 |     --overwrite_output_dir \
23 |     --multi_gpu_one_model
24 | 
25 | if [[ ${TEST_PAIRS} == *zh-en* ]]; then
26 |     python \
27 |         run_llmmt.py \
28 |         --model_name_or_path haoranxu/ALMA-13B-Pretrain \
29 |         --do_predict \
30 |         --low_cpu_mem_usage \
31 |         --language_pairs zh-en \
32 |         --mmt_data_path ./human_written_data/ \
33 |         --per_device_eval_batch_size 2 \
34 |         --output_dir ${OUTPUT_DIR} \
35 |         --use_peft \
36 |         --peft_model_id  haoranxu/ALMA-13B-Pretrain-LoRA \
37 |         --predict_with_generate \
38 |         --max_new_tokens 256 \
39 |         --max_source_length 512 \
40 |         --fp16 \
41 |         --seed 42 \
42 |         --num_beams 5 \
43 |         --overwrite_cache \
44 |         --overwrite_output_dir \
45 |         --multi_gpu_one_model
46 | fi
47 | 
48 | ## Evaluation (BLEU, COMET)
49 | bash ./evals/eval_generation.sh ${OUTPUT_DIR} ${TEST_PAIRS}
50 | 


--------------------------------------------------------------------------------
/evals/alma_13b_no_parallel.sh:
--------------------------------------------------------------------------------
 1 | OUTPUT_DIR=${1:-"./outputs-alma-13b-full-ft/"}
 2 | TEST_PAIRS=${2:-"de-en,cs-en,is-en,zh-en,ru-en,en-de,en-cs,en-is,en-zh,en-ru"}
 3 | 
 4 | ## Generation
 5 | python \
 6 |     run_llmmt.py \
 7 |     --model_name_or_path haoranxu/ALMA-13B \
 8 |     --do_predict \
 9 |     --low_cpu_mem_usage \
10 |     --language_pairs ${TEST_PAIRS} \
11 |     --mmt_data_path ./human_written_data/ \
12 |     --per_device_eval_batch_size 2 \
13 |     --output_dir ${OUTPUT_DIR} \
14 |     --predict_with_generate \
15 |     --max_new_tokens 256 \
16 |     --max_source_length 256 \
17 |     --fp16 \
18 |     --seed 42 \
19 |     --num_beams 5 \
20 |     --overwrite_cache \
21 |     --overwrite_output_dir \
22 |     --multi_gpu_one_model
23 | 
24 | if [[ ${TEST_PAIRS} == *zh-en* ]]; then
25 |     python \
26 |         run_llmmt.py \
27 |         --model_name_or_path haoranxu/ALMA-13B \
28 |         --do_predict \
29 |         --low_cpu_mem_usage \
30 |         --language_pairs zh-en \
31 |         --mmt_data_path ./human_written_data/ \
32 |         --per_device_eval_batch_size 2 \
33 |         --output_dir ${OUTPUT_DIR} \
34 |         --predict_with_generate \
35 |         --max_new_tokens 256 \
36 |         --max_source_length 512 \
37 |         --fp16 \
38 |         --seed 42 \
39 |         --num_beams 5 \
40 |         --overwrite_cache \
41 |         --overwrite_output_dir \
42 |         --multi_gpu_one_model
43 | fi
44 | 
45 | ## Evaluation (BLEU, COMET)
46 | bash ./evals/eval_generation.sh ${OUTPUT_DIR} ${TEST_PAIRS}


--------------------------------------------------------------------------------
/evals/alma_13b_r.sh:
--------------------------------------------------------------------------------
 1 | OUTPUT_DIR=${1:-"./outputs-alma-13b-r/"}
 2 | TEST_PAIRS=${2:-"de-en,cs-en,is-en,zh-en,ru-en,en-de,en-cs,en-is,en-zh,en-ru"}
 3 | # random port between 30000 and 50000
 4 | port=$(( RANDOM % (50000 - 30000 + 1 ) + 30000 ))
 5 | 
 6 | accelerate launch --main_process_port ${port} --config_file configs/deepspeed_eval_config_bf16.yaml \
 7 |     run_llmmt.py \
 8 |     --model_name_or_path haoranxu/ALMA-13B-R \
 9 |     --do_predict \
10 |     --low_cpu_mem_usage \
11 |     --language_pairs ${TEST_PAIRS} \
12 |     --mmt_data_path ./human_written_data/ \
13 |     --per_device_eval_batch_size 4 \
14 |     --output_dir ${OUTPUT_DIR} \
15 |     --predict_with_generate \
16 |     --max_new_tokens 256 \
17 |     --max_source_length 256 \
18 |     --bf16 \
19 |     --seed 42 \
20 |     --num_beams 5 \
21 |     --overwrite_cache \
22 |     --overwrite_output_dir 
23 | 
24 | if [[ ${TEST_PAIRS} == *zh-en* ]]; then
25 | accelerate launch --main_process_port ${port} --config_file configs/deepspeed_eval_config_bf16.yaml \
26 |     run_llmmt.py \
27 |     --model_name_or_path haoranxu/ALMA-13B-R \
28 |     --do_predict \
29 |     --low_cpu_mem_usage \
30 |     --language_pairs zh-en \
31 |     --mmt_data_path ./human_written_data/ \
32 |     --per_device_eval_batch_size 4 \
33 |     --output_dir ${OUTPUT_DIR} \
34 |     --predict_with_generate \
35 |     --max_new_tokens 256 \
36 |     --max_source_length 512 \
37 |     --bf16 \
38 |     --seed 42 \
39 |     --num_beams 5 \
40 |     --overwrite_cache \
41 |     --overwrite_output_dir  
42 | fi
43 | 
44 | ## Evaluation (BLEU, COMET)
45 | bash ./evals/eval_generation.sh ${OUTPUT_DIR} ${TEST_PAIRS}
46 | 


--------------------------------------------------------------------------------
/evals/alma_13b_r_wmt23.sh:
--------------------------------------------------------------------------------
 1 | OUTPUT_DIR=${1:-"./outputs-alma-13b-r-wmt23/"}
 2 | # random port between 30000 and 50000
 3 | port=$(( RANDOM % (50000 - 30000 + 1 ) + 30000 ))
 4 | 
 5 | accelerate launch --main_process_port ${port} --config_file configs/deepspeed_eval_config_bf16.yaml\
 6 |     run_llmmt.py \
 7 |     --model_name_or_path haoranxu/ALMA-13B-R \
 8 |     --do_predict \
 9 |     --low_cpu_mem_usage \
10 |     --language_pairs en-ru,en-zh \
11 |     --mmt_data_path ./human_written_data/ \
12 |     --override_test_data_path haoranxu/WMT23-Test \
13 |     --per_device_eval_batch_size 4 \
14 |     --output_dir ${OUTPUT_DIR} \
15 |     --predict_with_generate \
16 |     --max_new_tokens 256 \
17 |     --max_source_length 256 \
18 |     --bf16 \
19 |     --seed 42 \
20 |     --num_beams 5 \
21 |     --overwrite_cache \
22 |     --overwrite_output_dir \
23 | 
24 | accelerate launch --main_process_port ${port} --config_file configs/deepspeed_eval_config_bf16.yaml \
25 |     run_llmmt.py \
26 |     --model_name_or_path haoranxu/ALMA-13B-R \
27 |     --do_predict \
28 |     --low_cpu_mem_usage \
29 |     --language_pairs ru-en,de-en,en-de,zh-en \
30 |     --mmt_data_path ./human_written_data/ \
31 |     --override_test_data_path haoranxu/WMT23-Test \
32 |     --per_device_eval_batch_size 4 \
33 |     --output_dir ${OUTPUT_DIR} \
34 |     --predict_with_generate \
35 |     --max_new_tokens 512 \
36 |     --max_source_length 1024 \
37 |     --bf16 \
38 |     --seed 42 \
39 |     --num_beams 5 \
40 |     --overwrite_cache \
41 |     --overwrite_output_dir \
42 | 
43 | bash ./evals/eval_generation_wmt23.sh ${OUTPUT_DIR} de-en,zh-en,ru-en,en-de,en-ru,en-zh
44 | 


--------------------------------------------------------------------------------
/evals/alma_7b.sh:
--------------------------------------------------------------------------------
 1 | OUTPUT_DIR=${1:-"./outputs-alma-7b-full-ft/"}
 2 | TEST_PAIRS=${2:-"de-en,cs-en,is-en,zh-en,ru-en,en-de,en-cs,en-is,en-zh,en-ru"}
 3 | # random port between 30000 and 50000
 4 | port=$(( RANDOM % (50000 - 30000 + 1 ) + 30000 ))
 5 | 
 6 | ## Generation
 7 | accelerate launch --main_process_port ${port} --config_file configs/deepspeed_eval_config_bf16.yaml \
 8 |     run_llmmt.py \
 9 |     --model_name_or_path haoranxu/ALMA-7B \
10 |     --do_predict \
11 |     --low_cpu_mem_usage \
12 |     --language_pairs ${TEST_PAIRS} \
13 |     --mmt_data_path ./human_written_data/ \
14 |     --per_device_eval_batch_size 2 \
15 |     --output_dir ${OUTPUT_DIR} \
16 |     --predict_with_generate \
17 |     --max_new_tokens 256 \
18 |     --max_source_length 256 \
19 |     --fp16 \
20 |     --seed 42 \
21 |     --num_beams 5 \
22 |     --overwrite_cache \
23 |     --overwrite_output_dir
24 | 
25 | ## Some tokenized zh source sentence is longer than 256, here we set 512
26 | if [[ ${TEST_PAIRS} == *zh-en* ]]; then
27 |     accelerate launch --main_process_port ${port} --config_file configs/deepspeed_eval_config_bf16.yaml \
28 |         run_llmmt.py \
29 |         --model_name_or_path "haoranxu/ALMA-7B" \
30 |         --do_predict \
31 |         --low_cpu_mem_usage \
32 |         --language_pairs zh-en \
33 |         --mmt_data_path ./human_written_data/ \
34 |         --per_device_eval_batch_size 2 \
35 |         --output_dir ${OUTPUT_DIR} \
36 |         --predict_with_generate \
37 |         --max_new_tokens 256 \
38 |         --max_source_length 512 \
39 |         --fp16 \
40 |         --seed 42 \
41 |         --num_beams 5 \
42 |         --overwrite_cache \
43 |         --overwrite_output_dir
44 | fi
45 | 
46 | ## Evaluation (BLEU, COMET)
47 | bash ./evals/eval_generation.sh ${OUTPUT_DIR} ${TEST_PAIRS}


--------------------------------------------------------------------------------
/evals/alma_7b_lora.sh:
--------------------------------------------------------------------------------
 1 | OUTPUT_DIR=${1:-"./outputs-alma-7b-lora/"}
 2 | TEST_PAIRS=${2:-"de-en,cs-en,is-en,zh-en,ru-en,en-de,en-cs,en-is,en-zh,en-ru"}
 3 | 
 4 | # random port between 30000 and 50000
 5 | port=$(( RANDOM % (50000 - 30000 + 1 ) + 30000 ))
 6 | 
 7 | accelerate launch --main_process_port ${port} --config_file configs/deepspeed_eval_config_bf16.yaml \
 8 |     run_llmmt.py \
 9 |     --model_name_or_path haoranxu/ALMA-7B-Pretrain \
10 |     --do_predict \
11 |     --low_cpu_mem_usage \
12 |     --language_pairs ${TEST_PAIRS} \
13 |     --mmt_data_path ./human_written_data/ \
14 |     --per_device_eval_batch_size 2 \
15 |     --output_dir ${OUTPUT_DIR} \
16 |     --use_peft \
17 |     --peft_model_id  haoranxu/ALMA-7B-Pretrain-LoRA \
18 |     --predict_with_generate \
19 |     --max_new_tokens 256 \
20 |     --max_source_length 256 \
21 |     --fp16 \
22 |     --seed 42 \
23 |     --num_beams 5 \
24 |     --overwrite_cache \
25 |     --overwrite_output_dir 
26 | 
27 | if [[ ${TEST_PAIRS} == *zh-en* ]]; then
28 | accelerate launch --main_process_port ${port} --config_file configs/deepspeed_eval_config_bf16.yaml \
29 |     run_llmmt.py \
30 |     --model_name_or_path haoranxu/ALMA-7B-Pretrain \
31 |     --do_predict \
32 |     --low_cpu_mem_usage \
33 |     --language_pairs zh-en \
34 |     --mmt_data_path ./human_written_data/ \
35 |     --per_device_eval_batch_size 2 \
36 |     --output_dir ${OUTPUT_DIR} \
37 |     --use_peft \
38 |     --peft_model_id  haoranxu/ALMA-7B-Pretrain-LoRA \
39 |     --predict_with_generate \
40 |     --max_new_tokens 256 \
41 |     --max_source_length 512 \
42 |     --fp16 \
43 |     --seed 42 \
44 |     --num_beams 5 \
45 |     --overwrite_cache \
46 |     --overwrite_output_dir
47 | 
48 | fi
49 | 
50 | ## Evaluation (BLEU, COMET)
51 | bash ./evals/eval_generation.sh ${OUTPUT_DIR} ${TEST_PAIRS}


--------------------------------------------------------------------------------
/evals/alma_7b_lora_no_parallel.sh:
--------------------------------------------------------------------------------
 1 | OUTPUT_DIR=${1:-"./outputs-alma-7b-lora/"}
 2 | TEST_PAIRS=${2:-"de-en,cs-en,is-en,zh-en,ru-en,en-de,en-cs,en-is,en-zh,en-ru"}
 3 | # random port between 30000 and 50000
 4 | port=$(( RANDOM % (50000 - 30000 + 1 ) + 30000 ))
 5 | 
 6 | python \
 7 |     run_llmmt.py \
 8 |     --model_name_or_path haoranxu/ALMA-7B-Pretrain \
 9 |     --do_predict \
10 |     --low_cpu_mem_usage \
11 |     --language_pairs ${TEST_PAIRS} \
12 |     --mmt_data_path ./human_written_data/ \
13 |     --per_device_eval_batch_size 2 \
14 |     --output_dir ${OUTPUT_DIR} \
15 |     --use_peft \
16 |     --peft_model_id  haoranxu/ALMA-7B-Pretrain-LoRA \
17 |     --predict_with_generate \
18 |     --max_new_tokens 256 \
19 |     --max_source_length 256 \
20 |     --fp16 \
21 |     --seed 42 \
22 |     --num_beams 5 \
23 |     --overwrite_cache \
24 |     --overwrite_output_dir \
25 |     --multi_gpu_one_model
26 | 
27 | if [[ ${TEST_PAIRS} == *zh-en* ]]; then
28 | python \
29 |     run_llmmt.py \
30 |     --model_name_or_path haoranxu/ALMA-7B-Pretrain \
31 |     --do_predict \
32 |     --low_cpu_mem_usage \
33 |     --language_pairs zh-en \
34 |     --mmt_data_path ./human_written_data/ \
35 |     --per_device_eval_batch_size 2 \
36 |     --output_dir ${OUTPUT_DIR} \
37 |     --use_peft \
38 |     --peft_model_id  haoranxu/ALMA-7B-Pretrain-LoRA \
39 |     --predict_with_generate \
40 |     --max_new_tokens 256 \
41 |     --max_source_length 512 \
42 |     --fp16 \
43 |     --seed 42 \
44 |     --num_beams 5 \
45 |     --overwrite_cache \
46 |     --overwrite_output_dir \
47 |     --multi_gpu_one_model
48 | 
49 | fi
50 | 
51 | ## Evaluation (BLEU, COMET)
52 | bash ./evals/eval_generation.sh ${OUTPUT_DIR} ${TEST_PAIRS}


--------------------------------------------------------------------------------
/evals/alma_7b_no_parallel.sh:
--------------------------------------------------------------------------------
 1 | OUTPUT_DIR=${1:-"./outputs-alma-7b-full-ft/"}
 2 | TEST_PAIRS=${2:-"de-en,cs-en,is-en,zh-en,ru-en,en-de,en-cs,en-is,en-zh,en-ru"}
 3 | 
 4 | ## Generation
 5 | python \
 6 |     run_llmmt.py \
 7 |     --model_name_or_path "haoranxu/ALMA-7B" \
 8 |     --do_predict \
 9 |     --low_cpu_mem_usage \
10 |     --language_pairs ${TEST_PAIRS} \
11 |     --mmt_data_path ./human_written_data/ \
12 |     --per_device_eval_batch_size 2 \
13 |     --output_dir ${OUTPUT_DIR} \
14 |     --predict_with_generate \
15 |     --max_new_tokens 256 \
16 |     --max_source_length 256 \
17 |     --fp16 \
18 |     --seed 42 \
19 |     --num_beams 5 \
20 |     --overwrite_cache \
21 |     --overwrite_output_dir \
22 |     --multi_gpu_one_model 
23 | 
24 | if [[ ${TEST_PAIRS} == *zh-en* ]]; then
25 |     python \
26 |         run_llmmt.py \
27 |         --model_name_or_path "haoranxu/ALMA-7B" \
28 |         --do_predict \
29 |         --low_cpu_mem_usage \
30 |         --language_pairs zh-en \
31 |         --mmt_data_path ./human_written_data/ \
32 |         --per_device_eval_batch_size 2 \
33 |         --output_dir ${OUTPUT_DIR} \
34 |         --predict_with_generate \
35 |         --max_new_tokens 256 \
36 |         --max_source_length 512 \
37 |         --fp16 \
38 |         --seed 42 \
39 |         --num_beams 5 \
40 |         --overwrite_cache \
41 |         --overwrite_output_dir \
42 |         --multi_gpu_one_model 
43 | fi
44 | 
45 | ## Evaluation (BLEU, COMET)
46 | bash ./evals/eval_generation.sh ${OUTPUT_DIR} ${TEST_PAIRS}


--------------------------------------------------------------------------------
/evals/eval_generation.sh:
--------------------------------------------------------------------------------
 1 | OUTPUT_DIR=${1}
 2 | TEST_PAIRS=${2}
 3 | 
 4 | ## Evaluation
 5 | for pair in ${TEST_PAIRS//,/ }; do
 6 |     src=$(echo ${pair} | cut -d "-" -f 1)
 7 |     tgt=$(echo ${pair} | cut -d "-" -f 2)
 8 |     TOK="13a"
 9 |     if [ ${tgt} == "zh" ]; then
10 |         TOK="zh"
11 |     elif [ ${tgt} == "ja" ]; then
12 |         TOK="ja-mecab"
13 |     fi
14 |     echo "--------------------Results for ${pair}-------------------------------------"
15 |     src_path=./outputs/wmt22_outputs/wmt-testset/${src}${tgt}/test.${src}-${tgt}.${src}
16 |     tgt_path=./outputs/wmt22_outputs/wmt-testset/${src}${tgt}/test.${src}-${tgt}.${tgt}
17 |     output_path=${OUTPUT_DIR}/test-${src}-${tgt}
18 |     SACREBLEU_FORMAT=text sacrebleu -tok ${TOK} -w 2 ${tgt_path} < ${output_path} > ${output_path}.bleu
19 |     cat ${output_path}.bleu
20 |     comet-score -s ${src_path} -t ${output_path} -r ${tgt_path} --batch_size 256 --model Unbabel/wmt22-comet-da --gpus 1 > ${output_path}.comet
21 |     comet-score -s ${src_path} -t ${output_path} --batch_size 256 --model Unbabel/wmt22-cometkiwi-da --gpus 1 > ${output_path}.cometkiwi
22 |     comet-score -s ${src_path} -t ${output_path} --batch_size 8 --model Unbabel/wmt23-cometkiwi-da-xxl --gpus 1 > ${output_path}.cometkiwi_10b
23 |     comet-score -s ${src_path} -t ${output_path} --batch_size 8 --model Unbabel/XCOMET-XXL --gpus 1 --to_json ${output_path}.xcomet.output.json > ${output_path}.xcomet_10b    
24 |     tail -n 1 ${output_path}.comet
25 | done
26 | 
27 | for pair in ${TEST_PAIRS//,/ }; do
28 |     src=$(echo ${pair} | cut -d "-" -f 1)
29 |     tgt=$(echo ${pair} | cut -d "-" -f 2)
30 |     echo "---------------------------${src}-${tgt}-------------------------------"
31 |     output_path=${OUTPUT_DIR}/test-${src}-${tgt}
32 |     cat ${output_path}.bleu
33 |     tail -n 1 ${output_path}.comet
34 |     tail -n 1 ${output_path}.cometkiwi
35 |     tail -n 1 ${output_path}.cometkiwi_10b
36 |     tail -n 2 ${output_path}.xcomet_10b
37 | done
38 | 


--------------------------------------------------------------------------------
/evals/eval_generation_wmt23.sh:
--------------------------------------------------------------------------------
 1 | OUTPUT_DIR=${1}
 2 | TEST_PAIRS=${2}
 3 | 
 4 | ## Evaluation
 5 | for pair in ${TEST_PAIRS//,/ }; do
 6 |     src=$(echo ${pair} | cut -d "-" -f 1)
 7 |     tgt=$(echo ${pair} | cut -d "-" -f 2)
 8 |     TOK="13a"
 9 |     if [ ${tgt} == "zh" ]; then
10 |         TOK="zh"
11 |     elif [ ${tgt} == "ja" ]; then
12 |         TOK="ja-mecab"
13 |     fi
14 |     echo "--------------------Results for ${pair}-------------------------------------"
15 |     src_path=./outputs/wmt23_outputs/wmt-testset/${src}${tgt}/test.${src}-${tgt}.${src}
16 |     tgt_path=./outputs/wmt23_outputs/wmt-testset/${src}${tgt}/test.${src}-${tgt}.${tgt}
17 |     output_path=${OUTPUT_DIR}/test-${src}-${tgt}
18 |     SACREBLEU_FORMAT=text sacrebleu -tok ${TOK} -w 2 ${tgt_path} < ${output_path} > ${output_path}.bleu
19 |     cat ${output_path}.bleu
20 |     comet-score -s ${src_path} -t ${output_path} -r ${tgt_path} --batch_size 256 --model Unbabel/wmt22-comet-da --gpus 1 > ${output_path}.comet
21 |     comet-score -s ${src_path} -t ${output_path} --batch_size 256 --model Unbabel/wmt22-cometkiwi-da --gpus 1 > ${output_path}.cometkiwi
22 |     comet-score -s ${src_path} -t ${output_path} --batch_size 8 --model Unbabel/wmt23-cometkiwi-da-xxl --gpus 1 > ${output_path}.cometkiwi_10b
23 |     comet-score -s ${src_path} -t ${output_path} --batch_size 8 --model Unbabel/XCOMET-XXL --gpus 1 --to_json ${output_path}.xcomet.output.json > ${output_path}.xcomet_10b    
24 |     tail -n 1 ${output_path}.comet
25 | done
26 | 
27 | for pair in ${TEST_PAIRS//,/ }; do
28 |     src=$(echo ${pair} | cut -d "-" -f 1)
29 |     tgt=$(echo ${pair} | cut -d "-" -f 2)
30 |     echo "---------------------------${src}-${tgt}-------------------------------"
31 |     output_path=${OUTPUT_DIR}/test-${src}-${tgt}
32 |     cat ${output_path}.bleu
33 |     tail -n 1 ${output_path}.comet
34 |     tail -n 1 ${output_path}.cometkiwi
35 |     tail -n 1 ${output_path}.cometkiwi_10b
36 |     tail -n 2 ${output_path}.xcomet_10b
37 | done
38 | 


--------------------------------------------------------------------------------
/evals/eval_other_models.sh:
--------------------------------------------------------------------------------
 1 | MODEL_NAME=${1}
 2 | TEST_PAIRS=${2:-"de-en,cs-en,is-en,zh-en,ru-en,en-de,en-cs,en-is,en-zh,en-ru"}
 3 | MODEL="${MODEL_NAME//\//-}"
 4 | OUTPUT_DIR=outputs-${MODEL}
 5 | 
 6 | export HF_DATASETS_CACHE=".cache/huggingface_cache/datasets"
 7 | export TRANSFORMERS_CACHE=".cache/models/"
 8 | # random port between 30000 and 50000
 9 | port=$(( RANDOM % (50000 - 30000 + 1 ) + 30000 ))
10 | 
11 | if [[ ${MODEL_NAME} == "meta-llama/Llama-2-7b-hf" ]]; then
12 |     REVISION="--model_revision 637a748546bb9abca62b0684183cc362bc1ece6d"
13 | elif [[ ${MODEL_NAME} == "meta-llama/Llama-2-13b-hf" ]]; then
14 |     REVISION="--model_revision --model_revision 9474c6d222f45e7eb328c0f6b55501e7da67c9c3"
15 | fi
16 | 
17 | ## Generation
18 | accelerate launch --main_process_port ${port} --config_file configs/deepspeed_eval_config.yaml \
19 |     run_llmmt.py \
20 |     --model_name_or_path ${MODEL} \
21 |     --do_predict \
22 |     --low_cpu_mem_usage \
23 |     --language_pairs ${TEST_PAIRS} \
24 |     --mmt_data_path ./human_written_data/ \
25 |     --per_device_eval_batch_size 2 \
26 |     --output_dir ${OUTPUT_DIR} \
27 |     --predict_with_generate \
28 |     --max_new_tokens 256 \
29 |     --max_source_length 256 \
30 |     --fp16 \
31 |     --seed 42 \
32 |     --num_beams 5 \
33 |     --overwrite_cache \
34 |     --overwrite_output_dir \
35 |     ${REVISION}
36 | 
37 | ## Evaluation (BLEU, COMET)
38 | bash ./evals/eval_generation.sh ${OUTPUT_DIR} ${TEST_PAIRS}


--------------------------------------------------------------------------------
/evals/llama-2-13b-5-shot.sh:
--------------------------------------------------------------------------------
 1 | OUTPUT_DIR=${1:-"./outputs-llama-2-13b-5-shot/"}
 2 | TEST_PAIRS=${2:-"de-en,cs-en,is-en,zh-en,ru-en,en-de,en-cs,en-is,en-zh,en-ru"}
 3 | 
 4 | # random port between 30000 and 50000
 5 | port=$(( RANDOM % (50000 - 30000 + 1 ) + 30000 ))
 6 | 
 7 | accelerate launch --main_process_port ${port} --config_file configs/deepspeed_eval_config_bf16.yaml \
 8 |     run_llmmt.py \
 9 |     --model_name_or_path meta-llama/Llama-2-13b-hf \
10 |     --model_revision 9474c6d222f45e7eb328c0f6b55501e7da67c9c3 \
11 |     --do_predict \
12 |     --low_cpu_mem_usage \
13 |     --language_pairs ${TEST_PAIRS} \
14 |     --mmt_data_path ./human_written_data/ \
15 |     --per_device_eval_batch_size 2 \
16 |     --output_dir ${OUTPUT_DIR} \
17 |     --predict_with_generate \
18 |     --max_new_tokens 256 \
19 |     --max_source_length 768 \
20 |     --fp16 \
21 |     --seed 42 \
22 |     --num_beams 1 \
23 |     --few_shot_eval_path ./human_written_data/HR-5-shot/ \
24 |     --overwrite_cache \
25 |     --overwrite_output_dir
26 | 
27 | ## Evaluation (BLEU, COMET)
28 | bash ./evals/eval_generation.sh ${OUTPUT_DIR} ${TEST_PAIRS}


--------------------------------------------------------------------------------
/figures/alma.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fe1ixxu/ALMA/e4545ca1f9799f1c7052e16d757ea3ed6b6b4286/figures/alma.jpg


--------------------------------------------------------------------------------
/figures/alma_logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fe1ixxu/ALMA/e4545ca1f9799f1c7052e16d757ea3ed6b6b4286/figures/alma_logo.png


--------------------------------------------------------------------------------
/figures/alma_origin_logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fe1ixxu/ALMA/e4545ca1f9799f1c7052e16d757ea3ed6b6b4286/figures/alma_origin_logo.png


--------------------------------------------------------------------------------
/figures/almar.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fe1ixxu/ALMA/e4545ca1f9799f1c7052e16d757ea3ed6b6b4286/figures/almar.png


--------------------------------------------------------------------------------
/figures/xalma.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fe1ixxu/ALMA/e4545ca1f9799f1c7052e16d757ea3ed6b6b4286/figures/xalma.png


--------------------------------------------------------------------------------
/human_written_data/Filtered-5-shot/shots.cs-en.json:
--------------------------------------------------------------------------------
 1 | [
 2 |     {
 3 |         "source": "Tak\u00e9 v r\u00e1mci t\u00e9to oblasti je Hotel Tropico Mallorca Island a Tropico Playa.",
 4 |         "target": "Also within the area is the Hotel Tropico Mallorca Island and the Tropico Playa."
 5 |     },
 6 |     {
 7 |         "source": "Hotely ve m\u011bst\u011b Furmanovka Hotely \u2013 Furmanovka Hledat hotely \u2013 Furmanovka",
 8 |         "target": "Furmanovka Hotels Furmanovka Hotels Search for hotels in Furmanovka"
 9 |     },
10 |     {
11 |         "source": "Kone\u010dn\u011b m\u016f\u017eete zjistit v\u0161e, co pot\u0159ebujete v\u011bd\u011bt o va\u0161em GAF souboru... okam\u017eit\u011b!",
12 |         "target": "Finally, you can now discover everything you need to know about your GAF file... instantly!"
13 |     },
14 |     {
15 |         "source": "Copyright \u00a9 AQE advisors, a.s., V\u0161echna pr\u00e1va vyhrazena",
16 |         "target": "Copyright \u00a9 AQE advisors, a.s., All rights reserved"
17 |     },
18 |     {
19 |         "source": "AirPrint a logo AirPrint jsou ochrann\u00e9 zn\u00e1mky spole\u010dnosti Apple Inc.",
20 |         "target": "AirPrint and the AirPrint logo are trademarks of Apple Inc."
21 |     }
22 | ]


--------------------------------------------------------------------------------
/human_written_data/Filtered-5-shot/shots.de-en.json:
--------------------------------------------------------------------------------
 1 | [
 2 |     {
 3 |         "source": "Hotel Inglaterra in Granada, Spain - Besten Preise Garantiert | Lets Book Hotel",
 4 |         "target": "Hotel Inglaterra in Granada, Spain - Best Rates Guaranteed | Lets Book Hotel"
 5 |     },
 6 |     {
 7 |         "source": "Porsche Design Cervo 2.0 Laptoptasche 13\u2033 Leder schwarz",
 8 |         "target": "Porsche Design Cervo 2.0 Laptop bag 13\u2033 leather black"
 9 |     },
10 |     {
11 |         "source": "Fl\u00fcge von Brisbane nach Amsterdam via Shanghai Pudong",
12 |         "target": "Flights from Brisbane to Amsterdam via Shanghai Pudong"
13 |     },
14 |     {
15 |         "source": "Hans Reiter (Ministerium f\u00fcr Wissenschaft, Forschung, und Kunst Baden-W\u00fcrttemberg) Foto: Johannes Zimmermann, Stuttgart",
16 |         "target": "Hans Reiter (Baden-W\u00fcrttemberg Ministry of Science, Research, and Art) Photo: Johannes Zimmermann, Stuttgart"
17 |     },
18 |     {
19 |         "source": "Das Bild \"Mountain landscape at sunset\" von Pavlo Vakhrushev ist bei Fotolia lizenzfrei ab 2 Credits erh\u00e4ltlich (Credit ab 0,74 \u20ac).",
20 |         "target": "The photo \"Mountain landscape at sunset\" from Pavlo Vakhrushev is available on Fotolia under a royalty-free license from 2 credits (Credit from $0.74)."
21 |     }
22 | ]


--------------------------------------------------------------------------------
/human_written_data/Filtered-5-shot/shots.de-fr.json:
--------------------------------------------------------------------------------
 1 | [
 2 |   { 
 3 |     "source": "Seit 2000 haben sich aus der Notwendigkeit des Wiederaufbaus nach den unrechtm\u00e4\u00dfigen und ungerechten Kriegen im Kosovo, in Serbien und in Afghanistan neue Schwerpunkte ergeben.", 
 4 |     "target": "Depuis 2000, de nouvelles priorit\u00e9s se sont fait jour en raison des besoins de reconstruction faisant suite aux guerres ill\u00e9gales et iniques au Kosovo, en Serbie et en Afghanistan."
 5 |   },
 6 |   {
 7 |     "source": "Levetiracetam SUN muss zweimal t\u00e4glich verabreicht werden, morgens und abends, jeden Tag ungef\u00e4hr zur gleichen Uhrzeit.", 
 8 |     "target": "Levetiracetam SUN doit \u00eatre administr\u00e9 2 fois par jour, une fois le matin et une fois le soir, approximativement \u00e0 la m\u00eame heure chaque jour."
 9 |   },
10 |   {
11 |     "source": "Wir verl\u00e4ngern unsere w\u00e4rmsten Gr\u00fc\u00dfe und laden Sie ein, am Econo Lodge City Star Brisbane zu bleiben.", 
12 |     "target": "Nous prolongeons nos salutations plus chaudes et vous invitons \u00e0 rester au Econo Lodge City Star Brisbane."
13 |   },
14 |   {
15 |     "source": "Sie sehen Wettervorhersage in Elizabethtown. Sie sehen auch die Wettervorhersage in Vereinigte Staaten,", 
16 |     "target": "vous regardez Pr\u00e9visions m\u00e9t\u00e9orologiques dans Elizabethtown. Voir aussi Pr\u00e9visions m\u00e9t\u00e9orologiques pour le pays \u00c9tats-Unis,"
17 |   },
18 |   {
19 |     "source": "AVG Free Edition 2016.71.7597 Freigegeben: 16 Mai 2016 (Vor 4 Wochen) Technische Details | Change Log",  
20 |     "target": "AVG Free Edition 2016.71.7597 Sorties: 16 mai 2016 (Il y a 4 semaines) D\u00e9tails techniques | Journal des changements"
21 |   }
22 | ]


--------------------------------------------------------------------------------
/human_written_data/Filtered-5-shot/shots.en-cs.json:
--------------------------------------------------------------------------------
 1 | [
 2 |     {
 3 |         "source": "Free download pdf User's Manual for Garmin GNC 420AW GPS",
 4 |         "target": "Sta\u017een\u00ed zdarma pdf U\u017eivatelsk\u00fd manu\u00e1l for Garmin GNC 420AW GPS"
 5 |     },
 6 |     {
 7 |         "source": "Games developed by NetEnt, Amatic, Pragmatic Play and more providers at WildBlaster.com.",
 8 |         "target": "Games vyvinut\u00fd NetEnt, Amatic, Pragmatic Play a v\u00edce poskytovatel\u016f na WildBlaster.com."
 9 |     },
10 |     {
11 |         "source": "Article 14 Entry into force This Decision shall enter into force on the date of its adoption.",
12 |         "target": "\u010cl\u00e1nek 14 Vstup v platnost Toto rozhodnut\u00ed vstupuje v platnost dnem p\u0159ijet\u00ed."
13 |     },
14 |     {
15 |         "source": "Russak  Forum4 Photos3 Russak Czech Republic, Europe Send a message Facebook Twitter Google+ Vehicle's owners (1) 1994 Sephia / Shuma / Mentor 20 photos Fuel gasoline.",
16 |         "target": "Russak  F\u00f3rum4 Fotky3 Russak \u010cesk\u00e1 republika, Evropa Poslat zpr\u00e1vu Facebook Twitter Google+ Majitel vozidel (1) 1994 Sephia / Shuma / Mentor 20 fotek Palivo benz\u00edn."
17 |     },
18 |     {
19 |         "source": "Please inform KeyBarcelona Plaza Universidad Apartment - Gran Via in advance of your expected arrival time.",
20 |         "target": "Informujte pros\u00edm KeyBarcelona Plaza Universidad Apartment - Gran Via o sv\u00e9m p\u0159edpokl\u00e1dan\u00e9m \u010dase p\u0159\u00edjezdu p\u0159edem."
21 |     }
22 | ]


--------------------------------------------------------------------------------
/human_written_data/Filtered-5-shot/shots.en-de.json:
--------------------------------------------------------------------------------
 1 | [
 2 |     {
 3 |         "source": "Hotels near General Limousine Service Roma Day Tours",
 4 |         "target": "Hotels in der N\u00e4he von General Limousine Service Roma Day Tours"
 5 |     },
 6 |     {
 7 |         "source": "Do you have a question concerning ZyXEL Communications 2602RL-D3A?",
 8 |         "target": "Haben Sie eine Frage bez\u00fcglich ZyXEL Communications 2602RL-D3A?"
 9 |     },
10 |     {
11 |         "source": "How long does it take to get from Duisburg to London?",
12 |         "target": "Wie lange dauert es von Duisburg nach London zu kommen?"
13 |     },
14 |     {
15 |         "source": "Explore.exe is located in the C:\\Windows\\System32 folder.",
16 |         "target": "Explore.exe befindet sich im Ordner C:\\Windows\\System32."
17 |     },
18 |     {
19 |         "source": "Find the best hotels in Corniglio and plan your trip",
20 |         "target": "Finden Sie die besten Hotels in Corniglio und planen Sie Ihre Reise"
21 |     }
22 | ]


--------------------------------------------------------------------------------
/human_written_data/Filtered-5-shot/shots.en-ha.json:
--------------------------------------------------------------------------------
 1 | [
 2 |   {
 3 |     "source": "What preparations did Abram have to make for his journey , and why might that have involved sacrifice ?", 
 4 |     "target": "Wa\u0257anne shirye - shirye Abram ya yi don tafiyarsa , kuma me ya sa lalle ya \u0199unshi sadaukarwa ?"
 5 |   },
 6 |   {
 7 |     "source": "The Bible also gives wise counsel about human relations , urging us to love one another and to treat others with respect , dignity , and kindness .", 
 8 |     "target": "Littafi Mai Tsarki ya kuma ba da shawara mai kyau game da dangantakar \u2019 yan Adam , ya aririce mu mu \u0199aunaci juna kuma mu bi da juna cikin girmamawa , mutunci , da kuma kirki ."
 9 |   },
10 |   {
11 |     "source": "EPDM sheet rubber is optimally suited for use outside and at high temperatures due to properties such as resistance to heat, sunlight, ozone, acids, alkaline and oxygen-containing solvents.", 
12 |     "target": "Rubin rubutun EPDM yana dacewa da kyau don amfani a waje da kuma yanayin zafi saboda kaddarorin irin su jure yanayin zafi, hasken rana, ozone, acid, alkaline da kuma oxygen-hade."
13 |   },
14 |   {
15 |     "source": "8. What would we be able to anticipate from you in the initial 90 days?", 
16 |     "target": "8. Menene za mu iya jira daga gare ku a farkon kwanakin 90?"
17 |   },
18 |   {
19 |     "source": "Capacity(t/h) Feed Both at Center and Sides 120-180 200-260 300-380 450-520", 
20 |     "target": "\u0198imar (t / h) Ciyar Dukansu a Cibiyar da Sides 120-180 200-260 300-380 450-520"
21 |   }
22 | ]
23 | 


--------------------------------------------------------------------------------
/human_written_data/Filtered-5-shot/shots.en-is.json:
--------------------------------------------------------------------------------
 1 | [
 2 |     {
 3 |         "source": "FOOD SERVICE: \u2022 The Florida State Fire College has a full-service cafeteria operated by MCTAE Culinary School.",
 4 |         "target": "Food Service: \u2022 The Florida State Fire College me\u00f0 allri \u00fej\u00f3nustu m\u00f6tuneyti reki\u00f0 af MCTAE Culinary School."
 5 |     },
 6 |     {
 7 |         "source": "Luxury homes for sale in Spain Found 60 Properties",
 8 |         "target": "L\u00faxus heimili til s\u00f6lu \u00e1 Sp\u00e1ni Hafa fundist 60 Properties Ver\u00f0 fr\u00e1"
 9 |     },
10 |     {
11 |         "source": "At the time, Belgium was part of the Burgundian empire, Rome, Spain, was part of the Netherlands and even France .",
12 |         "target": "\u00c1 \u00feeim t\u00edma var Belg\u00eda hluti af Bourgogne heimsveldinu, R\u00f3m, Sp\u00e1ni, var hluti af Hollandi og jafnvel Frakklandi."
13 |     },
14 |     {
15 |         "source": "Yummy, the Neptune burger was the best of the best, the flavor well yes oh yes.",
16 |         "target": "Yummy, The Neptune hamborgari var the bestur af the bestur, brag\u00f0i\u00f0 vel j\u00e1 \u00f3 j\u00e1."
17 |     },
18 |     {
19 |         "source": "This applies no less to economic performance than other criteria: Economic growth, research and development, tehnological innovation, productivity pr. hour of work, job creation, participation in the labour-market, (especially female participation), equality of the sexes, level of education, social mobility, absence of poverty, health and longevity, quality of infrastructure, access to unspoilt nature, the overall quality of life. Less inequality than in most places.",
20 |         "target": "\u00deetta \u00e1 ekki s\u00ed\u00f0ur vi\u00f0 um hagr\u00e6na m\u00e6likvar\u00f0a en a\u00f0ra: Hagv\u00f6xt, framlei\u00f0ni pr vinnustund, ranns\u00f3knir og \u00fer\u00f3un, t\u00e6knin\u00fdjungar og \u00fatbrei\u00f0slu \u00feeirra, sk\u00f6pun starfa, \u00fe\u00e1ttt\u00f6ku \u00e1 vinnumarka\u00f0i (s\u00e9rstaklega \u00fe\u00e1ttt\u00f6ku kvenna), jafnr\u00e6\u00f0i kynjanna, menntunarstig og starfs\u00fej\u00e1lfun, f\u00e9lagslegan hreyfanleika, heilbrig\u00f0i og langl\u00edfi, g\u00e6\u00f0i innvi\u00f0a, \u00fatr\u00fdmingu f\u00e1t\u00e6ktar, a\u00f0gang a\u00f0 \u00f3spilltri n\u00e1tt\u00faru, almenn l\u00edfsg\u00e6\u00f0i."
21 |     }
22 | ]


--------------------------------------------------------------------------------
/human_written_data/Filtered-5-shot/shots.en-ja.json:
--------------------------------------------------------------------------------
 1 | [
 2 |     {
 3 |         "source": "About ADI Corporate Information Executive Team Vincent Roche Attention Internet Explorer users: Analog.com no longer supports IE 11.",
 4 |         "target": "ADI\u306b\u3064\u3044\u3066 \u4f1a\u793e\u6982\u8981 \u7d4c\u55b6\u9663 Vincent Roche Internet Explorer\u3092\u304a\u4f7f\u3044\u306e\u304a\u5ba2\u69d8\u3078\u306e\u6ce8\u610f: Analog.com\u30b5\u30a4\u30c8\u306f\u3001IE11\u3092\u30b5\u30dd\u30fc\u30c8\u3057\u3066\u304a\u308a\u307e\u305b\u3093\u3002"
 5 |     },
 6 |     {
 7 |         "source": "Simplify performance tuning and troubleshooting with Azure SQL Database \u2013 Azure \u30d6\u30ed\u30b0 \u307e\u3068\u3081",
 8 |         "target": "Azure SQL Database \u3067\u30d1\u30d5\u30a9\u30fc\u30de\u30f3\u30b9\u306e\u30c1\u30e5\u30fc\u30cb\u30f3\u30b0\u3068\u30c8\u30e9\u30d6\u30eb\u30b7\u30e5\u30fc\u30c6\u30a3\u30f3\u30b0\u3092\u7c21\u7d20\u5316 \u2013 Azure \u30d6\u30ed\u30b0 \u307e\u3068\u3081"
 9 |     },
10 |     {
11 |         "source": "Ant Financial has so far raised a staggering $14 billion this year alone in Series C funding.",
12 |         "target": "Ant Financial\u306f\u3053\u308c\u307e\u3067\u3001\u30b7\u30ea\u30fc\u30baC\u306e\u8cc7\u91d1\u8abf\u9054\u3067\u4eca\u5e74\u3060\u3051\u3067$ 14\u5104\u3092\u9a5a\u7570\u7684\u306b\u4e0a\u3052\u3066\u3044\u307e\u3059\u3002"
13 |     },
14 |     {
15 |         "source": "On Windows 8.1, completely uninstalling DirectX 9, 10 and 11 without consequences is possible only if they are not installed during a system update.",
16 |         "target": "Windows 8.1\u3067\u306f\u3001DirectX 9\u300110\u3001\u304a\u3088\u307311\u3092\u30b7\u30b9\u30c6\u30e0\u306e\u66f4\u65b0\u4e2d\u306b\u30a4\u30f3\u30b9\u30c8\u30fc\u30eb\u3057\u306a\u3044\u5834\u5408\u306b\u306e\u307f\u3001\u5f71\u97ff\u3092\u4e0e\u3048\u308b\u3053\u3068\u306a\u304f\u5b8c\u5168\u306b\u30a2\u30f3\u30a4\u30f3\u30b9\u30c8\u30fc\u30eb\u3067\u304d\u307e\u3059\u3002"
17 |     },
18 |     {
19 |         "source": "At 6:16 pm the entire cloudbank is illuminated, but by 6:40 pm only the top of the cloudbank is illuminated.",
20 |         "target": "\u5348\u5f8c6:16\u306b\u3001\u96f2\u306e\u571f\u624b\u5168\u4f53\u304c\u7167\u3089\u3055\u308c\u307e\u3059\u304c\u3001\u5348\u5f8c6:40\u307e\u3067\u306b\u306f\u3001\u96f2\u306e\u571f\u624b\u306e\u9802\u70b9\u3060\u3051\u304c\u7167\u3089\u3055\u308c\u307e\u3059\u3002"
21 |     }
22 | ]


--------------------------------------------------------------------------------
/human_written_data/Filtered-5-shot/shots.en-ru.json:
--------------------------------------------------------------------------------
 1 | [
 2 |     {
 3 |         "source": "2016-2017 - master of production training of the department \u00abTransport equipment and technologies\u00bb, Kazakh Agrotechnical University named after S.Seifullin, Astana",
 4 |         "target": "2016-2017 \u0433\u0433. - \u043c\u0430\u0441\u0442\u0435\u0440 \u043f\u0440\u043e\u0438\u0437\u0432\u043e\u0434\u0441\u0442\u0432\u0435\u043d\u043d\u043e\u0433\u043e \u043e\u0431\u0443\u0447\u0435\u043d\u0438\u044f \u043a\u0430\u0444\u0435\u0434\u0440\u044b \u00ab\u0422\u0440\u0430\u043d\u0441\u043f\u043e\u0440\u0442\u043d\u0430\u044f \u0442\u0435\u0445\u043d\u0438\u043a\u0430 \u0438 \u0442\u0435\u0445\u043d\u043e\u043b\u043e\u0433\u0438\u0439\u00bb, \u041a\u0430\u0437\u0430\u0445\u0441\u043a\u0438\u0439 \u0430\u0433\u0440\u043e\u0442\u0435\u0445\u043d\u0438\u0447\u0435\u0441\u043a\u0438\u0439 \u0443\u043d\u0438\u0432\u0435\u0440\u0441\u0438\u0442\u0435\u0442 \u0438\u043c.\u0421.\u0421\u0435\u0439\u0444\u0443\u043b\u043b\u0438\u043d\u0430, \u0433. \u0410\u0441\u0442\u0430\u043d\u0430"
 5 |     },
 6 |     {
 7 |         "source": "Is it also possible to use standards from the EN 301489 series?",
 8 |         "target": "\u041c\u043e\u0436\u043d\u043e \u043b\u0438 \u0442\u0430\u043a\u0436\u0435 \u0438\u0441\u043f\u043e\u043b\u044c\u0437\u043e\u0432\u0430\u0442\u044c \u0441\u0442\u0430\u043d\u0434\u0430\u0440\u0442\u044b \u0441\u0435\u0440\u0438\u0438 EN 301489?"
 9 |     },
10 |     {
11 |         "source": "Chairman: Mr. Hamaneh (Vice-Chairman) (Islamic Republic of Iran)",
12 |         "target": "\u041f\u0440\u0435\u0434\u0441\u0435\u0434\u0430\u0442\u0435\u043b\u044c: \u0433-\u043d \u0425\u0430\u043c\u0430\u043d\u0435 (\u0437\u0430\u043c\u0435\u0441\u0442\u0438\u0442\u0435\u043b\u044c \u041f\u0440\u0435\u0434\u0441\u0435\u0434\u0430\u0442\u0435\u043b\u044f) (\u0418\u0441\u043b\u0430\u043c\u0441\u043a\u0430\u044f \u0420\u0435\u0441\u043f\u0443\u0431\u043b\u0438\u043a\u0430 \u0418\u0440\u0430\u043d)"
13 |     },
14 |     {
15 |         "source": "The report covers the period from 1 January 2006 to 15 March 2007.",
16 |         "target": "\u0414\u043e\u043a\u043b\u0430\u0434 \u043e\u0445\u0432\u0430\u0442\u044b\u0432\u0430\u0435\u0442 \u043f\u0435\u0440\u0438\u043e\u0434 \u0441 1 \u044f\u043d\u0432\u0430\u0440\u044f 2006 \u0433\u043e\u0434\u0430 \u043f\u043e 15 \u043c\u0430\u0440\u0442\u0430 2007 \u0433\u043e\u0434\u0430."
17 |     },
18 |     {
19 |         "source": "2 years ago 1:10:00 ProPorn small tits, beauty, erotic, girlfriend",
20 |         "target": "2 \u0433\u043e\u0434\u0430 \u043d\u0430\u0437\u0430\u0434 1:10:00 ProPorn \u043c\u0430\u043b\u0435\u043d\u044c\u043a\u0438\u0435 \u0441\u0438\u0441\u044c\u043a\u0438, \u043a\u0440\u0430\u0441\u0430\u0432\u0438\u0446\u044b, \u044d\u0440\u043e\u0442\u0438\u043a\u0430, \u043f\u043e\u0434\u0440\u0443\u0433\u0430"
21 |     }
22 | ]


--------------------------------------------------------------------------------
/human_written_data/Filtered-5-shot/shots.en-uk.json:
--------------------------------------------------------------------------------
 1 | [
 2 |     {
 3 |         "source": "You are here: Home>Tech blog>Joomla FAQ>Joomla 1.5 and Joomla 1.7, 2.5, 3.x How to load modules using ajax",
 4 |         "target": "\u0412\u0438 \u0442\u0443\u0442: \u0413\u043e\u043b\u043e\u0432\u043d\u0430>\u0422\u0435\u0445\u043d\u0456\u0447\u043d\u0438\u0439 \u0431\u043b\u043e\u0433>\u0427\u0430\u041f\u0438 \u043f\u043e Joomla>Joomla 1.5 \u0442\u0430 1.7, 2.5, 3.x. \u042f\u043a \u0437\u0430\u0432\u0430\u043d\u0442\u0430\u0436\u0443\u0432\u0430\u0442\u0438 \u043c\u043e\u0434\u0443\u043b\u0456 \u0437\u0430 \u0434\u043e\u043f\u043e\u043c\u043e\u0433\u043e\u044e Ajax"
 5 |     },
 6 |     {
 7 |         "source": "Western Hungary - Photo album, picture galleries (985 photos / 27 galleries)",
 8 |         "target": "\u0417\u0430\u0445\u0456\u0434\u043d\u0430 \u0423\u0433\u043e\u0440\u0449\u0438\u043d\u0430 - \u0424\u043e\u0442\u043e\u0430\u043b\u044c\u0431\u043e\u043c, \u0444\u043e\u0442\u043e\u0433\u0440\u0430\u0444\u0456\u0457, \u0433\u0430\u043b\u0435\u0440\u0435\u0457 \u0437\u043e\u0431\u0440\u0430\u0436\u0435\u043d\u044c (985 \u0444\u043e\u0442\u043e\u0433\u0440\u0430\u0444\u0456\u0457 / 27 \u0433\u0430\u043b\u0435\u0440\u0435\u0457)"
 9 |     },
10 |     {
11 |         "source": "Exchange of scientific information and publications; joint research and publication of results Poland",
12 |         "target": "\u041e\u0431\u043c\u0456\u043d \u043d\u0430\u0443\u043a\u043e\u0432\u043e\u044e \u0456\u043d\u0444\u043e\u0440\u043c\u0430\u0446\u0456\u0454\u044e \u0456 \u043f\u0443\u0431\u043b\u0456\u043a\u0430\u0446\u0456\u044f\u043c\u0438; \u043f\u0440\u043e\u0432\u0435\u0434\u0435\u043d\u043d\u044f \u0441\u043f\u0456\u043b\u044c\u043d\u0438\u0445 \u043d\u0430\u0443\u043a\u043e\u0432\u0438\u0445 \u0434\u043e\u0441\u043b\u0456\u0434\u0436\u0435\u043d\u044c \u0456 \u0441\u0443\u043c\u0456\u0441\u043d\u0430 \u043f\u0443\u0431\u043b\u0456\u043a\u0430\u0446\u0456\u044f \u0440\u0435\u0437\u0443\u043b\u044c\u0442\u0430\u0442\u0456\u0432 \u041f\u043e\u043b\u044c\u0449\u0430"
13 |     },
14 |     {
15 |         "source": "Look at the air fares Batumi \u2014 Odessa for the next two weeks and choose the option that fits you, in price and in time.",
16 |         "target": "\u041f\u0435\u0440\u0435\u0433\u043b\u044f\u043d\u044c\u0442\u0435 \u0446\u0456\u043d\u0438 \u043d\u0430 \u0430\u0432\u0456\u0430\u043f\u0435\u0440\u0435\u043b\u0456\u0442 \u0411\u0430\u0442\u0443\u043c\u0456 \u2014 \u041e\u0434\u0435\u0441\u0430 \u043d\u0430 \u043d\u0430\u0439\u0431\u043b\u0438\u0436\u0447\u0456 \u0434\u0432\u0430 \u0442\u0438\u0436\u043d\u0456 \u0456 \u0432\u0438\u0431\u0435\u0440\u0456\u0442\u044c \u043e\u043f\u0442\u0438\u043c\u0430\u043b\u044c\u043d\u0438\u0439 \u0432\u0430\u0440\u0456\u0430\u043d\u0442, \u044f\u043a\u0438\u0439 \u043f\u0456\u0434\u0445\u043e\u0434\u0438\u0442\u044c \u0432\u0430\u043c \u0456 \u0437\u0430 \u0432\u0430\u0440\u0442\u0456\u0441\u0442\u044e, \u0456 \u0437\u0430 \u0447\u0430\u0441\u043e\u043c."
17 |     },
18 |     {
19 |         "source": "Exhibition: 1984. Kyiv. Shevchenko \u2013 artist. To 170\u2013th birthday [Shevchenko \u2013 artist: Exhibition Catalogue. \u2013 K., 1986. \u2013 p. 8].",
20 |         "target": "\u0412\u0438\u0441\u0442\u0430\u0432\u043a\u0430: 1984. \u041a\u0438\u0457\u0432. \u0428\u0435\u0432\u0447\u0435\u043d\u043a\u043e-\u0445\u0443\u0434\u043e\u0436\u043d\u0438\u043a. \u0414\u043e 170-\u0440\u0456\u0447\u0447\u044f \u0432\u0456\u0434 \u0434\u043d\u044f \u043d\u0430\u0440\u043e\u0434\u0436\u0435\u043d\u043d\u044f [\u0428\u0435\u0432\u0447\u0435\u043d\u043a\u043e-\u0445\u0443\u0434\u043e\u0436\u043d\u0438\u043a: \u041a\u0430\u0442\u0430\u043b\u043e\u0433 \u0432\u0438\u0441\u0442\u0430\u0432\u043a\u0438. \u2013 \u041a., 1986. \u2013 \u0421. 8]."
21 |     }
22 | ]


--------------------------------------------------------------------------------
/human_written_data/Filtered-5-shot/shots.en-zh.json:
--------------------------------------------------------------------------------
 1 | [
 2 |     {
 3 |         "source": "In 2001, 38% of reported HIV infections and 33% of AIDS cases were in women (see tables 12.7 and 12.8 in annex 2, p. 14-15).",
 4 |         "target": "2001\u5e74\uff0c\u5728\u62a5\u544a\u7684\u827e\u6ecb\u75c5\u75c5\u6bd2\u611f\u67d3\u548c\u827e\u6ecb\u75c5\u75c5\u4f8b\u4e2d\uff0c\u5987\u5973\u5206\u522b\u5360 38\uff05\u548c33\uff05\uff08\u89c1\u9644\u4ef62\u886812.7 \u548c\u8868 12.8\uff0c\u7b2c14-15\u9875\uff09\u3002"
 5 |     },
 6 |     {
 7 |         "source": "Where Are You? I'm Here",
 8 |         "target": "\u4f60\u5728\u54ea\u88e1\uff1f\u6211\u5728\u9019\u88e1"
 9 |     },
10 |     {
11 |         "source": "9. Future needs in environmental monitoring",
12 |         "target": "9. \u73af\u5883\u76d1\u6d4b\u65b9\u9762\u4eca\u540e\u7684\u9700\u8981"
13 |     },
14 |     {
15 |         "source": "Central Bank of Nigeria",
16 |         "target": "\u5c3c\u65e5\u5229\u4e9a\u4e2d\u592e\u94f6\u884c (\u82f1\u8bed : Central Bank of Nigeria )"
17 |     },
18 |     {
19 |         "source": "Apache CentOs Linux PHP Server",
20 |         "target": "\u963f\u5e15\u5947 CentOs Linux PHP Server"
21 |     }
22 | ]


--------------------------------------------------------------------------------
/human_written_data/Filtered-5-shot/shots.fr-de.json:
--------------------------------------------------------------------------------
 1 | [
 2 |   { 
 3 |     "source": "Depuis 2000, de nouvelles priorit\u00e9s se sont fait jour en raison des besoins de reconstruction faisant suite aux guerres ill\u00e9gales et iniques au Kosovo, en Serbie et en Afghanistan.", 
 4 |     "target": "Seit 2000 haben sich aus der Notwendigkeit des Wiederaufbaus nach den unrechtm\u00e4\u00dfigen und ungerechten Kriegen im Kosovo, in Serbien und in Afghanistan neue Schwerpunkte ergeben."
 5 |   },
 6 |   {
 7 |     "source": "Levetiracetam SUN doit \u00eatre administr\u00e9 2 fois par jour, une fois le matin et une fois le soir, approximativement \u00e0 la m\u00eame heure chaque jour.", 
 8 |     "target": "Levetiracetam SUN muss zweimal t\u00e4glich verabreicht werden, morgens und abends, jeden Tag ungef\u00e4hr zur gleichen Uhrzeit."
 9 |   },
10 |   {
11 |     "source": "Nous prolongeons nos salutations plus chaudes et vous invitons \u00e0 rester au Econo Lodge City Star Brisbane.", 
12 |     "target": "Wir verl\u00e4ngern unsere w\u00e4rmsten Gr\u00fc\u00dfe und laden Sie ein, am Econo Lodge City Star Brisbane zu bleiben."
13 |   },
14 |   {
15 |     "source": "vous regardez Pr\u00e9visions m\u00e9t\u00e9orologiques dans Elizabethtown. Voir aussi Pr\u00e9visions m\u00e9t\u00e9orologiques pour le pays \u00c9tats-Unis,", 
16 |     "target": "Sie sehen Wettervorhersage in Elizabethtown. Sie sehen auch die Wettervorhersage in Vereinigte Staaten,"
17 |   },
18 |   { 
19 |     "source": "AVG Free Edition 2016.71.7597 Sorties: 16 mai 2016 (Il y a 4 semaines) D\u00e9tails techniques | Journal des changements", 
20 |     "target": "AVG Free Edition 2016.71.7597 Freigegeben: 16 Mai 2016 (Vor 4 Wochen) Technische Details | Change Log"
21 |   }
22 | ]


--------------------------------------------------------------------------------
/human_written_data/Filtered-5-shot/shots.ha-en.json:
--------------------------------------------------------------------------------
 1 | [
 2 |   {
 3 |     "source": "Wa\u0257anne shirye - shirye Abram ya yi don tafiyarsa , kuma me ya sa lalle ya \u0199unshi sadaukarwa ?", 
 4 |     "target": "What preparations did Abram have to make for his journey , and why might that have involved sacrifice ?"
 5 |   },
 6 |   {
 7 |     "source": "Littafi Mai Tsarki ya kuma ba da shawara mai kyau game da dangantakar \u2019 yan Adam , ya aririce mu mu \u0199aunaci juna kuma mu bi da juna cikin girmamawa , mutunci , da kuma kirki .", 
 8 |     "target": "The Bible also gives wise counsel about human relations , urging us to love one another and to treat others with respect , dignity , and kindness ."
 9 |   },
10 |   {
11 |     "source": "Rubin rubutun EPDM yana dacewa da kyau don amfani a waje da kuma yanayin zafi saboda kaddarorin irin su jure yanayin zafi, hasken rana, ozone, acid, alkaline da kuma oxygen-hade.", 
12 |     "target": "EPDM sheet rubber is optimally suited for use outside and at high temperatures due to properties such as resistance to heat, sunlight, ozone, acids, alkaline and oxygen-containing solvents."
13 |   },
14 |   {
15 |     "source": "8. Menene za mu iya jira daga gare ku a farkon kwanakin 90?", 
16 |     "target": "8. What would we be able to anticipate from you in the initial 90 days?"
17 |   },
18 |   {
19 |     "source": "\u0198imar (t / h) Ciyar Dukansu a Cibiyar da Sides 120-180 200-260 300-380 450-520", 
20 |     "target": "Capacity(t/h) Feed Both at Center and Sides 120-180 200-260 300-380 450-520"
21 |   }
22 | ]
23 | 


--------------------------------------------------------------------------------
/human_written_data/Filtered-5-shot/shots.is-en.json:
--------------------------------------------------------------------------------
 1 | [
 2 |     {
 3 |         "source": "Finndu fr\u00e1b\u00e6r ver\u00f0 me\u00f0 Autatlantis \u00e1 Proserpine Flugv\u00f6llur, sj\u00e1\u00f0u einkunnir vi\u00f0skiptavina - og b\u00f3ka\u00f0u \u00e1 netinu, hratt og \u00f6rugglega",
 4 |         "target": "Find great prices with Autatlantis at Proserpine Airport, see customer ratings - and book online, quickly and easily"
 5 |     },
 6 |     {
 7 |         "source": "Li\u00f0i\u00f0 n\u00e6r Geo drengur, st\u00falka Milly og gr\u00e6nt Android v\u00e9lmenni Botha.",
 8 |         "target": "The team includes Geo boy, girl Milly and green android robot Botha."
 9 |     },
10 |     {
11 |         "source": "The Crow Lake Verkefni\u00f0 spannar \u00ferj\u00fa s\u00fdslum \u00ed Su\u00f0ur-Dakota og er st\u00e6rsta vindur verkefni \u00ed Bandar\u00edkjunum \u00e1tti eing\u00f6ngu vi\u00f0 samstarfsverkefni, me\u00f0 afkastagetu upp \u00e1 151,5 megav\u00f6tt.",
12 |         "target": "The Crow Lake project spans three counties in South Dakota and is the largest wind project in the United States owned solely by a cooperative, with a capacity of 151.5 megawatts."
13 |     },
14 |     {
15 |         "source": "All Slot Mobile spilav\u00edti\u00f0 er eitt hi\u00f0 fullkomna d\u00e6mi um framfarir \u00ed spilav\u00edtum.",
16 |         "target": "The All Slot Mobile Casino is one of the perfect examples of advancement in casino gaming."
17 |     },
18 |     {
19 |         "source": "Heiti og flokkunarfr\u00e6\u00f0ileg hugt\u00f6k, eins og \u00feau eru skilgreind \u00ed tegundaskr\u00e1num \u00ed tilskipunum 2009/147/EB (fuglatilskipunin) og 92/43/EBE (vistger\u00f0atilskipunin).",
20 |         "target": "Names and taxonomic concepts as defined by the species lists in Directives 2009/147/EC (Birds Directive) and 92/43/EEC (Habitats Directive)."
21 |     }
22 | ]


--------------------------------------------------------------------------------
/human_written_data/Filtered-5-shot/shots.ja-en.json:
--------------------------------------------------------------------------------
 1 | [
 2 |     {
 3 |         "source": "\u7269\u7406\u7684\u30bb\u30ad\u30e5\u30ea\u30c6\u30a3\u3068\u74b0\u5883\u5236\u5fa1 Claris\u3067\u306f\u3001\u30db\u30b9\u30c6\u30a3\u30f3\u30b0\u30b5\u30fc\u30d3\u30b9\u306b Amazon Web Services(AWS)\u3092\u5229\u7528\u3057\u3066\u3044\u307e\u3059\u3002",
 4 |         "target": "Physical Security and Environmental Controls Claris uses Amazon Web Services (AWS) for its hosting needs."
 5 |     },
 6 |     {
 7 |         "source": "SOC \u306f\u3001\u91cd\u5927\u306a\u640d\u5bb3\u304c\u767a\u751f \u3059\u308b\u524d\u306b\u3001\u653b\u6483\u3092\u5148\u53d6\u308a\u3057\u305f\u308a\u3001\u89e3\u6c7a\u3057\u305f\u308a\u3059\u308b\u305f\u3081\u306e\u30b9\u30de\u30fc \u30c8\u3067\u52b9\u7387\u7684\u306a\u691c\u51fa\u3001\u8abf\u67fb\u3001\u5fdc\u7b54\u6a5f\u80fd\u3092\u5099\u3048\u308b\u5fc5\u8981\u304c\u3042\u308a\u307e\u3059\u3002",
 8 |         "target": "SOCs need to be empowered with smart and efficient detect, investigate, and respond capabilities to preempt attacks or resolve them before significant damage occurs."
 9 |     },
10 |     {
11 |         "source": "\u30c0\u30a6\u30f3\u30ed\u30fc\u30c9\u3059\u308b\u30d5\u30a1\u30a4\u30eb\u306e\u66f8\u5f0f (CAB\u3001XPI\u3001\u307e\u305f\u306f CRX ) \u306f\u30af\u30e9\u30a4\u30a2\u30f3\u30c8\u306e\u30d6\u30e9\u30a6\u30b6\u30fc\u306b\u3088\u308a\u7570\u306a\u308a\u307e\u3059\u3002",
12 |         "target": "The format of the file that is downloaded (CAB, XPI, or CRX) will depend on the client's browser."
13 |     },
14 |     {
15 |         "source": "Citrix ADM \u30c7\u30a3\u30b6\u30b9\u30bf\u30ea\u30ab\u30d0\u30ea(DR)\u6a5f\u80fd\u306f\u3001\u9ad8\u53ef\u7528\u6027\u30e2\u30fc\u30c9\u3067\u5c55\u958b\u3055\u308c\u305fCitrix ADM \u5b8c\u5168\u306a\u30b7\u30b9\u30c6\u30e0\u30d0\u30c3\u30af\u30a2\u30c3\u30d7\u3068\u30ea\u30ab\u30d0\u30ea\u6a5f\u80fd\u3092\u63d0\u4f9b\u3057\u307e\u3059\u3002",
16 |         "target": "The Citrix ADM disaster recovery (DR) feature provides full system backup and recovery capabilities for Citrix ADM deployed in high availability mode."
17 |     },
18 |     {
19 |         "source": "\u3053\u306e ASIC \u304a\u3088\u3073 SoC \u306e\u5b8c\u5168\u306a\u308b\u30bd\u30ea\u30e5\u30fc\u30b7\u30e7\u30f3\u306f\u3001\u30cf\u30fc\u30c9\u30a6\u30a7\u30a2\u30d9\u30fc\u30b9\u3067\u3001\u6700\u5927 4096 x 4096 \u307e\u3067\u306eISO/IEC 14496-10 Advanced Video Coding Standard (MPEG-4 Part 10)\u898f\u683c\u306b\u5b8c\u5168\u6e96\u62e0\u3057\u305f\u30c7\u30b3\u30fc\u30c9\u304c\u53ef\u80fd\u3067\u3059\u3002",
20 |         "target": "The perfect solution for ASICs and SoCs is hardware-based and capable of full ISO/IEC 14496-10 Advanced Video Coding Standard (MPEG-4 Part 10) compliance decoding up to a resolution of 4096 x 4096."
21 |     }
22 | ]


--------------------------------------------------------------------------------
/human_written_data/Filtered-5-shot/shots.ru-en.json:
--------------------------------------------------------------------------------
 1 | [
 2 |     {
 3 |         "source": "\u042d\u0440\u0438\u0445 \u0421\u0438\u0433\u0430\u043b \u00ab\u041c\u0443\u0436\u0447\u0438\u043d\u0430, \u0436\u0435\u043d\u0449\u0438\u043d\u0430 \u0438 \u0440\u0435\u0431\u0451\u043d\u043e\u043a\u00bb, 1982 \u2014 \u0438\u044e\u043b\u044c-\u0430\u0432\u0433\u0443\u0441\u0442, \u2116 4.",
 4 |         "target": "Erich Segal \u00abMan, Woman and Child\u00bb, 1982 \u2013 July\u2013August , \u21164."
 5 |     },
 6 |     {
 7 |         "source": "\u0421\u0430\u0439\u0442 \u0438 \u0423\u0441\u043b\u0443\u0433\u0430 \u043f\u0440\u0435\u0434\u043e\u0441\u0442\u0430\u0432\u043b\u044f\u044e\u0442\u0441\u044f \u0438 \u0443\u043f\u0440\u0430\u0432\u043b\u044f\u044e\u0442\u0441\u044f Tyche Technologies AG. \u0421\u0430\u0439\u0442 www.greeklesibians.lesbianscrowd.com.",
 8 |         "target": "The Website and the service is hosted and administered by Tyche Technologies AG. Website www.greeklesibians.lesbianscrowd.com."
 9 |     },
10 |     {
11 |         "source": "\u041e\u0431\u0443\u0447\u0435\u043d\u0438\u0435 \u0430\u043d\u0433\u043b\u0438\u0439\u0441\u043a\u043e\u043c\u0443 \u0441\u043e speaking24.com - \u0441\u043b\u043e\u0432\u043e \u0434\u043d\u044f - TO EXHORT",
12 |         "target": "Learn English with speaking24.com - word of the day - TO EXHORT"
13 |     },
14 |     {
15 |         "source": "\u041c\u0435\u043d\u0434\u0435\u043b\u0435\u0435\u0432\u0441\u043a (\u0420\u0435\u0441\u043f\u0443\u0431\u043b\u0438\u043a\u0430 \u0422\u0430\u0442\u0430\u0440\u0441\u0442\u0430\u043d) 85549251 ** \u0422\u0435\u043b\u0435\u0444\u043e\u043d",
16 |         "target": "Mendeleevsk (The Republic Of Tatarstan) 85549251 ** Phone"
17 |     },
18 |     {
19 |         "source": "Moncler \u0420\u043e\u0434 \u041f\u0430\u043b\u044c\u0442\u043e \u0412\u043d\u0438\u0437 \u041c\u0443\u0436\u0447\u0438\u043d\u044b \u0427\u0435\u0440\u043d\u044b\u0439, Moncler\u043e\u0434\u0435\u0436\u0434\u0430 \u043e\u0449\u0443\u0449\u0435\u043d\u0438\u0435 \u043d\u0430\u0447\u0438\u043d\u0430\u044f \u0441 \u043c\u043e\u0434\u044b \u0441\u0442\u043e\u043b\u0438\u0446\u0430 \u043c\u0438\u0440\u0430, \u0424\u0440\u0430\u043d\u0446\u0438\u044f. Moncler \u0431\u044b\u043b \u043d\u0430\u0439\u0434\u0435\u043d \u0432 1950 \u0433\u043e\u0434\u0443 \u0420\u0435\u043d\u0435 Ramilion . \u042d\u0442\u0430 \u043a\u043e\u043b\u043b\u0435\u043a\u0446\u0438\u044f \u0431\u044b\u043b\u0430 \u0441\u043e\u0437\u0434\u0430\u043d\u0430 \u0434\u043b\u044f \u0438\u043d\u0442\u0435\u0440\u043f\u0440\u0435\u0442\u0430\u0446\u0438\u0438 \u0433\u043e\u0440\u043e\u0434\u0441\u043a\u043e\u0439 \u0438 \u0441\u043e\u0432\u0440\u0435\u043c\u0435\u043d\u043d\u044b\u0439 \u0441\u0442\u0438\u043b\u044c \u0433\u043b\u0430\u0437\u0430\u043c\u0438 \u043a\u043b\u0430\u0441\u0441 \u0438 \u044d\u043b\u0435\u0433\u0430\u043d\u0442\u043d\u043e\u0441\u0442\u044c . Moncler \u043f\u0440\u0435\u0434\u0441\u0442\u0430\u0432\u043b\u044f\u0435\u0442 \u0441\u043e\u0431\u043e\u0439 \u0441\u0431\u043e\u0440\u043d\u0438\u043a, \u0441\u043f\u0435\u0446\u0438\u0430\u043b\u0438\u0437\u0438\u0440\u0443\u044e\u0449\u0430\u044f\u0441\u044f \u043d\u0430 \u0432\u0435\u0440\u0445\u043d\u044e\u044e \u043e\u0434\u0435\u0436\u0434\u0443 .",
20 |         "target": "Moncler Rod Coat Down Men Black,Moncler a clothing sensation beginning in the fashion capital of the world, France. Moncler was found in the 1950's by Rene Ramilion. This collection has been created to interpret an urban and modern style through the eyes of class and elegance. Moncler is a collection specialising in outdoor clothing."
21 |     }
22 | ]


--------------------------------------------------------------------------------
/human_written_data/Filtered-5-shot/shots.uk-en.json:
--------------------------------------------------------------------------------
 1 | [
 2 |     {
 3 |         "source": "\u043f\u0440\u043e\u0433\u043d\u043e\u0437 \u043f\u043e\u0433\u043e\u0434\u0438 \u0443 \u043c\u0456\u0441\u0442\u0456 \u041c\u0430\u0434\u0430\u0440\u0445 (v4), \u043f\u0440\u043e\u0433\u043d\u043e\u0437 \u043f\u043e\u0433\u043e\u0434\u0438 \u0432 \u043a\u0440\u0430\u0457\u043d\u0456 \u041c\u0430\u0440\u043e\u043a\u043a\u043e (v4), \u043f\u0440\u043e\u0433\u043d\u043e\u0437 \u043f\u043e\u0433\u043e\u0434\u0438 \u0443 \u043c\u0456\u0441\u0442\u0456 \u041c\u0430\u0434\u0430\u0440\u0445 (v1), \u043f\u0440\u043e\u0433\u043d\u043e\u0437 \u043f\u043e\u0433\u043e\u0434\u0438 \u0432 \u043a\u0440\u0430\u0457\u043d\u0456 \u041c\u0430\u0440\u043e\u043a\u043a\u043e (v1), \u043f\u0440\u043e\u0433\u043d\u043e\u0437 \u043f\u043e\u0433\u043e\u0434\u0438 \u0443 \u043c\u0456\u0441\u0442\u0456 \u041c\u0430\u0434\u0430\u0440\u0445 (v2), \u043f\u0440\u043e\u0433\u043d\u043e\u0437 \u043f\u043e\u0433\u043e\u0434\u0438 \u0432 \u043a\u0440\u0430\u0457\u043d\u0456 \u041c\u0430\u0440\u043e\u043a\u043a\u043e (v2)",
 4 |         "target": "weather forecast in Madagh (v4), weather forecast in Morocco (v4), weather forecast in Madagh (v1), weather forecast in Morocco (v1), weather forecast in Madagh (v2), weather forecast in Morocco (v2)"
 5 |     },
 6 |     {
 7 |         "source": "- \u043f\u0440\u0456\u0437\u0432\u0438\u0449\u0435, \u0456\u043c'\u044f, \u043f\u043e \u0431\u0430\u0442\u044c\u043a\u043e\u0432\u0456 (\u0437\u0430 \u043d\u0430\u044f\u0432\u043d\u043e\u0441\u0442\u0456), \u0434\u0430\u0442\u0430 \u043d\u0430\u0440\u043e\u0434\u0436\u0435\u043d\u043d\u044f \u0431\u043e\u0440\u0436\u043d\u0438\u043a\u0430 \u2014 \u0444\u0456\u0437\u0438\u0447\u043d\u043e\u0457 \u043e\u0441\u043e\u0431\u0438 \u0430\u0431\u043e \u043d\u0430\u0439\u043c\u0435\u043d\u0443\u0432\u0430\u043d\u043d\u044f, \u0456\u0434\u0435\u043d\u0442\u0438\u0444\u0456\u043a\u0430\u0446\u0456\u0439\u043d\u0438\u0439 \u043a\u043e\u0434 \u044e\u0440\u0438\u0434\u0438\u0447\u043d\u043e\u0457 \u043e\u0441\u043e\u0431\u0438 \u0443 \u0404\u0434\u0438\u043d\u043e\u043c\u0443 \u0434\u0435\u0440\u0436\u0430\u0432\u043d\u043e\u043c\u0443 \u0440\u0435\u0454\u0441\u0442\u0440\u0456 \u044e\u0440\u0438\u0434\u0438\u0447\u043d\u0438\u0445 \u043e\u0441\u0456\u0431, \u0444\u0456\u0437\u0438\u0447\u043d\u0438\u0445 \u043e\u0441\u0456\u0431-\u043f\u0456\u0434\u043f\u0440\u0438\u0454\u043c\u0446\u0456\u0432 \u0442\u0430 \u0433\u0440\u043e\u043c\u0430\u0434\u0441\u044c\u043a\u0438\u0445 \u0444\u043e\u0440\u043c\u0443\u0432\u0430\u043d\u044c \u0431\u043e\u0440\u0436\u043d\u0438\u043a\u0430 \u2014 \u044e\u0440\u0438\u0434\u0438\u0447\u043d\u043e\u0457 \u043e\u0441\u043e\u0431\u0438;",
 8 |         "target": "- surname, name, patronymic (if any), date of birth of the debtor - an individual or name, identification code of the legal entity in the Unified State Register of Legal Entities, individuals-entrepreneurs and public entities of the debtor - a legal entity;"
 9 |     },
10 |     {
11 |         "source": "\u041f\u0435\u0440\u0435\u0433\u043b\u044f\u043d\u044c\u0442\u0435 \u0446\u0456\u043d\u0438 \u043d\u0430 \u0430\u0432\u0456\u0430\u043f\u0435\u0440\u0435\u043b\u0456\u0442 \u0412\u0435\u0440\u043e\u043d\u0430 \u2014 \u0421\u0430\u043d\u043a\u0442-\u041f\u0435\u0442\u0435\u0440\u0431\u0443\u0440\u0433 \u043d\u0430 \u043d\u0430\u0439\u0431\u043b\u0438\u0436\u0447\u0456 \u0434\u0432\u0430 \u0442\u0438\u0436\u043d\u0456 \u0456 \u0432\u0438\u0431\u0435\u0440\u0456\u0442\u044c \u043e\u043f\u0442\u0438\u043c\u0430\u043b\u044c\u043d\u0438\u0439 \u0432\u0430\u0440\u0456\u0430\u043d\u0442, \u044f\u043a\u0438\u0439 \u043f\u0456\u0434\u0445\u043e\u0434\u0438\u0442\u044c \u0432\u0430\u043c \u0456 \u0437\u0430 \u0432\u0430\u0440\u0442\u0456\u0441\u0442\u044e, \u0456 \u0437\u0430 \u0447\u0430\u0441\u043e\u043c.",
12 |         "target": "Look at the air fares Verona \u2014 St Petersburg for the next two weeks and choose the option that fits you, in price and in time."
13 |     },
14 |     {
15 |         "source": "Accace Ukraine \u0440\u0430\u0437\u043e\u043c \u0437 \u043a\u043e\u043c\u0430\u043d\u0434\u043e\u044e EBA Education \u0437\u0430\u043f\u0440\u043e\u0448\u0443\u044e\u0442\u044c \u0432\u0430\u0441 \u0432\u0437\u044f\u0442\u0438 \u0443\u0447\u0430\u0441\u0442\u044c \u0443 EBA Education Update: \u0410\u0443\u0434\u0438\u0442 \u0435\u0444\u0435\u043a\u0442\u0438\u0432\u043d\u043e\u0441\u0442\u0456.",
16 |         "target": "EBA Education Team together with Accace Ukraine invite you to join the EBA Education Update: Performance Audit."
17 |     },
18 |     {
19 |         "source": "Almatherm , IP \u0432 \u0410\u043b\u043c\u0430\u0442\u0438 _ \u0406\u043d\u0442\u0435\u0440\u043d\u0435\u0442-\u043c\u0430\u0433\u0430\u0437\u0438\u043d Almatherm , IP \u0410\u043b\u043c\u0430\u0442\u0438 (\u041a\u0430\u0437\u0430\u0445\u0441\u0442\u0430\u043d)",
20 |         "target": "Almatherm , IP in Almaty _ Online-store Almatherm , IP Almaty (Kazakhstan)"
21 |     }
22 | ]


--------------------------------------------------------------------------------
/human_written_data/Filtered-5-shot/shots.zh-en.json:
--------------------------------------------------------------------------------
 1 | [
 2 |     {
 3 |         "source": "\u5728\u4ed6\u7684\u4f5c\u54c1\u5f00\u59cb\u5728\u610f\u5927\u5229\u4e8e1924\u5e74\u5728\u5f88\u5927\u7a0b\u5ea6\u4e0a\u4f5c\u98ce\"\u610f\u5927\u5229\u4ee3\u6570\u51e0\u4f55 , \" Zariski\u610f\u8bc6\u5230,\u6574\u4e2a\u95ee\u9898\u9700\u8981\u9002\u5f53\u7684\u57fa\u7840\u3002",
 4 |         "target": "After beginning his work in Italy in 1924 very much in the style of \"Italian algebraic geometry,\" Zariski realised that the whole subject needed proper foundations."
 5 |     },
 6 |     {
 7 |         "source": "\u9776\u5411DSB\u5f62\u6210\u540e,\u7ec6\u80de\u901a\u5e38\u4f7f\u7528\u4e24\u79cdDNA\u4fee\u590d\u9014\u5f84\u4e2d\u7684\u4e00\u79cd\u6765\u5b58\u6d3b:\u975e\u540c\u6e90\u672b\u7aef\u8fde\u63a5(NHEJ)\u6216\u540c\u6e90\u4f9d\u8d56\u6027\u4fee\u590d(HDR ) \u3002",
 8 |         "target": "Following targeted DSB formation, the cell typically uses one of two DNA repair pathways to survive: non-homologous end joining (NHEJ) or homology dependent repair (HDR)."
 9 |     },
10 |     {
11 |         "source": "\u300a\u85c9\u7531\u539f\u5b50\u5f48\u62b5\u9054\u706b\u661f\uff1a\u7375\u6236\u5ea7\u8a08\u5283\u79d8\u53f2\u300b\uff08To Mars by A-Bomb: The Secret History of Project Orion\uff09\u662f\u4e00\u90e82003\u5e74\u82f1\u570b\u5ee3\u64ad\u516c\u53f8\uff08BBC\uff09\u95dc\u65bc\u8a72\u8a08\u5283\u7684\u7d00\u9304\u7247\u3002",
12 |         "target": "To Mars by A-Bomb: The Secret History of Project Orion was a 2003 BBC documentary film about the project."
13 |     },
14 |     {
15 |         "source": "1998\u5e74\u8fdb\u884c\u7684\u4e00\u9879\u8c03\u67e5\u53d1\u73b0\uff0c\u65e5\u672c\u670929.5%\u7684\u4eba\u53e3\u76f8\u4fe1\u6765\u751f\uff0c\u8fd8\u6709\u53e6\u591640%\u613f\u610f\u76f8\u4fe1\uff0c\u5e76\u4e14\u53c8\u4ee5\u5e74\u8f7b\u4eba\u76f8\u4fe1\u7684\u6bd4\u4f8b\u6700\u9ad8\u3002",
16 |         "target": "A 1998 survey found that 29.5% of the Japanese population believed in an afterlife, and a further 40% wanted to believe; belief was highest among the young."
17 |     },
18 |     {
19 |         "source": "2001\u5e7412\u6708\uff0cRTECS\u88abNIOSH\u8f6c\u8ba9\u7ed9\u4e86\u7231\u601d\u552f\u5c14MDL\uff08Elsevier MDL\uff09\uff0c\u4e00\u5bb6\u79c1\u8425\u516c\u53f8\u3002",
20 |         "target": "In December 2001 RTECS was transferred from NIOSH to the private company Elsevier MDL."
21 |     }
22 | ]


--------------------------------------------------------------------------------
/human_written_data/HW-5-shot/shots.cs-en.json:
--------------------------------------------------------------------------------
 1 | [
 2 |     {
 3 |         "source": "Podle všeho šlo zcela určitě o pokračující bitvu.",
 4 |         "target": "It appears that this was definitely an ongoing battle."
 5 |     },
 6 |     {
 7 |         "source": "Nová doba přinesla nové návody.",
 8 |         "target": "The new age brought new instructions."
 9 |     },
10 |     {
11 |         "source": "Jenže změnila název a IČO a nebylo co řešit.",
12 |         "target": "The company then changed its company ID number, and that was it."
13 |     },
14 |     {
15 |         "source": "Chodí s dopomocí a jezdí na kole.",
16 |         "target": "He's walking with help, and riding a bike."
17 |     },
18 |     {
19 |         "source": "V Londýně zemřel cyklista po srážce s automobilem.",
20 |         "target": "A cyclist has died in a collision involving a car in London."
21 |     }
22 | ]


--------------------------------------------------------------------------------
/human_written_data/HW-5-shot/shots.de-en.json:
--------------------------------------------------------------------------------
 1 | [
 2 |     {
 3 |         "source": "Die Lehrstellen beim Kreis sind offenbar sehr begehrt.",
 4 |         "target": "The apprenticeship positions at the district are apparently very coveted."
 5 |     },
 6 |     {
 7 |         "source": "Michael Rottmann, Manager der Halle, berichtet, dass bereits nach der Berufswahlmesse am 19. September ein entsprechender Schriftzug auf dem Werbeschild am Treppenaufgang zum Gempt-Bistro entdeckt worden sei.",
 8 |         "target": "Michael Rottmann, manager of the hall, reports that writing was found on the advertising sign on the staircase to the Gempt Bistro on September 19 after the career choice fair."
 9 |     },
10 |     {
11 |         "source": "Die CDU mit ihrem Spitzenkandidaten, dem Innenminister Lorenz Caffier, hat schon erlebt, wie es ist, wenn man zur falschen Zeit am richtigen Ort Wahlkampf macht.",
12 |         "target": "The CDU and its top-level candidate, Minister of the Interior Lorenz Caffier, has already experienced what happens when you fight an election in the right place at the wrong time."
13 |     },
14 |     {
15 |         "source": "Sechs Monate Bauarbeiten, das ist schon brutal.",
16 |         "target": "Six months of construction works, that's brutal."
17 |     },
18 |     {
19 |         "source": "Die Därme der Mäuse wurden DNA-sequenziert und es wurden sechs Bakterienarten gefunden, die in den Mäusen mit den Immunzellen vorhanden waren, aber ohne sie bei den Mäusen fehlten.",
20 |         "target": "The guts of the mice were DNA sequenced and it was found six bacterial species present in the mice with the immune cells but absent from the mice without them."
21 |     }
22 | ]


--------------------------------------------------------------------------------
/human_written_data/HW-5-shot/shots.en-cs.json:
--------------------------------------------------------------------------------
 1 | [
 2 |     {
 3 |         "source": "Garcia said he appreciates Lomachenko as \"a tremendous fighter,\" and may attend his Saturday bout.",
 4 |         "target": "Garcia uvedl, že uznává Lomačenka jako „úžasného boxera“ a mohl by se zúčastnit jeho sobotního zápasu."
 5 |     },
 6 |     {
 7 |         "source": "My supervisor, wo was Czech, once told me about the French football team and said something along the lines of:",
 8 |         "target": "Moje vedoucí, Češka, mi jednou vyprávěla o francouzském fotbalovém týmu a pronesla něco ve smyslu: Vždyť to nejsou Francouzi, ale Afričané."
 9 |     },
10 |     {
11 |         "source": "The report, being considered by Theresa May, also calls for tougher checks on registration to prevent the electoral register being used for immigration and benefit fraud.",
12 |         "target": "Zpráva, kterou nyní hodnotí Theresa Mayová, také žádá o důkladnější kontroly při registracích, aby se zabránilo využívání volebních seznamů pro imigrační podvody a podvody s dávkami."
13 |     },
14 |     {
15 |         "source": "As Miller scolded Acosta: \"I don't want to get off into a whole thing about history here, but the Statue of Liberty is ... a symbol of American liberty lighting the world.",
16 |         "target": "Miller pokáral Acostu slovy: „Nechci se tady pouštět do historie, ale Socha Svobody je... symbol americké svobody, který vrhá světlo na celý svět."
17 |     },
18 |     {
19 |         "source": "People will also not be able to vote at the Consulate General in Donetsk.",
20 |         "target": "Volit se nebude ani na generálním konzulátu v Doněcku."
21 |     }
22 | ]


--------------------------------------------------------------------------------
/human_written_data/HW-5-shot/shots.en-de.json:
--------------------------------------------------------------------------------
 1 | [
 2 |     {
 3 |         "source": "According to Rouhani, European signatories of the 2015 nuclear deal urged him to meet with US President Donald Trump, promising that Washington would lift \"all\" sanctions in return.",
 4 |         "target": "Laut Rouhani drängten ihn europäische Unterzeichnerstaaten der Nuklearvereinbarung von 2015 zu einem Treffen mit dem amerikanischen Präsidenten Donald Trump, und versprachen, dass Washington in Gegenzug „alle“ Sanktionen aufheben würden."
 5 |     },
 6 |     {
 7 |         "source": "My last stop is a shop said to have inspired a fresh generation of vinyl lovers: Urban Outfitters.",
 8 |         "target": "Mein letzter Halt ist ein Geschäft, von dem gesagt wird, dass eine neue Generation von Plattenliebhabern inspiriert haben soll: Urban Outfitters."
 9 |     },
10 |     {
11 |         "source": "\"You've really just got to dig deep and worry about your own match,\" Spieth said.",
12 |         "target": "„Du musst dich wirklich stark konzentrieren und dir nur Sorgen um dein eigenes Spiel machen\", sagte Spieth."
13 |     },
14 |     {
15 |         "source": "The Report opens with plea for open debate and the formation of a consensus in the United States about the policy towards the Middle East.",
16 |         "target": "Der Bericht fängt an mit der Bitte um einen offenen Diskurs und um Konsensbildung in den USA über eine Strategie für den Mittleren Osten."
17 |     },
18 |     {
19 |         "source": "Who should go to a sleep lab, and what happens there?",
20 |         "target": "Wer sollte ins Schlaflabor, und was passiert da?"
21 |     }
22 | ]


--------------------------------------------------------------------------------
/human_written_data/HW-5-shot/shots.en-is.json:
--------------------------------------------------------------------------------
 1 | [
 2 |     {
 3 |         "source": "In the year 1970, Raymond Damadian, a medical doctor and research scientist, discovered the basis for using magnetic resonance imaging as a tool for medical diagnosis.",
 4 |         "target": "Árið 1970 uppgötvaði Raymond Damatian, læknir og rannsakandi, undirstöðuatriðin við að nota segulómun við læknisfræðilega greiningu."
 5 |     },
 6 |     {
 7 |         "source": "It is related to but usually not involving alpine style ski touring or mountaineering, the latter ones done in steep terrain and requiring much stiffer skis and boots.",
 8 |         "target": "Þetta er skyld skíðaíþrótt en felur yfirleitt ekki í sér alpagreinar né fjallaklifur, en síðarnefndu íþróttagreinarnar fara fram í miklum bratta og krefjast notkunar á mun stífari skíðum og skíðaskóm."
 9 |     },
10 |     {
11 |         "source": "In late 2015, TogiNet established AstroNet Radio as a subsidiary station.",
12 |         "target": "Síðari hluta árs 2015 stofnaði TogiNet útvarpsstöðina AstroNet sem dótturstöð."
13 |     },
14 |     {
15 |         "source": "Boating is a national pastime in Finland, with a boat to every seven or eight people.",
16 |         "target": "Siglingar á bátum eru vinsæl dægrastytting í Finnlandi þar sem er einn bátur á hverja sjö til átta einstaklinga."
17 |     },
18 |     {
19 |         "source": "Dustin \"Goldust\" Runnels commented that \"Luna was as freaky as me...maybe even more...love her and will miss her...hopefully she's in a better place.\"",
20 |         "target": "Dustin \"Goldust\" Runnels sagði: \"Luna var jafn klikkuð og ég ... jafnvel klikkaðri ... elska hana og mun sakna hennar ... vonandi er hún á betri stað.\""
21 |     }
22 | ]


--------------------------------------------------------------------------------
/human_written_data/HW-5-shot/shots.en-ru.json:
--------------------------------------------------------------------------------
 1 | [
 2 |     {
 3 |         "source": "During this period of European history, the Catholic Church, which had become rich and powerful, came under scrutiny.",
 4 |         "target": "В течение этого периода европейской истории, католическая церковь, ставшая богатой и сильной, оказалась объектом пристального внимания."
 5 |     },
 6 |     {
 7 |         "source": "House Intelligence Committee Chairman Adam Schiff (D-Calif.) condemned Trump's attacks on the whistleblower and his or her sources.",
 8 |         "target": "Председатель Комитета палаты представителей США по разведке Адам Шифф осудил нападение Трампа на осведомителя и его источники."
 9 |     },
10 |     {
11 |         "source": "According to the RT Telegram channel, the journalists wore special protective gear and were there for less than five minutes, since the ruins of the reactor that was destroyed by the explosion still emit 40 thousand times more radiation than normal.",
12 |         "target": "Как сообщается в Telegram-канале RT, журналисты находились там в специальных защитных костюмах менее пяти минут, так как руины разрушенного взрывом энергоблока до сих пор источают радиацию, в 40 тысяч раз превышающую норму."
13 |     },
14 |     {
15 |         "source": "\"Best Screenplay\" was awarded to the work of Sergey Dmitrenko based on stories told by Fazil Iskander in the book \"Sandro from Chegem\".",
16 |         "target": "\"Лучший киносценарий\" - работа Сергея Дмитренко по мотивам историй, рассказанных Фазилем Искандером в книге \"Сандро из Чегема\"."
17 |     },
18 |     {
19 |         "source": "An action participant, Nevroz Duman claimed that the aim of the march is to show that the society is united and consolidated about this issue.",
20 |         "target": "Одна из участниц акции Невроз Думан заявила, что цель демонстрации - показать единство общества и солидарность в данной проблеме."
21 |     }
22 | ]


--------------------------------------------------------------------------------
/human_written_data/HW-5-shot/shots.en-zh.json:
--------------------------------------------------------------------------------
 1 | [
 2 |     {
 3 |         "source": "But playing can be tough.",
 4 |         "target": "但是踢球有时会很艰难。"
 5 |     },
 6 |     {
 7 |         "source": "\"In the coming months, we will consult on introducing a complete ban on the advertising and promotion of vapour products.\"",
 8 |         "target": "“在未来几个月，我们将就全面禁止雾化产品的广告和促销进行磋商。”"
 9 |     },
10 |     {
11 |         "source": "A traveller with piles of money might consider a round the world flight, broken up with stays in many of these hotels.",
12 |         "target": "一名腰缠万贯的旅客可能会考虑乘飞机环游世界，并在旅途中陆续入住许多家这样的酒店。"
13 |     },
14 |     {
15 |         "source": "This is the second institution of higher learning jointly established by a US$130 million donation by the Li Ka Shing Foundation. With effect from next year, the Foundation will commit another RMB2 billion to build Shantou University.",
16 |         "target": "该校为李嘉诚基金会捐资1.3亿美元合作筹建的第二所高校，而李嘉诚基金会明年起将再投资20亿元(人民币，下同)建设汕头大学。"
17 |     },
18 |     {
19 |         "source": "The 25th Asia-Pacific Economic Cooperation (APEC) Economic Leaders' Meeting was held in the central Vietnam city of Da Nang from the 10th to 11th. The progress of the Free Trade Area of the Asia-Pacific (FTAAP) will be one of the highlights of this meeting.",
20 |         "target": "亚太经合组织（ＡＰＥＣ）第二十五次领导人非正式会议将于１０日至１１日在越南中部城市岘港举行，亚太自贸区（ＦＴＡＡＰ）进程是此次会议讨论的热点之一。"
21 |     }
22 | ]


--------------------------------------------------------------------------------
/human_written_data/HW-5-shot/shots.is-en.json:
--------------------------------------------------------------------------------
 1 | [
 2 |     {
 3 |         "source": "Vörubílstjórinn, sem er 64 ára, slasaðist ekki við áreksturinn.",
 4 |         "target": "The truck driver, who is aged 64, was not injured in the crash."
 5 |     },
 6 |     {
 7 |         "source": "Það er auðveldlega hægt að gera með því að nota tiltölulega hljóðláta vekjaraklukku til að koma þér til meðvitundar án þess að vekja þig að fullu.",
 8 |         "target": "This can be easily done by using a relatively quiet alarm clock to bring you to consciousness without fully waking you."
 9 |     },
10 |     {
11 |         "source": "Hsieh gaf í skyn þegar kosningarnar fóru fram að Ma kynni að flýja land um leið og neyðarástand yrði.",
12 |         "target": "Hsieh implied during the election that Ma might flee the country during a time of crisis."
13 |     },
14 |     {
15 |         "source": "Þú verður alltaf að bóka beint hjá flugfélaginu í gegnum síma.",
16 |         "target": "In all cases, you must book by phone directly with the airline."
17 |     },
18 |     {
19 |         "source": "Kannski er algengasta tegund ferðaþjónustu sú sem fólk tengir við ferðalög: Afþreyingarferðamennska.",
20 |         "target": "Perhaps the most common type of tourism is what most people associate with traveling: Recreation tourism."
21 |     }
22 | ]


--------------------------------------------------------------------------------
/human_written_data/HW-5-shot/shots.ru-en.json:
--------------------------------------------------------------------------------
 1 | [
 2 |     {
 3 |         "source": "Материалы переданы в Пресненский районный суд Москвы для рассмотрения по существу.",
 4 |         "target": "The files have been handed over to Moscow’s Presnensky District Court for consideration on the merits."
 5 |     },
 6 |     {
 7 |         "source": "По словам Бондарева, за три года в Сирии уничтожены десятки тысяч объектов террористов - речь идет, например, о складах боеприпасов, укрепрайонах и штабах.",
 8 |         "target": "In Bondarev's words, after three years in Syria tens of thousands of terrorist targets have been destroyed - including, for example, ammunition stockpiles, fortifications, and headquarters."
 9 |     },
10 |     {
11 |         "source": "Мне показали эту квартиру в Эрлс Корт, и здесь были такие же высокие потолки.",
12 |         "target": "I was shown this apartment in Earls Court and it had the same high ceilings."
13 |     },
14 |     {
15 |         "source": "\"Я сказал, нигга, убирайся из моего дома!\".",
16 |         "target": "\"I said n---a, get off my property!\""
17 |     },
18 |     {
19 |         "source": "Однако говорить о триумфе Европы пока преждевременно.",
20 |         "target": "Talk of European glory remains premature, though."
21 |     }
22 | ]


--------------------------------------------------------------------------------
/human_written_data/HW-5-shot/shots.zh-en.json:
--------------------------------------------------------------------------------
 1 | [
 2 |     {
 3 |         "source": "这将造福中国人民，也将造福世界各国人民。",
 4 |         "target": "This will benefit the Chinese people, and benefit all the peoples of the world as well."
 5 |     },
 6 |     {
 7 |         "source": "人工智能有很强的科幻色彩，但它其实是计算机科学非常重要的一个分支，研究的是机器的行为、学习和智能适应。",
 8 |         "target": "Although AI has a strong connotation of science fiction, AI forms a very important branch of computer science, dealing with behavior, learning and intelligent adaptation in a machine."
 9 |     },
10 |     {
11 |         "source": "井贤栋表示，过去3年“蚂蚁森林”影响了5亿人，未来3年“蚂蚁森林”的目标是带动全球10亿人参与低碳行动。",
12 |         "target": "Jing said \"Ant Forest\" had affected 500 million people in the past 3 years, and its goal in the next 3 years is to inspire 1 billion people worldwide to take part in low-carbon activities."
13 |     },
14 |     {
15 |         "source": "接下来，研究人员等母狮体内的荷尔蒙含量达到合适程度时，为它进行人工授精。",
16 |         "target": "Next, the researchers waited for the hormones in the female lion to reach a proper level so as to conduct the artificial insemination in it."
17 |     },
18 |     {
19 |         "source": "划转工作时间紧，任务重 。 ”",
20 |         "target": "The transfer work is tough job in limited time. \""
21 |     }
22 | ]


--------------------------------------------------------------------------------
/install_alma.sh:
--------------------------------------------------------------------------------
 1 | pip install transformers==4.51.1
 2 | pip install peft==0.13.0
 3 | pip install sentencepiece
 4 | pip install sacrebleu
 5 | pip install ipython
 6 | pip install datasets
 7 | pip install evaluate
 8 | pip3 install deepspeed==0.15.1
 9 | pip install einops
10 | pip install wandb
11 | pip install zstandard
12 | pip install accelerate==0.34.2
13 | pip install jsonlines
14 | pip install trl
15 | 


--------------------------------------------------------------------------------
/outputs/wmt22_outputs/README.md:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/run_cpo_llmmt.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # coding=utf-8
  3 | 
  4 | import logging
  5 | import os
  6 | import sys
  7 | import json
  8 | 
  9 | import datasets
 10 | import torch
 11 | from datasets import load_dataset
 12 | 
 13 | import transformers
 14 | from transformers import (
 15 |     HfArgumentParser,
 16 |     set_seed,
 17 | )
 18 | from utils.utils import preprocess_cpo_data, load_tokenizer, load_model, SavePeftModelCallback
 19 | from utils.arguments import ModelArguments, DataTrainingArguments
 20 | from trl import CPOTrainer, CPOConfig
 21 | 
 22 | logger = logging.getLogger(__name__)
 23 | 
 24 | def main():
 25 |     # See all possible arguments in src/transformers/training_args.py
 26 |     # or by passing the --help flag to this script.
 27 |     # We now keep distinct sets of args, for a cleaner separation of concerns.
 28 |     
 29 |     parser = HfArgumentParser((ModelArguments, DataTrainingArguments, CPOConfig))
 30 |     if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
 31 |         # If we pass only one argument to the script and it's the path to a json file,
 32 |         # let's parse it to get our arguments.
 33 |         model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
 34 |     else:
 35 |         model_args, data_args, training_args = parser.parse_args_into_dataclasses()
 36 | 
 37 |     # Setup logging
 38 |     logging.basicConfig(
 39 |         format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
 40 |         datefmt="%m/%d/%Y %H:%M:%S",
 41 |         handlers=[logging.StreamHandler(sys.stdout)],
 42 |     )
 43 | 
 44 |     if training_args.should_log:
 45 |         # The default of training_args.log_level is passive, so we set log level at info here to have that default.
 46 |         transformers.utils.logging.set_verbosity_info()
 47 | 
 48 |     log_level = training_args.get_process_log_level()
 49 |     logger.setLevel(log_level)
 50 |     datasets.utils.logging.set_verbosity(log_level)
 51 |     transformers.utils.logging.set_verbosity(log_level)
 52 |     transformers.utils.logging.enable_default_handler()
 53 |     transformers.utils.logging.enable_explicit_format()
 54 | 
 55 |     # Log on each process the small summary:
 56 |     logger.warning(
 57 |         f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
 58 |         + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
 59 |     )
 60 |     logger.info(f"Training/evaluation parameters {training_args}")
 61 | 
 62 |     # Get the datasets
 63 |     pairs = set(data_args.language_pairs.split(","))
 64 |     train_raw_data, valid_raw_data, test_raw_data = {}, None, None
 65 |     seen = set()
 66 |     ## load cpo dataset
 67 |     train_raw_data["mmt"] = {} 
 68 |     for pair in pairs:
 69 |         src_lang, tgt_lang = pair.split("-")
 70 |         first_lang = src_lang if src_lang != "en" else tgt_lang
 71 |         second_lang = "en"
 72 |         if (first_lang, second_lang) not in seen and training_args.do_train:
 73 |             train_raw_data["mmt"][f"{first_lang}-{second_lang}"] = load_dataset(
 74 |                 data_args.cpo_data_path,
 75 |                 f"{first_lang}-{second_lang}",
 76 |                 cache_dir=model_args.cache_dir,
 77 |                 use_auth_token=True if model_args.use_auth_token else None,
 78 |                 streaming=data_args.streaming,
 79 |                 )
 80 |         seen.add((first_lang, second_lang))
 81 |     
 82 |     # load tokenizer
 83 |     set_seed(training_args.seed)
 84 |     tokenizer = load_tokenizer(data_args, model_args, training_args, logger)
 85 | 
 86 |     shots_eval_dict = {}
 87 |     if data_args.few_shot_eval_path:
 88 |         for lg_pair in test_raw_data.keys():
 89 |             pair_shot_path = os.path.join(data_args.few_shot_eval_path, f"shots.{lg_pair}.json")
 90 |             if not os.path.isfile(pair_shot_path):
 91 |                 ValueError(f"Make sure the language pair {lg_pair} is in the few shot eval folder!")
 92 |             with open(pair_shot_path) as f:
 93 |                 shots_eval_dict[lg_pair] = json.load(f)
 94 | 
 95 |     # Preprocess data
 96 |     train_datasets, eval_datasets, test_datasets = preprocess_cpo_data(train_raw_data, valid_raw_data, test_raw_data, pairs, tokenizer, shots_eval_dict, data_args, training_args, model_args)
 97 | 
 98 |     # Load model
 99 |     model = load_model(data_args, model_args, training_args, tokenizer, logger) 
100 | 
101 |     # Initialize our Trainer
102 |     trainer = CPOTrainer(
103 |         model,
104 |         args=training_args,
105 |         train_dataset=train_datasets,
106 |         eval_dataset=eval_datasets,
107 |         tokenizer=tokenizer,
108 |         callbacks=[SavePeftModelCallback] if model_args.use_peft else None,
109 |     )
110 |     # Training
111 |     if training_args.do_train:
112 |         checkpoint = None
113 |         if training_args.resume_from_checkpoint is not None:
114 |             checkpoint = training_args.resume_from_checkpoint
115 |         
116 |         trainer.train(resume_from_checkpoint=checkpoint)
117 | 
118 |         trainer.save_state()
119 |         if model_args.use_peft:
120 |             if torch.distributed.get_rank() == 0:
121 |                 model.save_pretrained(training_args.output_dir) 
122 |         else:
123 |             trainer.save_model()  # Saves the tokenizer too for easy upload
124 | 
125 | def _mp_fn(index):
126 |     # For xla_spawn (TPUs)
127 |     main()
128 | 
129 | 
130 | if __name__ == "__main__":
131 |     main()


--------------------------------------------------------------------------------
/run_llmmt.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # coding=utf-8
  3 | 
  4 | import logging
  5 | import copy
  6 | import math
  7 | import os
  8 | import sys
  9 | import json
 10 | import random
 11 | from dataclasses import dataclass, field
 12 | from itertools import chain
 13 | from typing import Optional
 14 | import numpy as np
 15 | import jsonlines
 16 | 
 17 | import datasets
 18 | import evaluate
 19 | import torch
 20 | from datasets import load_dataset
 21 | 
 22 | import transformers
 23 | from transformers import (
 24 |     CONFIG_MAPPING,
 25 |     MODEL_FOR_CAUSAL_LM_MAPPING,
 26 |     AutoConfig,
 27 |     AutoModelForCausalLM,
 28 |     AutoTokenizer,
 29 |     HfArgumentParser,
 30 |     Trainer,
 31 |     TrainingArguments,
 32 |     Seq2SeqTrainingArguments,
 33 |     default_data_collator,
 34 |     is_torch_tpu_available,
 35 |     set_seed,
 36 |     LlamaTokenizer,
 37 | )
 38 | from transformers.testing_utils import CaptureLogger
 39 | from transformers.trainer_utils import get_last_checkpoint
 40 | from transformers.utils import check_min_version, send_example_telemetry
 41 | from transformers.utils.versions import require_version
 42 | from peft import LoraConfig, get_peft_model, TaskType
 43 | from peft import PeftModel, PeftConfig
 44 | from collections import defaultdict
 45 | from transformers.trainer_callback import TrainerCallback
 46 | from datasets import concatenate_datasets, interleave_datasets
 47 | from utils.trainer_llmmt import LlmmtTrainer
 48 | from utils.utils import LANG_TABLE, load_mmt_dataset, get_preprocessed_data, clean_outputstring, load_a_single_text_file, load_tokenizer, load_model, SavePeftModelCallback, get_key_suffix, NLLB_CODE, ISO1_ISO3_map
 49 | from utils.arguments import ModelArguments, DataTrainingArguments
 50 | from utils.ul2collator import DataCollatorForUL2
 51 | 
 52 | logger = logging.getLogger(__name__)
 53 | 
 54 | from peft import get_peft_config, get_peft_model, LoraConfig, TaskType
 55 | 
 56 | from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR
 57 | 
 58 | 
 59 | def main():
 60 |     # See all possible arguments in src/transformers/training_args.py
 61 |     # or by passing the --help flag to this script.
 62 |     # We now keep distinct sets of args, for a cleaner separation of concerns.
 63 |     
 64 |     parser = HfArgumentParser((ModelArguments, DataTrainingArguments, Seq2SeqTrainingArguments))
 65 |     if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
 66 |         # If we pass only one argument to the script and it's the path to a json file,
 67 |         # let's parse it to get our arguments.
 68 |         model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
 69 |     else:
 70 |         model_args, data_args, training_args = parser.parse_args_into_dataclasses()
 71 | 
 72 |     # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The
 73 |     # information sent is the one passed as arguments along with your Python/PyTorch versions.
 74 |     send_example_telemetry("run_llmmt", model_args, data_args)
 75 | 
 76 |     # Setup logging
 77 |     logging.basicConfig(
 78 |         format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
 79 |         datefmt="%m/%d/%Y %H:%M:%S",
 80 |         handlers=[logging.StreamHandler(sys.stdout)],
 81 |     )
 82 | 
 83 |     if training_args.should_log:
 84 |         # The default of training_args.log_level is passive, so we set log level at info here to have that default.
 85 |         transformers.utils.logging.set_verbosity_info()
 86 | 
 87 |     log_level = training_args.get_process_log_level()
 88 |     logger.setLevel(log_level)
 89 |     datasets.utils.logging.set_verbosity(log_level)
 90 |     transformers.utils.logging.set_verbosity(log_level)
 91 |     transformers.utils.logging.enable_default_handler()
 92 |     transformers.utils.logging.enable_explicit_format()
 93 | 
 94 |     # Log on each process the small summary:
 95 |     logger.warning(
 96 |         f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
 97 |         + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
 98 |     )
 99 |     logger.info(f"Training/evaluation parameters {training_args}")
100 | 
101 |     # Get the datasets
102 |     pairs = data_args.language_pairs.split(",")
103 |     train_raw_data, valid_raw_data, test_raw_data = {}, None, {}
104 |     if data_args.text_test_file:
105 |         test_raw_data["mmt"] = load_a_single_text_file(pairs, data_args, model_args)
106 |     elif data_args.mmt_data_path:
107 |         mmt_train_raw_data, valid_raw_data, mmt_test_raw_data = load_mmt_dataset(pairs, data_args, model_args, training_args, logger)
108 |         train_raw_data["mmt"] = mmt_train_raw_data
109 |         test_raw_data["mmt"] = mmt_test_raw_data
110 | 
111 |     load_kwargs = {
112 |             'cache_dir': model_args.cache_dir,
113 |             'token': True if model_args.use_auth_token else None,
114 |             'streaming': data_args.streaming,
115 |             "trust_remote_code": True,
116 |         }
117 | 
118 |     if data_args.aya_datasets:
119 |         considered_languages_ISO1 = sorted(set(lang for p in pairs for lang in p.split('-')))
120 |         considered_languages_ISO3 = [ISO1_ISO3_map[lang] for lang in considered_languages_ISO1]
121 |         if training_args.do_train:
122 |             aya_train_raw_data = load_dataset(
123 |                     data_args.aya_datasets,
124 |                     **load_kwargs,
125 |                 )['train']
126 |                 
127 |             train_raw_data["aya"] = aya_train_raw_data.filter(lambda x: x['language_code'] in considered_languages_ISO3)
128 |         if training_args.do_predict:
129 |             test_raw_data["aya"] = {}
130 |             aya_test_raw_data = load_dataset(
131 |                 data_args.aya_datasets,
132 |                 **load_kwargs,
133 |             )['test']
134 |             for lg in considered_languages_ISO1:
135 |                 sub_aya_test_raw_data = aya_test_raw_data.filter(lambda x: x['language_code'] in [ISO1_ISO3_map[lg]])
136 |                 if len(sub_aya_test_raw_data) > 0:
137 |                     test_raw_data["aya"][lg] = sub_aya_test_raw_data
138 | 
139 |     if data_args.mono_data_path:
140 |         train_raw_data["mono"] = load_dataset(
141 |             "json",
142 |             data_files=data_args.mono_data_path,
143 |             **load_kwargs,
144 |         )
145 | 
146 |     if data_args.nllb_pretrain_data_path:
147 |         if data_args.nllb_interleave_probs:
148 |             interleave_probs = [float(p) for p in data_args.nllb_interleave_probs.split(",")]
149 |         else:
150 |             interleave_probs = [1/len(pairs)] * len(pairs)
151 |             
152 |         nllb_raw_data = []
153 |         for lg_pair in pairs:
154 |             src_lang, tgt_lang = lg_pair.split("-")
155 |             src_lang, tgt_lang = NLLB_CODE[src_lang], NLLB_CODE[tgt_lang]
156 |             language_key = f"{src_lang}-{tgt_lang}" if src_lang < tgt_lang else f"{tgt_lang}-{src_lang}"
157 | 
158 |             if src_lang == "tha_Thai" or tgt_lang == "tha_Thai":
159 |                 lg_dataset = load_dataset(
160 |                     "Helsinki-NLP/opus-100",
161 |                     "en-th",
162 |                     **load_kwargs,
163 |                 )["train"].shuffle(seed=training_args.seed)
164 |             else:
165 |                 lg_dataset = load_dataset(
166 |                     data_args.nllb_pretrain_data_path,
167 |                     language_key,
168 |                     **load_kwargs,
169 |                 )['train'].shuffle(seed=training_args.seed)
170 | 
171 |             def normalize_example(example):
172 |                 lg1, lg2 = example["translation"].keys()
173 |                 if random.random() < 0.5:
174 |                     combined_translation = example["translation"][lg1] + " " + example["translation"][lg2]
175 |                 else:
176 |                     combined_translation = example["translation"][lg2] + " " + example["translation"][lg1]
177 |                 return {
178 |                     "raw_text": combined_translation,
179 |                 }
180 | 
181 |             lg_dataset = lg_dataset.map(normalize_example, remove_columns=lg_dataset.column_names)
182 |             nllb_raw_data.append(lg_dataset)
183 |             
184 |         train_raw_data["nllb_pretrain"] = interleave_datasets(nllb_raw_data, probabilities=interleave_probs, seed=training_args.seed, stopping_strategy="all_exhausted")
185 |         
186 |     if data_args.oscar_data_path:
187 |         oscar_langs = data_args.oscar_data_lang.split(",")
188 |         if data_args.interleave_probs:
189 |             interleave_probs = [float(p) for p in data_args.interleave_probs.split(",")]
190 |         else:
191 |             interleave_probs = [1/len(oscar_langs)] * len(oscar_langs)
192 |         oscar_langs = [x for x, _ in sorted(zip(oscar_langs, interleave_probs), key=lambda zippair: zippair[1])]
193 |         interleave_probs = sorted(interleave_probs)
194 |         oscar_train_raw_data = []
195 | 
196 |         for lg in oscar_langs:
197 |             oscar_lg_data = load_dataset(
198 |                 data_args.oscar_data_path,
199 |                 lg,
200 |                 **load_kwargs
201 |             )['train'].shuffle(seed=training_args.seed)
202 |                 # if data_args.oscar_data_path != "cc100" else 
203 |                 # load_dataset(
204 |                 #     data_args.oscar_data_path,
205 |                 #     lang=lg,
206 |                 #     **load_kwargs
207 |                 # )['train'].shuffle(seed=training_args.seed)
208 | 
209 |             def normalize_oscar_example(example):
210 |                 return {
211 |                     "raw_text": example["text"],
212 |                 }
213 |             oscar_lg_data = oscar_lg_data.map(normalize_oscar_example, remove_columns=oscar_lg_data.column_names)
214 |             oscar_train_raw_data.append(oscar_lg_data)
215 |         train_raw_data["oscar"] = interleave_datasets(oscar_train_raw_data, probabilities=interleave_probs, seed=training_args.seed, stopping_strategy="all_exhausted")
216 | 
217 |         if "nllb_pretrain" in train_raw_data:
218 |         ## only for nllb pretrain and oscar:
219 |             train_raw_data["oscar"] = interleave_datasets([train_raw_data["oscar"], train_raw_data["nllb_pretrain"]], probabilities=[0.5, 0.5], seed=training_args.seed, stopping_strategy="all_exhausted").shuffle(seed=training_args.seed)
220 |             train_raw_data.pop("nllb_pretrain")
221 |         
222 |     # load tokenizer
223 |     set_seed(training_args.seed)
224 |     tokenizer = load_tokenizer(data_args, model_args, training_args, logger)
225 |     if data_args.use_ul2:
226 |         assert data_args.use_prefix_lm, "Must enable use prefix language model"
227 | 
228 |     shots_eval_dict = {}
229 |     if data_args.few_shot_eval_path:
230 |         for lg_pair in test_raw_data["mmt"].keys():
231 |             pair_shot_path = os.path.join(data_args.few_shot_eval_path, f"shots.{lg_pair}.json")
232 |             if not os.path.isfile(pair_shot_path):
233 |                 ValueError(f"Make sure the language pair {lg_pair} is in the few shot eval folder!")
234 |             with open(pair_shot_path) as f:
235 |                 shots_eval_dict[lg_pair] = json.load(f)
236 | 
237 |     if model_args.chat_style:
238 |         dummy_sentence = "This is a dummy sentence"
239 |         chat_dummy_sentence = [{"role": "user", "content": dummy_sentence}] 
240 |         dummy_sentence_with_speical_tokens = tokenizer.apply_chat_template(chat_dummy_sentence, tokenize=False, add_generation_prompt=True)
241 |         encoded = tokenizer.encode(dummy_sentence_with_speical_tokens, add_special_tokens=False)
242 |         decoded_text = tokenizer.decode(encoded, skip_special_tokens=True)
243 |         begin_prefix = decoded_text.split(dummy_sentence, 1)[0].strip()
244 |         additional_suffix = decoded_text.split(dummy_sentence, 1)[-1]
245 |     else:
246 |         begin_prefix = ""
247 |         additional_suffix = ""
248 | 
249 |     train_datasets, eval_datasets, test_datasets = get_preprocessed_data(train_raw_data, valid_raw_data, test_raw_data, pairs, tokenizer, shots_eval_dict, data_args, training_args, model_args)
250 |     metric = evaluate.load("sacrebleu")
251 | 
252 |     # Load model
253 |     model = load_model(data_args, model_args, training_args, tokenizer, logger)
254 |     collate_fn = DataCollatorForUL2(model, tokenizer) if data_args.use_ul2 else default_data_collator
255 |     
256 |     # Initialize our Trainer
257 |     trainer = LlmmtTrainer(
258 |         model=model,
259 |         args=training_args,
260 |         train_dataset=train_datasets if training_args.do_train else None,
261 |         eval_dataset=eval_datasets if training_args.do_eval else None,
262 |         tokenizer=tokenizer,
263 |         data_collator=collate_fn,
264 |         callbacks=[SavePeftModelCallback] if model_args.use_peft else None,
265 |     )
266 | 
267 |     # Training
268 |     if training_args.do_train:
269 |         checkpoint = None
270 |         if training_args.resume_from_checkpoint is not None:
271 |             checkpoint = training_args.resume_from_checkpoint
272 | 
273 |         train_result = trainer.train(resume_from_checkpoint=checkpoint)
274 | 
275 |         trainer.save_state()
276 |         if model_args.use_peft:
277 |             model.save_pretrained(training_args.output_dir) 
278 |         else:
279 |             trainer.save_model()  # Saves the tokenizer too for easy upload
280 |     # Prediction
281 |     if training_args.do_predict:
282 |         trainer.args.prediction_loss_only = False
283 |         if data_args.mmt_data_path:
284 |             lg_pairs = sorted(test_datasets["mmt"].keys()) # make sure each device print in the same order
285 |             for lg_pair in lg_pairs:
286 |                 test_dataset = test_datasets["mmt"][lg_pair]
287 |                 src_lang, tgt_lang = lg_pair.split("-")
288 |                 logger.info(f"*** Prediction for {lg_pair}***")
289 |                 if model_args.encoder_decoder_type == "nllb":
290 |                     preds, _, _ = trainer.predict(
291 |                     test_dataset=test_dataset, 
292 |                     max_new_tokens=data_args.max_new_tokens, 
293 |                     num_beams=data_args.num_beams, 
294 |                     metric_key_prefix="test",
295 |                     use_cache=True,
296 |                     forced_bos_token_id=tokenizer.lang_code_to_id[NLLB_CODE[tgt_lang]],
297 |                 )
298 |                 else:
299 |                     preds, _, _ = trainer.predict(
300 |                         test_dataset=test_dataset, 
301 |                         max_new_tokens=data_args.max_new_tokens, 
302 |                         num_beams=data_args.num_beams, 
303 |                         metric_key_prefix="test",
304 |                         use_cache=True,
305 |                     )
306 | 
307 |                 # Replace -100s used for padding as we can't decode them
308 |                 if int(torch.cuda.current_device()) == 0:
309 |                     preds = np.where(preds != -100, preds, tokenizer.pad_token_id)
310 |                     decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
311 | 
312 |                     # Some simple post-processing
313 |                     decoded_preds = [pred.strip() for pred in decoded_preds]
314 | 
315 |                     for idx in range(data_args.display_num_translations):
316 |                         print("------------------------")
317 |                         print(decoded_preds[idx])
318 | 
319 |                     with open(os.path.join(training_args.output_dir, f"test-{src_lang}-{tgt_lang}{data_args.suffix_eval_file}"), "w", encoding="utf-8") as f:
320 |                         suffix = get_key_suffix(tgt_lang, data_args, additional_suffix)
321 |                         if len(shots_eval_dict) != 0:
322 |                             split_idx = len(shots_eval_dict[lg_pair]) + 1
323 |                         else:
324 |                             split_idx = 1
325 |                         for pred in decoded_preds:
326 |                             # Output is itself if it is an encoder-decoder model, otherwise it is the prefix + output
327 |                             pred = clean_outputstring(pred, suffix, logger, split_idx) if not model_args.encoder_decoder_type else pred.strip()
328 |                             f.writelines([pred, "\n"])
329 | 
330 |         if data_args.aya_datasets:
331 |             langs = sorted(test_datasets["aya"].keys()) # make sure each device print in the same order
332 |             for lg in langs:
333 |                 test_dataset = test_datasets["aya"][lg]
334 |                 logger.info(f"*** Prediction aya for {lg}***")
335 |                 preds, _, _ = trainer.predict(
336 |                     test_dataset=test_dataset, 
337 |                     max_new_tokens=data_args.max_new_tokens, 
338 |                     num_beams=data_args.num_beams, 
339 |                     metric_key_prefix="test",
340 |                     use_cache=True,
341 |                 )
342 | 
343 |                 # Replace -100s used for padding as we can't decode them
344 |                 if int(torch.cuda.current_device()) == 0:
345 |                     preds = np.where(preds != -100, preds, tokenizer.pad_token_id)
346 |                     decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
347 | 
348 |                     # Some simple post-processing
349 |                     decoded_preds = [pred.strip() for pred in decoded_preds]
350 | 
351 |                     for idx in range(data_args.display_num_translations):
352 |                         print("------------------------")
353 |                         print(decoded_preds[idx])
354 | 
355 |                     with jsonlines.open(os.path.join(training_args.output_dir, f"aya-test-{lg}.json"), "w") as f:
356 |                         for pred in decoded_preds:
357 |                             # Output is itself if it is an encoder-decoder model, otherwise it is the prefix + output
358 |                             # pred = clean_outputstring(pred, suffix, logger, split_idx) if not model_args.encoder_decoder_type else pred.strip()
359 |                             try:
360 |                                 if begin_prefix or additional_suffix:
361 |                                     question = pred.split(additional_suffix)[0].split(begin_prefix)[1].strip()
362 |                                     response = pred.split(additional_suffix)[1].strip()
363 |                                 else:
364 |                                     question = ""
365 |                                     response = pred.strip()
366 |                                 json_input = {
367 |                                     "question": question,
368 |                                     "response": response,
369 |                                 }
370 |                                 f.write(json_input)
371 |                             except:
372 |                                 json_input = {
373 |                                     "question": pred,
374 |                                     "response": "TODO",
375 |                                 }
376 |                                 f.write(json_input)
377 |                                 print(f"Error in saving aya test {lg} json file. The output is {pred}")
378 |                                 continue
379 | 
380 | 
381 | def _mp_fn(index):
382 |     # For xla_spawn (TPUs)
383 |     main()
384 | 
385 | 
386 | if __name__ == "__main__":
387 |     main()
388 | 
389 | 


--------------------------------------------------------------------------------
/runs/cpo_ft.sh:
--------------------------------------------------------------------------------
 1 | OUTPUT_DIR=${1:-"./alma-7b-dpo-ft"}
 2 | pairs=${2:-"de-en,cs-en,is-en,zh-en,ru-en,en-de,en-cs,en-is,en-zh,en-ru"}
 3 | # random port between 30000 and 50000
 4 | port=$(( RANDOM % (50000 - 30000 + 1 ) + 30000 ))
 5 | 
 6 | accelerate launch --main_process_port ${port} --config_file configs/deepspeed_train_config_bf16.yaml \
 7 |      run_cpo_llmmt.py \
 8 |     --model_name_or_path haoranxu/ALMA-13B-Pretrain \
 9 |     --tokenizer_name haoranxu/ALMA-13B-Pretrain \
10 |     --peft_model_id  haoranxu/ALMA-13B-Pretrain-LoRA \
11 |     --cpo_scorer kiwi_xcomet \
12 |     --beta 0.1 \
13 |     --use_peft \
14 |     --use_fast_tokenizer False \
15 |     --cpo_data_path  haoranxu/ALMA-R-Preference \
16 |     --do_train \
17 |     --language_pairs ${pairs} \
18 |     --low_cpu_mem_usage \
19 |     --bf16 \
20 |     --learning_rate 1e-4 \
21 |     --weight_decay 0.01 \
22 |     --gradient_accumulation_steps 1 \
23 |     --lr_scheduler_type inverse_sqrt \
24 |     --warmup_ratio 0.01 \
25 |     --ignore_pad_token_for_loss \
26 |     --ignore_prompt_token_for_loss \
27 |     --per_device_train_batch_size 2 \
28 |     --evaluation_strategy no \
29 |     --save_strategy steps \
30 |     --save_total_limit 1 \
31 |     --logging_strategy steps \
32 |     --logging_steps 0.05 \
33 |     --output_dir ${OUTPUT_DIR} \
34 |     --num_train_epochs 1 \
35 |     --prediction_loss_only \
36 |     --max_new_tokens 256 \
37 |     --max_source_length 256 \
38 |     --max_prompt_length 256 \
39 |     --max_length 512 \
40 |     --seed 42 \
41 |     --overwrite_output_dir \
42 |     --report_to none \
43 |     --overwrite_cache 


--------------------------------------------------------------------------------
/runs/mono_ft.sh:
--------------------------------------------------------------------------------
 1 | OUTPUT_DIR=${1:-"./llama-2-7b-oscar-ft"}
 2 | # random port between 30000 and 50000
 3 | port=$(( RANDOM % (50000 - 30000 + 1 ) + 30000 ))
 4 | accelerate launch --main_process_port ${port} --config_file configs/deepspeed_train_config.yaml \
 5 |      run_llmmt.py \
 6 |     --model_name_or_path meta-llama/Llama-2-7b-hf \
 7 |     --oscar_data_path oscar-corpus/OSCAR-2301 \
 8 |     --oscar_data_lang en,ru,cs,zh,is,de \
 9 |     --interleave_probs "0.17,0.22,0.14,0.19,0.08,0.2" \
10 |     --streaming \
11 |     --max_steps 600000 \
12 |     --do_train \
13 |     --low_cpu_mem_usage \
14 |     --fp16 \
15 |     --learning_rate 2e-5 \
16 |     --weight_decay 0.01 \
17 |     --gradient_accumulation_steps 4 \
18 |     --lr_scheduler_type cosine \
19 |     --warmup_ratio 0.01 \
20 |     --ignore_pad_token_for_loss \
21 |     --ignore_prompt_token_for_loss \
22 |     --per_device_train_batch_size 4 \
23 |     --per_device_eval_batch_size 4 \
24 |     --save_strategy steps \
25 |     --save_steps 2000 \
26 |     --save_total_limit 1 \
27 |     --logging_strategy steps \
28 |     --logging_steps 1 \
29 |     --output_dir ${OUTPUT_DIR} \
30 |     --max_new_tokens 256 \
31 |     --max_source_length 256 \
32 |     --seed 42 \
33 |     --overwrite_output_dir \
34 |     --report_to none
35 | 


--------------------------------------------------------------------------------
/runs/parallel_ft.sh:
--------------------------------------------------------------------------------
 1 | OUTPUT_DIR=${1:-"./alma-7b-parallel-ft"}
 2 | pairs=${2:-"de-en,cs-en,is-en,zh-en,ru-en,en-de,en-cs,en-is,en-zh,en-ru"}
 3 | 
 4 | # random port between 30000 and 50000
 5 | port=$(( RANDOM % (50000 - 30000 + 1 ) + 30000 ))
 6 | accelerate launch --main_process_port ${port} --config_file configs/deepspeed_train_config.yaml \
 7 |      run_llmmt.py \
 8 |     --model_name_or_path haoranxu/ALMA-7B-Pretrain \
 9 |     --mmt_data_path ./human_written_data/ \
10 |     --do_train \
11 |     --do_eval \
12 |     --do_predict \
13 |     --language_pairs ${pairs} \
14 |     --load_best_model_at_end \
15 |     --low_cpu_mem_usage \
16 |     --fp16 \
17 |     --learning_rate 2e-5 \
18 |     --weight_decay 0.01 \
19 |     --gradient_accumulation_steps 4 \
20 |     --lr_scheduler_type inverse_sqrt \
21 |     --warmup_ratio 0.01 \
22 |     --ignore_pad_token_for_loss \
23 |     --ignore_prompt_token_for_loss \
24 |     --per_device_train_batch_size 4 \
25 |     --per_device_eval_batch_size 4 \
26 |     --evaluation_strategy steps \
27 |     --eval_steps 0.1 \
28 |     --save_strategy steps \
29 |     --save_steps 0.1 \
30 |     --save_total_limit 1 \
31 |     --logging_strategy steps \
32 |     --logging_steps 0.05 \
33 |     --output_dir ${OUTPUT_DIR} \
34 |     --num_train_epochs 1 \
35 |     --predict_with_generate \
36 |     --prediction_loss_only \
37 |     --max_new_tokens 256 \
38 |     --max_source_length 256 \
39 |     --seed 42 \
40 |     --overwrite_output_dir \
41 |     --num_beams 5 \
42 |     --ddp_timeout 999999 \
43 |     --report_to none \
44 |     --overwrite_cache 
45 | 
46 | 


--------------------------------------------------------------------------------
/runs/parallel_ft_lora.sh:
--------------------------------------------------------------------------------
 1 | OUTPUT_DIR=${1:-"./alma-7b-parallel-ft-lora"}
 2 | pairs=${2:-"de-en,cs-en,is-en,zh-en,ru-en,en-de,en-cs,en-is,en-zh,en-ru"}
 3 | LORA_RANK=${3:-"16"}
 4 | 
 5 | # random port between 30000 and 50000
 6 | port=$(( RANDOM % (50000 - 30000 + 1 ) + 30000 ))
 7 | 
 8 | accelerate launch --main_process_port ${port}  --config_file configs/deepspeed_train_config_bf16.yaml \
 9 |      run_llmmt.py \
10 |     --model_name_or_path haoranxu/ALMA-7B-Pretrain \
11 |     --mmt_data_path  ./human_written_data/ \
12 |     --use_peft \
13 |     --lora_rank ${LORA_RANK} \
14 |     --do_train \
15 |     --do_eval \
16 |     --do_predict \
17 |     --language_pairs ${pairs} \
18 |     --load_best_model_at_end \
19 |     --low_cpu_mem_usage \
20 |     --fp16 \
21 |     --learning_rate 2e-3 \
22 |     --weight_decay 0.01 \
23 |     --gradient_accumulation_steps 4 \
24 |     --lr_scheduler_type inverse_sqrt \
25 |     --warmup_ratio 0.01 \
26 |     --ignore_pad_token_for_loss \
27 |     --ignore_prompt_token_for_loss \
28 |     --per_device_train_batch_size 4 \
29 |     --per_device_eval_batch_size 4 \
30 |     --evaluation_strategy steps \
31 |     --eval_steps 0.05 \
32 |     --save_strategy steps \
33 |     --save_steps 0.05 \
34 |     --save_total_limit 1 \
35 |     --logging_strategy steps \
36 |     --logging_steps 0.05 \
37 |     --output_dir ${OUTPUT_DIR} \
38 |     --num_train_epochs 1 \
39 |     --predict_with_generate \
40 |     --prediction_loss_only \
41 |     --max_new_tokens 256 \
42 |     --max_source_length 256 \
43 |     --seed 42 \
44 |     --overwrite_output_dir \
45 |     --num_beams 5 \
46 |     --ddp_timeout 999999 \
47 |     --report_to none \
48 |     --overwrite_cache
49 |     
50 | ## Evaluation (BLEU, COMET)
51 | bash ./evals/eval_generation.sh ${OUTPUT_DIR} ${pairs}


--------------------------------------------------------------------------------
/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fe1ixxu/ALMA/e4545ca1f9799f1c7052e16d757ea3ed6b6b4286/utils/__init__.py


--------------------------------------------------------------------------------
/utils/arguments.py:
--------------------------------------------------------------------------------
  1 | from dataclasses import dataclass, field
  2 | from typing import Optional
  3 | from transformers import MODEL_FOR_CAUSAL_LM_MAPPING
  4 | from transformers.utils.versions import require_version
  5 | 
  6 | MODEL_CONFIG_CLASSES = list(MODEL_FOR_CAUSAL_LM_MAPPING.keys())
  7 | MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)
  8 | @dataclass
  9 | class ModelArguments:
 10 |     """
 11 |     Arguments pertaining to which model/config/tokenizer we are going to fine-tune, or train from scratch.
 12 |     """
 13 | 
 14 |     model_name_or_path: Optional[str] = field(
 15 |         default=None,
 16 |         metadata={
 17 |             "help": (
 18 |                 "The model checkpoint for weights initialization.Don't set if you want to train a model from scratch."
 19 |             )
 20 |         },
 21 |     )
 22 |     model_type: Optional[str] = field(
 23 |         default=None,
 24 |         metadata={"help": "If training from scratch, pass a model type from the list: " + ", ".join(MODEL_TYPES)},
 25 |     )
 26 |     config_overrides: Optional[str] = field(
 27 |         default=None,
 28 |         metadata={
 29 |             "help": (
 30 |                 "Override some existing default config settings when a model is trained from scratch. Example: "
 31 |                 "n_embd=10,resid_pdrop=0.2,scale_attn_weights=false,summary_type=cls_index"
 32 |             )
 33 |         },
 34 |     )
 35 |     config_name: Optional[str] = field(
 36 |         default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
 37 |     )
 38 |     tokenizer_name: Optional[str] = field(
 39 |         default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
 40 |     )
 41 |     cache_dir: Optional[str] = field(
 42 |         default=None,
 43 |         metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"},
 44 |     )
 45 |     use_fast_tokenizer: bool = field(
 46 |         default=True,
 47 |         metadata={"help": "Whether to use one of the fast tokenizer (backed by the tokenizers library) or not."},
 48 |     )
 49 |     model_revision: str = field(
 50 |         default="main",
 51 |         metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
 52 |     )
 53 |     use_auth_token: bool = field(
 54 |         default=False,
 55 |         metadata={
 56 |             "help": (
 57 |                 "Will use the token generated when running `huggingface-cli login` (necessary to use this script "
 58 |                 "with private models)."
 59 |             )
 60 |         },
 61 |     )
 62 |     use_flash_attention_2: bool = field(
 63 |         default=False,
 64 |         metadata={
 65 |             "help": (
 66 |                 "Will enable flash attention 2"
 67 |             )
 68 |         },
 69 |     )
 70 |     torch_dtype: Optional[str] = field(
 71 |         default=None,
 72 |         metadata={
 73 |             "help": (
 74 |                 "Override the default `torch.dtype` and load the model under this dtype. If `auto` is passed, the "
 75 |                 "dtype will be automatically derived from the model's weights."
 76 |             ),
 77 |             "choices": ["auto", "bfloat16", "float16", "float32"],
 78 |         },
 79 |     )
 80 |     low_cpu_mem_usage: bool = field(
 81 |         default=False,
 82 |         metadata={
 83 |             "help": (
 84 |                 "It is an option to create the model as an empty shell, then only materialize its parameters when the pretrained weights are loaded."
 85 |                 "set True will benefit LLM loading time and RAM consumption."
 86 |             )
 87 |         },
 88 |     )
 89 |     encoder_decoder_type: str = field(
 90 |         default="",
 91 |         metadata={"help": "Enable it if the model is encoder-decoder architecture."},
 92 |     )
 93 | 
 94 |     chat_style: bool = field(
 95 |         default=False,
 96 |         metadata={
 97 |             "help": (
 98 |                 "Whether to use chat style decoding"
 99 |             )
100 |         },
101 |     )
102 | 
103 |     load_in_8bit: bool = field(
104 |         default=False,
105 |         metadata={
106 |             "help": (
107 |                 "Whether load model with int8"
108 |             )
109 |         },
110 |     )
111 |     use_peft: bool = field(
112 |         default=False,
113 |         metadata={
114 |             "help": (
115 |                 "Whether use PEFT (parameter efficient fine-tuning)"
116 |             )
117 |         },
118 |     )
119 |     lora_rank: int = field(
120 |         default=16,
121 |         metadata={
122 |             "help": (
123 |                 "The rank for LoRA"
124 |             )
125 |         },
126 |     )
127 |     multi_gpu_one_model: bool = field(
128 |         default=False,
129 |         metadata={
130 |             "help": "Use multiple GPUs to load one model."
131 |         },
132 |     )
133 |     peft_model_id: str = field(
134 |         default="",
135 |         metadata={
136 |             "help": (
137 |                 "PEFT model location"
138 |             )
139 |         },
140 |     )
141 |     def __post_init__(self):
142 |         if self.config_overrides is not None and (self.config_name is not None or self.model_name_or_path is not None):
143 |             raise ValueError(
144 |                 "--config_overrides can't be used in combination with --config_name or --model_name_or_path"
145 |             )
146 | 
147 | 
148 | @dataclass
149 | class DataTrainingArguments:
150 |     """
151 |     Arguments pertaining to what data we are going to input our model for training and eval.
152 |     """
153 |     language_pairs: str = field(default="", metadata={"help": "training language pairs"})
154 |     dataset_name: Optional[str] = field(
155 |         default=None, metadata={"help": "The name of the dataset to use (via the datasets library)."}
156 |     )
157 |     dataset_config_name: Optional[str] = field(
158 |         default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
159 |     )
160 |     mmt_data_path: Optional[str] = field(default=None, metadata={"help": "The input MMT training data path."})
161 |     override_test_data_path: Optional[str] = field(default=None, metadata={"help": "This will override the default test data in the mmt data"})
162 |     cpo_data_path: Optional[str] = field(default=None, metadata={"help": "The input CPO training data path."})
163 |     mono_data_path: Optional[str] = field(default=None, metadata={"help": "The input mono data training data path."})
164 |     oscar_data_path: Optional[str] = field(default=None, metadata={"help": "The input Oscar mono data name."})
165 |     nllb_pretrain_data_path: Optional[str] = field(default=None, metadata={"help": "The input NLLB pretrain (parallel) data path."})
166 |     oscar_data_lang: Optional[str] = field(default=None, metadata={"help": "The input Oscar mono data language."})
167 |     text_test_file:  Optional[str] = field(default=None, metadata={"help": "A single test data file in text format, this will override the mmt_data_path and override_test_data_path"})
168 |     aya_datasets:  Optional[str] = field(default=None, metadata={"help": "The datasets for Aya model."})
169 | 
170 |     max_train_samples: Optional[int] = field(
171 |         default=None,
172 |         metadata={
173 |             "help": (
174 |                 "For debugging purposes or quicker training, truncate the number of training examples to this "
175 |                 "value if set."
176 |             )
177 |         },
178 |     )
179 |     max_eval_samples: Optional[int] = field(
180 |         default=None,
181 |         metadata={
182 |             "help": (
183 |                 "For debugging purposes or quicker training, truncate the number of evaluation examples to this "
184 |                 "value if set."
185 |             )
186 |         },
187 |     )
188 |     max_test_samples: Optional[int] = field(
189 |         default=None,
190 |         metadata={
191 |             "help": (
192 |                 "For debugging purposes, truncate the number of test examples to this "
193 |                 "value if set."
194 |             )
195 |         },
196 |     )
197 |     streaming: bool = field(default=False, metadata={"help": "Enable streaming mode"})
198 |     block_size: Optional[int] = field(
199 |         default=None,
200 |         metadata={
201 |             "help": (
202 |                 "Optional input sequence length after tokenization. "
203 |                 "The training dataset will be truncated in block of this size for training. "
204 |                 "Default to the model max input length for single sentence inputs (take into account special tokens)."
205 |             )
206 |         },
207 |     )
208 |     overwrite_cache: bool = field(
209 |         default=False, metadata={"help": "Overwrite the cached training and evaluation sets"}
210 |     )
211 |     validation_split_percentage: Optional[int] = field(
212 |         default=5,
213 |         metadata={
214 |             "help": "The percentage of the train set used as validation set in case there's no validation split"
215 |         },
216 |     )
217 |     preprocessing_num_workers: Optional[int] = field(
218 |         default=None,
219 |         metadata={"help": "The number of processes to use for the preprocessing."},
220 |     )
221 |     keep_linebreaks: bool = field(
222 |         default=True, metadata={"help": "Whether to keep line breaks when using TXT files or not."}
223 |     )
224 |     ignore_pad_token_for_loss: bool = field(
225 |         default=True,
226 |         metadata={
227 |             "help": "Whether to ignore the tokens corresponding to padded labels in the loss computation or not."
228 |         },
229 |     )
230 |     ignore_prompt_token_for_loss: bool = field(
231 |         default=False,
232 |         metadata={
233 |             "help": "Whether to ignore the prompt tokens in the loss computation or not."
234 |         },
235 |     )
236 |     use_ul2: bool = field(
237 |         default=False,
238 |         metadata={
239 |             "help": "Whether to enable mixture of denoisers from UL2 model."
240 |         },
241 |     )
242 |     max_source_length: Optional[int] = field(
243 |         default=256,
244 |         metadata={
245 |             "help": (
246 |                 "The maximum total sequence length text after tokenization. Sequences longer "
247 |                 "than this will be truncated, sequences shorter will be padded."
248 |             )
249 |         },
250 |     )
251 |     max_new_tokens: Optional[int] = field(
252 |         default=256,
253 |         metadata={
254 |             "help": (
255 |                 "The maximum new tokens to generate except the prompt."
256 |             )
257 |         },
258 |     )
259 |     num_beams: Optional[int] = field(
260 |         default=5,
261 |         metadata={
262 |             "help": (
263 |                 "Beam size for generation"
264 |             )
265 |         }
266 |     )
267 | 
268 |     display_num_translations: Optional[int] = field(
269 |         default=10,
270 |         metadata={
271 |             "help": (
272 |                 "Number of translations will be displayed after translation."
273 |             )
274 |         }
275 |     )
276 | 
277 |     right_pad: bool = field(
278 |         default=False,
279 |         metadata={
280 |             "help": "Use right pad for training, especially for models like MPT."
281 |         },
282 |     )
283 | 
284 |     use_prefix_lm: bool = field(
285 |         default=False,
286 |         metadata={
287 |             "help": "Use prefix language model, especially for models like MPT."
288 |         },
289 |     )
290 |     few_shot_eval_path: str = field(
291 |         default="",
292 |         metadata={
293 |             "help": "The path for few show evaluation"
294 |         },
295 |     )
296 |     use_target_lang_prompt_eval: bool = field(
297 |         default=False,
298 |         metadata={
299 |             "help": "Enable prompt from target language, e.g., in Chinese, the prompt is 将其从英语翻译成汉语：......"
300 |         },
301 |     )
302 | 
303 |     interleave_probs: str = field(
304 |         default="",
305 |         metadata={
306 |             "help": "Usung interleave to concatenate datasets, with probabilities of p1,p2,p3,..., splited by commas"
307 |         },
308 |     )
309 | 
310 |     nllb_interleave_probs: str = field(
311 |         default="",
312 |         metadata={
313 |             "help": "Usung interleave to concatenate datasets, with probabilities of p1,p2,p3,..., splited by commas for NLLB"
314 |         },
315 |     )
316 | 
317 |     suffix_eval_file: str = field(
318 |         default="",
319 |         metadata={
320 |             "help": "The suffix for the eval file: test-src-tgt'suffix_eval_file'"
321 |         },
322 |     )
323 | 
324 |     cpo_scorer: str = field(
325 |         default="xcomet_kiwi",
326 |         metadata={
327 |             "help": "The scorer of CPO, e.g., using xcomet, kiwi, or both of them (xcomet-kiwi) for CPO training"
328 |         },
329 |     )
330 | 
331 | 
332 |     # predict_source_lang: str = field(default="", metadata={"help": "The source language for testing"})
333 |     # predict_target_lang: str = field(default="en", metadata={"help": "The target language for testing"})
334 | 
335 |     suffix: Optional[str] = field(default="", metadata={"help": "The suffix of the training file."})
336 | 
337 |     def __post_init__(self):
338 |         if self.streaming:
339 |             require_version("datasets>=2.0.0", "The streaming feature requires `datasets>=2.0.0`")
340 | 


--------------------------------------------------------------------------------
/utils/cpo_config.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2024 The HuggingFace Team. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | from dataclasses import dataclass
15 | from typing import Dict, Literal, Optional
16 | 
17 | from transformers import TrainingArguments
18 | 
19 | 
20 | @dataclass
21 | class CPOConfig(TrainingArguments):
22 |     r"""
23 |     CPOConfig collects all training arguments related to the [`CPOTrainer`] class.
24 | 
25 |     Using [`HfArgumentParser`] we can turn this class into
26 |     [argparse](https://docs.python.org/3/library/argparse#module-argparse) arguments that can be specified on the
27 |     command line.
28 | 
29 |     Parameters:
30 |         max_length (`int`, defaults to `None`):
31 |             The maximum length of the sequences in the batch. This argument is required if you want to use the default data collator.
32 |         max_prompt_length (`int`, defaults to `None`):
33 |             The maximum length of the prompt. This argument is required if you want to use the default data collator.
34 |         max_target_length (`int`, defaults to `None`):
35 |             The maximum length of the target. This argument is required if you want to use the default data collator and your model is an encoder-decoder.
36 |         beta (`float`, defaults to 0.1):
37 |             The beta factor in CPO loss.
38 |         label_smoothing (`float`, defaults to 0):
39 |             The label smoothing factor. This argument is required if you want to use the default data collator.
40 |         loss_type (`str`, defaults to `sigmoid`):
41 |             The type of loss to use. This argument is required if you want to use the default data collator.
42 |         label_pad_token_id (`int`, defaults to `-100`):
43 |             The label pad token id. This argument is required if you want to use the default data collator.
44 |         cpo_alpha (`float`, defaults to `1.0`):
45 |             A hyperparameter that controls the strength of the BC regularizer in CPO training.
46 |         simpo_gamma (`float`, defaults to `0.5`):
47 |             A target reward margin for the SimPO loss, used only when the "simpo" option is enabled.
48 |         padding_value (`int`, defaults to `None`):
49 |             The padding value if it is different to the tokenizer's pad_token_id.
50 |         truncation_mode (`str`, defaults to `keep_end`):
51 |             The truncation mode to use, either `keep_end` or `keep_start`. This argument is required if you want to use the default data collator.
52 |         generate_during_eval (`bool`, defaults to `False`):
53 |             Whether to sample and log generations during evaluation step.
54 |         is_encoder_decoder (`Optional[bool]`, `optional`, defaults to `None`):
55 |             If no model is provided, we need to know if the model_init returns an encoder-decoder.
56 |         disable_dropout (`bool`, defaults to `True`):
57 |             Whether or not to disable dropouts in `model`.
58 |         model_init_kwargs (`Optional[Dict]`, *optional*):
59 |             Dict of Optional kwargs to pass when instantiating the model from a string
60 |         dataset_num_proc (`Optional[int]`, *optional*):
61 |             The number of workers to use to tokenize the data. Defaults to None.
62 |     """
63 | 
64 |     max_length: Optional[int] = None
65 |     max_prompt_length: Optional[int] = None
66 |     max_completion_length: Optional[int] = None
67 |     max_target_length: Optional[int] = None
68 | 
69 |     beta: float = 0.1
70 |     label_smoothing: float = 0
71 |     loss_type: Literal["sigmoid", "hinge", "ipo", "simpo"] = "sigmoid"
72 |     disable_dropout: bool = True
73 |     cpo_alpha: float = 1.0
74 |     simpo_gamma: float = 0.5
75 |     relax_cofficient_1: float = 0.9
76 |     relax_cofficient_2: float = 0.4 
77 | 
78 |     label_pad_token_id: int = -100
79 |     padding_value: int = None
80 |     truncation_mode: str = "keep_end"
81 |     generate_during_eval: bool = False
82 |     is_encoder_decoder: Optional[bool] = None
83 | 
84 |     model_init_kwargs: Optional[Dict] = None
85 | 
86 |     dataset_num_proc: Optional[int] = None
87 | 
88 |     def __post_init__(self):
89 |         if self.loss_type == "kto_pair":
90 |             raise ValueError("Support for kto_pair has been removed in CPOTrainer. Please use KTOTrainer.")
91 |         return super().__post_init__()
92 | 


--------------------------------------------------------------------------------
/utils/trainer_llmmt.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2020 The HuggingFace Team. All rights reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | from copy import deepcopy
 16 | from pathlib import Path
 17 | from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Union
 18 | 
 19 | import torch
 20 | from torch import nn
 21 | from torch.utils.data import Dataset
 22 | 
 23 | from transformers.deepspeed import is_deepspeed_zero3_enabled
 24 | from transformers.generation.configuration_utils import GenerationConfig
 25 | from transformers.trainer import Trainer
 26 | from transformers.utils import logging
 27 | 
 28 | 
 29 | if TYPE_CHECKING:
 30 |     from transformers.data.data_collator import DataCollator
 31 |     from transformers.modeling_utils import PreTrainedModel
 32 |     from transformers.tokenization_utils_base import PreTrainedTokenizerBase
 33 |     from transformers.trainer_callback import TrainerCallback
 34 |     from transformers.trainer_utils import EvalPrediction, PredictionOutput
 35 |     from transformers.training_args import TrainingArguments
 36 | 
 37 | 
 38 | logger = logging.get_logger(__name__)
 39 | 
 40 | 
 41 | class LlmmtTrainer(Trainer):
 42 |     def __init__(
 43 |         self,
 44 |         model: Union["PreTrainedModel", nn.Module] = None,
 45 |         args: "TrainingArguments" = None,
 46 |         data_collator: Optional["DataCollator"] = None,
 47 |         train_dataset: Optional[Dataset] = None,
 48 |         eval_dataset: Optional[Union[Dataset, Dict[str, Dataset]]] = None,
 49 |         tokenizer: Optional["PreTrainedTokenizerBase"] = None,
 50 |         model_init: Optional[Callable[[], "PreTrainedModel"]] = None,
 51 |         compute_metrics: Optional[Callable[["EvalPrediction"], Dict]] = None,
 52 |         callbacks: Optional[List["TrainerCallback"]] = None,
 53 |         optimizers: Tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR] = (None, None),
 54 |         preprocess_logits_for_metrics: Optional[Callable[[torch.Tensor, torch.Tensor], torch.Tensor]] = None,
 55 |     ):
 56 |         super().__init__(
 57 |             model=model,
 58 |             args=args,
 59 |             data_collator=data_collator,
 60 |             train_dataset=train_dataset,
 61 |             eval_dataset=eval_dataset,
 62 |             tokenizer=tokenizer,
 63 |             model_init=model_init,
 64 |             compute_metrics=compute_metrics,
 65 |             callbacks=callbacks,
 66 |             optimizers=optimizers,
 67 |             preprocess_logits_for_metrics=preprocess_logits_for_metrics,
 68 |         )
 69 | 
 70 |         # Override self.model.generation_config if a GenerationConfig is specified in args.
 71 |         # Priority: args.generation_config > model.generation_config > default GenerationConfig.
 72 |         if self.args.generation_config is not None:
 73 |             gen_config = self.load_generation_config(self.args.generation_config)
 74 |             self.model.generation_config = gen_config
 75 | 
 76 |     @staticmethod
 77 |     def load_generation_config(gen_config_arg: Union[str, GenerationConfig]) -> GenerationConfig:
 78 |         """
 79 |         Loads a `~generation.GenerationConfig` from the `Seq2SeqTrainingArguments.generation_config` arguments.
 80 | 
 81 |         Args:
 82 |             gen_config_arg (`str` or [`~generation.GenerationConfig`]):
 83 |                 `Seq2SeqTrainingArguments.generation_config` argument.
 84 | 
 85 |         Returns:
 86 |             A `~generation.GenerationConfig`.
 87 |         """
 88 | 
 89 |         # GenerationConfig provided, nothing to do
 90 |         if isinstance(gen_config_arg, GenerationConfig):
 91 |             return deepcopy(gen_config_arg)
 92 | 
 93 |         # str or Path
 94 |         pretrained_model_name = Path(gen_config_arg) if isinstance(gen_config_arg, str) else gen_config_arg
 95 |         config_file_name = None
 96 | 
 97 |         # Figuring if it is path pointing to a file, pointing to a directory or else a model id or URL
 98 |         # This step is required in order to determine config_file_name
 99 |         if pretrained_model_name.is_file():
100 |             config_file_name = pretrained_model_name.name
101 |             pretrained_model_name = pretrained_model_name.parent
102 |         # dir path
103 |         elif pretrained_model_name.is_dir():
104 |             pass
105 |         # model id or URL
106 |         else:
107 |             pretrained_model_name = gen_config_arg
108 | 
109 |         gen_config = GenerationConfig.from_pretrained(pretrained_model_name, config_file_name)
110 |         return gen_config
111 | 
112 |     def evaluate(
113 |         self,
114 |         eval_dataset: Optional[Dataset] = None,
115 |         ignore_keys: Optional[List[str]] = None,
116 |         metric_key_prefix: str = "eval",
117 |         **gen_kwargs,
118 |     ) -> Dict[str, float]:
119 |         """
120 |         Run evaluation and returns metrics.
121 | 
122 |         The calling script will be responsible for providing a method to compute metrics, as they are task-dependent
123 |         (pass it to the init `compute_metrics` argument).
124 | 
125 |         You can also subclass and override this method to inject custom behavior.
126 | 
127 |         Args:
128 |             eval_dataset (`Dataset`, *optional*):
129 |                 Pass a dataset if you wish to override `self.eval_dataset`. If it is an [`~datasets.Dataset`], columns
130 |                 not accepted by the `model.forward()` method are automatically removed. It must implement the `__len__`
131 |                 method.
132 |             ignore_keys (`List[str]`, *optional*):
133 |                 A list of keys in the output of your model (if it is a dictionary) that should be ignored when
134 |                 gathering predictions.
135 |             metric_key_prefix (`str`, *optional*, defaults to `"eval"`):
136 |                 An optional prefix to be used as the metrics key prefix. For example the metrics "bleu" will be named
137 |                 "eval_bleu" if the prefix is `"eval"` (default)
138 |             max_length (`int`, *optional*):
139 |                 The maximum target length to use when predicting with the generate method.
140 |             num_beams (`int`, *optional*):
141 |                 Number of beams for beam search that will be used when predicting with the generate method. 1 means no
142 |                 beam search.
143 |             gen_kwargs:
144 |                 Additional `generate` specific kwargs.
145 | 
146 |         Returns:
147 |             A dictionary containing the evaluation loss and the potential metrics computed from the predictions. The
148 |             dictionary also contains the epoch number which comes from the training state.
149 |         """
150 | 
151 |         gen_kwargs = gen_kwargs.copy()
152 |         if gen_kwargs.get("max_length") is None and gen_kwargs.get("max_new_tokens") is None:
153 |             gen_kwargs["max_length"] = self.args.generation_max_length
154 |         gen_kwargs["num_beams"] = (
155 |             gen_kwargs["num_beams"] if gen_kwargs.get("num_beams") is not None else self.args.generation_num_beams
156 |         )
157 |         self._gen_kwargs = gen_kwargs
158 | 
159 |         return super().evaluate(eval_dataset, ignore_keys=ignore_keys, metric_key_prefix=metric_key_prefix)
160 | 
161 |     def predict(
162 |         self,
163 |         test_dataset: Dataset,
164 |         ignore_keys: Optional[List[str]] = None,
165 |         metric_key_prefix: str = "test",
166 |         **gen_kwargs,
167 |     ) -> "PredictionOutput":
168 |         """
169 |         Run prediction and returns predictions and potential metrics.
170 | 
171 |         Depending on the dataset and your use case, your test dataset may contain labels. In that case, this method
172 |         will also return metrics, like in `evaluate()`.
173 | 
174 |         Args:
175 |             test_dataset (`Dataset`):
176 |                 Dataset to run the predictions on. If it is a [`~datasets.Dataset`], columns not accepted by the
177 |                 `model.forward()` method are automatically removed. Has to implement the method `__len__`
178 |             ignore_keys (`List[str]`, *optional*):
179 |                 A list of keys in the output of your model (if it is a dictionary) that should be ignored when
180 |                 gathering predictions.
181 |             metric_key_prefix (`str`, *optional*, defaults to `"eval"`):
182 |                 An optional prefix to be used as the metrics key prefix. For example the metrics "bleu" will be named
183 |                 "eval_bleu" if the prefix is `"eval"` (default)
184 |             max_length (`int`, *optional*):
185 |                 The maximum target length to use when predicting with the generate method.
186 |             num_beams (`int`, *optional*):
187 |                 Number of beams for beam search that will be used when predicting with the generate method. 1 means no
188 |                 beam search.
189 |             gen_kwargs:
190 |                 Additional `generate` specific kwargs.
191 | 
192 |         <Tip>
193 | 
194 |         If your predictions or labels have different sequence lengths (for instance because you're doing dynamic
195 |         padding in a token classification task) the predictions will be padded (on the right) to allow for
196 |         concatenation into one array. The padding index is -100.
197 | 
198 |         </Tip>
199 | 
200 |         Returns: *NamedTuple* A namedtuple with the following keys:
201 | 
202 |             - predictions (`np.ndarray`): The predictions on `test_dataset`.
203 |             - label_ids (`np.ndarray`, *optional*): The labels (if the dataset contained some).
204 |             - metrics (`Dict[str, float]`, *optional*): The potential dictionary of metrics (if the dataset contained
205 |               labels).
206 |         """
207 | 
208 |         gen_kwargs = gen_kwargs.copy()
209 |         if gen_kwargs.get("max_length") is None and gen_kwargs.get("max_new_tokens") is None:
210 |             gen_kwargs["max_length"] = self.args.generation_max_length
211 |         gen_kwargs["num_beams"] = (
212 |             gen_kwargs["num_beams"] if gen_kwargs.get("num_beams") is not None else self.args.generation_num_beams
213 |         )
214 |         self._gen_kwargs = gen_kwargs
215 | 
216 |         return super().predict(test_dataset, ignore_keys=ignore_keys, metric_key_prefix=metric_key_prefix)
217 | 
218 |     def prediction_step(
219 |         self,
220 |         model: nn.Module,
221 |         inputs: Dict[str, Union[torch.Tensor, Any]],
222 |         prediction_loss_only: bool,
223 |         ignore_keys: Optional[List[str]] = None,
224 |     ) -> Tuple[Optional[float], Optional[torch.Tensor], Optional[torch.Tensor]]:
225 |         """
226 |         Perform an evaluation step on `model` using `inputs`.
227 | 
228 |         Subclass and override to inject custom behavior.
229 | 
230 |         Args:
231 |             model (`nn.Module`):
232 |                 The model to evaluate.
233 |             inputs (`Dict[str, Union[torch.Tensor, Any]]`):
234 |                 The inputs and targets of the model.
235 | 
236 |                 The dictionary will be unpacked before being fed to the model. Most models expect the targets under the
237 |                 argument `labels`. Check your model's documentation for all accepted arguments.
238 |             prediction_loss_only (`bool`):
239 |                 Whether or not to return the loss only.
240 | 
241 |         Return:
242 |             Tuple[Optional[float], Optional[torch.Tensor], Optional[torch.Tensor]]: A tuple with the loss, logits and
243 |             labels (each being optional).
244 |         """
245 | 
246 |         if not self.args.predict_with_generate or prediction_loss_only:
247 |             return super().prediction_step(
248 |                 model, inputs, prediction_loss_only=prediction_loss_only, ignore_keys=ignore_keys
249 |             )
250 | 
251 |         has_labels = "labels" in inputs
252 |         inputs = self._prepare_inputs(inputs)
253 | 
254 |         # XXX: adapt synced_gpus for fairscale as well
255 |         # Priority (handled in generate):
256 |         # gen_kwargs > model.generation_config > default GenerationConfig()
257 |         gen_kwargs = self._gen_kwargs.copy()
258 |         if gen_kwargs.get("max_length") is None and gen_kwargs.get("max_new_tokens") is None:
259 |             gen_kwargs["max_length"] = self.model.config.max_length
260 |         gen_kwargs["num_beams"] = (
261 |             gen_kwargs["num_beams"] if gen_kwargs.get("num_beams") is not None else self.model.config.num_beams
262 |         )
263 |         default_synced_gpus = True if is_deepspeed_zero3_enabled() else False
264 |         gen_kwargs["synced_gpus"] = (
265 |             gen_kwargs["synced_gpus"] if gen_kwargs.get("synced_gpus") is not None else default_synced_gpus
266 |         )
267 | 
268 |         # If the `decoder_input_ids` was created from `labels`, evict the former, so that the model can freely generate
269 |         # (otherwise, it would continue generating from the padded `decoder_input_ids`)
270 |         if (
271 |             "labels" in inputs
272 |             and "decoder_input_ids" in inputs
273 |             and inputs["labels"].shape == inputs["decoder_input_ids"].shape
274 |         ):
275 |             inputs = {k: v for k, v in inputs.items() if k != "decoder_input_ids"}
276 |         generated_tokens = self.model.generate(**inputs, **gen_kwargs)
277 | 
278 |         # Temporary hack to ensure the generation config is not initialized for each iteration of the evaluation loop
279 |         # TODO: remove this hack when the legacy code that initializes generation_config from a model config is
280 |         # removed in https://github.com/huggingface/transformers/blob/98d88b23f54e5a23e741833f1e973fdf600cc2c5/src/transformers/generation/utils.py#L1183
281 |         if self.model.generation_config._from_model_config:
282 |             self.model.generation_config._from_model_config = False
283 | 
284 |         # Retrieves GenerationConfig from model.generation_config
285 |         gen_config = self.model.generation_config
286 |         # in case the batch is shorter than max length, the output should be padded
287 |         if generated_tokens.shape[-1] < gen_config.max_length:
288 |             generated_tokens = self._pad_tensors_to_max_len(generated_tokens, gen_config.max_length)
289 |         elif gen_config.max_new_tokens is not None and generated_tokens.shape[-1] < gen_config.max_new_tokens + 1:
290 |             generated_tokens = self._pad_tensors_to_max_len(generated_tokens, gen_config.max_new_tokens + 1)
291 | 
292 |         loss = None
293 | 
294 |         if self.args.prediction_loss_only:
295 |             return loss, None, None
296 | 
297 |         if has_labels:
298 |             labels = inputs["labels"]
299 |             if labels.shape[-1] < gen_config.max_length:
300 |                 labels = self._pad_tensors_to_max_len(labels, gen_config.max_length)
301 |             elif gen_config.max_new_tokens is not None and labels.shape[-1] < gen_config.max_new_tokens + 1:
302 |                 labels = self._pad_tensors_to_max_len(labels, gen_config.max_new_tokens + 1)
303 |         else:
304 |             labels = None
305 | 
306 |         return loss, generated_tokens, labels
307 | 
308 |     def _pad_tensors_to_max_len(self, tensor, max_length):
309 |         if self.tokenizer is not None and hasattr(self.tokenizer, "pad_token_id"):
310 |             # If PAD token is not defined at least EOS token has to be defined
311 |             pad_token_id = (
312 |                 self.tokenizer.pad_token_id if self.tokenizer.pad_token_id is not None else self.tokenizer.eos_token_id
313 |             )
314 |         else:
315 |             if self.model.config.pad_token_id is not None:
316 |                 pad_token_id = self.model.config.pad_token_id
317 |             else:
318 |                 raise ValueError("Pad_token_id must be set in the configuration of the model, in order to pad tensors")
319 | 
320 |         padded_tensor = pad_token_id * torch.ones(
321 |             (tensor.shape[0], max_length), dtype=tensor.dtype, device=tensor.device
322 |         )
323 |         padded_tensor[:, : tensor.shape[-1]] = tensor
324 |         return padded_tensor
325 | 


--------------------------------------------------------------------------------
/utils/ul2collator.py:
--------------------------------------------------------------------------------
  1 | import random
  2 | from collections.abc import Mapping
  3 | import numpy as np
  4 | import torch
  5 | from torch.nn import functional as F
  6 | from dataclasses import dataclass
  7 | from typing import Any, Dict, List, Optional, Tuple, Union
  8 | from transformers.tokenization_utils_base import PreTrainedTokenizerBase
  9 | from transformers import AutoModelForCausalLM
 10 | from transformers.data.data_collator import (
 11 |     DataCollatorMixin,
 12 |     _torch_collate_batch,
 13 | )
 14 | import copy
 15 | 
 16 | from transformers import default_data_collator
 17 | from .utils import get_first_non_specical_index, get_first_special_index, get_first_special_index_batch
 18 | 
 19 | def random_spans_noise_mask(length, mean_noise_span_length, noise_density):
 20 |     """
 21 |     A copy from https://github.com/EleutherAI/oslo/blob/main/oslo/transformers/tasks/data_t5_pretraining.py#L230 (inception)
 22 |     This function is copy of `random_spans_helper <https://github.com/google-research/text-to-text-transfer-transformer/blob/84f8bcc14b5f2c03de51bd3587609ba8f6bbd1cd/t5/data/preprocessors.py#L2682>`__ .
 23 |     Noise mask consisting of random spans of noise tokens.
 24 |     The number of noise tokens and the number of noise spans and non-noise spans
 25 |     are determined deterministically as follows:
 26 |     num_noise_tokens = round(length * noise_density)
 27 |     num_nonnoise_spans = num_noise_spans = round(num_noise_tokens / mean_noise_span_length)
 28 |     Spans alternate between non-noise and noise, beginning with non-noise.
 29 |     Subject to the above restrictions, all masks are equally likely.
 30 |     Args:
 31 |         length: an int32 scalar (length of the incoming token sequence)
 32 |         noise_density: a float - approximate density of output mask
 33 |         mean_noise_span_length: a number
 34 |     Returns:
 35 |         a boolean tensor with shape [length]
 36 |     """
 37 | 
 38 |     orig_length = length
 39 | 
 40 |     num_noise_tokens = int(np.round(length * noise_density))
 41 |     # avoid degeneracy by ensuring positive numbers of noise and nonnoise tokens.
 42 |     num_noise_tokens = min(max(num_noise_tokens, 1), length - 1)
 43 |     num_noise_spans = int(np.round(num_noise_tokens / mean_noise_span_length))
 44 | 
 45 |     # avoid degeneracy by ensuring positive number of noise spans
 46 |     num_noise_spans = max(num_noise_spans, 1)
 47 |     num_nonnoise_tokens = length - num_noise_tokens
 48 | 
 49 |     # pick the lengths of the noise spans and the non-noise spans
 50 |     def _random_segmentation(num_items, num_segments):
 51 |         """Partition a sequence of items randomly into non-empty segments.
 52 |         Args:
 53 |             num_items: an integer scalar > 0
 54 |             num_segments: an integer scalar in [1, num_items]
 55 |         Returns:
 56 |             a Tensor with shape [num_segments] containing positive integers that add
 57 |             up to num_items
 58 |         """
 59 |         mask_indices = np.arange(num_items - 1) < (num_segments - 1)
 60 |         np.random.shuffle(mask_indices)
 61 |         first_in_segment = np.pad(mask_indices, [[1, 0]])
 62 |         segment_id = np.cumsum(first_in_segment)
 63 |         # count length of sub segments assuming that list is sorted
 64 |         _, segment_length = np.unique(segment_id, return_counts=True)
 65 |         return segment_length
 66 | 
 67 |     noise_span_lengths = _random_segmentation(num_noise_tokens, num_noise_spans)
 68 |     nonnoise_span_lengths = _random_segmentation(
 69 |         num_nonnoise_tokens, num_noise_spans
 70 |     )
 71 | 
 72 |     interleaved_span_lengths = np.reshape(
 73 |         np.stack([nonnoise_span_lengths, noise_span_lengths], axis=1),
 74 |         [num_noise_spans * 2],
 75 |     )
 76 |     span_starts = np.cumsum(interleaved_span_lengths)[:-1]
 77 |     span_start_indicator = np.zeros((length,), dtype=np.int8)
 78 |     span_start_indicator[span_starts] = True
 79 |     span_num = np.cumsum(span_start_indicator)
 80 |     is_noise = np.equal(span_num % 2, 1)
 81 | 
 82 |     return is_noise[:orig_length]
 83 | 
 84 | @dataclass
 85 | class DataCollatorForUL2(DataCollatorMixin):
 86 |     """
 87 | 
 88 |     Data collator used for UL2
 89 | 
 90 |     """
 91 |     model: AutoModelForCausalLM
 92 |     tokenizer: PreTrainedTokenizerBase
 93 |     r_denoising: bool = True
 94 |     r_probability: float = 0.25
 95 |     r_denoising_config: Tuple[Tuple] = ((3, 0.15),)
 96 |     s_denoising: bool = True
 97 |     s_probability: float = 0.5
 98 |     x_denoising: bool = True
 99 |     x_probability: float = 0.25
100 |     x_denoising_config: Tuple[Tuple] = ((32, 0.5, 0.5),)
101 |     pad_to_multiple_of: Optional[int] = None
102 |     tf_experimental_compile: bool = False
103 |     return_tensors: str = "pt"
104 |     label_pad_token_id: int = -100
105 | 
106 |     def __post_init__(self):
107 |         self.total_task = [0, 1, 2]
108 |         task_prob = []
109 |         task_prob.append(self.r_probability if self.r_denoising else 0.0)
110 |         task_prob.append(self.s_probability if self.s_denoising else 0.0)
111 |         task_prob.append(self.x_probability if self.x_denoising else 0.0)
112 |         self.task_prob = task_prob
113 |         self.pad_token_id = self.tokenizer.pad_token_id
114 |         self.decoder_start_token_id = self.tokenizer.bos_token_id
115 | 
116 |     def assign_task_type(self, batch_size: int):
117 |         '''
118 |             Randomly assign S,R,X to each sentence based on weighted prob
119 |         '''
120 |         return random.choices(self.total_task,weights=self.task_prob, k=batch_size)
121 | 
122 |     def torch_call(self, examples: List[Union[List[int], Any, Dict[str, Any]]]) -> Dict[str, Any]:
123 |         torch.set_printoptions(threshold=10_000)
124 |         np.set_printoptions(threshold=10_000)
125 |         if torch.rand(1) < -1 or not self.model.training:
126 |             return default_data_collator(examples)
127 | 
128 |         # Handle dict or lists with proper padding and conversion to tensor.
129 |         # print(examples)
130 |         task_ids = self.assign_task_type(len(examples))
131 |         task_type = torch.tensor(task_ids)
132 |         lengths = torch.tensor([ len(e['input_ids']) for e in examples ], dtype=torch.long)
133 |         if isinstance(examples[0], Mapping):
134 |             batch = self.tokenizer.pad(examples, return_tensors="pt",
135 |                 pad_to_multiple_of=self.pad_to_multiple_of)
136 |         else:
137 |             batch = {
138 |                 "input_ids": _torch_collate_batch(examples, self.tokenizer,
139 |                     pad_to_multiple_of=self.pad_to_multiple_of)
140 |             }
141 |         max_length = batch['input_ids'].shape[-1]
142 | 
143 |         # new_batch = copy.deepcopy(batch)
144 |         new_batch = {
145 |             "input_ids": torch.zeros(batch['input_ids'].shape, dtype=torch.long),
146 |             "labels": torch.zeros(batch['input_ids'].shape, dtype=torch.long),
147 |             "attention_mask": torch.zeros(batch['input_ids'].shape, dtype=torch.long),
148 |             "prefix_mask": torch.zeros(batch['input_ids'].shape, dtype=torch.long),
149 |         }
150 | 
151 |         _, expanded_length = batch['input_ids'].shape
152 |         input_ids = batch["input_ids"]
153 |         r_denoising_idx = task_type == 0
154 |         r_denoising_idx_num = torch.where(r_denoising_idx)[0]
155 |         if r_denoising_idx.any():
156 |             mask_indices = None
157 |             sub_input_ids = input_ids[r_denoising_idx]
158 |             # union of different denoising settings
159 |             for (mean_span, noise) in self.r_denoising_config:
160 |                 _mask_indices = np.array([
161 |                     random_spans_noise_mask(expanded_length, mean_span, noise) for _ in range(len(sub_input_ids))
162 |                 ])
163 | 
164 |                 if mask_indices is None:
165 |                     mask_indices = _mask_indices
166 |                 else:
167 |                     mask_indices = mask_indices | _mask_indices
168 | 
169 |             valid_lengths = get_first_special_index_batch(sub_input_ids, self.pad_token_id)
170 |             for idx, valid_len in enumerate(valid_lengths):
171 |                 mask_indices[idx, valid_len:] = False
172 |             input_ids_sentinel = self.create_sentinel_ids(mask_indices.astype(np.int8))
173 |             labels_mask = ~mask_indices
174 |             labels_sentinel = self.create_sentinel_ids(labels_mask.astype(np.int8))
175 |             _sub_input_ids = self.filter_input_ids(sub_input_ids, input_ids_sentinel)
176 |             _labels = self.filter_input_ids(sub_input_ids, labels_sentinel)
177 |             
178 |             labels = []
179 |             _input_ids = []
180 |             for idx, _label in enumerate(_labels):
181 |                 label = _label[_label != self.pad_token_id]
182 |                 _sub_input_ids_idx = _sub_input_ids[idx][_sub_input_ids[idx] != self.pad_token_id]
183 |                 sub_input_len =  len(_sub_input_ids_idx)
184 |                 _sub_input_ids_idx = np.concatenate((_sub_input_ids_idx, label))
185 |                 label = np.concatenate(([self.label_pad_token_id] * sub_input_len, label))
186 |                 new_batch['attention_mask'][r_denoising_idx_num[idx]][:len(label)] = 1
187 |                 new_batch["prefix_mask"][r_denoising_idx_num[idx]][:sub_input_len] = 1
188 |                 if len(label) > max_length:
189 |                     label = torch.from_numpy(label[: max_length])
190 |                     _sub_input_ids_idx = torch.from_numpy(_sub_input_ids_idx[: max_length])
191 |                 else:
192 |                     diff = max_length - len(label)
193 |                     label = F.pad(torch.from_numpy(label), (0, diff), 'constant', self.label_pad_token_id)
194 |                     _sub_input_ids_idx = F.pad(torch.from_numpy(_sub_input_ids_idx), (0, diff), 'constant', self.pad_token_id)
195 |                 labels.append(label)
196 |                 _input_ids.append(_sub_input_ids_idx)
197 |             labels = torch.stack(labels)
198 |             _input_ids = torch.stack(_input_ids)
199 |             
200 |             new_batch['input_ids'][r_denoising_idx] = _input_ids.long()
201 |             new_batch['labels'][r_denoising_idx] = labels.long()
202 | 
203 |         s_denoising_idx = task_type == 1
204 |         s_denoising_idx_num = torch.where(s_denoising_idx)[0]
205 |         if s_denoising_idx.any():
206 |             sub_input_ids = input_ids[s_denoising_idx]
207 |             _labels = []
208 |             _input_ids = []
209 | 
210 |             for idx, input_id in enumerate(sub_input_ids):
211 |                 valid_len = get_first_special_index(input_id, self.pad_token_id)
212 |                 split = max(valid_len//2, 2)
213 |                 new_batch["prefix_mask"][s_denoising_idx_num[idx]][:split] = 1
214 | 
215 |             # for input_id, len_ in zip(sub_input_ids, lengths[s_denoising_idx]):
216 |             #     if self.tokenizer.padding_side == "left":
217 |             #         idx = get_first_non_specical_index(input_id, self.pad_token_id)
218 |             #         valid_len = len_ - idx - 1
219 |             #         split = max(valid_len//2, 2) + idx
220 |             #         diff = expanded_length - split
221 |             #         _input_ids.append(F.pad(input_id[:split], (0, diff), 'constant', self.pad_token_id))
222 |             #         past_seq = input_id[split:]
223 |             #         if past_seq[-1] != self.tokenizer.eos_token_id:
224 |             #             past_seq[-1] = self.tokenizer.eos_token_id
225 |             #         # _labels.append(F.pad(past_seq, (split, 0), 'constant', self.pad_token_id))
226 |             #     else:
227 |             #         valid_len = get_first_special_index(input_id, self.pad_token_id)
228 |             #         split = max(valid_len//2, 2)
229 |             #         # diff = expanded_length - split
230 |             #         # _input_ids.append(F.pad(input_id[:split], (0, diff), 'constant', self.pad_token_id))
231 |             #         # past_seq = input_id[split:]
232 |             #         # past_seq = torch.where(past_seq == self.pad_token_id, self.label_pad_token_id, past_seq)
233 |             #         # _labels.append(F.pad(past_seq, (split, 0), 'constant', self.label_pad_token_id))
234 | 
235 |             new_batch['input_ids'][s_denoising_idx] = batch['input_ids'][s_denoising_idx]
236 |             new_batch['labels'][s_denoising_idx] = batch['labels'][s_denoising_idx]
237 |             new_batch['attention_mask'][s_denoising_idx] = batch['attention_mask'][s_denoising_idx]
238 | 
239 | 
240 |         x_denoising_idx = task_type == 2
241 |         x_denoising_idx_num = torch.where(x_denoising_idx)[0]
242 |         if x_denoising_idx.any():
243 |             sub_input_ids = input_ids[x_denoising_idx]
244 |             mask_indices = []
245 |             valid_lengths = get_first_special_index_batch(sub_input_ids, self.pad_token_id)
246 |             for len_, valid_len in zip(lengths[x_denoising_idx], valid_lengths):
247 |                 mask_index = None
248 |                 # idx = get_first_non_specical_index(input_id, self.pad_token_id)
249 |                 # valid_len = len_ - idx - 1
250 |                 for (mean_span, noise, ratio) in self.x_denoising_config:
251 |                     mean_span = min(mean_span, valid_len * ratio)
252 |                     _mask_index = np.array(
253 |                         random_spans_noise_mask(expanded_length, mean_span, noise)
254 |                     )
255 |                     if mask_index is None:
256 |                         mask_index = _mask_index
257 |                     else:
258 |                         mask_index = mask_index | _mask_index
259 |                 mask_index[valid_len:] = False
260 |                 mask_indices.append(mask_index[np.newaxis,:])
261 | 
262 |             mask_indices = np.concatenate(mask_indices, axis=0)
263 |             input_ids_sentinel = self.create_sentinel_ids(mask_indices.astype(np.int8))
264 |             labels_mask = ~mask_indices
265 |             labels_sentinel = self.create_sentinel_ids(labels_mask.astype(np.int8))
266 |             _sub_input_ids = self.filter_input_ids(sub_input_ids, input_ids_sentinel)
267 |             _labels = self.filter_input_ids(sub_input_ids, labels_sentinel)
268 | 
269 |             labels = []
270 |             _input_ids = []
271 |             for idx, _label in enumerate(_labels):
272 |                 label = _label[_label != self.pad_token_id]
273 |                 _sub_input_ids_idx = _sub_input_ids[idx][_sub_input_ids[idx] != self.pad_token_id]
274 |                 sub_input_len =  len(_sub_input_ids_idx)
275 |                 _sub_input_ids_idx = np.concatenate((_sub_input_ids_idx, label))
276 |                 label = np.concatenate(([self.label_pad_token_id] * sub_input_len, label))
277 |                 new_batch['attention_mask'][x_denoising_idx_num[idx]][:len(label)] = 1
278 |                 new_batch["prefix_mask"][x_denoising_idx_num[idx]][:sub_input_len] = 1
279 |                 if len(label) > max_length:
280 |                     label = torch.from_numpy(label[: max_length])
281 |                     _sub_input_ids_idx = torch.from_numpy(_sub_input_ids_idx[: max_length])
282 |                 else:
283 |                     diff = max_length - len(label)
284 |                     label = F.pad(torch.from_numpy(label), (0, diff), 'constant', self.label_pad_token_id)
285 |                     _sub_input_ids_idx = F.pad(torch.from_numpy(_sub_input_ids_idx), (0, diff), 'constant', self.pad_token_id)
286 |                 labels.append(label)
287 |                 _input_ids.append(_sub_input_ids_idx)
288 |             labels = torch.stack(labels)
289 |             _input_ids = torch.stack(_input_ids)
290 |             
291 |             new_batch['input_ids'][x_denoising_idx] = _input_ids.long()
292 |             new_batch['labels'][x_denoising_idx] = labels.long()
293 | 
294 |         # if torch.cuda.current_device() == 0:
295 |         #     print(new_batch)
296 |         # exit(0)
297 |         ## Override labels
298 |         # if "labels" in batch:
299 |         #     new_batch["labels"] = batch["labels"]
300 |         # new_batch["attention_mask"] = batch["attention_mask"]
301 |         
302 |         return new_batch 
303 | 
304 | 
305 |     def filter_input_ids(self, input_ids, sentinel_ids):
306 |         """
307 |         Puts sentinel mask on `input_ids` and fuse consecutive mask tokens into a single mask token by deleting.
308 |         This will reduce the sequence length from `expanded_inputs_length` to `input_length`.
309 |         """
310 | 
311 |         input_ids_full = np.where(sentinel_ids != 0, sentinel_ids, input_ids)
312 |         # input_ids tokens and sentinel tokens are >= 0, tokens < 0 are
313 |         # masked tokens coming after sentinel tokens and should be removed
314 |         input_ids = []
315 |         for row in input_ids_full:
316 |             collapsed_id = row[row >= 0]
317 |             diff = len(row) - len(collapsed_id)
318 |             collapsed_id = np.pad(collapsed_id, (0, diff), 'constant', constant_values=self.pad_token_id)
319 |             input_ids.append(collapsed_id)
320 |         return np.array(input_ids)
321 | 
322 |     def create_sentinel_ids(self, mask_indices):
323 |         """
324 |         Sentinel ids creation given the indices that should be masked.
325 |         The start indices of each mask are replaced by the sentinel ids in increasing
326 |         order. Consecutive mask indices to be deleted are replaced with `-1`.
327 |         """
328 |         start_indices = mask_indices - np.roll(mask_indices, 1, axis=-1) * mask_indices
329 |         start_indices[:, 0] = mask_indices[:, 0]
330 | 
331 |         sentinel_ids = np.where(
332 |             start_indices != 0, np.cumsum(start_indices, axis=-1), start_indices
333 |         )
334 |         sentinel_ids = np.where(
335 |             sentinel_ids != 0, (len(self.tokenizer) - sentinel_ids), 0
336 |         )
337 |         sentinel_ids -= mask_indices - start_indices
338 | 
339 |         return sentinel_ids
340 | 
341 | 
342 |     def prepare_decoder_inputs_from_labels(self, batch):
343 |         # decoder_start_token_id has to be defined. In T5 it is usually set to the pad_token_id.
344 |         # See T5 docs for more information
345 |         batch["labels"][ batch["labels"] == self.pad_token_id ] = self.label_pad_token_id
346 |         shifted_labels = batch["labels"].new_zeros(batch["labels"].shape)
347 |         shifted_labels[..., 1:] = batch["labels"][..., :-1].clone()
348 |         shifted_labels[..., 0] = self.decoder_start_token_id  # decoder_start_token_id
349 | 
350 |         batch["decoder_input_ids"] = torch.masked_fill(
351 |             shifted_labels,
352 |             shifted_labels == self.label_pad_token_id,
353 |             self.pad_token_id
354 |         )
355 |         batch["decoder_attention_mask"] = torch.where(
356 |             shifted_labels == self.label_pad_token_id,
357 |             0,
358 |             torch.ones_like(shifted_labels),
359 |         )
360 |         return batch
361 | 
362 |     def np_prepare_decoder_inputs_from_labels(self, batch):
363 |         batch["labels"][ batch["labels"] == self.pad_token_id ] = self.label_pad_token_id
364 |         shifted_labels = np.zeros(batch["labels"].shape)
365 |         shifted_labels[..., 1:] = batch["labels"][..., :-1].copy()
366 |         shifted_labels[..., 0] = self.decoder_start_token_id
367 | 
368 |         batch["decoder_input_ids"] = np.where(
369 |             shifted_labels == self.label_pad_token_id,
370 |             self.pad_token_id,
371 |             shifted_labels
372 |         )
373 |         batch["decoder_attention_mask"] = np.where(
374 |             shifted_labels == self.label_pad_token_id,
375 |             0,
376 |             np.ones_like(shifted_labels)
377 |         )
378 |         return batch


--------------------------------------------------------------------------------