├── shared_tasks
    ├── dynalab
    │   ├── .gitignore
    │   ├── requirements.txt
    │   ├── model_generation.json
    │   ├── setup_config.json
    │   ├── Makefile
    │   ├── README.md
    │   └── handler.py
    └── wmt
    │   └── get_african_flores.sh
├── flores_logo.png
├── flores200
    ├── NLLB_GITHUB_BANNER_Flores.png
    └── README.md
├── previous_releases
    ├── floresv1
    │   ├── flores_logo.png
    │   ├── data
    │   │   ├── flores_test_sets.tgz
    │   │   └── wikipedia_en_ne_si_test_sets.tgz
    │   ├── scripts
    │   │   ├── spm_train.py
    │   │   ├── indic_norm_tok.sh
    │   │   ├── download_indic.sh
    │   │   ├── utils.py
    │   │   ├── shuf.py
    │   │   ├── spm_decode.py
    │   │   ├── indic_norm_tok.py
    │   │   ├── spm_encode.py
    │   │   ├── train.py
    │   │   └── translate.py
    │   ├── reproduce.sh
    │   ├── prepare-sien.sh
    │   ├── prepare-neen.sh
    │   ├── prepare-monolingual.sh
    │   ├── configs
    │   │   ├── enne.json
    │   │   ├── ensi.json
    │   │   ├── neen.json
    │   │   └── sien.json
    │   ├── README.md
    │   └── download-data.sh
    └── flores101
    │   └── README.md
├── ocr
    ├── OCR_impact_BT
    │   ├── lang_codes.source
    │   ├── evaluate.sh
    │   ├── translate_mono_books.sh
    │   ├── translate_mono.sh
    │   ├── finetune_eval_books.sh
    │   └── finetune.sh
    ├── requirements.txt
    ├── Data
    │   └── language_codes
    │   │   ├── languages.csv
    │   │   └── languages_fonts_codes.csv
    ├── data_collection
    │   ├── file_splitter.py
    │   ├── utils.py
    │   ├── download_UDHR_data.py
    │   └── augment_data.py
    ├── README.md
    └── OCR_eval
    │   ├── metrics.py
    │   ├── google_vision_OCR.py
    │   └── OCR_eval.py
├── CODE_OF_CONDUCT.md
├── CONTRIBUTING.md
├── nllb_md
    └── README.md
├── README.md
├── nllb_seed
    └── README.md
├── flores_move.py
└── toxicity
    └── README.md


/shared_tasks/dynalab/.gitignore:
--------------------------------------------------------------------------------
1 | .dynalab/
2 | 


--------------------------------------------------------------------------------
/flores_logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/flores/HEAD/flores_logo.png


--------------------------------------------------------------------------------
/flores200/NLLB_GITHUB_BANNER_Flores.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/flores/HEAD/flores200/NLLB_GITHUB_BANNER_Flores.png


--------------------------------------------------------------------------------
/previous_releases/floresv1/flores_logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/flores/HEAD/previous_releases/floresv1/flores_logo.png


--------------------------------------------------------------------------------
/ocr/OCR_impact_BT/lang_codes.source:
--------------------------------------------------------------------------------
1 | declare -A LANG_CODES=(["pus"]="ps" ["jpn"]="ja" ["khm"]="km" ["npi"]="ne" ["lao"]="lo" ["amh"]="am" ["tam"]="ta")
2 | 


--------------------------------------------------------------------------------
/previous_releases/floresv1/data/flores_test_sets.tgz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/flores/HEAD/previous_releases/floresv1/data/flores_test_sets.tgz


--------------------------------------------------------------------------------
/shared_tasks/dynalab/requirements.txt:
--------------------------------------------------------------------------------
1 | sentencepiece
2 | # Replace this by 1.0.0 once it's out
3 | git+git://github.com/pytorch/fairseq.git@1305008e#egg=fairseq
4 | 


--------------------------------------------------------------------------------
/shared_tasks/dynalab/model_generation.json:
--------------------------------------------------------------------------------
1 | {
2 |     "dummy": false,
3 |     "beam_size": 1,
4 |     "max_len_a": 1.3,
5 |     "max_len_b": 5,
6 |     "min_len": 5
7 | }
8 | 


--------------------------------------------------------------------------------
/previous_releases/floresv1/data/wikipedia_en_ne_si_test_sets.tgz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/facebookresearch/flores/HEAD/previous_releases/floresv1/data/wikipedia_en_ne_si_test_sets.tgz


--------------------------------------------------------------------------------
/ocr/requirements.txt:
--------------------------------------------------------------------------------
 1 | editdistance
 2 | google-cloud-storage
 3 | google-cloud-vision
 4 | levenshtein
 5 | numpy
 6 | opencv-python
 7 | pandas
 8 | pdf2image
 9 | pillow
10 | requests
11 | tqdm
12 | xmltodict
13 | 


--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
1 | # Code of Conduct
2 | 
3 | Facebook has adopted a Code of Conduct that we expect project participants to adhere to.
4 | Please read the [full text](https://code.fb.com/codeofconduct/)
5 | so that you can understand what actions will and will not be tolerated.
6 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
1 | ## Issues
2 | We use GitHub issues to track public bugs. Please ensure your description is
3 | clear and has sufficient instructions to be able to reproduce the issue.
4 | 
5 | ## License
6 | By contributing to this repository, you agree that your contributions will be
7 | licensed under the LICENSE file in the root directory of this source tree.
8 | 


--------------------------------------------------------------------------------
/shared_tasks/dynalab/setup_config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "task": "flores_small1",
 3 |     "checkpoint": "model.pt",
 4 |     "handler": "handler.py",
 5 |     "requirements": true,
 6 |     "setup": false,
 7 |     "model_files": [
 8 |         "model_generation.json",
 9 |         "dict.txt",
10 |         "sentencepiece.bpe.model"
11 |     ],
12 |     "exclude": []
13 | }
14 | 


--------------------------------------------------------------------------------
/previous_releases/floresv1/scripts/spm_train.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | #
 7 | #!/usr/bin/env python
 8 | 
 9 | from __future__ import absolute_import, division, print_function, unicode_literals
10 | 
11 | import shlex
12 | import sys
13 | 
14 | import sentencepiece as spm
15 | 
16 | 
17 | if __name__ == "__main__":
18 |     spm.SentencePieceTrainer.Train(" ".join(map(shlex.quote, sys.argv[1:])))
19 | 


--------------------------------------------------------------------------------
/previous_releases/floresv1/scripts/indic_norm_tok.sh:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | #
 7 | #!/bin/bash
 8 | 
 9 | if [ $# -ne 2 ]; then
10 |     echo "usage: $0 LANGUAGE INFILE"
11 |     exit 1
12 | fi
13 | LANG=$1
14 | INFILE=$2
15 | 
16 | ROOT=$(dirname "$0")
17 | 
18 | INDICNLP=$ROOT/indic_nlp_library
19 | if [ ! -e $INDICNLP ]; then
20 |     exit 1
21 | fi
22 | 
23 | python2 $ROOT/indic_norm_tok.py --indic-nlp-path $INDICNLP --language $LANG $INFILE
24 | 


--------------------------------------------------------------------------------
/previous_releases/floresv1/scripts/download_indic.sh:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | #
 7 | #!/bin/bash
 8 | 
 9 | ROOT=$(dirname "$0")
10 | 
11 | INDICNLP=$ROOT/indic_nlp_library
12 | if [ ! -e $INDICNLP ]; then
13 |     echo "Cloning Indic NLP Library..."
14 |     git -C $ROOT clone https://github.com/anoopkunchukuttan/indic_nlp_library.git
15 |     pushd $INDICNLP
16 |     git reset --hard 0a5e01f2701e0df5bc1f9905334cd7916d874c16
17 |     popd
18 | else
19 |     echo "Indic is already pulled from github. Skipping."
20 | fi
21 | 


--------------------------------------------------------------------------------
/previous_releases/floresv1/scripts/utils.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | #
 7 | #!/usr/bin/env python
 8 | 
 9 | import os
10 | from subprocess import check_output
11 | import subprocess
12 | 
13 | 
14 | def check_last_line(filepath, s, is_exact=False):
15 |     if not os.path.exists(filepath):
16 |         return False
17 | 
18 |     last_line = check_output(f'tail -n 1 {filepath}', shell=True).decode('utf-8').strip()
19 |     return (s == last_line) if is_exact else (s in last_line)
20 | 
21 | 
22 | def count_line(filename):
23 |     try:
24 |         return int(check_output(f'wc -l {filename} | cut -d " " -f1', shell=True))
25 |     except (subprocess.CalledProcessError, ValueError) as e:
26 |         return 0
27 | 


--------------------------------------------------------------------------------
/previous_releases/floresv1/scripts/shuf.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # Copyright (c) Facebook, Inc. and its affiliates.
 3 | # All rights reserved.
 4 | #
 5 | # This source code is licensed under the license found in the
 6 | # LICENSE file in the root directory of this source tree.
 7 | #
 8 | 
 9 | 
10 | import argparse
11 | import numpy as np
12 | import sys
13 | 
14 | 
15 | def main():
16 |     parser = argparse.ArgumentParser()
17 |     parser.add_argument('--num-lines', '-n', default=None, help='Output the first n lines after shuffling', type=int)
18 |     parser.add_argument('--seed', '-s', default=42, help='Random seed', type=int)
19 |     args = parser.parse_args()
20 | 
21 |     lines = [line for line in sys.stdin]
22 |     args.num_lines = min(args.num_lines or len(lines), len(lines))
23 | 
24 |     np.random.seed(args.seed)
25 |     for i in np.random.choice(len(lines), args.num_lines, replace=False):
26 |         print(lines[i], end='')
27 | 
28 | 
29 | if __name__ == "__main__":
30 |     main()
31 | 


--------------------------------------------------------------------------------
/ocr/Data/language_codes/languages.csv:
--------------------------------------------------------------------------------
 1 | Language,Code
 2 | Amharic,amh
 3 | Armenian,hye
 4 | Asturian,ast
 5 | Belarusian,bel
 6 | Bengali,ben
 7 | Bulgarian,bul
 8 | Burmese,mya
 9 | Georgian,kat
10 | Greek,ell
11 | Gujarati,guj
12 | Hebrew,heb
13 | Hindi,hin
14 | Japanese,jpn
15 | Kannada,kan
16 | Kazakh,kaz
17 | Khmer,khm
18 | Korean,kor
19 | Kyrgyz,kir
20 | Lao,lao
21 | Macedonian,mkd
22 | Malayalam,mal
23 | Marathi,mar
24 | Nepali,npi
25 | Pashto,pbu
26 | Punjabi,pan
27 | Russian,rus
28 | Serbian,srp
29 | Tajik,tgk
30 | Tamil,tam
31 | Telugu,tel
32 | Thai,tha
33 | Ukrainian,ukr
34 | Urdu,urd
35 | Vietnamese,vie
36 | Turkish,tur
37 | Uzbek,uzn
38 | Wolof,wol
39 | Zulu,zul
40 | Arabic,arb
41 | Cebuano,ceb
42 | Chinese_Simpl,cmn
43 | Fula,fuv
44 | Ganda,lug
45 | Icelandic,isl
46 | Lingala,lin
47 | Maori,mri
48 | Mongolian,khk
49 | Nyanja,nya
50 | Romanian,ron
51 | Sorani_Kurdish,ckb
52 | Shona,sna
53 | Umbundu,umb
54 | Swahili,swh
55 | Somali,som
56 | Swedish,swe
57 | Polish,pol
58 | Slovak,slk
59 | Slovenian,slv
60 | Oromo,gaz
61 | Portuguese_Portugal,por
62 | 


--------------------------------------------------------------------------------
/shared_tasks/wmt/get_african_flores.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | mkdir -p african_flores
 4 | mkdir -p african_flores/dev
 5 | mkdir -p african_flores/devtest
 6 | 
 7 | # download flores101 and flores_wmt22_supplement
 8 | wget https://dl.fbaipublicfiles.com/flores101/dataset/flores101_dataset.tar.gz
 9 | wget https://dl.fbaipublicfiles.com/flores101/dataset/flores_wmt22_supplement.tar.gz
10 | 
11 | # unzip downloaded files.
12 | tar -xf flores101_dataset.tar.gz
13 | tar -xf flores_wmt22_supplement.tar.gz
14 | 
15 | 
16 | # move flores_wmt22_supplement dev and devtest to african_flores
17 | for lang in kin ssw tsn tso
18 | do
19 | 	for split in dev devtest
20 | 	do
21 | 		cp -r flores_wmt22_supplement/$split/$lang.$split african_flores/$split/
22 | 	done
23 | done
24 | 
25 | # move flores101 dev and devtest to african_flores
26 | for lang in afr amh eng fra ful hau ibo kam lin lug luo nso nya orm sna som swh umb wol xho yor zul
27 | do
28 | 	for split in dev devtest
29 | 	do
30 | 		cp -r flores101_dataset/$split/$lang.$split african_flores/$split/
31 | 	done
32 | done
33 | 
34 | # delete archive and unused folders
35 | rm -r flores*
36 | 
37 | 


--------------------------------------------------------------------------------
/previous_releases/floresv1/reproduce.sh:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | #
 7 | #!/bin/bash
 8 | # Script to reproduce iterative back-translation baseline
 9 | 
10 | SUPPORT_LANG_PAIRS="ne_en|en_ne|si_en|en_si"
11 | 
12 | if [[ $# -lt 1 ]] || ! [[ "$1" =~ ^($SUPPORT_LANG_PAIRS)$ ]]; then
13 |   echo "Usage: $0 LANGUAGE_PAIR($SUPPORT_LANG_PAIRS)"
14 |   exit 1
15 | fi
16 | 
17 | LANG_PAIR=$1
18 | 
19 | if [[ $LANG_PAIR = "ne_en" ]]; then
20 |   python scripts/train.py --config configs/neen.json --databin $PWD/data-bin/wiki_ne_en_bpe5000/
21 | elif [[ $LANG_PAIR = "en_ne" ]]; then
22 |   python scripts/train.py --config configs/enne.json --databin $PWD/data-bin/wiki_ne_en_bpe5000/
23 | elif [[ $LANG_PAIR = "si_en" ]]; then
24 |   python scripts/train.py --config configs/sien.json --databin $PWD/data-bin/wiki_si_en_bpe5000/
25 | elif [[ $LANG_PAIR = "en_si" ]]; then
26 |   python scripts/train.py --config configs/ensi.json --databin $PWD/data-bin/wiki_si_en_bpe5000/
27 | fi
28 | 


--------------------------------------------------------------------------------
/nllb_md/README.md:
--------------------------------------------------------------------------------
 1 | # No Language Left Behind Multi Domain 
 2 | 
 3 | NLLB Multi Domain is a set of professionally-translated sentences in News, Unscripted informal speech, and Health domains. It is designed to enable assessment of out-of-domain performance and to study domain adaptation for machine translation. Each domain has approximately 3000 sentences. 
 4 | 
 5 | --------------------------------------------------------------------------------
 6 | 
 7 | ## Download
 8 | 
 9 | NLLB-Multi Domain can be downloaded using the following links:
10 | * Unscripted chat [here](https://tinyurl.com/NLLBMDchat)
11 | * News [here](https://tinyurl.com/NLLBMDnews)
12 | * Health [here](https://tinyurl.com/NLLBMDhealth)
13 | 
14 | which you can download with the following commands:
15 | 
16 | ```bash
17 | wget --trust-server-names https://tinyurl.com/NLLBMDchat
18 | wget --trust-server-names https://tinyurl.com/NLLBMDnews
19 | wget --trust-server-names https://tinyurl.com/NLLBMDhealth
20 | ```
21 | 
22 | ## Languages in NLLB Multi Domain
23 | 
24 | Language | FLORES-200 code
25 | ---|---
26 | Central Aymara | ayr_Latn
27 | Bhojpuri | bho_Deva
28 | Dyula | dyu_Latn
29 | Friulian | fur_Latn
30 | Russian | rus_Cyrl
31 | Wolof | wol_Latn


--------------------------------------------------------------------------------
/shared_tasks/dynalab/Makefile:
--------------------------------------------------------------------------------
 1 | MODEL_NAME=m2m-124-175m
 2 | 
 3 | # test: simple_test_$(MODEL_NAME) docker_test_$(MODEL_NAME)
 4 | test: docker_test_$(MODEL_NAME)
 5 | 
 6 | 
 7 | # Download and extract baseline model
 8 | dl_$(MODEL_NAME): $(MODEL_NAME).pt
 9 | 
10 | 
11 | $(MODEL_NAME).pt: flores101_mm100_175M.tar.gz
12 | 	tar -xzf $<
13 | 	mv flores101_mm100_175M/model.pt $@
14 | 	touch $@
15 | 	mv flores101_mm100_175M/* .
16 | 	rm -r flores101_mm100_175M/
17 | 
18 | 
19 | flores101_mm100_175M.tar.gz:
20 | 	wget https://dl.fbaipublicfiles.com/flores101/pretrained_models/flores101_mm100_175M.tar.gz
21 | 
22 | 
23 | # Setup dynalab
24 | dyna_init_$(MODEL_NAME): .dynalab/$(MODEL_NAME)
25 | 
26 | 
27 | .dynalab/$(MODEL_NAME):
28 | 	dynalab-cli init \
29 | 		--name $(MODEL_NAME) \
30 | 		--task flores_small1 \
31 | 		--handler handler.py \
32 | 		--model-checkpoint $(MODEL_NAME).pt \
33 | 		--install-requirements \
34 | 		--model-files sentencepiece.bpe.model,model_generation.json,dict.txt
35 | 
36 | 
37 | # Run dynalab tests
38 | simple_test_$(MODEL_NAME): $(MODEL_NAME).pt .dynalab/$(MODEL_NAME) handler.py
39 | 	# python -m pdb `which dynalab-cli` test --local -n $(MODEL_NAME)
40 | 	dynalab-cli test --local -n $(MODEL_NAME)
41 | 
42 | 
43 | docker_test_$(MODEL_NAME): $(MODEL_NAME).pt .dynalab/$(MODEL_NAME) handler.py
44 | 	dynalab-cli test -n $(MODEL_NAME)
45 | 
46 | # .dynalab/$(MODEL_NAME)/tmp/$(MODEL_NAME).mar: .dynalab/$(MODEL_NAME) handler.py $(MODEL_NAME).pt
47 | 
48 | 
49 | 


--------------------------------------------------------------------------------
/ocr/OCR_impact_BT/evaluate.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | if [[ -z "$flores101_dataset" ]]; then
 4 |   echo 'Need to specify the env var flores101_dataset.'
 5 |   exit 1
 6 | fi
 7 | 
 8 | source OCR_impact_BT/lang_codes.source
 9 | 
10 | SRC_MM100_LANG_CODE=en
11 | 
12 | SIZE=20k
13 | 
14 | for trg_lang_code in "${!LANG_CODES[@]}"; do
15 |     for error_rate in {1..22..2}; do
16 |         for error_type in insert delete replace; do
17 |             data_type=${error_rate}/${error_type}
18 |             root_output="Data/backtranslation/data_cer_$SIZE/${data_type}"
19 |             root_checkpoint_out="${root_output}/model_checkpoints"
20 |             trg_m100_lang_code="${LANG_CODES[${trg_lang_code}]}"
21 |             checkpoint_out=${root_checkpoint_out}_${SRC_MM100_LANG_CODE}_${trg_m100_lang_code}
22 |             if [ -f "$checkpoint_out/checkpoint6.pt" ]; then
23 |                 if [ -s "$root_output/generation_${SRC_MM100_LANG_CODE}_${trg_m100_lang_code}/${trg_lang_code}.txt" ]; then
24 |                     echo "${data_type} ${trg_lang_code}" >> "Data/backtranslation/data_cer_$SIZE/results3.txt"
25 |                     sacrebleu \
26 |                       "${flores101_dataset}/devtest/${trg_lang_code}.devtest" \
27 |                       < "$root_output/generation_${SRC_MM100_LANG_CODE}_${trg_m100_lang_code}/${trg_lang_code}.txt" \
28 |                       --tokenize spm >> "Data/backtranslation/data_cer_$SIZE/results3.txt"
29 |                 else
30 |                     echo "$root_output/generation_${SRC_MM100_LANG_CODE}_${trg_m100_lang_code}/${trg_lang_code}.txt" doesn\'t exist
31 |                 fi
32 |             fi
33 |         done
34 |     done
35 | done
36 | 


--------------------------------------------------------------------------------
/previous_releases/floresv1/scripts/spm_decode.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | #
 7 | #!/usr/bin/env python
 8 | 
 9 | from __future__ import absolute_import, division, print_function, unicode_literals
10 | 
11 | import argparse
12 | 
13 | import sentencepiece as spm
14 | 
15 | 
16 | def main():
17 |     parser = argparse.ArgumentParser()
18 |     parser.add_argument("--model", required=True,
19 |                         help="sentencepiece model to use for decoding")
20 |     parser.add_argument("--input", required=True, help="input file to decode")
21 |     parser.add_argument("--input_format", choices=["piece", "id"], default="piece")
22 |     args = parser.parse_args()
23 | 
24 |     sp = spm.SentencePieceProcessor()
25 |     sp.Load(args.model)
26 | 
27 |     if args.input_format == "piece":
28 |         def decode(l):
29 |             return "".join(sp.DecodePieces(l))
30 |     elif args.input_format == "id":
31 |         def decode(l):
32 |             return "".join(sp.DecodeIds(l))
33 |     else:
34 |         raise NotImplementedError
35 | 
36 |     def tok2int(tok):
37 |         # remap reference-side <unk> (represented as <<unk>>) to 0
38 |         return int(tok) if tok != "<<unk>>" else 0
39 | 
40 |     with open(args.input, "r", encoding="utf-8") as h:
41 |         for line in h:
42 |             if args.input_format == "id":
43 |                 print(decode(list(map(tok2int, line.rstrip().split()))))
44 |             elif args.input_format == "piece":
45 |                 print(decode(line.rstrip().split()))
46 | 
47 | if __name__ == "__main__":
48 |     main()
49 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # FLORES-200 and NLLB Professionally Translated Datasets: NLLB-Seed, NLLB-MD, and Toxicity-200
 2 | 
 3 | 
 4 | ⚠️ This repository is no longer being updated ⚠️
 5 | 
 6 | **Newer versions** of the FLORES and NLLB-Seed datasets managed by the [Open Language Data Initiative](https://www.oldi.org/) are available here:
 7 | * [FLORES](https://github.com/openlanguagedata/flores)
 8 | * [NLLB-Seed](https://github.com/openlanguagedata/seed)
 9 | 
10 | Quick-access to the original READMEs: 
11 | * [FLORES-200](flores200/README.md)
12 | * [NLLB-Seed](nllb_seed/README.md)
13 | * [NLLB-MD](nllb_md/README.md)
14 | * [Toxicity-200](toxicity/README.md)
15 | 
16 | ## Citation
17 | 
18 | If you use any of this data in your work, please cite:
19 | 
20 | ```bibtex
21 | @article{nllb2022,
22 |   author    = {NLLB Team, Marta R. Costa-jussà, James Cross, Onur Çelebi, Maha Elbayad, Kenneth Heafield, Kevin Heffernan, Elahe Kalbassi,  Janice Lam, Daniel Licht, Jean Maillard, Anna Sun, Skyler Wang, Guillaume Wenzek, Al Youngblood, Bapi Akula, Loic Barrault, Gabriel Mejia Gonzalez, Prangthip Hansanti, John Hoffman, Semarley Jarrett, Kaushik Ram Sadagopan, Dirk Rowe, Shannon Spruit, Chau Tran, Pierre Andrews, Necip Fazil Ayan, Shruti Bhosale, Sergey Edunov, Angela Fan, Cynthia Gao, Vedanuj Goswami, Francisco Guzmán, Philipp Koehn, Alexandre Mourachko, Christophe Ropers, Safiyyah Saleem, Holger Schwenk, Jeff Wang},
23 |   title     = {No Language Left Behind: Scaling Human-Centered Machine Translation},
24 |   year      = {2022}
25 | }
26 | ```
27 | 
28 | ## Changelog
29 | - 2022-06-30: Released FLORES-200, NLLB-Seed, NLLB-MD, and Toxicity-200
30 | 
31 | - 2021-06-04: Released FLORES-101
32 | 
33 | ## Licenses
34 | 
35 | * FLORES-200: CC-BY-SA 4.0
36 | * NLLB-SEED: CC-BY-SA 4.0
37 | * NLLB-MD: CC-BY-NC 4.0
38 | * Toxicity-200: CC-BY-SA 4.0


--------------------------------------------------------------------------------
/previous_releases/floresv1/scripts/indic_norm_tok.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | #
 7 | #!/usr/bin/env python
 8 | 
 9 | from __future__ import absolute_import, division, print_function, unicode_literals
10 | 
11 | import argparse
12 | import fileinput
13 | import os
14 | import sys
15 | 
16 | 
17 | def main():
18 |     parser = argparse.ArgumentParser()
19 |     parser.add_argument("--indic-nlp-path", required=True,
20 |                         help="path to Indic NLP Library root")
21 |     parser.add_argument("--language", required=True)
22 |     parser.add_argument("--remove-nuktas", default=False, action="store_true")
23 |     parser.add_argument("input", help="input file; use - for stdin")
24 |     args = parser.parse_args()
25 | 
26 |     try:
27 |         sys.path.extend([
28 |             args.indic_nlp_path,
29 |             os.path.join(args.indic_nlp_path, "src"),
30 |         ])
31 |         from indicnlp.tokenize import indic_tokenize
32 |         from indicnlp.normalize.indic_normalize import IndicNormalizerFactory
33 |     except:
34 |         raise Exception(
35 |             "Cannot load Indic NLP Library, make sure --indic-nlp-path is correct"
36 |         )
37 | 
38 |     # create normalizer
39 |     factory = IndicNormalizerFactory()
40 |     normalizer = factory.get_normalizer(
41 |         args.language, remove_nuktas=args.remove_nuktas,
42 |     )
43 | 
44 |     # normalize and tokenize
45 |     for line in fileinput.input([args.input], openhook=fileinput.hook_compressed):
46 |         line = normalizer.normalize(line.decode("utf-8"))
47 |         line = " ".join(indic_tokenize.trivial_tokenize(line, args.language))
48 |         sys.stdout.write(line.encode("utf-8"))
49 | 
50 | 
51 | if __name__ == '__main__':
52 |     main()
53 | 


--------------------------------------------------------------------------------
/nllb_seed/README.md:
--------------------------------------------------------------------------------
 1 | # No Language Left Behind Seed Data 
 2 | 
 3 | NLLB Seed is a set of professionally-translated sentences in the Wikipedia domain. Data for NLLB-Seed was sampled from Wikimedia’s [List of articles every Wikipedia should have](https://meta.wikimedia.org/wiki/List_of_articles_every_Wikipedia_should_have/Expanded), a collection of topics in different fields of knowledge and human activity. NLLB-Seed consists of around six thousand sentences in 39 languages. NLLB-Seed is meant to be used for training rather than model evaluation. Due to this difference, NLLB-Seed does not go through the human quality assurance process present in FLORES-200.
 4 | 
 5 | --------------------------------------------------------------------------------
 6 | 
 7 | ## Download
 8 | 
 9 | ⚠️ This repository is no longer being updated ⚠️
10 | 
11 | **For newer versions of this dataset**, see <https://github.com/openlanguagedata/seed> and <https://www.oldi.org>.
12 | 
13 | The original version of the dataset can still be downloaded [here](https://tinyurl.com/NLLBSeed).
14 | 
15 | ## Languages in NLLB - Seed
16 | 
17 | Language | FLORES-200 code
18 | ---|---
19 | Acehnese (Arabic script) | ace_Arab
20 | Acehnese (Latin script) | ace_Latn
21 | Moroccan Arabic | ary_Arab
22 | Egyptian Arabic | arz_Arab
23 | Bambara | bam_Latn
24 | Balinese | ban_Latn
25 | Bhojpuri | bho_Deva
26 | Banjar (Arabic script) | bjn_Arab
27 | Banjar (Latin script) | bjn_Latn
28 | Buginese | bug_Latn
29 | Crimean Tatar | crh_Latn
30 | Southwestern Dinka | dik_Latn
31 | Dzongkha | dzo_Tibt
32 | Friulian | fur_Latn
33 | Nigerian Fulfulde | fuv_Latn
34 | Guarani | grn_Latn
35 | Chhattisgarhi | hne_Deva
36 | Kashmiri (Arabic script) | kas_Arab
37 | Kashmiri (Devanagari script) | kas_Deva
38 | Central Kanuri (Arabic script) | knc_Arab
39 | Central Kanuri (Latin script) | knc_Latn
40 | Ligurian | lij_Latn
41 | Limburgish | lim_Latn
42 | Lombard | lmo_Latn
43 | Latgalian | ltg_Latn
44 | Magahi | mag_Deva
45 | Meitei (Bengali script) | mni_Beng
46 | Maori | mri_Latn
47 | Nuer | nus_Latn
48 | Dari | prs_Arab
49 | Southern Pashto | pbt_Arab
50 | Sicilian | scn_Latn
51 | Shan | shn_Mymr
52 | Sardinian | srd_Latn
53 | Silesian | szl_Latn
54 | Tamasheq (Latin script) | taq_Latn
55 | Tamasheq (Tifinagh script) | taq_Tfng
56 | Central Atlas Tamazight | tzm_Tfng
57 | Venetian | vec_Latn
58 | 


--------------------------------------------------------------------------------
/ocr/data_collection/file_splitter.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | import os
 3 | import sys
 4 | 
 5 | 
 6 | class FileSplitter:
 7 |     def __init__(self):
 8 |         self.parse_args(sys.argv)
 9 | 
10 |     @staticmethod
11 |     def run():
12 |         splitter = FileSplitter()
13 |         splitter.split()
14 | 
15 |     def split(self):
16 |         file_number = 1
17 |         line_number = 1
18 | 
19 |         print("Splitting", os.path.join(self.working_dir, self.file_base_name + self.file_ext),
20 |               "into multiple files with", self.split_size, "lines")
21 | 
22 |         out_file = self.get_new_file(file_number)
23 |         for line in self.in_file:
24 |             out_file.write(line)
25 |             line_number += 1
26 |             if line_number == self.split_size + 1:
27 |                 out_file.close()
28 |                 file_number += 1
29 |                 line_number = 1
30 |                 out_file = self.get_new_file(file_number)
31 | 
32 |         out_file.close()
33 | 
34 |         print("Created %s files." % (str(file_number)))
35 | 
36 |     def get_new_file(self, file_number):
37 |         """return a new file object ready to write to"""
38 |         new_file_name = "%s%s%s" % (self.file_base_name, str(file_number), self.file_ext)
39 |         new_file_path = os.path.join(self.working_dir, new_file_name)
40 |         print("creating file %s" % (new_file_path))
41 |         return open(new_file_path, 'w')
42 | 
43 |     def parse_args(self, argv):
44 |         """parse args and set up instance variables"""
45 |         try:
46 |             self.split_size = 1000
47 |             if len(argv) > 2:
48 |                 self.split_size = int(argv[2])
49 |             self.file_name = argv[1]
50 |             self.in_file = open(self.file_name, "r")
51 |             self.working_dir = os.getcwd()
52 |             self.file_base_name, self.file_ext = os.path.splitext(self.file_name)
53 |         except:
54 |             print(self.usage())
55 |             sys.exit(1)
56 | 
57 |     def usage(self):
58 |         return """
59 |         Split a large file into many smaller files with set number of rows.
60 |         Usage:
61 |             $ python file_splitter.py <file_name> [row_count]
62 |         row_count is optional (default is 1000)
63 |         """
64 | 
65 | 
66 | if __name__ == "__main__":
67 |     FileSplitter.run()
68 | 


--------------------------------------------------------------------------------
/ocr/README.md:
--------------------------------------------------------------------------------
 1 | # OCR Improves Machine Translation for Low-Resource Languages
 2 | 
 3 | This folder contains the scripts to run the data preparation and evaluation for the following paper.
 4 | 
 5 | ```bibtex
 6 | @inproceedings{ignat2022ocr,
 7 |   author = "Oana Ignat and Jean Maillard and Vishrav Chaudhary and Francisco Guzmán",
 8 |   title = "OCR Improves Machine Translation for Low-Resource Languages",
 9 |   booktitle = "Findings of ACL 2022, Long Papers",
10 |   year = 2022
11 | }
12 | ```
13 | 
14 | Contents:
15 | * Design and build a benchmark in 60 languages that includes a variety of languages, scripts (UDHR data and Flores 101)
16 |     * Code to download and process UDHR articles: [`data_collection/download_UDHR_data.py`](data_collection/download_UDHR_data.py)
17 |     * Code to Augment Flores 101 PDFs (font, opacity, letter-spacing, italic, bold, Gaussian noise, skewing): [`data_collection/augment_data.py`](data_collection/augment_data.py)
18 | 
19 | * Evaluate related work under this new benchmark
20 |     * Code to run Google Vision API: [`OCR_eval/google_vision_OCR.py`](OCR_eval/google_vision_OCR.py)
21 |      * Code to run Tesseract and measure metrics: [`OCR_eval/OCR_eval.py`](OCR_eval/OCR_eval.py)
22 | 
23 | * Study the downstream impact of recognition errors in back translation
24 |     * Code to extract OCR errors from UDHR and insert them in monolingual data (WikiMatrix and CC100): [`OCR_impact_BT/OCR_error_analysis.py`](OCR_impact_BT/OCR_error_analysis.py)
25 |     * Code run BT and finetune MM124 model on OCRed monolingual data: [`OCR_impact_BT/translate_mono.sh`](OCR_impact_BT/translate_mono.sh), [`OCR_impact_BT/finetune.sh`](OCR_impact_BT/finetune.sh) , [`OCR_impact_BT/evaluate.sh`](OCR_impact_BT/evaluate.sh)
26 |     * Code run BT and finetune MM124 model on OCRed Nepali books: [`OCR_impact_BT/translate_mono_books.sh`](OCR_impact_BT/translate_mono_books.sh), [`OCR_impact_BT/finetune_eval_books.sh`](OCR_impact_BT/finetune_eval_books.sh)
27 | 
28 | Setup:
29 | 
30 | 1. Install [Tesseract v4](https://github.com/tesseract-ocr/tesseract).
31 | 2. Install the Python requirements:
32 | 
33 |   ```bash
34 |   pip install -r requirements.txt
35 |   ```
36 | 
37 | 3. To use the Google Vision API, [set up the authentication with Google
38 |    Cloud](https://cloud.google.com/storage/docs/reference/libraries#setting_up_authentication)
39 | 4. You mat need to change the `CHROME_PATH` value from `data_collection/augment_data.py` to the location where Google
40 |    Chrome is in your computer.
41 | 


--------------------------------------------------------------------------------
/ocr/Data/language_codes/languages_fonts_codes.csv:
--------------------------------------------------------------------------------
 1 | Language,Code,Script,Fonts,Tesseract Code
 2 | Amharic,amh,Ge’ez,NotoSansEthiopic-Regular,amh
 3 | Armenian,hye,Armenian,NotoSerifArmenian-Regular,hye
 4 | Asturian,ast,Latin,TimesNewRoman; Arial,fra
 5 | Belarusian,bel,Cyrillic,Arial; Verdana,bel
 6 | Bengali,ben,Bengali,Verdana; NotoSansBengali-Regular,ben
 7 | Bulgarian,bul,Cyrillic,Arial; Verdana,bul
 8 | Burmese,mya,Myanmar,TimesNewRoman; NotoSansMyanmar-Regular,mya
 9 | Georgian,kat,Georgian,NotoSerifGeorgian-Regular,kat
10 | Greek,ell,Greek,Arial,ell
11 | Gujarati,guj,Gujarati,NotoSerifGujarati-Regular,guj
12 | Hebrew,heb,Hebrew,NotoSerifHebrew-Regular,heb
13 | Hindi,hin,Devanagari,NotoSansDevanagari-Regular,hin
14 | Japanese,jpn,Han; Hiragana; Katakana,Arial; Sim Sun; Yahei,jpn
15 | Kannada,kan,Telugu-Kannada,NotoSansKannada-Regular,kan
16 | Kazakh,kaz,Cyrillic,Arial; Verdana,kaz
17 | Khmer,khm,Khmer,NotoSansKhmer-Regular,khm
18 | Korean,kor,Hangul,Arial; Noto Sans KR,kor
19 | Kyrgyz,kir,Cyrillic,Arial; Verdana,kir
20 | Lao,lao,Lao,NotoSansLao-Regular; Verdana,lao
21 | Macedonian,mkd,Cyrillic,Arial; Verdana,mkd
22 | Malayalam,mal,Malayalam,Verdana; NotoSansMalayalam-Regular,mal
23 | Marathi,mar,Devanagari,NotoSansDevanagari-Regular,mar
24 | Nepali,npi,Devanagari,NotoSansDevanagari-Regular,nep
25 | Pashto,pbu,Perso-Arabic,Verdana; Calibri; Dubai,pus
26 | Punjabi,pan,Gurmukhi,NotoSansGurmukhi-Regular,pan
27 | Russian,rus,Cyrillic,Arial; Verdana,rus
28 | Serbian,srp,Cyrillic,Arial; Verdana,srp
29 | Tajik,tgk,Cyrillic,Arial; Verdana,tgk
30 | Tamil,tam,Tamil,NotoSansTamil-Regular,tam
31 | Telugu,tel,Telugu-Kannada,NotoSansKannada-Regular,tel
32 | Thai,tha,Thai,NotoSansThai-Regular; Verdana,tha
33 | Ukrainian,ukr,Cyrillic,Arial; Verdana,ukr
34 | Urdu,urd,Perso-Arabic,JameelNooriNastaleeq; NotoNastaliqUrdu-Regular,urd
35 | Vietnamese,vie,Latin,TimesNewRoman; Arial,vie
36 | Turkish,tur,Latin,TimesNewRoman; Arial,tur
37 | Uzbek,uzn,Latin,TimesNewRoman; Arial,uzb
38 | Wolof,wol,Latin,TimesNewRoman; Arial,afr
39 | Zulu,zul,Latin,TimesNewRoman; Arial,afr
40 | Arabic,arb,Arabic,NotoSansArabic-Regular; TraditionalArabic; Arial,ara
41 | Cebuano,ceb,Latin,TimesNewRoman; Arial,ceb
42 | Chinese_Simpl,cmn,Hant,Arial; Sim Sun; Yahei,chi_sim
43 | Fula,fuv,Latin,TimesNewRoman; Arial,afr
44 | Ganda,lug,Latin,TimesNewRoman; Arial,afr
45 | Icelandic,isl,Latin,TimesNewRoman; Arial,isl
46 | Lingala,lin,Latin,TimesNewRoman; Arial,afr
47 | Maori,mri,Latin,TimesNewRoman; Arial,mri
48 | Mongolian,khk,Cyrillic,Arial; Verdana,mon
49 | Nyanja,nya,Latin,TimesNewRoman; Arial,afr
50 | Romanian,ron,Latin,TimesNewRoman; Arial,ron
51 | Sorani_Kurdish,ckb,Arabic,TimesNewRoman; Arial,ara
52 | Shona,sna,Latin,TimesNewRoman; Arial,afr
53 | Umbundu,umb,Latin,TimesNewRoman; Arial,afr
54 | Swahili,swh,Latin,TimesNewRoman; Arial,swa
55 | Somali,som,Latin,TimesNewRoman; Arial,afr
56 | Swedish,swe,Latin,TimesNewRoman; Arial,swe
57 | Polish,pol,Latin,TimesNewRoman; Arial,pol
58 | Slovak,slk,Latin,TimesNewRoman; Arial,slk
59 | Slovenian,slv,Latin,TimesNewRoman; Arial,slv
60 | Oromo,gaz,Latin,TimesNewRoman; Arial,afr
61 | Portuguese_Portugal,por,Latin,TimesNewRoman; Arial,por
62 | 


--------------------------------------------------------------------------------
/ocr/OCR_impact_BT/translate_mono_books.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | if [[ -z "$fairseq" ]]; then
 4 |   echo 'Need to specify the env var fairseq.'
 5 |   exit 1
 6 | fi
 7 | 
 8 | source OCR_impact_BT/lang_codes.source
 9 | 
10 | TRG_LANG_CODE=eng
11 | TRG_MM100_LANG_CODE=en
12 | 
13 | for data_type in books_10k books_20k books_30k; do
14 |   for src_lang_code in "${!LANG_CODES[@]}"; do
15 |     root_output='Data/backtranslation/data_books/'${data_type}
16 |     mkdir -p $root_output/SPM/mono/
17 |     echo $root_output
18 | 
19 |     SRC_MM100_LANG_CODE="${LANG_CODES[${src_lang_code}]}"
20 |     if [ ! -f "$root_output/${src_lang_code}_mono.txt" ]; then
21 |       echo "$root_output/${src_lang_code}_mono.txt" doesn\'t exist
22 |       continue
23 |     fi
24 |     generation_file=$root_output/generation_${SRC_MM100_LANG_CODE}_${TRG_MM100_LANG_CODE}/${TRG_LANG_CODE}.txt
25 |     echo "${src_lang_code}" "${SRC_MM100_LANG_CODE}"
26 |     if [ -s "${generation_file}" ] && [ "$(wc -l <"${generation_file}")" -eq 10000 ]; then
27 |       echo "${generation_file} exists and has 10000 lines."
28 |       continue
29 |     else
30 |       echo "$generation_file doesn't exist or hasn't 10000 lines"
31 |     fi
32 |     python "$fairseq/scripts/spm_encode.py" \
33 |       --model "$fairseq/flores101_mm100_615M/sentencepiece.bpe.model" \
34 |       --output_format=piece \
35 |       --inputs="$root_output/${src_lang_code}_mono.txt" \
36 |       --outputs="$root_output/SPM/mono/spm.${SRC_MM100_LANG_CODE}-${TRG_MM100_LANG_CODE}.${SRC_MM100_LANG_CODE}"
37 | 
38 |     python "$fairseq/scripts/spm_encode.py" \
39 |       --model "$fairseq/flores101_mm100_615M/sentencepiece.bpe.model" \
40 |       --output_format=piece \
41 |       --inputs="$root_output/${src_lang_code}_mono.txt" \
42 |       --outputs="$root_output/SPM/mono/spm.${SRC_MM100_LANG_CODE}-${TRG_MM100_LANG_CODE}.${TRG_MM100_LANG_CODE}"
43 | 
44 |     fairseq-preprocess \
45 |       --source-lang "${SRC_MM100_LANG_CODE}" --target-lang "${TRG_MM100_LANG_CODE}" \
46 |       --trainpref "$root_output/SPM/mono/spm.${SRC_MM100_LANG_CODE}-${TRG_MM100_LANG_CODE}" \
47 |       --thresholdsrc 0 --thresholdtgt 0 \
48 |       --destdir "$root_output/data_bin_${SRC_MM100_LANG_CODE}_${TRG_MM100_LANG_CODE}" \
49 |       --srcdict "$fairseq/flores101_mm100_615M/dict.txt" --tgtdict "$fairseq/flores101_mm100_615M/dict.txt"
50 | 
51 |     fairseq-generate \
52 |       "$root_output/data_bin_${SRC_MM100_LANG_CODE}_${TRG_MM100_LANG_CODE}" \
53 |       --batch-size 256 \
54 |       --skip-invalid-size-inputs-valid-test \
55 |       --path "$fairseq/flores101_mm100_615M/model.pt" \
56 |       --fixed-dictionary "$fairseq/flores101_mm100_615M/dict.txt" \
57 |       -s "${SRC_MM100_LANG_CODE}" -t "${TRG_MM100_LANG_CODE}" \
58 |       --remove-bpe 'sentencepiece' \
59 |       --beam 5 \
60 |       --fp16 \
61 |       --task translation_multi_simple_epoch \
62 |       --lang-pairs "$fairseq/flores101_mm100_615M/language_pairs.txt" \
63 |       --decoder-langtok --encoder-langtok src \
64 |       --gen-subset train \
65 |       --dataset-impl mmap \
66 |       --distributed-world-size 1 --distributed-no-spawn \
67 |       --results-path "$root_output/generation_${SRC_MM100_LANG_CODE}_${TRG_MM100_LANG_CODE}"
68 | 
69 |     # clean fairseq generated file to only create hypotheses file.
70 |     grep -P '^H-' "$root_output/generation_${SRC_MM100_LANG_CODE}_${TRG_MM100_LANG_CODE}/generate-train.txt" |
71 |       cut -c 3- |
72 |       sort -n -k 1 |
73 |       awk -F "\t" '{print $NF}' \
74 |         >"$root_output/generation_${SRC_MM100_LANG_CODE}_${TRG_MM100_LANG_CODE}/${TRG_LANG_CODE}.txt"
75 |   done
76 | done
77 | 


--------------------------------------------------------------------------------
/ocr/OCR_impact_BT/translate_mono.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | if [[ -z "$fairseq" ]]; then
 4 |   echo 'Need to specify the env var fairseq.'
 5 |   exit 1
 6 | fi
 7 | 
 8 | source OCR_impact_BT/lang_codes.source
 9 | 
10 | TRG_LANG_CODE=eng
11 | TRG_MM100_LANG_CODE=en
12 | 
13 | SIZE=20k
14 | 
15 | for src_lang_code in "${!LANG_CODES[@]}"; do
16 |   for error_rate in {1..22..2}; do
17 |     for error_type in replace insert delete; do
18 |       data_type=${error_rate}/${error_type}
19 |       root_output=Data/backtranslation/data_cer_$SIZE/${data_type}
20 |       mkdir -p $root_output/SPM/mono/
21 |       echo $root_output
22 | 
23 |       src_mm100_lang_code="${LANG_CODES[${src_lang_code}]}"
24 |       if [ ! -f "$root_output/${src_lang_code}_mono.txt" ]; then
25 |         echo "$root_output/${src_lang_code}_mono.txt" doesn\'t exist
26 |         continue
27 |       fi
28 |       generation_file=$root_output/generation_${src_mm100_lang_code}_${TRG_MM100_LANG_CODE}/${TRG_LANG_CODE}.txt
29 |       echo "${src_lang_code}" "${src_mm100_lang_code}"
30 |       if [ -s "${generation_file}" ] && [ "$(wc -l < "$generation_file")" -eq 20000 ]; then
31 |         echo "${generation_file} exists and has 20000 lines."
32 |         continue
33 |       else
34 |         echo "$generation_file doesn't exist or hasn't 20000 lines"
35 |       fi
36 |       python "$fairseq/scripts/spm_encode.py" \
37 |         --model "$fairseq/flores101_mm100_615M/sentencepiece.bpe.model" \
38 |         --output_format=piece \
39 |         --inputs="$root_output/${src_lang_code}_mono.txt" \
40 |         --outputs="$root_output/SPM/mono/spm.${src_mm100_lang_code}-${TRG_MM100_LANG_CODE}.${src_mm100_lang_code}"
41 | 
42 |       python "$fairseq/scripts/spm_encode.py" \
43 |         --model "$fairseq/flores101_mm100_615M/sentencepiece.bpe.model" \
44 |         --output_format=piece \
45 |         --inputs="$root_output/${src_lang_code}_mono.txt" \
46 |         --outputs="$root_output/SPM/mono/spm.${src_mm100_lang_code}-${TRG_MM100_LANG_CODE}.${TRG_MM100_LANG_CODE}"
47 | 
48 |       fairseq-preprocess \
49 |         --source-lang "${src_mm100_lang_code}" --target-lang ${TRG_MM100_LANG_CODE} \
50 |         --trainpref "$root_output/SPM/mono/spm.${src_mm100_lang_code}-${TRG_MM100_LANG_CODE}" \
51 |         --thresholdsrc 0 --thresholdtgt 0 \
52 |         --destdir "$root_output/data_bin_${src_mm100_lang_code}_${TRG_MM100_LANG_CODE}" \
53 |         --srcdict "$fairseq/flores101_mm100_615M/dict.txt" --tgtdict "$fairseq/flores101_mm100_615M/dict.txt"
54 | 
55 |       fairseq-generate \
56 |         "$root_output/data_bin_${src_mm100_lang_code}_${TRG_MM100_LANG_CODE}" \
57 |         --batch-size 64 \
58 |         --skip-invalid-size-inputs-valid-test \
59 |         --path "$fairseq/flores101_mm100_615M/model.pt" \
60 |         --fixed-dictionary "$fairseq/flores101_mm100_615M/dict.txt" \
61 |         -s "${src_mm100_lang_code}" -t ${TRG_MM100_LANG_CODE} \
62 |         --remove-bpe 'sentencepiece' \
63 |         --beam 5 \
64 |         --fp16 \
65 |         --task translation_multi_simple_epoch \
66 |         --lang-pairs "$fairseq/flores101_mm100_615M/language_pairs.txt" \
67 |         --decoder-langtok --encoder-langtok src \
68 |         --gen-subset train \
69 |         --dataset-impl mmap \
70 |         --distributed-world-size 1 --distributed-no-spawn \
71 |         --results-path "$root_output/generation_${src_mm100_lang_code}_${TRG_MM100_LANG_CODE}"
72 | 
73 |       # clean fairseq generated file to only create hypotheses file.
74 |       grep -P '^H-' "$root_output/generation_${src_mm100_lang_code}_${TRG_MM100_LANG_CODE}/generate-train.txt" |
75 |         cut -c 3- |
76 |         sort -n -k 1 |
77 |         awk -F "\t" '{print $NF}' \
78 |           >"$root_output/generation_${src_mm100_lang_code}_${TRG_MM100_LANG_CODE}/${TRG_LANG_CODE}.txt"
79 |     done
80 |   done
81 | done
82 | 


--------------------------------------------------------------------------------
/previous_releases/floresv1/prepare-sien.sh:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates.
  2 | # All rights reserved.
  3 | #
  4 | # This source code is licensed under the license found in the
  5 | # LICENSE file in the root directory of this source tree.
  6 | #
  7 | #!/bin/bash
  8 | 
  9 | SRC=si
 10 | TGT=en
 11 | 
 12 | BPESIZE=5000
 13 | TRAIN_MINLEN=6  # remove sentences with <6 BPE tokens
 14 | TRAIN_MAXLEN=250  # remove sentences with >250 BPE tokens
 15 | 
 16 | ROOT=$(dirname "$0")
 17 | SCRIPTS=$ROOT/scripts
 18 | DATA=$ROOT/data
 19 | TMP=$DATA/wiki_${SRC}_${TGT}_bpe${BPESIZE}
 20 | DATABIN=$ROOT/data-bin/wiki_${SRC}_${TGT}_bpe${BPESIZE}
 21 | mkdir -p $TMP $DATABIN
 22 | 
 23 | SRC_TOKENIZER="bash $SCRIPTS/indic_norm_tok.sh $SRC"
 24 | TGT_TOKENIZER="cat"  # learn target-side BPE over untokenized (raw) text
 25 | SPM_TRAIN=$SCRIPTS/spm_train.py
 26 | SPM_ENCODE=$SCRIPTS/spm_encode.py
 27 | 
 28 | URLS=(
 29 |     "https://github.com/facebookresearch/flores/raw/master/data/wikipedia_en_ne_si_test_sets.tgz"
 30 | )
 31 | ARCHIVES=(
 32 |     "wikipedia_en_ne_si_test_sets.tgz"
 33 | )
 34 | TRAIN_SETS=(
 35 |     "all-clean-si/GNOMEKDEUbuntu.en-si"
 36 |     "all-clean-si/OpenSubtitles2018.en-si"
 37 | )
 38 | VALID_SET="wikipedia_en_ne_si_test_sets/wikipedia.dev.si-en"
 39 | TEST_SET="wikipedia_en_ne_si_test_sets/wikipedia.devtest.si-en"
 40 | 
 41 | if [ ! -d $DATA/all-clean-si ]; then
 42 |     echo "Data directory not found. Please run 'bash download-data.sh' first..."
 43 |     exit -1
 44 | fi
 45 | 
 46 | # download and extract data
 47 | for ((i=0;i<${#URLS[@]};++i)); do
 48 |     ARCHIVE=$DATA/${ARCHIVES[i]}
 49 |     if [ -f $ARCHIVE ]; then
 50 |         echo "$ARCHIVE already exists, skipping download"
 51 |     else
 52 |         URL=${URLS[i]}
 53 |         wget -P $DATA "$URL"
 54 |         if [ -f $ARCHIVE ]; then
 55 |             echo "$URL successfully downloaded."
 56 |         else
 57 |             echo "$URL not successfully downloaded."
 58 |             exit -1
 59 |         fi
 60 |     fi
 61 |     FILE=${ARCHIVE: -4}
 62 |     if [ -e $FILE ]; then
 63 |         echo "$FILE already exists, skipping extraction"
 64 |     else
 65 |         tar -C $DATA -xzvf $ARCHIVE
 66 |     fi
 67 | done
 68 | 
 69 | echo "pre-processing train data..."
 70 | bash $SCRIPTS/download_indic.sh
 71 | for FILE in "${TRAIN_SETS[@]}" ; do
 72 |     $SRC_TOKENIZER $DATA/$FILE.$SRC
 73 | done > $TMP/train.$SRC
 74 | for FILE in "${TRAIN_SETS[@]}"; do
 75 |     $TGT_TOKENIZER $DATA/$FILE.$TGT
 76 | done > $TMP/train.$TGT
 77 | 
 78 | echo "pre-processing dev/test data..."
 79 | $SRC_TOKENIZER $DATA/${VALID_SET}.$SRC > $TMP/valid.$SRC
 80 | $TGT_TOKENIZER $DATA/${VALID_SET}.$TGT > $TMP/valid.$TGT
 81 | $SRC_TOKENIZER $DATA/${TEST_SET}.$SRC > $TMP/test.$SRC
 82 | $TGT_TOKENIZER $DATA/${TEST_SET}.$TGT > $TMP/test.$TGT
 83 | 
 84 | # learn BPE with sentencepiece
 85 | python $SPM_TRAIN \
 86 |   --input=$TMP/train.$SRC,$TMP/train.$TGT \
 87 |   --model_prefix=$DATABIN/sentencepiece.bpe \
 88 |   --vocab_size=$BPESIZE \
 89 |   --character_coverage=1.0 \
 90 |   --model_type=bpe
 91 | 
 92 | # encode train/valid/test
 93 | python $SPM_ENCODE \
 94 |   --model $DATABIN/sentencepiece.bpe.model \
 95 |   --output_format=piece \
 96 |   --inputs $TMP/train.$SRC $TMP/train.$TGT \
 97 |   --outputs $TMP/train.bpe.$SRC $TMP/train.bpe.$TGT \
 98 |   --min-len $TRAIN_MINLEN --max-len $TRAIN_MAXLEN
 99 | for SPLIT in "valid" "test"; do \
100 |   python $SPM_ENCODE \
101 |     --model $DATABIN/sentencepiece.bpe.model \
102 |     --output_format=piece \
103 |     --inputs $TMP/$SPLIT.$SRC $TMP/$SPLIT.$TGT \
104 |     --outputs $TMP/$SPLIT.bpe.$SRC $TMP/$SPLIT.bpe.$TGT
105 | done
106 | 
107 | # binarize data
108 | fairseq-preprocess \
109 |   --source-lang $SRC --target-lang $TGT \
110 |   --trainpref $TMP/train.bpe --validpref $TMP/valid.bpe --testpref $TMP/test.bpe \
111 |   --destdir $DATABIN \
112 |   --joined-dictionary \
113 |   --workers 4
114 | 


--------------------------------------------------------------------------------
/ocr/OCR_eval/metrics.py:
--------------------------------------------------------------------------------
  1 | """Script to compute character error rate and word error rate between a predicted text and its "gold" target text.
  2 | 
  3 | Usage:
  4 | python metrics.py --pred [predicted_filename] --tgt [target_filename]
  5 | 
  6 | Copyright (c) 2021, Shruti Rijhwani
  7 | All rights reserved.
  8 | 
  9 | This source code is licensed under the BSD-style license found in the
 10 | LICENSE file in the root directory of <https://github.com/shrutirij/ocr-post-correction>
 11 | """
 12 | import argparse
 13 | import unicodedata
 14 | from collections import defaultdict
 15 | 
 16 | import editdistance as ed
 17 | 
 18 | 
 19 | def check_splitted_words(predicted_text):
 20 |     new_predicted = []
 21 |     pred_spl = predicted_text.split()
 22 |     i = 0
 23 |     while i < len(pred_spl) - 1:
 24 |         if pred_spl[i][-1] == "-":
 25 |             new_word = pred_spl[i][:-1] + pred_spl[i + 1]
 26 |             new_predicted.append(new_word)
 27 |             i = i + 2
 28 |         else:
 29 |             new_predicted.append(pred_spl[i])
 30 |             i = i + 1
 31 |     return " ".join(new_predicted)
 32 | 
 33 | 
 34 | class ErrorMetrics:
 35 |     def preprocess(self, text):
 36 |         preprocessed = " ".join(text.strip().split())
 37 |         return preprocessed
 38 | 
 39 |     def calculate_metrics(self, predicted_text, transcript):
 40 |         print("Metrics:")
 41 |         predicted_text = predicted_text.replace("\u200d", "")
 42 |         predicted_text = predicted_text.replace("\u200c", "")
 43 |         transcript = transcript.replace("\u200d", "")
 44 |         transcript = transcript.replace("\u200c", "")
 45 | 
 46 |         # need to normalize unicode representation for vietnamese
 47 |         predicted_text = unicodedata.normalize("NFC", predicted_text)
 48 |         transcript = unicodedata.normalize("NFC", transcript)
 49 | 
 50 |         # check if there is any word split by "-"
 51 |         if "-" in predicted_text:
 52 |             predicted_text = check_splitted_words(predicted_text)
 53 | 
 54 |         cer = ed.eval(predicted_text, transcript) / float(len(transcript))
 55 |         pred_spl = predicted_text.split()
 56 |         transcript_spl = transcript.split()
 57 | 
 58 |         wer = ed.eval(pred_spl, transcript_spl) / float(len(transcript_spl))
 59 |         return cer, wer
 60 | 
 61 | 
 62 | def compute_metrics(pred, tgt):
 63 |     metrics = ErrorMetrics()
 64 | 
 65 |     pred_lines = open(pred, encoding="utf8").readlines()
 66 |     tgt_lines = open(tgt, encoding="utf8").readlines()
 67 | 
 68 |     predicted = []
 69 |     transcripts = []
 70 |     for pred_line, tgt_line in zip(pred_lines, tgt_lines):
 71 |         if not tgt_line.strip():
 72 |             continue
 73 |         predicted.append(metrics.preprocess(pred_line))
 74 |         transcripts.append(metrics.preprocess(tgt_line))
 75 | 
 76 |     cer, wer = metrics.calculate_metrics("\n".join(predicted), "\n".join(transcripts))
 77 |     return cer * 100, wer * 100
 78 | 
 79 | 
 80 | if __name__ == "__main__":
 81 | 
 82 |     parser = argparse.ArgumentParser()
 83 |     parser.add_argument("--pred", help="Predicted text.")
 84 |     parser.add_argument("--tgt", help="Target text.")
 85 |     args = parser.parse_args()
 86 | 
 87 |     errors = defaultdict(lambda: 0)
 88 |     char_counts = defaultdict(lambda: 0)
 89 | 
 90 |     metrics = ErrorMetrics()
 91 | 
 92 |     pred_lines = open(args.pred, encoding="utf8").readlines()
 93 |     tgt_lines = open(args.tgt, encoding="utf8").readlines()
 94 | 
 95 |     predicted, transcripts = [], []
 96 |     for pred_line, tgt_line in zip(pred_lines, tgt_lines):
 97 |         if not tgt_line.strip():
 98 |             continue
 99 |         predicted.append(metrics.preprocess(pred_line))
100 |         transcripts.append(metrics.preprocess(tgt_line))
101 | 
102 |     cer, wer = metrics.calculate_metrics("\n".join(predicted), "\n".join(transcripts))
103 |     print("CER {:.2f}".format(cer * 100))
104 |     print("WER {:.2f}".format(wer * 100))
105 | 


--------------------------------------------------------------------------------
/previous_releases/floresv1/scripts/spm_encode.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates.
  2 | # All rights reserved.
  3 | #
  4 | # This source code is licensed under the license found in the
  5 | # LICENSE file in the root directory of this source tree.
  6 | #
  7 | #!/usr/bin/env python
  8 | 
  9 | from __future__ import absolute_import, division, print_function, unicode_literals
 10 | 
 11 | import argparse
 12 | import contextlib
 13 | import sys
 14 | 
 15 | import sentencepiece as spm
 16 | 
 17 | 
 18 | def main():
 19 |     parser = argparse.ArgumentParser()
 20 |     parser.add_argument("--model", required=True,
 21 |                         help="sentencepiece model to use for encoding")
 22 |     parser.add_argument("--inputs", nargs="+", default=['-'],
 23 |                         help="input files to filter/encode")
 24 |     parser.add_argument("--outputs", nargs="+", default=['-'],
 25 |                         help="path to save encoded outputs")
 26 |     parser.add_argument("--output_format", choices=["piece", "id"], default="piece")
 27 |     parser.add_argument("--min-len", type=int, metavar="N",
 28 |                         help="filter sentence pairs with fewer than N tokens")
 29 |     parser.add_argument("--max-len", type=int, metavar="N",
 30 |                         help="filter sentence pairs with more than N tokens")
 31 |     args = parser.parse_args()
 32 | 
 33 |     assert len(args.inputs) == len(args.outputs), \
 34 |             "number of input and output paths should match"
 35 | 
 36 |     sp = spm.SentencePieceProcessor()
 37 |     sp.Load(args.model)
 38 | 
 39 |     if args.output_format == "piece":
 40 |         def encode(l):
 41 |             return sp.EncodeAsPieces(l)
 42 |     elif args.output_format == "id":
 43 |         def encode(l):
 44 |             return list(map(str, sp.EncodeAsIds(l)))
 45 |     else:
 46 |         raise NotImplementedError
 47 | 
 48 |     if args.min_len is not None or args.max_len is not None:
 49 |         def valid(line):
 50 |             return (
 51 |                 (args.min_len is None or len(line) >= args.min_len)
 52 |                 and (args.max_len is None or len(line) <= args.max_len)
 53 |             )
 54 |     else:
 55 |         def valid(lines):
 56 |             return True
 57 | 
 58 |     with contextlib.ExitStack() as stack:
 59 |         inputs = [
 60 |             stack.enter_context(open(input, "r", encoding="utf-8", newline="\n", errors="ignore"))
 61 |                 if input != "-" else sys.stdin
 62 |             for input in args.inputs
 63 |         ]
 64 |         outputs = [
 65 |             stack.enter_context(open(output, "w", encoding="utf-8", newline="\n"))
 66 |                 if output != "-" else sys.stdout
 67 |             for output in args.outputs
 68 |         ]
 69 | 
 70 |         stats = {
 71 |             "num_empty": 0,
 72 |             "num_filtered": 0,
 73 |         }
 74 | 
 75 |         def encode_line(line):
 76 |             line = line.strip()
 77 |             if len(line) > 0:
 78 |                 line = encode(line)
 79 |                 if valid(line):
 80 |                     return line
 81 |                 else:
 82 |                     stats["num_filtered"] += 1
 83 |             else:
 84 |                 stats["num_empty"] += 1
 85 |             return None
 86 | 
 87 |         for i, lines in enumerate(zip(*inputs), start=1):
 88 |             enc_lines = list(map(encode_line, lines))
 89 |             if not any(enc_line is None for enc_line in enc_lines):
 90 |                 for enc_line, output_h in zip(enc_lines, outputs):
 91 |                     print(" ".join(enc_line), file=output_h)
 92 |             if i % 10000 == 0:
 93 |                 print("processed {} lines".format(i), file=sys.stderr)
 94 | 
 95 |         print("skipped {} empty lines".format(stats["num_empty"]), file=sys.stderr)
 96 |         print("filtered {} lines".format(stats["num_filtered"]), file=sys.stderr)
 97 | 
 98 | 
 99 | if __name__ == "__main__":
100 |     main()
101 | 


--------------------------------------------------------------------------------
/previous_releases/floresv1/prepare-neen.sh:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates.
  2 | # All rights reserved.
  3 | #
  4 | # This source code is licensed under the license found in the
  5 | # LICENSE file in the root directory of this source tree.
  6 | #
  7 | #!/bin/bash
  8 | 
  9 | SRC=ne
 10 | TGT=en
 11 | 
 12 | BPESIZE=5000
 13 | TRAIN_MINLEN=1  # remove sentences with <1 BPE token
 14 | TRAIN_MAXLEN=250  # remove sentences with >250 BPE tokens
 15 | 
 16 | ROOT=$(dirname "$0")
 17 | SCRIPTS=$ROOT/scripts
 18 | DATA=$ROOT/data
 19 | TMP=$DATA/wiki_${SRC}_${TGT}_bpe${BPESIZE}
 20 | DATABIN=$ROOT/data-bin/wiki_${SRC}_${TGT}_bpe${BPESIZE}
 21 | mkdir -p $TMP $DATABIN
 22 | 
 23 | SRC_TOKENIZER="bash $SCRIPTS/indic_norm_tok.sh $SRC"
 24 | TGT_TOKENIZER="cat"  # learn target-side BPE over untokenized (raw) text
 25 | SPM_TRAIN=$SCRIPTS/spm_train.py
 26 | SPM_ENCODE=$SCRIPTS/spm_encode.py
 27 | 
 28 | URLS=(
 29 |     "https://github.com/facebookresearch/flores/raw/main/floresv1/data/wikipedia_en_ne_si_test_sets.tgz"
 30 | )
 31 | ARCHIVES=(
 32 |     "wikipedia_en_ne_si_test_sets.tgz"
 33 | )
 34 | TRAIN_SETS=(
 35 |     "all-clean-ne/bible_dup.en-ne"
 36 |     "all-clean-ne/bible.en-ne"
 37 |     "all-clean-ne/globalvoices.2018q4.ne-en"
 38 |     "all-clean-ne/GNOMEKDEUbuntu.en-ne"
 39 |     "all-clean-ne/nepali-penn-treebank"
 40 | )
 41 | VALID_SET="wikipedia_en_ne_si_test_sets/wikipedia.dev.ne-en"
 42 | TEST_SET="wikipedia_en_ne_si_test_sets/wikipedia.devtest.ne-en"
 43 | 
 44 | if [ ! -d $DATA/all-clean-ne ]; then
 45 |     echo "Data directory not found. Please run 'bash download-data.sh' first..."
 46 |     exit -1
 47 | fi
 48 | 
 49 | # download and extract data
 50 | for ((i=0;i<${#URLS[@]};++i)); do
 51 |     ARCHIVE=$DATA/${ARCHIVES[i]}
 52 |     if [ -f $ARCHIVE ]; then
 53 |         echo "$ARCHIVE already exists, skipping download"
 54 |     else
 55 |         URL=${URLS[i]}
 56 |         wget -P $DATA "$URL"
 57 |         if [ -f $ARCHIVE ]; then
 58 |             echo "$URL successfully downloaded."
 59 |         else
 60 |             echo "$URL not successfully downloaded."
 61 |             exit -1
 62 |         fi
 63 |     fi
 64 |     FILE=${ARCHIVE: -4}
 65 |     if [ -e $FILE ]; then
 66 |         echo "$FILE already exists, skipping extraction"
 67 |     else
 68 |         tar -C $DATA -xzvf $ARCHIVE
 69 |     fi
 70 | done
 71 | 
 72 | echo "pre-processing train data..."
 73 | bash $SCRIPTS/download_indic.sh
 74 | for FILE in "${TRAIN_SETS[@]}" ; do
 75 |     $SRC_TOKENIZER $DATA/$FILE.$SRC
 76 | done > $TMP/train.$SRC
 77 | for FILE in "${TRAIN_SETS[@]}"; do
 78 |     $TGT_TOKENIZER $DATA/$FILE.$TGT
 79 | done > $TMP/train.$TGT
 80 | 
 81 | echo "pre-processing dev/test data..."
 82 | $SRC_TOKENIZER $DATA/${VALID_SET}.$SRC > $TMP/valid.$SRC
 83 | $TGT_TOKENIZER $DATA/${VALID_SET}.$TGT > $TMP/valid.$TGT
 84 | $SRC_TOKENIZER $DATA/${TEST_SET}.$SRC > $TMP/test.$SRC
 85 | $TGT_TOKENIZER $DATA/${TEST_SET}.$TGT > $TMP/test.$TGT
 86 | 
 87 | # learn BPE with sentencepiece
 88 | python $SPM_TRAIN \
 89 |   --input=$TMP/train.$SRC,$TMP/train.$TGT \
 90 |   --model_prefix=$DATABIN/sentencepiece.bpe \
 91 |   --vocab_size=$BPESIZE \
 92 |   --character_coverage=1.0 \
 93 |   --model_type=bpe
 94 | 
 95 | # encode train/valid/test
 96 | python $SPM_ENCODE \
 97 |   --model $DATABIN/sentencepiece.bpe.model \
 98 |   --output_format=piece \
 99 |   --inputs $TMP/train.$SRC $TMP/train.$TGT \
100 |   --outputs $TMP/train.bpe.$SRC $TMP/train.bpe.$TGT \
101 |   --min-len $TRAIN_MINLEN --max-len $TRAIN_MAXLEN
102 | for SPLIT in "valid" "test"; do \
103 |   python $SPM_ENCODE \
104 |     --model $DATABIN/sentencepiece.bpe.model \
105 |     --output_format=piece \
106 |     --inputs $TMP/$SPLIT.$SRC $TMP/$SPLIT.$TGT \
107 |     --outputs $TMP/$SPLIT.bpe.$SRC $TMP/$SPLIT.bpe.$TGT
108 | done
109 | 
110 | # binarize data
111 | fairseq-preprocess \
112 |   --source-lang $SRC --target-lang $TGT \
113 |   --trainpref $TMP/train.bpe --validpref $TMP/valid.bpe --testpref $TMP/test.bpe \
114 |   --destdir $DATABIN \
115 |   --joined-dictionary \
116 |   --workers 4
117 | 


--------------------------------------------------------------------------------
/previous_releases/floresv1/prepare-monolingual.sh:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates.
  2 | # All rights reserved.
  3 | #
  4 | # This source code is licensed under the license found in the
  5 | # LICENSE file in the root directory of this source tree.
  6 | #
  7 | #!/bin/bash
  8 | # Downloads the data and creates data/all-clean.tgz within the current directory
  9 | 
 10 | set -e
 11 | set -o pipefail
 12 | set -x
 13 | 
 14 | ROOT=$(dirname "$0")
 15 | DATA=$ROOT/data/mono
 16 | NUM_MONO=5000000
 17 | 
 18 | REMOVE_FILE_PATHS=()
 19 | 
 20 | # Download data
 21 | download_data() {
 22 |   CORPORA=$1
 23 |   URL=$2
 24 | 
 25 |   echo "Downloading $URL"
 26 |   wget -c $URL -O $CORPORA --no-check-certificate || rm -f $CORPORA
 27 |   if [ -f $CORPORA ]; then
 28 |     echo "$URL successfully downloaded."
 29 |   else
 30 |     echo "$URL not successfully downloaded."
 31 |     exit -1
 32 |   fi
 33 | }
 34 | 
 35 | # Prepare parallel data to generate sentence piece model
 36 | if ! [[ -f data-bin/wiki_ne_en_bpe5000/sentencepiece.bpe.model ]]; then
 37 |   bash prepare-neen.sh
 38 | fi
 39 | if ! [[ -f data-bin/wiki_si_en_bpe5000/sentencepiece.bpe.model ]]; then
 40 |   bash prepare-sien.sh
 41 | fi
 42 | 
 43 | mkdir -p $DATA
 44 | 
 45 | if ! [[ -f $DATA/mono.sample"$NUM_MONO".en ]]; then
 46 |   download_data wikipedia_en_filtered.gz https://dl.fbaipublicfiles.com/fairseq/data/wikipedia.en_filtered.gz
 47 |   REMOVE_FILE_PATHS+=( wikipedia_en_filtered.gz )
 48 |   gunzip -c wikipedia_en_filtered.gz > $DATA/mono.en
 49 |   cat $DATA/mono.en | ./scripts/shuf.py --seed 42 -n $NUM_MONO > $DATA/mono.sample"$NUM_MONO".en
 50 | fi
 51 | 
 52 | if ! [[ -f $DATA/mono.sample"$NUM_MONO".ne ]]; then
 53 |   download_data wikipedia_ne_filtered.gz https://dl.fbaipublicfiles.com/fairseq/data/wikipedia.ne_filtered.gz
 54 |   download_data commoncrawl.deduped.ne.xz http://data.statmt.org/wmt19/parallel-corpus-filtering/commoncrawl.deduped.ne.xz
 55 |   REMOVE_FILE_PATHS+=( wikipedia_ne_filtered.gz commoncrawl.deduped.ne.xz )
 56 |   gunzip -c wikipedia_ne_filtered.gz > $DATA/mono.ne
 57 |   unxz -c commoncrawl.deduped.ne.xz >> $DATA/mono.ne
 58 |   cat $DATA/mono.ne | ./scripts/shuf.py --seed 43 -n $NUM_MONO > $DATA/mono.sample"$NUM_MONO".ne
 59 | fi
 60 | 
 61 | if ! [[ -f $DATA/mono.sample"$NUM_MONO".si ]]; then
 62 |   download_data wikipedia_si_filtered.gz https://dl.fbaipublicfiles.com/fairseq/data/wikipedia.si_filtered.gz
 63 |   download_data commoncrawl.deduped.si.xz http://data.statmt.org/wmt19/parallel-corpus-filtering/commoncrawl.deduped.si.xz
 64 |   REMOVE_FILE_PATHS+=( wikipedia_si_filtered.gz commoncrawl.deduped.si.xz )
 65 |   gunzip -c wikipedia_si_filtered.gz > $DATA/mono.si
 66 |   unxz -c commoncrawl.deduped.si.xz >> $DATA/mono.si
 67 |   cat $DATA/mono.si | ./scripts/shuf.py --seed 44 -n $NUM_MONO > $DATA/mono.sample"$NUM_MONO".si
 68 | fi
 69 | 
 70 | ROOT=$(dirname "$0")
 71 | SCRIPTS=$ROOT/scripts
 72 | SRC_TOKENIZER="bash $SCRIPTS/indic_norm_tok.sh"
 73 | TGT_TOKENIZER="cat"  # learn target-side BPE over untokenized (raw) text
 74 | SPM_TRAIN=$SCRIPTS/spm_train.py
 75 | SPM_ENCODE=$SCRIPTS/spm_encode.py
 76 | 
 77 | echo "pre-processing monolingual data..."
 78 | mkdir -p $DATA/neen
 79 | bash $SCRIPTS/download_indic.sh
 80 | if ! [[ -f $DATA/neen/mono.bpe.ne ]]; then
 81 |   $SRC_TOKENIZER ne $DATA/mono.sample"$NUM_MONO".ne | python $SPM_ENCODE \
 82 |     --model data-bin/wiki_ne_en_bpe5000/sentencepiece.bpe.model \
 83 |     --output_format=piece \
 84 |     --inputs - \
 85 |     --outputs $DATA/neen/mono.bpe.ne
 86 | fi
 87 | 
 88 | if ! [[ -f $DATA/neen/mono.bpe.en ]]; then
 89 |   $TGT_TOKENIZER $DATA/mono.sample"$NUM_MONO".en | python $SPM_ENCODE \
 90 |     --model data-bin/wiki_ne_en_bpe5000/sentencepiece.bpe.model \
 91 |     --output_format=piece \
 92 |     --inputs - \
 93 |     --outputs $DATA/neen/mono.bpe.en
 94 | fi
 95 | 
 96 | 
 97 | if ! [[ -f $DATA/sien/mono.bpe.si ]]; then
 98 |   mkdir -p $DATA/sien
 99 |   $SRC_TOKENIZER si $DATA/mono.sample"$NUM_MONO".si | python $SPM_ENCODE \
100 |     --model data-bin/wiki_si_en_bpe5000/sentencepiece.bpe.model \
101 |     --output_format=piece \
102 |     --inputs - \
103 |     --outputs $DATA/sien/mono.bpe.si
104 | fi
105 | 
106 | if ! [[ -f $DATA/sien/mono.bpe.en ]]; then
107 |   $TGT_TOKENIZER $DATA/mono.sample"$NUM_MONO".en | python $SPM_ENCODE \
108 |     --model data-bin/wiki_si_en_bpe5000/sentencepiece.bpe.model \
109 |     --output_format=piece \
110 |     --inputs - \
111 |     --outputs $DATA/sien/mono.bpe.en
112 | fi
113 | 
114 | # Remove the temporary files
115 | for ((i=0;i<${#REMOVE_FILE_PATHS[@]};++i)); do
116 |   rm -rf ${REMOVE_FILE_PATHS[i]}
117 | done
118 | 


--------------------------------------------------------------------------------
/previous_releases/floresv1/configs/enne.json:
--------------------------------------------------------------------------------
  1 | [
  2 |   [
  3 |     "sup",
  4 |     {
  5 |       "src": "en",
  6 |       "tgt": "ne",
  7 |       "train": {
  8 |         "parameters": [
  9 |           "--ddp-backend no_c10d",
 10 |           "--arch transformer",
 11 |           "--share-all-embeddings",
 12 |           "--encoder-layers 5",
 13 |           "--decoder-layers 5",
 14 |           "--encoder-embed-dim 512",
 15 |           "--decoder-embed-dim 512",
 16 |           "--encoder-ffn-embed-dim 2048",
 17 |           "--decoder-ffn-embed-dim 2048",
 18 |           "--encoder-attention-heads 8",
 19 |           "--decoder-attention-heads 8",
 20 |           "--encoder-normalize-before",
 21 |           "--decoder-normalize-before",
 22 |           "--attention-dropout 0.3",
 23 |           "--relu-dropout 0.3",
 24 |           "--optimizer adam",
 25 |           "--adam-betas \"(0.9, 0.98)\"",
 26 |           "--lr-scheduler inverse_sqrt",
 27 |           "--warmup-init-lr 1e-7",
 28 |           "--warmup-updates 4000",
 29 |           "--lr 0.003",
 30 |           "--min-lr 1e-9",
 31 |           "--clip-norm 0.0",
 32 |           "--dropout 0.3",
 33 |           "--criterion label_smoothed_cross_entropy",
 34 |           "--label-smoothing 0.2",
 35 |           "--max-tokens 4000",
 36 |           "--seed 1",
 37 |           "--log-format json",
 38 |           "--log-interval 100",
 39 |           "--max-epoch 100",
 40 |           "--save-interval 1",
 41 |           "--no-epoch-checkpoints"
 42 |         ],
 43 |         "gpu": 4
 44 |       },
 45 |       "translate": {
 46 |         "lenpen": 1.5,
 47 |         "max_token": 30000,
 48 |         "mono": "data/mono/neen/mono.bpe.en"
 49 |       }
 50 |     }
 51 |   ],
 52 |   [
 53 |     "bt_iter1",
 54 |     {
 55 |       "src": "ne",
 56 |       "tgt": "en",
 57 |       "train": {
 58 |         "parameters": [
 59 |           "--fp16",
 60 |           "--ddp-backend no_c10d",
 61 |           "--arch transformer",
 62 |           "--share-all-embeddings",
 63 |           "--encoder-layers 6",
 64 |           "--decoder-layers 6",
 65 |           "--encoder-embed-dim 1024",
 66 |           "--decoder-embed-dim 1024",
 67 |           "--encoder-ffn-embed-dim 4096",
 68 |           "--decoder-ffn-embed-dim 4096",
 69 |           "--encoder-attention-heads 16",
 70 |           "--decoder-attention-heads 16",
 71 |           "--encoder-normalize-before",
 72 |           "--decoder-normalize-before",
 73 |           "--attention-dropout 0.1",
 74 |           "--relu-dropout 0.1",
 75 |           "--optimizer adam",
 76 |           "--adam-betas \"(0.9, 0.98)\"",
 77 |           "--lr-scheduler inverse_sqrt",
 78 |           "--warmup-init-lr 1e-7",
 79 |           "--warmup-updates 4000",
 80 |           "--lr 0.001",
 81 |           "--min-lr 1e-9",
 82 |           "--clip-norm 0.0",
 83 |           "--dropout 0.1",
 84 |           "--criterion label_smoothed_cross_entropy",
 85 |           "--label-smoothing 0.2",
 86 |           "--upsample-primary 1",
 87 |           "--max-tokens 4000",
 88 |           "--seed 1",
 89 |           "--log-format json",
 90 |           "--log-interval 100",
 91 |           "--max-update 120000",
 92 |           "--save-interval-updates 2000",
 93 |           "--keep-interval-updates 1",
 94 |           "--no-epoch-checkpoints"
 95 |         ],
 96 |         "gpu": 8
 97 |       },
 98 |       "translate": {
 99 |         "lenpen": 0.8,
100 |         "max_token": 20000,
101 |         "mono": "data/mono/neen/mono.bpe.ne"
102 |       }
103 |     }
104 |   ],
105 |   [
106 |     "bt_iter2",
107 |     {
108 |       "src": "en",
109 |       "tgt": "ne",
110 |       "train": {
111 |         "parameters": [
112 |           "--fp16",
113 |           "--ddp-backend no_c10d",
114 |           "--arch transformer",
115 |           "--share-all-embeddings",
116 |           "--encoder-layers 6",
117 |           "--decoder-layers 6",
118 |           "--encoder-embed-dim 1024",
119 |           "--decoder-embed-dim 1024",
120 |           "--encoder-ffn-embed-dim 4096",
121 |           "--decoder-ffn-embed-dim 4096",
122 |           "--encoder-attention-heads 4",
123 |           "--decoder-attention-heads 4",
124 |           "--encoder-normalize-before",
125 |           "--decoder-normalize-before",
126 |           "--attention-dropout 0.2",
127 |           "--relu-dropout 0.2",
128 |           "--optimizer adam",
129 |           "--adam-betas \"(0.9, 0.98)\"",
130 |           "--lr-scheduler inverse_sqrt",
131 |           "--warmup-init-lr 1e-7",
132 |           "--warmup-updates 4000",
133 |           "--lr 0.0007",
134 |           "--min-lr 1e-9",
135 |           "--clip-norm 0.0",
136 |           "--dropout 0.2",
137 |           "--criterion label_smoothed_cross_entropy",
138 |           "--label-smoothing 0.2",
139 |           "--upsample-primary 7",
140 |           "--max-tokens 4000",
141 |           "--seed 1",
142 |           "--log-format json",
143 |           "--log-interval 100",
144 |           "--max-update 90000",
145 |           "--save-interval-updates 2000",
146 |           "--keep-interval-updates 1",
147 |           "--no-epoch-checkpoints"
148 |         ],
149 |         "gpu": 8
150 |       },
151 |       "translate": {
152 |         "lenpen": 0.8,
153 |         "max_token": 20000,
154 |         "mono": "data/mono/neen/mono.bpe.en"
155 |       }
156 |     }
157 |   ]
158 | ]
159 | 


--------------------------------------------------------------------------------
/previous_releases/floresv1/configs/ensi.json:
--------------------------------------------------------------------------------
  1 | [
  2 |   [
  3 |     "sup",
  4 |     {
  5 |       "src": "en",
  6 |       "tgt": "si",
  7 |       "train": {
  8 |         "parameters": [
  9 |           "--ddp-backend no_c10d",
 10 |           "--arch transformer",
 11 |           "--share-all-embeddings",
 12 |           "--encoder-layers 5",
 13 |           "--decoder-layers 5",
 14 |           "--encoder-embed-dim 512",
 15 |           "--decoder-embed-dim 512",
 16 |           "--encoder-ffn-embed-dim 2048",
 17 |           "--decoder-ffn-embed-dim 2048",
 18 |           "--encoder-attention-heads 8",
 19 |           "--decoder-attention-heads 8",
 20 |           "--encoder-normalize-before",
 21 |           "--decoder-normalize-before",
 22 |           "--attention-dropout 0.3",
 23 |           "--relu-dropout 0.3",
 24 |           "--optimizer adam",
 25 |           "--adam-betas \"(0.9, 0.98)\"",
 26 |           "--lr-scheduler inverse_sqrt",
 27 |           "--warmup-init-lr 1e-7",
 28 |           "--warmup-updates 4000",
 29 |           "--lr 0.003",
 30 |           "--min-lr 1e-9",
 31 |           "--clip-norm 0.0",
 32 |           "--dropout 0.3",
 33 |           "--criterion label_smoothed_cross_entropy",
 34 |           "--label-smoothing 0.2",
 35 |           "--max-tokens 4000",
 36 |           "--seed 1",
 37 |           "--log-format json",
 38 |           "--log-interval 100",
 39 |           "--max-epoch 100",
 40 |           "--save-interval 1",
 41 |           "--no-epoch-checkpoints"
 42 |         ],
 43 |         "gpu": 4
 44 |       },
 45 |       "translate": {
 46 |         "lenpen": 1.5,
 47 |         "max_token": 30000,
 48 |         "mono": "data/mono/sien/mono.bpe.en"
 49 |       }
 50 |     }
 51 |   ],
 52 |   [
 53 |     "bt_iter1",
 54 |     {
 55 |       "src": "si",
 56 |       "tgt": "en",
 57 |       "train": {
 58 |         "parameters": [
 59 |           "--fp16",
 60 |           "--ddp-backend no_c10d",
 61 |           "--arch transformer",
 62 |           "--share-all-embeddings",
 63 |           "--encoder-layers 6",
 64 |           "--decoder-layers 6",
 65 |           "--encoder-embed-dim 1024",
 66 |           "--decoder-embed-dim 1024",
 67 |           "--encoder-ffn-embed-dim 4096",
 68 |           "--decoder-ffn-embed-dim 4096",
 69 |           "--encoder-attention-heads 16",
 70 |           "--decoder-attention-heads 16",
 71 |           "--encoder-normalize-before",
 72 |           "--decoder-normalize-before",
 73 |           "--attention-dropout 0.1",
 74 |           "--relu-dropout 0.1",
 75 |           "--optimizer adam",
 76 |           "--adam-betas \"(0.9, 0.98)\"",
 77 |           "--lr-scheduler inverse_sqrt",
 78 |           "--warmup-init-lr 1e-7",
 79 |           "--warmup-updates 4000",
 80 |           "--lr 0.001",
 81 |           "--min-lr 1e-9",
 82 |           "--clip-norm 0.0",
 83 |           "--dropout 0.1",
 84 |           "--criterion label_smoothed_cross_entropy",
 85 |           "--label-smoothing 0.2",
 86 |           "--upsample-primary 1",
 87 |           "--max-tokens 4000",
 88 |           "--seed 1",
 89 |           "--log-format json",
 90 |           "--log-interval 100",
 91 |           "--max-update 120000",
 92 |           "--save-interval-updates 2000",
 93 |           "--keep-interval-updates 1",
 94 |           "--no-epoch-checkpoints"
 95 |         ],
 96 |         "gpu": 8
 97 |       },
 98 |       "translate": {
 99 |         "lenpen": 2.5,
100 |         "max_token": 20000,
101 |         "mono": "data/mono/sien/mono.bpe.si"
102 |       }
103 |     }
104 |   ],
105 |   [
106 |     "bt_iter2",
107 |     {
108 |       "src": "en",
109 |       "tgt": "si",
110 |       "train": {
111 |         "parameters": [
112 |           "--fp16",
113 |           "--ddp-backend no_c10d",
114 |           "--arch transformer",
115 |           "--share-all-embeddings",
116 |           "--encoder-layers 6",
117 |           "--decoder-layers 6",
118 |           "--encoder-embed-dim 1024",
119 |           "--decoder-embed-dim 1024",
120 |           "--encoder-ffn-embed-dim 4096",
121 |           "--decoder-ffn-embed-dim 4096",
122 |           "--encoder-attention-heads 4",
123 |           "--decoder-attention-heads 4",
124 |           "--encoder-normalize-before",
125 |           "--decoder-normalize-before",
126 |           "--attention-dropout 0.2",
127 |           "--relu-dropout 0.2",
128 |           "--optimizer adam",
129 |           "--adam-betas \"(0.9, 0.98)\"",
130 |           "--lr-scheduler inverse_sqrt",
131 |           "--warmup-init-lr 1e-7",
132 |           "--warmup-updates 4000",
133 |           "--lr 0.0007",
134 |           "--min-lr 1e-9",
135 |           "--clip-norm 0.0",
136 |           "--dropout 0.2",
137 |           "--criterion label_smoothed_cross_entropy",
138 |           "--label-smoothing 0.2",
139 |           "--upsample-primary 7",
140 |           "--max-tokens 4000",
141 |           "--seed 1",
142 |           "--log-format json",
143 |           "--log-interval 100",
144 |           "--max-update 90000",
145 |           "--save-interval-updates 2000",
146 |           "--keep-interval-updates 1",
147 |           "--no-epoch-checkpoints"
148 |         ],
149 |         "gpu": 8
150 |       },
151 |       "translate": {
152 |         "lenpen": 0.8,
153 |         "max_token": 20000,
154 |         "mono": "data/mono/sien/mono.bpe.en"
155 |       }
156 |     }
157 |   ]
158 | ]
159 | 


--------------------------------------------------------------------------------
/previous_releases/floresv1/configs/neen.json:
--------------------------------------------------------------------------------
  1 | [
  2 |   [
  3 |     "sup",
  4 |     {
  5 |       "src": "ne",
  6 |       "tgt": "en",
  7 |       "train": {
  8 |         "parameters": [
  9 |           "--fp16",
 10 |           "--ddp-backend no_c10d",
 11 |           "--arch transformer",
 12 |           "--share-all-embeddings",
 13 |           "--encoder-layers 5",
 14 |           "--decoder-layers 5",
 15 |           "--encoder-embed-dim 512",
 16 |           "--decoder-embed-dim 512",
 17 |           "--encoder-ffn-embed-dim 2048",
 18 |           "--decoder-ffn-embed-dim 2048",
 19 |           "--encoder-attention-heads 8",
 20 |           "--decoder-attention-heads 8",
 21 |           "--encoder-normalize-before",
 22 |           "--decoder-normalize-before",
 23 |           "--attention-dropout 0.3",
 24 |           "--relu-dropout 0.3",
 25 |           "--optimizer adam",
 26 |           "--adam-betas \"(0.9, 0.98)\"",
 27 |           "--lr-scheduler inverse_sqrt",
 28 |           "--warmup-init-lr 1e-7",
 29 |           "--warmup-updates 4000",
 30 |           "--lr 0.003",
 31 |           "--min-lr 1e-9",
 32 |           "--clip-norm 0.0",
 33 |           "--dropout 0.3",
 34 |           "--criterion label_smoothed_cross_entropy",
 35 |           "--label-smoothing 0.2",
 36 |           "--max-tokens 4000",
 37 |           "--seed 1",
 38 |           "--log-format json",
 39 |           "--log-interval 100",
 40 |           "--max-epoch 100",
 41 |           "--save-interval 1",
 42 |           "--no-epoch-checkpoints"
 43 |         ],
 44 |         "gpu": 4
 45 |       },
 46 |       "translate": {
 47 |         "lenpen": 1.5,
 48 |         "max_token": 30000,
 49 |         "mono": "data/mono/neen/mono.bpe.ne"
 50 |       }
 51 |     }
 52 |   ],
 53 |   [
 54 |     "bt_iter1",
 55 |     {
 56 |       "src": "en",
 57 |       "tgt": "ne",
 58 |       "train": {
 59 |         "parameters": [
 60 |           "--fp16",
 61 |           "--ddp-backend no_c10d",
 62 |           "--arch transformer",
 63 |           "--share-all-embeddings",
 64 |           "--encoder-layers 6",
 65 |           "--decoder-layers 6",
 66 |           "--encoder-embed-dim 1024",
 67 |           "--decoder-embed-dim 1024",
 68 |           "--encoder-ffn-embed-dim 4096",
 69 |           "--decoder-ffn-embed-dim 4096",
 70 |           "--encoder-attention-heads 4",
 71 |           "--decoder-attention-heads 4",
 72 |           "--encoder-normalize-before",
 73 |           "--decoder-normalize-before",
 74 |           "--attention-dropout 0.2",
 75 |           "--relu-dropout 0.2",
 76 |           "--optimizer adam",
 77 |           "--adam-betas \"(0.9, 0.98)\"",
 78 |           "--lr-scheduler inverse_sqrt",
 79 |           "--warmup-init-lr 1e-7",
 80 |           "--warmup-updates 4000",
 81 |           "--lr 0.0007",
 82 |           "--min-lr 1e-9",
 83 |           "--clip-norm 0.0",
 84 |           "--dropout 0.2",
 85 |           "--criterion label_smoothed_cross_entropy",
 86 |           "--label-smoothing 0.2",
 87 |           "--upsample-primary 7",
 88 |           "--max-tokens 4000",
 89 |           "--seed 1",
 90 |           "--log-format json",
 91 |           "--log-interval 100",
 92 |           "--max-update 90000",
 93 |           "--save-interval-updates 2000",
 94 |           "--keep-interval-updates 1",
 95 |           "--no-epoch-checkpoints"
 96 |         ],
 97 |         "gpu": 8
 98 |       },
 99 |       "translate": {
100 |         "lenpen": 0.8,
101 |         "max_token": 20000,
102 |         "mono": "data/mono/neen/mono.bpe.en"
103 |       }
104 |     }
105 |   ],
106 |   [
107 |     "bt_iter2",
108 |     {
109 |       "src": "ne",
110 |       "tgt": "en",
111 |       "train": {
112 |         "parameters": [
113 |           "--fp16",
114 |           "--ddp-backend no_c10d",
115 |           "--arch transformer",
116 |           "--share-all-embeddings",
117 |           "--encoder-layers 6",
118 |           "--decoder-layers 6",
119 |           "--encoder-embed-dim 1024",
120 |           "--decoder-embed-dim 1024",
121 |           "--encoder-ffn-embed-dim 4096",
122 |           "--decoder-ffn-embed-dim 4096",
123 |           "--encoder-attention-heads 16",
124 |           "--decoder-attention-heads 16",
125 |           "--encoder-normalize-before",
126 |           "--decoder-normalize-before",
127 |           "--attention-dropout 0.1",
128 |           "--relu-dropout 0.1",
129 |           "--optimizer adam",
130 |           "--adam-betas \"(0.9, 0.98)\"",
131 |           "--lr-scheduler inverse_sqrt",
132 |           "--warmup-init-lr 1e-7",
133 |           "--warmup-updates 4000",
134 |           "--lr 0.001",
135 |           "--min-lr 1e-9",
136 |           "--clip-norm 0.0",
137 |           "--dropout 0.1",
138 |           "--criterion label_smoothed_cross_entropy",
139 |           "--label-smoothing 0.2",
140 |           "--upsample-primary 1",
141 |           "--max-tokens 4000",
142 |           "--seed 1",
143 |           "--log-format json",
144 |           "--log-interval 100",
145 |           "--max-update 120000",
146 |           "--save-interval-updates 2000",
147 |           "--keep-interval-updates 1",
148 |           "--no-epoch-checkpoints"
149 |         ],
150 |         "gpu": 8
151 |       },
152 |       "translate": {
153 |         "lenpen": 0.8,
154 |         "max_token": 20000,
155 |         "mono": "data/mono/neen/mono.bpe.ne"
156 |       }
157 |     }
158 |   ]
159 | ]
160 | 


--------------------------------------------------------------------------------
/previous_releases/floresv1/configs/sien.json:
--------------------------------------------------------------------------------
  1 | [
  2 |   [
  3 |     "sup",
  4 |     {
  5 |       "src": "si",
  6 |       "tgt": "en",
  7 |       "train": {
  8 |         "parameters": [
  9 |           "--fp16",
 10 |           "--ddp-backend no_c10d",
 11 |           "--arch transformer",
 12 |           "--share-all-embeddings",
 13 |           "--encoder-layers 5",
 14 |           "--decoder-layers 5",
 15 |           "--encoder-embed-dim 512",
 16 |           "--decoder-embed-dim 512",
 17 |           "--encoder-ffn-embed-dim 2048",
 18 |           "--decoder-ffn-embed-dim 2048",
 19 |           "--encoder-attention-heads 8",
 20 |           "--decoder-attention-heads 8",
 21 |           "--encoder-normalize-before",
 22 |           "--decoder-normalize-before",
 23 |           "--attention-dropout 0.3",
 24 |           "--relu-dropout 0.3",
 25 |           "--optimizer adam",
 26 |           "--adam-betas \"(0.9, 0.98)\"",
 27 |           "--lr-scheduler inverse_sqrt",
 28 |           "--warmup-init-lr 1e-7",
 29 |           "--warmup-updates 4000",
 30 |           "--lr 0.003",
 31 |           "--min-lr 1e-9",
 32 |           "--clip-norm 0.0",
 33 |           "--dropout 0.3",
 34 |           "--criterion label_smoothed_cross_entropy",
 35 |           "--label-smoothing 0.2",
 36 |           "--max-tokens 4000",
 37 |           "--seed 1",
 38 |           "--log-format json",
 39 |           "--log-interval 100",
 40 |           "--max-epoch 100",
 41 |           "--save-interval 1",
 42 |           "--no-epoch-checkpoints"
 43 |         ],
 44 |         "gpu": 4
 45 |       },
 46 |       "translate": {
 47 |         "lenpen": 1.5,
 48 |         "max_token": 30000,
 49 |         "mono": "data/mono/sien/mono.bpe.si"
 50 |       }
 51 |     }
 52 |   ],
 53 |   [
 54 |     "bt_iter1",
 55 |     {
 56 |       "src": "en",
 57 |       "tgt": "si",
 58 |       "train": {
 59 |         "parameters": [
 60 |           "--fp16",
 61 |           "--ddp-backend no_c10d",
 62 |           "--arch transformer",
 63 |           "--share-all-embeddings",
 64 |           "--encoder-layers 6",
 65 |           "--decoder-layers 6",
 66 |           "--encoder-embed-dim 1024",
 67 |           "--decoder-embed-dim 1024",
 68 |           "--encoder-ffn-embed-dim 4096",
 69 |           "--decoder-ffn-embed-dim 4096",
 70 |           "--encoder-attention-heads 4",
 71 |           "--decoder-attention-heads 4",
 72 |           "--encoder-normalize-before",
 73 |           "--decoder-normalize-before",
 74 |           "--attention-dropout 0.2",
 75 |           "--relu-dropout 0.2",
 76 |           "--optimizer adam",
 77 |           "--adam-betas \"(0.9, 0.98)\"",
 78 |           "--lr-scheduler inverse_sqrt",
 79 |           "--warmup-init-lr 1e-7",
 80 |           "--warmup-updates 4000",
 81 |           "--lr 0.0007",
 82 |           "--min-lr 1e-9",
 83 |           "--clip-norm 0.0",
 84 |           "--dropout 0.2",
 85 |           "--criterion label_smoothed_cross_entropy",
 86 |           "--label-smoothing 0.2",
 87 |           "--upsample-primary 7",
 88 |           "--max-tokens 4000",
 89 |           "--seed 1",
 90 |           "--log-format json",
 91 |           "--log-interval 100",
 92 |           "--max-update 90000",
 93 |           "--save-interval-updates 2000",
 94 |           "--keep-interval-updates 1",
 95 |           "--no-epoch-checkpoints"
 96 |         ],
 97 |         "gpu": 8
 98 |       },
 99 |       "translate": {
100 |         "lenpen": 0.6,
101 |         "max_token": 16000,
102 |         "mono": "data/mono/sien/mono.bpe.en"
103 |       }
104 |     }
105 |   ],
106 |   [
107 |     "bt_iter2",
108 |     {
109 |       "src": "si",
110 |       "tgt": "en",
111 |       "train": {
112 |         "parameters": [
113 |           "--fp16",
114 |           "--ddp-backend no_c10d",
115 |           "--arch transformer",
116 |           "--share-all-embeddings",
117 |           "--encoder-layers 6",
118 |           "--decoder-layers 6",
119 |           "--encoder-embed-dim 1024",
120 |           "--decoder-embed-dim 1024",
121 |           "--encoder-ffn-embed-dim 4096",
122 |           "--decoder-ffn-embed-dim 4096",
123 |           "--encoder-attention-heads 16",
124 |           "--decoder-attention-heads 16",
125 |           "--encoder-normalize-before",
126 |           "--decoder-normalize-before",
127 |           "--attention-dropout 0.1",
128 |           "--relu-dropout 0.1",
129 |           "--optimizer adam",
130 |           "--adam-betas \"(0.9, 0.98)\"",
131 |           "--lr-scheduler inverse_sqrt",
132 |           "--warmup-init-lr 1e-7",
133 |           "--warmup-updates 4000",
134 |           "--lr 0.001",
135 |           "--min-lr 1e-9",
136 |           "--clip-norm 0.0",
137 |           "--dropout 0.1",
138 |           "--criterion label_smoothed_cross_entropy",
139 |           "--label-smoothing 0.2",
140 |           "--upsample-primary 1",
141 |           "--max-tokens 4000",
142 |           "--seed 1",
143 |           "--log-format json",
144 |           "--log-interval 100",
145 |           "--max-update 120000",
146 |           "--save-interval-updates 2000",
147 |           "--keep-interval-updates 1",
148 |           "--no-epoch-checkpoints"
149 |         ],
150 |         "gpu": 8
151 |       },
152 |       "translate": {
153 |         "lenpen": 0.8,
154 |         "max_token": 16000,
155 |         "mono": "data/mono/sien/mono.bpe.si"
156 |       }
157 |     }
158 |   ]
159 | ]
160 | 


--------------------------------------------------------------------------------
/ocr/OCR_eval/google_vision_OCR.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import json
  3 | import logging
  4 | import os
  5 | import re
  6 | from typing import Literal
  7 | 
  8 | from google.cloud import storage, vision
  9 | 
 10 | from data_collection.utils import create_dictionary_lang
 11 | 
 12 | 
 13 | def async_detect_document(
 14 |         gcs_source_uri, gcs_destination_uri, destination_path,
 15 |         mime_type: Literal["image/tiff", "application/pdf"] = "image/tiff", batch_size: int = 1
 16 | ):
 17 |     """OCR with PDF/TIFF as source files on GCS
 18 | 
 19 |     batch_size: how many pages should be grouped into each json output file.
 20 |     """
 21 |     client = vision.ImageAnnotatorClient()
 22 | 
 23 |     feature = vision.Feature(type_=vision.Feature.Type.DOCUMENT_TEXT_DETECTION)
 24 | 
 25 |     gcs_source = vision.GcsSource(uri=gcs_source_uri)
 26 |     input_config = vision.InputConfig(gcs_source=gcs_source, mime_type=mime_type)
 27 | 
 28 |     gcs_destination = vision.GcsDestination(uri=gcs_destination_uri)
 29 |     output_config = vision.OutputConfig(
 30 |         gcs_destination=gcs_destination, batch_size=batch_size
 31 |     )
 32 | 
 33 |     async_request = vision.AsyncAnnotateFileRequest(
 34 |         features=[feature], input_config=input_config, output_config=output_config
 35 |     )
 36 | 
 37 |     operation = client.async_batch_annotate_files(requests=[async_request])
 38 | 
 39 |     print("Waiting for the operation to finish.")
 40 |     operation.result(timeout=420)
 41 | 
 42 |     # Once the request has completed and the output has been
 43 |     # written to GCS, we can list all the output files.
 44 |     storage_client = storage.Client()
 45 | 
 46 |     match = re.match(r"gs://([^/]+)/(.+)", gcs_destination_uri)
 47 |     bucket_name = match.group(1)
 48 |     prefix = match.group(2)
 49 | 
 50 |     bucket = storage_client.get_bucket(bucket_name)
 51 | 
 52 |     # List objects with the given prefix.
 53 |     blob_list = list(bucket.list_blobs(prefix=prefix))
 54 |     print("Output files:")
 55 |     for blob in blob_list:
 56 |         print(blob.name)
 57 | 
 58 |     # Process the first output file from GCS.
 59 |     # Since we specified batch_size=2, the first response contains
 60 |     # the first two pages of the input file.
 61 |     output = blob_list[0]
 62 | 
 63 |     json_string = output.download_as_string()
 64 |     response = json.loads(json_string)
 65 | 
 66 |     # The actual response for the first page of the input file.
 67 |     first_page_response = response["responses"][0]
 68 |     annotation = first_page_response["fullTextAnnotation"]
 69 | 
 70 |     # Here we print the full text from the first page.
 71 |     # The response contains more information:
 72 |     # annotation/pages/blocks/paragraphs/words/symbols
 73 |     # including confidence scores and bounding boxes
 74 | 
 75 |     input_txt_ocr = annotation["text"].split("\n")
 76 |     new_ocr_txt = " ".join(input_txt_ocr)
 77 |     new_ocr_txt = " ".join(new_ocr_txt.split())  # remove double spaces
 78 | 
 79 |     output_path_ocr = destination_path + gcs_destination_uri.split("/")[-1] + "_sentsplit_synth.txt"
 80 |     print(output_path_ocr)
 81 |     with open(output_path_ocr, "w", encoding="utf-8") as f:
 82 |         f.write(new_ocr_txt)
 83 | 
 84 | 
 85 | def parse_args() -> argparse.Namespace:
 86 |     parser = argparse.ArgumentParser()
 87 |     parser.add_argument("--dataset", choices=["FLORES", "UDHR"], default="UDHR")
 88 |     return parser.parse_args()
 89 | 
 90 | 
 91 | def main() -> None:
 92 |     args = parse_args()
 93 | 
 94 |     dict_lang = create_dictionary_lang()
 95 |     file_names = sorted(os.listdir(os.path.join("Data", args.dataset + "_tiff_synth")))
 96 |     for file_name in file_names:
 97 |         gcs_source_uri = f"gs://ocrdata_flores_udhr/{args.dataset}_tiff_synth/{file_name}"
 98 |         print(gcs_source_uri)
 99 |         gcs_destination_uri = f"gs://ocrdata_flores_udhr/Output_{args.dataset}_synth/{file_name[:-5]}"
100 |         lang_code = gcs_destination_uri.split("/")[-1].split("_")[0]
101 |         lang_name = dict_lang[lang_code]
102 |         destination_path = "Data/" + args.dataset + "/" + lang_name + "/googlevision/"
103 |         os.makedirs(destination_path, exist_ok=True)
104 |         file_names_done = os.listdir(destination_path)
105 |         file_name = gcs_destination_uri.split("/")[-1]
106 |         new_file_name = "".join(file_name.split("_")[:2])
107 |         print(file_name + "_sentsplit.txt")
108 |         print(new_file_name + "_sentsplit.txt")
109 | 
110 |         if (
111 |                 file_name + "_sentsplit_synth.txt" in file_names_done
112 |                 or new_file_name + "_sentsplit_synth.txt" in file_names_done
113 |         ):
114 |             print(
115 |                 "Done: "
116 |                 + destination_path
117 |                 + gcs_destination_uri.split("/")[-1]
118 |                 + "_sentsplit_synth.txt"
119 |             )
120 |             continue
121 |         try:
122 |             async_detect_document(
123 |                 gcs_source_uri, gcs_destination_uri, destination_path
124 |             )
125 |         except:
126 |             logging.basicConfig(
127 |                 filename="Data/google_ocr.log",
128 |                 filemode="w+",
129 |                 format="%(name)s - %(levelname)s - %(message)s",
130 |             )
131 |             logging.warning("Fail: " + gcs_source_uri)
132 | 
133 | 
134 | if __name__ == "__main__":
135 |     main()
136 | 


--------------------------------------------------------------------------------
/shared_tasks/dynalab/README.md:
--------------------------------------------------------------------------------
  1 | # Flores on Dynabench
  2 | 
  3 | This tutorial should answer the following questions.
  4 | 
  5 | * How to participate in the FLORES-101 Large-Scale Multilingual Machine Translation Shared Task?
  6 | * How to test your model API locally?
  7 | * How to evaluate your your model locally on the dev/devtest set?
  8 | * How to make a submission?
  9 | 
 10 | 
 11 | ## Participating in the FLORES-101 WMT 2021 Shared Task
 12 | 
 13 | The [Large-Scale Multilingual Machine Translation Shared Task](http://www.statmt.org/wmt21/large-scale-multilingual-translation-task.html)
 14 | comprises of 3 tracks.
 15 | 
 16 | Small Track #1 : 5 Central/East European languages, 30 directions: Croatian, Hungarian, Estonian, Serbian, Macedonian, English
 17 | 
 18 | Small Track #2: 5 East Asian languages, 30 directions: Javanese, Indonesian, Malay, Tagalog, Tamil, English
 19 | 
 20 | Large Track: All Languages, to and from English. Full list at the bottom of this page.
 21 | 
 22 | To compete in a track you'll need to submit a model to [Dynabench](https://www.dynabench.org/flores).
 23 | Dynabench will take care of running the inference of the model on the secret test set and will publish the resulting metrics to the leaderboard.
 24 | 
 25 | 
 26 | ## Evaluate your model locally on the dev/devtest set
 27 | 
 28 | Each track is accompanied with 3 splits of the dataset.
 29 | * a public `dev` set that you're encouraged to use for validation and selecting
 30 | hyper-parameters.
 31 | * a public `devtest` set which you can use as a test set
 32 | * and a secret `test` set which will be used by Dynabench to evaluate your model.
 33 | 
 34 | The simplest way to get started is to train and evaluate your model using the dev 
 35 | and the devtest dataset which have been released publicly.
 36 | 
 37 | The [main README](../flores#evaluation) has more instructions on evaluating your model locally.
 38 | 
 39 | 
 40 | ## Testing your model API locally
 41 | 
 42 | Dynabench has certain expectations about the API used by your model.
 43 | Dynalab provides tools for testing and uploading the model to Dynabench.
 44 | The documentation below provides an overview for interacting with Dynalab and should 
 45 | suffice for testing the model API. Please refer to the [README](https://github.com/facebookresearch/dynalab) 
 46 | for more details.
 47 | 
 48 | Dynabench uses [TorchServe](https://pytorch.org/serve/) in the backend.
 49 | You'll need to implement a handler that:
 50 | * receives several json objects (one per line) representing one batch of translation
 51 | * extract the sentences for your model
 52 | * compute the translations
 53 | * returns translations in json objects (one per line)
 54 | 
 55 | We recommend starting from the example [handler](handler.py)
 56 | that should work with most [Fairseq](https://github.com/pytorch/fairseq) models.
 57 | 
 58 | You'll need to modify the Handler class for your model.
 59 | We recommend against modifying the other top level functions.
 60 | Note how `__init__` method is able to load the local files.
 61 | 
 62 | Each `sample` passed to `service.preprocess` is a dict with the following keys:
 63 | 
 64 | ```py
 65 | {
 66 |   "uid": "some_unique_identifier",
 67 |   "sourceLanguage": "eng",
 68 |   "targetLanguage": "hrv",
 69 |   "sourceText": "Hello world !",
 70 | }
 71 | ```
 72 | 
 73 | 
 74 | At the end of preprocess you'll need to return a list of sample like this:
 75 | 
 76 | ```py
 77 | {
 78 |   "uid": "some_unique_identifier",
 79 |   "translatedText": "Your translation",
 80 |   "signed": "some_hash",
 81 | }
 82 | ```
 83 | 
 84 | Note that the "signed" field will be obtained by calling `self.taskIO.sign_response(response, example)`.
 85 | 
 86 | Also note that you can edit the [requirements.txt](./requirements.txt) file,
 87 | based your model's specific dependencies.
 88 | 
 89 | Once you've implemented the handler you'll need to test it locally.
 90 | 
 91 | First install `dynalab` using instructions from their [repo](https://github.com/facebookresearch/dynalab#installation).
 92 | 
 93 | The simplest test is to run `python handler.py`.
 94 | You'll need to update the `local_test` function to use the task you want.
 95 | Then you can move to running more involved tests using Dynalab.
 96 | 
 97 | Afterwards, from this directory run:
 98 | `dynalab-cli init -n <name-of-your-model>`
 99 | Note that the model name needs to be lower-kebab-case.
100 | 
101 | Chose the track you want to apply to: "flores_small1", "flores_small2" or "flores_full".
102 | Note that the input format is same for all the tracks.
103 | Then follow the prompt instruction and point to your model path, handler path ...
104 | 
105 | Then you can run `dynalab-cli test -n <name-of-your-model> --local`
106 | This will run your model on a sample input, using your current python environment.
107 | For debugging you can run:
108 | `python -m pdb $(which dynalab-cli) test -n <name-of-your-model> --local`
109 | 
110 | If this works, you can then proceed to the docker tests.
111 | This will create a full docker image and run the tests inside it.
112 | This example can be used to check if `requirements.txt`
113 | contains all the dependencies needed to run your model.
114 | 
115 | `dynalab-cli test -n <name-of-your-model>`
116 | 
117 | ## Making a submission
118 | 
119 | To make a submission you'll need to create an account on [Dynabench](https://www.dynabench.org/).
120 | And login locally using `dynalab-cli login`.
121 | 
122 | Then you can finally run `dynalab-cli upload -n <name-of-your-model>`.
123 | You'll receive a confirmation email afterwards.
124 | 


--------------------------------------------------------------------------------
/ocr/data_collection/utils.py:
--------------------------------------------------------------------------------
 1 | import csv
 2 | 
 3 | import numpy as np
 4 | 
 5 | # pdf codes might be updated in the meantime in the UDHR website
 6 | udhr_lang_code = {"amh": "amh", "hye": "arm", "ast": "aub", "bel": "ruw", "ben": "bng", "bul": "blg", "mya": "bms",
 7 |                   "kat": "geo", "ell": "grk", "guj": "gjr", "heb": "hbr", "hin": "hnd", "jpn": "jpn", "kan": "kjv",
 8 |                   "kaz": "kaz", "khm": "khm", "kor": "kkn", "kir": "kdo", "lao": "nol", "mkd": "mkj", "mal": "mjs",
 9 |                   "mar": "mrt", "npi": "nep", "pbu": "pbu", "pan": "pnj1", "rus": "rus", "srp": "src5", "tgk": "pet",
10 |                   "tam": "tcv", "tel": "tcw", "tha": "thj", "ukr": "ukr", "urd": "urd", "vie": "vie", "tur": "trk",
11 |                   "uzn": "uzb", "wol": "wol", "arb": "arz", "ceb": "ceb", "cmn": "chn", "fuv": "fuv", "lug": "lap1",
12 |                   "isl": "ice", "lin": "lin", "mri": "mbf", "khk": "khk", "nya": "nyj", "ron": "rum", "ckb": "kdb1",
13 |                   "zul": "zuu", "sna": "shd", "umb": "mnf", "swh": "swa", "som": "som", "swe": "swd", "pol": "pql",
14 |                   "slk": "slo", "slv": "slv", "gaz": "gax", "por": "por"}
15 | 
16 | lang_code_xml = {"ron": "ron_1953", "ell": "ell_monotonic", "npi": "nep", "fuv": "fuv2", "srp": "srp_cyrl",
17 |                  "uzn": "uzn_latn", "cmn": "cmn_hans", "nya": "nya_chechewa", "por": "por_PT", "gaz": "gax"}
18 | 
19 | 
20 | def get_languages():
21 |     languages = []
22 |     lang_code = []
23 |     with open("Data/language_codes/languages.csv", newline="", encoding="utf-8") as file:
24 |         for row in csv.reader(file):
25 |             languages.append(row[0])
26 |             lang_code.append(row[1])
27 |     return languages[1:], lang_code[1:]
28 | 
29 | 
30 | def create_dictionary_lang():
31 |     return dict(zip(*get_languages()))
32 | 
33 | 
34 | def merge_dicts(dict1, dict2):
35 |     """ Merge dictionaries and keep values of common keys in list"""
36 |     dict3 = {**dict1, **dict2}
37 |     for key, value in dict3.items():
38 |         if key in dict1 and key in dict2:
39 |             dict3[key] = list(set(value + dict1[key]))
40 |     return dict3
41 | 
42 | 
43 | # articles that had an CER > mean + 2 * std -> must be anomalies (not matching pdf & text)
44 | def check_annotations_anomaly(list_error_rates):
45 |     error_rates = np.asarray([er for _, er in list_error_rates])
46 | 
47 |     mean = error_rates.mean(axis=0)
48 |     std = error_rates.std(axis=0)
49 | 
50 |     list_anomalies = []
51 |     print("mean:", mean, "std:", std)
52 |     for index, error_rate in list_error_rates:
53 |         if error_rate >= mean + 2 * std:
54 |             print("Articles: " + str(index) + " CER: " + str(error_rate))
55 |             list_anomalies.append(index)
56 |     return list_anomalies
57 | 
58 | 
59 | # I already computed them running both Google & Tesseract
60 | def return_all_anomalies():
61 |     dict_anomalies_google = {'amh': [], 'hye': [], 'ast': [15, 17, 20], 'bel': [20], 'ben': [11], 'bul': [17, 20],
62 |                              'mya': [3, 17], 'kat': [8], 'ell': [15, 17], 'guj': [3, 30], 'heb': [17], 'hin': [9],
63 |                              'jpn': [17, 20], 'kan': [9], 'kaz': [15, 20], 'khm': [5, 11], 'kor': [], 'kir': [12, 27],
64 |                              'lao': [3, 6], 'mkd': [12], 'mal': [10, 13], 'mar': [9], 'npi': [24, 28], 'pbu': [26],
65 |                              'pan': [17], 'rus': [3], 'srp': [15, 17, 20], 'tgk': [17], 'tam': [15, 24], 'tel': [15],
66 |                              'tha': [], 'ukr': [6], 'urd': [3], 'vie': [24], 'tur': [15, 20], 'uzn': [5], 'wol': [3],
67 |                              'zul': [15, 17, 20], 'arb': [17], 'ceb': [20], 'cmn': [], 'fuv': [15], 'lug': [14, 21],
68 |                              'isl': [6, 10], 'lin': [15, 20], 'mri': [4], 'khk': [13, 17], 'nya': [15, 20],
69 |                              'ron': [1, 20], 'ckb': [2], 'sna': [15, 17], 'umb': [9, 6, 7, 8, 10, 11, 12, 13, 14],
70 |                              'swh': [15, 17, 20], 'som': [15, 17, 20], 'swe': [11], 'pol': [15, 17, 20], 'slk': [14],
71 |                              'slv': [17, 30], 'gaz': [15, 20], 'por': [17]}
72 |     dict_anomalies_tesseract = {'amh': [13, 20, 27], 'hye': [], 'ast': [15, 17, 20], 'bel': [17, 20], 'ben': [13],
73 |                                 'bul': [17, 20], 'mya': [], 'kat': [8], 'ell': [15, 17], 'guj': [3, 30],
74 |                                 'heb': [13, 17], 'hin': [6], 'jpn': [2, 30], 'kan': [9], 'kaz': [15, 17],
75 |                                 'khm': [5, 11], 'kor': [24], 'kir': [15, 20], 'lao': [6], 'mkd': [17, 20],
76 |                                 'mal': [10, 21], 'mar': [29], 'npi': [28], 'pbu': [26], 'pan': [7], 'rus': [15, 17, 20],
77 |                                 'srp': [20], 'tgk': [4], 'tam': [3, 15, 24], 'tel': [15], 'tha': [6, 10],
78 |                                 'ukr': [17, 20], 'urd': [3, 6, 9], 'vie': [], 'tur': [15, 17, 20], 'uzn': [5],
79 |                                 'wol': [15, 17, 20], 'zul': [15, 20], 'arb': [17], 'ceb': [17, 20], 'cmn': [22],
80 |                                 'fuv': [], 'lug': [14, 21], 'isl': [10], 'lin': [15, 20], 'mri': [4], 'khk': [15, 20],
81 |                                 'nya': [20], 'ron': [15, 20], 'ckb': [3, 6], 'sna': [17], 'umb': [9],
82 |                                 'swh': [15, 17, 20], 'som': [15], 'swe': [11], 'pol': [15, 17, 20], 'slk': [14],
83 |                                 'slv': [17, 30], 'gaz': [17, 20], 'por': [6, 13, 24]}
84 |     return merge_dicts(dict_anomalies_google, dict_anomalies_tesseract)
85 | 
86 | 
87 | def sentence_split(input_, output, mode):
88 |     with open(input_ + ".txt", encoding="utf8") as file:
89 |         input_txt_ocr = file.readlines()
90 | 
91 |     new_ocr_txt = " ".join(input_txt_ocr)
92 |     new_ocr_txt = " ".join(new_ocr_txt.split())  # remove double spaces
93 | 
94 |     with open(output, mode, encoding="utf-8") as file:
95 |         file.write(new_ocr_txt)
96 | 


--------------------------------------------------------------------------------
/previous_releases/floresv1/scripts/train.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # Copyright (c) Facebook, Inc. and its affiliates.
  3 | # All rights reserved.
  4 | #
  5 | # This source code is licensed under the license found in the
  6 | # LICENSE file in the root directory of this source tree.
  7 | #
  8 | 
  9 | import argparse
 10 | import os
 11 | import torch
 12 | from subprocess import check_call, check_output
 13 | from glob import glob
 14 | from tempfile import TemporaryDirectory, NamedTemporaryFile as TempFile
 15 | import math
 16 | import json
 17 | from utils import check_last_line
 18 | 
 19 | 
 20 | def read_config(config_path):
 21 |     with open(config_path, 'r') as js:
 22 |         return json.load(js)
 23 | 
 24 | 
 25 | def call(cmd, shell=True):
 26 |     print(cmd)
 27 |     check_call(cmd, shell=shell)
 28 | 
 29 | def train(src, tgt, train_config, savedir, databin):
 30 |     # expect to have 'hyperparameters', 'src', 'tgt', 'databin' in train_config
 31 |     os.makedirs(savedir, exist_ok=True)
 32 | 
 33 |     logpath = os.path.join(savedir, 'train.log')
 34 |     checkpoint = os.path.join(savedir, 'checkpoint_best.pt')
 35 | 
 36 |     if check_last_line(logpath, 'done') and os.path.exists(checkpoint):
 37 |         print(f"Training is finished. Best checkpoint: {checkpoint}")
 38 |         return
 39 | 
 40 |     cuda_visible_devices = list(range(torch.cuda.device_count()))
 41 |     num_visible_gpu = len(cuda_visible_devices)
 42 |     num_gpu = min(train_config['gpu'], 2**int(math.log2(num_visible_gpu)))
 43 |     cuda_devices_clause = f"CUDA_VISIBLE_DEVICES={','.join([str(i) for i in cuda_visible_devices[:num_gpu]])}"
 44 |     update_freq = train_config['gpu'] / num_gpu
 45 |     call(f"""{cuda_devices_clause} fairseq-train {databin} \
 46 |         --source-lang {src} --target-lang {tgt} \
 47 |         --save-dir {savedir} \
 48 |         --update-freq {update_freq} \
 49 |         {" ".join(train_config['parameters'])} \
 50 |         | tee {logpath}
 51 |     """, shell=True)
 52 | 
 53 | 
 54 | def eval_bleu(src, tgt, subset, lenpen, databin, checkpoint, output, max_token=20000):
 55 |     bleuarg = "--sacrebleu" if tgt == "en" else ""
 56 |     call(f"""fairseq-generate {databin} \
 57 |         --source-lang {src} --target-lang {tgt} \
 58 |         --path {checkpoint} \
 59 |         --max-tokens {max_token} \
 60 |         --beam 5 \
 61 |         --lenpen {lenpen} \
 62 |         --max-len-a 1.8 \
 63 |         --max-len-b 10 \
 64 |         --gen-subset {subset} \
 65 |         --remove-bpe=sentencepiece \
 66 |         {bleuarg} > {output}
 67 |     """)
 68 |     return check_output(f"tail -n 1 {output}", shell=True).decode('utf-8').strip()
 69 | 
 70 | 
 71 | def translate(src, tgt, model, lenpen, dest, data, max_token=12000):
 72 |     script_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'translate.py')
 73 |     check_call(f"""python {script_path} --data {data}\
 74 |         --source-lang {src} --target-lang {tgt} \
 75 |         --model {model} \
 76 |         --beam 5 --lenpen {lenpen} \
 77 |         --max-len-a 1.8 \
 78 |         --max-len-b 10 \
 79 |         --dest {dest} \
 80 |         --max-token {max_token} \
 81 |         --chunks 100 \
 82 |         --backend local
 83 |     """, shell=True)
 84 | 
 85 | # (src, tgt) is the direction of the databin
 86 | def build_bt_databin(src, tgt, train_prefix, para_databin, output_folder):
 87 |     final_output = os.path.join(f'{output_folder}/data-bin')
 88 |     if os.path.exists(final_output):
 89 |         print(f"Databin path {final_output} exists")
 90 |         return final_output
 91 | 
 92 |     train_databin = os.path.join(output_folder, 'train-data-bin')
 93 |     os.makedirs(train_databin, exist_ok=True)
 94 |     call(f"ln -fs {train_prefix}.hypo {output_folder}/bt.{src}")
 95 |     call(f"ln -fs {train_prefix}.src {output_folder}/bt.{tgt}")
 96 | 
 97 |     call(f"""fairseq-preprocess \
 98 |         --source-lang {src} --target-lang {tgt} \
 99 |         --trainpref {output_folder}/bt \
100 |         --destdir {train_databin} \
101 |         --joined-dictionary \
102 |         --srcdict {para_databin}/dict.{src}.txt \
103 |         --workers 40
104 |     """)
105 | 
106 |     os.makedirs(final_output, exist_ok=True)
107 |     call(f"ln -fs {para_databin}/* {final_output}")
108 |     for lang in [src, tgt]:
109 |         for suffix in ['idx', 'bin']:
110 |             file_suffix = f"{src}-{tgt}.{lang}.{suffix}"
111 |             call(f"ln -fs {train_databin}/train.{file_suffix} {output_folder}/data-bin/train1.{file_suffix}")
112 |     return final_output
113 | 
114 | 
115 | def main():
116 |     parser = argparse.ArgumentParser()
117 |     parser.add_argument('--config', '-c', required=True, help='pipeline config')
118 |     parser.add_argument('--databin', '-d', required=True, help='initial databin')
119 |     args = parser.parse_args()
120 | 
121 |     configs = read_config(args.config)
122 |     workdir = os.path.join(os.path.dirname(os.path.abspath(__file__)), '../experiments')
123 |     #cuda_visible_devices=args.cuda_visible_devices or list(range(torch.cuda.device_count()))
124 | 
125 |     initial_databin = args.databin
126 |     for i in range(len(configs)):
127 |         (name, config) = configs[i]
128 |         src = config['src']
129 |         tgt = config['tgt']
130 |         direction = f"{src}-{tgt}"
131 |         print(f"Start {name} iteration, {direction}")
132 |         iter_workdir = os.path.join(workdir, name, direction)
133 |         # train
134 |         model_dir = os.path.join(iter_workdir, 'model')
135 |         train(src, tgt, config['train'], model_dir, initial_databin)
136 |         checkpoint_path = os.path.join(model_dir, 'checkpoint_best.pt')
137 |         # eval
138 |         lenpen = config['translate']['lenpen']
139 |         eval_output = os.path.join(model_dir, 'eval.txt')
140 |         if check_last_line(eval_output, "BLEU"):
141 |             print(check_output(f"tail -n 1 {eval_output}", shell=True).decode('utf-8').strip())
142 |         else:
143 |             print(eval_bleu(
144 |                 config['src'], config['tgt'],
145 |                 'test', lenpen,
146 |                 args.databin, checkpoint_path,
147 |                 os.path.join(model_dir, 'eval.txt')
148 |             ))
149 |         # Early exit to skip back-translation for the last iteration
150 |         if i == len(configs) - 1:
151 |             break
152 |         # translate
153 |         translate_output = os.path.join(iter_workdir, 'synthetic')
154 |         translate(config['src'], config['tgt'], checkpoint_path, lenpen, translate_output, config['translate']['mono'], config['translate']['max_token'])
155 |         # generate databin
156 |         databin_folder = os.path.join(translate_output, 'bt')
157 |         initial_databin = build_bt_databin(
158 |             config['tgt'], config['src'],
159 |             os.path.join(translate_output, 'generated'), args.databin, databin_folder
160 |         )
161 | 
162 | main()
163 | 


--------------------------------------------------------------------------------
/ocr/data_collection/download_UDHR_data.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | import glob
  3 | import json
  4 | import os
  5 | 
  6 | import cv2
  7 | import numpy as np
  8 | import requests
  9 | import xmltodict
 10 | from PIL import Image
 11 | from pdf2image import convert_from_path
 12 | 
 13 | from utils import udhr_lang_code, lang_code_xml
 14 | 
 15 | print("There are", len(set(udhr_lang_code)), "different languages")
 16 | 
 17 | output_folder_pdf = "Data/UDHR/pdfs"
 18 | output_folder_img = "Data/UDHR/imgs"
 19 | output_folder_txt = "Data/UDHR/txt"
 20 | 
 21 | 
 22 | def save_txt(lang_code: str, lang_code_xml) -> None:
 23 |     lang_code_xml = lang_code_xml.get(lang_code, lang_code)
 24 | 
 25 |     print("Getting xml UDHR file", lang_code_xml + ".xml")
 26 |     url = "https://www.unicode.org/udhr/d/udhr_" + lang_code_xml + ".xml"
 27 |     response = requests.get(url, allow_redirects=True)
 28 | 
 29 |     print("Parsing xml UDHR file", lang_code_xml + ".xml")
 30 |     dict = xmltodict.parse(response.content)
 31 |     all_articles = dict["udhr"]["article"]
 32 |     print("There are", len(all_articles), "articles.")
 33 |     dict_paragraphs = {}
 34 |     for i in range(len(all_articles)):
 35 |         article_nb = i + 1
 36 |         dict_paragraphs[article_nb] = []
 37 |         title = all_articles[i]['title']
 38 |         if 'para' in all_articles[i]:
 39 |             if type(all_articles[i]['para']) is list:  # format is inconsistent
 40 |                 for paragraph in all_articles[i]['para']:
 41 |                     if paragraph is not None:  # format issues
 42 |                         if not dict_paragraphs[article_nb]:
 43 |                             dict_paragraphs[article_nb].append(title + " " + paragraph)
 44 |                         else:
 45 |                             dict_paragraphs[article_nb].append(paragraph)
 46 |             else:
 47 |                 if not dict_paragraphs[article_nb]:
 48 |                     dict_paragraphs[article_nb].append(title + " " + all_articles[i]['para'])
 49 |                 else:
 50 |                     dict_paragraphs[article_nb].append(all_articles[i]['para'])
 51 |         else:
 52 |             for paragraph in all_articles[i]['orderedlist']['listitem']:
 53 |                 if paragraph is not None:  # format issues
 54 |                     if not dict_paragraphs[article_nb]:
 55 |                         if '@tag' in paragraph:
 56 |                             dict_paragraphs[article_nb].append(
 57 |                                 title + " " + paragraph['@tag'] + " " + paragraph['para'])
 58 |                         else:
 59 |                             dict_paragraphs[article_nb].append(title + " " + paragraph['para'])
 60 |                     else:
 61 |                         if '@tag' in paragraph:
 62 |                             dict_paragraphs[article_nb].append(paragraph['@tag'] + " " + paragraph['para'])
 63 |                         else:
 64 |                             dict_paragraphs[article_nb].append(paragraph['para'])
 65 | 
 66 |     if len(dict_paragraphs) != 30:
 67 |         raise ValueError("NOT 30 XML articles!!!")
 68 | 
 69 |     print("Saving text data json UDHR file", lang_code + ".json")
 70 |     output_file = output_folder_txt + lang_code + ".json"
 71 |     os.makedirs(output_folder_txt, exist_ok=True)
 72 |     with open(output_file, 'w+', encoding='utf-8') as file:
 73 |         json.dump(dict_paragraphs, file, ensure_ascii=False)
 74 | 
 75 | 
 76 | def save_pdfs(lang_code: str) -> None:
 77 |     print("Saving pdf UDHR file", lang_code + ".pdf")
 78 |     url = "https://www.ohchr.org/EN/UDHR/Documents/UDHR_Translations/" + udhr_lang_code[lang_code] + ".pdf"
 79 |     response = requests.get(url, allow_redirects=True)
 80 |     os.makedirs(output_folder_pdf, exist_ok=True)
 81 |     with open(os.path.join(output_folder_pdf, lang_code + ".pdf"), 'wb') as file:
 82 |         file.write(response.content)
 83 | 
 84 | 
 85 | def convert_pdf_to_png(lang_code: str) -> None:
 86 |     path_pdf_in = os.path.join(output_folder_pdf, lang_code + ".pdf")
 87 |     folder_jpg_out = os.path.join(output_folder_img, lang_code, "jpgs")
 88 |     path_jpg_out = os.path.join(folder_jpg_out, lang_code)
 89 |     os.makedirs(folder_jpg_out, exist_ok=True)
 90 |     if lang_code in ['arb', 'ukr']:
 91 |         for i, image in enumerate(convert_from_path(path_pdf_in)):
 92 |             image.save(f'{path_jpg_out}-{i}.jpg', 'JPEG')  # Save pages as images in the PDF file.
 93 |     else:
 94 |         os.system(f"convert -density 300 {path_pdf_in} -quality 100 {path_jpg_out}.jpg")
 95 | 
 96 | 
 97 | def join_imgs(lang_code: str) -> None:
 98 |     list_im = glob.glob(os.path.join(output_folder_img, lang_code, "jpgs") + "/*.jpg")
 99 |     images = [Image.open(i).convert('L') for i in sorted(list_im)]
100 |     # pick the image which is the smallest, and resize the others to match it (can be arbitrary image shape here)
101 |     min_shape = sorted([(np.sum(image.size), image.size) for image in images])[0][1]
102 |     # for a vertical stacking it is simple: use vstack
103 |     imgs_comb = np.vstack([np.asarray(i.resize(min_shape)) for i in images])
104 |     imgs_comb = Image.fromarray(imgs_comb)
105 |     imgs_comb.save(output_folder_img + "/" + lang_code + "/" + lang_code + '.png')
106 | 
107 | 
108 | def split_by_coordinates(lang_code: str) -> None:
109 |     with open('Data/language_codes/article_coordinates.json', encoding="utf-8") as file:
110 |         coordinates = json.load(file)
111 | 
112 |     path_out = os.path.join(output_folder_img, lang_code, "articles")
113 |     os.makedirs(path_out, exist_ok=True)
114 | 
115 |     image = cv2.imread(os.path.join(output_folder_img, lang_code, lang_code + '.png'))
116 |     for key in coordinates:
117 |         lang_code1 = key.split("_")[0]
118 |         if lang_code != lang_code1:
119 |             continue
120 |         name_article = key
121 |         (startX, startY), (endX, endY) = coordinates[key]
122 | 
123 |         cropped_image = image[startY:endY, startX:endX]
124 |         cv2.imwrite(path_out + name_article + '.jpg', cropped_image)
125 | 
126 | 
127 | def convert_udhr_a_b(lang_code: str) -> None:
128 |     files_a_b = glob.glob(output_folder_img + lang_code + "/articles/*a.jpg")
129 |     for file_a_b in files_a_b:
130 |         name = file_a_b.split("/")[-1][:-5]
131 |         path_file = "/".join(file_a_b.split("/")[:-1])
132 |         file_name = path_file + "/" + name
133 |         os.system('convert -append ' + file_name + '{a,b}.jpg' + " " + file_name + '.jpg')
134 | 
135 | 
136 | def split_pdf_into_articles(lang_code: str) -> None:
137 |     convert_pdf_to_png(lang_code)
138 |     join_imgs(lang_code)
139 |     split_by_coordinates(lang_code)
140 |     convert_udhr_a_b(lang_code)  # Concatenate the split articles.
141 | 
142 | 
143 | def main() -> None:
144 |     for lang_code in udhr_lang_code:
145 |         if lang_code == 'isl':
146 |             continue
147 |         print("-------------", lang_code, "----------------")
148 |         save_pdfs(lang_code)
149 |         save_txt(lang_code, lang_code_xml)
150 |         split_pdf_into_articles(lang_code)
151 | 
152 | 
153 | if __name__ == "__main__":
154 |     main()
155 | 


--------------------------------------------------------------------------------
/flores_move.py:
--------------------------------------------------------------------------------
 1 | import glob 
 2 | import os
 3 | import subprocess
 4 | 
 5 | all_filenames = ["ace_Arab","ace_Latn","acm_Arab","acq_Arab","aeb_Arab","afr_Latn","ajp_Arab","aka_Latn","amh_Ethi","apc_Arab","arb_Arab","arb_Latn","ars_Arab","ary_Arab","arz_Arab","asm_Beng","ast_Latn","awa_Deva","ayr_Latn","azb_Arab","azj_Latn","bak_Cyrl","bam_Latn","ban_Latn","bel_Cyrl","bem_Latn","ben_Beng","bho_Deva","bjn_Arab","bjn_Latn","bod_Tibt","bos_Latn","bug_Latn","bul_Cyrl","cat_Latn","ceb_Latn","ces_Latn","cjk_Latn","ckb_Arab","crh_Latn","cym_Latn","dan_Latn","deu_Latn","dik_Latn","dyu_Latn","dzo_Tibt","ell_Grek","eng_Latn","epo_Latn","est_Latn","eus_Latn","ewe_Latn","fao_Latn","pes_Arab","fij_Latn","fin_Latn","fon_Latn","fra_Latn","fur_Latn","fuv_Latn","gla_Latn","gle_Latn","glg_Latn","grn_Latn","guj_Gujr","hat_Latn","hau_Latn","heb_Hebr","hin_Deva","hne_Deva","hrv_Latn","hun_Latn","hye_Armn","ibo_Latn","ilo_Latn","ind_Latn","isl_Latn","ita_Latn","jav_Latn","jpn_Jpan","kab_Latn","kac_Latn","kam_Latn","kan_Knda","kas_Arab","kas_Deva","kat_Geor","knc_Arab","knc_Latn","kaz_Cyrl","kbp_Latn","kea_Latn","khm_Khmr","kik_Latn","kin_Latn","kir_Cyrl","kmb_Latn","kon_Latn","kor_Hang","kmr_Latn","lao_Laoo","lvs_Latn","lij_Latn","lim_Latn","lin_Latn","lit_Latn","lmo_Latn","ltg_Latn","ltz_Latn","lua_Latn","lug_Latn","luo_Latn","lus_Latn","mag_Deva","mai_Deva","mal_Mlym","mar_Deva","min_Arab","min_Latn","mkd_Cyrl","plt_Latn","mlt_Latn","mni_Beng","khk_Cyrl","mos_Latn","mri_Latn","zsm_Latn","mya_Mymr","nld_Latn","nno_Latn","nob_Latn","npi_Deva","nso_Latn","nus_Latn","nya_Latn","oci_Latn","gaz_Latn","ory_Orya","pag_Latn","pan_Guru","pap_Latn","pol_Latn","por_Latn","prs_Arab","pbt_Arab","quy_Latn","ron_Latn","run_Latn","rus_Cyrl","sag_Latn","san_Deva","sat_Beng","scn_Latn","shn_Mymr","sin_Sinh","slk_Latn","slv_Latn","smo_Latn","sna_Latn","snd_Arab","som_Latn","sot_Latn","spa_Latn","als_Latn","srd_Latn","srp_Cyrl","ssw_Latn","sun_Latn","swe_Latn","swh_Latn","szl_Latn","tam_Taml","tat_Cyrl","tel_Telu","tgk_Cyrl","tgl_Latn","tha_Thai","tir_Ethi","taq_Latn","taq_Tfng","tpi_Latn","tsn_Latn","tso_Latn","tuk_Latn","tum_Latn","tur_Latn","twi_Latn","tzm_Tfng","uig_Arab","ukr_Cyrl","umb_Latn","urd_Arab","uzn_Latn","vec_Latn","vie_Latn","war_Latn","wol_Latn","xho_Latn","ydd_Hebr","yor_Latn","yue_Hant","zho_Hans","zho_Hant",
 6 | "zul_Latn"]
 7 | 
 8 | assert(len(all_filenames) == 204)
 9 | 
10 | filenames = glob.glob("/large_experiments/nllb/mmt/flores200_final/dev/*")
11 | counter = []
12 | for filename in filenames:
13 |     langcode = filename.replace("/large_experiments/nllb/mmt/flores200_final/dev/", "").replace(".dev", "")
14 |     if "_" in langcode:
15 |         counter.append(langcode)
16 | 
17 | missing_codes = []
18 | counter = set(counter)
19 | for i in all_filenames:
20 |     if i not in counter:
21 |         missing_codes.append(i)
22 |     else:
23 |         subprocess.run(f"cp -L /large_experiments/nllb/mmt/flores200_final/dev/{i}.dev /private/home/angelafan/flores_opensource_links/dev/", shell=True)
24 |         # print(f"cp /large_experiments/nllb/mmt/flores200_final/dev/{i}.dev /private/home/angelafan/flores_opensource_links/dev/")
25 | 
26 | 
27 | filenames = glob.glob("/large_experiments/nllb/mmt/flores101_beta/dev/*")
28 | 
29 | print(missing_codes)
30 | 
31 | for filename in filenames:
32 |     langcode = filename.replace("/large_experiments/nllb/mmt/flores101_beta/dev/", "").replace(".dev", "")
33 |     if langcode in missing_codes:
34 |         print(langcode, filename)
35 |         subprocess.run(f"cp -L  {filename} /private/home/angelafan/flores_opensource_links/dev/", shell=True)
36 | 
37 | 
38 | # import glob 
39 | # import os
40 | # import subprocess
41 | 
42 | # all_filenames = ["ace_Arab","ace_Latn","acm_Arab","acq_Arab","aeb_Arab","afr_Latn","ajp_Arab","aka_Latn","amh_Ethi","apc_Arab","arb_Arab","arb_Latn","ars_Arab","ary_Arab","arz_Arab","asm_Beng","ast_Latn","awa_Deva","ayr_Latn","azb_Arab","azj_Latn","bak_Cyrl","bam_Latn","ban_Latn","bel_Cyrl","bem_Latn","ben_Beng","bho_Deva","bjn_Arab","bjn_Latn","bod_Tibt","bos_Latn","bug_Latn","bul_Cyrl","cat_Latn","ceb_Latn","ces_Latn","cjk_Latn","ckb_Arab","crh_Latn","cym_Latn","dan_Latn","deu_Latn","dik_Latn","dyu_Latn","dzo_Tibt","ell_Grek","eng_Latn","epo_Latn","est_Latn","eus_Latn","ewe_Latn","fao_Latn","pes_Arab","fij_Latn","fin_Latn","fon_Latn","fra_Latn","fur_Latn","fuv_Latn","gla_Latn","gle_Latn","glg_Latn","grn_Latn","guj_Gujr","hat_Latn","hau_Latn","heb_Hebr","hin_Deva","hne_Deva","hrv_Latn","hun_Latn","hye_Armn","ibo_Latn","ilo_Latn","ind_Latn","isl_Latn","ita_Latn","jav_Latn","jpn_Jpan","kab_Latn","kac_Latn","kam_Latn","kan_Knda","kas_Arab","kas_Deva","kat_Geor","knc_Arab","knc_Latn","kaz_Cyrl","kbp_Latn","kea_Latn","khm_Khmr","kik_Latn","kin_Latn","kir_Cyrl","kmb_Latn","kon_Latn","kor_Hang","kmr_Latn","lao_Laoo","lvs_Latn","lij_Latn","lim_Latn","lin_Latn","lit_Latn","lmo_Latn","ltg_Latn","ltz_Latn","lua_Latn","lug_Latn","luo_Latn","lus_Latn","mag_Deva","mai_Deva","mal_Mlym","mar_Deva","min_Arab","min_Latn","mkd_Cyrl","plt_Latn","mlt_Latn","mni_Beng","khk_Cyrl","mos_Latn","mri_Latn","zsm_Latn","mya_Mymr","nld_Latn","nno_Latn","nob_Latn","npi_Deva","nso_Latn","nus_Latn","nya_Latn","oci_Latn","gaz_Latn","ory_Orya","pag_Latn","pan_Guru","pap_Latn","pol_Latn","por_Latn","prs_Arab","pbt_Arab","quy_Latn","ron_Latn","run_Latn","rus_Cyrl","sag_Latn","san_Deva","sat_Beng","scn_Latn","shn_Mymr","sin_Sinh","slk_Latn","slv_Latn","smo_Latn","sna_Latn","snd_Arab","som_Latn","sot_Latn","spa_Latn","als_Latn","srd_Latn","srp_Cyrl","ssw_Latn","sun_Latn","swe_Latn","swh_Latn","szl_Latn","tam_Taml","tat_Cyrl","tel_Telu","tgk_Cyrl","tgl_Latn","tha_Thai","tir_Ethi","taq_Latn","taq_Tfng","tpi_Latn","tsn_Latn","tso_Latn","tuk_Latn","tum_Latn","tur_Latn","twi_Latn","tzm_Tfng","uig_Arab","ukr_Cyrl","umb_Latn","urd_Arab","uzn_Latn","vec_Latn","vie_Latn","war_Latn","wol_Latn","xho_Latn","ydd_Hebr","yor_Latn","yue_Hant","zho_Hans","zho_Hant",
43 | # "zul_Latn"]
44 | 
45 | # assert(len(all_filenames) == 204)
46 | 
47 | # filenames = glob.glob("/large_experiments/nllb/mmt/flores200_final/devtest/*")
48 | # counter = []
49 | # for filename in filenames:
50 | #     langcode = filename.replace("/large_experiments/nllb/mmt/flores200_final/devtest/", "").replace(".devtest", "")
51 | #     if "_" in langcode:
52 | #         counter.append(langcode)
53 | 
54 | # missing_codes = []
55 | # counter = set(counter)
56 | # for i in all_filenames:
57 | #     if i not in counter:
58 | #         missing_codes.append(i)
59 | #     else:
60 | #         subprocess.run(f"cp -L /large_experiments/nllb/mmt/flores200_final/devtest/{i}.devtest /private/home/angelafan/flores_opensource_links/devtest/", shell=True)
61 | #         # print(f"cp -L /large_experiments/nllb/mmt/flores200_final/devtest/{i}.devtest /private/home/angelafan/flores_opensource_links/dev/")
62 | 
63 | 
64 | # filenames = glob.glob("/large_experiments/nllb/mmt/flores101_beta/devtest/*")
65 | 
66 | # print(missing_codes)
67 | 
68 | # for filename in filenames:
69 | #     langcode = filename.replace("/large_experiments/nllb/mmt/flores101_beta/devtest/", "").replace(".devtest", "")
70 | #     if langcode in missing_codes:
71 | #         print(langcode, filename)
72 | #         subprocess.run(f"cp -L  {filename} /private/home/angelafan/flores_opensource_links/devtest/", shell=True)
73 | #         # print(f"cp -L  {filename} /private/home/angelafan/flores_opensource_links/devtest/")
74 | 


--------------------------------------------------------------------------------
/previous_releases/floresv1/README.md:
--------------------------------------------------------------------------------
  1 | <p align="center">
  2 | <img src="flores_logo.png" width="500">
  3 | </p>
  4 | 
  5 | --------------------------------------------------------------------------------
  6 | 
  7 | # Facebook Low Resource MT Benchmark (FLoRes)
  8 | FLoRes is a benchmark dataset for machine translation between English and four low resource languages, Nepali, Sinhala, Khmer and Pashto, based on sentences translated from Wikipedia.
  9 | The data sets can be downloaded [HERE](https://github.com/facebookresearch/flores/raw/master/data/flores_test_sets.tgz).
 10 | 
 11 | **New**: two new languages, Khmer and Pashto, are added to the dataset.
 12 | 
 13 | This repository contains data and baselines from the paper:  
 14 | [The FLoRes Evaluation Datasets for Low-Resource Machine Translation: Nepali-English and Sinhala-English](https://arxiv.org/abs/1902.01382).
 15 | 
 16 | ## Baselines
 17 | 
 18 | The following instructions will can be used to reproduce the baseline results from the paper.
 19 | 
 20 | ### Requirements
 21 | 
 22 | The baseline uses the
 23 | [Indic NLP Library](https://github.com/anoopkunchukuttan/indic_nlp_library) and
 24 | [sentencepiece](https://github.com/google/sentencepiece) for preprocessing;
 25 | [fairseq](https://github.com/pytorch/fairseq) for model training; and
 26 | [sacrebleu](https://github.com/mjpost/sacreBLEU) for scoring.
 27 | 
 28 | Dependencies can be installed via pip:
 29 | ```
 30 | $ pip install fairseq sacrebleu sentencepiece
 31 | ```
 32 | 
 33 | The Indic NLP Library will be cloned automatically by the `prepare-{ne,si}en.sh` scripts.
 34 | 
 35 | ### Download and preprocess data
 36 | 
 37 | The `download-data.sh` script can be used to download and extract the raw data.
 38 | Thereafter the `prepare-neen.sh` and `prepare-sien.sh` scripts can be used to
 39 | preprocess the raw data. In particular, they will use the sentencepiece library
 40 | to learn a shared BPE vocabulary with 5000 subword units and binarize the data
 41 | for training with fairseq.
 42 | 
 43 | To download and extract the raw data:
 44 | ```
 45 | $ bash download-data.sh
 46 | ```
 47 | 
 48 | Thereafter, run the following to preprocess the raw data:
 49 | ```
 50 | $ bash prepare-neen.sh
 51 | $ bash prepare-sien.sh
 52 | ```
 53 | 
 54 | ### Train a baseline Transformer model
 55 | 
 56 | To train a baseline Ne-En model on a single GPU:
 57 | ```
 58 | $ CUDA_VISIBLE_DEVICES=0 fairseq-train \
 59 |     data-bin/wiki_ne_en_bpe5000/ \
 60 |     --source-lang ne --target-lang en \
 61 |     --arch transformer --share-all-embeddings \
 62 |     --encoder-layers 5 --decoder-layers 5 \
 63 |     --encoder-embed-dim 512 --decoder-embed-dim 512 \
 64 |     --encoder-ffn-embed-dim 2048 --decoder-ffn-embed-dim 2048 \
 65 |     --encoder-attention-heads 2 --decoder-attention-heads 2 \
 66 |     --encoder-normalize-before --decoder-normalize-before \
 67 |     --dropout 0.4 --attention-dropout 0.2 --relu-dropout 0.2 \
 68 |     --weight-decay 0.0001 \
 69 |     --label-smoothing 0.2 --criterion label_smoothed_cross_entropy \
 70 |     --optimizer adam --adam-betas '(0.9, 0.98)' --clip-norm 0 \
 71 |     --lr-scheduler inverse_sqrt --warmup-updates 4000 --warmup-init-lr 1e-7 \
 72 |     --lr 1e-3 --min-lr 1e-9 \
 73 |     --max-tokens 4000 \
 74 |     --update-freq 4 \
 75 |     --max-epoch 100 --save-interval 10
 76 | ```
 77 | 
 78 | To train on 4 GPUs, remove the `--update-freq` flag and run `CUDA_VISIBLE_DEVICES=0,1,2,3 fairseq-train (...)`.
 79 | If you have a Volta or newer GPU you can further improve training speed by adding the `--fp16` flag.
 80 | 
 81 | This same architecture can be used for En-Ne, Si-En and En-Si:
 82 | - For En-Ne, update the training command with:  
 83 |   `fairseq-train data-bin/wiki_ne_en_bpe5000 --source-lang en --target-lang ne`
 84 | - For Si-En, update the training command with:  
 85 |   `fairseq-train data-bin/wiki_si_en_bpe5000 --source-lang si --target-lang en`
 86 | - For En-Si, update the training command with:  
 87 |   `fairseq-train data-bin/wiki_si_en_bpe5000 --source-lang en --target-lang si`
 88 | 
 89 | ### Compute BLEU using sacrebleu
 90 | 
 91 | Run beam search generation and scoring with sacrebleu:
 92 | ```
 93 | $ fairseq-generate \
 94 |     data-bin/wiki_ne_en_bpe5000/ \
 95 |     --source-lang ne --target-lang en \
 96 |     --path checkpoints/checkpoint_best.pt \
 97 |     --beam 5 --lenpen 1.2 \
 98 |     --gen-subset valid \
 99 |     --remove-bpe=sentencepiece \
100 |     --sacrebleu
101 | ```
102 | 
103 | Note that the `--gen-subset valid` set is the FloRes **dev** set and `--gen-subset test` set is the FloRes **devtest** set.
104 | Replace `--gen-subset valid` with `--gen-subset test` above to score the FLoRes **devtest** set which is corresponding to the reported number in our paper.
105 | 
106 | **Tokenized BLEU for En-Ne and En-Si:**
107 | 
108 | For these language pairs we report tokenized BLEU. You can compute tokenized BLEU by removing the `--sacrebleu` flag
109 | from generate.py:
110 | ```
111 | $ fairseq-generate \
112 |     data-bin/wiki_ne_en_bpe5000/ \
113 |     --source-lang en --target-lang ne \
114 |     --path checkpoints/checkpoint_best.pt \
115 |     --beam 5 --lenpen 1.2 \
116 |     --gen-subset valid \
117 |     --remove-bpe=sentencepiece
118 | ```
119 | 
120 | ### Train iterative back-translation models
121 | 
122 | After runing the commands in *Download and preprocess data* section above, run the following to download and preprocess the monolingual data:
123 | ```
124 | $ bash prepare-monolingual.sh
125 | ```
126 | 
127 | To train the iterative back-translation for two iterations on Ne-En, run the following:
128 | ```
129 | $ bash reproduce.sh ne_en
130 | ```
131 | 
132 | The script will train an Ne-En supervised model, translate Nepali monolingual data, train En-Ne back-translation iteration 1 model, translate English monolingual data back to Nepali, and train Ne-En back-translation iteration 2 model. All the model training and data generation happen locally. The script uses all the GPUs listed in `CUDA_VISIBLE_DEVICES` variable unless certain cuda device ids are specified to `train.py`, and it is designed to adjust the hyper-parameters according to the number of available GPUs.  With 8 Tesla V100 GPUs, the full pipeline takes about 25 hours to finish. We expect the final BT iteration 2 Ne-En model achieves around 15.9 (sacre)BLEU score on devtest set. The script supports `ne_en`, `en_ne`, `si_en` and `en_si` directions.
133 | 
134 | ## Citation
135 | 
136 | If you use this data in your work, please cite:
137 | 
138 | ```bibtex
139 | @inproceedings{,
140 |   title={Two New Evaluation Datasets for Low-Resource Machine Translation: Nepali-English and Sinhala-English},
141 |   author={Guzm\'{a}n, Francisco and Chen, Peng-Jen and Ott, Myle and Pino, Juan and Lample, Guillaume and Koehn, Philipp and Chaudhary, Vishrav and Ranzato, Marc'Aurelio},
142 |   journal={arXiv preprint arXiv:1902.01382},
143 |   year={2019}
144 | }
145 | ```
146 | 
147 | ## Changelog
148 | - 2020-04-02: Add two new langauge pairs, Khmer-English, Pashto-English.
149 | - 2019-11-04: Add config to reproduce iterative back-translation result on Sinhala-English and English-Sinhala.
150 | - 2019-10-23: Add script to reproduce iterative back-translation result on Nepali-English and English-Nepali.
151 | - 2019-10-18: Add final test set.
152 | - 2019-05-20: Remove extra carriage return character from Nepali-English parallel dataset.
153 | - 2019-04-18: Specify the linebreak character in the sentencepiece encoding script to fix small portion of misaligned parallel sentences in Nepali-English parallel dataset.
154 | - 2019-03-08: Update tokenizer script to make it compatible with previous version of indic_nlp.
155 | - 2019-02-14: Update dataset preparation script to avoid unexpected extra line being added to each paralel dataset.
156 | 
157 | 
158 | ## License
159 | The dataset is licenced under CC-BY-SA, see the LICENSE file for details.
160 | 


--------------------------------------------------------------------------------
/ocr/OCR_impact_BT/finetune_eval_books.sh:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env bash
  2 | 
  3 | if [[ -z "$fairseq" || -z "$flores101_dataset" ]]; then
  4 |   echo 'Need to specify the env vars fairseq and flores101_dataset.'
  5 |   exit 1
  6 | fi
  7 | 
  8 | source OCR_impact_BT/lang_codes.source
  9 | 
 10 | SRC_LANG_CODE=eng
 11 | SRC_MM100_LANG_CODE=en
 12 | 
 13 | for data_type in books_10k books_20k books_30k; do
 14 |   for trg_lang_code in "${!LANG_CODES[@]}"; do
 15 |     root_output='Data/backtranslation/data_books/'${data_type}
 16 |     root_checkpoint_out=${root_output}/model_checkpoints
 17 |     tensorboard_log_dir=${root_output}/logdir
 18 | 
 19 |     mkdir -p $root_output/SPM/train/
 20 |     mkdir -p $root_output/SPM/test/
 21 |     mkdir -p $root_output/SPM/val/
 22 | 
 23 |     trg_mm100_lang_code="${LANG_CODES[${trg_lang_code}]}"
 24 |     echo "${trg_lang_code}" "${trg_mm100_lang_code}"
 25 |     if [ ! -s "$root_output/generation_${trg_mm100_lang_code}_${SRC_MM100_LANG_CODE}/${SRC_LANG_CODE}.txt" ]; then
 26 |       echo "$root_output/generation_${trg_mm100_lang_code}_${SRC_MM100_LANG_CODE}/${SRC_LANG_CODE}.txt" doesn\'t exist
 27 |       continue
 28 |     fi
 29 |     checkpoint_out=${root_checkpoint_out}_${SRC_MM100_LANG_CODE}_${trg_mm100_lang_code}
 30 |     if [ -f "$checkpoint_out/checkpoint6.pt" ]; then
 31 |       echo "$checkpoint_out/checkpoint6.pt exists"
 32 |     else
 33 |       echo "$checkpoint_out/checkpoint6.pt doesn't exist!"
 34 | 
 35 |       python "$fairseq/scripts/spm_encode.py" \
 36 |         --model "$fairseq/flores101_mm100_615M/sentencepiece.bpe.model" \
 37 |         --output_format=piece \
 38 |         --inputs="$root_output/generation_${trg_mm100_lang_code}_${SRC_MM100_LANG_CODE}/${SRC_LANG_CODE}.txt" \
 39 |         --outputs="$root_output/SPM/train/spm.${SRC_MM100_LANG_CODE}-${trg_mm100_lang_code}.${SRC_MM100_LANG_CODE}"
 40 | 
 41 |       python "$fairseq/scripts/spm_encode.py" \
 42 |         --model "$fairseq/flores101_mm100_615M/sentencepiece.bpe.model" \
 43 |         --output_format=piece \
 44 |         --inputs="$root_output/${trg_lang_code}_mono.txt" \
 45 |         --outputs="$root_output/SPM/train/spm.${SRC_MM100_LANG_CODE}-${trg_mm100_lang_code}.${trg_mm100_lang_code}"
 46 | 
 47 |       python "$fairseq/scripts/spm_encode.py" \
 48 |         --model "$fairseq/flores101_mm100_615M/sentencepiece.bpe.model" \
 49 |         --output_format=piece \
 50 |         --inputs="$flores101_dataset/dev/${SRC_LANG_CODE}.dev" \
 51 |         --outputs="$root_output/SPM/val/spm.${SRC_MM100_LANG_CODE}-${trg_mm100_lang_code}.${SRC_MM100_LANG_CODE}"
 52 | 
 53 |       python "$fairseq/scripts/spm_encode.py" \
 54 |         --model "$fairseq/flores101_mm100_615M/sentencepiece.bpe.model" \
 55 |         --output_format=piece \
 56 |         --inputs="$flores101_dataset/dev/${trg_lang_code}.dev" \
 57 |         --outputs="$root_output/SPM/val/spm.${SRC_MM100_LANG_CODE}-${trg_mm100_lang_code}.${trg_mm100_lang_code}"
 58 | 
 59 |       python "$fairseq/scripts/spm_encode.py" \
 60 |         --model "$fairseq/flores101_mm100_615M/sentencepiece.bpe.model" \
 61 |         --output_format=piece \
 62 |         --inputs="$flores101_dataset/devtest/${SRC_LANG_CODE}.devtest" \
 63 |         --outputs="$root_output/SPM/test/spm.${SRC_MM100_LANG_CODE}-${trg_mm100_lang_code}.${SRC_MM100_LANG_CODE}"
 64 | 
 65 |       python "$fairseq/scripts/spm_encode.py" \
 66 |         --model "$fairseq/flores101_mm100_615M/sentencepiece.bpe.model" \
 67 |         --output_format=piece \
 68 |         --inputs="$flores101_dataset/devtest/${trg_lang_code}.devtest" \
 69 |         --outputs="$root_output/SPM/test/spm.${SRC_MM100_LANG_CODE}-${trg_mm100_lang_code}.${trg_mm100_lang_code}"
 70 | 
 71 |       # #### Binarization
 72 |       fairseq-preprocess \
 73 |         --source-lang ${SRC_MM100_LANG_CODE} --target-lang "${trg_mm100_lang_code}" \
 74 |         --validpref "$root_output/SPM/val/spm.${SRC_MM100_LANG_CODE}-${trg_mm100_lang_code}" \
 75 |         --trainpref "$root_output/SPM/train/spm.${SRC_MM100_LANG_CODE}-${trg_mm100_lang_code}" \
 76 |         --testpref "$root_output/SPM/test/spm.${SRC_MM100_LANG_CODE}-${trg_mm100_lang_code}" \
 77 |         --thresholdsrc 0 --thresholdtgt 0 \
 78 |         --destdir "$root_output/data_bin_${SRC_MM100_LANG_CODE}_${trg_mm100_lang_code}" \
 79 |         --srcdict "$fairseq/flores101_mm100_615M/dict.txt" --tgtdict "$fairseq/flores101_mm100_615M/dict.txt"
 80 | 
 81 |       lang_pairs="${SRC_MM100_LANG_CODE}-${trg_mm100_lang_code}"
 82 |       fairseq-train --fp16 \
 83 |         --memory-efficient-fp16 \
 84 |         "$root_output/data_bin_${SRC_MM100_LANG_CODE}_${trg_mm100_lang_code}" \
 85 |         --finetune-from-model "${CHECKPOINT_IN}" \
 86 |         --task 'translation_multi_simple_epoch' \
 87 |         --arch transformer_wmt_en_de_big --share-all-embeddings \
 88 |         --encoder-layers 12 --decoder-layers 12 \
 89 |         --encoder-attention-heads 16 --decoder-attention-heads 16 \
 90 |         --encoder-embed-dim 1024 --decoder-embed-dim 1024 \
 91 |         --encoder-ffn-embed-dim 4096 --decoder-ffn-embed-dim 4096 \
 92 |         --encoder-normalize-before --decoder-normalize-before \
 93 |         --dropout 0.1 --attention-dropout 0.1 --relu-dropout 0.0 \
 94 |         --weight-decay 0.0 \
 95 |         --label-smoothing 0.1 --criterion label_smoothed_cross_entropy \
 96 |         --optimizer adam --adam-betas '(0.9, 0.98)' --clip-norm 0 --adam-eps 1e-08 \
 97 |         --lr-scheduler inverse_sqrt --lr 0.0002 --warmup-updates 4000 --warmup-init-lr 1e-7 --max-update 10000000 \
 98 |         --max-tokens 3400 \
 99 |         --encoder-langtok 'src' \
100 |         --decoder-langtok \
101 |         --lang-pairs "$lang_pairs" \
102 |         --sampling-method 'temperature' --sampling-temperature 5.0 \
103 |         --source-lang ${SRC_MM100_LANG_CODE} --target-lang "${trg_mm100_lang_code}" \
104 |         --update-freq 2 \
105 |         --seed 2 \
106 |         --max-source-positions 1024 --max-target-positions 1024 \
107 |         --max-epoch 6 --save-interval 3 \
108 |         --tensorboard-logdir ${tensorboard_log_dir} \
109 |         --save-dir "${checkpoint_out}"
110 | 
111 |       python -c "import torch; torch.cuda.empty_cache()"
112 |     fi
113 | 
114 |     if [ -s "$root_output/generation_${SRC_MM100_LANG_CODE}_${trg_mm100_lang_code}/${trg_lang_code}.txt" ]; then
115 |       echo "$root_output/generation_${SRC_MM100_LANG_CODE}_${trg_mm100_lang_code}/${trg_lang_code}.txt" exists
116 |     else
117 |       echo "$root_output/generation_${SRC_MM100_LANG_CODE}_${trg_mm100_lang_code}/${trg_lang_code}.txt" doesn\'t exist
118 | 
119 |       fairseq-generate \
120 |         "$root_output/data_bin_${SRC_MM100_LANG_CODE}_${trg_mm100_lang_code}" \
121 |         --batch-size 512 \
122 |         --path "${checkpoint_out}/checkpoint_best.pt" \
123 |         --fixed-dictionary "$fairseq/flores101_mm100_615M/dict.txt" \
124 |         -s ${SRC_MM100_LANG_CODE} -t "${trg_mm100_lang_code}" \
125 |         --remove-bpe 'sentencepiece' \
126 |         --beam 5 \
127 |         --task translation_multi_simple_epoch \
128 |         --lang-pairs "$fairseq/flores101_mm100_615M/language_pairs.txt" \
129 |         --decoder-langtok --encoder-langtok src \
130 |         --gen-subset test \
131 |         --fp16 \
132 |         --dataset-impl mmap \
133 |         --distributed-world-size 1 --distributed-no-spawn \
134 |         --results-path "$root_output/generation_${SRC_MM100_LANG_CODE}_${trg_mm100_lang_code} "
135 |     fi
136 | 
137 |     if [ -s "$root_output/generation_${SRC_MM100_LANG_CODE}_${trg_mm100_lang_code}/generate-test.txt" ]; then
138 |       echo "Evaluation: " $root_output/results.txt
139 |       ## clean fairseq generated file to only create hypotheses file.
140 |       grep -P '^H-' "$root_output/generation_${SRC_MM100_LANG_CODE}_${trg_mm100_lang_code}/generate-test.txt" |
141 |         cut -c 3- |
142 |         sort -n -k 1 |
143 |         awk -F "\t" '{print $NF}' \
144 |           >"$root_output/generation_${SRC_MM100_LANG_CODE}_${trg_mm100_lang_code}/${trg_lang_code}.txt"
145 |       ## Evaluate
146 |       echo "${data_type} ${trg_lang_code}" >>$root_output/results.txt
147 |       sacrebleu \
148 |         "$flores101_dataset/devtest/${trg_lang_code}.devtest" \
149 |         --tokenize spm \
150 |         <"$root_output/generation_${SRC_MM100_LANG_CODE}_${trg_mm100_lang_code}/${trg_lang_code}.txt" \
151 |         >>$root_output/results.txt
152 |     fi
153 |   done
154 | done
155 | 


--------------------------------------------------------------------------------
/ocr/OCR_impact_BT/finetune.sh:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env bash
  2 | 
  3 | if [[ -z "$fairseq" || -z "$flores101_dataset" ]]; then
  4 |   echo 'Need to specify the env vars fairseq and flores101_dataset.'
  5 |   exit 1
  6 | fi
  7 | 
  8 | CHECKPOINT_IN_PATH=$fairseq/flores101_mm100_615M/model.pt
  9 | 
 10 | source OCR_impact_BT/lang_codes.source
 11 | 
 12 | SRC_LANG_CODE=eng
 13 | SRC_MM100_LANG_CODE=en
 14 | 
 15 | SIZE=20k
 16 | LOG_FILE="Data/backtranslation/data_cer_$SIZE/logs.txt"
 17 | 
 18 | for trg_lang_code in "${!LANG_CODES[@]}"; do
 19 |   for error_rate in {1..22..6}; do
 20 |     for error_type in insert delete replace; do
 21 |       data_type=${error_rate}/${error_type}
 22 |       root_output="Data/backtranslation/data_cer_$SIZE/${data_type}"
 23 |       root_checkpoint_out=${root_output}/model_checkpoints
 24 |       tensorboard_log_dir=${root_output}/logdir
 25 | 
 26 |       mkdir -p $root_output/SPM/train/
 27 |       mkdir -p $root_output/SPM/test/
 28 |       mkdir -p $root_output/SPM/val/
 29 | 
 30 |       trg_mm100_lang_code="${LANG_CODES[${trg_lang_code}]}"
 31 |       echo "${trg_lang_code}" "${trg_mm100_lang_code}"
 32 |       if [ ! -s "$root_output/generation_${trg_mm100_lang_code}_${SRC_MM100_LANG_CODE}/${SRC_LANG_CODE}.txt" ]; then
 33 |         echo "$root_output/generation_${trg_mm100_lang_code}_${SRC_MM100_LANG_CODE}/${SRC_LANG_CODE}.txt" doesn\'t exist
 34 |         continue
 35 |       fi
 36 |       checkpoint_out=${root_checkpoint_out}_${SRC_MM100_LANG_CODE}_${trg_mm100_lang_code}
 37 |       echo "${checkpoint_out}"
 38 |       if [ -f "$checkpoint_out/checkpoint6.pt" ]; then
 39 |         echo "$checkpoint_out/checkpoint6.pt exists"
 40 |       else
 41 |         echo "$checkpoint_out/checkpoint6.pt doesn't exist!"
 42 | 
 43 |         python "${fairseq}"/scripts/spm_encode.py \
 44 |           --model "${fairseq}"/flores101_mm100_615M/sentencepiece.bpe.model \
 45 |           --output_format=piece \
 46 |           --inputs=$root_output/generation_"${trg_mm100_lang_code}"_${SRC_MM100_LANG_CODE}/${SRC_LANG_CODE}.txt \
 47 |           --outputs=$root_output/SPM/train/spm.${SRC_MM100_LANG_CODE}-"${trg_mm100_lang_code}".${SRC_MM100_LANG_CODE}
 48 | 
 49 |         python "${fairseq}"/scripts/spm_encode.py \
 50 |           --model "${fairseq}"/flores101_mm100_615M/sentencepiece.bpe.model \
 51 |           --output_format=piece \
 52 |           --inputs=$root_output/"${trg_lang_code}"_mono.txt \
 53 |           --outputs=$root_output/SPM/train/spm.${SRC_MM100_LANG_CODE}-"${trg_mm100_lang_code}"."${trg_mm100_lang_code}"
 54 | 
 55 |         python "${fairseq}"/scripts/spm_encode.py \
 56 |           --model "${fairseq}"/flores101_mm100_615M/sentencepiece.bpe.model \
 57 |           --output_format=piece \
 58 |           --inputs="${flores101_dataset}"/dev/${SRC_LANG_CODE}.dev \
 59 |           --outputs=$root_output/SPM/val/spm.${SRC_MM100_LANG_CODE}-"${trg_mm100_lang_code}".${SRC_MM100_LANG_CODE}
 60 | 
 61 |         python "${fairseq}"/scripts/spm_encode.py \
 62 |           --model "${fairseq}"/flores101_mm100_615M/sentencepiece.bpe.model \
 63 |           --output_format=piece \
 64 |           --inputs="${flores101_dataset}"/dev/"${trg_lang_code}".dev \
 65 |           --outputs=$root_output/SPM/val/spm.${SRC_MM100_LANG_CODE}-"${trg_mm100_lang_code}"."${trg_mm100_lang_code}"
 66 | 
 67 |         python "${fairseq}"/scripts/spm_encode.py \
 68 |           --model "${fairseq}"/flores101_mm100_615M/sentencepiece.bpe.model \
 69 |           --output_format=piece \
 70 |           --inputs="${flores101_dataset}"/devtest/${SRC_LANG_CODE}.devtest \
 71 |           --outputs=$root_output/SPM/test/spm.${SRC_MM100_LANG_CODE}-"${trg_mm100_lang_code}".${SRC_MM100_LANG_CODE}
 72 | 
 73 |         python "${fairseq}"/scripts/spm_encode.py \
 74 |           --model "${fairseq}"/flores101_mm100_615M/sentencepiece.bpe.model \
 75 |           --output_format=piece \
 76 |           --inputs="${flores101_dataset}"/devtest/"${trg_lang_code}".devtest \
 77 |           --outputs="$root_output/SPM/test/spm.${SRC_MM100_LANG_CODE}-${trg_mm100_lang_code}.${trg_mm100_lang_code}"
 78 | 
 79 |         # #### Binarization
 80 |         fairseq-preprocess \
 81 |           --source-lang ${SRC_MM100_LANG_CODE} --target-lang "${trg_mm100_lang_code}" \
 82 |           --validpref $root_output/SPM/val/spm.${SRC_MM100_LANG_CODE}-"${trg_mm100_lang_code}" \
 83 |           --trainpref $root_output/SPM/train/spm.${SRC_MM100_LANG_CODE}-"${trg_mm100_lang_code}" \
 84 |           --testpref $root_output/SPM/test/spm.${SRC_MM100_LANG_CODE}-"${trg_mm100_lang_code}" \
 85 |           --thresholdsrc 0 --thresholdtgt 0 \
 86 |           --destdir $root_output/data_bin_${SRC_MM100_LANG_CODE}_"${trg_mm100_lang_code}" \
 87 |           --srcdict "${fairseq}"/flores101_mm100_615M/dict.txt --tgtdict "${fairseq}"/flores101_mm100_615M/dict.txt
 88 | 
 89 |         lang_pairs="${SRC_MM100_LANG_CODE}-${trg_mm100_lang_code}"
 90 |         fairseq-train --fp16 \
 91 |           --memory-efficient-fp16 \
 92 |           "$root_output/data_bin_${SRC_MM100_LANG_CODE}_${trg_mm100_lang_code}" \
 93 |           --finetune-from-model "${CHECKPOINT_IN_PATH}" \
 94 |           --task 'translation_multi_simple_epoch' \
 95 |           --arch transformer_wmt_en_de_big --share-all-embeddings \
 96 |           --encoder-layers 12 --decoder-layers 12 \
 97 |           --encoder-attention-heads 16 --decoder-attention-heads 16 \
 98 |           --encoder-embed-dim 1024 --decoder-embed-dim 1024 \
 99 |           --encoder-ffn-embed-dim 4096 --decoder-ffn-embed-dim 4096 \
100 |           --encoder-normalize-before --decoder-normalize-before \
101 |           --dropout 0.1 --attention-dropout 0.1 --relu-dropout 0.0 \
102 |           --weight-decay 0.0 \
103 |           --label-smoothing 0.1 --criterion label_smoothed_cross_entropy \
104 |           --optimizer adam --adam-betas '(0.9, 0.98)' --clip-norm 0 --adam-eps 1e-08 \
105 |           --lr-scheduler inverse_sqrt --lr 0.0002 --warmup-updates 4000 --warmup-init-lr 1e-7 --max-update 10000000 \
106 |           --max-tokens 3400 \
107 |           --encoder-langtok 'src' \
108 |           --decoder-langtok \
109 |           --lang-pairs "$lang_pairs" \
110 |           --sampling-method 'temperature' --sampling-temperature 5.0 \
111 |           --source-lang ${SRC_MM100_LANG_CODE} --target-lang "${trg_mm100_lang_code}" \
112 |           --update-freq 2 \
113 |           --seed 2 \
114 |           --max-source-positions 1024 --max-target-positions 1024 \
115 |           --max-epoch 6 --save-interval 3 \
116 |           --tensorboard-logdir ${tensorboard_log_dir} \
117 |           --save-dir "${checkpoint_out}" \
118 |           --log-file "${LOG_FILE}"
119 | 
120 |         python -c "import torch; torch.cuda.empty_cache()"
121 |       fi
122 | 
123 |       if [ -s "$root_output/generation_${SRC_MM100_LANG_CODE}_${trg_mm100_lang_code}/${trg_lang_code}.txt" ]; then
124 |         echo "$root_output/generation_${SRC_MM100_LANG_CODE}_${trg_mm100_lang_code}/${trg_lang_code}.txt" exists
125 |       else
126 |         echo "$root_output/generation_${SRC_MM100_LANG_CODE}_${trg_mm100_lang_code}/${trg_lang_code}.txt" doesn\'t exist
127 | 
128 |         fairseq-generate \
129 |           "$root_output/data_bin_${SRC_MM100_LANG_CODE}_${trg_mm100_lang_code}" \
130 |           --batch-size 512 \
131 |           --path "${checkpoint_out}/checkpoint_best.pt" \
132 |           --fixed-dictionary "${fairseq}"/flores101_mm100_615M/dict.txt \
133 |           -s ${SRC_MM100_LANG_CODE} -t "${trg_mm100_lang_code}" \
134 |           --remove-bpe 'sentencepiece' \
135 |           --beam 5 \
136 |           --task translation_multi_simple_epoch \
137 |           --lang-pairs "${fairseq}"/flores101_mm100_615M/language_pairs.txt \
138 |           --decoder-langtok --encoder-langtok src \
139 |           --gen-subset test \
140 |           --fp16 \
141 |           --dataset-impl mmap \
142 |           --distributed-world-size 1 --distributed-no-spawn \
143 |           --results-path $root_output/generation_${SRC_MM100_LANG_CODE}_"${trg_mm100_lang_code}"
144 |       fi
145 | 
146 |       if [ -s $root_output/generation_${SRC_MM100_LANG_CODE}_"${trg_mm100_lang_code}"/generate-test.txt ]; then
147 |         ## clean fairseq generated file to only create hypotheses file.
148 |         grep -P '^H-' "$root_output/generation_${SRC_MM100_LANG_CODE}_${trg_mm100_lang_code}/generate-test.txt" \
149 |           | cut -c 3- \
150 |           | sort -n -k 1 \
151 |           | awk -F "\t" '{print $NF}' \
152 |           > "$root_output/generation_${SRC_MM100_LANG_CODE}_${trg_mm100_lang_code}/${trg_lang_code}.txt"
153 |       fi
154 | 
155 |     done
156 |   done
157 | done
158 | 


--------------------------------------------------------------------------------
/previous_releases/floresv1/scripts/translate.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # Copyright (c) Facebook, Inc. and its affiliates.
  3 | # All rights reserved.
  4 | #
  5 | # This source code is licensed under the license found in the
  6 | # LICENSE file in the root directory of this source tree.
  7 | #
  8 | 
  9 | import argparse
 10 | import os
 11 | import torch
 12 | from subprocess import check_call, check_output
 13 | from glob import glob
 14 | from tempfile import NamedTemporaryFile as TempFile
 15 | import time
 16 | import subprocess
 17 | import multiprocessing as mp
 18 | from utils import check_last_line, count_line
 19 | import tqdm
 20 | 
 21 | 
 22 | def translate_files_slurm(args, cmds, expected_output_files):
 23 |     conda_env = '/private/home/pipibjc/.conda/envs/fairseq-20190509'
 24 |     for cmd in cmds:
 25 |         with TempFile('w') as script:
 26 |             sh = f"""#!/bin/bash
 27 |             source activate {conda_env}
 28 |             {cmd}
 29 |             """
 30 |             print(sh)
 31 |             script.write(sh)
 32 |             script.flush()
 33 |             cmd = f"sbatch --gres=gpu:1 -c {args.cpu + 2} {args.sbatch_args} --time=15:0:0 {script.name}"
 34 |             import sys
 35 |             print(cmd, file=sys.stderr)
 36 |             check_call(cmd, shell=True)
 37 | 
 38 |     # wait for all outputs has finished
 39 |     num_finished = 0
 40 |     while num_finished < len(expected_output_files):
 41 |         num_finished = 0
 42 |         for output_file in expected_output_files:
 43 |             num_finished += 1 if check_finished(output_file) else 0
 44 |         if num_finished < len(expected_output_files):
 45 |             time.sleep(3 * 60)
 46 |             print("sleeping for 3m ...")
 47 | 
 48 | 
 49 | def check_finished(output_file):
 50 |     return check_last_line(output_file, "finished")
 51 | 
 52 | 
 53 | def get_output_file(dest_dir, file):
 54 |     return f"{dest_dir}/{os.path.basename(file)}.log"
 55 | 
 56 | 
 57 | def translate(arg_list):
 58 |     (q, cmd) = arg_list
 59 |     i = q.get()
 60 |     os.environ['CUDA_VISIBLE_DEVICES']=str(i)
 61 |     cmd = f"CUDA_VISIBLE_DEVICES={i} {cmd}"
 62 |     print(f"executing:\n{cmd}")
 63 |     check_call(cmd, shell=True)
 64 |     q.put(i)
 65 | 
 66 | 
 67 | def translate_files_local(args, cmds):
 68 |     m = mp.Manager()
 69 |     gpu_queue = m.Queue()
 70 |     for i in args.cuda_visible_device_ids:
 71 |         gpu_queue.put(i)
 72 |     with mp.Pool(processes=len(args.cuda_visible_device_ids)) as pool:
 73 |         for _ in tqdm.tqdm(pool.imap_unordered(translate, [(gpu_queue, cmd) for cmd in cmds]), total=len(cmds)):
 74 |             pass
 75 | 
 76 | 
 77 | def translate_files(args, dest_dir, input_files):
 78 |     cmd_template = f"""fairseq-interactive \
 79 |         {args.databin} \
 80 |         --source-lang {args.source_lang} --target-lang {args.target_lang} \
 81 |         --path {args.model} \
 82 |         --lenpen {args.lenpen} \
 83 |         --max-len-a {args.max_len_a} \
 84 |         --max-len-b {args.max_len_b} \
 85 |         --buffer-size {args.buffer_size} \
 86 |         --max-tokens {args.max_tokens} \
 87 |         --num-workers {args.cpu} > {{output_file}} && \
 88 |     echo "finished" >> {{output_file}}
 89 |     """
 90 |     cmds = []
 91 |     expected_output_files = []
 92 |     for input_file in input_files:
 93 |         output_file = get_output_file(dest_dir, input_file)
 94 |         cmds.append(f"cat {input_file} | " + cmd_template.format(output_file=output_file))
 95 |         expected_output_files.append(output_file)
 96 |     if args.backend == 'local':
 97 |         translate_files_local(args, cmds)
 98 |     elif args.backend == 'slurm':
 99 |         translate_files_slurm(args, cmds, expected_output_files)
100 | 
101 | 
102 | def main():
103 |     parser = argparse.ArgumentParser()
104 |     parser.add_argument('--data', '-d', required=True, help='Path to file to translate')
105 |     parser.add_argument('--model', '-m', required=True, help='Model checkpoint')
106 |     parser.add_argument('--lenpen', default=1.2, type=float, help='Length penalty')
107 |     parser.add_argument('--beam', default=5, type=int, help='Beam size')
108 |     parser.add_argument('--max-len-a', type=float, default=0, help='max-len-a parameter when back-translating')
109 |     parser.add_argument('--max-len-b', type=int, default=200, help='max-len-b parameter when back-translating')
110 |     parser.add_argument('--cpu', type=int, default=4, help='Number of CPU for interactive.py')
111 |     parser.add_argument('--cuda-visible-device-ids', '-gids', default=None, nargs='*', help='List of cuda visible devices ids, camma separated')
112 |     parser.add_argument('--dest', help='Output path for the intermediate and translated file')
113 |     parser.add_argument('--max-tokens', type=int, default=12000, help='max tokens')
114 |     parser.add_argument('--buffer-size', type=int, default=10000, help='Buffer size')
115 |     parser.add_argument('--chunks', type=int, default=100)
116 |     parser.add_argument('--source-lang', type=str, default=None, help='Source langauge. Will inference from the model if not set')
117 |     parser.add_argument('--target-lang', type=str, default=None, help='Target langauge. Will inference from the model if not set')
118 |     parser.add_argument('--databin', type=str, default=None, help='Parallel databin. Will combine with the back-translated databin')
119 |     parser.add_argument('--sbatch-args', default='', help='Extra SBATCH arguments')
120 | 
121 |     parser.add_argument('--backend', type=str, default='local', choices=['local', 'slurm'])
122 |     args = parser.parse_args()
123 | 
124 |     args.cuda_visible_device_ids = args.cuda_visible_device_ids or list(range(torch.cuda.device_count()))
125 | 
126 |     chkpnt = torch.load(args.model)
127 |     model_args = chkpnt['args']
128 |     if args.source_lang is None or args.target_lang is None:
129 |         args.source_lang = args.source_lang or model_args.source_lang
130 |         args.target_lang = args.target_lang or model_args.target_lang
131 |     if args.databin is None:
132 |         args.databin = args.databin or model_args.data
133 | 
134 |     root_dir = os.path.dirname(os.path.realpath(__file__))
135 |     translation_dir = os.path.join(args.dest or root_dir, 'translations', f'{args.source_lang}-{args.target_lang}')
136 | 
137 |     tempdir = os.path.join(translation_dir, 'splits')
138 |     os.makedirs(tempdir, exist_ok=True)
139 |     split_files = glob(f'{tempdir}/mono_data*')
140 | 
141 |     if len(split_files) != args.chunks:
142 |         if len(split_files) != 0:
143 |             print("number of split files are not the same as chunks. removing files and re-split")
144 |             [os.remove(os.path.join(tempdir, f)) for f in os.listdir(tempdir)]
145 |         print("splitting files ...")
146 |         check_call(f'split -n "r/{args.chunks}" -a3 -d {args.data} {tempdir}/mono_data', shell=True)
147 |         split_files = glob(f'{tempdir}/mono_data*')
148 |     else:
149 |         print("has the same number of splitted file and the specified chunks, skip splitting file")
150 | 
151 |     translated_files = []
152 |     files_to_translate = []
153 |     for file in split_files:
154 |         # skip the translation job if it's finished
155 |         output_file = get_output_file(translation_dir, file)
156 |         translated_files.append(output_file)
157 |         if check_finished(output_file):
158 |             print(f"{output_file} is translated")
159 |             continue
160 |         files_to_translate.append(file)
161 | 
162 |     print(f"{len(files_to_translate)} files to translate")
163 | 
164 |     translate_files(args, translation_dir, files_to_translate)
165 | 
166 |     # aggregate translated files
167 |     generated_src = f'{args.dest}/generated.src'
168 |     generated_tgt = f'{args.dest}/generated.hypo'
169 |     if count_line(generated_src) != count_line(generated_tgt) or count_line(generated_src) <= 0:
170 |         print(f"aggregating translated {len(translated_files)} files")
171 |         with TempFile() as fout:
172 |             files = " ".join(translated_files)
173 |             check_call(f"cat {files}", shell=True, stdout=fout)
174 |             # strip head and make pairs
175 |             check_call(f'cat {fout.name} | grep "^S" | cut -f2 > {generated_src}', shell=True)
176 |             check_call(f'cat {fout.name} | grep "^H" | cut -f3 > {generated_tgt}', shell=True)
177 |     assert count_line(generated_src) == count_line(generated_tgt)
178 |     print(f"output generated files to {generated_src}, {generated_tgt}")
179 | 
180 | 
181 | if __name__ == '__main__':
182 |     main()
183 | 


--------------------------------------------------------------------------------
/toxicity/README.md:
--------------------------------------------------------------------------------
  1 | # Toxicity-200
  2 | 
  3 | ** Warning: The files included in this contain toxic language. **
  4 | 
  5 | This repository contains files that include frequent words and phrases generally considered toxic because they represent:
  6 | * Frequently used profanities
  7 | * Frequently used insults and hate speech terms, or language used to bully, denigrate, or demean
  8 | * Pornographic terms
  9 | * Terms for body parts associated with sexual activity
 10 | 
 11 | --------------------------------------------------------------------------------
 12 | 
 13 | ## Download
 14 | 
 15 | Toxicity-200 can be downloaded [here](https://tinyurl.com/NLLB200TWL) which you can download with the following command:
 16 | 
 17 | ```bash
 18 | wget --trust-server-names https://tinyurl.com/NLLB200TWL
 19 | ```
 20 | 
 21 | ## Purpose, Ethical Considerations, and Use of the Lists
 22 | The primary purpose of such lists is to help with translation model safety by monitoring for hallucinated toxicity. By *hallucinated toxicity*, we mean the presence of toxic items in the translated text when no such toxic items can be found in the source text.
 23 | 
 24 | The lists were collected via human translation. Any such translation effort inevitably poses risks of bias. The likelihood of getting access to professionals with diverse backgrounds and worldviews is not equal across all supported languages. In addition to the work that has already been done to mitigate biases, which can also introduce its own potential biases, the ultimate mitigation strategy can be to provide the community with free access to the lists, and to welcome feedback and contributions from the community in all supported languages.
 25 | 
 26 | The files are in zip format, and unzipping is password protected. To unzip the files after downloading, you may use the following command line:
 27 | `unzip --password tL4nLLb [BCP47_code]_twl.zip`
 28 | The unzipping of the files implies that you consent to viewing their contents.
 29 | 
 30 | Language codes for all languages can be found in the below table (see **Project Status**). The BCP 47 language codes include an ISO 639-3 base tag to identify the language and ISO 15924 supplemental tag to identify the script (e.g., taq_Tfng for Tamasheq in Tifinagh script). The codes mirror those used for the release of the FLORES-200 data sets. However, in cases where FLORES-200 targets a specific lect, the corresponding lists may not be as restrictive in that they may include items from closely related lects.
 31 | 
 32 | ## Languages in Toxicity-200
 33 | The following toxicity lists are currently available in these languages:
 34 | 
 35 | BCP 47 Code | Language
 36 | ----------- | ----------------------------------
 37 | ace_Arab    | Acehnese (Arabic script)
 38 | ace_Latn    | Acehnese (Latin script)
 39 | acm_Arab    | Mesopotamian Arabic
 40 | acq_Arab    | Ta’izzi-Adeni Arabic
 41 | aeb_Arab    | Tunisian Arabic
 42 | afr_Latn    | Afrikaans
 43 | ajp_Arab    | South Levantine Arabic
 44 | aka_Latn    | Akan
 45 | als_Latn    | Tosk Albanian
 46 | amh_Ethi    | Amharic
 47 | apc_Arab    | North Levantine Arabic
 48 | arb_Arab    | Modern Standard Arabic
 49 | arb_Latn    | Modern Standard Arabic (Romanized)
 50 | ars_Arab    | Najdi Arabic
 51 | ary_Arab    | Moroccan Arabic
 52 | arz_Arab    | Egyptian Arabic
 53 | asm_Beng    | Assamese
 54 | ast_Latn    | Asturian
 55 | awa_Deva    | Awadhi
 56 | ayr_Latn    | Central Aymara
 57 | azb_Arab    | South Azerbaijani
 58 | azj_Latn    | North Azerbaijani
 59 | bak_Cyrl    | Bashkir
 60 | bam_Latn    | Bambara
 61 | ban_Latn    | Balinese
 62 | bel_Cyrl    | Belarusian
 63 | bem_Latn    | Bemba
 64 | ben_Beng    | Bengali
 65 | bho_Deva    | Bhojpuri
 66 | bjn_Arab    | Banjar (Arabic script)
 67 | bjn_Latn    | Banjar (Latin script)
 68 | bod_Tibt    | Standard Tibetan
 69 | bos_Latn    | Bosnian
 70 | bug_Latn    | Buginese
 71 | bul_Cyrl    | Bulgarian
 72 | cat_Latn    | Catalan
 73 | ceb_Latn    | Cebuano
 74 | ces_Latn    | Czech
 75 | cjk_Latn    | Chokwe
 76 | ckb_Arab    | Central Kurdish
 77 | crh_Latn    | Crimean Tatar
 78 | cym_Latn    | Welsh
 79 | dan_Latn    | Danish
 80 | deu_Latn    | German
 81 | dik_Latn    | Southwestern Dinka
 82 | dyu_Latn    | Dyula
 83 | dzo_Tibt    | Dzongkha
 84 | ell_Grek    | Greek
 85 | eng_Latn    | English
 86 | epo_Latn    | Esperanto
 87 | est_Latn    | Estonian
 88 | eus_Latn    | Basque
 89 | ewe_Latn    | Ewe
 90 | fao_Latn    | Faroese
 91 | fij_Latn    | Fijian
 92 | fin_Latn    | Finnish
 93 | fon_Latn    | Fon
 94 | fra_Latn    | French
 95 | fur_Latn    | Friulian
 96 | fuv_Latn    | Nigerian Fulfulde
 97 | gaz_Latn    | West Central Oromo
 98 | gla_Latn    | Scottish Gaelic
 99 | gle_Latn    | Irish
100 | glg_Latn    | Galician
101 | grn_Latn    | Guarani
102 | guj_Gujr    | Gujarati
103 | hat_Latn    | Haitian Creole
104 | hau_Latn    | Hausa
105 | heb_Hebr    | Hebrew
106 | hin_Deva    | Hindi
107 | hne_Deva    | Chhattisgarhi
108 | hrv_Latn    | Croatian
109 | hun_Latn    | Hungarian
110 | hye_Armn    | Armenian
111 | ibo_Latn    | Igbo
112 | ilo_Latn    | Ilocano
113 | ind_Latn    | Indonesian
114 | isl_Latn    | Icelandic
115 | ita_Latn    | Italian
116 | jav_Latn    | Javanese
117 | jpn_Jpan    | Japanese
118 | kab_Latn    | Kabyle
119 | kac_Latn    | Jingpho
120 | kam_Latn    | Kamba
121 | kan_Knda    | Kannada
122 | kas_Arab    | Kashmiri (Arabic script)
123 | kas_Deva    | Kashmiri (Devanagari script)
124 | kat_Geor    | Georgian
125 | kaz_Cyrl    | Kazakh
126 | kbp_Latn    | Kabiyè
127 | kea_Latn    | Kabuverdianu
128 | khk_Cyrl    | Halh Mongolian
129 | khm_Khmr    | Khmer
130 | kik_Latn    | Kikuyu
131 | kin_Latn    | Kinyarwanda
132 | kir_Cyrl    | Kyrgyz
133 | kmb_Latn    | Kimbundu
134 | kmr_Latn    | Northern Kurdish
135 | knc_Arab    | Central Kanuri (Arabic script)
136 | knc_Latn    | Central Kanuri (Latin script)
137 | kon_Latn    | Kikongo
138 | kor_Hang    | Korean
139 | lao_Laoo    | Lao
140 | lij_Latn    | Ligurian
141 | lim_Latn    | Limburgish
142 | lin_Latn    | Lingala
143 | lit_Latn    | Lithuanian
144 | lmo_Latn    | Lombard
145 | ltg_Latn    | Latgalian
146 | ltz_Latn    | Luxembourgish
147 | lua_Latn    | Luba-Kasai
148 | lug_Latn    | Ganda
149 | luo_Latn    | Luo
150 | lus_Latn    | Mizo
151 | lvs_Latn    | Standard Latvian
152 | mag_Deva    | Magahi
153 | mai_Deva    | Maithili
154 | mal_Mlym    | Malayalam
155 | mar_Deva    | Marathi
156 | min_Arab    | Minangkabau (Arabic script)
157 | min_Latn    | Minangkabau (Latin script)
158 | mkd_Cyrl    | Macedonian
159 | mlt_Latn    | Maltese
160 | mni_Beng    | Meitei (Bengali script)
161 | mos_Latn    | Mossi
162 | mri_Latn    | Maori
163 | mya_Mymr    | Burmese
164 | nld_Latn    | Dutch
165 | nno_Latn    | Norwegian Nynorsk
166 | nob_Latn    | Norwegian Bokmål
167 | npi_Deva    | Nepali
168 | nso_Latn    | Northern Sotho
169 | nus_Latn    | Nuer
170 | nya_Latn    | Nyanja
171 | oci_Latn    | Occitan
172 | ory_Orya    | Odia
173 | pag_Latn    | Pangasinan
174 | pan_Guru    | Eastern Panjabi
175 | pap_Latn    | Papiamento
176 | pbt_Arab    | Southern Pashto
177 | pes_Arab    | Western Persian
178 | plt_Latn    | Plateau Malagasy
179 | pol_Latn    | Polish
180 | por_Latn    | Portuguese
181 | prs_Arab    | Dari
182 | quy_Latn    | Ayacucho Quechua
183 | ron_Latn    | Romanian
184 | run_Latn    | Rundi
185 | rus_Cyrl    | Russian
186 | sag_Latn    | Sango
187 | san_Deva    | Sanskrit
188 | sat_Olck    | Santali
189 | scn_Latn    | Sicilian
190 | shn_Mymr    | Shan
191 | sin_Sinh    | Sinhala
192 | slk_Latn    | Slovak
193 | slv_Latn    | Slovenian
194 | smo_Latn    | Samoan
195 | sna_Latn    | Shona
196 | snd_Arab    | Sindhi
197 | som_Latn    | Somali
198 | sot_Latn    | Southern Sotho
199 | spa_Latn    | Spanish
200 | srd_Latn    | Sardinian
201 | srp_Cyrl    | Serbian
202 | ssw_Latn    | Swati
203 | sun_Latn    | Sundanese
204 | swe_Latn    | Swedish
205 | swh_Latn    | Swahili
206 | szl_Latn    | Silesian
207 | tam_Taml    | Tamil
208 | taq_Latn    | Tamasheq (Latin script)
209 | taq_Tfng    | Tamasheq (Tifinagh script)
210 | tat_Cyrl    | Tatar
211 | tel_Telu    | Telugu
212 | tgk_Cyrl    | Tajik
213 | tgl_Latn    | Tagalog
214 | tha_Thai    | Thai
215 | tir_Ethi    | Tigrinya
216 | tpi_Latn    | Tok Pisin
217 | tsn_Latn    | Tswana
218 | tso_Latn    | Tsonga
219 | tuk_Latn    | Turkmen
220 | tum_Latn    | Tumbuka
221 | tur_Latn    | Turkish
222 | twi_Latn    | Twi
223 | tzm_Tfng    | Central Atlas Tamazight
224 | uig_Arab    | Uyghur
225 | ukr_Cyrl    | Ukrainian
226 | umb_Latn    | Umbundu
227 | urd_Arab    | Urdu
228 | uzn_Latn    | Northern Uzbek
229 | vec_Latn    | Venetian
230 | vie_Latn    | Vietnamese
231 | war_Latn    | Waray
232 | wol_Latn    | Wolof
233 | xho_Latn    | Xhosa
234 | ydd_Hebr    | Eastern Yiddish
235 | yor_Latn    | Yoruba
236 | yue_Hant    | Yue Chinese
237 | zho_Hans    | Chinese (Simplified)
238 | zho_Hant    | Chinese (Traditional)
239 | zsm_Latn    | Standard Malay
240 | zul_Latn    | Zulu
241 | 
242 | ## Latest Update
243 | Date: 2022-12-14
244 | Files:
245 | BCP 47 Code | Language
246 | ----------- | ----------------------------------
247 | est_Latn    | Estonian
248 | fra_Latn    | French
249 | nld_Latn    | Dutch
250 | 


--------------------------------------------------------------------------------
/previous_releases/flores101/README.md:
--------------------------------------------------------------------------------
  1 | ## Download FLORES-101 Dataset 
  2 | The data can be downloaded from: [Here](https://dl.fbaipublicfiles.com/flores101/dataset/flores101_dataset.tar.gz).
  3 | 
  4 | A supplement to FLORES-101 was released for the WMT22 Shared Task on [Large-Scale Machine Translation Evaluation for African Languages](https://statmt.org/wmt22/large-scale-multilingual-translation-task.html) and can be downloaded from [here](https://dl.fbaipublicfiles.com/flores101/dataset/flores_wmt22_supplement.tar.gz).
  5 | 
  6 | ## Evaluation 
  7 | 
  8 | ### SPM-BLEU 
  9 | For evaluation, we use SentencePiece BLEU (spBLEU) which uses a SentencePiece (SPM) tokenizer with 256K tokens and then BLEU score is computed on the sentence-piece tokenized text. This requires installing sacrebleu using a specific branch:
 10 | ```bash
 11 | git clone --single-branch --branch adding_spm_tokenized_bleu https://github.com/ngoyal2707/sacrebleu.git
 12 | cd sacrebleu
 13 | python setup.py install
 14 | ```
 15 | 
 16 | ### Offline Evaluation
 17 | 
 18 | #### Download FLORES-101 dev and devtest dataset
 19 | 
 20 | ```bash
 21 | cd ~/
 22 | wget https://dl.fbaipublicfiles.com/flores101/dataset/flores101_dataset.tar.gz
 23 | tar -xvzf flores101_dataset.tar.gz
 24 | ```
 25 | 
 26 | #### Compute spBLEU
 27 | 
 28 | Instructions for computing spBLEU for detokenized translations generated by a model 
 29 | 
 30 | ```bash
 31 | flores101_devtest=flores101_dataset/devtest
 32 | 
 33 | # Path to generated detokenized translations file
 34 | translation_file=/path/to/detok_trans.txt
 35 | 
 36 | # Set the target language (for this example, English)
 37 | trg_lang=eng
 38 | 
 39 | cat $translation_file | sacrebleu -tok spm $flores101_devtest/${trg_lang}.devtest
 40 | ```
 41 | 
 42 | ### Example walkthrough of Generation and Evaluation using a pre-trained model in fairseq
 43 | 
 44 | Following example walks shows evaluating released `M2M-124 615M` model on an example language pair of `Nyanja -> Swahili` on `FLORES-101` `devtest` which achieves `12.4` spBLEU.
 45 | 
 46 | #### Download model, sentencepiece vocab
 47 | 
 48 | ```bash
 49 | fairseq=/path/to/fairseq
 50 | cd $fairseq
 51 | 
 52 | # Download 615M param model.
 53 | wget https://dl.fbaipublicfiles.com/flores101/pretrained_models/flores101_mm100_615M.tar.gz
 54 | 
 55 | # Extract 
 56 | tar -xvzf flores101_mm100_615M.tar.gz
 57 | ```
 58 | 
 59 | #### Encode using our SentencePiece Model
 60 | Note: Install SentencePiece from [here](https://github.com/google/sentencepiece)
 61 | 
 62 | 
 63 | ```bash
 64 | flores101_dataset=/path/to/flores_dataset
 65 | fairseq=/path/to/fairseq
 66 | cd $fairseq
 67 | 
 68 | # Example lang pair translation: Nyanja -> Swahili
 69 | # MM100 code for Nyanja and Swahili: ny, sw
 70 | 
 71 | SRC_LANG_CODE=nya
 72 | TRG_LANG_CODE=swh
 73 | 
 74 | SRC_MM100_LANG_CODE=ny
 75 | TRG_MM100_LANG_CODE=sw
 76 | 
 77 | python scripts/spm_encode.py \
 78 |     --model flores101_mm100_615M/sentencepiece.bpe.model \
 79 |     --output_format=piece \
 80 |     --inputs=$flores101_dataset/devtest/${SRC_LANG_CODE}.devtest \
 81 |     --outputs=spm.${SRC_MM100_LANG_CODE}-${TRG_MM100_LANG_CODE}.${SRC_MM100_LANG_CODE}
 82 | 
 83 | python scripts/spm_encode.py \
 84 |     --model flores101_mm100_615M/sentencepiece.bpe.model \
 85 |     --output_format=piece \
 86 |     --inputs=$flores101_dataset/devtest/${TRG_LANG_CODE}.devtest \
 87 |     --outputs=spm.${SRC_MM100_LANG_CODE}-${TRG_MM100_LANG_CODE}.${TRG_MM100_LANG_CODE}
 88 | ```
 89 | 
 90 | #### Binarization
 91 | 
 92 | ```bash
 93 | fairseq-preprocess \
 94 |     --source-lang ${SRC_MM100_LANG_CODE} --target-lang ${TRG_MM100_LANG_CODE} \
 95 |     --testpref spm.${SRC_MM100_LANG_CODE}-${TRG_MM100_LANG_CODE} \
 96 |     --thresholdsrc 0 --thresholdtgt 0 \
 97 |     --destdir data_bin_${SRC_MM100_LANG_CODE}_${TRG_MM100_LANG_CODE} \
 98 |     --srcdict flores101_mm100_615M/dict.txt --tgtdict flores101_mm100_615M/dict.txt
 99 | ```
100 | 
101 | #### Generation 
102 | 
103 | 
104 | ```bash
105 | fairseq-generate \
106 |     data_bin_${SRC_MM100_LANG_CODE}_${TRG_MM100_LANG_CODE} \
107 |     --batch-size 1 \
108 |     --path flores101_mm100_615M/model.pt \
109 |     --fixed-dictionary flores101_mm100_615M/dict.txt \
110 |     -s ${SRC_MM100_LANG_CODE} -t ${TRG_MM100_LANG_CODE} \
111 |     --remove-bpe 'sentencepiece' \
112 |     --beam 5 \
113 |     --task translation_multi_simple_epoch \
114 |     --lang-pairs flores101_mm100_615M/language_pairs.txt \
115 |     --decoder-langtok --encoder-langtok src \
116 |     --gen-subset test \
117 |     --fp16 \
118 |     --dataset-impl mmap \
119 |     --distributed-world-size 1 --distributed-no-spawn \
120 |     --results-path generation_${SRC_MM100_LANG_CODE}_${TRG_MM100_LANG_CODE}
121 | 
122 | # clean fairseq generated file to only create hypotheses file.
123 | cat generation_${SRC_MM100_LANG_CODE}_${TRG_MM100_LANG_CODE}/generate-test.txt  | grep -P '^H-'  | cut -c 3- | sort -n -k 1 | awk -F "\t" '{print $NF}' > generation_${SRC_MM100_LANG_CODE}_${TRG_MM100_LANG_CODE}/sys.txt
124 | ```
125 | 
126 | #### spBLEU Evaluation
127 | 
128 | 
129 | ```bash
130 | # Get score
131 | sacrebleu flores101_dataset/devtest/${TRG_LANG_CODE}.devtest < generation_${SRC_MM100_LANG_CODE}_${TRG_MM100_LANG_CODE}/sys.txt --tokenize spm
132 | # Expected Outcome:
133 | # BLEU+case.mixed+numrefs.1+smooth.exp+tok.spm+version.1.5.0 = 12.4 34.9/15.8/8.7/4.9 (BP = 1.000 ratio = 1.007 hyp_len = 37247 ref_len = 36999)
134 | ```
135 | 
136 | ## List of Languages
137 | 
138 | Language | FLORES-101 code | MM100 lang code
139 | ---|---|---
140 | Akrikaans | afr | af
141 | Amharic | amh | am
142 | Arabic | ara | ar
143 | Armenian | hye | hy
144 | Assamese | asm | as
145 | Asturian | ast | ast
146 | Azerbaijani | azj | az
147 | Belarusian | bel | be
148 | Bengali | ben | bn
149 | Bosnian | bos | bs
150 | Bulgarian | bul | bg
151 | Burmese | mya | my
152 | Catalan | cat | ca
153 | Cebuano | ceb | ceb
154 | Chinese Simpl | zho_simpl | zho
155 | Chinese Trad | zho_trad | zho
156 | Croatian | hrv | hr
157 | Czech | ces | cs
158 | Danish | dan | da
159 | Dutch | nld | nl
160 | English | eng | en
161 | Estonian | est | et
162 | Filipino (Tagalog) | tgl | tl
163 | Finnish | fin | fi
164 | French | fra | fr
165 | Fulah | ful | ff
166 | Galician | glg | gl
167 | Ganda | lug | lg
168 | Georgian | kat | ka
169 | German | deu | de
170 | Greek | ell | el
171 | Gujarati | guj | gu
172 | Hausa | hau | ha
173 | Hebrew | heb | he
174 | Hindi | hin | hi
175 | Hungarian | hun | hu
176 | Icelandic | isl | is
177 | Igbo | ibo | ig
178 | Indonesian | ind | id
179 | Irish | gle | ga
180 | Italian | ita | it
181 | Japanese | jpn | ja
182 | Javanese | jav | jv
183 | Kabuverdianu | kea | kea
184 | Kamba | kam | kam
185 | Kannada | kan | kn
186 | Kazakh | kaz | kk
187 | Khmer | khm | km
188 | Korean | kor | ko
189 | Kyrgyz | kir | ky
190 | Lao | lao | lo
191 | Latvian | lav | lv
192 | Lingala | lin | ln
193 | Lithuanian | lit | lt
194 | Luo | luo | luo
195 | Luxembourgish | ltz | lb
196 | Macedonian | mkd | mk 
197 | Malay | msa | ms
198 | Malayalam | mal | ml
199 | Maltese | mlt | mt
200 | Maori | mri | mi
201 | Marathi | mar | mr
202 | Mongolian | mon | mn
203 | Nepali | npi | ne
204 | Northern Sotho | nso | ns
205 | Norwegian | nob | no
206 | Nyanja | nya | ny
207 | Occitan | oci | oc
208 | Oriya | ory | or
209 | Oromo | orm | om
210 | Pashto | pus | ps
211 | Persian | fas | fa
212 | Polish | pol | pl
213 | Portuguese (Brazil) | por | pt
214 | Punjabi | pan | pa
215 | Romanian | ron | ro
216 | Russian | rus | ru
217 | Serbian | srp | sr
218 | Shona | sna | sn
219 | Sindhi | snd | sd
220 | Slovak | slk | sk
221 | Slovenian | slv | sl
222 | Somali | som | so
223 | Sorani Kurdish | ckb | ku
224 | Spanish (Latin American) | spa | es
225 | Swahili | swh | sw
226 | Swedish | swe | sv
227 | Tajik | tgk | tg
228 | Tamil | tam | ta
229 | Telugu | tel | te
230 | Thai | tha | th
231 | Turkish | tur | tr
232 | Ukrainian | ukr | uk
233 | Umbundu | umb | umb
234 | Urdu | urd | ur
235 | Uzbek | uzb | uz
236 | Vietnamese | vie | vi
237 | Welsh | cym | cy
238 | Wolof | wol | wo
239 | Xhosa | xho | xh
240 | Yoruba | yor | yo
241 | Zulu | zul | zu
242 | 
243 | 
244 | ## WMT Task
245 | The FLORES-101 dataset is being used for the WMT2021 Large-Scale Multilingual Machine Translation Shared Task. You can learn more about the task [HERE](http://www.statmt.org/wmt21/large-scale-multilingual-translation-task.html). We also provide two pretrained models, downloadable from the WMT task page. 
246 | 
247 | 
248 | ## Citation
249 | 
250 | If you use this data in your work, please cite:
251 | 
252 | ```bibtex
253 | @inproceedings{,
254 |   title={The FLORES-101  Evaluation Benchmark for Low-Resource and Multilingual Machine Translation},
255 |   author={Goyal, Naman and Gao, Cynthia and Chaudhary, Vishrav and Chen, Peng-Jen and Wenzek, Guillaume and Ju, Da and Krishnan, Sanjana and Ranzato, Marc'Aurelio and Guzm\'{a}n, Francisco and Fan, Angela},
256 |   year={2021}
257 | }
258 | 
259 | @inproceedings{,
260 |   title={Two New Evaluation Datasets for Low-Resource Machine Translation: Nepali-English and Sinhala-English},
261 |   author={Guzm\'{a}n, Francisco and Chen, Peng-Jen and Ott, Myle and Pino, Juan and Lample, Guillaume and Koehn, Philipp and Chaudhary, Vishrav and Ranzato, Marc'Aurelio},
262 |   journal={arXiv preprint arXiv:1902.01382},
263 |   year={2019}
264 | }
265 | ```


--------------------------------------------------------------------------------
/previous_releases/floresv1/download-data.sh:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates.
  2 | # All rights reserved.
  3 | #
  4 | # This source code is licensed under the license found in the
  5 | # LICENSE file in the root directory of this source tree.
  6 | #
  7 | #!/bin/bash
  8 | # Downloads the data and creates data/all-clean.tgz within the current directory
  9 | 
 10 | set -e
 11 | set -o pipefail
 12 | 
 13 | SRC=en
 14 | SI_TGT=si
 15 | NE_TGT=ne
 16 | 
 17 | ROOT=$(dirname "$0")
 18 | DATA=$ROOT/data
 19 | NE_ROOT=$DATA/all-clean-ne
 20 | SI_ROOT=$DATA/all-clean-si
 21 | HI_ROOT=$DATA/all-clean-hi
 22 | 
 23 | mkdir -p $DATA $NE_ROOT $SI_ROOT $HI_ROOT
 24 | 
 25 | SI_OPUS_DATASETS=(
 26 |   "$SI_ROOT/GNOME.en-si"
 27 |   "$SI_ROOT/Ubuntu.en-si"
 28 |   "$SI_ROOT/KDE4.en-si"
 29 |   "$SI_ROOT/OpenSubtitles.en-si"
 30 | )
 31 | 
 32 | SI_OPUS_URLS=(
 33 |   "https://object.pouta.csc.fi/OPUS-GNOME/v1/moses/en-si.txt.zip"
 34 |   "https://object.pouta.csc.fi/OPUS-Ubuntu/v14.10/moses/en-si.txt.zip"
 35 |   "https://object.pouta.csc.fi/OPUS-KDE4/v2/moses/en-si.txt.zip"
 36 |   "https://object.pouta.csc.fi/OPUS-OpenSubtitles/v2018/moses/en-si.txt.zip"
 37 | )
 38 | 
 39 | NE_OPUS_DATASETS=(
 40 |   "$NE_ROOT/GNOME.en-ne"
 41 |   "$NE_ROOT/Ubuntu.en-ne"
 42 |   "$NE_ROOT/KDE4.en-ne"
 43 | )
 44 | 
 45 | NE_OPUS_URLS=(
 46 |   "https://object.pouta.csc.fi/OPUS-GNOME/v1/moses/en-ne.txt.zip"
 47 |   "https://object.pouta.csc.fi/OPUS-Ubuntu/v14.10/moses/en-ne.txt.zip"
 48 |   "https://object.pouta.csc.fi/OPUS-KDE4/v2/moses/en-ne.txt.zip"
 49 | )
 50 | 
 51 | REMOVE_FILE_PATHS=()
 52 | 
 53 | # Download data
 54 | download_data() {
 55 |   CORPORA=$1
 56 |   URL=$2
 57 | 
 58 |   if [ -f $CORPORA ]; then
 59 |     echo "$CORPORA already exists, skipping download"
 60 |   else
 61 |     echo "Downloading $URL"
 62 |     wget $URL -O $CORPORA --no-check-certificate || rm -f $CORPORA
 63 |     if [ -f $CORPORA ]; then
 64 |       echo "$URL successfully downloaded."
 65 |     else
 66 |       echo "$URL not successfully downloaded."
 67 |       rm -f $CORPORA
 68 |       exit -1
 69 |     fi
 70 |   fi
 71 | }
 72 | 
 73 | # Example: download_opus_data $LANG_ROOT $TGT
 74 | download_opus_data() {
 75 |   LANG_ROOT=$1
 76 |   TGT=$2
 77 | 
 78 |   if [ "$TGT" = "si" ]; then
 79 |     URLS=("${SI_OPUS_URLS[@]}")
 80 |     DATASETS=("${SI_OPUS_DATASETS[@]}")
 81 |   else
 82 |     URLS=("${NE_OPUS_URLS[@]}")
 83 |     DATASETS=("${NE_OPUS_DATASETS[@]}")
 84 |   fi
 85 | 
 86 |   # Download and extract data
 87 |   for ((i=0;i<${#URLS[@]};++i)); do
 88 |     URL=${URLS[i]}
 89 |     CORPORA=${DATASETS[i]}
 90 | 
 91 |     download_data $CORPORA $URL
 92 |     unzip -o $CORPORA -d $LANG_ROOT
 93 |     REMOVE_FILE_PATHS+=( $CORPORA $CORPORA.xml $CORPORA.ids $LANG_ROOT/README $LANG_ROOT/LICENSE )
 94 |   done
 95 | 
 96 |   cat ${DATASETS[0]}.$SRC ${DATASETS[1]}.$SRC ${DATASETS[2]}.$SRC > $LANG_ROOT/GNOMEKDEUbuntu.$SRC-$TGT.$SRC
 97 |   cat ${DATASETS[0]}.$TGT ${DATASETS[1]}.$TGT ${DATASETS[2]}.$TGT > $LANG_ROOT/GNOMEKDEUbuntu.$SRC-$TGT.$TGT
 98 | 
 99 |   REMOVE_FILE_PATHS+=( ${DATASETS[0]}.$SRC ${DATASETS[1]}.$SRC ${DATASETS[2]}.$SRC )
100 |   REMOVE_FILE_PATHS+=( ${DATASETS[0]}.$TGT ${DATASETS[1]}.$TGT ${DATASETS[2]}.$TGT )
101 | }
102 | 
103 | download_opus_data $SI_ROOT $SI_TGT
104 | cp ${SI_OPUS_DATASETS[3]}.$SRC $SI_ROOT/OpenSubtitles2018.$SRC-$SI_TGT.$SRC
105 | cp ${SI_OPUS_DATASETS[3]}.$SI_TGT $SI_ROOT/OpenSubtitles2018.$SRC-$SI_TGT.$SI_TGT
106 | REMOVE_FILE_PATHS+=( ${SI_OPUS_DATASETS[3]}.$SRC ${SI_OPUS_DATASETS[3]}.$SI_TGT )
107 | 
108 | download_opus_data $NE_ROOT $NE_TGT
109 | 
110 | 
111 | # Download and extract Global Voices data
112 | GLOBAL_VOICES="$NE_ROOT/globalvoices.2018q4.ne-en"
113 | GLOBAL_VOICES_URL="http://www.casmacat.eu/corpus/global-voices/globalvoices.ne-en.xliff.gz"
114 | 
115 | download_data $GLOBAL_VOICES.gz $GLOBAL_VOICES_URL
116 | gunzip -Nf $GLOBAL_VOICES.gz
117 | 
118 | sed -ne 's?.*<source>\(.*\)</source>.*?\1?p' $GLOBAL_VOICES > $GLOBAL_VOICES.$NE_TGT
119 | sed -ne 's?.*<target[^>]*>\(.*\)</target>.*?\1?p' $GLOBAL_VOICES > $GLOBAL_VOICES.$SRC
120 | 
121 | REMOVE_FILE_PATHS+=( $GLOBAL_VOICES )
122 | 
123 | # Download and extract the bible dataset
124 | BIBLE_TOOLS=$ROOT/bible-corpus-tools
125 | XML_BIBLES=$ROOT/XML_Bibles
126 | XML_BIBLES_DUP=$ROOT/XML_Bibles_dup
127 | 
128 | if [ ! -e $BIBLE_TOOLS ]; then
129 |     echo "Cloning bible-corpus-tools repository..."
130 |     git clone https://github.com/christos-c/bible-corpus-tools.git
131 | fi
132 | 
133 | mkdir -p $BIBLE_TOOLS/bin $XML_BIBLES $XML_BIBLES_DUP
134 | javac -cp "$BIBLE_TOOLS/lib/*" -d $BIBLE_TOOLS/bin $BIBLE_TOOLS/src/bible/readers/*.java $BIBLE_TOOLS/src/bible/*.java
135 | 
136 | download_data bible.tar.gz "https://github.com/christos-c/bible-corpus/archive/v1.2.1.tar.gz"
137 | tar xvzf bible.tar.gz
138 | 
139 | cp $ROOT/bible-corpus-1.2.1/bibles/{Greek.xml,English.xml,Nepali.xml} $XML_BIBLES/
140 | cp $ROOT/bible-corpus-1.2.1/bibles/{Greek.xml,English-WEB.xml,Nepali.xml} $XML_BIBLES_DUP/
141 | 
142 | java -cp $BIBLE_TOOLS/lib/*:$BIBLE_TOOLS/bin bible.CreateMLBooks $XML_BIBLES
143 | java -cp $BIBLE_TOOLS/lib/*:$BIBLE_TOOLS/bin bible.CreateMLBooks $XML_BIBLES_DUP
144 | java -cp $BIBLE_TOOLS/lib/*:$BIBLE_TOOLS/bin bible.CreateVerseAlignedBooks $XML_BIBLES
145 | java -cp $BIBLE_TOOLS/lib/*:$BIBLE_TOOLS/bin bible.CreateVerseAlignedBooks $XML_BIBLES_DUP
146 | 
147 | cat $XML_BIBLES/aligned/*/English.txt > $NE_ROOT/bible.$SRC-$NE_TGT.$SRC
148 | cat $XML_BIBLES/aligned/*/Nepali.txt > $NE_ROOT/bible.$SRC-$NE_TGT.$NE_TGT
149 | cat $XML_BIBLES_DUP/aligned/*/English-WEB.txt > $NE_ROOT/bible_dup.$SRC-$NE_TGT.$SRC
150 | cat $XML_BIBLES_DUP/aligned/*/Nepali.txt > $NE_ROOT/bible_dup.$SRC-$NE_TGT.$NE_TGT
151 | REMOVE_FILE_PATHS+=( bible-corpus-1.2.1 bible.tar.gz $BIBLE_TOOLS $XML_BIBLES $XML_BIBLES_DUP )
152 | 
153 | 
154 | # Download parallel en-hi corpus
155 | download_data $DATA/en-hi.tgz "http://www.cfilt.iitb.ac.in/iitb_parallel/iitb_corpus_download/parallel.tgz"
156 | #download_data $DATA/en-hi.tgz "https://www.cse.iitb.ac.in/~anoopk/share/iitb_en_hi_parallel/iitb_corpus_download/parallel.tgz"
157 | tar xvzf $DATA/en-hi.tgz
158 | cp parallel/* $HI_ROOT/
159 | REMOVE_FILE_PATHS+=( parallel $DATA/en-hi.tgz )
160 | 
161 | 
162 | # Download and extract the Penn Treebank dataset
163 | NE_TAGGED=$ROOT/new_submissions_parallel_corpus_project_Nepal
164 | NE_TAGGED_URL="http://www.cle.org.pk/Downloads/ling_resources/parallelcorpus/NepaliTaggedCorpus.zip"
165 | EN_TAGGED_PATCH_URL="https://dl.fbaipublicfiles.com/fairseq/data/nepali-penn-treebank.en.patch"
166 | NE_TAGGED_PATCH_URL="https://dl.fbaipublicfiles.com/fairseq/data/nepali-penn-treebank.ne.patch"
167 | MOSES=$ROOT/mosesdecoder
168 | MOSES_TOK=$MOSES/scripts/tokenizer
169 | EN_PATCH_REGEX="{s:\\\/:\/:g;s/\*\T\*\-\n+//g;s/\-LCB\-/\{/g;s/\-RCB\-/\}/g; s/\-LSB\-/\[/g; s/\-RSB\-/\]/g;s/\-LRB\-/\(/g; s/\-RRB\-/\)/g; s/\'\'/\"/g; s/\`\`/\"/g; s/\ +\'s\ +/\'s /g; s/\ +\'re\ +/\'re /g; s/\"\ +/\"/g; s/\ +\"/\"/g; s/\ n't([\ \.\"])/n't\1/g; s/\r+(.)/\1/g;}"
170 | NE_PATCH_REGEX="{s:\p{Cf}::g;s:\\\/:\/:g;s/\*\T\*\-\n+//g;s/\-LCB\-/\{/g;s/\-RCB\-/\}/g; s/\-LSB\-/\[/g; s/\-RSB\-/\]/g;s/\-LRB\-/\(/g; s/\-RRB\-/\)/g; s/\'\'/\"/g; s/\`\`/\"/g; s/\ +\'s\ +/\'s /g; s/\ +\'re\ +/\'re /g; s/\"\ +/\"/g; s/\ +\"/\"/g; s/\ n't([\ \.\"])/n't\1/g; s/\r+(.)/\1/g;}"
171 | 
172 | download_data $DATA/nepali-penn-treebank.$SRC.patch $EN_TAGGED_PATCH_URL
173 | download_data $DATA/nepali-penn-treebank.$NE_TGT.patch $NE_TAGGED_PATCH_URL
174 | download_data original.zip $NE_TAGGED_URL
175 | unzip -o original.zip -d $ROOT
176 | 
177 | cat $NE_TAGGED/00.txt $NE_TAGGED/01.txt $NE_TAGGED/02.txt > $NE_TAGGED/nepali-penn-treebank.$SRC
178 | cat $NE_TAGGED/00ne_revised.txt $NE_TAGGED/01ne_revised.txt $NE_TAGGED/02ne_revised.txt > $NE_TAGGED/nepali-penn-treebank.$NE_TGT
179 | 
180 | patch $NE_TAGGED/nepali-penn-treebank.$SRC -i $DATA/nepali-penn-treebank.$SRC.patch -o $NE_TAGGED/nepali-penn-treebank-patched.$SRC
181 | patch $NE_TAGGED/nepali-penn-treebank.$NE_TGT -i $DATA/nepali-penn-treebank.$NE_TGT.patch -o $NE_TAGGED/nepali-penn-treebank-patched.$NE_TGT
182 | 
183 | if [ ! -e $MOSES ]; then
184 |     echo "Cloning moses repository..."
185 |     git clone https://github.com/moses-smt/mosesdecoder.git
186 | fi
187 | 
188 | cat $NE_TAGGED/nepali-penn-treebank-patched.$SRC | \
189 |   perl -anpe "$EN_PATCH_REGEX"  | \
190 |   $MOSES_TOK/tokenizer.perl -l $SRC | \
191 |   $MOSES_TOK/detokenizer.perl -l $SRC > $NE_ROOT/nepali-penn-treebank.$SRC
192 | 
193 | cat $NE_TAGGED/nepali-penn-treebank-patched.$NE_TGT | \
194 |   perl -CIO -anpe "$NE_PATCH_REGEX" | \
195 |   $MOSES_TOK/detokenizer.perl -l $SRC > $NE_ROOT/nepali-penn-treebank.$NE_TGT
196 | 
197 | 
198 | # Download nepali dictionary data
199 | NE_DICT=$NE_ROOT/dictionaries
200 | download_data $NE_DICT "http://www.seas.upenn.edu/~nlp/resources/TACL-data-release/dictionaries.tar.gz"
201 | tar xvzf $NE_DICT
202 | cp dictionaries/dict.ne $NE_ROOT/dictionary.$NE_TGT-$SRC
203 | REMOVE_FILE_PATHS+=( $NE_DICT dictionaries )
204 | 
205 | 
206 | # Download test sets
207 | download_data $DATA/wikipedia_en_ne_si_test_sets.tgz "https://github.com/facebookresearch/flores/raw/master/data/wikipedia_en_ne_si_test_sets.tgz"
208 | REMOVE_FILE_PATHS+=( $MOSES $NE_TAGGED original.zip $DATA/nepali-penn-treebank.$SRC.patch $DATA/nepali-penn-treebank.$NE_TGT.patch )
209 | 
210 | pushd $DATA/
211 | tar -vxf wikipedia_en_ne_si_test_sets.tgz
212 | popd
213 | 
214 | 
215 | # Remove the temporary files
216 | for ((i=0;i<${#REMOVE_FILE_PATHS[@]};++i)); do
217 |   rm -rf ${REMOVE_FILE_PATHS[i]}
218 | done
219 | 


--------------------------------------------------------------------------------
/ocr/OCR_eval/OCR_eval.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | import argparse
  3 | import glob
  4 | import os
  5 | from os import path
  6 | from typing import Mapping, Any, Tuple, Sequence
  7 | 
  8 | import numpy as np
  9 | import pandas as pd
 10 | from tqdm.auto import tqdm
 11 | 
 12 | from data_collection.augment_data import run_augmentation
 13 | from data_collection.utils import create_dictionary_lang, return_all_anomalies, sentence_split
 14 | from metrics import compute_metrics
 15 | 
 16 | 
 17 | def read_ocr_codes_file() -> Tuple[Sequence[str], Sequence[str]]:
 18 |     df = pd.read_csv('Data/language_codes/languages_fonts_codes.csv')
 19 |     lang_codes = df["Code"].tolist()
 20 |     lang_tesseract_codes = df["Tesseract Code"].replace(np.nan, '').tolist()
 21 |     return lang_codes, lang_tesseract_codes
 22 | 
 23 | 
 24 | def run_on_Flores(lang_code: str, lang_name: str, ocr_system: str):
 25 |     root_lang_name = os.path.join("Data", "FLORES", lang_name)
 26 |     os.makedirs(os.path.join(root_lang_name, ocr_system), exist_ok=True)
 27 | 
 28 |     dict_results = {lang_name: ['', '']}
 29 |     groundtruth_split_input = root_lang_name + "txt/" + lang_code + "_split.txt"
 30 |     predicted_split_output = root_lang_name + ocr_system + "/" + lang_code + "_sentsplit.txt"
 31 | 
 32 |     print(lang_name + ": " + lang_code)
 33 |     print("pred: " + predicted_split_output + "; gt: " + groundtruth_split_input)
 34 | 
 35 |     if not os.path.isfile(predicted_split_output):
 36 |         print("No path: " + predicted_split_output)
 37 |         return
 38 |     if ocr_system != "googlevision" and os.path.isfile(predicted_split_output):
 39 |         print("Removing existing file: " + predicted_split_output)
 40 |         os.remove(predicted_split_output)
 41 | 
 42 |     if ocr_system == "tesseract":
 43 |         print("Running Tesseract")
 44 |         lang_codes, lang_Tesseract_codes = read_ocr_codes_file()
 45 |         Tesseract_code = lang_Tesseract_codes[lang_codes.index(lang_code)]
 46 |         print(Tesseract_code)
 47 |         if not Tesseract_code:
 48 |             print("No Tesseract training data for " + lang_name)
 49 |         else:
 50 |             path_img = root_lang_name + "png/" + lang_code + "/"
 51 |             all_img_files = sorted([f for f in glob.glob(path_img + "*.png")])
 52 | 
 53 |             for name_img in all_img_files:
 54 |                 name_img = name_img.split("/")[-1][:-4]
 55 |                 image_input = path_img + name_img + ".png"
 56 |                 tesseract_output = root_lang_name + "tesseract/" + name_img + "_tesseract"
 57 |                 os.system(
 58 |                     "tesseract " + str(image_input) + " " + str(tesseract_output) + " -l " + Tesseract_code
 59 |                 )
 60 |                 sentence_split(input_=tesseract_output, output=predicted_split_output, mode='a')
 61 | 
 62 |     elif ocr_system == "googlevision":
 63 |         print("Running " + ocr_system + ": results already on")
 64 |     else:
 65 |         raise ValueError("Wrong ocr_system name: " + ocr_system)
 66 | 
 67 |     CER, WER = compute_metrics(
 68 |         pred=predicted_split_output, tgt=groundtruth_split_input
 69 |     )
 70 |     print("{:.2f}".format(CER) + "," + "{:.2f}".format(WER))
 71 |     print(" -------------------- ")
 72 |     dict_results[lang_name] = ["{:.2f}".format(CER), "{:.2f}".format(WER)]
 73 |     return dict_results
 74 | 
 75 | 
 76 | def average_error_rates(lang_name, list_error_rates, dict_results):
 77 |     CERs = [CER for [_, CER, _] in list_error_rates]
 78 |     WERs = [WER for [_, _, WER] in list_error_rates]
 79 |     mean_CER = np.mean(np.array(CERs), axis=0)
 80 |     mean_WER = np.mean(np.array(WERs), axis=0)
 81 |     dict_results[lang_name] = ["{:.2f}".format(mean_CER), "{:.2f}".format(mean_WER)]
 82 |     print("mean CER: " + str(mean_CER) + "; mean WER: " + str(mean_WER))
 83 |     return dict_results
 84 | 
 85 | 
 86 | def run_on_udhr(lang_code: str, lang_name: str, ocr_system: str, dict_all_anomalies):
 87 |     root_lang_name = os.path.join("Data/UDHR", lang_name)
 88 |     os.makedirs(root_lang_name, exist_ok=True)
 89 |     os.makedirs(os.path.join(root_lang_name, ocr_system), exist_ok=True)
 90 |     dict_results = {lang_name: ['', '']}
 91 | 
 92 |     lang_codes, lang_Tesseract_codes = read_ocr_codes_file()
 93 |     Tesseract_code = lang_Tesseract_codes[lang_codes.index(lang_code)]
 94 |     if lang_code == 'ckb':
 95 |         Tesseract_code = 'tur'  # FLORES has arabic script for Sorani-Kurdish
 96 |     print(Tesseract_code)
 97 | 
 98 |     list_anomaly_articles = dict_all_anomalies[lang_code]
 99 | 
100 |     if not Tesseract_code:
101 |         print("No Tesseract training data for " + lang_name)
102 |     else:
103 |         list_error_rates = []
104 |         for i in range(1, 31):
105 |             if i in list_anomaly_articles:
106 |                 print("not processing anomaly: article " + str(i))
107 |                 continue
108 |             print("Article " + str(i))
109 |             name_img = lang_code + str(i)
110 |             path_image_input = os.path.join("Data", "UDHR", "annotations", lang_code)
111 |             if glob.glob(os.path.join(path_image_input, "*.png")):  # input can be png or jpg
112 |                 image_input = os.path.join(path_image_input, f"{lang_code}_{i}.png")
113 |             elif glob.glob(os.path.join(path_image_input, "*.PNG")):  # input can be png or jpg
114 |                 image_input = os.path.join(path_image_input, f"{lang_code}_{i}.PNG")
115 |             elif glob.glob(os.path.join(path_image_input, "*.jpg")):
116 |                 image_input = os.path.join(path_image_input, f"{lang_code}_{i}.jpg")
117 |             elif glob.glob(os.path.join(path_image_input, "*.JPG")):
118 |                 image_input = os.path.join(path_image_input, f"{lang_code}_{i}.JPG")
119 |             else:
120 |                 print("Extension different than png or jpg or img not there")
121 |                 break
122 |             if not path.exists(image_input):
123 |                 print(image_input + " doesn't exist")
124 |                 continue
125 | 
126 |             groundtruth_input = os.path.join(root_lang_name, name_img + ".txt")
127 |             groundtruth_split_input = os.path.join(root_lang_name, name_img + "_split.txt")
128 |             predicted_split_output = os.path.join(root_lang_name, ocr_system, name_img + "_sentsplit.txt")
129 |             print("pred: " + predicted_split_output + "; gt: " + groundtruth_split_input)
130 | 
131 |             # save GT text split into sentences
132 |             with open(groundtruth_input, encoding="utf8") as file:
133 |                 input_txt = file.read().split("\n")
134 |             with open(groundtruth_split_input, 'w', encoding="utf-8") as file:
135 |                 file.write(" ".join(input_txt))
136 | 
137 |             if ocr_system == "tesseract":
138 |                 tesseract_output = os.path.join(root_lang_name, "tesseract", name_img + "_tesseract")
139 |                 if not os.path.isfile(predicted_split_output):
140 |                     os.system(
141 |                         "tesseract " + str(image_input) + " " + str(tesseract_output) + " -l " + Tesseract_code
142 |                     )
143 | 
144 |                     sentence_split(input_=tesseract_output, output=predicted_split_output, mode='w')
145 | 
146 |             CER, WER = compute_metrics(predicted_split_output, groundtruth_split_input)
147 |             list_error_rates.append([i, CER, WER])
148 |             print("{:.2f}".format(CER), "{:.2f}".format(WER))
149 |             print(" -------------------- ")
150 |         dict_results = average_error_rates(lang_name, list_error_rates, dict_results)
151 |     return dict_results
152 | 
153 | 
154 | def run_ocr_eval(dataset: str, lang_code: str, lang_name: str, ocr_system: str,
155 |                  dict_all_anomalies) -> Mapping[str, Any]:
156 |     if dataset == "FLORES":
157 |         return run_on_Flores(lang_code, lang_name, ocr_system)
158 |     elif dataset == "UDHR":
159 |         return run_on_udhr(lang_code, lang_name, ocr_system, dict_all_anomalies)
160 |     else:
161 |         raise ValueError(f"Unknown dataset: {dataset}")
162 | 
163 | 
164 | def run_tess_on_books(lang_code: str = "nep") -> None:
165 |     root_lang_name = os.path.join("Data", "crawls", lang_code)
166 |     path_img = os.path.join(root_lang_name, "png")
167 |     all_img_files = sorted([f for f in glob.iglob(path_img + "*.png")])
168 | 
169 |     for path_img_in in all_img_files:
170 |         path_tiff_out = os.path.join(root_lang_name, "tiff",
171 |                                      os.path.splitext(os.path.basename(path_img_in))[0] + ".tiff")
172 |         os.system("convert -density 300 " + path_img_in + " -quality 100 " + path_tiff_out)
173 | 
174 |     predicted_split_output = os.path.join(root_lang_name, lang_code + "_tess_sentsplit.txt")
175 |     filenames = []
176 |     for name_img in all_img_files:
177 |         name_img = os.path.splitext(os.path.basename(name_img))[0]
178 |         tesseract_output = os.path.join(root_lang_name, "tesseract", name_img + "_tesseract")
179 |         filenames.append(tesseract_output + ".txt")
180 |         if tesseract_output + ".txt" not in glob.glob(os.path.join(root_lang_name, "tesseract", "*.txt")):
181 |             image_input = os.path.join(path_img, name_img + ".png")
182 |             os.system(
183 |                 "tesseract " + str(image_input) + " " + str(tesseract_output) + " -l " + lang_code
184 |             )
185 | 
186 |     count_lines = 0
187 |     with open(predicted_split_output, 'w', encoding="utf-8") as outfile:
188 |         for filename in filenames:
189 |             with open(filename, encoding="utf-8") as infile:
190 |                 for line in infile:
191 |                     if line.strip() and len(line.split()) > 5:
192 |                         outfile.write(line)
193 |                         count_lines += 1
194 | 
195 |     print(count_lines)
196 |     os.system(
197 |         "head -10000 Data/crawls/" + lang_code + "/" + lang_code + "_tess_sentsplit.txt > Data/crawls/" + lang_code
198 |         + "_tess_10k.txt")
199 |     os.system(
200 |         "head -20000 Data/crawls/" + lang_code + "/" + lang_code + "_tess_sentsplit.txt > Data/crawls/" + lang_code
201 |         + "_tess_20k.txt")
202 |     os.system(
203 |         "head -30000 Data/crawls/" + lang_code + "/" + lang_code + "_tess_sentsplit.txt > Data/crawls/" + lang_code
204 |         + "_tess_30k.txt")
205 | 
206 | 
207 | def parse_args() -> argparse.Namespace:
208 |     parser = argparse.ArgumentParser()
209 |     parser.add_argument("--dataset", choices=["FLORES", "UDHR"], default="UDHR")
210 |     parser.add_argument("--ocr-system", choices=["tesseract", "googlevision"], default="tesseract")
211 |     return parser.parse_args()
212 | 
213 | 
214 | def main() -> None:
215 |     args = parse_args()
216 | 
217 |     lang_code_dict = create_dictionary_lang()
218 |     dict_results = {}
219 |     os.makedirs(os.path.join('Data', 'Results', args.ocr_system), exist_ok=True)
220 | 
221 |     run_tess_on_books()
222 |     for lang_code, lang_name in tqdm(lang_code_dict.items(), total=len(lang_code_dict)):
223 |         dict_results[lang_name] = []
224 |         if args.dataset == "FLORES":
225 |             run_augmentation(lang_code)
226 | 
227 |         print("----------------")
228 |         print(lang_name, lang_code)
229 | 
230 |         dict_results = run_ocr_eval(args.dataset, lang_code, lang_name, args.ocr_system, return_all_anomalies())
231 | 
232 |         pd.DataFrame(dict_results).T.reset_index().to_csv(
233 |             'Data/Results/' + args.ocr_system + '/' + args.dataset + '.csv', mode='a',
234 |             header=False, index=False)
235 | 
236 | 
237 | if __name__ == "__main__":
238 |     main()
239 | 


--------------------------------------------------------------------------------
/ocr/data_collection/augment_data.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | import argparse
  3 | import glob
  4 | import os
  5 | import re
  6 | 
  7 | import cv2
  8 | import numpy as np
  9 | import pandas as pd
 10 | from skimage import transform as tf
 11 | 
 12 | from utils import create_dictionary_lang
 13 | 
 14 | CHROME_PATH = "/Applications/Google\ Chrome.app/Contents/MacOS/Google\ Chrome"
 15 | 
 16 | lang_code_dict = create_dictionary_lang()
 17 | root_dir = "Data/FLORES/"
 18 | 
 19 | # TODO: font weight?
 20 | dict_properties = {
 21 |     "color": ["black"],
 22 |     "opacity": ["1", "0.3"],
 23 |     "font_size": ["20px"],
 24 |     "letter_spacing": ["normal", "0.2em", "-0.2em"],
 25 |     "italic": [True, False],
 26 |     "bold": [True, False],
 27 |     "gauss": [True, False],
 28 |     "skew": [True, False]
 29 | }
 30 | 
 31 | 
 32 | def read_fonts_file():
 33 |     df = pd.read_csv('Data/misc/languages_fonts_codes.csv')
 34 |     lang_codes = df["Code"].tolist()
 35 |     lang_fonts = df["Fonts"].tolist()
 36 |     return lang_codes, lang_fonts
 37 | 
 38 | 
 39 | def create_style_file(
 40 |         font, color, opacity, font_size, letter_spacing, italic, bold, name_style
 41 | ):
 42 |     str_font = "src: url(fonts/" + font + ".ttf);"
 43 |     full_style = (
 44 |             """
 45 |         @font-face {
 46 |         font-family: defined_font;
 47 |         """
 48 |             + str_font
 49 |             + """
 50 |     }
 51 |     p {
 52 |         font-family: defined_font;
 53 |         color: """
 54 |             + color
 55 |             + ";"
 56 |             + """
 57 |         opacity: """
 58 |             + opacity
 59 |             + ";"
 60 |             + """
 61 |         letter-spacing:"""
 62 |             + letter_spacing
 63 |             + ";"
 64 |             + """
 65 |         font-size:"""
 66 |             + font_size
 67 |             + ";"
 68 |     )
 69 |     if italic:
 70 |         full_style += """
 71 |         font-style: italic;"""
 72 |     if bold:
 73 |         full_style += """
 74 |         font-weight: bold;"""
 75 | 
 76 |     full_style += """
 77 |     }
 78 |     """
 79 |     with open("Data/augmentation/styles/" + name_style + ".css", "w+") as f:
 80 |         f.write(full_style)
 81 | 
 82 | 
 83 | def create_html_file(root_path, list_sentences, name_html_file, name_style):
 84 |     str_style = (
 85 |             f"""<link rel="stylesheet" href="{os.path.abspath('Data/augmentation/styles/' + name_style + '.css')}">"""
 86 |     )
 87 |     str_html_head = (
 88 |             """
 89 |         <!DOCTYPE html>
 90 |         <html lang="en">
 91 |     
 92 |         <head>
 93 |         <meta charset="UTF-8">
 94 |         <meta name="viewport" content=
 95 |               "width=device-width, initial-scale=1.0">
 96 |         """
 97 |             + str_style
 98 |             + """
 99 |     </head>
100 |     <body>
101 |     """
102 |     )
103 |     # put all text into one paragraph
104 |     str_html_text = "<p>" + "".join(list_sentences) + "</p>"
105 |     str_html_head_close = """
106 |     </body>
107 |     </html>
108 |     """
109 |     full_text = str_html_head + str_html_text + str_html_head_close
110 |     with open(os.path.join(root_path, name_html_file + ".html"), "w") as f:
111 |         f.write(full_text)
112 | 
113 | 
114 | def save_html_to_pdf(root_save_pdfs: str, root_html_url: str, name_html_file: str) -> None:
115 |     os.system(
116 |         CHROME_PATH
117 |         + " --headless --print-to-pdf-no-header --print-to-pdf="
118 |         + root_save_pdfs
119 |         + name_html_file
120 |         + ".pdf "
121 |         + root_html_url
122 |         + name_html_file
123 |         + ".html"
124 |     )
125 | 
126 | 
127 | def save_pdf_to_png(lang_name, name_file, name_html_file, root_path: str = "Data/FLORES/"):
128 |     # if entire pdf -> need to first split into pages
129 |     root_save_pdfs = "Data/augmentation/pdfs/"
130 |     os.makedirs(root_path + lang_name + "/png/" + name_file, exist_ok=True)
131 |     path_png_out = root_path + lang_name + "/png/" + name_file + "/" + name_html_file
132 |     path_pdf_in = root_save_pdfs + name_html_file + ".pdf"
133 |     print("Saving pdf to png for " + lang_name)
134 |     os.system("convert -density 300 -trim " + path_pdf_in + " -quality 100 " + path_png_out + "%02d.png")
135 | 
136 | 
137 | def add_gaussian_noise(lang_name, name_file, name_html_file, root_path: str = "Data/FLORES/"):
138 |     img = cv2.imread(root_path + lang_name + "/png/" + name_file + "/" + name_html_file + ".png")
139 |     # Generate Gaussian noise
140 |     gauss = np.random.normal(0, 1, img.size)
141 |     gauss = gauss.reshape(img.shape[0], img.shape[1], img.shape[2]).astype("uint8")
142 |     # Add the Gaussian noise to the image
143 |     img_gauss = cv2.add(img, gauss)
144 | 
145 |     img_gauss = cv2.cvtColor(img_gauss, cv2.COLOR_BGR2GRAY)
146 |     cv2.imwrite(
147 |         root_path
148 |         + lang_name
149 |         + "/png/"
150 |         + name_file
151 |         + "/"
152 |         + name_html_file
153 |         + "_gauss"
154 |         + ".png",
155 |         img_gauss,
156 |     )
157 | 
158 | 
159 | def add_salt_pepper_noise(lang_name, name_file, name_html_file, amount, s_vs_p, root_path: str = "Data/FLORES/"):
160 |     image = cv2.imread(
161 |         root_path + lang_name + "/png/" + name_file + "/" + name_html_file + ".png"
162 |     )
163 |     img_noise = np.copy(image)
164 |     # Salt mode
165 |     num_salt = np.ceil(amount * image.size * s_vs_p)
166 |     coords = [np.random.randint(0, i - 1, int(num_salt)) for i in image.shape]
167 |     img_noise[coords] = 1
168 | 
169 |     # Pepper mode
170 |     num_pepper = np.ceil(amount * image.size * (1. - s_vs_p))
171 |     coords = [np.random.randint(0, i - 1, int(num_pepper)) for i in image.shape]
172 |     img_noise[coords] = 0
173 |     img_noise = cv2.cvtColor(img_noise, cv2.COLOR_BGR2GRAY)
174 |     cv2.imwrite(
175 |         root_path
176 |         + lang_name
177 |         + "/png/"
178 |         + name_file
179 |         + "/"
180 |         + name_html_file
181 |         + "_salt_pepper"
182 |         + ".png",
183 |         img_noise,
184 |     )
185 | 
186 | 
187 | def add_speckle_noise(lang_name, name_file, name_html_file, root_path: str = "Data/FLORES/"):
188 |     img = cv2.imread(
189 |         root_path + lang_name + "/png/" + name_file + "/" + name_html_file + ".png"
190 |     )
191 |     gauss = np.random.normal(0, 1, img.size)
192 |     gauss = gauss.reshape(img.shape[0], img.shape[1], img.shape[2]).astype('uint8')
193 |     img_noise = img + img * gauss
194 |     img_noise = cv2.cvtColor(img_noise, cv2.COLOR_BGR2GRAY)
195 | 
196 |     cv2.imwrite(
197 |         root_path
198 |         + lang_name
199 |         + "/png/"
200 |         + name_file
201 |         + "/"
202 |         + name_html_file
203 |         + "_gauss"
204 |         + ".png",
205 |         img_noise,
206 |     )
207 | 
208 | 
209 | def add_skew(lang_name, name_file, name_html_file, root_path: str = "Data/FLORES/"):
210 |     img = cv2.imread(
211 |         root_path + lang_name + "/png/" + name_file + "/" + name_html_file + ".png"
212 |     )
213 |     # Create Affine transform
214 |     affine_tf = tf.AffineTransform(shear=0.1)
215 |     # Apply transform to image data
216 |     img_skew = tf.warp(img, inverse_map=affine_tf) * 255
217 |     cv2.imwrite(
218 |         root_path
219 |         + lang_name
220 |         + "/png/"
221 |         + name_file
222 |         + "/"
223 |         + name_html_file
224 |         + "_skew"
225 |         + ".png",
226 |         img_skew,
227 |     )
228 | 
229 | 
230 | def run_augmentation_udhr(lang_code):
231 |     lang_name = lang_code_dict[lang_code]
232 | 
233 |     lang_codes, lang_fonts = read_fonts_file()
234 |     index_lang_code = lang_codes.index(lang_code)
235 | 
236 |     fonts = lang_fonts[index_lang_code].split("; ")
237 | 
238 |     color = dict_properties["color"][0]
239 |     opacity = dict_properties["opacity"][0]
240 |     letter_spacing = dict_properties["letter_spacing"][0]
241 |     italic = dict_properties["italic"][1]
242 |     bold = dict_properties["bold"][1]
243 | 
244 |     font = fonts[0]
245 |     root = os.path.join('Data/UDHR/', lang_name)
246 |     root_path = os.path.join('Data/UDHR/annotations/', lang_code, "pdfs_synth")
247 |     os.makedirs(root_path, exist_ok=True)
248 |     txt_files = glob.glob(root + '/*[0-9].txt')
249 |     name_style = font + "_" + color.replace("#", "") + "_" + opacity + "_" + letter_spacing
250 |     if italic:
251 |         name_style += "_" + "italic"
252 |     if bold:
253 |         name_style += "_" + "bold"
254 | 
255 |     for txt_file in txt_files:
256 |         with open(txt_file, encoding="utf8") as file:
257 |             list_sentences = file.read()
258 |         name_file = txt_file.split("/")[-1][:-4]
259 |         nb = str(re.findall(r'\d+', name_file)[0])
260 |         name_html_file = f"{lang_code}_{nb}"
261 |         create_html_file(root_path, list_sentences, name_html_file, name_style)
262 | 
263 |         root_save_pdfs = root_path
264 |         root_html_url = root_path
265 |         save_html_to_pdf(root_save_pdfs, root_html_url, name_html_file)
266 |         print("Saving pdf to png for " + name_html_file)
267 |         path_png_out = os.path.join(root_path, name_html_file + ".png")
268 |         path_pdf_in = os.path.join(root_path, name_html_file + ".pdf")
269 |         os.system("convert -density 300 -trim " + path_pdf_in + " -quality 100 " + path_png_out)
270 | 
271 | 
272 | def run_augmentation(lang_code):
273 |     lang_name = lang_code_dict[lang_code]
274 |     root_lang_name = root_dir + lang_name + "/"
275 | 
276 |     lang_codes, lang_fonts = read_fonts_file()
277 |     index_lang_code = lang_codes.index(lang_code)
278 | 
279 |     fonts = lang_fonts[index_lang_code].split("; ")
280 | 
281 |     color = dict_properties["color"][0]
282 |     opacity = dict_properties["opacity"][0]
283 |     letter_spacing = dict_properties["letter_spacing"][0]
284 |     italic = dict_properties["italic"][1]
285 |     bold = dict_properties["bold"][1]
286 |     gauss = dict_properties["gauss"][1]
287 |     skew = dict_properties["skew"][1]
288 | 
289 |     font = fonts[0]
290 |     name_file = lang_code
291 |     with open(os.path.join(root_lang_name, name_file + ".txt"), encoding="utf-8") as file:
292 |         list_sentences = file.readlines()
293 | 
294 |     name_style = font + "_" + color.replace("#", "") + "_" + opacity + "_" + letter_spacing
295 |     if italic:
296 |         name_style += "_" + "italic"
297 |     if bold:
298 |         name_style += "_" + "bold"
299 |     name_html_file = name_file + "_" + name_style
300 | 
301 |     root_path = "Data/augmentation/htmls"
302 |     create_html_file(root_path, list_sentences, name_html_file, name_style)
303 | 
304 |     root_save_pdfs = "Data/augmentation/pdfs"
305 |     root_html_url = f"file://{os.path.abspath('Data/augmentation/htmls')}"
306 |     save_html_to_pdf(root_save_pdfs, root_html_url, name_html_file)
307 |     save_pdf_to_png(lang_name, name_file, name_html_file)
308 |     if gauss:
309 |         add_salt_pepper_noise(lang_name, name_file, name_html_file, amount=0.005, s_vs_p=0.5)
310 |     if skew:
311 |         add_skew(lang_name, name_file, name_html_file)
312 | 
313 | 
314 | def parse_args() -> argparse.Namespace:
315 |     parser = argparse.ArgumentParser()
316 |     parser.add_argument("--dataset", choices=["FLORES", "UDHR"], default="FLORES")
317 |     return parser.parse_args()
318 | 
319 | 
320 | def main() -> None:
321 |     args = parse_args()
322 | 
323 |     for lang_code in lang_code_dict:
324 |         if args.dataset == "FLORES":
325 |             run_augmentation(lang_code)
326 |         elif args.dataset == "UDHR":
327 |             run_augmentation_udhr(lang_code)
328 |         else:
329 |             raise ValueError(f"Unknown dataset: {args.dataset}")
330 | 
331 | 
332 | if __name__ == "__main__":
333 |     main()
334 | 


--------------------------------------------------------------------------------
/shared_tasks/dynalab/handler.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates.
  2 | 
  3 | import json
  4 | import logging
  5 | import time
  6 | import os
  7 | from pathlib import Path
  8 | 
  9 | import fairseq.checkpoint_utils
 10 | import sentencepiece
 11 | import torch
 12 | from typing import NamedTuple
 13 | from dynalab.handler.base_handler import BaseDynaHandler
 14 | from dynalab.tasks.flores_small1 import TaskIO
 15 | from fairseq.sequence_generator import SequenceGenerator
 16 | from fairseq.tasks.translation import TranslationConfig, TranslationTask
 17 | from fairseq.data import data_utils
 18 | 
 19 | logger = logging.getLogger(__name__)
 20 | logger.setLevel(logging.INFO)
 21 | 
 22 | # Tell Torchserve to let use do the deserialization
 23 | os.environ["TS_DECODE_INPUT_REQUEST"] = "false"
 24 | 
 25 | 
 26 | def mapping(languages: str) -> dict:
 27 |     return dict(
 28 |         tuple(pair.split(":"))
 29 |         for pair in languages.strip().replace("\n", "").split(",")
 30 |     )
 31 | 
 32 | 
 33 | ISO2M100 = mapping(
 34 |     """
 35 | afr:af,amh:am,ara:ar,asm:as,ast:ast,azj:az,bel:be,ben:bn,bos:bs,bul:bg,
 36 | cat:ca,ceb:ceb,ces:cs,ckb:ku,cym:cy,dan:da,deu:de,ell:el,eng:en,est:et,
 37 | fas:fa,fin:fi,fra:fr,ful:ff,gle:ga,glg:gl,guj:gu,hau:ha,heb:he,hin:hi,
 38 | hrv:hr,hun:hu,hye:hy,ibo:ig,ind:id,isl:is,ita:it,jav:jv,jpn:ja,kam:kam,
 39 | kan:kn,kat:ka,kaz:kk,kea:kea,khm:km,kir:ky,kor:ko,lao:lo,lav:lv,lin:ln,
 40 | lit:lt,ltz:lb,lug:lg,luo:luo,mal:ml,mar:mr,mkd:mk,mlt:mt,mon:mn,mri:mi,
 41 | msa:ms,mya:my,nld:nl,nob:no,npi:ne,nso:ns,nya:ny,oci:oc,orm:om,ory:or,
 42 | pan:pa,pol:pl,por:pt,pus:ps,ron:ro,rus:ru,slk:sk,slv:sl,sna:sn,snd:sd,
 43 | som:so,spa:es,srp:sr,swe:sv,swh:sw,tam:ta,tel:te,tgk:tg,tgl:tl,tha:th,
 44 | tur:tr,ukr:uk,umb:umb,urd:ur,uzb:uz,vie:vi,wol:wo,xho:xh,yor:yo,zho_simp:zh,
 45 | zho_trad:zh,zul:zu
 46 | """
 47 | )
 48 | 
 49 | 
 50 | class FakeGenerator:
 51 |     """Fake sequence generator, that returns the input."""
 52 | 
 53 |     def generate(self, models, sample, prefix_tokens=None):
 54 |         src_tokens = sample["net_input"]["src_tokens"]
 55 |         return [[{"tokens": tokens[:-1]}] for tokens in src_tokens]
 56 | 
 57 | 
 58 | class Handler(BaseDynaHandler):
 59 |     """Use Fairseq model for translation.
 60 |     To use this handler, download one of the Flores pretrained model:
 61 | 
 62 |     615M parameters:
 63 |         https://dl.fbaipublicfiles.com/flores101/pretrained_models/flores101_mm100_615M.tar.gz
 64 |     175M parameters:
 65 |         https://dl.fbaipublicfiles.com/flores101/pretrained_models/flores101_mm100_175M.tar.gz
 66 | 
 67 |     and extract the files next to this one.
 68 |     Notably there should be a "dict.txt" and a "sentencepiece.bpe.model".
 69 |     """
 70 | 
 71 |     def initialize(self, context):
 72 |         """
 73 |         load model and extra files.
 74 |         """
 75 |         logger.info(
 76 |             f"Will initialize with system_properties: {context.system_properties}"
 77 |         )
 78 |         model_pt_path, model_file_dir, device = self._handler_initialize(context)
 79 |         config = json.loads(
 80 |             (Path(model_file_dir) / "model_generation.json").read_text()
 81 |         )
 82 |         self.device = device
 83 | 
 84 |         translation_cfg = TranslationConfig()
 85 |         self.vocab = TranslationTask.load_dictionary("dict.txt")
 86 | 
 87 |         self.spm = sentencepiece.SentencePieceProcessor()
 88 |         self.spm.Load("sentencepiece.bpe.model")
 89 |         logger.info("Loaded sentencepiece.bpe.model")
 90 | 
 91 |         if config.get("dummy", False):
 92 |             self.sequence_generator = FakeGenerator()
 93 |             logger.warning("Will use a FakeGenerator model, only testing BPE")
 94 |         else:
 95 |             task = TranslationTask(translation_cfg, self.vocab, self.vocab)
 96 |             [model], cfg = fairseq.checkpoint_utils.load_model_ensemble(
 97 |                 [model_pt_path], task=task
 98 |             )
 99 |             model.eval().to(self.device)
100 |             logger.info(f"Loaded model from {model_pt_path} to device {self.device}")
101 |             logger.info(
102 |                 f"Will use the following config: {json.dumps(config, indent=4)}"
103 |             )
104 |             self.sequence_generator = SequenceGenerator(
105 |                 [model],
106 |                 tgt_dict=self.vocab,
107 |                 beam_size=config.get("beam_size", 1),
108 |                 max_len_a=config.get("max_len_a", 1.3),
109 |                 max_len_b=config.get("max_len_b", 5),
110 |                 min_len=config.get("min_len", 5),
111 |             )
112 | 
113 |         self.taskIO = TaskIO()
114 |         self.initialized = True
115 | 
116 |     def lang_token(self, lang: str) -> int:
117 |         """Converts the ISO 639-3 language code to MM100 language codes."""
118 |         simple_lang = ISO2M100[lang]
119 |         token = self.vocab.index(f"__{simple_lang}__")
120 |         assert token != self.vocab.unk(), f"Unknown language '{lang}' ({simple_lang})"
121 |         return token
122 | 
123 |     def tokenize(self, line: str) -> list:
124 |         words = self.spm.EncodeAsPieces(line.strip())
125 |         tokens = [self.vocab.index(word) for word in words]
126 |         return tokens
127 | 
128 |     def preprocess_one(self, sample) -> dict:
129 |         """
130 |         preprocess data into a format that the model can do inference on
131 |         """
132 |         # TODO: this doesn't seem to produce good results. wrong EOS / BOS ?
133 |         tokens = self.tokenize(sample["sourceText"])
134 |         src_token = self.lang_token(sample["sourceLanguage"])
135 |         tgt_token = self.lang_token(sample["targetLanguage"])
136 |         return {
137 |             "src_tokens": [src_token] + tokens + [self.vocab.eos()],
138 |             "src_length": len(tokens) + 1,
139 |             "tgt_token": tgt_token,
140 |         }
141 |         return sample
142 | 
143 |     def preprocess(self, samples) -> dict:
144 |         samples = [self.preprocess_one(s) for s in samples]
145 |         prefix_tokens = torch.tensor([[s["tgt_token"]] for s in samples])
146 |         src_lengths = torch.tensor([s["src_length"] for s in samples])
147 |         src_tokens = data_utils.collate_tokens(
148 |             [torch.tensor(s["src_tokens"]) for s in samples],
149 |             self.vocab.pad(),
150 |             self.vocab.eos(),
151 |         )
152 |         return {
153 |             "nsentences": len(samples),
154 |             "ntokens": src_lengths.sum().item(),
155 |             "net_input": {
156 |                 "src_tokens": src_tokens.to(self.device),
157 |                 "src_lengths": src_lengths.to(self.device),
158 |             },
159 |             "prefix_tokens": prefix_tokens.to(self.device),
160 |         }
161 | 
162 |     def strip_pad(self, sentence):
163 |         assert sentence.ndim == 1
164 |         return sentence[sentence.ne(self.vocab.pad())]
165 | 
166 |     @torch.no_grad()
167 |     def inference(self, input_data: dict) -> list:
168 |         generated = self.sequence_generator.generate(
169 |             models=[],
170 |             sample=input_data,
171 |             prefix_tokens=input_data["prefix_tokens"],
172 |         )
173 |         # `generate` returns a list of samples
174 |         # with several hypothesis per sample
175 |         # and a dict per hypothesis.
176 |         # We also need to strip the language token.
177 |         return [hypos[0]["tokens"][1:] for hypos in generated]
178 | 
179 |     def postprocess(self, inference_output, samples: list) -> list:
180 |         """
181 |         post process inference output into a response.
182 |         response should be a list of json
183 |         the response format will need to pass the validation in
184 |         ```
185 |         dynalab.tasks.flores_small1.TaskIO().verify_response(response)
186 |         ```
187 |         """
188 |         translations = [
189 |             self.vocab.string(self.strip_pad(sentence), "sentencepiece")
190 |             for sentence in inference_output
191 |         ]
192 |         return [
193 |             # Signing required by dynabench, don't remove.
194 |             self.taskIO.sign_response(
195 |                 {"id": sample["uid"], "translatedText": translation},
196 |                 sample,
197 |             )
198 |             for translation, sample in zip(translations, samples)
199 |         ]
200 | 
201 | 
202 | _service = Handler()
203 | 
204 | 
205 | def deserialize(torchserve_data: list) -> list:
206 |     samples = []
207 |     for torchserve_sample in torchserve_data:
208 |         data = torchserve_sample["body"]
209 |         # In case torchserve did the deserialization for us.
210 |         if isinstance(data, dict):
211 |             samples.append(data)
212 |         elif isinstance(data, (bytes, bytearray)):
213 |             lines = data.decode("utf-8").splitlines()
214 |             for i, l in enumerate(lines):
215 |                 try:
216 |                     samples.append(json.loads(l))
217 |                 except Exception as e:
218 |                     logging.error(f"Couldn't deserialize line {i}: {l}")
219 |                     logging.exception(e)
220 |         else:
221 |             logging.error(f"Unexpected payload: {data}")
222 | 
223 |     return samples
224 | 
225 | 
226 | def handle_mini_batch(service, samples):
227 |     n = len(samples)
228 |     start_time = time.time()
229 |     input_data = service.preprocess(samples)
230 |     logger.info(
231 |         f"Preprocessed a batch of size {n} ({n/(time.time()-start_time):.2f} samples / s)"
232 |     )
233 | 
234 |     start_time = time.time()
235 |     output = service.inference(input_data)
236 |     logger.info(
237 |         f"Infered a batch of size {n} ({n/(time.time()-start_time):.2f} samples / s)"
238 |     )
239 | 
240 |     start_time = time.time()
241 |     json_results = service.postprocess(output, samples)
242 |     logger.info(
243 |         f"Postprocessed a batch of size {n} ({n/(time.time()-start_time):.2f} samples / s)"
244 |     )
245 |     return json_results
246 | 
247 | 
248 | def handle(torchserve_data, context):
249 |     if not _service.initialized:
250 |         _service.initialize(context)
251 |     if torchserve_data is None:
252 |         return None
253 | 
254 |     start_time = time.time()
255 |     all_samples = deserialize(torchserve_data)
256 |     n = len(all_samples)
257 |     logger.info(
258 |         f"Deserialized a batch of size {n} ({n/(time.time()-start_time):.2f} samples / s)"
259 |     )
260 |     # Adapt this to your model. The GPU has 16Gb of RAM.
261 |     batch_size = 128
262 |     results = []
263 |     samples = []
264 |     for i, sample in enumerate(all_samples):
265 |         samples.append(sample)
266 |         if len(samples) < batch_size and i + 1 < n:
267 |             continue
268 | 
269 |         results.extend(handle_mini_batch(_service, samples))
270 |         samples = []
271 | 
272 |     assert len(results)
273 |     start_time = time.time()
274 |     response = "\n".join(json.dumps(r, indent=None, ensure_ascii=False) for r in results)
275 |     logger.info(
276 |         f"Serialized a batch of size {n} ({n/(time.time()-start_time):.2f} samples / s)"
277 |     )
278 |     return [response]
279 | 
280 | 
281 | def local_test():
282 |     from dynalab.tasks import flores_small1
283 | 
284 |     bin_data = b"\n".join(json.dumps(d).encode("utf-8") for d in flores_small1.data)
285 |     torchserve_data = [{"body": bin_data}]
286 | 
287 |     manifest = {"model": {"serializedFile": "model.pt"}}
288 |     system_properties = {"model_dir": ".", "gpu_id": None}
289 | 
290 |     class Context(NamedTuple):
291 |         system_properties: dict
292 |         manifest: dict
293 | 
294 |     ctx = Context(system_properties, manifest)
295 |     batch_responses = handle(torchserve_data, ctx)
296 |     print(batch_responses)
297 | 
298 |     single_responses = [
299 |         handle([{"body": json.dumps(d).encode("utf-8")}], ctx)[0]
300 |         for d in flores_small1.data
301 |     ]
302 |     assert batch_responses == ["\n".join(single_responses)]
303 | 
304 | 
305 | if __name__ == "__main__":
306 |     local_test()
307 | 


--------------------------------------------------------------------------------
/flores200/README.md:
--------------------------------------------------------------------------------
  1 | <p align="center">
  2 | <img src="NLLB_GITHUB_BANNER_Flores.png" width="700">
  3 | </p>
  4 | 
  5 | --------------------------------------------------------------------------------
  6 | 
  7 | # The FLORES-200 Evaluation Benchmark for Low-Resource and Multilingual Machine Translation
  8 | 
  9 | The creation of FLORES-200 doubles the existing language coverage of FLORES-101. Given the nature of the new languages, which have less standardization and require more specialized professional translations, the verification process became more complex. This required modifications to the translation workflow. FLORES-200 has several languages which were not translated from English. Specifically, several languages were translated from Spanish, French, Russian and Modern Standard Arabic. Moreover, FLORES-200 also includes two script alternatives for four languages. 
 10 | 
 11 | --------------------------------------------------------------------------------
 12 | 
 13 | ## Composition
 14 | FLORES-200 consists of translations from 842 distinct web articles, totaling
 15 | 3001 sentences. These sentences are divided into three splits: dev, devtest, and test (hidden). On average, sentences are approximately 21 words long.
 16 | 
 17 | ## Download
 18 | 
 19 | ⚠️ This repository is no longer being updated ⚠️
 20 | 
 21 | **For newer versions of this dataset**, see <https://github.com/openlanguagedata/flores> and <https://www.oldi.org>.
 22 | 
 23 | The original version of the dataset can still be downloaded [here](https://tinyurl.com/flores200dataset) and is also available on HuggingFace [here](https://huggingface.co/datasets/facebook/flores).
 24 | 
 25 | ## SPM and Dictionary
 26 | 
 27 | * Dictionary Download [here](https://tinyurl.com/nllb200dictionary)
 28 | * SPM Model Download [here](https://tinyurl.com/flores200sacrebleuspm)
 29 | 
 30 | ### Example SentencePiece Usage
 31 | Note: Install SentencePiece from [here](https://github.com/google/sentencepiece)
 32 | 
 33 | ```bash
 34 | flores_dataset=/path/to/flores_dataset
 35 | fairseq=/path/to/fairseq
 36 | cd $fairseq
 37 | 
 38 | python scripts/spm_encode.py \
 39 |     --model flores_spm_model_here \
 40 |     --output_format=piece \
 41 |     --inputs=data_input_path_here \
 42 |     --outputs=data_output_path_here
 43 | ```
 44 | 
 45 | ## Evaluation 
 46 | 
 47 | We primarily evaluate with chrf++:
 48 | 
 49 | ```bash
 50 | sacrebleu -m chrf --chrf-word-order 2 {ref_file} < {hyp_file}
 51 | ```
 52 | 
 53 | and also evaluate with spBLEU:
 54 | 
 55 | ```bash
 56 | # tokenize with SPM
 57 | python scripts/spm_encode.py \
 58 |     --model flores_spm_model_here \
 59 |     --output_format=piece \
 60 |     --inputs={untok_hyp_file} \
 61 |     --outputs={hyp_file}
 62 | 
 63 | # calculate with sacrebleu
 64 | cat {hyp_file} | sacrebleu {ref_file}
 65 | ```
 66 | 
 67 | ## Languages in FLORES-200
 68 | 
 69 | Language | FLORES-200 code
 70 | ---|---
 71 | Acehnese (Arabic script) | ace_Arab
 72 | Acehnese (Latin script) | ace_Latn
 73 | Mesopotamian Arabic | acm_Arab
 74 | Ta’izzi-Adeni Arabic | acq_Arab
 75 | Tunisian Arabic | aeb_Arab
 76 | Afrikaans | afr_Latn
 77 | South Levantine Arabic | ajp_Arab
 78 | Akan | aka_Latn
 79 | Amharic | amh_Ethi
 80 | North Levantine Arabic | apc_Arab
 81 | Modern Standard Arabic | arb_Arab
 82 | Modern Standard Arabic (Romanized) | arb_Latn
 83 | Najdi Arabic | ars_Arab
 84 | Moroccan Arabic | ary_Arab
 85 | Egyptian Arabic | arz_Arab
 86 | Assamese | asm_Beng
 87 | Asturian | ast_Latn
 88 | Awadhi | awa_Deva
 89 | Central Aymara | ayr_Latn
 90 | South Azerbaijani | azb_Arab
 91 | North Azerbaijani | azj_Latn
 92 | Bashkir | bak_Cyrl
 93 | Bambara | bam_Latn
 94 | Balinese | ban_Latn
 95 | Belarusian | bel_Cyrl
 96 | Bemba | bem_Latn
 97 | Bengali | ben_Beng
 98 | Bhojpuri | bho_Deva
 99 | Banjar (Arabic script) | bjn_Arab
100 | Banjar (Latin script) | bjn_Latn
101 | Standard Tibetan | bod_Tibt
102 | Bosnian | bos_Latn
103 | Buginese | bug_Latn
104 | Bulgarian | bul_Cyrl
105 | Catalan | cat_Latn
106 | Cebuano | ceb_Latn
107 | Czech | ces_Latn
108 | Chokwe | cjk_Latn
109 | Central Kurdish | ckb_Arab
110 | Crimean Tatar | crh_Latn
111 | Welsh | cym_Latn
112 | Danish | dan_Latn
113 | German | deu_Latn
114 | Southwestern Dinka | dik_Latn
115 | Dyula | dyu_Latn
116 | Dzongkha | dzo_Tibt
117 | Greek | ell_Grek
118 | English | eng_Latn
119 | Esperanto | epo_Latn
120 | Estonian | est_Latn
121 | Basque | eus_Latn
122 | Ewe | ewe_Latn
123 | Faroese | fao_Latn
124 | Fijian | fij_Latn
125 | Finnish | fin_Latn
126 | Fon | fon_Latn
127 | French | fra_Latn
128 | Friulian | fur_Latn
129 | Nigerian Fulfulde | fuv_Latn
130 | Scottish Gaelic | gla_Latn
131 | Irish | gle_Latn
132 | Galician | glg_Latn
133 | Guarani | grn_Latn
134 | Gujarati | guj_Gujr
135 | Haitian Creole | hat_Latn
136 | Hausa | hau_Latn
137 | Hebrew | heb_Hebr
138 | Hindi | hin_Deva
139 | Chhattisgarhi | hne_Deva
140 | Croatian | hrv_Latn
141 | Hungarian | hun_Latn
142 | Armenian | hye_Armn
143 | Igbo | ibo_Latn
144 | Ilocano | ilo_Latn
145 | Indonesian | ind_Latn
146 | Icelandic | isl_Latn
147 | Italian | ita_Latn
148 | Javanese | jav_Latn
149 | Japanese | jpn_Jpan
150 | Kabyle | kab_Latn
151 | Jingpho | kac_Latn
152 | Kamba | kam_Latn
153 | Kannada | kan_Knda
154 | Kashmiri (Arabic script) | kas_Arab
155 | Kashmiri (Devanagari script) | kas_Deva
156 | Georgian | kat_Geor
157 | Central Kanuri (Arabic script) | knc_Arab
158 | Central Kanuri (Latin script) | knc_Latn
159 | Kazakh | kaz_Cyrl
160 | Kabiyè | kbp_Latn
161 | Kabuverdianu | kea_Latn
162 | Khmer | khm_Khmr
163 | Kikuyu | kik_Latn
164 | Kinyarwanda | kin_Latn
165 | Kyrgyz | kir_Cyrl
166 | Kimbundu | kmb_Latn
167 | Northern Kurdish | kmr_Latn
168 | Kikongo | kon_Latn
169 | Korean | kor_Hang
170 | Lao | lao_Laoo
171 | Ligurian | lij_Latn
172 | Limburgish | lim_Latn
173 | Lingala | lin_Latn
174 | Lithuanian | lit_Latn
175 | Lombard | lmo_Latn
176 | Latgalian | ltg_Latn
177 | Luxembourgish | ltz_Latn
178 | Luba-Kasai | lua_Latn
179 | Ganda | lug_Latn
180 | Luo | luo_Latn
181 | Mizo | lus_Latn
182 | Standard Latvian | lvs_Latn
183 | Magahi | mag_Deva
184 | Maithili | mai_Deva
185 | Malayalam | mal_Mlym
186 | Marathi | mar_Deva
187 | Minangkabau (Arabic script) | min_Arab
188 | Minangkabau (Latin script) | min_Latn
189 | Macedonian | mkd_Cyrl
190 | Plateau Malagasy | plt_Latn
191 | Maltese | mlt_Latn
192 | Meitei (Bengali script) | mni_Beng
193 | Halh Mongolian | khk_Cyrl
194 | Mossi | mos_Latn
195 | Maori | mri_Latn
196 | Burmese | mya_Mymr
197 | Dutch | nld_Latn
198 | Norwegian Nynorsk | nno_Latn
199 | Norwegian Bokmål | nob_Latn
200 | Nepali | npi_Deva
201 | Northern Sotho | nso_Latn
202 | Nuer | nus_Latn
203 | Nyanja | nya_Latn
204 | Occitan | oci_Latn
205 | West Central Oromo | gaz_Latn
206 | Odia | ory_Orya
207 | Pangasinan | pag_Latn
208 | Eastern Panjabi | pan_Guru
209 | Papiamento | pap_Latn
210 | Western Persian | pes_Arab
211 | Polish | pol_Latn
212 | Portuguese | por_Latn
213 | Dari | prs_Arab
214 | Southern Pashto | pbt_Arab
215 | Ayacucho Quechua | quy_Latn
216 | Romanian | ron_Latn
217 | Rundi | run_Latn
218 | Russian | rus_Cyrl
219 | Sango | sag_Latn
220 | Sanskrit | san_Deva
221 | Santali | sat_Olck
222 | Sicilian | scn_Latn
223 | Shan | shn_Mymr
224 | Sinhala | sin_Sinh
225 | Slovak | slk_Latn
226 | Slovenian | slv_Latn
227 | Samoan | smo_Latn
228 | Shona | sna_Latn
229 | Sindhi | snd_Arab
230 | Somali | som_Latn
231 | Southern Sotho | sot_Latn
232 | Spanish | spa_Latn
233 | Tosk Albanian | als_Latn
234 | Sardinian | srd_Latn
235 | Serbian | srp_Cyrl
236 | Swati | ssw_Latn
237 | Sundanese | sun_Latn
238 | Swedish | swe_Latn
239 | Swahili | swh_Latn
240 | Silesian | szl_Latn
241 | Tamil | tam_Taml
242 | Tatar | tat_Cyrl
243 | Telugu | tel_Telu
244 | Tajik | tgk_Cyrl
245 | Tagalog | tgl_Latn
246 | Thai | tha_Thai
247 | Tigrinya | tir_Ethi
248 | Tamasheq (Latin script) | taq_Latn
249 | Tamasheq (Tifinagh script) | taq_Tfng
250 | Tok Pisin | tpi_Latn
251 | Tswana | tsn_Latn
252 | Tsonga | tso_Latn
253 | Turkmen | tuk_Latn
254 | Tumbuka | tum_Latn
255 | Turkish | tur_Latn
256 | Twi | twi_Latn
257 | Central Atlas Tamazight | tzm_Tfng
258 | Uyghur | uig_Arab
259 | Ukrainian | ukr_Cyrl
260 | Umbundu | umb_Latn
261 | Urdu | urd_Arab
262 | Northern Uzbek | uzn_Latn
263 | Venetian | vec_Latn
264 | Vietnamese | vie_Latn
265 | Waray | war_Latn
266 | Wolof | wol_Latn
267 | Xhosa | xho_Latn
268 | Eastern Yiddish | ydd_Hebr
269 | Yoruba | yor_Latn
270 | Yue Chinese | yue_Hant
271 | Chinese (Simplified) | zho_Hans
272 | Chinese (Traditional) | zho_Hant
273 | Standard Malay | zsm_Latn
274 | Zulu | zul_Latn
275 | 
276 | ## Updates to Previous Languages
277 | Based on feedback and further Q/A, we've improved the quality of several languages:
278 | 
279 | * Quechua (quy_Latn)
280 | * Aymara (ayr_Latn)
281 | * Cebuano (ceb_Latn)
282 | * Kimbundu (kmb_Latn)
283 | * Umbundu (umb_Latn)
284 | 
285 | As a result, the results between FLORES-101 and FLORES-200 for these languages will differ slightly.
286 | 
287 | ### Map between FLORES-101 Language Codes and FLORES-200 Language Codes
288 | 
289 | FLORES-200 code | FLORES-101 code
290 | ---|---
291 | afr_Latn | afr
292 | amh_Ethi | amh
293 | arb_Arab | ara
294 | asm_Beng | asm
295 | ast_Latn | ast
296 | azj_Latn | azj
297 | bel_Cyrl | bel
298 | ben_Beng | ben
299 | bos_Latn | bos
300 | bul_Cyrl | bul
301 | cat_Latn | cat
302 | ceb_Latn | ceb
303 | ces_Latn | ces
304 | ckb_Arab | ckb
305 | cym_Latn | cym
306 | dan_Latn | dan
307 | deu_Latn | deu
308 | ell_Grek | ell
309 | eng_Latn | eng
310 | est_Latn | est
311 | fin_Latn | fin
312 | fra_Latn | fra
313 | fuv_Latn | ful
314 | gle_Latn | gle
315 | glg_Latn | glg
316 | guj_Gujr | guj
317 | hau_Latn | hau
318 | heb_Hebr | heb
319 | hin_Deva | hin
320 | hrv_Latn | hrv
321 | hun_Latn | hun
322 | hye_Armn | hye
323 | ibo_Latn | ibo
324 | ind_Latn | ind
325 | isl_Latn | isl
326 | ita_Latn | ita
327 | jav_Latn | jav
328 | jpn_Jpan | jpn
329 | kam_Latn | kam
330 | kan_Knda | kan
331 | kat_Geor | kat
332 | kaz_Cyrl | kaz
333 | khm_Khmr | khm
334 | kir_Cyrl | kir
335 | kor_Hang | kor
336 | lao_Laoo | lao
337 | lij_Latn | Latvian
338 | lim_Latn | kea
339 | lin_Latn | lin
340 | lit_Latn | lit
341 | ltz_Latn | ltz
342 | lug_Latn | lug
343 | luo_Latn | luo
344 | lvs_Latn | lav
345 | mal_Mlym | mal
346 | mar_Deva | mar
347 | mkd_Cyrl | mkd
348 | mlt_Latn | mlt
349 | khk_Cyrl | mon
350 | mri_Latn | mri
351 | mya_Mymr | mya
352 | nld_Latn | nld
353 | nob_Latn | nob
354 | npi_Deva | npi
355 | nso_Latn | nso
356 | nya_Latn | nya
357 | oci_Latn | oci
358 | gaz_Latn | orm
359 | ory_Orya | ory
360 | pan_Guru | pan
361 | pes_Arab | fas
362 | pol_Latn | pol
363 | por_Latn | por
364 | pbt_Arab | pus
365 | ron_Latn | ron
366 | rus_Cyrl | rus
367 | slk_Latn | slk
368 | sna_Latn | sna
369 | snd_Arab | snd
370 | som_Latn | som
371 | spa_Latn | spa
372 | srp_Cyrl | srp
373 | swe_Latn | swe
374 | swh_Latn | swh
375 | tam_Taml | tam
376 | tel_Telu | tel
377 | tgk_Cyrl | tgk
378 | tgl_Latn | tgl
379 | tha_Thai | tha
380 | tur_Latn | tur
381 | ukr_Cyrl | ukr
382 | umb_Latn | umb
383 | urd_Arab | urd
384 | uzn_Latn | uzb
385 | vie_Latn | vie
386 | wol_Latn | wol
387 | xho_Latn | xho
388 | yor_Latn | yor
389 | zho_Hans | zho_simpl
390 | zho_Hant | zho_trad
391 | zsm_Latn | msa
392 | zul_Latn | zul
393 | 
394 | ## Previous FLORES Releases 
395 | 
396 | ### FLORES-101 
397 | `FLORES-101` is a Many-to-Many multilingual translation benchmark dataset for 101 languages. 
398 | 
399 | * **Paper:** [The FLORES-101 Evaluation Benchmark for Low-Resource and Multilingual Machine Translation](https://ai.facebook.com/research/publications/the-flores-101-evaluation-benchmark-for-low-resource-and-multilingual-machine-translation).
400 | 
401 | * Download `FLORES-101` [**dataset**](https://dl.fbaipublicfiles.com/flores101/dataset/flores101_dataset.tar.gz) and the [WMT22 supplement](https://dl.fbaipublicfiles.com/flores101/dataset/flores_wmt22_supplement.tar.gz).
402 | 
403 | * Read the [**blogpost**](https://ai.facebook.com/blog/the-flores-101-data-set-helping-build-better-translation-systems-around-the-world) and [**paper**](https://ai.facebook.com/research/publications/the-flores-101-evaluation-benchmark-for-low-resource-and-multilingual-machine-translation).
404 | 
405 | * Evaluation server: [dynabench](https://dynabench.org/flores),
406 |   [Instructions to submit model](/shared_tasks/dynalab/README.md)
407 | 
408 | ### FLORESv1 
409 | FLORESv1 included Nepali, Sinhala, Pashto, and Khmer. 
410 | 
411 | * **Paper:** [The FLoRes Evaluation Datasets for Low-Resource Machine Translation: Nepali-English and Sinhala-English](https://arxiv.org/abs/1902.01382)
412 | 
413 | * Download `FLORESv1` [**dataset**](/previous_releases/floresv1/)
414 | 
415 | ## Citation
416 | 
417 | If you use this data in your work, please cite:
418 | 
419 | ```bibtex
420 | @article{nllb2022,
421 |   author    = {NLLB Team, Marta R. Costa-jussà, James Cross, Onur Çelebi, Maha Elbayad, Kenneth Heafield, Kevin Heffernan, Elahe Kalbassi,  Janice Lam, Daniel Licht, Jean Maillard, Anna Sun, Skyler Wang, Guillaume Wenzek, Al Youngblood, Bapi Akula, Loic Barrault, Gabriel Mejia Gonzalez, Prangthip Hansanti, John Hoffman, Semarley Jarrett, Kaushik Ram Sadagopan, Dirk Rowe, Shannon Spruit, Chau Tran, Pierre Andrews, Necip Fazil Ayan, Shruti Bhosale, Sergey Edunov, Angela Fan, Cynthia Gao, Vedanuj Goswami, Francisco Guzmán, Philipp Koehn, Alexandre Mourachko, Christophe Ropers, Safiyyah Saleem, Holger Schwenk, Jeff Wang},
422 |   title     = {No Language Left Behind: Scaling Human-Centered Machine Translation},
423 |   year      = {2022}
424 | }
425 | 
426 | @inproceedings{,
427 |   title={The FLORES-101  Evaluation Benchmark for Low-Resource and Multilingual Machine Translation},
428 |   author={Goyal, Naman and Gao, Cynthia and Chaudhary, Vishrav and Chen, Peng-Jen and Wenzek, Guillaume and Ju, Da and Krishnan, Sanjana and Ranzato, Marc'Aurelio and Guzm\'{a}n, Francisco and Fan, Angela},
429 |   year={2021}
430 | }
431 | 
432 | @inproceedings{,
433 |   title={Two New Evaluation Datasets for Low-Resource Machine Translation: Nepali-English and Sinhala-English},
434 |   author={Guzm\'{a}n, Francisco and Chen, Peng-Jen and Ott, Myle and Pino, Juan and Lample, Guillaume and Koehn, Philipp and Chaudhary, Vishrav and Ranzato, Marc'Aurelio},
435 |   journal={arXiv preprint arXiv:1902.01382},
436 |   year={2019}
437 | }
438 | ```
439 | 


--------------------------------------------------------------------------------