├── src
    └── language_model
    │   ├── __init__.py
    │   ├── modelling
    │       ├── __init__.py
    │       └── trainer.py
    │   ├── tokenization
    │       ├── __init__.py
    │       └── trainer.py
    │   ├── pipeline.py
    │   └── runner.py
├── requirements.txt
├── .gitattributes
├── requirements.dev.txt
├── setup.py
├── .isort.cfg
├── run.py
├── load.py
├── requirements_installation.sh
├── configs
    └── ukr
    │   ├── train_tokenizer
    │       └── ukr-roberta-base.py
    │   └── train_model
    │       └── ukr-roberta-base.py
├── setup.cfg
├── .pre-commit-config.yaml
├── .gitignore
└── README.md


/src/language_model/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/language_model/modelling/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/language_model/tokenization/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | torch==1.4.0
2 | transformers==2.11.0
3 | 


--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
1 | # Auto detect text files and perform LF normalization
2 | * text=auto
3 | 


--------------------------------------------------------------------------------
/requirements.dev.txt:
--------------------------------------------------------------------------------
1 | dvc==0.63.*
2 | git+https://github.com/youscan/ds-shared.git@master
3 | ipython
4 | pre-commit
5 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import find_packages, setup
 2 | 
 3 | setup(
 4 |     name="language-modelling",
 5 |     version="0.2.0",
 6 |     package_dir={"": "src"},
 7 |     packages=find_packages("src"),
 8 |     include_package_data=True,
 9 | )
10 | 


--------------------------------------------------------------------------------
/src/language_model/pipeline.py:
--------------------------------------------------------------------------------
1 | class ITask(object):
2 |     def execute(self, environment_path: str) -> None:
3 |         raise NotImplementedError()
4 | 
5 | 
6 | class TaskRunner(object):
7 |     def run(self) -> None:
8 |         raise NotImplementedError()
9 | 


--------------------------------------------------------------------------------
/.isort.cfg:
--------------------------------------------------------------------------------
 1 | [settings]
 2 | multi_line_output=3
 3 | include_trailing_comma=True
 4 | force_grid_wrap=0
 5 | use_parentheses=True
 6 | line_length=119
 7 | skip_glob=venv/*,stubs/*
 8 | known_first_party = language_model
 9 | known_third_party = setuptools,tokenizers,transformers
10 | 


--------------------------------------------------------------------------------
/run.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | 
 3 | from language_model.runner import SandboxRunner
 4 | 
 5 | if __name__ == "__main__":
 6 |     parser = argparse.ArgumentParser()
 7 |     parser.add_argument("--task", default=None, type=str, required=True, help="Configuration file")
 8 |     args = parser.parse_args()
 9 |     runner = SandboxRunner(config_path=args.task)
10 |     runner.run()
11 | 


--------------------------------------------------------------------------------
/load.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | 
 3 | from language_model.runner import SandboxRunner
 4 | 
 5 | DATA_FOLDER_PATH = "data"
 6 | 
 7 | 
 8 | if __name__ == "__main__":
 9 |     parser = argparse.ArgumentParser()
10 |     parser.add_argument("--task", default=None, type=str, required=True, help="Configuration file")
11 |     args = parser.parse_args()
12 |     runner = SandboxRunner(config_path=args.task, sandbox_root_path=DATA_FOLDER_PATH)
13 |     runner.run()
14 | 


--------------------------------------------------------------------------------
/requirements_installation.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | pip install -U pip
 3 | pip install -r requirements.txt
 4 | pip install -r requirements.dev.txt
 5 | 
 6 | (
 7 |   git clone https://github.com/NVIDIA/apex || { echo "Failed to download and install Nvidia apex"; exit 1; }
 8 |   cd apex && \
 9 |   pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" ./
10 | )
11 | 
12 | pip install -e .
13 | 
14 | pre-commit install
15 | 


--------------------------------------------------------------------------------
/configs/ukr/train_tokenizer/ukr-roberta-base.py:
--------------------------------------------------------------------------------
 1 | from tokenizers.implementations import ByteLevelBPETokenizer
 2 | 
 3 | from language_model.tokenization.trainer import ByteLevelBPETokenizerTrainer
 4 | 
 5 | task = ByteLevelBPETokenizerTrainer(
 6 |     source_folder_path="data/ukr/data/wiki_oscar_data/",
 7 |     tokenizer=ByteLevelBPETokenizer(),
 8 |     vocab_size=52000,
 9 |     min_frequency=5,
10 |     special_tokens=["<s>", "<pad>", "</s>", "<unk>", "<mask>"],
11 | )
12 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
 1 | [isort]
 2 | multi_line_output=3
 3 | include_trailing_comma=True
 4 | force_grid_wrap=0
 5 | use_parentheses=True
 6 | line_length=119
 7 | skip_glob=venv/*
 8 | 
 9 | [flake8]
10 | max-line-length = 119
11 | exclude = data/,.dvc/,venv/,.git,apex/,ys-python-api/
12 | per-file-ignores = __init__.py:F401
13 | ignore = E203,W503
14 | 
15 | [mypy]
16 | ignore_missing_imports = True
17 | disallow_untyped_calls = True
18 | disallow_untyped_defs = True
19 | disallow_incomplete_defs = True
20 | disallow_any_generics = True
21 | check_untyped_defs = True
22 | disallow_untyped_decorators = True
23 | no_implicit_optional = True
24 | warn_return_any = True
25 | 
26 | [tool:pytest]
27 | norecursedirs = apex/* configs
28 | filterwarnings =
29 |     ignore:.* is deprecated:DeprecationWarning
30 | 


--------------------------------------------------------------------------------
/configs/ukr/train_model/ukr-roberta-base.py:
--------------------------------------------------------------------------------
 1 | from transformers import RobertaConfig, RobertaForMaskedLM, RobertaTokenizer
 2 | 
 3 | from language_model.modelling.trainer import RobertaForMaskedLMTrainTask
 4 | 
 5 | _model_config = RobertaConfig(
 6 |     vocab_size=52000,
 7 |     max_position_embeddings=514,
 8 |     num_attention_heads=12,
 9 |     num_hidden_layers=12,
10 |     type_vocab_size=1,
11 |     intermediate_size=3072,
12 | )
13 | 
14 | _model = RobertaForMaskedLM(_model_config)
15 | 
16 | _tokenizer = RobertaTokenizer.from_pretrained("outputs/ukr/train_tokenizer/ukr-roberta-base/tokenizer", max_len=512)
17 | 
18 | task = RobertaForMaskedLMTrainTask(
19 |     file_path="data/ukr/aggregated_data/ukr-roberta-base/data.txt",
20 |     model=_model,
21 |     tokenizer=_tokenizer,
22 |     batch_size_per_gpu=40,
23 | )
24 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | repos:
 2 |   - repo: https://github.com/asottile/seed-isort-config
 3 |     rev: v1.9.1
 4 |     hooks:
 5 |       - id: seed-isort-config
 6 |   - repo: https://github.com/pre-commit/mirrors-isort
 7 |     rev: v4.3.21
 8 |     hooks:
 9 |       - id: isort
10 |         args: ["-rc"]
11 |   - repo: https://github.com/psf/black
12 |     rev: 19.3b0
13 |     hooks:
14 |       - id: black
15 |         args: ["--line-length=119"]
16 |   - repo: https://github.com/pre-commit/pre-commit-hooks
17 |     rev: v2.3.0
18 |     hooks:
19 |       - id: trailing-whitespace
20 |       - id: check-yaml
21 |       - id: check-json
22 |       - id: end-of-file-fixer
23 |       - id: requirements-txt-fixer
24 |   - repo: https://github.com/pycqa/flake8
25 |     rev: 3.8.2
26 |     hooks:
27 |       - id: flake8
28 |         additional_dependencies: [
29 |           flake8-bugbear==20.1.4,
30 |           flake8-builtins==1.5.3,
31 |           flake8-debugger==3.2.1,
32 |           flake8-isort==3.0.0,
33 |           isort==4.3.21,
34 |         ]
35 |         args: ["--config=setup.cfg"]
36 |   - repo: https://github.com/pre-commit/mirrors-mypy
37 |     rev: v0.761
38 |     hooks:
39 |       - id: mypy
40 |         args: ["--config=setup.cfg"]
41 |         exclude: configs/
42 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Byte-compiled / optimized / DLL files
 2 | __pycache__/
 3 | *.py[cod]
 4 | *$py.class
 5 | 
 6 | # C extensions
 7 | *.so
 8 | 
 9 | # Distribution / packaging
10 | .Python
11 | env/
12 | build/
13 | develop-eggs/
14 | dist/
15 | downloads/
16 | eggs/
17 | .eggs/
18 | lib/
19 | lib64/
20 | parts/
21 | sdist/
22 | var/
23 | *.egg-info/
24 | .installed.cfg
25 | *.egg
26 | 
27 | # PyInstaller
28 | #  Usually these files are written by a python script from a template
29 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
30 | *.manifest
31 | *.spec
32 | 
33 | # Installer logs
34 | pip-log.txt
35 | pip-delete-this-directory.txt
36 | 
37 | # Unit test / coverage reports
38 | htmlcov/
39 | .tox/
40 | .coverage
41 | .coverage.*
42 | .cache
43 | nosetests.xml
44 | coverage.xml
45 | *,cover
46 | .hypothesis/
47 | 
48 | # Translations
49 | *.mo
50 | *.pot
51 | 
52 | # Django stuff:
53 | *.log
54 | local_settings.py
55 | 
56 | # Flask stuff:
57 | instance/
58 | .webassets-cache
59 | 
60 | # Scrapy stuff:
61 | .scrapy
62 | 
63 | # Sphinx documentation
64 | docs/_build/
65 | 
66 | # PyBuilder
67 | target/
68 | 
69 | # IPython Notebook
70 | .ipynb_checkpoints
71 | 
72 | # pyenv
73 | .python-version
74 | 
75 | # celery beat schedule file
76 | celerybeat-schedule
77 | 
78 | # dotenv
79 | .env
80 | 
81 | # virtualenv
82 | venv/
83 | ENV/
84 | 
85 | # Spyder project settings
86 | .spyderproject
87 | 
88 | # Rope project settings
89 | .ropeproject
90 | 
91 | .idea/
92 | .mypy_cache/
93 | apex/
94 | data/
95 | results/
96 | 


--------------------------------------------------------------------------------
/src/language_model/tokenization/trainer.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from typing import List
 3 | 
 4 | from tokenizers import ByteLevelBPETokenizer
 5 | 
 6 | from ..pipeline import ITask
 7 | 
 8 | 
 9 | class ByteLevelBPETokenizerTrainer(ITask):
10 |     def __init__(
11 |         self,
12 |         source_folder_path: str,
13 |         tokenizer: ByteLevelBPETokenizer,
14 |         vocab_size: int,
15 |         min_frequency: int,
16 |         special_tokens: List[str],
17 |     ) -> None:
18 |         super().__init__()
19 |         self.source_folder_path = source_folder_path
20 |         self.tokenizer = tokenizer
21 |         self.special_tokens = special_tokens
22 |         self.min_frequency = min_frequency
23 |         self.vocab_size = vocab_size
24 | 
25 |     def execute(self, environment_path: str) -> None:
26 |         files = self.get_all_files_in_folder(self.source_folder_path)
27 | 
28 |         self.tokenizer.train(
29 |             files=files,
30 |             vocab_size=self.vocab_size,
31 |             min_frequency=self.min_frequency,
32 |             special_tokens=self.special_tokens,
33 |         )
34 | 
35 |         self.tokenizer.save(os.path.join(environment_path, "tokenizer"))
36 | 
37 |     @staticmethod
38 |     def get_all_files_in_folder(data_folder_path: str) -> List[str]:
39 |         data_files_paths = []
40 |         for (dir_path, _, filenames) in os.walk(data_folder_path):
41 |             data_files_paths.extend([os.path.join(dir_path, file_name) for file_name in filenames])
42 |         return data_files_paths
43 | 


--------------------------------------------------------------------------------
/src/language_model/modelling/trainer.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from transformers import (
 4 |     DataCollatorForLanguageModeling,
 5 |     LineByLineTextDataset,
 6 |     RobertaForMaskedLM,
 7 |     RobertaTokenizerFast,
 8 |     Trainer,
 9 |     TrainingArguments,
10 | )
11 | 
12 | from ..pipeline import ITask
13 | 
14 | 
15 | class RobertaForMaskedLMTrainTask(ITask):
16 |     def __init__(
17 |         self,
18 |         file_path: str,
19 |         model: RobertaForMaskedLM,
20 |         tokenizer: RobertaTokenizerFast,
21 |         block_size: int = 128,
22 |         mlm_probability: float = 0.15,
23 |         epochs: int = 1,
24 |         batch_size_per_gpu: int = 64,
25 |         save_steps: int = 10000,
26 |         save_total_limit: int = 2,
27 |     ) -> None:
28 |         super().__init__()
29 |         self.file_path = file_path
30 |         self.model = model
31 |         self.tokenizer = tokenizer
32 |         self.block_size = block_size
33 |         self.mlm_probability = mlm_probability
34 |         self.epochs = epochs
35 |         self.batch_size_per_gpu = batch_size_per_gpu
36 |         self.save_steps = save_steps
37 |         self.save_total_limit = save_total_limit
38 | 
39 |     def execute(self, environment_path: str) -> None:
40 |         dataset = LineByLineTextDataset(tokenizer=self.tokenizer, file_path=self.file_path, block_size=self.block_size)
41 | 
42 |         data_collator = DataCollatorForLanguageModeling(
43 |             tokenizer=self.tokenizer, mlm=True, mlm_probability=self.mlm_probability
44 |         )
45 | 
46 |         training_args = TrainingArguments(
47 |             output_dir=os.path.join(environment_path, "temp"),
48 |             overwrite_output_dir=True,
49 |             num_train_epochs=self.epochs,
50 |             per_gpu_train_batch_size=self.batch_size_per_gpu,
51 |             save_steps=self.save_steps,
52 |             save_total_limit=self.save_total_limit,
53 |         )
54 | 
55 |         trainer = Trainer(
56 |             model=self.model,
57 |             args=training_args,
58 |             data_collator=data_collator,
59 |             train_dataset=dataset,
60 |             prediction_loss_only=True,
61 |         )
62 | 
63 |         trainer.train()
64 | 
65 |         trainer.save_model(os.path.join(environment_path, "model"))
66 |         self.tokenizer.save_pretrained(os.path.join(environment_path, "tokenizer"))
67 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Ukrainian Roberta
 2 | 
 3 | Ukrainian Roberta was trained by YouScan data science team.
 4 | 
 5 | ## Pre-training corpora
 6 | Below is the list of corpora used along with the output of wc command (counting lines, words and characters). These corpora were concatenated and tokenized with HuggingFace Roberta Tokenizer.
 7 | 
 8 | | Tables        | Lines           | Words  | Characters  |
 9 | | ------------- |--------------:| -----:| -----:|
10 | | [Ukrainian Wikipedia - May 2020](https://dumps.wikimedia.org/ukwiki/latest/ukwiki-latest-pages-articles.xml.bz2)      | 18 001 466| 201 207 739 | 2 647 891 947 |
11 | | [Ukrainian OSCAR deduplicated dataset](https://oscar-public.huma-num.fr/shuffled/uk_dedup.txt.gz) | 56 560 011      |    2 250 210 650 | 29 705 050 592 |
12 | | Sampled mentions from social networks | 11 245 710      |    128 461 796 | 1 632 567 763 |
13 | | Total | 85 807 187      |    2 579 880 185 | 33 985 510 302 |
14 | 
15 | ## Pre-training details
16 | 
17 | * Ukrainian Roberta was trained with code provided in [HuggingFace tutorial](https://huggingface.co/blog/how-to-train)
18 | * Currently released model follows roberta-base-cased model architecture (12-layer, 768-hidden, 12-heads, 125M parameters)
19 | * The model was trained on 4xV100 (85 hours)
20 | * Training configuration you can find in the config file (`configs/train_model/version_1/ukr/ukr-roberta-base.py`)
21 | 
22 | ## Evaluation
23 | ukr-roberta-base was tested on internal YouScan tasks, we got improvements by 2 percent (fscore) comparing to [mBERT](https://huggingface.co/bert-base-multilingual-cased).
24 | 
25 | ## Usage
26 | Ukrainian Roberta is released via [HuggingFace Transformers library](https://huggingface.co/transformers/).
27 | 
28 | ```python
29 | from transformers import pipeline, RobertaForMaskedLM, RobertaTokenizer
30 | 
31 | model = RobertaForMaskedLM.from_pretrained("youscan/ukr-roberta-base")
32 | tokenizer = RobertaTokenizer.from_pretrained("youscan/ukr-roberta-base")
33 | 
34 | fill_mask = pipeline('fill-mask', model=model, tokenizer=tokenizer)
35 | fill_mask("Тарас Шевченко – великий українсьский <mask>.")
36 | # [{'sequence': '<s> Тарас Шевченко – великий українсьский поет.</s>',
37 | #   'score': 0.48607954382896423,
38 | #   'token': 11426},
39 | #  {'sequence': '<s> Тарас Шевченко – великий українсьский письменник.</s>',
40 | #   'score': 0.23330871760845184,
41 | #   'token': 10121},
42 | #  {'sequence': '<s> Тарас Шевченко – великий українсьский художник.</s>',
43 | #   'score': 0.06583040952682495,
44 | #   'token': 12836},
45 | #  {'sequence': '<s> Тарас Шевченко – великий українсьский князь.</s>',
46 | #   'score': 0.021497823297977448,
47 | #   'token': 17247},
48 | #  {'sequence': '<s> Тарас Шевченко – великий українсьский народ.</s>',
49 | #   'score': 0.020411811769008636,
50 | #   'token': 1021}]
51 | ```
52 | 
53 | ## Author
54 | Vitalii Radchenko - contact me on Twitter [@vitaliradchenko](https://twitter.com/vitaliradchenko)
55 | 


--------------------------------------------------------------------------------
/src/language_model/runner.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from importlib import import_module
 3 | from logging import INFO, FileHandler, Formatter, Logger, StreamHandler, getLogger
 4 | from typing import Sequence
 5 | 
 6 | from .pipeline import ITask, TaskRunner
 7 | 
 8 | LOGGING_FORMAT: str = "%(asctime)s : %(levelname)s : %(module)s : %(message)s"
 9 | DEFAULT_LOG_DIR: str = "logs"
10 | DEFAULT_LOG_FILE: str = "log.txt"
11 | DEFAULT_CONFIGURATION_DIR: str = "configs"
12 | TASK_FIELD_NAME: str = "task"
13 | 
14 | 
15 | class SandboxRunner(TaskRunner):
16 |     def __init__(self, config_path: str, sandbox_root_path: str = "outputs") -> None:
17 |         self.config_path = config_path
18 |         self.sandbox_root_path = sandbox_root_path
19 | 
20 |     def get_root_folder_path(self) -> str:
21 |         return self.sandbox_root_path
22 | 
23 |     def run(self) -> None:
24 |         experiment_ids = identifiers_from_config_file(self.config_path)
25 |         module_name = ".".join(experiment_ids)
26 |         module = import_module(module_name)
27 |         task: ITask = getattr(module, TASK_FIELD_NAME)
28 | 
29 |         pure_experiment_ids = drop_configuration_dir(experiment_ids=experiment_ids)
30 |         experiment_sandbox_path = os.path.join(*pure_experiment_ids)
31 |         sandbox_folder_path = os.path.join(self.get_root_folder_path(), experiment_sandbox_path)
32 | 
33 |         logger = init_logger(experiment_ids, overwrite=True)
34 |         logger.info(f"Running task from {self.config_path}")
35 |         if not os.path.exists(sandbox_folder_path) or not os.path.isdir(sandbox_folder_path):
36 |             os.makedirs(sandbox_folder_path)
37 |         task.execute(sandbox_folder_path)
38 | 
39 | 
40 | def identifiers_from_config_file(filepath: str) -> Sequence[str]:
41 |     path = os.path.normpath(filepath)
42 |     path_components = path.split(os.sep)
43 |     path_components[-1] = os.path.splitext(path_components[-1])[0]
44 |     return path_components
45 | 
46 | 
47 | def drop_configuration_dir(experiment_ids: Sequence[str]) -> Sequence[str]:
48 |     config_dir_index = experiment_ids[:-1].index(DEFAULT_CONFIGURATION_DIR)
49 |     if config_dir_index != -1:
50 |         experiment_ids = experiment_ids[config_dir_index + 1 :]
51 |     return experiment_ids
52 | 
53 | 
54 | def init_logger(experiment_identifiers: Sequence[str], overwrite: bool = True, log_to_stderr: bool = False) -> Logger:
55 |     path_components = [DEFAULT_LOG_DIR] + list(experiment_identifiers)
56 |     log_path = os.path.join(*path_components)
57 | 
58 |     logger = getLogger()
59 |     logger.setLevel(INFO)
60 | 
61 |     logging_path = os.path.join(log_path)
62 |     if not os.path.exists(logging_path):
63 |         os.makedirs(logging_path)
64 | 
65 |     formatter = Formatter(LOGGING_FORMAT)
66 | 
67 |     fh = FileHandler(os.path.join(logging_path, DEFAULT_LOG_FILE), mode="w" if overwrite else "a", encoding="utf-8")
68 |     fh.setFormatter(formatter)
69 |     logger.addHandler(fh)
70 | 
71 |     if log_to_stderr:
72 |         sh = StreamHandler()
73 |         sh.setFormatter(formatter)
74 |         logger.addHandler(sh)
75 | 
76 |     return logger
77 | 


--------------------------------------------------------------------------------