├── src └── language_model │ ├── __init__.py │ ├── modelling │ ├── __init__.py │ └── trainer.py │ ├── tokenization │ ├── __init__.py │ └── trainer.py │ ├── pipeline.py │ └── runner.py ├── requirements.txt ├── .gitattributes ├── requirements.dev.txt ├── setup.py ├── .isort.cfg ├── run.py ├── load.py ├── requirements_installation.sh ├── configs └── ukr │ ├── train_tokenizer │ └── ukr-roberta-base.py │ └── train_model │ └── ukr-roberta-base.py ├── setup.cfg ├── .pre-commit-config.yaml ├── .gitignore └── README.md /src/language_model/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/language_model/modelling/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/language_model/tokenization/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | torch==1.4.0 2 | transformers==2.11.0 3 | -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | # Auto detect text files and perform LF normalization 2 | * text=auto 3 | -------------------------------------------------------------------------------- /requirements.dev.txt: -------------------------------------------------------------------------------- 1 | dvc==0.63.* 2 | git+https://github.com/youscan/ds-shared.git@master 3 | ipython 4 | pre-commit 5 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import find_packages, setup 2 | 3 | setup( 4 | name="language-modelling", 5 | version="0.2.0", 6 | package_dir={"": "src"}, 7 | packages=find_packages("src"), 8 | include_package_data=True, 9 | ) 10 | -------------------------------------------------------------------------------- /src/language_model/pipeline.py: -------------------------------------------------------------------------------- 1 | class ITask(object): 2 | def execute(self, environment_path: str) -> None: 3 | raise NotImplementedError() 4 | 5 | 6 | class TaskRunner(object): 7 | def run(self) -> None: 8 | raise NotImplementedError() 9 | -------------------------------------------------------------------------------- /.isort.cfg: -------------------------------------------------------------------------------- 1 | [settings] 2 | multi_line_output=3 3 | include_trailing_comma=True 4 | force_grid_wrap=0 5 | use_parentheses=True 6 | line_length=119 7 | skip_glob=venv/*,stubs/* 8 | known_first_party = language_model 9 | known_third_party = setuptools,tokenizers,transformers 10 | -------------------------------------------------------------------------------- /run.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | from language_model.runner import SandboxRunner 4 | 5 | if __name__ == "__main__": 6 | parser = argparse.ArgumentParser() 7 | parser.add_argument("--task", default=None, type=str, required=True, help="Configuration file") 8 | args = parser.parse_args() 9 | runner = SandboxRunner(config_path=args.task) 10 | runner.run() 11 | -------------------------------------------------------------------------------- /load.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | from language_model.runner import SandboxRunner 4 | 5 | DATA_FOLDER_PATH = "data" 6 | 7 | 8 | if __name__ == "__main__": 9 | parser = argparse.ArgumentParser() 10 | parser.add_argument("--task", default=None, type=str, required=True, help="Configuration file") 11 | args = parser.parse_args() 12 | runner = SandboxRunner(config_path=args.task, sandbox_root_path=DATA_FOLDER_PATH) 13 | runner.run() 14 | -------------------------------------------------------------------------------- /requirements_installation.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | pip install -U pip 3 | pip install -r requirements.txt 4 | pip install -r requirements.dev.txt 5 | 6 | ( 7 | git clone https://github.com/NVIDIA/apex || { echo "Failed to download and install Nvidia apex"; exit 1; } 8 | cd apex && \ 9 | pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" ./ 10 | ) 11 | 12 | pip install -e . 13 | 14 | pre-commit install 15 | -------------------------------------------------------------------------------- /configs/ukr/train_tokenizer/ukr-roberta-base.py: -------------------------------------------------------------------------------- 1 | from tokenizers.implementations import ByteLevelBPETokenizer 2 | 3 | from language_model.tokenization.trainer import ByteLevelBPETokenizerTrainer 4 | 5 | task = ByteLevelBPETokenizerTrainer( 6 | source_folder_path="data/ukr/data/wiki_oscar_data/", 7 | tokenizer=ByteLevelBPETokenizer(), 8 | vocab_size=52000, 9 | min_frequency=5, 10 | special_tokens=["", "", "", "", ""], 11 | ) 12 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [isort] 2 | multi_line_output=3 3 | include_trailing_comma=True 4 | force_grid_wrap=0 5 | use_parentheses=True 6 | line_length=119 7 | skip_glob=venv/* 8 | 9 | [flake8] 10 | max-line-length = 119 11 | exclude = data/,.dvc/,venv/,.git,apex/,ys-python-api/ 12 | per-file-ignores = __init__.py:F401 13 | ignore = E203,W503 14 | 15 | [mypy] 16 | ignore_missing_imports = True 17 | disallow_untyped_calls = True 18 | disallow_untyped_defs = True 19 | disallow_incomplete_defs = True 20 | disallow_any_generics = True 21 | check_untyped_defs = True 22 | disallow_untyped_decorators = True 23 | no_implicit_optional = True 24 | warn_return_any = True 25 | 26 | [tool:pytest] 27 | norecursedirs = apex/* configs 28 | filterwarnings = 29 | ignore:.* is deprecated:DeprecationWarning 30 | -------------------------------------------------------------------------------- /configs/ukr/train_model/ukr-roberta-base.py: -------------------------------------------------------------------------------- 1 | from transformers import RobertaConfig, RobertaForMaskedLM, RobertaTokenizer 2 | 3 | from language_model.modelling.trainer import RobertaForMaskedLMTrainTask 4 | 5 | _model_config = RobertaConfig( 6 | vocab_size=52000, 7 | max_position_embeddings=514, 8 | num_attention_heads=12, 9 | num_hidden_layers=12, 10 | type_vocab_size=1, 11 | intermediate_size=3072, 12 | ) 13 | 14 | _model = RobertaForMaskedLM(_model_config) 15 | 16 | _tokenizer = RobertaTokenizer.from_pretrained("outputs/ukr/train_tokenizer/ukr-roberta-base/tokenizer", max_len=512) 17 | 18 | task = RobertaForMaskedLMTrainTask( 19 | file_path="data/ukr/aggregated_data/ukr-roberta-base/data.txt", 20 | model=_model, 21 | tokenizer=_tokenizer, 22 | batch_size_per_gpu=40, 23 | ) 24 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: https://github.com/asottile/seed-isort-config 3 | rev: v1.9.1 4 | hooks: 5 | - id: seed-isort-config 6 | - repo: https://github.com/pre-commit/mirrors-isort 7 | rev: v4.3.21 8 | hooks: 9 | - id: isort 10 | args: ["-rc"] 11 | - repo: https://github.com/psf/black 12 | rev: 19.3b0 13 | hooks: 14 | - id: black 15 | args: ["--line-length=119"] 16 | - repo: https://github.com/pre-commit/pre-commit-hooks 17 | rev: v2.3.0 18 | hooks: 19 | - id: trailing-whitespace 20 | - id: check-yaml 21 | - id: check-json 22 | - id: end-of-file-fixer 23 | - id: requirements-txt-fixer 24 | - repo: https://github.com/pycqa/flake8 25 | rev: 3.8.2 26 | hooks: 27 | - id: flake8 28 | additional_dependencies: [ 29 | flake8-bugbear==20.1.4, 30 | flake8-builtins==1.5.3, 31 | flake8-debugger==3.2.1, 32 | flake8-isort==3.0.0, 33 | isort==4.3.21, 34 | ] 35 | args: ["--config=setup.cfg"] 36 | - repo: https://github.com/pre-commit/mirrors-mypy 37 | rev: v0.761 38 | hooks: 39 | - id: mypy 40 | args: ["--config=setup.cfg"] 41 | exclude: configs/ 42 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | 27 | # PyInstaller 28 | # Usually these files are written by a python script from a template 29 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 30 | *.manifest 31 | *.spec 32 | 33 | # Installer logs 34 | pip-log.txt 35 | pip-delete-this-directory.txt 36 | 37 | # Unit test / coverage reports 38 | htmlcov/ 39 | .tox/ 40 | .coverage 41 | .coverage.* 42 | .cache 43 | nosetests.xml 44 | coverage.xml 45 | *,cover 46 | .hypothesis/ 47 | 48 | # Translations 49 | *.mo 50 | *.pot 51 | 52 | # Django stuff: 53 | *.log 54 | local_settings.py 55 | 56 | # Flask stuff: 57 | instance/ 58 | .webassets-cache 59 | 60 | # Scrapy stuff: 61 | .scrapy 62 | 63 | # Sphinx documentation 64 | docs/_build/ 65 | 66 | # PyBuilder 67 | target/ 68 | 69 | # IPython Notebook 70 | .ipynb_checkpoints 71 | 72 | # pyenv 73 | .python-version 74 | 75 | # celery beat schedule file 76 | celerybeat-schedule 77 | 78 | # dotenv 79 | .env 80 | 81 | # virtualenv 82 | venv/ 83 | ENV/ 84 | 85 | # Spyder project settings 86 | .spyderproject 87 | 88 | # Rope project settings 89 | .ropeproject 90 | 91 | .idea/ 92 | .mypy_cache/ 93 | apex/ 94 | data/ 95 | results/ 96 | -------------------------------------------------------------------------------- /src/language_model/tokenization/trainer.py: -------------------------------------------------------------------------------- 1 | import os 2 | from typing import List 3 | 4 | from tokenizers import ByteLevelBPETokenizer 5 | 6 | from ..pipeline import ITask 7 | 8 | 9 | class ByteLevelBPETokenizerTrainer(ITask): 10 | def __init__( 11 | self, 12 | source_folder_path: str, 13 | tokenizer: ByteLevelBPETokenizer, 14 | vocab_size: int, 15 | min_frequency: int, 16 | special_tokens: List[str], 17 | ) -> None: 18 | super().__init__() 19 | self.source_folder_path = source_folder_path 20 | self.tokenizer = tokenizer 21 | self.special_tokens = special_tokens 22 | self.min_frequency = min_frequency 23 | self.vocab_size = vocab_size 24 | 25 | def execute(self, environment_path: str) -> None: 26 | files = self.get_all_files_in_folder(self.source_folder_path) 27 | 28 | self.tokenizer.train( 29 | files=files, 30 | vocab_size=self.vocab_size, 31 | min_frequency=self.min_frequency, 32 | special_tokens=self.special_tokens, 33 | ) 34 | 35 | self.tokenizer.save(os.path.join(environment_path, "tokenizer")) 36 | 37 | @staticmethod 38 | def get_all_files_in_folder(data_folder_path: str) -> List[str]: 39 | data_files_paths = [] 40 | for (dir_path, _, filenames) in os.walk(data_folder_path): 41 | data_files_paths.extend([os.path.join(dir_path, file_name) for file_name in filenames]) 42 | return data_files_paths 43 | -------------------------------------------------------------------------------- /src/language_model/modelling/trainer.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from transformers import ( 4 | DataCollatorForLanguageModeling, 5 | LineByLineTextDataset, 6 | RobertaForMaskedLM, 7 | RobertaTokenizerFast, 8 | Trainer, 9 | TrainingArguments, 10 | ) 11 | 12 | from ..pipeline import ITask 13 | 14 | 15 | class RobertaForMaskedLMTrainTask(ITask): 16 | def __init__( 17 | self, 18 | file_path: str, 19 | model: RobertaForMaskedLM, 20 | tokenizer: RobertaTokenizerFast, 21 | block_size: int = 128, 22 | mlm_probability: float = 0.15, 23 | epochs: int = 1, 24 | batch_size_per_gpu: int = 64, 25 | save_steps: int = 10000, 26 | save_total_limit: int = 2, 27 | ) -> None: 28 | super().__init__() 29 | self.file_path = file_path 30 | self.model = model 31 | self.tokenizer = tokenizer 32 | self.block_size = block_size 33 | self.mlm_probability = mlm_probability 34 | self.epochs = epochs 35 | self.batch_size_per_gpu = batch_size_per_gpu 36 | self.save_steps = save_steps 37 | self.save_total_limit = save_total_limit 38 | 39 | def execute(self, environment_path: str) -> None: 40 | dataset = LineByLineTextDataset(tokenizer=self.tokenizer, file_path=self.file_path, block_size=self.block_size) 41 | 42 | data_collator = DataCollatorForLanguageModeling( 43 | tokenizer=self.tokenizer, mlm=True, mlm_probability=self.mlm_probability 44 | ) 45 | 46 | training_args = TrainingArguments( 47 | output_dir=os.path.join(environment_path, "temp"), 48 | overwrite_output_dir=True, 49 | num_train_epochs=self.epochs, 50 | per_gpu_train_batch_size=self.batch_size_per_gpu, 51 | save_steps=self.save_steps, 52 | save_total_limit=self.save_total_limit, 53 | ) 54 | 55 | trainer = Trainer( 56 | model=self.model, 57 | args=training_args, 58 | data_collator=data_collator, 59 | train_dataset=dataset, 60 | prediction_loss_only=True, 61 | ) 62 | 63 | trainer.train() 64 | 65 | trainer.save_model(os.path.join(environment_path, "model")) 66 | self.tokenizer.save_pretrained(os.path.join(environment_path, "tokenizer")) 67 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Ukrainian Roberta 2 | 3 | Ukrainian Roberta was trained by YouScan data science team. 4 | 5 | ## Pre-training corpora 6 | Below is the list of corpora used along with the output of wc command (counting lines, words and characters). These corpora were concatenated and tokenized with HuggingFace Roberta Tokenizer. 7 | 8 | | Tables | Lines | Words | Characters | 9 | | ------------- |--------------:| -----:| -----:| 10 | | [Ukrainian Wikipedia - May 2020](https://dumps.wikimedia.org/ukwiki/latest/ukwiki-latest-pages-articles.xml.bz2) | 18 001 466| 201 207 739 | 2 647 891 947 | 11 | | [Ukrainian OSCAR deduplicated dataset](https://oscar-public.huma-num.fr/shuffled/uk_dedup.txt.gz) | 56 560 011 | 2 250 210 650 | 29 705 050 592 | 12 | | Sampled mentions from social networks | 11 245 710 | 128 461 796 | 1 632 567 763 | 13 | | Total | 85 807 187 | 2 579 880 185 | 33 985 510 302 | 14 | 15 | ## Pre-training details 16 | 17 | * Ukrainian Roberta was trained with code provided in [HuggingFace tutorial](https://huggingface.co/blog/how-to-train) 18 | * Currently released model follows roberta-base-cased model architecture (12-layer, 768-hidden, 12-heads, 125M parameters) 19 | * The model was trained on 4xV100 (85 hours) 20 | * Training configuration you can find in the config file (`configs/train_model/version_1/ukr/ukr-roberta-base.py`) 21 | 22 | ## Evaluation 23 | ukr-roberta-base was tested on internal YouScan tasks, we got improvements by 2 percent (fscore) comparing to [mBERT](https://huggingface.co/bert-base-multilingual-cased). 24 | 25 | ## Usage 26 | Ukrainian Roberta is released via [HuggingFace Transformers library](https://huggingface.co/transformers/). 27 | 28 | ```python 29 | from transformers import pipeline, RobertaForMaskedLM, RobertaTokenizer 30 | 31 | model = RobertaForMaskedLM.from_pretrained("youscan/ukr-roberta-base") 32 | tokenizer = RobertaTokenizer.from_pretrained("youscan/ukr-roberta-base") 33 | 34 | fill_mask = pipeline('fill-mask', model=model, tokenizer=tokenizer) 35 | fill_mask("Тарас Шевченко – великий українсьский .") 36 | # [{'sequence': ' Тарас Шевченко – великий українсьский поет.', 37 | # 'score': 0.48607954382896423, 38 | # 'token': 11426}, 39 | # {'sequence': ' Тарас Шевченко – великий українсьский письменник.', 40 | # 'score': 0.23330871760845184, 41 | # 'token': 10121}, 42 | # {'sequence': ' Тарас Шевченко – великий українсьский художник.', 43 | # 'score': 0.06583040952682495, 44 | # 'token': 12836}, 45 | # {'sequence': ' Тарас Шевченко – великий українсьский князь.', 46 | # 'score': 0.021497823297977448, 47 | # 'token': 17247}, 48 | # {'sequence': ' Тарас Шевченко – великий українсьский народ.', 49 | # 'score': 0.020411811769008636, 50 | # 'token': 1021}] 51 | ``` 52 | 53 | ## Author 54 | Vitalii Radchenko - contact me on Twitter [@vitaliradchenko](https://twitter.com/vitaliradchenko) 55 | -------------------------------------------------------------------------------- /src/language_model/runner.py: -------------------------------------------------------------------------------- 1 | import os 2 | from importlib import import_module 3 | from logging import INFO, FileHandler, Formatter, Logger, StreamHandler, getLogger 4 | from typing import Sequence 5 | 6 | from .pipeline import ITask, TaskRunner 7 | 8 | LOGGING_FORMAT: str = "%(asctime)s : %(levelname)s : %(module)s : %(message)s" 9 | DEFAULT_LOG_DIR: str = "logs" 10 | DEFAULT_LOG_FILE: str = "log.txt" 11 | DEFAULT_CONFIGURATION_DIR: str = "configs" 12 | TASK_FIELD_NAME: str = "task" 13 | 14 | 15 | class SandboxRunner(TaskRunner): 16 | def __init__(self, config_path: str, sandbox_root_path: str = "outputs") -> None: 17 | self.config_path = config_path 18 | self.sandbox_root_path = sandbox_root_path 19 | 20 | def get_root_folder_path(self) -> str: 21 | return self.sandbox_root_path 22 | 23 | def run(self) -> None: 24 | experiment_ids = identifiers_from_config_file(self.config_path) 25 | module_name = ".".join(experiment_ids) 26 | module = import_module(module_name) 27 | task: ITask = getattr(module, TASK_FIELD_NAME) 28 | 29 | pure_experiment_ids = drop_configuration_dir(experiment_ids=experiment_ids) 30 | experiment_sandbox_path = os.path.join(*pure_experiment_ids) 31 | sandbox_folder_path = os.path.join(self.get_root_folder_path(), experiment_sandbox_path) 32 | 33 | logger = init_logger(experiment_ids, overwrite=True) 34 | logger.info(f"Running task from {self.config_path}") 35 | if not os.path.exists(sandbox_folder_path) or not os.path.isdir(sandbox_folder_path): 36 | os.makedirs(sandbox_folder_path) 37 | task.execute(sandbox_folder_path) 38 | 39 | 40 | def identifiers_from_config_file(filepath: str) -> Sequence[str]: 41 | path = os.path.normpath(filepath) 42 | path_components = path.split(os.sep) 43 | path_components[-1] = os.path.splitext(path_components[-1])[0] 44 | return path_components 45 | 46 | 47 | def drop_configuration_dir(experiment_ids: Sequence[str]) -> Sequence[str]: 48 | config_dir_index = experiment_ids[:-1].index(DEFAULT_CONFIGURATION_DIR) 49 | if config_dir_index != -1: 50 | experiment_ids = experiment_ids[config_dir_index + 1 :] 51 | return experiment_ids 52 | 53 | 54 | def init_logger(experiment_identifiers: Sequence[str], overwrite: bool = True, log_to_stderr: bool = False) -> Logger: 55 | path_components = [DEFAULT_LOG_DIR] + list(experiment_identifiers) 56 | log_path = os.path.join(*path_components) 57 | 58 | logger = getLogger() 59 | logger.setLevel(INFO) 60 | 61 | logging_path = os.path.join(log_path) 62 | if not os.path.exists(logging_path): 63 | os.makedirs(logging_path) 64 | 65 | formatter = Formatter(LOGGING_FORMAT) 66 | 67 | fh = FileHandler(os.path.join(logging_path, DEFAULT_LOG_FILE), mode="w" if overwrite else "a", encoding="utf-8") 68 | fh.setFormatter(formatter) 69 | logger.addHandler(fh) 70 | 71 | if log_to_stderr: 72 | sh = StreamHandler() 73 | sh.setFormatter(formatter) 74 | logger.addHandler(sh) 75 | 76 | return logger 77 | --------------------------------------------------------------------------------