├── src
└── language_model
│ ├── __init__.py
│ ├── modelling
│ ├── __init__.py
│ └── trainer.py
│ ├── tokenization
│ ├── __init__.py
│ └── trainer.py
│ ├── pipeline.py
│ └── runner.py
├── requirements.txt
├── .gitattributes
├── requirements.dev.txt
├── setup.py
├── .isort.cfg
├── run.py
├── load.py
├── requirements_installation.sh
├── configs
└── ukr
│ ├── train_tokenizer
│ └── ukr-roberta-base.py
│ └── train_model
│ └── ukr-roberta-base.py
├── setup.cfg
├── .pre-commit-config.yaml
├── .gitignore
└── README.md
/src/language_model/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/src/language_model/modelling/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/src/language_model/tokenization/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | torch==1.4.0
2 | transformers==2.11.0
3 |
--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
1 | # Auto detect text files and perform LF normalization
2 | * text=auto
3 |
--------------------------------------------------------------------------------
/requirements.dev.txt:
--------------------------------------------------------------------------------
1 | dvc==0.63.*
2 | git+https://github.com/youscan/ds-shared.git@master
3 | ipython
4 | pre-commit
5 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | from setuptools import find_packages, setup
2 |
3 | setup(
4 | name="language-modelling",
5 | version="0.2.0",
6 | package_dir={"": "src"},
7 | packages=find_packages("src"),
8 | include_package_data=True,
9 | )
10 |
--------------------------------------------------------------------------------
/src/language_model/pipeline.py:
--------------------------------------------------------------------------------
1 | class ITask(object):
2 | def execute(self, environment_path: str) -> None:
3 | raise NotImplementedError()
4 |
5 |
6 | class TaskRunner(object):
7 | def run(self) -> None:
8 | raise NotImplementedError()
9 |
--------------------------------------------------------------------------------
/.isort.cfg:
--------------------------------------------------------------------------------
1 | [settings]
2 | multi_line_output=3
3 | include_trailing_comma=True
4 | force_grid_wrap=0
5 | use_parentheses=True
6 | line_length=119
7 | skip_glob=venv/*,stubs/*
8 | known_first_party = language_model
9 | known_third_party = setuptools,tokenizers,transformers
10 |
--------------------------------------------------------------------------------
/run.py:
--------------------------------------------------------------------------------
1 | import argparse
2 |
3 | from language_model.runner import SandboxRunner
4 |
5 | if __name__ == "__main__":
6 | parser = argparse.ArgumentParser()
7 | parser.add_argument("--task", default=None, type=str, required=True, help="Configuration file")
8 | args = parser.parse_args()
9 | runner = SandboxRunner(config_path=args.task)
10 | runner.run()
11 |
--------------------------------------------------------------------------------
/load.py:
--------------------------------------------------------------------------------
1 | import argparse
2 |
3 | from language_model.runner import SandboxRunner
4 |
5 | DATA_FOLDER_PATH = "data"
6 |
7 |
8 | if __name__ == "__main__":
9 | parser = argparse.ArgumentParser()
10 | parser.add_argument("--task", default=None, type=str, required=True, help="Configuration file")
11 | args = parser.parse_args()
12 | runner = SandboxRunner(config_path=args.task, sandbox_root_path=DATA_FOLDER_PATH)
13 | runner.run()
14 |
--------------------------------------------------------------------------------
/requirements_installation.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | pip install -U pip
3 | pip install -r requirements.txt
4 | pip install -r requirements.dev.txt
5 |
6 | (
7 | git clone https://github.com/NVIDIA/apex || { echo "Failed to download and install Nvidia apex"; exit 1; }
8 | cd apex && \
9 | pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" ./
10 | )
11 |
12 | pip install -e .
13 |
14 | pre-commit install
15 |
--------------------------------------------------------------------------------
/configs/ukr/train_tokenizer/ukr-roberta-base.py:
--------------------------------------------------------------------------------
1 | from tokenizers.implementations import ByteLevelBPETokenizer
2 |
3 | from language_model.tokenization.trainer import ByteLevelBPETokenizerTrainer
4 |
5 | task = ByteLevelBPETokenizerTrainer(
6 | source_folder_path="data/ukr/data/wiki_oscar_data/",
7 | tokenizer=ByteLevelBPETokenizer(),
8 | vocab_size=52000,
9 | min_frequency=5,
10 | special_tokens=["", "", "", "", ""],
11 | )
12 |
--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [isort]
2 | multi_line_output=3
3 | include_trailing_comma=True
4 | force_grid_wrap=0
5 | use_parentheses=True
6 | line_length=119
7 | skip_glob=venv/*
8 |
9 | [flake8]
10 | max-line-length = 119
11 | exclude = data/,.dvc/,venv/,.git,apex/,ys-python-api/
12 | per-file-ignores = __init__.py:F401
13 | ignore = E203,W503
14 |
15 | [mypy]
16 | ignore_missing_imports = True
17 | disallow_untyped_calls = True
18 | disallow_untyped_defs = True
19 | disallow_incomplete_defs = True
20 | disallow_any_generics = True
21 | check_untyped_defs = True
22 | disallow_untyped_decorators = True
23 | no_implicit_optional = True
24 | warn_return_any = True
25 |
26 | [tool:pytest]
27 | norecursedirs = apex/* configs
28 | filterwarnings =
29 | ignore:.* is deprecated:DeprecationWarning
30 |
--------------------------------------------------------------------------------
/configs/ukr/train_model/ukr-roberta-base.py:
--------------------------------------------------------------------------------
1 | from transformers import RobertaConfig, RobertaForMaskedLM, RobertaTokenizer
2 |
3 | from language_model.modelling.trainer import RobertaForMaskedLMTrainTask
4 |
5 | _model_config = RobertaConfig(
6 | vocab_size=52000,
7 | max_position_embeddings=514,
8 | num_attention_heads=12,
9 | num_hidden_layers=12,
10 | type_vocab_size=1,
11 | intermediate_size=3072,
12 | )
13 |
14 | _model = RobertaForMaskedLM(_model_config)
15 |
16 | _tokenizer = RobertaTokenizer.from_pretrained("outputs/ukr/train_tokenizer/ukr-roberta-base/tokenizer", max_len=512)
17 |
18 | task = RobertaForMaskedLMTrainTask(
19 | file_path="data/ukr/aggregated_data/ukr-roberta-base/data.txt",
20 | model=_model,
21 | tokenizer=_tokenizer,
22 | batch_size_per_gpu=40,
23 | )
24 |
--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
1 | repos:
2 | - repo: https://github.com/asottile/seed-isort-config
3 | rev: v1.9.1
4 | hooks:
5 | - id: seed-isort-config
6 | - repo: https://github.com/pre-commit/mirrors-isort
7 | rev: v4.3.21
8 | hooks:
9 | - id: isort
10 | args: ["-rc"]
11 | - repo: https://github.com/psf/black
12 | rev: 19.3b0
13 | hooks:
14 | - id: black
15 | args: ["--line-length=119"]
16 | - repo: https://github.com/pre-commit/pre-commit-hooks
17 | rev: v2.3.0
18 | hooks:
19 | - id: trailing-whitespace
20 | - id: check-yaml
21 | - id: check-json
22 | - id: end-of-file-fixer
23 | - id: requirements-txt-fixer
24 | - repo: https://github.com/pycqa/flake8
25 | rev: 3.8.2
26 | hooks:
27 | - id: flake8
28 | additional_dependencies: [
29 | flake8-bugbear==20.1.4,
30 | flake8-builtins==1.5.3,
31 | flake8-debugger==3.2.1,
32 | flake8-isort==3.0.0,
33 | isort==4.3.21,
34 | ]
35 | args: ["--config=setup.cfg"]
36 | - repo: https://github.com/pre-commit/mirrors-mypy
37 | rev: v0.761
38 | hooks:
39 | - id: mypy
40 | args: ["--config=setup.cfg"]
41 | exclude: configs/
42 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | env/
12 | build/
13 | develop-eggs/
14 | dist/
15 | downloads/
16 | eggs/
17 | .eggs/
18 | lib/
19 | lib64/
20 | parts/
21 | sdist/
22 | var/
23 | *.egg-info/
24 | .installed.cfg
25 | *.egg
26 |
27 | # PyInstaller
28 | # Usually these files are written by a python script from a template
29 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
30 | *.manifest
31 | *.spec
32 |
33 | # Installer logs
34 | pip-log.txt
35 | pip-delete-this-directory.txt
36 |
37 | # Unit test / coverage reports
38 | htmlcov/
39 | .tox/
40 | .coverage
41 | .coverage.*
42 | .cache
43 | nosetests.xml
44 | coverage.xml
45 | *,cover
46 | .hypothesis/
47 |
48 | # Translations
49 | *.mo
50 | *.pot
51 |
52 | # Django stuff:
53 | *.log
54 | local_settings.py
55 |
56 | # Flask stuff:
57 | instance/
58 | .webassets-cache
59 |
60 | # Scrapy stuff:
61 | .scrapy
62 |
63 | # Sphinx documentation
64 | docs/_build/
65 |
66 | # PyBuilder
67 | target/
68 |
69 | # IPython Notebook
70 | .ipynb_checkpoints
71 |
72 | # pyenv
73 | .python-version
74 |
75 | # celery beat schedule file
76 | celerybeat-schedule
77 |
78 | # dotenv
79 | .env
80 |
81 | # virtualenv
82 | venv/
83 | ENV/
84 |
85 | # Spyder project settings
86 | .spyderproject
87 |
88 | # Rope project settings
89 | .ropeproject
90 |
91 | .idea/
92 | .mypy_cache/
93 | apex/
94 | data/
95 | results/
96 |
--------------------------------------------------------------------------------
/src/language_model/tokenization/trainer.py:
--------------------------------------------------------------------------------
1 | import os
2 | from typing import List
3 |
4 | from tokenizers import ByteLevelBPETokenizer
5 |
6 | from ..pipeline import ITask
7 |
8 |
9 | class ByteLevelBPETokenizerTrainer(ITask):
10 | def __init__(
11 | self,
12 | source_folder_path: str,
13 | tokenizer: ByteLevelBPETokenizer,
14 | vocab_size: int,
15 | min_frequency: int,
16 | special_tokens: List[str],
17 | ) -> None:
18 | super().__init__()
19 | self.source_folder_path = source_folder_path
20 | self.tokenizer = tokenizer
21 | self.special_tokens = special_tokens
22 | self.min_frequency = min_frequency
23 | self.vocab_size = vocab_size
24 |
25 | def execute(self, environment_path: str) -> None:
26 | files = self.get_all_files_in_folder(self.source_folder_path)
27 |
28 | self.tokenizer.train(
29 | files=files,
30 | vocab_size=self.vocab_size,
31 | min_frequency=self.min_frequency,
32 | special_tokens=self.special_tokens,
33 | )
34 |
35 | self.tokenizer.save(os.path.join(environment_path, "tokenizer"))
36 |
37 | @staticmethod
38 | def get_all_files_in_folder(data_folder_path: str) -> List[str]:
39 | data_files_paths = []
40 | for (dir_path, _, filenames) in os.walk(data_folder_path):
41 | data_files_paths.extend([os.path.join(dir_path, file_name) for file_name in filenames])
42 | return data_files_paths
43 |
--------------------------------------------------------------------------------
/src/language_model/modelling/trainer.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | from transformers import (
4 | DataCollatorForLanguageModeling,
5 | LineByLineTextDataset,
6 | RobertaForMaskedLM,
7 | RobertaTokenizerFast,
8 | Trainer,
9 | TrainingArguments,
10 | )
11 |
12 | from ..pipeline import ITask
13 |
14 |
15 | class RobertaForMaskedLMTrainTask(ITask):
16 | def __init__(
17 | self,
18 | file_path: str,
19 | model: RobertaForMaskedLM,
20 | tokenizer: RobertaTokenizerFast,
21 | block_size: int = 128,
22 | mlm_probability: float = 0.15,
23 | epochs: int = 1,
24 | batch_size_per_gpu: int = 64,
25 | save_steps: int = 10000,
26 | save_total_limit: int = 2,
27 | ) -> None:
28 | super().__init__()
29 | self.file_path = file_path
30 | self.model = model
31 | self.tokenizer = tokenizer
32 | self.block_size = block_size
33 | self.mlm_probability = mlm_probability
34 | self.epochs = epochs
35 | self.batch_size_per_gpu = batch_size_per_gpu
36 | self.save_steps = save_steps
37 | self.save_total_limit = save_total_limit
38 |
39 | def execute(self, environment_path: str) -> None:
40 | dataset = LineByLineTextDataset(tokenizer=self.tokenizer, file_path=self.file_path, block_size=self.block_size)
41 |
42 | data_collator = DataCollatorForLanguageModeling(
43 | tokenizer=self.tokenizer, mlm=True, mlm_probability=self.mlm_probability
44 | )
45 |
46 | training_args = TrainingArguments(
47 | output_dir=os.path.join(environment_path, "temp"),
48 | overwrite_output_dir=True,
49 | num_train_epochs=self.epochs,
50 | per_gpu_train_batch_size=self.batch_size_per_gpu,
51 | save_steps=self.save_steps,
52 | save_total_limit=self.save_total_limit,
53 | )
54 |
55 | trainer = Trainer(
56 | model=self.model,
57 | args=training_args,
58 | data_collator=data_collator,
59 | train_dataset=dataset,
60 | prediction_loss_only=True,
61 | )
62 |
63 | trainer.train()
64 |
65 | trainer.save_model(os.path.join(environment_path, "model"))
66 | self.tokenizer.save_pretrained(os.path.join(environment_path, "tokenizer"))
67 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Ukrainian Roberta
2 |
3 | Ukrainian Roberta was trained by YouScan data science team.
4 |
5 | ## Pre-training corpora
6 | Below is the list of corpora used along with the output of wc command (counting lines, words and characters). These corpora were concatenated and tokenized with HuggingFace Roberta Tokenizer.
7 |
8 | | Tables | Lines | Words | Characters |
9 | | ------------- |--------------:| -----:| -----:|
10 | | [Ukrainian Wikipedia - May 2020](https://dumps.wikimedia.org/ukwiki/latest/ukwiki-latest-pages-articles.xml.bz2) | 18 001 466| 201 207 739 | 2 647 891 947 |
11 | | [Ukrainian OSCAR deduplicated dataset](https://oscar-public.huma-num.fr/shuffled/uk_dedup.txt.gz) | 56 560 011 | 2 250 210 650 | 29 705 050 592 |
12 | | Sampled mentions from social networks | 11 245 710 | 128 461 796 | 1 632 567 763 |
13 | | Total | 85 807 187 | 2 579 880 185 | 33 985 510 302 |
14 |
15 | ## Pre-training details
16 |
17 | * Ukrainian Roberta was trained with code provided in [HuggingFace tutorial](https://huggingface.co/blog/how-to-train)
18 | * Currently released model follows roberta-base-cased model architecture (12-layer, 768-hidden, 12-heads, 125M parameters)
19 | * The model was trained on 4xV100 (85 hours)
20 | * Training configuration you can find in the config file (`configs/train_model/version_1/ukr/ukr-roberta-base.py`)
21 |
22 | ## Evaluation
23 | ukr-roberta-base was tested on internal YouScan tasks, we got improvements by 2 percent (fscore) comparing to [mBERT](https://huggingface.co/bert-base-multilingual-cased).
24 |
25 | ## Usage
26 | Ukrainian Roberta is released via [HuggingFace Transformers library](https://huggingface.co/transformers/).
27 |
28 | ```python
29 | from transformers import pipeline, RobertaForMaskedLM, RobertaTokenizer
30 |
31 | model = RobertaForMaskedLM.from_pretrained("youscan/ukr-roberta-base")
32 | tokenizer = RobertaTokenizer.from_pretrained("youscan/ukr-roberta-base")
33 |
34 | fill_mask = pipeline('fill-mask', model=model, tokenizer=tokenizer)
35 | fill_mask("Тарас Шевченко – великий українсьский .")
36 | # [{'sequence': ' Тарас Шевченко – великий українсьский поет.',
37 | # 'score': 0.48607954382896423,
38 | # 'token': 11426},
39 | # {'sequence': ' Тарас Шевченко – великий українсьский письменник.',
40 | # 'score': 0.23330871760845184,
41 | # 'token': 10121},
42 | # {'sequence': ' Тарас Шевченко – великий українсьский художник.',
43 | # 'score': 0.06583040952682495,
44 | # 'token': 12836},
45 | # {'sequence': ' Тарас Шевченко – великий українсьский князь.',
46 | # 'score': 0.021497823297977448,
47 | # 'token': 17247},
48 | # {'sequence': ' Тарас Шевченко – великий українсьский народ.',
49 | # 'score': 0.020411811769008636,
50 | # 'token': 1021}]
51 | ```
52 |
53 | ## Author
54 | Vitalii Radchenko - contact me on Twitter [@vitaliradchenko](https://twitter.com/vitaliradchenko)
55 |
--------------------------------------------------------------------------------
/src/language_model/runner.py:
--------------------------------------------------------------------------------
1 | import os
2 | from importlib import import_module
3 | from logging import INFO, FileHandler, Formatter, Logger, StreamHandler, getLogger
4 | from typing import Sequence
5 |
6 | from .pipeline import ITask, TaskRunner
7 |
8 | LOGGING_FORMAT: str = "%(asctime)s : %(levelname)s : %(module)s : %(message)s"
9 | DEFAULT_LOG_DIR: str = "logs"
10 | DEFAULT_LOG_FILE: str = "log.txt"
11 | DEFAULT_CONFIGURATION_DIR: str = "configs"
12 | TASK_FIELD_NAME: str = "task"
13 |
14 |
15 | class SandboxRunner(TaskRunner):
16 | def __init__(self, config_path: str, sandbox_root_path: str = "outputs") -> None:
17 | self.config_path = config_path
18 | self.sandbox_root_path = sandbox_root_path
19 |
20 | def get_root_folder_path(self) -> str:
21 | return self.sandbox_root_path
22 |
23 | def run(self) -> None:
24 | experiment_ids = identifiers_from_config_file(self.config_path)
25 | module_name = ".".join(experiment_ids)
26 | module = import_module(module_name)
27 | task: ITask = getattr(module, TASK_FIELD_NAME)
28 |
29 | pure_experiment_ids = drop_configuration_dir(experiment_ids=experiment_ids)
30 | experiment_sandbox_path = os.path.join(*pure_experiment_ids)
31 | sandbox_folder_path = os.path.join(self.get_root_folder_path(), experiment_sandbox_path)
32 |
33 | logger = init_logger(experiment_ids, overwrite=True)
34 | logger.info(f"Running task from {self.config_path}")
35 | if not os.path.exists(sandbox_folder_path) or not os.path.isdir(sandbox_folder_path):
36 | os.makedirs(sandbox_folder_path)
37 | task.execute(sandbox_folder_path)
38 |
39 |
40 | def identifiers_from_config_file(filepath: str) -> Sequence[str]:
41 | path = os.path.normpath(filepath)
42 | path_components = path.split(os.sep)
43 | path_components[-1] = os.path.splitext(path_components[-1])[0]
44 | return path_components
45 |
46 |
47 | def drop_configuration_dir(experiment_ids: Sequence[str]) -> Sequence[str]:
48 | config_dir_index = experiment_ids[:-1].index(DEFAULT_CONFIGURATION_DIR)
49 | if config_dir_index != -1:
50 | experiment_ids = experiment_ids[config_dir_index + 1 :]
51 | return experiment_ids
52 |
53 |
54 | def init_logger(experiment_identifiers: Sequence[str], overwrite: bool = True, log_to_stderr: bool = False) -> Logger:
55 | path_components = [DEFAULT_LOG_DIR] + list(experiment_identifiers)
56 | log_path = os.path.join(*path_components)
57 |
58 | logger = getLogger()
59 | logger.setLevel(INFO)
60 |
61 | logging_path = os.path.join(log_path)
62 | if not os.path.exists(logging_path):
63 | os.makedirs(logging_path)
64 |
65 | formatter = Formatter(LOGGING_FORMAT)
66 |
67 | fh = FileHandler(os.path.join(logging_path, DEFAULT_LOG_FILE), mode="w" if overwrite else "a", encoding="utf-8")
68 | fh.setFormatter(formatter)
69 | logger.addHandler(fh)
70 |
71 | if log_to_stderr:
72 | sh = StreamHandler()
73 | sh.setFormatter(formatter)
74 | logger.addHandler(sh)
75 |
76 | return logger
77 |
--------------------------------------------------------------------------------