├── data ├── readme.md ├── wiki_model │ └── vocab │ │ └── wikipedia_upper_voc_32000_sen10000000.model ├── wiki_books_oscar │ └── vocab │ │ ├── books_wikipedia_v32k_sen10M.spm.bpe.model │ │ ├── books_wikipedia_v50k_sen10M.spm.bpe.model │ │ ├── books_wikipedia_lower_v32k_sen10M.spm.bpe.model │ │ └── books_wikipedia_lower_v50k_sen10M.spm.bpe.model └── wolne_lektury_non_polish_isbn.txt ├── images └── ermlab_software.png ├── libs ├── morfeusz2_1.9.14-18.04_amd64.deb ├── morfeusz2-0.4.0-py3.6-Linux-amd64.egg └── readme.md ├── Pipfile ├── LICENSE ├── gen_lines.py ├── aws_configuration.md ├── .gitignore ├── process_sentences.py ├── fill_mask_task.py ├── polish_roberta_vocab.ipynb ├── README.md ├── polish_process_data.ipynb ├── polish_roberta_training.ipynb ├── playground_taggers.py └── text_utils.py /data/readme.md: -------------------------------------------------------------------------------- 1 | Folder for data. -------------------------------------------------------------------------------- /images/ermlab_software.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Ermlab/PoLitBert/HEAD/images/ermlab_software.png -------------------------------------------------------------------------------- /libs/morfeusz2_1.9.14-18.04_amd64.deb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Ermlab/PoLitBert/HEAD/libs/morfeusz2_1.9.14-18.04_amd64.deb -------------------------------------------------------------------------------- /libs/morfeusz2-0.4.0-py3.6-Linux-amd64.egg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Ermlab/PoLitBert/HEAD/libs/morfeusz2-0.4.0-py3.6-Linux-amd64.egg -------------------------------------------------------------------------------- /data/wiki_model/vocab/wikipedia_upper_voc_32000_sen10000000.model: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Ermlab/PoLitBert/HEAD/data/wiki_model/vocab/wikipedia_upper_voc_32000_sen10000000.model -------------------------------------------------------------------------------- /data/wiki_books_oscar/vocab/books_wikipedia_v32k_sen10M.spm.bpe.model: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Ermlab/PoLitBert/HEAD/data/wiki_books_oscar/vocab/books_wikipedia_v32k_sen10M.spm.bpe.model -------------------------------------------------------------------------------- /data/wiki_books_oscar/vocab/books_wikipedia_v50k_sen10M.spm.bpe.model: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Ermlab/PoLitBert/HEAD/data/wiki_books_oscar/vocab/books_wikipedia_v50k_sen10M.spm.bpe.model -------------------------------------------------------------------------------- /data/wiki_books_oscar/vocab/books_wikipedia_lower_v32k_sen10M.spm.bpe.model: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Ermlab/PoLitBert/HEAD/data/wiki_books_oscar/vocab/books_wikipedia_lower_v32k_sen10M.spm.bpe.model -------------------------------------------------------------------------------- /data/wiki_books_oscar/vocab/books_wikipedia_lower_v50k_sen10M.spm.bpe.model: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Ermlab/PoLitBert/HEAD/data/wiki_books_oscar/vocab/books_wikipedia_lower_v50k_sen10M.spm.bpe.model -------------------------------------------------------------------------------- /Pipfile: -------------------------------------------------------------------------------- 1 | [[source]] 2 | name = "pypi" 3 | url = "https://pypi.org/simple" 4 | verify_ssl = true 5 | 6 | [dev-packages] 7 | jupyter = "*" 8 | ipykernel = "*" 9 | pylint = "*" 10 | tensorboardx = "*" 11 | tensorboard = "*" 12 | pandas = "*" 13 | matplotlib = "*" 14 | black = "*" 15 | autopep8 = "*" 16 | torch = "*" 17 | stanza = "*" 18 | spacy = "*" 19 | langdetect = "*" 20 | 21 | [packages] 22 | sentencepiece = "*" 23 | nltk = "*" 24 | tqdm = "*" 25 | fairseq = "*" 26 | langdetect = "*" 27 | polyglot = "*" 28 | pycld2 = "*" 29 | pyicu = "*" 30 | 31 | [requires] 32 | python_version = "3.7" 33 | 34 | [pipenv] 35 | allow_prereleases = true 36 | -------------------------------------------------------------------------------- /libs/readme.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | Dodatkowe biblioteki: 4 | 5 | * pl_spacy_model_morfeusz_big-0.1.0.tar.gz 6 | * pl_spacy_model_morfeusz-0.1.0.tar.gz 7 | * pl_spacy_model-0.1.0.tar.gz 8 | 9 | 10 | 11 | ## Morfeusz installation instructions 12 | 13 | 14 | 15 | * http://morfeusz.sgjp.pl/download/ 16 | 17 | 18 | ``` 19 | wget -O - http://download.sgjp.pl/apt/sgjp.gpg.key | sudo apt-key add - 20 | sudo apt-add-repository http://download.sgjp.pl/apt/ubuntu 21 | sudo apt update 22 | 23 | sudo apt install morfeusz2 24 | 25 | wget http://download.sgjp.pl/morfeusz/20200510/Linux/18.04/64/morfeusz2-0.4.0-py3.6-Linux-amd64.egg 26 | easy_install ./morfeusz2-0.4.0-py3.6-Linux-amd64.egg 27 | ``` -------------------------------------------------------------------------------- /data/wolne_lektury_non_polish_isbn.txt: -------------------------------------------------------------------------------- 1 | 978-83-288-3507-8 2 | 978-83-288-3477-4 3 | 978-83-288-3475-0 4 | 978-83-288-3474-3 5 | 978-83-288-3439-2 6 | 978-83-288-3357-9 7 | 978-83-288-3354-8 8 | 978-83-288-3349-4 9 | 978-83-288-3322-7 10 | 978-83-288-3305-0 11 | 978-83-288-3302-9 12 | 978-83-288-3301-2 13 | 978-83-288-3299-2 14 | 978-83-288-3298-5 15 | 978-83-288-3295-4 16 | 978-83-288-4000-3 17 | 978-83-288-3223-7 18 | 978-83-288-3221-3 19 | 978-83-288-3181-0 20 | 978-83-288-3077-6 21 | 978-83-288-3076-9 22 | 978-83-288-3859-8 23 | 978-83-288-3854-3 24 | 978-83-288-3846-8 25 | 978-83-288-3845-1 26 | 978-83-288-3844-4 27 | 978-83-288-3843-7 28 | 978-83-288-3842-0 29 | 978-83-288-3841-3 30 | 978-83-288-3815-4 31 | 978-83-288-3814-7 32 | 978-83-288-4032-4 33 | 978-83-288-3806-2 34 | 978-83-288-3777-5 35 | 978-83-288-3779-9 36 | 978-83-288-3771-3 37 | 978-83-288-3770-6 38 | 978-83-288-3760-7 39 | 978-83-288-3758-4 40 | 978-83-288-3683-9 41 | 978-83-288-3682-2 42 | 978-83-288-3681-5 43 | 978-83-288-4027-0 44 | 978-83-288-4026-3 -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 Ermlab Software 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /gen_lines.py: -------------------------------------------------------------------------------- 1 | 2 | #%% 3 | import sys 4 | import datetime as dt 5 | import os 6 | from pathlib import Path 7 | import re 8 | from tqdm import tqdm 9 | #from tqdm.notebook import tqdm 10 | import mmap 11 | from text_utils import get_num_lines 12 | 13 | 14 | #%% 15 | 16 | files_to_proces = [ 17 | './data/corpus_wikipedia_2020-02-01.txt', 18 | './data/corpus_books_2020_02_24.txt' 19 | ] 20 | files_to_proces = [ 21 | './data/corpus_wikipedia_2020-02-01.txt', 22 | ] 23 | files_to_proces = [ 24 | './data/corpus_books_2020_02_24_fix.txt' 25 | ] 26 | 27 | for input_file in files_to_proces: 28 | print(input_file) 29 | p = Path(input_file) 30 | output_path = f"{p.with_suffix('')}_lines.txt" 31 | 32 | print(f"in file={input_file}\nout file={output_path}") 33 | 34 | t0=dt.datetime.now() 35 | 36 | total_lines = get_num_lines(input_file) 37 | 38 | text='' 39 | with open(output_path, 'w+') as output_file: 40 | with open(input_file) as f: 41 | i=0 42 | for line in tqdm(f,total=total_lines): 43 | 44 | 45 | #check if line is valid sentence, if not remove it 46 | 47 | 48 | 49 | # get block of text to new line which splits ariticles 50 | text+=line 51 | i+=1 52 | if line.strip() == '' or i%1000==0: 53 | 54 | sentences = sentence_tokenizer.tokenize(text) 55 | 56 | file_content = '' 57 | for sentence in sentences: 58 | file_content += sentence.strip() 59 | file_content+='\n' 60 | output_file.write(file_content) 61 | 62 | output_file.write('\n') 63 | text='' 64 | 65 | 66 | t1=dt.datetime.now() 67 | print(f'Split lines done, takes={t1-t0}') 68 | 69 | -------------------------------------------------------------------------------- /aws_configuration.md: -------------------------------------------------------------------------------- 1 | 2 | ## Base image 3 | 4 | 5 | [Ubuntu 18 Deep learning Base](https://aws.amazon.com/marketplace/pp/Amazon-Web-Services-Deep-Learning-Base-AMI-Amazon-/B077GFM7L7) 6 | 7 | * you do not have to remove conda :) 8 | * install all libraries on your own 9 | 10 | 11 | ## AWS p3 prepare 12 | 13 | Remove conda 14 | 15 | ``` 16 | rm -rf /home/ubuntu/anaconda3 17 | ``` 18 | 19 | install sqllite 20 | 21 | ``` 22 | sudo apt install libsqlite3-dev 23 | sudo apt-get install libffi-dev 24 | ``` 25 | 26 | Set cuda version 27 | 28 | ``` 29 | $ sudo rm /usr/local/cuda 30 | sudo ln -s /usr/local/cuda-10.2 /usr/local/cuda 31 | 32 | 33 | #check if it works 34 | $ cd /usr/local/cuda/samples/1_Utilities/deviceQuery 35 | sudo make 36 | ./deviceQuery 37 | 38 | ``` 39 | 40 | intall pyenv 41 | 42 | ``` 43 | curl https://pyenv.run | bash 44 | ``` 45 | 46 | install python 3.7.3 47 | 48 | ``` 49 | pyenv install 3.7.3 50 | 51 | python --version 52 | ``` 53 | 54 | Add pyenv to .basrc 55 | ``` 56 | source ~/.bashrc 57 | ``` 58 | 59 | Set global python version to 3.7.3 60 | 61 | ``` 62 | pyenv global 3.7.3 63 | ``` 64 | 65 | 66 | Install pipenv - check 67 | 68 | ``` 69 | pip install --user pipenv 70 | ``` 71 | Add to path .profile 72 | ``` 73 | source ~/.profile 74 | ``` 75 | 76 | 77 | 78 | clone the herbert repo 79 | 80 | ``` 81 | git clone https://github.com/Ermlab/PoLitBert.git 82 | git checkout dev 83 | ``` 84 | 85 | Install dependecies 86 | 87 | ``` 88 | cd PoLitBert 89 | pipenv install 90 | ``` 91 | 92 | Install NVIDIA apex 93 | 94 | ``` 95 | cd herbert 96 | pipenv shell 97 | 98 | cd /libs/ 99 | git clone https://github.com/NVIDIA/apex 100 | cd apex 101 | CUDA_HOME=/usr/local/cuda/ pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" ./ 102 | ``` 103 | 104 | run trainning 105 | 106 | ``` 107 | fairseq-train .... 108 | ``` 109 | 110 | run tensorboard 111 | 112 | ``` 113 | tensorboard --logdir $LOGS_DIR 114 | ``` 115 | 116 | 117 | 118 | Tunell ssh to Tensorboard 119 | ``` 120 | ssh -A -t -i ~/.ssh/aws_key ubuntu@ubuntu@ec2-54-154-227-149.eu-west-1.compute.amazonaws.com -L 6008:localhost:6006 121 | ``` 122 | 123 | Copy checkpoint 124 | 125 | ``` 126 | scp -i ~/.ssh/aws_key ubuntu@ec2-54-229-85-20.eu-west-1.compute.amazonaws.com:/mnt/efs/fs1/bert_model/checkpoints/wiki_model/checkpoint127.pt ./ 127 | ``` -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | 131 | */gw_nas/* 132 | **/data/corpus_raw/* 133 | data/wiki_dump/* 134 | -------------------------------------------------------------------------------- /process_sentences.py: -------------------------------------------------------------------------------- 1 | #%% 2 | import text_utils as tu 3 | 4 | from pathlib import Path 5 | from tqdm import tqdm 6 | import datetime as dt 7 | from collections import namedtuple 8 | 9 | import argparse 10 | 11 | 12 | #%% 13 | 14 | 15 | parser = argparse.ArgumentParser( 16 | description='Process raw corpus txt file, split to lines, check is sentence is valid, remove to short and to long sentences and save to file with suffix "_lines"' 17 | ) 18 | parser.add_argument("corpus_file", type=str, help="corpus txt raw input file") 19 | 20 | 21 | parser.add_argument( 22 | "-sp", 23 | "--split_each_line_as_doc", 24 | action="store_true", 25 | help="If true each line from corpus file will be treated as document, new line will be added after last sentence from this line", 26 | default=False, 27 | ) 28 | 29 | parser.add_argument( 30 | "-vs", 31 | "--check_valid_sentence", 32 | action="store_true", 33 | default=False, 34 | help="check if extracted sentence is valid polish sentence, if not do not save it in output file", 35 | ) 36 | 37 | 38 | parser.add_argument( 39 | "-ls", 40 | "--check_lang_sentence", 41 | action="store_true", 42 | default=False, 43 | help="check if extracted sentence is in polish, remove sentences in other lang, do not save it in output file", 44 | ) 45 | 46 | parser.add_argument( 47 | "-ml", 48 | "--max_sentence_length", 49 | type=int, 50 | default=700, 51 | help="remove longer(in chars) sentences", 52 | ) 53 | 54 | parser.add_argument( 55 | "-u", 56 | "--krnnt_pos_url", 57 | type=str, 58 | default="http://localhost:9003", 59 | help="KRNNT pos tagger docker url", 60 | ) 61 | 62 | 63 | args = parser.parse_args() 64 | 65 | 66 | #%% 67 | 68 | corpus_oscar_raw = args.corpus_file 69 | 70 | 71 | p = Path(corpus_oscar_raw) 72 | corpus_oscar_lines = f"{p.with_suffix('')}_lines.txt" 73 | 74 | print(f"Start preparing corpus") 75 | print(f"in file={corpus_oscar_raw}\nout file={corpus_oscar_lines}") 76 | start = dt.datetime.now() 77 | print(f"Start time: {start}") 78 | 79 | 80 | stats, vl, pl = tu.corpus_process_sentence( 81 | corpus_oscar_raw, 82 | corpus_oscar_lines, 83 | split_each_line_as_doc=args.split_each_line_as_doc, 84 | check_valid_sentence=args.check_valid_sentence, 85 | check_lang_sentence=args.check_lang_sentence, 86 | max_sentence_length=args.max_sentence_length, 87 | krnnt_url=args.krnnt_pos_url, 88 | ) 89 | 90 | end = dt.datetime.now() 91 | print(f"Finish. End time: {end} Start time: {start} took={end-start}") 92 | 93 | from pprint import pprint 94 | 95 | print(f"Cleaning stats") 96 | pprint(stats) 97 | 98 | #%% 99 | -------------------------------------------------------------------------------- /fill_mask_task.py: -------------------------------------------------------------------------------- 1 | #%% 2 | import os 3 | 4 | from fairseq.models.roberta import RobertaModel, RobertaHubInterface 5 | from fairseq import hub_utils 6 | 7 | 8 | # OPI model 9 | root_path = "./data/external_models/opi/" 10 | model_path = os.path.join(root_path) 11 | checkpoint_file = "checkpoint_best.pt" 12 | 13 | vocab_model_file="sentencepiece.model" 14 | vocab_path = os.path.join(root_path,vocab_model_file) 15 | 16 | 17 | #%% Ermlab model 18 | root_path = "./data/wiki_model/" 19 | model_path = os.path.join(root_path,"checkpoints/") 20 | checkpoint_file = "checkpoint77.pt" 21 | checkpoint_file = "checkpoint94.pt" 22 | checkpoint_file = "checkpoint127.pt" 23 | checkpoint_file = "checkpoint_best.pt" 24 | 25 | 26 | vocab_model_file="wikipedia_upper_voc_32000_sen10000000.model" 27 | vocab_path = os.path.join(root_path, "vocab", vocab_model_file) 28 | #%% 29 | 30 | loaded = hub_utils.from_pretrained( 31 | model_name_or_path=model_path, 32 | checkpoint_file=checkpoint_file, 33 | data_name_or_path='./', 34 | bpe="sentencepiece", 35 | sentencepiece_vocab=vocab_path, 36 | load_checkpoint_heads=True, 37 | archive_map=RobertaModel.hub_models(), 38 | cpu=True 39 | ) 40 | roberta = RobertaHubInterface(loaded['args'], loaded['task'], loaded['models'][0]) 41 | roberta.eval() 42 | 43 | #%% 44 | 45 | def print_mask(s, predicted): 46 | 47 | print(s) 48 | for p in predicted: 49 | print(f'\t{p[2]} - {p[0]} - confidence {p[1]}') 50 | 51 | sentences = [ 52 | 'Bolesław Bierut objął rządy w roku.', #1948 53 | 'Największym we współczesnym świecie jest głód.', 54 | 'Wikipedia powstała jako projekt uzupełniający dla , darmowej encyklopedii internetowej', #Nupedii 55 | 'W Olsztynie pracował Mikołaj Kopernik, ten który ziemię a wstrzymał słońce.', 56 | 'Krzysztof do sklepu i zrobił zakupy na śniadanie.', 57 | 'Anna do sklepu i zrobiła zakupy na śniadanie.', 58 | 'Idąć do szkoły, potrącony przez rowerzystę.', 59 | 'Nie lubił zupy , ale musiał ją zjeść ', 60 | 'Nagle pojawili się jego , z którymi nie rozmawiał już od dłuższego czasu', 61 | 'Na śniadanie zjadł kanapkę z i sałatą', 62 | ' linie lotnicze wstrzymały loty do Rosji', 63 | 'Nic nie powiedział, wstał i wyszedł z domu ' 64 | 65 | 66 | ] 67 | 68 | for s in sentences: 69 | topk_tokens = roberta.fill_mask(s, topk=5) 70 | print_mask(s, topk_tokens) 71 | 72 | 73 | # %% 74 | from transformers import * 75 | model = BertForMaskedLM.from_pretrained("dkleczek/bert-base-polish-uncased-v1") 76 | tokenizer = BertTokenizer.from_pretrained("dkleczek/bert-base-polish-uncased-v1") 77 | nlp = pipeline('fill-mask', model=model, tokenizer=tokenizer) 78 | 79 | #%% 80 | 81 | for pred in nlp(f"Adam Mickiewicz wielkim polskim {nlp.tokenizer.mask_token} był."): 82 | print(pred) 83 | 84 | for s in sentences: 85 | s = s.replace('', '[MASK]') 86 | pred = nlp(s) 87 | print(f'{s} \n') 88 | for p in pred: 89 | print(f'{p}') 90 | 91 | # %% 92 | -------------------------------------------------------------------------------- /polish_roberta_vocab.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# PoLitBert - Polish RoBERT'a model \n", 8 | "\n", 9 | "## Preparation of vocabulary and encoding the data\n", 10 | "\n", 11 | "Used corpuses:\n", 12 | "* Wikipedia, Link: \n", 13 | "* Oscar\n", 14 | "* Polish Books" 15 | ] 16 | }, 17 | { 18 | "cell_type": "markdown", 19 | "metadata": {}, 20 | "source": [ 21 | "Usefull resources\n", 22 | "* https://github.com/pytorch/fairseq/blob/master/examples/roberta/README.pretraining.md\n", 23 | "* https://github.com/musixmatchresearch/umberto/issues/2" 24 | ] 25 | }, 26 | { 27 | "cell_type": "code", 28 | "execution_count": null, 29 | "outputs": [], 30 | "source": [ 31 | "import csv\n", 32 | "import sys\n", 33 | "import datetime as dt\n", 34 | "import os\n", 35 | "from pathlib import Path\n", 36 | "import re\n", 37 | "\n", 38 | "from tqdm import tqdm\n", 39 | "\n", 40 | "import mmap" 41 | ], 42 | "metadata": { 43 | "collapsed": false, 44 | "pycharm": { 45 | "name": "#%%\n" 46 | } 47 | } 48 | }, 49 | { 50 | "cell_type": "markdown", 51 | "metadata": {}, 52 | "source": [ 53 | "## Create vocabulary\n", 54 | "\n", 55 | "### Prepare data for vocab\n", 56 | "\n", 57 | "Separate text file for training vocabulary has been created with one sentence per line.\n", 58 | "We used polish sentence tokenizer with [additional abbreviations](https://gist.github.com/ksopyla/f05fe2f48bbc9de895368b8a7863b5c3)\n", 59 | "typical for the Polish language.\n", 60 | "Sentencepiece model is capable of handling around 12.000.000 sentences, so larger files are not necessary." 61 | ] 62 | }, 63 | { 64 | "cell_type": "markdown", 65 | "metadata": {}, 66 | "source": [ 67 | "### Train the BPE vocabulary model\n", 68 | "\n", 69 | "We used the [SentencePiece](https://github.com/google/sentencepiece) segmentation model trained from raw\n", 70 | "sentences with fixed final vocabulary size - 32K and 50K unique tokens.\n", 71 | "\n", 72 | "Training and segmentation can be done in two ways:\n", 73 | "- as a python module,\n", 74 | "- as a command-line tool.\n", 75 | "\n", 76 | "To use it as a command-line it should be installed from source, which is described in the\n", 77 | "[build the C++ version from source](https://github.com/google/sentencepiece#c-from-source) section of the documentation.\n" 78 | ] 79 | }, 80 | { 81 | "cell_type": "markdown", 82 | "metadata": {}, 83 | "source": [ 84 | "#### Training SentencePiece vocab using command line\n", 85 | "\n", 86 | "* 32k vocab:\n", 87 | "```\n", 88 | "spm_train \\\n", 89 | " --input=./data/corpus_raw/corpus_books_wiki_12M_lines.txt \\\n", 90 | " --max_sentence_length=4192\\\n", 91 | " --model_prefix=./data/vocab/books_wikipedia_v32k_sen10M.spm.bpe \\\n", 92 | " --vocab_size=32000 \\\n", 93 | " --model_type=bpe \\\n", 94 | " --shuffle_input_sentence=true \\\n", 95 | " --input_sentence_size=10000000 \\\n", 96 | " --bos_id=0 --eos_id=1 --pad_id=2 --unk_id=3\n", 97 | "```\n", 98 | "\n", 99 | "* 50k vocab:\n", 100 | "```\n", 101 | "spm_train \\\n", 102 | " --input=./data/corpus_raw/corpus_books_wiki_12M_lines.txt \\\n", 103 | " --max_sentence_length=4192\\\n", 104 | " --model_prefix=./data/vocab/books_wikipedia_v50k_sen10M.spm.bpe \\\n", 105 | " --vocab_size=50000 \\\n", 106 | " --model_type=bpe \\\n", 107 | " --shuffle_input_sentence=true \\\n", 108 | " --input_sentence_size=10000000 \\\n", 109 | " --bos_id=0 --eos_id=1 --pad_id=2 --unk_id=3\n", 110 | "```" 111 | ] 112 | }, 113 | { 114 | "cell_type": "markdown", 115 | "metadata": { 116 | "pycharm": { 117 | "name": "#%% md\n" 118 | } 119 | }, 120 | "source": [ 121 | "#### Training SentencePiece vocab with Python module\n", 122 | "\n", 123 | "Below, for reference, an example of how to prepare a SP model if Python script is preferred." 124 | ] 125 | }, 126 | { 127 | "cell_type": "code", 128 | "execution_count": null, 129 | "outputs": [], 130 | "source": [ 131 | "import sentencepiece as spm\n", 132 | "\n", 133 | "vocab_size = 32000\n", 134 | "model_type = \"bpe\" \n", 135 | "iss = 10_000_000\n", 136 | "\n", 137 | "data_file = './data/corpus_raw/corpus_books_wiki_12M_lines.txt'\n", 138 | "\n", 139 | "tok_model = f\"books_wikipedia_v32k_sen10M\"\n", 140 | "tok_model = os.path.abspath(f\"./data/vocab/{tok_model}\")\n", 141 | "\n", 142 | "piece_options = ' --bos_id=0 --eos_id=1 --pad_id=2 --unk_id=3 --shuffle_input_sentence=true'\n", 143 | "\n", 144 | "cmd = f\"--input={data_file} --model_prefix={tok_model} --num_threads=4 --vocab_size={vocab_size} --input_sentence_size={iss}\" + piece_options\n", 145 | "print(cmd)\n", 146 | "\n", 147 | "start = dt.datetime.now()\n", 148 | "print(start)\n", 149 | "spm.SentencePieceTrainer.train(cmd)\n", 150 | "end = dt.datetime.now()\n", 151 | "\n", 152 | "print(f\"Created vocab of {vocab_size} tokens from {data_file}, took {end-start}.\")" 153 | ], 154 | "metadata": { 155 | "collapsed": false, 156 | "pycharm": { 157 | "name": "#%%\n" 158 | } 159 | } 160 | }, 161 | { 162 | "cell_type": "code", 163 | "execution_count": 10, 164 | "metadata": {}, 165 | "outputs": [ 166 | { 167 | "name": "stdout", 168 | "output_type": "stream", 169 | "text": "32000\n['▁Będąc', '▁młodym', '▁programi', 'stą', '▁(', 'ho', 'ho', '),', '▁czy', 't', 'ałem', '▁\"', 'D', 'zia', 'dy', '\"', '▁w', '▁1983', 'r', '.']\n" 170 | } 171 | ], 172 | "source": [ 173 | "# Example segmentation usage:\n", 174 | "\n", 175 | "# make segmenter instance and load the model file (m.model)\n", 176 | "sp = spm.SentencePieceProcessor()\n", 177 | "sp.load(f\"{tok_model}.model\")\n", 178 | "\n", 179 | "# verify vocab size\n", 180 | "print(sp.get_piece_size())\n", 181 | "\n", 182 | "# encode: text => id\n", 183 | "text = \"\"\"Będąc młodym programistą (hoho), czytałem \"Dziady\" w 1983r.\"\"\"\n", 184 | "print(sp.encode_as_pieces(text))" 185 | ] 186 | }, 187 | { 188 | "cell_type": "markdown", 189 | "metadata": {}, 190 | "source": [ 191 | "### Fairseq vocab\n", 192 | "\n", 193 | "Usage of sentencepiece the model's with fairseq requires changing the separator used in the dictionary.\n", 194 | "All _\\t_ characters should be replaced with _whitespace_ in the vocab file." 195 | ] 196 | }, 197 | { 198 | "cell_type": "code", 199 | "execution_count": 12, 200 | "metadata": {}, 201 | "outputs": [], 202 | "source": [ 203 | "for vocab_size in (\"32k\", \"50k\"):\n", 204 | " vocab_file = f\"./data/vocab/books_wikipedia_v{vocab_size}_sen10M.spm.bpe.vocab\"\n", 205 | "\n", 206 | " p = Path(vocab_file)\n", 207 | "\n", 208 | " output_path = f\"{p.with_suffix('')}_fair.vocab\"\n", 209 | " with open(output_path, 'w+') as output_file:\n", 210 | " with open(vocab_file) as f:\n", 211 | "\n", 212 | " text = f.read().replace('\\t', ' ')\n", 213 | " output_file.write(text)" 214 | ] 215 | }, 216 | { 217 | "cell_type": "markdown", 218 | "metadata": {}, 219 | "source": [ 220 | "### Encode data with sentence piece model\n", 221 | "\n", 222 | "Encoding prepared training and test datasets with SentencePiece tokenizer. Both, for 32k and 50k vocabularies.\n", 223 | "\n", 224 | "* 32k vocab:\n", 225 | "\n", 226 | "```\n", 227 | "DATA_PATH=./data/wiki_books_oscar/\n", 228 | "VOCAB_SIZE=32k\n", 229 | "\n", 230 | "for SPLIT in test train ; do \\\n", 231 | " spm_encode \\\n", 232 | " --model=./data/vocab/books_wikipedia_v${VOCAB_SIZE}_sen10M.spm.bpe.model \\\n", 233 | " --extra_options=bos:eos \\\n", 234 | " --output_format=piece \\\n", 235 | " < ${DATA_PATH}corpus_wiki_books_oscar_${SPLIT}.txt \\\n", 236 | " > ${DATA_PATH}corpus_wiki_books_oscar_${SPLIT}_${VOCAB_SIZE}.txt.bpe\n", 237 | "done\n", 238 | "```\n", 239 | "\n", 240 | "* 50k vocab:\n", 241 | "\n", 242 | "```\n", 243 | "DATA_PATH=./data/wiki_books_oscar/\n", 244 | "VOCAB_SIZE=50k\n", 245 | "\n", 246 | "for SPLIT in test train ; do \\\n", 247 | " spm_encode \\\n", 248 | " --model=./data/vocab/books_wikipedia_v${VOCAB_SIZE}_sen10M.spm.bpe.model \\\n", 249 | " --extra_options=bos:eos \\\n", 250 | " --output_format=piece \\\n", 251 | " < ${DATA_PATH}corpus_wiki_books_oscar_${SPLIT}.txt \\\n", 252 | " > ${DATA_PATH}corpus_wiki_books_oscar_${SPLIT}_${VOCAB_SIZE}.txt.bpe\n", 253 | "done\n", 254 | "```" 255 | ] 256 | }, 257 | { 258 | "cell_type": "markdown", 259 | "metadata": { 260 | "pycharm": { 261 | "name": "#%% md\n" 262 | } 263 | }, 264 | "source": [ 265 | "## Data binarization with Fairseq\n", 266 | "\n", 267 | "### Fairseq-preprocessing bpe encoded and splited data\n", 268 | "\n", 269 | "* Data processed with 32k vocab:\n", 270 | "\n", 271 | "```\n", 272 | "DATA_PATH=./data/wiki_books_oscar/\n", 273 | "VOCAB_SIZE=32k\n", 274 | "\n", 275 | "fairseq-preprocess \\\n", 276 | " --only-source \\\n", 277 | " --srcdict ./vocab/books_wikipedia_v${VOCAB_SIZE}_sen10M.spm.bpe_fair.vocab \\\n", 278 | " --trainpref ${DATA_PATH}corpus_wiki_books_oscar_train_vocab${VOCAB_SIZE}.txt.bpe \\\n", 279 | " --validpref ${DATA_PATH}corpus_wiki_books_oscar_test_vocab${VOCAB_SIZE}.txt.bpe \\\n", 280 | " --destdir ${DATA_PATH}vocab${VOCAB_SIZE} \\\n", 281 | " --workers 8\n", 282 | "```\n", 283 | "\n", 284 | "* Data processed with 50k vocab:\n", 285 | "\n", 286 | "```\n", 287 | "DATA_PATH=./data/wiki_books_oscar/\n", 288 | "VOCAB_SIZE=50k\n", 289 | "\n", 290 | "fairseq-preprocess \\\n", 291 | " --only-source \\\n", 292 | " --srcdict ./vocab/books_wikipedia_v${VOCAB_SIZE}_sen10M.spm.bpe_fair.vocab \\\n", 293 | " --trainpref ${DATA_PATH}corpus_wiki_books_oscar_train_vocab${VOCAB_SIZE}.txt.bpe \\\n", 294 | " --validpref ${DATA_PATH}corpus_wiki_books_oscar_test_vocab${VOCAB_SIZE}.txt.bpe \\\n", 295 | " --destdir ${DATA_PATH}vocab${VOCAB_SIZE} \\\n", 296 | " --workers 8\n", 297 | "```" 298 | ] 299 | } 300 | ], 301 | "metadata": { 302 | "language_info": { 303 | "name": "python", 304 | "codemirror_mode": { 305 | "name": "ipython", 306 | "version": 3 307 | }, 308 | "version": "3.7.3-final" 309 | }, 310 | "orig_nbformat": 2, 311 | "file_extension": ".py", 312 | "mimetype": "text/x-python", 313 | "name": "python", 314 | "npconvert_exporter": "python", 315 | "pygments_lexer": "ipython3", 316 | "version": 3, 317 | "kernelspec": { 318 | "name": "python37364bitherbertpipenvf409fddaf3f446fd8dcf7490c441f6bd", 319 | "display_name": "Python 3.7.3 64-bit ('herbert': pipenv)" 320 | } 321 | }, 322 | "nbformat": 4, 323 | "nbformat_minor": 2 324 | } -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # PoLitBert - Polish RoBERTa model 2 | 3 | Polish RoBERTa model trained on Polish Wikipedia, Polish literature and Oscar. 4 | Major assumption is that good quality text will give good model. 5 | 6 | We believe in open science and knowledge sharing, thus we decided to share complete code, params, experiment details and tensorboards. 7 | 8 | ## Table of Contents 9 | - [Experiments setup and goals](#experiments-setup-and-goals) 10 | - [Data](#data) 11 | - [Data processing for training](#data-processing-for-training) 12 | - [Training, testing dataset stats](#training-testing-dataset-stats) 13 | - [Training Polish RoBERTa protocol with Fairseq](#training-polish-roberta-protocol-with-fairseq) 14 | - [Pretrained models and vocabs](#pretrained-models-and-vocabs) 15 | - [KLEJ evaluation](#klej-evaluation) 16 | - [Details of models training](#details-of-models-training) 17 | - [Used libraries](#used-libraries) 18 | - [Acknowledgements](#acknowledgements) 19 | - [About Ermlab Software](#about-ermlab-software) 20 | 21 | ## Experiments setup and goals 22 | 23 | During experiments, we want to examine: 24 | 25 | * impact of different learning schedulers for training speed and accuracy, tested: 26 | * linear schedule with warmup 27 | * cyclic schedule: cosine, triangular 28 | * impact of training time on final accuracy 29 | 30 | 31 | ## Data 32 | 33 | * Polish Wikipedia dump 03.2020 - archive link https://dumps.wikimedia.org/plwiki/20200301 (not working anymore) 34 | * Polish private book corpus (6GB) 35 | * Cleaned [Polish Oscar corpus](https://traces1.inria.fr/oscar/files/Compressed/pl_dedup.txt.gz) (remove non-polish sentences, keep only valid sentences etc.)([Cleaned Polish Oscar details](https://github.com/Ermlab/PoLitBert/blob/master/polish_process_data.ipynb)) 36 | 37 | 38 | ### Data processing for training 39 | 40 | Our main assumption is that good quality text should produce good language model. 41 | So far the most popular polish dataset was "Polish wikipedia dump" however this text characterize with formal language. 42 | Second source of text is polish part of Oscar corpus - crawled text from the polish internet. When we investigate this corpus with more details it appears that it contains a lot of: foreign sentences (in Russian, English, German etc.), too short sentences and not grammatical sentences (as words enumerations). 43 | 44 | We prepared a few cleaning heuristics: 45 | 46 | * remove sentences shorter than 47 | * remove non polish sentences 48 | * remove ungrammatical sentences (without verbs and with too many nouns) 49 | * perform sentence tokenization and save each sentence in new line, after each document the new line was added 50 | 51 | Data was cleaned with use of [process_sentences.py](process_sentences.py) script, the whole process is presented in the [polish_process_data.ipynb](polish_process_data.ipynb) notebook. 52 | 53 | * Polish Wikipedia dump (03.2020) 54 | * [corpus_wikipedia_2020-03-01_all_lines.zip (0.58 GB)](https://minio.clarin-pl.eu/ermlab/public/PoLitBert/corpus-wikipedia/corpus_wikipedia_2020-03-01_all_lines.zip) 55 | * Cleaned Polish Oscar corpus 56 | * [corpus_oscar_2020-04-10_32M_lines.zip (3.35 GB)](https://minio.clarin-pl.eu/ermlab/public/PoLitBert/corpus-oscar/corpus_oscar_2020-04-10_32M_lines.zip) 57 | * [corpus_oscar_2020-04-10_64M_lines.zip (3.45 GB)](https://minio.clarin-pl.eu/ermlab/public/PoLitBert/corpus-oscar/corpus_oscar_2020-04-10_64M_lines.zip) 58 | * [corpus_oscar_2020-04-10_96M_lines.zip (3.49 GB)](https://minio.clarin-pl.eu/ermlab/public/PoLitBert/corpus-oscar/corpus_oscar_2020-04-10_96M_lines.zip) 59 | * [corpus_oscar_2020-04-10_128M_lines.zip (3.53 GB)](https://minio.clarin-pl.eu/ermlab/public/PoLitBert/corpus-oscar/corpus_oscar_2020-04-10_128M_lines.zip) 60 | * [corpus_oscar_2020-04-10_128M_above_lines.zip (1.93 GB)](https://minio.clarin-pl.eu/ermlab/public/PoLitBert/corpus-oscar/corpus_oscar_2020-04-10_128M_above_lines.zip) 61 | 62 | 63 | Summary of Cleaned Polish Oscar corpus 64 | 65 | 66 | | File | All lines | All sentences | Invalid length sent. | Non-polish sent. | Ungrammatical sent. | Valid sentences | 67 | |-------|-------------|-----------------|----------------------|-------------------|----------------------|---------------| 68 | | corpus_oscar_2020-04-10_32M_lines.txt | 32 000 506 | 94 332 394 | 1 796 371 | 296 093 | 8 100 750 | 84 139 180 | 69 | | corpus_oscar_2020-04-10_64M_lines.txt | 32 000 560 | 96 614 563 | 1 777 586 | 491 789 | 7 869 507 | 86 475 681 | 70 | | corpus_oscar_2020-04-10_96M_lines.txt | 32 001 738 | 96 457 553 | 1 796 083 | 302 598 | 7 908 090 | 86 450 782 | 71 | | corpus_oscar_2020-04-10_128M_lines.txt| 32 002 212 | 97 761 040 | 1 919 071 | 305 924 | 7 891 846 | 87 644 199 | 72 | | corpus_oscar_2020-04-10_128M_above_lines.txt|17 519 467| 53 446 884 |  1 090 714 | 212 657 | 4 343 296 | 47 800 217 | 73 | 74 | 75 | 76 | ### Training, testing dataset stats 77 | 78 | 79 | 80 | | Train Corpus | Lines | Words | Characters | 81 | |----------------------------|-------------|---------------|----------------| 82 | | Polish Wikipedia (2020-03) | 11 748 343 | 181 560 313 | 1 309 416 493 | 83 | | Books | 81 140 395 | 829 404 801 | 5 386 053 287 | 84 | | Oscar (32M part, cleared) | 112 466 497 | 1 198 735 834 | 8 454 177 161 | 85 | | Total | 205 355 235 | 2 209 700 948 | 15 149 646 941 | 86 | 87 | 88 | For testing we take ~10% of each corpus 89 | 90 | | Test Corpus | Lines | Words | Characters | 91 | |----------------------------|------------|-------------|---------------| 92 | | Polish Wikipedia (2020-03) | 1 305 207 | 21 333 280 | 155 403 453 | 93 | | Books | 9 007 716 | 93 141 853 | 610 111 989 | 94 | | Oscar (32M part, cleared) | 14 515 735 | 157 303 490 | 1 104 855 397 | 95 | | Total | 24 828 658 | 271 778 623 | 1 870 370 839 | 96 | 97 | 98 | 99 | ## Training Polish RoBERTA protocol with Fairseq 100 | 101 | 102 | General recipe of the final data preparation and model training process: 103 | 1. Prepare huge text file _data.txt_ e.g. Wikipedia text, where each sentence is in a new line and each article is separated by two new lines. 104 | 1. Take 10-15M lines and prepare another file for sentencepiece (vocabulary builder) - again, each sentence is in one line. 105 | 1. Train sentencepiece vocabulary and save it in fairseq format _vocab.fairseq.txt_. 106 | 1. Encode _data.txt_ with trained sentencepiece model to _data.sp.txt_. 107 | 1. Preprocess _data.sp.txt_ with [fairseq-preprocess](https://fairseq.readthedocs.io/en/latest/command_line_tools.html#fairseq-preprocess). 108 | 1. Run training. 109 | 110 | Detailed data preparation steps for fairseq (vocab gen and binarization) are available in separate notebook [polish_roberta_vocab.ipynb](polish_roberta_vocab.ipynb). 111 | 112 | Commands needed to reproduce fairseq models with various training protocols may be found in [polish_roberta_training.ipynb](polish_roberta_training.ipynb). 113 | 114 | ## Pretrained models and vocabs 115 | 116 | * [PoLitBert_v32k_linear_50k](https://minio.clarin-pl.eu/ermlab/public/PoLitBert/models/PoLitBert_v32k_linear_50k.zip) 117 | * [PoLitBert_v32k_tri_50k](https://minio.clarin-pl.eu/ermlab/public/PoLitBert/models/PoLitBert_v32k_tri_50k.zip) 118 | * [PoLitBert_v32k_cos1_2_50k](https://minio.clarin-pl.eu/ermlab/public/PoLitBert/models/PoLitBert_v32k_cos1_2_50k.zip) 119 | * [PoLitBert_v32k_tri_125k](https://minio.clarin-pl.eu/ermlab/public/PoLitBert/models/PoLitBert_v32k_tri_125k.zip) 120 | * [PoLitBert_32k_cos1_5](https://minio.clarin-pl.eu/ermlab/public/PoLitBert/models/PoLitBert_v32k_cos1_5_50k.zip) 121 | * [PoLitBert_v32k_cos1_5_50k](https://minio.clarin-pl.eu/ermlab/public/PoLitBert/models/PoLitBert_v32k_linear_125k.zip) 122 | * [PoLitBert_v50k_linear_50k](https://minio.clarin-pl.eu/ermlab/public/PoLitBert/models/PoLitBert_v50k_linear_50k.zip) 123 | 124 | 125 | ### KLEJ evaluation 126 | 127 | All models were evaluated at 26.07.2020 with 9 [KLEJ benchmark](https://klejbenchmark.com/) tasks . 128 | Below results were achieved with use of fine-tuning scripts from 129 | [Polish RoBERTa](https://github.com/sdadas/polish-roberta#evaluation) without any further tweaks. which suggests that 130 | the potential of the models may not been fully utilized yet. 131 | 132 | 133 | | Model | NKJP-NER | CDSC-E | CDSC-R | CBD | PolEmo2.0-IN | PolEmo2.0-OUT | DYK | PSC | AR | Avg | 134 | |--------------------------------------|:--------:|:------:|:------:|:----:|:------------:|:-------------:|:----:|:----:|:----:|:-----:| 135 | | PoLitBert_v32k_linear_50k | 92.3 | 91.5 | 92.2 | 64 | 89.8 | 76.1 | 60.2 | 97.9 | 87.6 | 83.51 | 136 | | PoLitBert_v32k_linear_50k_2ep | 91.9 | 91.8 | 90.9 | 64.6 | 89.1 | 75.9 | 59.8 | 97.9 | 87.9 | 83.31 | 137 | | PoLitBert_v32k_tri_125k | 93.6 | 91.7 | 91.8 | 62.4 | 90.3 | 75.7 | 59 | 97.4 | 87.2 | 83.23 | 138 | | PoLitBert_v32k_linear_125k_2ep | 94.3 | 92.1 | 92.8 | 64 | 90.6 | 79.1 | 51.7 | 94.1 | 88.7 | 83.04 | 139 | | PoLitBert_v32k_tri_50k | 93.9 | 91.7 | 92.1 | 57.6 | 88.8 | 77.9 | 56.6 | 96.5 | 87.7 | 82.53 | 140 | | PoLitBert_v32k_linear_125k | 94 | 91.3 | 91.8 | 61.1 | 90.4 | 78.1 | 50.8 | 95.8 | 88.2 | 82.39 | 141 | | PoLitBert_v50k_linear_50k | 92.8 | 92.3 | 91.7 | 57.7 | 90.3 | 80.6 | 42.2 | 97.4 | 88.5 | 81.50 | 142 | | PoLitBert_v32k_cos1_2_50k | 92.5 | 91.6 | 90.7 | 60.1 | 89.5 | 73.5 | 49.1 | 95.2 | 87.5 | 81.08 | 143 | | PoLitBert_v32k_cos1_5_50k | 93.2 | 90.7 | 89.5 | 51.7 | 89.5 | 74.3 | 49.1 | 97.1 | 87.5 | 80.29 | 144 | 145 | A comparison with other developed models is available in the continuously updated [leaderboard](https://klejbenchmark.com/leaderboard/) of evaluation tasks. 146 | 147 | 148 | 149 | ### Details of models training 150 | 151 | 152 | We believe in open science and knowledge sharing, thus we decided to share complete code, params, experiment details and tensorboards. 153 | 154 | Link to [PoLitBert research log]( 155 | https://docs.google.com/spreadsheets/d/1ebVH-otNJM0nCslY0I9aaCytXpwCihGTUDiAmMLz-zc/edit?usp=sharing) (same as below). 156 | 157 | 158 | | Experiment | Model name | Vocab size | Scheduler | BSZ | WPB | Steps | Train tokens | Train loss | Valid loss | Best (test) loss | 159 | |----------------------------------------------------------------------------------------------------------------------------------------------------|----------------------------------------------------------------------------------------------------------------------------------------|------------|------------------------------|-------|----------|---------|--------------|------------|------------|------------------| 160 | | [#1](https://nbviewer.jupyter.org/github/Ermlab/PoLitBert/blob/dev/polish_roberta_training.ipynb#Experiment-1---linear-decay,-50k-updates) | PoLitBert_v32k_linear_50k ([tensorboard](https://tensorboard.dev/experiment/KfLdZq1gTYy8pPtKrVuoHw/#scalars)) | 32k | linear decay | 8 192 | 4,07E+06 | 50 000 | 2,03E+11 | 1,502 | 1,460 | 1,422 | 161 | | [#2](https://nbviewer.jupyter.org/github/Ermlab/PoLitBert/blob/dev/polish_roberta_training.ipynb#Experiment-2---cyclic-triangular,-50k-updates) | PoLitBert_v32k_tri_50k ([tensorboard](https://tensorboard.dev/experiment/eGmn2nsgQEqqaNvbY3b1kQ/#scalars)) | 32k | triangular | 8 192 | 4,07E+06 | 50 000 | 2,03E+11 | 1,473 | 1,436 | 1,402 | 162 | | [#3](https://nbviewer.jupyter.org/github/Ermlab/PoLitBert/blob/dev/polish_roberta_training.ipynb#Experiment-3---cyclic-cosine,-50k-updates) | PoLitBert_v32k_cos1_50k ([tensorboard](https://tensorboard.dev/experiment/Vg9bGil3QC2fKgnyp7eKRg/)) | 32k | cosine mul=1 | 8 192 | 4,07E+06 | 23 030 | 9,37E+10 | 10,930 | 11,000 | 1,832 | 163 | | [#4](https://nbviewer.jupyter.org/github/Ermlab/PoLitBert/blob/dev/polish_roberta_training.ipynb#Experiment-4---cyclic-cosine,-50k-updates) | PoLitBert_v32k_cos1_2_50k ([tensorboard](https://tensorboard.dev/experiment/ElKrpymrQXKETX4Ea9lLSQ/#scalars)) | 32k | cosine mul=1 peak=0.0005 | 8 192 | 4,07E+06 | 50 000 | 2,03E+11 | 1,684 | 1,633 | 1,595 | 164 | | [#5](https://nbviewer.jupyter.org/github/Ermlab/PoLitBert/blob/dev/polish_roberta_training.ipynb#Experiments-5,-6,-7---cyclic-cosine,-50k-updates) | PoLitBert_v32k_cos1_3_50k ([tensorboard]()) | 32k | cosine mul=2 | 8 192 | 4,07E+06 | 3 735 | 1,52E+10 | 10,930 | | | 165 | | [#6](https://nbviewer.jupyter.org/github/Ermlab/PoLitBert/blob/dev/polish_roberta_training.ipynb#Experiments-5,-6,-7---cyclic-cosine,-50k-updates) | PoLitBert_v32k_cos1_4_50k ([tensorboard](https://tensorboard.dev/experiment/fNXEfyauRvSRkxri064RNA/)) | 32k | cosine mul=2 grad-clip=0.9 | 8 192 | 4,07E+06 | 4 954 | 2,02E+10 | 10,910 | 10,940 | 2,470 | 166 | | [#8](https://nbviewer.jupyter.org/github/Ermlab/PoLitBert/blob/dev/polish_roberta_training.ipynb#Experiment-8---cyclic-triangular,-125k-updates) | PoLitBert_v32k_tri_125k ([tensorboard](https://tensorboard.dev/experiment/OfVtgeQLRlq6iMtDgdAPGA/#scalars)) | 32k | triangular | 8 192 | 4,07E+06 | 125 000 | 5,09E+11 | 1,435 | 1,313 | 1,363 | 167 | | [#9](https://nbviewer.jupyter.org/github/Ermlab/PoLitBert/blob/dev/polish_roberta_training.ipynb#Experiment-9---cyclic-cosine,-125k-updates) | PoLitBert_v32k_cos1_5_50k ([tensorboard](https://tensorboard.dev/experiment/6ocg02CyQvCpq60gWSzDXQ/#scalars)) | 32k | cosine, mul=2, grad-clip=0.9 | 8 192 | 4,07E+06 | 125 000 | 5,09E+11 | 1,502 | 1,358 | 1,426 | 168 | | [#10](https://nbviewer.jupyter.org/github/Ermlab/PoLitBert/blob/dev/polish_roberta_training.ipynb#Experiment-10---linear,-125k-updates) | PoLitBert_v32k_linear_125k ([tensorboard](https://tensorboard.dev/experiment/55MrDxXxS2mP8uGyZk5WPg/#scalars)) | 32k | linear decay | 8 192 | 4,07E+06 | 125 000 | 5,09E+11 | 1,322 | 1,218 | 1,268 | 169 | | [#11](https://nbviewer.jupyter.org/github/Ermlab/PoLitBert/blob/dev/polish_roberta_training.ipynb#Experiment-11---vocab50k,-linear,-50k-updates) | PoLitBert_v50k_linear_50k ([tensorboard](https://tensorboard.dev/experiment/nkYJ7jp1RR2fLCqbGE7Kfw/#scalars)) | 50k | linear decay | 8 192 | 4,07E+06 | 50 000 | 2,04E+11 | 1,546 | 1,439 | 1,480 | 170 | 171 | 172 | ## Used libraries 173 | 174 | 175 | * [KRNNT - Polish morphological tagger.](https://github.com/kwrobel-nlp/krnnt) - we use dockerized version 176 | * langdetect - for detecting sentence language 177 | * polyglot - for detecting sentence language 178 | * sentencepiece 179 | * [Fairseq v0.9](https://github.com/pytorch/fairseq) 180 | 181 | 182 | ### Instalation dependecies and problems 183 | 184 | * langdetect needs additional package 185 | * install sudo apt-get install libicu-dev 186 | * sentencepiece was installed from source code 187 | 188 | 189 | ## Acknowledgements 190 | 191 | This is the joint work of companies [Ermlab Software](https://ermlab.com/?utm_source=github&utm_medium=readme&utm_campaign=politbert) and [Literacka](https://literacka.com.pl/?utm_source=github&utm_medium=readme&utm_campaign=politbert) 192 | 193 | 194 | Part of the work was financed from the grant of [The Polish National Centre for Research and Development](https://www.gov.pl/web/ncbr-en) no. POIR.01.01.01-00-1213/19, the beneficiary of which was Literacka. Project title "Asystent wydawniczy - oprogramowanie do analizy treści, wykorzystujące algorytmy sztucznej inteligencji w celu zautomatyzowania procesu wydawniczego i predykcji sukcesów rynkowych publikacji." 195 | 196 | We would like to express ours gratitude to NVidia Inception Programme and Amazon AWS for providing the free GPU credits - thank you! 197 | 198 | 199 | ### Authors: 200 | 201 | * [Krzysztof Sopyła](https://www.linkedin.com/in/krzysztofsopyla/) 202 | * [Łukasz Sawaniewski](https://www.linkedin.com/in/sawaniewski/) 203 | 204 | 205 | ### Also appreciate the help from 206 | 207 | - [simonefrancia](https://github.com/simonefrancia) from Musixmatch for his [detailed explanations how they trained RoBERTa](https://github.com/musixmatchresearch/umberto/issues/2) Italian model [Umberto ](https://github.com/musixmatchresearch/umberto) 208 | 209 | 210 | ## About Ermlab Software 211 | 212 | __Ermlab__ - Polish machine learning company 213 | 214 | :owl: [Website](https://ermlab.com/?utm_source=github&utm_medium=readme&utm_campaign=politbert) | :octocat: [Repository](https://github.com/ermlab) 215 | 216 | . 217 | 218 | -------------------------------------------------------------------------------- /polish_process_data.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "source": [ 6 | "# Prepare data for trainning Polish Roberta model\n", 7 | "\n", 8 | "Get raw text from different sources and concat in one big data file.\n", 9 | "\n", 10 | "Usefull shell commands:\n", 11 | "\n", 12 | "Move files to another directory, where isbn's are in the file list (wolne lektury non polish)\n", 13 | "\n", 14 | "```sh\n", 15 | "cat wolne_lektury_non_polish_isbn.txt | xargs -I{} sh -c \"mv *'{}'* ./non_polish/;\"\n", 16 | "```\n", 17 | "\n", 18 | "\n", 19 | "Cat all text files and instert new line between each text\n", 20 | "\n", 21 | "```sh\n", 22 | "find *content.txt | xargs -I{} sh -c \"cat '{}'; echo ''\" > corpus_[type]_[date].txt\n", 23 | "```\n", 24 | "\n", 25 | "Take 11768022 first lines form splited wikipedia file\n", 26 | "\n", 27 | "```sh\n", 28 | "head -11768022 corpus_wiki_2020-02-13.txt > corpus_wiki_2020-02-13_sample.txt\n", 29 | "```" 30 | ], 31 | "metadata": { 32 | "collapsed": false 33 | } 34 | }, 35 | { 36 | "cell_type": "markdown", 37 | "source": [ 38 | "```Python\n", 39 | "import json\n", 40 | "from pathlib import Path\n", 41 | "from glob import glob\n", 42 | "import os\n", 43 | "from concurrent.futures import ProcessPoolExecutor\n", 44 | "from itertools import chain\n", 45 | "import nltk\n", 46 | "import re\n", 47 | "from tqdm import tqdm\n", 48 | "\n", 49 | "import text_utils as tu\n", 50 | "```" 51 | ], 52 | "metadata": { 53 | "collapsed": false 54 | } 55 | }, 56 | { 57 | "cell_type": "markdown", 58 | "source": [ 59 | "## Prepare wikipedia data\n", 60 | "\n", 61 | "\n", 62 | "Download wikipedia data and extract it with wikiextractor\n", 63 | "\n", 64 | "Download data from https://dumps.wikimedia.org/plwiki/20200301/\n", 65 | "\n", 66 | "Download data\n", 67 | "\n", 68 | "```sh\n", 69 | "mkdir wiki_dump\n", 70 | "cd wiki_dump\n", 71 | "wget https://dumps.wikimedia.org/plwiki/20200301/plwiki-20200301-pages-articles-multistream1.xml-p1p169750.bz2\n", 72 | "wget https://dumps.wikimedia.org/plwiki/20200301/plwiki-20200301-pages-articles-multistream2.xml-p169751p510662.bz2\n", 73 | "wget https://dumps.wikimedia.org/plwiki/20200301/plwiki-20200301-pages-articles-multistream3.xml-p510663p1056310.bz2\n", 74 | "wget https://dumps.wikimedia.org/plwiki/20200301/plwiki-20200301-pages-articles-multistream4.xml-p1056311p1831508.bz2\n", 75 | "wget https://dumps.wikimedia.org/plwiki/20200301/plwiki-20200301-pages-articles-multistream5.xml-p1831509p3070393.bz2\n", 76 | "wget https://dumps.wikimedia.org/plwiki/20200301/plwiki-20200301-pages-articles-multistream6.xml-p3070394p4570393.bz2\n", 77 | "wget https://dumps.wikimedia.org/plwiki/20200301/plwiki-20200301-pages-articles-multistream6.xml-p4570394p4727706.bz2\n", 78 | "```\n", 79 | "\n", 80 | "Split data into train and eval (validation) sets\n", 81 | "\n", 82 | "File _train.txt_ with trainning files part names\n", 83 | "\n", 84 | "```sh\n", 85 | "# train.txt\n", 86 | "plwiki-20200301-pages-articles-multistream1.xml-p1p169750\n", 87 | "plwiki-20200301-pages-articles-multistream2.xml-p169751p510662\n", 88 | "plwiki-20200301-pages-articles-multistream3.xml-p510663p1056310\n", 89 | "plwiki-20200301-pages-articles-multistream4.xml-p1056311p1831508\n", 90 | "plwiki-20200301-pages-articles-multistream5.xml-p1831509p3070393\n", 91 | "plwiki-20200301-pages-articles-multistream6.xml-p3070394p4570393\n", 92 | "```\n", 93 | "\n", 94 | "File _eval.txt_ with validation file part name\n", 95 | "\n", 96 | "```sh\n", 97 | "# eval.txt\n", 98 | "plwiki-20200301-pages-articles-multistream6.xml-p4570394p4727706\n", 99 | "```\n", 100 | "\n", 101 | "Extract text and save it to json format\n", 102 | "\n", 103 | "```sh\n", 104 | "cd data/wiki_dump\n", 105 | "\n", 106 | "cat train.txt | xargs -I@ python ../../libs/wikiextractor/WikiExtractor.py @ --bytes=100M --json --output=\"./train/@\"\n", 107 | "\n", 108 | "cat eval.txt | xargs -I@ python ../../libs/wikiextractor/WikiExtractor.py @ --bytes=100M --json --output=\"./eval/@\"\n", 109 | "```" 110 | ], 111 | "metadata": { 112 | "collapsed": false 113 | } 114 | }, 115 | { 116 | "cell_type": "markdown", 117 | "source": [ 118 | "Get files with content (depend of corpus you need: train, eval, all), process each json file and save in txt, separete articles by new line.\n", 119 | "\n", 120 | "Read files, process json and save into one big txt file with documents splited by two new lines\n" 121 | ], 122 | "metadata": { 123 | "collapsed": false 124 | } 125 | }, 126 | { 127 | "cell_type": "markdown", 128 | "source": [ 129 | "```Python\n", 130 | "# type='train'\n", 131 | "# type='eval'\n", 132 | "type='all'\n", 133 | "wiki_dump_folder = f\"./data/wiki_dump/*/**\" if type=='all' else f\"./data/wiki_dump/{type}/**\"\n", 134 | "courpus_raw_path=f'./data/corpus_raw/corpus_wikipedia_2020-03-01_{type}.txt'\n", 135 | "wiki_json_files = [f for f in glob(wiki_dump_folder, recursive=True) if os.path.isfile(f)]\n", 136 | "\n", 137 | "print(courpus_raw_path)\n", 138 | "\n", 139 | "def process_wiki_line(line, min_len=0):\n", 140 | " '''Extract text only, normalize spacing and remove short documetns'''\n", 141 | " try:\n", 142 | " doc = json.loads(line)\n", 143 | " txt = re.sub(\"\\s+\", \" \", doc[\"text\"])\n", 144 | " if len(txt)< min_len:\n", 145 | " return '' #return empty if shorter then min_len\n", 146 | " return txt\n", 147 | " except:\n", 148 | " # print(f\"Could not parse line \\n{line}\\n\")\n", 149 | " return ''\n", 150 | "\n", 151 | "with open(courpus_raw_path, 'w+') as output_file:\n", 152 | " print(courpus_raw_path)\n", 153 | " for json_line in tqdm(wiki_json_files):\n", 154 | "\n", 155 | " tot_len = tu.get_num_lines(json_line)\n", 156 | " print(f'process - {json_line} lines={tot_len}')\n", 157 | "\n", 158 | " with open(json_line) as f:\n", 159 | " text=''\n", 160 | "\n", 161 | " for line in tqdm(f,total=tu.get_num_lines(json_line)):\n", 162 | " text=process_wiki_line(line, min_len=450)\n", 163 | " # print(text[0:20])\n", 164 | " if text.strip()!='':\n", 165 | " output_file.write(text)\n", 166 | " # put new line of the end of the article\n", 167 | " output_file.write('\\n\\n')\n", 168 | "```" 169 | ], 170 | "metadata": { 171 | "collapsed": false 172 | } 173 | }, 174 | { 175 | "cell_type": "markdown", 176 | "source": [ 177 | "### Wikipedia process txt file\n", 178 | "\n", 179 | "Do sentence tokenization and save each sentence in new line, add blank line between wiki aritcles. Do not check if sentence is polish and valid!" 180 | ], 181 | "metadata": { 182 | "collapsed": false 183 | } 184 | }, 185 | { 186 | "cell_type": "markdown", 187 | "source": [ 188 | "```Python\n", 189 | "p = Path(courpus_raw_path) #'./data/corpus_raw/corpus_wikipedia_2020-03-01_{all,train,eval}.txt'\n", 190 | "corpus_line_path = f\"{p.with_suffix('')}_lines.txt\"\n", 191 | "\n", 192 | "print(f\"in file={courpus_raw_path}\\nout file={corpus_line_path}\")\n", 193 | "\n", 194 | "\n", 195 | "stats, vl, pl= tu.corpus_process_sentence(courpus_raw_path,\n", 196 | " corpus_line_path,\n", 197 | " split_each_line_as_doc = False,\n", 198 | " check_valid_sentence= False,\n", 199 | " check_lang_sentence=False)\n", 200 | "```" 201 | ], 202 | "metadata": { 203 | "collapsed": false 204 | } 205 | }, 206 | { 207 | "cell_type": "markdown", 208 | "source": [ 209 | "## Prepare book corpus data\n", 210 | "\n", 211 | "Read book dataset and normalize line splitiing. The textfile has '\\n' in middle of the sentence. It is not necessary if you have proper file.\n", 212 | "\n", 213 | "Input: concatenated book textfile\n", 214 | "Output: file with removed new lines in the middle of the sentence.\n", 215 | "\n", 216 | "Run once!!" 217 | ], 218 | "metadata": { 219 | "collapsed": false 220 | } 221 | }, 222 | { 223 | "cell_type": "markdown", 224 | "source": [ 225 | "Clean book corpus, remove unnecessary new lines in the middle of sentences" 226 | ], 227 | "metadata": { 228 | "collapsed": false 229 | } 230 | }, 231 | { 232 | "cell_type": "markdown", 233 | "source": [ 234 | "```Python\n", 235 | "# input_path\n", 236 | "corpus_book_raw='./data/corpus_raw/corpus_books_2020_02_24.txt'\n", 237 | "\n", 238 | "p = Path(corpus_book_raw)\n", 239 | "\n", 240 | "# output_path\n", 241 | "corpus_book_fix = f\"{p.with_suffix('')}_fix.txt\"\n", 242 | "```" 243 | ], 244 | "metadata": { 245 | "collapsed": false 246 | } 247 | }, 248 | { 249 | "cell_type": "markdown", 250 | "source": [ 251 | "```Python\n", 252 | "# remove line breaks in the middle of the sentence\n", 253 | "reg = re.compile('(?<=[A-Za-ząćęłńóśźż,—-])(?0:\n", 297 | " save_buffer2file(output_file, text)\n", 298 | "\n", 299 | "\n", 300 | "t1=dt.datetime.now()\n", 301 | "print(f'Done. Takes={t1-t0}')\n", 302 | "```" 303 | ], 304 | "metadata": { 305 | "collapsed": false 306 | } 307 | }, 308 | { 309 | "cell_type": "markdown", 310 | "source": [ 311 | "### Book corpus process txt file\n", 312 | "\n", 313 | "Do sentence tokenization and save each sentence in new line, add blank line between wiki aritcles.\n", 314 | "\n", 315 | "Do not check if sentence is polish and valid!" 316 | ], 317 | "metadata": { 318 | "collapsed": false 319 | } 320 | }, 321 | { 322 | "cell_type": "markdown", 323 | "source": [ 324 | "```Python\n", 325 | "p = Path(corpus_book_fix)\n", 326 | "corpus_book_lines = f\"{p.with_suffix('')}_lines.txt\"\n", 327 | "\n", 328 | "print(f\"in file={corpus_book_fix}\\nout file={corpus_book_lines}\")\n", 329 | "\n", 330 | "\n", 331 | "stats, vl, pl =tu.corpus_process_sentence(corpus_book_fix,\n", 332 | " corpus_book_lines,\n", 333 | " split_each_line_as_doc = False,\n", 334 | " check_valid_sentence= False,\n", 335 | " check_lang_sentence=False)\n", 336 | "```" 337 | ], 338 | "metadata": { 339 | "collapsed": false 340 | } 341 | }, 342 | { 343 | "cell_type": "markdown", 344 | "source": [ 345 | "## Prepare Oscar train\n", 346 | "\n", 347 | "Download dataset:\n", 348 | "[Polish part Oscar corpus](https://traces1.inria.fr/oscar/files/Compressed/pl_dedup.txt.gz) (pl_dedup.txt.gz ~19GB)\n", 349 | "\n", 350 | "```sh\n", 351 | "mv pl_dedup.txt.gz oscar_pl_dedup.txt.gz\n", 352 | "gunzip -k oscar_pl_dedup.txt.gz\n", 353 | "cd oscar_pl_dedup\n", 354 | "```\n", 355 | "\n", 356 | "Take 32M first lines\n", 357 | "\n", 358 | "```sh\n", 359 | "head -n 32MB pl_dedup.txt > corpus_oscar_2020-04-10_32M.txt\n", 360 | "\n", 361 | "#split file into parts for 4M lines\n", 362 | "\n", 363 | "split -l 4000000 -d --additional-suffix _.txt corpus_oscar_2020-04-10_32M.txt corpus_oscar_2020-04-10_32M_\n", 364 | "\n", 365 | "```" 366 | ], 367 | "metadata": { 368 | "collapsed": false 369 | } 370 | }, 371 | { 372 | "cell_type": "markdown", 373 | "source": [ 374 | "For test use smaller file" 375 | ], 376 | "metadata": { 377 | "collapsed": false 378 | } 379 | }, 380 | { 381 | "cell_type": "markdown", 382 | "source": [ 383 | "```Python\n", 384 | "corpus_oscar_raw = \"./data/corpus_raw/corpus_oscar_100k.txt\"\n", 385 | "\n", 386 | "p = Path(corpus_oscar_raw)\n", 387 | "corpus_oscar_lines = f\"{p.with_suffix('')}_lines.txt\"\n", 388 | "\n", 389 | "print(f\"in file={corpus_oscar_raw}\\nout file={corpus_oscar_lines}\")\n", 390 | "\n", 391 | "stats, vl, pl = tu.corpus_process_sentence(\n", 392 | " corpus_oscar_raw,\n", 393 | " corpus_oscar_lines,\n", 394 | " split_each_line_as_doc=True,\n", 395 | " check_valid_sentence=True,\n", 396 | " check_lang_sentence=True,\n", 397 | " max_sentence_length=700,\n", 398 | ")\n", 399 | "```" 400 | ], 401 | "metadata": { 402 | "collapsed": false 403 | } 404 | }, 405 | { 406 | "cell_type": "markdown", 407 | "source": [ 408 | "# Prepare training, test datasets\n", 409 | "\n", 410 | "Split **corpus_wikipedia_2020-03-01_all_lines.txt** 1.5GB\n", 411 | "\n", 412 | "* train - 0 - 11748343 (~90%)\n", 413 | "* test - 11748344 - 13053550\n", 414 | "\n", 415 | "```sh\n", 416 | "head -11748343 corpus_wikipedia_2020-03-01_all_lines.txt > corpus_wikipedia_2020-03-01_all_lines_train.txt\n", 417 | "tail -$((13053550-11748343)) corpus_wikipedia_2020-03-01_all_lines.txt > corpus_wikipedia_2020-03-01_all_lines_test.txt\n", 418 | "```\n", 419 | "\n", 420 | "\n", 421 | "Split **corpus_books_2020_02_24_fix_lines.txt** 6.5GB\n", 422 | "\n", 423 | "* train - 0 - 81140395 (~90%)\n", 424 | "* test - 81140395 - 90148111\n", 425 | "\n", 426 | "```sh\n", 427 | "head -81140395 corpus_books_2020_02_24_fix_lines.txt > corpus_books_2020_02_24_fix_lines_train.txt\n", 428 | "tail -$((90148111-81140395)) corpus_books_2020_02_24_fix_lines.txt > corpus_books_2020_02_24_fix_lines_test.txt\n", 429 | "```\n", 430 | "\n", 431 | "\n", 432 | "**Oscar Train**\n", 433 | "\n", 434 | "Train 1.1GB x8 ~ 8.9GB **corpus_oscar_2020-04-10_32M_0?__lines.txt**\n", 435 | "\n", 436 | "The below files were\n", 437 | "\n", 438 | "corpus_oscar_2020-04-10_32M_00__lines.txt\n", 439 | "\n", 440 | "corpus_oscar_2020-04-10_32M_01__lines.txt\n", 441 | "\n", 442 | "corpus_oscar_2020-04-10_32M_02__lines.txt\n", 443 | "\n", 444 | "corpus_oscar_2020-04-10_32M_03__lines.txt\n", 445 | "\n", 446 | "corpus_oscar_2020-04-10_32M_04__lines.txt\n", 447 | "\n", 448 | "corpus_oscar_2020-04-10_32M_05__lines.txt\n", 449 | "\n", 450 | "corpus_oscar_2020-04-10_32M_06__lines.txt\n", 451 | "\n", 452 | "corpus_oscar_2020-04-10_32M_07__lines.txt\n", 453 | "\n", 454 | "```sh\n", 455 | "cat corpus_oscar_2020-04-10_32M_0?__lines.txt > corpus_oscar_2020-04-10_32M_lines_train.txt\n", 456 | "```\n", 457 | "\n", 458 | "pl_dedup.txt - 145518911 lines\n", 459 | "\n", 460 | "**Oscar Test**\n", 461 | "\n", 462 | "Use corpus_oscar_2020-04-10_last_4M_lines.txt (~1.3GB)\n" 463 | ], 464 | "metadata": { 465 | "collapsed": false 466 | } 467 | }, 468 | { 469 | "cell_type": "markdown", 470 | "source": [ 471 | "## Make train and test corpus set\n", 472 | "\n", 473 | "### Train\n", 474 | "\n", 475 | "```sh\n", 476 | "cat corpus_wikipedia_2020-03-01_all_lines_train.txt \\\n", 477 | " corpus_books_2020_02_24_fix_lines_train.txt \\\n", 478 | " corpus_oscar_2020-04-10_32M_lines_train.txt > corpus_wiki_books_oscar_train.txt\n", 479 | "```\n", 480 | "\n", 481 | "### Test\n", 482 | "\n", 483 | "```sh\n", 484 | "cat corpus_wikipedia_2020-03-01_all_lines_test.txt \\\n", 485 | " corpus_books_2020_02_24_fix_lines_test.txt \\\n", 486 | " corpus_oscar_2020-04-10_last_4M_lines.txt > corpus_wiki_books_oscar_test.txt\n", 487 | "```\n" 488 | ], 489 | "metadata": { 490 | "collapsed": false 491 | } 492 | } 493 | ], 494 | "metadata": { 495 | "kernelspec": { 496 | "display_name": "Python 3", 497 | "language": "python", 498 | "name": "python3" 499 | }, 500 | "language_info": { 501 | "codemirror_mode": { 502 | "name": "ipython", 503 | "version": 2 504 | }, 505 | "file_extension": ".py", 506 | "mimetype": "text/x-python", 507 | "name": "python", 508 | "nbconvert_exporter": "python", 509 | "pygments_lexer": "ipython2", 510 | "version": "2.7.6" 511 | } 512 | }, 513 | "nbformat": 4, 514 | "nbformat_minor": 0 515 | } -------------------------------------------------------------------------------- /polish_roberta_training.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# PoLitBert - Polish RoBERT'a model \n", 8 | "\n", 9 | "## Model training experiments' protocols.\n", 10 | "\n", 11 | "Training environment details:\n", 12 | "* Pytorch 1.5\n", 13 | "* Apex \n", 14 | "* CUDA 10.2\n", 15 | "* fairseq 0.9\n" 16 | ] 17 | }, 18 | { 19 | "cell_type": "markdown", 20 | "source": [ 21 | "Experiments were additionally compared in a separate [research log](https://docs.google.com/spreadsheets/d/1ebVH-otNJM0nCslY0I9aaCytXpwCihGTUDiAmMLz-zc/edit?usp=sharing)\n", 22 | "\n", 23 | "* Experiment 1 - linear decay, 50k updates, short training\n", 24 | " * linear schedule peek_lr=5e-4 updates=50000, bsz=8192, test convergence speed of linear schedule, try to find optimal speed\n", 25 | "* Experiment 2 - cyclic triangular, 50k updates (ok)\n", 26 | " * cyclic triangular schedule updates=50000, bsz=8192, cyclic step=5000 test convergence speed of linear schedule, try high peak learning rate with shrinking after provided steps (PEAK_LR=0.001,LR_SHRINK=0.8, STEP_SIZE=2500)\n", 27 | "* Experiment 3 - cyclic cosine, 50k updates (loss explode)\n", 28 | " * cosine cyclic schedule, try high peak learning rate with shrinking (PEAK_LR=0.001,LR_SHRINK=0.8, STEP_SIZE=2500)\n", 29 | "* Experiment 4 - cyclic cosine, 50k updates (loss explode)\n", 30 | " * cyclic cosine schedule, updates=50000, bsz=8192, cyclic step=2500- test convergence speed of linear schedule, try to find optimal speed, should be similar to triangular schedule in 5000 steps goes up and down with lr. After 23k steps experiment was stopped, loss jumped and plateau\n", 31 | "\n", 32 | "* Experiments 5, 6, 7 - cyclic cosine, 50k updates\n", 33 | " * try cyclic cosine with smaller STEP_SIZE but after each pass we double the STEP_SIZE (--t-mult 2) STEP_SIZE=1000, WARMUP_UPDATES=1000, PEAK_LR=0.001, LR_SHRINK=0.8 --t-mult 2\n", 34 | "\n", 35 | "* Experiment 8 - cyclic triangular, 125k updates, long training\n", 36 | " * TODO: EXPERIMENT_DESCRIPTION\n", 37 | "* Experiment 9 - cyclic cosine, 125k updates, long training\n", 38 | " * similar to Experiment 5, but trained longer\n", 39 | "* Experiment 10 - linear, 125k updates, long training\n", 40 | " * as experiment 1, long training\n", 41 | "* Experiment 11 - vocab50k, linear, 50k updates\n", 42 | " * as experiment 1 but larger vocab\n", 43 | "\n", 44 | "Further 50k vocab training were stopped because of lack of funds :) \n", 45 | "\n", 46 | "---" 47 | ], 48 | "metadata": { 49 | "collapsed": false 50 | } 51 | }, 52 | { 53 | "cell_type": "markdown", 54 | "source": [ 55 | "### Experiment 1 - linear decay, 50k updates\n", 56 | "\n", 57 | "Vocab: 32k tokens
\n", 58 | "Train on: AWS p3.16xlarge\n", 59 | "\n", 60 | "Efective batch size = MAX_SENTENCES\\*UPDATE_FREQ\\*num_gpu = 16\\*64\\*8 = 8192\n", 61 | "\n", 62 | "First experiment with linear scheduler, run for 50k updates.\n", 63 | "\n", 64 | "```\n", 65 | "TOTAL_UPDATES=50000 # Total number of training steps\n", 66 | "WARMUP_UPDATES=10000 # Warmup the learning rate over this many updates\n", 67 | "PEAK_LR=0.0005 # Peak learning rate, adjust as needed\n", 68 | "TOKENS_PER_SAMPLE=512 # Max sequence length\n", 69 | "MAX_POSITIONS=512 # Num. positional embeddings (usually same as above)\n", 70 | "MAX_SENTENCES=16 # Number of sequences per batch (batch size)\n", 71 | "UPDATE_FREQ=64 # Increase the batch size 16x\n", 72 | "\n", 73 | "DATA_DIR=./data/wiki_books_oscar/vocab32k/\n", 74 | "SAVE_DIR=./checkpoints/PoLitBert_v32k_linear_50k/\n", 75 | "LOGS_DIR=./checkpoints/PoLitBert_v32k_linear_50k/logs/\n", 76 | "\n", 77 | "fairseq-train --fp16 $DATA_DIR \\\n", 78 | " --task masked_lm --criterion masked_lm \\\n", 79 | " --arch roberta_base --sample-break-mode complete --tokens-per-sample $TOKENS_PER_SAMPLE \\\n", 80 | " --optimizer adam --adam-betas '(0.9,0.98)' --adam-eps 1e-6 --clip-norm 0.0 \\\n", 81 | " --lr-scheduler polynomial_decay --lr $PEAK_LR --warmup-updates $WARMUP_UPDATES --total-num-update $TOTAL_UPDATES \\\n", 82 | " --dropout 0.1 --attention-dropout 0.1 --weight-decay 0.01 \\\n", 83 | " --max-sentences $MAX_SENTENCES --update-freq $UPDATE_FREQ \\\n", 84 | " --max-update $TOTAL_UPDATES --log-format simple --log-interval 1 --skip-invalid-size-inputs-valid-test \\\n", 85 | " --save-dir $SAVE_DIR --tensorboard-logdir $LOGS_DIR --keep-last-epochs 10 \\\n", 86 | " --ddp-backend=no_c10d\n", 87 | "\n", 88 | "```\n", 89 | "\n", 90 | "```\n", 91 | "tensorboard dev upload --logdir $LOGS_DIR \\\n", 92 | " --name \"PoLitBert - Polish RoBERT'a model, exp. #1\" \\\n", 93 | " --description \"- linear decay, 50k updates, vocab32k, --save-dir ${SAVE_DIR}\"\n", 94 | "\n", 95 | "```" 96 | ], 97 | "metadata": { 98 | "collapsed": false, 99 | "pycharm": { 100 | "name": "#%% md\n" 101 | } 102 | } 103 | }, 104 | { 105 | "cell_type": "markdown", 106 | "metadata": {}, 107 | "source": [ 108 | "### Experiment 2 - cyclic triangular, 50k updates\n", 109 | "\n", 110 | "Vocab: 32k tokens
\n", 111 | "Train on: AWS p3.16xlarge\n", 112 | "\n", 113 | "Efective batch size = MAX_SENTENCES\\*UPDATE_FREQ\\*num_gpu = 16\\*64\\*8 = 8192\n", 114 | "\n", 115 | "Cyclic triangular schedule, 5000 steps for rise to peek lr and fall to base lr,\n", 116 | "after each 5k steps shrink peak and base lr.\n", 117 | "\n", 118 | "```\n", 119 | "TOTAL_UPDATES=50000 # Total number of training steps\n", 120 | "STEP_SIZE=5000\n", 121 | "BASE_LR=0.0001\n", 122 | "PEAK_LR=0.001 # Peak learning rate, adjust as needed\n", 123 | "LR_SHRINK=0.8 # max peak shirinking factor\n", 124 | "TOKENS_PER_SAMPLE=512 # Max sequence length\n", 125 | "MAX_POSITIONS=512 # Num. positional embeddings (usually same as above)\n", 126 | "MAX_SENTENCES=16 # Number of sequences per batch (batch size)\n", 127 | "UPDATE_FREQ=64 # Increase the batch size 16x\n", 128 | "\n", 129 | "DATA_DIR=./data/wiki_books_oscar/vocab32k/\n", 130 | "SAVE_DIR=./checkpoints/PoLitBert_v32k_tri_50k/\n", 131 | "LOGS_DIR=./checkpoints/PoLitBert_v32k_tri_50k/logs/\n", 132 | "\n", 133 | "fairseq-train --fp16 $DATA_DIR \\\n", 134 | " --task masked_lm --criterion masked_lm \\\n", 135 | " --arch roberta_base --sample-break-mode complete --tokens-per-sample $TOKENS_PER_SAMPLE \\\n", 136 | " --optimizer adam --adam-betas '(0.9,0.98)' --adam-eps 1e-6 --clip-norm 0.0 \\\n", 137 | " --lr-scheduler triangular --lr $BASE_LR --max-lr $PEAK_LR \\\n", 138 | " --lr-period-updates $STEP_SIZE --lr-shrink $LR_SHRINK --shrink-min \\\n", 139 | " --dropout 0.1 --attention-dropout 0.1 --weight-decay 0.01 \\\n", 140 | " --max-sentences $MAX_SENTENCES --update-freq $UPDATE_FREQ \\\n", 141 | " --max-update $TOTAL_UPDATES --skip-invalid-size-inputs-valid-test \\\n", 142 | " --tensorboard-logdir $LOGS_DIR --log-format simple --log-interval 1 --save-dir $SAVE_DIR \\\n", 143 | " --ddp-backend=no_c10d\n", 144 | "\n", 145 | "```\n", 146 | "\n", 147 | "```\n", 148 | "tensorboard dev upload --logdir $LOGS_DIR \\\n", 149 | " --name \"PoLitBert - Polish RoBERT'a model, exp. #2\" \\\n", 150 | " --description \"- cyclic triangular, 50k updates, vocab32k, --save-dir ${SAVE_DIR}\"\n", 151 | "\n", 152 | "```" 153 | ] 154 | }, 155 | { 156 | "cell_type": "markdown", 157 | "source": [ 158 | "### Experiment 3 - cyclic cosine, 50k updates\n", 159 | "\n", 160 | "Vocab: upper 32k tokens
\n", 161 | "Train on: AWS p3.16xlarge\n", 162 | "\n", 163 | "Efective batch size = MAX_SENTENCES \\* UPDATE_FREQ \\* num_gpu = 16\\*64\\*8 = 8192\n", 164 | "\n", 165 | "Cyclic cosine schedule, 5000 steps for rise to peek lr and fall to base lr, after each 5k steps shrink\n", 166 | "peak and base lr.\n", 167 | "\n", 168 | "```\n", 169 | "TOTAL_UPDATES=50000 # Total number of training steps\n", 170 | "STEP_SIZE=2500\n", 171 | "WARMUP_UPDATES=2500 # same as triangular\n", 172 | "BASE_LR=0.0001\n", 173 | "PEAK_LR=0.001 # Peak learning rate, adjust as needed\n", 174 | "LR_SHRINK=0.8 #\n", 175 | "TOKENS_PER_SAMPLE=512 # Max sequence length\n", 176 | "MAX_POSITIONS=512 # Num. positional embeddings (usually same as above)\n", 177 | "MAX_SENTENCES=16 # Number of sequences per batch (batch size)\n", 178 | "UPDATE_FREQ=64 # Increase the batch size 16x\n", 179 | "\n", 180 | "DATA_DIR=./data/wiki_books_oscar/vocab32k/\n", 181 | "SAVE_DIR=./checkpoints/PoLitBert_v32k_cos1_50k/\n", 182 | "LOGS_DIR=./checkpoints/PoLitBert_v32k_cos1_50k/logs/\n", 183 | "\n", 184 | "\n", 185 | "fairseq-train --fp16 $DATA_DIR \\\n", 186 | " --task masked_lm --criterion masked_lm \\\n", 187 | " --arch roberta_base --sample-break-mode complete --tokens-per-sample $TOKENS_PER_SAMPLE \\\n", 188 | " --optimizer adam --adam-betas '(0.9,0.98)' --adam-eps 1e-6 --clip-norm 0.0 \\\n", 189 | " --lr-scheduler cosine --lr $BASE_LR --max-lr $PEAK_LR \\\n", 190 | " --warmup-updates $WARMUP_UPDATES \\\n", 191 | " --lr-period-updates $STEP_SIZE --t-mult 1 --lr-shrink $LR_SHRINK \\\n", 192 | " --dropout 0.1 --attention-dropout 0.1 --weight-decay 0.01 \\\n", 193 | " --max-sentences $MAX_SENTENCES --update-freq $UPDATE_FREQ \\\n", 194 | " --max-update $TOTAL_UPDATES --skip-invalid-size-inputs-valid-test \\\n", 195 | " --tensorboard-logdir $LOGS_DIR --log-format simple --log-interval 1 --save-dir $SAVE_DIR \\\n", 196 | " --ddp-backend=no_c10d\n", 197 | "\n", 198 | "```\n", 199 | "\n", 200 | "```\n", 201 | "tensorboard dev upload --logdir $LOGS_DIR \\\n", 202 | " --name \"PoLitBert - Polish RoBERT'a model, exp. #3\" \\\n", 203 | " --description \"- cyclic cosine, 50k updates, vocab32k, step=2500 --save-dir ${SAVE_DIR}\"\n", 204 | "\n", 205 | "```\n" 206 | ], 207 | "metadata": { 208 | "collapsed": false, 209 | "pycharm": { 210 | "name": "#%% md\n" 211 | } 212 | } 213 | }, 214 | { 215 | "cell_type": "markdown", 216 | "metadata": {}, 217 | "source": [ 218 | "### Experiment 4 - cyclic cosine, 50k updates\n", 219 | "\n", 220 | "Vocab: 32k tokens
\n", 221 | "Train on: AWS p3.16xlarge\n", 222 | "\n", 223 | "Effective batch size = MAX_SENTENCES\\*UPDATE_FREQ\\*num_gpu = 16\\*64\\*8 = 8192\n", 224 | "\n", 225 | "Cyclic cosine schedule, 5000 steps for rise to peek lr and fall to base lr,\n", 226 | "after each 5k steps shrink peak and base lr.\n", 227 | "\n", 228 | "```\n", 229 | "TOTAL_UPDATES=50000 # Total number of training steps\n", 230 | "STEP_SIZE=2500\n", 231 | "WARMUP_UPDATES=2500 # Same as triangular\n", 232 | "BASE_LR=0.0001\n", 233 | "PEAK_LR=0.0005 # Peak learning rate, adjust as needed\n", 234 | "LR_SHRINK=0.8\n", 235 | "TOKENS_PER_SAMPLE=512 # Max sequence length\n", 236 | "MAX_POSITIONS=512 # Num. positional embeddings (usually same as above)\n", 237 | "MAX_SENTENCES=16 # Number of sequences per batch (batch size)\n", 238 | "UPDATE_FREQ=64 # Increase the batch size 16x\n", 239 | "\n", 240 | "DATA_DIR=./data/wiki_books_oscar/vocab32k/\n", 241 | "SAVE_DIR=./checkpoints/PoLitBert_v32k_cos1_2_50k/\n", 242 | "LOGS_DIR=./checkpoints/PoLitBert_v32k_cos1_2_50k/logs/\n", 243 | "\n", 244 | "fairseq-train --fp16 $DATA_DIR \\\n", 245 | " --task masked_lm --criterion masked_lm \\\n", 246 | " --arch roberta_base --sample-break-mode complete --tokens-per-sample $TOKENS_PER_SAMPLE \\\n", 247 | " --optimizer adam --adam-betas '(0.9,0.98)' --adam-eps 1e-6 --clip-norm 0.0 \\\n", 248 | " --lr-scheduler cosine --lr $BASE_LR --max-lr $PEAK_LR \\\n", 249 | " --warmup-updates $WARMUP_UPDATES \\\n", 250 | " --lr-period-updates $STEP_SIZE --t-mult 1 --lr-shrink $LR_SHRINK \\\n", 251 | " --dropout 0.1 --attention-dropout 0.1 --weight-decay 0.01 \\\n", 252 | " --max-sentences $MAX_SENTENCES --update-freq $UPDATE_FREQ \\\n", 253 | " --max-update $TOTAL_UPDATES --skip-invalid-size-inputs-valid-test \\\n", 254 | " --tensorboard-logdir $LOGS_DIR --log-format simple --log-interval 1 --save-dir $SAVE_DIR \\\n", 255 | " --ddp-backend=no_c10d --num-workers 2\n", 256 | "\n", 257 | "```\n", 258 | "\n", 259 | "```\n", 260 | "tensorboard dev upload --logdir $LOGS_DIR \\\n", 261 | " --name \"PoLitBert - Polish RoBERT'a model, exp. #4\" \\\n", 262 | " --description \"- cyclic cosine, 50k updates, vocab32k, step=2500 half lr=0.0005, --save-dir ${SAVE_DIR}\"\n", 263 | "\n", 264 | "```" 265 | ] 266 | }, 267 | { 268 | "cell_type": "markdown", 269 | "source": [ 270 | "### Experiments 5, 6, 7 - cyclic cosine, 50k updates\n", 271 | "\n", 272 | "Vocab: upper 32k tokens
\n", 273 | "Train on: AWS p3.16xlarge\n", 274 | "\n", 275 | "Efective batch size = MAX_SENTENCES\\*UPDATE_FREQ\\*num_gpu = 16\\*64\\*8 = 8192\n", 276 | "\n", 277 | "Cyclic cosine schedule, 1000 steps for rise to peek lr and fall to base lr.\n", 278 | "\n", 279 | "```\n", 280 | "TOTAL_UPDATES=50000 # Total number of training steps\n", 281 | "STEP_SIZE=1000\n", 282 | "WARMUP_UPDATES=1000\n", 283 | "BASE_LR=0.0001\n", 284 | "PEAK_LR=0.001 # Peak learning rate, adjust as needed\n", 285 | "LR_SHRINK=0.8\n", 286 | "TOKENS_PER_SAMPLE=512 # Max sequence length\n", 287 | "MAX_POSITIONS=512 # Num. positional embeddings (usually same as above)\n", 288 | "MAX_SENTENCES=16 # Number of sequences per batch (batch size)\n", 289 | "UPDATE_FREQ=64 # Increase the batch size 16x\n", 290 | "\n", 291 | "DATA_DIR=./data/wiki_books_oscar/vocab32k/\n", 292 | "SAVE_DIR=./checkpoints/PoLitBert_v32k_cos1_4_50k/\n", 293 | "LOGS_DIR=./checkpoints/PoLitBert_v32k_cos1_4_50k/logs/\n", 294 | "\n", 295 | "\n", 296 | "fairseq-train --fp16 $DATA_DIR \\\n", 297 | " --task masked_lm --criterion masked_lm \\\n", 298 | " --arch roberta_base --sample-break-mode complete --tokens-per-sample $TOKENS_PER_SAMPLE \\\n", 299 | " --optimizer adam --adam-betas '(0.9,0.98)' --adam-eps 1e-6 --clip-norm 0.9 \\\n", 300 | " --lr-scheduler cosine --lr $BASE_LR --max-lr $PEAK_LR \\\n", 301 | " --warmup-updates $WARMUP_UPDATES \\\n", 302 | " --lr-period-updates $STEP_SIZE --t-mult 2 --lr-shrink $LR_SHRINK \\\n", 303 | " --dropout 0.1 --attention-dropout 0.1 --weight-decay 0.01 \\\n", 304 | " --max-sentences $MAX_SENTENCES --update-freq $UPDATE_FREQ \\\n", 305 | " --max-update $TOTAL_UPDATES --skip-invalid-size-inputs-valid-test \\\n", 306 | " --tensorboard-logdir $LOGS_DIR --log-format simple --log-interval 1 --save-dir $SAVE_DIR \\\n", 307 | " --ddp-backend=no_c10d --num-workers 2\n", 308 | "\n", 309 | "```\n", 310 | "\n", 311 | "```\n", 312 | "tensorboard dev upload --logdir $LOGS_DIR \\\n", 313 | " --name \"PoLitBert - Polish RoBERT'a model, exp. #6\" \\\n", 314 | " --description \"- cyclic cosine, 50k updates, step=1000, t-mult=2, clip-norm=0.9, --save-dir ${SAVE_DIR}\"\n", 315 | "\n", 316 | "```" 317 | ], 318 | "metadata": { 319 | "collapsed": false, 320 | "pycharm": { 321 | "name": "#%% md\n" 322 | } 323 | } 324 | }, 325 | { 326 | "cell_type": "markdown", 327 | "metadata": {}, 328 | "source": [ 329 | "### Experiment 8 - cyclic triangular, 125k updates\n", 330 | "\n", 331 | "Vocab: 32k tokens
\n", 332 | "Train on: AWS p3.16xlarge\n", 333 | "\n", 334 | "Efective batch size = MAX_SENTENCES\\*UPDATE_FREQ\\*num_gpu = 16\\*64\\*8 = 8192\n", 335 | "\n", 336 | "```\n", 337 | "TOTAL_UPDATES=125000 # Total number of training steps\n", 338 | "STEP_SIZE=5000\n", 339 | "BASE_LR=0.0001\n", 340 | "PEAK_LR=0.001 # Peak learning rate, adjust as needed\n", 341 | "LR_SHRINK=0.8\n", 342 | "TOKENS_PER_SAMPLE=512 # Max sequence length\n", 343 | "MAX_POSITIONS=512 # Num. positional embeddings (usually same as above)\n", 344 | "MAX_SENTENCES=16 # Number of sequences per batch (batch size)\n", 345 | "UPDATE_FREQ=64 # Increase the batch size 16x\n", 346 | "\n", 347 | "DATA_DIR=./data/wiki_books_oscar/vocab32k/\n", 348 | "SAVE_DIR=./checkpoints/PoLitBert_v32k_tri_125k/\n", 349 | "LOGS_DIR=./checkpoints/PoLitBert_v32k_tri_125k/logs/\n", 350 | "\n", 351 | "fairseq-train --fp16 $DATA_DIR \\\n", 352 | " --task masked_lm --criterion masked_lm \\\n", 353 | " --arch roberta_base --sample-break-mode complete --tokens-per-sample $TOKENS_PER_SAMPLE \\\n", 354 | " --optimizer adam --adam-betas '(0.9,0.98)' --adam-eps 1e-6 --clip-norm 0.0 \\\n", 355 | " --lr-scheduler triangular --lr $BASE_LR --max-lr $PEAK_LR \\\n", 356 | " --lr-period-updates $STEP_SIZE --lr-shrink $LR_SHRINK --shrink-min \\\n", 357 | " --dropout 0.1 --attention-dropout 0.1 --weight-decay 0.01 \\\n", 358 | " --max-sentences $MAX_SENTENCES --update-freq $UPDATE_FREQ \\\n", 359 | " --max-update $TOTAL_UPDATES --skip-invalid-size-inputs-valid-test \\\n", 360 | " --tensorboard-logdir $LOGS_DIR --log-format simple --log-interval 1 --save-dir $SAVE_DIR \\\n", 361 | " --ddp-backend=no_c10d\n", 362 | "\n", 363 | "```\n", 364 | "\n", 365 | "```\n", 366 | "tensorboard dev upload --logdir $LOGS_DIR \\\n", 367 | " --name \"PoLitBert - Polish RoBERT'a model, exp. #8\" \\\n", 368 | " --description \"- cyclic triangular, 125k updates, vocab32k, --save-dir ${SAVE_DIR}\"\n", 369 | "\n", 370 | "```" 371 | ] 372 | }, 373 | { 374 | "cell_type": "markdown", 375 | "metadata": {}, 376 | "source": [ 377 | "### Experiment 9 - cyclic cosine, 125k updates\n", 378 | "\n", 379 | "Vocab: upper 32k tokens
\n", 380 | "Train on: AWS p3.16xlarge\n", 381 | "\n", 382 | "```\n", 383 | "TOTAL_UPDATES=125000 # Total number of training steps\n", 384 | "STEP_SIZE=1000\n", 385 | "WARMUP_UPDATES=5000 \n", 386 | "BASE_LR=0.00001\n", 387 | "PEAK_LR=0.0007 # Peak learning rate, adjust as needed\n", 388 | "LR_SHRINK=0.7\n", 389 | "TOKENS_PER_SAMPLE=512 # Max sequence length\n", 390 | "MAX_POSITIONS=512 # Num. positional embeddings (usually same as above)\n", 391 | "MAX_SENTENCES=16 # Number of sequences per batch (batch size)\n", 392 | "UPDATE_FREQ=64 # Increase the batch size 16x\n", 393 | "\n", 394 | "DATA_DIR=./data/wiki_books_oscar/vocab32k/\n", 395 | "SAVE_DIR=./checkpoints/PoLitBert_v32k_cos1_5_50k/\n", 396 | "LOGS_DIR=./checkpoints/PoLitBert_v32k_cos1_5_50k/logs/\n", 397 | "\n", 398 | "fairseq-train --fp16 $DATA_DIR \\\n", 399 | " --task masked_lm --criterion masked_lm \\\n", 400 | " --arch roberta_base --sample-break-mode complete --tokens-per-sample $TOKENS_PER_SAMPLE \\\n", 401 | " --optimizer adam --adam-betas '(0.9,0.98)' --adam-eps 1e-6 --clip-norm 0.9 \\\n", 402 | " --lr-scheduler cosine --lr $BASE_LR --max-lr $PEAK_LR \\\n", 403 | " --warmup-updates $WARMUP_UPDATES \\\n", 404 | " --lr-period-updates $STEP_SIZE --t-mult 2 --lr-shrink $LR_SHRINK \\\n", 405 | " --dropout 0.1 --attention-dropout 0.1 --weight-decay 0.01 \\\n", 406 | " --max-sentences $MAX_SENTENCES --update-freq $UPDATE_FREQ \\\n", 407 | " --max-update $TOTAL_UPDATES --skip-invalid-size-inputs-valid-test \\\n", 408 | " --tensorboard-logdir $LOGS_DIR --log-format simple --log-interval 1 --save-dir $SAVE_DIR \\\n", 409 | " --ddp-backend=no_c10d --num-workers 2\n", 410 | "\n", 411 | "```\n", 412 | "\n", 413 | "```\n", 414 | "tensorboard dev upload --logdir $LOGS_DIR \\\n", 415 | " --name \"PoLitBert - Polish RoBERT'a model, exp. #9\" \\\n", 416 | " --description \"- cyclic cosine, 125k updates, vocab32k, t-mult=2 clip-norm=0.9, --save-dir ${SAVE_DIR}\"\n", 417 | "\n", 418 | "```" 419 | ] 420 | }, 421 | { 422 | "cell_type": "markdown", 423 | "metadata": {}, 424 | "source": [ 425 | "### Experiment 10 - linear, 125k updates\n", 426 | "\n", 427 | "Vocab: 32k tokens
\n", 428 | "Train on: AWS p3.16xlarge\n", 429 | "\n", 430 | "Efective batch size = MAX_SENTENCES\\*UPDATE_FREQ\\*num_gpu = 16\\*64\\*8 = 8192\n", 431 | "\n", 432 | "```\n", 433 | "TOTAL_UPDATES=125000 # Total number of training steps\n", 434 | "WARMUP_UPDATES=10000 # Warmup the learning rate over this many updates\n", 435 | "PEAK_LR=0.001 # Peak learning rate, adjust as needed\n", 436 | "TOKENS_PER_SAMPLE=512 # Max sequence length\n", 437 | "MAX_POSITIONS=512 # Num. positional embeddings (usually same as above)\n", 438 | "MAX_SENTENCES=16 # Number of sequences per batch (batch size)\n", 439 | "UPDATE_FREQ=64 # Increase the batch size 16x\n", 440 | "\n", 441 | "DATA_DIR=./data/wiki_books_oscar/vocab32k/\n", 442 | "SAVE_DIR=./checkpoints/PoLitBert_v32k_linear_125k/\n", 443 | "LOGS_DIR=./checkpoints/PoLitBert_v32k_linear_125k/logs/\n", 444 | "\n", 445 | "fairseq-train --fp16 $DATA_DIR \\\n", 446 | " --task masked_lm --criterion masked_lm \\\n", 447 | " --arch roberta_base --sample-break-mode complete --tokens-per-sample $TOKENS_PER_SAMPLE \\\n", 448 | " --optimizer adam --adam-betas '(0.9,0.98)' --adam-eps 1e-6 --clip-norm 0.9 \\\n", 449 | " --lr-scheduler polynomial_decay --lr $PEAK_LR --warmup-updates $WARMUP_UPDATES --total-num-update $TOTAL_UPDATES \\\n", 450 | " --dropout 0.1 --attention-dropout 0.1 --weight-decay 0.01 \\\n", 451 | " --max-sentences $MAX_SENTENCES --update-freq $UPDATE_FREQ \\\n", 452 | " --max-update $TOTAL_UPDATES --log-format simple --log-interval 1 --skip-invalid-size-inputs-valid-test \\\n", 453 | " --save-dir $SAVE_DIR --tensorboard-logdir $LOGS_DIR \\\n", 454 | " --ddp-backend=no_c10d\n", 455 | "\n", 456 | "```\n", 457 | "\n", 458 | "```\n", 459 | "tensorboard dev upload --logdir $LOGS_DIR \\\n", 460 | " --name \"PoLitBert - Polish RoBERT'a model, exp. #10 \" \\\n", 461 | " --description \"- linear, 125k updates, vocab32k, clip-norm=0.9, --save-dir ${SAVE_DIR}\"\n", 462 | "\n", 463 | "```" 464 | ] 465 | }, 466 | { 467 | "cell_type": "markdown", 468 | "metadata": { 469 | "pycharm": { 470 | "name": "#%% md\n" 471 | } 472 | }, 473 | "source": [ 474 | "### Experiment 11 - vocab50k, linear, 50k updates\n", 475 | "\n", 476 | "Vocab: 50k tokens\n", 477 | "Train on: AWS p3.16xlarge\n", 478 | "\n", 479 | "Efective batch size = MAX_SENTENCES\\*UPDATE_FREQ\\*num_gpu = 16\\*64\\*8 = 8192\n", 480 | "\n", 481 | "```\n", 482 | "TOTAL_UPDATES=50000 # Total number of training steps\n", 483 | "WARMUP_UPDATES=10000 # Warmup the learning rate over this many updates\n", 484 | "PEAK_LR=0.001 # Peak learning rate, adjust as needed\n", 485 | "TOKENS_PER_SAMPLE=512 # Max sequence length\n", 486 | "MAX_POSITIONS=512 # Num. positional embeddings (usually same as above)\n", 487 | "MAX_SENTENCES=16 # Number of sequences per batch (batch size)\n", 488 | "UPDATE_FREQ=64 # Increase the batch size 16x\n", 489 | "\n", 490 | "DATA_DIR=./data/wiki_books_oscar/vocab50k/\n", 491 | "SAVE_DIR=./checkpoints/PoLitBert_v50k_linear_50k/\n", 492 | "LOGS_DIR=./checkpoints/PoLitBert_v50k_linear_50k/logs/\n", 493 | "\n", 494 | "fairseq-train --fp16 $DATA_DIR \\\n", 495 | " --task masked_lm --criterion masked_lm \\\n", 496 | " --arch roberta_base --sample-break-mode complete --tokens-per-sample $TOKENS_PER_SAMPLE \\\n", 497 | " --optimizer adam --adam-betas '(0.9,0.98)' --adam-eps 1e-6 --clip-norm 0.9 \\\n", 498 | " --lr-scheduler polynomial_decay --lr $PEAK_LR --warmup-updates $WARMUP_UPDATES --total-num-update $TOTAL_UPDATES \\\n", 499 | " --dropout 0.1 --attention-dropout 0.1 --weight-decay 0.01 \\\n", 500 | " --max-sentences $MAX_SENTENCES --update-freq $UPDATE_FREQ \\\n", 501 | " --max-update $TOTAL_UPDATES --log-format simple --log-interval 1 --skip-invalid-size-inputs-valid-test \\\n", 502 | " --save-dir $SAVE_DIR --tensorboard-logdir $LOGS_DIR \\\n", 503 | " --ddp-backend=no_c10d\n", 504 | "\n", 505 | "```\n", 506 | "\n", 507 | "```\n", 508 | "tensorboard dev upload --logdir $LOGS_DIR \\\n", 509 | " --name \"PoLitBert - Polish RoBERT'a model, exp. #11\" \\\n", 510 | " --description \"- linear, 50k updates, vocab50k, clip-norm=0.9, --save-dir ${SAVE_DIR}\"\n", 511 | "\n", 512 | "```\n", 513 | "\n" 514 | ] 515 | } 516 | ], 517 | "metadata": { 518 | "language_info": { 519 | "name": "python", 520 | "codemirror_mode": { 521 | "name": "ipython", 522 | "version": 3 523 | }, 524 | "version": "3.7.3-final" 525 | }, 526 | "orig_nbformat": 2, 527 | "file_extension": ".py", 528 | "mimetype": "text/x-python", 529 | "name": "python", 530 | "npconvert_exporter": "python", 531 | "pygments_lexer": "ipython3", 532 | "version": 3, 533 | "kernelspec": { 534 | "name": "python37364bitherbertpipenvf409fddaf3f446fd8dcf7490c441f6bd", 535 | "display_name": "Python 3.7.3 64-bit ('herbert': pipenv)" 536 | } 537 | }, 538 | "nbformat": 4, 539 | "nbformat_minor": 2 540 | } -------------------------------------------------------------------------------- /playground_taggers.py: -------------------------------------------------------------------------------- 1 | #%% 2 | import nltk 3 | import text_utils as tu 4 | 5 | import sys 6 | import datetime as dt 7 | import os 8 | from pathlib import Path 9 | from tqdm import tqdm 10 | 11 | from collections import namedtuple, Counter 12 | import morfeusz2 13 | import stanza 14 | 15 | 16 | # import nltk 17 | # #nltk.download('punkt') 18 | # nltk.download() 19 | # only rus i en 20 | 21 | # for txt in sentences: 22 | # # txt = sentences[1] 23 | # tokens = nltk.word_tokenize(txt, "polish") 24 | # print(nltk.pos_tag(tokens, lang="pl")) 25 | 26 | InterpTuple = namedtuple( 27 | "InterpTuple", field_names=["word", "lemat", "tags", "info", "style"] 28 | ) 29 | 30 | sentences = [ 31 | "Krzysiek pije piwo o północy", 32 | "Krzysiek wypije piwo o północy", 33 | "Krzysiek będzie pił napój o północy", 34 | "Krzysiek będzie pić piwo o północy", 35 | "Pić to trzeba umieć", 36 | "Wolno mi wypić jedno piwo", 37 | "Winnam wypić piwo", 38 | "Za oknem słońce", 39 | "Będzie w domu", 40 | "Opublikowano w niedzielę", 41 | "sprawę zgłoszono do Sądu", 42 | "wniesiono wymaganą opłatę", 43 | "Picie piwa i jeżdżenie na rowerze to fajna rozrywka", 44 | "Obwieszczenie o wydaniu decyzji o ustaleniu lokalizacji inwestycji celu publicznego dla zamierzenia realizowanego w obrębach Serock i Brzeźno (gmina Pruszcz) oraz w obrębach Wudzyn i Wudzynek (gmina Dobrcz)", 45 | "Abstrakcje Kwiaty Dla dzieci Sport Owoce Człowiek Pojazdy Kuchnia Zwierzęta Martwa Natura Inne Architektura Widoki Drzewa Gory Prostokąty Kwadraty Panoramy Panoramy Slim Tryptyki Mid Tryptyki Tryptyki Wide 4 elementowe regular 4 elementowe 5 elementowe 7 elementowe Tryptyki High 9 elementowe Rosliny Bestsellery Ręcznie malowane!", 46 | "Wybierz temat Zamówienie kwerendy archiwalnej Zapytanie o papierową dokumentację projektową archiwaliów audiowizualnych Zapytanie o dostępne formaty i jakość kopii cyfrowych Zapytanie o dostępne wersje językowe materiałów audiowizualnych Sugestia zmiany w opisie materiału Inne", 47 | "w górach nad jeziorem w lesie nad morzem nad rzeką ---------------------------------------------- Karpaty Wzniesienia Łódzkie Trójmiasto Podkarpacie Równina Radomska Beskid Sądecki Bieszczady Gorce Pieniny Podhale Tatry Tatry Wschodnie Lubelszczyzna Polesie Lubelskie Pogórze Beskidzkie Wyżyna Lubelska Roztocze Tatry Zachodnie Pogórze Śląskie Pogórze Wielickie Małopolska Wyżyna Małopolska Niecka Nidziańska Pogórze Wiśnickie Kotlina Sandomierska Mazowsze Nizina Mazowiecka Puszcza Kampinoska Okolice Warszawy Pogórze Rożnowskie Pogórze Ciężkowickie Wyżyna Krakowsko-Częstochowska Pogórze Strzyżowskie Pogórze Dynowskie Pogórze Przemyskie Jura Krakowsko-Częstochowska Obniżenie Gorlickie Kotlina Jasielsko-Krośnieńska Kotlina Oświęcimska Pogórze Jasielskie Pogórze Bukowskie Płaskowyż Chyrowski Puszcza Niepołomicka Pomorze Brama Krakowska Pomorze Wschodnie Wybrzeże Bałtyckie Pojezierze Kujawskie Wyżyna Przedborska Wyżyna Kielecka Puszcza Sandomierska Dolina Nidy Góry Świętokrzyskie Puszcza Świętorzyska Pomorze Zachodnie Tatry Wysokie Pobrzeże Bałtyckie Roztocze Wschodnie Puszcza Solska Roztocze Zachodnie Roztocze Środkowe Wzniesienia Górowskie Podgórze Bocheńskie Płaskowyż Tarnowski Równina Ornecka Karpaty Zachodnie Karpaty Wschodnie Nizina Sępopolska Pobrzeże Gdańskie Półwysep Helski Mierzeja Helska Mierzeja Wiślana Beskidy Pobrzeże Kaszubskie Kotlina Rabczańska Kotlina Żywiecka Beskid Mały Beskid Makowski Beskid Wyspowy Beskid Żywiecki Kotlina Sądecka Beskid Śląski Równina Warmińska Nizina Nadwiślańska Wybrzeże Staropruskie Wysoczyzna Elbląska Żuławy Wiślane Wybrzeże Koszalińskie Wybrzeże Szczecińskie Nizina Szczecińska Wybrzeże Słowińskie Równina Białogardzka Równina Słupska Wysoczyzna Damnicka Wysoczyzna Żarnowiecka Pradolina Łeby i Redy Uznam i Wolin Wybrzeże Trzebiatowskie Równina Wkrzańska Dolina Dolnej Odry Równina Goleniowska Wzniesienia Szczecińskie Wzgórza Bukowe Równina Wełtyńska Równina Pyrzycko-Stargardzka Równina Nowogardzka Równina Gryficka Mazury Warmia Pojezierze Mazurskie Suwalszczyzna Puszcza Romincka Pojezierze Zachodniosuwalskie Pojezierze Wschodniosuwalskie Pojezierze Olsztyńskie Pojezierze Mrągowskie Kraina Wielkich Jezior Kraina Węgorapy Wzgórza Szeskie Pojezierze Ełckie Równina Mazurska Wielkopolska Nizina Wielkopolska Pojezierze Wielkopolskie Ziemia Lubuska Ziemia Kłodzka Puszcza Notecka Pojezierze Poznańskie Poznański Przełom Warty Pojezierze Chodzieskie Pojezierze Gnieźnieńskie Równina Inowrocławska Równina Wrzesińska Kujawy Podlasie Nizina Podlaska Puszcza Knyszyńska Puszcza Białowieska Puszcza Augustowska Śląsk Nizina Śląska Wyżyna Śląska Pradolina Warciańsko-Odrzańska Bory Stobrawskie Nizina Śląsko-Łużycka Sudety Wysoczyzna Rościsławska Pradolina Wrocławska Równina Wrocławska Dolina Nysy Kłodzkiej Równina Niemodlińska Równina Oleśnicka Równina Opolska Płaskowyż Głubczycki Kotlina Raciborska Bory Dolnośląskie Równina Szprotawska Wysoczyzna Lubińska Równina Legnicka Równina Chojnowska Dolina Środkowej Odry Kotlina Kargowska Kotlina Śremska Przedgórze Sudeckie Pogórze Zachodniosudeckie Sudety Zachodnie Sudety Środkowe Sudety Wschodnie Góry Złote Masyw Śnieżnika Góry Opawskie Brama Lubawska Góry Wałbrzyskie Góry Kamienne Góry Sowie Wyżyna Miechowska Góry Bardzkie Obniżenie Noworudzkie Obniżenie Scinawki Góry Stołowe Pogórze Orlickie Góry Orlickie Góry Bystrzyckie Kotlina Kłodzka Góry Izerskie Góry Kaczawskie Kotlina Jeleniogórska Karkonosze Rudawy Janowickie Pogórze Izerskie Pogórze Kaczawskie Pogórze Wałbrzyskie Pojezierze Pomorskie Pojezierze Zachodniopomorskie Pojezierze Wschodniopomorskie Pojezierze Południowopomorskie Dolina Dolnej Wisły Pojezierze Iławskie Pojezierze Lubuskie Pojezierze Leszczyńskie Lubuski Przełom Odry Pojezierze Łagowskie Równina Torzymska Dolina Kwidzyńska Kotlina Grudziądzka Dolina Fordońska Pojezierze Kaszubskie Pojezierze Starogardzkie Pojezierze Myśliborskie Pojezierze Choszczeńskie Pojezierze Ińskie Wysoczyzna Łobeska Pojezierze Drawskie Wysoczyzna Polanowska Pojezierze Bytowskie Równina Gorzowska Pojezierze Dobiegniewskie Równina Drawska Pojezierze Wałeckie Równina Wałecka Pojezierze Szczecińskie Równina Charzykowska Dolina Gwdy Pojezierze Krajeńskie Bory Tucholskie Dolina Brdy Wysoczyzna Świecka Pojezierze Chełmińskie Pojezierze Brodnickie Dolina Drwęcy Pojezierze Dobrzyńskie Kotlina Gorzowska Kotlina Toruńska Kotlina Płocka Dolina Noteci Kotlina Milicka Beskid Niski Dolina Baryczy Kaszuby Dolny Śląsk Zalew Wiślany Wysoczyzna Rawska Wyżyna Woźnicko-Wieluńska Puszcza Wkrzańska Puszcza Goleniowska Równina Łęczyńsko-Włodawska Puszcza Bukowa Puszcza Drawska Puszcza Gorzowska Puszcza Lubuska Puszcza Karpacka Puszcza Kozienicka Puszcza Pilicka Puszcza Biała Puszcza Bydgoska Puszcza Kurpiowska Puszcza Piska Puszcza Borecka Puszcza Nidzicka Kotlina Szczercowska", 48 | "UCHWAŁA NR VI/34/15 RADY GMINY BARTNICZKA z dnia 12 czerwca 2015 r. w sprawie określenia zasad udzielania oraz rozliczania dotacji celowych z budżetu Gminy Bartniczka na dofinansowanie realizacji przydomowych", 49 | "Wydarzenia i nowości Konstrukcje Publikacje Producenci Dachy skośne Publikacje Producenci Dachy płaskie Publikacje Producenci Pokrycia dachowe Pokrycia ceramiczne Publikacje Producenci Pokrycia cementowe Publikacje Producenci Pokrycia blaszane Publikacje Producenci Papy Publikacje Producenci Gonty bitumiczne Publikacje Producenci Łupek Publikacje Producenci Płyty dachowe Publikacje Producenci Inne Publikacje Producenci Dachy zielone Publikacje Producenci Dachy odwrócone Publikacje Producenci Okno w dachu Publikacje Producenci Folie dachowe Publikacje Producenci Ocieplenia dachów skośnych Publikacje Producenci Ocieplenia dachów płaskich Publikacje Producenci Akcesoria dachowe Publikacje Producenci Kominy Publikacje Producenci Rynny i odwodnienia Publikacje Producenci Ochrona odgromowa Publikacje Producenci Renowacja Publikacje Producenci Chemia budowlana Publikacje Producenci Maszyny i narzędzia Publikacje Producenci Obróbki blacharskie Publikacje Producenci Poddasza Publikacje Producenci Wentylacja dachów Publikacje Producenci Dom energooszczędny Publikacje Producenci Proekologiczne budowanie Publikacje Producenci Instrukcje Poradnik Publikacje Producenci Dylematy Inne TV Dachy Forum szkół Dla dekarzy Z życia PSD Szkolenia Budownictwo w statystykach BHP na budowie Rzeczoznawcy Organizacje branżowe Targi Wydawnictwa Konkursy i szkolenia Kontakt", 50 | "Woda i wilgoć, które mogą przenikać do wnętrza przegród nawet w obliczu niewielkich opadów deszczu i śniegu, stanowią jedno z największych zagrożeń stabilności i wytrzymałości konstrukcji dachów płaskich.", 51 | "Problem objawia się najczęściej w sezonie jesienno-zimowym i dotyczy głównie obiektów wielkopowierzchniowych, takich jak magazyny, hale produkcyjne czy centra logistyczne.", 52 | "O czym powinni pamiętać inżynierowie, projektując bezpieczny dach płaski?", 53 | "Dach z zerowym kątem nachylenia to gwarancja problemów eksploatacyjnych: mechanicznej degradacji materiału izolacyjnego, korozji stalowych blach i łączników mechanicznych, zmniejszonej efektywności energetycznej obiektu.", 54 | "Dlatego też warunkiem koniecznym jest uwzględnienie odpowiednich spadków dachu.", 55 | "W Polsce, w sezonie jesienno-zimowym okres zalegania śniegu może sięgać nawet kilku miesięcy.", 56 | "Woda, która przez ten czas nie jest dostatecznie szybko usuwana mechanicznie lub poprzez odparowanie, stanowi prawdziwą próbę zarówno dla szczelności, jak i wytrzymałości mechanicznej konstrukcji.", 57 | "Jak podkreśla Adam Buszko, ekspert firmy Paroc, nawet nieduże z pozoru błędy mogą o sobie szybko przypomnieć w postaci poważnych przecieków.", 58 | "– Nagromadzona wilgoć może dochodzić nawet do 10-20 milimetrów na metr kwadratowy, co odpowiada 10-20% objętości izolacji w zależności od jej grubości – wyjaśnia.", 59 | "Ryzyko problemów wzrasta zwłaszcza w okresie niskich temperatur, kiedy woda penetruje wszelkie szczeliny i ewentualne rozwarstwienia.", 60 | "– Cykle zamarzania i rozmarzania mogą prowadzić do powstawania nieszczelności w warstwie hydroizolacji oraz na jej połączeniach z innymi konstrukcjami – na przykład ścianami elewacji – dodaje ekspert Paroc.", 61 | "W przypadku dachów płaskich planowane spadki powinny wynosić minimum 2-3°.", 62 | "W wyjątkowych sytuacjach, gdy ze względów konstrukcyjnych spadki muszą wynosić mniej niż 2° (np. w zlewniach pogłębionych), należy podjąć odpowiednie działania w celu ograniczenia ryzyka wystąpienia zatoisk wody.", 63 | "W przypadku montażu sztywnych płyt styropianowych, nawet przy słabym wietrze wzrasta ryzyko albo niedogrzania połączenia na zakładach, albo w drugą stronę – do stopienia styropianu.", 64 | "Problem ten dość mocno rzutuje na zachowanie się wody na gotowym dachu, a często zależy tylko w minimalnym stopniu od umiejętności oraz doświadczenia wykonawcy.", 65 | "W przypadku zimnego zgrzewu, im mniejsze zachowamy spadki, tym większe ryzyko penetracji szpar pomiędzy warstwami papy przez wodę.", 66 | "Jeśli dojdzie zaś do wytopienia styropianu, w warstwie pokrycia wytworzą się zagłębienia, w których stać będzie woda, a papa na zakładach podlegać będzie intensywniejszym cyklom naprężeń.", 67 | "Z powyższych względów najlepiej stosować izolacje niepalne, takie jak wełna kamienna, w przypadku której nie występuje ryzyko stopienia materiału.", 68 | "Ze względu na naturalną paroprzepuszczalność oraz odporność na wnikanie wilgoci, produkty z wełny kamiennej sprawdzają się zarówno w przypadku konstrukcji dachów płaskich wentylowanych, jak i niewentylowanych.", 69 | "W pierwszym przypadku stosować można izolacyjne płyty dwuwarstwowe z wierzchnią warstwą utwardzoną lub dwie płyty: jedną twardą – jako płytę wierzchnią, drugą miękką i lżejszą – jako spodnią.", 70 | "Podobne rozwiązanie sprawdza się także w przypadku konstrukcji wentylowanych.", 71 | "Gotowe rozwiązania w tej dziedzinie dostarczają producenci izolacji.", 72 | "– W systemie PAROC Air spodnia płyta izolacyjna wyposażona jest w system rowków, które umożliwiają sprawny transport pary wodnej w kierunku wylotów – tłumaczy Adam Buszko.", 73 | "– Wierzchni arkusz z wełny kamiennej został zaś opracowany tak, by zapewniać trwałe, twarde i ognioodporne podłoże dla większości typów płaskich pokryć dachowych, a także dla izolacji warstwy nośnej w miejscach remontów – dodaje.", 74 | "Odpowiednio dobrana izolacja, w połączeniu właściwym rozmieszczeniem spadków i odwodnień, pozwala na sprawne odprowadzenie wilgoci z konstrukcji przez cały okres jej eksploatacji.", 75 | "Za optymalne przyjmuje się, że osuszanie połaci dachu na poziomie 0,5 kg wody/m2 na dobę, co skutecznie eliminuje zagrożenie gromadzenia się wilgoci – również na etapie budowy.", 76 | "Przedsiębiorstwo Badawczo-Wdrożeniowe Acrylmed dr Ludwika Własińska Sp. z o.o. 63-100 Śrem, ul. Mickiewicza 33", 77 | "Sąd Rejonowy Poznań-Nowe Miasto i Wilda w Poznaniu, IX Wydział Gospodarczy Krajowego Rejestru Sądowego", 78 | "Na naszych stronach internetowych stosujemy pliki cookies.", 79 | "Korzystając z naszych serwisów internetowych bez zmiany ustawień przeglądarki wyrażasz zgodę na stosowanie plików cookies zgodnie z Polityką prywatności.", 80 | "Na mocy uchwały Rady Powiatu Głogowskiego z dnia 28 marca 2007 r. Nr VI/54/2007 został utworzony zespół publicznych placówek kształcenia ustawicznego i praktycznego o nazwie Głogowskie Centrum Edukacji Zawodowej w Głogowie przy ulicy Piaskowej 1.", 81 | "Centrum Kształcenia Ustawicznego w Głogowie - ustawiczne kształcenie, dokształcanie i doskonalenie osób dorosłych.", 82 | "- dostrzeganie swojej niepowtarzalności, a także niepowtarzalności innych, szanowanie jej.", 83 | "- poznanie różnych środków transportu: lądowego, wodnego, powietrznego.", 84 | "- rozpoznawanie drzew po ich liściach i owocach, zbieranie owoców drzew, wzbogacanie nimi kącika przyrody: wykorzystywanie owoców w działalności plastycznej, technicznej, matematycznej, muzycznej oraz w inny, niestandardowy sposób.", 85 | "Czytanie bajki Kubuś Puchatek przez Wójta Gminy Wiśniew p. Krzysztofa Kryszczuka.", 86 | "PLAN PRACY DYDAKTYCZNEJ NA WRZESIEŃ 2016 W GRUPIE 6-latków Tygryski Tydzień I Przedszkole drugi dom Tydzień III Uliczne sygnały Tydzień II Przedszkole drugi dom Tydzień IV Jesień w lesie Treści programowe", 87 | "Dell OptiPlex 9020 Konfiguracja i funkcje komputera Informacja o ostrzeżeniach PRZESTROGA: Napis OSTRZEŻENIE informuje o sytuacjach, w których występuje ryzyko uszkodzenia sprzętu, obrażeń ciała lub śmierci.", 88 | "Zasadnicze cele konkursu to m.in. kształtowanie postaw patriotycznych młodzieży poprzez propagowanie i pogłębianie wiedzy o organizacji i działalności Służby Zwycięstwu Polski-Związku Walki Zbrojnej – Armii Krajowej oraz formacji poakowskich na terenie zamojskiego Inspektoratu Armii Krajowej, kultywowanie wartości, ideałów i postaw żołnierzy Polskiego Państwa Podziemnego, zainspirowanie i zachęcanie młodzieży do podjęcia samodzielnych badań nad historią swojej rodziny, środowiska związanego z miejscem zamieszkania", 89 | "Z kolei w imieniu organizatora konkursu głos zabrał Prezes ŚZŻAK Okręg Zamość Poseł Sławomir Zawiślak, który przedstawiając efekty tegorocznej, już IX edycji konkursu podziękował za pomoc w jego organizacji Patronom Honorowym, Dyrektorom szkół, nauczycielom, członkom związku w tym Weteranom AK – wszystkim tym, którzy od lat popierają inicjatywę Związku i przyczyniają się do uświetnienia konkursu.", 90 | "Na czwartkowym (13.06) kongresie pojawili się Marta Niewczas, podkarpacki pełnomocnik Europy Plus, poseł Janusz Palikot, europoseł Marek Siwiec, Robert Smucz, przewodniczący zarządu okręgu rzeszowsko-tarnobrzeskiego RP i wiceszef Europy Plus na Podkarpaciu oraz jeden z liderów stowarzyszenia Ordynacka Robert Kwiatkowski.", 91 | "ochrona ppoż-znaki uzupełniające prom.elektromag.-znaki ostrzeg.", 92 | "subst.chem.-znaki kateg.niebezp.", 93 | "taśma odradzająca z folii PE telefony alarmowe-tablice urz.elektryczne-znaki informac.", 94 | "urz.elektryczne-znaki ostrzeg.", 95 | "urz.elektryczne-znaki zakazu pozostałe znaki i tablice", 96 | "Wyświetl posty z ostatnich: Wszystkie Posty1 Dzień7 Dni2 Tygodnie1 Miesiąc3 Miesiące6 Miesięcy1 Rok Najpierw StarszeNajpierw Nowsze Zobacz poprzedni temat : Zobacz następny temat", 97 | "Na skróty:o nasatakiofftopz poza sojuszufunna wasze zyczenie zakladamy tematynie moge sie zarejestrowaccarna listaregulaminpakty", 98 | "3.1) Uprawnienia do wykonywania określonej działalności lub czynności, jeżeli przepisy prawa nakładają obowiązek ich posiadania Zamawiający uzna warunek za spełniony, jeżeli Wykonawca wykaże, iż posiada zezwolenie na wykonywanie działalności ubezpieczeniowej, o którym mowa w Ustawie z dnia 22 maja 2003 r. o działalności ubezpieczeniowej (tekst jednolity Dz. U. z 2013 r. poz. 950 z późn. zm.), a w przypadku gdy rozpoczął on działalność przed wejściem w życie Ustawy z dnia 28 lipca 1990 r. o działalności ubezpieczeniowej (Dz. U. Nr 59, poz. 344 ze zm.) zaświadczenie Ministra Finansów o posiadaniu zgody na wykonywanie działalności ubezpieczeniowej III.3.2) Wiedza i doświadczenie III.3.3) Potencjał techniczny III.3.4) Osoby zdolne do wykonania zamówienia", 99 | "Volvo Ocean Race (Wcześniej Whitbread Round the World) jest dla żeglarzy tym, czym dla wielbicieli motosportu 24-godzinny wyścig LeMans, a dla alpinistów zimowe ataki szczytowe.", 100 | "W 1973 r. na starcie Whitbread Round the World załogę Copernicusa stanowili: kapitan Zygfryd Perlicki, Zbigniew Puchalski, Bogdan Bogdziński, Ryszard Mackiewicz i Bronisław Tarnacki.", 101 | "Możliwość rozliczenia mieszkania w cenie i dopłaty reszty kwoty,", 102 | "Obecna na rynku od roku 1976, amerykańska firma ASP (Armament Systems and Procedures, Inc.) to prekursor, a obecnie też lider produkcji najwyższej jakości akcesoriów dla służb mundurowych.", 103 | "przyjmowania oświadczeń odstąpienia od zawartych umów sprzedaży na odległość, zgodnie z postanowieniami niniejszego Regulaminu oraz przepisami Rozdziału 4 ustawy z dnia 30 maja 2014 r. o prawach konsumenta, co stanowi prawnie uzasadniony interes Sprzedawcy (podstawa prawna przetwarzania danych: art. 6 ust. 1 lit. f RODO),", 104 | "Studenci kierunku Architektura Krajobrazu otrzymują przygotowanie z zakresu nauk przyrodniczych, rolniczych, technicznych i sztuk pięknych oraz umiejętności wykorzystania jej w pracy zawodowej z zachowaniem zasad prawnych i estetycznych.", 105 | "Poza tym wydział ten zajmuje się sprawami związanymi z realizacją zadań inwestycyjnych i remontowych, przygotowaniem i prowadzeniem procedur przetargowych na realizacją inwestycji i remontów, podejmowaniem działań w celu pozyskiwania zewnętrznych źródeł finansowania działalności inwestycyjnej, w tym opracowywaniem stosownych wniosków oraz koordynowaniem spraw związanych ze sprawozdawczością i rozliczaniem inwestycji.", 106 | "Biuro Promocji, Informacji i Rozwoju Powiatu prowadzi sprawy związane z promocją i rozwojem powiatu, współpracą z mediami, planowaniem strategicznym oraz koordynowaniem działań związanych ze współpracą zagraniczną.", 107 | "Do głównych zadań Biura Kadr i Płac należy opracowywanie zasad polityki kadrowej i zarządzanie kadrami oraz sporządzanie wykazów etatów i planów rozmieszczenia pracowników.", 108 | 109 | 110 | 111 | ] 112 | 113 | #%% test with morfeusz2 114 | 115 | morf_sent = tu.MorfeuszAnalyzer() 116 | 117 | morf_stanza = tu.StanzaAnalyzer() 118 | 119 | morf_krnnt = tu.KRNNTAnalyzer() 120 | 121 | for t, s in enumerate(sentences): 122 | 123 | is_valid1 = morf_sent.sentence_valid(s) 124 | 125 | is_valid2 = morf_stanza.sentence_valid(s) 126 | 127 | is_valid3 = morf_krnnt.sentence_valid(s) 128 | 129 | print(f"#####\n{s}\n morfeusz={is_valid1} stanza={is_valid2} krnnt={is_valid3}") 130 | 131 | 132 | 133 | #%% pos with flair 134 | from flair.data import Sentence 135 | from flair.models import SequenceTagger 136 | 137 | 138 | tagger = SequenceTagger.load("pos-multi") 139 | 140 | #%% 141 | 142 | 143 | sentence = sentences[0] 144 | print(f"\n>>>{sentence}") 145 | sent = Sentence(sentence) 146 | tagger.predict(sent) 147 | print(f"\n{sent.to_tagged_string()}") 148 | for t in sent.tokens: 149 | print(f"{t}- {t.get_tag('upos').value} {t.get_tag('upos').score}") 150 | 151 | 152 | conv_flair_get_pos = lambda x: x.get_tag("upos").value 153 | flair_ud_pos = list(map(conv_flair_get_pos, sent.tokens)) 154 | stats_flair_pos = Counter(flair_ud_pos) 155 | 156 | print(stats_flair_pos) 157 | 158 | 159 | # %% sentence taggers 160 | 161 | # docker run -p 9003:9003 -it djstrong/krnnt:1.0.0 162 | 163 | 164 | import itertools 165 | 166 | flatten = itertools.chain.from_iterable 167 | # map words in sentence to list of pos 168 | conv_stanza_pos = lambda x: [w.pos for w in x.words] 169 | conv_stanza_xpos = lambda x: [w.xpos for w in x.words] 170 | 171 | 172 | stanza.download("pl") 173 | nlp = stanza.Pipeline( 174 | "pl", processors="tokenize,pos,lemma", verbose=False 175 | ) # initialize neural pipeline 176 | 177 | import requests, json 178 | 179 | url = "http://localhost:9003/?output_format=jsonl" 180 | # url = "http://localhost:9003/?input_format=lines&output_format=tsv" 181 | 182 | 183 | # get only tag 184 | conv_main_nkjp = lambda x: x[2].split(":")[0] 185 | conv_main_ud = lambda x: tu.get_main_ud_pos(x[2]) 186 | 187 | for s in sentences[0:]: 188 | 189 | print(f"\n>>>{s}\n sent len={len(s)}") 190 | #run krnnt tagger 191 | x = requests.post(url, data=s.encode("utf-8")) 192 | # print(x.status_code) 193 | # print(x.text) 194 | 195 | resp = x.json() 196 | list_nkjp_pos = list(map(conv_main_nkjp, resp[0])) 197 | krnnt_pos = list(map(conv_main_ud, resp[0])) 198 | 199 | stats_nkjp = Counter(list_nkjp_pos) 200 | stats_krnnt_ud = Counter(krnnt_pos) 201 | # print(f"NKJP tags stats={stats_nkjp}") 202 | print(f"krnnt UD tags stats={stats_krnnt_ud}") 203 | print(f'ud sequence={",".join(krnnt_pos)}') 204 | 205 | # run stanza tagger 206 | doc = nlp(s) # run annotation over a sentence 207 | 208 | # flatten if found many sentences 209 | stanza_pos = list(flatten(map(conv_stanza_pos, doc.sentences))) 210 | stanza_xpos = list(flatten(map(conv_stanza_xpos, doc.sentences))) 211 | stats_stanza_pos = Counter(stanza_pos) 212 | stats_stanza_xpos = Counter(stanza_xpos) 213 | print(f"stanza UD tags stats={stats_stanza_pos}") 214 | #print(f"stanza NKJP tags stats={stats_stanza_xpos}") 215 | print(f'stanza ud sequence={",".join(stanza_pos)}') 216 | #print(f'stanza NKJP sequence={",".join(stanza_xpos)}') 217 | 218 | 219 | # %% 220 | 221 | import spacy 222 | 223 | # nlp = spacy.load('pl_spacy_model') 224 | nlp = spacy.load("pl_spacy_model_morfeusz") 225 | 226 | # List the tokens including their lemmas and POS tags 227 | doc = nlp("Granice mojego języka oznaczają granice mojego świata") # ~Wittgenstein 228 | for token in doc: 229 | print(token.text, token.lemma_, token.tag_) 230 | 231 | 232 | # %% 233 | 234 | 235 | import spacy 236 | 237 | sp_nlp = spacy.load("pl_spacy_model_morfeusz", force=True) 238 | 239 | #%% 240 | 241 | desc = [ 242 | "Akcja powieści toczy się w Warszawie i Tworkach w czasach okupacji. Bohaterowie to młodzi kochankowie - Żydzi, którzy próbnują uciec przed historią i skryć się ze swoją miłością w zakładzie dla obłąkanych. To, co w powieści najważniejsze jednak, to nie fabuła, a raczej opis nastrojów bohaterów, oscylujących między melancholią a rozpaczą, nowatorskie podejście do tematu wojny, który staje się poniekąd drugorzędny w stosunku do prywatnych odczuć opisywanych postaci i wysublimowany styl.\\nNajlepsza i najczęściej nagradzana powieść Marka Bieńczyka! Książka wyróżniona Paszportem „Polityki” i Nagrodą im. Władysława Reymonta oraz nominacją do Nagrody Literackiej Nike!\\nNajlepsza i najczęściej nagradzana powieść  laureata Literackiej Nagrody Nike 2012.\nAkcja tej powieści rozgrywa się w roku 1943 w podwarszawskich Tworkach. Jej bohaterowie to grupka dwudziestolatków, dziewcząt i chłopców, Polaków i Żydów, którzy pracują w pozostającym pod niemieckim zarządem szpitalu psychiatrycznym. Okazuje się, że jedynym normalnym miejscem w nienormalnym świecie jest zakład dla obłąkanych. Znajdują tu azyl, dający nadzieję na w miarę spokojne i godne przetrwanie okupacji. W chwilach wolnych od zajęć młodzi bawią się, flirtują, spacerują po malowniczej okolicy, deklamują wiersze. Wydaje się, że piekło okupacji hitlerowskiej ich nie dotyczy. Ta beztroska nijak się ma do ponurych czasów, w których przyszło im żyć, a których grozę w pełni zdaje się początkowo odczuwać chyba tylko czytelnik Lecz ni stąd ni zowąd, po cichu i niezauważenie, znikają z kart powieści jej kolejni bohaterowie. Pozostają po nich tylko pożegnalne listy.", 243 | "Ta księga miała być zakazana\nHiszpania końca XVIII wieku. Dwóch akademików – bibliotekarz i admirał – wyrusza z Madrytu do Paryża z tajną misją. Ich cel: zdobyć egzemplarz słynnej francuskiej Encyklopedii. To dzieło, dla wielu wyklęte i heretyckie, potępione przez Kościół, zawiera wiedzę zarezerwowaną dla nielicznych i podważa dotychczasowy porządek świata. Awanturnicza eskapada pełna pojedynków, intryg i spisków zmienia się w walkę światła z ciemnością.\nKlimat osiemnastowiecznego Madrytu i Paryża, zakazana księga, nadciągająca rewolucja, historia przenikająca się ze współczesnością – Arturo Pérez-Reverte, autor bestsellerów, członek Hiszpańskiej Akademii Królewskiej, od pierwszej strony uwodzi czytelnika, zadając mu szelmowskie pytanie: „Gdzie kończy się fikcja, a zaczyna prawda?”.\n\nArturo Pérez-Reverte – akademik, który oczarował miliony. Jeden z najgłośniejszych pisarzy współczesnej literatury hiszpańskiej. Autor bestsellerowych powieści, m.in. Klubu Dumas (zekranizowanej przez Romana Polańskiego), Szachownicy flamandzkiej, Mężczyzny, który tańczył tango. Jego książki przełożono na niemal 30 języków.", 244 | 'POŻEGNANIE Z MIASTEM ANIOŁÓW\nTeresa Loudenberry, czyli "Toots", wykazuje wrodzony talent do wynajdywania przygód nawet jeżeli wcale ich nie szuka. Jednak od czasu, gdy Sophie namówiła przyjaciółki na regularne seansy spirytystyczne, życie w Los Angeles przybrało nieco zbyt dramatyczny obrót nawet jak na gusta Toots. Kiedy Ida otrzymuje wiadomość z zaświatów sugerującą, że jej zmarły mąż mógł paść ofiarą morderstwa, wystraszona Toots uznaje, iż to dobry moment, by cztery matki chrzestne opuściły Los Angeles i zamieniły je na jej dom w Charleston.\nTymczasem Mavis zachowuje się podejrzanie, wysyłając sterty paczek i nie chcąc wyjawić, co się za tym kryje. A przecież po tylu latach powinna już dobrze wiedzieć, że Ida, Toots i Sophie nigdy nie pozwolą, żeby jakaś tajemnica pozostała nierozwiązana, tak samo jak nie zignorują przyjaciółki w potrzebie. Zaś kiedy matki chrzestne odkrywają, że zabójca męża Idy dybie także na nią samą, biorą się za to, w czym są najlepsze - naradzają się, obmyślają śmiały plan i udowadniają światu, że nikt nie zdoła sprostać tym czterem niezwykłym przyjaciółkom.', 245 | ] 246 | 247 | 248 | stanza.download("pl") 249 | st_nlp = stanza.Pipeline( 250 | "pl", processors="tokenize,pos,lemma", verbose=False 251 | ) # initialize neural pipeline 252 | 253 | import requests, json 254 | 255 | url = "http://localhost:9003/?output_format=jsonl" 256 | # url = "http://localhost:9003/?input_format=lines&output_format=tsv" 257 | 258 | 259 | for s in desc[0:1]: 260 | 261 | s = s.replace("\n", " ").replace("\r", " ") 262 | print(f"\n>>>{s}\n") 263 | 264 | print("\nkrnnt model---------\n") 265 | 266 | start = dt.datetime.now() 267 | x = requests.post(url, data=s.encode("utf-8")) 268 | resp = x.json() 269 | end = dt.datetime.now() - start 270 | 271 | krnnt_tokens = [] 272 | for sent in resp: 273 | single_sent = "" 274 | for tok, lemm, pos in sent: 275 | single_sent += f"{tok}({lemm}) " 276 | krnnt_tokens.append((tok, lemm)) 277 | print(f"{single_sent}\n") 278 | 279 | print(f"\nkrnnt model takes={end}---------\n") 280 | 281 | print("\nstanza model----------\n") 282 | start = dt.datetime.now() 283 | st_doc = st_nlp(s) 284 | end = dt.datetime.now() - start 285 | 286 | 287 | stanza_tokens = [] 288 | for sent in st_doc.sentences: 289 | single_sent = "" 290 | for word in sent.words: 291 | single_sent += f"{word.text}({word.lemma}) " 292 | stanza_tokens.append((word.text, word.lemma)) 293 | print(f"{single_sent}\n") 294 | stanza_tokens.append((word.text, word.lemma)) 295 | print(f"\nstanza model takes={end}---------\n") 296 | 297 | print("\nspacy_pl model----------\n") 298 | start = dt.datetime.now() 299 | sp_doc = sp_nlp(s) 300 | end = dt.datetime.now() - start 301 | 302 | single_sent = "" 303 | 304 | spacy_tokens = [] 305 | for token in sp_doc: 306 | single_sent += f"{token.text}({token.lemma_}) " 307 | spacy_tokens.append((token.text, token.lemma_)) 308 | print(f"{single_sent}\n") 309 | 310 | print(f"\n spacy_pl model takes={end}---------\n") 311 | 312 | 313 | # %% 314 | -------------------------------------------------------------------------------- /text_utils.py: -------------------------------------------------------------------------------- 1 | """Text utils 2 | 3 | """ 4 | import json 5 | import requests 6 | from enum import Enum 7 | import stanza 8 | from abc import ABC, abstractmethod 9 | import logging 10 | import sys 11 | import datetime as dt 12 | import os 13 | import mmap 14 | from pathlib import Path 15 | from tqdm import tqdm 16 | 17 | from collections import namedtuple, Counter 18 | import itertools 19 | 20 | 21 | # import morfeusz2 22 | import nltk 23 | 24 | from langdetect import detect_langs 25 | 26 | from polyglot.detect import Detector 27 | 28 | 29 | import warnings 30 | warnings.filterwarnings('ignore') 31 | 32 | def get_num_lines(file_path): 33 | fp = open(file_path, "r+") 34 | buf = mmap.mmap(fp.fileno(), 0) 35 | lines = 0 36 | while buf.readline(): 37 | lines += 1 38 | return lines 39 | 40 | 41 | logging.basicConfig(level=logging.ERROR) 42 | # disable logging from polyglot 43 | # Detector is not able to detect the language reliably. 44 | logger_poly = logging.getLogger("polyglot.detect.base:Detector") 45 | 46 | logger_poly.setLevel(level=logging.CRITICAL) 47 | logger_poly.propagate = False 48 | logger_poly.disabled = True 49 | 50 | 51 | def check_polish_sentence(sentence): 52 | """Returns true if sentence is written in polish 53 | Uses langdetect library and polyglot. 54 | 55 | """ 56 | 57 | # prevent error "input contains invalid UTF-8 around byte" 58 | text_to_detect = "".join(x for x in sentence if x.isprintable()) 59 | try: 60 | langs = detect_langs(text_to_detect) 61 | except Exception as e: 62 | # print(f"<{sentence}> - exp={e}") 63 | # if we cant detect the language, return True and try to deal with it later in the pipeline 64 | return True 65 | 66 | # if contains 'pl' with probability grateher then 0.4 67 | langdet_pl = any([(l.lang == "pl" and l.prob > 0.4) for l in langs]) 68 | 69 | detector = Detector(text_to_detect, quiet=True) 70 | poly_pl = detector.language.code == "pl" and detector.language.confidence > 40 71 | 72 | return langdet_pl or poly_pl 73 | 74 | 75 | def create_nltk_sentence_tokenizer(): 76 | """ 77 | 78 | find in vim with pattern: /\<\w\{2,3\}\>\.\n 79 | """ 80 | 81 | extra_abbreviations = [ 82 | "ps", 83 | "inc", 84 | "corp", 85 | "ltd", 86 | "Co", 87 | "pkt", 88 | "Dz.Ap", 89 | "Jr", 90 | "jr", 91 | "sp.k", 92 | "sp", 93 | # "Sp", 94 | "poj", 95 | "pseud", 96 | "krypt", 97 | "ws", 98 | "itd", 99 | "np", 100 | "sanskryt", 101 | "nr", 102 | "gł", 103 | "Takht", 104 | "tzw", 105 | "tzn", 106 | "t.zw", 107 | "ewan", 108 | "tyt", 109 | "fig", 110 | "oryg", 111 | "t.j", 112 | "vs", 113 | "l.mn", 114 | "l.poj", 115 | "ul", 116 | "al", 117 | "Al", 118 | "el", 119 | "tel", 120 | "wew", # wewnętrzny 121 | "bud", 122 | "pok", 123 | "wł", 124 | "sam", # samochód 125 | "sa", # spółka sa. 126 | "wit", # witaminy 127 | "mat", # materiały 128 | "kat", # kategorii 129 | "wg", # według 130 | "btw", # 131 | "itp", # 132 | "wz", # w związku 133 | "gosp", # 134 | "dział", # 135 | "hurt", # 136 | "mech", # 137 | "wyj", # wyj 138 | "pt", # pod tytułem 139 | "zew", # zewnętrzny 140 | ] 141 | 142 | position_abbrev = [ 143 | "Ks", 144 | "Abp", 145 | "abp", 146 | "bp", 147 | "dr", 148 | "kard", 149 | "mgr", 150 | "prof", 151 | "zwycz", 152 | "hab", 153 | "arch", 154 | "arch.kraj", 155 | "B.Sc", 156 | "Ph.D", 157 | "lek", 158 | "med", 159 | "n.med", 160 | "bł", 161 | "św", 162 | "hr", 163 | "dziek", 164 | ] 165 | 166 | roman_abbrev = ( 167 | [] 168 | ) # ['I', 'II', 'III', 'IV', 'V', 'VI', 'VII', 'VIII', 'IX', 'X', 'XI', 'XII', 'XII','XIV','XV','XVI', 'XVII', 'XVIII','XIX', 'XX', 'XXI' ] 169 | 170 | quantity_abbrev = [ 171 | "mln", 172 | "obr./min", 173 | "km/godz", 174 | "godz", 175 | "egz", 176 | "ha", 177 | "j.m", 178 | "cal", 179 | "obj", 180 | "alk", 181 | "wag", 182 | "obr", # obroty 183 | "wk", 184 | "mm", 185 | "MB", # mega bajty 186 | "Mb", # mega bity 187 | "jedn", # jednostkowe 188 | "op", 189 | "szt", # sztuk 190 | ] # not added: tys. 191 | 192 | actions_abbrev = [ 193 | "tłum", 194 | "tlum", 195 | "zob", 196 | "wym", 197 | "w/wym", 198 | "pot", 199 | "ww", 200 | "ogł", 201 | "wyd", 202 | "min", 203 | "m.i", 204 | "m.in", 205 | "in", 206 | "im", 207 | "muz", 208 | "tj", 209 | "dot", 210 | "wsp", 211 | "właść", 212 | "właśc", 213 | "przedr", 214 | "czyt", 215 | "proj", 216 | "dosł", 217 | "hist", 218 | "daw", 219 | "zwł", 220 | "zaw", 221 | "późn", 222 | "spr", 223 | "jw", 224 | "odp", # odpowiedź 225 | "symb", # symbol 226 | "klaw", # klawiaturowe 227 | ] 228 | 229 | place_abbrev = [ 230 | "śl", 231 | "płd", 232 | "geogr", 233 | "zs", 234 | "pom", # pomorskie 235 | "kuj-pom", # kujawsko pomorskie 236 | ] 237 | 238 | lang_abbrev = [ 239 | "jęz", 240 | "fr", 241 | "franc", 242 | "ukr", 243 | "ang", 244 | "gr", 245 | "hebr", 246 | "czes", 247 | "pol", 248 | "niem", 249 | "arab", 250 | "egip", 251 | "hiszp", 252 | "jap", 253 | "chin", 254 | "kor", 255 | "tyb", 256 | "wiet", 257 | "sum", 258 | "chor", 259 | "słow", 260 | "węg", 261 | "ros", 262 | "boś", 263 | "szw", 264 | ] 265 | 266 | administration = [ 267 | "dz.urz", # dziennik urzędowy 268 | "póź.zm", 269 | "rej", # rejestr, rejestracyjny dowód 270 | "sygn", # sygnatura 271 | "Dz.U", # dziennik ustaw 272 | "woj", # województow 273 | "ozn", # 274 | "ust", # ustawa 275 | "ref", # ref 276 | "dz", 277 | "akt", # akta 278 | ] 279 | 280 | time = [ 281 | "tyg", # tygodniu 282 | ] 283 | 284 | military_abbrev = [ 285 | "kpt", 286 | "kpr", 287 | "obs", 288 | "pil", 289 | "mjr", 290 | "płk", 291 | "dypl", 292 | "pp", 293 | "gw", 294 | "dyw", 295 | "bryg", # brygady 296 | "ppłk", 297 | "mar", 298 | "marsz", 299 | "rez", 300 | "ppor", 301 | "DPanc", 302 | "BPanc", 303 | "DKaw", 304 | "p.uł", 305 | "sierż", 306 | "post", 307 | "asp", 308 | "szt", # sztabowy 309 | "podinsp", 310 | "kom", # komendant, tel. komórka 311 | "nadkom", 312 | ] 313 | 314 | extra_abbreviations = ( 315 | extra_abbreviations 316 | + position_abbrev 317 | + quantity_abbrev 318 | + place_abbrev 319 | + actions_abbrev 320 | + lang_abbrev 321 | + administration 322 | + time 323 | + military_abbrev 324 | ) 325 | 326 | # create tokenizer with update abrev 327 | sentence_tokenizer = nltk.data.load("tokenizers/punkt/polish.pickle") 328 | # update abbrev 329 | sentence_tokenizer._params.abbrev_types.update(extra_abbreviations) 330 | 331 | return sentence_tokenizer 332 | 333 | 334 | # def sent_tokenizer(sentence): 335 | 336 | 337 | # return 338 | 339 | 340 | def corpus_process_sentence( 341 | corpus_input_file, 342 | courpus_output_file, 343 | split_each_line_as_doc, 344 | check_valid_sentence=False, 345 | check_lang_sentence=False, 346 | max_sentence_length=5000, 347 | krnnt_url="http://localhost:9003", 348 | ): 349 | """ 350 | Read corpus_input_file and save each sentence to new line of output file. Do some checks: 351 | if sentence is not to long, to short, if is written in polish, and if is valid polish sentence 352 | conaining verb 353 | 354 | :param corpus_input_file: path to corpus txt file 355 | :param output_file: path to output txt file where each sentence is in seperate line 356 | :param split_each_line_as_doc: determine if treat each line as separate document, if so add additional blank line after the end of the string, if set to false each document has already new blank line at the end 357 | :param check_valid_sentence: keep only valid sentence, those with verb, omit non valid sentences 358 | :param max_sentence_length: max sentence length in characters, omit sentece longer then 1500, high probability of non valid sentence 359 | """ 360 | 361 | total_lines = get_num_lines(corpus_input_file) 362 | sentence_tokenizer = create_nltk_sentence_tokenizer() 363 | 364 | # morf_sent = MorfeuszAnalyzer() 365 | # morf_sent = StanzaAnalyzer() 366 | morf_sent = KRNNTAnalyzer(krnnt_url) 367 | 368 | # statistics 369 | invalid_length_sentences = 0 370 | non_valid_sentences = 0 371 | all_sentences = 0 372 | non_polish = 0 373 | 374 | non_valid_sentences_list = [] 375 | non_polish_list = [] 376 | 377 | with open(courpus_output_file, "w+") as output_file: 378 | with open(corpus_input_file) as f: 379 | i = 0 380 | text = "" 381 | for line in tqdm(f, total=total_lines): 382 | 383 | # get block of text to new line which splits articles 384 | text += line 385 | 386 | i += 1 387 | if split_each_line_as_doc or line.strip() == "" or i % 100 == 0: 388 | # if split_each_line_as_doc is set then add new line after each line, if not then read file up to empty line (or max 100 lines) 389 | sentences = sentence_tokenizer.tokenize(text) 390 | 391 | file_content = "" 392 | for sentence in sentences: 393 | 394 | sentence = sentence.strip() 395 | sentence_length = len(sentence) 396 | 397 | all_sentences += 1 398 | 399 | if sentence_length < 4 or sentence_length > max_sentence_length: 400 | # omit to long and too short sentences 401 | invalid_length_sentences += 1 402 | continue 403 | 404 | if ( 405 | check_lang_sentence 406 | and sentence_length > 40 407 | and not check_polish_sentence(sentence) 408 | ): 409 | non_polish += 1 410 | non_polish_list.append(sentence) 411 | continue 412 | 413 | if ( 414 | check_valid_sentence 415 | and sentence_length > 60 416 | and not morf_sent.sentence_valid(sentence) 417 | ): 418 | 419 | # omit sentence if is not valid, we do not check short sentences 420 | non_valid_sentences += 1 421 | non_valid_sentences_list.append(sentence) 422 | 423 | continue 424 | 425 | file_content += sentence 426 | file_content += "\n" 427 | 428 | if file_content != "": 429 | 430 | output_file.write(file_content) 431 | output_file.write("\n") 432 | 433 | text = "" 434 | 435 | stats = { 436 | "lines": i, 437 | "all_sentences": all_sentences, 438 | "non_valid_sentences": non_valid_sentences, 439 | "invalid_length_sentences": invalid_length_sentences, 440 | "non_polish": non_polish, 441 | } 442 | 443 | return stats, non_valid_sentences_list, non_polish_list 444 | 445 | 446 | class MorfAnalyzer(ABC): 447 | def __init__(self): 448 | super(MorfAnalyzer, self).__init__() 449 | 450 | @abstractmethod 451 | def analyse(self, sentence): 452 | """Analyse the sentence and return morf tags""" 453 | 454 | @abstractmethod 455 | def sentence_valid(self, sentence): 456 | """Check if the passed txt is valid sentence, should contain min. one verb in proper form""" 457 | pass 458 | 459 | 460 | InterpTuple = namedtuple( 461 | "InterpTuple", field_names=["word", "lemat", "tags", "info", "style"] 462 | ) 463 | 464 | 465 | class MorfeuszAnalyzer(MorfAnalyzer): 466 | def __init__(self): 467 | super(MorfeuszAnalyzer, self).__init__() 468 | 469 | self._morfeusz = morfeusz2.Morfeusz(separate_numbering=True) 470 | self._verb_pattern = set( 471 | ["fin", "praet", "inf", "pred", "impt", "imps", "bedzie"] 472 | ) # 'ger', ppas 473 | 474 | def analyse(self, sentence): 475 | """Analyse the sentence and return morfeusz2 morf tags 476 | """ 477 | analysis = self._morfeusz.analyse(sentence) 478 | return analysis 479 | 480 | def sentence_valid(self, sentence): 481 | """Check if the passed txt is valid sentence, should contain min. one verb in proper form""" 482 | 483 | analysis = self.analyse(sentence) 484 | 485 | # evaluation is done lazy 486 | return any( 487 | InterpTuple(*a[2]).tags.split(":")[0] in self._verb_pattern 488 | for a in analysis 489 | ) 490 | 491 | 492 | class StanzaAnalyzer(MorfAnalyzer): 493 | def __init__(self): 494 | super(StanzaAnalyzer, self).__init__() 495 | 496 | stanza.download("pl") 497 | self._nlp_pipeline = stanza.Pipeline( 498 | "pl", processors="tokenize,pos,lemma", verbose=True, use_gpu=True 499 | ) # initialize neural pipeline 500 | 501 | self._conv_stanza_pos = lambda x: [w.pos for w in x.words] 502 | 503 | def analyse(self, sentence): 504 | """Analyse the sentence and return stanza pos tags 505 | """ 506 | return self._nlp_pipeline(sentence) 507 | 508 | def sentence_valid(self, sentence): 509 | """Check if the passed txt is valid sentence, should contain min. one verb in proper form""" 510 | 511 | doc = self.analyse(sentence) 512 | 513 | flatten = itertools.chain.from_iterable 514 | 515 | # get flatten list of tokens from all sentences tokenized by stanza 516 | # our sentence tokenization is different from stanza, very often our tokenized 517 | # sentence is treated as 2 or 3 sentences by stanza 518 | # map sentence word to pos tags and flatten all list 519 | stanza_pos = list(flatten(map(self._conv_stanza_pos, doc.sentences))) 520 | stats_stanza_pos = Counter(stanza_pos) 521 | 522 | # prosta heurystyka na bazie obserwacji 523 | # musi być min. 1 VERB 524 | # 1 VERB - max_NOUN 7-10 NOUN 525 | # 2 VERB - max_noun+2 526 | # 3 verb - max_noun+4 itp 527 | 528 | verbs = stats_stanza_pos["VERB"] 529 | nouns = ( 530 | stats_stanza_pos["NOUN"] 531 | + stats_stanza_pos["PROPN"] 532 | + stats_stanza_pos["DET"] 533 | ) 534 | aux = stats_stanza_pos["AUX"] 535 | 536 | # aux can be treated in some sentences as sentence builder 537 | verbs = verbs + aux 538 | 539 | # max number of nouns coresponding to first verb 540 | max_noun = 12 541 | # additional nouns to additional verbs 542 | nouns_per_verb = 2 543 | 544 | if verbs < 1: 545 | # if sentence does not contain any verb then is not valid 546 | return False 547 | elif nouns <= max_noun + (verbs - 1) * nouns_per_verb: 548 | return True 549 | else: 550 | return False 551 | 552 | 553 | class KRNNTAnalyzer(MorfAnalyzer): 554 | """KRNNT POS analyzer 555 | 556 | Note: KRNNT service/server must be started 557 | docker run -p 9003:9003 -it djstrong/krnnt:1.0.0 558 | 559 | """ 560 | def __init__(self, url="http://localhost:9003"): 561 | super(KRNNTAnalyzer, self).__init__() 562 | 563 | 564 | self._url = f"{url}/?output_format=jsonl" 565 | 566 | self._conv_main_nkjp = lambda x: x[2].split(":")[0] 567 | self._conv_main_ud = lambda x: get_main_ud_pos(x[2]) 568 | 569 | def analyse(self, sentence): 570 | """Analyse the sentence and return nkjp pos tags 571 | """ 572 | try: 573 | x = requests.post(self._url, data=sentence.encode("utf-8")) 574 | resp = x.json() 575 | 576 | except json.decoder.JSONDecodeError: 577 | return None 578 | 579 | return resp 580 | 581 | def sentence_valid(self, sentence): 582 | """Check if the passed txt is valid sentence, should contain min. one verb in proper form""" 583 | 584 | resp = self.analyse(sentence) 585 | 586 | if resp is None: 587 | return False 588 | 589 | krnnt_pos = list(map(self._conv_main_ud, resp[0])) 590 | 591 | stats_krnnt_ud = Counter(krnnt_pos) 592 | 593 | # prosta heurystyka na bazie obserwacji 594 | # musi być min. 1 VERB 595 | # 1 VERB - max_NOUN 7-10 NOUN 596 | # 2 VERB - max_noun+2 597 | # 3 verb - max_noun+4 itp 598 | 599 | verbs = stats_krnnt_ud["VERB"] 600 | 601 | # nouns + unknown words + "uch, ech, psst itp" 602 | nouns = stats_krnnt_ud["NOUN"] + stats_krnnt_ud["X"] + stats_krnnt_ud["INTJ"] 603 | aux = stats_krnnt_ud["AUX"] 604 | 605 | # aux can be treated in some sentences as sentence builder 606 | verbs = verbs + aux 607 | 608 | # max number of nouns coresponding to first verb 609 | max_noun = 12 610 | # additional nouns to additional verbs 611 | nouns_per_verb = 2 612 | 613 | if verbs < 1: 614 | # if sentence does not contain any verb then is not valid 615 | return False 616 | elif nouns <= max_noun + (verbs - 1) * nouns_per_verb: 617 | return True 618 | else: 619 | return False 620 | 621 | 622 | # mapping from nkjp to ud tag set 623 | # https://gitlab.com/piotr.pezik/apt_pl/-/blob/master/translation.py 624 | 625 | # Licensed under the Apache License, Version 2.0 (the 'License'); 626 | # you may not use this file except in compliance with the License. 627 | # You may obtain a copy of the License at 628 | 629 | # http://www.apache.org/licenses/LICENSE-2.0 630 | 631 | # Unless required by applicable law or agreed to in writing, software 632 | # distributed under the License is distributed on an 'AS IS' BASIS, 633 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 634 | # See the License for the specific language governing permissions and 635 | # limitations under the License. 636 | 637 | D_FIELD = Enum( 638 | "D_FIELD", "flexemes cats special_lemmas special_words default POS FEATURES" 639 | ) 640 | 641 | 642 | nkjp_to_ud_dict = { 643 | D_FIELD.flexemes.name: { 644 | "ger": { 645 | D_FIELD.default.name: { 646 | D_FIELD.POS.name: "NOUN", 647 | D_FIELD.FEATURES.name: [("VerbForm", "Ger")], 648 | } 649 | }, 650 | "subst": { 651 | D_FIELD.special_lemmas.name: [ 652 | ( 653 | ["kto", "co"], # Kto i co są w subst-ach 654 | { 655 | D_FIELD.POS.name: "PRON", 656 | D_FIELD.FEATURES.name: [("PronType", "Int,Rel")], 657 | }, 658 | ), 659 | ( 660 | ["coś", "ktoś", "cokolwiek", "ktokolwiek"], 661 | { 662 | D_FIELD.POS.name: "PRON", 663 | D_FIELD.FEATURES.name: [("PronType", "Ind")], 664 | }, 665 | ), 666 | ( 667 | ["nikt", "nic"], # nikt i nic to subst 668 | { 669 | D_FIELD.POS.name: "PRON", 670 | D_FIELD.FEATURES.name: [("PronType", "Neg")], 671 | }, 672 | ), 673 | ( 674 | ["wszystko", "wszyscy"], 675 | { 676 | D_FIELD.POS.name: "PRON", 677 | D_FIELD.FEATURES.name: [("PronType", "Tot")], 678 | }, 679 | ), 680 | ( 681 | ["to"], 682 | { 683 | D_FIELD.POS.name: "PRON", 684 | D_FIELD.FEATURES.name: [("PronType", "Dem")], 685 | }, 686 | ), 687 | ], 688 | D_FIELD.default.name: { 689 | D_FIELD.POS.name: "NOUN", 690 | # D_FIELD.FEATURES.name: 691 | }, 692 | }, 693 | "pred": { 694 | D_FIELD.default.name: { 695 | D_FIELD.POS.name: "VERB", 696 | # D_FIELD.FEATURES.name: 697 | } 698 | }, 699 | "comp": { 700 | D_FIELD.default.name: { 701 | D_FIELD.POS.name: "SCONJ", 702 | # D_FIELD.FEATURES.name: 703 | } 704 | }, 705 | "interp": { 706 | D_FIELD.default.name: { 707 | D_FIELD.POS.name: "PUNCT" 708 | # D_FIELD.FEATURES.name: 709 | } 710 | }, 711 | "conj": { 712 | D_FIELD.default.name: { 713 | D_FIELD.POS.name: "CONJ", 714 | # D_FIELD.FEATURES.name: 715 | } 716 | }, 717 | "adv": { 718 | D_FIELD.default.name: { 719 | D_FIELD.POS.name: "ADV", 720 | # D_FIELD.FEATURES.name: 721 | } 722 | }, 723 | "aglt": { 724 | D_FIELD.special_lemmas.name: [ 725 | ( 726 | ["być"], # Kto i co są w subst-ach 727 | { 728 | D_FIELD.POS.name: "AUX", 729 | D_FIELD.FEATURES.name: [ 730 | ("Mood", "Ind"), 731 | ("Tense", "Pres"), 732 | ("VerbForm", "Fin"), 733 | ], 734 | }, 735 | ) 736 | ], 737 | D_FIELD.default.name: { 738 | D_FIELD.POS.name: "AUX", 739 | D_FIELD.FEATURES.name: [ 740 | ("PronType", "Prs"), 741 | ("Reflex", "Yes"), 742 | ("Mood", "Ind"), 743 | ("Tense", "Pres"), 744 | ("VerbForm", "Fin"), 745 | ], 746 | }, 747 | }, 748 | "bedzie": { 749 | D_FIELD.default.name: { 750 | D_FIELD.POS.name: "AUX", 751 | # D_FIELD.FEATURES.name: 752 | }, 753 | D_FIELD.special_words.name: [ 754 | ( 755 | ["będą", "będzie", "będę", "będziemy", "będziesz"], 756 | { 757 | D_FIELD.POS.name: "AUX", 758 | D_FIELD.FEATURES.name: [ 759 | ("Tense", "Fut"), 760 | ("Mood", "Ind"), 761 | ("VerbForm", "Fin"), 762 | ], 763 | }, 764 | ) 765 | ], 766 | }, 767 | "burk": { 768 | D_FIELD.default.name: { 769 | D_FIELD.POS.name: "NOUN", 770 | # D_FIELD.FEATURES.name: 771 | } 772 | }, 773 | "depr": { 774 | D_FIELD.default.name: { 775 | D_FIELD.POS.name: "NOUN", 776 | # D_FIELD.FEATURES.name: 777 | } 778 | }, 779 | "ign": { 780 | D_FIELD.default.name: { 781 | D_FIELD.POS.name: "X", 782 | # D_FIELD.FEATURES.name: 783 | } 784 | }, 785 | "dig": { 786 | D_FIELD.default.name: { 787 | D_FIELD.POS.name: "NUM", 788 | # D_FIELD.FEATURES.name: 789 | } 790 | }, 791 | "romandig": { 792 | D_FIELD.default.name: { 793 | D_FIELD.POS.name: "NUM", 794 | # D_FIELD.FEATURES.name: 795 | } 796 | }, 797 | "siebie": { 798 | D_FIELD.default.name: { 799 | D_FIELD.POS.name: "PRON", 800 | D_FIELD.FEATURES.name: [("PronType", "Prs"), ("Reflex", "Yes")], 801 | } 802 | }, 803 | "numcol": { 804 | D_FIELD.default.name: { 805 | D_FIELD.POS.name: "NUM", 806 | D_FIELD.FEATURES.name: [("NumType", "Sets")], 807 | } 808 | }, 809 | "winien": { 810 | D_FIELD.default.name: {D_FIELD.POS.name: "ADJ"}, 811 | D_FIELD.special_lemmas.name: [(["powinien"], {D_FIELD.POS.name: "ADJ",})], 812 | }, 813 | # 'adj':{ 814 | # D_FIELD.special_lemmas.name: [ 815 | # ( 816 | # ['jaki', 'jakiś', 'żaden', 'wszystek', 'niejaki', 'który', 'taki', 'niektóry', 'którykolwiek', 'któryś', 817 | # 'ten', 'jakikolwiek', 'tamten', 'każdy', 'wszelki', 'ów'], 818 | # { 819 | # D_FIELD.POS.name: 'DET', 820 | # D_FIELD.FEATURES.name: [('PronType', 'Ind')] 821 | # }) 822 | # ], 823 | # D_FIELD.default.name:{ 824 | # D_FIELD.POS.name:'ADJ' 825 | # } 826 | # }, 827 | "xxx": {D_FIELD.default.name: {D_FIELD.POS.name: "X"}}, 828 | "interj": {D_FIELD.default.name: {D_FIELD.POS.name: "INTJ"}}, 829 | "adj": { 830 | D_FIELD.special_lemmas.name: [ 831 | ( 832 | ["wszystek", "wszyscy", "każdy", "wszelki"], 833 | { 834 | D_FIELD.POS.name: "DET", 835 | D_FIELD.FEATURES.name: [("PronType", "Tot")], 836 | }, 837 | ), 838 | ( 839 | ["jaki", "który"], # Kto i co są w subst-ach 840 | { 841 | D_FIELD.POS.name: "DET", 842 | D_FIELD.FEATURES.name: [("PronType", "Int,Rel")], 843 | }, 844 | ), 845 | ( 846 | ["to", "ten", "taki", "tamten", "ów"], 847 | { 848 | D_FIELD.POS.name: "DET", 849 | D_FIELD.FEATURES.name: [("PronType", "Dem")], 850 | }, 851 | ), 852 | ( 853 | [ 854 | "jakiś", 855 | "kilka", 856 | "kilkadziesiąt", 857 | "kilkaset", 858 | "niektóry", 859 | "któryś", 860 | "jakikolwiek", 861 | "niejaki", 862 | "którykolwiek", 863 | ], # kilkanaście jest w num, coś/ktoś w subst 864 | { 865 | D_FIELD.POS.name: "DET", 866 | D_FIELD.FEATURES.name: [("PronType", "Ind")], 867 | }, 868 | ), 869 | ( 870 | ["żaden"], # nikt i nic to subst 871 | { 872 | D_FIELD.POS.name: "DET", 873 | D_FIELD.FEATURES.name: [("PronType", "Neg")], 874 | }, 875 | ), 876 | ], 877 | D_FIELD.default.name: {D_FIELD.POS.name: "ADJ"}, 878 | }, 879 | "adjc": { 880 | D_FIELD.default.name: {D_FIELD.POS.name: "ADJ"}, 881 | D_FIELD.special_words.name: [ 882 | ( 883 | [ 884 | "winien", 885 | "gotów", 886 | "pewien", 887 | "ciekaw", 888 | "wart", 889 | "pełen", 890 | "świadom", 891 | "pewnien", 892 | "godzien", 893 | "łaskaw", 894 | "znan", 895 | "rad", 896 | "wesół", 897 | "zdrów", 898 | ], 899 | {D_FIELD.POS.name: "ADJ"}, 900 | ) 901 | ], 902 | }, 903 | "qub": { 904 | D_FIELD.special_lemmas.name: [ 905 | ( 906 | ["się"], 907 | { 908 | D_FIELD.POS.name: "PRON", 909 | D_FIELD.FEATURES.name: [("PronType", "Prs"), ("Reflex", "Yes")], 910 | }, 911 | ) 912 | ], 913 | D_FIELD.special_words.name: [ 914 | ( 915 | ["sie", "sia"], 916 | { 917 | D_FIELD.POS.name: "PRON", 918 | D_FIELD.FEATURES.name: [ 919 | ("PronType", "Prs"), 920 | ("Reflex", "Yes"), 921 | ("Typo", "Yes"), 922 | ], 923 | }, 924 | ), 925 | ( 926 | ["by"], 927 | { 928 | D_FIELD.POS.name: "AUX", 929 | D_FIELD.FEATURES.name: [ 930 | ("VerbForm", "Fin"), 931 | ("Mood", "Cnd"), 932 | ("Aspect", "Imp"), 933 | ], 934 | }, 935 | ), 936 | ], 937 | D_FIELD.default.name: {D_FIELD.POS.name: "PART"}, 938 | }, 939 | "adja": { 940 | D_FIELD.default.name: { 941 | D_FIELD.POS.name: "ADJ", 942 | D_FIELD.FEATURES.name: [("Hyph", "Yes")], 943 | } 944 | }, 945 | "prep": { 946 | D_FIELD.default.name: { 947 | D_FIELD.POS.name: "ADP", 948 | D_FIELD.FEATURES.name: [("AdpType", "Prep")], 949 | } 950 | }, 951 | "praet": { 952 | D_FIELD.default.name: { 953 | D_FIELD.POS.name: "VERB", 954 | D_FIELD.FEATURES.name: [ 955 | ("Tense", "Past"), 956 | ("VerbForm", "Part"), 957 | ("Voice", "Act"), 958 | ], 959 | } 960 | }, 961 | "pact": { 962 | D_FIELD.default.name: { 963 | D_FIELD.POS.name: "VERB", 964 | D_FIELD.FEATURES.name: [ 965 | ("VerbForm", "Part"), 966 | ("Voice", "Act"), 967 | ("Tense", "Pres"), 968 | ], 969 | } 970 | }, 971 | "pant": { 972 | D_FIELD.default.name: { 973 | D_FIELD.POS.name: "VERB", 974 | D_FIELD.FEATURES.name: [("Tense", "Past"), ("VerbForm", "Trans")], 975 | } 976 | }, 977 | "pcon": { 978 | D_FIELD.default.name: { 979 | D_FIELD.POS.name: "VERB", 980 | D_FIELD.FEATURES.name: [("Tense", "Pres"), ("VerbForm", "Trans")], 981 | } 982 | }, 983 | "ppas": { 984 | D_FIELD.default.name: { 985 | D_FIELD.POS.name: "VERB", 986 | D_FIELD.FEATURES.name: [("VerbForm", "Part"), ("Voice", "Pass")], 987 | } 988 | }, 989 | "num": { 990 | D_FIELD.special_lemmas.name: [ 991 | ( 992 | ["kilkanaście", "kilka", "kilkadziesiąt", "kilkaset"], 993 | { 994 | D_FIELD.POS.name: "DET", 995 | D_FIELD.FEATURES.name: [ 996 | ("PronType", "Ind"), 997 | ("NumType", "Card"), 998 | ], 999 | }, 1000 | ) 1001 | ], 1002 | D_FIELD.default.name: { 1003 | D_FIELD.POS.name: "NUM", 1004 | D_FIELD.FEATURES.name: [ 1005 | # ('NumType', 'Sets') 1006 | ], 1007 | }, 1008 | }, 1009 | "brev": { 1010 | D_FIELD.default.name: { 1011 | D_FIELD.POS.name: "X", 1012 | # D_FIELD.FEATURES.name: [ 1013 | # ('Abbr', 'Yes') 1014 | # ] 1015 | } 1016 | }, 1017 | "adjp": { 1018 | D_FIELD.default.name: { 1019 | D_FIELD.POS.name: "ADJ", 1020 | D_FIELD.FEATURES.name: [("PrepCase", "Pre")], 1021 | } 1022 | }, 1023 | "fin": { 1024 | D_FIELD.default.name: { 1025 | D_FIELD.POS.name: "VERB", 1026 | D_FIELD.FEATURES.name: { 1027 | ("VerbForm", "Fin"), 1028 | ("Tense", "Pres"), 1029 | ("Mood", "Ind"), 1030 | }, 1031 | } 1032 | }, 1033 | "ppron12": { 1034 | D_FIELD.default.name: { 1035 | D_FIELD.POS.name: "PRON", 1036 | D_FIELD.FEATURES.name: {("PronType", "Prs")}, 1037 | } 1038 | }, 1039 | "ppron3": { 1040 | D_FIELD.default.name: { 1041 | D_FIELD.POS.name: "PRON", 1042 | D_FIELD.FEATURES.name: {("PronType", "Prs")}, 1043 | } 1044 | }, 1045 | "inf": { 1046 | D_FIELD.default.name: { 1047 | D_FIELD.POS.name: "VERB", 1048 | D_FIELD.FEATURES.name: {("VerbForm", "Inf")}, 1049 | } 1050 | }, 1051 | # 'num':{ 1052 | # #D_FIELD.special_lemmas.name :None, 1053 | # #D_FIELD.special_words.name :None, 1054 | # D_FIELD.default.name:{ 1055 | # D_FIELD.POS.name:'NUM' 1056 | # #,D_FIELD.FEATURES.name: 1057 | # } 1058 | # }, 1059 | "impt": { 1060 | D_FIELD.default.name: { 1061 | D_FIELD.POS.name: "VERB", 1062 | D_FIELD.FEATURES.name: [("Mood", "Imp"), ("VerbForm", "Fin")], 1063 | } 1064 | }, 1065 | "imps": { 1066 | D_FIELD.default.name: { 1067 | D_FIELD.POS.name: "VERB", 1068 | D_FIELD.FEATURES.name: [ 1069 | ("Case", "Nom"), 1070 | ("Gender", "Neut"), 1071 | ("Negative", "Pos"), 1072 | ("Number", "Sing"), 1073 | ("VerbForm", "Part"), 1074 | ("Voice", "Pass"), 1075 | ], 1076 | } 1077 | }, 1078 | }, 1079 | D_FIELD.cats.name: { 1080 | "pl": {D_FIELD.default.name: {D_FIELD.FEATURES.name: {("Number", "Plur")}}}, 1081 | "pun": {D_FIELD.default.name: {D_FIELD.FEATURES.name: [("Abbr", "Yes")]}}, 1082 | "npun": {D_FIELD.default.name: {D_FIELD.FEATURES.name: [("Abbr", "Yes")]}}, 1083 | "acc": {D_FIELD.default.name: {D_FIELD.FEATURES.name: {("Case", "Acc")}}}, 1084 | "nakc": {D_FIELD.default.name: {D_FIELD.FEATURES.name: {("Variant", "Short")}}}, 1085 | "voc": {D_FIELD.default.name: {D_FIELD.FEATURES.name: {("Case", "Voc")}}}, 1086 | # 'ppron12': { 1087 | # D_FIELD.default.name: { 1088 | # D_FIELD.FEATURES.name: { 1089 | # ('PronType', 'Prs') 1090 | # } 1091 | # } 1092 | # }, 1093 | "m1": { 1094 | D_FIELD.default.name: { 1095 | D_FIELD.FEATURES.name: {("Animacy", "Hum"), ("Gender", "Masc")} 1096 | } 1097 | }, 1098 | "m2": { 1099 | D_FIELD.default.name: { 1100 | D_FIELD.FEATURES.name: {("Animacy", "Anim"), ("Gender", "Masc")} 1101 | } 1102 | }, 1103 | "m3": { 1104 | D_FIELD.default.name: { 1105 | D_FIELD.FEATURES.name: {("Animacy", "Inan"), ("Gender", "Masc")} 1106 | } 1107 | }, 1108 | "rec": {D_FIELD.default.name: {D_FIELD.FEATURES.name: {}}}, 1109 | "nagl": {D_FIELD.default.name: {D_FIELD.FEATURES.name: {}}}, 1110 | "agl": {D_FIELD.default.name: {D_FIELD.FEATURES.name: {}}}, 1111 | "congr": {D_FIELD.default.name: {D_FIELD.FEATURES.name: {}}}, 1112 | "praep": {D_FIELD.default.name: {D_FIELD.FEATURES.name: [("PrepCase", "Pre")]}}, 1113 | "_": {D_FIELD.default.name: {D_FIELD.FEATURES.name: {}}}, 1114 | "aff": {D_FIELD.default.name: {D_FIELD.FEATURES.name: {("Polarity", "Pos")}}}, 1115 | "com": {D_FIELD.default.name: {D_FIELD.FEATURES.name: {("Degree", "Cmp")}}}, 1116 | # 'com': { 1117 | # D_FIELD.default.name: { 1118 | # D_FIELD.FEATURES.name: {} 1119 | # } 1120 | # }, 1121 | "perf": {D_FIELD.default.name: {D_FIELD.FEATURES.name: {("Aspect", "Perf")}}}, 1122 | "imperf": {D_FIELD.default.name: {D_FIELD.FEATURES.name: {("Aspect", "Imp")}}}, 1123 | "sg": {D_FIELD.default.name: {D_FIELD.FEATURES.name: {("Number", "Sing")}}}, 1124 | "gen": {D_FIELD.default.name: {D_FIELD.FEATURES.name: {("Case", "Gen")}}}, 1125 | "nom": {D_FIELD.default.name: {D_FIELD.FEATURES.name: {("Case", "Nom")}}}, 1126 | "pos": {D_FIELD.default.name: {D_FIELD.FEATURES.name: {("Degree", "Pos")}}}, 1127 | "akc": {D_FIELD.default.name: {D_FIELD.FEATURES.name: {("Variant", "Long")}}}, 1128 | "f": {D_FIELD.default.name: {D_FIELD.FEATURES.name: {("Gender", "Fem")}}}, 1129 | "dat": {D_FIELD.default.name: {D_FIELD.FEATURES.name: {("Case", "Dat")}}}, 1130 | "inst": {D_FIELD.default.name: {D_FIELD.FEATURES.name: {("Case", "Ins")}}}, 1131 | "loc": {D_FIELD.default.name: {D_FIELD.FEATURES.name: {("Case", "Loc")}}}, 1132 | "neg": {D_FIELD.default.name: {D_FIELD.FEATURES.name: {("Polarity", "Neg")}}}, 1133 | "npraep": { 1134 | D_FIELD.default.name: {D_FIELD.FEATURES.name: {("PrepCase", "Npr")}} 1135 | }, 1136 | "nwok": {D_FIELD.default.name: {D_FIELD.FEATURES.name: {("Variant", "Short")}}}, 1137 | "wok": {D_FIELD.default.name: {D_FIELD.FEATURES.name: {("Variant", "Long")}}}, 1138 | "vok": {D_FIELD.default.name: {D_FIELD.FEATURES.name: {("Variant", "Long")}}}, 1139 | "xxx": {D_FIELD.default.name: {D_FIELD.FEATURES.name: {("Foreign", "Yes")}}}, 1140 | "pri": {D_FIELD.default.name: {D_FIELD.FEATURES.name: {("Person", "1")}}}, 1141 | "sec": {D_FIELD.default.name: {D_FIELD.FEATURES.name: {("Person", "2")}}}, 1142 | "ter": {D_FIELD.default.name: {D_FIELD.FEATURES.name: {("Person", "3")}}}, 1143 | "sup": {D_FIELD.default.name: {D_FIELD.FEATURES.name: {("Degree", "Sup")}}}, 1144 | "n": {D_FIELD.default.name: {D_FIELD.FEATURES.name: {("Gender", "Neut")}}}, 1145 | }, 1146 | } 1147 | 1148 | 1149 | def get_main_ud_pos(nkjp_tag): 1150 | main_nkjp_tag = nkjp_tag.split(":")[0] 1151 | try: 1152 | return nkjp_to_ud_dict[D_FIELD.flexemes.name][main_nkjp_tag].get( 1153 | D_FIELD.default.name 1154 | )[D_FIELD.POS.name] 1155 | except: 1156 | return main_nkjp_tag 1157 | --------------------------------------------------------------------------------