├── .dockerignore ├── .github └── workflows │ ├── pypi.yml │ └── test.yml ├── .gitignore ├── LICENSE ├── MANIFEST.in ├── Makefile ├── README.md ├── docker ├── slovnet-morph-bert │ ├── Dockerfile │ ├── Makefile │ └── app.py ├── slovnet-morph │ ├── exec │ │ ├── Dockerfile │ │ ├── Makefile │ │ └── app.py │ └── torch │ │ ├── Dockerfile │ │ ├── Makefile │ │ └── app.py ├── slovnet-ner-bert │ ├── Dockerfile │ ├── Makefile │ └── app.py ├── slovnet-ner │ ├── exec │ │ ├── Dockerfile │ │ ├── Makefile │ │ └── app.py │ └── torch │ │ ├── Dockerfile │ │ ├── Makefile │ │ └── app.py ├── slovnet-syntax-bert │ ├── Dockerfile │ ├── Makefile │ └── app.py └── slovnet-syntax │ ├── exec │ ├── Dockerfile │ ├── Makefile │ └── app.py │ └── torch │ ├── Dockerfile │ ├── Makefile │ └── app.py ├── requirements ├── app.txt ├── ci.txt ├── dev.txt └── gpu.txt ├── scripts ├── 01_bert_news │ ├── data.ipynb │ ├── main.ipynb │ └── main.py ├── 02_bert_ner │ ├── data.ipynb │ ├── grid.ipynb │ ├── infer.ipynb │ ├── main.ipynb │ └── main.py ├── 03_bert_morph │ ├── data.ipynb │ ├── grid.ipynb │ ├── main.ipynb │ └── main.py ├── 04_bert_syntax │ ├── data.ipynb │ ├── grid.ipynb │ ├── main.ipynb │ └── main.py ├── 05_ner │ ├── data.ipynb │ ├── grid.ipynb │ ├── main.ipynb │ ├── main.py │ └── pack.ipynb ├── 06_morph │ ├── data.ipynb │ ├── grid.ipynb │ ├── main.ipynb │ ├── main.py │ └── pack.ipynb ├── 07_syntax │ ├── data.ipynb │ ├── grid.ipynb │ ├── main.ipynb │ ├── main.py │ └── pack.ipynb ├── README.md └── slovnet.json ├── setup.cfg ├── setup.py ├── slovnet ├── __init__.py ├── api.py ├── batch.py ├── bert.py ├── bio.py ├── board.py ├── chop.py ├── conll.py ├── const.py ├── encoders │ ├── __init__.py │ ├── bert.py │ ├── buffer.py │ ├── common.py │ ├── syntax.py │ └── tag.py ├── exec │ ├── __init__.py │ ├── encoders.py │ ├── infer.py │ ├── mask.py │ ├── model.py │ ├── pack.py │ └── pad.py ├── infer │ ├── __init__.py │ ├── base.py │ ├── bert.py │ ├── syntax.py │ └── tag.py ├── io.py ├── log.py ├── loss.py ├── markup.py ├── mask.py ├── model │ ├── __init__.py │ ├── base.py │ ├── bert.py │ ├── cnn.py │ ├── crf.py │ ├── emb.py │ ├── exec.py │ ├── state.py │ ├── syntax.py │ └── tag.py ├── pad.py ├── record.py ├── s3.py ├── score.py ├── sent.py ├── shape.py ├── span.py ├── tar.py ├── token.py ├── visitor.py └── vocab.py └── tests ├── test_api.py ├── test_bio.py └── test_shape.py /.dockerignore: -------------------------------------------------------------------------------- 1 | * 2 | !README.md 3 | !setup.py 4 | !requirements 5 | !slovnet 6 | !docker -------------------------------------------------------------------------------- /.github/workflows/pypi.yml: -------------------------------------------------------------------------------- 1 | name: Publish PyPi 2 | 3 | on: 4 | push: 5 | tags: 6 | - v* 7 | 8 | jobs: 9 | build: 10 | runs-on: ubuntu-latest 11 | 12 | steps: 13 | - uses: actions/checkout@v3 14 | - name: Set up Python 15 | uses: actions/setup-python@v4 16 | with: 17 | python-version: '3.10' 18 | - name: Install dependencies 19 | run: pip install wheel 20 | - name: Build package 21 | run: python setup.py sdist bdist_wheel 22 | - name: Publish PyPI 23 | uses: pypa/gh-action-pypi-publish@release/v1 24 | with: 25 | password: ${{ secrets.PYPI_API_TOKEN }} 26 | 27 | -------------------------------------------------------------------------------- /.github/workflows/test.yml: -------------------------------------------------------------------------------- 1 | name: Test 2 | 3 | on: [push, pull_request] 4 | 5 | jobs: 6 | build: 7 | 8 | runs-on: ubuntu-latest 9 | strategy: 10 | matrix: 11 | python-version: ['3.8', '3.9', '3.10', '3.11'] 12 | 13 | steps: 14 | - uses: actions/checkout@v3 15 | 16 | - name: Set up Python ${{ matrix.python-version }} 17 | uses: actions/setup-python@v4 18 | with: 19 | python-version: ${{ matrix.python-version }} 20 | 21 | - name: Install dependencies 22 | run: | 23 | pip install -r requirements/ci.txt 24 | pip install -e . 25 | 26 | - name: Test 27 | run: make test 28 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | .cache/ 3 | .coverage 4 | .vscode 5 | coverage.xml 6 | *.egg-info/ 7 | *.pyc 8 | __pycache__/ 9 | .ipynb_checkpoints/ 10 | .pytest_cache/ 11 | build/ 12 | dist/ 13 | notes/ 14 | data/ 15 | bert/ 16 | model/ 17 | navec/ 18 | rubert/ 19 | events.out.tfevents.* 20 | slovnet_ner_news_v1.tar 21 | slovnet_ner_custom_tags.tar 22 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | recursive-include * * 2 | prune .git 3 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | 2 | test: 3 | flake8 slovnet tests 4 | pytest -vv tests 5 | -------------------------------------------------------------------------------- /docker/slovnet-morph-bert/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM pytorch/pytorch:1.4-cuda10.1-cudnn7-runtime 2 | 3 | RUN S3=https://storage.yandexcloud.net/natasha-slovnet \ 4 | && curl -O $S3/01_bert_news/rubert/vocab.txt \ 5 | && curl -O $S3/03_bert_morph/model/tags_vocab.txt \ 6 | && curl -O $S3/01_bert_news/model/emb.pt \ 7 | && curl -O $S3/03_bert_morph/model/encoder.pt \ 8 | && curl -O $S3/03_bert_morph/model/morph.pt 9 | 10 | COPY requirements/app.txt requirements.txt 11 | RUN pip install -r requirements.txt 12 | 13 | COPY . . 14 | RUN pip install -e . 15 | 16 | CMD python docker/slovnet-morph-bert/app.py -------------------------------------------------------------------------------- /docker/slovnet-morph-bert/Makefile: -------------------------------------------------------------------------------- 1 | 2 | ROOT = ~/proj/slovnet 3 | IMAGE = natasha/slovnet-morph-bert 4 | 5 | image: 6 | cd $(ROOT); docker build \ 7 | -t $(IMAGE) \ 8 | -f docker/slovnet-morph-bert/Dockerfile . 9 | 10 | push: 11 | docker push $(IMAGE) 12 | 13 | run: 14 | docker run -it --rm \ 15 | -p 8080:8080 \ 16 | -e DEVICE=cpu \ 17 | -e BATCH_SIZE=4 \ 18 | $(IMAGE) 19 | 20 | -------------------------------------------------------------------------------- /docker/slovnet-morph-bert/app.py: -------------------------------------------------------------------------------- 1 | 2 | from os import getenv 3 | 4 | import logging 5 | logging.basicConfig( 6 | level=logging.INFO, 7 | format='%(asctime)-15s %(message)s' 8 | ) 9 | log = logging.info 10 | 11 | from aiohttp import web 12 | 13 | import torch 14 | torch.set_grad_enabled(False) 15 | 16 | from slovnet.const import CUDA0 17 | from slovnet.vocab import ( 18 | BERTVocab, 19 | Vocab 20 | ) 21 | from slovnet.model.bert import ( 22 | RuBERTConfig, 23 | BERTEmbedding, 24 | BERTEncoder, 25 | BERTMorphHead, 26 | BERTMorph 27 | ) 28 | from slovnet.encoders.bert import BERTInferEncoder 29 | from slovnet.infer.bert import BERTMorphInfer, BERTTagDecoder 30 | 31 | 32 | WORDS_VOCAB = getenv('WORDS_VOCAB', 'vocab.txt') 33 | TAGS_VOCAB = getenv('TAGS_VOCAB', 'tags_vocab.txt') 34 | EMB = getenv('EMB', 'emb.pt') 35 | ENCODER = getenv('ENCODER', 'encoder.pt') 36 | MORPH = getenv('MORPH', 'morph.pt') 37 | 38 | DEVICE = getenv('DEVICE', CUDA0) 39 | SEQ_LEN = int(getenv('SEQ_LEN', 256)) 40 | BATCH_SIZE = int(getenv('BATCH_SIZE', 64)) 41 | 42 | HOST = getenv('HOST', '0.0.0.0') 43 | PORT = int(getenv('PORT', 8080)) 44 | MB = 1024 * 1024 45 | MAX_SIZE = int(getenv('MAX_SIZE', 100 * MB)) 46 | 47 | 48 | log('Load vocabs: %r, %r' % (WORDS_VOCAB, TAGS_VOCAB)) 49 | words_vocab = BERTVocab.load(WORDS_VOCAB) 50 | tags_vocab = Vocab.load(TAGS_VOCAB) 51 | 52 | config = RuBERTConfig() 53 | emb = BERTEmbedding.from_config(config) 54 | encoder = BERTEncoder.from_config(config) 55 | morph = BERTMorphHead(config.emb_dim, len(tags_vocab)) 56 | model = BERTMorph(emb, encoder, morph) 57 | model.eval() 58 | 59 | log('Load emb: %r' % EMB) 60 | model.emb.load(EMB) 61 | log('Load encoder: %r' % ENCODER) 62 | model.encoder.load(ENCODER) 63 | log('Load morph: %r' % MORPH) 64 | model.head.load(MORPH) 65 | log('Device: %r' % DEVICE) 66 | model = model.to(DEVICE) 67 | 68 | log('Seq len: %r' % SEQ_LEN) 69 | log('Batch size: %r' % BATCH_SIZE) 70 | encoder = BERTInferEncoder( 71 | words_vocab, 72 | seq_len=SEQ_LEN, batch_size=BATCH_SIZE 73 | ) 74 | decoder = BERTTagDecoder(tags_vocab) 75 | infer = BERTMorphInfer(model, encoder, decoder) 76 | 77 | 78 | async def handle(request): 79 | chunk = await request.json() 80 | log('Post chunk size: %r' % len(chunk)) 81 | markups = list(infer(chunk)) 82 | 83 | tokens = sum(len(_.tokens) for _ in markups) 84 | log('Infer tokens: %r', tokens) 85 | 86 | data = [_.as_json for _ in markups] 87 | return web.json_response(data) 88 | 89 | 90 | log('Max size: %r' % (MAX_SIZE // MB)) 91 | app = web.Application(client_max_size=MAX_SIZE) 92 | app.add_routes([web.post('/', handle)]) 93 | 94 | web.run_app(app, host=HOST, port=PORT) 95 | -------------------------------------------------------------------------------- /docker/slovnet-morph/exec/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.6 2 | 3 | RUN curl https://storage.yandexcloud.net/natasha-navec/packs/navec_news_v1_1B_250K_300d_100q.tar -o navec.tar \ 4 | && curl https://storage.yandexcloud.net/natasha-slovnet/packs/slovnet_morph_news_v1.tar -o pack.tar 5 | 6 | RUN pip install aiohttp==3.6.1 7 | 8 | COPY requirements/main.txt requirements.txt 9 | RUN pip install -r requirements.txt 10 | 11 | COPY . . 12 | RUN pip install -e . 13 | 14 | CMD python docker/slovnet-morph/exec/app.py 15 | -------------------------------------------------------------------------------- /docker/slovnet-morph/exec/Makefile: -------------------------------------------------------------------------------- 1 | 2 | ROOT = ~/proj/slovnet 3 | IMAGE = natasha/slovnet-morph 4 | 5 | image: 6 | cd $(ROOT); docker build \ 7 | -t $(IMAGE) \ 8 | -f docker/slovnet-morph/exec/Dockerfile . 9 | 10 | push: 11 | docker push $(IMAGE) 12 | 13 | run: 14 | docker run -it --rm \ 15 | -p 8080:8080 \ 16 | -e BATCH_SIZE=4 \ 17 | $(IMAGE) 18 | 19 | -------------------------------------------------------------------------------- /docker/slovnet-morph/exec/app.py: -------------------------------------------------------------------------------- 1 | 2 | from os import getenv 3 | 4 | import logging 5 | logging.basicConfig( 6 | level=logging.INFO, 7 | format='%(asctime)-15s %(message)s' 8 | ) 9 | log = logging.info 10 | 11 | from aiohttp import web 12 | 13 | from navec import Navec 14 | from slovnet import Morph 15 | 16 | 17 | NAVEC = getenv('NAVEC', 'navec.tar') 18 | PACK = getenv('PACK', 'pack.tar') 19 | BATCH_SIZE = int(getenv('BATCH_SIZE', 8)) 20 | 21 | HOST = getenv('HOST', '0.0.0.0') 22 | PORT = int(getenv('PORT', 8080)) 23 | MB = 1024 * 1024 24 | MAX_SIZE = int(getenv('MAX_SIZE', 100 * MB)) 25 | 26 | 27 | log('Load navec: %r' % NAVEC) 28 | navec = Navec.load(NAVEC) 29 | 30 | log('Load pack: %r' % PACK) 31 | log('Batch size: %r' % BATCH_SIZE) 32 | morph = Morph.load(PACK) 33 | morph.navec(navec) 34 | 35 | 36 | async def handle(request): 37 | chunk = await request.json() 38 | log('Post chunk size: %r' % len(chunk)) 39 | markups = list(morph.map(chunk)) 40 | 41 | tags = sum(len(_.tags) for _ in markups) 42 | log('Infer tags: %r', tags) 43 | 44 | data = [_.as_json for _ in markups] 45 | return web.json_response(data) 46 | 47 | 48 | log('Max size: %r' % (MAX_SIZE // MB)) 49 | app = web.Application(client_max_size=MAX_SIZE) 50 | app.add_routes([web.post('/', handle)]) 51 | 52 | web.run_app(app, host=HOST, port=PORT) 53 | -------------------------------------------------------------------------------- /docker/slovnet-morph/torch/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM pytorch/pytorch:1.4-cuda10.1-cudnn7-runtime 2 | 3 | RUN S3=https://storage.yandexcloud.net/ \ 4 | && curl -O $S3/natasha-slovnet/06_morph/model/shape.pt \ 5 | && curl -O $S3/natasha-slovnet/06_morph/model/encoder.pt \ 6 | && curl -O $S3/natasha-slovnet/06_morph/model/morph.pt \ 7 | && curl -O $S3/natasha-slovnet/06_morph/model/tags_vocab.txt \ 8 | && curl -L $S3/natasha-navec/navec_news_v1_1B_250K_300d_100q.tar > navec.tar 9 | 10 | COPY requirements/app.txt requirements.txt 11 | RUN pip install -r requirements.txt 12 | 13 | COPY . . 14 | RUN pip install -e . 15 | 16 | CMD python docker/slovnet-morph/torch/app.py 17 | -------------------------------------------------------------------------------- /docker/slovnet-morph/torch/Makefile: -------------------------------------------------------------------------------- 1 | 2 | ROOT = ~/proj/slovnet 3 | IMAGE = natasha/slovnet-morph 4 | 5 | image: 6 | cd $(ROOT); docker build \ 7 | -t $(IMAGE) \ 8 | -f docker/slovnet-morph/torch/Dockerfile . 9 | 10 | push: 11 | docker push $(IMAGE) 12 | 13 | run: 14 | docker run -it --rm \ 15 | -p 8080:8080 \ 16 | -e DEVICE=cpu \ 17 | -e BATCH_SIZE=4 \ 18 | $(IMAGE) 19 | 20 | -------------------------------------------------------------------------------- /docker/slovnet-morph/torch/app.py: -------------------------------------------------------------------------------- 1 | 2 | from os import getenv 3 | 4 | import logging 5 | logging.basicConfig( 6 | level=logging.INFO, 7 | format='%(asctime)-15s %(message)s' 8 | ) 9 | log = logging.info 10 | 11 | from aiohttp import web 12 | 13 | import torch 14 | torch.set_grad_enabled(False) 15 | 16 | from navec import Navec 17 | 18 | from slovnet.const import PAD, CUDA0 19 | from slovnet.shape import SHAPES 20 | from slovnet.vocab import Vocab 21 | from slovnet.model.emb import ( 22 | NavecEmbedding, 23 | Embedding 24 | ) 25 | from slovnet.model.tag import ( 26 | TagEmbedding, 27 | TagEncoder, 28 | MorphHead, 29 | Morph 30 | ) 31 | from slovnet.encoders.tag import TagInferEncoder 32 | from slovnet.infer.tag import MorphInfer, TagDecoder 33 | 34 | 35 | NAVEC = getenv('NAVEC', 'navec.tar') 36 | SHAPE = getenv('SHAPE', 'shape.pt') 37 | ENCODER = getenv('ENCODER', 'encoder.pt') 38 | MORPH = getenv('MORPH', 'morph.pt') 39 | TAGS_VOCAB = getenv('TAGS_VOCAB', 'tags_vocab.txt') 40 | 41 | SHAPE_DIM = 30 42 | LAYER_DIMS = [256, 128, 64] 43 | KERNEL_SIZE = 3 44 | 45 | DEVICE = getenv('DEVICE', CUDA0) 46 | BATCH_SIZE = int(getenv('BATCH_SIZE', 64)) 47 | 48 | HOST = getenv('HOST', '0.0.0.0') 49 | PORT = int(getenv('PORT', 8080)) 50 | MB = 1024 * 1024 51 | MAX_SIZE = int(getenv('MAX_SIZE', 100 * MB)) 52 | 53 | 54 | log('Load navec: %r' % NAVEC) 55 | navec = Navec.load(NAVEC) 56 | 57 | words_vocab = Vocab(navec.vocab.words) 58 | shapes_vocab = Vocab([PAD] + SHAPES) 59 | tags_vocab = Vocab.load(TAGS_VOCAB) 60 | 61 | word = NavecEmbedding(navec) 62 | shape = Embedding( 63 | vocab_size=len(shapes_vocab), 64 | dim=SHAPE_DIM, 65 | pad_id=shapes_vocab.pad_id 66 | ) 67 | emb = TagEmbedding(word, shape) 68 | encoder = TagEncoder( 69 | input_dim=emb.dim, 70 | layer_dims=LAYER_DIMS, 71 | kernel_size=KERNEL_SIZE, 72 | ) 73 | morph = MorphHead(encoder.dim, len(tags_vocab)) 74 | model = Morph(emb, encoder, morph) 75 | model.eval() 76 | 77 | log('Load shape: %r' % SHAPE) 78 | model.emb.shape.load(SHAPE) 79 | log('Load encoder: %r' % ENCODER) 80 | model.encoder.load(ENCODER) 81 | log('Load morph: %r' % MORPH) 82 | model.head.load(MORPH) 83 | log('Device: %r' % DEVICE) 84 | model = model.to(DEVICE) 85 | 86 | log('Batch size: %r' % BATCH_SIZE) 87 | encoder = TagInferEncoder( 88 | words_vocab, shapes_vocab, 89 | batch_size=BATCH_SIZE 90 | ) 91 | decoder = TagDecoder(tags_vocab) 92 | infer = MorphInfer(model, encoder, decoder) 93 | 94 | 95 | async def handle(request): 96 | chunk = await request.json() 97 | log('Post chunk size: %r' % len(chunk)) 98 | markups = list(infer(chunk)) 99 | 100 | tags = sum(len(_.tags) for _ in markups) 101 | log('Infer tags: %r', tags) 102 | 103 | data = [_.as_json for _ in markups] 104 | return web.json_response(data) 105 | 106 | 107 | log('Max size: %r' % (MAX_SIZE // MB)) 108 | app = web.Application(client_max_size=MAX_SIZE) 109 | app.add_routes([web.post('/', handle)]) 110 | 111 | web.run_app(app, host=HOST, port=PORT) 112 | -------------------------------------------------------------------------------- /docker/slovnet-ner-bert/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM pytorch/pytorch:1.4-cuda10.1-cudnn7-runtime 2 | 3 | RUN S3=https://storage.yandexcloud.net/natasha-slovnet \ 4 | && curl -O $S3/01_bert_news/rubert/vocab.txt \ 5 | && curl -O $S3/01_bert_news/model/emb.pt \ 6 | && curl -O $S3/02_bert_ner/model/encoder.pt \ 7 | && curl -O $S3/02_bert_ner/model/ner.pt 8 | 9 | COPY requirements/app.txt requirements.txt 10 | RUN pip install -r requirements.txt 11 | 12 | COPY . . 13 | RUN pip install -e . 14 | 15 | CMD python docker/slovnet-ner-bert/app.py 16 | -------------------------------------------------------------------------------- /docker/slovnet-ner-bert/Makefile: -------------------------------------------------------------------------------- 1 | 2 | ROOT = ~/proj/slovnet 3 | IMAGE = natasha/slovnet-ner-bert 4 | 5 | image: 6 | cd $(ROOT); docker build \ 7 | -t $(IMAGE) \ 8 | -f docker/slovnet-ner-bert/Dockerfile . 9 | 10 | push: 11 | docker push $(IMAGE) 12 | 13 | run: 14 | docker run -it --rm \ 15 | -p 8080:8080 \ 16 | -e DEVICE=cpu \ 17 | -e BATCH_SIZE=4 \ 18 | $(IMAGE) 19 | 20 | -------------------------------------------------------------------------------- /docker/slovnet-ner-bert/app.py: -------------------------------------------------------------------------------- 1 | 2 | from os import getenv 3 | 4 | import logging 5 | logging.basicConfig( 6 | level=logging.INFO, 7 | format='%(asctime)-15s %(message)s' 8 | ) 9 | log = logging.info 10 | 11 | from aiohttp import web 12 | 13 | import torch 14 | torch.set_grad_enabled(False) 15 | 16 | from slovnet.const import PER, LOC, ORG, CUDA0 17 | from slovnet.vocab import ( 18 | BERTVocab, 19 | BIOTagsVocab 20 | ) 21 | from slovnet.model.bert import ( 22 | RuBERTConfig, 23 | BERTEmbedding, 24 | BERTEncoder, 25 | BERTNERHead, 26 | BERTNER 27 | ) 28 | from slovnet.encoders.bert import BERTInferEncoder 29 | from slovnet.infer.bert import BERTNERInfer, BERTTagDecoder 30 | 31 | 32 | WORDS_VOCAB = getenv('WORDS_VOCAB', 'vocab.txt') 33 | EMB = getenv('EMB', 'emb.pt') 34 | ENCODER = getenv('ENCODER', 'encoder.pt') 35 | NER = getenv('NER', 'ner.pt') 36 | 37 | DEVICE = getenv('DEVICE', CUDA0) 38 | SEQ_LEN = int(getenv('SEQ_LEN', 256)) 39 | BATCH_SIZE = int(getenv('BATCH_SIZE', 64)) 40 | 41 | HOST = getenv('HOST', '0.0.0.0') 42 | PORT = int(getenv('PORT', 8080)) 43 | MB = 1024 * 1024 44 | MAX_SIZE = int(getenv('MAX_SIZE', 100 * MB)) 45 | 46 | 47 | log('Load words vocab: %r' % WORDS_VOCAB) 48 | words_vocab = BERTVocab.load(WORDS_VOCAB) 49 | tags_vocab = BIOTagsVocab([PER, LOC, ORG]) 50 | 51 | config = RuBERTConfig() 52 | emb = BERTEmbedding.from_config(config) 53 | encoder = BERTEncoder.from_config(config) 54 | ner = BERTNERHead(config.emb_dim, len(tags_vocab)) 55 | model = BERTNER(emb, encoder, ner) 56 | model.eval() 57 | 58 | log('Load emb: %r' % EMB) 59 | model.emb.load(EMB) 60 | log('Load encoder: %r' % ENCODER) 61 | model.encoder.load(ENCODER) 62 | log('Load ner: %r' % NER) 63 | model.head.load(NER) 64 | log('Device: %r' % DEVICE) 65 | model = model.to(DEVICE) 66 | 67 | log('Seq len: %r' % SEQ_LEN) 68 | log('Batch size: %r' % BATCH_SIZE) 69 | encoder = BERTInferEncoder( 70 | words_vocab, 71 | seq_len=SEQ_LEN, batch_size=BATCH_SIZE 72 | ) 73 | decoder = BERTTagDecoder(tags_vocab) 74 | infer = BERTNERInfer(model, encoder, decoder) 75 | 76 | 77 | async def handle(request): 78 | chunk = await request.json() 79 | log('Post chunk size: %r' % len(chunk)) 80 | markups = list(infer(chunk)) 81 | 82 | spans = sum(len(_.spans) for _ in markups) 83 | log('Infer spans: %r', spans) 84 | 85 | data = [_.as_json for _ in markups] 86 | return web.json_response(data) 87 | 88 | 89 | log('Max size: %r' % (MAX_SIZE // MB)) 90 | app = web.Application(client_max_size=MAX_SIZE) 91 | app.add_routes([web.post('/', handle)]) 92 | 93 | web.run_app(app, host=HOST, port=PORT) 94 | -------------------------------------------------------------------------------- /docker/slovnet-ner/exec/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.6 2 | 3 | RUN curl https://storage.yandexcloud.net/natasha-navec/packs/navec_news_v1_1B_250K_300d_100q.tar -o navec.tar \ 4 | && curl https://storage.yandexcloud.net/natasha-slovnet/packs/slovnet_ner_news_v1.tar -o pack.tar 5 | 6 | RUN pip install aiohttp==3.6.1 7 | 8 | COPY requirements/main.txt requirements.txt 9 | RUN pip install -r requirements.txt 10 | 11 | COPY . . 12 | RUN pip install -e . 13 | 14 | CMD python docker/slovnet-ner/exec/app.py 15 | -------------------------------------------------------------------------------- /docker/slovnet-ner/exec/Makefile: -------------------------------------------------------------------------------- 1 | 2 | ROOT = ~/proj/slovnet 3 | IMAGE = natasha/slovnet-ner 4 | 5 | image: 6 | cd $(ROOT); docker build \ 7 | -t $(IMAGE) \ 8 | -f docker/slovnet-ner/exec/Dockerfile . 9 | 10 | push: 11 | docker push $(IMAGE) 12 | 13 | run: 14 | docker run -it --rm \ 15 | -p 8080:8080 \ 16 | -e BATCH_SIZE=4 \ 17 | $(IMAGE) 18 | 19 | -------------------------------------------------------------------------------- /docker/slovnet-ner/exec/app.py: -------------------------------------------------------------------------------- 1 | 2 | from os import getenv 3 | 4 | import logging 5 | logging.basicConfig( 6 | level=logging.INFO, 7 | format='%(asctime)-15s %(message)s' 8 | ) 9 | log = logging.info 10 | 11 | from aiohttp import web 12 | 13 | from navec import Navec 14 | from slovnet import NER 15 | 16 | 17 | NAVEC = getenv('NAVEC', 'navec.tar') 18 | PACK = getenv('PACK', 'pack.tar') 19 | BATCH_SIZE = int(getenv('BATCH_SIZE', 8)) 20 | 21 | HOST = getenv('HOST', '0.0.0.0') 22 | PORT = int(getenv('PORT', 8080)) 23 | MB = 1024 * 1024 24 | MAX_SIZE = int(getenv('MAX_SIZE', 100 * MB)) 25 | 26 | log('Load navec: %r' % NAVEC) 27 | navec = Navec.load(NAVEC) 28 | 29 | log('Load pack: %r' % PACK) 30 | log('Batch size: %r' % BATCH_SIZE) 31 | ner = NER.load(PACK) 32 | ner.navec(navec) 33 | 34 | 35 | async def handle(request): 36 | chunk = await request.json() 37 | log('Post chunk size: %r' % len(chunk)) 38 | markups = list(ner.map(chunk)) 39 | 40 | spans = sum(len(_.spans) for _ in markups) 41 | log('Infer spans: %r', spans) 42 | 43 | data = [_.as_json for _ in markups] 44 | return web.json_response(data) 45 | 46 | 47 | log('Max size: %r' % (MAX_SIZE // MB)) 48 | app = web.Application(client_max_size=MAX_SIZE) 49 | app.add_routes([web.post('/', handle)]) 50 | 51 | web.run_app(app, host=HOST, port=PORT) 52 | -------------------------------------------------------------------------------- /docker/slovnet-ner/torch/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM pytorch/pytorch:1.4-cuda10.1-cudnn7-runtime 2 | 3 | RUN S3=https://storage.yandexcloud.net/ \ 4 | && curl -O $S3/natasha-slovnet/05_ner/model/shape.pt \ 5 | && curl -O $S3/natasha-slovnet/05_ner/model/encoder.pt \ 6 | && curl -O $S3/natasha-slovnet/05_ner/model/ner.pt \ 7 | && curl -L $S3/natasha-navec/packs/navec_news_v1_1B_250K_300d_100q.tar > navec.tar 8 | 9 | COPY requirements/app.txt requirements.txt 10 | RUN pip install -r requirements.txt 11 | 12 | COPY . . 13 | RUN pip install -e . 14 | 15 | CMD python docker/slovnet-ner/torch/app.py 16 | -------------------------------------------------------------------------------- /docker/slovnet-ner/torch/Makefile: -------------------------------------------------------------------------------- 1 | 2 | ROOT = ~/proj/slovnet 3 | IMAGE = natasha/slovnet-ner 4 | 5 | image: 6 | cd $(ROOT); docker build \ 7 | -t $(IMAGE) \ 8 | -f docker/slovnet-ner/torch/Dockerfile . 9 | 10 | push: 11 | docker push $(IMAGE) 12 | 13 | run: 14 | docker run -it --rm \ 15 | -p 8080:8080 \ 16 | -e DEVICE=cpu \ 17 | -e BATCH_SIZE=4 \ 18 | $(IMAGE) 19 | 20 | -------------------------------------------------------------------------------- /docker/slovnet-ner/torch/app.py: -------------------------------------------------------------------------------- 1 | 2 | from os import getenv 3 | 4 | import logging 5 | logging.basicConfig( 6 | level=logging.INFO, 7 | format='%(asctime)-15s %(message)s' 8 | ) 9 | log = logging.info 10 | 11 | from aiohttp import web 12 | 13 | import torch 14 | torch.set_grad_enabled(False) 15 | 16 | from navec import Navec 17 | 18 | from slovnet.const import ( 19 | PER, LOC, ORG, 20 | PAD, 21 | CUDA0 22 | ) 23 | from slovnet.shape import SHAPES 24 | from slovnet.vocab import ( 25 | Vocab, 26 | BIOTagsVocab 27 | ) 28 | from slovnet.model.emb import ( 29 | NavecEmbedding, 30 | Embedding 31 | ) 32 | from slovnet.model.tag import ( 33 | TagEmbedding, 34 | TagEncoder, 35 | NERHead, 36 | NER as NERModel 37 | ) 38 | from slovnet.encoders.tag import TagInferEncoder 39 | from slovnet.infer.tag import NERInfer, TagDecoder 40 | 41 | 42 | NAVEC = getenv('NAVEC', 'navec.tar') 43 | SHAPE = getenv('SHAPE', 'shape.pt') 44 | ENCODER = getenv('ENCODER', 'encoder.pt') 45 | NER = getenv('NER', 'ner.pt') 46 | 47 | SHAPE_DIM = 30 48 | LAYER_DIMS = [256, 128, 64] 49 | KERNEL_SIZE = 3 50 | 51 | DEVICE = getenv('DEVICE', CUDA0) 52 | BATCH_SIZE = int(getenv('BATCH_SIZE', 64)) 53 | 54 | HOST = getenv('HOST', '0.0.0.0') 55 | PORT = int(getenv('PORT', 8080)) 56 | MB = 1024 * 1024 57 | MAX_SIZE = int(getenv('MAX_SIZE', 100 * MB)) 58 | 59 | 60 | log('Load navec: %r' % NAVEC) 61 | navec = Navec.load(NAVEC) 62 | 63 | words_vocab = Vocab(navec.vocab.words) 64 | shapes_vocab = Vocab([PAD] + SHAPES) 65 | tags_vocab = BIOTagsVocab([PER, LOC, ORG]) 66 | 67 | word = NavecEmbedding(navec) 68 | shape = Embedding( 69 | vocab_size=len(shapes_vocab), 70 | dim=SHAPE_DIM, 71 | pad_id=shapes_vocab.pad_id 72 | ) 73 | emb = TagEmbedding(word, shape) 74 | encoder = TagEncoder( 75 | input_dim=emb.dim, 76 | layer_dims=LAYER_DIMS, 77 | kernel_size=KERNEL_SIZE, 78 | ) 79 | ner = NERHead(encoder.dim, len(tags_vocab)) 80 | model = NERModel(emb, encoder, ner) 81 | model.eval() 82 | 83 | log('Load shape: %r' % SHAPE) 84 | model.emb.shape.load(SHAPE) 85 | log('Load encoder: %r' % ENCODER) 86 | model.encoder.load(ENCODER) 87 | log('Load ner: %r' % NER) 88 | model.head.load(NER) 89 | log('Device: %r' % DEVICE) 90 | model = model.to(DEVICE) 91 | 92 | log('Batch size: %r' % BATCH_SIZE) 93 | encoder = TagInferEncoder( 94 | words_vocab, shapes_vocab, 95 | batch_size=BATCH_SIZE 96 | ) 97 | decoder = TagDecoder(tags_vocab) 98 | infer = NERInfer(model, encoder, decoder) 99 | 100 | 101 | async def handle(request): 102 | chunk = await request.json() 103 | log('Post chunk size: %r' % len(chunk)) 104 | markups = list(infer(chunk)) 105 | 106 | spans = sum(len(_.spans) for _ in markups) 107 | log('Infer spans: %r', spans) 108 | 109 | data = [_.as_json for _ in markups] 110 | return web.json_response(data) 111 | 112 | 113 | log('Max size: %r' % (MAX_SIZE // MB)) 114 | app = web.Application(client_max_size=MAX_SIZE) 115 | app.add_routes([web.post('/', handle)]) 116 | 117 | web.run_app(app, host=HOST, port=PORT) 118 | -------------------------------------------------------------------------------- /docker/slovnet-syntax-bert/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM pytorch/pytorch:1.4-cuda10.1-cudnn7-runtime 2 | 3 | RUN S3=https://storage.yandexcloud.net/natasha-slovnet \ 4 | && curl -O $S3/01_bert_news/rubert/vocab.txt \ 5 | && curl -O $S3/04_bert_syntax/model/rels_vocab.txt \ 6 | && curl -O $S3/01_bert_news/model/emb.pt \ 7 | && curl -O $S3/04_bert_syntax/model/encoder.pt \ 8 | && curl -O $S3/04_bert_syntax/model/head.pt \ 9 | && curl -O $S3/04_bert_syntax/model/rel.pt 10 | 11 | COPY requirements/app.txt requirements.txt 12 | RUN pip install -r requirements.txt 13 | 14 | COPY . . 15 | RUN pip install -e . 16 | 17 | CMD python docker/slovnet-syntax-bert/app.py -------------------------------------------------------------------------------- /docker/slovnet-syntax-bert/Makefile: -------------------------------------------------------------------------------- 1 | 2 | ROOT = ~/proj/slovnet 3 | IMAGE = natasha/slovnet-syntax-bert 4 | 5 | image: 6 | cd $(ROOT); docker build \ 7 | -t $(IMAGE) \ 8 | -f docker/slovnet-syntax-bert/Dockerfile . 9 | 10 | push: 11 | docker push $(IMAGE) 12 | 13 | run: 14 | docker run -it --rm \ 15 | -p 8080:8080 \ 16 | -e DEVICE=cpu \ 17 | -e BATCH_SIZE=4 \ 18 | $(IMAGE) 19 | 20 | -------------------------------------------------------------------------------- /docker/slovnet-syntax-bert/app.py: -------------------------------------------------------------------------------- 1 | 2 | from os import getenv 3 | 4 | import logging 5 | logging.basicConfig( 6 | level=logging.INFO, 7 | format='%(asctime)-15s %(message)s' 8 | ) 9 | log = logging.info 10 | 11 | from aiohttp import web 12 | 13 | import torch 14 | torch.set_grad_enabled(False) 15 | 16 | from slovnet.const import CUDA0 17 | from slovnet.vocab import ( 18 | BERTVocab, 19 | Vocab 20 | ) 21 | from slovnet.model.bert import ( 22 | RuBERTConfig, 23 | BERTEmbedding, 24 | BERTEncoder, 25 | BERTSyntaxHead, 26 | BERTSyntaxRel, 27 | BERTSyntax 28 | ) 29 | from slovnet.encoders.bert import BERTInferEncoder 30 | from slovnet.infer.bert import BERTSyntaxInfer, BERTSyntaxDecoder 31 | 32 | 33 | WORDS_VOCAB = getenv('WORDS_VOCAB', 'vocab.txt') 34 | RELS_VOCAB = getenv('RELS_VOCAB', 'rels_vocab.txt') 35 | EMB = getenv('EMB', 'emb.pt') 36 | ENCODER = getenv('ENCODER', 'encoder.pt') 37 | HEAD = getenv('HEAD', 'head.pt') 38 | REL = getenv('REL', 'rel.pt') 39 | 40 | DEVICE = getenv('DEVICE', CUDA0) 41 | SEQ_LEN = int(getenv('SEQ_LEN', 512)) 42 | BATCH_SIZE = int(getenv('BATCH_SIZE', 32)) 43 | 44 | HOST = getenv('HOST', '0.0.0.0') 45 | PORT = int(getenv('PORT', 8080)) 46 | MB = 1024 * 1024 47 | MAX_SIZE = int(getenv('MAX_SIZE', 100 * MB)) 48 | 49 | 50 | log('Load vocabs: %r, %r' % (WORDS_VOCAB, RELS_VOCAB)) 51 | words_vocab = BERTVocab.load(WORDS_VOCAB) 52 | rels_vocab = Vocab.load(RELS_VOCAB) 53 | 54 | config = RuBERTConfig() 55 | emb = BERTEmbedding.from_config(config) 56 | encoder = BERTEncoder.from_config(config) 57 | head = BERTSyntaxHead( 58 | input_dim=config.emb_dim, 59 | hidden_dim=config.emb_dim // 2, 60 | ) 61 | rel = BERTSyntaxRel( 62 | input_dim=config.emb_dim, 63 | hidden_dim=config.emb_dim // 2, 64 | rel_dim=len(rels_vocab) 65 | ) 66 | model = BERTSyntax(emb, encoder, head, rel) 67 | model.eval() 68 | 69 | log('Load emb: %r' % EMB) 70 | model.emb.load(EMB) 71 | log('Load encoder: %r' % ENCODER) 72 | model.encoder.load(ENCODER) 73 | log('Load head, rel: %r, %r' % (HEAD, REL)) 74 | model.head.load(HEAD) 75 | model.rel.load(REL) 76 | log('Device: %r' % DEVICE) 77 | model = model.to(DEVICE) 78 | 79 | log('Seq len: %r' % SEQ_LEN) 80 | log('Batch size: %r' % BATCH_SIZE) 81 | encoder = BERTInferEncoder( 82 | words_vocab, 83 | seq_len=SEQ_LEN, batch_size=BATCH_SIZE 84 | ) 85 | decoder = BERTSyntaxDecoder(rels_vocab) 86 | infer = BERTSyntaxInfer(model, encoder, decoder) 87 | 88 | 89 | async def handle(request): 90 | chunk = await request.json() 91 | log('Post chunk size: %r' % len(chunk)) 92 | markups = list(infer(chunk)) 93 | 94 | tokens = sum(len(_.tokens) for _ in markups) 95 | log('Infer tokens: %r', tokens) 96 | 97 | data = [_.as_json for _ in markups] 98 | return web.json_response(data) 99 | 100 | 101 | log('Max size: %r' % (MAX_SIZE // MB)) 102 | app = web.Application(client_max_size=MAX_SIZE) 103 | app.add_routes([web.post('/', handle)]) 104 | 105 | web.run_app(app, host=HOST, port=PORT) 106 | -------------------------------------------------------------------------------- /docker/slovnet-syntax/exec/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.6 2 | 3 | RUN curl https://storage.yandexcloud.net/natasha-navec/packs/navec_news_v1_1B_250K_300d_100q.tar -o navec.tar \ 4 | && curl https://storage.yandexcloud.net/natasha-slovnet/packs/slovnet_syntax_news_v1.tar -o pack.tar 5 | 6 | RUN pip install aiohttp==3.6.1 7 | 8 | COPY requirements/main.txt requirements.txt 9 | RUN pip install -r requirements.txt 10 | 11 | COPY . . 12 | RUN pip install -e . 13 | 14 | CMD python docker/slovnet-syntax/exec/app.py 15 | -------------------------------------------------------------------------------- /docker/slovnet-syntax/exec/Makefile: -------------------------------------------------------------------------------- 1 | 2 | ROOT = ~/proj/slovnet 3 | IMAGE = natasha/slovnet-syntax 4 | 5 | image: 6 | cd $(ROOT); docker build \ 7 | -t $(IMAGE) \ 8 | -f docker/slovnet-syntax/exec/Dockerfile . 9 | 10 | push: 11 | docker push $(IMAGE) 12 | 13 | run: 14 | docker run -it --rm \ 15 | -p 8080:8080 \ 16 | -e BATCH_SIZE=4 \ 17 | $(IMAGE) 18 | 19 | -------------------------------------------------------------------------------- /docker/slovnet-syntax/exec/app.py: -------------------------------------------------------------------------------- 1 | 2 | from os import getenv 3 | 4 | import logging 5 | logging.basicConfig( 6 | level=logging.INFO, 7 | format='%(asctime)-15s %(message)s' 8 | ) 9 | log = logging.info 10 | 11 | from aiohttp import web 12 | 13 | from navec import Navec 14 | from slovnet import Syntax 15 | 16 | 17 | NAVEC = getenv('NAVEC', 'navec.tar') 18 | PACK = getenv('PACK', 'pack.tar') 19 | BATCH_SIZE = int(getenv('BATCH_SIZE', 8)) 20 | 21 | HOST = getenv('HOST', '0.0.0.0') 22 | PORT = int(getenv('PORT', 8080)) 23 | MB = 1024 * 1024 24 | MAX_SIZE = int(getenv('MAX_SIZE', 100 * MB)) 25 | 26 | log('Load navec: %r' % NAVEC) 27 | navec = Navec.load(NAVEC) 28 | 29 | log('Load pack: %r' % PACK) 30 | log('Batch size: %r' % BATCH_SIZE) 31 | syntax = Syntax.load(PACK) 32 | syntax.navec(navec) 33 | 34 | 35 | async def handle(request): 36 | chunk = await request.json() 37 | log('Post chunk size: %r' % len(chunk)) 38 | markups = list(syntax.map(chunk)) 39 | 40 | tokens = sum(len(_.tokens) for _ in markups) 41 | log('Infer tokens: %r', tokens) 42 | 43 | data = [_.as_json for _ in markups] 44 | return web.json_response(data) 45 | 46 | 47 | log('Max size: %r' % (MAX_SIZE // MB)) 48 | app = web.Application(client_max_size=MAX_SIZE) 49 | app.add_routes([web.post('/', handle)]) 50 | 51 | web.run_app(app, host=HOST, port=PORT) 52 | -------------------------------------------------------------------------------- /docker/slovnet-syntax/torch/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM pytorch/pytorch:1.4-cuda10.1-cudnn7-runtime 2 | 3 | RUN S3=https://storage.yandexcloud.net/ \ 4 | && curl -O $S3/natasha-slovnet/07_syntax/model/shape.pt \ 5 | && curl -O $S3/natasha-slovnet/07_syntax/model/encoder.pt \ 6 | && curl -O $S3/natasha-slovnet/07_syntax/model/head.pt \ 7 | && curl -O $S3/natasha-slovnet/07_syntax/model/rel.pt \ 8 | && curl -O $S3/natasha-slovnet/07_syntax/model/rels_vocab.txt \ 9 | && curl -L $S3/natasha-navec/navec_news_v1_1B_250K_300d_100q.tar > navec.tar 10 | 11 | COPY requirements/app.txt requirements.txt 12 | RUN pip install -r requirements.txt 13 | 14 | COPY . . 15 | RUN pip install -e . 16 | 17 | CMD python docker/slovnet-syntax/torch/app.py 18 | -------------------------------------------------------------------------------- /docker/slovnet-syntax/torch/Makefile: -------------------------------------------------------------------------------- 1 | 2 | ROOT = ~/proj/slovnet 3 | IMAGE = natasha/slovnet-syntax 4 | 5 | image: 6 | cd $(ROOT); docker build \ 7 | -t $(IMAGE) \ 8 | -f docker/slovnet-syntax/torch/Dockerfile . 9 | 10 | push: 11 | docker push $(IMAGE) 12 | 13 | run: 14 | docker run -it --rm \ 15 | -p 8080:8080 \ 16 | -e DEVICE=cpu \ 17 | -e BATCH_SIZE=4 \ 18 | $(IMAGE) 19 | 20 | -------------------------------------------------------------------------------- /docker/slovnet-syntax/torch/app.py: -------------------------------------------------------------------------------- 1 | 2 | from os import getenv 3 | 4 | import logging 5 | logging.basicConfig( 6 | level=logging.INFO, 7 | format='%(asctime)-15s %(message)s' 8 | ) 9 | log = logging.info 10 | 11 | from aiohttp import web 12 | 13 | import torch 14 | torch.set_grad_enabled(False) 15 | 16 | from navec import Navec 17 | 18 | from slovnet.const import PAD, CUDA0 19 | from slovnet.shape import SHAPES 20 | from slovnet.vocab import Vocab 21 | from slovnet.model.emb import ( 22 | NavecEmbedding, 23 | Embedding 24 | ) 25 | from slovnet.model.syntax import ( 26 | SyntaxEmbedding, 27 | SyntaxEncoder, 28 | SyntaxHead, 29 | SyntaxRel, 30 | Syntax 31 | ) 32 | from slovnet.encoders.syntax import SyntaxInferEncoder 33 | from slovnet.infer.syntax import SyntaxInfer, SyntaxDecoder 34 | 35 | 36 | NAVEC = getenv('NAVEC', 'navec.tar') 37 | SHAPE = getenv('SHAPE', 'shape.pt') 38 | ENCODER = getenv('ENCODER', 'encoder.pt') 39 | HEAD = getenv('HEAD', 'head.pt') 40 | REL = getenv('REL', 'rel.pt') 41 | RELS_VOCAB = getenv('RELS_VOCAB', 'rels_vocab.txt') 42 | 43 | SHAPE_DIM = 30 44 | LAYER_DIMS = [256, 128, 64] 45 | KERNEL_SIZE = 3 46 | 47 | DEVICE = getenv('DEVICE', CUDA0) 48 | BATCH_SIZE = int(getenv('BATCH_SIZE', 64)) 49 | 50 | HOST = getenv('HOST', '0.0.0.0') 51 | PORT = int(getenv('PORT', 8080)) 52 | MB = 1024 * 1024 53 | MAX_SIZE = int(getenv('MAX_SIZE', 100 * MB)) 54 | 55 | 56 | log('Load navec: %r' % NAVEC) 57 | navec = Navec.load(NAVEC) 58 | 59 | words_vocab = Vocab(navec.vocab.words) 60 | shapes_vocab = Vocab([PAD] + SHAPES) 61 | rels_vocab = Vocab.load(RELS_VOCAB) 62 | 63 | word = NavecEmbedding(navec) 64 | shape = Embedding( 65 | vocab_size=len(shapes_vocab), 66 | dim=SHAPE_DIM, 67 | pad_id=shapes_vocab.pad_id 68 | ) 69 | emb = SyntaxEmbedding(word, shape) 70 | encoder = SyntaxEncoder( 71 | input_dim=emb.dim, 72 | layer_dims=LAYER_DIMS, 73 | kernel_size=KERNEL_SIZE, 74 | ) 75 | head = SyntaxHead( 76 | input_dim=encoder.dim, 77 | hidden_dim=encoder.dim // 2, 78 | ) 79 | rel = SyntaxRel( 80 | input_dim=encoder.dim, 81 | hidden_dim=encoder.dim // 2, 82 | rel_dim=len(rels_vocab) 83 | ) 84 | model = Syntax(emb, encoder, head, rel) 85 | model.eval() 86 | 87 | log('Load shape: %r' % SHAPE) 88 | model.emb.shape.load(SHAPE) 89 | log('Load encoder: %r' % ENCODER) 90 | model.encoder.load(ENCODER) 91 | log('Load head: %r' % HEAD) 92 | model.head.load(HEAD) 93 | log('Load rel: %r' % REL) 94 | model.rel.load(REL) 95 | log('Device: %r' % DEVICE) 96 | model = model.to(DEVICE) 97 | 98 | log('Batch size: %r' % BATCH_SIZE) 99 | encoder = SyntaxInferEncoder( 100 | words_vocab, shapes_vocab, 101 | batch_size=BATCH_SIZE 102 | ) 103 | decoder = SyntaxDecoder(rels_vocab) 104 | infer = SyntaxInfer(model, encoder, decoder) 105 | 106 | 107 | async def handle(request): 108 | chunk = await request.json() 109 | log('Post chunk size: %r' % len(chunk)) 110 | markups = list(infer(chunk)) 111 | 112 | tokens = sum(len(_.tokens) for _ in markups) 113 | log('Infer tokens: %r', tokens) 114 | 115 | data = [_.as_json for _ in markups] 116 | return web.json_response(data) 117 | 118 | 119 | log('Max size: %r' % (MAX_SIZE // MB)) 120 | app = web.Application(client_max_size=MAX_SIZE) 121 | app.add_routes([web.post('/', handle)]) 122 | 123 | web.run_app(app, host=HOST, port=PORT) 124 | -------------------------------------------------------------------------------- /requirements/app.txt: -------------------------------------------------------------------------------- 1 | aiohttp==3.6.1 2 | torch==1.4.0 3 | navec==0.9.0 4 | razdel==0.5.0 5 | -------------------------------------------------------------------------------- /requirements/ci.txt: -------------------------------------------------------------------------------- 1 | numpy==1.24.4 2 | razdel==0.5.0 3 | navec==0.9.0 4 | 5 | pytest==7.2.1 6 | flake8==5.0.4 7 | -------------------------------------------------------------------------------- /requirements/dev.txt: -------------------------------------------------------------------------------- 1 | pytest 2 | flake8 3 | -------------------------------------------------------------------------------- /requirements/gpu.txt: -------------------------------------------------------------------------------- 1 | torch==1.4.0 # 1.5 compiled with 10.2? 2 | boto3 3 | tensorboard 4 | tqdm 5 | -------------------------------------------------------------------------------- /scripts/01_bert_news/data.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "%run main.py\n", 10 | "%load_ext autoreload\n", 11 | "%autoreload 2\n", 12 | "\n", 13 | "!mkdir -p {RAW_DIR}\n", 14 | "s3 = S3()" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": null, 20 | "metadata": {}, 21 | "outputs": [], 22 | "source": [ 23 | "!wget https://storage.yandexcloud.net/natasha-corus/taiga/Fontanka.tar.gz -P {RAW_DIR}\n", 24 | "!wget https://storage.yandexcloud.net/natasha-corus/ods/gazeta_v1.csv.zip -P {RAW_DIR}\n", 25 | "!wget https://storage.yandexcloud.net/natasha-corus/ods/interfax_v1.csv.zip -P {RAW_DIR}\n", 26 | "!wget https://storage.yandexcloud.net/natasha-corus/lenta-ru-news.csv.gz -P {RAW_DIR}\n", 27 | "!wget https://storage.yandexcloud.net/natasha-corus/buriy/news-articles-2014.tar.bz2 -P {RAW_DIR}\n", 28 | "!wget https://storage.yandexcloud.net/natasha-corus/buriy/news-articles-2015-part1.tar.bz2 -P {RAW_DIR}\n", 29 | "!wget https://storage.yandexcloud.net/natasha-corus/buriy/news-articles-2015-part2.tar.bz2 -P {RAW_DIR}" 30 | ] 31 | }, 32 | { 33 | "cell_type": "code", 34 | "execution_count": null, 35 | "metadata": {}, 36 | "outputs": [], 37 | "source": [ 38 | "LOADS = {\n", 39 | " 'gazeta_v1.csv.zip': load_ods_gazeta,\n", 40 | " 'interfax_v1.csv.zip': load_ods_interfax,\n", 41 | " 'Fontanka.tar.gz': load_taiga_fontanka,\n", 42 | " 'lenta-ru-news.csv.gz': load_lenta,\n", 43 | " 'news-articles-2015-part1.tar.bz2': load_buriy_news,\n", 44 | " 'news-articles-2015-part2.tar.bz2': load_buriy_news,\n", 45 | " 'news-articles-2014.tar.bz2': load_buriy_news,\n", 46 | "}\n", 47 | "\n", 48 | "\n", 49 | "lines = [] # Requires 15Gb RAM\n", 50 | "for name in listdir(RAW_DIR):\n", 51 | " path = join_path(RAW_DIR, name)\n", 52 | " records = LOADS[name](path)\n", 53 | " for record in log_progress(records, desc=name):\n", 54 | " line = re.sub('\\s+', ' ', record.text) # news article -> single line\n", 55 | " lines.append(line)" 56 | ] 57 | }, 58 | { 59 | "cell_type": "code", 60 | "execution_count": null, 61 | "metadata": {}, 62 | "outputs": [], 63 | "source": [ 64 | "seed(1)\n", 65 | "shuffle(lines)" 66 | ] 67 | }, 68 | { 69 | "cell_type": "code", 70 | "execution_count": null, 71 | "metadata": {}, 72 | "outputs": [], 73 | "source": [ 74 | "cap = 1000\n", 75 | "dump_lines(lines[:cap], TEST)\n", 76 | "dump_lines(log_progress(lines[cap:]), TRAIN)" 77 | ] 78 | }, 79 | { 80 | "cell_type": "code", 81 | "execution_count": null, 82 | "metadata": {}, 83 | "outputs": [], 84 | "source": [ 85 | "s3.upload(TEST, S3_TEST)\n", 86 | "s3.upload(TRAIN, S3_TRAIN)" 87 | ] 88 | } 89 | ], 90 | "metadata": { 91 | "kernelspec": { 92 | "display_name": "Python 3", 93 | "language": "python", 94 | "name": "python3" 95 | }, 96 | "language_info": { 97 | "codemirror_mode": { 98 | "name": "ipython", 99 | "version": 3 100 | }, 101 | "file_extension": ".py", 102 | "mimetype": "text/x-python", 103 | "name": "python", 104 | "nbconvert_exporter": "python", 105 | "pygments_lexer": "ipython3", 106 | "version": "3.6.9" 107 | } 108 | }, 109 | "nbformat": 4, 110 | "nbformat_minor": 2 111 | } 112 | -------------------------------------------------------------------------------- /scripts/01_bert_news/main.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 5, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "%run main.py\n", 10 | "%load_ext autoreload\n", 11 | "%autoreload 2\n", 12 | "\n", 13 | "!mkdir -p {DATA_DIR} {RUBERT_DIR} {MODEL_DIR}\n", 14 | "s3 = S3()" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": 7, 20 | "metadata": {}, 21 | "outputs": [], 22 | "source": [ 23 | "if not exists(TEST):\n", 24 | " s3.download(S3_TEST, TEST)\n", 25 | " s3.download(S3_TRAIN, TRAIN)" 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "execution_count": null, 31 | "metadata": {}, 32 | "outputs": [], 33 | "source": [ 34 | "if not exists(RUBERT_VOCAB):\n", 35 | " s3.download(S3_RUBERT_VOCAB, RUBERT_VOCAB)\n", 36 | " s3.download(S3_RUBERT_EMB, RUBERT_EMB)\n", 37 | " s3.download(S3_RUBERT_ENCODER, RUBERT_ENCODER)\n", 38 | " s3.download(S3_RUBERT_MLM, RUBERT_MLM)" 39 | ] 40 | }, 41 | { 42 | "cell_type": "code", 43 | "execution_count": null, 44 | "metadata": {}, 45 | "outputs": [], 46 | "source": [ 47 | "vocab = BERTVocab.load(RUBERT_VOCAB)" 48 | ] 49 | }, 50 | { 51 | "cell_type": "code", 52 | "execution_count": null, 53 | "metadata": {}, 54 | "outputs": [], 55 | "source": [ 56 | "config = RuBERTConfig()\n", 57 | "emb = BERTEmbedding.from_config()\n", 58 | "encoder = BERTEncoder.from_config()\n", 59 | "head = BERTMLMHead(config.emb_dim, config.vocab_size)\n", 60 | "model = BERTMLM(emb, encoder, head)\n", 61 | "\n", 62 | " # fix pos emb, train on short seqs\n", 63 | "emb.position.weight.requires_grad = False\n", 64 | "\n", 65 | "model.emb.load(RUBERT_EMB)\n", 66 | "model.encoder.load(RUBERT_ENCODER)\n", 67 | "model.head.load(RUBERT_MLM)\n", 68 | "model = model.to(DEVICE)\n", 69 | "\n", 70 | "criterion = masked_flatten_cross_entropy" 71 | ] 72 | }, 73 | { 74 | "cell_type": "code", 75 | "execution_count": null, 76 | "metadata": {}, 77 | "outputs": [], 78 | "source": [ 79 | "torch.manual_seed(1)\n", 80 | "seed(1)" 81 | ] 82 | }, 83 | { 84 | "cell_type": "code", 85 | "execution_count": null, 86 | "metadata": {}, 87 | "outputs": [], 88 | "source": [ 89 | "encode = BERTMLMTrainEncoder(\n", 90 | " vocab,\n", 91 | " seq_len=128,\n", 92 | " batch_size=32,\n", 93 | " shuffle_size=10000\n", 94 | ")\n", 95 | "\n", 96 | "lines = load_lines(TEST)\n", 97 | "batches = encode(lines)\n", 98 | "test_batches = [_.to(DEVICE) for _ in batches]\n", 99 | "\n", 100 | "lines = load_lines(TRAIN)\n", 101 | "batches = encode(lines)\n", 102 | "train_batches = (_.to(DEVICE) for _ in batches)" 103 | ] 104 | }, 105 | { 106 | "cell_type": "code", 107 | "execution_count": null, 108 | "metadata": {}, 109 | "outputs": [], 110 | "source": [ 111 | "board = TensorBoard(BOARD_NAME, RUNS_DIR)\n", 112 | "train_board = board.section(TRAIN_BOARD)\n", 113 | "test_board = board.section(TEST_BOARD)" 114 | ] 115 | }, 116 | { 117 | "cell_type": "code", 118 | "execution_count": null, 119 | "metadata": {}, 120 | "outputs": [], 121 | "source": [ 122 | "optimizer = optim.Adam(model.parameters(), lr=0.0001)\n", 123 | "model, optimizer = amp.initialize(model, optimizer, opt_level=O2)\n", 124 | "scheduler = optim.lr_scheduler.ExponentialLR(optimizer, 0.999)" 125 | ] 126 | }, 127 | { 128 | "cell_type": "code", 129 | "execution_count": null, 130 | "metadata": {}, 131 | "outputs": [], 132 | "source": [ 133 | "train_meter = MLMScoreMeter()\n", 134 | "test_meter = MLMScoreMeter()\n", 135 | "\n", 136 | "accum_steps = 64 # 2K batch\n", 137 | "log_steps = 256\n", 138 | "eval_steps = 512\n", 139 | "save_steps = eval_steps * 10\n", 140 | "\n", 141 | "model.train()\n", 142 | "optimizer.zero_grad()\n", 143 | "\n", 144 | "for step, batch in log_progress(enumerate(train_batches)):\n", 145 | " batch = process_batch(model, criterion, batch)\n", 146 | " batch.loss /= accum_steps\n", 147 | " \n", 148 | " with amp.scale_loss(batch.loss, optimizer) as scaled:\n", 149 | " scaled.backward()\n", 150 | "\n", 151 | " score = score_mlm_batch(batch, ks=())\n", 152 | " train_meter.add(score)\n", 153 | "\n", 154 | " if every(step, log_steps):\n", 155 | " train_meter.write(train_board)\n", 156 | " train_meter.reset()\n", 157 | "\n", 158 | " if every(step, accum_steps):\n", 159 | " optimizer.step()\n", 160 | " scheduler.step()\n", 161 | " optimizer.zero_grad()\n", 162 | "\n", 163 | " if every(step, eval_steps):\n", 164 | " batches = infer_batches(model, criterion, test_batches)\n", 165 | " scores = score_mlm_batches(batches)\n", 166 | " test_meter.extend(scores)\n", 167 | " test_meter.write(test_board)\n", 168 | " test_meter.reset()\n", 169 | " \n", 170 | " if every(step, save_steps):\n", 171 | " model.emb.dump(MODEL_EMB)\n", 172 | " model.encoder.dump(MODEL_ENCODER)\n", 173 | " model.mlm.dump(MODEL_MLM)\n", 174 | " \n", 175 | " s3.upload(MODEL_EMB, S3_MODEL_EMB)\n", 176 | " s3.upload(MODEL_ENCODER, S3_MODEL_ENCODER)\n", 177 | " s3.upload(MODEL_MLM, S3_MODEL_MLM)\n", 178 | " \n", 179 | " board.step()" 180 | ] 181 | }, 182 | { 183 | "cell_type": "code", 184 | "execution_count": null, 185 | "metadata": {}, 186 | "outputs": [], 187 | "source": [] 188 | } 189 | ], 190 | "metadata": { 191 | "kernelspec": { 192 | "display_name": "Python 3", 193 | "language": "python", 194 | "name": "python3" 195 | }, 196 | "language_info": { 197 | "codemirror_mode": { 198 | "name": "ipython", 199 | "version": 3 200 | }, 201 | "file_extension": ".py", 202 | "mimetype": "text/x-python", 203 | "name": "python", 204 | "nbconvert_exporter": "python", 205 | "pygments_lexer": "ipython3", 206 | "version": "3.6.9" 207 | } 208 | }, 209 | "nbformat": 4, 210 | "nbformat_minor": 2 211 | } 212 | -------------------------------------------------------------------------------- /scripts/01_bert_news/main.py: -------------------------------------------------------------------------------- 1 | 2 | import re 3 | from os import listdir, makedirs 4 | from os.path import exists, join, expanduser 5 | from random import seed, shuffle 6 | 7 | from tqdm.notebook import tqdm as log_progress 8 | 9 | import torch 10 | from torch import optim 11 | 12 | from apex import amp 13 | O2 = 'O2' 14 | 15 | from corus import ( 16 | load_buriy_news, 17 | load_taiga_fontanka, 18 | load_ods_gazeta, 19 | load_ods_interfax, 20 | load_lenta 21 | ) 22 | 23 | from slovnet.io import ( 24 | load_lines, 25 | dump_lines 26 | ) 27 | from slovnet.s3 import S3 28 | from slovnet.board import TensorBoard 29 | from slovnet.const import CUDA0 30 | 31 | from slovnet.model.bert import ( 32 | RuBERTConfig, 33 | BERTEmbedding, 34 | BERTEncoder, 35 | BERTMLMHead, 36 | BERTMLM 37 | ) 38 | from slovnet.vocab import BERTVocab 39 | from slovnet.encoders.bert import BERTMLMTrainEncoder 40 | from slovnet.score import ( 41 | MLMScoreMeter, 42 | score_mlm_batch, 43 | score_mlm_batches 44 | ) 45 | from slovnet.loss import masked_flatten_cross_entropy 46 | 47 | 48 | DATA_DIR = 'data' 49 | MODEL_DIR = 'model' 50 | RUBERT_DIR = 'rubert' 51 | RAW_DIR = join(DATA_DIR, 'raw') 52 | 53 | TRAIN = join(DATA_DIR, 'train.txt') 54 | TEST = join(DATA_DIR, 'test.txt') 55 | 56 | S3_DIR = '01_bert_news' 57 | S3_TRAIN = join(S3_DIR, TRAIN) 58 | S3_TEST = join(S3_DIR, TEST) 59 | 60 | VOCAB = 'vocab.txt' 61 | EMB = 'emb.pt' 62 | ENCODER = 'encoder.pt' 63 | MLM = 'mlm.pt' 64 | 65 | RUBERT_VOCAB = join(RUBERT_DIR, VOCAB) 66 | RUBERT_EMB = join(RUBERT_DIR, EMB) 67 | RUBERT_ENCODER = join(RUBERT_DIR, ENCODER) 68 | RUBERT_MLM = join(RUBERT_DIR, MLM) 69 | 70 | S3_RUBERT_VOCAB = join(S3_DIR, RUBERT_VOCAB) 71 | S3_RUBERT_EMB = join(S3_DIR, RUBERT_EMB) 72 | S3_RUBERT_ENCODER = join(S3_DIR, RUBERT_ENCODER) 73 | S3_RUBERT_MLM = join(S3_DIR, RUBERT_MLM) 74 | 75 | MODEL_EMB = join(MODEL_DIR, EMB) 76 | MODEL_ENCODER = join(MODEL_DIR, ENCODER) 77 | MODEL_MLM = join(MODEL_DIR, MLM) 78 | 79 | S3_MODEL_EMB = join(S3_DIR, MODEL_EMB) 80 | S3_MODEL_ENCODER = join(S3_DIR, MODEL_ENCODER) 81 | S3_MODEL_MLM = join(S3_DIR, MODEL_MLM) 82 | 83 | BOARD_NAME = '01_bert_news' 84 | RUNS_DIR = 'runs' 85 | 86 | TRAIN_BOARD = '01_train' 87 | TEST_BOARD = '02_test' 88 | 89 | DEVICE = CUDA0 90 | 91 | 92 | def every(step, period): 93 | return step > 0 and step % period == 0 94 | 95 | 96 | def process_batch(model, criterion, batch): 97 | pred = model(batch.input) 98 | loss = criterion(pred, batch.target.value, batch.target.mask) 99 | return batch.processed(loss, pred) 100 | 101 | 102 | def infer_batches(model, criterion, batches): 103 | training = model.training 104 | model.eval() 105 | with torch.no_grad(): 106 | for batch in batches: 107 | yield process_batch(model, criterion, batch) 108 | model.train(training) 109 | -------------------------------------------------------------------------------- /scripts/02_bert_ner/data.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "%run main.py\n", 10 | "%load_ext autoreload\n", 11 | "%autoreload 2\n", 12 | "\n", 13 | "!mkdir -p {RAW_DIR}\n", 14 | "s3 = S3()" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": null, 20 | "metadata": {}, 21 | "outputs": [], 22 | "source": [ 23 | "# !wget https://storage.yandexcloud.net/natasha-corus/collection5.zip -P {RAW_DIR}\n", 24 | "# !wget https://storage.yandexcloud.net/natasha-corus/factRuEval-2016-master.zip -P {RAW_DIR}\n", 25 | "# !unzip {RAW_DIR}/collection5.zip -d {RAW_DIR}\n", 26 | "# !unzip {RAW_DIR}/factRuEval-2016-master.zip -d {RAW_DIR}" 27 | ] 28 | }, 29 | { 30 | "cell_type": "code", 31 | "execution_count": null, 32 | "metadata": {}, 33 | "outputs": [], 34 | "source": [ 35 | "settings = [\n", 36 | " (CORUS_NE5, NE5, S3_NE5, load_ne5),\n", 37 | " (CORUS_FACTRU, FACTRU, S3_FACTRU, load_factru)\n", 38 | "]" 39 | ] 40 | }, 41 | { 42 | "cell_type": "code", 43 | "execution_count": null, 44 | "metadata": {}, 45 | "outputs": [], 46 | "source": [ 47 | "for source, target, _, load in settings:\n", 48 | " records = load(source)\n", 49 | " records = log_progress(records, desc=load.__name__)\n", 50 | " records = (_.adapted for _ in records)\n", 51 | " items = (_.as_json for _ in records)\n", 52 | " lines = format_jl(items)\n", 53 | " dump_gz_lines(lines, target)" 54 | ] 55 | }, 56 | { 57 | "cell_type": "code", 58 | "execution_count": null, 59 | "metadata": {}, 60 | "outputs": [], 61 | "source": [ 62 | "for _, source, target, _ in log_progress(settings):\n", 63 | " s3.upload(source, target)" 64 | ] 65 | }, 66 | { 67 | "cell_type": "code", 68 | "execution_count": null, 69 | "metadata": {}, 70 | "outputs": [], 71 | "source": [] 72 | } 73 | ], 74 | "metadata": { 75 | "kernelspec": { 76 | "display_name": "Python 3", 77 | "language": "python", 78 | "name": "python3" 79 | }, 80 | "language_info": { 81 | "codemirror_mode": { 82 | "name": "ipython", 83 | "version": 3 84 | }, 85 | "file_extension": ".py", 86 | "mimetype": "text/x-python", 87 | "name": "python", 88 | "nbconvert_exporter": "python", 89 | "pygments_lexer": "ipython3", 90 | "version": "3.6.9" 91 | } 92 | }, 93 | "nbformat": 4, 94 | "nbformat_minor": 2 95 | } 96 | -------------------------------------------------------------------------------- /scripts/02_bert_ner/grid.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 2, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "# %run -n main.py\n", 10 | "# seed(2)\n", 11 | "# env = dict(environ)\n", 12 | "# for index in range(100):\n", 13 | "# seed_ = str(randint(0, 100))\n", 14 | "# bert_lr = '%.7f' % (10 ** uniform(-6, -3))\n", 15 | "# lr = '%.5f' % (10 ** uniform(-4, -2))\n", 16 | "# lr_gamma = '%.2f' % uniform(0.1, 0.9)\n", 17 | "# board_name = f'{index:03d} {seed_} {bert_lr} {lr} {lr_gamma}'\n", 18 | "# env.update(\n", 19 | "# seed=seed_,\n", 20 | "# bert_lr=bert_lr,\n", 21 | "# lr=lr,\n", 22 | "# lr_gamma=lr_gamma,\n", 23 | "# board_name=board_name\n", 24 | "# )\n", 25 | "# run(\n", 26 | "# args=[\n", 27 | "# 'jupyter', 'nbconvert',\n", 28 | "# '--ExecutePreprocessor.timeout=6000',\n", 29 | "# '--to=notebook',\n", 30 | "# '--execute', 'main.ipynb'\n", 31 | "# ],\n", 32 | "# env=env\n", 33 | "# )" 34 | ] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "execution_count": null, 39 | "metadata": {}, 40 | "outputs": [], 41 | "source": [] 42 | } 43 | ], 44 | "metadata": { 45 | "kernelspec": { 46 | "display_name": "Python 3", 47 | "language": "python", 48 | "name": "python3" 49 | }, 50 | "language_info": { 51 | "codemirror_mode": { 52 | "name": "ipython", 53 | "version": 3 54 | }, 55 | "file_extension": ".py", 56 | "mimetype": "text/x-python", 57 | "name": "python", 58 | "nbconvert_exporter": "python", 59 | "pygments_lexer": "ipython3", 60 | "version": "3.6.9" 61 | } 62 | }, 63 | "nbformat": 4, 64 | "nbformat_minor": 4 65 | } 66 | -------------------------------------------------------------------------------- /scripts/02_bert_ner/infer.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "%run main.py\n", 10 | "%load_ext autoreload\n", 11 | "%autoreload 2\n", 12 | "\n", 13 | "s3 = S3()" 14 | ] 15 | }, 16 | { 17 | "cell_type": "code", 18 | "execution_count": null, 19 | "metadata": {}, 20 | "outputs": [], 21 | "source": [ 22 | "if not exists(BERT_VOCAB):\n", 23 | " s3.download(S3_BERT_VOCAB, BERT_VOCAB)" 24 | ] 25 | }, 26 | { 27 | "cell_type": "code", 28 | "execution_count": null, 29 | "metadata": {}, 30 | "outputs": [], 31 | "source": [ 32 | "torch.set_grad_enabled(False)" 33 | ] 34 | }, 35 | { 36 | "cell_type": "code", 37 | "execution_count": 30, 38 | "metadata": {}, 39 | "outputs": [], 40 | "source": [ 41 | "words_vocab = BERTVocab.load(BERT_VOCAB)\n", 42 | "tags_vocab = BIOTagsVocab(TAGS)\n", 43 | "\n", 44 | "config = RuBERTConfig()\n", 45 | "emb = BERTEmbedding.from_config(config)\n", 46 | "encoder = BERTEncoder.from_config(config)\n", 47 | "ner = BERTNERHead(config.emb_dim, len(tags_vocab))\n", 48 | "model = BERTNER(emb, encoder, ner)\n", 49 | "model.eval()\n", 50 | "\n", 51 | "model.emb.load(BERT_EMB) # default downloaded embeddings\n", 52 | "model.encoder.load(MODEL_ENCODER) # custom pretrained model encoder\n", 53 | "model.head.load(MODEL_NER) # custom pretrained model head\n", 54 | "model = model.to(DEVICE)\n", 55 | "\n", 56 | "encoder = BERTInferEncoder(\n", 57 | " words_vocab,\n", 58 | " seq_len=SEQ_LEN, batch_size=BATCH_SIZE\n", 59 | ")\n", 60 | "\n", 61 | "decoder = BERTTagDecoder(tags_vocab)\n", 62 | "infer = BERTNERInfer(model, encoder, decoder)" 63 | ] 64 | }, 65 | { 66 | "cell_type": "code", 67 | "execution_count": null, 68 | "metadata": {}, 69 | "outputs": [], 70 | "source": [ 71 | "# inference\n", 72 | "\n", 73 | "default = ['Европейский союз добавил в санкционный список девять политических деятелей из самопровозглашенных республик Донбасса — Донецкой народной республики (ДНР) и Луганской народной республики (ЛНР) — в связи с прошедшими там выборами. Об этом говорится в документе, опубликованном в официальном журнале Евросоюза. В новом списке фигурирует Леонид Пасечник, который по итогам выборов стал главой ЛНР. Помимо него там присутствуют Владимир Бидевка и Денис Мирошниченко, председатели законодательных органов ДНР и ЛНР, а также Ольга Позднякова и Елена Кравченко, председатели ЦИК обеих республик. Выборы прошли в непризнанных республиках Донбасса 11 ноября. На них удержали лидерство действующие руководители и партии — Денис Пушилин и «Донецкая республика» в ДНР и Леонид Пасечник с движением «Мир Луганщине» в ЛНР. Президент Франции Эмманюэль Макрон и канцлер ФРГ Ангела Меркель после встречи с украинским лидером Петром Порошенко осудили проведение выборов, заявив, что они нелегитимны и «подрывают территориальную целостность и суверенитет Украины». Позже к осуждению присоединились США с обещаниями новых санкций для России.',]\n", 74 | "custom = ['Ваш произвольный текст']\n", 75 | "\n", 76 | "chunk = custom if CUSTOM_TUNING else default\n", 77 | "\n", 78 | "markups = list(infer(chunk)) # chunk is about 1000 chars? - https://github.com/natasha/naeval/blob/52c4a508bf212b95d4e610cfe1b5e23b8ca94d2f/naeval/ner/models/slovnet.py#L16\n", 79 | "\n", 80 | "spans = sum(len(_.spans) for _ in markups)\n", 81 | "\n", 82 | "data = [_.as_json for _ in markups]\n", 83 | "\n", 84 | "print(markups)\n", 85 | "\n", 86 | "from ipymarkup import show_span_ascii_markup as show_markup\n", 87 | "show_markup(markups[0].text, markups[0].spans)" 88 | ] 89 | } 90 | ], 91 | "metadata": { 92 | "kernelspec": { 93 | "display_name": "env", 94 | "language": "python", 95 | "name": "python3" 96 | }, 97 | "language_info": { 98 | "codemirror_mode": { 99 | "name": "ipython", 100 | "version": 3 101 | }, 102 | "file_extension": ".py", 103 | "mimetype": "text/x-python", 104 | "name": "python", 105 | "nbconvert_exporter": "python", 106 | "pygments_lexer": "ipython3", 107 | "version": "3.10.6 (main, Nov 14 2022, 16:10:14) [GCC 11.3.0]" 108 | }, 109 | "vscode": { 110 | "interpreter": { 111 | "hash": "fbd3b3cf5ce5dbcb71d33f7b8a90c542bb07cc48175c202e830100849640f809" 112 | } 113 | } 114 | }, 115 | "nbformat": 4, 116 | "nbformat_minor": 4 117 | } 118 | -------------------------------------------------------------------------------- /scripts/02_bert_ner/main.py: -------------------------------------------------------------------------------- 1 | 2 | from os import getenv, environ 3 | from os.path import exists, join, expanduser 4 | from random import seed, sample, randint, uniform 5 | from subprocess import run 6 | 7 | from tqdm.notebook import tqdm as log_progress 8 | 9 | import torch 10 | from torch import optim 11 | 12 | from naeval.ner.datasets import ( 13 | load_factru, 14 | load_ne5, 15 | ) 16 | 17 | from slovnet.s3 import S3 18 | from slovnet.io import ( 19 | format_jl, 20 | parse_jl, 21 | 22 | load_gz_lines, 23 | dump_gz_lines 24 | ) 25 | from slovnet.board import ( 26 | TensorBoard, 27 | LogBoard, 28 | MultiBoard 29 | ) 30 | from slovnet.const import ( 31 | TRAIN, TEST, 32 | PER, LOC, ORG, 33 | CUDA0, 34 | ) 35 | from slovnet.token import tokenize 36 | 37 | from slovnet.model.bert import ( 38 | RuBERTConfig, 39 | BERTEmbedding, 40 | BERTEncoder, 41 | BERTNERHead, 42 | BERTNER 43 | ) 44 | from slovnet.markup import ( 45 | SpanMarkup, 46 | show_span_markup 47 | ) 48 | from slovnet.vocab import BERTVocab, BIOTagsVocab 49 | from slovnet.encoders.bert import BERTNERTrainEncoder, BERTInferEncoder 50 | from slovnet.score import ( 51 | NERBatchScore, 52 | NERScoreMeter, 53 | score_ner_batch 54 | ) 55 | from slovnet.mask import ( 56 | Masked, 57 | split_masked, 58 | pad_masked 59 | ) 60 | 61 | from slovnet.infer.bert import BERTNERInfer, BERTTagDecoder 62 | 63 | DATA_DIR = 'data' 64 | MODEL_DIR = 'model' 65 | BERT_DIR = 'bert' 66 | 67 | RAW_DIR = join(DATA_DIR, 'raw') 68 | 69 | CORUS_NE5 = join(RAW_DIR, 'Collection5') 70 | CORUS_FACTRU = join(RAW_DIR, 'factRuEval-2016-master') 71 | 72 | NE5 = join(DATA_DIR, 'ne5.jl.gz') 73 | FACTRU = join(DATA_DIR, 'factru.jl.gz') 74 | 75 | S3_DIR = '02_bert_ner' 76 | S3_NE5 = join(S3_DIR, NE5) 77 | S3_FACTRU = join(S3_DIR, FACTRU) 78 | 79 | VOCAB = 'vocab.txt' 80 | EMB = 'emb.pt' 81 | ENCODER = 'encoder.pt' 82 | NER = 'ner.pt' 83 | 84 | BERT_VOCAB = join(BERT_DIR, VOCAB) 85 | BERT_EMB = join(BERT_DIR, EMB) 86 | BERT_ENCODER = join(BERT_DIR, ENCODER) 87 | 88 | S3_RUBERT_DIR = '01_bert_news/rubert' 89 | S3_MLM_DIR = '01_bert_news/model' 90 | S3_BERT_VOCAB = join(S3_RUBERT_DIR, VOCAB) 91 | S3_BERT_EMB = join(S3_MLM_DIR, EMB) 92 | S3_BERT_ENCODER = join(S3_MLM_DIR, ENCODER) 93 | 94 | MODEL_ENCODER = join(MODEL_DIR, ENCODER) 95 | MODEL_NER = join(MODEL_DIR, NER) 96 | 97 | S3_MODEL_ENCODER = join(S3_DIR, MODEL_ENCODER) 98 | S3_MODEL_NER = join(S3_DIR, MODEL_NER) 99 | 100 | BOARD_NAME = getenv('board_name', '02_bert_ner') 101 | RUNS_DIR = 'runs' 102 | 103 | TRAIN_BOARD = '01_train' 104 | TEST_BOARD = '02_test' 105 | 106 | SEED = int(getenv('seed', 72)) 107 | DEVICE = getenv('device', CUDA0) 108 | BERT_LR = float(getenv('bert_lr', 0.000045)) 109 | LR = float(getenv('lr', 0.0075)) 110 | LR_GAMMA = float(getenv('lr_gamma', 0.45)) 111 | EPOCHS = int(getenv('epochs', 5)) 112 | 113 | SEQ_LEN = int(getenv('SEQ_LEN', 256)) 114 | BATCH_SIZE = int(getenv('BATCH_SIZE', 64)) 115 | 116 | ##################### 117 | # 118 | # CUSTOM TAGS TUNING 119 | # 120 | ############### START 121 | 122 | CUSTOM_TUNING = False # Set this flag to true in order to use your custom dataset and tags 123 | CUSTOM_TEXTS = join(DATA_DIR, 'custom-dataset.jl.gz') # Put your own data into the data dir 124 | TAGS = ['CUSTOM-TAG'] if CUSTOM_TUNING else [PER, LOC, ORG] # List all your custom tags 125 | 126 | ################# END 127 | 128 | 129 | def process_batch(model, criterion, batch): 130 | input, target = batch 131 | 132 | pred = model(input.value) 133 | pred = pad_masked(pred, input.mask) 134 | mask = pad_masked(input.mask, input.mask) 135 | 136 | loss = criterion(pred, target.value, target.mask) 137 | 138 | pred = Masked(pred, mask) 139 | return batch.processed(loss, pred) 140 | -------------------------------------------------------------------------------- /scripts/03_bert_morph/data.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "%run main.py\n", 10 | "%load_ext autoreload\n", 11 | "%autoreload 2\n", 12 | "\n", 13 | "!mkdir -p {RAW_DIR}\n", 14 | "s3 = S3()" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": null, 20 | "metadata": {}, 21 | "outputs": [], 22 | "source": [ 23 | "# !wget https://storage.yandexcloud.net/natasha-corus/GramEval2020-master.zip -P {RAW_DIR}\n", 24 | "# !unzip {RAW_DIR}/GramEval2020-master.zip -d {RAW_DIR}" 25 | ] 26 | }, 27 | { 28 | "cell_type": "code", 29 | "execution_count": null, 30 | "metadata": {}, 31 | "outputs": [], 32 | "source": [ 33 | "for target in [NEWS, FICTION]:\n", 34 | " paths = (\n", 35 | " join(GRAMRU_DIR, _)\n", 36 | " for _ in GRAMRU_FILES[target]\n", 37 | " )\n", 38 | " records = (\n", 39 | " record\n", 40 | " for path in paths\n", 41 | " for record in load_dataset(path)\n", 42 | " )\n", 43 | " records = log_progress(records, desc=target)\n", 44 | " items = (_.as_json for _ in records)\n", 45 | " lines = format_jl(items)\n", 46 | " dump_gz_lines(lines, target)" 47 | ] 48 | }, 49 | { 50 | "cell_type": "code", 51 | "execution_count": null, 52 | "metadata": {}, 53 | "outputs": [], 54 | "source": [ 55 | "s3.upload(NEWS, S3_NEWS)\n", 56 | "s3.upload(FICTION, S3_FICTION)" 57 | ] 58 | }, 59 | { 60 | "cell_type": "code", 61 | "execution_count": null, 62 | "metadata": {}, 63 | "outputs": [], 64 | "source": [] 65 | } 66 | ], 67 | "metadata": { 68 | "kernelspec": { 69 | "display_name": "Python 3", 70 | "language": "python", 71 | "name": "python3" 72 | }, 73 | "language_info": { 74 | "codemirror_mode": { 75 | "name": "ipython", 76 | "version": 3 77 | }, 78 | "file_extension": ".py", 79 | "mimetype": "text/x-python", 80 | "name": "python", 81 | "nbconvert_exporter": "python", 82 | "pygments_lexer": "ipython3", 83 | "version": "3.6.9" 84 | } 85 | }, 86 | "nbformat": 4, 87 | "nbformat_minor": 2 88 | } 89 | -------------------------------------------------------------------------------- /scripts/03_bert_morph/grid.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 2, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "# %run -n main.py\n", 10 | "# seed(1)\n", 11 | "# env = dict(environ)\n", 12 | "# for index in range(100):\n", 13 | "# seed_ = str(randint(0, 100))\n", 14 | "# bert_lr = '%.7f' % (10 ** uniform(-6, -3))\n", 15 | "# lr = '%.5f' % (10 ** uniform(-4, -2))\n", 16 | "# lr_gamma = '%.2f' % uniform(0.1, 0.9)\n", 17 | "# board_name = f'{index:03d} {seed_} {bert_lr} {lr} {lr_gamma}'\n", 18 | "# env.update(\n", 19 | "# seed=seed_,\n", 20 | "# bert_lr=bert_lr,\n", 21 | "# lr=lr,\n", 22 | "# lr_gamma=lr_gamma,\n", 23 | "# board_name=board_name\n", 24 | "# )\n", 25 | "# run(\n", 26 | "# args=[\n", 27 | "# 'jupyter', 'nbconvert',\n", 28 | "# '--ExecutePreprocessor.timeout=6000',\n", 29 | "# '--to=notebook',\n", 30 | "# '--execute', 'main.ipynb'\n", 31 | "# ],\n", 32 | "# env=env\n", 33 | "# )" 34 | ] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "execution_count": null, 39 | "metadata": {}, 40 | "outputs": [], 41 | "source": [] 42 | } 43 | ], 44 | "metadata": { 45 | "kernelspec": { 46 | "display_name": "Python 3", 47 | "language": "python", 48 | "name": "python3" 49 | }, 50 | "language_info": { 51 | "codemirror_mode": { 52 | "name": "ipython", 53 | "version": 3 54 | }, 55 | "file_extension": ".py", 56 | "mimetype": "text/x-python", 57 | "name": "python", 58 | "nbconvert_exporter": "python", 59 | "pygments_lexer": "ipython3", 60 | "version": "3.6.9" 61 | } 62 | }, 63 | "nbformat": 4, 64 | "nbformat_minor": 4 65 | } 66 | -------------------------------------------------------------------------------- /scripts/03_bert_morph/main.py: -------------------------------------------------------------------------------- 1 | 2 | from os import getenv, environ 3 | from os.path import exists, join 4 | from random import seed, sample, randint, uniform 5 | from itertools import islice as head 6 | from subprocess import run 7 | 8 | from tqdm.notebook import tqdm as log_progress 9 | 10 | import torch 11 | from torch import optim 12 | 13 | from naeval.morph.datasets import load_dataset 14 | 15 | from slovnet.s3 import S3 16 | from slovnet.io import ( 17 | format_jl, 18 | parse_jl, 19 | 20 | load_gz_lines, 21 | dump_gz_lines 22 | ) 23 | from slovnet.board import ( 24 | TensorBoard, 25 | LogBoard, 26 | MultiBoard 27 | ) 28 | from slovnet.const import ( 29 | TRAIN, TEST, 30 | PAD, CUDA0, 31 | ) 32 | 33 | from slovnet.model.bert import ( 34 | RuBERTConfig, 35 | BERTEmbedding, 36 | BERTEncoder, 37 | BERTMorphHead, 38 | BERTMorph 39 | ) 40 | from slovnet.markup import MorphMarkup 41 | from slovnet.vocab import BERTVocab, Vocab 42 | from slovnet.encoders.bert import BERTMorphTrainEncoder 43 | from slovnet.loss import masked_flatten_cross_entropy 44 | from slovnet.batch import ProcessedBatch 45 | from slovnet.score import ( 46 | MorphScoreMeter, 47 | score_morph_batch 48 | ) 49 | from slovnet.mask import ( 50 | Masked, 51 | split_masked, 52 | pad_masked 53 | ) 54 | 55 | 56 | DATA_DIR = 'data' 57 | MODEL_DIR = 'model' 58 | BERT_DIR = 'bert' 59 | RAW_DIR = join(DATA_DIR, 'raw') 60 | 61 | NEWS = join(DATA_DIR, 'news.jl.gz') 62 | FICTION = join(DATA_DIR, 'fiction.jl.gz') 63 | GRAMRU_DIR = join(RAW_DIR, 'GramEval2020-master') 64 | GRAMRU_FILES = { 65 | NEWS: [ 66 | 'dataOpenTest/GramEval2020-RuEval2017-Lenta-news-dev.conllu', 67 | 'dataTrain/MorphoRuEval2017-Lenta-train.conllu', 68 | ], 69 | FICTION: [ 70 | 'dataOpenTest/GramEval2020-SynTagRus-dev.conllu', 71 | 'dataTrain/GramEval2020-SynTagRus-train-v2.conllu', 72 | 'dataTrain/MorphoRuEval2017-JZ-gold.conllu' 73 | ], 74 | } 75 | 76 | S3_DIR = '03_bert_morph' 77 | S3_NEWS = join(S3_DIR, NEWS) 78 | S3_FICTION = join(S3_DIR, FICTION) 79 | 80 | VOCAB = 'vocab.txt' 81 | EMB = 'emb.pt' 82 | ENCODER = 'encoder.pt' 83 | MORPH = 'morph.pt' 84 | 85 | BERT_VOCAB = join(BERT_DIR, VOCAB) 86 | BERT_EMB = join(BERT_DIR, EMB) 87 | BERT_ENCODER = join(BERT_DIR, ENCODER) 88 | 89 | S3_RUBERT_DIR = '01_bert_news/rubert' 90 | S3_MLM_DIR = '01_bert_news/model' 91 | S3_BERT_VOCAB = join(S3_RUBERT_DIR, VOCAB) 92 | S3_BERT_EMB = join(S3_MLM_DIR, EMB) 93 | S3_BERT_ENCODER = join(S3_MLM_DIR, ENCODER) 94 | 95 | TAGS_VOCAB = join(MODEL_DIR, 'tags_vocab.txt') 96 | MODEL_ENCODER = join(MODEL_DIR, ENCODER) 97 | MODEL_MORPH = join(MODEL_DIR, MORPH) 98 | 99 | S3_TAGS_VOCAB = join(S3_DIR, TAGS_VOCAB) 100 | S3_MODEL_ENCODER = join(S3_DIR, MODEL_ENCODER) 101 | S3_MODEL_MORPH = join(S3_DIR, MODEL_MORPH) 102 | 103 | BOARD_NAME = getenv('board_name', '03_bert_morph_02') 104 | RUNS_DIR = 'runs' 105 | 106 | TRAIN_BOARD = '01_train' 107 | TEST_BOARD = '02_test' 108 | 109 | SEED = int(getenv('seed', 1)) 110 | DEVICE = getenv('device', CUDA0) 111 | BERT_LR = float(getenv('bert_lr', 0.0002)) 112 | LR = float(getenv('lr', 0.001)) 113 | LR_GAMMA = float(getenv('lr_gamma', 0.8)) 114 | EPOCHS = int(getenv('epochs', 5)) 115 | 116 | 117 | def process_batch(model, criterion, batch): 118 | input, target = batch 119 | 120 | pred = model(input.value) 121 | pred = pad_masked(pred, input.mask) 122 | mask = pad_masked(input.mask, input.mask) 123 | 124 | loss = criterion(pred, target.value, target.mask) 125 | 126 | pred = model.morph.decode(pred) 127 | pred = pred[mask] 128 | 129 | # unmask pred, target to fit in score batch 130 | target = target.value[target.mask] 131 | 132 | return ProcessedBatch(input, target, loss, pred) 133 | 134 | -------------------------------------------------------------------------------- /scripts/04_bert_syntax/data.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "%run main.py\n", 10 | "%load_ext autoreload\n", 11 | "%autoreload 2\n", 12 | "\n", 13 | "!mkdir -p {RAW_DIR}\n", 14 | "s3 = S3()" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": null, 20 | "metadata": {}, 21 | "outputs": [], 22 | "source": [ 23 | "# !wget https://storage.yandexcloud.net/natasha-corus/GramEval2020-master.zip -P {RAW_DIR}\n", 24 | "# !unzip {RAW_DIR}/GramEval2020-master.zip -d {RAW_DIR}" 25 | ] 26 | }, 27 | { 28 | "cell_type": "code", 29 | "execution_count": null, 30 | "metadata": {}, 31 | "outputs": [], 32 | "source": [ 33 | "for target in [NEWS, FICTION]:\n", 34 | " paths = (\n", 35 | " join(GRAMRU_DIR, _)\n", 36 | " for _ in GRAMRU_FILES[target]\n", 37 | " )\n", 38 | " records = (\n", 39 | " record\n", 40 | " for path in paths\n", 41 | " for record in load_dataset(path)\n", 42 | " )\n", 43 | " records = log_progress(records, desc=target)\n", 44 | " items = (_.as_json for _ in records)\n", 45 | " lines = format_jl(items)\n", 46 | " dump_gz_lines(lines, target)" 47 | ] 48 | }, 49 | { 50 | "cell_type": "code", 51 | "execution_count": null, 52 | "metadata": {}, 53 | "outputs": [], 54 | "source": [ 55 | "s3.upload(NEWS, S3_NEWS)\n", 56 | "s3.upload(FICTION, S3_FICTION)" 57 | ] 58 | }, 59 | { 60 | "cell_type": "code", 61 | "execution_count": null, 62 | "metadata": {}, 63 | "outputs": [], 64 | "source": [] 65 | } 66 | ], 67 | "metadata": { 68 | "kernelspec": { 69 | "display_name": "Python 3", 70 | "language": "python", 71 | "name": "python3" 72 | }, 73 | "language_info": { 74 | "codemirror_mode": { 75 | "name": "ipython", 76 | "version": 3 77 | }, 78 | "file_extension": ".py", 79 | "mimetype": "text/x-python", 80 | "name": "python", 81 | "nbconvert_exporter": "python", 82 | "pygments_lexer": "ipython3", 83 | "version": "3.6.9" 84 | } 85 | }, 86 | "nbformat": 4, 87 | "nbformat_minor": 2 88 | } 89 | -------------------------------------------------------------------------------- /scripts/04_bert_syntax/grid.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 2, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "# %run -n main.py\n", 10 | "# seed(1)\n", 11 | "# env = dict(environ)\n", 12 | "# for index in range(100):\n", 13 | "# seed_ = str(randint(0, 100))\n", 14 | "# bert_lr = '%.7f' % (10 ** uniform(-6, -3))\n", 15 | "# lr = '%.5f' % (10 ** uniform(-4, -2))\n", 16 | "# lr_gamma = '%.2f' % uniform(0.1, 0.9)\n", 17 | "# board_name = f'{index:03d} {seed_} {bert_lr} {lr} {lr_gamma}'\n", 18 | "# env.update(\n", 19 | "# seed=seed_,\n", 20 | "# bert_lr=bert_lr,\n", 21 | "# lr=lr,\n", 22 | "# lr_gamma=lr_gamma,\n", 23 | "# board_name=board_name\n", 24 | "# )\n", 25 | "# run(\n", 26 | "# args=[\n", 27 | "# 'jupyter', 'nbconvert',\n", 28 | "# '--ExecutePreprocessor.timeout=6000',\n", 29 | "# '--to=notebook',\n", 30 | "# '--execute', 'main.ipynb'\n", 31 | "# ],\n", 32 | "# env=env\n", 33 | "# )" 34 | ] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "execution_count": null, 39 | "metadata": {}, 40 | "outputs": [], 41 | "source": [] 42 | } 43 | ], 44 | "metadata": { 45 | "kernelspec": { 46 | "display_name": "Python 3", 47 | "language": "python", 48 | "name": "python3" 49 | }, 50 | "language_info": { 51 | "codemirror_mode": { 52 | "name": "ipython", 53 | "version": 3 54 | }, 55 | "file_extension": ".py", 56 | "mimetype": "text/x-python", 57 | "name": "python", 58 | "nbconvert_exporter": "python", 59 | "pygments_lexer": "ipython3", 60 | "version": "3.6.9" 61 | } 62 | }, 63 | "nbformat": 4, 64 | "nbformat_minor": 4 65 | } 66 | -------------------------------------------------------------------------------- /scripts/04_bert_syntax/main.py: -------------------------------------------------------------------------------- 1 | 2 | from os import getenv, environ 3 | from os.path import exists, join 4 | from itertools import chain, islice as head 5 | from random import seed, sample, randint, uniform 6 | from subprocess import run 7 | 8 | from tqdm.notebook import tqdm as log_progress 9 | 10 | import torch 11 | from torch import optim 12 | 13 | from naeval.syntax.datasets import load_dataset 14 | 15 | from slovnet.s3 import S3 16 | from slovnet.io import ( 17 | format_jl, 18 | parse_jl, 19 | 20 | load_gz_lines, 21 | dump_gz_lines 22 | ) 23 | from slovnet.board import ( 24 | TensorBoard, 25 | LogBoard, 26 | MultiBoard 27 | ) 28 | from slovnet.const import ( 29 | TRAIN, TEST, 30 | PAD, CUDA0, 31 | ) 32 | 33 | from slovnet.model.bert import ( 34 | RuBERTConfig, 35 | BERTEmbedding, 36 | BERTEncoder, 37 | BERTSyntaxHead, 38 | BERTSyntaxRel, 39 | BERTSyntax 40 | ) 41 | from slovnet.markup import SyntaxMarkup 42 | from slovnet.vocab import BERTVocab, Vocab 43 | from slovnet.encoders.bert import BERTSyntaxTrainEncoder 44 | from slovnet.loss import masked_flatten_cross_entropy 45 | from slovnet.score import ( 46 | SyntaxScoreMeter, 47 | score_syntax_batch 48 | ) 49 | from slovnet.mask import ( 50 | Masked, 51 | split_masked, 52 | pad_masked 53 | ) 54 | 55 | 56 | DATA_DIR = 'data' 57 | MODEL_DIR = 'model' 58 | BERT_DIR = 'bert' 59 | RAW_DIR = join(DATA_DIR, 'raw') 60 | 61 | NEWS = join(DATA_DIR, 'news.jl.gz') 62 | FICTION = join(DATA_DIR, 'fiction.jl.gz') 63 | GRAMRU_DIR = join(RAW_DIR, 'GramEval2020-master') 64 | GRAMRU_FILES = { 65 | NEWS: [ 66 | 'dataOpenTest/GramEval2020-RuEval2017-Lenta-news-dev.conllu', 67 | 'dataTrain/MorphoRuEval2017-Lenta-train.conllu', 68 | ], 69 | FICTION: [ 70 | 'dataOpenTest/GramEval2020-SynTagRus-dev.conllu', 71 | 'dataTrain/GramEval2020-SynTagRus-train-v2.conllu', 72 | 'dataTrain/MorphoRuEval2017-JZ-gold.conllu' 73 | ], 74 | } 75 | 76 | S3_DIR = '04_bert_syntax' 77 | S3_NEWS = join(S3_DIR, NEWS) 78 | S3_FICTION = join(S3_DIR, FICTION) 79 | 80 | VOCAB = 'vocab.txt' 81 | EMB = 'emb.pt' 82 | ENCODER = 'encoder.pt' 83 | HEAD = 'head.pt' 84 | REL = 'rel.pt' 85 | 86 | BERT_VOCAB = join(BERT_DIR, VOCAB) 87 | BERT_EMB = join(BERT_DIR, EMB) 88 | BERT_ENCODER = join(BERT_DIR, ENCODER) 89 | 90 | S3_RUBERT_DIR = '01_bert_news/rubert' 91 | S3_MLM_DIR = '01_bert_news/model' 92 | S3_BERT_VOCAB = join(S3_RUBERT_DIR, VOCAB) 93 | S3_BERT_EMB = join(S3_MLM_DIR, EMB) 94 | S3_BERT_ENCODER = join(S3_MLM_DIR, ENCODER) 95 | 96 | RELS_VOCAB = join(MODEL_DIR, 'rels_vocab.txt') 97 | MODEL_ENCODER = join(MODEL_DIR, ENCODER) 98 | MODEL_HEAD = join(MODEL_DIR, HEAD) 99 | MODEL_REL = join(MODEL_DIR, REL) 100 | 101 | S3_RELS_VOCAB = join(S3_DIR, RELS_VOCAB) 102 | S3_MODEL_ENCODER = join(S3_DIR, MODEL_ENCODER) 103 | S3_MODEL_HEAD = join(S3_DIR, MODEL_HEAD) 104 | S3_MODEL_REL = join(S3_DIR, MODEL_REL) 105 | 106 | BOARD_NAME = getenv('board_name', '04_bert_syntax_01') 107 | RUNS_DIR = 'runs' 108 | 109 | TRAIN_BOARD = '01_train' 110 | TEST_BOARD = '02_test' 111 | 112 | SEED = int(getenv('seed', 50)) 113 | DEVICE = getenv('device', CUDA0) 114 | BERT_LR = float(getenv('bert_lr', 0.000058)) 115 | LR = float(getenv('lr', 0.00012)) 116 | LR_GAMMA = float(getenv('lr_gamma', 0.29)) 117 | EPOCHS = int(getenv('epochs', 2)) 118 | 119 | 120 | def process_batch(model, criterion, batch): 121 | input, target = batch 122 | 123 | pred = model( 124 | input.word_id, input.word_mask, input.pad_mask, 125 | target.mask, target.head_id 126 | ) 127 | 128 | loss = ( 129 | criterion(pred.head_id, target.head_id, target.mask) 130 | + criterion(pred.rel_id, target.rel_id, target.mask) 131 | ) 132 | 133 | pred.head_id = model.head.decode(pred.head_id, target.mask) 134 | pred.rel_id = model.rel.decode(pred.rel_id, target.mask) 135 | 136 | return batch.processed(loss, pred) 137 | -------------------------------------------------------------------------------- /scripts/05_ner/data.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "%run main.py\n", 10 | "%load_ext autoreload\n", 11 | "%autoreload 2\n", 12 | "\n", 13 | "!mkdir -p {RAW_DIR}\n", 14 | "s3 = S3()" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": null, 20 | "metadata": {}, 21 | "outputs": [], 22 | "source": [ 23 | "# !wget https://storage.yandexcloud.net/natasha-nerus/data/nerus_lenta.conllu.gz -P {RAW_DIR}" 24 | ] 25 | }, 26 | { 27 | "cell_type": "code", 28 | "execution_count": null, 29 | "metadata": {}, 30 | "outputs": [], 31 | "source": [ 32 | "records = load_nerus(RAW_NERUS)\n", 33 | "records = log_progress(records, total=NERUS_TOTAL)\n", 34 | "\n", 35 | "markups = (adapt_markup(_.ner) for _ in records)\n", 36 | "items = (_.as_json for _ in markups)\n", 37 | "lines = list(format_jl(items))" 38 | ] 39 | }, 40 | { 41 | "cell_type": "code", 42 | "execution_count": null, 43 | "metadata": {}, 44 | "outputs": [], 45 | "source": [ 46 | "seed(1)\n", 47 | "shuffle(lines)" 48 | ] 49 | }, 50 | { 51 | "cell_type": "code", 52 | "execution_count": null, 53 | "metadata": {}, 54 | "outputs": [], 55 | "source": [ 56 | "dump_gz_lines(log_progress(lines), NERUS)" 57 | ] 58 | }, 59 | { 60 | "cell_type": "code", 61 | "execution_count": null, 62 | "metadata": {}, 63 | "outputs": [], 64 | "source": [ 65 | "s3.upload(NERUS, S3_NERUS)" 66 | ] 67 | }, 68 | { 69 | "cell_type": "code", 70 | "execution_count": null, 71 | "metadata": {}, 72 | "outputs": [], 73 | "source": [] 74 | } 75 | ], 76 | "metadata": { 77 | "kernelspec": { 78 | "display_name": "Python 3", 79 | "language": "python", 80 | "name": "python3" 81 | }, 82 | "language_info": { 83 | "codemirror_mode": { 84 | "name": "ipython", 85 | "version": 3 86 | }, 87 | "file_extension": ".py", 88 | "mimetype": "text/x-python", 89 | "name": "python", 90 | "nbconvert_exporter": "python", 91 | "pygments_lexer": "ipython3", 92 | "version": "3.6.9" 93 | } 94 | }, 95 | "nbformat": 4, 96 | "nbformat_minor": 2 97 | } 98 | -------------------------------------------------------------------------------- /scripts/05_ner/grid.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "# %run -n main.py\n", 10 | "# seed(1)\n", 11 | "# env = dict(environ)\n", 12 | "# for index in range(100):\n", 13 | "# seed_ = str(randint(0, 100))\n", 14 | "# lr = '%.5f' % (10 ** uniform(-4, -1))\n", 15 | "# lr_gamma = '%.2f' % uniform(0.1, 0.9)\n", 16 | "# board_name = f'{index:03d} {seed_} {lr} {lr_gamma}'\n", 17 | "# env.update(\n", 18 | "# seed=seed_,\n", 19 | "# lr=lr,\n", 20 | "# lr_gamma=lr_gamma,\n", 21 | "# board_name=board_name\n", 22 | "# )\n", 23 | "# run(\n", 24 | "# args=[\n", 25 | "# 'jupyter', 'nbconvert',\n", 26 | "# '--ExecutePreprocessor.timeout=6000',\n", 27 | "# '--to=notebook',\n", 28 | "# '--execute', 'main.ipynb'\n", 29 | "# ],\n", 30 | "# env=env\n", 31 | "# )" 32 | ] 33 | }, 34 | { 35 | "cell_type": "code", 36 | "execution_count": null, 37 | "metadata": {}, 38 | "outputs": [], 39 | "source": [ 40 | "# %run -n main.py\n", 41 | "# seed(1)\n", 42 | "# env = dict(environ)\n", 43 | "# for layers_num in [2, 3, 4]:\n", 44 | "# for layer_dim in [16, 64, 256]:\n", 45 | "# board_name = f'{layers_num} {layer_dim}'\n", 46 | "# env.update(\n", 47 | "# layers_num=str(layers_num),\n", 48 | "# layer_dim=str(layer_dim),\n", 49 | "# board_name=board_name\n", 50 | "# )\n", 51 | "# run(\n", 52 | "# args=[\n", 53 | "# 'jupyter', 'nbconvert',\n", 54 | "# '--ExecutePreprocessor.timeout=6000',\n", 55 | "# '--to=notebook',\n", 56 | "# '--execute', 'main.ipynb'\n", 57 | "# ],\n", 58 | "# env=env\n", 59 | "# )" 60 | ] 61 | }, 62 | { 63 | "cell_type": "code", 64 | "execution_count": null, 65 | "metadata": {}, 66 | "outputs": [], 67 | "source": [] 68 | } 69 | ], 70 | "metadata": { 71 | "kernelspec": { 72 | "display_name": "Python 3", 73 | "language": "python", 74 | "name": "python3" 75 | }, 76 | "language_info": { 77 | "codemirror_mode": { 78 | "name": "ipython", 79 | "version": 3 80 | }, 81 | "file_extension": ".py", 82 | "mimetype": "text/x-python", 83 | "name": "python", 84 | "nbconvert_exporter": "python", 85 | "pygments_lexer": "ipython3", 86 | "version": "3.6.9" 87 | } 88 | }, 89 | "nbformat": 4, 90 | "nbformat_minor": 4 91 | } 92 | -------------------------------------------------------------------------------- /scripts/05_ner/main.py: -------------------------------------------------------------------------------- 1 | 2 | from os import getenv, environ 3 | from os.path import exists, join, expanduser 4 | from random import seed, shuffle, sample, randint, uniform 5 | from itertools import islice as head 6 | from subprocess import run 7 | 8 | from tqdm.notebook import tqdm as log_progress 9 | 10 | import torch 11 | from torch import optim 12 | 13 | from nerus import load_nerus 14 | 15 | from navec import Navec 16 | 17 | from slovnet.s3 import S3 18 | from slovnet.io import ( 19 | format_jl, 20 | parse_jl, 21 | 22 | load_gz_lines, 23 | dump_gz_lines 24 | ) 25 | from slovnet.board import ( 26 | TensorBoard, 27 | LogBoard, 28 | MultiBoard 29 | ) 30 | from slovnet.const import ( 31 | TRAIN, TEST, 32 | PER, LOC, ORG, 33 | WORD, SHAPE, TAG, 34 | CUDA0, 35 | PAD 36 | ) 37 | from slovnet.token import tokenize 38 | from slovnet.shape import SHAPES 39 | 40 | from slovnet.span import Span 41 | from slovnet.markup import ( 42 | SpanMarkup, 43 | show_span_markup 44 | ) 45 | from slovnet.model.emb import ( 46 | Embedding, 47 | NavecEmbedding 48 | ) 49 | from slovnet.model.tag import ( 50 | TagEmbedding, 51 | TagEncoder, 52 | NERHead, 53 | NER 54 | ) 55 | from slovnet.vocab import ( 56 | Vocab, 57 | BIOTagsVocab 58 | ) 59 | from slovnet.encoders.tag import TagTrainEncoder 60 | from slovnet.score import ( 61 | NERBatchScore, 62 | NERScoreMeter, 63 | score_ner_batch 64 | ) 65 | 66 | from slovnet.exec.pack import ( 67 | Meta, 68 | DumpPack 69 | ) 70 | from slovnet import api 71 | 72 | 73 | DATA_DIR = 'data' 74 | MODEL_DIR = 'model' 75 | NAVEC_DIR = 'navec' 76 | RAW_DIR = join(DATA_DIR, 'raw') 77 | S3_DIR = '05_ner' 78 | 79 | RAW_NERUS = join(RAW_DIR, 'nerus_lenta.conllu.gz') 80 | NERUS_TOTAL = 739346 81 | 82 | NERUS = join(DATA_DIR, 'nerus.jl.gz') 83 | S3_NERUS = join(S3_DIR, NERUS) 84 | 85 | NAVEC_URL = 'https://storage.yandexcloud.net/natasha-navec/packs/navec_news_v1_1B_250K_300d_100q.tar' 86 | NAVEC = join(NAVEC_DIR, 'navec_news_v1_1B_250K_300d_100q.tar') 87 | 88 | MODEL_SHAPE = join(MODEL_DIR, 'shape.pt') 89 | MODEL_ENCODER = join(MODEL_DIR, 'encoder.pt') 90 | MODEL_NER = join(MODEL_DIR, 'ner.pt') 91 | 92 | S3_MODEL_SHAPE = join(S3_DIR, MODEL_SHAPE) 93 | S3_MODEL_ENCODER = join(S3_DIR, MODEL_ENCODER) 94 | S3_MODEL_NER = join(S3_DIR, MODEL_NER) 95 | 96 | BOARD_NAME = getenv('board_name', '05_ner') 97 | RUNS_DIR = 'runs' 98 | 99 | TRAIN_BOARD = '01_train' 100 | TEST_BOARD = '02_test' 101 | 102 | SEED = int(getenv('seed', 17)) 103 | DEVICE = getenv('device', CUDA0) 104 | 105 | SHAPE_DIM = int(getenv('shape_dim', 30)) 106 | LAYERS_NUM = int(getenv('layers_num', 3)) 107 | LAYER_DIM = int(getenv('layer_dim', 64)) 108 | KERNEL_SIZE = int(getenv('kernel_size', 3)) 109 | 110 | LR = float(getenv('lr', 0.005)) 111 | LR_GAMMA = float(getenv('lr_gamma', 0.75)) 112 | EPOCHS = int(getenv('epochs', 3)) 113 | 114 | LAYER_DIMS = [ 115 | LAYER_DIM * 2**_ 116 | for _ in reversed(range(LAYERS_NUM)) 117 | ] 118 | 119 | ##################### 120 | # 121 | # CUSTOM TAGS TUNING 122 | # 123 | ############### START 124 | 125 | CUSTOM_TUNING = True # Set this flag to true in order to use your custom dataset and tags 126 | CUSTOM_TEXTS = join(DATA_DIR, 'big-synthetic-dataset.jl.gz') # Put your own data into the data dir 127 | TAGS = ['CUSTOM-TAG'] if CUSTOM_TUNING else [PER, LOC, ORG] # List all your custom tags 128 | ID = 'slovnet_ner_custom_tags' if CUSTOM_TUNING else 'slovnet_ner_news_v1' 129 | 130 | ################# END 131 | 132 | PACK = ID + '.tar' 133 | S3_PACK = join('packs', PACK) 134 | 135 | def adapt_markup(record): 136 | return SpanMarkup( 137 | record.text, 138 | [Span(_.start, _.stop, _.type) for _ in record.spans] 139 | ) 140 | 141 | def process_batch(model, criterion, batch): 142 | input, target = batch 143 | 144 | pred = model(input.word_id, input.shape_id) 145 | loss = criterion(pred, target) 146 | 147 | return batch.processed(loss, pred) 148 | -------------------------------------------------------------------------------- /scripts/05_ner/pack.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "%run main.py\n", 10 | "%load_ext autoreload\n", 11 | "%autoreload 2\n", 12 | "\n", 13 | "!mkdir -p {NAVEC_DIR} {MODEL_DIR} {PACK_DIR}\n", 14 | "s3 = S3()" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": 2, 20 | "metadata": {}, 21 | "outputs": [], 22 | "source": [ 23 | "if not exists(NAVEC):\n", 24 | " !wget {NAVEC_URL} -O {NAVEC}\n", 25 | " s3.download(S3_MODEL_SHAPE, MODEL_SHAPE)\n", 26 | " s3.download(S3_MODEL_ENCODER, MODEL_ENCODER)\n", 27 | " s3.download(S3_MODEL_NER, MODEL_NER)" 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "execution_count": null, 33 | "metadata": {}, 34 | "outputs": [], 35 | "source": [ 36 | "navec = Navec.load(NAVEC)\n", 37 | "\n", 38 | "words_vocab = Vocab(navec.vocab.words)\n", 39 | "shapes_vocab = Vocab([PAD] + SHAPES)\n", 40 | "tags_vocab = BIOTagsVocab(TAGS)\n", 41 | "\n", 42 | "word = NavecEmbedding(navec)\n", 43 | "shape = Embedding(\n", 44 | " vocab_size=len(shapes_vocab),\n", 45 | " dim=SHAPE_DIM,\n", 46 | " pad_id=shapes_vocab.pad_id\n", 47 | ")\n", 48 | "emb = TagEmbedding(word, shape)\n", 49 | "encoder = TagEncoder(\n", 50 | " input_dim=emb.dim,\n", 51 | " layer_dims=LAYER_DIMS,\n", 52 | " kernel_size=KERNEL_SIZE,\n", 53 | ")\n", 54 | "ner = NERHead(encoder.dim, len(tags_vocab))\n", 55 | "model = NER(emb, encoder, ner)\n", 56 | "model.eval()\n", 57 | "\n", 58 | "model.emb.shape.load(MODEL_SHAPE)\n", 59 | "model.encoder.load(MODEL_ENCODER)\n", 60 | "model.head.load(MODEL_NER)" 61 | ] 62 | }, 63 | { 64 | "cell_type": "code", 65 | "execution_count": 4, 66 | "metadata": {}, 67 | "outputs": [], 68 | "source": [ 69 | "model = model.to_exec()\n", 70 | "model = model.strip_navec()\n", 71 | "arrays, model = model.separate_arrays()" 72 | ] 73 | }, 74 | { 75 | "cell_type": "code", 76 | "execution_count": 5, 77 | "metadata": {}, 78 | "outputs": [], 79 | "source": [ 80 | "with DumpPack(PACK) as pack:\n", 81 | " meta = Meta(ID)\n", 82 | " pack.dump_meta(meta)\n", 83 | " \n", 84 | " pack.dump_model(model)\n", 85 | " pack.dump_arrays(arrays)\n", 86 | "\n", 87 | " pack.dump_vocab(words_vocab, WORD)\n", 88 | " pack.dump_vocab(shapes_vocab, SHAPE)\n", 89 | " pack.dump_vocab(tags_vocab, TAG)" 90 | ] 91 | }, 92 | { 93 | "cell_type": "code", 94 | "execution_count": null, 95 | "metadata": {}, 96 | "outputs": [], 97 | "source": [ 98 | "ner = api.NER.load(PACK)\n", 99 | "ner.navec(navec)\n", 100 | "\n", 101 | "default = ['«Коронамобиль» Ангелы Меркель сняли на видео']\n", 102 | "custom = ['Ваш произвольный текст']\n", 103 | "\n", 104 | "chunk = custom if CUSTOM_TUNING else default\n", 105 | "\n", 106 | "markup = ner(chunk)\n", 107 | "\n", 108 | "show_span_markup(markup)" 109 | ] 110 | }, 111 | { 112 | "cell_type": "code", 113 | "execution_count": null, 114 | "metadata": {}, 115 | "outputs": [], 116 | "source": [ 117 | "#s3.upload(PACK, S3_PACK)" 118 | ] 119 | }, 120 | { 121 | "cell_type": "code", 122 | "execution_count": null, 123 | "metadata": {}, 124 | "outputs": [], 125 | "source": [] 126 | } 127 | ], 128 | "metadata": { 129 | "kernelspec": { 130 | "display_name": "Python 3.10.6 ('env': venv)", 131 | "language": "python", 132 | "name": "python3" 133 | }, 134 | "language_info": { 135 | "codemirror_mode": { 136 | "name": "ipython", 137 | "version": 3 138 | }, 139 | "file_extension": ".py", 140 | "mimetype": "text/x-python", 141 | "name": "python", 142 | "nbconvert_exporter": "python", 143 | "pygments_lexer": "ipython3", 144 | "version": "3.10.6" 145 | }, 146 | "vscode": { 147 | "interpreter": { 148 | "hash": "fbd3b3cf5ce5dbcb71d33f7b8a90c542bb07cc48175c202e830100849640f809" 149 | } 150 | } 151 | }, 152 | "nbformat": 4, 153 | "nbformat_minor": 2 154 | } 155 | -------------------------------------------------------------------------------- /scripts/06_morph/data.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "%run main.py\n", 10 | "%load_ext autoreload\n", 11 | "%autoreload 2\n", 12 | "\n", 13 | "!mkdir -p {RAW_DIR}\n", 14 | "s3 = S3()" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": null, 20 | "metadata": {}, 21 | "outputs": [], 22 | "source": [ 23 | "# !wget https://storage.yandexcloud.net/natasha-nerus/data/nerus_lenta.conllu.gz -P {RAW_DIR}" 24 | ] 25 | }, 26 | { 27 | "cell_type": "code", 28 | "execution_count": null, 29 | "metadata": {}, 30 | "outputs": [], 31 | "source": [ 32 | "records = load_nerus(RAW_NERUS)\n", 33 | "records = log_progress(records, total=NERUS_TOTAL)\n", 34 | "\n", 35 | "sents = (\n", 36 | " sent\n", 37 | " for record in records\n", 38 | " for sent in record.sents\n", 39 | ")\n", 40 | "markups = (adapt_markup(_.morph) for _ in sents)\n", 41 | "items = (_.as_json for _ in markups)\n", 42 | "lines = list(format_jl(items))" 43 | ] 44 | }, 45 | { 46 | "cell_type": "code", 47 | "execution_count": null, 48 | "metadata": {}, 49 | "outputs": [], 50 | "source": [ 51 | "seed(1)\n", 52 | "shuffle(lines)" 53 | ] 54 | }, 55 | { 56 | "cell_type": "code", 57 | "execution_count": null, 58 | "metadata": {}, 59 | "outputs": [], 60 | "source": [ 61 | "dump_gz_lines(log_progress(lines), NERUS)" 62 | ] 63 | }, 64 | { 65 | "cell_type": "code", 66 | "execution_count": null, 67 | "metadata": {}, 68 | "outputs": [], 69 | "source": [ 70 | "s3.upload(NERUS, S3_NERUS)" 71 | ] 72 | }, 73 | { 74 | "cell_type": "code", 75 | "execution_count": null, 76 | "metadata": {}, 77 | "outputs": [], 78 | "source": [] 79 | } 80 | ], 81 | "metadata": { 82 | "kernelspec": { 83 | "display_name": "Python 3", 84 | "language": "python", 85 | "name": "python3" 86 | }, 87 | "language_info": { 88 | "codemirror_mode": { 89 | "name": "ipython", 90 | "version": 3 91 | }, 92 | "file_extension": ".py", 93 | "mimetype": "text/x-python", 94 | "name": "python", 95 | "nbconvert_exporter": "python", 96 | "pygments_lexer": "ipython3", 97 | "version": "3.6.9" 98 | } 99 | }, 100 | "nbformat": 4, 101 | "nbformat_minor": 2 102 | } 103 | -------------------------------------------------------------------------------- /scripts/06_morph/grid.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "# %run -n main.py\n", 10 | "# seed(1)\n", 11 | "# env = dict(environ)\n", 12 | "# for index in range(100):\n", 13 | "# seed_ = str(randint(0, 100))\n", 14 | "# lr = '%.5f' % (10 ** uniform(-4, -1))\n", 15 | "# lr_gamma = '%.2f' % uniform(0.1, 0.9)\n", 16 | "# board_name = f'{index:03d} {seed_} {lr} {lr_gamma}'\n", 17 | "# env.update(\n", 18 | "# seed=seed_,\n", 19 | "# lr=lr,\n", 20 | "# lr_gamma=lr_gamma,\n", 21 | "# board_name=board_name\n", 22 | "# )\n", 23 | "# run(\n", 24 | "# args=[\n", 25 | "# 'jupyter', 'nbconvert',\n", 26 | "# '--ExecutePreprocessor.timeout=6000',\n", 27 | "# '--to=notebook',\n", 28 | "# '--execute', 'main.ipynb'\n", 29 | "# ],\n", 30 | "# env=env\n", 31 | "# )" 32 | ] 33 | }, 34 | { 35 | "cell_type": "code", 36 | "execution_count": null, 37 | "metadata": {}, 38 | "outputs": [], 39 | "source": [ 40 | "# %run -n main.py\n", 41 | "# seed(1)\n", 42 | "# env = dict(environ)\n", 43 | "# for layers_num in [2, 3, 4]:\n", 44 | "# for layer_dim in [16, 64, 256]:\n", 45 | "# board_name = f'{layers_num} {layer_dim}'\n", 46 | "# env.update(\n", 47 | "# layers_num=str(layers_num),\n", 48 | "# layer_dim=str(layer_dim),\n", 49 | "# board_name=board_name\n", 50 | "# )\n", 51 | "# run(\n", 52 | "# args=[\n", 53 | "# 'jupyter', 'nbconvert',\n", 54 | "# '--ExecutePreprocessor.timeout=6000',\n", 55 | "# '--to=notebook',\n", 56 | "# '--execute', 'main.ipynb'\n", 57 | "# ],\n", 58 | "# env=env\n", 59 | "# )" 60 | ] 61 | }, 62 | { 63 | "cell_type": "code", 64 | "execution_count": null, 65 | "metadata": {}, 66 | "outputs": [], 67 | "source": [] 68 | } 69 | ], 70 | "metadata": { 71 | "kernelspec": { 72 | "display_name": "Python 3", 73 | "language": "python", 74 | "name": "python3" 75 | }, 76 | "language_info": { 77 | "codemirror_mode": { 78 | "name": "ipython", 79 | "version": 3 80 | }, 81 | "file_extension": ".py", 82 | "mimetype": "text/x-python", 83 | "name": "python", 84 | "nbconvert_exporter": "python", 85 | "pygments_lexer": "ipython3", 86 | "version": "3.6.9" 87 | } 88 | }, 89 | "nbformat": 4, 90 | "nbformat_minor": 4 91 | } 92 | -------------------------------------------------------------------------------- /scripts/06_morph/main.py: -------------------------------------------------------------------------------- 1 | 2 | from os import getenv, environ 3 | from os.path import exists, join, expanduser 4 | from random import seed, shuffle, sample, randint, uniform 5 | from itertools import islice as head 6 | from subprocess import run 7 | 8 | from tqdm.notebook import tqdm as log_progress 9 | 10 | import torch 11 | from torch import optim 12 | 13 | from nerus import load_nerus 14 | 15 | from navec import Navec 16 | 17 | from slovnet.s3 import S3 18 | from slovnet.io import ( 19 | format_jl, 20 | parse_jl, 21 | 22 | load_gz_lines, 23 | dump_gz_lines 24 | ) 25 | from slovnet.board import ( 26 | TensorBoard, 27 | LogBoard, 28 | MultiBoard 29 | ) 30 | from slovnet.const import ( 31 | TRAIN, TEST, 32 | CUDA0, PAD, 33 | WORD, SHAPE, TAG 34 | ) 35 | from slovnet.token import tokenize 36 | from slovnet.shape import SHAPES 37 | 38 | from slovnet.markup import ( 39 | MorphToken, 40 | MorphMarkup, 41 | show_morph_markup 42 | ) 43 | from slovnet.model.emb import ( 44 | Embedding, 45 | NavecEmbedding 46 | ) 47 | from slovnet.model.tag import ( 48 | TagEmbedding, 49 | TagEncoder, 50 | MorphHead, 51 | Morph 52 | ) 53 | from slovnet.loss import flatten_cross_entropy 54 | from slovnet.vocab import Vocab 55 | from slovnet.encoders.tag import TagTrainEncoder 56 | from slovnet.score import ( 57 | MorphBatchScore, 58 | MorphScoreMeter, 59 | score_morph_batch 60 | ) 61 | 62 | from slovnet.exec.pack import ( 63 | Meta, 64 | DumpPack 65 | ) 66 | from slovnet import api 67 | 68 | 69 | DATA_DIR = 'data' 70 | MODEL_DIR = 'model' 71 | NAVEC_DIR = 'navec' 72 | RAW_DIR = join(DATA_DIR, 'raw') 73 | S3_DIR = '06_morph' 74 | 75 | RAW_NERUS = join(RAW_DIR, 'nerus_lenta.conllu.gz') 76 | NERUS_TOTAL = 739346 77 | 78 | NERUS = join(DATA_DIR, 'nerus.jl.gz') 79 | S3_NERUS = join(S3_DIR, NERUS) 80 | 81 | NAVEC_URL = 'https://storage.yandexcloud.net/natasha-navec/packs/navec_news_v1_1B_250K_300d_100q.tar' 82 | NAVEC = join(NAVEC_DIR, 'navec_news_v1_1B_250K_300d_100q.tar') 83 | 84 | TAGS_VOCAB = join(MODEL_DIR, 'tags_vocab.txt') 85 | MODEL_SHAPE = join(MODEL_DIR, 'shape.pt') 86 | MODEL_ENCODER = join(MODEL_DIR, 'encoder.pt') 87 | MODEL_MORPH = join(MODEL_DIR, 'morph.pt') 88 | 89 | S3_TAGS_VOCAB = join(S3_DIR, TAGS_VOCAB) 90 | S3_MODEL_SHAPE = join(S3_DIR, MODEL_SHAPE) 91 | S3_MODEL_ENCODER = join(S3_DIR, MODEL_ENCODER) 92 | S3_MODEL_MORPH = join(S3_DIR, MODEL_MORPH) 93 | 94 | ID = 'slovnet_morph_news_v1' 95 | PACK = ID + '.tar' 96 | S3_PACK = join('packs', PACK) 97 | 98 | BOARD_NAME = getenv('board_name', '06_morph') 99 | RUNS_DIR = 'runs' 100 | 101 | TRAIN_BOARD = '01_train' 102 | TEST_BOARD = '02_test' 103 | 104 | SEED = int(getenv('seed', 65)) 105 | DEVICE = getenv('device', CUDA0) 106 | 107 | SHAPE_DIM = int(getenv('shape_dim', 30)) 108 | LAYERS_NUM = int(getenv('layers_num', 3)) 109 | LAYER_DIM = int(getenv('layer_dim', 64)) 110 | KERNEL_SIZE = int(getenv('kernel_size', 3)) 111 | 112 | LR = float(getenv('lr', 0.0033)) 113 | LR_GAMMA = float(getenv('lr_gamma', 0.9)) 114 | EPOCHS = int(getenv('epochs', 5)) 115 | 116 | LAYER_DIMS = [ 117 | LAYER_DIM * 2**_ 118 | for _ in reversed(range(LAYERS_NUM)) 119 | ] 120 | 121 | 122 | def adapt_markup(record): 123 | return MorphMarkup([ 124 | MorphToken(_.text, _.pos, _.feats) 125 | for _ in record.tokens 126 | ]) 127 | 128 | 129 | def process_batch(model, criterion, batch): 130 | input, target = batch 131 | 132 | pred = model(input.word_id, input.shape_id) 133 | loss = criterion(pred, target) 134 | 135 | pred = model.morph.decode(pred) 136 | 137 | return batch.processed(loss, pred) 138 | -------------------------------------------------------------------------------- /scripts/06_morph/pack.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "%run main.py\n", 10 | "%load_ext autoreload\n", 11 | "%autoreload 2\n", 12 | "\n", 13 | "!mkdir -p {NAVEC_DIR} {MODEL_DIR} {PACK_DIR}\n", 14 | "s3 = S3()" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": null, 20 | "metadata": {}, 21 | "outputs": [], 22 | "source": [ 23 | "if not exists(NAVEC):\n", 24 | " !wget {NAVEC_URL} -O {NAVEC}\n", 25 | " s3.download(S3_TAGS_VOCAB, TAGS_VOCAB)\n", 26 | " s3.download(S3_MODEL_SHAPE, MODEL_SHAPE)\n", 27 | " s3.download(S3_MODEL_ENCODER, MODEL_ENCODER)\n", 28 | " s3.download(S3_MODEL_MORPH, MODEL_MORPH)" 29 | ] 30 | }, 31 | { 32 | "cell_type": "code", 33 | "execution_count": null, 34 | "metadata": {}, 35 | "outputs": [], 36 | "source": [ 37 | "navec = Navec.load(NAVEC)\n", 38 | "\n", 39 | "words_vocab = Vocab(navec.vocab.words)\n", 40 | "shapes_vocab = Vocab([PAD] + SHAPES)\n", 41 | "tags_vocab = Vocab.load(TAGS_VOCAB)\n", 42 | "\n", 43 | "word = NavecEmbedding(navec)\n", 44 | "shape = Embedding(\n", 45 | " vocab_size=len(shapes_vocab),\n", 46 | " dim=SHAPE_DIM,\n", 47 | " pad_id=shapes_vocab.pad_id\n", 48 | ")\n", 49 | "emb = TagEmbedding(word, shape)\n", 50 | "encoder = TagEncoder(\n", 51 | " input_dim=emb.dim,\n", 52 | " layer_dims=LAYER_DIMS,\n", 53 | " kernel_size=KERNEL_SIZE,\n", 54 | ")\n", 55 | "morph = MorphHead(encoder.dim, len(tags_vocab))\n", 56 | "model = Morph(emb, encoder, morph)\n", 57 | "model.eval()\n", 58 | "\n", 59 | "model.emb.shape.load(MODEL_SHAPE)\n", 60 | "model.encoder.load(MODEL_ENCODER)\n", 61 | "model.head.load(MODEL_MORPH)" 62 | ] 63 | }, 64 | { 65 | "cell_type": "code", 66 | "execution_count": null, 67 | "metadata": {}, 68 | "outputs": [], 69 | "source": [ 70 | "model = model.to_exec()\n", 71 | "model = model.strip_navec()\n", 72 | "arrays, model = model.separate_arrays()" 73 | ] 74 | }, 75 | { 76 | "cell_type": "code", 77 | "execution_count": null, 78 | "metadata": {}, 79 | "outputs": [], 80 | "source": [ 81 | "with DumpPack(PACK) as pack:\n", 82 | " meta = Meta(ID)\n", 83 | " pack.dump_meta(meta)\n", 84 | " \n", 85 | " pack.dump_model(model)\n", 86 | " pack.dump_arrays(arrays)\n", 87 | "\n", 88 | " pack.dump_vocab(words_vocab, WORD)\n", 89 | " pack.dump_vocab(shapes_vocab, SHAPE)\n", 90 | " pack.dump_vocab(tags_vocab, TAG)" 91 | ] 92 | }, 93 | { 94 | "cell_type": "code", 95 | "execution_count": null, 96 | "metadata": {}, 97 | "outputs": [], 98 | "source": [ 99 | "morph = api.Morph.load(PACK)\n", 100 | "morph.navec(navec)\n", 101 | "words = 'Ежедневно очаги коронавирусной инфекции'.split()\n", 102 | "markup = morph(words)\n", 103 | "show_morph_markup(markup)" 104 | ] 105 | }, 106 | { 107 | "cell_type": "code", 108 | "execution_count": null, 109 | "metadata": {}, 110 | "outputs": [], 111 | "source": [ 112 | "s3.upload(PACK, S3_PACK)" 113 | ] 114 | }, 115 | { 116 | "cell_type": "code", 117 | "execution_count": null, 118 | "metadata": {}, 119 | "outputs": [], 120 | "source": [] 121 | } 122 | ], 123 | "metadata": { 124 | "kernelspec": { 125 | "display_name": "Python 3", 126 | "language": "python", 127 | "name": "python3" 128 | }, 129 | "language_info": { 130 | "codemirror_mode": { 131 | "name": "ipython", 132 | "version": 3 133 | }, 134 | "file_extension": ".py", 135 | "mimetype": "text/x-python", 136 | "name": "python", 137 | "nbconvert_exporter": "python", 138 | "pygments_lexer": "ipython3", 139 | "version": "3.6.9" 140 | } 141 | }, 142 | "nbformat": 4, 143 | "nbformat_minor": 2 144 | } 145 | -------------------------------------------------------------------------------- /scripts/07_syntax/data.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "%run main.py\n", 10 | "%load_ext autoreload\n", 11 | "%autoreload 2\n", 12 | "\n", 13 | "!mkdir -p {RAW_DIR}\n", 14 | "s3 = S3()" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": null, 20 | "metadata": {}, 21 | "outputs": [], 22 | "source": [ 23 | "# !wget https://storage.yandexcloud.net/natasha-nerus/data/nerus_lenta.conllu.gz -P {RAW_DIR}" 24 | ] 25 | }, 26 | { 27 | "cell_type": "code", 28 | "execution_count": null, 29 | "metadata": {}, 30 | "outputs": [], 31 | "source": [ 32 | "records = load_nerus(RAW_NERUS)\n", 33 | "records = log_progress(records, total=NERUS_TOTAL)\n", 34 | "\n", 35 | "sents = (\n", 36 | " sent\n", 37 | " for record in records\n", 38 | " for sent in record.sents\n", 39 | ")\n", 40 | "markups = (adapt_markup(_.syntax) for _ in sents)\n", 41 | "items = (_.as_json for _ in markups)\n", 42 | "lines = list(format_jl(items))" 43 | ] 44 | }, 45 | { 46 | "cell_type": "code", 47 | "execution_count": null, 48 | "metadata": {}, 49 | "outputs": [], 50 | "source": [ 51 | "seed(1)\n", 52 | "shuffle(lines)" 53 | ] 54 | }, 55 | { 56 | "cell_type": "code", 57 | "execution_count": null, 58 | "metadata": {}, 59 | "outputs": [], 60 | "source": [ 61 | "# dump_gz_lines(log_progress(lines), NERUS)" 62 | ] 63 | }, 64 | { 65 | "cell_type": "code", 66 | "execution_count": null, 67 | "metadata": {}, 68 | "outputs": [], 69 | "source": [ 70 | "# s3.upload(NERUS, S3_NERUS)" 71 | ] 72 | }, 73 | { 74 | "cell_type": "code", 75 | "execution_count": null, 76 | "metadata": {}, 77 | "outputs": [], 78 | "source": [] 79 | } 80 | ], 81 | "metadata": { 82 | "kernelspec": { 83 | "display_name": "Python 3", 84 | "language": "python", 85 | "name": "python3" 86 | }, 87 | "language_info": { 88 | "codemirror_mode": { 89 | "name": "ipython", 90 | "version": 3 91 | }, 92 | "file_extension": ".py", 93 | "mimetype": "text/x-python", 94 | "name": "python", 95 | "nbconvert_exporter": "python", 96 | "pygments_lexer": "ipython3", 97 | "version": "3.6.9" 98 | } 99 | }, 100 | "nbformat": 4, 101 | "nbformat_minor": 2 102 | } 103 | -------------------------------------------------------------------------------- /scripts/07_syntax/grid.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "# %run -n main.py\n", 10 | "# seed(1)\n", 11 | "# env = dict(environ)\n", 12 | "# for index in range(100):\n", 13 | "# seed_ = str(randint(0, 100))\n", 14 | "# lr = '%.5f' % (10 ** uniform(-4, -1))\n", 15 | "# lr_gamma = '%.2f' % uniform(0.1, 0.9)\n", 16 | "# board_name = f'{index:03d} {seed_} {lr} {lr_gamma}'\n", 17 | "# env.update(\n", 18 | "# seed=seed_,\n", 19 | "# lr=lr,\n", 20 | "# lr_gamma=lr_gamma,\n", 21 | "# board_name=board_name\n", 22 | "# )\n", 23 | "# run(\n", 24 | "# args=[\n", 25 | "# 'jupyter', 'nbconvert',\n", 26 | "# '--ExecutePreprocessor.timeout=6000',\n", 27 | "# '--to=notebook',\n", 28 | "# '--execute', 'main.ipynb'\n", 29 | "# ],\n", 30 | "# env=env\n", 31 | "# )" 32 | ] 33 | }, 34 | { 35 | "cell_type": "code", 36 | "execution_count": null, 37 | "metadata": {}, 38 | "outputs": [], 39 | "source": [ 40 | "# %run -n main.py\n", 41 | "# seed(1)\n", 42 | "# env = dict(environ)\n", 43 | "# for layers_num in [2, 3, 4]:\n", 44 | "# for layer_dim in [16, 64, 256]:\n", 45 | "# board_name = f'{layers_num} {layer_dim}'\n", 46 | "# env.update(\n", 47 | "# layers_num=str(layers_num),\n", 48 | "# layer_dim=str(layer_dim),\n", 49 | "# board_name=board_name\n", 50 | "# )\n", 51 | "# run(\n", 52 | "# args=[\n", 53 | "# 'jupyter', 'nbconvert',\n", 54 | "# '--ExecutePreprocessor.timeout=6000',\n", 55 | "# '--to=notebook',\n", 56 | "# '--execute', 'main.ipynb'\n", 57 | "# ],\n", 58 | "# env=env\n", 59 | "# )" 60 | ] 61 | }, 62 | { 63 | "cell_type": "code", 64 | "execution_count": null, 65 | "metadata": {}, 66 | "outputs": [], 67 | "source": [] 68 | } 69 | ], 70 | "metadata": { 71 | "kernelspec": { 72 | "display_name": "Python 3", 73 | "language": "python", 74 | "name": "python3" 75 | }, 76 | "language_info": { 77 | "codemirror_mode": { 78 | "name": "ipython", 79 | "version": 3 80 | }, 81 | "file_extension": ".py", 82 | "mimetype": "text/x-python", 83 | "name": "python", 84 | "nbconvert_exporter": "python", 85 | "pygments_lexer": "ipython3", 86 | "version": "3.6.9" 87 | } 88 | }, 89 | "nbformat": 4, 90 | "nbformat_minor": 4 91 | } 92 | -------------------------------------------------------------------------------- /scripts/07_syntax/main.py: -------------------------------------------------------------------------------- 1 | 2 | from os import getenv, environ 3 | from os.path import exists, join, expanduser 4 | from random import seed, shuffle, sample, randint, uniform 5 | from subprocess import run 6 | 7 | from tqdm.notebook import tqdm as log_progress 8 | 9 | import torch 10 | from torch import optim 11 | 12 | from nerus import load_nerus 13 | 14 | from navec import Navec 15 | 16 | from slovnet.s3 import S3 17 | from slovnet.io import ( 18 | format_jl, 19 | parse_jl, 20 | 21 | load_gz_lines, 22 | dump_gz_lines 23 | ) 24 | from slovnet.board import ( 25 | TensorBoard, 26 | LogBoard, 27 | MultiBoard 28 | ) 29 | from slovnet.const import ( 30 | TRAIN, TEST, 31 | CUDA0, PAD, 32 | WORD, SHAPE, REL 33 | ) 34 | from slovnet.token import tokenize 35 | from slovnet.shape import SHAPES 36 | 37 | from slovnet.markup import ( 38 | SyntaxToken, 39 | SyntaxMarkup, 40 | show_syntax_markup 41 | ) 42 | from slovnet.model.emb import ( 43 | Embedding, 44 | NavecEmbedding 45 | ) 46 | from slovnet.model.syntax import ( 47 | SyntaxEmbedding, 48 | SyntaxEncoder, 49 | SyntaxHead, 50 | SyntaxRel, 51 | Syntax 52 | ) 53 | from slovnet.loss import masked_flatten_cross_entropy 54 | from slovnet.vocab import Vocab 55 | from slovnet.encoders.syntax import SyntaxTrainEncoder 56 | from slovnet.score import ( 57 | SyntaxBatchScore, 58 | SyntaxScoreMeter, 59 | score_syntax_batch 60 | ) 61 | 62 | from slovnet.exec.pack import ( 63 | Meta, 64 | DumpPack 65 | ) 66 | from slovnet import api 67 | 68 | 69 | DATA_DIR = 'data' 70 | MODEL_DIR = 'model' 71 | NAVEC_DIR = 'navec' 72 | RAW_DIR = join(DATA_DIR, 'raw') 73 | S3_DIR = '07_syntax' 74 | 75 | RAW_NERUS = join(RAW_DIR, 'nerus_lenta.conllu.gz') 76 | NERUS_TOTAL = 739346 77 | 78 | NERUS = join(DATA_DIR, 'nerus.jl.gz') 79 | S3_NERUS = join(S3_DIR, NERUS) 80 | 81 | NAVEC_URL = 'https://storage.yandexcloud.net/natasha-navec/packs/navec_news_v1_1B_250K_300d_100q.tar' 82 | NAVEC = join(NAVEC_DIR, 'navec_news_v1_1B_250K_300d_100q.tar') 83 | 84 | RELS_VOCAB = join(MODEL_DIR, 'rels_vocab.txt') 85 | MODEL_SHAPE = join(MODEL_DIR, 'shape.pt') 86 | MODEL_ENCODER = join(MODEL_DIR, 'encoder.pt') 87 | MODEL_HEAD = join(MODEL_DIR, 'head.pt') 88 | MODEL_REL = join(MODEL_DIR, 'rel.pt') 89 | 90 | S3_RELS_VOCAB = join(S3_DIR, RELS_VOCAB) 91 | S3_MODEL_SHAPE = join(S3_DIR, MODEL_SHAPE) 92 | S3_MODEL_ENCODER = join(S3_DIR, MODEL_ENCODER) 93 | S3_MODEL_HEAD = join(S3_DIR, MODEL_HEAD) 94 | S3_MODEL_REL = join(S3_DIR, MODEL_REL) 95 | 96 | ID = 'slovnet_syntax_news_v1' 97 | PACK = ID + '.tar' 98 | S3_PACK = join('packs', PACK) 99 | 100 | BOARD_NAME = getenv('board_name', '07_syntax') 101 | RUNS_DIR = 'runs' 102 | 103 | TRAIN_BOARD = '01_train' 104 | TEST_BOARD = '02_test' 105 | 106 | SEED = int(getenv('seed', 17)) 107 | DEVICE = getenv('device', CUDA0) 108 | 109 | SHAPE_DIM = int(getenv('shape_dim', 30)) 110 | LAYERS_NUM = int(getenv('layers_num', 3)) 111 | LAYER_DIM = int(getenv('layer_dim', 64)) 112 | KERNEL_SIZE = int(getenv('kernel_size', 3)) 113 | 114 | LR = float(getenv('lr', 0.0051)) 115 | LR_GAMMA = float(getenv('lr_gamma', 0.74)) 116 | EPOCHS = int(getenv('epochs', 3)) 117 | 118 | LAYER_DIMS = [ 119 | LAYER_DIM * 2**_ 120 | for _ in reversed(range(LAYERS_NUM)) 121 | ] 122 | 123 | 124 | def adapt_markup(record): 125 | return SyntaxMarkup([ 126 | SyntaxToken(_.id, _.text, _.head_id, _.rel) 127 | for _ in record.tokens 128 | ]) 129 | 130 | 131 | def process_batch(model, criterion, batch): 132 | input, target = batch 133 | 134 | pred = model( 135 | input.word_id, input.shape_id, input.pad_mask, 136 | target.mask, target.head_id 137 | ) 138 | loss = ( 139 | criterion(pred.head_id, target.head_id, target.mask) 140 | + criterion(pred.rel_id, target.rel_id, target.mask) 141 | ) 142 | 143 | pred.head_id = model.head.decode(pred.head_id, target.mask) 144 | pred.rel_id = model.rel.decode(pred.rel_id, target.mask) 145 | 146 | return batch.processed(loss, pred) 147 | -------------------------------------------------------------------------------- /scripts/07_syntax/pack.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "%run main.py\n", 10 | "%load_ext autoreload\n", 11 | "%autoreload 2\n", 12 | "\n", 13 | "!mkdir -p {NAVEC_DIR} {MODEL_DIR} {PACK_DIR}\n", 14 | "s3 = S3()" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": null, 20 | "metadata": {}, 21 | "outputs": [], 22 | "source": [ 23 | "if not exists(NAVEC):\n", 24 | " !wget {NAVEC_URL} -O {NAVEC}\n", 25 | " s3.download(S3_MODEL_SHAPE, MODEL_SHAPE)\n", 26 | " s3.download(S3_MODEL_ENCODER, MODEL_ENCODER)\n", 27 | " s3.download(S3_MODEL_HEAD, MODEL_HEAD)\n", 28 | " s3.download(S3_MODEL_REL, MODEL_REL)\n", 29 | " s3.download(S3_RELS_VOCAB, RELS_VOCAB)" 30 | ] 31 | }, 32 | { 33 | "cell_type": "code", 34 | "execution_count": null, 35 | "metadata": {}, 36 | "outputs": [], 37 | "source": [ 38 | "navec = Navec.load(NAVEC)\n", 39 | "\n", 40 | "words_vocab = Vocab(navec.vocab.words)\n", 41 | "shapes_vocab = Vocab([PAD] + SHAPES)\n", 42 | "rels_vocab = Vocab.load(RELS_VOCAB)\n", 43 | "\n", 44 | "word = NavecEmbedding(navec)\n", 45 | "shape = Embedding(\n", 46 | " vocab_size=len(shapes_vocab),\n", 47 | " dim=SHAPE_DIM,\n", 48 | " pad_id=shapes_vocab.pad_id\n", 49 | ")\n", 50 | "emb = SyntaxEmbedding(word, shape)\n", 51 | "encoder = SyntaxEncoder(\n", 52 | " input_dim=emb.dim,\n", 53 | " layer_dims=LAYER_DIMS,\n", 54 | " kernel_size=KERNEL_SIZE,\n", 55 | ")\n", 56 | "head = SyntaxHead(\n", 57 | " input_dim=encoder.dim,\n", 58 | " hidden_dim=encoder.dim // 2,\n", 59 | ")\n", 60 | "rel = SyntaxRel(\n", 61 | " input_dim=encoder.dim,\n", 62 | " hidden_dim=encoder.dim // 2,\n", 63 | " rel_dim=len(rels_vocab)\n", 64 | ")\n", 65 | "model = Syntax(emb, encoder, head, rel)\n", 66 | "model.eval()\n", 67 | "\n", 68 | "model.emb.shape.load(MODEL_SHAPE)\n", 69 | "model.encoder.load(MODEL_ENCODER)\n", 70 | "model.head.load(MODEL_HEAD)\n", 71 | "model.rel.load(MODEL_REL)" 72 | ] 73 | }, 74 | { 75 | "cell_type": "code", 76 | "execution_count": null, 77 | "metadata": {}, 78 | "outputs": [], 79 | "source": [ 80 | "model = model.to_exec()\n", 81 | "model = model.strip_navec()\n", 82 | "arrays, model = model.separate_arrays()" 83 | ] 84 | }, 85 | { 86 | "cell_type": "code", 87 | "execution_count": null, 88 | "metadata": {}, 89 | "outputs": [], 90 | "source": [ 91 | "with DumpPack(PACK) as pack:\n", 92 | " meta = Meta(ID)\n", 93 | " pack.dump_meta(meta)\n", 94 | " \n", 95 | " pack.dump_model(model)\n", 96 | " pack.dump_arrays(arrays)\n", 97 | "\n", 98 | " pack.dump_vocab(words_vocab, WORD)\n", 99 | " pack.dump_vocab(shapes_vocab, SHAPE)\n", 100 | " pack.dump_vocab(rels_vocab, REL)" 101 | ] 102 | }, 103 | { 104 | "cell_type": "code", 105 | "execution_count": null, 106 | "metadata": {}, 107 | "outputs": [], 108 | "source": [ 109 | "syntax = api.Syntax.load(PACK)\n", 110 | "syntax.navec(navec)\n", 111 | "words = 'Опубликованы новые данные по заражению коронавирусом в Москве'.split()\n", 112 | "markup = syntax(words)\n", 113 | "show_syntax_markup(markup)" 114 | ] 115 | }, 116 | { 117 | "cell_type": "code", 118 | "execution_count": null, 119 | "metadata": {}, 120 | "outputs": [], 121 | "source": [ 122 | "s3.upload(PACK, S3_PACK)" 123 | ] 124 | }, 125 | { 126 | "cell_type": "code", 127 | "execution_count": null, 128 | "metadata": {}, 129 | "outputs": [], 130 | "source": [] 131 | } 132 | ], 133 | "metadata": { 134 | "kernelspec": { 135 | "display_name": "Python 3", 136 | "language": "python", 137 | "name": "python3" 138 | }, 139 | "language_info": { 140 | "codemirror_mode": { 141 | "name": "ipython", 142 | "version": 3 143 | }, 144 | "file_extension": ".py", 145 | "mimetype": "text/x-python", 146 | "name": "python", 147 | "nbconvert_exporter": "python", 148 | "pygments_lexer": "ipython3", 149 | "version": "3.6.9" 150 | } 151 | }, 152 | "nbformat": 4, 153 | "nbformat_minor": 2 154 | } 155 | -------------------------------------------------------------------------------- /scripts/README.md: -------------------------------------------------------------------------------- 1 | # How to run notebooks in the /slovnet/scripts: 2 | 3 | 1) Navigate to the repository root folder 4 | 5 | 2) Install the required dependencies by running these commands: 6 | 7 | ```bash 8 | pip3 install -r requirements/dev.txt 9 | pip3 install -e . 10 | 11 | ``` 12 | 13 | ---- 14 | # How to train the NER model on your custom texts and tags: 15 | 16 | 17 | 18 | ## Step 1: train big BERT NER model 19 | 20 | 1) #### Specify three variables in the [slovnet/scripts/02_bert_ner/main.py](https://github.com/natasha/slovnet/blob/master/scripts/02_bert_ner/main.py) 21 | 22 | If you are going to train the model on your own custom texts and tags, then: 23 | - firstly set **CUSTOM_TUNING** flag to **True** 24 | - then specify the list of your custom tags in the **TAGS** variable 25 | - and also specify the name of the file with your custom dataset in the **CUSTOM_TEXTS** variable (**custom-dataset.jl.gz** by default) 26 | 27 | 2) #### By running the cells in the [slovnet/scripts/02_bert_ner/main.ipynb](https://github.com/natasha/slovnet/blob/master/scripts/02_bert_ner/main.ipynb) 28 | 29 | - Download the default datasets **factru.jl.gz** and **ne5.jl.gz** in order to understand the input file format 30 | - Annotate your texts (about 1K-2K texts ~1000 symbols each) with any NER annotator, for example [this online tool](https://paramonych.github.io/ner-annotator-online) 31 | - Combine annotated texts into the **".jl"** file and name it according to the value specified in the **CUSTOM_TEXTS** variable from the step above 32 | - Zip you **".jl"** file 33 | - Put your zipped **".jl.gz"** file into the same **02_bert_ner/data/** directory 34 | - Download the **encoder** (*encoder.pt*), **tokens** (*vocab.txt*) and **embeddings** (*emb.pt*) pretrained on hundreds of thousands of articles into **02_bert_ner/bert/** directory 35 | - Configure and train the model 36 | - And lastly dump the resulting model into the **02_bert_ner/model/** directory as **encoder.pt** and **ner.pt** files 37 | 38 | 39 | 40 | ## Step 2: mark out big dataset with BERT NER model from step one 41 | 42 | 1) #### Prepare big dataset containing about 500K-1000K of texts ~1000 symbols each 43 | 2) #### By running the cells in the [slovnet/scripts/02_bert_ner/infer.ipynb](https://github.com/natasha/slovnet/blob/master/scripts/02_bert_ner/infer.ipynb) 44 | 45 | - Download (if needed) the **tokens** (*vocab.txt*) pretrained on hundreds of thousands of articles into **02_bert_ner/bert/** directory 46 | - Configure the model pretrained on the previous step 47 | - Iterate through the list of your texts and feed them to the inference as the small chunks (~1000 symbols each) 48 | 49 | 50 | 51 | ## Step 3: train small NER model on the big synthetic markup from step two 52 | 53 | 1) #### Specify four variables in the [slovnet/scripts/05_ner/main.py](https://github.com/natasha/slovnet/blob/master/scripts/05_ner/main.py) 54 | 55 | If you are going to train the model on your own custom texts and tags, then: 56 | - firstly set **CUSTOM_TUNING** flag to **True** 57 | - then specify the list of your custom tags in the **TAGS** variable (**Important!** These must be the *same TAGS as in step one* ) 58 | - then specify the name of the file with your big synthetic custom dataset (from the previous step) in the **CUSTOM_TEXTS** variable (**big-synthetic-dataset.jl.gz** by default) 59 | - and finally specify your resulting package name by setting the **ID** variable (**slovnet_ner_custom_tags** by default) 60 | 61 | 2) #### By running the cells in the [slovnet/scripts/05_ner/main.ipynb](https://github.com/natasha/slovnet/blob/master/scripts/05_ner/main.ipynb) 62 | 63 | - Download the default big synthetic dataset **nerus.jl.gz** in order to understand the input file format 64 | - Put your zipped big synthetic dataset from the previous step in **".jl.gz"** file into the same **05_ner/data/** directory 65 | - Download the **navec embeddings** (*navec_news_v1_1B_250K_300d_100q.tar*) pretrained on hundreds of thousands of articles into **05_ner/navec/** directory 66 | - Configure and train the model 67 | - And lastly dump the resulting model into the **05_ner/model/** directory as **encoder.pt**, **ner.pt** and **shape.pt** files 68 | 69 | 70 | 71 | ## Step 4: pack and test small NER model 72 | 73 | #### By running the cells in the [slovnet/scripts/05_ner/pack.ipynb](https://github.com/natasha/slovnet/blob/master/scripts/05_ner/pack.ipynb) 74 | 75 | - Download (if needed) the **navec embeddings** (*navec_news_v1_1B_250K_300d_100q.tar*) pretrained on hundreds of thousands of articles into **05_ner/navec/** directory 76 | - Configure the model pretrained on the previous step 77 | - Prepare and dump the resulting package - it will be **05_ner/slovnet_ner_custom_tags.tar** by default 78 | - Load the package, pass the embeddings into it and then test with your piece of text -------------------------------------------------------------------------------- /scripts/slovnet.json: -------------------------------------------------------------------------------- 1 | { 2 | "s3_key_id":"YCAJER9umFAFwOsa2lxWTEsCe", 3 | "s3_key":"YCOL0pFUXfEdeumVr-GSM1g2co_sN4-BV09AtVzI" 4 | } -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [tool:pytest] 2 | python_files = test_*.py test.py 3 | 4 | [flake8] 5 | # E501 line too long 6 | # W503 line break before binary op 7 | extend-ignore = E501,W503 8 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | 2 | from setuptools import setup, find_packages 3 | 4 | 5 | with open('README.md') as file: 6 | description = file.read() 7 | 8 | 9 | setup( 10 | name='slovnet', 11 | version='0.6.0', 12 | 13 | description='Deep-learning based NLP modeling for Russian language', 14 | long_description=description, 15 | long_description_content_type='text/markdown', 16 | 17 | url='https://github.com/natasha/slovnet', 18 | author='Alexander Kukushkin', 19 | author_email='alex@alexkuk.ru', 20 | license='MIT', 21 | 22 | classifiers=[ 23 | 'License :: OSI Approved :: MIT License', 24 | 'Programming Language :: Python :: 3', 25 | 'Topic :: Scientific/Engineering :: Artificial Intelligence', 26 | ], 27 | keywords='nlp, deeplearning, russian', 28 | 29 | packages=find_packages( 30 | exclude=['tests'] 31 | ), 32 | install_requires=[ 33 | 'numpy', 34 | 'razdel', 35 | 'navec', 36 | ] 37 | ) 38 | -------------------------------------------------------------------------------- /slovnet/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | from .api import NER, Morph, Syntax # noqa 3 | -------------------------------------------------------------------------------- /slovnet/api.py: -------------------------------------------------------------------------------- 1 | 2 | from .record import Record 3 | from .const import WORD, SHAPE, TAG, REL 4 | from .chop import chop 5 | 6 | from .exec.pack import Pack 7 | from .exec.model import ( 8 | Morph as MorphModel, 9 | NER as NERModel, 10 | Syntax as SyntaxModel 11 | ) 12 | from .exec.encoders import ( 13 | TagEncoder, 14 | SyntaxEncoder 15 | ) 16 | from .exec.infer import ( 17 | TagDecoder, 18 | MorphInfer, 19 | NERInfer, 20 | 21 | SyntaxDecoder, 22 | SyntaxInfer 23 | ) 24 | 25 | 26 | class API(Record): 27 | __attributes__ = ['infer', 'batch_size'] 28 | 29 | def navec(self, navec): 30 | self.infer.model = self.infer.model.inject_navec(navec) 31 | return self 32 | 33 | def map(self, items): 34 | for chunk in chop(items, self.batch_size): 35 | yield from self.infer(chunk) 36 | 37 | def __call__(self, item): 38 | return next(self.map([item])) 39 | 40 | 41 | class NER(API): 42 | @classmethod 43 | def load(cls, path, batch_size=8): 44 | with Pack(path) as pack: 45 | meta = pack.load_meta() 46 | meta.check_protocol() 47 | 48 | model = pack.load_model(NERModel) 49 | arrays = dict(pack.load_arrays(model.weights)) 50 | 51 | words_vocab = pack.load_vocab(WORD) 52 | shapes_vocab = pack.load_vocab(SHAPE) 53 | tags_vocab = pack.load_vocab(TAG) 54 | 55 | model = model.inject_arrays(arrays) 56 | encoder = TagEncoder( 57 | words_vocab, shapes_vocab, 58 | batch_size 59 | ) 60 | decoder = TagDecoder(tags_vocab) 61 | infer = NERInfer(model, encoder, decoder) 62 | 63 | return cls(infer, batch_size) 64 | 65 | 66 | class Morph(API): 67 | @classmethod 68 | def load(cls, path, batch_size=8): 69 | with Pack(path) as pack: 70 | meta = pack.load_meta() 71 | meta.check_protocol() 72 | 73 | model = pack.load_model(MorphModel) 74 | arrays = dict(pack.load_arrays(model.weights)) 75 | 76 | words_vocab = pack.load_vocab(WORD) 77 | shapes_vocab = pack.load_vocab(SHAPE) 78 | tags_vocab = pack.load_vocab(TAG) 79 | 80 | model = model.inject_arrays(arrays) 81 | encoder = TagEncoder( 82 | words_vocab, shapes_vocab, 83 | batch_size 84 | ) 85 | decoder = TagDecoder(tags_vocab) 86 | infer = MorphInfer(model, encoder, decoder) 87 | 88 | return cls(infer, batch_size) 89 | 90 | 91 | class Syntax(API): 92 | @classmethod 93 | def load(cls, path, batch_size=8): 94 | with Pack(path) as pack: 95 | meta = pack.load_meta() 96 | meta.check_protocol() 97 | 98 | model = pack.load_model(SyntaxModel) 99 | arrays = dict(pack.load_arrays(model.weights)) 100 | 101 | words_vocab = pack.load_vocab(WORD) 102 | shapes_vocab = pack.load_vocab(SHAPE) 103 | rels_vocab = pack.load_vocab(REL) 104 | 105 | model = model.inject_arrays(arrays) 106 | encoder = SyntaxEncoder( 107 | words_vocab, shapes_vocab, 108 | batch_size 109 | ) 110 | decoder = SyntaxDecoder(rels_vocab) 111 | infer = SyntaxInfer(model, encoder, decoder) 112 | 113 | return cls(infer, batch_size) 114 | -------------------------------------------------------------------------------- /slovnet/batch.py: -------------------------------------------------------------------------------- 1 | 2 | from .record import Record 3 | 4 | 5 | class Batch(Record): 6 | __attributes__ = ['input', 'target'] 7 | 8 | def processed(self, loss, pred): 9 | return ProcessedBatch( 10 | self.input, self.target, 11 | loss, pred 12 | ) 13 | 14 | 15 | class ProcessedBatch(Record): 16 | __attributes__ = ['input', 'target', 'loss', 'pred'] 17 | -------------------------------------------------------------------------------- /slovnet/bert.py: -------------------------------------------------------------------------------- 1 | 2 | import re 3 | 4 | 5 | def bert_chunks(text): 6 | # diff with bert tokenizer 28 / 10000 ~0.3% 7 | # школа №3 -> школа, №3 8 | # @diet_prada -> @, diet, _, prada 9 | return re.findall(r'\w+|[^\w\s]', text) 10 | 11 | 12 | def wordpiece(text, vocab, prefix='##'): 13 | start = 0 14 | stop = size = len(text) 15 | subs = [] 16 | while start < size: 17 | sub = text[start:stop] 18 | if start > 0: 19 | sub = prefix + sub 20 | if sub in vocab.item_ids: 21 | subs.append(sub) 22 | start = stop 23 | stop = size 24 | else: 25 | stop -= 1 26 | if stop < start: 27 | return 28 | return subs 29 | 30 | 31 | def safe_wordpiece(text, vocab): 32 | subs = wordpiece(text, vocab) 33 | if not subs: 34 | return [text] 35 | return subs 36 | 37 | 38 | def bert_subs(text, vocab): 39 | return [ 40 | sub 41 | for chunk in bert_chunks(text) 42 | for sub in safe_wordpiece(chunk, vocab) 43 | ] 44 | -------------------------------------------------------------------------------- /slovnet/bio.py: -------------------------------------------------------------------------------- 1 | 2 | from .span import Span 3 | from .const import B, I, O 4 | 5 | 6 | def parse_bio(tag): 7 | if '-' in tag: 8 | part, type = tag.split('-', 1) 9 | else: 10 | part = tag 11 | type = None 12 | return part, type 13 | 14 | 15 | def format_bio(part, type): 16 | if not type: 17 | return part 18 | return '%s-%s' % (part, type) 19 | 20 | 21 | ########## 22 | # 23 | # IO 24 | # 25 | ######### 26 | 27 | # assert tokens and spans are sorted 28 | # assert spans do not overlap 29 | # assert span bounds align with token bounds 30 | 31 | 32 | def append_ellipsis(items, ellipsis=None): 33 | for item in items: 34 | yield item 35 | yield ellipsis 36 | 37 | 38 | def spans_io(tokens, spans): 39 | spans = append_ellipsis(spans) 40 | span = next(spans) 41 | for token in tokens: 42 | part = O 43 | type = None 44 | if span: 45 | if token.start >= span.start: 46 | part = I 47 | type = span.type 48 | if token.stop >= span.stop: 49 | span = next(spans) 50 | yield format_bio(part, type) 51 | 52 | 53 | def io_spans(tokens, tags): 54 | previous = None 55 | start = None 56 | stop = None 57 | for token, tag in zip(tokens, tags): 58 | part, type = parse_bio(tag) 59 | # wikiner splits on I-PER B-PER for example 60 | if previous != type or part == B: 61 | if not previous and type: 62 | # O I 63 | start = token.start 64 | elif previous and type: 65 | # I-A I-B 66 | yield Span(start, stop, previous) 67 | start = token.start 68 | elif previous and not type: 69 | # I O 70 | yield Span(start, stop, previous) 71 | previous = None 72 | previous = type 73 | stop = token.stop 74 | if previous: 75 | yield Span(start, stop, previous) 76 | 77 | 78 | ####### 79 | # 80 | # BIO 81 | # 82 | ######### 83 | 84 | 85 | def spans_bio(tokens, spans): 86 | spans = append_ellipsis(spans) 87 | span = next(spans) 88 | for token in tokens: 89 | part = O 90 | type = None 91 | if span: 92 | if token.start >= span.start: 93 | type = span.type 94 | if token.start == span.start: 95 | part = B 96 | else: 97 | part = I 98 | if token.stop >= span.stop: 99 | span = next(spans) 100 | yield format_bio(part, type) 101 | 102 | 103 | def bio_spans(tokens, tags): 104 | previous = None 105 | start = None 106 | stop = None 107 | for token, tag in zip(tokens, tags): 108 | part, type = parse_bio(tag) 109 | if part == O: 110 | if previous: 111 | yield Span(start, stop, previous) 112 | previous = None 113 | elif part == B: 114 | if previous: 115 | yield Span(start, stop, previous) 116 | previous = type 117 | start = token.start 118 | stop = token.stop 119 | elif part == I: 120 | stop = token.stop 121 | if previous: 122 | yield Span(start, stop, previous) 123 | 124 | 125 | ######### 126 | # 127 | # CONVERT 128 | # 129 | ######### 130 | 131 | 132 | def bio_io(tags): 133 | for tag in tags: 134 | part, type = parse_bio(tag) 135 | if part == B: 136 | part = I 137 | yield format_bio(part, type) 138 | 139 | 140 | ######## 141 | # 142 | # SELECT 143 | # 144 | ###### 145 | 146 | 147 | def select_type_tags(tags, selected): 148 | for tag in tags: 149 | part, type = parse_bio(tag) 150 | if type != selected: 151 | part = O 152 | type = None 153 | yield format_bio(part, type) 154 | -------------------------------------------------------------------------------- /slovnet/board.py: -------------------------------------------------------------------------------- 1 | 2 | from os.path import join as join_path 3 | 4 | from torch.utils.tensorboard import SummaryWriter 5 | 6 | from .record import Record 7 | from .log import log 8 | 9 | 10 | class Board(Record): 11 | __attributes__ = ['steps'] 12 | 13 | def __init__(self, steps=0): 14 | self.steps = steps 15 | 16 | def section(self, name): 17 | return BoardSection(name, self) 18 | 19 | def step(self): 20 | self.steps += 1 21 | 22 | 23 | class TensorBoard(Board): 24 | __attributes__ = ['dir', 'root', 'steps'] 25 | 26 | def __init__(self, dir, root, steps=0, flush_secs=1): 27 | self.dir = dir 28 | self.root = root 29 | self.writer = SummaryWriter( 30 | join_path(root, dir), 31 | flush_secs=flush_secs 32 | ) 33 | super(TensorBoard, self).__init__(steps) 34 | 35 | def add_scalar(self, key, value): 36 | self.writer.add_scalar(key, value, self.steps) 37 | 38 | 39 | class LogBoard(Board): 40 | def add_scalar(self, key, value): 41 | log('{:>4} {:.4f} {}'.format(self.steps, value, key)) 42 | 43 | 44 | class MultiBoard(Board): 45 | __attributes__ = ['boards'] 46 | 47 | def __init__(self, boards): 48 | self.boards = boards 49 | 50 | def step(self): 51 | for board in self.boards: 52 | board.step() 53 | 54 | def add_scalar(self, key, value): 55 | for board in self.boards: 56 | board.add_scalar(key, value) 57 | 58 | 59 | class BoardSection(Record): 60 | __attributes__ = ['name', 'board'] 61 | 62 | def prefixed(self, key): 63 | return '%s/%s' % (self.name, key) 64 | 65 | def add_scalar(self, key, value): 66 | self.board.add_scalar(self.prefixed(key), value) 67 | -------------------------------------------------------------------------------- /slovnet/chop.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | def chop(items, size): 4 | buffer = [] 5 | for item in items: 6 | buffer.append(item) 7 | if len(buffer) >= size: 8 | yield buffer 9 | buffer = [] 10 | if buffer: 11 | yield buffer 12 | 13 | 14 | def chop_drop(items, size): 15 | chunks = chop(items, size) 16 | for chunk in chunks: 17 | if len(chunk) < size: 18 | continue 19 | yield chunk 20 | 21 | 22 | def chop_weighted(items, size, weight): 23 | buffer = [] 24 | accum = 0 25 | for item in items: 26 | value = weight(item) 27 | if accum + value > size: 28 | yield buffer 29 | buffer = [] 30 | accum = 0 31 | buffer.append(item) 32 | accum += value 33 | if buffer: 34 | yield buffer 35 | -------------------------------------------------------------------------------- /slovnet/conll.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | def format_conll_tag(pos, feats): 4 | if not feats: 5 | return pos 6 | 7 | feats = '|'.join( 8 | '%s=%s' % (_, feats[_]) 9 | for _ in sorted(feats) 10 | ) 11 | return '%s|%s' % (pos, feats) 12 | 13 | 14 | def parse_conll_tag(tag): 15 | if '|' not in tag: 16 | return tag, {} 17 | 18 | pos, feats = tag.split('|', 1) 19 | feats = dict( 20 | _.split('=', 1) 21 | for _ in feats.split('|') 22 | ) 23 | return pos, feats 24 | -------------------------------------------------------------------------------- /slovnet/const.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | from os.path import exists, abspath 4 | 5 | from .io import load_json 6 | 7 | TRAIN = 'train' 8 | DEV = 'dev' 9 | TEST = 'test' 10 | 11 | ######### 12 | # DEVICE 13 | ######### 14 | 15 | CUDA0 = 'cuda:0' 16 | CUDA1 = 'cuda:1' 17 | CUDA2 = 'cuda:2' 18 | CUDA3 = 'cuda:3' 19 | CPU = 'cpu' 20 | 21 | ######### 22 | # VOCAB 23 | ######### 24 | 25 | UNK = '' 26 | PAD = '' 27 | CLS = '' 28 | SEP = '' 29 | MASK = '' 30 | 31 | WORD = 'word' 32 | SHAPE = 'shape' 33 | TAG = 'tag' 34 | REL = 'rel' 35 | 36 | ######### 37 | # BIO 38 | ######### 39 | 40 | B = 'B' 41 | I = 'I' # noqa E741 42 | O = 'O' # noqa 43 | 44 | PER = 'PER' 45 | LOC = 'LOC' 46 | ORG = 'ORG' 47 | 48 | ######### 49 | # CONFIG 50 | ######### 51 | 52 | config = {} 53 | path = abspath('../slovnet.json') 54 | 55 | if exists(path): 56 | config = load_json(path) 57 | 58 | ######### 59 | # S3 60 | ######### 61 | 62 | S3_KEY_ID = config.get('s3_key_id') 63 | S3_KEY = config.get('s3_key') 64 | S3_BUCKET = 'natasha-slovnet' 65 | S3_REGION = 'us-east-1' 66 | S3_ENDPOINT = 'https://storage.yandexcloud.net' 67 | -------------------------------------------------------------------------------- /slovnet/encoders/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/natasha/slovnet/4b46dc497b586e106258819c23be28c277efac97/slovnet/encoders/__init__.py -------------------------------------------------------------------------------- /slovnet/encoders/buffer.py: -------------------------------------------------------------------------------- 1 | 2 | from random import shuffle 3 | 4 | 5 | class Buffer: 6 | def __init__(self, size): 7 | self.size = size 8 | self.reset() 9 | 10 | def append(self, item): 11 | self.buffer.append(item) 12 | 13 | def reset(self): 14 | self.buffer = [] 15 | 16 | @property 17 | def is_full(self): 18 | return len(self.buffer) >= self.size 19 | 20 | def __call__(self, items): 21 | for item in items: 22 | self.append(item) 23 | if self.is_full: 24 | for item in self.flush(): 25 | yield item 26 | for item in self.flush(): 27 | yield item 28 | 29 | 30 | class ShuffleBuffer(Buffer): 31 | def flush(self): 32 | shuffle(self.buffer) 33 | for item in self.buffer: 34 | yield item 35 | self.reset() 36 | 37 | 38 | class SortBuffer(Buffer): 39 | def __init__(self, size, key): 40 | self.key = key 41 | Buffer.__init__(self, size) 42 | 43 | def flush(self): 44 | self.buffer.sort(key=self.key) 45 | for item in self.buffer: 46 | yield item 47 | self.reset() 48 | -------------------------------------------------------------------------------- /slovnet/encoders/common.py: -------------------------------------------------------------------------------- 1 | 2 | import torch 3 | 4 | from slovnet.record import Record 5 | from slovnet.pad import pad_sequence 6 | from slovnet.chop import chop 7 | from slovnet.shape import word_shape 8 | 9 | 10 | class WordShapeInferInput(Record): 11 | __attributes__ = ['word_id', 'shape_id', 'pad_mask'] 12 | 13 | 14 | class WordShapeInferEncoder: 15 | def __init__(self, words_vocab, shapes_vocab, 16 | batch_size=8): 17 | self.words_vocab = words_vocab 18 | self.shapes_vocab = shapes_vocab 19 | 20 | self.batch_size = batch_size 21 | 22 | def item(self, words): 23 | word_ids, shape_ids = [], [] 24 | for word in words: 25 | shape = word_shape(word) 26 | word_id = self.words_vocab.encode(word.lower()) 27 | shape_id = self.shapes_vocab.encode(shape) 28 | word_ids.append(word_id) 29 | shape_ids.append(shape_id) 30 | return word_ids, shape_ids 31 | 32 | def input(self, items): 33 | word_id, shape_id = [], [] 34 | for word_ids, shape_ids in items: 35 | word_id.append(torch.tensor(word_ids, dtype=torch.long)) 36 | shape_id.append(torch.tensor(shape_ids, dtype=torch.long)) 37 | word_id = pad_sequence(word_id, self.words_vocab.pad_id) 38 | shape_id = pad_sequence(shape_id, self.shapes_vocab.pad_id) 39 | pad_mask = word_id == self.words_vocab.pad_id 40 | return WordShapeInferInput(word_id, shape_id, pad_mask) 41 | 42 | def __call__(self, items): 43 | items = (self.item(_) for _ in items) 44 | chunks = chop(items, self.batch_size) 45 | for chunk in chunks: 46 | yield self.input(chunk) 47 | -------------------------------------------------------------------------------- /slovnet/encoders/syntax.py: -------------------------------------------------------------------------------- 1 | 2 | import torch 3 | 4 | from slovnet.record import Record 5 | from slovnet.pad import pad_sequence 6 | from slovnet.chop import chop 7 | from slovnet.shape import word_shape 8 | from slovnet.batch import Batch 9 | 10 | from .buffer import SortBuffer 11 | from .common import WordShapeInferEncoder 12 | 13 | 14 | ROOT_ID = '0' 15 | 16 | 17 | class SyntaxTrainItem(Record): 18 | __attributes__ = ['word_ids', 'shape_ids', 'head_ids', 'rel_ids'] 19 | 20 | 21 | class SyntaxInput(Record): 22 | __attributes__ = ['word_id', 'shape_id', 'pad_mask'] 23 | 24 | 25 | class SyntaxTarget(Record): 26 | __attributes__ = ['head_id', 'rel_id', 'mask'] 27 | 28 | 29 | class SyntaxTrainEncoder: 30 | def __init__(self, words_vocab, shapes_vocab, rels_vocab, 31 | batch_size=8, sort_size=1): 32 | self.words_vocab = words_vocab 33 | self.shapes_vocab = shapes_vocab 34 | self.rels_vocab = rels_vocab 35 | 36 | self.batch_size = batch_size 37 | self.sort = SortBuffer(sort_size, key=lambda _: len(_.tokens)) 38 | 39 | def item(self, markup): 40 | word_ids, shape_ids, head_ids, rel_ids = [], [], [], [] 41 | ids = {ROOT_ID: 0} 42 | 43 | for index, token in enumerate(markup.tokens, 1): 44 | ids[token.id] = index 45 | head_ids.append(token.head_id) 46 | 47 | rel_id = self.rels_vocab.encode(token.rel) 48 | rel_ids.append(rel_id) 49 | 50 | shape = word_shape(token.text) 51 | shape_id = self.shapes_vocab.encode(shape) 52 | shape_ids.append(shape_id) 53 | 54 | word = token.text.lower() 55 | word_id = self.words_vocab.encode(word) 56 | word_ids.append(word_id) 57 | 58 | head_ids = [ids[_] for _ in head_ids] 59 | return SyntaxTrainItem(word_ids, shape_ids, head_ids, rel_ids) 60 | 61 | def batch(self, chunk): 62 | word_id, shape_id, head_id, rel_id = [], [], [], [] 63 | for item in chunk: 64 | word_id.append(torch.tensor(item.word_ids, dtype=torch.long)) 65 | shape_id.append(torch.tensor(item.shape_ids, dtype=torch.long)) 66 | head_id.append(torch.tensor(item.head_ids, dtype=torch.long)) 67 | rel_id.append(torch.tensor(item.rel_ids, dtype=torch.long)) 68 | 69 | word_id = pad_sequence(word_id, fill=self.words_vocab.pad_id) 70 | shape_id = pad_sequence(shape_id, fill=self.shapes_vocab.pad_id) 71 | pad_mask = word_id == self.words_vocab.pad_id 72 | input = SyntaxInput(word_id, shape_id, pad_mask) 73 | 74 | head_id = pad_sequence(head_id) 75 | rel_id = pad_sequence(rel_id, fill=self.rels_vocab.pad_id) 76 | mask = rel_id != self.rels_vocab.pad_id 77 | target = SyntaxTarget(head_id, rel_id, mask) 78 | 79 | return Batch(input, target) 80 | 81 | def __call__(self, markups): 82 | markups = self.sort(markups) 83 | items = (self.item(_) for _ in markups) 84 | chunks = chop(items, self.batch_size) 85 | for chunk in chunks: 86 | yield self.batch(chunk) 87 | 88 | 89 | class SyntaxInferEncoder(WordShapeInferEncoder): 90 | pass 91 | -------------------------------------------------------------------------------- /slovnet/encoders/tag.py: -------------------------------------------------------------------------------- 1 | 2 | import torch 3 | 4 | from slovnet.record import Record 5 | from slovnet.chop import chop, chop_drop 6 | from slovnet.shape import word_shape 7 | from slovnet.batch import Batch 8 | 9 | from .buffer import ShuffleBuffer 10 | from .common import WordShapeInferEncoder 11 | 12 | 13 | class TagTrainInput(Record): 14 | __attributes__ = ['word_id', 'shape_id'] 15 | 16 | 17 | class TagTrainEncoder: 18 | def __init__(self, words_vocab, shapes_vocab, tags_vocab, 19 | seq_len=512, batch_size=8, shuffle_size=1): 20 | self.words_vocab = words_vocab 21 | self.shapes_vocab = shapes_vocab 22 | self.tags_vocab = tags_vocab 23 | 24 | self.seq_len = seq_len 25 | self.batch_size = batch_size 26 | 27 | self.shuffle = ShuffleBuffer(shuffle_size) 28 | 29 | def items(self, markups): 30 | for markup in markups: 31 | for token in markup.tokens: 32 | shape = word_shape(token.text) 33 | word = token.text.lower() 34 | yield ( 35 | self.words_vocab.encode(word), 36 | self.shapes_vocab.encode(shape), 37 | self.tags_vocab.encode(token.tag) 38 | ) 39 | 40 | def batch(self, chunk): 41 | chunk = torch.tensor(chunk, dtype=torch.long) # batch x seq x (word, shp, tag) 42 | word_id, shape_id, tag_id = chunk.unbind(-1) 43 | 44 | input = TagTrainInput(word_id, shape_id) 45 | return Batch(input, tag_id) 46 | 47 | def __call__(self, markups): 48 | items = self.items(markups) 49 | seqs = chop_drop(items, self.seq_len) 50 | seqs = self.shuffle(seqs) 51 | chunks = chop(seqs, self.batch_size) 52 | for chunk in chunks: 53 | yield self.batch(chunk) 54 | 55 | 56 | class TagInferEncoder(WordShapeInferEncoder): 57 | pass 58 | -------------------------------------------------------------------------------- /slovnet/exec/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/natasha/slovnet/4b46dc497b586e106258819c23be28c277efac97/slovnet/exec/__init__.py -------------------------------------------------------------------------------- /slovnet/exec/encoders.py: -------------------------------------------------------------------------------- 1 | 2 | import numpy as np 3 | 4 | from slovnet.record import Record 5 | from slovnet.chop import chop 6 | from slovnet.shape import word_shape 7 | 8 | from .pad import pad_sequence 9 | 10 | 11 | class WordShapeInput(Record): 12 | __attributes__ = ['word_id', 'shape_id', 'pad_mask'] 13 | 14 | 15 | class WordShapeEncoder(Record): 16 | __attributes__ = ['words_vocab', 'shapes_vocab', 'batch_size'] 17 | 18 | def __init__(self, words_vocab, shapes_vocab, 19 | batch_size=8): 20 | self.words_vocab = words_vocab 21 | self.shapes_vocab = shapes_vocab 22 | 23 | self.batch_size = batch_size 24 | 25 | def item(self, words): 26 | word_ids, shape_ids = [], [] 27 | for word in words: 28 | shape = word_shape(word) 29 | word_id = self.words_vocab.encode(word.lower()) 30 | shape_id = self.shapes_vocab.encode(shape) 31 | word_ids.append(word_id) 32 | shape_ids.append(shape_id) 33 | return word_ids, shape_ids 34 | 35 | def input(self, items): 36 | word_id, shape_id = [], [] 37 | for word_ids, shape_ids in items: 38 | word_id.append(np.array(word_ids)) 39 | shape_id.append(np.array(shape_ids)) 40 | word_id = pad_sequence(word_id, self.words_vocab.pad_id) 41 | shape_id = pad_sequence(shape_id, self.shapes_vocab.pad_id) 42 | pad_mask = word_id == self.words_vocab.pad_id 43 | return WordShapeInput(word_id, shape_id, pad_mask) 44 | 45 | def __call__(self, items): 46 | items = (self.item(_) for _ in items) 47 | chunks = chop(items, self.batch_size) 48 | for chunk in chunks: 49 | yield self.input(chunk) 50 | 51 | 52 | class TagEncoder(WordShapeEncoder): 53 | pass 54 | 55 | 56 | class SyntaxEncoder(WordShapeEncoder): 57 | pass 58 | -------------------------------------------------------------------------------- /slovnet/exec/infer.py: -------------------------------------------------------------------------------- 1 | 2 | from slovnet.record import Record 3 | from slovnet.token import tokenize 4 | from slovnet.markup import ( 5 | BIOMarkup, 6 | MorphMarkup, 7 | SyntaxMarkup 8 | ) 9 | 10 | from .mask import split_masked 11 | 12 | 13 | class Infer(Record): 14 | __attributes__ = ['model', 'encoder', 'decoder'] 15 | 16 | 17 | ###### 18 | # 19 | # TAG 20 | # 21 | ##### 22 | 23 | 24 | class TagDecoder(Record): 25 | __attributes__ = ['tags_vocab'] 26 | 27 | def __call__(self, preds): 28 | for pred in preds: 29 | yield [self.tags_vocab.decode(_) for _ in pred] 30 | 31 | 32 | def text_words(text): 33 | return [_.text for _ in tokenize(text)] 34 | 35 | 36 | class NERInfer(Infer): 37 | def process(self, inputs): 38 | for input in inputs: 39 | pred = self.model(input.word_id, input.shape_id, input.pad_mask) 40 | yield from self.model.head.crf.decode(pred, ~input.pad_mask) 41 | 42 | def __call__(self, texts): 43 | items = [text_words(_) for _ in texts] 44 | inputs = self.encoder(items) 45 | preds = self.process(inputs) 46 | preds = self.decoder(preds) 47 | 48 | for text, item, pred in zip(texts, items, preds): 49 | tuples = zip(item, pred) 50 | markup = BIOMarkup.from_tuples(tuples) 51 | yield markup.to_span(text) 52 | 53 | 54 | class MorphInfer(Infer): 55 | def process(self, inputs): 56 | for input in inputs: 57 | pred = self.model(input.word_id, input.shape_id, input.pad_mask) 58 | pred = self.model.head.decode(pred) 59 | yield from split_masked(pred, ~input.pad_mask) 60 | 61 | def __call__(self, items): 62 | inputs = self.encoder(items) 63 | preds = self.process(inputs) 64 | preds = self.decoder(preds) 65 | 66 | for item, pred in zip(items, preds): 67 | tuples = zip(item, pred) 68 | yield MorphMarkup.from_tuples(tuples) 69 | 70 | 71 | ######## 72 | # 73 | # SYNTAX 74 | # 75 | ###### 76 | 77 | 78 | class SyntaxDecoder(Record): 79 | __attributes__ = ['rels_vocab'] 80 | 81 | def __call__(self, preds): 82 | for pred in preds: 83 | head_ids, rel_ids = pred 84 | ids = [str(_ + 1) for _ in range(len(head_ids))] 85 | head_ids = [str(_) for _ in head_ids.tolist()] 86 | rels = [self.rels_vocab.decode(_) for _ in rel_ids] 87 | yield ids, head_ids, rels 88 | 89 | 90 | class SyntaxInfer(Infer): 91 | def process(self, inputs): 92 | for input in inputs: 93 | pred = self.model(input.word_id, input.shape_id, input.pad_mask) 94 | mask = ~input.pad_mask 95 | 96 | head_id = self.model.head.decode(pred.head_id, mask) 97 | head_id = split_masked(head_id, mask) 98 | 99 | rel_id = self.model.rel.decode(pred.rel_id, mask) 100 | rel_id = split_masked(rel_id, mask) 101 | 102 | yield from zip(head_id, rel_id) 103 | 104 | def __call__(self, items): 105 | inputs = self.encoder(items) 106 | preds = self.process(inputs) 107 | preds = self.decoder(preds) 108 | 109 | for item, pred in zip(items, preds): 110 | ids, head_ids, rels = pred 111 | tuples = zip(ids, item, head_ids, rels) 112 | yield SyntaxMarkup.from_tuples(tuples) 113 | -------------------------------------------------------------------------------- /slovnet/exec/mask.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | def split_masked(input, mask): 4 | sizes = mask.sum(-1) 5 | for index, size in enumerate(sizes): 6 | yield input[index, :size] 7 | 8 | 9 | def fill_masked(input, mask, fill=0): 10 | return fill * mask + input * ~mask 11 | -------------------------------------------------------------------------------- /slovnet/exec/pack.py: -------------------------------------------------------------------------------- 1 | 2 | import json 3 | from gzip import ( 4 | compress, 5 | decompress 6 | ) 7 | 8 | import numpy as np 9 | 10 | from slovnet.record import Record 11 | from slovnet.tar import Tar, DumpTar 12 | from slovnet.vocab import Vocab 13 | 14 | 15 | PROTOCOL = 1 16 | 17 | META = 'meta.json' 18 | MODEL = 'model.json' 19 | 20 | 21 | class Meta(Record): 22 | __attributes__ = ['id', 'protocol'] 23 | 24 | def __init__(self, id, protocol=PROTOCOL): 25 | self.id = id 26 | self.protocol = protocol 27 | 28 | def check_protocol(self): 29 | if self.protocol != PROTOCOL: 30 | raise ValueError('Expected protocol=%r, got %r' % (PROTOCOL, self.protocol)) 31 | 32 | 33 | ####### 34 | # 35 | # ARRAY 36 | # 37 | ####### 38 | 39 | 40 | def array_name(id): 41 | return 'arrays/%d.bin' % id 42 | 43 | 44 | def array_bytes(array): 45 | return array.tobytes() 46 | 47 | 48 | def bytes_array(bytes, shape, dtype): 49 | return np.frombuffer(bytes, dtype).reshape(shape) 50 | 51 | 52 | ###### 53 | # 54 | # VOCAB 55 | # 56 | ####### 57 | 58 | 59 | def vocab_name(id): 60 | return 'vocabs/%s.gz' % id 61 | 62 | 63 | def vocab_bytes(vocab): 64 | content = '\n'.join(vocab.items) 65 | bytes = content.encode('utf8') 66 | return compress(bytes) 67 | 68 | 69 | def bytes_vocab(bytes): 70 | content = decompress(bytes).decode('utf8') 71 | items = content.splitlines() 72 | return Vocab(items) 73 | 74 | 75 | ###### 76 | # 77 | # PACK 78 | # 79 | ######## 80 | 81 | 82 | def json_bytes(data): 83 | content = json.dumps(data, ensure_ascii=False, indent=2) 84 | return content.encode('utf8') 85 | 86 | 87 | def bytes_json(bytes): 88 | return json.loads(bytes.decode('utf8')) 89 | 90 | 91 | class Pack(Tar): 92 | def load_record(self, name, Record): 93 | bytes = self.read(name) 94 | data = bytes_json(bytes) 95 | return Record.from_json(data) 96 | 97 | def load_meta(self): 98 | return self.load_record(META, Meta) 99 | 100 | def load_model(self, Model): 101 | return self.load_record(MODEL, Model) 102 | 103 | def load_arrays(self, weights): 104 | for weight in weights: 105 | if not weight.is_id: 106 | continue 107 | 108 | shape, dtype, id = weight 109 | name = array_name(id) 110 | bytes = self.read(name) 111 | yield id, bytes_array(bytes, shape, dtype) 112 | 113 | def load_vocab(self, id): 114 | name = vocab_name(id) 115 | bytes = self.read(name) 116 | return bytes_vocab(bytes) 117 | 118 | 119 | class DumpPack(DumpTar): 120 | def dump_record(self, record, name): 121 | bytes = json_bytes(record.as_json) 122 | self.write(bytes, name) 123 | 124 | def dump_meta(self, meta): 125 | self.dump_record(meta, META) 126 | 127 | def dump_model(self, model): 128 | self.dump_record(model, MODEL) 129 | 130 | def dump_arrays(self, arrays): 131 | for id, array in arrays.items(): 132 | name = array_name(id) 133 | bytes = array_bytes(array) 134 | self.write(bytes, name) 135 | 136 | def dump_vocab(self, vocab, id): 137 | name = vocab_name(id) 138 | bytes = vocab_bytes(vocab) 139 | self.write(bytes, name) 140 | -------------------------------------------------------------------------------- /slovnet/exec/pad.py: -------------------------------------------------------------------------------- 1 | 2 | import numpy as np 3 | 4 | 5 | def pad_sequence(sequences, fill=0): 6 | # assert all sequences are 1d 7 | size = max(_.size for _ in sequences) 8 | array = np.full((len(sequences), size), fill) 9 | for index, sequence in enumerate(sequences): 10 | array[index, :sequence.size] = sequence 11 | return array 12 | -------------------------------------------------------------------------------- /slovnet/infer/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/natasha/slovnet/4b46dc497b586e106258819c23be28c277efac97/slovnet/infer/__init__.py -------------------------------------------------------------------------------- /slovnet/infer/base.py: -------------------------------------------------------------------------------- 1 | 2 | class Infer: 3 | def __init__(self, model, encoder, decoder): 4 | self.model = model 5 | self.encoder = encoder 6 | self.decoder = decoder 7 | -------------------------------------------------------------------------------- /slovnet/infer/bert.py: -------------------------------------------------------------------------------- 1 | 2 | from itertools import groupby 3 | 4 | from slovnet.record import Record 5 | from slovnet.token import tokenize 6 | from slovnet.bert import bert_subs 7 | from slovnet.chop import chop_weighted 8 | from slovnet.mask import pad_masked, split_masked 9 | from slovnet.markup import ( 10 | BIOMarkup, 11 | MorphMarkup, 12 | SyntaxMarkup 13 | ) 14 | 15 | from .base import Infer 16 | from .tag import TagDecoder 17 | from .syntax import SyntaxDecoder 18 | 19 | 20 | ########## 21 | # 22 | # SEGMENT 23 | # 24 | ######### 25 | 26 | 27 | class SubsToken(Record): 28 | __attributes__ = ['text', 'subs'] 29 | 30 | 31 | class BERTInferItem(Record): 32 | __attributes__ = ['id', 'tokens', 'pred'] 33 | 34 | def __init__(self, id, tokens, pred=None): 35 | self.id = id 36 | self.tokens = tokens 37 | self.pred = pred 38 | 39 | @property 40 | def words(self): 41 | return [_.text for _ in self.tokens] 42 | 43 | 44 | def substoken(text, vocab): 45 | subs = bert_subs(text, vocab) 46 | return SubsToken(text, subs) 47 | 48 | 49 | def text_items(texts, vocab): 50 | for id, text in enumerate(texts): 51 | tokens = [ 52 | substoken(_.text, vocab) 53 | for _ in tokenize(text) 54 | ] 55 | yield BERTInferItem(id, tokens) 56 | 57 | 58 | def word_items(items, vocab): 59 | for id, words in enumerate(items): 60 | tokens = [ 61 | substoken(_, vocab) 62 | for _ in words 63 | ] 64 | yield BERTInferItem(id, tokens) 65 | 66 | 67 | def segment_items(items, seq_len): 68 | for item in items: 69 | chunks = chop_weighted( 70 | item.tokens, 71 | seq_len, 72 | weight=lambda _: len(_.subs) 73 | ) 74 | for chunk in chunks: 75 | yield BERTInferItem(item.id, chunk) 76 | 77 | 78 | def flatten(seqs): 79 | return [ 80 | item 81 | for seq in seqs 82 | for item in seq 83 | ] 84 | 85 | 86 | def join_items(items): 87 | for id, group in groupby(items, key=lambda _: _.id): 88 | group = list(group) 89 | tokens = flatten(_.tokens for _ in group) 90 | pred = flatten(_.pred for _ in group) 91 | yield BERTInferItem(id, tokens, pred) 92 | 93 | 94 | ####### 95 | # 96 | # DECODE 97 | # 98 | ####### 99 | 100 | 101 | class BERTTagDecoder(TagDecoder): 102 | pass 103 | 104 | 105 | class BERTSyntaxDecoder(SyntaxDecoder): 106 | pass 107 | 108 | 109 | ####### 110 | # 111 | # INFER 112 | # 113 | ###### 114 | 115 | 116 | class BERTNERInfer(Infer): 117 | def process(self, inputs): 118 | for input in inputs: 119 | input = input.to(self.model.device) 120 | pred = self.model(input.word_id, input.pad_mask) 121 | pred = pad_masked(pred, input.word_mask) 122 | mask = pad_masked(input.word_mask, input.word_mask) 123 | yield from self.model.head.crf.decode(pred, mask) 124 | 125 | def __call__(self, chunk): 126 | items = text_items(chunk, self.encoder.words_vocab) 127 | # consider , spec tokens 128 | items = list(segment_items(items, self.encoder.seq_len - 2)) 129 | inputs = self.encoder(items) 130 | preds = self.process(inputs) 131 | preds = self.decoder(preds) 132 | 133 | for item, pred in zip(items, preds): 134 | item.pred = pred 135 | 136 | items = join_items(items) 137 | 138 | for text, item in zip(chunk, items): 139 | tuples = zip(item.words, item.pred) 140 | markup = BIOMarkup.from_tuples(tuples) 141 | yield markup.to_span(text) 142 | 143 | 144 | class BERTMorphInfer(Infer): 145 | def process(self, inputs): 146 | for input in inputs: 147 | input = input.to(self.model.device) 148 | pred = self.model(input.word_id, input.pad_mask) 149 | pred = self.model.head.decode(pred) 150 | yield from split_masked(pred, input.word_mask) 151 | 152 | def __call__(self, chunk): 153 | items = word_items(chunk, self.encoder.words_vocab) 154 | items = list(segment_items(items, self.encoder.seq_len - 2)) 155 | inputs = self.encoder(items) 156 | preds = self.process(inputs) 157 | preds = self.decoder(preds) 158 | 159 | for item, pred in zip(items, preds): 160 | item.pred = pred 161 | 162 | items = join_items(items) 163 | for item in items: 164 | tuples = zip(item.words, item.pred) 165 | yield MorphMarkup.from_tuples(tuples) 166 | 167 | 168 | class BERTSyntaxInfer(Infer): 169 | def process(self, inputs): 170 | for input in inputs: 171 | input = input.to(self.model.device) 172 | pred = self.model(input.word_id, input.word_mask, input.pad_mask) 173 | head_id = self.model.head.decode(pred.head_id) 174 | rel_id = self.model.rel.decode(pred.rel_id) 175 | yield from zip(head_id, rel_id) 176 | 177 | def __call__(self, chunk): 178 | items = list(word_items(chunk, self.encoder.words_vocab)) 179 | inputs = self.encoder(items) 180 | preds = self.process(inputs) 181 | preds = self.decoder(preds) 182 | 183 | for item, pred in zip(items, preds): 184 | item.pred = pred 185 | 186 | items = join_items(items) 187 | for item in items: 188 | ids, head_ids, rels = item.pred 189 | tuples = zip(ids, item.words, head_ids, rels) 190 | yield SyntaxMarkup.from_tuples(tuples) 191 | -------------------------------------------------------------------------------- /slovnet/infer/syntax.py: -------------------------------------------------------------------------------- 1 | 2 | from slovnet.markup import SyntaxMarkup 3 | from slovnet.mask import split_masked 4 | 5 | from .base import Infer 6 | 7 | 8 | class SyntaxDecoder: 9 | def __init__(self, rels_vocab): 10 | self.rels_vocab = rels_vocab 11 | 12 | def __call__(self, preds): 13 | for pred in preds: 14 | head_ids, rel_ids = pred 15 | ids = [str(_ + 1) for _ in range(len(head_ids))] 16 | head_ids = [str(_) for _ in head_ids.tolist()] 17 | rels = [self.rels_vocab.decode(_) for _ in rel_ids] 18 | yield ids, head_ids, rels 19 | 20 | 21 | class SyntaxInfer(Infer): 22 | def process(self, inputs): 23 | for input in inputs: 24 | input = input.to(self.model.device) 25 | 26 | pred = self.model(input.word_id, input.shape_id, input.pad_mask) 27 | mask = ~input.pad_mask 28 | 29 | head_id = self.model.head.decode(pred.head_id, mask) 30 | head_id = split_masked(head_id, mask) 31 | 32 | rel_id = self.model.rel.decode(pred.rel_id, mask) 33 | rel_id = split_masked(rel_id, mask) 34 | 35 | yield from zip(head_id, rel_id) 36 | 37 | def __call__(self, items): 38 | inputs = self.encoder(items) 39 | preds = self.process(inputs) 40 | preds = self.decoder(preds) 41 | 42 | for item, pred in zip(items, preds): 43 | ids, head_ids, rels = pred 44 | tuples = zip(ids, item, head_ids, rels) 45 | yield SyntaxMarkup.from_tuples(tuples) 46 | -------------------------------------------------------------------------------- /slovnet/infer/tag.py: -------------------------------------------------------------------------------- 1 | 2 | from slovnet.mask import split_masked 3 | from slovnet.token import tokenize 4 | from slovnet.markup import ( 5 | BIOMarkup, 6 | MorphMarkup 7 | ) 8 | 9 | from .base import Infer 10 | 11 | 12 | class TagDecoder: 13 | def __init__(self, tags_vocab): 14 | self.tags_vocab = tags_vocab 15 | 16 | def __call__(self, preds): 17 | for pred in preds: 18 | yield [self.tags_vocab.decode(_) for _ in pred] 19 | 20 | 21 | def text_words(text): 22 | return [_.text for _ in tokenize(text)] 23 | 24 | 25 | class NERInfer(Infer): 26 | def process(self, inputs): 27 | for input in inputs: 28 | input = input.to(self.model.device) 29 | pred = self.model(input.word_id, input.shape_id, input.pad_mask) 30 | yield from self.model.ner.crf.decode(pred, ~input.pad_mask) 31 | 32 | def __call__(self, texts): 33 | items = [text_words(_) for _ in texts] 34 | inputs = self.encoder(items) 35 | preds = self.process(inputs) 36 | preds = self.decoder(preds) 37 | 38 | for text, item, pred in zip(texts, items, preds): 39 | tuples = zip(item, pred) 40 | markup = BIOMarkup.from_tuples(tuples) 41 | yield markup.to_span(text) 42 | 43 | 44 | class MorphInfer(Infer): 45 | def process(self, inputs): 46 | for input in inputs: 47 | input = input.to(self.model.device) 48 | pred = self.model(input.word_id, input.shape_id, input.pad_mask) 49 | pred = self.model.morph.decode(pred) 50 | yield from split_masked(pred, ~input.pad_mask) 51 | 52 | def __call__(self, items): 53 | inputs = self.encoder(items) 54 | preds = self.process(inputs) 55 | preds = self.decoder(preds) 56 | 57 | for item, pred in zip(items, preds): 58 | tuples = zip(item, pred) 59 | yield MorphMarkup.from_tuples(tuples) 60 | -------------------------------------------------------------------------------- /slovnet/io.py: -------------------------------------------------------------------------------- 1 | 2 | import json 3 | import gzip 4 | 5 | 6 | def load_lines(path, encoding='utf8'): 7 | with open(path, encoding=encoding) as file: 8 | for line in file: 9 | yield line.rstrip('\n') 10 | 11 | 12 | def dump_lines(lines, path, encoding='utf8'): 13 | with open(path, 'w', encoding=encoding) as file: 14 | for line in lines: 15 | file.write(line + '\n') 16 | 17 | 18 | def load_gz_lines(path, encoding='utf8'): 19 | with gzip.open(path) as file: 20 | for line in file: 21 | yield line.decode(encoding).rstrip() 22 | 23 | 24 | def dump_gz_lines(lines, path): 25 | with gzip.open(path, 'wt') as file: 26 | for line in lines: 27 | file.write(line + '\n') 28 | 29 | 30 | def load_json(path, encoding='utf8'): 31 | with open(path, encoding=encoding) as file: 32 | return json.load(file) 33 | 34 | 35 | def format_jl(items): 36 | for item in items: 37 | yield json.dumps(item, ensure_ascii=False) 38 | 39 | 40 | def parse_jl(lines): 41 | for line in lines: 42 | yield json.loads(line) 43 | -------------------------------------------------------------------------------- /slovnet/log.py: -------------------------------------------------------------------------------- 1 | 2 | import sys 3 | from datetime import datetime 4 | 5 | 6 | def log(format, *args): 7 | message = format % args 8 | now = datetime.now().strftime('%Y-%m-%d %H:%M:%S') 9 | print( 10 | '[%s] %s' % (now, message), 11 | file=sys.stderr 12 | ) 13 | -------------------------------------------------------------------------------- /slovnet/loss.py: -------------------------------------------------------------------------------- 1 | 2 | from torch.nn import functional as F 3 | 4 | from .mask import fill_masked 5 | 6 | 7 | def flatten_cross_entropy(pred, target, ignore_id=None): 8 | target = target.flatten() 9 | pred = pred.view(len(target), -1) 10 | return F.cross_entropy(pred, target, ignore_index=ignore_id) 11 | 12 | 13 | def masked_flatten_cross_entropy(pred, target, mask, ignore_id=-100): 14 | target = fill_masked(target, ~mask, ignore_id) 15 | return flatten_cross_entropy(pred, target, ignore_id) 16 | -------------------------------------------------------------------------------- /slovnet/markup.py: -------------------------------------------------------------------------------- 1 | 2 | try: 3 | from ipymarkup import ( 4 | show_span_box_markup, 5 | show_dep_markup 6 | ) 7 | except ImportError: 8 | pass 9 | 10 | from .record import Record 11 | from .bio import ( 12 | spans_bio, 13 | bio_spans 14 | ) 15 | from .token import find_tokens 16 | from .sent import sentenize 17 | from .span import ( 18 | Span, 19 | envelop_spans, 20 | offset_spans 21 | ) 22 | from .conll import ( 23 | format_conll_tag, 24 | parse_conll_tag 25 | ) 26 | 27 | 28 | ######## 29 | # 30 | # SPAN 31 | # 32 | ####### 33 | 34 | 35 | class SpanMarkup(Record): 36 | __attributes__ = ['text', 'spans'] 37 | __annotations__ = { 38 | 'spans': [Span] 39 | } 40 | 41 | @property 42 | def sents(self): 43 | for sent in sentenize(self.text): 44 | spans = envelop_spans(sent, self.spans) 45 | spans = offset_spans(spans, -sent.start) 46 | yield SpanMarkup(sent.text, list(spans)) 47 | 48 | def to_bio(self, tokens): 49 | tags = spans_bio(tokens, self.spans) 50 | words = [_.text for _ in tokens] 51 | return BIOMarkup.from_tuples(zip(words, tags)) 52 | 53 | 54 | def show_span_markup(markup): 55 | show_span_box_markup(markup.text, markup.spans) 56 | 57 | 58 | ######## 59 | # 60 | # TAG 61 | # 62 | ###### 63 | 64 | 65 | class TagToken(Record): 66 | __attributes__ = ['text', 'tag'] 67 | 68 | 69 | class TagMarkup(Record): 70 | __attributes__ = ['tokens'] 71 | __annotations__ = { 72 | 'tokens': [TagToken] 73 | } 74 | 75 | @property 76 | def words(self): 77 | return [_.text for _ in self.tokens] 78 | 79 | @property 80 | def tags(self): 81 | return [_.tag for _ in self.tokens] 82 | 83 | @classmethod 84 | def from_tuples(cls, tuples): 85 | return cls([ 86 | TagToken(word, tag) 87 | for word, tag in tuples 88 | ]) 89 | 90 | 91 | class BIOMarkup(TagMarkup): 92 | def to_span(self, text): 93 | tokens = find_tokens(text, self.words) 94 | spans = list(bio_spans(tokens, self.tags)) 95 | return SpanMarkup(text, spans) 96 | 97 | 98 | ######## 99 | # 100 | # MORPH 101 | # 102 | ######## 103 | 104 | 105 | class MorphToken(TagToken): 106 | __attributes__ = ['text', 'pos', 'feats'] 107 | 108 | @property 109 | def tag(self): 110 | return format_conll_tag(self.pos, self.feats) 111 | 112 | 113 | class MorphMarkup(TagMarkup): 114 | __attributes__ = ['tokens'] 115 | __annotations__ = { 116 | 'tokens': [MorphToken] 117 | } 118 | 119 | @classmethod 120 | def from_tuples(cls, tuples): 121 | tokens = [] 122 | for word, tag in tuples: 123 | pos, feats = parse_conll_tag(tag) 124 | tokens.append(MorphToken(word, pos, feats)) 125 | return cls(tokens) 126 | 127 | 128 | def format_morph_markup(markup, size=20): 129 | for word, tag in zip(markup.words, markup.tags): 130 | word = word.rjust(size) 131 | yield '%s %s' % (word, tag) 132 | 133 | 134 | def show_morph_markup(markup): 135 | for line in format_morph_markup(markup): 136 | print(line) 137 | 138 | 139 | def format_morph_markup_diff(a, b, size=20): 140 | for word, a_token, b_token in zip(a.words, a.tokens, b.tokens): 141 | word = word.rjust(size) 142 | a_tag = format_conll_tag(a_token.pos, a_token.feats) 143 | yield '%s %s' % (word, a_tag) 144 | if a_token != b_token: 145 | word = ' ' * size 146 | b_tag = format_conll_tag(b_token.pos, b_token.feats) 147 | yield '%s ! %s' % (word, b_tag) 148 | 149 | 150 | def show_morph_markup_diff(a, b): 151 | for line in format_morph_markup_diff(a, b): 152 | print(line) 153 | 154 | 155 | ####### 156 | # 157 | # SYNTAX 158 | # 159 | ####### 160 | 161 | 162 | class SyntaxToken(TagToken): 163 | __attributes__ = ['id', 'text', 'head_id', 'rel'] 164 | 165 | 166 | class SyntaxMarkup(TagMarkup): 167 | __attributes__ = ['tokens'] 168 | __annotations__ = { 169 | 'tokens': [SyntaxToken] 170 | } 171 | 172 | @classmethod 173 | def from_tuples(cls, tuples): 174 | return cls([ 175 | SyntaxToken(id, text, head_id, rel) 176 | for id, text, head_id, rel in tuples 177 | ]) 178 | 179 | 180 | def syntax_markup_deps(tokens): 181 | for token in tokens: 182 | id = int(token.id) 183 | head_id = int(token.head_id) 184 | # skip root=0, skip loop 185 | # ipymarkup crashes 186 | if head_id == 0 or head_id == id: 187 | continue 188 | 189 | rel = token.rel 190 | id = id - 1 191 | head_id = head_id - 1 192 | yield head_id, id, rel 193 | 194 | 195 | def show_syntax_markup(markup): 196 | deps = syntax_markup_deps(markup.tokens) 197 | show_dep_markup(markup.words, deps) 198 | -------------------------------------------------------------------------------- /slovnet/mask.py: -------------------------------------------------------------------------------- 1 | 2 | import torch 3 | 4 | from .record import Record 5 | from .pad import pad_sequence 6 | 7 | 8 | class Masked(Record): 9 | __attributes__ = ['value', 'mask'] 10 | 11 | 12 | def mask_like(input): 13 | return torch.ones_like(input, dtype=torch.bool) 14 | 15 | 16 | def split_masked(input, mask): 17 | sizes = mask.sum(dim=-1).tolist() 18 | return input[mask].split(sizes) 19 | 20 | 21 | def pad_masked(input, mask, fill=0): 22 | seqs = split_masked(input, mask) 23 | return pad_sequence(seqs, fill) 24 | 25 | 26 | def fill_masked(input, mask, fill=0): 27 | return fill * mask + input * ~mask 28 | -------------------------------------------------------------------------------- /slovnet/model/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/natasha/slovnet/4b46dc497b586e106258819c23be28c277efac97/slovnet/model/__init__.py -------------------------------------------------------------------------------- /slovnet/model/base.py: -------------------------------------------------------------------------------- 1 | 2 | from torch import nn 3 | 4 | from .state import StateMixin 5 | from .exec import ExecMixin 6 | 7 | 8 | class Module(nn.Module, StateMixin, ExecMixin): 9 | @property 10 | def device(self): 11 | for parameter in self.parameters(): 12 | return parameter.device 13 | -------------------------------------------------------------------------------- /slovnet/model/bert.py: -------------------------------------------------------------------------------- 1 | 2 | import torch 3 | from torch import nn 4 | from torch.nn import functional as F 5 | 6 | from slovnet.record import Record 7 | from slovnet.mask import pad_masked 8 | 9 | from .base import Module 10 | from .tag import ( 11 | NERHead, 12 | MorphHead 13 | ) 14 | from .syntax import ( 15 | SyntaxHead, 16 | SyntaxRel, 17 | SyntaxPred 18 | ) 19 | 20 | 21 | class BERTConfig(Record): 22 | __attributes__ = [ 23 | 'vocab_size', 'seq_len', 'emb_dim', 24 | 'layers_num', 'heads_num', 'hidden_dim', 25 | 'dropout', 'norm_eps' 26 | ] 27 | 28 | 29 | class RuBERTConfig(BERTConfig): 30 | def __init__(self, 31 | vocab_size=50106, 32 | seq_len=512, 33 | emb_dim=768, 34 | layers_num=12, 35 | heads_num=12, 36 | hidden_dim=3072, 37 | dropout=0.1, 38 | norm_eps=1e-12): 39 | super(RuBERTConfig, self).__init__( 40 | vocab_size, seq_len, emb_dim, 41 | layers_num, heads_num, hidden_dim, 42 | dropout, norm_eps 43 | ) 44 | 45 | 46 | class BERTEmbedding(Module): 47 | def __init__(self, vocab_size, seq_len, emb_dim, dropout=0.1, norm_eps=1e-12): 48 | super(BERTEmbedding, self).__init__() 49 | self.word = nn.Embedding(vocab_size, emb_dim) 50 | self.position = nn.Embedding(seq_len, emb_dim) 51 | self.norm = nn.LayerNorm(emb_dim, eps=norm_eps) 52 | self.drop = nn.Dropout(dropout) 53 | 54 | @classmethod 55 | def from_config(cls, config): 56 | return cls( 57 | config.vocab_size, config.seq_len, config.emb_dim, 58 | config.dropout, config.norm_eps 59 | ) 60 | 61 | def forward(self, input): 62 | batch_size, seq_len = input.shape 63 | position = torch.arange(seq_len).expand_as(input).to(input.device) 64 | 65 | emb = self.word(input) + self.position(position) 66 | emb = self.norm(emb) 67 | return self.drop(emb) 68 | 69 | 70 | def BERTLayer(emb_dim, heads_num, hidden_dim, dropout=0.1, norm_eps=1e-12): 71 | layer = nn.TransformerEncoderLayer( 72 | d_model=emb_dim, 73 | nhead=heads_num, 74 | dim_feedforward=hidden_dim, 75 | dropout=dropout, 76 | activation='gelu' 77 | ) 78 | layer.norm1.eps = norm_eps 79 | layer.norm2.eps = norm_eps 80 | return layer 81 | 82 | 83 | class BERTEncoder(Module): 84 | def __init__(self, layers_num, emb_dim, heads_num, hidden_dim, 85 | dropout=0.1, norm_eps=1e-12): 86 | super(BERTEncoder, self).__init__() 87 | self.layers = nn.ModuleList([ 88 | BERTLayer( 89 | emb_dim, heads_num, hidden_dim, 90 | dropout, norm_eps 91 | ) 92 | for _ in range(layers_num) 93 | ]) 94 | 95 | @classmethod 96 | def from_config(cls, config): 97 | return cls( 98 | config.layers_num, config.emb_dim, config.heads_num, config.hidden_dim, 99 | config.dropout, config.norm_eps 100 | ) 101 | 102 | def forward(self, input, pad_mask=None): 103 | input = input.transpose(0, 1) # torch expects seq x batch x emb 104 | for layer in self.layers: 105 | input = layer(input, src_key_padding_mask=pad_mask) 106 | return input.transpose(0, 1) # restore 107 | 108 | 109 | ######### 110 | # 111 | # MLM 112 | # 113 | ######### 114 | 115 | 116 | class BERTMLMHead(Module): 117 | def __init__(self, emb_dim, vocab_size, norm_eps=1e-12): 118 | super(BERTMLMHead, self).__init__() 119 | self.linear1 = nn.Linear(emb_dim, emb_dim) 120 | self.norm = nn.LayerNorm(emb_dim, eps=norm_eps) 121 | self.linear2 = nn.Linear(emb_dim, vocab_size) 122 | 123 | def forward(self, input): 124 | x = self.linear1(input) 125 | x = F.gelu(x) 126 | x = self.norm(x) 127 | return self.linear2(x) 128 | 129 | 130 | class BERTMLM(Module): 131 | def __init__(self, emb, encoder, head): 132 | super(BERTMLM, self).__init__() 133 | self.emb = emb 134 | self.encoder = encoder 135 | self.head = head 136 | 137 | def forward(self, input): 138 | x = self.emb(input) 139 | x = self.encoder(x) 140 | return self.head(x) 141 | 142 | 143 | ######### 144 | # 145 | # TAG 146 | # 147 | ###### 148 | 149 | 150 | class BERTNERHead(NERHead): 151 | pass 152 | 153 | 154 | class BERTMorphHead(MorphHead): 155 | pass 156 | 157 | 158 | class BERTTag(Module): 159 | def __init__(self, emb, encoder, head): 160 | super(BERTTag, self).__init__() 161 | self.emb = emb 162 | self.encoder = encoder 163 | self.head = head 164 | 165 | def forward(self, input, pad_mask=None): 166 | x = self.emb(input) 167 | x = self.encoder(x, pad_mask) 168 | return self.head(x) 169 | 170 | 171 | class BERTNER(BERTTag): 172 | pass 173 | 174 | 175 | class BERTMorph(BERTTag): 176 | pass 177 | 178 | 179 | ####### 180 | # 181 | # SYNTAX 182 | # 183 | ####### 184 | 185 | 186 | class BERTSyntaxHead(SyntaxHead): 187 | pass 188 | 189 | 190 | class BERTSyntaxRel(SyntaxRel): 191 | pass 192 | 193 | 194 | class BERTSyntax(Module): 195 | def __init__(self, emb, encoder, head, rel): 196 | super(BERTSyntax, self).__init__() 197 | self.emb = emb 198 | self.encoder = encoder 199 | self.head = head 200 | self.rel = rel 201 | 202 | def forward(self, input, word_mask, pad_mask, 203 | target_mask, target_head_id=None): 204 | x = self.emb(input) 205 | x = self.encoder(x, pad_mask) 206 | x = pad_masked(x, word_mask) 207 | 208 | head_id = self.head(x) 209 | if target_head_id is None: 210 | target_head_id = self.head.decode(head_id, target_mask) 211 | 212 | return SyntaxPred( 213 | head_id=head_id, 214 | rel_id=self.rel(x, target_head_id) 215 | ) 216 | -------------------------------------------------------------------------------- /slovnet/model/cnn.py: -------------------------------------------------------------------------------- 1 | 2 | from torch import nn 3 | 4 | from .base import Module 5 | 6 | 7 | class CNNEncoderLayer(Module): 8 | def __init__(self, in_dim, out_dim, kernel_size): 9 | super(CNNEncoderLayer, self).__init__() 10 | 11 | padding = (kernel_size - 1) // 2 12 | self.conv = nn.Conv1d( 13 | in_dim, out_dim, kernel_size, 14 | padding=padding 15 | ) 16 | self.relu = nn.ReLU() 17 | self.norm = nn.BatchNorm1d(out_dim) 18 | 19 | def __call__(self, input): 20 | x = self.conv(input) 21 | x = self.relu(x) 22 | return self.norm(x) 23 | 24 | 25 | def gen_cnn_encoder_layers(input_dim, layer_dims, kernel_size): 26 | dims = [input_dim] + layer_dims 27 | for index in range(1, len(dims)): 28 | in_dim = dims[index - 1] 29 | out_dim = dims[index] 30 | yield CNNEncoderLayer(in_dim, out_dim, kernel_size) 31 | 32 | 33 | class CNNEncoder(Module): 34 | def __init__(self, input_dim, layer_dims, kernel_size): 35 | super(CNNEncoder, self).__init__() 36 | 37 | layers = gen_cnn_encoder_layers(input_dim, layer_dims, kernel_size) 38 | self.layers = nn.ModuleList(layers) 39 | self.dim = layer_dims[-1] 40 | 41 | def forward(self, input, mask=None): # batch x seq x emb 42 | input = input.transpose(2, 1) # batch x emb x seq 43 | 44 | if mask is not None: 45 | mask = mask.unsqueeze(1) # batch x 1 x seq 46 | 47 | for layer in self.layers: 48 | input = layer(input) # batch x dim x seq 49 | 50 | if mask is not None: 51 | input[mask.expand_as(input)] = 0 52 | 53 | return input.transpose(2, 1) # batch x seq x dim 54 | -------------------------------------------------------------------------------- /slovnet/model/crf.py: -------------------------------------------------------------------------------- 1 | 2 | import torch 3 | from torch import nn 4 | 5 | from slovnet.mask import mask_like 6 | 7 | from .base import Module 8 | 9 | 10 | class CRF(Module): 11 | # https://github.com/kmkurn/pytorch-crf/blob/master/torchcrf/__init__.py 12 | 13 | def __init__(self, tags_num): 14 | super(CRF, self).__init__() 15 | self.tags_num = tags_num 16 | self.transitions = nn.Parameter(torch.empty(tags_num, tags_num)) 17 | self.reset_parameters() 18 | 19 | def reset_parameters(self): 20 | nn.init.uniform_(self.transitions, -0.1, 0.1) 21 | 22 | def extra_repr(self): 23 | return 'tags_num=%d' % self.tags_num 24 | 25 | def forward(self, emissions, tags, mask=None): 26 | if mask is None: 27 | mask = mask_like(tags) 28 | 29 | emissions = emissions.transpose(1, 0) # seq x batch x tags 30 | tags = tags.transpose(1, 0) # seq x batch 31 | mask = mask.transpose(1, 0) 32 | 33 | log_likelihood = ( 34 | self.score(emissions, tags, mask) 35 | - self.normalization(emissions, mask) 36 | ) # batch 37 | return -torch.mean(log_likelihood) # 1 38 | 39 | def score(self, emissions, tags, mask): 40 | seq_len, batch_size = tags.shape 41 | batch_range = torch.arange(batch_size) 42 | score = emissions[0, batch_range, tags[0]] # batch 43 | for index in range(1, seq_len): 44 | score += ( 45 | self.transitions[tags[index - 1], tags[index]] 46 | + emissions[index, batch_range, tags[index]] 47 | ) * mask[index] 48 | return score 49 | 50 | def normalization(self, emissions, mask): 51 | seq_len, batch_size, tags_num = emissions.shape 52 | score = emissions[0] 53 | for index in range(1, seq_len): 54 | score_ = score.view(batch_size, tags_num, 1) 55 | emissions_ = emissions[index].view(batch_size, 1, tags_num) 56 | score_ = score_ + self.transitions + emissions_ # batch x tags x tags 57 | score_ = torch.logsumexp(score_, dim=-2) # batch x tags 58 | mask_ = mask[index].view(batch_size, 1) 59 | score = torch.where(mask_, score_, score) 60 | return torch.logsumexp(score, dim=-1) # batch 61 | 62 | def decode(self, emissions, mask=None): 63 | batch_size, seq_len, tags_num = emissions.shape 64 | if mask is None: 65 | mask = torch.ones( 66 | (batch_size, seq_len), 67 | dtype=torch.bool, 68 | device=emissions.device 69 | ) 70 | 71 | emissions = emissions.transpose(1, 0) 72 | mask = mask.transpose(1, 0) 73 | 74 | history = [] 75 | score = emissions[0] 76 | for index in range(1, seq_len): 77 | score_ = score.view(batch_size, tags_num, 1) 78 | emissions_ = emissions[index].view(batch_size, 1, tags_num) 79 | score_ = score_ + self.transitions + emissions_ # batch x tags x tags 80 | score_, indexes = torch.max(score_, dim=-2) # batch x tags 81 | mask_ = mask[index].view(batch_size, 1) 82 | score = torch.where(mask_, score_, score) 83 | history.append(indexes) 84 | 85 | sizes = mask.sum(0) - 1 86 | batch = [] 87 | for index in range(batch_size): 88 | best = score[index].argmax() 89 | tags = [best] 90 | size = sizes[index] 91 | for indexes in reversed(history[:size]): 92 | best = indexes[index][best] 93 | tags.append(best) 94 | tags.reverse() 95 | batch.append(torch.tensor(tags)) 96 | return batch 97 | -------------------------------------------------------------------------------- /slovnet/model/emb.py: -------------------------------------------------------------------------------- 1 | 2 | import torch 3 | from torch import nn 4 | 5 | from .base import Module 6 | 7 | 8 | class Embedding(nn.Embedding, Module): 9 | def __init__(self, vocab_size, dim, pad_id): 10 | super(Embedding, self).__init__(vocab_size, dim, pad_id) 11 | self.dim = dim 12 | 13 | 14 | class PQEmbedding(Module): 15 | def __init__(self, indexes, codes): 16 | super(PQEmbedding, self).__init__() 17 | 18 | qdim, centroids, subdim = codes.shape 19 | self.subdim = subdim 20 | self.dim = qdim * subdim 21 | 22 | codes = codes.transpose(1, 0) # for gather, centroids x qdim x subdim 23 | self.codes = nn.Parameter(codes, requires_grad=False) 24 | self.indexes = nn.Parameter(indexes, requires_grad=False) 25 | 26 | def extra_repr(self): 27 | return 'indexes=[...], codes=[...]' 28 | 29 | def forward(self, input): 30 | shape = input.shape 31 | input = input.flatten() # reshape in return 32 | 33 | indexes = self.indexes[input] 34 | vectors, qdim = indexes.shape 35 | 36 | indexes = indexes.view(vectors, qdim, 1) 37 | indexes = indexes.expand(vectors, qdim, self.subdim) 38 | output = self.codes.gather(0, indexes.long()) # vectors x qdim x subdim 39 | return output.view(*shape, self.dim) 40 | 41 | 42 | class NavecEmbedding(PQEmbedding): 43 | def __init__(self, navec): 44 | self.id = navec.meta.id 45 | super(NavecEmbedding, self).__init__( 46 | torch.from_numpy(navec.pq.indexes), 47 | torch.from_numpy(navec.pq.codes) 48 | ) 49 | 50 | def extra_repr(self): 51 | return 'id=%r, indexes=[...], codes=[...]' % self.id 52 | 53 | 54 | class WordShapeEmbedding(Module): 55 | def __init__(self, word, shape): 56 | super(WordShapeEmbedding, self).__init__() 57 | self.word = word 58 | self.shape = shape 59 | self.dim = word.dim + shape.dim 60 | 61 | def forward(self, word_id, shape_id): 62 | word = self.word(word_id) 63 | shape = self.shape(shape_id) 64 | return torch.cat([word, shape], dim=-1) 65 | -------------------------------------------------------------------------------- /slovnet/model/exec.py: -------------------------------------------------------------------------------- 1 | 2 | import torch 3 | 4 | from slovnet.visitor import Visitor 5 | from slovnet.exec import model as exec 6 | 7 | 8 | class ExecVisitor(Visitor): 9 | def visit_Parameter(self, item): 10 | return self.visit(item.data) 11 | 12 | def visit_Tensor(self, item): 13 | array = item.detach().numpy() 14 | return exec.Weight( 15 | array.shape, 16 | array.dtype.name, 17 | array 18 | ) 19 | 20 | def visit_Linear(self, item): 21 | # in torch linear is xA^T + b 22 | weight = item.weight.transpose(1, 0) 23 | return exec.Linear( 24 | self.visit(weight), 25 | self.visit(item.bias) 26 | ) 27 | 28 | def visit_Conv1d(self, item): 29 | padding, = item.padding # tuple -> int 30 | return exec.Conv1d( 31 | self.visit(item.weight), 32 | self.visit(item.bias), 33 | padding 34 | ) 35 | 36 | def visit_ReLU(self, item): 37 | return exec.ReLU() 38 | 39 | def visit_BatchNorm1d(self, item): 40 | running_std = torch.sqrt(item.running_var + item.eps) 41 | return exec.BatchNorm1d( 42 | self.visit(item.weight), 43 | self.visit(item.bias), 44 | self.visit(item.running_mean), 45 | self.visit(running_std), 46 | ) 47 | 48 | def visit_Embedding(self, item): 49 | return exec.Embedding( 50 | self.visit(item.weight) 51 | ) 52 | 53 | def visit_NavecEmbedding(self, item): 54 | # recover initial qdim x centroids x chunk 55 | codes = item.codes.transpose(1, 0) 56 | return exec.NavecEmbedding( 57 | item.id, 58 | self.visit(item.indexes), 59 | self.visit(codes) 60 | ) 61 | 62 | def visit_WordShapeEmbedding(self, item): 63 | return exec.WordShapeEmbedding( 64 | self.visit(item.word), 65 | self.visit(item.shape) 66 | ) 67 | 68 | def visit_CNNEncoderLayer(self, item): 69 | return exec.CNNEncoderLayer( 70 | self.visit(item.conv), 71 | self.visit(item.relu), 72 | self.visit(item.norm) 73 | ) 74 | 75 | def visit_CNNEncoder(self, item): 76 | return exec.CNNEncoder([ 77 | self.visit(_) 78 | for _ in item.layers 79 | ]) 80 | 81 | def visit_NERHead(self, item): 82 | return exec.NERHead( 83 | self.visit(item.proj), 84 | self.visit(item.crf) 85 | ) 86 | 87 | def visit_MorphHead(self, item): 88 | return exec.MorphHead( 89 | self.visit(item.proj) 90 | ) 91 | 92 | def visit_Tag(self, item): 93 | from slovnet.model.tag import NERHead, MorphHead 94 | 95 | cls = type(item.head) 96 | if cls is NERHead: 97 | Tag = exec.NER 98 | elif cls is MorphHead: 99 | Tag = exec.Morph 100 | 101 | return Tag( 102 | self.visit(item.emb), 103 | self.visit(item.encoder), 104 | self.visit(item.head) 105 | ) 106 | 107 | def visit_FF(self, item): 108 | return exec.FF( 109 | self.visit(item.proj), 110 | self.visit(item.relu) 111 | ) 112 | 113 | def visit_SyntaxHead(self, item): 114 | return exec.SyntaxHead( 115 | self.visit(item.head), 116 | self.visit(item.tail), 117 | self.visit(item.root), 118 | self.visit(item.kernel) 119 | ) 120 | 121 | def visit_SyntaxRel(self, item): 122 | return exec.SyntaxRel( 123 | self.visit(item.head), 124 | self.visit(item.tail), 125 | self.visit(item.root), 126 | self.visit(item.kernel) 127 | ) 128 | 129 | def visit_Syntax(self, item): 130 | return exec.Syntax( 131 | self.visit(item.emb), 132 | self.visit(item.encoder), 133 | self.visit(item.head), 134 | self.visit(item.rel) 135 | ) 136 | 137 | def visit_CRF(self, item): 138 | return exec.CRF( 139 | self.visit(item.transitions) 140 | ) 141 | 142 | 143 | class ExecMixin: 144 | # super stange error if as_exec property 145 | # torch Module does some magic 146 | def to_exec(self): 147 | visitor = ExecVisitor() 148 | return visitor(self) 149 | -------------------------------------------------------------------------------- /slovnet/model/state.py: -------------------------------------------------------------------------------- 1 | 2 | import torch 3 | 4 | from slovnet.const import CPU 5 | 6 | 7 | def load_model(model, path): 8 | model.load_state_dict(torch.load(path, map_location=CPU)) 9 | 10 | 11 | def dump_model(model, path): 12 | torch.save(model.state_dict(), path) 13 | 14 | 15 | class StateMixin: 16 | def load(self, path): 17 | load_model(self, path) 18 | return self 19 | 20 | def dump(self, path): 21 | dump_model(self, path) 22 | -------------------------------------------------------------------------------- /slovnet/model/syntax.py: -------------------------------------------------------------------------------- 1 | 2 | import torch 3 | from torch import nn 4 | from torch.nn import functional as F 5 | 6 | from slovnet.record import Record 7 | from slovnet.mask import fill_masked 8 | 9 | from .base import Module 10 | from .cnn import CNNEncoder 11 | from .emb import WordShapeEmbedding 12 | 13 | 14 | class FF(Module): 15 | def __init__(self, input_dim, hidden_dim, dropout): 16 | super(FF, self).__init__() 17 | self.proj = nn.Linear(input_dim, hidden_dim) 18 | self.relu = nn.ReLU() 19 | self.drop = nn.Dropout(dropout) 20 | 21 | def forward(self, input): 22 | x = self.proj(input) 23 | x = self.relu(x) 24 | return self.drop(x) 25 | 26 | 27 | def append_root(input, root): 28 | batch_size, seq_len, emb_dim = input.shape 29 | root = root.repeat(batch_size).view(batch_size, 1, emb_dim) 30 | return torch.cat((root, input), dim=1) 31 | 32 | 33 | def strip_root(input): 34 | input = input[:, 1:, :] 35 | return input.contiguous() 36 | 37 | 38 | def append_root_mask(mask): 39 | return F.pad(mask, (1, 0), 'constant', True) 40 | 41 | 42 | def matmul_mask(mask): 43 | # 1 1 1 0 0 44 | # -> 45 | # 1 1 1 0 0 46 | # 1 1 1 0 0 47 | # 1 1 1 0 0 48 | # 0 0 0 0 0 49 | # 0 0 0 0 0 50 | 51 | mask = mask.float() # matmul not supported for bool 52 | mask = mask.unsqueeze(-2) # batch x 1 x seq 53 | mask = mask.transpose(-2, -1).matmul(mask) 54 | return mask.bool() 55 | 56 | 57 | class SyntaxHead(Module): 58 | def __init__(self, input_dim, hidden_dim, dropout=0.1): 59 | super(SyntaxHead, self).__init__() 60 | self.head = FF(input_dim, hidden_dim, dropout) 61 | self.tail = FF(input_dim, hidden_dim, dropout) 62 | 63 | self.root = nn.Parameter(torch.empty(input_dim)) 64 | self.kernel = nn.Parameter(torch.empty(hidden_dim, hidden_dim)) 65 | 66 | self.reset_parameters() 67 | 68 | def reset_parameters(self): 69 | nn.init.uniform_(self.root) 70 | nn.init.eye_(self.kernel) 71 | 72 | def decode(self, pred, mask): 73 | # multiple roots 74 | # loops, nonprojective 75 | # ~10% sents 76 | 77 | mask = append_root_mask(mask) 78 | mask = matmul_mask(mask) 79 | mask = strip_root(mask) 80 | 81 | pred = fill_masked(pred, ~mask, pred.min()) 82 | return pred.argmax(-1) 83 | 84 | def forward(self, input): 85 | input = append_root(input, self.root) 86 | head = self.head(input) 87 | tail = self.tail(input) 88 | 89 | x = head.matmul(self.kernel) 90 | x = x.matmul(tail.transpose(-2, -1)) 91 | return strip_root(x) 92 | 93 | 94 | def gather_head(input, root, index): 95 | batch_size, seq_len, emb_dim = input.shape 96 | input = append_root(input, root) # batch x seq + 1 x emb 97 | 98 | # for root select root 99 | zero = torch.zeros(batch_size, 1, dtype=torch.long, device=input.device) 100 | index = torch.cat((zero, index), dim=-1) # batch x seq + 1 x emb 101 | 102 | # prep for gather 103 | index = index.view(batch_size, seq_len + 1, 1) 104 | index = index.expand(batch_size, seq_len + 1, emb_dim) 105 | 106 | input = torch.gather(input, dim=-2, index=index) 107 | return strip_root(input) # batch x seq x emb 108 | 109 | 110 | class SyntaxRel(Module): 111 | def __init__(self, input_dim, hidden_dim, rel_dim, dropout=0.1): 112 | super(SyntaxRel, self).__init__() 113 | self.input_dim = input_dim 114 | self.hidden_dim = hidden_dim 115 | self.rel_dim = rel_dim 116 | 117 | self.head = FF(input_dim, hidden_dim, dropout) 118 | self.tail = FF(input_dim, hidden_dim, dropout) 119 | 120 | self.root = nn.Parameter(torch.empty(input_dim)) 121 | self.kernel = nn.Parameter(torch.empty(hidden_dim, hidden_dim * rel_dim)) 122 | 123 | self.reset_parameters() 124 | 125 | def reset_parameters(self): 126 | nn.init.uniform_(self.root) 127 | nn.init.xavier_uniform_(self.kernel) 128 | 129 | def decode(self, pred, mask): 130 | mask = mask.unsqueeze(-1) # batch x seq x 1 131 | mask = mask.expand_as(pred) 132 | 133 | pred = fill_masked(pred, ~mask, pred.min()) 134 | return pred.argmax(-1) 135 | 136 | def forward(self, input, head_id): 137 | head = self.head(gather_head(input, self.root, head_id)) 138 | tail = self.tail(input) 139 | 140 | batch_size, seq_len, _ = input.shape 141 | x = head.matmul(self.kernel) # batch x seq x hidden * rel 142 | x = x.view(batch_size, seq_len, self.rel_dim, self.hidden_dim) 143 | x = x.matmul(tail.view(batch_size, seq_len, self.hidden_dim, 1)) 144 | return x.view(batch_size, seq_len, self.rel_dim) 145 | 146 | 147 | class SyntaxPred(Record): 148 | __attributes__ = ['head_id', 'rel_id'] 149 | 150 | 151 | class SyntaxEmbedding(WordShapeEmbedding): 152 | pass 153 | 154 | 155 | class SyntaxEncoder(CNNEncoder): 156 | pass 157 | 158 | 159 | class Syntax(Module): 160 | def __init__(self, emb, encoder, head, rel): 161 | super(Syntax, self).__init__() 162 | self.emb = emb 163 | self.encoder = encoder 164 | self.head = head 165 | self.rel = rel 166 | 167 | def forward(self, word_id, shape_id, pad_mask, target_head_id=None): 168 | x = self.emb(word_id, shape_id) 169 | x = self.encoder(x, pad_mask) 170 | 171 | head_id = self.head(x) 172 | if target_head_id is None: 173 | target_head_id = self.head.decode(head_id, ~pad_mask) 174 | 175 | return SyntaxPred( 176 | head_id=head_id, 177 | rel_id=self.rel(x, target_head_id) 178 | ) 179 | -------------------------------------------------------------------------------- /slovnet/model/tag.py: -------------------------------------------------------------------------------- 1 | 2 | from torch import nn 3 | 4 | from .base import Module 5 | from .crf import CRF 6 | from .emb import WordShapeEmbedding 7 | from .cnn import CNNEncoder 8 | 9 | 10 | class NERHead(Module): 11 | def __init__(self, emb_dim, tags_num): 12 | super(NERHead, self).__init__() 13 | self.emb_dim = emb_dim 14 | self.tags_num = tags_num 15 | 16 | self.proj = nn.Linear(emb_dim, tags_num) 17 | self.crf = CRF(tags_num) 18 | 19 | def forward(self, input): 20 | return self.proj(input) 21 | 22 | 23 | class MorphHead(Module): 24 | def __init__(self, emb_dim, tags_num): 25 | super(MorphHead, self).__init__() 26 | self.emb_dim = emb_dim 27 | self.tags_num = tags_num 28 | 29 | self.proj = nn.Linear(emb_dim, tags_num) 30 | 31 | def decode(self, pred): 32 | return pred.argmax(-1) 33 | 34 | def forward(self, input): 35 | return self.proj(input) 36 | 37 | 38 | class TagEmbedding(WordShapeEmbedding): 39 | pass 40 | 41 | 42 | class TagEncoder(CNNEncoder): 43 | pass 44 | 45 | 46 | class Tag(Module): 47 | def __init__(self, emb, encoder, head): 48 | super(Tag, self).__init__() 49 | self.emb = emb 50 | self.encoder = encoder 51 | self.head = head 52 | 53 | def forward(self, word_id, shape_id, pad_mask=None): 54 | x = self.emb(word_id, shape_id) 55 | x = self.encoder(x, pad_mask) 56 | return self.head(x) 57 | 58 | 59 | class NER(Tag): 60 | pass 61 | 62 | 63 | class Morph(Tag): 64 | pass 65 | -------------------------------------------------------------------------------- /slovnet/pad.py: -------------------------------------------------------------------------------- 1 | 2 | from torch.nn.utils.rnn import pad_sequence as pad_sequence_ 3 | 4 | 5 | def pad_sequence(seqs, fill=0): 6 | return pad_sequence_( 7 | seqs, 8 | batch_first=True, 9 | padding_value=fill 10 | ) 11 | -------------------------------------------------------------------------------- /slovnet/record.py: -------------------------------------------------------------------------------- 1 | 2 | from collections import OrderedDict 3 | 4 | 5 | def parse_annotation(annotation): 6 | type = annotation or str 7 | 8 | repeatable = False 9 | if isinstance(annotation, list): # [Fact] 10 | repeatable = True 11 | type = annotation[0] 12 | 13 | is_record = issubclass(type, Record) 14 | 15 | return type, repeatable, is_record 16 | 17 | 18 | class Record(object): 19 | __attributes__ = [] 20 | __annotations__ = {} 21 | 22 | def __init__(self, *args, **kwargs): 23 | for key, value in zip(self.__attributes__, args): 24 | self.__dict__[key] = value 25 | self.__dict__.update(kwargs) 26 | 27 | def __eq__(self, other): 28 | return ( 29 | type(self) == type(other) 30 | and all( 31 | (getattr(self, _) == getattr(other, _)) 32 | for _ in self.__attributes__ 33 | ) 34 | ) 35 | 36 | def __ne__(self, other): 37 | return not self == other 38 | 39 | def __iter__(self): 40 | return (getattr(self, _) for _ in self.__attributes__) 41 | 42 | def __hash__(self): 43 | return hash(tuple(self)) 44 | 45 | def __repr__(self): 46 | name = self.__class__.__name__ 47 | args = ', '.join( 48 | '{key}={value!r}'.format( 49 | key=_, 50 | value=getattr(self, _) 51 | ) 52 | for _ in self.__attributes__ 53 | ) 54 | return '{name}({args})'.format( 55 | name=name, 56 | args=args 57 | ) 58 | 59 | def _repr_pretty_(self, printer, cycle): 60 | name = self.__class__.__name__ 61 | if cycle: 62 | printer.text('{name}(...)'.format(name=name)) 63 | else: 64 | printer.text('{name}('.format(name=name)) 65 | keys = self.__attributes__ 66 | size = len(keys) 67 | if size: 68 | with printer.indent(4): 69 | printer.break_() 70 | for index, key in enumerate(keys): 71 | printer.text(key + '=') 72 | value = getattr(self, key) 73 | printer.pretty(value) 74 | if index < size - 1: 75 | printer.text(',') 76 | printer.break_() 77 | printer.break_() 78 | printer.text(')') 79 | 80 | @property 81 | def as_json(self): 82 | data = OrderedDict() 83 | for key in self.__attributes__: 84 | annotation = self.__annotations__.get(key) 85 | _, repeatable, is_record = parse_annotation(annotation) 86 | 87 | value = getattr(self, key) 88 | if value is None: 89 | continue 90 | 91 | if repeatable and is_record: 92 | value = [_.as_json for _ in value] 93 | elif is_record: 94 | value = value.as_json 95 | 96 | data[key] = value 97 | return data 98 | 99 | @classmethod 100 | def from_json(cls, data): 101 | args = [] 102 | for key in cls.__attributes__: 103 | annotation = cls.__annotations__.get(key) 104 | type, repeatable, is_record = parse_annotation(annotation) 105 | value = data.get(key) 106 | if value is None and repeatable: 107 | value = [] 108 | elif value is not None: 109 | if repeatable and is_record: 110 | value = [type.from_json(_) for _ in value] 111 | elif is_record: 112 | value = type.from_json(value) 113 | args.append(value) 114 | return cls(*args) 115 | 116 | def to(self, device): 117 | cls = type(self) 118 | args = (_.to(device) for _ in self) 119 | return cls(*args) 120 | 121 | def copy(self): 122 | return type(self)(*self) 123 | 124 | def replace(self, **kwargs): 125 | other = self.copy() 126 | for key, value in kwargs.items(): 127 | setattr(other, key, value) 128 | return other 129 | -------------------------------------------------------------------------------- /slovnet/s3.py: -------------------------------------------------------------------------------- 1 | 2 | import boto3 3 | 4 | from .record import Record 5 | from .const import ( 6 | S3_KEY_ID, 7 | S3_KEY, 8 | S3_BUCKET, 9 | S3_ENDPOINT, 10 | S3_REGION 11 | ) 12 | 13 | 14 | class S3(Record): 15 | __attributes__ = ['key_id', 'key', 'bucket', 'endpoint', 'region'] 16 | 17 | def __init__(self, key_id=S3_KEY_ID, key=S3_KEY, bucket=S3_BUCKET, 18 | endpoint=S3_ENDPOINT, region=S3_REGION): 19 | self.key_id = key_id 20 | self.key = key 21 | self.bucket = bucket 22 | self.endpoint = endpoint 23 | self.region = region 24 | 25 | self.client = boto3.client( 26 | 's3', 27 | aws_access_key_id=key_id, 28 | aws_secret_access_key=key, 29 | region_name=region, 30 | endpoint_url=endpoint, 31 | ) 32 | 33 | def upload(self, path, key): 34 | self.client.upload_file(path, self.bucket, key) 35 | 36 | def download(self, key, path): 37 | self.client.download_file(self.bucket, key, path) 38 | -------------------------------------------------------------------------------- /slovnet/sent.py: -------------------------------------------------------------------------------- 1 | 2 | import re 3 | 4 | from razdel import sentenize as sentenize_ 5 | 6 | from .record import Record 7 | 8 | 9 | class Sent(Record): 10 | __attributes__ = ['start', 'stop', 'text'] 11 | 12 | 13 | def split_lines(text): 14 | for match in re.finditer(r'([^\r\n]+)', text): 15 | start = match.start() 16 | stop = match.end() 17 | line = match.group(1) 18 | yield Sent(start, stop, line) 19 | 20 | 21 | def sentenize(text): 22 | for line in split_lines(text): 23 | for sent in sentenize_(line.text): 24 | if not sent.text: # '\n\t\n' for example 25 | continue 26 | yield Sent( 27 | sent.start + line.start, 28 | sent.stop + line.start, 29 | sent.text 30 | ) 31 | -------------------------------------------------------------------------------- /slovnet/shape.py: -------------------------------------------------------------------------------- 1 | 2 | import re 3 | 4 | RU = 'RU' 5 | EN = 'EN' 6 | NUM = 'NUM' 7 | PUNCT = 'PUNCT' 8 | OTHER = 'OTHER' 9 | 10 | PUNCTS = ( 11 | '!#$%&()[]\\/*+,.:;<=>?@^_{|}~' # string.punctuation 12 | '-‐−‒⁃–—―' # https://habr.com/ru/post/20588/ 13 | '`"\'«»„“ʼʻ”' 14 | '№…' 15 | ) 16 | TYPE = re.compile( 17 | r''' 18 | (?P[а-яё]+) 19 | |(?P[a-z]+) 20 | |(?P[+-]?\d+) 21 | |(?P[%s]+) 22 | ''' % re.escape(PUNCTS), 23 | re.X | re.IGNORECASE 24 | ) 25 | 26 | X = 'X' 27 | x = 'x' 28 | XX = 'XX' 29 | xx = 'xx' 30 | Xx = 'Xx' 31 | Xx_Xx = 'Xx-Xx' 32 | 33 | 34 | def is_title(word): 35 | return len(word) > 1 and word[0].isupper() and word[1:].islower() 36 | 37 | 38 | def is_dash_title(word): 39 | if '-' in word: 40 | left, right = word.split('-', 1) 41 | return is_title(left) and is_title(right) 42 | 43 | 44 | def word_outline(word): 45 | if len(word) == 1: 46 | if word.isupper(): 47 | return X 48 | else: 49 | return x 50 | else: 51 | if word.isupper(): 52 | return XX 53 | elif word.islower(): 54 | return xx 55 | elif is_title(word): 56 | return Xx 57 | elif is_dash_title(word): 58 | return Xx_Xx 59 | else: 60 | return OTHER 61 | 62 | 63 | def word_type(word): 64 | # СИЗО-6 -> RU 65 | # 2011-2020 -> NUM 66 | match = TYPE.match(word) 67 | if match: 68 | return match.lastgroup 69 | return OTHER 70 | 71 | 72 | def format_shape(type, value): 73 | return '%s_%s' % (type, value) 74 | 75 | 76 | def word_shape(word): 77 | type = word_type(word) 78 | if type in (RU, EN): 79 | return format_shape(type, word_outline(word)) 80 | elif type == PUNCT: 81 | if len(word) > 1 or word not in PUNCTS: 82 | # ..., ?!, **** 83 | word = OTHER 84 | return format_shape(PUNCT, word) 85 | elif type in (NUM, OTHER): 86 | return type 87 | 88 | 89 | OUTLINES = [X, x, XX, xx, Xx, Xx_Xx, OTHER] 90 | SHAPES = ( 91 | [format_shape(RU, _) for _ in OUTLINES] 92 | + [format_shape(EN, _) for _ in OUTLINES] 93 | + [format_shape(PUNCT, _) for _ in PUNCTS] 94 | + [format_shape(PUNCT, OTHER), NUM, OTHER] 95 | ) 96 | -------------------------------------------------------------------------------- /slovnet/span.py: -------------------------------------------------------------------------------- 1 | 2 | from .record import Record 3 | 4 | 5 | class Span(Record): 6 | __attributes__ = ['start', 'stop', 'type'] 7 | 8 | def __init__(self, start, stop, type=None): 9 | self.start = start 10 | self.stop = stop 11 | self.type = type 12 | 13 | def offset(self, delta): 14 | return Span( 15 | self.start + delta, 16 | self.stop + delta, 17 | self.type 18 | ) 19 | 20 | 21 | def offset_spans(spans, delta): 22 | for span in spans: 23 | yield span.offset(delta) 24 | 25 | 26 | def envelop_span(envelope, span): 27 | return envelope.start <= span.start and span.stop <= envelope.stop 28 | 29 | 30 | def envelop_spans(envelope, spans): 31 | for span in spans: 32 | if envelop_span(envelope, span): 33 | yield span 34 | 35 | 36 | def select_type_spans(spans, types): 37 | for span in spans: 38 | if span.type in types: 39 | yield span 40 | -------------------------------------------------------------------------------- /slovnet/tar.py: -------------------------------------------------------------------------------- 1 | 2 | import tarfile 3 | from io import BytesIO 4 | 5 | from .record import Record 6 | 7 | 8 | class Tar(Record): 9 | __attributes__ = ['path'] 10 | 11 | mode = 'r' 12 | 13 | def __enter__(self): 14 | self.tar = tarfile.open(self.path, self.mode) 15 | return self 16 | 17 | def __exit__(self, *args): 18 | self.tar.close() 19 | 20 | def open(self, name): 21 | member = self.tar.getmember(name) 22 | return self.tar.extractfile(member) 23 | 24 | def read(self, name): 25 | return self.open(name).read() 26 | 27 | def list(self, prefix=None): 28 | for member in self.tar: 29 | name = member.name 30 | if not prefix or name.startswith(prefix): 31 | yield name 32 | 33 | 34 | class DumpTar(Tar): 35 | mode = 'w' 36 | 37 | def write(self, bytes, name): 38 | file = BytesIO(bytes) 39 | info = tarfile.TarInfo(name) 40 | info.size = len(bytes) 41 | self.tar.addfile(tarinfo=info, fileobj=file) 42 | -------------------------------------------------------------------------------- /slovnet/token.py: -------------------------------------------------------------------------------- 1 | 2 | from razdel import tokenize as tokenize_ 3 | 4 | from .record import Record 5 | 6 | 7 | class Token(Record): 8 | __attributes__ = ['start', 'stop', 'text'] 9 | 10 | 11 | def tokenize(text): 12 | for token in tokenize_(text): 13 | yield Token( 14 | token.start, 15 | token.stop, 16 | token.text 17 | ) 18 | 19 | 20 | def find_tokens(text, chunks): 21 | offset = 0 22 | for chunk in chunks: 23 | start = text.find(chunk, offset) 24 | stop = start + len(chunk) 25 | yield Token(start, stop, chunk) 26 | offset = stop 27 | -------------------------------------------------------------------------------- /slovnet/visitor.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | class Visitor(object): 4 | def resolve_method(self, item): 5 | for cls in item.__class__.__mro__: 6 | name = 'visit_' + cls.__name__ 7 | method = getattr(self, name, None) 8 | if method: 9 | return method 10 | raise ValueError('no method for {type!r}'.format( 11 | type=type(item) 12 | )) 13 | 14 | def visit(self, item): 15 | return self.resolve_method(item)(item) 16 | 17 | def __call__(self, item): 18 | return self.visit(item) 19 | -------------------------------------------------------------------------------- /slovnet/vocab.py: -------------------------------------------------------------------------------- 1 | 2 | from .record import Record 3 | from .io import ( 4 | load_lines, 5 | dump_lines 6 | ) 7 | from .const import ( 8 | B, I, O, 9 | 10 | UNK, PAD, 11 | CLS, SEP, 12 | MASK, 13 | ) 14 | from .bio import format_bio 15 | 16 | 17 | class Vocab(Record): 18 | __attributes__ = ['items'] 19 | 20 | def __init__(self, items): 21 | self.items = items 22 | self.item_ids = { 23 | item: id 24 | for id, item in enumerate(self.items) 25 | } 26 | self.unk_id = self.item_ids.get(UNK) 27 | self.pad_id = self.item_ids.get(PAD) 28 | 29 | def encode(self, item): 30 | return self.item_ids.get(item, self.unk_id) 31 | 32 | def decode(self, id): 33 | return self.items[id] 34 | 35 | def __len__(self): 36 | return len(self.items) 37 | 38 | def __repr__(self): 39 | return '%s(items=[...])' % self.__class__.__name__ 40 | 41 | def _repr_pretty_(self, printer, cycle): 42 | printer.text(repr(self)) 43 | 44 | @classmethod 45 | def load(cls, path): 46 | items = list(load_lines(path)) 47 | return cls(items) 48 | 49 | def dump(self, path): 50 | dump_lines(self.items, path) 51 | 52 | 53 | class BERTVocab(Vocab): 54 | def __init__(self, items): 55 | super(BERTVocab, self).__init__(items) 56 | self.sep_id = self.item_ids[SEP] 57 | self.cls_id = self.item_ids[CLS] 58 | self.mask_id = self.item_ids[MASK] 59 | 60 | 61 | class BIOTagsVocab(Vocab): 62 | def __init__(self, types): 63 | self.types = types 64 | 65 | items = [PAD, O] 66 | for type in types: 67 | for part in [B, I]: 68 | items.append(format_bio(part, type)) 69 | 70 | super(BIOTagsVocab, self).__init__(items) 71 | -------------------------------------------------------------------------------- /tests/test_api.py: -------------------------------------------------------------------------------- 1 | 2 | import pytest 3 | 4 | from os.path import join, dirname, basename, exists 5 | from os import makedirs 6 | from urllib.request import urlopen 7 | from shutil import copyfileobj 8 | 9 | from navec import Navec 10 | from slovnet import NER, Morph, Syntax 11 | 12 | 13 | DATA_DIR = join(dirname(__file__), '../data/test') 14 | 15 | 16 | def download(url, dir=DATA_DIR): 17 | path = join(dir, basename(url)) 18 | if exists(path): 19 | return path 20 | 21 | if not exists(dir): 22 | makedirs(dir) 23 | 24 | with urlopen(url) as source: 25 | with open(path, 'wb') as target: 26 | copyfileobj(source, target) 27 | 28 | return path 29 | 30 | 31 | @pytest.fixture(scope='module') 32 | def navec(): 33 | path = download('https://storage.yandexcloud.net/natasha-navec/packs/navec_news_v1_1B_250K_300d_100q.tar') 34 | return Navec.load(path) 35 | 36 | 37 | @pytest.fixture(scope='module') 38 | def ner(navec): 39 | path = download('https://storage.yandexcloud.net/natasha-slovnet/packs/slovnet_ner_news_v1.tar') 40 | return NER.load(path).navec(navec) 41 | 42 | 43 | @pytest.fixture(scope='module') 44 | def morph(navec): 45 | path = download('https://storage.yandexcloud.net/natasha-slovnet/packs/slovnet_morph_news_v1.tar') 46 | return Morph.load(path).navec(navec) 47 | 48 | 49 | @pytest.fixture(scope='module') 50 | def syntax(navec): 51 | path = download('https://storage.yandexcloud.net/natasha-slovnet/packs/slovnet_syntax_news_v1.tar') 52 | return Syntax.load(path).navec(navec) 53 | 54 | 55 | def test_ner(ner): 56 | text = 'На них удержали лидерство действующие руководители и партии — Денис Пушилин и «Донецкая республика» в ДНР и Леонид Пасечник с движением «Мир Луганщине» в ЛНР.' 57 | 58 | markup = ner(text) 59 | 60 | pred = [] 61 | for span in markup.spans: 62 | chunk = markup.text[span.start:span.stop] 63 | pred.append([span.type, chunk]) 64 | 65 | assert pred == [ 66 | ['PER', 'Денис Пушилин'], 67 | ['ORG', 'Донецкая республика'], 68 | ['LOC', 'ДНР'], 69 | ['PER', 'Леонид Пасечник'], 70 | ['ORG', 'Мир Луганщине'], 71 | ['LOC', 'ЛНР'] 72 | ] 73 | 74 | 75 | def test_morph(morph): 76 | words = ['Об', 'этом', 'говорится', 'в', 'документе', ',', 'опубликованном', 'в', 'официальном', 'журнале', 'Евросоюза', '.'] 77 | 78 | markup = morph(words) 79 | 80 | pred = [ 81 | [_.text, _.tag] 82 | for _ in markup.tokens 83 | ] 84 | assert pred == [ 85 | ['Об', 'ADP'], 86 | ['этом', 'PRON|Animacy=Inan|Case=Loc|Gender=Neut|Number=Sing'], 87 | ['говорится', 'VERB|Aspect=Imp|Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin|Voice=Pass'], 88 | ['в', 'ADP'], 89 | ['документе', 'NOUN|Animacy=Inan|Case=Loc|Gender=Masc|Number=Sing'], 90 | [',', 'PUNCT'], 91 | ['опубликованном', 'VERB|Aspect=Perf|Case=Loc|Gender=Masc|Number=Sing|Tense=Past|VerbForm=Part|Voice=Pass'], 92 | ['в', 'ADP'], 93 | ['официальном', 'ADJ|Case=Loc|Degree=Pos|Gender=Masc|Number=Sing'], 94 | ['журнале', 'NOUN|Animacy=Inan|Case=Loc|Gender=Masc|Number=Sing'], 95 | ['Евросоюза', 'PROPN|Animacy=Inan|Case=Gen|Gender=Masc|Number=Sing'], 96 | ['.', 'PUNCT'] 97 | ] 98 | 99 | 100 | def test_syntax(syntax): 101 | words = ['Опубликованы', 'новые', 'данные', 'по', 'заражению', 'коронавирусом', 'в', 'Москве'] 102 | 103 | markup = syntax(words) 104 | 105 | ids = {_.id: _ for _ in markup.tokens} 106 | pred = [] 107 | for token in markup.tokens: 108 | head = ids.get(token.head_id) 109 | if head: 110 | pred.append([token.text, head.rel, head.text]) 111 | else: 112 | pred.append(token.text) 113 | 114 | assert pred == [ 115 | 'Опубликованы', 116 | ['новые', 'nsubj:pass', 'данные'], 117 | ['данные', 'root', 'Опубликованы'], 118 | ['по', 'nmod', 'заражению'], 119 | ['заражению', 'nsubj:pass', 'данные'], 120 | ['коронавирусом', 'nmod', 'заражению'], 121 | ['в', 'obl', 'Москве'], 122 | ['Москве', 'nmod', 'коронавирусом'] 123 | ] 124 | -------------------------------------------------------------------------------- /tests/test_bio.py: -------------------------------------------------------------------------------- 1 | 2 | import pytest 3 | 4 | from slovnet.token import tokenize 5 | from slovnet.const import ( 6 | B, I, O, 7 | PER, LOC 8 | ) 9 | from slovnet.bio import ( 10 | format_bio, 11 | 12 | io_spans, 13 | spans_io, 14 | 15 | bio_spans, 16 | spans_bio, 17 | 18 | bio_io, 19 | select_type_tags 20 | ) 21 | 22 | 23 | T1, T2, T3, T4, T5 = tokenize('1 2 3 4 5') 24 | 25 | B_PER = format_bio(B, PER) 26 | I_PER = format_bio(I, PER) 27 | B_LOC = format_bio(B, LOC) 28 | I_LOC = format_bio(I, LOC) 29 | 30 | 31 | TESTS = [ 32 | [ 33 | [T1, T2, T3], 34 | [O, O, O], 35 | ], 36 | [ 37 | [], 38 | [], 39 | ] 40 | ] 41 | 42 | IO_TESTS = [ 43 | [ 44 | [T1, T2, T3], 45 | [I_PER, O, O] 46 | ], 47 | [ 48 | [T1, T2, T3], 49 | [I_PER, I_PER, O] 50 | ], 51 | [ 52 | [T1, T2, T3], 53 | [I_PER, I_LOC, O] 54 | ], 55 | [ 56 | [T1, T2], 57 | [I_PER, I_PER] 58 | ], 59 | ] 60 | 61 | BIO_TESTS = [ 62 | [ 63 | [T1, T2, T3], 64 | [B_PER, O, O], 65 | ], 66 | [ 67 | [T1, T2, T3], 68 | [B_PER, I_PER, O], 69 | ], 70 | [ 71 | [T1, T2], 72 | [B_PER, I_PER], 73 | ], 74 | [ 75 | [T1, T2, T3], 76 | [B_PER, B_LOC, O], 77 | ], 78 | [ 79 | [T1, T2, T3], 80 | [B_PER, B_PER, O], 81 | ], 82 | ] 83 | 84 | CONVERT_TESTS = [ 85 | [ 86 | [B_PER, I_PER], 87 | [I] 88 | ] 89 | ] 90 | 91 | 92 | @pytest.mark.parametrize('test', TESTS + IO_TESTS) 93 | def test_io(test): 94 | tokens, tags = test 95 | spans = io_spans(tokens, tags) 96 | guess = spans_io(tokens, spans) 97 | assert tags == list(guess) 98 | 99 | 100 | @pytest.mark.parametrize('test', TESTS + BIO_TESTS) 101 | def test_bio(test): 102 | tokens, tags = test 103 | spans = bio_spans(tokens, tags) 104 | guess = spans_bio(tokens, spans) 105 | assert tags == list(guess) 106 | 107 | 108 | def test_convert(): 109 | guess = bio_io([B_PER, I_PER, I_LOC]) 110 | etalon = [I_PER, I_PER, I_LOC] 111 | assert etalon == list(guess) 112 | 113 | 114 | def test_select(): 115 | guess = select_type_tags([B_PER, I_LOC], PER) 116 | etalon = [B_PER, O] 117 | assert etalon == list(guess) 118 | -------------------------------------------------------------------------------- /tests/test_shape.py: -------------------------------------------------------------------------------- 1 | 2 | import pytest 3 | 4 | from slovnet.shape import ( 5 | X, x, xx, XX, Xx, Xx_Xx, OTHER, 6 | RU, EN, NUM, PUNCT, 7 | 8 | word_shape, 9 | format_shape as s 10 | ) 11 | from slovnet.token import tokenize 12 | 13 | 14 | TESTS = [ 15 | [ 16 | 'В', 17 | [s(RU, X)], 18 | ], 19 | [ 20 | 'ИЛ-2', 21 | [s(RU, XX)], 22 | ], 23 | [ 24 | '105г.', 25 | [NUM, s(RU, x), s(PUNCT, '.')] 26 | ], 27 | [ 28 | 'Pal-Yz', 29 | [s(EN, Xx_Xx)] 30 | ], 31 | [ 32 | 'и Я-ДаА', 33 | [s(RU, x), s(RU, OTHER)] 34 | ], 35 | [ 36 | 'Прибыл на I@', 37 | [s(RU, Xx), s(RU, xx), s(EN, X), s(PUNCT, '@')] 38 | ], 39 | [ 40 | 'и -‐', 41 | [s(RU, x), s(PUNCT, OTHER)] 42 | ] 43 | ] 44 | 45 | 46 | @pytest.mark.parametrize('test', TESTS) 47 | def test_shape(test): 48 | text, etalon = test 49 | tokens = tokenize(text) 50 | guess = [word_shape(_.text) for _ in tokens] 51 | assert guess == etalon 52 | --------------------------------------------------------------------------------