├── corus
├── third
│ └── __init__.py
├── __init__.py
├── sources
│ ├── gramru.py
│ ├── taiga
│ │ ├── __init__.py
│ │ ├── subtitles.py
│ │ ├── kp.py
│ │ ├── interfax.py
│ │ ├── lenta.py
│ │ ├── magazines.py
│ │ ├── nplus1.py
│ │ ├── arzamas.py
│ │ ├── social.py
│ │ ├── proza.py
│ │ ├── fontanka.py
│ │ └── common.py
│ ├── simlex.py
│ ├── librusec.py
│ ├── __init__.py
│ ├── wikiner.py
│ ├── lenta.py
│ ├── gareev.py
│ ├── wiki.py
│ ├── russe.py
│ ├── rudrec.py
│ ├── ne5.py
│ ├── persons.py
│ ├── ria.py
│ ├── buriy.py
│ ├── toloka.py
│ ├── morphoru.py
│ ├── mokoron.py
│ ├── ud.py
│ ├── ods.py
│ ├── omnia.py
│ ├── corpora.py
│ ├── bsnlp.py
│ ├── factru.py
│ └── meta.py
├── path.py
├── record.py
├── zip.py
├── io.py
└── readme.py
├── requirements
├── dev.txt
└── ci.txt
├── data
├── ria.json.gz
├── ods
│ ├── iz.csv.gz
│ ├── ria.csv.gz
│ ├── rt.csv.gz
│ ├── gazeta.csv.gz
│ ├── meduza.csv.gz
│ ├── interfax.csv.gz
│ └── tass-001.csv.gz
├── Persons-1000.zip
├── ru_om1000a.x1_.xz
├── taiga
│ ├── KP.tar.gz
│ ├── Lenta.tar.gz
│ ├── NPlus1.tar.gz
│ ├── proza_ru.zip
│ ├── social.tar.gz
│ ├── stihi_ru.zip
│ ├── Arzamas.tar.gz
│ ├── Fontanka.tar.gz
│ ├── Interfax.tar.gz
│ ├── Magazines.tar.gz
│ └── Subtitles.tar.gz
├── buriy
│ ├── lenta.tar.bz2
│ ├── webhose-2016.tar.bz2
│ ├── news-articles-2014.tar.bz2
│ ├── news-articles-2015-part1.tar.bz2
│ └── news-articles-2015-part2.tar.bz2
├── aij-wikiner-ru-wp3.bz2
├── lenta-ru-news.csv.bz2
├── lenta-ru-news.csv.gz
├── librusec_fb2.plain.gz
├── annot.opcorpora.xml.byfile.zip
├── ruwiki-latest-pages-articles.xml.bz2
├── russe
│ └── sem
│ │ ├── ae2.csv
│ │ ├── rt.csv
│ │ ├── hj.csv
│ │ └── ae-train.csv
├── simlex
│ └── ru_simlex965_tagged.tsv
├── factRuEval-2016-master
│ └── devset
│ │ ├── book_58.coref
│ │ ├── book_58.objects
│ │ ├── book_58.txt
│ │ ├── book_58.facts
│ │ ├── book_58.tokens
│ │ └── book_58.spans
├── toloka
│ ├── ruadrect
│ │ └── task2_ru_test.tsv
│ └── lrwc-1.1-aggregated.tsv
├── rus-ner-news-corpus.iob
│ └── biztass-1.txt.iob
├── Collection5
│ ├── 001.ann
│ └── 001.txt
├── morphoru
│ ├── gikrya_new_test.out
│ ├── unamb_sent_14_6.conllu
│ └── RNCgoldInUD_Morpho.conll
├── rudrec
│ └── rudrec_annotated.json
├── bsnlp
│ └── test_pl_cs_ru_bg
│ │ ├── annotated
│ │ └── nord_stream
│ │ │ └── ru
│ │ │ ├── Nord_Stream_2_extra.xml_file_1.out
│ │ │ └── Nord_Stream_2_extra.xml_file_7.out
│ │ └── raw
│ │ └── nord_stream
│ │ └── ru
│ │ ├── Nord_Stream_2_extra.xml_file_7.txt
│ │ └── Nord_Stream_2_extra.xml_file_1.txt
├── ud
│ ├── ru_taiga-ud-dev.conllu
│ ├── ru_gsd-ud-dev.conllu
│ ├── ru_syntagrus-ud-dev.conllu
│ └── ru_pud-ud-test.conllu
├── gramru
│ └── GramEval_private_test.conllu
├── mokoron
│ └── db.sql
└── sample.ipynb
├── setup.cfg
├── .gitignore
├── Makefile
├── .github
└── workflows
│ ├── pypi.yml
│ └── test.yml
├── setup.py
├── LICENSE
└── README.md
/corus/third/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/corus/__init__.py:
--------------------------------------------------------------------------------
1 |
2 | from .sources import * # noqa
3 |
--------------------------------------------------------------------------------
/requirements/dev.txt:
--------------------------------------------------------------------------------
1 | flake8
2 | ipykernel
3 | nbconvert
4 |
--------------------------------------------------------------------------------
/requirements/ci.txt:
--------------------------------------------------------------------------------
1 | flake8==5.0.4
2 | jupyter==1.0.0
3 | nbconvert==7.2.8
4 |
--------------------------------------------------------------------------------
/data/ria.json.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/natasha/corus/HEAD/data/ria.json.gz
--------------------------------------------------------------------------------
/data/ods/iz.csv.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/natasha/corus/HEAD/data/ods/iz.csv.gz
--------------------------------------------------------------------------------
/data/ods/ria.csv.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/natasha/corus/HEAD/data/ods/ria.csv.gz
--------------------------------------------------------------------------------
/data/ods/rt.csv.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/natasha/corus/HEAD/data/ods/rt.csv.gz
--------------------------------------------------------------------------------
/data/Persons-1000.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/natasha/corus/HEAD/data/Persons-1000.zip
--------------------------------------------------------------------------------
/data/ods/gazeta.csv.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/natasha/corus/HEAD/data/ods/gazeta.csv.gz
--------------------------------------------------------------------------------
/data/ods/meduza.csv.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/natasha/corus/HEAD/data/ods/meduza.csv.gz
--------------------------------------------------------------------------------
/data/ru_om1000a.x1_.xz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/natasha/corus/HEAD/data/ru_om1000a.x1_.xz
--------------------------------------------------------------------------------
/data/taiga/KP.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/natasha/corus/HEAD/data/taiga/KP.tar.gz
--------------------------------------------------------------------------------
/data/buriy/lenta.tar.bz2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/natasha/corus/HEAD/data/buriy/lenta.tar.bz2
--------------------------------------------------------------------------------
/data/ods/interfax.csv.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/natasha/corus/HEAD/data/ods/interfax.csv.gz
--------------------------------------------------------------------------------
/data/ods/tass-001.csv.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/natasha/corus/HEAD/data/ods/tass-001.csv.gz
--------------------------------------------------------------------------------
/data/taiga/Lenta.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/natasha/corus/HEAD/data/taiga/Lenta.tar.gz
--------------------------------------------------------------------------------
/data/taiga/NPlus1.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/natasha/corus/HEAD/data/taiga/NPlus1.tar.gz
--------------------------------------------------------------------------------
/data/taiga/proza_ru.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/natasha/corus/HEAD/data/taiga/proza_ru.zip
--------------------------------------------------------------------------------
/data/taiga/social.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/natasha/corus/HEAD/data/taiga/social.tar.gz
--------------------------------------------------------------------------------
/data/taiga/stihi_ru.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/natasha/corus/HEAD/data/taiga/stihi_ru.zip
--------------------------------------------------------------------------------
/data/aij-wikiner-ru-wp3.bz2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/natasha/corus/HEAD/data/aij-wikiner-ru-wp3.bz2
--------------------------------------------------------------------------------
/data/lenta-ru-news.csv.bz2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/natasha/corus/HEAD/data/lenta-ru-news.csv.bz2
--------------------------------------------------------------------------------
/data/lenta-ru-news.csv.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/natasha/corus/HEAD/data/lenta-ru-news.csv.gz
--------------------------------------------------------------------------------
/data/librusec_fb2.plain.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/natasha/corus/HEAD/data/librusec_fb2.plain.gz
--------------------------------------------------------------------------------
/data/taiga/Arzamas.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/natasha/corus/HEAD/data/taiga/Arzamas.tar.gz
--------------------------------------------------------------------------------
/data/taiga/Fontanka.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/natasha/corus/HEAD/data/taiga/Fontanka.tar.gz
--------------------------------------------------------------------------------
/data/taiga/Interfax.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/natasha/corus/HEAD/data/taiga/Interfax.tar.gz
--------------------------------------------------------------------------------
/data/taiga/Magazines.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/natasha/corus/HEAD/data/taiga/Magazines.tar.gz
--------------------------------------------------------------------------------
/data/taiga/Subtitles.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/natasha/corus/HEAD/data/taiga/Subtitles.tar.gz
--------------------------------------------------------------------------------
/data/buriy/webhose-2016.tar.bz2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/natasha/corus/HEAD/data/buriy/webhose-2016.tar.bz2
--------------------------------------------------------------------------------
/corus/sources/gramru.py:
--------------------------------------------------------------------------------
1 |
2 | from .ud import load_ud
3 |
4 |
5 | def load_gramru(path):
6 | return load_ud(path)
7 |
--------------------------------------------------------------------------------
/data/annot.opcorpora.xml.byfile.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/natasha/corus/HEAD/data/annot.opcorpora.xml.byfile.zip
--------------------------------------------------------------------------------
/data/buriy/news-articles-2014.tar.bz2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/natasha/corus/HEAD/data/buriy/news-articles-2014.tar.bz2
--------------------------------------------------------------------------------
/data/ruwiki-latest-pages-articles.xml.bz2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/natasha/corus/HEAD/data/ruwiki-latest-pages-articles.xml.bz2
--------------------------------------------------------------------------------
/data/buriy/news-articles-2015-part1.tar.bz2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/natasha/corus/HEAD/data/buriy/news-articles-2015-part1.tar.bz2
--------------------------------------------------------------------------------
/data/buriy/news-articles-2015-part2.tar.bz2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/natasha/corus/HEAD/data/buriy/news-articles-2015-part2.tar.bz2
--------------------------------------------------------------------------------
/data/russe/sem/ae2.csv:
--------------------------------------------------------------------------------
1 | word1,word2,sim
2 | абажур,торшер,1
3 | абажур,люстра,1
4 | абажур,лампа,1
5 | абажур,свет,1
6 | абажур,ночник,1
7 |
--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 |
2 | [flake8]
3 | # E501 line too long
4 | # W503 line break before binary op
5 | extend-ignore = E501,W503
6 | exclude = corus/third
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .cache
2 | .coverage
3 | coverage.xml
4 | .pytest_cache
5 | .ipynb_checkpoints
6 | .DS_Store
7 | *.pyc
8 | *.egg-info
9 | build
10 | dist
11 | notes
12 |
--------------------------------------------------------------------------------
/data/russe/sem/rt.csv:
--------------------------------------------------------------------------------
1 | word1,word2,sim
2 | аберрация,год,0
3 | аберрация,человек,0
4 | аберрация,заблуждение,1
5 | абзац,отрывок,1
6 | абзац,время,0
7 | абзац,район,0
8 | абиссиния,население,0
9 |
--------------------------------------------------------------------------------
/data/russe/sem/hj.csv:
--------------------------------------------------------------------------------
1 | word1,word2,sim
2 | автомобиль,машина,0.958333
3 | маг,волшебник,0.958333
4 | доллар,бакс,0.952381
5 | мальчик,парень,0.952381
6 | машина,автомобиль,0.952381
7 | кладбище,погост,0.916667
8 |
--------------------------------------------------------------------------------
/corus/path.py:
--------------------------------------------------------------------------------
1 |
2 | from os import listdir as list_dir # noqa
3 | from os.path import join as join_path # noqa
4 | from os.path import basename as get_filename # noqa
5 | from os.path import splitext as split_ext # noqa
6 |
--------------------------------------------------------------------------------
/data/russe/sem/ae-train.csv:
--------------------------------------------------------------------------------
1 | word1,word2,related,sim
2 | автомат,калашникова,assoc,1
3 | автомат,пулемет,assoc,1
4 | автомат,пистолет,assoc,1
5 | автомат,война,assoc,1
6 | автомат,газ. вода,assoc,1
7 | автомат,год,random,0
8 |
--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
1 |
2 | lint:
3 | flake8 corus
4 |
5 | exec-docs:
6 | python -m nbconvert \
7 | --ExecutePreprocessor.kernel_name=python3 \
8 | --ClearMetadataPreprocessor.enabled=True \
9 | --execute --to notebook --inplace \
10 | docs.ipynb
11 |
--------------------------------------------------------------------------------
/data/simlex/ru_simlex965_tagged.tsv:
--------------------------------------------------------------------------------
1 | # Word1 Word2 Average Score
2 | авария_NOUN бедствие_NOUN 6.15
3 | август_NOUN месяц_NOUN 2.85
4 | авиация_NOUN полет_NOUN 6.77
5 | эксцентричный_ADJ странный_ADJ 6.31
6 | эластичный_ADJ гибкий_ADJ 7.92
7 | элегантность_NOUN стиль_NOUN 6.46
8 |
--------------------------------------------------------------------------------
/data/factRuEval-2016-master/devset/book_58.coref:
--------------------------------------------------------------------------------
1 | 2 16967 16972
2 | name Италия
3 |
4 | 3 16968 16970 16974
5 | name Грузия
6 |
7 | 4 16975
8 | name МИД Грузии
9 |
10 | 5 16969
11 | firstname Виторио
12 | lastname Сандали
13 |
14 | 6 16971
15 | firstname Александр
16 | lastname Налбандов
17 |
18 |
--------------------------------------------------------------------------------
/data/toloka/ruadrect/task2_ru_test.tsv:
--------------------------------------------------------------------------------
1 | tweet_id tweet label
2 | 892079521922416641 @A_Kapustin запретить на хрен.. недосмотр однако.. только прозак, только хардкор 0
3 | 1089927935031676929 не тратьте деньги на образование, тратьте на транквилизаторы: какая разница какие у тебя оценки когда ты залипла под ксанаксом? 0
4 |
--------------------------------------------------------------------------------
/data/factRuEval-2016-master/devset/book_58.objects:
--------------------------------------------------------------------------------
1 | 16972 LocOrg 32962 # Италии
2 | 16975 Org 32963 32965 # миде Грузии
3 | 16974 LocOrg 32965 # Грузии
4 | 16967 LocOrg 32951 # Италии
5 | 16968 LocOrg 32952 # Грузии
6 | 16969 Person 32953 32954 # Виторио Сандали
7 | 16970 LocOrg 32955 # Грузии
8 | 16971 Person 32956 32957 # Александром Налбандовым
9 |
--------------------------------------------------------------------------------
/data/factRuEval-2016-master/devset/book_58.txt:
--------------------------------------------------------------------------------
1 | Встреча с послом Италии в миде Грузии
2 |
3 | По инициативе итальянской стороны чрезвычайный и полномочный посол Италии в Грузии Виторио Сандали встретился с заместителем министра иностранных дел Грузии Александром Налбандовым. Предметом обсуждения стали вопросы сотрудничества в международных организациях.
4 |
--------------------------------------------------------------------------------
/corus/sources/taiga/__init__.py:
--------------------------------------------------------------------------------
1 |
2 | from .arzamas import * # noqa
3 | from .fontanka import * # noqa
4 | from .interfax import * # noqa
5 | from .kp import * # noqa
6 | from .lenta import * # noqa
7 | from .magazines import * # noqa
8 | from .nplus1 import * # noqa
9 | from .subtitles import * # noqa
10 | from .social import * # noqa
11 | from .proza import * # noqa
12 |
--------------------------------------------------------------------------------
/data/factRuEval-2016-master/devset/book_58.facts:
--------------------------------------------------------------------------------
1 | 58-0 Meeting
2 | Participant obj5 Сандали Виторио
3 | Participant obj6 Налбандов Александр
4 |
5 | 58-1 Occupation
6 | Who obj5 Сандали Виторио
7 | Where obj2 Италия
8 | Position span32958 чрезвычайный и полномочный посол | span64007 чрезвычайный и полномочный посол Италии в Грузии
9 |
10 | 58-2 Occupation
11 | Who obj6 Налбандов Александр
12 | Position span32959 заместителем министра иностранных дел
13 | Where obj3 Грузия
14 |
--------------------------------------------------------------------------------
/data/toloka/lrwc-1.1-aggregated.tsv:
--------------------------------------------------------------------------------
1 | INPUT:hyponym INPUT:hypernym INPUT:genitive OUTPUT:judgement CONFIDENCE:judgement
2 | автомобиль автомашина автомашины true 99.75%
3 | автомобиль автомототранспорт автомототранспорта true 99.96%
4 | автомобиль автомототранспортный автомототранспортного true 99.99%
5 | автомобиль автомототранспортное_средство автомототранспортного_средства true 99.99%
6 | автомобиль внедорожник внедорожника false 61.28%
7 | автомобиль железный_конь железного_коня false 77.76%
8 |
--------------------------------------------------------------------------------
/data/rus-ner-news-corpus.iob/biztass-1.txt.iob:
--------------------------------------------------------------------------------
1 | МОСКВА O
2 | , O
3 | 21 O
4 | июня O
5 | . O
6 | / O
7 | БИЗНЕС-ТАСС B-ORG
8 | / O
9 | . O
10 | Группа O
11 | НЛМК B-ORG
12 | заняла O
13 | второе O
14 | место O
15 | в O
16 | рейтинге O
17 | 35 O
18 | наиболее O
19 | конкурентоспособных O
20 | сталелитейных O
21 | компаний O
22 | мира O
23 | . O
24 | Рейтинг O
25 | составлялся O
26 | World B-ORG
27 | Steel I-ORG
28 | Dynamics I-ORG
29 | , O
30 | ведущей O
31 | международной O
32 | исследовательской O
33 | компанией O
34 | , O
35 | на O
36 | основе O
37 | оценки O
38 | 23 O
39 |
--------------------------------------------------------------------------------
/data/Collection5/001.ann:
--------------------------------------------------------------------------------
1 | T1 GEOPOLIT 0 6 Россия
2 | T2 GEOPOLIT 50 53 США
3 | T3 GEOPOLIT 57 63 Грузию
4 | T4 LOC 87 93 МОСКВА
5 | T5 MEDIA 103 114 РИА Новости
6 | T6 GEOPOLIT 116 122 Россия
7 | T7 GEOPOLIT 141 144 США
8 | T8 GEOPOLIT 161 168 Тбилиси
9 | T9 GEOPOLIT 301 307 России
10 | T10 PER 308 324 Григорий Карасин
11 | T11 GEOPOLIT 383 386 США
12 | T12 PER 387 402 Дэниэлом Фридом
13 | T13 GEOPOLIT 505 517 Южной Осетии
14 | T14 GEOPOLIT 703 709 Россия
15 | T15 GEOPOLIT 723 730 Тбилиси
16 | T16 GEOPOLIT 815 825 Вашингтона
17 | T17 ORG 838 841 МИД
18 | T18 GEOPOLIT 842 848 России
19 |
--------------------------------------------------------------------------------
/data/morphoru/gikrya_new_test.out:
--------------------------------------------------------------------------------
1 | 1 А а CONJ _
2 | 2 потом потом ADV Degree=Pos
3 | 3 опять опять ADV Degree=Pos
4 | 4 появлялись появляться VERB Mood=Ind|Number=Plur|Tense=Past|VerbForm=Fin|Voice=Mid
5 | 5 эсэсовцы эсэсовец NOUN Animacy=Anim|Case=Nom|Gender=Masc|Number=Plur
6 | 6 . . PUNCT _
7 |
8 | 1 Вокруг вокруг ADP _
9 | 2 него он PRON Case=Gen|Gender=Masc|Number=Sing|Person=3
10 | 3 вспыхнул вспыхнуть VERB Gender=Masc|Mood=Ind|Number=Sing|Tense=Past|VerbForm=Fin|Voice=Act
11 | 4 зеленый зелёный ADJ Case=Nom|Degree=Pos|Gender=Masc|Number=Sing
12 | 5 свет свет NOUN Animacy=Inan|Case=Nom|Gender=Masc|Number=Sing
13 |
--------------------------------------------------------------------------------
/.github/workflows/pypi.yml:
--------------------------------------------------------------------------------
1 | name: Publish PyPi
2 |
3 | on:
4 | push:
5 | tags:
6 | - v*
7 |
8 | jobs:
9 | build:
10 | runs-on: ubuntu-latest
11 |
12 | steps:
13 | - uses: actions/checkout@v3
14 |
15 | - name: Set up Python
16 | uses: actions/setup-python@v4
17 | with:
18 | python-version: '3.10'
19 |
20 | - name: Install dependencies
21 | run: pip install wheel
22 |
23 | - name: Build package
24 | run: python setup.py sdist bdist_wheel
25 |
26 | - name: Publish PyPI
27 | uses: pypa/gh-action-pypi-publish@release/v1
28 | with:
29 | password: ${{ secrets.PYPI_API_TOKEN }}
30 |
31 |
--------------------------------------------------------------------------------
/.github/workflows/test.yml:
--------------------------------------------------------------------------------
1 | name: Test
2 |
3 | on: [push, pull_request]
4 |
5 | jobs:
6 | build:
7 |
8 | runs-on: ubuntu-latest
9 | strategy:
10 | matrix:
11 | python-version: ['3.8', '3.9', '3.10', '3.11']
12 |
13 | steps:
14 | - uses: actions/checkout@v3
15 |
16 | - name: Set up Python
17 | uses: actions/setup-python@v4
18 | with:
19 | python-version: ${{ matrix.python-version }}
20 |
21 | - name: Install dependencies
22 | run: |
23 | pip install -r requirements/ci.txt
24 | pip install -e .
25 |
26 | - name: Test
27 | run: |
28 | make lint
29 | make exec-docs
30 |
--------------------------------------------------------------------------------
/corus/sources/simlex.py:
--------------------------------------------------------------------------------
1 |
2 | from corus.record import Record
3 | from corus.io import (
4 | load_lines,
5 | parse_tsv,
6 | skip_header
7 | )
8 |
9 |
10 | class SimlexRecord(Record):
11 | __attributes__ = ['word1', 'word2', 'score']
12 |
13 | def __init__(self, word1, word2, score):
14 | self.word1 = word1
15 | self.word2 = word2
16 | self.score = score
17 |
18 |
19 | def parse_simlex(lines):
20 | skip_header(lines)
21 | records = parse_tsv(lines)
22 | for word1, word2, score in records:
23 | score = float(score)
24 | yield SimlexRecord(word1, word2, score)
25 |
26 |
27 | def load_simlex(path):
28 | lines = load_lines(path)
29 | return parse_simlex(lines)
30 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 |
2 | from setuptools import setup, find_packages
3 |
4 |
5 | with open('README.md') as file:
6 | description = file.read()
7 |
8 |
9 | setup(
10 | name='corus',
11 | version='0.10.0',
12 | description='Links to russian corpora, functions for loading and parsing',
13 | long_description=description,
14 | long_description_content_type='text/markdown',
15 | url='https://github.com/natasha/corus',
16 | author='Alexander Kukushkin',
17 | author_email='alex@alexkuk.ru',
18 | license='MIT',
19 | classifiers=[
20 | 'License :: OSI Approved :: MIT License',
21 | 'Programming Language :: Python :: 3',
22 | ],
23 | keywords='corpora, russian, nlp, datasets',
24 | install_requires=[],
25 | packages=find_packages(),
26 | )
27 |
28 |
--------------------------------------------------------------------------------
/data/Collection5/001.txt:
--------------------------------------------------------------------------------
1 | Россия рассчитывает на конструктивное воздействие США на Грузию
2 |
3 | 04/08/2008 12:08
4 |
5 | МОСКВА, 4 авг - РИА Новости. Россия рассчитывает, что США воздействуют на Тбилиси в связи с обострением ситуации в зоне грузино-осетинского конфликта. Об этом статс-секретарь - заместитель министра иностранных дел России Григорий Карасин заявил в телефонном разговоре с заместителем госсекретаря США Дэниэлом Фридом.
6 |
7 | "С российской стороны выражена глубокая озабоченность в связи с новым витком напряженности вокруг Южной Осетии, противозаконными действиями грузинской стороны по наращиванию своих вооруженных сил в регионе, бесконтрольным строительством фортификационных сооружений", - говорится в сообщении.
8 |
9 | "Россия уже призвала Тбилиси к ответственной линии и рассчитывает также на конструктивное воздействие со стороны Вашингтона", - сообщил МИД России.
--------------------------------------------------------------------------------
/data/rudrec/rudrec_annotated.json:
--------------------------------------------------------------------------------
1 | {"file_name": "172744.tsv", "text": "нам прописали, так мой ребенок сыпью покрылся, глаза опухли, сверху и снизу на веках высыпала сыпь, ( 8 месяцев сыну)А от виферона такого не было... У кого ещё такие побочки, отзовитесь!1 Чем спасались?\n", "entities": [{"start": 122, "entity_type": "Drugform", "end": 130, "entity_id": "*[0]_se", "entity_text": "виферона", "concept_id": "C0021735", "concept_name": NaN}, {"start": 31, "entity_type": "ADR", "end": 45, "entity_id": "*[1]", "entity_text": "сыпью покрылся", "concept_id": "C0015230", "concept_name": NaN}, {"start": 47, "entity_type": "ADR", "end": 59, "entity_id": "*[2]", "entity_text": "глаза опухли", "concept_id": "C4760994", "concept_name": NaN}, {"start": 76, "entity_type": "ADR", "end": 98, "entity_id": "*[3]", "entity_text": "на веках высыпала сыпь", "concept_id": "C0015230", "concept_name": NaN}], "sentence_id": 0}
--------------------------------------------------------------------------------
/data/bsnlp/test_pl_cs_ru_bg/annotated/nord_stream/ru/Nord_Stream_2_extra.xml_file_1.out:
--------------------------------------------------------------------------------
1 | ru-ryanair-new-extra-1
2 | "Газпрому" "Газпром" ORG ORG-Gazprom
3 | Nord stream-2 Nord stream-2 PRO PRO-Nord-Stream-2
4 | Андрей Коболев Андрей Коболев PER PER-Andrey-Kobolev
5 | Брюсселе Брюссель LOC GPE-Brussles
6 | ЕС ЕС ORG ORG-European-Union
7 | Европу Европа LOC LOC-Europe
8 | Климкин Климкин PER PER-Pavel-Klimkin
9 | Линасом Линкявичюсом Линас Линкявичюс PER PER-Linas-Linkavichus
10 | Литвы Литва LOC GPE-Lithuania
11 | МИД Литвы МИД Литвы ORG ORG-Foreign-Office-Lithuania
12 | МИД МИД ORG ORG-Foreign-Office
13 | НАК "Нафтогаз Украины" НАК "Нафтогаз Украины" ORG ORG-Naftogaz
14 | Павел Климкин Павел Климкин PER PER-Pavel-Klimkin
15 | Россией Россия LOC GPE-Russia
16 | России Россия LOC GPE-Russia
17 | Украина Украина LOC GPE-Ukraine
18 | Украину Украина LOC GPE-Ukraine
19 | Украины Украина LOC GPE-Ukraine
20 |
--------------------------------------------------------------------------------
/corus/sources/librusec.py:
--------------------------------------------------------------------------------
1 |
2 | import re
3 |
4 | from corus.record import Record
5 | from corus.io import load_gz_lines
6 |
7 |
8 | class LibrusecRecord(Record):
9 | __attributes__ = ['id', 'text']
10 |
11 | def __init__(self, id, text):
12 | self.id = id
13 | self.text = text
14 |
15 |
16 | def flush(id, buffer):
17 | return LibrusecRecord(id, '\n'.join(buffer))
18 |
19 |
20 | def parse_librusec(lines):
21 | id = None
22 | buffer = []
23 | for line in lines:
24 | match = re.match(r'^(\d+)\.fb2', line)
25 | if match:
26 | if id:
27 | yield flush(id, buffer)
28 | buffer = []
29 | id = match.group(1)
30 | line = line[match.end() + 1:] # extra space
31 | buffer.append(line)
32 | yield flush(id, buffer)
33 |
34 |
35 | def load_librusec(path):
36 | lines = load_gz_lines(path)
37 | return parse_librusec(lines)
38 |
--------------------------------------------------------------------------------
/data/bsnlp/test_pl_cs_ru_bg/annotated/nord_stream/ru/Nord_Stream_2_extra.xml_file_7.out:
--------------------------------------------------------------------------------
1 | ru-ryanair-new-extra-7
2 | Nord Stream AG Nord Stream AG ORG ORG-Nord-Stream-AG
3 | Берлина Берлин LOC GPE-Berlin
4 | Вашингтона Вашингтон LOC GPE-WashingtonDC
5 | Герхарда Шредера Герхард Шредер PER PER-Gerhard-Fritz-Kurt-Schröder
6 | Европарламент Европарламент ORG ORG-European-Parlament
7 | Европе Европа LOC LOC-Europe
8 | Европы Европа LOC LOC-Europe
9 | МИД Германии МИД Германии ORG ORG-Ministry-of-Foreign-Affairs-Germany
10 | МИД ФРГ МИД ФРГ ORG ORG-Federal-Foreign-Office-Germany
11 | Роснефти Роснефть ORG ORG-Rosneft
12 | США США LOC GPE-USA
13 | Северного потока-2 Северный поток-2 PRO PRO-Nord-Stream-2
14 | Северный поток-2 Северный поток-2 PRO PRO-Nord-Stream-2
15 | Северный поток Северный поток PRO PRO-Nord-Stream-1
16 | Украина Украина LOC GPE-Ukraine
17 | ФРГ ФРГ LOC GPE-Germany
18 | Хайко Маас Хайко Маас PER PER-Heiko-Maas
19 | Шредер Шредер PER PER-Gerhard-Fritz-Kurt-Schröder
20 |
--------------------------------------------------------------------------------
/data/factRuEval-2016-master/devset/book_58.tokens:
--------------------------------------------------------------------------------
1 | 89968 0 7 Встреча
2 | 89969 8 1 с
3 | 89970 10 6 послом
4 | 89971 17 6 Италии
5 | 89972 24 1 в
6 | 89973 26 4 миде
7 | 89974 31 6 Грузии
8 |
9 | 89975 39 2 По
10 | 89976 42 10 инициативе
11 | 89977 53 11 итальянской
12 | 89978 65 7 стороны
13 | 89979 73 12 чрезвычайный
14 | 89980 86 1 и
15 | 89981 88 11 полномочный
16 | 89982 100 5 посол
17 | 89983 106 6 Италии
18 | 89984 113 1 в
19 | 89985 115 6 Грузии
20 | 89986 122 7 Виторио
21 | 89987 130 7 Сандали
22 | 89988 138 10 встретился
23 | 89989 149 1 с
24 | 89990 151 12 заместителем
25 | 89991 164 8 министра
26 | 89992 173 11 иностранных
27 | 89993 185 3 дел
28 | 89994 189 6 Грузии
29 | 89995 196 11 Александром
30 | 89996 208 11 Налбандовым
31 | 89997 219 1 .
32 |
33 | 89998 221 9 Предметом
34 | 89999 231 10 обсуждения
35 | 90000 242 5 стали
36 | 90001 248 7 вопросы
37 | 90002 256 14 сотрудничества
38 | 90003 271 1 в
39 | 90004 273 13 международных
40 | 90005 287 12 организациях
41 | 90006 299 1 .
42 |
43 |
--------------------------------------------------------------------------------
/data/morphoru/unamb_sent_14_6.conllu:
--------------------------------------------------------------------------------
1 | 1 « « PUNCT _ _ _ _ _ _
2 | 2 Школа ШКОЛА NOUN _ Animacy=Inan|Case=Nom|Gender=Fem|Number=Sing _ _ _ _
3 | 3 злословия ЗЛОСЛОВИЕ NOUN _ Animacy=Inan|Case=Gen|Gender=Neut|Number=Sing _ _ _ _
4 | 4 » » PUNCT _ _ _ _ _ _
5 | 5 учит УЧИТЬ VERB _ Aspect=Imp|Mood=Ind|Number=Sing|Person=3|Tense=Notpast|VerbForm=Fin _ _ _ _
6 | 6 прикусить ПРИКУСИТЬ VERB _ Aspect=Perf|VerbForm=Inf _ _ _ _
7 | 7 язык ЯЗЫК NOUN _ Animacy=Inan|Case=Acc|Gender=Masc|Number=Sing _ _ _ _
8 |
9 | 1 Сохранится СОХРАНИТЬСЯ VERB _ Aspect=Perf|Mood=Ind|Number=Sing|Person=3|Tense=Notpast|VerbForm=Fin _ _ _ _
10 | 2 ли ЛИ PART _ _ _ _ _ _
11 | 3 градус ГРАДУС NOUN _ Animacy=Inan|Case=Nom|Gender=Masc|Number=Sing _ _ _ _
12 | 4 дискуссии ДИСКУССИЯ NOUN _ Animacy=Inan|Case=Gen|Gender=Fem|Number=Sing _ _ _ _
13 | 5 в В ADP _ _ _ _ _ _
14 | 6 новом НОВЫЙ ADJ _ Case=Loc|Gender=Masc|Number=Sing _ _ _ _
15 | 7 сезоне СЕЗОН NOUN _ Animacy=Inan|Case=Loc|Gender=Masc|Number=Sing _ _ _ _
16 | 8 ? ? PUNCT _ _ _ _ _ _
17 |
--------------------------------------------------------------------------------
/corus/sources/__init__.py:
--------------------------------------------------------------------------------
1 |
2 | from .factru import load_factru # noqa
3 | from .gareev import load_gareev # noqa
4 | from .lenta import load_lenta, load_lenta2 # noqa
5 | from .librusec import load_librusec # noqa
6 | from .ne5 import load_ne5 # noqa
7 | from .wikiner import load_wikiner # noqa
8 | from .bsnlp import load_bsnlp # noqa
9 | from .persons import load_persons # noqa
10 | from .taiga import * # noqa
11 | from .buriy import * # noqa
12 | from .mokoron import * # noqa
13 | from .wiki import load_wiki # noqa
14 | from .ods import * # noqa
15 | from .ria import * # noqa
16 | from .ud import * # noqa
17 | from .morphoru import * # noqa
18 | from .gramru import load_gramru # noqa
19 | from .corpora import load_corpora # noqa
20 | from .russe import * # noqa
21 | from .toloka import load_toloka_lrwc # noqa
22 | from .simlex import load_simlex # noqa
23 | from .omnia import load_omnia # noqa
24 | from .toloka import load_ruadrect # noqa
25 | from .rudrec import load_rudrec # noqa
26 |
--------------------------------------------------------------------------------
/data/ud/ru_taiga-ud-dev.conllu:
--------------------------------------------------------------------------------
1 | # newpar
2 | # sent_id = instagram-16
3 | # speaker = screened-18
4 | # genre = social
5 | # text = @screened-88 ✅взабраться на статую Христа - только что!😄
6 | 1 @screened-88 @screened-88 X _ Foreign=Yes 3 vocative _ _
7 | 2 ✅ ✅ PUNCT _ _ 3 punct _ SpaceAfter=No
8 | 3 взабраться взобраться VERB _ Aspect=Perf|VerbForm=Inf|Voice=Mid 0 root _ _
9 | 4 на на ADP _ _ 5 case _ _
10 | 5 статую статуя NOUN _ Animacy=Inan|Case=Acc|Gender=Fem|Number=Sing 3 obl _ _
11 | 6 Христа Христос PROPN _ Animacy=Anim|Case=Gen|Gender=Masc|Number=Sing 5 nmod _ _
12 | 7 - - PUNCT _ _ 9 punct _ _
13 | 8 только только PART _ _ 9 advmod _ _
14 | 9 что что PRON _ Case=Nom 3 parataxis _ SpaceAfter=No
15 | 10 ! ! PUNCT _ _ 3 punct _ SpaceAfter=No
16 | 11 😄 😄 SYM _ _ 3 discourse _ _
17 |
18 | # newpar
19 | # sent_id = instagram-17
20 | # speaker = screened-18
21 | # genre = social
22 | # text = @screened-58 😊спасибо
23 | 1 @screened-58 @screened-58 X _ Foreign=Yes 3 vocative _ _
24 | 2 😊 😊 SYM _ _ 3 discourse _ SpaceAfter=No
25 | 3 спасибо спасибо INTJ _ _ 0 root _ _
26 |
--------------------------------------------------------------------------------
/corus/sources/wikiner.py:
--------------------------------------------------------------------------------
1 |
2 | from corus.record import Record
3 | from corus.io import load_bz2_lines
4 |
5 |
6 | class WikinerToken(Record):
7 | __attributes__ = ['text', 'pos', 'tag']
8 |
9 | def __init__(self, text, pos, tag):
10 | self.text = text
11 | self.pos = pos
12 | self.tag = tag
13 |
14 |
15 | class WikinerMarkup(Record):
16 | __attributes__ = ['tokens']
17 |
18 | def __init__(self, tokens):
19 | self.tokens = tokens
20 |
21 |
22 | def parse_wikiner(line):
23 | if not line:
24 | # skip empy lines
25 | return
26 |
27 | # На|PR|O севере|S|O граничит|V|O с|PR|O Латвией|S|I-LOC
28 | tokens = []
29 | for part in line.split():
30 | text, pos, tag = part.split('|', 2)
31 | token = WikinerToken(text, pos, tag)
32 | tokens.append(token)
33 |
34 | return WikinerMarkup(tokens)
35 |
36 |
37 | def load_wikiner(path):
38 | lines = load_bz2_lines(path)
39 | for line in lines:
40 | record = parse_wikiner(line)
41 | if record:
42 | yield record
43 |
--------------------------------------------------------------------------------
/data/factRuEval-2016-master/devset/book_58.spans:
--------------------------------------------------------------------------------
1 | 32962 loc_name 17 6 89971 1 # 89971 Италии
2 | 32963 org_name 26 4 89973 1 # 89973 миде
3 | 32965 loc_name 31 6 89974 1 # 89974 Грузии
4 | 32966 job 10 6 89970 1 # 89970 послом
5 | 64002 job 10 13 89970 2 # 89970 89971 послом Италии
6 | 32951 loc_name 106 6 89983 1 # 89983 Италии
7 | 32952 loc_name 115 6 89985 1 # 89985 Грузии
8 | 32953 name 122 7 89986 1 # 89986 Виторио
9 | 32954 surname 130 7 89987 1 # 89987 Сандали
10 | 32955 loc_name 189 6 89994 1 # 89994 Грузии
11 | 32956 name 196 11 89995 1 # 89995 Александром
12 | 32957 surname 208 11 89996 1 # 89996 Налбандовым
13 | 32958 job 73 32 89979 4 # 89979 89980 89981 89982 чрезвычайный и полномочный посол
14 | 32959 job 151 37 89990 4 # 89990 89991 89992 89993 заместителем министра иностранных дел
15 | 32960 job 164 24 89991 3 # 89991 89992 89993 министра иностранных дел
16 | 32961 job 100 5 89982 1 # 89982 посол
17 | 64007 job 73 48 89979 7 # 89979 89980 89981 89982 89983 89984 89985 чрезвычайный и полномочный посол Италии в Грузии
18 | 64013 job 100 21 89982 4 # 89982 89983 89984 89985 посол Италии в Грузии
19 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2017
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
--------------------------------------------------------------------------------
/corus/sources/lenta.py:
--------------------------------------------------------------------------------
1 |
2 | from datetime import datetime
3 |
4 | from corus.record import Record
5 | from corus.io import (
6 | load_gz_lines,
7 | load_bz2_lines,
8 | parse_csv,
9 | skip_header
10 | )
11 |
12 |
13 | class LentaRecord(Record):
14 | __attributes__ = ['url', 'title', 'text', 'topic', 'tags', 'date']
15 |
16 | def __init__(self, url, title, text, topic, tags, date=None):
17 | self.url = url
18 | self.title = title
19 | self.text = text
20 | self.topic = topic
21 | self.tags = tags
22 | self.date = date
23 |
24 |
25 | def parse_lenta(lines):
26 | rows = parse_csv(lines)
27 | skip_header(rows)
28 | for cells in rows:
29 | yield LentaRecord(*cells)
30 |
31 |
32 | def parse_lenta2(lines):
33 | for record in parse_lenta(lines):
34 | record.date = datetime.strptime(record.date, '%Y/%m/%d')
35 | yield record
36 |
37 |
38 | def load_lenta(path):
39 | lines = load_gz_lines(path)
40 | return parse_lenta(lines)
41 |
42 |
43 | def load_lenta2(path):
44 | lines = load_bz2_lines(path)
45 | return parse_lenta2(lines)
46 |
--------------------------------------------------------------------------------
/data/morphoru/RNCgoldInUD_Morpho.conll:
--------------------------------------------------------------------------------
1 | ==> blogs.xhtml <==
2 | ==newfile==
3 | Кстати кстати H _ _
4 | о о ADP _ _
5 | вопросе вопрос NOUN Animacy=Inan|Case=Loc|Gender=Masc|Number=Sing _
6 | " " PUNCT _ _
7 | Пушкин Пушкин NOUN Animacy=Anim|Case=Nom|Gender=Masc|Number=Sing NameType=Sur
8 | и и CONJ _ _
9 | святитель святитель NOUN Animacy=Anim|Case=Nom|Gender=Masc|Number=Sing _
10 | Филарет Филарет NOUN Animacy=Anim|Case=Nom|Gender=Masc|Number=Sing NameType=Giv
11 | , , PUNCT _ _
12 | митрополит митрополит NOUN Animacy=Anim|Case=Nom|Gender=Masc|Number=Sing _
13 | Московский московский ADJ Case=Nom|Gender=Masc|Number=Sing|Variant=Full _
14 | "... "... PUNCT _ _
15 | ты ты PRON Case=Nom|Number=Sing|Person=2 _
16 | надумал надумать VERB Gender=Masc|Mood=Ind|Number=Sing|Tense=Past|VerbForm=Fin|Voice=Act Subcat=Tran|Aspect=Perf
17 | , , PUNCT _ _
18 | что что PRON Animacy=Inan|Case=Acc|Gender=Neut|Number=Sing _
19 | можно можно ADV Degree=Pos Predic=Yes
20 | сказать сказать VERB VerbForm=Inf|Voice=Act Subcat=Tran|Aspect=Perf
21 | ? ? PUNCT _ _
22 |
23 | Да да PART _ _
24 | ! ! PUNCT _ _
25 |
26 | И и CONJ _ _
27 | чтооо что PRON Animacy=Inan|Case=Acc|Gender=Neut|Number=Sing Typo=Yes
28 | же же PART _ _
29 | ? ? PUNCT _ _
--------------------------------------------------------------------------------
/corus/sources/gareev.py:
--------------------------------------------------------------------------------
1 |
2 | import re
3 |
4 | from corus.path import (
5 | list_dir,
6 | join_path
7 | )
8 | from corus.io import load_lines
9 | from corus.record import Record
10 |
11 |
12 | class GareevToken(Record):
13 | __attributes__ = ['text', 'tag']
14 |
15 | def __init__(self, text, tag):
16 | self.text = text
17 | self.tag = tag
18 |
19 |
20 | class GareevRecord(Record):
21 | __attributes__ = ['tokens']
22 |
23 | def __init__(self, tokens):
24 | self.tokens = tokens
25 |
26 |
27 | def parse_conll(lines):
28 | for line in lines:
29 | text, tag = line.split('\t', 1)
30 | yield GareevToken(text, tag)
31 |
32 |
33 | def parse_gareev(lines):
34 | tokens = list(parse_conll(lines))
35 | return GareevRecord(tokens)
36 |
37 |
38 | def load_id(id, dir):
39 | path = join_path(dir, '%s.txt.iob' % id)
40 | lines = load_lines(path)
41 | return parse_gareev(lines)
42 |
43 |
44 | def list_ids(dir):
45 | for filename in list_dir(dir):
46 | match = re.match(r'^(.+).txt.iob', filename)
47 | if match:
48 | yield match.group(1)
49 |
50 |
51 | def load_gareev(dir):
52 | for id in list_ids(dir):
53 | yield load_id(id, dir)
54 |
--------------------------------------------------------------------------------
/corus/sources/wiki.py:
--------------------------------------------------------------------------------
1 |
2 | from io import StringIO
3 | import json
4 |
5 | from corus.record import Record
6 | from corus.io import load_bz2_lines
7 | from corus.third.WikiExtractor import (
8 | options,
9 | pages_from,
10 | Extractor
11 | )
12 |
13 |
14 | options.write_json = True
15 |
16 |
17 | class WikiRecord(Record):
18 | __attributes__ = ['id', 'url', 'title', 'text']
19 |
20 | def __init__(self, id, url, title, text):
21 | self.id = id
22 | self.url = url
23 | self.title = title
24 | self.text = text
25 |
26 | @classmethod
27 | def from_json(cls, data):
28 | return cls(
29 | id=data['id'],
30 | url=data['url'],
31 | title=data['title'],
32 | text=data['text']
33 | )
34 |
35 |
36 | class Extractor_(Extractor):
37 | def extract_(self):
38 | output = StringIO()
39 | self.extract(output)
40 | return json.loads(output.getvalue())
41 |
42 |
43 | def load_wiki(path):
44 | lines = load_bz2_lines(path)
45 | records = pages_from(lines)
46 | for record in records:
47 | id, revision, title, _, _, page = record
48 | extractor = Extractor_(id, revision, title, page)
49 | data = extractor.extract_()
50 | yield WikiRecord.from_json(data)
51 |
--------------------------------------------------------------------------------
/data/bsnlp/test_pl_cs_ru_bg/raw/nord_stream/ru/Nord_Stream_2_extra.xml_file_7.txt:
--------------------------------------------------------------------------------
1 | ru-ryanair-new-extra-7
2 | ru
3 | 2019-01-10
4 | https://www.rbc.ru/rbcfreenews/5c379d0b9a79470901bdea3b
5 | МИД ФРГ заявил о невозможности решать в США энергетические вопросы Европы
6 |
7 | Глава МИД Германии Хайко Маас назвал неприемлемым введение санкций США против газопровода Северный поток-2, пишет Вопросы европейской энергетической политики нужно решать в Европе, а не в США. Наложение односторонних санкций против Северного потока-2 точно не тот путь, сказал он.
8 |
9 | В середине декабря палата представителей конгресса США приняла резолюцию против Северного потока-2. В документе газопровод называют радикальным шагом назад для энергетической безопасности Европы. Тогда же Европарламент принял свою резолюцию с призывом прекратить реализацию данного проекта. Европейские депутаты отметили, что в сетях энергоснабжения региона решающую роль играет Украина. По мнению экс-канцлера ФРГ Герхарда Шредера, давление на Северный поток-2 в США оказывают из-за планов Вашингтона стать поставщиком газа для Берлина. Шредер с сентября 2017 года входит в совет директоров Роснефти и является его председателем. После ухода с поста канцлера, Шредер возглавлял наблюдательный совет, а позже комитет акционеров компании Nord Stream AG, созданной для управления магистральным газопроводом Северный поток.
10 |
--------------------------------------------------------------------------------
/corus/sources/russe.py:
--------------------------------------------------------------------------------
1 |
2 | from corus.record import Record
3 | from corus.io import (
4 | load_lines,
5 | parse_csv,
6 | dict_csv
7 | )
8 |
9 |
10 | class RusseSemRecord(Record):
11 | __attributes__ = ['word1', 'word2', 'sim']
12 |
13 | def __init__(self, word1, word2, sim):
14 | self.word1 = word1
15 | self.word2 = word2
16 | self.sim = sim
17 |
18 |
19 | # word1,word2,related,sim
20 | # автомат,калашникова,assoc,1
21 | # автомат,пулемет,assoc,1
22 | # автомат,пистолет,assoc,1
23 | # автомат,война,assoc,1
24 | # автомат,газ. вода,assoc,1
25 | # автомат,год,random,0
26 | # автомат,человек,random,0
27 | # автомат,время,random,0
28 | # автомат,район,random,0
29 |
30 |
31 | def parse_russe(lines):
32 | records = parse_csv(lines)
33 | items = dict_csv(records)
34 | for item in items:
35 | word1 = item['word1']
36 | word2 = item['word2']
37 | sim = float(item['sim'])
38 | yield RusseSemRecord(word1, word2, sim)
39 |
40 |
41 | def load_russe(path):
42 | lines = load_lines(path)
43 | return parse_russe(lines)
44 |
45 |
46 | def load_russe_hj(path):
47 | return load_russe(path)
48 |
49 |
50 | def load_russe_rt(path):
51 | return load_russe(path)
52 |
53 |
54 | def load_russe_ae(path):
55 | return load_russe(path)
56 |
57 |
58 | __all__ = [
59 | 'load_russe_hj',
60 | 'load_russe_rt',
61 | 'load_russe_ae',
62 | ]
63 |
--------------------------------------------------------------------------------
/corus/sources/taiga/subtitles.py:
--------------------------------------------------------------------------------
1 |
2 | from .common import (
3 | Meta,
4 | load_tar_metas,
5 | load_tar_texts,
6 | parse_filename_id,
7 | merge_metas
8 | )
9 |
10 |
11 | # [{'filepath': 'Heroes - 3x12 - Our Father.HDTV.LOL.en.txt',
12 | # 'id': '8940',
13 | # 'languages': 'en',
14 | # 'title': 'Heroes - 3x12 - Our Father.HDTV.LOL.en.srt'},
15 | # {'filepath': 'Friends - 3x17 - The One Without The Ski Trip.ru.txt',
16 | # 'id': '7553',
17 | # 'languages': 'ru',
18 | # 'title': 'Friends - 3x17 - The One Without The Ski Trip.ru.srt'},
19 |
20 |
21 | def parse_metas(items):
22 | for item in items:
23 | id = parse_filename_id(item['filepath'])
24 | lang = item['languages']
25 | title = item['title']
26 | yield Meta(
27 | id=id,
28 | lang=lang,
29 | title=title
30 | )
31 |
32 |
33 | def load_taiga_subtitles_metas(path, offset=0, count=1):
34 | items = load_tar_metas(path, '*/metatable.csv', offset, count)
35 | return parse_metas(items)
36 |
37 |
38 | # home/tsha/Subtitles/texts/12 Monkeys/12 Monkeys - 1x01 - Splinter.HDTV.KILLERS.en.txt
39 | # home/tsha/Subtitles/texts/12 Monkeys/12 Monkeys - 1x01 - Splinter.HDTV.KILLERS.ru.txt
40 |
41 |
42 | def load_taiga_subtitles(path, metas=None, offset=2113024, count=19011):
43 | records = load_tar_texts(path, '*/texts/*.txt', offset, count)
44 | return merge_metas(records, metas)
45 |
--------------------------------------------------------------------------------
/corus/sources/rudrec.py:
--------------------------------------------------------------------------------
1 |
2 | from corus.record import Record
3 | from corus.io import (
4 | parse_jsonl,
5 | load_lines
6 | )
7 |
8 |
9 | class RuDReCRecord(Record):
10 | __attributes__ = ['file_name', 'text', 'sentence_id', 'entities']
11 |
12 | def __init__(self, file_name, text, sentence_id, entities):
13 | self.file_name = file_name
14 | self.text = text
15 | self.sentence_id = sentence_id
16 | self.entities = entities
17 |
18 |
19 | class RuDReCEntity(Record):
20 | __attributes__ = [
21 | 'entity_id', 'entity_text', 'entity_type',
22 | 'start', 'end', 'concept_id', 'concept_name'
23 | ]
24 |
25 | def __init__(self, entity_id, entity_text, entity_type, start, end, concept_id, concept_name):
26 | self.entity_id = entity_id
27 | self.entity_text = entity_text
28 | self.entity_type = entity_type
29 | self.start = start
30 | self.end = end
31 | self.concept_id = concept_id
32 | self.concept_name = concept_name
33 |
34 |
35 | def parse_entities(items):
36 | for item in items:
37 | yield RuDReCEntity(
38 | item['entity_id'],
39 | item['entity_text'],
40 | item['entity_type'],
41 | item['start'],
42 | item['end'],
43 | item.get('concept_id'),
44 | item.get('concept_name')
45 | )
46 |
47 |
48 | def parse_rudrec(items):
49 | for item in items:
50 | entities = list(parse_entities(item['entities']))
51 | yield RuDReCRecord(
52 | item['file_name'],
53 | item['text'],
54 | item['sentence_id'],
55 | entities
56 | )
57 |
58 |
59 | def load_rudrec(path):
60 | lines = load_lines(path)
61 | items = parse_jsonl(lines)
62 | return parse_rudrec(items)
63 |
--------------------------------------------------------------------------------
/corus/record.py:
--------------------------------------------------------------------------------
1 |
2 | class Record(object):
3 | __attributes__ = []
4 |
5 | def __eq__(self, other):
6 | return (
7 | type(self) == type(other)
8 | and all(
9 | (getattr(self, _) == getattr(other, _))
10 | for _ in self.__attributes__
11 | )
12 | )
13 |
14 | def __ne__(self, other):
15 | return not self == other
16 |
17 | def __iter__(self):
18 | return (getattr(self, _) for _ in self.__attributes__)
19 |
20 | def __hash__(self):
21 | return hash(tuple(self))
22 |
23 | def __repr__(self):
24 | name = self.__class__.__name__
25 | args = ', '.join(
26 | '{key}={value!r}'.format(
27 | key=_,
28 | value=getattr(self, _)
29 | )
30 | for _ in self.__attributes__
31 | )
32 | return '{name}({args})'.format(
33 | name=name,
34 | args=args
35 | )
36 |
37 | def _repr_pretty_(self, printer, cycle):
38 | name = self.__class__.__name__
39 | if cycle:
40 | printer.text('{name}(...)'.format(name=name))
41 | else:
42 | printer.text('{name}('.format(name=name))
43 | keys = self.__attributes__
44 | size = len(keys)
45 | if size:
46 | with printer.indent(4):
47 | printer.break_()
48 | for index, key in enumerate(keys):
49 | printer.text(key + '=')
50 | value = getattr(self, key)
51 | printer.pretty(value)
52 | if index < size - 1:
53 | printer.text(',')
54 | printer.break_()
55 | printer.break_()
56 | printer.text(')')
57 |
--------------------------------------------------------------------------------
/corus/sources/ne5.py:
--------------------------------------------------------------------------------
1 |
2 | import re
3 |
4 | from corus.path import (
5 | list_dir,
6 | join_path
7 | )
8 | from corus.record import Record
9 | from corus.io import load_lines
10 |
11 |
12 | class Ne5Span(Record):
13 | __attributes__ = ['index', 'type', 'start', 'stop', 'text']
14 |
15 | def __init__(self, index, type, start, stop, text):
16 | self.index = index
17 | self.type = type
18 | self.start = start
19 | self.stop = stop
20 | self.text = text
21 |
22 |
23 | class Ne5Markup(Record):
24 | __attributes__ = ['id', 'text', 'spans']
25 |
26 | def __init__(self, id, text, spans):
27 | self.id = id
28 | self.text = text
29 | self.spans = spans
30 |
31 |
32 | def list_ids(dir):
33 | for filename in list_dir(dir):
34 | match = re.match(r'^(.+).txt$', filename)
35 | if match:
36 | yield match.group(1)
37 |
38 |
39 | def txt_path(id, dir):
40 | return join_path(dir, '%s.txt' % id)
41 |
42 |
43 | def ann_path(id, dir):
44 | return join_path(dir, '%s.ann' % id)
45 |
46 |
47 | def parse_spans(lines):
48 | # brat format http://brat.nlplab.org/standoff.html
49 | for line in lines:
50 | index, type, start, stop, text = line.split(None, 4)
51 | start = int(start)
52 | stop = int(stop)
53 | yield Ne5Span(index, type, start, stop, text)
54 |
55 |
56 | def load_text(path):
57 | # do not convert \r\n to \n
58 | with open(path, newline='') as file:
59 | return file.read()
60 |
61 |
62 | def load_id(id, dir):
63 | path = txt_path(id, dir)
64 | text = load_text(path)
65 | path = ann_path(id, dir)
66 | lines = load_lines(path)
67 | spans = list(parse_spans(lines))
68 | return Ne5Markup(id, text, spans)
69 |
70 |
71 | def load_ne5(dir):
72 | for id in list_ids(dir):
73 | yield load_id(id, dir)
74 |
--------------------------------------------------------------------------------
/data/gramru/GramEval_private_test.conllu:
--------------------------------------------------------------------------------
1 | 1 А а CCONJ _ _ 5 cc _ _
2 | 2 потом потом ADV _ Degree=Pos 5 advmod _ _
3 | 3 мы мы PRON _ Case=Nom|Number=Plur|Person=1 5 nsubj _ _
4 | 4 все весь DET _ Case=Nom|Number=Plur 3 det _ _
5 | 5 погрузились погружаться VERB _ Aspect=Perf|Mood=Ind|Number=Plur|Tense=Past|VerbForm=Fin|Voice=Mid 0 root _ _
6 | 6 в в ADP _ _ 7 case _ _
7 | 7 автобус автобус NOUN _ Animacy=Inan|Case=Acc|Gender=Masc|Number=Sing 5 obl _ _
8 | 8 и и CCONJ _ _ 9 cc _ _
9 | 9 поехали поехать VERB _ Aspect=Perf|Mood=Ind|Number=Plur|Tense=Past|VerbForm=Fin|Voice=Act 5 conj _ _
10 | 10 их они PRON _ Case=Acc|Number=Plur|Person=3 9 obj _ _
11 | 11 в в ADP _ _ 12 case _ _
12 | 12 аэропорт аэропорт NOUN _ Animacy=Inan|Case=Acc|Gender=Masc|Number=Sing 13 obl _ _
13 | 13 провожать провожать VERB _ Aspect=Imp|VerbForm=Inf|Voice=Act 9 xcomp _ _
14 | 14 . . PUNCT _ _ 13 punct _ _
15 |
16 | 1 Маменька маменька PROPN _ Animacy=Anim|Case=Nom|Gender=Fem|Number=Sing 2 nsubj _ _
17 | 2 сбежала сбежать VERB _ Aspect=Perf|Gender=Fem|Mood=Ind|Number=Sing|Tense=Past|VerbForm=Fin|Voice=Act 0 root _ _
18 | 3 ? ? PUNCT _ _ 2 punct _ _
19 |
20 | 1 Писано писать VERB _ Aspect=Perf|Gender=Neut|Number=Sing|Tense=Past|Variant=Short|VerbForm=Part|Voice=Pass 0 root _ _
21 | 2 въ в PROPN _ Animacy=Inan|Case=Nom|Gender=Masc|Number=Sing 1 nsubj:pass _ _
22 | 3 нашемъ нашемъ PROPN _ Animacy=Inan|Case=Gen|Gender=Masc|Number=Sing 2 nmod _ _
23 | 4 строеніи строеніь NOUN _ Animacy=Inan|Case=Gen|Gender=Fem|Number=Sing 3 nmod _ _
24 | 5 въ в PROPN _ Animacy=Inan|Case=Gen|Gender=Masc|Number=Sing 4 nmod _ _
25 | 6 Воскресенскомъ Воскресенскомъ PROPN _ Animacy=Inan|Case=Gen|Gender=Masc|Number=Sing 5 nmod _ _
26 | 7 монастырѣ монастырѣ NOUN _ Animacy=Inan|Case=Gen|Gender=Fem|Number=Sing 2 nmod _ _
27 | 8 въ в PROPN _ Animacy=Inan|Case=Gen|Gender=Masc|Number=Sing 7 nmod _ _
28 | 9 Новомъ Новомъ PROPN _ Animacy=Inan|Case=Gen|Gender=Masc|Number=Sing 8 nmod _ _
29 |
--------------------------------------------------------------------------------
/corus/sources/persons.py:
--------------------------------------------------------------------------------
1 |
2 | import re
3 | from corus.record import Record
4 | from corus.io import (
5 | list_zip,
6 | load_zip_texts,
7 | parse_xml,
8 | )
9 |
10 |
11 | TEXT = 'text.txt'
12 | ANNO = 'anno.markup.xml'
13 |
14 |
15 | class PersonsSpan(Record):
16 | __attributes__ = ['id', 'start', 'stop', 'value']
17 |
18 | def __init__(self, id, start, stop, value):
19 | self.id = id
20 | self.start = start
21 | self.stop = stop
22 | self.value = value
23 |
24 |
25 | class PersonsMarkup(Record):
26 | __attributes__ = ['text', 'spans']
27 |
28 | def __init__(self, text, spans):
29 | self.text = text
30 | self.spans = spans
31 |
32 |
33 | def list_ids(path):
34 | for name in list_zip(path):
35 | match = re.match(r'^Persons-1000/collection/([^/]+)/text\.txt$', name)
36 | if match:
37 | yield match.group(1)
38 |
39 |
40 | def part_names(ids, part):
41 | for id in ids:
42 | yield 'Persons-1000/collection/%s/%s' % (id, part)
43 |
44 |
45 | def parse_anno(text):
46 | xml = parse_xml(text)
47 | for entry in xml.findall('entry'):
48 | id = int(entry.find('id').text)
49 | start = int(entry.find('offset').text)
50 | size = int(entry.find('length').text)
51 | stop = start + size
52 | attribute = entry.find('attribute')
53 | value = attribute.find('value').text
54 | yield PersonsSpan(id, start, stop, value)
55 |
56 |
57 | def load_ids(ids, path):
58 | names = part_names(ids, TEXT)
59 | texts = load_zip_texts(path, names, 'cp1251')
60 |
61 | names = part_names(ids, ANNO)
62 | annos = load_zip_texts(path, names, 'utf-8')
63 | for text, anno in zip(texts, annos):
64 | spans = list(parse_anno(anno))
65 | yield PersonsMarkup(text, spans)
66 |
67 |
68 | def load_persons(path):
69 | ids = list(list_ids(path))
70 | return load_ids(ids, path)
71 |
--------------------------------------------------------------------------------
/corus/sources/taiga/kp.py:
--------------------------------------------------------------------------------
1 |
2 | from datetime import datetime
3 |
4 | from .common import (
5 | Author,
6 | Meta,
7 | load_tar_metas,
8 | load_tar_texts,
9 | merge_metas
10 | )
11 |
12 |
13 | # {'author': 'Мария ГОШИНА',
14 | # 'authorreaders': '',
15 | # 'authortexts': '',
16 | # 'date': '2017-01-20',
17 | # 'magazine': '',
18 | # 'segment': 'KP',
19 | # 'source': 'http://www.kp.ru/online/news/2632060/',
20 | # 'tags': '',
21 | # 'textdiff': '',
22 | # 'textid': '10@2632060',
23 | # 'textname': 'В Саратове спасатели помогли родственникам попасть в квартиру пенсионерки',
24 | # 'textregion': 'www.saratov.kp.ru',
25 | # 'textrubric': 'Общество>Общество',
26 | # 'time': '09:27:00+03:00'},
27 |
28 |
29 | def parse_metas(items):
30 | for item in items:
31 | id = item['textid']
32 | timestamp = item['date'] + item['time'][:8]
33 | timestamp = datetime.strptime(timestamp, '%Y-%m-%d%H:%M:%S')
34 |
35 | name = item['author'] or None
36 | author = Author(name=name)
37 |
38 | rubric = item['textrubric']
39 | title = item['textname']
40 | url = item['source']
41 | yield Meta(
42 | id=id,
43 | timestamp=timestamp,
44 | rubric=rubric,
45 | author=author,
46 | title=title,
47 | url=url
48 | )
49 |
50 |
51 | def load_taiga_kp_metas(path, offset=0, count=1):
52 | items = load_tar_metas(path, '*/newmetadata.csv', offset, count)
53 | return parse_metas(items)
54 |
55 |
56 | # home/tsha/KP/texts/10@2598286.txt
57 | # home/tsha/KP/texts/10@2598287.txt
58 | # home/tsha/KP/texts/10@2598289.txt
59 |
60 |
61 | def load_taiga_kp(path, metas=None, offset=13042176, count=45503):
62 | records = load_tar_texts(path, '*/texts/*.txt', offset, count)
63 | return merge_metas(records, metas)
64 |
65 |
66 | __all__ = [
67 | 'load_taiga_kp_metas',
68 | 'load_taiga_kp'
69 | ]
70 |
--------------------------------------------------------------------------------
/corus/sources/taiga/interfax.py:
--------------------------------------------------------------------------------
1 |
2 | from datetime import datetime
3 |
4 | from .common import (
5 | Meta,
6 | load_tar_metas,
7 | load_tar_texts,
8 | merge_metas
9 | )
10 |
11 |
12 | # {'author': '',
13 | # 'authorreaders': '',
14 | # 'authortexts': '',
15 | # 'date': '2013-02-24',
16 | # 'magazine': '',
17 | # 'segment': 'Interfax',
18 | # 'source': 'http://www.interfax.ru/russia/292151',
19 | # 'tags': 'Кубань',
20 | # 'textdiff': '',
21 | # 'textid': 'russia292151',
22 | # 'textname': '60 тысяч жителей Туапсинского района остались без электричества',
23 | # 'textregion': '',
24 | # 'textrubric': 'В России',
25 | # 'time': '16:10'},
26 |
27 |
28 | def parse_metas(items):
29 | for item in items:
30 | id = item['textid']
31 |
32 | timestamp = item['date'] + item['time']
33 | try:
34 | timestamp = datetime.strptime(timestamp, '%Y-%m-%d%H:%M')
35 | except ValueError:
36 | # rare, date='' time='2011-09-12'
37 | timestamp = datetime.strptime(timestamp, '%Y-%m-%d')
38 |
39 | title = item['textname']
40 | tags = item['tags']
41 | rubric = item.get('rubric')
42 | url = item['source']
43 | yield Meta(
44 | id=id,
45 | timestamp=timestamp,
46 | title=title,
47 | rubric=rubric,
48 | tags=tags,
49 | url=url
50 | )
51 |
52 |
53 | def load_taiga_interfax_metas(path, offset=0, count=1):
54 | items = load_tar_metas(path, '*/newmetadata.csv', offset, count)
55 | return parse_metas(items)
56 |
57 |
58 | # home/tsha/Interfax/texts/business225067.txt
59 | # home/tsha/Interfax/texts/business225113.txt
60 | # home/tsha/Interfax/texts/business225178.txt
61 |
62 |
63 | def load_taiga_interfax(path, metas=None, offset=11447296, count=46429):
64 | records = load_tar_texts(path, '*/texts/*.txt', offset, count)
65 | return merge_metas(records, metas)
66 |
67 |
68 | __all__ = [
69 | 'load_taiga_interfax_metas',
70 | 'load_taiga_interfax'
71 | ]
72 |
--------------------------------------------------------------------------------
/corus/sources/ria.py:
--------------------------------------------------------------------------------
1 |
2 | import re
3 |
4 | from corus.record import Record
5 | from corus.io import (
6 | load_gz_lines,
7 | parse_jsonl
8 | )
9 |
10 |
11 | class RiaRawRecord(Record):
12 | __attributes__ = ['title', 'text']
13 |
14 | def __init__(self, title, text):
15 | self.title = title
16 | self.text = text
17 |
18 |
19 | class RiaRecord(Record):
20 | __attributes__ = ['title', 'prefix', 'text']
21 |
22 | def __init__(self, title, prefix, text):
23 | self.title = title
24 | self.prefix = prefix
25 | self.text = text
26 |
27 |
28 | def parse_ria_raw(lines):
29 | records = parse_jsonl(lines)
30 | for record in records:
31 | yield RiaRawRecord(
32 | record['title'],
33 | record['text']
34 | )
35 |
36 |
37 | def load_ria_raw(path):
38 | lines = load_gz_lines(path)
39 | return parse_ria_raw(lines)
40 |
41 |
42 | def untag(text):
43 | return re.sub(r'<[^>]+>', '', text)
44 |
45 |
46 | def unescape(text):
47 | text = text.replace('<', '<')
48 | text = text.replace('>', '>')
49 | text = text.replace('&', '&')
50 | text = text.replace('–', '-')
51 | text = text.replace(' ', ' ')
52 | return text
53 |
54 |
55 | def first_sent(text):
56 | # москва, 31 янв - риа новости.
57 | # фарнборо (великобритания), 21 июл - риа новости, александр смотров.
58 | index = text.find('. ') # len('. ')
59 | if index > 0:
60 | index += 2
61 | sent, suffix = text[:index], text[index:]
62 | if 'риа новости' in sent and len(sent) < 70:
63 | sent = sent.strip()
64 | return sent, suffix
65 | return None, text
66 |
67 |
68 | def parse_ria(records):
69 | for record in records:
70 | text = record.text
71 | text = untag(text)
72 | text = unescape(text)
73 | prefix, text = first_sent(text)
74 | yield RiaRecord(
75 | record.title,
76 | prefix,
77 | text
78 | )
79 |
80 |
81 | def load_ria(path):
82 | records = load_ria_raw(path)
83 | return parse_ria(records)
84 |
85 |
86 | __all__ = [
87 | 'load_ria_raw',
88 | 'load_ria'
89 | ]
90 |
--------------------------------------------------------------------------------
/corus/sources/buriy.py:
--------------------------------------------------------------------------------
1 |
2 | import tarfile
3 | from io import TextIOWrapper
4 | from datetime import datetime
5 |
6 | from corus.record import Record
7 | from corus.io import (
8 | parse_csv,
9 | skip_header,
10 | )
11 |
12 |
13 | class BuriyRecord(Record):
14 | __attributes__ = ['timestamp', 'url', 'edition', 'topics', 'title', 'text']
15 |
16 | def __init__(self, timestamp, url, edition, topics, title, text):
17 | self.timestamp = timestamp
18 | self.url = url
19 | self.edition = edition
20 | self.topics = topics
21 | self.title = title
22 | self.text = text
23 |
24 |
25 | def load_tar(path, encoding='utf8'):
26 | with tarfile.open(path) as tar:
27 | for member in tar:
28 | if not member.isfile():
29 | continue
30 | file = tar.extractfile(member)
31 | yield TextIOWrapper(file, encoding)
32 |
33 |
34 | def parse_timestamp(timestamp):
35 | for pattern in ['%Y-%m-%d %H:%M:%S', '%Y-%m-%d']:
36 | try:
37 | return datetime.strptime(timestamp, pattern)
38 | except ValueError:
39 | continue
40 |
41 |
42 | def maybe_none(value, none=('',)):
43 | if value in none:
44 | return
45 | return value
46 |
47 |
48 | def parse_buriy(lines, max_text=10000000):
49 | rows = parse_csv(lines, max_field=max_text)
50 | skip_header(rows)
51 | for row in rows:
52 | timestamp, url, edition, topics, title, text = row
53 | timestamp = parse_timestamp(timestamp)
54 | edition = maybe_none(edition, ('', '-'))
55 | topics = maybe_none(topics)
56 | yield BuriyRecord(
57 | timestamp=timestamp,
58 | url=url,
59 | edition=edition,
60 | topics=topics,
61 | title=title,
62 | text=text
63 | )
64 |
65 |
66 | def load_buriy(path):
67 | for lines in load_tar(path):
68 | for record in parse_buriy(lines):
69 | yield record
70 |
71 |
72 | def load_buriy_news(path):
73 | return load_buriy(path)
74 |
75 |
76 | def load_buriy_webhose(path):
77 | return load_buriy(path)
78 |
79 |
80 | __all__ = [
81 | 'load_buriy_news',
82 | 'load_buriy_webhose'
83 | ]
84 |
--------------------------------------------------------------------------------
/corus/sources/taiga/lenta.py:
--------------------------------------------------------------------------------
1 |
2 | from datetime import datetime
3 |
4 | from .common import (
5 | Meta,
6 | load_tar_metas,
7 | load_tar_texts,
8 | patch_month,
9 | merge_metas
10 | )
11 |
12 |
13 | # {'author': '',
14 | # 'authorreaders': '',
15 | # 'authortexts': '',
16 | # 'date': '8 марта 2011',
17 | # 'magazine': '',
18 | # 'segment': 'Lenta',
19 | # 'source': 'https://lenta.ru/news/2011/03/08/hobgoblin/',
20 | # 'tags': '',
21 | # 'textdiff': '',
22 | # 'textid': '20110308hobgoblin',
23 | # 'textname': 'HBO запустит сериал о волшебной войне с Гитлером',
24 | # 'textregion': '',
25 | # 'textrubric': 'Культура',
26 | # 'time': '14:33'},
27 |
28 |
29 | LENTA_MONTHS = {
30 | 'января': 'Jan',
31 | 'февраля': 'Feb',
32 | 'марта': 'Mar',
33 | 'апреля': 'Apr',
34 | 'мая': 'May',
35 | 'июня': 'Jun',
36 | 'июля': 'Jul',
37 | 'августа': 'Aug',
38 | 'сентября': 'Sep',
39 | 'октября': 'Oct',
40 | 'ноября': 'Nov',
41 | 'декабря': 'Dec',
42 | }
43 |
44 |
45 | def parse_metas(items):
46 | for item in items:
47 | id = item['textid']
48 |
49 | date, time, timestamp = item['date'], item['time'], None
50 | if date and time:
51 | timestamp = patch_month(date, LENTA_MONTHS) + time
52 | timestamp = datetime.strptime(timestamp, '%d %b %Y%H:%M')
53 |
54 | title = item['textname']
55 | rubric = item['textrubric']
56 | url = item['source'] or None
57 | yield Meta(
58 | id=id,
59 | timestamp=timestamp,
60 | title=title,
61 | rubric=rubric,
62 | url=url
63 | )
64 |
65 |
66 | def load_taiga_lenta_metas(path, offset=0, count=1):
67 | items = load_tar_metas(path, '*/newmetadata.csv', offset, count)
68 | return parse_metas(items)
69 |
70 |
71 | # home/tsha/Lenta/texts/20100101three.txt
72 | # home/tsha/Lenta/texts/20100101tomsk.txt
73 | # home/tsha/Lenta/texts/20100101urus.txt
74 |
75 |
76 | def load_taiga_lenta(path, metas=None, offset=12800000, count=36446):
77 | records = load_tar_texts(path, '*/texts/*.txt', offset, count)
78 | return merge_metas(records, metas)
79 |
80 |
81 | __all__ = [
82 | 'load_taiga_lenta_metas',
83 | 'load_taiga_lenta'
84 | ]
85 |
--------------------------------------------------------------------------------
/data/bsnlp/test_pl_cs_ru_bg/raw/nord_stream/ru/Nord_Stream_2_extra.xml_file_1.txt:
--------------------------------------------------------------------------------
1 | ru-ryanair-new-extra-1
2 | ru
3 | 2019-01-10
4 | https://www.epravda.com.ua/rus/news/2019/01/10/644185/
5 | Климкин рассказал, чего ожидает от газовых переговоров с Россией в Брюсселе
6 |
7 | Украина во время второго раунда трехсторонних переговоров в Брюсселе будет настаивать на том, что транзит газа должен происходить в соответствии с нормами европейского законодательства. Об этом заявил министр иностранных дел Украины Павел Климкин на совместном брифинге с главой МИД Литвы Линасом Линкявичюсом в четверг, 10 января, передает
8 |
9 | "Наша позиция относительно будущего транзита, а также по европейской энергетической безопасности предельно четкая: мы готовы выполнить все соответствующие европейские регуляторные нормы для нашей газотранспортной системы. Мы считаем, что будущий транзит должен базироваться на прозрачных и эффективных регуляторах ЕС", - сказал Климкин. "Это означает, например, что газ будет покупаться на восточной границе Украины, что транзитный тариф будет рассчитываться по европейской методологии. То есть общий смысл нашей позиции - транзит должно происходить в соответствии с европейским законодательством ", - отметил глава МИД.
10 |
11 | Он подчеркнул, что общей позицией Украины и Литвы является то, что требования европейского законодательства должны быть распространены и на Nord stream-2, что фундаментально повысит европейскую энергетическую безопасность против России. Подытоживая, Климкин подчеркнул, что Украина готова к конструктивному диалогу о будущем транзита газа в Европу, но он должен базироваться на очень четких условиях, самое главное из которых - соответствие нормам европейского законодательства для обеспечения четких, эффективных и прозрачных предпосылок транзита. "Поэтому во время переговоров в Брюсселе 21 января мы будем исходить из этих очень простых, но очень важных требований", - отметил Климкин. Ранее глава НАК "Нафтогаз Украины" Андрей Коболев заявил о готовности Украины, при согласовании этого вопроса правительством, обсудить возможность просмотра или отказа от второго транзитного иска к российскому "Газпрому" на сумму более 12 млрд долларов в привязке с заключением нового долгосрочного контракта. Действующий контракт по транзиту российского газа через Украину заканчивается в конце 2019 года.
12 |
--------------------------------------------------------------------------------
/corus/sources/taiga/magazines.py:
--------------------------------------------------------------------------------
1 |
2 | import re
3 | from datetime import datetime
4 |
5 | from .common import (
6 | Meta,
7 | load_tar_metas,
8 | load_tar_texts,
9 | merge_metas,
10 | )
11 |
12 |
13 | # {'author': '',
14 | # 'authorreaders': '',
15 | # 'authortexts': '',
16 | # 'date': '2007, 10',
17 | # 'magazine': 'Знамя',
18 | # 'segment': 'Журнальный зал',
19 | # 'source': 'http://magazines.russ.ru/znamia/2007/10/tu26.html',
20 | # 'tags': '',
21 | # 'textdiff': '',
22 | # 'textid': '50005',
23 | # 'textname': 'Михаил Копелиович. Рецензия – любовь моя',
24 | # 'textregion': '',
25 | # 'textrubric': 'article',
26 | # 'time': ''},
27 |
28 |
29 | def parse_metas(items):
30 | for item in items:
31 | id = item.get('textid')
32 | if not id:
33 | continue
34 |
35 | timestamp = item.get('date')
36 | if timestamp:
37 | try:
38 | timestamp = datetime.strptime(timestamp, '%Y, %m')
39 | except ValueError:
40 | # rare 2002, 7-8
41 | pass
42 |
43 | title = item['textname'] or None
44 | rubric = item.get('textrubric') or None
45 |
46 | url = None
47 | if 'source' in item:
48 | url = item['source']
49 | match = re.search(r'russ\.ru/([^/]+)', url)
50 | label = match.group(1)
51 | id = label + '_' + id
52 |
53 | yield Meta(
54 | id=id,
55 | timestamp=timestamp,
56 | title=title,
57 | rubric=rubric,
58 | url=url
59 | )
60 |
61 |
62 | def load_taiga_magazines_metas(path, offset=0, count=36):
63 | items = load_tar_metas(path, '*/corpus_*_metadata.csv', offset, count)
64 | return parse_metas(items)
65 |
66 |
67 | # home/tsha/Magazines/texts/corpus_arion_10658.txt
68 | # home/tsha/Magazines/texts/corpus_arion_10659.txt
69 |
70 |
71 | def parse_magazines_id(name):
72 | match = re.search(r'corpus_([\d\w_]+)\.txt', name)
73 | return match.group(1)
74 |
75 |
76 | def load_taiga_magazines(path, metas=None, offset=7292416, count=39890):
77 | records = load_tar_texts(path, '*/texts/*.txt', offset, count)
78 | return merge_metas(records, metas)
79 |
80 |
81 | __all__ = [
82 | 'load_taiga_magazines_metas',
83 | 'load_taiga_magazines'
84 | ]
85 |
--------------------------------------------------------------------------------
/corus/sources/toloka.py:
--------------------------------------------------------------------------------
1 |
2 | from corus.record import Record
3 | from corus.io import (
4 | load_lines,
5 | parse_tsv,
6 | skip_header,
7 | )
8 |
9 |
10 | class LRWCRecord(Record):
11 | __attributes__ = ['hyponym', 'hypernym', 'genitive', 'judgement', 'confidence']
12 |
13 | def __init__(self, hyponym, hypernym, genitive, judgement, confidence):
14 | self.hyponym = hyponym
15 | self.hypernym = hypernym
16 | self.genitive = genitive
17 | self.judgement = judgement
18 | self.confidence = confidence
19 |
20 |
21 | # INPUT:hyponym INPUT:hypernym INPUT:genitive OUTPUT:judgement CONFIDENCE:judgement
22 | # автомобиль автомашина автомашины true 99.75%
23 | # автомобиль автомототранспорт автомототранспорта true 99.96%
24 | # автомобиль автомототранспортный автомототранспортного true 99.99%
25 |
26 |
27 | def parse_judgement(value):
28 | if value == 'true':
29 | return 1.0
30 | elif value == 'false':
31 | return 0.0
32 |
33 |
34 | def parse_confidence(value):
35 | return float(value[:-1])
36 |
37 |
38 | def parse_toloka_lrwc(lines):
39 | skip_header(lines)
40 | records = parse_tsv(lines)
41 | for record in records:
42 | hyponym, hypernym, genitive, judgement, confidence = record
43 | judgement = parse_judgement(judgement)
44 | confidence = parse_confidence(confidence)
45 | yield LRWCRecord(hyponym, hypernym, genitive, judgement, confidence)
46 |
47 |
48 | def load_toloka_lrwc(path):
49 | lines = load_lines(path)
50 | return parse_toloka_lrwc(lines)
51 |
52 |
53 | class RuADReCTRecord(Record):
54 | __attributes__ = ['tweet_id', 'tweet', 'label']
55 |
56 | def __init__(self, tweet_id, tweet, label):
57 | self.tweet_id = tweet_id
58 | self.tweet = tweet
59 | self.label = label
60 |
61 | # – tweet_id: уникальный номер сообщения в системе twitter;
62 | # – tweet: текст сообщения (твита);
63 | # - label: класс твита, 1 - содержит упоминание побочного эффекта, 0 - не содердит
64 |
65 |
66 | def parse_ruadrect(lines):
67 | rows = parse_tsv(lines)
68 | skip_header(rows)
69 | for cells in rows:
70 | yield RuADReCTRecord(*cells)
71 |
72 |
73 | def load_ruadrect(path):
74 | lines = load_lines(path)
75 | return parse_ruadrect(lines)
76 |
--------------------------------------------------------------------------------
/corus/sources/taiga/nplus1.py:
--------------------------------------------------------------------------------
1 |
2 | from datetime import datetime
3 |
4 | from .common import (
5 | Author,
6 | Meta,
7 | load_tar_metas,
8 | load_tar_texts,
9 | patch_month,
10 | merge_metas,
11 | )
12 |
13 |
14 | # {'author': 'Владимир Королев',
15 | # 'authorreaders': '',
16 | # 'authortexts': '',
17 | # 'date': '21 Янв. 2017',
18 | # 'magazine': '',
19 | # 'segment': 'nplus1',
20 | # 'source': 'https://nplus1.ru/news/2017/01/21/Asphaltene-3d',
21 | # 'tags': '',
22 | # 'textdiff': '5.2',
23 | # 'textid': '20170121Asphaltene-3d',
24 | # 'textname': '«Архипелаги» асфальтенов ощупали в 3D',
25 | # 'textregion': '',
26 | # 'textrubric': 'Наука',
27 | # 'time': '17:34'},
28 |
29 |
30 | NPLUS1_MONTHS = {
31 | 'Янв.': 'Jan',
32 | 'Фев.': 'Feb',
33 | 'Март': 'Mar',
34 | 'Апр.': 'Apr',
35 | 'Май': 'May',
36 | 'Июнь': 'Jun',
37 | 'Июль': 'Jul',
38 | 'Авг.': 'Aug',
39 | 'Сен.': 'Sep',
40 | 'Окт.': 'Oct',
41 | 'Нояб.': 'Nov',
42 | 'Дек.': 'Dec',
43 | }
44 |
45 |
46 | def parse_metas(items):
47 | for item in items:
48 | id = item['textid']
49 |
50 | timestamp, date, time = None, item['date'], item['time']
51 | if date and time:
52 | timestamp = patch_month(date, NPLUS1_MONTHS) + time
53 | timestamp = datetime.strptime(timestamp, '%d %b %Y%H:%M')
54 |
55 | name = item['author'] or None
56 | author = Author(name=name)
57 |
58 | title = item['textname']
59 | rubric = item['textrubric'] or None
60 | url = item['source']
61 | yield Meta(
62 | id=id,
63 | timestamp=timestamp,
64 | author=author,
65 | title=title,
66 | rubric=rubric,
67 | url=url
68 | )
69 |
70 |
71 | def load_taiga_nplus1_metas(path, offset=0, count=1):
72 | items = load_tar_metas(path, '*/newmetadata.csv', offset, count)
73 | return parse_metas(items)
74 |
75 |
76 | # home/tsha/NPlus1/texts/20150320drone.txt
77 | # home/tsha/NPlus1/texts/20150320nitrogen.txt
78 | # home/tsha/NPlus1/texts/20150320silica.txt
79 |
80 |
81 | def load_taiga_nplus1(path, metas=None, offset=1919488, count=7696):
82 | records = load_tar_texts(path, '*/texts/*.txt', offset, count)
83 | return merge_metas(records, metas)
84 |
85 |
86 | __all__ = [
87 | 'load_taiga_nplus1_metas',
88 | 'load_taiga_nplus1'
89 | ]
90 |
--------------------------------------------------------------------------------
/corus/sources/taiga/arzamas.py:
--------------------------------------------------------------------------------
1 |
2 | import re
3 | from datetime import datetime
4 |
5 | from .common import (
6 | Author,
7 | Meta,
8 | load_tar_metas,
9 | load_tar_texts,
10 | merge_metas,
11 | )
12 |
13 |
14 | # {'About_author': '',
15 | # 'Author_profession': 'Кандидат исторических наук. Креативный директор Фонда Егора Гайдара. Один из\xa0участников сетевого проекта «Прожито», создающего полный электронный корпус дневников советского времени.',
16 | # 'URL': 'http://arzamas.academy/mag/427-chapter7Историк советской литературы и\xa0культуры',
17 | # 'author': 'Илья Венявкин',
18 | # 'id': '427',
19 | # 'source': 'Arzamas',
20 | # 'tags': "['Документ', 'СССР']",
21 | # 'textdate': '27.04.2017',
22 | # 'theme': "['Литература', 'История']",
23 | # 'title': 'Советский писатель внутри Большого террора. Глава 7 • '}
24 |
25 |
26 | def parse_metas(items):
27 | for item in items:
28 | id = item['id']
29 | timestamp = datetime.strptime(item['textdate'], '%d.%m.%Y')
30 | tags = eval(item['tags'])
31 | themes = eval(item['theme'])
32 | name = item['author'] or None
33 | profession = item['Author_profession'] or None
34 | about = item['About_author'] or None
35 | author = Author(
36 | name=name,
37 | profession=profession,
38 | about=about
39 | )
40 | title = item['title'].strip(u'• ')
41 | url = item['URL']
42 | yield Meta(
43 | id=id,
44 | timestamp=timestamp,
45 | tags=tags,
46 | themes=themes,
47 | author=author,
48 | title=title,
49 | url=url
50 | )
51 |
52 |
53 | def load_taiga_arzamas_metas(path, offset=0, count=1):
54 | items = load_tar_metas(path, '*/metatable.csv', offset, count=1)
55 | return parse_metas(items)
56 |
57 |
58 | # home/tsha/Arzamas/texts/arzamas_449.txt
59 | # home/tsha/Arzamas/texts/arzamas_450.txt
60 | # home/tsha/Arzamas/texts/arzamas_452.txt
61 |
62 |
63 | def parse_id(name):
64 | match = re.search(r'arzamas_(\d+)\.txt', name)
65 | return match.group(1)
66 |
67 |
68 | def load_taiga_arzamas(path, metas=None, offset=144896, count=311):
69 | records = load_tar_texts(path, '*/texts/*.txt', offset, count, parse_id)
70 | return merge_metas(records, metas)
71 |
72 |
73 | __all__ = [
74 | 'load_taiga_arzamas_metas',
75 | 'load_taiga_arzamas'
76 | ]
77 |
--------------------------------------------------------------------------------
/data/mokoron/db.sql:
--------------------------------------------------------------------------------
1 | -- MySQL dump 10.13 Distrib 5.6.12, for osx10.6 (x86_64)
2 | --
3 | -- Host: localhost Database: neu
4 | -- ------------------------------------------------------
5 | -- Server version 5.6.12
6 |
7 | /*!40101 SET @OLD_CHARACTER_SET_CLIENT=@@CHARACTER_SET_CLIENT */;
8 | /*!40101 SET @OLD_CHARACTER_SET_RESULTS=@@CHARACTER_SET_RESULTS */;
9 | /*!40101 SET @OLD_COLLATION_CONNECTION=@@COLLATION_CONNECTION */;
10 | /*!40101 SET NAMES utf8 */;
11 | /*!40103 SET @OLD_TIME_ZONE=@@TIME_ZONE */;
12 | /*!40103 SET TIME_ZONE='+00:00' */;
13 | /*!40014 SET @OLD_UNIQUE_CHECKS=@@UNIQUE_CHECKS, UNIQUE_CHECKS=0 */;
14 | /*!40014 SET @OLD_FOREIGN_KEY_CHECKS=@@FOREIGN_KEY_CHECKS, FOREIGN_KEY_CHECKS=0 */;
15 | /*!40101 SET @OLD_SQL_MODE=@@SQL_MODE, SQL_MODE='NO_AUTO_VALUE_ON_ZERO' */;
16 | /*!40111 SET @OLD_SQL_NOTES=@@SQL_NOTES, SQL_NOTES=0 */;
17 |
18 | --
19 | -- Table structure for table `sentiment`
20 | --
21 |
22 | DROP TABLE IF EXISTS `sentiment`;
23 | /*!40101 SET @saved_cs_client = @@character_set_client */;
24 | /*!40101 SET character_set_client = utf8 */;
25 | CREATE TABLE `sentiment` (
26 | `id` bigint(32) NOT NULL AUTO_INCREMENT,
27 | `tdate` varchar(128) DEFAULT NULL,
28 | `tname` varchar(128) DEFAULT NULL,
29 | `ttext` varchar(256) DEFAULT NULL,
30 | `ttype` int(10) DEFAULT '0',
31 | `trep` int(10) DEFAULT '0',
32 | `trtw` int(10) DEFAULT '0',
33 | `tfav` int(10) DEFAULT '0',
34 | `tstcount` int(10) DEFAULT '0',
35 | `tfoll` int(10) DEFAULT '0',
36 | `tfrien` int(10) DEFAULT '0',
37 | `listcount` int(10) DEFAULT '0',
38 | PRIMARY KEY (`id`)
39 | ) ENGINE=MyISAM AUTO_INCREMENT=441644379397451777 DEFAULT CHARSET=utf8;
40 | /*!40101 SET character_set_client = @saved_cs_client */;
41 |
42 | --
43 | -- Dumping data for table `sentiment`
44 | --
45 |
46 | LOCK TABLES `sentiment` WRITE;
47 | /*!40000 ALTER TABLE `sentiment` DISABLE KEYS */;
48 | INSERT INTO `sentiment` VALUES (408906695721877504,'1386325928','Va5ilina','Пропавшая в Хабаровске школьница почти сутки провела в яме у коллектор',2,0,0,0,183,95,158,0),(408906695700520960,'1386325928','i_wont_judge_ya','ЛЕНТА, Я СЕГОДНЯ ПОЛГОДА ДИРЕКШИОНЕЕЕЕР! С:\nХОТЯ ВСЕ РАВНО НИКТО НЕ ПОЗДРАВИТ ЛОЛ',2,0,0,0,19809,804,257,11)
49 | INSERT INTO `sentiment` VALUES (410005806927847424,'1386587976','Victorika_nya','Открытые аудиозаписи нужны, чтобы прийти в гости и включить их ^.^',2,0,0,0,426,12,20,0),(408906695663161344,'1386325928','victorypanasenk','Царствие Божие внутрь вас есть.',2,0,0,0,1080,986,412,0)
50 |
--------------------------------------------------------------------------------
/corus/zip.py:
--------------------------------------------------------------------------------
1 |
2 | from collections import namedtuple
3 |
4 | import zlib
5 | from io import BytesIO
6 | from struct import (
7 | calcsize,
8 | unpack
9 | )
10 |
11 |
12 | def open_zip(path):
13 | return open(path, 'rb')
14 |
15 |
16 | # File: APPNOTE.TXT - .ZIP File Format Specification
17 | # https://pkware.cachefly.net/webdocs/casestudies/APPNOTE.TXT
18 |
19 | # 4.3.7 Local file header:
20 | # local file header signature 4 bytes (0x04034b50)
21 | # version needed to extract 2 bytes
22 | # general purpose bit flag 2 bytes
23 | # compression method 2 bytes
24 | # last mod file time 2 bytes
25 | # last mod file date 2 bytes
26 | # crc-32 4 bytes
27 | # compressed size 4 bytes
28 | # uncompressed size 4 bytes
29 | # file name length 2 bytes
30 | # extra field length 2 bytes
31 |
32 | # file name (variable size)
33 | # extra field (variable size)
34 |
35 |
36 | HEADER_FORMAT = '<4s5HL2L2H'
37 | HEADER_SIGNATURE = b'PK\x03\x04'
38 |
39 | NO_COMPRESSION = 0
40 | DEFLATED = 8
41 |
42 |
43 | ZipHeader = namedtuple(
44 | 'ZipHeader',
45 | ['signature', 'extract_by', 'flags', 'compression',
46 | 'time', 'date', 'crc', 'compressed', 'uncompressed',
47 | 'name', 'extra']
48 | )
49 |
50 |
51 | def decode_name(name):
52 | # since assert flags == 0
53 | return name.decode('cp437')
54 |
55 |
56 | def read_zip_header(file):
57 | size = calcsize(HEADER_FORMAT)
58 | buffer = file.read(size)
59 | if len(buffer) < size:
60 | return
61 |
62 | data = unpack(HEADER_FORMAT, buffer)
63 | header = ZipHeader._make(data)
64 | if not is_zip_header(header):
65 | return
66 |
67 | assert_zip_header(header)
68 | name = file.read(header.name)
69 | header = header._replace(name=decode_name(name))
70 | file.read(header.extra) # skip extra
71 | return header
72 |
73 |
74 | def is_zip_header(record):
75 | return record.signature == HEADER_SIGNATURE
76 |
77 |
78 | def assert_zip_header(record):
79 | assert record.flags == 0, record.flags
80 | assert record.compression in (NO_COMPRESSION, DEFLATED), record.compression
81 |
82 |
83 | def read_zip_data(file, header):
84 | data = file.read(header.compressed)
85 | if header.compression == DEFLATED:
86 | data = zlib.decompress(data, -15)
87 | # TODO Maybe do buffered reading to save memory
88 | return BytesIO(data)
89 |
--------------------------------------------------------------------------------
/data/ud/ru_gsd-ud-dev.conllu:
--------------------------------------------------------------------------------
1 | # sent_id = dev-s3
2 | # text = Он и являлся'' полным властелином всей Ахсауской местности'' и родоначальником Телакуровых, построивших здесь свой замок.
3 | 1 Он он PRON PRP Case=Nom|Gender=Masc|Number=Sing|Person=3 3 nsubj _ _
4 | 2 и и PART UH _ 3 advmod _ _
5 | 3 являлся являться VERB VBC Aspect=Imp|Gender=Masc|Mood=Ind|Number=Sing|Tense=Past|VerbForm=Fin|Voice=Mid 0 root _ SpaceAfter=No
6 | 4 '' '' PUNCT `` _ 6 punct _ _
7 | 5 полным полный ADJ JJL Case=Ins|Degree=Pos|Gender=Masc|Number=Sing 6 amod _ _
8 | 6 властелином властелин NOUN NN Animacy=Anim|Case=Ins|Gender=Masc|Number=Sing 3 xcomp _ _
9 | 7 всей весь DET DT Case=Gen|Gender=Fem|Number=Sing 9 det _ _
10 | 8 Ахсауской ахсауский ADJ JJL Case=Gen|Degree=Pos|Gender=Fem|Number=Sing 9 amod _ _
11 | 9 местности местность NOUN NN Animacy=Inan|Case=Gen|Gender=Fem|Number=Sing 6 nmod _ SpaceAfter=No
12 | 10 '' '' PUNCT '' _ 6 punct _ _
13 | 11 и и CCONJ CC _ 12 cc _ _
14 | 12 родоначальником родоначальник NOUN NN Animacy=Anim|Case=Ins|Gender=Masc|Number=Sing 6 conj _ _
15 | 13 Телакуровых Телакуров PROPN NNP Animacy=Anim|Case=Gen|Gender=Masc|Number=Plur 12 nmod _ SpaceAfter=No
16 | 14 , , PUNCT , _ 15 punct _ _
17 | 15 построивших построить VERB VBNL Animacy=Anim|Aspect=Perf|Case=Gen|Number=Plur|Tense=Past|VerbForm=Part|Voice=Act 13 acl _ _
18 | 16 здесь здесь ADV RB Degree=Pos 15 advmod _ _
19 | 17 свой свой DET PRP$ Animacy=Inan|Case=Acc|Gender=Masc|Number=Sing 18 det _ _
20 | 18 замок замок NOUN NN Animacy=Inan|Case=Acc|Gender=Masc|Number=Sing 15 obj _ SpaceAfter=No
21 | 19 . . PUNCT . _ 6 punct _ _
22 |
23 | # sent_id = dev-s4
24 | # text = Сержант посоветовал Баклсу пойти на работу водителем машины скорой помощи.
25 | 1 Сержант сержант NOUN NN Animacy=Anim|Case=Nom|Gender=Masc|Number=Sing 2 nsubj _ _
26 | 2 посоветовал посоветовать VERB VBC Aspect=Perf|Gender=Masc|Mood=Ind|Number=Sing|Tense=Past|VerbForm=Fin|Voice=Act 0 root _ _
27 | 3 Баклсу Баклс PROPN NNP Animacy=Anim|Case=Dat|Gender=Masc|Number=Sing 2 iobj _ _
28 | 4 пойти пойти VERB VB Aspect=Perf|VerbForm=Inf|Voice=Act 2 xcomp _ _
29 | 5 на на ADP IN _ 6 case _ _
30 | 6 работу работа NOUN NN Animacy=Inan|Case=Acc|Gender=Fem|Number=Sing 4 obl _ _
31 | 7 водителем водитель NOUN NN Animacy=Anim|Case=Ins|Gender=Masc|Number=Sing 4 xcomp _ _
32 | 8 машины машина NOUN NN Animacy=Inan|Case=Gen|Gender=Fem|Number=Sing 7 nmod _ _
33 | 9 скорой скорый ADJ JJL Case=Gen|Degree=Pos|Gender=Fem|Number=Sing 10 amod _ _
34 | 10 помощи помощь NOUN NN Animacy=Inan|Case=Gen|Gender=Fem|Number=Sing 8 nmod _ SpaceAfter=No
35 | 11 . . PUNCT . _ 2 punct _ _
36 |
--------------------------------------------------------------------------------
/corus/sources/morphoru.py:
--------------------------------------------------------------------------------
1 |
2 | from corus.record import Record
3 | from corus.io import load_lines
4 |
5 | from .ud import group_sents, parse_row, parse_feats
6 |
7 |
8 | class MorphoSent(Record):
9 | __attributes__ = ['tokens', 'attrs']
10 |
11 | def __init__(self, tokens, attrs=()):
12 | self.tokens = tokens
13 | self.attrs = attrs
14 |
15 |
16 | class MorphoToken(Record):
17 | __attributes__ = ['text', 'lemma', 'pos', 'feats', 'feats2']
18 |
19 | def __init__(self, text, lemma, pos, feats, feats2=None):
20 | self.text = text
21 | self.lemma = lemma
22 | self.pos = pos
23 | self.feats = feats
24 | self.feats2 = feats2
25 |
26 |
27 | def parse_morphoru(lines, parse_sent):
28 | for group in group_sents(lines):
29 | tokens = list(parse_sent(group))
30 | yield MorphoSent(tokens)
31 |
32 |
33 | def parse_morphoru_gicrya_sent(lines):
34 | for line in lines:
35 | _, text, lemma, pos, feats = parse_row(line)
36 | feats = dict(parse_feats(feats))
37 | yield MorphoToken(text, lemma, pos, feats)
38 |
39 |
40 | def parse_morphoru_corpora_sent(lines):
41 | for line in lines:
42 | parts = parse_row(line)
43 | _, text, lemma, pos, _, feats = parts[:6]
44 | feats = dict(parse_feats(feats))
45 | yield MorphoToken(text, lemma, pos, feats)
46 |
47 |
48 | def parse_morphoru_rnc(lines):
49 | # ==> blogs.xhtml <==
50 | # ==newfile==
51 | # Кстати кстати H _ _
52 | # о о ADP _ _
53 |
54 | for group in group_sents(lines):
55 | attrs, tokens = [], []
56 | for line in group:
57 | if line.startswith('=='):
58 | attrs.append(line)
59 | else:
60 | _, text, lemma, pos, feats, feats2 = parse_row(line)
61 | feats = dict(parse_feats(feats))
62 | feats2 = dict(parse_feats(feats2))
63 | token = MorphoToken(text, lemma, pos, feats, feats2)
64 | tokens.append(token)
65 | yield MorphoSent(tokens, attrs)
66 |
67 |
68 | def load_morphoru_gicrya(path):
69 | lines = load_lines(path)
70 | return parse_morphoru(lines, parse_morphoru_gicrya_sent)
71 |
72 |
73 | def load_morphoru_rnc(path):
74 | lines = load_lines(path)
75 | return parse_morphoru_rnc(lines)
76 |
77 |
78 | def load_morphoru_corpora(path):
79 | lines = load_lines(path)
80 | return parse_morphoru(lines, parse_morphoru_corpora_sent)
81 |
82 |
83 | __all__ = [
84 | 'load_morphoru_gicrya',
85 | 'load_morphoru_rnc',
86 | 'load_morphoru_corpora'
87 | ]
88 |
--------------------------------------------------------------------------------
/data/ud/ru_syntagrus-ud-dev.conllu:
--------------------------------------------------------------------------------
1 | # sent_id = 2013Algoritm.xml_16
2 | # text = Различные определения алгоритма в явной или неявной форме содержат следующий ряд общих требований:
3 | 1 Различные различный ADJ _ Case=Nom|Degree=Pos|Number=Plur 2 amod 2:amod _
4 | 2 определения определение NOUN _ Animacy=Inan|Case=Nom|Gender=Neut|Number=Plur 9 nsubj 9:nsubj _
5 | 3 алгоритма алгоритм NOUN _ Animacy=Inan|Case=Gen|Gender=Masc|Number=Sing 2 nmod 2:nmod _
6 | 4 в в ADP _ _ 8 case 8:case _
7 | 5 явной явный ADJ _ Case=Loc|Degree=Pos|Gender=Fem|Number=Sing 8 amod 8:amod _
8 | 6 или или CCONJ _ _ 7 cc 7:cc _
9 | 7 неявной неявный ADJ _ Case=Loc|Degree=Pos|Gender=Fem|Number=Sing 5 conj 5:conj _
10 | 8 форме форма NOUN _ Animacy=Inan|Case=Loc|Gender=Fem|Number=Sing 9 obl 9:obl _
11 | 9 содержат содержать VERB _ Aspect=Imp|Mood=Ind|Number=Plur|Person=3|Tense=Pres|VerbForm=Fin|Voice=Act 0 root 0:root _
12 | 10 следующий следующий ADJ _ Animacy=Inan|Case=Acc|Degree=Pos|Gender=Masc|Number=Sing 11 amod 11:amod _
13 | 11 ряд ряд NOUN _ Animacy=Inan|Case=Acc|Gender=Masc|Number=Sing 9 obj 9:obj _
14 | 12 общих общий ADJ _ Case=Gen|Degree=Pos|Number=Plur 13 amod 13:amod _
15 | 13 требований требование NOUN _ Animacy=Inan|Case=Gen|Gender=Neut|Number=Plur 11 nmod 11:nmod SpaceAfter=No
16 | 14 : : PUNCT _ _ 9 punct 9:punct _
17 |
18 | # sent_id = 2013Algoritm.xml_17
19 | # text = - Дискретность - алгоритм должен представлять процесс решения задачи как последовательное выполнение некоторых простых шагов.
20 | 1 - - PUNCT _ _ 2 punct 2:punct _
21 | 2 Дискретность дискретность NOUN _ Animacy=Inan|Case=Nom|Gender=Fem|Number=Sing 0 root 0:root _
22 | 3 - - PUNCT _ _ 5 punct 5:punct _
23 | 4 алгоритм алгоритм NOUN _ Animacy=Inan|Case=Nom|Gender=Masc|Number=Sing 5 nsubj 5:nsubj _
24 | 5 должен должен ADJ _ Degree=Pos|Gender=Masc|Number=Sing|Variant=Short 2 parataxis 2:parataxis _
25 | 6 представлять представлять VERB _ Aspect=Imp|VerbForm=Inf|Voice=Act 5 xcomp 5:xcomp _
26 | 7 процесс процесс NOUN _ Animacy=Inan|Case=Acc|Gender=Masc|Number=Sing 6 obj 6:obj _
27 | 8 решения решение NOUN _ Animacy=Inan|Case=Gen|Gender=Neut|Number=Sing 7 nmod 7:nmod _
28 | 9 задачи задача NOUN _ Animacy=Inan|Case=Gen|Gender=Fem|Number=Sing 8 nmod 8:nmod _
29 | 10 как как SCONJ _ _ 12 mark 12:mark _
30 | 11 последовательное последовательный ADJ _ Case=Acc|Degree=Pos|Gender=Neut|Number=Sing 12 amod 12:amod _
31 | 12 выполнение выполнение NOUN _ Animacy=Inan|Case=Acc|Gender=Neut|Number=Sing 6 advcl 6:advcl _
32 | 13 некоторых некоторый DET _ Case=Gen|Number=Plur 15 det 15:det _
33 | 14 простых простой ADJ _ Case=Gen|Degree=Pos|Number=Plur 15 amod 15:amod _
34 | 15 шагов шаг NOUN _ Animacy=Inan|Case=Gen|Gender=Masc|Number=Plur 12 nmod 12:nmod SpaceAfter=No
35 | 16 . . PUNCT _ _ 2 punct 2:punct _
36 |
--------------------------------------------------------------------------------
/data/ud/ru_pud-ud-test.conllu:
--------------------------------------------------------------------------------
1 | # newdoc id = n01010
2 | # sent_id = n01010042
3 | # text = «Был момент, — сказал господин Панвалкар, — когда он чувствовал, что они должны покинуть здание».
4 | # english_text = There was a time, Mr Panvalkar said, when he felt that they should leave the building.
5 | 1 « « PUNCT `` _ 2 punct _ SpaceAfter=No
6 | 2 Был быть AUX VBC Aspect=Imp|Gender=Masc|Mood=Ind|Number=Sing|Tense=Past|VerbForm=Fin|Voice=Act 0 root _ _
7 | 3 момент момент NOUN NN Animacy=Inan|Case=Nom|Gender=Masc|Number=Sing 2 nsubj _ SpaceAfter=No
8 | 4 , , PUNCT , _ 6 punct _ _
9 | 5 — — PUNCT - _ 6 punct _ OrigForm=--
10 | 6 сказал сказать VERB VBC Aspect=Perf|Gender=Masc|Mood=Ind|Number=Sing|Tense=Past|VerbForm=Fin|Voice=Act 2 parataxis _ _
11 | 7 господин господин NOUN NN Animacy=Anim|Case=Nom|Gender=Masc|Number=Sing 6 nsubj _ _
12 | 8 Панвалкар Панвалкар PROPN NNP Animacy=Anim|Case=Nom|Gender=Masc|Number=Sing 7 flat:name _ SpaceAfter=No
13 | 9 , , PUNCT , _ 6 punct _ _
14 | 10 — — PUNCT - _ 6 punct _ OrigForm=--
15 | 11 когда когда SCONJ IN _ 13 mark _ _
16 | 12 он он PRON PRP Case=Nom|Gender=Masc|Number=Sing|Person=3 13 nsubj _ _
17 | 13 чувствовал чувствовать VERB VBC Aspect=Imp|Gender=Masc|Mood=Ind|Number=Sing|Tense=Past|VerbForm=Fin|Voice=Act 3 advcl _ SpaceAfter=No
18 | 14 , , PUNCT , _ 17 punct _ _
19 | 15 что что SCONJ IN _ 17 mark _ _
20 | 16 они они PRON PRP Case=Nom|Number=Plur|Person=3 17 nsubj _ _
21 | 17 должны должен ADJ JJH Degree=Pos|Number=Plur|Variant=Short 13 ccomp _ _
22 | 18 покинуть покинуть VERB VB Aspect=Perf|VerbForm=Inf|Voice=Act 17 xcomp _ _
23 | 19 здание здание NOUN NN Animacy=Inan|Case=Acc|Gender=Neut|Number=Sing 18 obj _ SpaceAfter=No
24 | 20 » » PUNCT ' _ 2 punct _ SpaceAfter=No
25 | 21 . . PUNCT . _ 2 punct _ _
26 |
27 | # newdoc id = n01011
28 | # sent_id = n01011004
29 | # text = Ей также предъявлено обвинение в покушении на убийство ее двухлетней дочери.
30 | # english_text = She has also been charged with trying to kill her two-year-old daughter.
31 | 1 Ей она PRON PRP Case=Dat|Gender=Fem|Number=Sing|Person=3 3 iobj _ _
32 | 2 также также ADV RB Degree=Pos 3 advmod _ _
33 | 3 предъявлено предъявить VERB VBNH Aspect=Perf|Gender=Neut|Number=Sing|Tense=Past|Variant=Short|VerbForm=Part|Voice=Pass 0 root _ _
34 | 4 обвинение обвинение NOUN NN Animacy=Inan|Case=Nom|Gender=Neut|Number=Sing 3 nsubj:pass _ _
35 | 5 в в ADP IN _ 6 case _ _
36 | 6 покушении покушение NOUN NN Animacy=Inan|Case=Loc|Gender=Neut|Number=Sing 4 nmod _ _
37 | 7 на на ADP IN _ 8 case _ _
38 | 8 убийство убийство NOUN NN Animacy=Inan|Case=Acc|Gender=Neut|Number=Sing 6 nmod _ _
39 | 9 ее ее DET PRP$ _ 11 det _ _
40 | 10 двухлетней двухлетний ADJ JJ Case=Gen|Degree=Pos|Gender=Fem|Number=Sing 11 amod _ _
41 | 11 дочери дочь NOUN NN Animacy=Anim|Case=Gen|Gender=Fem|Number=Sing 8 nmod _ SpaceAfter=No
42 | 12 . . PUNCT . _ 3 punct _ _
43 |
--------------------------------------------------------------------------------
/corus/sources/taiga/social.py:
--------------------------------------------------------------------------------
1 |
2 | import re
3 | from itertools import islice as head
4 |
5 | from corus.record import Record
6 | from corus.io import match_names
7 |
8 | from .common import (
9 | load_tar,
10 | parse_filename_id
11 | )
12 |
13 |
14 | FB = 'fb'
15 | LJ = 'lj'
16 | TWITTER = 'twitter'
17 | VK = 'vk'
18 | NETWORKS = {
19 | 'fbtexts': FB,
20 | 'LiveJournalPostsandcommentsGICR': LJ,
21 | 'twtexts': TWITTER,
22 | 'vktexts': VK
23 | }
24 |
25 |
26 | class TaigaSocialRecord(Record):
27 | __attributes__ = ['id', 'network', 'text']
28 |
29 | def __init__(self, id, network, text):
30 | self.id = id
31 | self.network = network
32 | self.text = text
33 |
34 |
35 | def parse_lines(file, encoding='utf8'):
36 | for line in file:
37 | line = line.decode(encoding)
38 | yield line.lstrip('\ufeff').rstrip('\r\n')
39 |
40 |
41 | def parse_lj(file):
42 | for text in parse_lines(file):
43 | yield TaigaSocialRecord(
44 | id=None,
45 | network=LJ,
46 | text=text
47 | )
48 |
49 |
50 | # DataBaseItem: 6_5756d99b5dd2dc3dac164155
51 | # Кстати, как неожиданно КПРФ
52 | # DataBaseItem: 6_5756d9a85dd2dc3dac1645ae
53 | # [id12890229|Евгений], можно и по-другому сказать: "убогая клоунада" КПРФ - это
54 |
55 |
56 | def flush(network, id, buffer):
57 | text = '\n'.join(buffer)
58 | return TaigaSocialRecord(
59 | id=id,
60 | network=network,
61 | text=text
62 | )
63 |
64 |
65 | def parse_social_(file, network):
66 | lines = parse_lines(file)
67 | previous = None
68 | buffer = []
69 | for line in lines:
70 | match = re.match(r'^DataBaseItem: (.+)$', line)
71 | if match:
72 | if previous:
73 | yield flush(network, previous, buffer)
74 | buffer = []
75 | previous = match.group(1)
76 | else:
77 | buffer.append(line)
78 | if previous:
79 | yield flush(network, previous, buffer)
80 | buffer = []
81 |
82 |
83 | def parse_social(file, network):
84 | if network == LJ:
85 | return parse_lj(file)
86 | else:
87 | return parse_social_(file, network)
88 |
89 |
90 | def load_taiga_social(path, offset=3985892864, count=4):
91 | records = load_tar(path, offset=offset)
92 | records = match_names(records, '*/texts/*.txt')
93 | records = head(records, count)
94 | for record in records:
95 | network = parse_filename_id(record.name)
96 | network = NETWORKS[network]
97 | for record in parse_social(record.file, network):
98 | yield record
99 |
100 |
101 | __all__ = [
102 | 'load_taiga_social'
103 | ]
104 |
--------------------------------------------------------------------------------
/corus/sources/taiga/proza.py:
--------------------------------------------------------------------------------
1 |
2 | from datetime import datetime
3 |
4 | from .common import (
5 | Author,
6 | Meta,
7 | load_zip_metas,
8 | load_zip_texts,
9 | merge_metas
10 | )
11 |
12 |
13 | # {'URL': 'http://www.stihi.ru/2015/12/31/9302',
14 | # 'author': 'Макс Майер-Младший',
15 | # 'author_readers': '26',
16 | # 'author_texts': '2085',
17 | # 'authorlink': 'http://www.stihi.ru/avtor/380979994453',
18 | # 'date': '31.12.2015',
19 | # 'genre': 'лирика',
20 | # 'path': '/home/tsha/stihi_ru/texts/2015/12/20151231001.txt',
21 | # 'textid': '20151231001',
22 | # 'time': '23:56',
23 | # 'title': 'Ти знов являЕшся менi у снi',
24 | # 'topic': 'любовная лирика'}
25 |
26 |
27 | def parse_metas(items):
28 | for item in items:
29 | id = item['textid']
30 |
31 | timestamp = item['date'] + item['time']
32 | timestamp = datetime.strptime(timestamp, '%d.%m.%Y%H:%M')
33 |
34 | name = item['author']
35 | readers = item['author_readers'] or None
36 | if readers:
37 | readers = int(readers)
38 | texts = item['author_texts'] or None
39 | if texts:
40 | texts = int(texts)
41 | url = item['authorlink']
42 | author = Author(
43 | name=name,
44 | readers=readers,
45 | texts=texts,
46 | url=url
47 | )
48 |
49 | genre = item['genre']
50 | topic = item['topic']
51 | title = item['title']
52 | url = item['URL']
53 | yield Meta(
54 | id=id,
55 | timestamp=timestamp,
56 | author=author,
57 | genre=genre,
58 | topic=topic,
59 | title=title,
60 | url=url
61 | )
62 |
63 |
64 | def load_taiga_proza_metas(path, offset=0, count=2017 - 2005 + 1):
65 | items = load_zip_metas(path, '*/metatable_texts.txt', offset, count)
66 | return parse_metas(items)
67 |
68 |
69 | def load_taiga_stihi_metas(path, offset=0, count=2017 - 2015 + 1):
70 | items = load_zip_metas(path, '*/metatable_texts.txt', offset, count)
71 | return parse_metas(items)
72 |
73 |
74 | # proza_ru/home/tsha/proza_ru/tagged_texts/2015/12/20151231005.txt
75 | # proza_ru/home/tsha/proza_ru/texts/2015/12/20151231005.txt
76 |
77 |
78 | def load_taiga_proza(path, metas=None, offset=51432715409, count=1732589):
79 | records = load_zip_texts(path, '*/texts/*.txt', offset, count)
80 | return merge_metas(records, metas)
81 |
82 |
83 | def load_taiga_stihi(path, metas=None, offset=22304202421, count=9157973):
84 | records = load_zip_texts(path, '*/texts/*.txt', offset, count)
85 | return merge_metas(records, metas)
86 |
87 |
88 | __all__ = [
89 | 'load_taiga_proza_metas',
90 | 'load_taiga_proza',
91 | 'load_taiga_stihi_metas',
92 | 'load_taiga_stihi',
93 | ]
94 |
--------------------------------------------------------------------------------
/corus/sources/taiga/fontanka.py:
--------------------------------------------------------------------------------
1 |
2 | import re
3 | from datetime import datetime
4 |
5 | from .common import (
6 | Meta,
7 | load_tar_metas,
8 | load_tar_texts,
9 | merge_metas
10 | )
11 |
12 |
13 | # {'author': '',
14 | # 'authorreaders': '',
15 | # 'authortexts': '',
16 | # 'date': '04.04.2015',
17 | # 'magazine': '',
18 | # 'segment': 'Fontanka',
19 | # 'tags': 'Санкт-Петербург, Петербург, СПб, фонтанка, фонтанка.ру, АЖУР, Агентство Журналистских расследований, СМИ, новости, новости Петербурга, политика, экономика, криминал, Фонтанка, информация, события, город, культура, политика, бизнес, общество, происшествия, спорт, свободное время, авто, недвижимость, зарубежная недвижимость, Охта центр, финансы, туризм, работа, особое мнениеhttp://www.fontanka.ru/2015/04/04/068/',
20 | # 'textdiff': '',
21 | # 'textid': '20150404068',
22 | # 'textname': 'Минобороны: Россия не отстает от США в разработке лазерного оружия',
23 | # 'textregion': '',
24 | # 'textrubric': 'Технологии',
25 | # 'time': '20:59'},
26 |
27 |
28 | def parse_metas(items):
29 | for item in items:
30 | id = item['textid']
31 | # {'segment': 'Fontanka', 'textname': '"', 'textid': '20100205145'}
32 | tags, url, rubric, title = (), None, None, None
33 |
34 | if 'date' in item and 'time' in item:
35 | timestamp = item['date'] + item['time']
36 | if timestamp:
37 | timestamp = datetime.strptime(timestamp, '%d.%m.%Y%H:%M')
38 |
39 | if 'tags' in item:
40 | tags = item['tags']
41 | match = re.search(r'(http://.+)$', tags)
42 | if match:
43 | url = match.group(1)
44 | tags = re.split(r',\s+', tags[:match.start()])
45 |
46 | rubric = item.get('textrubric')
47 | title = item.get('textname')
48 | yield Meta(
49 | id=id,
50 | timestamp=timestamp,
51 | tags=tags,
52 | rubric=rubric,
53 | title=title,
54 | url=url
55 | )
56 |
57 |
58 | def load_taiga_fontanka_metas(path, offset=0, count=2017 - 2005 + 1):
59 | items = load_tar_metas(path, '*/metatable_*.csv', offset, count)
60 | return parse_metas(items)
61 |
62 |
63 | # home/tsha/Fontanka/texts/2007/fontanka_20070101001.txt
64 | # home/tsha/Fontanka/texts/2007/fontanka_20070101002.txt
65 | # home/tsha/Fontanka/texts/2007/fontanka_20070101004.txt
66 | # home/tsha/Fontanka/texts/2007/fontanka_20070101003.txt
67 |
68 |
69 | def parse_id(name):
70 | match = re.search(r'fontanka_(\d+)\.txt', name)
71 | return match.group(1)
72 |
73 |
74 | def load_taiga_fontanka(path, metas=None, offset=306359296, count=342683):
75 | records = load_tar_texts(path, '*/texts/*.txt', offset, count, parse_id)
76 | return merge_metas(records, metas)
77 |
78 |
79 | __all__ = [
80 | 'load_taiga_fontanka_metas',
81 | 'load_taiga_fontanka'
82 | ]
83 |
--------------------------------------------------------------------------------
/corus/io.py:
--------------------------------------------------------------------------------
1 |
2 | import gzip
3 | import bz2
4 | from zipfile import ZipFile
5 |
6 | import csv
7 | import json
8 |
9 | import xml.etree.ElementTree as ET
10 |
11 | from fnmatch import fnmatch as match_pattern
12 |
13 |
14 | #######
15 | #
16 | # UTILS
17 | #
18 | #######
19 |
20 |
21 | def match_names(records, pattern):
22 | for record in records:
23 | if match_pattern(record.name, pattern):
24 | yield record
25 |
26 |
27 | #######
28 | #
29 | # TEXT
30 | #
31 | ########
32 |
33 |
34 | def rstrip(text):
35 | return text.rstrip('\r\n')
36 |
37 |
38 | def load_text(path):
39 | with open(path) as file:
40 | return file.read()
41 |
42 |
43 | def dump_text(text, path):
44 | with open(path, 'w') as file:
45 | file.write(text)
46 |
47 |
48 | def load_lines(path, encoding="utf-8"):
49 | with open(path, encoding=encoding) as file:
50 | for line in file:
51 | yield rstrip(line)
52 |
53 |
54 | #####
55 | #
56 | # XML
57 | #
58 | ######
59 |
60 |
61 | def parse_xml(text):
62 | return ET.fromstring(text)
63 |
64 |
65 | #########
66 | #
67 | # GZ, BZ, XZ
68 | #
69 | #####
70 |
71 |
72 | def load_z_lines(path, open, encoding='utf8'):
73 | with open(path, mode='rt', encoding=encoding) as file:
74 | for line in file:
75 | yield rstrip(line)
76 |
77 |
78 | def load_gz_lines(path):
79 | return load_z_lines(path, gzip.open)
80 |
81 |
82 | def load_bz2_lines(path):
83 | return load_z_lines(path, bz2.open)
84 |
85 |
86 | def load_xz_lines(path):
87 | # Python may be built without lzma support
88 | # https://github.com/pandas-dev/pandas/issues/27532
89 | import lzma
90 |
91 | return load_z_lines(path, lzma.open)
92 |
93 |
94 | #######
95 | #
96 | # ZIP
97 | #
98 | ########
99 |
100 |
101 | def list_zip(path):
102 | with ZipFile(path) as zip:
103 | return zip.namelist()
104 |
105 |
106 | def load_zip_lines(path, name, encoding='utf8'):
107 | with ZipFile(path) as zip:
108 | with zip.open(name) as file:
109 | for line in file:
110 | yield rstrip(line.decode(encoding))
111 |
112 |
113 | def load_zip_texts(path, names, encoding='utf8'):
114 | with ZipFile(path) as zip:
115 | for name in names:
116 | with zip.open(name) as file:
117 | yield file.read().decode(encoding)
118 |
119 |
120 | ########
121 | #
122 | # CSV
123 | #
124 | #######
125 |
126 |
127 | def parse_csv(lines, delimiter=',', max_field=None):
128 | if max_field:
129 | csv.field_size_limit(max_field)
130 | return csv.reader(lines, delimiter=delimiter)
131 |
132 |
133 | def parse_tsv(lines):
134 | return parse_csv(lines, delimiter='\t')
135 |
136 |
137 | def skip_header(rows):
138 | return next(rows)
139 |
140 |
141 | def dict_csv(rows):
142 | header = next(rows)
143 | for row in rows:
144 | yield dict(zip(header, row))
145 |
146 |
147 | #########
148 | #
149 | # JSONL
150 | #
151 | #######
152 |
153 |
154 | def parse_jsonl(lines):
155 | for line in lines:
156 | yield json.loads(line)
157 |
--------------------------------------------------------------------------------
/corus/sources/mokoron.py:
--------------------------------------------------------------------------------
1 |
2 | import re
3 | from datetime import datetime
4 |
5 | from corus.record import Record
6 | from corus.io import load_lines
7 |
8 |
9 | # – id: уникальный номер сообщения в системе twitter;
10 | # – tdate: дата публикации сообщения (твита);
11 | # – tmane: имя пользователя, опубликовавшего сообщение;
12 | # – ttext: текст сообщения (твита);
13 | # – ttype: поле в котором в дальнейшем будет указано к кому классу относится твит (положительный, отрицательный, нейтральный);
14 | # – trep: количество реплаев к данному сообщению. В настоящий момент API твиттера не отдает эту информацию;
15 | # – tfav: число сколько раз данное сообщение было добавлено в избранное другими пользователями;
16 | # – tstcount: число всех сообщений пользователя в сети twitter;
17 | # – tfol: количество фоловеров пользователя (тех людей, которые читают пользователя);
18 | # – tfrien: количество друзей пользователя (те люди, которых читает пользователь);
19 | # – listcount: количество листов-подписок в которые добавлен твиттер-пользователь.
20 |
21 |
22 | class MokoronRecord(Record):
23 | __attributes__ = [
24 | 'id', 'timestamp', 'user', 'text', 'sentiment',
25 | 'replies', 'retweets', 'favourites', 'posts',
26 | 'followers', 'friends', 'lists'
27 | ]
28 |
29 | def __init__(self, id, timestamp, user, text, sentiment,
30 | replies, retweets, favourites, posts, followers, friends, lists):
31 | self.id = id
32 | self.timestamp = timestamp
33 | self.user = user
34 | self.text = text
35 | self.sentiment = sentiment
36 | self.replies = replies
37 | self.retweets = retweets
38 | self.favourites = favourites
39 | self.posts = posts
40 | self.followers = followers
41 | self.friends = friends
42 | self.lists = lists
43 |
44 | @classmethod
45 | def from_match(cls, match):
46 | dict = match.groupdict()
47 | for key in ['id', 'sentiment', 'replies', 'retweets',
48 | 'favourites', 'posts', 'followers', 'friends', 'lists']:
49 | dict[key] = int(dict[key])
50 | dict['timestamp'] = datetime.utcfromtimestamp(float(dict['timestamp']))
51 | return cls(**dict)
52 |
53 |
54 | # INSERT INTO `sentiment` VALUES (408906695721877504,'1386325928','Va5ilina','Пропавшая в Хабаровске школьница почти сутки провела в яме у коллектор',2,0,0,0,183,95,158,0),(408906695700520960,'1386325928','i_wont_judge_ya','ЛЕНТА, Я СЕГОДНЯ ПОЛГОДА ДИРЕКШИОНЕЕЕЕР! С:\nХОТЯ ВСЕ РАВНО НИКТО НЕ ПОЗДРАВИТ ЛОЛ',2,0,0,0,19809,804,257,11),
55 |
56 |
57 | INSERT = 'INSERT INTO `sentiment` VALUES'
58 | RECORD = re.compile(r'''
59 | \(
60 | (?P\d+),
61 | '(?P\d+)',
62 | '(?P.+?)',
63 | '(?P.+?)',
64 | (?P\d+),
65 | (?P\d+),
66 | (?P\d+),
67 | (?P\d+),
68 | (?P\d+),
69 | (?P\d+),
70 | (?P\d+),
71 | (?P\d+)
72 | \)
73 | ''', re.X)
74 |
75 |
76 | def load_mokoron(path):
77 | for line in load_lines(path):
78 | if line.startswith(INSERT):
79 | for match in RECORD.finditer(line):
80 | yield MokoronRecord.from_match(match)
81 |
82 |
83 | __all__ = [
84 | 'load_mokoron'
85 | ]
86 |
--------------------------------------------------------------------------------
/corus/sources/ud.py:
--------------------------------------------------------------------------------
1 |
2 | from corus.record import Record
3 | from corus.io import load_lines
4 |
5 |
6 | class UDSent(Record):
7 | __attributes__ = ['id', 'text', 'attrs', 'tokens']
8 |
9 | def __init__(self, id, text, attrs, tokens):
10 | self.id = id
11 | self.text = text
12 | self.attrs = attrs
13 | self.tokens = tokens
14 |
15 |
16 | class UDToken(Record):
17 | __attributes__ = ['id', 'text', 'lemma', 'pos', 'feats', 'head_id', 'rel']
18 |
19 | def __init__(self, id, text, lemma, pos, feats, head_id, rel):
20 | self.id = id
21 | self.text = text
22 | self.lemma = lemma
23 | self.pos = pos
24 | self.feats = feats
25 | self.head_id = head_id
26 | self.rel = rel
27 |
28 |
29 | def group_sents(lines):
30 | buffer = []
31 | for line in lines:
32 | if not line:
33 | yield buffer
34 | buffer = []
35 | else:
36 | buffer.append(line)
37 | if buffer:
38 | yield buffer
39 |
40 |
41 | def parse_feats(tags):
42 | if not tags:
43 | return
44 |
45 | for pair in tags.split('|'):
46 | key, value = pair.split('=', 1)
47 | yield key, value
48 |
49 |
50 | def _none(value):
51 | if value == '_':
52 | return
53 | return value
54 |
55 |
56 | def parse_row(line):
57 | return [_none(_) for _ in line.split('\t')]
58 |
59 |
60 | def parse_attr(line):
61 | # newdoc
62 | # title = instagram-2019
63 | # newpar
64 | # sent_id = instagram-1
65 | # speaker = screened-18
66 |
67 | line = line.lstrip('# ')
68 | if ' = ' in line:
69 | return line.split(' = ', 1)
70 | else:
71 | return line, None
72 |
73 |
74 | def parse_token(line):
75 | id, text, lemma, pos, _, feats, head_id, rel, _, _ = parse_row(line)
76 | feats = dict(parse_feats(feats))
77 | return UDToken(id, text, lemma, pos, feats, head_id, rel)
78 |
79 |
80 | def parse_ud(lines):
81 | # newdoc id = n01001
82 | # sent_id = n01001011
83 | # text = «Если передача цифровых технологий сегодня
84 | # 1 « « PUNCT `` _ 19 punct _ SpaceA
85 | # 2 Если если SCONJ IN _ 9 mark _ _
86 | # 3 передача передача NOUN NN Animacy=Inan|Case=N
87 |
88 | for group in group_sents(lines):
89 | attrs = {}
90 | tokens = []
91 | for line in group:
92 | if line.startswith('#'):
93 | key, value = parse_attr(line)
94 | attrs[key] = value
95 | else:
96 | token = parse_token(line)
97 | tokens.append(token)
98 |
99 | id = attrs.pop('sent_id', None)
100 | text = attrs.pop('text', None)
101 | yield UDSent(id, text, attrs, tokens)
102 |
103 |
104 | def load_ud(path):
105 | lines = load_lines(path)
106 | return parse_ud(lines)
107 |
108 |
109 | def load_ud_gsd(path):
110 | return load_ud(path)
111 |
112 |
113 | def load_ud_taiga(path):
114 | return load_ud(path)
115 |
116 |
117 | def load_ud_pud(path):
118 | return load_ud(path)
119 |
120 |
121 | def load_ud_syntag(path):
122 | return load_ud(path)
123 |
124 |
125 | __all__ = [
126 | 'load_ud_gsd',
127 | 'load_ud_taiga',
128 | 'load_ud_pud',
129 | 'load_ud_syntag',
130 | ]
131 |
--------------------------------------------------------------------------------
/corus/sources/ods.py:
--------------------------------------------------------------------------------
1 | # coding: utf8
2 |
3 | from datetime import datetime
4 |
5 | from corus.record import Record
6 | from corus.io import (
7 | load_gz_lines,
8 | parse_csv,
9 | skip_header
10 | )
11 |
12 |
13 | class NewsRecord(Record):
14 | __attributes__ = [
15 | 'timestamp', 'url', 'edition', 'topics',
16 | 'authors', 'title', 'text', 'stats'
17 | ]
18 |
19 | def __init__(self, timestamp, url, edition, topics, authors, title, text, stats):
20 | self.timestamp = timestamp
21 | self.url = url
22 | self.edition = edition
23 | self.topics = topics
24 | self.authors = authors
25 | self.title = title
26 | self.text = text
27 | self.stats = stats
28 |
29 |
30 | class Stats(Record):
31 | __attributes__ = [
32 | 'fb', 'vk', 'ok', 'twitter', 'lj', 'tg',
33 | 'likes', 'views', 'comments'
34 | ]
35 |
36 | def __init__(self, fb, vk, ok, twitter, lj, tg, likes, views, comments):
37 | self.fb = fb
38 | self.vk = vk
39 | self.ok = ok
40 | self.twitter = twitter
41 | self.lj = lj
42 | self.tg = tg
43 | self.likes = likes
44 | self.views = views
45 | self.comments = comments
46 |
47 |
48 | def none_row(row):
49 | for cell in row:
50 | if not cell or cell == '-':
51 | cell = None
52 | yield cell
53 |
54 |
55 | def maybe_int(value):
56 | if value:
57 | return int(value)
58 | return
59 |
60 |
61 | def parse_news(lines):
62 | # tass raises "field larger than field limit"
63 | rows = parse_csv(lines, max_field=100000000)
64 | skip_header(rows)
65 | for row in rows:
66 | (timestamp, url, edition, topics, authors, title, text,
67 | fb, vk, ok, twitter, lj, tg, likes, views, comments) = none_row(row)
68 |
69 | timestamp = datetime.strptime(timestamp, '%Y-%m-%d %H:%M:%S')
70 |
71 | if authors:
72 | authors = authors.split(',')
73 |
74 | # empty texts in meduza
75 | text = text or ''
76 |
77 | stats = Stats(
78 | maybe_int(fb),
79 | maybe_int(vk),
80 | maybe_int(ok),
81 | maybe_int(twitter),
82 | maybe_int(lj),
83 | maybe_int(tg),
84 | maybe_int(likes),
85 | maybe_int(views),
86 | maybe_int(comments)
87 | )
88 | yield NewsRecord(
89 | timestamp, url, edition, topics, authors,
90 | title, text, stats
91 | )
92 |
93 |
94 | def load_news(path):
95 | lines = load_gz_lines(path)
96 | return parse_news(lines)
97 |
98 |
99 | def load_ods_interfax(path):
100 | return load_news(path)
101 |
102 |
103 | def load_ods_gazeta(path):
104 | return load_news(path)
105 |
106 |
107 | def load_ods_izvestia(path):
108 | return load_news(path)
109 |
110 |
111 | def load_ods_meduza(path):
112 | return load_news(path)
113 |
114 |
115 | def load_ods_ria(path):
116 | return load_news(path)
117 |
118 |
119 | def load_ods_rt(path):
120 | return load_news(path)
121 |
122 |
123 | def load_ods_tass(path):
124 | return load_news(path)
125 |
126 |
127 | __all__ = [
128 | 'load_ods_interfax',
129 | 'load_ods_gazeta',
130 | 'load_ods_izvestia',
131 | 'load_ods_meduza',
132 | 'load_ods_ria',
133 | 'load_ods_rt',
134 | 'load_ods_tass',
135 | ]
136 |
--------------------------------------------------------------------------------
/corus/sources/omnia.py:
--------------------------------------------------------------------------------
1 |
2 | import re
3 |
4 | from corus.record import Record
5 | from corus.io import load_xz_lines
6 |
7 |
8 | class OmniaDoc(Record):
9 | __attributes__ = ['id', 'attrs', 'pars']
10 |
11 | def __init__(self, id, attrs, pars):
12 | self.id = id
13 | self.attrs = attrs
14 | self.pars = pars
15 |
16 |
17 | class OmniaPar(Record):
18 | __attributes__ = ['sents']
19 |
20 | def __init__(self, sents):
21 | self.sents = sents
22 |
23 |
24 | class OmniaSent(Record):
25 | __attributes__ = ['tokens']
26 |
27 | def __init__(self, tokens):
28 | self.tokens = tokens
29 |
30 |
31 | class OmniaToken(Record):
32 | __attributes__ = ['text', 'lemma', 'atag', 'tag', 'ztag', 'g']
33 |
34 | def __init__(self, text, lemma, atag, tag, ztag, g):
35 | self.text = text
36 | self.lemma = lemma
37 | self.atag = atag
38 | self.tag = tag
39 | self.ztag = ztag
40 | self.g = g
41 |
42 |
43 | DID = 'did'
44 | G_TAG = ' '
45 | S_END = ''
46 | P_END = '
'
47 | DOC_END = ''
48 |
49 |
50 | def take_until(stream, value):
51 | for item in stream:
52 | if item == value:
53 | break
54 | yield item
55 |
56 |
57 | def group_bounds(stream, end):
58 | for _ in stream:
59 | yield take_until(stream, end)
60 |
61 |
62 | def group_doc_bounds(stream):
63 | for header in stream:
64 | group = take_until(stream, DOC_END)
65 | yield header, group
66 |
67 |
68 | def group_pairs(stream):
69 | previous = None
70 | for item in stream:
71 | if previous:
72 | yield previous, item
73 | previous = item
74 | if previous:
75 | yield previous, None
76 |
77 |
78 | def parse_tokens(lines):
79 | pairs = group_pairs(lines)
80 | for line, next in pairs:
81 | if line == G_TAG:
82 | continue
83 |
84 | parts = line.split('\t')
85 | if len(parts) != 5:
86 | # наблюдать наблюдать Vb Vmn----a-e 1
87 | # интерес интерес Nn Ncmsan 1
88 | # Э Э Zz - 1
89 | #
90 | # <дуарда>
91 | # Г Г Zz - 1
92 | #
93 | # <еоргиевича>
94 | # к к Pp Sp-d 1
95 | # попыткам попытка Nn Ncfpdn 1
96 |
97 | # weird tag lines
98 | # <нрзб> <НРЗБ>
99 | # <дуарда>
100 | # <еоргиевича>
101 |
102 | # just skip them
103 | continue
104 |
105 | # Refs on atag and tag:
106 | # http://unesco.uniba.sk/aranea_about/aut.html
107 | # http://nl.ijs.si/ME/V4/msd/html/msd-ru.html
108 | text, lemma, atag, tag, ztag = parts
109 | g = next == G_TAG
110 |
111 | yield OmniaToken(text, lemma, atag, tag, ztag, g)
112 |
113 |
114 | def parse_sents(lines):
115 | groups = group_bounds(lines, S_END)
116 | for group in groups:
117 | tokens = list(parse_tokens(group))
118 | yield OmniaSent(tokens)
119 |
120 |
121 | def parse_pars(lines):
122 | groups = group_bounds(lines, P_END)
123 | for group in groups:
124 | sents = list(parse_sents(group))
125 | yield OmniaPar(sents)
126 |
127 |
128 | def parse_tag_attrs(tag):
129 | matches = re.finditer(r'([^= ]+)="([^"]+)"', tag)
130 | for match in matches:
131 | yield match.groups()
132 |
133 |
134 | def parse_doc_header(header):
135 | attrs = dict(parse_tag_attrs(header))
136 | id = attrs.pop(DID)
137 | return id, attrs
138 |
139 |
140 | def parse_docs(lines):
141 | groups = group_doc_bounds(lines)
142 | for header, group in groups:
143 | id, attrs = parse_doc_header(header)
144 | pars = list(parse_pars(group))
145 | yield OmniaDoc(id, attrs, pars)
146 |
147 |
148 | def load_omnia(path):
149 | lines = load_xz_lines(path)
150 | yield from parse_docs(lines)
151 |
--------------------------------------------------------------------------------
/corus/sources/corpora.py:
--------------------------------------------------------------------------------
1 |
2 | from corus.record import Record
3 | from corus.io import (
4 | list_zip,
5 | load_zip_texts,
6 | parse_xml
7 | )
8 |
9 |
10 | class CorporaText(Record):
11 | __attributes__ = ['id', 'parent_id', 'name', 'tags', 'pars']
12 |
13 | def __init__(self, id, parent_id, name, tags, pars):
14 | self.id = id
15 | self.parent_id = parent_id
16 | self.name = name
17 | self.tags = tags
18 | self.pars = pars
19 |
20 |
21 | class CorporaPar(Record):
22 | __attributes__ = ['id', 'sents']
23 |
24 | def __init__(self, id, sents):
25 | self.id = id
26 | self.sents = sents
27 |
28 |
29 | class CorporaSent(Record):
30 | __attributes__ = ['id', 'text', 'tokens']
31 |
32 | def __init__(self, id, text, tokens):
33 | self.id = id
34 | self.text = text
35 | self.tokens = tokens
36 |
37 |
38 | class CorporaToken(Record):
39 | __attributes__ = ['id', 'rev_id', 'text', 'forms']
40 |
41 | def __init__(self, id, rev_id, text, forms):
42 | self.id = id
43 | self.rev_id = rev_id
44 | self.text = text
45 | self.forms = forms
46 |
47 |
48 | class CorporaForm(Record):
49 | __attributes__ = ['id', 'text', 'grams']
50 |
51 | def __init__(self, id, text, grams):
52 | self.id = id
53 | self.text = text
54 | self.grams = grams
55 |
56 |
57 | #
58 | #
59 | #
60 | # url:http://www.chaskor.ru/news/tak_kto_komu_dolzhen_18043
61 | # Год:2010
62 | # Дата:19/06
63 | # Тема:ЧасКор:Экономика
64 | # Тема:ЧасКор:Экономика/Сырье
65 | #
66 | #
67 | #
68 | #
69 | # Так кто кому должен?
70 | #
71 | #
72 | #
73 | #
74 | #
75 | #
76 | #
77 | #
78 | #
79 | #
80 | #
81 | #
82 | #
83 |
84 |
85 | def parse_grams(xml):
86 | for item in xml:
87 | yield item.get('v')
88 |
89 |
90 | def parse_forms(xml):
91 | for item in xml:
92 | lemma = item.find('l')
93 | id = lemma.get('id')
94 | text = lemma.get('t')
95 | grams = list(parse_grams(lemma))
96 | yield CorporaForm(id, text, grams)
97 |
98 |
99 | def parse_tokens(xml):
100 | for token in xml:
101 | id = token.get('id')
102 | text = token.get('text')
103 | forms = token.find('tfr')
104 | rev_id = forms.get('rev_id')
105 | forms = list(parse_forms(forms))
106 | yield CorporaToken(id, rev_id, text, forms)
107 |
108 |
109 | def parse_sents(xml):
110 | for sent in xml:
111 | id = sent.get('id')
112 | source, tokens = sent
113 | text = source.text
114 | tokens = list(parse_tokens(tokens))
115 | yield CorporaSent(id, text, tokens)
116 |
117 |
118 | def parse_pars(xml):
119 | for par in xml:
120 | id = par.get('id')
121 | sents = list(parse_sents(par))
122 | yield CorporaPar(id, sents)
123 |
124 |
125 | def parse_tags(xml):
126 | for tag in xml:
127 | yield tag.text
128 |
129 |
130 | def parse_text(xml):
131 | id = xml.get('id')
132 | parent_id = xml.get('parent')
133 | name = xml.get('name')
134 | tags, pars = xml
135 | tags = list(parse_tags(tags))
136 | pars = list(parse_pars(pars))
137 | return CorporaText(id, parent_id, name, tags, pars)
138 |
139 |
140 | def load_corpora(path):
141 | names = list_zip(path)
142 | texts = load_zip_texts(path, names)
143 | for text in texts:
144 | xml = parse_xml(text)
145 | yield parse_text(xml)
146 |
--------------------------------------------------------------------------------
/corus/readme.py:
--------------------------------------------------------------------------------
1 |
2 | import re
3 |
4 | from .sources.meta import is_group
5 | from .io import (
6 | load_text,
7 | dump_text
8 | )
9 |
10 |
11 | COMMANDS = ('wget', 'unzip', 'unrar', 'rm', 'mv', 'tar')
12 |
13 | KB = 1024
14 | MB = 1024 * KB
15 | GB = 1024 * MB
16 |
17 | LABELS = {
18 | KB: 'Kb',
19 | MB: 'Mb',
20 | GB: 'Gb'
21 | }
22 |
23 |
24 | def is_command(step, commands=COMMANDS):
25 | return step.startswith(commands)
26 |
27 |
28 | def format_bytes(value):
29 | value /= KB
30 | unit = KB
31 | for _ in range(2):
32 | if value < KB:
33 | break
34 | value /= KB
35 | unit *= KB
36 | return '%.2f %s' % (value, LABELS[unit])
37 |
38 |
39 | def format_count(value):
40 | # https://stackoverflow.com/questions/16670125/python-format-string-thousand-separator-with-spaces/
41 | return format(value, ',').replace(',', ' ')
42 |
43 |
44 | def unfold_metas(items):
45 | for item in items:
46 | if is_group(item):
47 | yield True, item
48 | for meta in item.metas:
49 | yield False, meta
50 | else:
51 | yield False, item
52 |
53 |
54 | def format_metas_(metas, nbviewer=None):
55 | yield ''
56 | yield ''
57 | yield 'Dataset '
58 | yield 'API from corus import '
59 | yield 'Tags '
60 | yield 'Texts '
61 | yield 'Uncompressed '
62 | yield 'Description '
63 | yield ' '
64 | for group, meta in unfold_metas(metas):
65 | yield ''
66 |
67 | yield ''
68 | if meta.url:
69 | yield '%s ' % (meta.url, meta.title)
70 | else:
71 | yield meta.title
72 | yield ' '
73 |
74 | if not group:
75 | yield ''
76 | for index, function in enumerate(meta.functions):
77 | if index > 0:
78 | yield ''
79 | name = function.__name__
80 | yield ' ' % name
81 | if nbviewer:
82 | yield (
83 | '{name} '.format(
84 | nbviewer=nbviewer,
85 | name=name
86 | )
87 | )
88 | yield '# ' % name
89 | else:
90 | yield '{name} '.format(name=name)
91 | yield ' '
92 |
93 | yield ''
94 | if meta.tags:
95 | for tag in meta.tags:
96 | yield '%s' % tag
97 | yield ' '
98 |
99 | yield ''
100 | if meta.stats and meta.stats.count:
101 | yield format_count(meta.stats.count)
102 | yield ' '
103 |
104 | yield ''
105 | if meta.stats and meta.stats.bytes:
106 | yield format_bytes(meta.stats.bytes)
107 | yield ' '
108 |
109 | if group:
110 | yield ''
111 | else:
112 | yield ' '
113 | if meta.description:
114 | yield meta.description
115 | if meta.instruction:
116 | yield ''
117 | yield ''
118 |
119 | for index, step in enumerate(meta.instruction):
120 | if index > 0:
121 | yield ''
122 | if is_command(step):
123 | yield '%s' % step
124 | else:
125 | yield step
126 | yield ' '
127 |
128 | yield ' '
129 | yield '
'
130 |
131 |
132 | def format_metas(metas, url=None):
133 | return '\n'.join(format_metas_(metas, url))
134 |
135 |
136 | def show_html(html):
137 | from IPython.display import display, HTML
138 |
139 | display(HTML(html))
140 |
141 |
142 | def patch_readme(html, path):
143 | text = load_text(path)
144 | text = re.sub(
145 | r'(.+)',
146 | '\n' + html + '\n',
147 | text,
148 | flags=re.S
149 | )
150 | dump_text(text, path)
151 |
--------------------------------------------------------------------------------
/corus/sources/bsnlp.py:
--------------------------------------------------------------------------------
1 |
2 | import re
3 | from datetime import datetime
4 | from os import walk as walk_
5 | from os.path import (
6 | split as split_path,
7 | splitext as split_ext,
8 | join as join_path
9 | )
10 |
11 | from corus.record import Record
12 | from corus.io import (
13 | load_text,
14 | load_lines
15 | )
16 |
17 |
18 | RU = 'ru'
19 | BG = 'bg'
20 | CS = 'cs'
21 | PL = 'pl'
22 | LANGS = [RU, BG, CS, PL]
23 |
24 | ANNOTATED = 'annotated'
25 | RAW = 'raw'
26 |
27 | TXT = '.txt'
28 | OUT = '.out'
29 |
30 |
31 | class BsnlpId(Record):
32 | __attributes__ = ['lang', 'type', 'name', 'path']
33 |
34 | def __init__(self, lang, type, name, path):
35 | self.lang = lang
36 | self.type = type
37 | self.name = name
38 | self.path = path
39 |
40 |
41 | class BsnlpRaw(Record):
42 | __attributes__ = ['id', 'name', 'lang', 'date', 'url', 'text']
43 |
44 | def __init__(self, id, name, lang, date, url, text):
45 | self.id = id
46 | self.name = name
47 | self.lang = lang
48 | self.date = date
49 | self.url = url
50 | self.text = text
51 |
52 |
53 | class BsnlpAnnotated(Record):
54 | __attributes__ = ['id', 'name', 'substrings']
55 |
56 | def __init__(self, id, name, substrings):
57 | self.id = id
58 | self.name = name
59 | self.substrings = substrings
60 |
61 |
62 | class BsnlpSubstring(Record):
63 | __attributes__ = ['text', 'normal', 'type', 'id']
64 |
65 | def __init__(self, text, normal, type, id):
66 | self.text = text
67 | self.normal = normal
68 | self.type = type
69 | self.id = id
70 |
71 |
72 | class BsnlpMarkup(Record):
73 | __attributes__ = ['id', 'name', 'lang', 'date', 'url', 'text', 'substrings']
74 |
75 | def __init__(self, id, name, lang, date, url, text, substrings):
76 | self.id = id
77 | self.name = name
78 | self.lang = lang
79 | self.date = date
80 | self.url = url
81 | self.text = text
82 | self.substrings = substrings
83 |
84 |
85 | def walk(dir):
86 | def onerror(error):
87 | raise error
88 |
89 | return walk_(dir, onerror=onerror)
90 |
91 |
92 | def load_ids(dir, langs):
93 | if not langs:
94 | langs = LANGS
95 |
96 | # root bsnlp/sample_pl_cs_ru_bg/raw/cs
97 | # filename brexit_cs.txt_file_100.txt
98 | for root, subdirs, filenames in walk(dir):
99 | tail, lang = split_path(root)
100 | if lang not in langs:
101 | continue
102 |
103 | tail, type = split_path(tail)
104 | if type not in (ANNOTATED, RAW):
105 | # raw/nord_stream/ru/nord_stream_ru.txt_file_44.txt
106 | tail, type = split_path(tail)
107 | assert type in (ANNOTATED, RAW), root
108 |
109 | for filename in filenames:
110 | name, ext = split_ext(filename)
111 | if ext not in (TXT, OUT):
112 | continue
113 | path = join_path(root, filename)
114 | yield BsnlpId(lang, type, name, path)
115 |
116 |
117 | def select_type(ids, type):
118 | for id in ids:
119 | if id.type == type:
120 | yield id
121 |
122 |
123 | RAW_PATTERN = re.compile(r'''
124 | ^([^\n]+)\n
125 | (ru|bg|cs|pl)\n
126 | (\d\d\d\d-\d\d-\d\d)\n
127 | (https?://[^\n]+)?\n
128 | ''', re.X)
129 |
130 |
131 | def parse_raw(name, text):
132 | match = RAW_PATTERN.search(text)
133 | assert match, text
134 |
135 | id, lang, date, url = match.groups()
136 | date = datetime.strptime(date, '%Y-%m-%d')
137 | text = text[match.end():]
138 | return BsnlpRaw(id, name, lang, date, url, text)
139 |
140 |
141 | def load_raw(records):
142 | for record in records:
143 | text = load_text(record.path)
144 | yield parse_raw(record.name, text)
145 |
146 |
147 | # Евросоюза ORG ORG-European-Union
148 | ANNOTATED_PATTERN = re.compile(r'''
149 | ^([^\t]+)
150 | \t([^\t]+)?
151 | \t([^\t]+)
152 | \t?([^\t]+)?$
153 | ''', re.X)
154 |
155 |
156 | def parse_substrings(lines):
157 | for line in lines:
158 | match = ANNOTATED_PATTERN.match(line)
159 | if not match:
160 | # single case
161 | # ЕНП ЕНП ORG ORG-EPP ЕС ЕС ORG ORG-European-Union
162 | continue
163 | text, normal, type, id = match.groups()
164 | yield BsnlpSubstring(text, normal, type, id)
165 |
166 |
167 | def parse_annotated(name, lines):
168 | id = next(lines).lstrip('\ufeff')
169 | substrings = list(parse_substrings(lines))
170 | return BsnlpAnnotated(id, name, substrings)
171 |
172 |
173 | def load_annotated(records):
174 | for record in records:
175 | lines = load_lines(record.path)
176 | yield parse_annotated(record.name, lines)
177 |
178 |
179 | def merge(raw, annotated):
180 | id_raw = {_.name: _ for _ in raw}
181 | for record in annotated:
182 | raw = id_raw[record.name]
183 | yield BsnlpMarkup(
184 | raw.id, raw.name, raw.lang, raw.date, raw.url,
185 | raw.text, record.substrings
186 | )
187 |
188 |
189 | def load_bsnlp(dir, langs=[RU]):
190 | ids = list(load_ids(dir, langs))
191 | raw = load_raw(select_type(ids, RAW))
192 | annotated = load_annotated(select_type(ids, ANNOTATED))
193 | return merge(raw, annotated)
194 |
--------------------------------------------------------------------------------
/corus/sources/taiga/common.py:
--------------------------------------------------------------------------------
1 |
2 | from io import TextIOWrapper
3 | from itertools import islice as head
4 | import tarfile
5 |
6 | from corus.record import Record
7 | from corus.path import (
8 | get_filename,
9 | split_ext
10 | )
11 | from corus.zip import (
12 | open_zip,
13 | read_zip_header,
14 | read_zip_data
15 | )
16 | from corus.io import (
17 | match_names,
18 |
19 | parse_tsv,
20 | skip_header,
21 | )
22 |
23 |
24 | class ArchiveRecord(Record):
25 | __attributes__ = ['name', 'offset', 'file']
26 |
27 | def __init__(self, name, offset, file):
28 | self.name = name
29 | self.offset = offset
30 | self.file = file
31 |
32 |
33 | class TaigaRecord(Record):
34 | __attributes__ = ['id', 'meta', 'text']
35 |
36 | def __init__(self, id, meta, text):
37 | self.id = id
38 | self.meta = meta
39 | self.text = text
40 |
41 |
42 | class Author(Record):
43 | __attributes__ = ['name', 'readers', 'texts', 'profession', 'about', 'url']
44 |
45 | def __init__(self, name, readers=None, texts=None,
46 | profession=None, about=None, url=None):
47 | self.name = name
48 | self.readers = readers
49 | self.texts = texts
50 | self.profession = profession
51 | self.about = about
52 | self.url = url
53 |
54 |
55 | class Meta(Record):
56 | __attributes__ = ['id', 'timestamp', 'tags',
57 | 'themes', 'rubric', 'genre', 'topic',
58 | 'author', 'lang', 'title', 'url']
59 |
60 | def __init__(self, id, timestamp=None, tags=None,
61 | themes=None, rubric=None, genre=None, topic=None,
62 | author=None, lang=None, title=None, url=None):
63 | self.id = id
64 | self.timestamp = timestamp
65 | self.tags = tags
66 | self.themes = themes
67 | self.rubric = rubric
68 | self.genre = genre
69 | self.topic = topic
70 | self.author = author
71 | self.lang = lang
72 | self.title = title
73 | self.url = url
74 |
75 |
76 | def load_tar(path, offset=0):
77 | with tarfile.open(path) as tar:
78 | tar.fileobj.seek(offset)
79 | while True:
80 | member = tarfile.TarInfo.fromtarfile(tar)
81 | if not member.isfile():
82 | continue
83 |
84 | file = tar.extractfile(member)
85 | yield ArchiveRecord(
86 | name=member.name,
87 | offset=member.offset,
88 | file=file
89 | )
90 |
91 | tar.members = []
92 | tar.fileobj.seek(tar.offset)
93 |
94 |
95 | def load_zip(path, offset=0):
96 | with open_zip(path) as zip:
97 | zip.seek(offset)
98 | while True:
99 | offset = zip.tell()
100 |
101 | header = read_zip_header(zip)
102 | if not header:
103 | break
104 | if not header.uncompressed:
105 | continue
106 |
107 | file = read_zip_data(zip, header)
108 | yield ArchiveRecord(
109 | name=header.name,
110 | offset=offset,
111 | file=file
112 | )
113 |
114 |
115 | def parse_meta(file, encoding='utf8'):
116 | lines = TextIOWrapper(file, encoding)
117 | rows = parse_tsv(lines)
118 | header = skip_header(rows)
119 | for row in rows:
120 | yield dict(zip(header, row))
121 |
122 |
123 | def load_metas(path, pattern, offset, count, load):
124 | records = load(path, offset)
125 | records = match_names(records, pattern)
126 | records = head(records, count)
127 | for record in records:
128 | for item in parse_meta(record.file):
129 | yield item
130 |
131 |
132 | def load_tar_metas(path, pattern, offset, count):
133 | return load_metas(path, pattern, offset, count, load_tar)
134 |
135 |
136 | def load_zip_metas(path, pattern, offset, count):
137 | return load_metas(path, pattern, offset, count, load_zip)
138 |
139 |
140 | def load_texts(path, pattern, offset, count, parse_id, load, encoding='utf8'):
141 | records = load(path, offset=offset)
142 | records = match_names(records, pattern)
143 | records = head(records, count)
144 | for record in records:
145 | id = parse_id(record.name)
146 | file = TextIOWrapper(record.file, encoding)
147 | text = file.read()
148 | yield TaigaRecord(
149 | id=id,
150 | meta=None,
151 | text=text
152 | )
153 |
154 |
155 | def parse_filename_id(path):
156 | id, _ = split_ext(get_filename(path))
157 | return id
158 |
159 |
160 | def load_tar_texts(path, pattern, offset, count, parse_id=parse_filename_id):
161 | return load_texts(path, pattern, offset, count, parse_id, load_tar)
162 |
163 |
164 | def load_zip_texts(path, pattern, offset, count, parse_id=parse_filename_id):
165 | return load_texts(path, pattern, offset, count, parse_id, load_zip)
166 |
167 |
168 | def merge_metas(records, metas=None):
169 | if not metas:
170 | for record in records:
171 | yield record
172 | else:
173 | metas = {_.id: _ for _ in metas}
174 | for record in records:
175 | record.meta = metas.get(record.id)
176 | yield record
177 |
178 |
179 | def patch_month(date, months):
180 | for source, target in months.items():
181 | if source in date:
182 | return date.replace(source, target)
183 |
--------------------------------------------------------------------------------
/corus/sources/factru.py:
--------------------------------------------------------------------------------
1 |
2 | import re
3 | from corus.path import (
4 | list_dir,
5 | join_path
6 | )
7 | from corus.record import Record
8 | from corus.io import (
9 | load_text,
10 | load_lines,
11 | )
12 |
13 |
14 | DEVSET = 'devset'
15 | TESTSET = 'testset'
16 |
17 | TXT = 'txt'
18 | SPANS = 'spans'
19 | OBJECTS = 'objects'
20 | COREF = 'coref'
21 | FACTS = 'facts'
22 |
23 |
24 | class FactruSpan(Record):
25 | __attributes__ = ['id', 'type', 'start', 'stop']
26 |
27 | def __init__(self, id, type, start, stop):
28 | self.id = id
29 | self.type = type
30 | self.start = start
31 | self.stop = stop
32 |
33 |
34 | class FactruObject(Record):
35 | __attributes__ = ['id', 'type', 'spans']
36 |
37 | def __init__(self, id, type, spans):
38 | self.id = id
39 | self.type = type
40 | self.spans = spans
41 |
42 |
43 | class FactruCorefSlot(Record):
44 | __attributes__ = ['type', 'value']
45 |
46 | def __init__(self, type, value):
47 | self.type = type
48 | self.value = value
49 |
50 |
51 | class FactruCoref(Record):
52 | __attributes__ = ['id', 'objects', 'slots']
53 |
54 | def __init__(self, id, objects, slots):
55 | self.id = id
56 | self.objects = objects
57 | self.slots = slots
58 |
59 |
60 | class FactruFactSlot(Record):
61 | __attributes__ = ['type', 'ref', 'value']
62 |
63 | def __init__(self, type, ref, value):
64 | self.type = type
65 | self.ref = ref
66 | self.value = value
67 |
68 |
69 | class FactruFact(Record):
70 | __attributes__ = ['id', 'type', 'slots']
71 |
72 | def __init__(self, id, type, slots):
73 | self.id = id
74 | self.type = type
75 | self.slots = slots
76 |
77 |
78 | class FactruMarkup(Record):
79 | __attributes__ = ['id', 'text', 'objects', 'corefs', 'facts']
80 |
81 | def __init__(self, id, text, objects, corefs, facts):
82 | self.id = id
83 | self.text = text
84 | self.objects = objects
85 | self.corefs = corefs
86 | self.facts = facts
87 |
88 |
89 | def list_ids(dir, set):
90 | for filename in list_dir(join_path(dir, set)):
91 | match = re.match(r'^book_(\d+)\.txt$', filename)
92 | if match:
93 | yield match.group(1)
94 |
95 |
96 | def part_path(id, dir, set, part):
97 | return join_path(dir, set, 'book_%s.%s' % (id, part))
98 |
99 |
100 | def parse_spans(lines):
101 | # 32962 loc_name 17 6 89971 1 # 89971 Италии
102 | # 32963 org_name 26 4 89973 1 # 89973 миде
103 | # 32965 loc_name 31 6 89974 1 # 89974 Грузии
104 |
105 | for line in lines:
106 | id, type, start, size, _ = line.split(None, 4)
107 | start = int(start)
108 | stop = start + int(size)
109 | yield FactruSpan(id, type, start, stop)
110 |
111 |
112 | def parse_objects(lines, spans):
113 | # 16972 LocOrg 32962 # Италии
114 | # 16975 Org 32963 32965 # миде Грузии
115 |
116 | id_spans = {_.id: _ for _ in spans}
117 | for line in lines:
118 | parts = iter(line.split())
119 | id = next(parts)
120 | type = next(parts)
121 | spans = []
122 | for index in parts:
123 | if not index.isdigit():
124 | break
125 | span = id_spans[index]
126 | spans.append(span)
127 | yield FactruObject(id, type, spans)
128 |
129 |
130 | def parse_coref_slots(lines):
131 | for line in lines:
132 | if not line:
133 | break
134 |
135 | parts = line.split(None, 1)
136 | if len(parts) == 1:
137 | # 1101 18638 18654
138 | # name Венгрия
139 | # wikidata
140 | # lastname
141 | continue
142 |
143 | type, value = parts
144 | yield FactruCorefSlot(type, value)
145 |
146 |
147 | def parse_corefs(lines, objects):
148 | # 3 16968 16970 16974
149 | # name Грузия
150 | #
151 | # 5 16969
152 | # firstname Виторио
153 | # lastname Сандали
154 |
155 | id_objects = {_.id: _ for _ in objects}
156 | for line in lines:
157 | parts = iter(line.split())
158 | id = next(parts)
159 | objects = [id_objects[_] for _ in parts]
160 | slots = list(parse_coref_slots(lines))
161 | yield FactruCoref(id, objects, slots)
162 |
163 |
164 | def parse_facts_slots(lines, id_corefs, id_spans):
165 | for line in lines:
166 | if not line:
167 | break
168 | type, line = line.split(None, 1)
169 | values = line.split(' | ')
170 | for value in values:
171 | # Participant obj90 Industrial and Commercial Bank of China | Промышленный и коммерческий банк Китая
172 | # Participant obj3640 WhatsApp
173 | # Type купля/продажа
174 | match = re.search(r'^(obj|span)(\d+)', value)
175 | if match:
176 | ref, id = match.groups()
177 | if ref == 'obj':
178 | value = id_corefs[id]
179 | elif ref == 'span':
180 | value = id_spans[id]
181 | else:
182 | ref = None
183 | yield FactruFactSlot(type, ref, value)
184 |
185 |
186 | def parse_facts(lines, corefs, spans):
187 | # 58-0 Meeting
188 | # Participant obj5 Сандали Виторио
189 | # Participant obj6 Налбандов Александр
190 | #
191 | # 58-1 Occupation
192 | # Who obj5 Сандали Виторио
193 | # Where obj2 Италия
194 | # Position span32958 чрезвычайный и полномочный посол | span64007 чрезвычайный и полномочный посол Италии в Грузии
195 |
196 | id_corefs = {_.id: _ for _ in corefs}
197 | id_spans = {_.id: _ for _ in spans}
198 | for line in lines:
199 | id, type = line.split(None, 1)
200 | slots = list(parse_facts_slots(lines, id_corefs, id_spans))
201 | yield FactruFact(id, type, slots)
202 |
203 |
204 | def load_id(id, dir, set):
205 | path = part_path(id, dir, set, TXT)
206 | text = load_text(path)
207 |
208 | path = part_path(id, dir, set, SPANS)
209 | lines = load_lines(path)
210 | spans = list(parse_spans(lines))
211 |
212 | path = part_path(id, dir, set, OBJECTS)
213 | lines = load_lines(path)
214 | objects = list(parse_objects(lines, spans))
215 |
216 | path = part_path(id, dir, set, COREF)
217 | lines = load_lines(path)
218 | corefs = list(parse_corefs(lines, objects))
219 |
220 | path = part_path(id, dir, set, FACTS)
221 | lines = load_lines(path)
222 | facts = list(parse_facts(lines, corefs, spans))
223 |
224 | return FactruMarkup(id, text, objects, corefs, facts)
225 |
226 |
227 | def load_factru(dir, sets=[DEVSET, TESTSET]):
228 | for set in sets:
229 | for id in list_ids(dir, set):
230 | yield load_id(id, dir, set)
231 |
--------------------------------------------------------------------------------
/data/sample.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Buriy"
8 | ]
9 | },
10 | {
11 | "cell_type": "code",
12 | "execution_count": 10,
13 | "metadata": {},
14 | "outputs": [],
15 | "source": [
16 | "# from os.path import (\n",
17 | "# join,\n",
18 | "# expanduser\n",
19 | "# )\n",
20 | "# from itertools import islice as head\n",
21 | "# import tarfile\n",
22 | "# from io import BytesIO\n",
23 | "\n",
24 | "# from tqdm import tqdm_notebook as log_progress\n",
25 | "\n",
26 | "\n",
27 | "# def source_path(filename, dir='~/proj/corus-data/buriy/'):\n",
28 | "# return join(\n",
29 | "# expanduser(dir),\n",
30 | "# filename\n",
31 | "# )\n",
32 | "\n",
33 | "\n",
34 | "# def target_path(filename, dir='buriy'):\n",
35 | "# return join(dir, filename)\n",
36 | "\n",
37 | "\n",
38 | "# def top_lines(file, count):\n",
39 | "# lines = head(file, count)\n",
40 | "# return b''.join(lines)\n",
41 | "\n",
42 | "\n",
43 | "# def sample(source, target, count):\n",
44 | "# with tarfile.open(source) as source, tarfile.open(target, 'w:bz2') as target:\n",
45 | "# for member in log_progress(head(source, count)):\n",
46 | "# if not member.isfile():\n",
47 | "# continue\n",
48 | "# file = source.extractfile(member)\n",
49 | "# data = top_lines(file, 2)\n",
50 | "# member.size = len(data)\n",
51 | "# file = BytesIO(data)\n",
52 | "# target.addfile(member, file)\n",
53 | "\n",
54 | "\n",
55 | "# filenames = [\n",
56 | "# 'lenta.tar.bz2',\n",
57 | "# 'news-articles-2014.tar.bz2',\n",
58 | "# 'news-articles-2015-part1.tar.bz2',\n",
59 | "# 'news-articles-2015-part2.tar.bz2',\n",
60 | "# 'webhose-2016.tar.bz2',\n",
61 | "# ]\n",
62 | "# for filename in filenames:\n",
63 | "# print(filename)\n",
64 | "# sample(source_path(filename), target_path(filename), 10)"
65 | ]
66 | },
67 | {
68 | "cell_type": "markdown",
69 | "metadata": {},
70 | "source": [
71 | "# Taiga"
72 | ]
73 | },
74 | {
75 | "cell_type": "code",
76 | "execution_count": 29,
77 | "metadata": {
78 | "scrolled": false
79 | },
80 | "outputs": [],
81 | "source": [
82 | "from os.path import (\n",
83 | " join,\n",
84 | " expanduser\n",
85 | ")\n",
86 | "\n",
87 | "from tqdm import tqdm_notebook as log_progress\n",
88 | "\n",
89 | "from corus.io import (\n",
90 | " load_tar,\n",
91 | " load_zip\n",
92 | ")\n",
93 | "\n",
94 | "\n",
95 | "def source_path(filename, dir='~/corus-data/taiga/'):\n",
96 | " return join(\n",
97 | " expanduser(dir),\n",
98 | " filename\n",
99 | " )\n",
100 | "\n",
101 | "\n",
102 | "def target_path(filename, dir='taiga'):\n",
103 | " return join(dir, filename)"
104 | ]
105 | },
106 | {
107 | "cell_type": "markdown",
108 | "metadata": {
109 | "heading_collapsed": true
110 | },
111 | "source": [
112 | "## Offsets"
113 | ]
114 | },
115 | {
116 | "cell_type": "code",
117 | "execution_count": 30,
118 | "metadata": {
119 | "hidden": true
120 | },
121 | "outputs": [],
122 | "source": [
123 | "# def get_offsets(records, patterns):\n",
124 | "# offsets = {}\n",
125 | "# for record in records:\n",
126 | "# for pattern in patterns:\n",
127 | "# if pattern not in offsets and pattern in record.name:\n",
128 | "# offsets[pattern] = record.offset\n",
129 | "# if len(offsets) == len(patterns):\n",
130 | "# break\n",
131 | "# return offsets\n",
132 | "\n",
133 | "\n",
134 | "# patterns = ['meta', '/texts', '/tagged']\n",
135 | "# files = [\n",
136 | "# ('Arzamas.tar.gz', load_tar, patterns),\n",
137 | "# ('Fontanka.tar.gz', load_tar, patterns),\n",
138 | "# ('Interfax.tar.gz', load_tar, patterns),\n",
139 | "# ('KP.tar.gz', load_tar, patterns),\n",
140 | "# ('Lenta.tar.gz', load_tar, patterns),\n",
141 | "# ('Magazines.tar.gz', load_tar, patterns),\n",
142 | "# ('NPlus1.tar.gz', load_tar, patterns),\n",
143 | "# ('Subtitles.tar.gz', load_tar, patterns),\n",
144 | " \n",
145 | "# ('social.tar.gz', load_tar, ['/texts', '/tagged']),\n",
146 | "\n",
147 | "# ('proza_ru.zip', load_zip, patterns),\n",
148 | "# ('stihi_ru.zip', load_zip, patterns),\n",
149 | "# ]\n",
150 | "# for filename, load, patterns in files:\n",
151 | "# path = source_path(filename)\n",
152 | "# records = load(path)\n",
153 | "# print(filename)\n",
154 | "# offsets = get_offsets(log_progress(records), patterns)\n",
155 | "# for pattern in patterns:\n",
156 | "# print('', offsets.get(pattern), pattern, sep='\\t')"
157 | ]
158 | },
159 | {
160 | "cell_type": "code",
161 | "execution_count": 5,
162 | "metadata": {
163 | "hidden": true
164 | },
165 | "outputs": [],
166 | "source": [
167 | "# Arzamas.tar.gz\n",
168 | "# \t512\tmeta\n",
169 | "# \t144896\t/texts\n",
170 | "# \t5112320\t/tagged\n",
171 | "\n",
172 | "# Fontanka.tar.gz\n",
173 | "# \t512\tmeta\n",
174 | "# \t306359296\t/texts\n",
175 | "# \t1394093568\t/tagged\n",
176 | "\n",
177 | "# Interfax.tar.gz\n",
178 | "# \t512\tmeta\n",
179 | "# \t11447296\t/texts\n",
180 | "# \t140434432\t/tagged\n",
181 | "\n",
182 | "# KP.tar.gz\n",
183 | "# \t512\tmeta\n",
184 | "# \t13042176\t/texts\n",
185 | "# \t126222848\t/tagged\n",
186 | "\n",
187 | "# Lenta.tar.gz\n",
188 | "# \t512\tmeta\n",
189 | "# \t12800000\t/texts\n",
190 | "# \t140551168\t/tagged\n",
191 | "\n",
192 | "# Magazines.tar.gz\n",
193 | "# \t512\tmeta\n",
194 | "# \t7292416\t/texts\n",
195 | "# \t2390665216\t/tagged\n",
196 | "\n",
197 | "# NPlus1.tar.gz\n",
198 | "# \t512\tmeta\n",
199 | "# \t1919488\t/texts\n",
200 | "# \t33988608\t/tagged\n",
201 | "\n",
202 | "# Subtitles.tar.gz\n",
203 | "# \t512\tmeta\n",
204 | "# \t2113024\t/texts\n",
205 | "# \t974075904\t/tagged\n",
206 | "\n",
207 | "# social.tar.gz\n",
208 | "# \t3985892864\t/texts\n",
209 | "# \t1024\t/tagged\n",
210 | "\n",
211 | "# proza_ru.zip\n",
212 | "# \t636\tmeta\n",
213 | "# \t51432715409\t/texts\n",
214 | "# \t201377139\t/tagged\n",
215 | "\n",
216 | "# stihi_ru.zip\n",
217 | "# \t899\tmeta\n",
218 | "# \t22304202421\t/texts\n",
219 | "# \t381570084\t/tagged"
220 | ]
221 | },
222 | {
223 | "cell_type": "markdown",
224 | "metadata": {
225 | "heading_collapsed": true
226 | },
227 | "source": [
228 | "## Sample"
229 | ]
230 | },
231 | {
232 | "cell_type": "code",
233 | "execution_count": 2,
234 | "metadata": {
235 | "hidden": true
236 | },
237 | "outputs": [],
238 | "source": [
239 | "def tar_copy_(lines, target, info):\n",
240 | " data = b''.join(lines)\n",
241 | " file = BytesIO(data)\n",
242 | " info.size = len(data)\n",
243 | " target.addfile(info, file)\n",
244 | "\n",
245 | "\n",
246 | "def tar_copy_text(source, target, info, count):\n",
247 | " file = source.extractfile(info)\n",
248 | " lines = islice(file, count)\n",
249 | " tar_copy_(lines, target, info)\n",
250 | "\n",
251 | "\n",
252 | "def tar_copy_meta(source, target, info, pattern, encoding='utf8'):\n",
253 | " file = source.extractfile(info)\n",
254 | " lines = [\n",
255 | " _ for index, _\n",
256 | " in enumerate(file)\n",
257 | " if pattern in _.decode(encoding) or index == 0\n",
258 | " ]\n",
259 | " if len(lines) == 1: # just header\n",
260 | " return\n",
261 | " tar_copy_(lines, target, info)"
262 | ]
263 | },
264 | {
265 | "cell_type": "code",
266 | "execution_count": 3,
267 | "metadata": {
268 | "hidden": true,
269 | "scrolled": false
270 | },
271 | "outputs": [],
272 | "source": [
273 | "# def sample(source, target, pattern):\n",
274 | "# with tarfile.open(source) as source, tarfile.open(target, 'w:gz') as target:\n",
275 | "# texts = 0\n",
276 | "# for info in log_progress(source):\n",
277 | "# if not info.isfile():\n",
278 | "# continue\n",
279 | "# name = info.name\n",
280 | "# if 'metadata' in name or 'metatable' in name:\n",
281 | "# tar_copy_meta(source, target, info, pattern)\n",
282 | "# print(name)\n",
283 | "# elif ('/tagged' in name or '/text' in name) and pattern in name:\n",
284 | "# tar_copy_text(source, target, info, 100)\n",
285 | "# print(name)\n",
286 | "# texts += 1\n",
287 | "# if texts >= 2:\n",
288 | "# break\n",
289 | "# source.members = []\n",
290 | "\n",
291 | "\n",
292 | "# FILENAMES = [\n",
293 | "# ('Arzamas.tar.gz', '101'),\n",
294 | "# ('Fontanka.tar.gz', '20070101001'),\n",
295 | "# ('Interfax.tar.gz', 'business199005'),\n",
296 | "# ('KP.tar.gz', '10@2598286'),\n",
297 | "# ('Lenta.tar.gz', '20091231boeviks'),\n",
298 | "# ('NPlus1.tar.gz', '20160915'),\n",
299 | "# ('Magazines.tar.gz', '103870'),\n",
300 | "# ('Subtitles.tar.gz', 'Pilot.HDTV.XII'),\n",
301 | "# ]\n",
302 | "\n",
303 | "# for filename, pattern in FILENAMES:\n",
304 | "# sample(source_path(filename), target_path(filename), pattern)\n",
305 | "# print(source, '->', target)"
306 | ]
307 | },
308 | {
309 | "cell_type": "code",
310 | "execution_count": 11,
311 | "metadata": {
312 | "hidden": true
313 | },
314 | "outputs": [],
315 | "source": [
316 | "# filename = 'social.tar.gz'\n",
317 | "# source = source_path(filename)\n",
318 | "# target = source_path(filename)\n",
319 | "\n",
320 | "# with tarfile.open(source) as source, tarfile.open(target, 'w:gz') as target:\n",
321 | "# for info in log_progress(source):\n",
322 | "# if not info.isfile():\n",
323 | "# continue\n",
324 | "# if '/text' in info.name:\n",
325 | "# tar_copy_text(source, target, info, 4)\n",
326 | "# print(info.name)"
327 | ]
328 | },
329 | {
330 | "cell_type": "code",
331 | "execution_count": 5,
332 | "metadata": {
333 | "hidden": true
334 | },
335 | "outputs": [],
336 | "source": [
337 | "# import zipfile\n",
338 | "# from corus.zip import open_zip, list_zip, read_zip\n",
339 | "\n",
340 | "\n",
341 | "# def zip_copy_(lines, target, record):\n",
342 | "# data = '\\n'.join(lines)\n",
343 | "# target.writestr(record.name, data)\n",
344 | "\n",
345 | "\n",
346 | "# def zip_copy_text(source, target, record, count):\n",
347 | "# text = read_zip(source, record)\n",
348 | "# lines = text.splitlines()[:count]\n",
349 | "# zip_copy_(lines, target, record)\n",
350 | "\n",
351 | "\n",
352 | "# def zip_copy_meta(source, target, record, pattern):\n",
353 | "# text = read_zip(source, record)\n",
354 | "# lines = text.splitlines()\n",
355 | "# lines = [\n",
356 | "# _ for index, _\n",
357 | "# in enumerate(lines)\n",
358 | "# if pattern in _ or index == 0\n",
359 | "# ]\n",
360 | "# if len(lines) == 1: # just header\n",
361 | "# return\n",
362 | "# zip_copy_(lines, target, record)\n",
363 | "\n",
364 | "\n",
365 | "# def sample(source, target, pattern, count):\n",
366 | "# with open_zip(source) as source, zipfile.ZipFile(target, 'w') as target:\n",
367 | "# texts = 0\n",
368 | "# for record in log_progress(list_zip(source)):\n",
369 | "# if not record.uncompressed: # not a file\n",
370 | "# continue\n",
371 | "# name = record.name\n",
372 | "# if 'metatable' in name:\n",
373 | "# zip_copy_meta(source, target, record, pattern)\n",
374 | "# print(name)\n",
375 | "# elif ('/tagged' in name or '/text' in name) and pattern in name:\n",
376 | "# zip_copy_text(source, target, record, count)\n",
377 | "# print(name)\n",
378 | "# texts += 1\n",
379 | "# if texts >= 2:\n",
380 | "# break\n",
381 | "\n",
382 | "\n",
383 | "# FILENAMES = [\n",
384 | "# ('proza_ru.zip', '20151231005', 10),\n",
385 | "# ('stihi_ru.zip', '20151231001', 100)\n",
386 | "# ]\n",
387 | "\n",
388 | "# for filename, pattern, count in FILENAMES:\n",
389 | "# source = source_path(filename)\n",
390 | "# target = target_path(filename)\n",
391 | "# sample(source, target, pattern, count)\n",
392 | "# print(source, '->', target)"
393 | ]
394 | },
395 | {
396 | "cell_type": "code",
397 | "execution_count": null,
398 | "metadata": {
399 | "hidden": true
400 | },
401 | "outputs": [],
402 | "source": []
403 | }
404 | ],
405 | "metadata": {
406 | "kernelspec": {
407 | "display_name": "Python 3",
408 | "language": "python",
409 | "name": "python3"
410 | },
411 | "language_info": {
412 | "codemirror_mode": {
413 | "name": "ipython",
414 | "version": 3
415 | },
416 | "file_extension": ".py",
417 | "mimetype": "text/x-python",
418 | "name": "python",
419 | "nbconvert_exporter": "python",
420 | "pygments_lexer": "ipython3",
421 | "version": "3.5.1"
422 | }
423 | },
424 | "nbformat": 4,
425 | "nbformat_minor": 2
426 | }
427 |
--------------------------------------------------------------------------------
/corus/sources/meta.py:
--------------------------------------------------------------------------------
1 |
2 | from corus.record import Record
3 |
4 | from . import (
5 | load_mokoron,
6 | load_wiki,
7 | load_simlex,
8 | load_omnia,
9 | load_gramru,
10 | load_corpora,
11 | load_ruadrect,
12 |
13 | load_factru,
14 | load_gareev,
15 | load_lenta,
16 | load_lenta2,
17 | load_librusec,
18 | load_ne5,
19 | load_wikiner,
20 | load_bsnlp,
21 | load_persons,
22 | load_rudrec,
23 |
24 | load_taiga_arzamas,
25 | load_taiga_fontanka,
26 | load_taiga_interfax,
27 | load_taiga_kp,
28 | load_taiga_lenta,
29 | load_taiga_nplus1,
30 | load_taiga_magazines,
31 | load_taiga_subtitles,
32 | load_taiga_social,
33 | load_taiga_proza,
34 | load_taiga_stihi,
35 |
36 | load_buriy_news,
37 | load_buriy_webhose,
38 |
39 | load_ods_interfax,
40 | load_ods_gazeta,
41 | load_ods_izvestia,
42 | load_ods_meduza,
43 | load_ods_ria,
44 | load_ods_rt,
45 | load_ods_tass,
46 |
47 | load_ria_raw,
48 | load_ria,
49 |
50 | load_ud_gsd,
51 | load_ud_taiga,
52 | load_ud_pud,
53 | load_ud_syntag,
54 |
55 | load_morphoru_gicrya,
56 | load_morphoru_rnc,
57 | load_morphoru_corpora,
58 |
59 | load_russe_hj,
60 | load_russe_rt,
61 | load_russe_ae,
62 |
63 | load_toloka_lrwc,
64 | )
65 |
66 |
67 | class Meta(Record):
68 | __attributes__ = ['title', 'url',
69 | 'description', 'stats', 'instruction',
70 | 'tags', 'functions']
71 |
72 | def __init__(self, title, url=None,
73 | description=None, stats=None, instruction=(),
74 | tags=(), functions=()):
75 | self.title = title
76 | self.url = url
77 | self.description = description
78 | self.stats = stats
79 | self.instruction = instruction
80 | self.tags = tags
81 | self.functions = functions
82 |
83 |
84 | class Group(Record):
85 | __attributes__ = ['title', 'url', 'description', 'instruction', 'metas']
86 |
87 | def __init__(self, title, url=None, description=None, instruction=(), metas=()):
88 | self.title = title
89 | self.url = url
90 | self.description = description
91 | self.instruction = instruction
92 | self.metas = metas
93 |
94 |
95 | def is_group(item):
96 | return isinstance(item, Group)
97 |
98 |
99 | class Stats(Record):
100 | __attributes__ = ['bytes', 'count']
101 |
102 | def __init__(self, bytes=None, count=None):
103 | self.bytes = bytes
104 | self.count = count
105 |
106 |
107 | NER = 'ner'
108 | NEWS = 'news'
109 | FICTION = 'fiction'
110 | SOCIAL = 'social'
111 | MORPH = 'morph'
112 | SYNTAX = 'syntax'
113 | EMB = 'emb'
114 | SIM = 'sim'
115 | SENTIMENT = 'sentiment'
116 | WEB = 'web'
117 |
118 | METAS = [
119 | Group(
120 | title='Lenta.ru',
121 | url='https://github.com/yutkin/Lenta.Ru-News-Dataset',
122 | metas=[
123 | Meta(
124 | title='Lenta.ru v1.0',
125 | stats=Stats(
126 | bytes=1785632079,
127 | count=739351
128 | ),
129 | instruction=[
130 | 'wget https://github.com/yutkin/Lenta.Ru-News-Dataset/releases/download/v1.0/lenta-ru-news.csv.gz'
131 | ],
132 | tags=[NEWS],
133 | functions=[load_lenta]
134 | ),
135 | Meta(
136 | title='Lenta.ru v1.1+',
137 | stats=Stats(
138 | bytes=2084746431,
139 | count=800975
140 | ),
141 | instruction=[
142 | 'wget https://github.com/yutkin/Lenta.Ru-News-Dataset/releases/download/v1.1/lenta-ru-news.csv.bz2'
143 | ],
144 | tags=[NEWS],
145 | functions=[load_lenta2]
146 | ),
147 | ]
148 | ),
149 | Meta(
150 | title='Lib.rus.ec',
151 | url='https://russe.nlpub.org/downloads/',
152 | description='Dump of lib.rus.ec prepared for RUSSE workshop',
153 | stats=Stats(
154 | count=301871,
155 | bytes=155611193945
156 | ),
157 | instruction=[
158 | 'wget http://panchenko.me/data/russe/librusec_fb2.plain.gz'
159 | ],
160 | tags=[FICTION],
161 | functions=[load_librusec]
162 | ),
163 | Meta(
164 | title='Rossiya Segodnya',
165 | url='https://github.com/RossiyaSegodnya/ria_news_dataset',
166 | stats=Stats(
167 | count=1003869,
168 | bytes=3974121040
169 | ),
170 | instruction=[
171 | 'wget https://github.com/RossiyaSegodnya/ria_news_dataset/raw/master/ria.json.gz'
172 | ],
173 | tags=[NEWS],
174 | functions=[load_ria_raw, load_ria]
175 | ),
176 | Meta(
177 | title='Mokoron Russian Twitter Corpus',
178 | url='http://study.mokoron.com/',
179 | description='Russian Twitter sentiment markup',
180 | instruction=[
181 | 'Manually download https://www.dropbox.com/s/9egqjszeicki4ho/db.sql'
182 | ],
183 | stats=Stats(
184 | count=17633417,
185 | bytes=1998559570
186 | ),
187 | tags=[SOCIAL, SENTIMENT],
188 | functions=[load_mokoron],
189 | ),
190 | Meta(
191 | title='Wikipedia',
192 | url='https://dumps.wikimedia.org/',
193 | description='Russian Wiki dump',
194 | instruction=[
195 | 'wget https://dumps.wikimedia.org/ruwiki/latest/ruwiki-latest-pages-articles.xml.bz2'
196 | ],
197 | stats=Stats(
198 | count=1541401,
199 | bytes=13895798340
200 | ),
201 | functions=[load_wiki],
202 | ),
203 | Meta(
204 | title='GramEval2020',
205 | url='https://github.com/dialogue-evaluation/GramEval2020',
206 | instruction=[
207 | 'wget https://github.com/dialogue-evaluation/GramEval2020/archive/master.zip',
208 | 'unzip master.zip',
209 | 'mv GramEval2020-master/dataTrain train',
210 | 'mv GramEval2020-master/dataOpenTest dev',
211 | 'rm -r master.zip GramEval2020-master',
212 | 'wget https://github.com/AlexeySorokin/GramEval2020/raw/master/data/GramEval_private_test.conllu'
213 | ],
214 | stats=Stats(
215 | count=162372,
216 | bytes=31503713
217 | ),
218 | functions=[load_gramru],
219 | ),
220 | Meta(
221 | title='OpenCorpora',
222 | url='http://opencorpora.org/',
223 | instruction=[
224 | 'wget http://opencorpora.org/files/export/annot/annot.opcorpora.xml.zip'
225 | ],
226 | stats=Stats(
227 | count=4030,
228 | bytes=21194932
229 | ),
230 | tags=[MORPH],
231 | functions=[load_corpora],
232 | ),
233 | Meta(
234 | title='RusVectores SimLex-965',
235 | instruction=[
236 | 'wget https://rusvectores.org/static/testsets/ru_simlex965_tagged.tsv',
237 | 'wget https://rusvectores.org/static/testsets/ru_simlex965.tsv'
238 | ],
239 | tags=[EMB, SIM],
240 | functions=[load_simlex],
241 | ),
242 | Meta(
243 | title='Omnia Russica',
244 | url='https://omnia-russica.github.io/',
245 | description='Taiga + Wiki + Araneum. Read "Even larger Russian corpus" https://events.spbu.ru/eventsContent/events/2019/corpora/corp_sborn.pdf',
246 | instruction=[
247 | 'Manually download http://bit.ly/2ZT4BY9'
248 | ],
249 | stats=Stats(
250 | bytes=525728427750
251 | ),
252 | tags=[MORPH, WEB, FICTION],
253 | functions=[load_omnia]
254 | ),
255 |
256 |
257 | ###########
258 | #
259 | # NER
260 | #
261 | ############
262 |
263 |
264 | Meta(
265 | title='factRuEval-2016',
266 | url='https://github.com/dialogue-evaluation/factRuEval-2016/',
267 | description='Manual PER, LOC, ORG markup prepared for 2016 Dialog competition',
268 | stats=Stats(
269 | count=254,
270 | bytes=992532
271 | ),
272 | instruction=[
273 | 'wget https://github.com/dialogue-evaluation/factRuEval-2016/archive/master.zip',
274 | 'unzip master.zip',
275 | 'rm master.zip'
276 | ],
277 | tags=[NER, NEWS],
278 | functions=[load_factru]
279 | ),
280 | Meta(
281 | title='Gareev',
282 | url='https://www.researchgate.net/publication/262203599_Introducing_Baselines_for_Russian_Named_Entity_Recognition',
283 | description='Manual PER, ORG markup (no LOC)',
284 | stats=Stats(
285 | count=97,
286 | bytes=465938
287 | ),
288 | instruction=[
289 | 'Email Rinat Gareev (gareev-rm@yandex.ru) ask for dataset',
290 | 'tar -xvf rus-ner-news-corpus.iob.tar.gz',
291 | 'rm rus-ner-news-corpus.iob.tar.gz'
292 | ],
293 | tags=[NER, NEWS],
294 | functions=[load_gareev]
295 | ),
296 | Meta(
297 | title='Collection5',
298 | url='http://www.labinform.ru/pub/named_entities/',
299 | description='News articles with manual PER, LOC, ORG markup',
300 | stats=Stats(
301 | count=1000,
302 | bytes=3105146
303 | ),
304 | instruction=[
305 | 'wget http://www.labinform.ru/pub/named_entities/collection5.zip',
306 | 'unzip collection5.zip',
307 | 'rm collection5.zip'
308 | ],
309 | tags=[NER, NEWS],
310 | functions=[load_ne5]
311 | ),
312 | Meta(
313 | title='WiNER',
314 | url='https://www.aclweb.org/anthology/I17-1042',
315 | description='Sentences from Wiki auto annotated with PER, LOC, ORG tags',
316 | stats=Stats(
317 | count=203287,
318 | bytes=37907651
319 | ),
320 | instruction=[
321 | 'wget https://github.com/dice-group/FOX/raw/master/input/Wikiner/aij-wikiner-ru-wp3.bz2'
322 | ],
323 | tags=[NER],
324 | functions=[load_wikiner]
325 | ),
326 | Meta(
327 | title='BSNLP-2019',
328 | url='http://bsnlp.cs.helsinki.fi/shared_task.html',
329 | description='Markup prepared for 2019 BSNLP Shared Task',
330 | stats=Stats(
331 | count=464,
332 | bytes=1211300
333 | ),
334 | instruction=[
335 | 'wget http://bsnlp.cs.helsinki.fi/TRAININGDATA_BSNLP_2019_shared_task.zip',
336 | 'wget http://bsnlp.cs.helsinki.fi/TESTDATA_BSNLP_2019_shared_task.zip',
337 | 'unzip TRAININGDATA_BSNLP_2019_shared_task.zip',
338 | 'unzip TESTDATA_BSNLP_2019_shared_task.zip -d test_pl_cs_ru_bg',
339 | 'rm TRAININGDATA_BSNLP_2019_shared_task.zip TESTDATA_BSNLP_2019_shared_task.zip'
340 | ],
341 | tags=[NER],
342 | functions=[load_bsnlp]
343 | ),
344 | Meta(
345 | title='Persons-1000',
346 | url='http://ai-center.botik.ru/Airec/index.php/ru/collections/28-persons-1000',
347 | description='Same as Collection5, only PER markup + normalized names',
348 | stats=Stats(
349 | count=1000,
350 | bytes=3105146
351 | ),
352 | instruction=[
353 | 'wget http://ai-center.botik.ru/Airec/ai-resources/Persons-1000.zip'
354 | ],
355 | tags=[NER, NEWS],
356 | functions=[load_persons]
357 | ),
358 | Meta(
359 | title='The Russian Drug Reaction Corpus (RuDReC)',
360 | url='https://github.com/cimm-kzn/RuDReC',
361 | description=(
362 | 'RuDReC is a new partially annotated corpus of consumer reviews in Russian about pharmaceutical '
363 | 'products for the detection of health-related named entities and the effectiveness of pharmaceutical products. '
364 | 'Here you can download and work with the annotated part, to get the raw part (1.4M reviews) '
365 | 'please refer to https://github.com/cimm-kzn/RuDReC.'
366 | ),
367 | stats=Stats(
368 | count=4809,
369 | bytes=1773
370 | ),
371 | instruction=[
372 | 'wget https://github.com/cimm-kzn/RuDReC/raw/master/data/rudrec_annotated.json'
373 | ],
374 | tags=[NER],
375 | functions=[load_rudrec]
376 | ),
377 |
378 | ##########
379 | #
380 | # TAIGA
381 | #
382 | ###########
383 |
384 |
385 | Group(
386 | title='Taiga',
387 | url='https://tatianashavrina.github.io/taiga_site/',
388 | description='Large collection of Russian texts from various sources: news sites, magazines, literacy, social networks',
389 | instruction=[
390 | 'wget https://linghub.ru/static/Taiga/retagged_taiga.tar.gz',
391 | 'tar -xzvf retagged_taiga.tar.gz'
392 | ],
393 | metas=[
394 | Meta(
395 | title='Arzamas',
396 | stats=Stats(
397 | count=311,
398 | bytes=4721604
399 | ),
400 | tags=[NEWS],
401 | functions=[load_taiga_arzamas],
402 | ),
403 | Meta(
404 | title='Fontanka',
405 | stats=Stats(
406 | count=342683,
407 | bytes=824419630
408 | ),
409 | tags=[NEWS],
410 | functions=[load_taiga_fontanka],
411 | ),
412 | Meta(
413 | title='Interfax',
414 | stats=Stats(
415 | count=46429,
416 | bytes=81320006
417 | ),
418 | tags=[NEWS],
419 | functions=[load_taiga_interfax],
420 | ),
421 | Meta(
422 | title='KP',
423 | stats=Stats(
424 | count=45503,
425 | bytes=64789612
426 | ),
427 | tags=[NEWS],
428 | functions=[load_taiga_kp],
429 | ),
430 | Meta(
431 | title='Lenta',
432 | stats=Stats(
433 | count=36446,
434 | bytes=99772679
435 | ),
436 | tags=[NEWS],
437 | functions=[load_taiga_lenta],
438 | ),
439 | Meta(
440 | title='Taiga/N+1',
441 | stats=Stats(
442 | count=7696,
443 | bytes=26167631
444 | ),
445 | tags=[NEWS],
446 | functions=[load_taiga_nplus1],
447 | ),
448 | Meta(
449 | title='Magazines',
450 | stats=Stats(
451 | count=39890,
452 | bytes=2352629006
453 | ),
454 | functions=[load_taiga_magazines]
455 | ),
456 | Meta(
457 | title='Subtitles',
458 | stats=Stats(
459 | count=19011,
460 | bytes=953237022
461 | ),
462 | functions=[load_taiga_subtitles]
463 | ),
464 | Meta(
465 | title='Social',
466 | stats=Stats(
467 | count=1876442,
468 | bytes=679670941
469 | ),
470 | tags=[SOCIAL],
471 | functions=[load_taiga_social]
472 | ),
473 | Meta(
474 | title='Proza',
475 | stats=Stats(
476 | count=1732434,
477 | bytes=41067043857
478 | ),
479 | tags=[FICTION],
480 | functions=[load_taiga_proza]
481 | ),
482 | Meta(
483 | title='Stihi',
484 | stats=Stats(
485 | count=9157686,
486 | bytes=13745805334
487 | ),
488 | functions=[load_taiga_stihi]
489 | ),
490 | ]
491 | ),
492 |
493 |
494 | #############
495 | #
496 | # BURIY
497 | #
498 | ##########
499 |
500 |
501 | Group(
502 | title='Russian NLP Datasets',
503 | url='https://github.com/buriy/russian-nlp-datasets/releases',
504 | description='Several Russian news datasets from webhose.io, lenta.ru and other news sites.',
505 | metas=[
506 | Meta(
507 | title='News',
508 | description='Dump of top 40 news + 20 fashion news sites.',
509 | instruction=[
510 | 'wget https://github.com/buriy/russian-nlp-datasets/releases/download/r4/news-articles-2014.tar.bz2',
511 | 'wget https://github.com/buriy/russian-nlp-datasets/releases/download/r4/news-articles-2015-part1.tar.bz2',
512 | 'wget https://github.com/buriy/russian-nlp-datasets/releases/download/r4/news-articles-2015-part2.tar.bz2'
513 | ],
514 | stats=Stats(
515 | count=2154801,
516 | bytes=7340672169
517 | ),
518 | tags=[NEWS],
519 | functions=[load_buriy_news],
520 | ),
521 | Meta(
522 | title='Webhose',
523 | description='Dump from webhose.io, 300 sources for one month.',
524 | instruction=[
525 | 'wget https://github.com/buriy/russian-nlp-datasets/releases/download/r4/webhose-2016.tar.bz2'
526 | ],
527 | stats=Stats(
528 | count=285965,
529 | bytes=901066314
530 | ),
531 | tags=[NEWS],
532 | functions=[load_buriy_webhose],
533 | ),
534 | ]
535 | ),
536 |
537 |
538 | #############
539 | #
540 | # ODS
541 | #
542 | #########
543 |
544 |
545 | Group(
546 | title='ODS #proj_news_viz',
547 | url='https://github.com/ods-ai-ml4sg/proj_news_viz/releases/tag/data',
548 | description='Several news sites scraped by members of #proj_news_viz ODS project.',
549 | metas=[
550 | Meta(
551 | title='Interfax',
552 | instruction=[
553 | 'wget https://github.com/ods-ai-ml4sg/proj_news_viz/releases/download/data/interfax.csv.gz',
554 | ],
555 | stats=Stats(
556 | count=543961,
557 | bytes=1314462876,
558 | ),
559 | tags=[NEWS],
560 | functions=[load_ods_interfax],
561 | ),
562 | Meta(
563 | title='Gazeta',
564 | instruction=[
565 | 'wget https://github.com/ods-ai-ml4sg/proj_news_viz/releases/download/data/gazeta.csv.gz',
566 | ],
567 | stats=Stats(
568 | count=865847,
569 | bytes=1752712320
570 | ),
571 | tags=[NEWS],
572 | functions=[load_ods_gazeta],
573 | ),
574 | Meta(
575 | title='Izvestia',
576 | instruction=[
577 | 'wget https://github.com/ods-ai-ml4sg/proj_news_viz/releases/download/data/iz.csv.gz',
578 | ],
579 | stats=Stats(
580 | count=86601,
581 | bytes=322117124
582 | ),
583 | tags=[NEWS],
584 | functions=[load_ods_izvestia],
585 | ),
586 | Meta(
587 | title='Meduza',
588 | instruction=[
589 | 'wget https://github.com/ods-ai-ml4sg/proj_news_viz/releases/download/data/meduza.csv.gz',
590 | ],
591 | stats=Stats(
592 | count=71806,
593 | bytes=283233963
594 | ),
595 | tags=[NEWS],
596 | functions=[load_ods_meduza],
597 | ),
598 | Meta(
599 | title='RIA',
600 | instruction=[
601 | 'wget https://github.com/ods-ai-ml4sg/proj_news_viz/releases/download/data/ria.csv.gz',
602 | ],
603 | stats=Stats(
604 | count=101543,
605 | bytes=245236791
606 | ),
607 | tags=[NEWS],
608 | functions=[load_ods_ria],
609 | ),
610 | Meta(
611 | title='Russia Today',
612 | instruction=[
613 | 'wget https://github.com/ods-ai-ml4sg/proj_news_viz/releases/download/data/rt.csv.gz',
614 | ],
615 | stats=Stats(
616 | count=106644,
617 | bytes=196212474
618 | ),
619 | tags=[NEWS],
620 | functions=[load_ods_rt],
621 | ),
622 | Meta(
623 | title='TASS',
624 | instruction=[
625 | 'wget https://github.com/ods-ai-ml4sg/proj_news_viz/releases/download/data/tass-001.csv.gz',
626 | ],
627 | stats=Stats(
628 | count=1135635,
629 | bytes=3515136716
630 | ),
631 | tags=[NEWS],
632 | functions=[load_ods_tass],
633 | ),
634 |
635 | ]
636 | ),
637 |
638 |
639 | #############
640 | #
641 | # UD
642 | #
643 | #########
644 |
645 |
646 | Group(
647 | title='Universal Dependencies',
648 | url='https://universaldependencies.org/',
649 | metas=[
650 | Meta(
651 | title='GSD',
652 | instruction=[
653 | 'wget https://github.com/UniversalDependencies/UD_Russian-GSD/raw/master/ru_gsd-ud-dev.conllu',
654 | 'wget https://github.com/UniversalDependencies/UD_Russian-GSD/raw/master/ru_gsd-ud-test.conllu',
655 | 'wget https://github.com/UniversalDependencies/UD_Russian-GSD/raw/master/ru_gsd-ud-train.conllu'
656 | ],
657 | stats=Stats(
658 | count=5030,
659 | bytes=1059114
660 | ),
661 | tags=[MORPH, SYNTAX],
662 | functions=[load_ud_gsd],
663 | ),
664 | Meta(
665 | title='Taiga',
666 | instruction=[
667 | 'wget https://github.com/UniversalDependencies/UD_Russian-Taiga/raw/master/ru_taiga-ud-dev.conllu',
668 | 'wget https://github.com/UniversalDependencies/UD_Russian-Taiga/raw/master/ru_taiga-ud-test.conllu',
669 | 'wget https://github.com/UniversalDependencies/UD_Russian-Taiga/raw/master/ru_taiga-ud-train.conllu'
670 | ],
671 | stats=Stats(
672 | count=3264,
673 | bytes=362293
674 | ),
675 | tags=[MORPH, SYNTAX],
676 | functions=[load_ud_taiga],
677 | ),
678 | Meta(
679 | title='PUD',
680 | instruction=[
681 | 'wget https://github.com/UniversalDependencies/UD_Russian-PUD/raw/master/ru_pud-ud-test.conllu',
682 | ],
683 | stats=Stats(
684 | count=1000,
685 | bytes=212766
686 | ),
687 | tags=[MORPH, SYNTAX],
688 | functions=[load_ud_pud],
689 | ),
690 | Meta(
691 | title='SynTagRus',
692 | instruction=[
693 | 'wget https://github.com/UniversalDependencies/UD_Russian-SynTagRus/raw/master/ru_syntagrus-ud-dev.conllu',
694 | 'wget https://github.com/UniversalDependencies/UD_Russian-SynTagRus/raw/master/ru_syntagrus-ud-test.conllu',
695 | 'wget https://github.com/UniversalDependencies/UD_Russian-SynTagRus/raw/master/ru_syntagrus-ud-train.conllu',
696 | ],
697 | stats=Stats(
698 | count=61889,
699 | bytes=11877258
700 | ),
701 | tags=[MORPH, SYNTAX],
702 | functions=[load_ud_syntag],
703 | ),
704 | ]
705 | ),
706 |
707 |
708 | #############
709 | #
710 | # MORPHORUEVAL
711 | #
712 | #########
713 |
714 |
715 | Group(
716 | title='morphoRuEval-2017',
717 | url='https://github.com/dialogue-evaluation/morphoRuEval-2017',
718 | metas=[
719 | Meta(
720 | title='General Internet-Corpus',
721 | instruction=[
722 | 'wget https://github.com/dialogue-evaluation/morphoRuEval-2017/raw/master/GIKRYA_texts_new.zip',
723 | 'unzip GIKRYA_texts_new.zip',
724 | 'rm GIKRYA_texts_new.zip'
725 | ],
726 | stats=Stats(
727 | count=83148,
728 | bytes=11091464
729 | ),
730 | tags=[MORPH],
731 | functions=[load_morphoru_gicrya],
732 | ),
733 | Meta(
734 | title='Russian National Corpus',
735 | instruction=[
736 | 'wget https://github.com/dialogue-evaluation/morphoRuEval-2017/raw/master/RNC_texts.rar',
737 | 'unrar x RNC_texts.rar',
738 | 'rm RNC_texts.rar'
739 | ],
740 | stats=Stats(
741 | count=98892,
742 | bytes=13330673
743 | ),
744 | tags=[MORPH],
745 | functions=[load_morphoru_rnc],
746 | ),
747 | Meta(
748 | title='OpenCorpora',
749 | instruction=[
750 | 'wget https://github.com/dialogue-evaluation/morphoRuEval-2017/raw/master/OpenCorpora_Texts.rar',
751 | 'unrar x OpenCorpora_Texts.rar',
752 | 'rm OpenCorpora_Texts.rar'
753 | ],
754 | stats=Stats(
755 | count=38510,
756 | bytes=5028255
757 | ),
758 | tags=[MORPH],
759 | functions=[load_morphoru_corpora],
760 | ),
761 | ]
762 | ),
763 |
764 |
765 | #############
766 | #
767 | # RUSSE SEM
768 | #
769 | #########
770 |
771 |
772 | Group(
773 | title='RUSSE Russian Semantic Relatedness',
774 | url='https://russe.nlpub.org/downloads/',
775 | metas=[
776 | Meta(
777 | title='HJ: Human Judgements of Word Pairs',
778 | instruction=[
779 | 'wget https://github.com/nlpub/russe-evaluation/raw/master/russe/evaluation/hj.csv'
780 | ],
781 | tags=[EMB, SIM],
782 | functions=[load_russe_hj],
783 | ),
784 | Meta(
785 | title='RT: Synonyms and Hypernyms from the Thesaurus RuThes',
786 | instruction=[
787 | 'wget https://raw.githubusercontent.com/nlpub/russe-evaluation/master/russe/evaluation/rt.csv'
788 | ],
789 | tags=[EMB, SIM],
790 | functions=[load_russe_rt],
791 | ),
792 | Meta(
793 | title='AE: Cognitive Associations from the Sociation.org Experiment',
794 | instruction=[
795 | 'wget https://github.com/nlpub/russe-evaluation/raw/master/russe/evaluation/ae-train.csv',
796 | 'wget https://github.com/nlpub/russe-evaluation/raw/master/russe/evaluation/ae-test.csv',
797 | 'wget https://raw.githubusercontent.com/nlpub/russe-evaluation/master/russe/evaluation/ae2.csv'
798 | ],
799 | tags=[EMB, SIM],
800 | functions=[load_russe_ae],
801 | ),
802 | ]
803 | ),
804 |
805 |
806 | #############
807 | #
808 | # TOLOKA
809 | #
810 | #########
811 |
812 |
813 | Group(
814 | title='Toloka Datasets',
815 | url='https://toloka.yandex.ru/datasets/',
816 | metas=[
817 | Meta(
818 | title='Lexical Relations from the Wisdom of the Crowd (LRWC)',
819 | instruction=[
820 | 'wget https://tlk.s3.yandex.net/dataset/LRWC.zip',
821 | 'unzip LRWC.zip',
822 | 'rm LRWC.zip'
823 | ],
824 | tags=[EMB, SIM],
825 | functions=[load_toloka_lrwc],
826 | ),
827 | Meta(
828 | title='The Russian Adverse Drug Reaction Corpus of Tweets (RuADReCT)',
829 | url='https://github.com/cimm-kzn/RuDReC',
830 | description='This corpus was developed for the Social Media Mining for Health Applications (#SMM4H) '
831 | 'Shared Task 2020',
832 | instruction=[
833 | 'wget https://github.com/cimm-kzn/RuDReC/raw/master/data/RuADReCT.zip',
834 | 'unzip RuADReCT.zip',
835 | 'rm RuADReCT.zip'
836 | ],
837 | stats=Stats(
838 | count=9515,
839 | bytes=2190063
840 | ),
841 | tags=[SOCIAL],
842 | functions=[load_ruadrect],
843 | ),
844 | ]
845 | ),
846 | ]
847 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | 
5 |
6 | Links to publicly available Russian corpora + code for loading and parsing. 20+ datasets, 350Gb+ of text .
7 |
8 | ## Usage
9 |
10 | For example lets use dump of lenta.ru by @yutkin . Manually download the archive (link in the Reference section):
11 | ```bash
12 | wget https://github.com/yutkin/Lenta.Ru-News-Dataset/releases/download/v1.0/lenta-ru-news.csv.gz
13 | ```
14 |
15 | Use `corus` to load the data:
16 |
17 | ```python
18 | >>> from corus import load_lenta
19 |
20 | >>> path = 'lenta-ru-news.csv.gz'
21 | >>> records = load_lenta(path)
22 | >>> next(records)
23 |
24 | LentaRecord(
25 | url='https://lenta.ru/news/2018/12/14/cancer/',
26 | title='Названы регионы России с\xa0самой высокой смертностью от\xa0рака',
27 | text='Вице-премьер по социальным вопросам Татьяна Голикова рассказала, в каких регионах России зафиксирована наиболее высокая смертность от рака, сооб...',
28 | topic='Россия',
29 | tags='Общество'
30 | )
31 | ```
32 |
33 | Iterate over texts:
34 |
35 | ```python
36 | >>> records = load_lenta(path)
37 | >>> for record in records:
38 | ... text = record.text
39 | ... ...
40 |
41 | ```
42 |
43 | For links to other datasets and their loaders see the Reference section.
44 |
45 | ## Documentation
46 |
47 | Materials are in Russian:
48 |
49 | * Corus page on natasha.github.io
50 | * Corus section of Datafest 2020 talk
51 |
52 | ## Install
53 |
54 | `corus` supports Python 3.5+, PyPy 3.
55 |
56 | ```bash
57 | $ pip install corus
58 | ```
59 |
60 | ## Reference
61 |
62 |
63 |
64 |
65 | Dataset
66 | API from corus import
67 | Tags
68 | Texts
69 | Uncompressed
70 | Description
71 |
72 |
73 |
74 | Lenta.ru
75 |
76 |
77 |
78 |
79 |
80 |
81 | Lenta.ru v1.0
82 |
83 |
84 |
85 | load_lenta
86 | #
87 |
88 |
89 | news
90 |
91 |
92 | 739 351
93 |
94 |
95 | 1.66 Gb
96 |
97 |
98 | wget https://github.com/yutkin/Lenta.Ru-News-Dataset/releases/download/v1.0/lenta-ru-news.csv.gz
99 |
100 |
101 |
102 |
103 | Lenta.ru v1.1+
104 |
105 |
106 |
107 | load_lenta2
108 | #
109 |
110 |
111 | news
112 |
113 |
114 | 800 975
115 |
116 |
117 | 1.94 Gb
118 |
119 |
120 | wget https://github.com/yutkin/Lenta.Ru-News-Dataset/releases/download/v1.1/lenta-ru-news.csv.bz2
121 |
122 |
123 |
124 |
125 | Lib.rus.ec
126 |
127 |
128 |
129 | load_librusec
130 | #
131 |
132 |
133 | fiction
134 |
135 |
136 | 301 871
137 |
138 |
139 | 144.92 Gb
140 |
141 |
142 | Dump of lib.rus.ec prepared for RUSSE workshop
143 |
144 |
145 | wget http://panchenko.me/data/russe/librusec_fb2.plain.gz
146 |
147 |
148 |
149 |
150 | Rossiya Segodnya
151 |
152 |
153 |
154 | load_ria_raw
155 | #
156 |
157 |
158 | load_ria
159 | #
160 |
161 |
162 | news
163 |
164 |
165 | 1 003 869
166 |
167 |
168 | 3.70 Gb
169 |
170 |
171 | wget https://github.com/RossiyaSegodnya/ria_news_dataset/raw/master/ria.json.gz
172 |
173 |
174 |
175 |
176 | Mokoron Russian Twitter Corpus
177 |
178 |
179 |
180 | load_mokoron
181 | #
182 |
183 |
184 | social
185 | sentiment
186 |
187 |
188 | 17 633 417
189 |
190 |
191 | 1.86 Gb
192 |
193 |
194 | Russian Twitter sentiment markup
195 |
196 |
197 | Manually download https://www.dropbox.com/s/9egqjszeicki4ho/db.sql
198 |
199 |
200 |
201 |
202 | Wikipedia
203 |
204 |
205 |
206 | load_wiki
207 | #
208 |
209 |
210 |
211 |
212 | 1 541 401
213 |
214 |
215 | 12.94 Gb
216 |
217 |
218 | Russian Wiki dump
219 |
220 |
221 | wget https://dumps.wikimedia.org/ruwiki/latest/ruwiki-latest-pages-articles.xml.bz2
222 |
223 |
224 |
225 |
226 | GramEval2020
227 |
228 |
229 |
230 | load_gramru
231 | #
232 |
233 |
234 |
235 |
236 | 162 372
237 |
238 |
239 | 30.04 Mb
240 |
241 |
242 | wget https://github.com/dialogue-evaluation/GramEval2020/archive/master.zip
243 |
244 | unzip master.zip
245 |
246 | mv GramEval2020-master/dataTrain train
247 |
248 | mv GramEval2020-master/dataOpenTest dev
249 |
250 | rm -r master.zip GramEval2020-master
251 |
252 | wget https://github.com/AlexeySorokin/GramEval2020/raw/master/data/GramEval_private_test.conllu
253 |
254 |
255 |
256 |
257 | OpenCorpora
258 |
259 |
260 |
261 | load_corpora
262 | #
263 |
264 |
265 | morph
266 |
267 |
268 | 4 030
269 |
270 |
271 | 20.21 Mb
272 |
273 |
274 | wget http://opencorpora.org/files/export/annot/annot.opcorpora.xml.zip
275 |
276 |
277 |
278 |
279 | RusVectores SimLex-965
280 |
281 |
282 |
283 | load_simlex
284 | #
285 |
286 |
287 | emb
288 | sim
289 |
290 |
291 |
292 |
293 |
294 |
295 | wget https://rusvectores.org/static/testsets/ru_simlex965_tagged.tsv
296 |
297 | wget https://rusvectores.org/static/testsets/ru_simlex965.tsv
298 |
299 |
300 |
301 |
302 | Omnia Russica
303 |
304 |
305 |
306 | load_omnia
307 | #
308 |
309 |
310 | morph
311 | web
312 | fiction
313 |
314 |
315 |
316 |
317 | 489.62 Gb
318 |
319 |
320 | Taiga + Wiki + Araneum. Read "Even larger Russian corpus" https://events.spbu.ru/eventsContent/events/2019/corpora/corp_sborn.pdf
321 |
322 |
323 | Manually download http://bit.ly/2ZT4BY9
324 |
325 |
326 |
327 |
328 | factRuEval-2016
329 |
330 |
331 |
332 | load_factru
333 | #
334 |
335 |
336 | ner
337 | news
338 |
339 |
340 | 254
341 |
342 |
343 | 969.27 Kb
344 |
345 |
346 | Manual PER, LOC, ORG markup prepared for 2016 Dialog competition
347 |
348 |
349 | wget https://github.com/dialogue-evaluation/factRuEval-2016/archive/master.zip
350 |
351 | unzip master.zip
352 |
353 | rm master.zip
354 |
355 |
356 |
357 |
358 | Gareev
359 |
360 |
361 |
362 | load_gareev
363 | #
364 |
365 |
366 | ner
367 | news
368 |
369 |
370 | 97
371 |
372 |
373 | 455.02 Kb
374 |
375 |
376 | Manual PER, ORG markup (no LOC)
377 |
378 |
379 | Email Rinat Gareev (gareev-rm@yandex.ru) ask for dataset
380 |
381 | tar -xvf rus-ner-news-corpus.iob.tar.gz
382 |
383 | rm rus-ner-news-corpus.iob.tar.gz
384 |
385 |
386 |
387 |
388 | Collection5
389 |
390 |
391 |
392 | load_ne5
393 | #
394 |
395 |
396 | ner
397 | news
398 |
399 |
400 | 1 000
401 |
402 |
403 | 2.96 Mb
404 |
405 |
406 | News articles with manual PER, LOC, ORG markup
407 |
408 |
409 | wget http://www.labinform.ru/pub/named_entities/collection5.zip
410 |
411 | unzip collection5.zip
412 |
413 | rm collection5.zip
414 |
415 |
416 |
417 |
418 | WiNER
419 |
420 |
421 |
422 | load_wikiner
423 | #
424 |
425 |
426 | ner
427 |
428 |
429 | 203 287
430 |
431 |
432 | 36.15 Mb
433 |
434 |
435 | Sentences from Wiki auto annotated with PER, LOC, ORG tags
436 |
437 |
438 | wget https://github.com/dice-group/FOX/raw/master/input/Wikiner/aij-wikiner-ru-wp3.bz2
439 |
440 |
441 |
442 |
443 | BSNLP-2019
444 |
445 |
446 |
447 | load_bsnlp
448 | #
449 |
450 |
451 | ner
452 |
453 |
454 | 464
455 |
456 |
457 | 1.16 Mb
458 |
459 |
460 | Markup prepared for 2019 BSNLP Shared Task
461 |
462 |
463 | wget http://bsnlp.cs.helsinki.fi/TRAININGDATA_BSNLP_2019_shared_task.zip
464 |
465 | wget http://bsnlp.cs.helsinki.fi/TESTDATA_BSNLP_2019_shared_task.zip
466 |
467 | unzip TRAININGDATA_BSNLP_2019_shared_task.zip
468 |
469 | unzip TESTDATA_BSNLP_2019_shared_task.zip -d test_pl_cs_ru_bg
470 |
471 | rm TRAININGDATA_BSNLP_2019_shared_task.zip TESTDATA_BSNLP_2019_shared_task.zip
472 |
473 |
474 |
475 |
476 | Persons-1000
477 |
478 |
479 |
480 | load_persons
481 | #
482 |
483 |
484 | ner
485 | news
486 |
487 |
488 | 1 000
489 |
490 |
491 | 2.96 Mb
492 |
493 |
494 | Same as Collection5, only PER markup + normalized names
495 |
496 |
497 | wget http://ai-center.botik.ru/Airec/ai-resources/Persons-1000.zip
498 |
499 |
500 |
501 |
502 | The Russian Drug Reaction Corpus (RuDReC)
503 |
504 |
505 |
506 | load_rudrec
507 | #
508 |
509 |
510 | ner
511 |
512 |
513 | 4 809
514 |
515 |
516 | 1.73 Kb
517 |
518 |
519 | RuDReC is a new partially annotated corpus of consumer reviews in Russian about pharmaceutical products for the detection of health-related named entities and the effectiveness of pharmaceutical products. Here you can download and work with the annotated part, to get the raw part (1.4M reviews) please refer to https://github.com/cimm-kzn/RuDReC.
520 |
521 |
522 | wget https://github.com/cimm-kzn/RuDReC/raw/master/data/rudrec_annotated.json
523 |
524 |
525 |
526 |
527 | Taiga
528 |
529 |
530 | Large collection of Russian texts from various sources: news sites, magazines, literacy, social networks
531 |
532 |
533 | wget https://linghub.ru/static/Taiga/retagged_taiga.tar.gz
534 |
535 | tar -xzvf retagged_taiga.tar.gz
536 |
537 |
538 |
539 |
540 | Arzamas
541 |
542 |
543 |
544 | load_taiga_arzamas
545 | #
546 |
547 |
548 | news
549 |
550 |
551 | 311
552 |
553 |
554 | 4.50 Mb
555 |
556 |
557 |
558 |
559 |
560 |
561 | Fontanka
562 |
563 |
564 |
565 | load_taiga_fontanka
566 | #
567 |
568 |
569 | news
570 |
571 |
572 | 342 683
573 |
574 |
575 | 786.23 Mb
576 |
577 |
578 |
579 |
580 |
581 |
582 | Interfax
583 |
584 |
585 |
586 | load_taiga_interfax
587 | #
588 |
589 |
590 | news
591 |
592 |
593 | 46 429
594 |
595 |
596 | 77.55 Mb
597 |
598 |
599 |
600 |
601 |
602 |
603 | KP
604 |
605 |
606 |
607 | load_taiga_kp
608 | #
609 |
610 |
611 | news
612 |
613 |
614 | 45 503
615 |
616 |
617 | 61.79 Mb
618 |
619 |
620 |
621 |
622 |
623 |
624 | Lenta
625 |
626 |
627 |
628 | load_taiga_lenta
629 | #
630 |
631 |
632 | news
633 |
634 |
635 | 36 446
636 |
637 |
638 | 95.15 Mb
639 |
640 |
641 |
642 |
643 |
644 |
645 | Taiga/N+1
646 |
647 |
648 |
649 | load_taiga_nplus1
650 | #
651 |
652 |
653 | news
654 |
655 |
656 | 7 696
657 |
658 |
659 | 24.96 Mb
660 |
661 |
662 |
663 |
664 |
665 |
666 | Magazines
667 |
668 |
669 |
670 | load_taiga_magazines
671 | #
672 |
673 |
674 |
675 |
676 | 39 890
677 |
678 |
679 | 2.19 Gb
680 |
681 |
682 |
683 |
684 |
685 |
686 | Subtitles
687 |
688 |
689 |
690 | load_taiga_subtitles
691 | #
692 |
693 |
694 |
695 |
696 | 19 011
697 |
698 |
699 | 909.08 Mb
700 |
701 |
702 |
703 |
704 |
705 |
706 | Social
707 |
708 |
709 |
710 | load_taiga_social
711 | #
712 |
713 |
714 | social
715 |
716 |
717 | 1 876 442
718 |
719 |
720 | 648.18 Mb
721 |
722 |
723 |
724 |
725 |
726 |
727 | Proza
728 |
729 |
730 |
731 | load_taiga_proza
732 | #
733 |
734 |
735 | fiction
736 |
737 |
738 | 1 732 434
739 |
740 |
741 | 38.25 Gb
742 |
743 |
744 |
745 |
746 |
747 |
748 | Stihi
749 |
750 |
751 |
752 | load_taiga_stihi
753 | #
754 |
755 |
756 |
757 |
758 | 9 157 686
759 |
760 |
761 | 12.80 Gb
762 |
763 |
764 |
765 |
766 |
767 |
768 | Russian NLP Datasets
769 |
770 |
771 | Several Russian news datasets from webhose.io, lenta.ru and other news sites.
772 |
773 |
774 |
775 |
776 | News
777 |
778 |
779 |
780 | load_buriy_news
781 | #
782 |
783 |
784 | news
785 |
786 |
787 | 2 154 801
788 |
789 |
790 | 6.84 Gb
791 |
792 |
793 | Dump of top 40 news + 20 fashion news sites.
794 |
795 |
796 | wget https://github.com/buriy/russian-nlp-datasets/releases/download/r4/news-articles-2014.tar.bz2
797 |
798 | wget https://github.com/buriy/russian-nlp-datasets/releases/download/r4/news-articles-2015-part1.tar.bz2
799 |
800 | wget https://github.com/buriy/russian-nlp-datasets/releases/download/r4/news-articles-2015-part2.tar.bz2
801 |
802 |
803 |
804 |
805 | Webhose
806 |
807 |
808 |
809 | load_buriy_webhose
810 | #
811 |
812 |
813 | news
814 |
815 |
816 | 285 965
817 |
818 |
819 | 859.32 Mb
820 |
821 |
822 | Dump from webhose.io, 300 sources for one month.
823 |
824 |
825 | wget https://github.com/buriy/russian-nlp-datasets/releases/download/r4/webhose-2016.tar.bz2
826 |
827 |
828 |
829 |
830 | ODS #proj_news_viz
831 |
832 |
833 | Several news sites scraped by members of #proj_news_viz ODS project.
834 |
835 |
836 |
837 |
838 | Interfax
839 |
840 |
841 |
842 | load_ods_interfax
843 | #
844 |
845 |
846 | news
847 |
848 |
849 | 543 961
850 |
851 |
852 | 1.22 Gb
853 |
854 |
855 | wget https://github.com/ods-ai-ml4sg/proj_news_viz/releases/download/data/interfax.csv.gz
856 |
857 |
858 |
859 |
860 | Gazeta
861 |
862 |
863 |
864 | load_ods_gazeta
865 | #
866 |
867 |
868 | news
869 |
870 |
871 | 865 847
872 |
873 |
874 | 1.63 Gb
875 |
876 |
877 | wget https://github.com/ods-ai-ml4sg/proj_news_viz/releases/download/data/gazeta.csv.gz
878 |
879 |
880 |
881 |
882 | Izvestia
883 |
884 |
885 |
886 | load_ods_izvestia
887 | #
888 |
889 |
890 | news
891 |
892 |
893 | 86 601
894 |
895 |
896 | 307.19 Mb
897 |
898 |
899 | wget https://github.com/ods-ai-ml4sg/proj_news_viz/releases/download/data/iz.csv.gz
900 |
901 |
902 |
903 |
904 | Meduza
905 |
906 |
907 |
908 | load_ods_meduza
909 | #
910 |
911 |
912 | news
913 |
914 |
915 | 71 806
916 |
917 |
918 | 270.11 Mb
919 |
920 |
921 | wget https://github.com/ods-ai-ml4sg/proj_news_viz/releases/download/data/meduza.csv.gz
922 |
923 |
924 |
925 |
926 | RIA
927 |
928 |
929 |
930 | load_ods_ria
931 | #
932 |
933 |
934 | news
935 |
936 |
937 | 101 543
938 |
939 |
940 | 233.88 Mb
941 |
942 |
943 | wget https://github.com/ods-ai-ml4sg/proj_news_viz/releases/download/data/ria.csv.gz
944 |
945 |
946 |
947 |
948 | Russia Today
949 |
950 |
951 |
952 | load_ods_rt
953 | #
954 |
955 |
956 | news
957 |
958 |
959 | 106 644
960 |
961 |
962 | 187.12 Mb
963 |
964 |
965 | wget https://github.com/ods-ai-ml4sg/proj_news_viz/releases/download/data/rt.csv.gz
966 |
967 |
968 |
969 |
970 | TASS
971 |
972 |
973 |
974 | load_ods_tass
975 | #
976 |
977 |
978 | news
979 |
980 |
981 | 1 135 635
982 |
983 |
984 | 3.27 Gb
985 |
986 |
987 | wget https://github.com/ods-ai-ml4sg/proj_news_viz/releases/download/data/tass-001.csv.gz
988 |
989 |
990 |
991 |
992 | Universal Dependencies
993 |
994 |
995 |
996 |
997 |
998 |
999 | GSD
1000 |
1001 |
1002 |
1003 | load_ud_gsd
1004 | #
1005 |
1006 |
1007 | morph
1008 | syntax
1009 |
1010 |
1011 | 5 030
1012 |
1013 |
1014 | 1.01 Mb
1015 |
1016 |
1017 | wget https://github.com/UniversalDependencies/UD_Russian-GSD/raw/master/ru_gsd-ud-dev.conllu
1018 |
1019 | wget https://github.com/UniversalDependencies/UD_Russian-GSD/raw/master/ru_gsd-ud-test.conllu
1020 |
1021 | wget https://github.com/UniversalDependencies/UD_Russian-GSD/raw/master/ru_gsd-ud-train.conllu
1022 |
1023 |
1024 |
1025 |
1026 | Taiga
1027 |
1028 |
1029 |
1030 | load_ud_taiga
1031 | #
1032 |
1033 |
1034 | morph
1035 | syntax
1036 |
1037 |
1038 | 3 264
1039 |
1040 |
1041 | 353.80 Kb
1042 |
1043 |
1044 | wget https://github.com/UniversalDependencies/UD_Russian-Taiga/raw/master/ru_taiga-ud-dev.conllu
1045 |
1046 | wget https://github.com/UniversalDependencies/UD_Russian-Taiga/raw/master/ru_taiga-ud-test.conllu
1047 |
1048 | wget https://github.com/UniversalDependencies/UD_Russian-Taiga/raw/master/ru_taiga-ud-train.conllu
1049 |
1050 |
1051 |
1052 |
1053 | PUD
1054 |
1055 |
1056 |
1057 | load_ud_pud
1058 | #
1059 |
1060 |
1061 | morph
1062 | syntax
1063 |
1064 |
1065 | 1 000
1066 |
1067 |
1068 | 207.78 Kb
1069 |
1070 |
1071 | wget https://github.com/UniversalDependencies/UD_Russian-PUD/raw/master/ru_pud-ud-test.conllu
1072 |
1073 |
1074 |
1075 |
1076 | SynTagRus
1077 |
1078 |
1079 |
1080 | load_ud_syntag
1081 | #
1082 |
1083 |
1084 | morph
1085 | syntax
1086 |
1087 |
1088 | 61 889
1089 |
1090 |
1091 | 11.33 Mb
1092 |
1093 |
1094 | wget https://github.com/UniversalDependencies/UD_Russian-SynTagRus/raw/master/ru_syntagrus-ud-dev.conllu
1095 |
1096 | wget https://github.com/UniversalDependencies/UD_Russian-SynTagRus/raw/master/ru_syntagrus-ud-test.conllu
1097 |
1098 | wget https://github.com/UniversalDependencies/UD_Russian-SynTagRus/raw/master/ru_syntagrus-ud-train.conllu
1099 |
1100 |
1101 |
1102 |
1103 | morphoRuEval-2017
1104 |
1105 |
1106 |
1107 |
1108 |
1109 |
1110 | General Internet-Corpus
1111 |
1112 |
1113 |
1114 | load_morphoru_gicrya
1115 | #
1116 |
1117 |
1118 | morph
1119 |
1120 |
1121 | 83 148
1122 |
1123 |
1124 | 10.58 Mb
1125 |
1126 |
1127 | wget https://github.com/dialogue-evaluation/morphoRuEval-2017/raw/master/GIKRYA_texts_new.zip
1128 |
1129 | unzip GIKRYA_texts_new.zip
1130 |
1131 | rm GIKRYA_texts_new.zip
1132 |
1133 |
1134 |
1135 |
1136 | Russian National Corpus
1137 |
1138 |
1139 |
1140 | load_morphoru_rnc
1141 | #
1142 |
1143 |
1144 | morph
1145 |
1146 |
1147 | 98 892
1148 |
1149 |
1150 | 12.71 Mb
1151 |
1152 |
1153 | wget https://github.com/dialogue-evaluation/morphoRuEval-2017/raw/master/RNC_texts.rar
1154 |
1155 | unrar x RNC_texts.rar
1156 |
1157 | rm RNC_texts.rar
1158 |
1159 |
1160 |
1161 |
1162 | OpenCorpora
1163 |
1164 |
1165 |
1166 | load_morphoru_corpora
1167 | #
1168 |
1169 |
1170 | morph
1171 |
1172 |
1173 | 38 510
1174 |
1175 |
1176 | 4.80 Mb
1177 |
1178 |
1179 | wget https://github.com/dialogue-evaluation/morphoRuEval-2017/raw/master/OpenCorpora_Texts.rar
1180 |
1181 | unrar x OpenCorpora_Texts.rar
1182 |
1183 | rm OpenCorpora_Texts.rar
1184 |
1185 |
1186 |
1187 |
1188 | RUSSE Russian Semantic Relatedness
1189 |
1190 |
1191 |
1192 |
1193 |
1194 |
1195 | HJ: Human Judgements of Word Pairs
1196 |
1197 |
1198 |
1199 | load_russe_hj
1200 | #
1201 |
1202 |
1203 | emb
1204 | sim
1205 |
1206 |
1207 |
1208 |
1209 |
1210 |
1211 | wget https://github.com/nlpub/russe-evaluation/raw/master/russe/evaluation/hj.csv
1212 |
1213 |
1214 |
1215 |
1216 | RT: Synonyms and Hypernyms from the Thesaurus RuThes
1217 |
1218 |
1219 |
1220 | load_russe_rt
1221 | #
1222 |
1223 |
1224 | emb
1225 | sim
1226 |
1227 |
1228 |
1229 |
1230 |
1231 |
1232 | wget https://raw.githubusercontent.com/nlpub/russe-evaluation/master/russe/evaluation/rt.csv
1233 |
1234 |
1235 |
1236 |
1237 | AE: Cognitive Associations from the Sociation.org Experiment
1238 |
1239 |
1240 |
1241 | load_russe_ae
1242 | #
1243 |
1244 |
1245 | emb
1246 | sim
1247 |
1248 |
1249 |
1250 |
1251 |
1252 |
1253 | wget https://github.com/nlpub/russe-evaluation/raw/master/russe/evaluation/ae-train.csv
1254 |
1255 | wget https://github.com/nlpub/russe-evaluation/raw/master/russe/evaluation/ae-test.csv
1256 |
1257 | wget https://raw.githubusercontent.com/nlpub/russe-evaluation/master/russe/evaluation/ae2.csv
1258 |
1259 |
1260 |
1261 |
1262 | Toloka Datasets
1263 |
1264 |
1265 |
1266 |
1267 |
1268 |
1269 | Lexical Relations from the Wisdom of the Crowd (LRWC)
1270 |
1271 |
1272 |
1273 | load_toloka_lrwc
1274 | #
1275 |
1276 |
1277 | emb
1278 | sim
1279 |
1280 |
1281 |
1282 |
1283 |
1284 |
1285 | wget https://tlk.s3.yandex.net/dataset/LRWC.zip
1286 |
1287 | unzip LRWC.zip
1288 |
1289 | rm LRWC.zip
1290 |
1291 |
1292 |
1293 |
1294 | The Russian Adverse Drug Reaction Corpus of Tweets (RuADReCT)
1295 |
1296 |
1297 |
1298 | load_ruadrect
1299 | #
1300 |
1301 |
1302 | social
1303 |
1304 |
1305 | 9 515
1306 |
1307 |
1308 | 2.09 Mb
1309 |
1310 |
1311 | This corpus was developed for the Social Media Mining for Health Applications (#SMM4H) Shared Task 2020
1312 |
1313 |
1314 | wget https://github.com/cimm-kzn/RuDReC/raw/master/data/RuADReCT.zip
1315 |
1316 | unzip RuADReCT.zip
1317 |
1318 | rm RuADReCT.zip
1319 |
1320 |
1321 |
1322 |
1323 |
1324 | ## Support
1325 |
1326 | - Chat — https://t.me/natural_language_processing
1327 | - Issues — https://github.com/natasha/corus/issues
1328 | - Commercial support — https://lab.alexkuk.ru
1329 |
1330 | ## Add new source
1331 |
1332 | 1. Implement `corus/sources/.py`
1333 | 2. Add import into `corus/sources/__init__.py`
1334 | 3. Add meta into `corus/source/meta.py`
1335 | 4. Add example into `docs.ipynb` (check meta table is correct)
1336 | 5. Run tests (readme is updated)
1337 |
1338 | ## Development
1339 |
1340 | Dev env
1341 |
1342 | ```bash
1343 | python -m venv ~/.venvs/natasha-corus
1344 | source ~/.venvs/natasha-corus/bin/activate
1345 |
1346 | pip install -r requirements/dev.txt
1347 | pip install -e .
1348 |
1349 | python -m ipykernel install --user --name natasha-corus
1350 | ```
1351 |
1352 | Lint + update docs
1353 |
1354 | ```bash
1355 | make lint
1356 | make exec-docs
1357 | ```
1358 |
1359 | Release
1360 |
1361 | ```bash
1362 | # Update setup.py version
1363 |
1364 | git commit -am 'Up version'
1365 | git tag v0.10.0
1366 |
1367 | git push
1368 | git push --tags
1369 | ```
1370 |
--------------------------------------------------------------------------------