├── corus ├── third │ └── __init__.py ├── __init__.py ├── sources │ ├── gramru.py │ ├── taiga │ │ ├── __init__.py │ │ ├── subtitles.py │ │ ├── kp.py │ │ ├── interfax.py │ │ ├── lenta.py │ │ ├── magazines.py │ │ ├── nplus1.py │ │ ├── arzamas.py │ │ ├── social.py │ │ ├── proza.py │ │ ├── fontanka.py │ │ └── common.py │ ├── simlex.py │ ├── librusec.py │ ├── __init__.py │ ├── wikiner.py │ ├── lenta.py │ ├── gareev.py │ ├── wiki.py │ ├── russe.py │ ├── rudrec.py │ ├── ne5.py │ ├── persons.py │ ├── ria.py │ ├── buriy.py │ ├── toloka.py │ ├── morphoru.py │ ├── mokoron.py │ ├── ud.py │ ├── ods.py │ ├── omnia.py │ ├── corpora.py │ ├── bsnlp.py │ ├── factru.py │ └── meta.py ├── path.py ├── record.py ├── zip.py ├── io.py └── readme.py ├── requirements ├── dev.txt └── ci.txt ├── data ├── ria.json.gz ├── ods │ ├── iz.csv.gz │ ├── ria.csv.gz │ ├── rt.csv.gz │ ├── gazeta.csv.gz │ ├── meduza.csv.gz │ ├── interfax.csv.gz │ └── tass-001.csv.gz ├── Persons-1000.zip ├── ru_om1000a.x1_.xz ├── taiga │ ├── KP.tar.gz │ ├── Lenta.tar.gz │ ├── NPlus1.tar.gz │ ├── proza_ru.zip │ ├── social.tar.gz │ ├── stihi_ru.zip │ ├── Arzamas.tar.gz │ ├── Fontanka.tar.gz │ ├── Interfax.tar.gz │ ├── Magazines.tar.gz │ └── Subtitles.tar.gz ├── buriy │ ├── lenta.tar.bz2 │ ├── webhose-2016.tar.bz2 │ ├── news-articles-2014.tar.bz2 │ ├── news-articles-2015-part1.tar.bz2 │ └── news-articles-2015-part2.tar.bz2 ├── aij-wikiner-ru-wp3.bz2 ├── lenta-ru-news.csv.bz2 ├── lenta-ru-news.csv.gz ├── librusec_fb2.plain.gz ├── annot.opcorpora.xml.byfile.zip ├── ruwiki-latest-pages-articles.xml.bz2 ├── russe │ └── sem │ │ ├── ae2.csv │ │ ├── rt.csv │ │ ├── hj.csv │ │ └── ae-train.csv ├── simlex │ └── ru_simlex965_tagged.tsv ├── factRuEval-2016-master │ └── devset │ │ ├── book_58.coref │ │ ├── book_58.objects │ │ ├── book_58.txt │ │ ├── book_58.facts │ │ ├── book_58.tokens │ │ └── book_58.spans ├── toloka │ ├── ruadrect │ │ └── task2_ru_test.tsv │ └── lrwc-1.1-aggregated.tsv ├── rus-ner-news-corpus.iob │ └── biztass-1.txt.iob ├── Collection5 │ ├── 001.ann │ └── 001.txt ├── morphoru │ ├── gikrya_new_test.out │ ├── unamb_sent_14_6.conllu │ └── RNCgoldInUD_Morpho.conll ├── rudrec │ └── rudrec_annotated.json ├── bsnlp │ └── test_pl_cs_ru_bg │ │ ├── annotated │ │ └── nord_stream │ │ │ └── ru │ │ │ ├── Nord_Stream_2_extra.xml_file_1.out │ │ │ └── Nord_Stream_2_extra.xml_file_7.out │ │ └── raw │ │ └── nord_stream │ │ └── ru │ │ ├── Nord_Stream_2_extra.xml_file_7.txt │ │ └── Nord_Stream_2_extra.xml_file_1.txt ├── ud │ ├── ru_taiga-ud-dev.conllu │ ├── ru_gsd-ud-dev.conllu │ ├── ru_syntagrus-ud-dev.conllu │ └── ru_pud-ud-test.conllu ├── gramru │ └── GramEval_private_test.conllu ├── mokoron │ └── db.sql └── sample.ipynb ├── setup.cfg ├── .gitignore ├── Makefile ├── .github └── workflows │ ├── pypi.yml │ └── test.yml ├── setup.py ├── LICENSE └── README.md /corus/third/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /corus/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | from .sources import * # noqa 3 | -------------------------------------------------------------------------------- /requirements/dev.txt: -------------------------------------------------------------------------------- 1 | flake8 2 | ipykernel 3 | nbconvert 4 | -------------------------------------------------------------------------------- /requirements/ci.txt: -------------------------------------------------------------------------------- 1 | flake8==5.0.4 2 | jupyter==1.0.0 3 | nbconvert==7.2.8 4 | -------------------------------------------------------------------------------- /data/ria.json.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/natasha/corus/HEAD/data/ria.json.gz -------------------------------------------------------------------------------- /data/ods/iz.csv.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/natasha/corus/HEAD/data/ods/iz.csv.gz -------------------------------------------------------------------------------- /data/ods/ria.csv.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/natasha/corus/HEAD/data/ods/ria.csv.gz -------------------------------------------------------------------------------- /data/ods/rt.csv.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/natasha/corus/HEAD/data/ods/rt.csv.gz -------------------------------------------------------------------------------- /data/Persons-1000.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/natasha/corus/HEAD/data/Persons-1000.zip -------------------------------------------------------------------------------- /data/ods/gazeta.csv.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/natasha/corus/HEAD/data/ods/gazeta.csv.gz -------------------------------------------------------------------------------- /data/ods/meduza.csv.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/natasha/corus/HEAD/data/ods/meduza.csv.gz -------------------------------------------------------------------------------- /data/ru_om1000a.x1_.xz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/natasha/corus/HEAD/data/ru_om1000a.x1_.xz -------------------------------------------------------------------------------- /data/taiga/KP.tar.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/natasha/corus/HEAD/data/taiga/KP.tar.gz -------------------------------------------------------------------------------- /data/buriy/lenta.tar.bz2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/natasha/corus/HEAD/data/buriy/lenta.tar.bz2 -------------------------------------------------------------------------------- /data/ods/interfax.csv.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/natasha/corus/HEAD/data/ods/interfax.csv.gz -------------------------------------------------------------------------------- /data/ods/tass-001.csv.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/natasha/corus/HEAD/data/ods/tass-001.csv.gz -------------------------------------------------------------------------------- /data/taiga/Lenta.tar.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/natasha/corus/HEAD/data/taiga/Lenta.tar.gz -------------------------------------------------------------------------------- /data/taiga/NPlus1.tar.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/natasha/corus/HEAD/data/taiga/NPlus1.tar.gz -------------------------------------------------------------------------------- /data/taiga/proza_ru.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/natasha/corus/HEAD/data/taiga/proza_ru.zip -------------------------------------------------------------------------------- /data/taiga/social.tar.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/natasha/corus/HEAD/data/taiga/social.tar.gz -------------------------------------------------------------------------------- /data/taiga/stihi_ru.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/natasha/corus/HEAD/data/taiga/stihi_ru.zip -------------------------------------------------------------------------------- /data/aij-wikiner-ru-wp3.bz2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/natasha/corus/HEAD/data/aij-wikiner-ru-wp3.bz2 -------------------------------------------------------------------------------- /data/lenta-ru-news.csv.bz2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/natasha/corus/HEAD/data/lenta-ru-news.csv.bz2 -------------------------------------------------------------------------------- /data/lenta-ru-news.csv.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/natasha/corus/HEAD/data/lenta-ru-news.csv.gz -------------------------------------------------------------------------------- /data/librusec_fb2.plain.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/natasha/corus/HEAD/data/librusec_fb2.plain.gz -------------------------------------------------------------------------------- /data/taiga/Arzamas.tar.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/natasha/corus/HEAD/data/taiga/Arzamas.tar.gz -------------------------------------------------------------------------------- /data/taiga/Fontanka.tar.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/natasha/corus/HEAD/data/taiga/Fontanka.tar.gz -------------------------------------------------------------------------------- /data/taiga/Interfax.tar.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/natasha/corus/HEAD/data/taiga/Interfax.tar.gz -------------------------------------------------------------------------------- /data/taiga/Magazines.tar.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/natasha/corus/HEAD/data/taiga/Magazines.tar.gz -------------------------------------------------------------------------------- /data/taiga/Subtitles.tar.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/natasha/corus/HEAD/data/taiga/Subtitles.tar.gz -------------------------------------------------------------------------------- /data/buriy/webhose-2016.tar.bz2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/natasha/corus/HEAD/data/buriy/webhose-2016.tar.bz2 -------------------------------------------------------------------------------- /corus/sources/gramru.py: -------------------------------------------------------------------------------- 1 | 2 | from .ud import load_ud 3 | 4 | 5 | def load_gramru(path): 6 | return load_ud(path) 7 | -------------------------------------------------------------------------------- /data/annot.opcorpora.xml.byfile.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/natasha/corus/HEAD/data/annot.opcorpora.xml.byfile.zip -------------------------------------------------------------------------------- /data/buriy/news-articles-2014.tar.bz2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/natasha/corus/HEAD/data/buriy/news-articles-2014.tar.bz2 -------------------------------------------------------------------------------- /data/ruwiki-latest-pages-articles.xml.bz2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/natasha/corus/HEAD/data/ruwiki-latest-pages-articles.xml.bz2 -------------------------------------------------------------------------------- /data/buriy/news-articles-2015-part1.tar.bz2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/natasha/corus/HEAD/data/buriy/news-articles-2015-part1.tar.bz2 -------------------------------------------------------------------------------- /data/buriy/news-articles-2015-part2.tar.bz2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/natasha/corus/HEAD/data/buriy/news-articles-2015-part2.tar.bz2 -------------------------------------------------------------------------------- /data/russe/sem/ae2.csv: -------------------------------------------------------------------------------- 1 | word1,word2,sim 2 | абажур,торшер,1 3 | абажур,люстра,1 4 | абажур,лампа,1 5 | абажур,свет,1 6 | абажур,ночник,1 7 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | 2 | [flake8] 3 | # E501 line too long 4 | # W503 line break before binary op 5 | extend-ignore = E501,W503 6 | exclude = corus/third -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .cache 2 | .coverage 3 | coverage.xml 4 | .pytest_cache 5 | .ipynb_checkpoints 6 | .DS_Store 7 | *.pyc 8 | *.egg-info 9 | build 10 | dist 11 | notes 12 | -------------------------------------------------------------------------------- /data/russe/sem/rt.csv: -------------------------------------------------------------------------------- 1 | word1,word2,sim 2 | аберрация,год,0 3 | аберрация,человек,0 4 | аберрация,заблуждение,1 5 | абзац,отрывок,1 6 | абзац,время,0 7 | абзац,район,0 8 | абиссиния,население,0 9 | -------------------------------------------------------------------------------- /data/russe/sem/hj.csv: -------------------------------------------------------------------------------- 1 | word1,word2,sim 2 | автомобиль,машина,0.958333 3 | маг,волшебник,0.958333 4 | доллар,бакс,0.952381 5 | мальчик,парень,0.952381 6 | машина,автомобиль,0.952381 7 | кладбище,погост,0.916667 8 | -------------------------------------------------------------------------------- /corus/path.py: -------------------------------------------------------------------------------- 1 | 2 | from os import listdir as list_dir # noqa 3 | from os.path import join as join_path # noqa 4 | from os.path import basename as get_filename # noqa 5 | from os.path import splitext as split_ext # noqa 6 | -------------------------------------------------------------------------------- /data/russe/sem/ae-train.csv: -------------------------------------------------------------------------------- 1 | word1,word2,related,sim 2 | автомат,калашникова,assoc,1 3 | автомат,пулемет,assoc,1 4 | автомат,пистолет,assoc,1 5 | автомат,война,assoc,1 6 | автомат,газ. вода,assoc,1 7 | автомат,год,random,0 8 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | 2 | lint: 3 | flake8 corus 4 | 5 | exec-docs: 6 | python -m nbconvert \ 7 | --ExecutePreprocessor.kernel_name=python3 \ 8 | --ClearMetadataPreprocessor.enabled=True \ 9 | --execute --to notebook --inplace \ 10 | docs.ipynb 11 | -------------------------------------------------------------------------------- /data/simlex/ru_simlex965_tagged.tsv: -------------------------------------------------------------------------------- 1 | # Word1 Word2 Average Score 2 | авария_NOUN бедствие_NOUN 6.15 3 | август_NOUN месяц_NOUN 2.85 4 | авиация_NOUN полет_NOUN 6.77 5 | эксцентричный_ADJ странный_ADJ 6.31 6 | эластичный_ADJ гибкий_ADJ 7.92 7 | элегантность_NOUN стиль_NOUN 6.46 8 | -------------------------------------------------------------------------------- /data/factRuEval-2016-master/devset/book_58.coref: -------------------------------------------------------------------------------- 1 | 2 16967 16972 2 | name Италия 3 | 4 | 3 16968 16970 16974 5 | name Грузия 6 | 7 | 4 16975 8 | name МИД Грузии 9 | 10 | 5 16969 11 | firstname Виторио 12 | lastname Сандали 13 | 14 | 6 16971 15 | firstname Александр 16 | lastname Налбандов 17 | 18 | -------------------------------------------------------------------------------- /data/toloka/ruadrect/task2_ru_test.tsv: -------------------------------------------------------------------------------- 1 | tweet_id tweet label 2 | 892079521922416641 @A_Kapustin запретить на хрен.. недосмотр однако.. только прозак, только хардкор 0 3 | 1089927935031676929 не тратьте деньги на образование, тратьте на транквилизаторы: какая разница какие у тебя оценки когда ты залипла под ксанаксом? 0 4 | -------------------------------------------------------------------------------- /data/factRuEval-2016-master/devset/book_58.objects: -------------------------------------------------------------------------------- 1 | 16972 LocOrg 32962 # Италии 2 | 16975 Org 32963 32965 # миде Грузии 3 | 16974 LocOrg 32965 # Грузии 4 | 16967 LocOrg 32951 # Италии 5 | 16968 LocOrg 32952 # Грузии 6 | 16969 Person 32953 32954 # Виторио Сандали 7 | 16970 LocOrg 32955 # Грузии 8 | 16971 Person 32956 32957 # Александром Налбандовым 9 | -------------------------------------------------------------------------------- /data/factRuEval-2016-master/devset/book_58.txt: -------------------------------------------------------------------------------- 1 | Встреча с послом Италии в миде Грузии 2 | 3 | По инициативе итальянской стороны чрезвычайный и полномочный посол Италии в Грузии Виторио Сандали встретился с заместителем министра иностранных дел Грузии Александром Налбандовым. Предметом обсуждения стали вопросы сотрудничества в международных организациях. 4 | -------------------------------------------------------------------------------- /corus/sources/taiga/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | from .arzamas import * # noqa 3 | from .fontanka import * # noqa 4 | from .interfax import * # noqa 5 | from .kp import * # noqa 6 | from .lenta import * # noqa 7 | from .magazines import * # noqa 8 | from .nplus1 import * # noqa 9 | from .subtitles import * # noqa 10 | from .social import * # noqa 11 | from .proza import * # noqa 12 | -------------------------------------------------------------------------------- /data/factRuEval-2016-master/devset/book_58.facts: -------------------------------------------------------------------------------- 1 | 58-0 Meeting 2 | Participant obj5 Сандали Виторио 3 | Participant obj6 Налбандов Александр 4 | 5 | 58-1 Occupation 6 | Who obj5 Сандали Виторио 7 | Where obj2 Италия 8 | Position span32958 чрезвычайный и полномочный посол | span64007 чрезвычайный и полномочный посол Италии в Грузии 9 | 10 | 58-2 Occupation 11 | Who obj6 Налбандов Александр 12 | Position span32959 заместителем министра иностранных дел 13 | Where obj3 Грузия 14 | -------------------------------------------------------------------------------- /data/toloka/lrwc-1.1-aggregated.tsv: -------------------------------------------------------------------------------- 1 | INPUT:hyponym INPUT:hypernym INPUT:genitive OUTPUT:judgement CONFIDENCE:judgement 2 | автомобиль автомашина автомашины true 99.75% 3 | автомобиль автомототранспорт автомототранспорта true 99.96% 4 | автомобиль автомототранспортный автомототранспортного true 99.99% 5 | автомобиль автомототранспортное_средство автомототранспортного_средства true 99.99% 6 | автомобиль внедорожник внедорожника false 61.28% 7 | автомобиль железный_конь железного_коня false 77.76% 8 | -------------------------------------------------------------------------------- /data/rus-ner-news-corpus.iob/biztass-1.txt.iob: -------------------------------------------------------------------------------- 1 | МОСКВА O 2 | , O 3 | 21 O 4 | июня O 5 | . O 6 | / O 7 | БИЗНЕС-ТАСС B-ORG 8 | / O 9 | . O 10 | Группа O 11 | НЛМК B-ORG 12 | заняла O 13 | второе O 14 | место O 15 | в O 16 | рейтинге O 17 | 35 O 18 | наиболее O 19 | конкурентоспособных O 20 | сталелитейных O 21 | компаний O 22 | мира O 23 | . O 24 | Рейтинг O 25 | составлялся O 26 | World B-ORG 27 | Steel I-ORG 28 | Dynamics I-ORG 29 | , O 30 | ведущей O 31 | международной O 32 | исследовательской O 33 | компанией O 34 | , O 35 | на O 36 | основе O 37 | оценки O 38 | 23 O 39 | -------------------------------------------------------------------------------- /data/Collection5/001.ann: -------------------------------------------------------------------------------- 1 | T1 GEOPOLIT 0 6 Россия 2 | T2 GEOPOLIT 50 53 США 3 | T3 GEOPOLIT 57 63 Грузию 4 | T4 LOC 87 93 МОСКВА 5 | T5 MEDIA 103 114 РИА Новости 6 | T6 GEOPOLIT 116 122 Россия 7 | T7 GEOPOLIT 141 144 США 8 | T8 GEOPOLIT 161 168 Тбилиси 9 | T9 GEOPOLIT 301 307 России 10 | T10 PER 308 324 Григорий Карасин 11 | T11 GEOPOLIT 383 386 США 12 | T12 PER 387 402 Дэниэлом Фридом 13 | T13 GEOPOLIT 505 517 Южной Осетии 14 | T14 GEOPOLIT 703 709 Россия 15 | T15 GEOPOLIT 723 730 Тбилиси 16 | T16 GEOPOLIT 815 825 Вашингтона 17 | T17 ORG 838 841 МИД 18 | T18 GEOPOLIT 842 848 России 19 | -------------------------------------------------------------------------------- /data/morphoru/gikrya_new_test.out: -------------------------------------------------------------------------------- 1 | 1 А а CONJ _ 2 | 2 потом потом ADV Degree=Pos 3 | 3 опять опять ADV Degree=Pos 4 | 4 появлялись появляться VERB Mood=Ind|Number=Plur|Tense=Past|VerbForm=Fin|Voice=Mid 5 | 5 эсэсовцы эсэсовец NOUN Animacy=Anim|Case=Nom|Gender=Masc|Number=Plur 6 | 6 . . PUNCT _ 7 | 8 | 1 Вокруг вокруг ADP _ 9 | 2 него он PRON Case=Gen|Gender=Masc|Number=Sing|Person=3 10 | 3 вспыхнул вспыхнуть VERB Gender=Masc|Mood=Ind|Number=Sing|Tense=Past|VerbForm=Fin|Voice=Act 11 | 4 зеленый зелёный ADJ Case=Nom|Degree=Pos|Gender=Masc|Number=Sing 12 | 5 свет свет NOUN Animacy=Inan|Case=Nom|Gender=Masc|Number=Sing 13 | -------------------------------------------------------------------------------- /.github/workflows/pypi.yml: -------------------------------------------------------------------------------- 1 | name: Publish PyPi 2 | 3 | on: 4 | push: 5 | tags: 6 | - v* 7 | 8 | jobs: 9 | build: 10 | runs-on: ubuntu-latest 11 | 12 | steps: 13 | - uses: actions/checkout@v3 14 | 15 | - name: Set up Python 16 | uses: actions/setup-python@v4 17 | with: 18 | python-version: '3.10' 19 | 20 | - name: Install dependencies 21 | run: pip install wheel 22 | 23 | - name: Build package 24 | run: python setup.py sdist bdist_wheel 25 | 26 | - name: Publish PyPI 27 | uses: pypa/gh-action-pypi-publish@release/v1 28 | with: 29 | password: ${{ secrets.PYPI_API_TOKEN }} 30 | 31 | -------------------------------------------------------------------------------- /.github/workflows/test.yml: -------------------------------------------------------------------------------- 1 | name: Test 2 | 3 | on: [push, pull_request] 4 | 5 | jobs: 6 | build: 7 | 8 | runs-on: ubuntu-latest 9 | strategy: 10 | matrix: 11 | python-version: ['3.8', '3.9', '3.10', '3.11'] 12 | 13 | steps: 14 | - uses: actions/checkout@v3 15 | 16 | - name: Set up Python 17 | uses: actions/setup-python@v4 18 | with: 19 | python-version: ${{ matrix.python-version }} 20 | 21 | - name: Install dependencies 22 | run: | 23 | pip install -r requirements/ci.txt 24 | pip install -e . 25 | 26 | - name: Test 27 | run: | 28 | make lint 29 | make exec-docs 30 | -------------------------------------------------------------------------------- /corus/sources/simlex.py: -------------------------------------------------------------------------------- 1 | 2 | from corus.record import Record 3 | from corus.io import ( 4 | load_lines, 5 | parse_tsv, 6 | skip_header 7 | ) 8 | 9 | 10 | class SimlexRecord(Record): 11 | __attributes__ = ['word1', 'word2', 'score'] 12 | 13 | def __init__(self, word1, word2, score): 14 | self.word1 = word1 15 | self.word2 = word2 16 | self.score = score 17 | 18 | 19 | def parse_simlex(lines): 20 | skip_header(lines) 21 | records = parse_tsv(lines) 22 | for word1, word2, score in records: 23 | score = float(score) 24 | yield SimlexRecord(word1, word2, score) 25 | 26 | 27 | def load_simlex(path): 28 | lines = load_lines(path) 29 | return parse_simlex(lines) 30 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | 2 | from setuptools import setup, find_packages 3 | 4 | 5 | with open('README.md') as file: 6 | description = file.read() 7 | 8 | 9 | setup( 10 | name='corus', 11 | version='0.10.0', 12 | description='Links to russian corpora, functions for loading and parsing', 13 | long_description=description, 14 | long_description_content_type='text/markdown', 15 | url='https://github.com/natasha/corus', 16 | author='Alexander Kukushkin', 17 | author_email='alex@alexkuk.ru', 18 | license='MIT', 19 | classifiers=[ 20 | 'License :: OSI Approved :: MIT License', 21 | 'Programming Language :: Python :: 3', 22 | ], 23 | keywords='corpora, russian, nlp, datasets', 24 | install_requires=[], 25 | packages=find_packages(), 26 | ) 27 | 28 | -------------------------------------------------------------------------------- /data/Collection5/001.txt: -------------------------------------------------------------------------------- 1 | Россия рассчитывает на конструктивное воздействие США на Грузию 2 | 3 | 04/08/2008 12:08 4 | 5 | МОСКВА, 4 авг - РИА Новости. Россия рассчитывает, что США воздействуют на Тбилиси в связи с обострением ситуации в зоне грузино-осетинского конфликта. Об этом статс-секретарь - заместитель министра иностранных дел России Григорий Карасин заявил в телефонном разговоре с заместителем госсекретаря США Дэниэлом Фридом. 6 | 7 | "С российской стороны выражена глубокая озабоченность в связи с новым витком напряженности вокруг Южной Осетии, противозаконными действиями грузинской стороны по наращиванию своих вооруженных сил в регионе, бесконтрольным строительством фортификационных сооружений", - говорится в сообщении. 8 | 9 | "Россия уже призвала Тбилиси к ответственной линии и рассчитывает также на конструктивное воздействие со стороны Вашингтона", - сообщил МИД России. -------------------------------------------------------------------------------- /data/rudrec/rudrec_annotated.json: -------------------------------------------------------------------------------- 1 | {"file_name": "172744.tsv", "text": "нам прописали, так мой ребенок сыпью покрылся, глаза опухли, сверху и снизу на веках высыпала сыпь, ( 8 месяцев сыну)А от виферона такого не было... У кого ещё такие побочки, отзовитесь!1 Чем спасались?\n", "entities": [{"start": 122, "entity_type": "Drugform", "end": 130, "entity_id": "*[0]_se", "entity_text": "виферона", "concept_id": "C0021735", "concept_name": NaN}, {"start": 31, "entity_type": "ADR", "end": 45, "entity_id": "*[1]", "entity_text": "сыпью покрылся", "concept_id": "C0015230", "concept_name": NaN}, {"start": 47, "entity_type": "ADR", "end": 59, "entity_id": "*[2]", "entity_text": "глаза опухли", "concept_id": "C4760994", "concept_name": NaN}, {"start": 76, "entity_type": "ADR", "end": 98, "entity_id": "*[3]", "entity_text": "на веках высыпала сыпь", "concept_id": "C0015230", "concept_name": NaN}], "sentence_id": 0} -------------------------------------------------------------------------------- /data/bsnlp/test_pl_cs_ru_bg/annotated/nord_stream/ru/Nord_Stream_2_extra.xml_file_1.out: -------------------------------------------------------------------------------- 1 | ru-ryanair-new-extra-1 2 | "Газпрому" "Газпром" ORG ORG-Gazprom 3 | Nord stream-2 Nord stream-2 PRO PRO-Nord-Stream-2 4 | Андрей Коболев Андрей Коболев PER PER-Andrey-Kobolev 5 | Брюсселе Брюссель LOC GPE-Brussles 6 | ЕС ЕС ORG ORG-European-Union 7 | Европу Европа LOC LOC-Europe 8 | Климкин Климкин PER PER-Pavel-Klimkin 9 | Линасом Линкявичюсом Линас Линкявичюс PER PER-Linas-Linkavichus 10 | Литвы Литва LOC GPE-Lithuania 11 | МИД Литвы МИД Литвы ORG ORG-Foreign-Office-Lithuania 12 | МИД МИД ORG ORG-Foreign-Office 13 | НАК "Нафтогаз Украины" НАК "Нафтогаз Украины" ORG ORG-Naftogaz 14 | Павел Климкин Павел Климкин PER PER-Pavel-Klimkin 15 | Россией Россия LOC GPE-Russia 16 | России Россия LOC GPE-Russia 17 | Украина Украина LOC GPE-Ukraine 18 | Украину Украина LOC GPE-Ukraine 19 | Украины Украина LOC GPE-Ukraine 20 | -------------------------------------------------------------------------------- /corus/sources/librusec.py: -------------------------------------------------------------------------------- 1 | 2 | import re 3 | 4 | from corus.record import Record 5 | from corus.io import load_gz_lines 6 | 7 | 8 | class LibrusecRecord(Record): 9 | __attributes__ = ['id', 'text'] 10 | 11 | def __init__(self, id, text): 12 | self.id = id 13 | self.text = text 14 | 15 | 16 | def flush(id, buffer): 17 | return LibrusecRecord(id, '\n'.join(buffer)) 18 | 19 | 20 | def parse_librusec(lines): 21 | id = None 22 | buffer = [] 23 | for line in lines: 24 | match = re.match(r'^(\d+)\.fb2', line) 25 | if match: 26 | if id: 27 | yield flush(id, buffer) 28 | buffer = [] 29 | id = match.group(1) 30 | line = line[match.end() + 1:] # extra space 31 | buffer.append(line) 32 | yield flush(id, buffer) 33 | 34 | 35 | def load_librusec(path): 36 | lines = load_gz_lines(path) 37 | return parse_librusec(lines) 38 | -------------------------------------------------------------------------------- /data/bsnlp/test_pl_cs_ru_bg/annotated/nord_stream/ru/Nord_Stream_2_extra.xml_file_7.out: -------------------------------------------------------------------------------- 1 | ru-ryanair-new-extra-7 2 | Nord Stream AG Nord Stream AG ORG ORG-Nord-Stream-AG 3 | Берлина Берлин LOC GPE-Berlin 4 | Вашингтона Вашингтон LOC GPE-WashingtonDC 5 | Герхарда Шредера Герхард Шредер PER PER-Gerhard-Fritz-Kurt-Schröder 6 | Европарламент Европарламент ORG ORG-European-Parlament 7 | Европе Европа LOC LOC-Europe 8 | Европы Европа LOC LOC-Europe 9 | МИД Германии МИД Германии ORG ORG-Ministry-of-Foreign-Affairs-Germany 10 | МИД ФРГ МИД ФРГ ORG ORG-Federal-Foreign-Office-Germany 11 | Роснефти Роснефть ORG ORG-Rosneft 12 | США США LOC GPE-USA 13 | Северного потока-2 Северный поток-2 PRO PRO-Nord-Stream-2 14 | Северный поток-2 Северный поток-2 PRO PRO-Nord-Stream-2 15 | Северный поток Северный поток PRO PRO-Nord-Stream-1 16 | Украина Украина LOC GPE-Ukraine 17 | ФРГ ФРГ LOC GPE-Germany 18 | Хайко Маас Хайко Маас PER PER-Heiko-Maas 19 | Шредер Шредер PER PER-Gerhard-Fritz-Kurt-Schröder 20 | -------------------------------------------------------------------------------- /data/factRuEval-2016-master/devset/book_58.tokens: -------------------------------------------------------------------------------- 1 | 89968 0 7 Встреча 2 | 89969 8 1 с 3 | 89970 10 6 послом 4 | 89971 17 6 Италии 5 | 89972 24 1 в 6 | 89973 26 4 миде 7 | 89974 31 6 Грузии 8 | 9 | 89975 39 2 По 10 | 89976 42 10 инициативе 11 | 89977 53 11 итальянской 12 | 89978 65 7 стороны 13 | 89979 73 12 чрезвычайный 14 | 89980 86 1 и 15 | 89981 88 11 полномочный 16 | 89982 100 5 посол 17 | 89983 106 6 Италии 18 | 89984 113 1 в 19 | 89985 115 6 Грузии 20 | 89986 122 7 Виторио 21 | 89987 130 7 Сандали 22 | 89988 138 10 встретился 23 | 89989 149 1 с 24 | 89990 151 12 заместителем 25 | 89991 164 8 министра 26 | 89992 173 11 иностранных 27 | 89993 185 3 дел 28 | 89994 189 6 Грузии 29 | 89995 196 11 Александром 30 | 89996 208 11 Налбандовым 31 | 89997 219 1 . 32 | 33 | 89998 221 9 Предметом 34 | 89999 231 10 обсуждения 35 | 90000 242 5 стали 36 | 90001 248 7 вопросы 37 | 90002 256 14 сотрудничества 38 | 90003 271 1 в 39 | 90004 273 13 международных 40 | 90005 287 12 организациях 41 | 90006 299 1 . 42 | 43 | -------------------------------------------------------------------------------- /data/morphoru/unamb_sent_14_6.conllu: -------------------------------------------------------------------------------- 1 | 1 « « PUNCT _ _ _ _ _ _ 2 | 2 Школа ШКОЛА NOUN _ Animacy=Inan|Case=Nom|Gender=Fem|Number=Sing _ _ _ _ 3 | 3 злословия ЗЛОСЛОВИЕ NOUN _ Animacy=Inan|Case=Gen|Gender=Neut|Number=Sing _ _ _ _ 4 | 4 » » PUNCT _ _ _ _ _ _ 5 | 5 учит УЧИТЬ VERB _ Aspect=Imp|Mood=Ind|Number=Sing|Person=3|Tense=Notpast|VerbForm=Fin _ _ _ _ 6 | 6 прикусить ПРИКУСИТЬ VERB _ Aspect=Perf|VerbForm=Inf _ _ _ _ 7 | 7 язык ЯЗЫК NOUN _ Animacy=Inan|Case=Acc|Gender=Masc|Number=Sing _ _ _ _ 8 | 9 | 1 Сохранится СОХРАНИТЬСЯ VERB _ Aspect=Perf|Mood=Ind|Number=Sing|Person=3|Tense=Notpast|VerbForm=Fin _ _ _ _ 10 | 2 ли ЛИ PART _ _ _ _ _ _ 11 | 3 градус ГРАДУС NOUN _ Animacy=Inan|Case=Nom|Gender=Masc|Number=Sing _ _ _ _ 12 | 4 дискуссии ДИСКУССИЯ NOUN _ Animacy=Inan|Case=Gen|Gender=Fem|Number=Sing _ _ _ _ 13 | 5 в В ADP _ _ _ _ _ _ 14 | 6 новом НОВЫЙ ADJ _ Case=Loc|Gender=Masc|Number=Sing _ _ _ _ 15 | 7 сезоне СЕЗОН NOUN _ Animacy=Inan|Case=Loc|Gender=Masc|Number=Sing _ _ _ _ 16 | 8 ? ? PUNCT _ _ _ _ _ _ 17 | -------------------------------------------------------------------------------- /corus/sources/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | from .factru import load_factru # noqa 3 | from .gareev import load_gareev # noqa 4 | from .lenta import load_lenta, load_lenta2 # noqa 5 | from .librusec import load_librusec # noqa 6 | from .ne5 import load_ne5 # noqa 7 | from .wikiner import load_wikiner # noqa 8 | from .bsnlp import load_bsnlp # noqa 9 | from .persons import load_persons # noqa 10 | from .taiga import * # noqa 11 | from .buriy import * # noqa 12 | from .mokoron import * # noqa 13 | from .wiki import load_wiki # noqa 14 | from .ods import * # noqa 15 | from .ria import * # noqa 16 | from .ud import * # noqa 17 | from .morphoru import * # noqa 18 | from .gramru import load_gramru # noqa 19 | from .corpora import load_corpora # noqa 20 | from .russe import * # noqa 21 | from .toloka import load_toloka_lrwc # noqa 22 | from .simlex import load_simlex # noqa 23 | from .omnia import load_omnia # noqa 24 | from .toloka import load_ruadrect # noqa 25 | from .rudrec import load_rudrec # noqa 26 | -------------------------------------------------------------------------------- /data/ud/ru_taiga-ud-dev.conllu: -------------------------------------------------------------------------------- 1 | # newpar 2 | # sent_id = instagram-16 3 | # speaker = screened-18 4 | # genre = social 5 | # text = @screened-88 ✅взабраться на статую Христа - только что!😄 6 | 1 @screened-88 @screened-88 X _ Foreign=Yes 3 vocative _ _ 7 | 2 ✅ ✅ PUNCT _ _ 3 punct _ SpaceAfter=No 8 | 3 взабраться взобраться VERB _ Aspect=Perf|VerbForm=Inf|Voice=Mid 0 root _ _ 9 | 4 на на ADP _ _ 5 case _ _ 10 | 5 статую статуя NOUN _ Animacy=Inan|Case=Acc|Gender=Fem|Number=Sing 3 obl _ _ 11 | 6 Христа Христос PROPN _ Animacy=Anim|Case=Gen|Gender=Masc|Number=Sing 5 nmod _ _ 12 | 7 - - PUNCT _ _ 9 punct _ _ 13 | 8 только только PART _ _ 9 advmod _ _ 14 | 9 что что PRON _ Case=Nom 3 parataxis _ SpaceAfter=No 15 | 10 ! ! PUNCT _ _ 3 punct _ SpaceAfter=No 16 | 11 😄 😄 SYM _ _ 3 discourse _ _ 17 | 18 | # newpar 19 | # sent_id = instagram-17 20 | # speaker = screened-18 21 | # genre = social 22 | # text = @screened-58 😊спасибо 23 | 1 @screened-58 @screened-58 X _ Foreign=Yes 3 vocative _ _ 24 | 2 😊 😊 SYM _ _ 3 discourse _ SpaceAfter=No 25 | 3 спасибо спасибо INTJ _ _ 0 root _ _ 26 | -------------------------------------------------------------------------------- /corus/sources/wikiner.py: -------------------------------------------------------------------------------- 1 | 2 | from corus.record import Record 3 | from corus.io import load_bz2_lines 4 | 5 | 6 | class WikinerToken(Record): 7 | __attributes__ = ['text', 'pos', 'tag'] 8 | 9 | def __init__(self, text, pos, tag): 10 | self.text = text 11 | self.pos = pos 12 | self.tag = tag 13 | 14 | 15 | class WikinerMarkup(Record): 16 | __attributes__ = ['tokens'] 17 | 18 | def __init__(self, tokens): 19 | self.tokens = tokens 20 | 21 | 22 | def parse_wikiner(line): 23 | if not line: 24 | # skip empy lines 25 | return 26 | 27 | # На|PR|O севере|S|O граничит|V|O с|PR|O Латвией|S|I-LOC 28 | tokens = [] 29 | for part in line.split(): 30 | text, pos, tag = part.split('|', 2) 31 | token = WikinerToken(text, pos, tag) 32 | tokens.append(token) 33 | 34 | return WikinerMarkup(tokens) 35 | 36 | 37 | def load_wikiner(path): 38 | lines = load_bz2_lines(path) 39 | for line in lines: 40 | record = parse_wikiner(line) 41 | if record: 42 | yield record 43 | -------------------------------------------------------------------------------- /data/factRuEval-2016-master/devset/book_58.spans: -------------------------------------------------------------------------------- 1 | 32962 loc_name 17 6 89971 1 # 89971 Италии 2 | 32963 org_name 26 4 89973 1 # 89973 миде 3 | 32965 loc_name 31 6 89974 1 # 89974 Грузии 4 | 32966 job 10 6 89970 1 # 89970 послом 5 | 64002 job 10 13 89970 2 # 89970 89971 послом Италии 6 | 32951 loc_name 106 6 89983 1 # 89983 Италии 7 | 32952 loc_name 115 6 89985 1 # 89985 Грузии 8 | 32953 name 122 7 89986 1 # 89986 Виторио 9 | 32954 surname 130 7 89987 1 # 89987 Сандали 10 | 32955 loc_name 189 6 89994 1 # 89994 Грузии 11 | 32956 name 196 11 89995 1 # 89995 Александром 12 | 32957 surname 208 11 89996 1 # 89996 Налбандовым 13 | 32958 job 73 32 89979 4 # 89979 89980 89981 89982 чрезвычайный и полномочный посол 14 | 32959 job 151 37 89990 4 # 89990 89991 89992 89993 заместителем министра иностранных дел 15 | 32960 job 164 24 89991 3 # 89991 89992 89993 министра иностранных дел 16 | 32961 job 100 5 89982 1 # 89982 посол 17 | 64007 job 73 48 89979 7 # 89979 89980 89981 89982 89983 89984 89985 чрезвычайный и полномочный посол Италии в Грузии 18 | 64013 job 100 21 89982 4 # 89982 89983 89984 89985 посол Италии в Грузии 19 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /corus/sources/lenta.py: -------------------------------------------------------------------------------- 1 | 2 | from datetime import datetime 3 | 4 | from corus.record import Record 5 | from corus.io import ( 6 | load_gz_lines, 7 | load_bz2_lines, 8 | parse_csv, 9 | skip_header 10 | ) 11 | 12 | 13 | class LentaRecord(Record): 14 | __attributes__ = ['url', 'title', 'text', 'topic', 'tags', 'date'] 15 | 16 | def __init__(self, url, title, text, topic, tags, date=None): 17 | self.url = url 18 | self.title = title 19 | self.text = text 20 | self.topic = topic 21 | self.tags = tags 22 | self.date = date 23 | 24 | 25 | def parse_lenta(lines): 26 | rows = parse_csv(lines) 27 | skip_header(rows) 28 | for cells in rows: 29 | yield LentaRecord(*cells) 30 | 31 | 32 | def parse_lenta2(lines): 33 | for record in parse_lenta(lines): 34 | record.date = datetime.strptime(record.date, '%Y/%m/%d') 35 | yield record 36 | 37 | 38 | def load_lenta(path): 39 | lines = load_gz_lines(path) 40 | return parse_lenta(lines) 41 | 42 | 43 | def load_lenta2(path): 44 | lines = load_bz2_lines(path) 45 | return parse_lenta2(lines) 46 | -------------------------------------------------------------------------------- /data/morphoru/RNCgoldInUD_Morpho.conll: -------------------------------------------------------------------------------- 1 | ==> blogs.xhtml <== 2 | ==newfile== 3 | Кстати кстати H _ _ 4 | о о ADP _ _ 5 | вопросе вопрос NOUN Animacy=Inan|Case=Loc|Gender=Masc|Number=Sing _ 6 | " " PUNCT _ _ 7 | Пушкин Пушкин NOUN Animacy=Anim|Case=Nom|Gender=Masc|Number=Sing NameType=Sur 8 | и и CONJ _ _ 9 | святитель святитель NOUN Animacy=Anim|Case=Nom|Gender=Masc|Number=Sing _ 10 | Филарет Филарет NOUN Animacy=Anim|Case=Nom|Gender=Masc|Number=Sing NameType=Giv 11 | , , PUNCT _ _ 12 | митрополит митрополит NOUN Animacy=Anim|Case=Nom|Gender=Masc|Number=Sing _ 13 | Московский московский ADJ Case=Nom|Gender=Masc|Number=Sing|Variant=Full _ 14 | "... "... PUNCT _ _ 15 | ты ты PRON Case=Nom|Number=Sing|Person=2 _ 16 | надумал надумать VERB Gender=Masc|Mood=Ind|Number=Sing|Tense=Past|VerbForm=Fin|Voice=Act Subcat=Tran|Aspect=Perf 17 | , , PUNCT _ _ 18 | что что PRON Animacy=Inan|Case=Acc|Gender=Neut|Number=Sing _ 19 | можно можно ADV Degree=Pos Predic=Yes 20 | сказать сказать VERB VerbForm=Inf|Voice=Act Subcat=Tran|Aspect=Perf 21 | ? ? PUNCT _ _ 22 | 23 | Да да PART _ _ 24 | ! ! PUNCT _ _ 25 | 26 | И и CONJ _ _ 27 | чтооо что PRON Animacy=Inan|Case=Acc|Gender=Neut|Number=Sing Typo=Yes 28 | же же PART _ _ 29 | ? ? PUNCT _ _ -------------------------------------------------------------------------------- /corus/sources/gareev.py: -------------------------------------------------------------------------------- 1 | 2 | import re 3 | 4 | from corus.path import ( 5 | list_dir, 6 | join_path 7 | ) 8 | from corus.io import load_lines 9 | from corus.record import Record 10 | 11 | 12 | class GareevToken(Record): 13 | __attributes__ = ['text', 'tag'] 14 | 15 | def __init__(self, text, tag): 16 | self.text = text 17 | self.tag = tag 18 | 19 | 20 | class GareevRecord(Record): 21 | __attributes__ = ['tokens'] 22 | 23 | def __init__(self, tokens): 24 | self.tokens = tokens 25 | 26 | 27 | def parse_conll(lines): 28 | for line in lines: 29 | text, tag = line.split('\t', 1) 30 | yield GareevToken(text, tag) 31 | 32 | 33 | def parse_gareev(lines): 34 | tokens = list(parse_conll(lines)) 35 | return GareevRecord(tokens) 36 | 37 | 38 | def load_id(id, dir): 39 | path = join_path(dir, '%s.txt.iob' % id) 40 | lines = load_lines(path) 41 | return parse_gareev(lines) 42 | 43 | 44 | def list_ids(dir): 45 | for filename in list_dir(dir): 46 | match = re.match(r'^(.+).txt.iob', filename) 47 | if match: 48 | yield match.group(1) 49 | 50 | 51 | def load_gareev(dir): 52 | for id in list_ids(dir): 53 | yield load_id(id, dir) 54 | -------------------------------------------------------------------------------- /corus/sources/wiki.py: -------------------------------------------------------------------------------- 1 | 2 | from io import StringIO 3 | import json 4 | 5 | from corus.record import Record 6 | from corus.io import load_bz2_lines 7 | from corus.third.WikiExtractor import ( 8 | options, 9 | pages_from, 10 | Extractor 11 | ) 12 | 13 | 14 | options.write_json = True 15 | 16 | 17 | class WikiRecord(Record): 18 | __attributes__ = ['id', 'url', 'title', 'text'] 19 | 20 | def __init__(self, id, url, title, text): 21 | self.id = id 22 | self.url = url 23 | self.title = title 24 | self.text = text 25 | 26 | @classmethod 27 | def from_json(cls, data): 28 | return cls( 29 | id=data['id'], 30 | url=data['url'], 31 | title=data['title'], 32 | text=data['text'] 33 | ) 34 | 35 | 36 | class Extractor_(Extractor): 37 | def extract_(self): 38 | output = StringIO() 39 | self.extract(output) 40 | return json.loads(output.getvalue()) 41 | 42 | 43 | def load_wiki(path): 44 | lines = load_bz2_lines(path) 45 | records = pages_from(lines) 46 | for record in records: 47 | id, revision, title, _, _, page = record 48 | extractor = Extractor_(id, revision, title, page) 49 | data = extractor.extract_() 50 | yield WikiRecord.from_json(data) 51 | -------------------------------------------------------------------------------- /data/bsnlp/test_pl_cs_ru_bg/raw/nord_stream/ru/Nord_Stream_2_extra.xml_file_7.txt: -------------------------------------------------------------------------------- 1 | ru-ryanair-new-extra-7 2 | ru 3 | 2019-01-10 4 | https://www.rbc.ru/rbcfreenews/5c379d0b9a79470901bdea3b 5 | МИД ФРГ заявил о невозможности решать в США энергетические вопросы Европы 6 | 7 | Глава МИД Германии Хайко Маас назвал неприемлемым введение санкций США против газопровода Северный поток-2, пишет Вопросы европейской энергетической политики нужно решать в Европе, а не в США. Наложение односторонних санкций против Северного потока-2 точно не тот путь, сказал он. 8 | 9 | В середине декабря палата представителей конгресса США приняла резолюцию против Северного потока-2. В документе газопровод называют радикальным шагом назад для энергетической безопасности Европы. Тогда же Европарламент принял свою резолюцию с призывом прекратить реализацию данного проекта. Европейские депутаты отметили, что в сетях энергоснабжения региона решающую роль играет Украина. По мнению экс-канцлера ФРГ Герхарда Шредера, давление на Северный поток-2 в США оказывают из-за планов Вашингтона стать поставщиком газа для Берлина. Шредер с сентября 2017 года входит в совет директоров Роснефти и является его председателем. После ухода с поста канцлера, Шредер возглавлял наблюдательный совет, а позже комитет акционеров компании Nord Stream AG, созданной для управления магистральным газопроводом Северный поток. 10 | -------------------------------------------------------------------------------- /corus/sources/russe.py: -------------------------------------------------------------------------------- 1 | 2 | from corus.record import Record 3 | from corus.io import ( 4 | load_lines, 5 | parse_csv, 6 | dict_csv 7 | ) 8 | 9 | 10 | class RusseSemRecord(Record): 11 | __attributes__ = ['word1', 'word2', 'sim'] 12 | 13 | def __init__(self, word1, word2, sim): 14 | self.word1 = word1 15 | self.word2 = word2 16 | self.sim = sim 17 | 18 | 19 | # word1,word2,related,sim 20 | # автомат,калашникова,assoc,1 21 | # автомат,пулемет,assoc,1 22 | # автомат,пистолет,assoc,1 23 | # автомат,война,assoc,1 24 | # автомат,газ. вода,assoc,1 25 | # автомат,год,random,0 26 | # автомат,человек,random,0 27 | # автомат,время,random,0 28 | # автомат,район,random,0 29 | 30 | 31 | def parse_russe(lines): 32 | records = parse_csv(lines) 33 | items = dict_csv(records) 34 | for item in items: 35 | word1 = item['word1'] 36 | word2 = item['word2'] 37 | sim = float(item['sim']) 38 | yield RusseSemRecord(word1, word2, sim) 39 | 40 | 41 | def load_russe(path): 42 | lines = load_lines(path) 43 | return parse_russe(lines) 44 | 45 | 46 | def load_russe_hj(path): 47 | return load_russe(path) 48 | 49 | 50 | def load_russe_rt(path): 51 | return load_russe(path) 52 | 53 | 54 | def load_russe_ae(path): 55 | return load_russe(path) 56 | 57 | 58 | __all__ = [ 59 | 'load_russe_hj', 60 | 'load_russe_rt', 61 | 'load_russe_ae', 62 | ] 63 | -------------------------------------------------------------------------------- /corus/sources/taiga/subtitles.py: -------------------------------------------------------------------------------- 1 | 2 | from .common import ( 3 | Meta, 4 | load_tar_metas, 5 | load_tar_texts, 6 | parse_filename_id, 7 | merge_metas 8 | ) 9 | 10 | 11 | # [{'filepath': 'Heroes - 3x12 - Our Father.HDTV.LOL.en.txt', 12 | # 'id': '8940', 13 | # 'languages': 'en', 14 | # 'title': 'Heroes - 3x12 - Our Father.HDTV.LOL.en.srt'}, 15 | # {'filepath': 'Friends - 3x17 - The One Without The Ski Trip.ru.txt', 16 | # 'id': '7553', 17 | # 'languages': 'ru', 18 | # 'title': 'Friends - 3x17 - The One Without The Ski Trip.ru.srt'}, 19 | 20 | 21 | def parse_metas(items): 22 | for item in items: 23 | id = parse_filename_id(item['filepath']) 24 | lang = item['languages'] 25 | title = item['title'] 26 | yield Meta( 27 | id=id, 28 | lang=lang, 29 | title=title 30 | ) 31 | 32 | 33 | def load_taiga_subtitles_metas(path, offset=0, count=1): 34 | items = load_tar_metas(path, '*/metatable.csv', offset, count) 35 | return parse_metas(items) 36 | 37 | 38 | # home/tsha/Subtitles/texts/12 Monkeys/12 Monkeys - 1x01 - Splinter.HDTV.KILLERS.en.txt 39 | # home/tsha/Subtitles/texts/12 Monkeys/12 Monkeys - 1x01 - Splinter.HDTV.KILLERS.ru.txt 40 | 41 | 42 | def load_taiga_subtitles(path, metas=None, offset=2113024, count=19011): 43 | records = load_tar_texts(path, '*/texts/*.txt', offset, count) 44 | return merge_metas(records, metas) 45 | -------------------------------------------------------------------------------- /corus/sources/rudrec.py: -------------------------------------------------------------------------------- 1 | 2 | from corus.record import Record 3 | from corus.io import ( 4 | parse_jsonl, 5 | load_lines 6 | ) 7 | 8 | 9 | class RuDReCRecord(Record): 10 | __attributes__ = ['file_name', 'text', 'sentence_id', 'entities'] 11 | 12 | def __init__(self, file_name, text, sentence_id, entities): 13 | self.file_name = file_name 14 | self.text = text 15 | self.sentence_id = sentence_id 16 | self.entities = entities 17 | 18 | 19 | class RuDReCEntity(Record): 20 | __attributes__ = [ 21 | 'entity_id', 'entity_text', 'entity_type', 22 | 'start', 'end', 'concept_id', 'concept_name' 23 | ] 24 | 25 | def __init__(self, entity_id, entity_text, entity_type, start, end, concept_id, concept_name): 26 | self.entity_id = entity_id 27 | self.entity_text = entity_text 28 | self.entity_type = entity_type 29 | self.start = start 30 | self.end = end 31 | self.concept_id = concept_id 32 | self.concept_name = concept_name 33 | 34 | 35 | def parse_entities(items): 36 | for item in items: 37 | yield RuDReCEntity( 38 | item['entity_id'], 39 | item['entity_text'], 40 | item['entity_type'], 41 | item['start'], 42 | item['end'], 43 | item.get('concept_id'), 44 | item.get('concept_name') 45 | ) 46 | 47 | 48 | def parse_rudrec(items): 49 | for item in items: 50 | entities = list(parse_entities(item['entities'])) 51 | yield RuDReCRecord( 52 | item['file_name'], 53 | item['text'], 54 | item['sentence_id'], 55 | entities 56 | ) 57 | 58 | 59 | def load_rudrec(path): 60 | lines = load_lines(path) 61 | items = parse_jsonl(lines) 62 | return parse_rudrec(items) 63 | -------------------------------------------------------------------------------- /corus/record.py: -------------------------------------------------------------------------------- 1 | 2 | class Record(object): 3 | __attributes__ = [] 4 | 5 | def __eq__(self, other): 6 | return ( 7 | type(self) == type(other) 8 | and all( 9 | (getattr(self, _) == getattr(other, _)) 10 | for _ in self.__attributes__ 11 | ) 12 | ) 13 | 14 | def __ne__(self, other): 15 | return not self == other 16 | 17 | def __iter__(self): 18 | return (getattr(self, _) for _ in self.__attributes__) 19 | 20 | def __hash__(self): 21 | return hash(tuple(self)) 22 | 23 | def __repr__(self): 24 | name = self.__class__.__name__ 25 | args = ', '.join( 26 | '{key}={value!r}'.format( 27 | key=_, 28 | value=getattr(self, _) 29 | ) 30 | for _ in self.__attributes__ 31 | ) 32 | return '{name}({args})'.format( 33 | name=name, 34 | args=args 35 | ) 36 | 37 | def _repr_pretty_(self, printer, cycle): 38 | name = self.__class__.__name__ 39 | if cycle: 40 | printer.text('{name}(...)'.format(name=name)) 41 | else: 42 | printer.text('{name}('.format(name=name)) 43 | keys = self.__attributes__ 44 | size = len(keys) 45 | if size: 46 | with printer.indent(4): 47 | printer.break_() 48 | for index, key in enumerate(keys): 49 | printer.text(key + '=') 50 | value = getattr(self, key) 51 | printer.pretty(value) 52 | if index < size - 1: 53 | printer.text(',') 54 | printer.break_() 55 | printer.break_() 56 | printer.text(')') 57 | -------------------------------------------------------------------------------- /corus/sources/ne5.py: -------------------------------------------------------------------------------- 1 | 2 | import re 3 | 4 | from corus.path import ( 5 | list_dir, 6 | join_path 7 | ) 8 | from corus.record import Record 9 | from corus.io import load_lines 10 | 11 | 12 | class Ne5Span(Record): 13 | __attributes__ = ['index', 'type', 'start', 'stop', 'text'] 14 | 15 | def __init__(self, index, type, start, stop, text): 16 | self.index = index 17 | self.type = type 18 | self.start = start 19 | self.stop = stop 20 | self.text = text 21 | 22 | 23 | class Ne5Markup(Record): 24 | __attributes__ = ['id', 'text', 'spans'] 25 | 26 | def __init__(self, id, text, spans): 27 | self.id = id 28 | self.text = text 29 | self.spans = spans 30 | 31 | 32 | def list_ids(dir): 33 | for filename in list_dir(dir): 34 | match = re.match(r'^(.+).txt$', filename) 35 | if match: 36 | yield match.group(1) 37 | 38 | 39 | def txt_path(id, dir): 40 | return join_path(dir, '%s.txt' % id) 41 | 42 | 43 | def ann_path(id, dir): 44 | return join_path(dir, '%s.ann' % id) 45 | 46 | 47 | def parse_spans(lines): 48 | # brat format http://brat.nlplab.org/standoff.html 49 | for line in lines: 50 | index, type, start, stop, text = line.split(None, 4) 51 | start = int(start) 52 | stop = int(stop) 53 | yield Ne5Span(index, type, start, stop, text) 54 | 55 | 56 | def load_text(path): 57 | # do not convert \r\n to \n 58 | with open(path, newline='') as file: 59 | return file.read() 60 | 61 | 62 | def load_id(id, dir): 63 | path = txt_path(id, dir) 64 | text = load_text(path) 65 | path = ann_path(id, dir) 66 | lines = load_lines(path) 67 | spans = list(parse_spans(lines)) 68 | return Ne5Markup(id, text, spans) 69 | 70 | 71 | def load_ne5(dir): 72 | for id in list_ids(dir): 73 | yield load_id(id, dir) 74 | -------------------------------------------------------------------------------- /data/gramru/GramEval_private_test.conllu: -------------------------------------------------------------------------------- 1 | 1 А а CCONJ _ _ 5 cc _ _ 2 | 2 потом потом ADV _ Degree=Pos 5 advmod _ _ 3 | 3 мы мы PRON _ Case=Nom|Number=Plur|Person=1 5 nsubj _ _ 4 | 4 все весь DET _ Case=Nom|Number=Plur 3 det _ _ 5 | 5 погрузились погружаться VERB _ Aspect=Perf|Mood=Ind|Number=Plur|Tense=Past|VerbForm=Fin|Voice=Mid 0 root _ _ 6 | 6 в в ADP _ _ 7 case _ _ 7 | 7 автобус автобус NOUN _ Animacy=Inan|Case=Acc|Gender=Masc|Number=Sing 5 obl _ _ 8 | 8 и и CCONJ _ _ 9 cc _ _ 9 | 9 поехали поехать VERB _ Aspect=Perf|Mood=Ind|Number=Plur|Tense=Past|VerbForm=Fin|Voice=Act 5 conj _ _ 10 | 10 их они PRON _ Case=Acc|Number=Plur|Person=3 9 obj _ _ 11 | 11 в в ADP _ _ 12 case _ _ 12 | 12 аэропорт аэропорт NOUN _ Animacy=Inan|Case=Acc|Gender=Masc|Number=Sing 13 obl _ _ 13 | 13 провожать провожать VERB _ Aspect=Imp|VerbForm=Inf|Voice=Act 9 xcomp _ _ 14 | 14 . . PUNCT _ _ 13 punct _ _ 15 | 16 | 1 Маменька маменька PROPN _ Animacy=Anim|Case=Nom|Gender=Fem|Number=Sing 2 nsubj _ _ 17 | 2 сбежала сбежать VERB _ Aspect=Perf|Gender=Fem|Mood=Ind|Number=Sing|Tense=Past|VerbForm=Fin|Voice=Act 0 root _ _ 18 | 3 ? ? PUNCT _ _ 2 punct _ _ 19 | 20 | 1 Писано писать VERB _ Aspect=Perf|Gender=Neut|Number=Sing|Tense=Past|Variant=Short|VerbForm=Part|Voice=Pass 0 root _ _ 21 | 2 въ в PROPN _ Animacy=Inan|Case=Nom|Gender=Masc|Number=Sing 1 nsubj:pass _ _ 22 | 3 нашемъ нашемъ PROPN _ Animacy=Inan|Case=Gen|Gender=Masc|Number=Sing 2 nmod _ _ 23 | 4 строеніи строеніь NOUN _ Animacy=Inan|Case=Gen|Gender=Fem|Number=Sing 3 nmod _ _ 24 | 5 въ в PROPN _ Animacy=Inan|Case=Gen|Gender=Masc|Number=Sing 4 nmod _ _ 25 | 6 Воскресенскомъ Воскресенскомъ PROPN _ Animacy=Inan|Case=Gen|Gender=Masc|Number=Sing 5 nmod _ _ 26 | 7 монастырѣ монастырѣ NOUN _ Animacy=Inan|Case=Gen|Gender=Fem|Number=Sing 2 nmod _ _ 27 | 8 въ в PROPN _ Animacy=Inan|Case=Gen|Gender=Masc|Number=Sing 7 nmod _ _ 28 | 9 Новомъ Новомъ PROPN _ Animacy=Inan|Case=Gen|Gender=Masc|Number=Sing 8 nmod _ _ 29 | -------------------------------------------------------------------------------- /corus/sources/persons.py: -------------------------------------------------------------------------------- 1 | 2 | import re 3 | from corus.record import Record 4 | from corus.io import ( 5 | list_zip, 6 | load_zip_texts, 7 | parse_xml, 8 | ) 9 | 10 | 11 | TEXT = 'text.txt' 12 | ANNO = 'anno.markup.xml' 13 | 14 | 15 | class PersonsSpan(Record): 16 | __attributes__ = ['id', 'start', 'stop', 'value'] 17 | 18 | def __init__(self, id, start, stop, value): 19 | self.id = id 20 | self.start = start 21 | self.stop = stop 22 | self.value = value 23 | 24 | 25 | class PersonsMarkup(Record): 26 | __attributes__ = ['text', 'spans'] 27 | 28 | def __init__(self, text, spans): 29 | self.text = text 30 | self.spans = spans 31 | 32 | 33 | def list_ids(path): 34 | for name in list_zip(path): 35 | match = re.match(r'^Persons-1000/collection/([^/]+)/text\.txt$', name) 36 | if match: 37 | yield match.group(1) 38 | 39 | 40 | def part_names(ids, part): 41 | for id in ids: 42 | yield 'Persons-1000/collection/%s/%s' % (id, part) 43 | 44 | 45 | def parse_anno(text): 46 | xml = parse_xml(text) 47 | for entry in xml.findall('entry'): 48 | id = int(entry.find('id').text) 49 | start = int(entry.find('offset').text) 50 | size = int(entry.find('length').text) 51 | stop = start + size 52 | attribute = entry.find('attribute') 53 | value = attribute.find('value').text 54 | yield PersonsSpan(id, start, stop, value) 55 | 56 | 57 | def load_ids(ids, path): 58 | names = part_names(ids, TEXT) 59 | texts = load_zip_texts(path, names, 'cp1251') 60 | 61 | names = part_names(ids, ANNO) 62 | annos = load_zip_texts(path, names, 'utf-8') 63 | for text, anno in zip(texts, annos): 64 | spans = list(parse_anno(anno)) 65 | yield PersonsMarkup(text, spans) 66 | 67 | 68 | def load_persons(path): 69 | ids = list(list_ids(path)) 70 | return load_ids(ids, path) 71 | -------------------------------------------------------------------------------- /corus/sources/taiga/kp.py: -------------------------------------------------------------------------------- 1 | 2 | from datetime import datetime 3 | 4 | from .common import ( 5 | Author, 6 | Meta, 7 | load_tar_metas, 8 | load_tar_texts, 9 | merge_metas 10 | ) 11 | 12 | 13 | # {'author': 'Мария ГОШИНА', 14 | # 'authorreaders': '', 15 | # 'authortexts': '', 16 | # 'date': '2017-01-20', 17 | # 'magazine': '', 18 | # 'segment': 'KP', 19 | # 'source': 'http://www.kp.ru/online/news/2632060/', 20 | # 'tags': '', 21 | # 'textdiff': '', 22 | # 'textid': '10@2632060', 23 | # 'textname': 'В Саратове спасатели помогли родственникам попасть в квартиру пенсионерки', 24 | # 'textregion': 'www.saratov.kp.ru', 25 | # 'textrubric': 'Общество>Общество', 26 | # 'time': '09:27:00+03:00'}, 27 | 28 | 29 | def parse_metas(items): 30 | for item in items: 31 | id = item['textid'] 32 | timestamp = item['date'] + item['time'][:8] 33 | timestamp = datetime.strptime(timestamp, '%Y-%m-%d%H:%M:%S') 34 | 35 | name = item['author'] or None 36 | author = Author(name=name) 37 | 38 | rubric = item['textrubric'] 39 | title = item['textname'] 40 | url = item['source'] 41 | yield Meta( 42 | id=id, 43 | timestamp=timestamp, 44 | rubric=rubric, 45 | author=author, 46 | title=title, 47 | url=url 48 | ) 49 | 50 | 51 | def load_taiga_kp_metas(path, offset=0, count=1): 52 | items = load_tar_metas(path, '*/newmetadata.csv', offset, count) 53 | return parse_metas(items) 54 | 55 | 56 | # home/tsha/KP/texts/10@2598286.txt 57 | # home/tsha/KP/texts/10@2598287.txt 58 | # home/tsha/KP/texts/10@2598289.txt 59 | 60 | 61 | def load_taiga_kp(path, metas=None, offset=13042176, count=45503): 62 | records = load_tar_texts(path, '*/texts/*.txt', offset, count) 63 | return merge_metas(records, metas) 64 | 65 | 66 | __all__ = [ 67 | 'load_taiga_kp_metas', 68 | 'load_taiga_kp' 69 | ] 70 | -------------------------------------------------------------------------------- /corus/sources/taiga/interfax.py: -------------------------------------------------------------------------------- 1 | 2 | from datetime import datetime 3 | 4 | from .common import ( 5 | Meta, 6 | load_tar_metas, 7 | load_tar_texts, 8 | merge_metas 9 | ) 10 | 11 | 12 | # {'author': '', 13 | # 'authorreaders': '', 14 | # 'authortexts': '', 15 | # 'date': '2013-02-24', 16 | # 'magazine': '', 17 | # 'segment': 'Interfax', 18 | # 'source': 'http://www.interfax.ru/russia/292151', 19 | # 'tags': 'Кубань', 20 | # 'textdiff': '', 21 | # 'textid': 'russia292151', 22 | # 'textname': '60 тысяч жителей Туапсинского района остались без электричества', 23 | # 'textregion': '', 24 | # 'textrubric': 'В России', 25 | # 'time': '16:10'}, 26 | 27 | 28 | def parse_metas(items): 29 | for item in items: 30 | id = item['textid'] 31 | 32 | timestamp = item['date'] + item['time'] 33 | try: 34 | timestamp = datetime.strptime(timestamp, '%Y-%m-%d%H:%M') 35 | except ValueError: 36 | # rare, date='' time='2011-09-12' 37 | timestamp = datetime.strptime(timestamp, '%Y-%m-%d') 38 | 39 | title = item['textname'] 40 | tags = item['tags'] 41 | rubric = item.get('rubric') 42 | url = item['source'] 43 | yield Meta( 44 | id=id, 45 | timestamp=timestamp, 46 | title=title, 47 | rubric=rubric, 48 | tags=tags, 49 | url=url 50 | ) 51 | 52 | 53 | def load_taiga_interfax_metas(path, offset=0, count=1): 54 | items = load_tar_metas(path, '*/newmetadata.csv', offset, count) 55 | return parse_metas(items) 56 | 57 | 58 | # home/tsha/Interfax/texts/business225067.txt 59 | # home/tsha/Interfax/texts/business225113.txt 60 | # home/tsha/Interfax/texts/business225178.txt 61 | 62 | 63 | def load_taiga_interfax(path, metas=None, offset=11447296, count=46429): 64 | records = load_tar_texts(path, '*/texts/*.txt', offset, count) 65 | return merge_metas(records, metas) 66 | 67 | 68 | __all__ = [ 69 | 'load_taiga_interfax_metas', 70 | 'load_taiga_interfax' 71 | ] 72 | -------------------------------------------------------------------------------- /corus/sources/ria.py: -------------------------------------------------------------------------------- 1 | 2 | import re 3 | 4 | from corus.record import Record 5 | from corus.io import ( 6 | load_gz_lines, 7 | parse_jsonl 8 | ) 9 | 10 | 11 | class RiaRawRecord(Record): 12 | __attributes__ = ['title', 'text'] 13 | 14 | def __init__(self, title, text): 15 | self.title = title 16 | self.text = text 17 | 18 | 19 | class RiaRecord(Record): 20 | __attributes__ = ['title', 'prefix', 'text'] 21 | 22 | def __init__(self, title, prefix, text): 23 | self.title = title 24 | self.prefix = prefix 25 | self.text = text 26 | 27 | 28 | def parse_ria_raw(lines): 29 | records = parse_jsonl(lines) 30 | for record in records: 31 | yield RiaRawRecord( 32 | record['title'], 33 | record['text'] 34 | ) 35 | 36 | 37 | def load_ria_raw(path): 38 | lines = load_gz_lines(path) 39 | return parse_ria_raw(lines) 40 | 41 | 42 | def untag(text): 43 | return re.sub(r'<[^>]+>', '', text) 44 | 45 | 46 | def unescape(text): 47 | text = text.replace('<', '<') 48 | text = text.replace('>', '>') 49 | text = text.replace('&', '&') 50 | text = text.replace('–', '-') 51 | text = text.replace(' ', ' ') 52 | return text 53 | 54 | 55 | def first_sent(text): 56 | # москва, 31 янв - риа новости. 57 | # фарнборо (великобритания), 21 июл - риа новости, александр смотров. 58 | index = text.find('. ') # len('. ') 59 | if index > 0: 60 | index += 2 61 | sent, suffix = text[:index], text[index:] 62 | if 'риа новости' in sent and len(sent) < 70: 63 | sent = sent.strip() 64 | return sent, suffix 65 | return None, text 66 | 67 | 68 | def parse_ria(records): 69 | for record in records: 70 | text = record.text 71 | text = untag(text) 72 | text = unescape(text) 73 | prefix, text = first_sent(text) 74 | yield RiaRecord( 75 | record.title, 76 | prefix, 77 | text 78 | ) 79 | 80 | 81 | def load_ria(path): 82 | records = load_ria_raw(path) 83 | return parse_ria(records) 84 | 85 | 86 | __all__ = [ 87 | 'load_ria_raw', 88 | 'load_ria' 89 | ] 90 | -------------------------------------------------------------------------------- /corus/sources/buriy.py: -------------------------------------------------------------------------------- 1 | 2 | import tarfile 3 | from io import TextIOWrapper 4 | from datetime import datetime 5 | 6 | from corus.record import Record 7 | from corus.io import ( 8 | parse_csv, 9 | skip_header, 10 | ) 11 | 12 | 13 | class BuriyRecord(Record): 14 | __attributes__ = ['timestamp', 'url', 'edition', 'topics', 'title', 'text'] 15 | 16 | def __init__(self, timestamp, url, edition, topics, title, text): 17 | self.timestamp = timestamp 18 | self.url = url 19 | self.edition = edition 20 | self.topics = topics 21 | self.title = title 22 | self.text = text 23 | 24 | 25 | def load_tar(path, encoding='utf8'): 26 | with tarfile.open(path) as tar: 27 | for member in tar: 28 | if not member.isfile(): 29 | continue 30 | file = tar.extractfile(member) 31 | yield TextIOWrapper(file, encoding) 32 | 33 | 34 | def parse_timestamp(timestamp): 35 | for pattern in ['%Y-%m-%d %H:%M:%S', '%Y-%m-%d']: 36 | try: 37 | return datetime.strptime(timestamp, pattern) 38 | except ValueError: 39 | continue 40 | 41 | 42 | def maybe_none(value, none=('',)): 43 | if value in none: 44 | return 45 | return value 46 | 47 | 48 | def parse_buriy(lines, max_text=10000000): 49 | rows = parse_csv(lines, max_field=max_text) 50 | skip_header(rows) 51 | for row in rows: 52 | timestamp, url, edition, topics, title, text = row 53 | timestamp = parse_timestamp(timestamp) 54 | edition = maybe_none(edition, ('', '-')) 55 | topics = maybe_none(topics) 56 | yield BuriyRecord( 57 | timestamp=timestamp, 58 | url=url, 59 | edition=edition, 60 | topics=topics, 61 | title=title, 62 | text=text 63 | ) 64 | 65 | 66 | def load_buriy(path): 67 | for lines in load_tar(path): 68 | for record in parse_buriy(lines): 69 | yield record 70 | 71 | 72 | def load_buriy_news(path): 73 | return load_buriy(path) 74 | 75 | 76 | def load_buriy_webhose(path): 77 | return load_buriy(path) 78 | 79 | 80 | __all__ = [ 81 | 'load_buriy_news', 82 | 'load_buriy_webhose' 83 | ] 84 | -------------------------------------------------------------------------------- /corus/sources/taiga/lenta.py: -------------------------------------------------------------------------------- 1 | 2 | from datetime import datetime 3 | 4 | from .common import ( 5 | Meta, 6 | load_tar_metas, 7 | load_tar_texts, 8 | patch_month, 9 | merge_metas 10 | ) 11 | 12 | 13 | # {'author': '', 14 | # 'authorreaders': '', 15 | # 'authortexts': '', 16 | # 'date': '8 марта 2011', 17 | # 'magazine': '', 18 | # 'segment': 'Lenta', 19 | # 'source': 'https://lenta.ru/news/2011/03/08/hobgoblin/', 20 | # 'tags': '', 21 | # 'textdiff': '', 22 | # 'textid': '20110308hobgoblin', 23 | # 'textname': 'HBO запустит сериал о волшебной войне с Гитлером', 24 | # 'textregion': '', 25 | # 'textrubric': 'Культура', 26 | # 'time': '14:33'}, 27 | 28 | 29 | LENTA_MONTHS = { 30 | 'января': 'Jan', 31 | 'февраля': 'Feb', 32 | 'марта': 'Mar', 33 | 'апреля': 'Apr', 34 | 'мая': 'May', 35 | 'июня': 'Jun', 36 | 'июля': 'Jul', 37 | 'августа': 'Aug', 38 | 'сентября': 'Sep', 39 | 'октября': 'Oct', 40 | 'ноября': 'Nov', 41 | 'декабря': 'Dec', 42 | } 43 | 44 | 45 | def parse_metas(items): 46 | for item in items: 47 | id = item['textid'] 48 | 49 | date, time, timestamp = item['date'], item['time'], None 50 | if date and time: 51 | timestamp = patch_month(date, LENTA_MONTHS) + time 52 | timestamp = datetime.strptime(timestamp, '%d %b %Y%H:%M') 53 | 54 | title = item['textname'] 55 | rubric = item['textrubric'] 56 | url = item['source'] or None 57 | yield Meta( 58 | id=id, 59 | timestamp=timestamp, 60 | title=title, 61 | rubric=rubric, 62 | url=url 63 | ) 64 | 65 | 66 | def load_taiga_lenta_metas(path, offset=0, count=1): 67 | items = load_tar_metas(path, '*/newmetadata.csv', offset, count) 68 | return parse_metas(items) 69 | 70 | 71 | # home/tsha/Lenta/texts/20100101three.txt 72 | # home/tsha/Lenta/texts/20100101tomsk.txt 73 | # home/tsha/Lenta/texts/20100101urus.txt 74 | 75 | 76 | def load_taiga_lenta(path, metas=None, offset=12800000, count=36446): 77 | records = load_tar_texts(path, '*/texts/*.txt', offset, count) 78 | return merge_metas(records, metas) 79 | 80 | 81 | __all__ = [ 82 | 'load_taiga_lenta_metas', 83 | 'load_taiga_lenta' 84 | ] 85 | -------------------------------------------------------------------------------- /data/bsnlp/test_pl_cs_ru_bg/raw/nord_stream/ru/Nord_Stream_2_extra.xml_file_1.txt: -------------------------------------------------------------------------------- 1 | ru-ryanair-new-extra-1 2 | ru 3 | 2019-01-10 4 | https://www.epravda.com.ua/rus/news/2019/01/10/644185/ 5 | Климкин рассказал, чего ожидает от газовых переговоров с Россией в Брюсселе 6 | 7 | Украина во время второго раунда трехсторонних переговоров в Брюсселе будет настаивать на том, что транзит газа должен происходить в соответствии с нормами европейского законодательства. Об этом заявил министр иностранных дел Украины Павел Климкин на совместном брифинге с главой МИД Литвы Линасом Линкявичюсом в четверг, 10 января, передает 8 | 9 | "Наша позиция относительно будущего транзита, а также по европейской энергетической безопасности предельно четкая: мы готовы выполнить все соответствующие европейские регуляторные нормы для нашей газотранспортной системы. Мы считаем, что будущий транзит должен базироваться на прозрачных и эффективных регуляторах ЕС", - сказал Климкин. "Это означает, например, что газ будет покупаться на восточной границе Украины, что транзитный тариф будет рассчитываться по европейской методологии. То есть общий смысл нашей позиции - транзит должно происходить в соответствии с европейским законодательством ", - отметил глава МИД. 10 | 11 | Он подчеркнул, что общей позицией Украины и Литвы является то, что требования европейского законодательства должны быть распространены и на Nord stream-2, что фундаментально повысит европейскую энергетическую безопасность против России. Подытоживая, Климкин подчеркнул, что Украина готова к конструктивному диалогу о будущем транзита газа в Европу, но он должен базироваться на очень четких условиях, самое главное из которых - соответствие нормам европейского законодательства для обеспечения четких, эффективных и прозрачных предпосылок транзита. "Поэтому во время переговоров в Брюсселе 21 января мы будем исходить из этих очень простых, но очень важных требований", - отметил Климкин. Ранее глава НАК "Нафтогаз Украины" Андрей Коболев заявил о готовности Украины, при согласовании этого вопроса правительством, обсудить возможность просмотра или отказа от второго транзитного иска к российскому "Газпрому" на сумму более 12 млрд долларов в привязке с заключением нового долгосрочного контракта. Действующий контракт по транзиту российского газа через Украину заканчивается в конце 2019 года. 12 | -------------------------------------------------------------------------------- /corus/sources/taiga/magazines.py: -------------------------------------------------------------------------------- 1 | 2 | import re 3 | from datetime import datetime 4 | 5 | from .common import ( 6 | Meta, 7 | load_tar_metas, 8 | load_tar_texts, 9 | merge_metas, 10 | ) 11 | 12 | 13 | # {'author': '', 14 | # 'authorreaders': '', 15 | # 'authortexts': '', 16 | # 'date': '2007, 10', 17 | # 'magazine': 'Знамя', 18 | # 'segment': 'Журнальный зал', 19 | # 'source': 'http://magazines.russ.ru/znamia/2007/10/tu26.html', 20 | # 'tags': '', 21 | # 'textdiff': '', 22 | # 'textid': '50005', 23 | # 'textname': 'Михаил Копелиович. Рецензия &#8211; любовь моя', 24 | # 'textregion': '', 25 | # 'textrubric': 'article', 26 | # 'time': ''}, 27 | 28 | 29 | def parse_metas(items): 30 | for item in items: 31 | id = item.get('textid') 32 | if not id: 33 | continue 34 | 35 | timestamp = item.get('date') 36 | if timestamp: 37 | try: 38 | timestamp = datetime.strptime(timestamp, '%Y, %m') 39 | except ValueError: 40 | # rare 2002, 7-8 41 | pass 42 | 43 | title = item['textname'] or None 44 | rubric = item.get('textrubric') or None 45 | 46 | url = None 47 | if 'source' in item: 48 | url = item['source'] 49 | match = re.search(r'russ\.ru/([^/]+)', url) 50 | label = match.group(1) 51 | id = label + '_' + id 52 | 53 | yield Meta( 54 | id=id, 55 | timestamp=timestamp, 56 | title=title, 57 | rubric=rubric, 58 | url=url 59 | ) 60 | 61 | 62 | def load_taiga_magazines_metas(path, offset=0, count=36): 63 | items = load_tar_metas(path, '*/corpus_*_metadata.csv', offset, count) 64 | return parse_metas(items) 65 | 66 | 67 | # home/tsha/Magazines/texts/corpus_arion_10658.txt 68 | # home/tsha/Magazines/texts/corpus_arion_10659.txt 69 | 70 | 71 | def parse_magazines_id(name): 72 | match = re.search(r'corpus_([\d\w_]+)\.txt', name) 73 | return match.group(1) 74 | 75 | 76 | def load_taiga_magazines(path, metas=None, offset=7292416, count=39890): 77 | records = load_tar_texts(path, '*/texts/*.txt', offset, count) 78 | return merge_metas(records, metas) 79 | 80 | 81 | __all__ = [ 82 | 'load_taiga_magazines_metas', 83 | 'load_taiga_magazines' 84 | ] 85 | -------------------------------------------------------------------------------- /corus/sources/toloka.py: -------------------------------------------------------------------------------- 1 | 2 | from corus.record import Record 3 | from corus.io import ( 4 | load_lines, 5 | parse_tsv, 6 | skip_header, 7 | ) 8 | 9 | 10 | class LRWCRecord(Record): 11 | __attributes__ = ['hyponym', 'hypernym', 'genitive', 'judgement', 'confidence'] 12 | 13 | def __init__(self, hyponym, hypernym, genitive, judgement, confidence): 14 | self.hyponym = hyponym 15 | self.hypernym = hypernym 16 | self.genitive = genitive 17 | self.judgement = judgement 18 | self.confidence = confidence 19 | 20 | 21 | # INPUT:hyponym INPUT:hypernym INPUT:genitive OUTPUT:judgement CONFIDENCE:judgement 22 | # автомобиль автомашина автомашины true 99.75% 23 | # автомобиль автомототранспорт автомототранспорта true 99.96% 24 | # автомобиль автомототранспортный автомототранспортного true 99.99% 25 | 26 | 27 | def parse_judgement(value): 28 | if value == 'true': 29 | return 1.0 30 | elif value == 'false': 31 | return 0.0 32 | 33 | 34 | def parse_confidence(value): 35 | return float(value[:-1]) 36 | 37 | 38 | def parse_toloka_lrwc(lines): 39 | skip_header(lines) 40 | records = parse_tsv(lines) 41 | for record in records: 42 | hyponym, hypernym, genitive, judgement, confidence = record 43 | judgement = parse_judgement(judgement) 44 | confidence = parse_confidence(confidence) 45 | yield LRWCRecord(hyponym, hypernym, genitive, judgement, confidence) 46 | 47 | 48 | def load_toloka_lrwc(path): 49 | lines = load_lines(path) 50 | return parse_toloka_lrwc(lines) 51 | 52 | 53 | class RuADReCTRecord(Record): 54 | __attributes__ = ['tweet_id', 'tweet', 'label'] 55 | 56 | def __init__(self, tweet_id, tweet, label): 57 | self.tweet_id = tweet_id 58 | self.tweet = tweet 59 | self.label = label 60 | 61 | # – tweet_id: уникальный номер сообщения в системе twitter; 62 | # – tweet: текст сообщения (твита); 63 | # - label: класс твита, 1 - содержит упоминание побочного эффекта, 0 - не содердит 64 | 65 | 66 | def parse_ruadrect(lines): 67 | rows = parse_tsv(lines) 68 | skip_header(rows) 69 | for cells in rows: 70 | yield RuADReCTRecord(*cells) 71 | 72 | 73 | def load_ruadrect(path): 74 | lines = load_lines(path) 75 | return parse_ruadrect(lines) 76 | -------------------------------------------------------------------------------- /corus/sources/taiga/nplus1.py: -------------------------------------------------------------------------------- 1 | 2 | from datetime import datetime 3 | 4 | from .common import ( 5 | Author, 6 | Meta, 7 | load_tar_metas, 8 | load_tar_texts, 9 | patch_month, 10 | merge_metas, 11 | ) 12 | 13 | 14 | # {'author': 'Владимир Королев', 15 | # 'authorreaders': '', 16 | # 'authortexts': '', 17 | # 'date': '21 Янв. 2017', 18 | # 'magazine': '', 19 | # 'segment': 'nplus1', 20 | # 'source': 'https://nplus1.ru/news/2017/01/21/Asphaltene-3d', 21 | # 'tags': '', 22 | # 'textdiff': '5.2', 23 | # 'textid': '20170121Asphaltene-3d', 24 | # 'textname': '«Архипелаги» асфальтенов ощупали в 3D', 25 | # 'textregion': '', 26 | # 'textrubric': 'Наука', 27 | # 'time': '17:34'}, 28 | 29 | 30 | NPLUS1_MONTHS = { 31 | 'Янв.': 'Jan', 32 | 'Фев.': 'Feb', 33 | 'Март': 'Mar', 34 | 'Апр.': 'Apr', 35 | 'Май': 'May', 36 | 'Июнь': 'Jun', 37 | 'Июль': 'Jul', 38 | 'Авг.': 'Aug', 39 | 'Сен.': 'Sep', 40 | 'Окт.': 'Oct', 41 | 'Нояб.': 'Nov', 42 | 'Дек.': 'Dec', 43 | } 44 | 45 | 46 | def parse_metas(items): 47 | for item in items: 48 | id = item['textid'] 49 | 50 | timestamp, date, time = None, item['date'], item['time'] 51 | if date and time: 52 | timestamp = patch_month(date, NPLUS1_MONTHS) + time 53 | timestamp = datetime.strptime(timestamp, '%d %b %Y%H:%M') 54 | 55 | name = item['author'] or None 56 | author = Author(name=name) 57 | 58 | title = item['textname'] 59 | rubric = item['textrubric'] or None 60 | url = item['source'] 61 | yield Meta( 62 | id=id, 63 | timestamp=timestamp, 64 | author=author, 65 | title=title, 66 | rubric=rubric, 67 | url=url 68 | ) 69 | 70 | 71 | def load_taiga_nplus1_metas(path, offset=0, count=1): 72 | items = load_tar_metas(path, '*/newmetadata.csv', offset, count) 73 | return parse_metas(items) 74 | 75 | 76 | # home/tsha/NPlus1/texts/20150320drone.txt 77 | # home/tsha/NPlus1/texts/20150320nitrogen.txt 78 | # home/tsha/NPlus1/texts/20150320silica.txt 79 | 80 | 81 | def load_taiga_nplus1(path, metas=None, offset=1919488, count=7696): 82 | records = load_tar_texts(path, '*/texts/*.txt', offset, count) 83 | return merge_metas(records, metas) 84 | 85 | 86 | __all__ = [ 87 | 'load_taiga_nplus1_metas', 88 | 'load_taiga_nplus1' 89 | ] 90 | -------------------------------------------------------------------------------- /corus/sources/taiga/arzamas.py: -------------------------------------------------------------------------------- 1 | 2 | import re 3 | from datetime import datetime 4 | 5 | from .common import ( 6 | Author, 7 | Meta, 8 | load_tar_metas, 9 | load_tar_texts, 10 | merge_metas, 11 | ) 12 | 13 | 14 | # {'About_author': '', 15 | # 'Author_profession': 'Кандидат исторических наук. Креативный директор Фонда Егора Гайдара. Один из\xa0участников сетевого проекта «Прожито», создающего полный электронный корпус дневников советского времени.', 16 | # 'URL': 'http://arzamas.academy/mag/427-chapter7Историк советской литературы и\xa0культуры', 17 | # 'author': 'Илья Венявкин', 18 | # 'id': '427', 19 | # 'source': 'Arzamas', 20 | # 'tags': "['Документ', 'СССР']", 21 | # 'textdate': '27.04.2017', 22 | # 'theme': "['Литература', 'История']", 23 | # 'title': 'Советский писатель внутри Большого террора. Глава 7 • '} 24 | 25 | 26 | def parse_metas(items): 27 | for item in items: 28 | id = item['id'] 29 | timestamp = datetime.strptime(item['textdate'], '%d.%m.%Y') 30 | tags = eval(item['tags']) 31 | themes = eval(item['theme']) 32 | name = item['author'] or None 33 | profession = item['Author_profession'] or None 34 | about = item['About_author'] or None 35 | author = Author( 36 | name=name, 37 | profession=profession, 38 | about=about 39 | ) 40 | title = item['title'].strip(u'• ') 41 | url = item['URL'] 42 | yield Meta( 43 | id=id, 44 | timestamp=timestamp, 45 | tags=tags, 46 | themes=themes, 47 | author=author, 48 | title=title, 49 | url=url 50 | ) 51 | 52 | 53 | def load_taiga_arzamas_metas(path, offset=0, count=1): 54 | items = load_tar_metas(path, '*/metatable.csv', offset, count=1) 55 | return parse_metas(items) 56 | 57 | 58 | # home/tsha/Arzamas/texts/arzamas_449.txt 59 | # home/tsha/Arzamas/texts/arzamas_450.txt 60 | # home/tsha/Arzamas/texts/arzamas_452.txt 61 | 62 | 63 | def parse_id(name): 64 | match = re.search(r'arzamas_(\d+)\.txt', name) 65 | return match.group(1) 66 | 67 | 68 | def load_taiga_arzamas(path, metas=None, offset=144896, count=311): 69 | records = load_tar_texts(path, '*/texts/*.txt', offset, count, parse_id) 70 | return merge_metas(records, metas) 71 | 72 | 73 | __all__ = [ 74 | 'load_taiga_arzamas_metas', 75 | 'load_taiga_arzamas' 76 | ] 77 | -------------------------------------------------------------------------------- /data/mokoron/db.sql: -------------------------------------------------------------------------------- 1 | -- MySQL dump 10.13 Distrib 5.6.12, for osx10.6 (x86_64) 2 | -- 3 | -- Host: localhost Database: neu 4 | -- ------------------------------------------------------ 5 | -- Server version 5.6.12 6 | 7 | /*!40101 SET @OLD_CHARACTER_SET_CLIENT=@@CHARACTER_SET_CLIENT */; 8 | /*!40101 SET @OLD_CHARACTER_SET_RESULTS=@@CHARACTER_SET_RESULTS */; 9 | /*!40101 SET @OLD_COLLATION_CONNECTION=@@COLLATION_CONNECTION */; 10 | /*!40101 SET NAMES utf8 */; 11 | /*!40103 SET @OLD_TIME_ZONE=@@TIME_ZONE */; 12 | /*!40103 SET TIME_ZONE='+00:00' */; 13 | /*!40014 SET @OLD_UNIQUE_CHECKS=@@UNIQUE_CHECKS, UNIQUE_CHECKS=0 */; 14 | /*!40014 SET @OLD_FOREIGN_KEY_CHECKS=@@FOREIGN_KEY_CHECKS, FOREIGN_KEY_CHECKS=0 */; 15 | /*!40101 SET @OLD_SQL_MODE=@@SQL_MODE, SQL_MODE='NO_AUTO_VALUE_ON_ZERO' */; 16 | /*!40111 SET @OLD_SQL_NOTES=@@SQL_NOTES, SQL_NOTES=0 */; 17 | 18 | -- 19 | -- Table structure for table `sentiment` 20 | -- 21 | 22 | DROP TABLE IF EXISTS `sentiment`; 23 | /*!40101 SET @saved_cs_client = @@character_set_client */; 24 | /*!40101 SET character_set_client = utf8 */; 25 | CREATE TABLE `sentiment` ( 26 | `id` bigint(32) NOT NULL AUTO_INCREMENT, 27 | `tdate` varchar(128) DEFAULT NULL, 28 | `tname` varchar(128) DEFAULT NULL, 29 | `ttext` varchar(256) DEFAULT NULL, 30 | `ttype` int(10) DEFAULT '0', 31 | `trep` int(10) DEFAULT '0', 32 | `trtw` int(10) DEFAULT '0', 33 | `tfav` int(10) DEFAULT '0', 34 | `tstcount` int(10) DEFAULT '0', 35 | `tfoll` int(10) DEFAULT '0', 36 | `tfrien` int(10) DEFAULT '0', 37 | `listcount` int(10) DEFAULT '0', 38 | PRIMARY KEY (`id`) 39 | ) ENGINE=MyISAM AUTO_INCREMENT=441644379397451777 DEFAULT CHARSET=utf8; 40 | /*!40101 SET character_set_client = @saved_cs_client */; 41 | 42 | -- 43 | -- Dumping data for table `sentiment` 44 | -- 45 | 46 | LOCK TABLES `sentiment` WRITE; 47 | /*!40000 ALTER TABLE `sentiment` DISABLE KEYS */; 48 | INSERT INTO `sentiment` VALUES (408906695721877504,'1386325928','Va5ilina','Пропавшая в Хабаровске школьница почти сутки провела в яме у коллектор',2,0,0,0,183,95,158,0),(408906695700520960,'1386325928','i_wont_judge_ya','ЛЕНТА, Я СЕГОДНЯ ПОЛГОДА ДИРЕКШИОНЕЕЕЕР! С:\nХОТЯ ВСЕ РАВНО НИКТО НЕ ПОЗДРАВИТ ЛОЛ',2,0,0,0,19809,804,257,11) 49 | INSERT INTO `sentiment` VALUES (410005806927847424,'1386587976','Victorika_nya','Открытые аудиозаписи нужны, чтобы прийти в гости и включить их ^.^',2,0,0,0,426,12,20,0),(408906695663161344,'1386325928','victorypanasenk','Царствие Божие внутрь вас есть.',2,0,0,0,1080,986,412,0) 50 | -------------------------------------------------------------------------------- /corus/zip.py: -------------------------------------------------------------------------------- 1 | 2 | from collections import namedtuple 3 | 4 | import zlib 5 | from io import BytesIO 6 | from struct import ( 7 | calcsize, 8 | unpack 9 | ) 10 | 11 | 12 | def open_zip(path): 13 | return open(path, 'rb') 14 | 15 | 16 | # File: APPNOTE.TXT - .ZIP File Format Specification 17 | # https://pkware.cachefly.net/webdocs/casestudies/APPNOTE.TXT 18 | 19 | # 4.3.7 Local file header: 20 | # local file header signature 4 bytes (0x04034b50) 21 | # version needed to extract 2 bytes 22 | # general purpose bit flag 2 bytes 23 | # compression method 2 bytes 24 | # last mod file time 2 bytes 25 | # last mod file date 2 bytes 26 | # crc-32 4 bytes 27 | # compressed size 4 bytes 28 | # uncompressed size 4 bytes 29 | # file name length 2 bytes 30 | # extra field length 2 bytes 31 | 32 | # file name (variable size) 33 | # extra field (variable size) 34 | 35 | 36 | HEADER_FORMAT = '<4s5HL2L2H' 37 | HEADER_SIGNATURE = b'PK\x03\x04' 38 | 39 | NO_COMPRESSION = 0 40 | DEFLATED = 8 41 | 42 | 43 | ZipHeader = namedtuple( 44 | 'ZipHeader', 45 | ['signature', 'extract_by', 'flags', 'compression', 46 | 'time', 'date', 'crc', 'compressed', 'uncompressed', 47 | 'name', 'extra'] 48 | ) 49 | 50 | 51 | def decode_name(name): 52 | # since assert flags == 0 53 | return name.decode('cp437') 54 | 55 | 56 | def read_zip_header(file): 57 | size = calcsize(HEADER_FORMAT) 58 | buffer = file.read(size) 59 | if len(buffer) < size: 60 | return 61 | 62 | data = unpack(HEADER_FORMAT, buffer) 63 | header = ZipHeader._make(data) 64 | if not is_zip_header(header): 65 | return 66 | 67 | assert_zip_header(header) 68 | name = file.read(header.name) 69 | header = header._replace(name=decode_name(name)) 70 | file.read(header.extra) # skip extra 71 | return header 72 | 73 | 74 | def is_zip_header(record): 75 | return record.signature == HEADER_SIGNATURE 76 | 77 | 78 | def assert_zip_header(record): 79 | assert record.flags == 0, record.flags 80 | assert record.compression in (NO_COMPRESSION, DEFLATED), record.compression 81 | 82 | 83 | def read_zip_data(file, header): 84 | data = file.read(header.compressed) 85 | if header.compression == DEFLATED: 86 | data = zlib.decompress(data, -15) 87 | # TODO Maybe do buffered reading to save memory 88 | return BytesIO(data) 89 | -------------------------------------------------------------------------------- /data/ud/ru_gsd-ud-dev.conllu: -------------------------------------------------------------------------------- 1 | # sent_id = dev-s3 2 | # text = Он и являлся'' полным властелином всей Ахсауской местности'' и родоначальником Телакуровых, построивших здесь свой замок. 3 | 1 Он он PRON PRP Case=Nom|Gender=Masc|Number=Sing|Person=3 3 nsubj _ _ 4 | 2 и и PART UH _ 3 advmod _ _ 5 | 3 являлся являться VERB VBC Aspect=Imp|Gender=Masc|Mood=Ind|Number=Sing|Tense=Past|VerbForm=Fin|Voice=Mid 0 root _ SpaceAfter=No 6 | 4 '' '' PUNCT `` _ 6 punct _ _ 7 | 5 полным полный ADJ JJL Case=Ins|Degree=Pos|Gender=Masc|Number=Sing 6 amod _ _ 8 | 6 властелином властелин NOUN NN Animacy=Anim|Case=Ins|Gender=Masc|Number=Sing 3 xcomp _ _ 9 | 7 всей весь DET DT Case=Gen|Gender=Fem|Number=Sing 9 det _ _ 10 | 8 Ахсауской ахсауский ADJ JJL Case=Gen|Degree=Pos|Gender=Fem|Number=Sing 9 amod _ _ 11 | 9 местности местность NOUN NN Animacy=Inan|Case=Gen|Gender=Fem|Number=Sing 6 nmod _ SpaceAfter=No 12 | 10 '' '' PUNCT '' _ 6 punct _ _ 13 | 11 и и CCONJ CC _ 12 cc _ _ 14 | 12 родоначальником родоначальник NOUN NN Animacy=Anim|Case=Ins|Gender=Masc|Number=Sing 6 conj _ _ 15 | 13 Телакуровых Телакуров PROPN NNP Animacy=Anim|Case=Gen|Gender=Masc|Number=Plur 12 nmod _ SpaceAfter=No 16 | 14 , , PUNCT , _ 15 punct _ _ 17 | 15 построивших построить VERB VBNL Animacy=Anim|Aspect=Perf|Case=Gen|Number=Plur|Tense=Past|VerbForm=Part|Voice=Act 13 acl _ _ 18 | 16 здесь здесь ADV RB Degree=Pos 15 advmod _ _ 19 | 17 свой свой DET PRP$ Animacy=Inan|Case=Acc|Gender=Masc|Number=Sing 18 det _ _ 20 | 18 замок замок NOUN NN Animacy=Inan|Case=Acc|Gender=Masc|Number=Sing 15 obj _ SpaceAfter=No 21 | 19 . . PUNCT . _ 6 punct _ _ 22 | 23 | # sent_id = dev-s4 24 | # text = Сержант посоветовал Баклсу пойти на работу водителем машины скорой помощи. 25 | 1 Сержант сержант NOUN NN Animacy=Anim|Case=Nom|Gender=Masc|Number=Sing 2 nsubj _ _ 26 | 2 посоветовал посоветовать VERB VBC Aspect=Perf|Gender=Masc|Mood=Ind|Number=Sing|Tense=Past|VerbForm=Fin|Voice=Act 0 root _ _ 27 | 3 Баклсу Баклс PROPN NNP Animacy=Anim|Case=Dat|Gender=Masc|Number=Sing 2 iobj _ _ 28 | 4 пойти пойти VERB VB Aspect=Perf|VerbForm=Inf|Voice=Act 2 xcomp _ _ 29 | 5 на на ADP IN _ 6 case _ _ 30 | 6 работу работа NOUN NN Animacy=Inan|Case=Acc|Gender=Fem|Number=Sing 4 obl _ _ 31 | 7 водителем водитель NOUN NN Animacy=Anim|Case=Ins|Gender=Masc|Number=Sing 4 xcomp _ _ 32 | 8 машины машина NOUN NN Animacy=Inan|Case=Gen|Gender=Fem|Number=Sing 7 nmod _ _ 33 | 9 скорой скорый ADJ JJL Case=Gen|Degree=Pos|Gender=Fem|Number=Sing 10 amod _ _ 34 | 10 помощи помощь NOUN NN Animacy=Inan|Case=Gen|Gender=Fem|Number=Sing 8 nmod _ SpaceAfter=No 35 | 11 . . PUNCT . _ 2 punct _ _ 36 | -------------------------------------------------------------------------------- /corus/sources/morphoru.py: -------------------------------------------------------------------------------- 1 | 2 | from corus.record import Record 3 | from corus.io import load_lines 4 | 5 | from .ud import group_sents, parse_row, parse_feats 6 | 7 | 8 | class MorphoSent(Record): 9 | __attributes__ = ['tokens', 'attrs'] 10 | 11 | def __init__(self, tokens, attrs=()): 12 | self.tokens = tokens 13 | self.attrs = attrs 14 | 15 | 16 | class MorphoToken(Record): 17 | __attributes__ = ['text', 'lemma', 'pos', 'feats', 'feats2'] 18 | 19 | def __init__(self, text, lemma, pos, feats, feats2=None): 20 | self.text = text 21 | self.lemma = lemma 22 | self.pos = pos 23 | self.feats = feats 24 | self.feats2 = feats2 25 | 26 | 27 | def parse_morphoru(lines, parse_sent): 28 | for group in group_sents(lines): 29 | tokens = list(parse_sent(group)) 30 | yield MorphoSent(tokens) 31 | 32 | 33 | def parse_morphoru_gicrya_sent(lines): 34 | for line in lines: 35 | _, text, lemma, pos, feats = parse_row(line) 36 | feats = dict(parse_feats(feats)) 37 | yield MorphoToken(text, lemma, pos, feats) 38 | 39 | 40 | def parse_morphoru_corpora_sent(lines): 41 | for line in lines: 42 | parts = parse_row(line) 43 | _, text, lemma, pos, _, feats = parts[:6] 44 | feats = dict(parse_feats(feats)) 45 | yield MorphoToken(text, lemma, pos, feats) 46 | 47 | 48 | def parse_morphoru_rnc(lines): 49 | # ==> blogs.xhtml <== 50 | # ==newfile== 51 | # Кстати кстати H _ _ 52 | # о о ADP _ _ 53 | 54 | for group in group_sents(lines): 55 | attrs, tokens = [], [] 56 | for line in group: 57 | if line.startswith('=='): 58 | attrs.append(line) 59 | else: 60 | _, text, lemma, pos, feats, feats2 = parse_row(line) 61 | feats = dict(parse_feats(feats)) 62 | feats2 = dict(parse_feats(feats2)) 63 | token = MorphoToken(text, lemma, pos, feats, feats2) 64 | tokens.append(token) 65 | yield MorphoSent(tokens, attrs) 66 | 67 | 68 | def load_morphoru_gicrya(path): 69 | lines = load_lines(path) 70 | return parse_morphoru(lines, parse_morphoru_gicrya_sent) 71 | 72 | 73 | def load_morphoru_rnc(path): 74 | lines = load_lines(path) 75 | return parse_morphoru_rnc(lines) 76 | 77 | 78 | def load_morphoru_corpora(path): 79 | lines = load_lines(path) 80 | return parse_morphoru(lines, parse_morphoru_corpora_sent) 81 | 82 | 83 | __all__ = [ 84 | 'load_morphoru_gicrya', 85 | 'load_morphoru_rnc', 86 | 'load_morphoru_corpora' 87 | ] 88 | -------------------------------------------------------------------------------- /data/ud/ru_syntagrus-ud-dev.conllu: -------------------------------------------------------------------------------- 1 | # sent_id = 2013Algoritm.xml_16 2 | # text = Различные определения алгоритма в явной или неявной форме содержат следующий ряд общих требований: 3 | 1 Различные различный ADJ _ Case=Nom|Degree=Pos|Number=Plur 2 amod 2:amod _ 4 | 2 определения определение NOUN _ Animacy=Inan|Case=Nom|Gender=Neut|Number=Plur 9 nsubj 9:nsubj _ 5 | 3 алгоритма алгоритм NOUN _ Animacy=Inan|Case=Gen|Gender=Masc|Number=Sing 2 nmod 2:nmod _ 6 | 4 в в ADP _ _ 8 case 8:case _ 7 | 5 явной явный ADJ _ Case=Loc|Degree=Pos|Gender=Fem|Number=Sing 8 amod 8:amod _ 8 | 6 или или CCONJ _ _ 7 cc 7:cc _ 9 | 7 неявной неявный ADJ _ Case=Loc|Degree=Pos|Gender=Fem|Number=Sing 5 conj 5:conj _ 10 | 8 форме форма NOUN _ Animacy=Inan|Case=Loc|Gender=Fem|Number=Sing 9 obl 9:obl _ 11 | 9 содержат содержать VERB _ Aspect=Imp|Mood=Ind|Number=Plur|Person=3|Tense=Pres|VerbForm=Fin|Voice=Act 0 root 0:root _ 12 | 10 следующий следующий ADJ _ Animacy=Inan|Case=Acc|Degree=Pos|Gender=Masc|Number=Sing 11 amod 11:amod _ 13 | 11 ряд ряд NOUN _ Animacy=Inan|Case=Acc|Gender=Masc|Number=Sing 9 obj 9:obj _ 14 | 12 общих общий ADJ _ Case=Gen|Degree=Pos|Number=Plur 13 amod 13:amod _ 15 | 13 требований требование NOUN _ Animacy=Inan|Case=Gen|Gender=Neut|Number=Plur 11 nmod 11:nmod SpaceAfter=No 16 | 14 : : PUNCT _ _ 9 punct 9:punct _ 17 | 18 | # sent_id = 2013Algoritm.xml_17 19 | # text = - Дискретность - алгоритм должен представлять процесс решения задачи как последовательное выполнение некоторых простых шагов. 20 | 1 - - PUNCT _ _ 2 punct 2:punct _ 21 | 2 Дискретность дискретность NOUN _ Animacy=Inan|Case=Nom|Gender=Fem|Number=Sing 0 root 0:root _ 22 | 3 - - PUNCT _ _ 5 punct 5:punct _ 23 | 4 алгоритм алгоритм NOUN _ Animacy=Inan|Case=Nom|Gender=Masc|Number=Sing 5 nsubj 5:nsubj _ 24 | 5 должен должен ADJ _ Degree=Pos|Gender=Masc|Number=Sing|Variant=Short 2 parataxis 2:parataxis _ 25 | 6 представлять представлять VERB _ Aspect=Imp|VerbForm=Inf|Voice=Act 5 xcomp 5:xcomp _ 26 | 7 процесс процесс NOUN _ Animacy=Inan|Case=Acc|Gender=Masc|Number=Sing 6 obj 6:obj _ 27 | 8 решения решение NOUN _ Animacy=Inan|Case=Gen|Gender=Neut|Number=Sing 7 nmod 7:nmod _ 28 | 9 задачи задача NOUN _ Animacy=Inan|Case=Gen|Gender=Fem|Number=Sing 8 nmod 8:nmod _ 29 | 10 как как SCONJ _ _ 12 mark 12:mark _ 30 | 11 последовательное последовательный ADJ _ Case=Acc|Degree=Pos|Gender=Neut|Number=Sing 12 amod 12:amod _ 31 | 12 выполнение выполнение NOUN _ Animacy=Inan|Case=Acc|Gender=Neut|Number=Sing 6 advcl 6:advcl _ 32 | 13 некоторых некоторый DET _ Case=Gen|Number=Plur 15 det 15:det _ 33 | 14 простых простой ADJ _ Case=Gen|Degree=Pos|Number=Plur 15 amod 15:amod _ 34 | 15 шагов шаг NOUN _ Animacy=Inan|Case=Gen|Gender=Masc|Number=Plur 12 nmod 12:nmod SpaceAfter=No 35 | 16 . . PUNCT _ _ 2 punct 2:punct _ 36 | -------------------------------------------------------------------------------- /data/ud/ru_pud-ud-test.conllu: -------------------------------------------------------------------------------- 1 | # newdoc id = n01010 2 | # sent_id = n01010042 3 | # text = «Был момент, — сказал господин Панвалкар, — когда он чувствовал, что они должны покинуть здание». 4 | # english_text = There was a time, Mr Panvalkar said, when he felt that they should leave the building. 5 | 1 « « PUNCT `` _ 2 punct _ SpaceAfter=No 6 | 2 Был быть AUX VBC Aspect=Imp|Gender=Masc|Mood=Ind|Number=Sing|Tense=Past|VerbForm=Fin|Voice=Act 0 root _ _ 7 | 3 момент момент NOUN NN Animacy=Inan|Case=Nom|Gender=Masc|Number=Sing 2 nsubj _ SpaceAfter=No 8 | 4 , , PUNCT , _ 6 punct _ _ 9 | 5 — — PUNCT - _ 6 punct _ OrigForm=-- 10 | 6 сказал сказать VERB VBC Aspect=Perf|Gender=Masc|Mood=Ind|Number=Sing|Tense=Past|VerbForm=Fin|Voice=Act 2 parataxis _ _ 11 | 7 господин господин NOUN NN Animacy=Anim|Case=Nom|Gender=Masc|Number=Sing 6 nsubj _ _ 12 | 8 Панвалкар Панвалкар PROPN NNP Animacy=Anim|Case=Nom|Gender=Masc|Number=Sing 7 flat:name _ SpaceAfter=No 13 | 9 , , PUNCT , _ 6 punct _ _ 14 | 10 — — PUNCT - _ 6 punct _ OrigForm=-- 15 | 11 когда когда SCONJ IN _ 13 mark _ _ 16 | 12 он он PRON PRP Case=Nom|Gender=Masc|Number=Sing|Person=3 13 nsubj _ _ 17 | 13 чувствовал чувствовать VERB VBC Aspect=Imp|Gender=Masc|Mood=Ind|Number=Sing|Tense=Past|VerbForm=Fin|Voice=Act 3 advcl _ SpaceAfter=No 18 | 14 , , PUNCT , _ 17 punct _ _ 19 | 15 что что SCONJ IN _ 17 mark _ _ 20 | 16 они они PRON PRP Case=Nom|Number=Plur|Person=3 17 nsubj _ _ 21 | 17 должны должен ADJ JJH Degree=Pos|Number=Plur|Variant=Short 13 ccomp _ _ 22 | 18 покинуть покинуть VERB VB Aspect=Perf|VerbForm=Inf|Voice=Act 17 xcomp _ _ 23 | 19 здание здание NOUN NN Animacy=Inan|Case=Acc|Gender=Neut|Number=Sing 18 obj _ SpaceAfter=No 24 | 20 » » PUNCT ' _ 2 punct _ SpaceAfter=No 25 | 21 . . PUNCT . _ 2 punct _ _ 26 | 27 | # newdoc id = n01011 28 | # sent_id = n01011004 29 | # text = Ей также предъявлено обвинение в покушении на убийство ее двухлетней дочери. 30 | # english_text = She has also been charged with trying to kill her two-year-old daughter. 31 | 1 Ей она PRON PRP Case=Dat|Gender=Fem|Number=Sing|Person=3 3 iobj _ _ 32 | 2 также также ADV RB Degree=Pos 3 advmod _ _ 33 | 3 предъявлено предъявить VERB VBNH Aspect=Perf|Gender=Neut|Number=Sing|Tense=Past|Variant=Short|VerbForm=Part|Voice=Pass 0 root _ _ 34 | 4 обвинение обвинение NOUN NN Animacy=Inan|Case=Nom|Gender=Neut|Number=Sing 3 nsubj:pass _ _ 35 | 5 в в ADP IN _ 6 case _ _ 36 | 6 покушении покушение NOUN NN Animacy=Inan|Case=Loc|Gender=Neut|Number=Sing 4 nmod _ _ 37 | 7 на на ADP IN _ 8 case _ _ 38 | 8 убийство убийство NOUN NN Animacy=Inan|Case=Acc|Gender=Neut|Number=Sing 6 nmod _ _ 39 | 9 ее ее DET PRP$ _ 11 det _ _ 40 | 10 двухлетней двухлетний ADJ JJ Case=Gen|Degree=Pos|Gender=Fem|Number=Sing 11 amod _ _ 41 | 11 дочери дочь NOUN NN Animacy=Anim|Case=Gen|Gender=Fem|Number=Sing 8 nmod _ SpaceAfter=No 42 | 12 . . PUNCT . _ 3 punct _ _ 43 | -------------------------------------------------------------------------------- /corus/sources/taiga/social.py: -------------------------------------------------------------------------------- 1 | 2 | import re 3 | from itertools import islice as head 4 | 5 | from corus.record import Record 6 | from corus.io import match_names 7 | 8 | from .common import ( 9 | load_tar, 10 | parse_filename_id 11 | ) 12 | 13 | 14 | FB = 'fb' 15 | LJ = 'lj' 16 | TWITTER = 'twitter' 17 | VK = 'vk' 18 | NETWORKS = { 19 | 'fbtexts': FB, 20 | 'LiveJournalPostsandcommentsGICR': LJ, 21 | 'twtexts': TWITTER, 22 | 'vktexts': VK 23 | } 24 | 25 | 26 | class TaigaSocialRecord(Record): 27 | __attributes__ = ['id', 'network', 'text'] 28 | 29 | def __init__(self, id, network, text): 30 | self.id = id 31 | self.network = network 32 | self.text = text 33 | 34 | 35 | def parse_lines(file, encoding='utf8'): 36 | for line in file: 37 | line = line.decode(encoding) 38 | yield line.lstrip('\ufeff').rstrip('\r\n') 39 | 40 | 41 | def parse_lj(file): 42 | for text in parse_lines(file): 43 | yield TaigaSocialRecord( 44 | id=None, 45 | network=LJ, 46 | text=text 47 | ) 48 | 49 | 50 | # DataBaseItem: 6_5756d99b5dd2dc3dac164155 51 | # Кстати, как неожиданно КПРФ 52 | # DataBaseItem: 6_5756d9a85dd2dc3dac1645ae 53 | # [id12890229|Евгений], можно и по-другому сказать: "убогая клоунада" КПРФ - это 54 | 55 | 56 | def flush(network, id, buffer): 57 | text = '\n'.join(buffer) 58 | return TaigaSocialRecord( 59 | id=id, 60 | network=network, 61 | text=text 62 | ) 63 | 64 | 65 | def parse_social_(file, network): 66 | lines = parse_lines(file) 67 | previous = None 68 | buffer = [] 69 | for line in lines: 70 | match = re.match(r'^DataBaseItem: (.+)$', line) 71 | if match: 72 | if previous: 73 | yield flush(network, previous, buffer) 74 | buffer = [] 75 | previous = match.group(1) 76 | else: 77 | buffer.append(line) 78 | if previous: 79 | yield flush(network, previous, buffer) 80 | buffer = [] 81 | 82 | 83 | def parse_social(file, network): 84 | if network == LJ: 85 | return parse_lj(file) 86 | else: 87 | return parse_social_(file, network) 88 | 89 | 90 | def load_taiga_social(path, offset=3985892864, count=4): 91 | records = load_tar(path, offset=offset) 92 | records = match_names(records, '*/texts/*.txt') 93 | records = head(records, count) 94 | for record in records: 95 | network = parse_filename_id(record.name) 96 | network = NETWORKS[network] 97 | for record in parse_social(record.file, network): 98 | yield record 99 | 100 | 101 | __all__ = [ 102 | 'load_taiga_social' 103 | ] 104 | -------------------------------------------------------------------------------- /corus/sources/taiga/proza.py: -------------------------------------------------------------------------------- 1 | 2 | from datetime import datetime 3 | 4 | from .common import ( 5 | Author, 6 | Meta, 7 | load_zip_metas, 8 | load_zip_texts, 9 | merge_metas 10 | ) 11 | 12 | 13 | # {'URL': 'http://www.stihi.ru/2015/12/31/9302', 14 | # 'author': 'Макс Майер-Младший', 15 | # 'author_readers': '26', 16 | # 'author_texts': '2085', 17 | # 'authorlink': 'http://www.stihi.ru/avtor/380979994453', 18 | # 'date': '31.12.2015', 19 | # 'genre': 'лирика', 20 | # 'path': '/home/tsha/stihi_ru/texts/2015/12/20151231001.txt', 21 | # 'textid': '20151231001', 22 | # 'time': '23:56', 23 | # 'title': 'Ти знов являЕшся менi у снi', 24 | # 'topic': 'любовная лирика'} 25 | 26 | 27 | def parse_metas(items): 28 | for item in items: 29 | id = item['textid'] 30 | 31 | timestamp = item['date'] + item['time'] 32 | timestamp = datetime.strptime(timestamp, '%d.%m.%Y%H:%M') 33 | 34 | name = item['author'] 35 | readers = item['author_readers'] or None 36 | if readers: 37 | readers = int(readers) 38 | texts = item['author_texts'] or None 39 | if texts: 40 | texts = int(texts) 41 | url = item['authorlink'] 42 | author = Author( 43 | name=name, 44 | readers=readers, 45 | texts=texts, 46 | url=url 47 | ) 48 | 49 | genre = item['genre'] 50 | topic = item['topic'] 51 | title = item['title'] 52 | url = item['URL'] 53 | yield Meta( 54 | id=id, 55 | timestamp=timestamp, 56 | author=author, 57 | genre=genre, 58 | topic=topic, 59 | title=title, 60 | url=url 61 | ) 62 | 63 | 64 | def load_taiga_proza_metas(path, offset=0, count=2017 - 2005 + 1): 65 | items = load_zip_metas(path, '*/metatable_texts.txt', offset, count) 66 | return parse_metas(items) 67 | 68 | 69 | def load_taiga_stihi_metas(path, offset=0, count=2017 - 2015 + 1): 70 | items = load_zip_metas(path, '*/metatable_texts.txt', offset, count) 71 | return parse_metas(items) 72 | 73 | 74 | # proza_ru/home/tsha/proza_ru/tagged_texts/2015/12/20151231005.txt 75 | # proza_ru/home/tsha/proza_ru/texts/2015/12/20151231005.txt 76 | 77 | 78 | def load_taiga_proza(path, metas=None, offset=51432715409, count=1732589): 79 | records = load_zip_texts(path, '*/texts/*.txt', offset, count) 80 | return merge_metas(records, metas) 81 | 82 | 83 | def load_taiga_stihi(path, metas=None, offset=22304202421, count=9157973): 84 | records = load_zip_texts(path, '*/texts/*.txt', offset, count) 85 | return merge_metas(records, metas) 86 | 87 | 88 | __all__ = [ 89 | 'load_taiga_proza_metas', 90 | 'load_taiga_proza', 91 | 'load_taiga_stihi_metas', 92 | 'load_taiga_stihi', 93 | ] 94 | -------------------------------------------------------------------------------- /corus/sources/taiga/fontanka.py: -------------------------------------------------------------------------------- 1 | 2 | import re 3 | from datetime import datetime 4 | 5 | from .common import ( 6 | Meta, 7 | load_tar_metas, 8 | load_tar_texts, 9 | merge_metas 10 | ) 11 | 12 | 13 | # {'author': '', 14 | # 'authorreaders': '', 15 | # 'authortexts': '', 16 | # 'date': '04.04.2015', 17 | # 'magazine': '', 18 | # 'segment': 'Fontanka', 19 | # 'tags': 'Санкт-Петербург, Петербург, СПб, фонтанка, фонтанка.ру, АЖУР, Агентство Журналистских расследований, СМИ, новости, новости Петербурга, политика, экономика, криминал, Фонтанка, информация, события, город, культура, политика, бизнес, общество, происшествия, спорт, свободное время, авто, недвижимость, зарубежная недвижимость, Охта центр, финансы, туризм, работа, особое мнениеhttp://www.fontanka.ru/2015/04/04/068/', 20 | # 'textdiff': '', 21 | # 'textid': '20150404068', 22 | # 'textname': 'Минобороны: Россия не отстает от США в разработке лазерного оружия', 23 | # 'textregion': '', 24 | # 'textrubric': 'Технологии', 25 | # 'time': '20:59'}, 26 | 27 | 28 | def parse_metas(items): 29 | for item in items: 30 | id = item['textid'] 31 | # {'segment': 'Fontanka', 'textname': '"', 'textid': '20100205145'} 32 | tags, url, rubric, title = (), None, None, None 33 | 34 | if 'date' in item and 'time' in item: 35 | timestamp = item['date'] + item['time'] 36 | if timestamp: 37 | timestamp = datetime.strptime(timestamp, '%d.%m.%Y%H:%M') 38 | 39 | if 'tags' in item: 40 | tags = item['tags'] 41 | match = re.search(r'(http://.+)$', tags) 42 | if match: 43 | url = match.group(1) 44 | tags = re.split(r',\s+', tags[:match.start()]) 45 | 46 | rubric = item.get('textrubric') 47 | title = item.get('textname') 48 | yield Meta( 49 | id=id, 50 | timestamp=timestamp, 51 | tags=tags, 52 | rubric=rubric, 53 | title=title, 54 | url=url 55 | ) 56 | 57 | 58 | def load_taiga_fontanka_metas(path, offset=0, count=2017 - 2005 + 1): 59 | items = load_tar_metas(path, '*/metatable_*.csv', offset, count) 60 | return parse_metas(items) 61 | 62 | 63 | # home/tsha/Fontanka/texts/2007/fontanka_20070101001.txt 64 | # home/tsha/Fontanka/texts/2007/fontanka_20070101002.txt 65 | # home/tsha/Fontanka/texts/2007/fontanka_20070101004.txt 66 | # home/tsha/Fontanka/texts/2007/fontanka_20070101003.txt 67 | 68 | 69 | def parse_id(name): 70 | match = re.search(r'fontanka_(\d+)\.txt', name) 71 | return match.group(1) 72 | 73 | 74 | def load_taiga_fontanka(path, metas=None, offset=306359296, count=342683): 75 | records = load_tar_texts(path, '*/texts/*.txt', offset, count, parse_id) 76 | return merge_metas(records, metas) 77 | 78 | 79 | __all__ = [ 80 | 'load_taiga_fontanka_metas', 81 | 'load_taiga_fontanka' 82 | ] 83 | -------------------------------------------------------------------------------- /corus/io.py: -------------------------------------------------------------------------------- 1 | 2 | import gzip 3 | import bz2 4 | from zipfile import ZipFile 5 | 6 | import csv 7 | import json 8 | 9 | import xml.etree.ElementTree as ET 10 | 11 | from fnmatch import fnmatch as match_pattern 12 | 13 | 14 | ####### 15 | # 16 | # UTILS 17 | # 18 | ####### 19 | 20 | 21 | def match_names(records, pattern): 22 | for record in records: 23 | if match_pattern(record.name, pattern): 24 | yield record 25 | 26 | 27 | ####### 28 | # 29 | # TEXT 30 | # 31 | ######## 32 | 33 | 34 | def rstrip(text): 35 | return text.rstrip('\r\n') 36 | 37 | 38 | def load_text(path): 39 | with open(path) as file: 40 | return file.read() 41 | 42 | 43 | def dump_text(text, path): 44 | with open(path, 'w') as file: 45 | file.write(text) 46 | 47 | 48 | def load_lines(path, encoding="utf-8"): 49 | with open(path, encoding=encoding) as file: 50 | for line in file: 51 | yield rstrip(line) 52 | 53 | 54 | ##### 55 | # 56 | # XML 57 | # 58 | ###### 59 | 60 | 61 | def parse_xml(text): 62 | return ET.fromstring(text) 63 | 64 | 65 | ######### 66 | # 67 | # GZ, BZ, XZ 68 | # 69 | ##### 70 | 71 | 72 | def load_z_lines(path, open, encoding='utf8'): 73 | with open(path, mode='rt', encoding=encoding) as file: 74 | for line in file: 75 | yield rstrip(line) 76 | 77 | 78 | def load_gz_lines(path): 79 | return load_z_lines(path, gzip.open) 80 | 81 | 82 | def load_bz2_lines(path): 83 | return load_z_lines(path, bz2.open) 84 | 85 | 86 | def load_xz_lines(path): 87 | # Python may be built without lzma support 88 | # https://github.com/pandas-dev/pandas/issues/27532 89 | import lzma 90 | 91 | return load_z_lines(path, lzma.open) 92 | 93 | 94 | ####### 95 | # 96 | # ZIP 97 | # 98 | ######## 99 | 100 | 101 | def list_zip(path): 102 | with ZipFile(path) as zip: 103 | return zip.namelist() 104 | 105 | 106 | def load_zip_lines(path, name, encoding='utf8'): 107 | with ZipFile(path) as zip: 108 | with zip.open(name) as file: 109 | for line in file: 110 | yield rstrip(line.decode(encoding)) 111 | 112 | 113 | def load_zip_texts(path, names, encoding='utf8'): 114 | with ZipFile(path) as zip: 115 | for name in names: 116 | with zip.open(name) as file: 117 | yield file.read().decode(encoding) 118 | 119 | 120 | ######## 121 | # 122 | # CSV 123 | # 124 | ####### 125 | 126 | 127 | def parse_csv(lines, delimiter=',', max_field=None): 128 | if max_field: 129 | csv.field_size_limit(max_field) 130 | return csv.reader(lines, delimiter=delimiter) 131 | 132 | 133 | def parse_tsv(lines): 134 | return parse_csv(lines, delimiter='\t') 135 | 136 | 137 | def skip_header(rows): 138 | return next(rows) 139 | 140 | 141 | def dict_csv(rows): 142 | header = next(rows) 143 | for row in rows: 144 | yield dict(zip(header, row)) 145 | 146 | 147 | ######### 148 | # 149 | # JSONL 150 | # 151 | ####### 152 | 153 | 154 | def parse_jsonl(lines): 155 | for line in lines: 156 | yield json.loads(line) 157 | -------------------------------------------------------------------------------- /corus/sources/mokoron.py: -------------------------------------------------------------------------------- 1 | 2 | import re 3 | from datetime import datetime 4 | 5 | from corus.record import Record 6 | from corus.io import load_lines 7 | 8 | 9 | # – id: уникальный номер сообщения в системе twitter; 10 | # – tdate: дата публикации сообщения (твита); 11 | # – tmane: имя пользователя, опубликовавшего сообщение; 12 | # – ttext: текст сообщения (твита); 13 | # – ttype: поле в котором в дальнейшем будет указано к кому классу относится твит (положительный, отрицательный, нейтральный); 14 | # – trep: количество реплаев к данному сообщению. В настоящий момент API твиттера не отдает эту информацию; 15 | # – tfav: число сколько раз данное сообщение было добавлено в избранное другими пользователями; 16 | # – tstcount: число всех сообщений пользователя в сети twitter; 17 | # – tfol: количество фоловеров пользователя (тех людей, которые читают пользователя); 18 | # – tfrien: количество друзей пользователя (те люди, которых читает пользователь); 19 | # – listcount: количество листов-подписок в которые добавлен твиттер-пользователь. 20 | 21 | 22 | class MokoronRecord(Record): 23 | __attributes__ = [ 24 | 'id', 'timestamp', 'user', 'text', 'sentiment', 25 | 'replies', 'retweets', 'favourites', 'posts', 26 | 'followers', 'friends', 'lists' 27 | ] 28 | 29 | def __init__(self, id, timestamp, user, text, sentiment, 30 | replies, retweets, favourites, posts, followers, friends, lists): 31 | self.id = id 32 | self.timestamp = timestamp 33 | self.user = user 34 | self.text = text 35 | self.sentiment = sentiment 36 | self.replies = replies 37 | self.retweets = retweets 38 | self.favourites = favourites 39 | self.posts = posts 40 | self.followers = followers 41 | self.friends = friends 42 | self.lists = lists 43 | 44 | @classmethod 45 | def from_match(cls, match): 46 | dict = match.groupdict() 47 | for key in ['id', 'sentiment', 'replies', 'retweets', 48 | 'favourites', 'posts', 'followers', 'friends', 'lists']: 49 | dict[key] = int(dict[key]) 50 | dict['timestamp'] = datetime.utcfromtimestamp(float(dict['timestamp'])) 51 | return cls(**dict) 52 | 53 | 54 | # INSERT INTO `sentiment` VALUES (408906695721877504,'1386325928','Va5ilina','Пропавшая в Хабаровске школьница почти сутки провела в яме у коллектор',2,0,0,0,183,95,158,0),(408906695700520960,'1386325928','i_wont_judge_ya','ЛЕНТА, Я СЕГОДНЯ ПОЛГОДА ДИРЕКШИОНЕЕЕЕР! С:\nХОТЯ ВСЕ РАВНО НИКТО НЕ ПОЗДРАВИТ ЛОЛ',2,0,0,0,19809,804,257,11), 55 | 56 | 57 | INSERT = 'INSERT INTO `sentiment` VALUES' 58 | RECORD = re.compile(r''' 59 | \( 60 | (?P\d+), 61 | '(?P\d+)', 62 | '(?P.+?)', 63 | '(?P.+?)', 64 | (?P\d+), 65 | (?P\d+), 66 | (?P\d+), 67 | (?P\d+), 68 | (?P\d+), 69 | (?P\d+), 70 | (?P\d+), 71 | (?P\d+) 72 | \) 73 | ''', re.X) 74 | 75 | 76 | def load_mokoron(path): 77 | for line in load_lines(path): 78 | if line.startswith(INSERT): 79 | for match in RECORD.finditer(line): 80 | yield MokoronRecord.from_match(match) 81 | 82 | 83 | __all__ = [ 84 | 'load_mokoron' 85 | ] 86 | -------------------------------------------------------------------------------- /corus/sources/ud.py: -------------------------------------------------------------------------------- 1 | 2 | from corus.record import Record 3 | from corus.io import load_lines 4 | 5 | 6 | class UDSent(Record): 7 | __attributes__ = ['id', 'text', 'attrs', 'tokens'] 8 | 9 | def __init__(self, id, text, attrs, tokens): 10 | self.id = id 11 | self.text = text 12 | self.attrs = attrs 13 | self.tokens = tokens 14 | 15 | 16 | class UDToken(Record): 17 | __attributes__ = ['id', 'text', 'lemma', 'pos', 'feats', 'head_id', 'rel'] 18 | 19 | def __init__(self, id, text, lemma, pos, feats, head_id, rel): 20 | self.id = id 21 | self.text = text 22 | self.lemma = lemma 23 | self.pos = pos 24 | self.feats = feats 25 | self.head_id = head_id 26 | self.rel = rel 27 | 28 | 29 | def group_sents(lines): 30 | buffer = [] 31 | for line in lines: 32 | if not line: 33 | yield buffer 34 | buffer = [] 35 | else: 36 | buffer.append(line) 37 | if buffer: 38 | yield buffer 39 | 40 | 41 | def parse_feats(tags): 42 | if not tags: 43 | return 44 | 45 | for pair in tags.split('|'): 46 | key, value = pair.split('=', 1) 47 | yield key, value 48 | 49 | 50 | def _none(value): 51 | if value == '_': 52 | return 53 | return value 54 | 55 | 56 | def parse_row(line): 57 | return [_none(_) for _ in line.split('\t')] 58 | 59 | 60 | def parse_attr(line): 61 | # newdoc 62 | # title = instagram-2019 63 | # newpar 64 | # sent_id = instagram-1 65 | # speaker = screened-18 66 | 67 | line = line.lstrip('# ') 68 | if ' = ' in line: 69 | return line.split(' = ', 1) 70 | else: 71 | return line, None 72 | 73 | 74 | def parse_token(line): 75 | id, text, lemma, pos, _, feats, head_id, rel, _, _ = parse_row(line) 76 | feats = dict(parse_feats(feats)) 77 | return UDToken(id, text, lemma, pos, feats, head_id, rel) 78 | 79 | 80 | def parse_ud(lines): 81 | # newdoc id = n01001 82 | # sent_id = n01001011 83 | # text = «Если передача цифровых технологий сегодня 84 | # 1 « « PUNCT `` _ 19 punct _ SpaceA 85 | # 2 Если если SCONJ IN _ 9 mark _ _ 86 | # 3 передача передача NOUN NN Animacy=Inan|Case=N 87 | 88 | for group in group_sents(lines): 89 | attrs = {} 90 | tokens = [] 91 | for line in group: 92 | if line.startswith('#'): 93 | key, value = parse_attr(line) 94 | attrs[key] = value 95 | else: 96 | token = parse_token(line) 97 | tokens.append(token) 98 | 99 | id = attrs.pop('sent_id', None) 100 | text = attrs.pop('text', None) 101 | yield UDSent(id, text, attrs, tokens) 102 | 103 | 104 | def load_ud(path): 105 | lines = load_lines(path) 106 | return parse_ud(lines) 107 | 108 | 109 | def load_ud_gsd(path): 110 | return load_ud(path) 111 | 112 | 113 | def load_ud_taiga(path): 114 | return load_ud(path) 115 | 116 | 117 | def load_ud_pud(path): 118 | return load_ud(path) 119 | 120 | 121 | def load_ud_syntag(path): 122 | return load_ud(path) 123 | 124 | 125 | __all__ = [ 126 | 'load_ud_gsd', 127 | 'load_ud_taiga', 128 | 'load_ud_pud', 129 | 'load_ud_syntag', 130 | ] 131 | -------------------------------------------------------------------------------- /corus/sources/ods.py: -------------------------------------------------------------------------------- 1 | # coding: utf8 2 | 3 | from datetime import datetime 4 | 5 | from corus.record import Record 6 | from corus.io import ( 7 | load_gz_lines, 8 | parse_csv, 9 | skip_header 10 | ) 11 | 12 | 13 | class NewsRecord(Record): 14 | __attributes__ = [ 15 | 'timestamp', 'url', 'edition', 'topics', 16 | 'authors', 'title', 'text', 'stats' 17 | ] 18 | 19 | def __init__(self, timestamp, url, edition, topics, authors, title, text, stats): 20 | self.timestamp = timestamp 21 | self.url = url 22 | self.edition = edition 23 | self.topics = topics 24 | self.authors = authors 25 | self.title = title 26 | self.text = text 27 | self.stats = stats 28 | 29 | 30 | class Stats(Record): 31 | __attributes__ = [ 32 | 'fb', 'vk', 'ok', 'twitter', 'lj', 'tg', 33 | 'likes', 'views', 'comments' 34 | ] 35 | 36 | def __init__(self, fb, vk, ok, twitter, lj, tg, likes, views, comments): 37 | self.fb = fb 38 | self.vk = vk 39 | self.ok = ok 40 | self.twitter = twitter 41 | self.lj = lj 42 | self.tg = tg 43 | self.likes = likes 44 | self.views = views 45 | self.comments = comments 46 | 47 | 48 | def none_row(row): 49 | for cell in row: 50 | if not cell or cell == '-': 51 | cell = None 52 | yield cell 53 | 54 | 55 | def maybe_int(value): 56 | if value: 57 | return int(value) 58 | return 59 | 60 | 61 | def parse_news(lines): 62 | # tass raises "field larger than field limit" 63 | rows = parse_csv(lines, max_field=100000000) 64 | skip_header(rows) 65 | for row in rows: 66 | (timestamp, url, edition, topics, authors, title, text, 67 | fb, vk, ok, twitter, lj, tg, likes, views, comments) = none_row(row) 68 | 69 | timestamp = datetime.strptime(timestamp, '%Y-%m-%d %H:%M:%S') 70 | 71 | if authors: 72 | authors = authors.split(',') 73 | 74 | # empty texts in meduza 75 | text = text or '' 76 | 77 | stats = Stats( 78 | maybe_int(fb), 79 | maybe_int(vk), 80 | maybe_int(ok), 81 | maybe_int(twitter), 82 | maybe_int(lj), 83 | maybe_int(tg), 84 | maybe_int(likes), 85 | maybe_int(views), 86 | maybe_int(comments) 87 | ) 88 | yield NewsRecord( 89 | timestamp, url, edition, topics, authors, 90 | title, text, stats 91 | ) 92 | 93 | 94 | def load_news(path): 95 | lines = load_gz_lines(path) 96 | return parse_news(lines) 97 | 98 | 99 | def load_ods_interfax(path): 100 | return load_news(path) 101 | 102 | 103 | def load_ods_gazeta(path): 104 | return load_news(path) 105 | 106 | 107 | def load_ods_izvestia(path): 108 | return load_news(path) 109 | 110 | 111 | def load_ods_meduza(path): 112 | return load_news(path) 113 | 114 | 115 | def load_ods_ria(path): 116 | return load_news(path) 117 | 118 | 119 | def load_ods_rt(path): 120 | return load_news(path) 121 | 122 | 123 | def load_ods_tass(path): 124 | return load_news(path) 125 | 126 | 127 | __all__ = [ 128 | 'load_ods_interfax', 129 | 'load_ods_gazeta', 130 | 'load_ods_izvestia', 131 | 'load_ods_meduza', 132 | 'load_ods_ria', 133 | 'load_ods_rt', 134 | 'load_ods_tass', 135 | ] 136 | -------------------------------------------------------------------------------- /corus/sources/omnia.py: -------------------------------------------------------------------------------- 1 | 2 | import re 3 | 4 | from corus.record import Record 5 | from corus.io import load_xz_lines 6 | 7 | 8 | class OmniaDoc(Record): 9 | __attributes__ = ['id', 'attrs', 'pars'] 10 | 11 | def __init__(self, id, attrs, pars): 12 | self.id = id 13 | self.attrs = attrs 14 | self.pars = pars 15 | 16 | 17 | class OmniaPar(Record): 18 | __attributes__ = ['sents'] 19 | 20 | def __init__(self, sents): 21 | self.sents = sents 22 | 23 | 24 | class OmniaSent(Record): 25 | __attributes__ = ['tokens'] 26 | 27 | def __init__(self, tokens): 28 | self.tokens = tokens 29 | 30 | 31 | class OmniaToken(Record): 32 | __attributes__ = ['text', 'lemma', 'atag', 'tag', 'ztag', 'g'] 33 | 34 | def __init__(self, text, lemma, atag, tag, ztag, g): 35 | self.text = text 36 | self.lemma = lemma 37 | self.atag = atag 38 | self.tag = tag 39 | self.ztag = ztag 40 | self.g = g 41 | 42 | 43 | DID = 'did' 44 | G_TAG = '' 45 | S_END = '' 46 | P_END = '

' 47 | DOC_END = '' 48 | 49 | 50 | def take_until(stream, value): 51 | for item in stream: 52 | if item == value: 53 | break 54 | yield item 55 | 56 | 57 | def group_bounds(stream, end): 58 | for _ in stream: 59 | yield take_until(stream, end) 60 | 61 | 62 | def group_doc_bounds(stream): 63 | for header in stream: 64 | group = take_until(stream, DOC_END) 65 | yield header, group 66 | 67 | 68 | def group_pairs(stream): 69 | previous = None 70 | for item in stream: 71 | if previous: 72 | yield previous, item 73 | previous = item 74 | if previous: 75 | yield previous, None 76 | 77 | 78 | def parse_tokens(lines): 79 | pairs = group_pairs(lines) 80 | for line, next in pairs: 81 | if line == G_TAG: 82 | continue 83 | 84 | parts = line.split('\t') 85 | if len(parts) != 5: 86 | # наблюдать наблюдать Vb Vmn----a-e 1 87 | # интерес интерес Nn Ncmsan 1 88 | # Э Э Zz - 1 89 | # 90 | # <дуарда> 91 | # Г Г Zz - 1 92 | # 93 | # <еоргиевича> 94 | # к к Pp Sp-d 1 95 | # попыткам попытка Nn Ncfpdn 1 96 | 97 | # weird tag lines 98 | # <нрзб> <НРЗБ> 99 | # <дуарда> 100 | # <еоргиевича> 101 | 102 | # just skip them 103 | continue 104 | 105 | # Refs on atag and tag: 106 | # http://unesco.uniba.sk/aranea_about/aut.html 107 | # http://nl.ijs.si/ME/V4/msd/html/msd-ru.html 108 | text, lemma, atag, tag, ztag = parts 109 | g = next == G_TAG 110 | 111 | yield OmniaToken(text, lemma, atag, tag, ztag, g) 112 | 113 | 114 | def parse_sents(lines): 115 | groups = group_bounds(lines, S_END) 116 | for group in groups: 117 | tokens = list(parse_tokens(group)) 118 | yield OmniaSent(tokens) 119 | 120 | 121 | def parse_pars(lines): 122 | groups = group_bounds(lines, P_END) 123 | for group in groups: 124 | sents = list(parse_sents(group)) 125 | yield OmniaPar(sents) 126 | 127 | 128 | def parse_tag_attrs(tag): 129 | matches = re.finditer(r'([^= ]+)="([^"]+)"', tag) 130 | for match in matches: 131 | yield match.groups() 132 | 133 | 134 | def parse_doc_header(header): 135 | attrs = dict(parse_tag_attrs(header)) 136 | id = attrs.pop(DID) 137 | return id, attrs 138 | 139 | 140 | def parse_docs(lines): 141 | groups = group_doc_bounds(lines) 142 | for header, group in groups: 143 | id, attrs = parse_doc_header(header) 144 | pars = list(parse_pars(group)) 145 | yield OmniaDoc(id, attrs, pars) 146 | 147 | 148 | def load_omnia(path): 149 | lines = load_xz_lines(path) 150 | yield from parse_docs(lines) 151 | -------------------------------------------------------------------------------- /corus/sources/corpora.py: -------------------------------------------------------------------------------- 1 | 2 | from corus.record import Record 3 | from corus.io import ( 4 | list_zip, 5 | load_zip_texts, 6 | parse_xml 7 | ) 8 | 9 | 10 | class CorporaText(Record): 11 | __attributes__ = ['id', 'parent_id', 'name', 'tags', 'pars'] 12 | 13 | def __init__(self, id, parent_id, name, tags, pars): 14 | self.id = id 15 | self.parent_id = parent_id 16 | self.name = name 17 | self.tags = tags 18 | self.pars = pars 19 | 20 | 21 | class CorporaPar(Record): 22 | __attributes__ = ['id', 'sents'] 23 | 24 | def __init__(self, id, sents): 25 | self.id = id 26 | self.sents = sents 27 | 28 | 29 | class CorporaSent(Record): 30 | __attributes__ = ['id', 'text', 'tokens'] 31 | 32 | def __init__(self, id, text, tokens): 33 | self.id = id 34 | self.text = text 35 | self.tokens = tokens 36 | 37 | 38 | class CorporaToken(Record): 39 | __attributes__ = ['id', 'rev_id', 'text', 'forms'] 40 | 41 | def __init__(self, id, rev_id, text, forms): 42 | self.id = id 43 | self.rev_id = rev_id 44 | self.text = text 45 | self.forms = forms 46 | 47 | 48 | class CorporaForm(Record): 49 | __attributes__ = ['id', 'text', 'grams'] 50 | 51 | def __init__(self, id, text, grams): 52 | self.id = id 53 | self.text = text 54 | self.grams = grams 55 | 56 | 57 | # 58 | # 59 | # 60 | # url:http://www.chaskor.ru/news/tak_kto_komu_dolzhen_18043 61 | # Год:2010 62 | # Дата:19/06 63 | # Тема:ЧасКор:Экономика 64 | # Тема:ЧасКор:Экономика/Сырье 65 | # 66 | # 67 | # 68 | # 69 | # Так кто кому должен? 70 | # 71 | # 72 | # 73 | # 74 | # 75 | # 76 | # 77 | # 78 | # 79 | # 80 | # 81 | # 82 | # 83 | 84 | 85 | def parse_grams(xml): 86 | for item in xml: 87 | yield item.get('v') 88 | 89 | 90 | def parse_forms(xml): 91 | for item in xml: 92 | lemma = item.find('l') 93 | id = lemma.get('id') 94 | text = lemma.get('t') 95 | grams = list(parse_grams(lemma)) 96 | yield CorporaForm(id, text, grams) 97 | 98 | 99 | def parse_tokens(xml): 100 | for token in xml: 101 | id = token.get('id') 102 | text = token.get('text') 103 | forms = token.find('tfr') 104 | rev_id = forms.get('rev_id') 105 | forms = list(parse_forms(forms)) 106 | yield CorporaToken(id, rev_id, text, forms) 107 | 108 | 109 | def parse_sents(xml): 110 | for sent in xml: 111 | id = sent.get('id') 112 | source, tokens = sent 113 | text = source.text 114 | tokens = list(parse_tokens(tokens)) 115 | yield CorporaSent(id, text, tokens) 116 | 117 | 118 | def parse_pars(xml): 119 | for par in xml: 120 | id = par.get('id') 121 | sents = list(parse_sents(par)) 122 | yield CorporaPar(id, sents) 123 | 124 | 125 | def parse_tags(xml): 126 | for tag in xml: 127 | yield tag.text 128 | 129 | 130 | def parse_text(xml): 131 | id = xml.get('id') 132 | parent_id = xml.get('parent') 133 | name = xml.get('name') 134 | tags, pars = xml 135 | tags = list(parse_tags(tags)) 136 | pars = list(parse_pars(pars)) 137 | return CorporaText(id, parent_id, name, tags, pars) 138 | 139 | 140 | def load_corpora(path): 141 | names = list_zip(path) 142 | texts = load_zip_texts(path, names) 143 | for text in texts: 144 | xml = parse_xml(text) 145 | yield parse_text(xml) 146 | -------------------------------------------------------------------------------- /corus/readme.py: -------------------------------------------------------------------------------- 1 | 2 | import re 3 | 4 | from .sources.meta import is_group 5 | from .io import ( 6 | load_text, 7 | dump_text 8 | ) 9 | 10 | 11 | COMMANDS = ('wget', 'unzip', 'unrar', 'rm', 'mv', 'tar') 12 | 13 | KB = 1024 14 | MB = 1024 * KB 15 | GB = 1024 * MB 16 | 17 | LABELS = { 18 | KB: 'Kb', 19 | MB: 'Mb', 20 | GB: 'Gb' 21 | } 22 | 23 | 24 | def is_command(step, commands=COMMANDS): 25 | return step.startswith(commands) 26 | 27 | 28 | def format_bytes(value): 29 | value /= KB 30 | unit = KB 31 | for _ in range(2): 32 | if value < KB: 33 | break 34 | value /= KB 35 | unit *= KB 36 | return '%.2f %s' % (value, LABELS[unit]) 37 | 38 | 39 | def format_count(value): 40 | # https://stackoverflow.com/questions/16670125/python-format-string-thousand-separator-with-spaces/ 41 | return format(value, ',').replace(',', ' ') 42 | 43 | 44 | def unfold_metas(items): 45 | for item in items: 46 | if is_group(item): 47 | yield True, item 48 | for meta in item.metas: 49 | yield False, meta 50 | else: 51 | yield False, item 52 | 53 | 54 | def format_metas_(metas, nbviewer=None): 55 | yield '' 56 | yield '' 57 | yield '' 58 | yield '' 59 | yield '' 60 | yield '' 61 | yield '' 62 | yield '' 63 | yield '' 64 | for group, meta in unfold_metas(metas): 65 | yield '' 66 | 67 | yield '' 73 | 74 | if not group: 75 | yield '' 92 | 93 | yield '' 98 | 99 | yield '' 103 | 104 | yield '' 108 | 109 | if group: 110 | yield '' 127 | 128 | yield '' 129 | yield '
DatasetAPI from corus importTagsTextsUncompressedDescription
' 68 | if meta.url: 69 | yield '%s' % (meta.url, meta.title) 70 | else: 71 | yield meta.title 72 | yield '' 76 | for index, function in enumerate(meta.functions): 77 | if index > 0: 78 | yield '
' 79 | name = function.__name__ 80 | yield '' % name 81 | if nbviewer: 82 | yield ( 83 | '{name}'.format( 84 | nbviewer=nbviewer, 85 | name=name 86 | ) 87 | ) 88 | yield '#' % name 89 | else: 90 | yield '{name}'.format(name=name) 91 | yield '
' 94 | if meta.tags: 95 | for tag in meta.tags: 96 | yield '%s' % tag 97 | yield '' 100 | if meta.stats and meta.stats.count: 101 | yield format_count(meta.stats.count) 102 | yield '' 105 | if meta.stats and meta.stats.bytes: 106 | yield format_bytes(meta.stats.bytes) 107 | yield '' 111 | else: 112 | yield '' 113 | if meta.description: 114 | yield meta.description 115 | if meta.instruction: 116 | yield '
' 117 | yield '
' 118 | 119 | for index, step in enumerate(meta.instruction): 120 | if index > 0: 121 | yield '
' 122 | if is_command(step): 123 | yield '%s' % step 124 | else: 125 | yield step 126 | yield '
' 130 | 131 | 132 | def format_metas(metas, url=None): 133 | return '\n'.join(format_metas_(metas, url)) 134 | 135 | 136 | def show_html(html): 137 | from IPython.display import display, HTML 138 | 139 | display(HTML(html)) 140 | 141 | 142 | def patch_readme(html, path): 143 | text = load_text(path) 144 | text = re.sub( 145 | r'(.+)', 146 | '\n' + html + '\n', 147 | text, 148 | flags=re.S 149 | ) 150 | dump_text(text, path) 151 | -------------------------------------------------------------------------------- /corus/sources/bsnlp.py: -------------------------------------------------------------------------------- 1 | 2 | import re 3 | from datetime import datetime 4 | from os import walk as walk_ 5 | from os.path import ( 6 | split as split_path, 7 | splitext as split_ext, 8 | join as join_path 9 | ) 10 | 11 | from corus.record import Record 12 | from corus.io import ( 13 | load_text, 14 | load_lines 15 | ) 16 | 17 | 18 | RU = 'ru' 19 | BG = 'bg' 20 | CS = 'cs' 21 | PL = 'pl' 22 | LANGS = [RU, BG, CS, PL] 23 | 24 | ANNOTATED = 'annotated' 25 | RAW = 'raw' 26 | 27 | TXT = '.txt' 28 | OUT = '.out' 29 | 30 | 31 | class BsnlpId(Record): 32 | __attributes__ = ['lang', 'type', 'name', 'path'] 33 | 34 | def __init__(self, lang, type, name, path): 35 | self.lang = lang 36 | self.type = type 37 | self.name = name 38 | self.path = path 39 | 40 | 41 | class BsnlpRaw(Record): 42 | __attributes__ = ['id', 'name', 'lang', 'date', 'url', 'text'] 43 | 44 | def __init__(self, id, name, lang, date, url, text): 45 | self.id = id 46 | self.name = name 47 | self.lang = lang 48 | self.date = date 49 | self.url = url 50 | self.text = text 51 | 52 | 53 | class BsnlpAnnotated(Record): 54 | __attributes__ = ['id', 'name', 'substrings'] 55 | 56 | def __init__(self, id, name, substrings): 57 | self.id = id 58 | self.name = name 59 | self.substrings = substrings 60 | 61 | 62 | class BsnlpSubstring(Record): 63 | __attributes__ = ['text', 'normal', 'type', 'id'] 64 | 65 | def __init__(self, text, normal, type, id): 66 | self.text = text 67 | self.normal = normal 68 | self.type = type 69 | self.id = id 70 | 71 | 72 | class BsnlpMarkup(Record): 73 | __attributes__ = ['id', 'name', 'lang', 'date', 'url', 'text', 'substrings'] 74 | 75 | def __init__(self, id, name, lang, date, url, text, substrings): 76 | self.id = id 77 | self.name = name 78 | self.lang = lang 79 | self.date = date 80 | self.url = url 81 | self.text = text 82 | self.substrings = substrings 83 | 84 | 85 | def walk(dir): 86 | def onerror(error): 87 | raise error 88 | 89 | return walk_(dir, onerror=onerror) 90 | 91 | 92 | def load_ids(dir, langs): 93 | if not langs: 94 | langs = LANGS 95 | 96 | # root bsnlp/sample_pl_cs_ru_bg/raw/cs 97 | # filename brexit_cs.txt_file_100.txt 98 | for root, subdirs, filenames in walk(dir): 99 | tail, lang = split_path(root) 100 | if lang not in langs: 101 | continue 102 | 103 | tail, type = split_path(tail) 104 | if type not in (ANNOTATED, RAW): 105 | # raw/nord_stream/ru/nord_stream_ru.txt_file_44.txt 106 | tail, type = split_path(tail) 107 | assert type in (ANNOTATED, RAW), root 108 | 109 | for filename in filenames: 110 | name, ext = split_ext(filename) 111 | if ext not in (TXT, OUT): 112 | continue 113 | path = join_path(root, filename) 114 | yield BsnlpId(lang, type, name, path) 115 | 116 | 117 | def select_type(ids, type): 118 | for id in ids: 119 | if id.type == type: 120 | yield id 121 | 122 | 123 | RAW_PATTERN = re.compile(r''' 124 | ^([^\n]+)\n 125 | (ru|bg|cs|pl)\n 126 | (\d\d\d\d-\d\d-\d\d)\n 127 | (https?://[^\n]+)?\n 128 | ''', re.X) 129 | 130 | 131 | def parse_raw(name, text): 132 | match = RAW_PATTERN.search(text) 133 | assert match, text 134 | 135 | id, lang, date, url = match.groups() 136 | date = datetime.strptime(date, '%Y-%m-%d') 137 | text = text[match.end():] 138 | return BsnlpRaw(id, name, lang, date, url, text) 139 | 140 | 141 | def load_raw(records): 142 | for record in records: 143 | text = load_text(record.path) 144 | yield parse_raw(record.name, text) 145 | 146 | 147 | # Евросоюза ORG ORG-European-Union 148 | ANNOTATED_PATTERN = re.compile(r''' 149 | ^([^\t]+) 150 | \t([^\t]+)? 151 | \t([^\t]+) 152 | \t?([^\t]+)?$ 153 | ''', re.X) 154 | 155 | 156 | def parse_substrings(lines): 157 | for line in lines: 158 | match = ANNOTATED_PATTERN.match(line) 159 | if not match: 160 | # single case 161 | # ЕНП ЕНП ORG ORG-EPP ЕС ЕС ORG ORG-European-Union 162 | continue 163 | text, normal, type, id = match.groups() 164 | yield BsnlpSubstring(text, normal, type, id) 165 | 166 | 167 | def parse_annotated(name, lines): 168 | id = next(lines).lstrip('\ufeff') 169 | substrings = list(parse_substrings(lines)) 170 | return BsnlpAnnotated(id, name, substrings) 171 | 172 | 173 | def load_annotated(records): 174 | for record in records: 175 | lines = load_lines(record.path) 176 | yield parse_annotated(record.name, lines) 177 | 178 | 179 | def merge(raw, annotated): 180 | id_raw = {_.name: _ for _ in raw} 181 | for record in annotated: 182 | raw = id_raw[record.name] 183 | yield BsnlpMarkup( 184 | raw.id, raw.name, raw.lang, raw.date, raw.url, 185 | raw.text, record.substrings 186 | ) 187 | 188 | 189 | def load_bsnlp(dir, langs=[RU]): 190 | ids = list(load_ids(dir, langs)) 191 | raw = load_raw(select_type(ids, RAW)) 192 | annotated = load_annotated(select_type(ids, ANNOTATED)) 193 | return merge(raw, annotated) 194 | -------------------------------------------------------------------------------- /corus/sources/taiga/common.py: -------------------------------------------------------------------------------- 1 | 2 | from io import TextIOWrapper 3 | from itertools import islice as head 4 | import tarfile 5 | 6 | from corus.record import Record 7 | from corus.path import ( 8 | get_filename, 9 | split_ext 10 | ) 11 | from corus.zip import ( 12 | open_zip, 13 | read_zip_header, 14 | read_zip_data 15 | ) 16 | from corus.io import ( 17 | match_names, 18 | 19 | parse_tsv, 20 | skip_header, 21 | ) 22 | 23 | 24 | class ArchiveRecord(Record): 25 | __attributes__ = ['name', 'offset', 'file'] 26 | 27 | def __init__(self, name, offset, file): 28 | self.name = name 29 | self.offset = offset 30 | self.file = file 31 | 32 | 33 | class TaigaRecord(Record): 34 | __attributes__ = ['id', 'meta', 'text'] 35 | 36 | def __init__(self, id, meta, text): 37 | self.id = id 38 | self.meta = meta 39 | self.text = text 40 | 41 | 42 | class Author(Record): 43 | __attributes__ = ['name', 'readers', 'texts', 'profession', 'about', 'url'] 44 | 45 | def __init__(self, name, readers=None, texts=None, 46 | profession=None, about=None, url=None): 47 | self.name = name 48 | self.readers = readers 49 | self.texts = texts 50 | self.profession = profession 51 | self.about = about 52 | self.url = url 53 | 54 | 55 | class Meta(Record): 56 | __attributes__ = ['id', 'timestamp', 'tags', 57 | 'themes', 'rubric', 'genre', 'topic', 58 | 'author', 'lang', 'title', 'url'] 59 | 60 | def __init__(self, id, timestamp=None, tags=None, 61 | themes=None, rubric=None, genre=None, topic=None, 62 | author=None, lang=None, title=None, url=None): 63 | self.id = id 64 | self.timestamp = timestamp 65 | self.tags = tags 66 | self.themes = themes 67 | self.rubric = rubric 68 | self.genre = genre 69 | self.topic = topic 70 | self.author = author 71 | self.lang = lang 72 | self.title = title 73 | self.url = url 74 | 75 | 76 | def load_tar(path, offset=0): 77 | with tarfile.open(path) as tar: 78 | tar.fileobj.seek(offset) 79 | while True: 80 | member = tarfile.TarInfo.fromtarfile(tar) 81 | if not member.isfile(): 82 | continue 83 | 84 | file = tar.extractfile(member) 85 | yield ArchiveRecord( 86 | name=member.name, 87 | offset=member.offset, 88 | file=file 89 | ) 90 | 91 | tar.members = [] 92 | tar.fileobj.seek(tar.offset) 93 | 94 | 95 | def load_zip(path, offset=0): 96 | with open_zip(path) as zip: 97 | zip.seek(offset) 98 | while True: 99 | offset = zip.tell() 100 | 101 | header = read_zip_header(zip) 102 | if not header: 103 | break 104 | if not header.uncompressed: 105 | continue 106 | 107 | file = read_zip_data(zip, header) 108 | yield ArchiveRecord( 109 | name=header.name, 110 | offset=offset, 111 | file=file 112 | ) 113 | 114 | 115 | def parse_meta(file, encoding='utf8'): 116 | lines = TextIOWrapper(file, encoding) 117 | rows = parse_tsv(lines) 118 | header = skip_header(rows) 119 | for row in rows: 120 | yield dict(zip(header, row)) 121 | 122 | 123 | def load_metas(path, pattern, offset, count, load): 124 | records = load(path, offset) 125 | records = match_names(records, pattern) 126 | records = head(records, count) 127 | for record in records: 128 | for item in parse_meta(record.file): 129 | yield item 130 | 131 | 132 | def load_tar_metas(path, pattern, offset, count): 133 | return load_metas(path, pattern, offset, count, load_tar) 134 | 135 | 136 | def load_zip_metas(path, pattern, offset, count): 137 | return load_metas(path, pattern, offset, count, load_zip) 138 | 139 | 140 | def load_texts(path, pattern, offset, count, parse_id, load, encoding='utf8'): 141 | records = load(path, offset=offset) 142 | records = match_names(records, pattern) 143 | records = head(records, count) 144 | for record in records: 145 | id = parse_id(record.name) 146 | file = TextIOWrapper(record.file, encoding) 147 | text = file.read() 148 | yield TaigaRecord( 149 | id=id, 150 | meta=None, 151 | text=text 152 | ) 153 | 154 | 155 | def parse_filename_id(path): 156 | id, _ = split_ext(get_filename(path)) 157 | return id 158 | 159 | 160 | def load_tar_texts(path, pattern, offset, count, parse_id=parse_filename_id): 161 | return load_texts(path, pattern, offset, count, parse_id, load_tar) 162 | 163 | 164 | def load_zip_texts(path, pattern, offset, count, parse_id=parse_filename_id): 165 | return load_texts(path, pattern, offset, count, parse_id, load_zip) 166 | 167 | 168 | def merge_metas(records, metas=None): 169 | if not metas: 170 | for record in records: 171 | yield record 172 | else: 173 | metas = {_.id: _ for _ in metas} 174 | for record in records: 175 | record.meta = metas.get(record.id) 176 | yield record 177 | 178 | 179 | def patch_month(date, months): 180 | for source, target in months.items(): 181 | if source in date: 182 | return date.replace(source, target) 183 | -------------------------------------------------------------------------------- /corus/sources/factru.py: -------------------------------------------------------------------------------- 1 | 2 | import re 3 | from corus.path import ( 4 | list_dir, 5 | join_path 6 | ) 7 | from corus.record import Record 8 | from corus.io import ( 9 | load_text, 10 | load_lines, 11 | ) 12 | 13 | 14 | DEVSET = 'devset' 15 | TESTSET = 'testset' 16 | 17 | TXT = 'txt' 18 | SPANS = 'spans' 19 | OBJECTS = 'objects' 20 | COREF = 'coref' 21 | FACTS = 'facts' 22 | 23 | 24 | class FactruSpan(Record): 25 | __attributes__ = ['id', 'type', 'start', 'stop'] 26 | 27 | def __init__(self, id, type, start, stop): 28 | self.id = id 29 | self.type = type 30 | self.start = start 31 | self.stop = stop 32 | 33 | 34 | class FactruObject(Record): 35 | __attributes__ = ['id', 'type', 'spans'] 36 | 37 | def __init__(self, id, type, spans): 38 | self.id = id 39 | self.type = type 40 | self.spans = spans 41 | 42 | 43 | class FactruCorefSlot(Record): 44 | __attributes__ = ['type', 'value'] 45 | 46 | def __init__(self, type, value): 47 | self.type = type 48 | self.value = value 49 | 50 | 51 | class FactruCoref(Record): 52 | __attributes__ = ['id', 'objects', 'slots'] 53 | 54 | def __init__(self, id, objects, slots): 55 | self.id = id 56 | self.objects = objects 57 | self.slots = slots 58 | 59 | 60 | class FactruFactSlot(Record): 61 | __attributes__ = ['type', 'ref', 'value'] 62 | 63 | def __init__(self, type, ref, value): 64 | self.type = type 65 | self.ref = ref 66 | self.value = value 67 | 68 | 69 | class FactruFact(Record): 70 | __attributes__ = ['id', 'type', 'slots'] 71 | 72 | def __init__(self, id, type, slots): 73 | self.id = id 74 | self.type = type 75 | self.slots = slots 76 | 77 | 78 | class FactruMarkup(Record): 79 | __attributes__ = ['id', 'text', 'objects', 'corefs', 'facts'] 80 | 81 | def __init__(self, id, text, objects, corefs, facts): 82 | self.id = id 83 | self.text = text 84 | self.objects = objects 85 | self.corefs = corefs 86 | self.facts = facts 87 | 88 | 89 | def list_ids(dir, set): 90 | for filename in list_dir(join_path(dir, set)): 91 | match = re.match(r'^book_(\d+)\.txt$', filename) 92 | if match: 93 | yield match.group(1) 94 | 95 | 96 | def part_path(id, dir, set, part): 97 | return join_path(dir, set, 'book_%s.%s' % (id, part)) 98 | 99 | 100 | def parse_spans(lines): 101 | # 32962 loc_name 17 6 89971 1 # 89971 Италии 102 | # 32963 org_name 26 4 89973 1 # 89973 миде 103 | # 32965 loc_name 31 6 89974 1 # 89974 Грузии 104 | 105 | for line in lines: 106 | id, type, start, size, _ = line.split(None, 4) 107 | start = int(start) 108 | stop = start + int(size) 109 | yield FactruSpan(id, type, start, stop) 110 | 111 | 112 | def parse_objects(lines, spans): 113 | # 16972 LocOrg 32962 # Италии 114 | # 16975 Org 32963 32965 # миде Грузии 115 | 116 | id_spans = {_.id: _ for _ in spans} 117 | for line in lines: 118 | parts = iter(line.split()) 119 | id = next(parts) 120 | type = next(parts) 121 | spans = [] 122 | for index in parts: 123 | if not index.isdigit(): 124 | break 125 | span = id_spans[index] 126 | spans.append(span) 127 | yield FactruObject(id, type, spans) 128 | 129 | 130 | def parse_coref_slots(lines): 131 | for line in lines: 132 | if not line: 133 | break 134 | 135 | parts = line.split(None, 1) 136 | if len(parts) == 1: 137 | # 1101 18638 18654 138 | # name Венгрия 139 | # wikidata 140 | # lastname 141 | continue 142 | 143 | type, value = parts 144 | yield FactruCorefSlot(type, value) 145 | 146 | 147 | def parse_corefs(lines, objects): 148 | # 3 16968 16970 16974 149 | # name Грузия 150 | # 151 | # 5 16969 152 | # firstname Виторио 153 | # lastname Сандали 154 | 155 | id_objects = {_.id: _ for _ in objects} 156 | for line in lines: 157 | parts = iter(line.split()) 158 | id = next(parts) 159 | objects = [id_objects[_] for _ in parts] 160 | slots = list(parse_coref_slots(lines)) 161 | yield FactruCoref(id, objects, slots) 162 | 163 | 164 | def parse_facts_slots(lines, id_corefs, id_spans): 165 | for line in lines: 166 | if not line: 167 | break 168 | type, line = line.split(None, 1) 169 | values = line.split(' | ') 170 | for value in values: 171 | # Participant obj90 Industrial and Commercial Bank of China | Промышленный и коммерческий банк Китая 172 | # Participant obj3640 WhatsApp 173 | # Type купля/продажа 174 | match = re.search(r'^(obj|span)(\d+)', value) 175 | if match: 176 | ref, id = match.groups() 177 | if ref == 'obj': 178 | value = id_corefs[id] 179 | elif ref == 'span': 180 | value = id_spans[id] 181 | else: 182 | ref = None 183 | yield FactruFactSlot(type, ref, value) 184 | 185 | 186 | def parse_facts(lines, corefs, spans): 187 | # 58-0 Meeting 188 | # Participant obj5 Сандали Виторио 189 | # Participant obj6 Налбандов Александр 190 | # 191 | # 58-1 Occupation 192 | # Who obj5 Сандали Виторио 193 | # Where obj2 Италия 194 | # Position span32958 чрезвычайный и полномочный посол | span64007 чрезвычайный и полномочный посол Италии в Грузии 195 | 196 | id_corefs = {_.id: _ for _ in corefs} 197 | id_spans = {_.id: _ for _ in spans} 198 | for line in lines: 199 | id, type = line.split(None, 1) 200 | slots = list(parse_facts_slots(lines, id_corefs, id_spans)) 201 | yield FactruFact(id, type, slots) 202 | 203 | 204 | def load_id(id, dir, set): 205 | path = part_path(id, dir, set, TXT) 206 | text = load_text(path) 207 | 208 | path = part_path(id, dir, set, SPANS) 209 | lines = load_lines(path) 210 | spans = list(parse_spans(lines)) 211 | 212 | path = part_path(id, dir, set, OBJECTS) 213 | lines = load_lines(path) 214 | objects = list(parse_objects(lines, spans)) 215 | 216 | path = part_path(id, dir, set, COREF) 217 | lines = load_lines(path) 218 | corefs = list(parse_corefs(lines, objects)) 219 | 220 | path = part_path(id, dir, set, FACTS) 221 | lines = load_lines(path) 222 | facts = list(parse_facts(lines, corefs, spans)) 223 | 224 | return FactruMarkup(id, text, objects, corefs, facts) 225 | 226 | 227 | def load_factru(dir, sets=[DEVSET, TESTSET]): 228 | for set in sets: 229 | for id in list_ids(dir, set): 230 | yield load_id(id, dir, set) 231 | -------------------------------------------------------------------------------- /data/sample.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Buriy" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 10, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "# from os.path import (\n", 17 | "# join,\n", 18 | "# expanduser\n", 19 | "# )\n", 20 | "# from itertools import islice as head\n", 21 | "# import tarfile\n", 22 | "# from io import BytesIO\n", 23 | "\n", 24 | "# from tqdm import tqdm_notebook as log_progress\n", 25 | "\n", 26 | "\n", 27 | "# def source_path(filename, dir='~/proj/corus-data/buriy/'):\n", 28 | "# return join(\n", 29 | "# expanduser(dir),\n", 30 | "# filename\n", 31 | "# )\n", 32 | "\n", 33 | "\n", 34 | "# def target_path(filename, dir='buriy'):\n", 35 | "# return join(dir, filename)\n", 36 | "\n", 37 | "\n", 38 | "# def top_lines(file, count):\n", 39 | "# lines = head(file, count)\n", 40 | "# return b''.join(lines)\n", 41 | "\n", 42 | "\n", 43 | "# def sample(source, target, count):\n", 44 | "# with tarfile.open(source) as source, tarfile.open(target, 'w:bz2') as target:\n", 45 | "# for member in log_progress(head(source, count)):\n", 46 | "# if not member.isfile():\n", 47 | "# continue\n", 48 | "# file = source.extractfile(member)\n", 49 | "# data = top_lines(file, 2)\n", 50 | "# member.size = len(data)\n", 51 | "# file = BytesIO(data)\n", 52 | "# target.addfile(member, file)\n", 53 | "\n", 54 | "\n", 55 | "# filenames = [\n", 56 | "# 'lenta.tar.bz2',\n", 57 | "# 'news-articles-2014.tar.bz2',\n", 58 | "# 'news-articles-2015-part1.tar.bz2',\n", 59 | "# 'news-articles-2015-part2.tar.bz2',\n", 60 | "# 'webhose-2016.tar.bz2',\n", 61 | "# ]\n", 62 | "# for filename in filenames:\n", 63 | "# print(filename)\n", 64 | "# sample(source_path(filename), target_path(filename), 10)" 65 | ] 66 | }, 67 | { 68 | "cell_type": "markdown", 69 | "metadata": {}, 70 | "source": [ 71 | "# Taiga" 72 | ] 73 | }, 74 | { 75 | "cell_type": "code", 76 | "execution_count": 29, 77 | "metadata": { 78 | "scrolled": false 79 | }, 80 | "outputs": [], 81 | "source": [ 82 | "from os.path import (\n", 83 | " join,\n", 84 | " expanduser\n", 85 | ")\n", 86 | "\n", 87 | "from tqdm import tqdm_notebook as log_progress\n", 88 | "\n", 89 | "from corus.io import (\n", 90 | " load_tar,\n", 91 | " load_zip\n", 92 | ")\n", 93 | "\n", 94 | "\n", 95 | "def source_path(filename, dir='~/corus-data/taiga/'):\n", 96 | " return join(\n", 97 | " expanduser(dir),\n", 98 | " filename\n", 99 | " )\n", 100 | "\n", 101 | "\n", 102 | "def target_path(filename, dir='taiga'):\n", 103 | " return join(dir, filename)" 104 | ] 105 | }, 106 | { 107 | "cell_type": "markdown", 108 | "metadata": { 109 | "heading_collapsed": true 110 | }, 111 | "source": [ 112 | "## Offsets" 113 | ] 114 | }, 115 | { 116 | "cell_type": "code", 117 | "execution_count": 30, 118 | "metadata": { 119 | "hidden": true 120 | }, 121 | "outputs": [], 122 | "source": [ 123 | "# def get_offsets(records, patterns):\n", 124 | "# offsets = {}\n", 125 | "# for record in records:\n", 126 | "# for pattern in patterns:\n", 127 | "# if pattern not in offsets and pattern in record.name:\n", 128 | "# offsets[pattern] = record.offset\n", 129 | "# if len(offsets) == len(patterns):\n", 130 | "# break\n", 131 | "# return offsets\n", 132 | "\n", 133 | "\n", 134 | "# patterns = ['meta', '/texts', '/tagged']\n", 135 | "# files = [\n", 136 | "# ('Arzamas.tar.gz', load_tar, patterns),\n", 137 | "# ('Fontanka.tar.gz', load_tar, patterns),\n", 138 | "# ('Interfax.tar.gz', load_tar, patterns),\n", 139 | "# ('KP.tar.gz', load_tar, patterns),\n", 140 | "# ('Lenta.tar.gz', load_tar, patterns),\n", 141 | "# ('Magazines.tar.gz', load_tar, patterns),\n", 142 | "# ('NPlus1.tar.gz', load_tar, patterns),\n", 143 | "# ('Subtitles.tar.gz', load_tar, patterns),\n", 144 | " \n", 145 | "# ('social.tar.gz', load_tar, ['/texts', '/tagged']),\n", 146 | "\n", 147 | "# ('proza_ru.zip', load_zip, patterns),\n", 148 | "# ('stihi_ru.zip', load_zip, patterns),\n", 149 | "# ]\n", 150 | "# for filename, load, patterns in files:\n", 151 | "# path = source_path(filename)\n", 152 | "# records = load(path)\n", 153 | "# print(filename)\n", 154 | "# offsets = get_offsets(log_progress(records), patterns)\n", 155 | "# for pattern in patterns:\n", 156 | "# print('', offsets.get(pattern), pattern, sep='\\t')" 157 | ] 158 | }, 159 | { 160 | "cell_type": "code", 161 | "execution_count": 5, 162 | "metadata": { 163 | "hidden": true 164 | }, 165 | "outputs": [], 166 | "source": [ 167 | "# Arzamas.tar.gz\n", 168 | "# \t512\tmeta\n", 169 | "# \t144896\t/texts\n", 170 | "# \t5112320\t/tagged\n", 171 | "\n", 172 | "# Fontanka.tar.gz\n", 173 | "# \t512\tmeta\n", 174 | "# \t306359296\t/texts\n", 175 | "# \t1394093568\t/tagged\n", 176 | "\n", 177 | "# Interfax.tar.gz\n", 178 | "# \t512\tmeta\n", 179 | "# \t11447296\t/texts\n", 180 | "# \t140434432\t/tagged\n", 181 | "\n", 182 | "# KP.tar.gz\n", 183 | "# \t512\tmeta\n", 184 | "# \t13042176\t/texts\n", 185 | "# \t126222848\t/tagged\n", 186 | "\n", 187 | "# Lenta.tar.gz\n", 188 | "# \t512\tmeta\n", 189 | "# \t12800000\t/texts\n", 190 | "# \t140551168\t/tagged\n", 191 | "\n", 192 | "# Magazines.tar.gz\n", 193 | "# \t512\tmeta\n", 194 | "# \t7292416\t/texts\n", 195 | "# \t2390665216\t/tagged\n", 196 | "\n", 197 | "# NPlus1.tar.gz\n", 198 | "# \t512\tmeta\n", 199 | "# \t1919488\t/texts\n", 200 | "# \t33988608\t/tagged\n", 201 | "\n", 202 | "# Subtitles.tar.gz\n", 203 | "# \t512\tmeta\n", 204 | "# \t2113024\t/texts\n", 205 | "# \t974075904\t/tagged\n", 206 | "\n", 207 | "# social.tar.gz\n", 208 | "# \t3985892864\t/texts\n", 209 | "# \t1024\t/tagged\n", 210 | "\n", 211 | "# proza_ru.zip\n", 212 | "# \t636\tmeta\n", 213 | "# \t51432715409\t/texts\n", 214 | "# \t201377139\t/tagged\n", 215 | "\n", 216 | "# stihi_ru.zip\n", 217 | "# \t899\tmeta\n", 218 | "# \t22304202421\t/texts\n", 219 | "# \t381570084\t/tagged" 220 | ] 221 | }, 222 | { 223 | "cell_type": "markdown", 224 | "metadata": { 225 | "heading_collapsed": true 226 | }, 227 | "source": [ 228 | "## Sample" 229 | ] 230 | }, 231 | { 232 | "cell_type": "code", 233 | "execution_count": 2, 234 | "metadata": { 235 | "hidden": true 236 | }, 237 | "outputs": [], 238 | "source": [ 239 | "def tar_copy_(lines, target, info):\n", 240 | " data = b''.join(lines)\n", 241 | " file = BytesIO(data)\n", 242 | " info.size = len(data)\n", 243 | " target.addfile(info, file)\n", 244 | "\n", 245 | "\n", 246 | "def tar_copy_text(source, target, info, count):\n", 247 | " file = source.extractfile(info)\n", 248 | " lines = islice(file, count)\n", 249 | " tar_copy_(lines, target, info)\n", 250 | "\n", 251 | "\n", 252 | "def tar_copy_meta(source, target, info, pattern, encoding='utf8'):\n", 253 | " file = source.extractfile(info)\n", 254 | " lines = [\n", 255 | " _ for index, _\n", 256 | " in enumerate(file)\n", 257 | " if pattern in _.decode(encoding) or index == 0\n", 258 | " ]\n", 259 | " if len(lines) == 1: # just header\n", 260 | " return\n", 261 | " tar_copy_(lines, target, info)" 262 | ] 263 | }, 264 | { 265 | "cell_type": "code", 266 | "execution_count": 3, 267 | "metadata": { 268 | "hidden": true, 269 | "scrolled": false 270 | }, 271 | "outputs": [], 272 | "source": [ 273 | "# def sample(source, target, pattern):\n", 274 | "# with tarfile.open(source) as source, tarfile.open(target, 'w:gz') as target:\n", 275 | "# texts = 0\n", 276 | "# for info in log_progress(source):\n", 277 | "# if not info.isfile():\n", 278 | "# continue\n", 279 | "# name = info.name\n", 280 | "# if 'metadata' in name or 'metatable' in name:\n", 281 | "# tar_copy_meta(source, target, info, pattern)\n", 282 | "# print(name)\n", 283 | "# elif ('/tagged' in name or '/text' in name) and pattern in name:\n", 284 | "# tar_copy_text(source, target, info, 100)\n", 285 | "# print(name)\n", 286 | "# texts += 1\n", 287 | "# if texts >= 2:\n", 288 | "# break\n", 289 | "# source.members = []\n", 290 | "\n", 291 | "\n", 292 | "# FILENAMES = [\n", 293 | "# ('Arzamas.tar.gz', '101'),\n", 294 | "# ('Fontanka.tar.gz', '20070101001'),\n", 295 | "# ('Interfax.tar.gz', 'business199005'),\n", 296 | "# ('KP.tar.gz', '10@2598286'),\n", 297 | "# ('Lenta.tar.gz', '20091231boeviks'),\n", 298 | "# ('NPlus1.tar.gz', '20160915'),\n", 299 | "# ('Magazines.tar.gz', '103870'),\n", 300 | "# ('Subtitles.tar.gz', 'Pilot.HDTV.XII'),\n", 301 | "# ]\n", 302 | "\n", 303 | "# for filename, pattern in FILENAMES:\n", 304 | "# sample(source_path(filename), target_path(filename), pattern)\n", 305 | "# print(source, '->', target)" 306 | ] 307 | }, 308 | { 309 | "cell_type": "code", 310 | "execution_count": 11, 311 | "metadata": { 312 | "hidden": true 313 | }, 314 | "outputs": [], 315 | "source": [ 316 | "# filename = 'social.tar.gz'\n", 317 | "# source = source_path(filename)\n", 318 | "# target = source_path(filename)\n", 319 | "\n", 320 | "# with tarfile.open(source) as source, tarfile.open(target, 'w:gz') as target:\n", 321 | "# for info in log_progress(source):\n", 322 | "# if not info.isfile():\n", 323 | "# continue\n", 324 | "# if '/text' in info.name:\n", 325 | "# tar_copy_text(source, target, info, 4)\n", 326 | "# print(info.name)" 327 | ] 328 | }, 329 | { 330 | "cell_type": "code", 331 | "execution_count": 5, 332 | "metadata": { 333 | "hidden": true 334 | }, 335 | "outputs": [], 336 | "source": [ 337 | "# import zipfile\n", 338 | "# from corus.zip import open_zip, list_zip, read_zip\n", 339 | "\n", 340 | "\n", 341 | "# def zip_copy_(lines, target, record):\n", 342 | "# data = '\\n'.join(lines)\n", 343 | "# target.writestr(record.name, data)\n", 344 | "\n", 345 | "\n", 346 | "# def zip_copy_text(source, target, record, count):\n", 347 | "# text = read_zip(source, record)\n", 348 | "# lines = text.splitlines()[:count]\n", 349 | "# zip_copy_(lines, target, record)\n", 350 | "\n", 351 | "\n", 352 | "# def zip_copy_meta(source, target, record, pattern):\n", 353 | "# text = read_zip(source, record)\n", 354 | "# lines = text.splitlines()\n", 355 | "# lines = [\n", 356 | "# _ for index, _\n", 357 | "# in enumerate(lines)\n", 358 | "# if pattern in _ or index == 0\n", 359 | "# ]\n", 360 | "# if len(lines) == 1: # just header\n", 361 | "# return\n", 362 | "# zip_copy_(lines, target, record)\n", 363 | "\n", 364 | "\n", 365 | "# def sample(source, target, pattern, count):\n", 366 | "# with open_zip(source) as source, zipfile.ZipFile(target, 'w') as target:\n", 367 | "# texts = 0\n", 368 | "# for record in log_progress(list_zip(source)):\n", 369 | "# if not record.uncompressed: # not a file\n", 370 | "# continue\n", 371 | "# name = record.name\n", 372 | "# if 'metatable' in name:\n", 373 | "# zip_copy_meta(source, target, record, pattern)\n", 374 | "# print(name)\n", 375 | "# elif ('/tagged' in name or '/text' in name) and pattern in name:\n", 376 | "# zip_copy_text(source, target, record, count)\n", 377 | "# print(name)\n", 378 | "# texts += 1\n", 379 | "# if texts >= 2:\n", 380 | "# break\n", 381 | "\n", 382 | "\n", 383 | "# FILENAMES = [\n", 384 | "# ('proza_ru.zip', '20151231005', 10),\n", 385 | "# ('stihi_ru.zip', '20151231001', 100)\n", 386 | "# ]\n", 387 | "\n", 388 | "# for filename, pattern, count in FILENAMES:\n", 389 | "# source = source_path(filename)\n", 390 | "# target = target_path(filename)\n", 391 | "# sample(source, target, pattern, count)\n", 392 | "# print(source, '->', target)" 393 | ] 394 | }, 395 | { 396 | "cell_type": "code", 397 | "execution_count": null, 398 | "metadata": { 399 | "hidden": true 400 | }, 401 | "outputs": [], 402 | "source": [] 403 | } 404 | ], 405 | "metadata": { 406 | "kernelspec": { 407 | "display_name": "Python 3", 408 | "language": "python", 409 | "name": "python3" 410 | }, 411 | "language_info": { 412 | "codemirror_mode": { 413 | "name": "ipython", 414 | "version": 3 415 | }, 416 | "file_extension": ".py", 417 | "mimetype": "text/x-python", 418 | "name": "python", 419 | "nbconvert_exporter": "python", 420 | "pygments_lexer": "ipython3", 421 | "version": "3.5.1" 422 | } 423 | }, 424 | "nbformat": 4, 425 | "nbformat_minor": 2 426 | } 427 | -------------------------------------------------------------------------------- /corus/sources/meta.py: -------------------------------------------------------------------------------- 1 | 2 | from corus.record import Record 3 | 4 | from . import ( 5 | load_mokoron, 6 | load_wiki, 7 | load_simlex, 8 | load_omnia, 9 | load_gramru, 10 | load_corpora, 11 | load_ruadrect, 12 | 13 | load_factru, 14 | load_gareev, 15 | load_lenta, 16 | load_lenta2, 17 | load_librusec, 18 | load_ne5, 19 | load_wikiner, 20 | load_bsnlp, 21 | load_persons, 22 | load_rudrec, 23 | 24 | load_taiga_arzamas, 25 | load_taiga_fontanka, 26 | load_taiga_interfax, 27 | load_taiga_kp, 28 | load_taiga_lenta, 29 | load_taiga_nplus1, 30 | load_taiga_magazines, 31 | load_taiga_subtitles, 32 | load_taiga_social, 33 | load_taiga_proza, 34 | load_taiga_stihi, 35 | 36 | load_buriy_news, 37 | load_buriy_webhose, 38 | 39 | load_ods_interfax, 40 | load_ods_gazeta, 41 | load_ods_izvestia, 42 | load_ods_meduza, 43 | load_ods_ria, 44 | load_ods_rt, 45 | load_ods_tass, 46 | 47 | load_ria_raw, 48 | load_ria, 49 | 50 | load_ud_gsd, 51 | load_ud_taiga, 52 | load_ud_pud, 53 | load_ud_syntag, 54 | 55 | load_morphoru_gicrya, 56 | load_morphoru_rnc, 57 | load_morphoru_corpora, 58 | 59 | load_russe_hj, 60 | load_russe_rt, 61 | load_russe_ae, 62 | 63 | load_toloka_lrwc, 64 | ) 65 | 66 | 67 | class Meta(Record): 68 | __attributes__ = ['title', 'url', 69 | 'description', 'stats', 'instruction', 70 | 'tags', 'functions'] 71 | 72 | def __init__(self, title, url=None, 73 | description=None, stats=None, instruction=(), 74 | tags=(), functions=()): 75 | self.title = title 76 | self.url = url 77 | self.description = description 78 | self.stats = stats 79 | self.instruction = instruction 80 | self.tags = tags 81 | self.functions = functions 82 | 83 | 84 | class Group(Record): 85 | __attributes__ = ['title', 'url', 'description', 'instruction', 'metas'] 86 | 87 | def __init__(self, title, url=None, description=None, instruction=(), metas=()): 88 | self.title = title 89 | self.url = url 90 | self.description = description 91 | self.instruction = instruction 92 | self.metas = metas 93 | 94 | 95 | def is_group(item): 96 | return isinstance(item, Group) 97 | 98 | 99 | class Stats(Record): 100 | __attributes__ = ['bytes', 'count'] 101 | 102 | def __init__(self, bytes=None, count=None): 103 | self.bytes = bytes 104 | self.count = count 105 | 106 | 107 | NER = 'ner' 108 | NEWS = 'news' 109 | FICTION = 'fiction' 110 | SOCIAL = 'social' 111 | MORPH = 'morph' 112 | SYNTAX = 'syntax' 113 | EMB = 'emb' 114 | SIM = 'sim' 115 | SENTIMENT = 'sentiment' 116 | WEB = 'web' 117 | 118 | METAS = [ 119 | Group( 120 | title='Lenta.ru', 121 | url='https://github.com/yutkin/Lenta.Ru-News-Dataset', 122 | metas=[ 123 | Meta( 124 | title='Lenta.ru v1.0', 125 | stats=Stats( 126 | bytes=1785632079, 127 | count=739351 128 | ), 129 | instruction=[ 130 | 'wget https://github.com/yutkin/Lenta.Ru-News-Dataset/releases/download/v1.0/lenta-ru-news.csv.gz' 131 | ], 132 | tags=[NEWS], 133 | functions=[load_lenta] 134 | ), 135 | Meta( 136 | title='Lenta.ru v1.1+', 137 | stats=Stats( 138 | bytes=2084746431, 139 | count=800975 140 | ), 141 | instruction=[ 142 | 'wget https://github.com/yutkin/Lenta.Ru-News-Dataset/releases/download/v1.1/lenta-ru-news.csv.bz2' 143 | ], 144 | tags=[NEWS], 145 | functions=[load_lenta2] 146 | ), 147 | ] 148 | ), 149 | Meta( 150 | title='Lib.rus.ec', 151 | url='https://russe.nlpub.org/downloads/', 152 | description='Dump of lib.rus.ec prepared for RUSSE workshop', 153 | stats=Stats( 154 | count=301871, 155 | bytes=155611193945 156 | ), 157 | instruction=[ 158 | 'wget http://panchenko.me/data/russe/librusec_fb2.plain.gz' 159 | ], 160 | tags=[FICTION], 161 | functions=[load_librusec] 162 | ), 163 | Meta( 164 | title='Rossiya Segodnya', 165 | url='https://github.com/RossiyaSegodnya/ria_news_dataset', 166 | stats=Stats( 167 | count=1003869, 168 | bytes=3974121040 169 | ), 170 | instruction=[ 171 | 'wget https://github.com/RossiyaSegodnya/ria_news_dataset/raw/master/ria.json.gz' 172 | ], 173 | tags=[NEWS], 174 | functions=[load_ria_raw, load_ria] 175 | ), 176 | Meta( 177 | title='Mokoron Russian Twitter Corpus', 178 | url='http://study.mokoron.com/', 179 | description='Russian Twitter sentiment markup', 180 | instruction=[ 181 | 'Manually download https://www.dropbox.com/s/9egqjszeicki4ho/db.sql' 182 | ], 183 | stats=Stats( 184 | count=17633417, 185 | bytes=1998559570 186 | ), 187 | tags=[SOCIAL, SENTIMENT], 188 | functions=[load_mokoron], 189 | ), 190 | Meta( 191 | title='Wikipedia', 192 | url='https://dumps.wikimedia.org/', 193 | description='Russian Wiki dump', 194 | instruction=[ 195 | 'wget https://dumps.wikimedia.org/ruwiki/latest/ruwiki-latest-pages-articles.xml.bz2' 196 | ], 197 | stats=Stats( 198 | count=1541401, 199 | bytes=13895798340 200 | ), 201 | functions=[load_wiki], 202 | ), 203 | Meta( 204 | title='GramEval2020', 205 | url='https://github.com/dialogue-evaluation/GramEval2020', 206 | instruction=[ 207 | 'wget https://github.com/dialogue-evaluation/GramEval2020/archive/master.zip', 208 | 'unzip master.zip', 209 | 'mv GramEval2020-master/dataTrain train', 210 | 'mv GramEval2020-master/dataOpenTest dev', 211 | 'rm -r master.zip GramEval2020-master', 212 | 'wget https://github.com/AlexeySorokin/GramEval2020/raw/master/data/GramEval_private_test.conllu' 213 | ], 214 | stats=Stats( 215 | count=162372, 216 | bytes=31503713 217 | ), 218 | functions=[load_gramru], 219 | ), 220 | Meta( 221 | title='OpenCorpora', 222 | url='http://opencorpora.org/', 223 | instruction=[ 224 | 'wget http://opencorpora.org/files/export/annot/annot.opcorpora.xml.zip' 225 | ], 226 | stats=Stats( 227 | count=4030, 228 | bytes=21194932 229 | ), 230 | tags=[MORPH], 231 | functions=[load_corpora], 232 | ), 233 | Meta( 234 | title='RusVectores SimLex-965', 235 | instruction=[ 236 | 'wget https://rusvectores.org/static/testsets/ru_simlex965_tagged.tsv', 237 | 'wget https://rusvectores.org/static/testsets/ru_simlex965.tsv' 238 | ], 239 | tags=[EMB, SIM], 240 | functions=[load_simlex], 241 | ), 242 | Meta( 243 | title='Omnia Russica', 244 | url='https://omnia-russica.github.io/', 245 | description='Taiga + Wiki + Araneum. Read "Even larger Russian corpus" https://events.spbu.ru/eventsContent/events/2019/corpora/corp_sborn.pdf', 246 | instruction=[ 247 | 'Manually download http://bit.ly/2ZT4BY9' 248 | ], 249 | stats=Stats( 250 | bytes=525728427750 251 | ), 252 | tags=[MORPH, WEB, FICTION], 253 | functions=[load_omnia] 254 | ), 255 | 256 | 257 | ########### 258 | # 259 | # NER 260 | # 261 | ############ 262 | 263 | 264 | Meta( 265 | title='factRuEval-2016', 266 | url='https://github.com/dialogue-evaluation/factRuEval-2016/', 267 | description='Manual PER, LOC, ORG markup prepared for 2016 Dialog competition', 268 | stats=Stats( 269 | count=254, 270 | bytes=992532 271 | ), 272 | instruction=[ 273 | 'wget https://github.com/dialogue-evaluation/factRuEval-2016/archive/master.zip', 274 | 'unzip master.zip', 275 | 'rm master.zip' 276 | ], 277 | tags=[NER, NEWS], 278 | functions=[load_factru] 279 | ), 280 | Meta( 281 | title='Gareev', 282 | url='https://www.researchgate.net/publication/262203599_Introducing_Baselines_for_Russian_Named_Entity_Recognition', 283 | description='Manual PER, ORG markup (no LOC)', 284 | stats=Stats( 285 | count=97, 286 | bytes=465938 287 | ), 288 | instruction=[ 289 | 'Email Rinat Gareev (gareev-rm@yandex.ru) ask for dataset', 290 | 'tar -xvf rus-ner-news-corpus.iob.tar.gz', 291 | 'rm rus-ner-news-corpus.iob.tar.gz' 292 | ], 293 | tags=[NER, NEWS], 294 | functions=[load_gareev] 295 | ), 296 | Meta( 297 | title='Collection5', 298 | url='http://www.labinform.ru/pub/named_entities/', 299 | description='News articles with manual PER, LOC, ORG markup', 300 | stats=Stats( 301 | count=1000, 302 | bytes=3105146 303 | ), 304 | instruction=[ 305 | 'wget http://www.labinform.ru/pub/named_entities/collection5.zip', 306 | 'unzip collection5.zip', 307 | 'rm collection5.zip' 308 | ], 309 | tags=[NER, NEWS], 310 | functions=[load_ne5] 311 | ), 312 | Meta( 313 | title='WiNER', 314 | url='https://www.aclweb.org/anthology/I17-1042', 315 | description='Sentences from Wiki auto annotated with PER, LOC, ORG tags', 316 | stats=Stats( 317 | count=203287, 318 | bytes=37907651 319 | ), 320 | instruction=[ 321 | 'wget https://github.com/dice-group/FOX/raw/master/input/Wikiner/aij-wikiner-ru-wp3.bz2' 322 | ], 323 | tags=[NER], 324 | functions=[load_wikiner] 325 | ), 326 | Meta( 327 | title='BSNLP-2019', 328 | url='http://bsnlp.cs.helsinki.fi/shared_task.html', 329 | description='Markup prepared for 2019 BSNLP Shared Task', 330 | stats=Stats( 331 | count=464, 332 | bytes=1211300 333 | ), 334 | instruction=[ 335 | 'wget http://bsnlp.cs.helsinki.fi/TRAININGDATA_BSNLP_2019_shared_task.zip', 336 | 'wget http://bsnlp.cs.helsinki.fi/TESTDATA_BSNLP_2019_shared_task.zip', 337 | 'unzip TRAININGDATA_BSNLP_2019_shared_task.zip', 338 | 'unzip TESTDATA_BSNLP_2019_shared_task.zip -d test_pl_cs_ru_bg', 339 | 'rm TRAININGDATA_BSNLP_2019_shared_task.zip TESTDATA_BSNLP_2019_shared_task.zip' 340 | ], 341 | tags=[NER], 342 | functions=[load_bsnlp] 343 | ), 344 | Meta( 345 | title='Persons-1000', 346 | url='http://ai-center.botik.ru/Airec/index.php/ru/collections/28-persons-1000', 347 | description='Same as Collection5, only PER markup + normalized names', 348 | stats=Stats( 349 | count=1000, 350 | bytes=3105146 351 | ), 352 | instruction=[ 353 | 'wget http://ai-center.botik.ru/Airec/ai-resources/Persons-1000.zip' 354 | ], 355 | tags=[NER, NEWS], 356 | functions=[load_persons] 357 | ), 358 | Meta( 359 | title='The Russian Drug Reaction Corpus (RuDReC)', 360 | url='https://github.com/cimm-kzn/RuDReC', 361 | description=( 362 | 'RuDReC is a new partially annotated corpus of consumer reviews in Russian about pharmaceutical ' 363 | 'products for the detection of health-related named entities and the effectiveness of pharmaceutical products. ' 364 | 'Here you can download and work with the annotated part, to get the raw part (1.4M reviews) ' 365 | 'please refer to https://github.com/cimm-kzn/RuDReC.' 366 | ), 367 | stats=Stats( 368 | count=4809, 369 | bytes=1773 370 | ), 371 | instruction=[ 372 | 'wget https://github.com/cimm-kzn/RuDReC/raw/master/data/rudrec_annotated.json' 373 | ], 374 | tags=[NER], 375 | functions=[load_rudrec] 376 | ), 377 | 378 | ########## 379 | # 380 | # TAIGA 381 | # 382 | ########### 383 | 384 | 385 | Group( 386 | title='Taiga', 387 | url='https://tatianashavrina.github.io/taiga_site/', 388 | description='Large collection of Russian texts from various sources: news sites, magazines, literacy, social networks', 389 | instruction=[ 390 | 'wget https://linghub.ru/static/Taiga/retagged_taiga.tar.gz', 391 | 'tar -xzvf retagged_taiga.tar.gz' 392 | ], 393 | metas=[ 394 | Meta( 395 | title='Arzamas', 396 | stats=Stats( 397 | count=311, 398 | bytes=4721604 399 | ), 400 | tags=[NEWS], 401 | functions=[load_taiga_arzamas], 402 | ), 403 | Meta( 404 | title='Fontanka', 405 | stats=Stats( 406 | count=342683, 407 | bytes=824419630 408 | ), 409 | tags=[NEWS], 410 | functions=[load_taiga_fontanka], 411 | ), 412 | Meta( 413 | title='Interfax', 414 | stats=Stats( 415 | count=46429, 416 | bytes=81320006 417 | ), 418 | tags=[NEWS], 419 | functions=[load_taiga_interfax], 420 | ), 421 | Meta( 422 | title='KP', 423 | stats=Stats( 424 | count=45503, 425 | bytes=64789612 426 | ), 427 | tags=[NEWS], 428 | functions=[load_taiga_kp], 429 | ), 430 | Meta( 431 | title='Lenta', 432 | stats=Stats( 433 | count=36446, 434 | bytes=99772679 435 | ), 436 | tags=[NEWS], 437 | functions=[load_taiga_lenta], 438 | ), 439 | Meta( 440 | title='Taiga/N+1', 441 | stats=Stats( 442 | count=7696, 443 | bytes=26167631 444 | ), 445 | tags=[NEWS], 446 | functions=[load_taiga_nplus1], 447 | ), 448 | Meta( 449 | title='Magazines', 450 | stats=Stats( 451 | count=39890, 452 | bytes=2352629006 453 | ), 454 | functions=[load_taiga_magazines] 455 | ), 456 | Meta( 457 | title='Subtitles', 458 | stats=Stats( 459 | count=19011, 460 | bytes=953237022 461 | ), 462 | functions=[load_taiga_subtitles] 463 | ), 464 | Meta( 465 | title='Social', 466 | stats=Stats( 467 | count=1876442, 468 | bytes=679670941 469 | ), 470 | tags=[SOCIAL], 471 | functions=[load_taiga_social] 472 | ), 473 | Meta( 474 | title='Proza', 475 | stats=Stats( 476 | count=1732434, 477 | bytes=41067043857 478 | ), 479 | tags=[FICTION], 480 | functions=[load_taiga_proza] 481 | ), 482 | Meta( 483 | title='Stihi', 484 | stats=Stats( 485 | count=9157686, 486 | bytes=13745805334 487 | ), 488 | functions=[load_taiga_stihi] 489 | ), 490 | ] 491 | ), 492 | 493 | 494 | ############# 495 | # 496 | # BURIY 497 | # 498 | ########## 499 | 500 | 501 | Group( 502 | title='Russian NLP Datasets', 503 | url='https://github.com/buriy/russian-nlp-datasets/releases', 504 | description='Several Russian news datasets from webhose.io, lenta.ru and other news sites.', 505 | metas=[ 506 | Meta( 507 | title='News', 508 | description='Dump of top 40 news + 20 fashion news sites.', 509 | instruction=[ 510 | 'wget https://github.com/buriy/russian-nlp-datasets/releases/download/r4/news-articles-2014.tar.bz2', 511 | 'wget https://github.com/buriy/russian-nlp-datasets/releases/download/r4/news-articles-2015-part1.tar.bz2', 512 | 'wget https://github.com/buriy/russian-nlp-datasets/releases/download/r4/news-articles-2015-part2.tar.bz2' 513 | ], 514 | stats=Stats( 515 | count=2154801, 516 | bytes=7340672169 517 | ), 518 | tags=[NEWS], 519 | functions=[load_buriy_news], 520 | ), 521 | Meta( 522 | title='Webhose', 523 | description='Dump from webhose.io, 300 sources for one month.', 524 | instruction=[ 525 | 'wget https://github.com/buriy/russian-nlp-datasets/releases/download/r4/webhose-2016.tar.bz2' 526 | ], 527 | stats=Stats( 528 | count=285965, 529 | bytes=901066314 530 | ), 531 | tags=[NEWS], 532 | functions=[load_buriy_webhose], 533 | ), 534 | ] 535 | ), 536 | 537 | 538 | ############# 539 | # 540 | # ODS 541 | # 542 | ######### 543 | 544 | 545 | Group( 546 | title='ODS #proj_news_viz', 547 | url='https://github.com/ods-ai-ml4sg/proj_news_viz/releases/tag/data', 548 | description='Several news sites scraped by members of #proj_news_viz ODS project.', 549 | metas=[ 550 | Meta( 551 | title='Interfax', 552 | instruction=[ 553 | 'wget https://github.com/ods-ai-ml4sg/proj_news_viz/releases/download/data/interfax.csv.gz', 554 | ], 555 | stats=Stats( 556 | count=543961, 557 | bytes=1314462876, 558 | ), 559 | tags=[NEWS], 560 | functions=[load_ods_interfax], 561 | ), 562 | Meta( 563 | title='Gazeta', 564 | instruction=[ 565 | 'wget https://github.com/ods-ai-ml4sg/proj_news_viz/releases/download/data/gazeta.csv.gz', 566 | ], 567 | stats=Stats( 568 | count=865847, 569 | bytes=1752712320 570 | ), 571 | tags=[NEWS], 572 | functions=[load_ods_gazeta], 573 | ), 574 | Meta( 575 | title='Izvestia', 576 | instruction=[ 577 | 'wget https://github.com/ods-ai-ml4sg/proj_news_viz/releases/download/data/iz.csv.gz', 578 | ], 579 | stats=Stats( 580 | count=86601, 581 | bytes=322117124 582 | ), 583 | tags=[NEWS], 584 | functions=[load_ods_izvestia], 585 | ), 586 | Meta( 587 | title='Meduza', 588 | instruction=[ 589 | 'wget https://github.com/ods-ai-ml4sg/proj_news_viz/releases/download/data/meduza.csv.gz', 590 | ], 591 | stats=Stats( 592 | count=71806, 593 | bytes=283233963 594 | ), 595 | tags=[NEWS], 596 | functions=[load_ods_meduza], 597 | ), 598 | Meta( 599 | title='RIA', 600 | instruction=[ 601 | 'wget https://github.com/ods-ai-ml4sg/proj_news_viz/releases/download/data/ria.csv.gz', 602 | ], 603 | stats=Stats( 604 | count=101543, 605 | bytes=245236791 606 | ), 607 | tags=[NEWS], 608 | functions=[load_ods_ria], 609 | ), 610 | Meta( 611 | title='Russia Today', 612 | instruction=[ 613 | 'wget https://github.com/ods-ai-ml4sg/proj_news_viz/releases/download/data/rt.csv.gz', 614 | ], 615 | stats=Stats( 616 | count=106644, 617 | bytes=196212474 618 | ), 619 | tags=[NEWS], 620 | functions=[load_ods_rt], 621 | ), 622 | Meta( 623 | title='TASS', 624 | instruction=[ 625 | 'wget https://github.com/ods-ai-ml4sg/proj_news_viz/releases/download/data/tass-001.csv.gz', 626 | ], 627 | stats=Stats( 628 | count=1135635, 629 | bytes=3515136716 630 | ), 631 | tags=[NEWS], 632 | functions=[load_ods_tass], 633 | ), 634 | 635 | ] 636 | ), 637 | 638 | 639 | ############# 640 | # 641 | # UD 642 | # 643 | ######### 644 | 645 | 646 | Group( 647 | title='Universal Dependencies', 648 | url='https://universaldependencies.org/', 649 | metas=[ 650 | Meta( 651 | title='GSD', 652 | instruction=[ 653 | 'wget https://github.com/UniversalDependencies/UD_Russian-GSD/raw/master/ru_gsd-ud-dev.conllu', 654 | 'wget https://github.com/UniversalDependencies/UD_Russian-GSD/raw/master/ru_gsd-ud-test.conllu', 655 | 'wget https://github.com/UniversalDependencies/UD_Russian-GSD/raw/master/ru_gsd-ud-train.conllu' 656 | ], 657 | stats=Stats( 658 | count=5030, 659 | bytes=1059114 660 | ), 661 | tags=[MORPH, SYNTAX], 662 | functions=[load_ud_gsd], 663 | ), 664 | Meta( 665 | title='Taiga', 666 | instruction=[ 667 | 'wget https://github.com/UniversalDependencies/UD_Russian-Taiga/raw/master/ru_taiga-ud-dev.conllu', 668 | 'wget https://github.com/UniversalDependencies/UD_Russian-Taiga/raw/master/ru_taiga-ud-test.conllu', 669 | 'wget https://github.com/UniversalDependencies/UD_Russian-Taiga/raw/master/ru_taiga-ud-train.conllu' 670 | ], 671 | stats=Stats( 672 | count=3264, 673 | bytes=362293 674 | ), 675 | tags=[MORPH, SYNTAX], 676 | functions=[load_ud_taiga], 677 | ), 678 | Meta( 679 | title='PUD', 680 | instruction=[ 681 | 'wget https://github.com/UniversalDependencies/UD_Russian-PUD/raw/master/ru_pud-ud-test.conllu', 682 | ], 683 | stats=Stats( 684 | count=1000, 685 | bytes=212766 686 | ), 687 | tags=[MORPH, SYNTAX], 688 | functions=[load_ud_pud], 689 | ), 690 | Meta( 691 | title='SynTagRus', 692 | instruction=[ 693 | 'wget https://github.com/UniversalDependencies/UD_Russian-SynTagRus/raw/master/ru_syntagrus-ud-dev.conllu', 694 | 'wget https://github.com/UniversalDependencies/UD_Russian-SynTagRus/raw/master/ru_syntagrus-ud-test.conllu', 695 | 'wget https://github.com/UniversalDependencies/UD_Russian-SynTagRus/raw/master/ru_syntagrus-ud-train.conllu', 696 | ], 697 | stats=Stats( 698 | count=61889, 699 | bytes=11877258 700 | ), 701 | tags=[MORPH, SYNTAX], 702 | functions=[load_ud_syntag], 703 | ), 704 | ] 705 | ), 706 | 707 | 708 | ############# 709 | # 710 | # MORPHORUEVAL 711 | # 712 | ######### 713 | 714 | 715 | Group( 716 | title='morphoRuEval-2017', 717 | url='https://github.com/dialogue-evaluation/morphoRuEval-2017', 718 | metas=[ 719 | Meta( 720 | title='General Internet-Corpus', 721 | instruction=[ 722 | 'wget https://github.com/dialogue-evaluation/morphoRuEval-2017/raw/master/GIKRYA_texts_new.zip', 723 | 'unzip GIKRYA_texts_new.zip', 724 | 'rm GIKRYA_texts_new.zip' 725 | ], 726 | stats=Stats( 727 | count=83148, 728 | bytes=11091464 729 | ), 730 | tags=[MORPH], 731 | functions=[load_morphoru_gicrya], 732 | ), 733 | Meta( 734 | title='Russian National Corpus', 735 | instruction=[ 736 | 'wget https://github.com/dialogue-evaluation/morphoRuEval-2017/raw/master/RNC_texts.rar', 737 | 'unrar x RNC_texts.rar', 738 | 'rm RNC_texts.rar' 739 | ], 740 | stats=Stats( 741 | count=98892, 742 | bytes=13330673 743 | ), 744 | tags=[MORPH], 745 | functions=[load_morphoru_rnc], 746 | ), 747 | Meta( 748 | title='OpenCorpora', 749 | instruction=[ 750 | 'wget https://github.com/dialogue-evaluation/morphoRuEval-2017/raw/master/OpenCorpora_Texts.rar', 751 | 'unrar x OpenCorpora_Texts.rar', 752 | 'rm OpenCorpora_Texts.rar' 753 | ], 754 | stats=Stats( 755 | count=38510, 756 | bytes=5028255 757 | ), 758 | tags=[MORPH], 759 | functions=[load_morphoru_corpora], 760 | ), 761 | ] 762 | ), 763 | 764 | 765 | ############# 766 | # 767 | # RUSSE SEM 768 | # 769 | ######### 770 | 771 | 772 | Group( 773 | title='RUSSE Russian Semantic Relatedness', 774 | url='https://russe.nlpub.org/downloads/', 775 | metas=[ 776 | Meta( 777 | title='HJ: Human Judgements of Word Pairs', 778 | instruction=[ 779 | 'wget https://github.com/nlpub/russe-evaluation/raw/master/russe/evaluation/hj.csv' 780 | ], 781 | tags=[EMB, SIM], 782 | functions=[load_russe_hj], 783 | ), 784 | Meta( 785 | title='RT: Synonyms and Hypernyms from the Thesaurus RuThes', 786 | instruction=[ 787 | 'wget https://raw.githubusercontent.com/nlpub/russe-evaluation/master/russe/evaluation/rt.csv' 788 | ], 789 | tags=[EMB, SIM], 790 | functions=[load_russe_rt], 791 | ), 792 | Meta( 793 | title='AE: Cognitive Associations from the Sociation.org Experiment', 794 | instruction=[ 795 | 'wget https://github.com/nlpub/russe-evaluation/raw/master/russe/evaluation/ae-train.csv', 796 | 'wget https://github.com/nlpub/russe-evaluation/raw/master/russe/evaluation/ae-test.csv', 797 | 'wget https://raw.githubusercontent.com/nlpub/russe-evaluation/master/russe/evaluation/ae2.csv' 798 | ], 799 | tags=[EMB, SIM], 800 | functions=[load_russe_ae], 801 | ), 802 | ] 803 | ), 804 | 805 | 806 | ############# 807 | # 808 | # TOLOKA 809 | # 810 | ######### 811 | 812 | 813 | Group( 814 | title='Toloka Datasets', 815 | url='https://toloka.yandex.ru/datasets/', 816 | metas=[ 817 | Meta( 818 | title='Lexical Relations from the Wisdom of the Crowd (LRWC)', 819 | instruction=[ 820 | 'wget https://tlk.s3.yandex.net/dataset/LRWC.zip', 821 | 'unzip LRWC.zip', 822 | 'rm LRWC.zip' 823 | ], 824 | tags=[EMB, SIM], 825 | functions=[load_toloka_lrwc], 826 | ), 827 | Meta( 828 | title='The Russian Adverse Drug Reaction Corpus of Tweets (RuADReCT)', 829 | url='https://github.com/cimm-kzn/RuDReC', 830 | description='This corpus was developed for the Social Media Mining for Health Applications (#SMM4H) ' 831 | 'Shared Task 2020', 832 | instruction=[ 833 | 'wget https://github.com/cimm-kzn/RuDReC/raw/master/data/RuADReCT.zip', 834 | 'unzip RuADReCT.zip', 835 | 'rm RuADReCT.zip' 836 | ], 837 | stats=Stats( 838 | count=9515, 839 | bytes=2190063 840 | ), 841 | tags=[SOCIAL], 842 | functions=[load_ruadrect], 843 | ), 844 | ] 845 | ), 846 | ] 847 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | ![CI](https://github.com/natasha/corus/actions/workflows/test.yml/badge.svg) 5 | 6 | Links to publicly available Russian corpora + code for loading and parsing. 20+ datasets, 350Gb+ of text. 7 | 8 | ## Usage 9 | 10 | For example lets use dump of lenta.ru by @yutkin. Manually download the archive (link in the Reference section): 11 | ```bash 12 | wget https://github.com/yutkin/Lenta.Ru-News-Dataset/releases/download/v1.0/lenta-ru-news.csv.gz 13 | ``` 14 | 15 | Use `corus` to load the data: 16 | 17 | ```python 18 | >>> from corus import load_lenta 19 | 20 | >>> path = 'lenta-ru-news.csv.gz' 21 | >>> records = load_lenta(path) 22 | >>> next(records) 23 | 24 | LentaRecord( 25 | url='https://lenta.ru/news/2018/12/14/cancer/', 26 | title='Названы регионы России с\xa0самой высокой смертностью от\xa0рака', 27 | text='Вице-премьер по социальным вопросам Татьяна Голикова рассказала, в каких регионах России зафиксирована наиболее высокая смертность от рака, сооб...', 28 | topic='Россия', 29 | tags='Общество' 30 | ) 31 | ``` 32 | 33 | Iterate over texts: 34 | 35 | ```python 36 | >>> records = load_lenta(path) 37 | >>> for record in records: 38 | ... text = record.text 39 | ... ... 40 | 41 | ``` 42 | 43 | For links to other datasets and their loaders see the Reference section. 44 | 45 | ## Documentation 46 | 47 | Materials are in Russian: 48 | 49 | * Corus page on natasha.github.io 50 | * Corus section of Datafest 2020 talk 51 | 52 | ## Install 53 | 54 | `corus` supports Python 3.5+, PyPy 3. 55 | 56 | ```bash 57 | $ pip install corus 58 | ``` 59 | 60 | ## Reference 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 76 | 78 | 79 | 80 | 83 | 88 | 91 | 94 | 97 | 100 | 101 | 102 | 105 | 110 | 113 | 116 | 119 | 122 | 123 | 124 | 127 | 132 | 135 | 138 | 141 | 147 | 148 | 149 | 152 | 161 | 164 | 167 | 170 | 173 | 174 | 175 | 178 | 183 | 187 | 190 | 193 | 199 | 200 | 201 | 204 | 209 | 211 | 214 | 217 | 223 | 224 | 225 | 228 | 233 | 235 | 238 | 241 | 254 | 255 | 256 | 259 | 264 | 267 | 270 | 273 | 276 | 277 | 278 | 281 | 286 | 290 | 292 | 294 | 299 | 300 | 301 | 304 | 309 | 314 | 316 | 319 | 325 | 326 | 327 | 330 | 335 | 339 | 342 | 345 | 355 | 356 | 357 | 360 | 365 | 369 | 372 | 375 | 385 | 386 | 387 | 390 | 395 | 399 | 402 | 405 | 415 | 416 | 417 | 420 | 425 | 428 | 431 | 434 | 440 | 441 | 442 | 445 | 450 | 453 | 456 | 459 | 473 | 474 | 475 | 478 | 483 | 487 | 490 | 493 | 499 | 500 | 501 | 504 | 509 | 512 | 515 | 518 | 524 | 525 | 526 | 529 | 537 | 538 | 539 | 542 | 547 | 550 | 553 | 556 | 558 | 559 | 560 | 563 | 568 | 571 | 574 | 577 | 579 | 580 | 581 | 584 | 589 | 592 | 595 | 598 | 600 | 601 | 602 | 605 | 610 | 613 | 616 | 619 | 621 | 622 | 623 | 626 | 631 | 634 | 637 | 640 | 642 | 643 | 644 | 647 | 652 | 655 | 658 | 661 | 663 | 664 | 665 | 668 | 673 | 675 | 678 | 681 | 683 | 684 | 685 | 688 | 693 | 695 | 698 | 701 | 703 | 704 | 705 | 708 | 713 | 716 | 719 | 722 | 724 | 725 | 726 | 729 | 734 | 737 | 740 | 743 | 745 | 746 | 747 | 750 | 755 | 757 | 760 | 763 | 765 | 766 | 767 | 770 | 773 | 774 | 775 | 778 | 783 | 786 | 789 | 792 | 802 | 803 | 804 | 807 | 812 | 815 | 818 | 821 | 827 | 828 | 829 | 832 | 835 | 836 | 837 | 840 | 845 | 848 | 851 | 854 | 857 | 858 | 859 | 862 | 867 | 870 | 873 | 876 | 879 | 880 | 881 | 884 | 889 | 892 | 895 | 898 | 901 | 902 | 903 | 906 | 911 | 914 | 917 | 920 | 923 | 924 | 925 | 928 | 933 | 936 | 939 | 942 | 945 | 946 | 947 | 950 | 955 | 958 | 961 | 964 | 967 | 968 | 969 | 972 | 977 | 980 | 983 | 986 | 989 | 990 | 991 | 994 | 996 | 997 | 998 | 1001 | 1006 | 1010 | 1013 | 1016 | 1023 | 1024 | 1025 | 1028 | 1033 | 1037 | 1040 | 1043 | 1050 | 1051 | 1052 | 1055 | 1060 | 1064 | 1067 | 1070 | 1073 | 1074 | 1075 | 1078 | 1083 | 1087 | 1090 | 1093 | 1100 | 1101 | 1102 | 1105 | 1107 | 1108 | 1109 | 1112 | 1117 | 1120 | 1123 | 1126 | 1133 | 1134 | 1135 | 1138 | 1143 | 1146 | 1149 | 1152 | 1159 | 1160 | 1161 | 1164 | 1169 | 1172 | 1175 | 1178 | 1185 | 1186 | 1187 | 1190 | 1192 | 1193 | 1194 | 1197 | 1202 | 1206 | 1208 | 1210 | 1213 | 1214 | 1215 | 1218 | 1223 | 1227 | 1229 | 1231 | 1234 | 1235 | 1236 | 1239 | 1244 | 1248 | 1250 | 1252 | 1259 | 1260 | 1261 | 1264 | 1266 | 1267 | 1268 | 1271 | 1276 | 1280 | 1282 | 1284 | 1291 | 1292 | 1293 | 1296 | 1301 | 1304 | 1307 | 1310 | 1320 | 1321 |
DatasetAPI from corus importTagsTextsUncompressedDescription
74 | Lenta.ru 75 | 77 |
81 | Lenta.ru v1.0 82 | 84 | 85 | load_lenta 86 | # 87 | 89 | news 90 | 92 | 739 351 93 | 95 | 1.66 Gb 96 | 98 | wget https://github.com/yutkin/Lenta.Ru-News-Dataset/releases/download/v1.0/lenta-ru-news.csv.gz 99 |
103 | Lenta.ru v1.1+ 104 | 106 | 107 | load_lenta2 108 | # 109 | 111 | news 112 | 114 | 800 975 115 | 117 | 1.94 Gb 118 | 120 | wget https://github.com/yutkin/Lenta.Ru-News-Dataset/releases/download/v1.1/lenta-ru-news.csv.bz2 121 |
125 | Lib.rus.ec 126 | 128 | 129 | load_librusec 130 | # 131 | 133 | fiction 134 | 136 | 301 871 137 | 139 | 144.92 Gb 140 | 142 | Dump of lib.rus.ec prepared for RUSSE workshop 143 |
144 |
145 | wget http://panchenko.me/data/russe/librusec_fb2.plain.gz 146 |
150 | Rossiya Segodnya 151 | 153 | 154 | load_ria_raw 155 | # 156 |
157 | 158 | load_ria 159 | # 160 |
162 | news 163 | 165 | 1 003 869 166 | 168 | 3.70 Gb 169 | 171 | wget https://github.com/RossiyaSegodnya/ria_news_dataset/raw/master/ria.json.gz 172 |
176 | Mokoron Russian Twitter Corpus 177 | 179 | 180 | load_mokoron 181 | # 182 | 184 | social 185 | sentiment 186 | 188 | 17 633 417 189 | 191 | 1.86 Gb 192 | 194 | Russian Twitter sentiment markup 195 |
196 |
197 | Manually download https://www.dropbox.com/s/9egqjszeicki4ho/db.sql 198 |
202 | Wikipedia 203 | 205 | 206 | load_wiki 207 | # 208 | 210 | 212 | 1 541 401 213 | 215 | 12.94 Gb 216 | 218 | Russian Wiki dump 219 |
220 |
221 | wget https://dumps.wikimedia.org/ruwiki/latest/ruwiki-latest-pages-articles.xml.bz2 222 |
226 | GramEval2020 227 | 229 | 230 | load_gramru 231 | # 232 | 234 | 236 | 162 372 237 | 239 | 30.04 Mb 240 | 242 | wget https://github.com/dialogue-evaluation/GramEval2020/archive/master.zip 243 |
244 | unzip master.zip 245 |
246 | mv GramEval2020-master/dataTrain train 247 |
248 | mv GramEval2020-master/dataOpenTest dev 249 |
250 | rm -r master.zip GramEval2020-master 251 |
252 | wget https://github.com/AlexeySorokin/GramEval2020/raw/master/data/GramEval_private_test.conllu 253 |
257 | OpenCorpora 258 | 260 | 261 | load_corpora 262 | # 263 | 265 | morph 266 | 268 | 4 030 269 | 271 | 20.21 Mb 272 | 274 | wget http://opencorpora.org/files/export/annot/annot.opcorpora.xml.zip 275 |
279 | RusVectores SimLex-965 280 | 282 | 283 | load_simlex 284 | # 285 | 287 | emb 288 | sim 289 | 291 | 293 | 295 | wget https://rusvectores.org/static/testsets/ru_simlex965_tagged.tsv 296 |
297 | wget https://rusvectores.org/static/testsets/ru_simlex965.tsv 298 |
302 | Omnia Russica 303 | 305 | 306 | load_omnia 307 | # 308 | 310 | morph 311 | web 312 | fiction 313 | 315 | 317 | 489.62 Gb 318 | 320 | Taiga + Wiki + Araneum. Read "Even larger Russian corpus" https://events.spbu.ru/eventsContent/events/2019/corpora/corp_sborn.pdf 321 |
322 |
323 | Manually download http://bit.ly/2ZT4BY9 324 |
328 | factRuEval-2016 329 | 331 | 332 | load_factru 333 | # 334 | 336 | ner 337 | news 338 | 340 | 254 341 | 343 | 969.27 Kb 344 | 346 | Manual PER, LOC, ORG markup prepared for 2016 Dialog competition 347 |
348 |
349 | wget https://github.com/dialogue-evaluation/factRuEval-2016/archive/master.zip 350 |
351 | unzip master.zip 352 |
353 | rm master.zip 354 |
358 | Gareev 359 | 361 | 362 | load_gareev 363 | # 364 | 366 | ner 367 | news 368 | 370 | 97 371 | 373 | 455.02 Kb 374 | 376 | Manual PER, ORG markup (no LOC) 377 |
378 |
379 | Email Rinat Gareev (gareev-rm@yandex.ru) ask for dataset 380 |
381 | tar -xvf rus-ner-news-corpus.iob.tar.gz 382 |
383 | rm rus-ner-news-corpus.iob.tar.gz 384 |
388 | Collection5 389 | 391 | 392 | load_ne5 393 | # 394 | 396 | ner 397 | news 398 | 400 | 1 000 401 | 403 | 2.96 Mb 404 | 406 | News articles with manual PER, LOC, ORG markup 407 |
408 |
409 | wget http://www.labinform.ru/pub/named_entities/collection5.zip 410 |
411 | unzip collection5.zip 412 |
413 | rm collection5.zip 414 |
418 | WiNER 419 | 421 | 422 | load_wikiner 423 | # 424 | 426 | ner 427 | 429 | 203 287 430 | 432 | 36.15 Mb 433 | 435 | Sentences from Wiki auto annotated with PER, LOC, ORG tags 436 |
437 |
438 | wget https://github.com/dice-group/FOX/raw/master/input/Wikiner/aij-wikiner-ru-wp3.bz2 439 |
443 | BSNLP-2019 444 | 446 | 447 | load_bsnlp 448 | # 449 | 451 | ner 452 | 454 | 464 455 | 457 | 1.16 Mb 458 | 460 | Markup prepared for 2019 BSNLP Shared Task 461 |
462 |
463 | wget http://bsnlp.cs.helsinki.fi/TRAININGDATA_BSNLP_2019_shared_task.zip 464 |
465 | wget http://bsnlp.cs.helsinki.fi/TESTDATA_BSNLP_2019_shared_task.zip 466 |
467 | unzip TRAININGDATA_BSNLP_2019_shared_task.zip 468 |
469 | unzip TESTDATA_BSNLP_2019_shared_task.zip -d test_pl_cs_ru_bg 470 |
471 | rm TRAININGDATA_BSNLP_2019_shared_task.zip TESTDATA_BSNLP_2019_shared_task.zip 472 |
476 | Persons-1000 477 | 479 | 480 | load_persons 481 | # 482 | 484 | ner 485 | news 486 | 488 | 1 000 489 | 491 | 2.96 Mb 492 | 494 | Same as Collection5, only PER markup + normalized names 495 |
496 |
497 | wget http://ai-center.botik.ru/Airec/ai-resources/Persons-1000.zip 498 |
502 | The Russian Drug Reaction Corpus (RuDReC) 503 | 505 | 506 | load_rudrec 507 | # 508 | 510 | ner 511 | 513 | 4 809 514 | 516 | 1.73 Kb 517 | 519 | RuDReC is a new partially annotated corpus of consumer reviews in Russian about pharmaceutical products for the detection of health-related named entities and the effectiveness of pharmaceutical products. Here you can download and work with the annotated part, to get the raw part (1.4M reviews) please refer to https://github.com/cimm-kzn/RuDReC. 520 |
521 |
522 | wget https://github.com/cimm-kzn/RuDReC/raw/master/data/rudrec_annotated.json 523 |
527 | Taiga 528 | 530 | Large collection of Russian texts from various sources: news sites, magazines, literacy, social networks 531 |
532 |
533 | wget https://linghub.ru/static/Taiga/retagged_taiga.tar.gz 534 |
535 | tar -xzvf retagged_taiga.tar.gz 536 |
540 | Arzamas 541 | 543 | 544 | load_taiga_arzamas 545 | # 546 | 548 | news 549 | 551 | 311 552 | 554 | 4.50 Mb 555 | 557 |
561 | Fontanka 562 | 564 | 565 | load_taiga_fontanka 566 | # 567 | 569 | news 570 | 572 | 342 683 573 | 575 | 786.23 Mb 576 | 578 |
582 | Interfax 583 | 585 | 586 | load_taiga_interfax 587 | # 588 | 590 | news 591 | 593 | 46 429 594 | 596 | 77.55 Mb 597 | 599 |
603 | KP 604 | 606 | 607 | load_taiga_kp 608 | # 609 | 611 | news 612 | 614 | 45 503 615 | 617 | 61.79 Mb 618 | 620 |
624 | Lenta 625 | 627 | 628 | load_taiga_lenta 629 | # 630 | 632 | news 633 | 635 | 36 446 636 | 638 | 95.15 Mb 639 | 641 |
645 | Taiga/N+1 646 | 648 | 649 | load_taiga_nplus1 650 | # 651 | 653 | news 654 | 656 | 7 696 657 | 659 | 24.96 Mb 660 | 662 |
666 | Magazines 667 | 669 | 670 | load_taiga_magazines 671 | # 672 | 674 | 676 | 39 890 677 | 679 | 2.19 Gb 680 | 682 |
686 | Subtitles 687 | 689 | 690 | load_taiga_subtitles 691 | # 692 | 694 | 696 | 19 011 697 | 699 | 909.08 Mb 700 | 702 |
706 | Social 707 | 709 | 710 | load_taiga_social 711 | # 712 | 714 | social 715 | 717 | 1 876 442 718 | 720 | 648.18 Mb 721 | 723 |
727 | Proza 728 | 730 | 731 | load_taiga_proza 732 | # 733 | 735 | fiction 736 | 738 | 1 732 434 739 | 741 | 38.25 Gb 742 | 744 |
748 | Stihi 749 | 751 | 752 | load_taiga_stihi 753 | # 754 | 756 | 758 | 9 157 686 759 | 761 | 12.80 Gb 762 | 764 |
768 | Russian NLP Datasets 769 | 771 | Several Russian news datasets from webhose.io, lenta.ru and other news sites. 772 |
776 | News 777 | 779 | 780 | load_buriy_news 781 | # 782 | 784 | news 785 | 787 | 2 154 801 788 | 790 | 6.84 Gb 791 | 793 | Dump of top 40 news + 20 fashion news sites. 794 |
795 |
796 | wget https://github.com/buriy/russian-nlp-datasets/releases/download/r4/news-articles-2014.tar.bz2 797 |
798 | wget https://github.com/buriy/russian-nlp-datasets/releases/download/r4/news-articles-2015-part1.tar.bz2 799 |
800 | wget https://github.com/buriy/russian-nlp-datasets/releases/download/r4/news-articles-2015-part2.tar.bz2 801 |
805 | Webhose 806 | 808 | 809 | load_buriy_webhose 810 | # 811 | 813 | news 814 | 816 | 285 965 817 | 819 | 859.32 Mb 820 | 822 | Dump from webhose.io, 300 sources for one month. 823 |
824 |
825 | wget https://github.com/buriy/russian-nlp-datasets/releases/download/r4/webhose-2016.tar.bz2 826 |
830 | ODS #proj_news_viz 831 | 833 | Several news sites scraped by members of #proj_news_viz ODS project. 834 |
838 | Interfax 839 | 841 | 842 | load_ods_interfax 843 | # 844 | 846 | news 847 | 849 | 543 961 850 | 852 | 1.22 Gb 853 | 855 | wget https://github.com/ods-ai-ml4sg/proj_news_viz/releases/download/data/interfax.csv.gz 856 |
860 | Gazeta 861 | 863 | 864 | load_ods_gazeta 865 | # 866 | 868 | news 869 | 871 | 865 847 872 | 874 | 1.63 Gb 875 | 877 | wget https://github.com/ods-ai-ml4sg/proj_news_viz/releases/download/data/gazeta.csv.gz 878 |
882 | Izvestia 883 | 885 | 886 | load_ods_izvestia 887 | # 888 | 890 | news 891 | 893 | 86 601 894 | 896 | 307.19 Mb 897 | 899 | wget https://github.com/ods-ai-ml4sg/proj_news_viz/releases/download/data/iz.csv.gz 900 |
904 | Meduza 905 | 907 | 908 | load_ods_meduza 909 | # 910 | 912 | news 913 | 915 | 71 806 916 | 918 | 270.11 Mb 919 | 921 | wget https://github.com/ods-ai-ml4sg/proj_news_viz/releases/download/data/meduza.csv.gz 922 |
926 | RIA 927 | 929 | 930 | load_ods_ria 931 | # 932 | 934 | news 935 | 937 | 101 543 938 | 940 | 233.88 Mb 941 | 943 | wget https://github.com/ods-ai-ml4sg/proj_news_viz/releases/download/data/ria.csv.gz 944 |
948 | Russia Today 949 | 951 | 952 | load_ods_rt 953 | # 954 | 956 | news 957 | 959 | 106 644 960 | 962 | 187.12 Mb 963 | 965 | wget https://github.com/ods-ai-ml4sg/proj_news_viz/releases/download/data/rt.csv.gz 966 |
970 | TASS 971 | 973 | 974 | load_ods_tass 975 | # 976 | 978 | news 979 | 981 | 1 135 635 982 | 984 | 3.27 Gb 985 | 987 | wget https://github.com/ods-ai-ml4sg/proj_news_viz/releases/download/data/tass-001.csv.gz 988 |
992 | Universal Dependencies 993 | 995 |
999 | GSD 1000 | 1002 | 1003 | load_ud_gsd 1004 | # 1005 | 1007 | morph 1008 | syntax 1009 | 1011 | 5 030 1012 | 1014 | 1.01 Mb 1015 | 1017 | wget https://github.com/UniversalDependencies/UD_Russian-GSD/raw/master/ru_gsd-ud-dev.conllu 1018 |
1019 | wget https://github.com/UniversalDependencies/UD_Russian-GSD/raw/master/ru_gsd-ud-test.conllu 1020 |
1021 | wget https://github.com/UniversalDependencies/UD_Russian-GSD/raw/master/ru_gsd-ud-train.conllu 1022 |
1026 | Taiga 1027 | 1029 | 1030 | load_ud_taiga 1031 | # 1032 | 1034 | morph 1035 | syntax 1036 | 1038 | 3 264 1039 | 1041 | 353.80 Kb 1042 | 1044 | wget https://github.com/UniversalDependencies/UD_Russian-Taiga/raw/master/ru_taiga-ud-dev.conllu 1045 |
1046 | wget https://github.com/UniversalDependencies/UD_Russian-Taiga/raw/master/ru_taiga-ud-test.conllu 1047 |
1048 | wget https://github.com/UniversalDependencies/UD_Russian-Taiga/raw/master/ru_taiga-ud-train.conllu 1049 |
1053 | PUD 1054 | 1056 | 1057 | load_ud_pud 1058 | # 1059 | 1061 | morph 1062 | syntax 1063 | 1065 | 1 000 1066 | 1068 | 207.78 Kb 1069 | 1071 | wget https://github.com/UniversalDependencies/UD_Russian-PUD/raw/master/ru_pud-ud-test.conllu 1072 |
1076 | SynTagRus 1077 | 1079 | 1080 | load_ud_syntag 1081 | # 1082 | 1084 | morph 1085 | syntax 1086 | 1088 | 61 889 1089 | 1091 | 11.33 Mb 1092 | 1094 | wget https://github.com/UniversalDependencies/UD_Russian-SynTagRus/raw/master/ru_syntagrus-ud-dev.conllu 1095 |
1096 | wget https://github.com/UniversalDependencies/UD_Russian-SynTagRus/raw/master/ru_syntagrus-ud-test.conllu 1097 |
1098 | wget https://github.com/UniversalDependencies/UD_Russian-SynTagRus/raw/master/ru_syntagrus-ud-train.conllu 1099 |
1103 | morphoRuEval-2017 1104 | 1106 |
1110 | General Internet-Corpus 1111 | 1113 | 1114 | load_morphoru_gicrya 1115 | # 1116 | 1118 | morph 1119 | 1121 | 83 148 1122 | 1124 | 10.58 Mb 1125 | 1127 | wget https://github.com/dialogue-evaluation/morphoRuEval-2017/raw/master/GIKRYA_texts_new.zip 1128 |
1129 | unzip GIKRYA_texts_new.zip 1130 |
1131 | rm GIKRYA_texts_new.zip 1132 |
1136 | Russian National Corpus 1137 | 1139 | 1140 | load_morphoru_rnc 1141 | # 1142 | 1144 | morph 1145 | 1147 | 98 892 1148 | 1150 | 12.71 Mb 1151 | 1153 | wget https://github.com/dialogue-evaluation/morphoRuEval-2017/raw/master/RNC_texts.rar 1154 |
1155 | unrar x RNC_texts.rar 1156 |
1157 | rm RNC_texts.rar 1158 |
1162 | OpenCorpora 1163 | 1165 | 1166 | load_morphoru_corpora 1167 | # 1168 | 1170 | morph 1171 | 1173 | 38 510 1174 | 1176 | 4.80 Mb 1177 | 1179 | wget https://github.com/dialogue-evaluation/morphoRuEval-2017/raw/master/OpenCorpora_Texts.rar 1180 |
1181 | unrar x OpenCorpora_Texts.rar 1182 |
1183 | rm OpenCorpora_Texts.rar 1184 |
1188 | RUSSE Russian Semantic Relatedness 1189 | 1191 |
1195 | HJ: Human Judgements of Word Pairs 1196 | 1198 | 1199 | load_russe_hj 1200 | # 1201 | 1203 | emb 1204 | sim 1205 | 1207 | 1209 | 1211 | wget https://github.com/nlpub/russe-evaluation/raw/master/russe/evaluation/hj.csv 1212 |
1216 | RT: Synonyms and Hypernyms from the Thesaurus RuThes 1217 | 1219 | 1220 | load_russe_rt 1221 | # 1222 | 1224 | emb 1225 | sim 1226 | 1228 | 1230 | 1232 | wget https://raw.githubusercontent.com/nlpub/russe-evaluation/master/russe/evaluation/rt.csv 1233 |
1237 | AE: Cognitive Associations from the Sociation.org Experiment 1238 | 1240 | 1241 | load_russe_ae 1242 | # 1243 | 1245 | emb 1246 | sim 1247 | 1249 | 1251 | 1253 | wget https://github.com/nlpub/russe-evaluation/raw/master/russe/evaluation/ae-train.csv 1254 |
1255 | wget https://github.com/nlpub/russe-evaluation/raw/master/russe/evaluation/ae-test.csv 1256 |
1257 | wget https://raw.githubusercontent.com/nlpub/russe-evaluation/master/russe/evaluation/ae2.csv 1258 |
1262 | Toloka Datasets 1263 | 1265 |
1269 | Lexical Relations from the Wisdom of the Crowd (LRWC) 1270 | 1272 | 1273 | load_toloka_lrwc 1274 | # 1275 | 1277 | emb 1278 | sim 1279 | 1281 | 1283 | 1285 | wget https://tlk.s3.yandex.net/dataset/LRWC.zip 1286 |
1287 | unzip LRWC.zip 1288 |
1289 | rm LRWC.zip 1290 |
1294 | The Russian Adverse Drug Reaction Corpus of Tweets (RuADReCT) 1295 | 1297 | 1298 | load_ruadrect 1299 | # 1300 | 1302 | social 1303 | 1305 | 9 515 1306 | 1308 | 2.09 Mb 1309 | 1311 | This corpus was developed for the Social Media Mining for Health Applications (#SMM4H) Shared Task 2020 1312 |
1313 |
1314 | wget https://github.com/cimm-kzn/RuDReC/raw/master/data/RuADReCT.zip 1315 |
1316 | unzip RuADReCT.zip 1317 |
1318 | rm RuADReCT.zip 1319 |
1322 | 1323 | 1324 | ## Support 1325 | 1326 | - Chat — https://t.me/natural_language_processing 1327 | - Issues — https://github.com/natasha/corus/issues 1328 | - Commercial support — https://lab.alexkuk.ru 1329 | 1330 | ## Add new source 1331 | 1332 | 1. Implement `corus/sources/.py` 1333 | 2. Add import into `corus/sources/__init__.py` 1334 | 3. Add meta into `corus/source/meta.py` 1335 | 4. Add example into `docs.ipynb` (check meta table is correct) 1336 | 5. Run tests (readme is updated) 1337 | 1338 | ## Development 1339 | 1340 | Dev env 1341 | 1342 | ```bash 1343 | python -m venv ~/.venvs/natasha-corus 1344 | source ~/.venvs/natasha-corus/bin/activate 1345 | 1346 | pip install -r requirements/dev.txt 1347 | pip install -e . 1348 | 1349 | python -m ipykernel install --user --name natasha-corus 1350 | ``` 1351 | 1352 | Lint + update docs 1353 | 1354 | ```bash 1355 | make lint 1356 | make exec-docs 1357 | ``` 1358 | 1359 | Release 1360 | 1361 | ```bash 1362 | # Update setup.py version 1363 | 1364 | git commit -am 'Up version' 1365 | git tag v0.10.0 1366 | 1367 | git push 1368 | git push --tags 1369 | ``` 1370 | --------------------------------------------------------------------------------