├── perl └── lib │ ├── ISO │ ├── LMF.pm │ └── LMF │ │ ├── EntityBase.pm │ │ ├── LexicalResource.pm │ │ ├── Lexicon.pm │ │ ├── WordForm.pm │ │ └── LexicalEntry.pm │ ├── Lingua │ └── AOT │ │ ├── MorphDict │ │ ├── FormSpec.pm │ │ ├── AccentParadigm.pm │ │ ├── Form.pm │ │ ├── MorphVariant.pm │ │ ├── Paradigm.pm │ │ └── Gramtab.pm │ │ └── test.pl │ └── OpenCorpora │ └── Dict │ ├── SimpleReader.pm │ └── Entry.pm ├── lib ├── .htaccess ├── timer.php ├── lib_mail.php └── header_ajax.php ├── anaphora ├── NE_extract │ ├── bad_noun.txt │ ├── pronouns.txt │ ├── run_toma.sh │ ├── kwtypes.proto │ ├── run_parse.sh │ ├── maindic.gzt │ ├── config.proto │ └── facttypes.proto ├── ana_test.pairs ├── features │ └── runF.sh ├── export_pairs.py ├── learning │ ├── baseline.py │ ├── learn.py │ └── scorer.py ├── ana_test.groups └── pairs.py ├── migrations ├── .htaccess ├── 20140101000000_initial_db.php ├── 20150703113519_add_pool_proto_name.php ├── 20160203212016_multiword_types.php ├── 20150610120658_merge_fails_comments.php ├── 20160124093035_add_prop_order.php ├── 20150628112302_turn_game_on_for_all.php ├── 20160130085413_more_on_multiwords.php ├── 20160110211650_remove_old_ne_prop.php ├── 20151209163951_add_ne_book_moderator.php ├── 20151210150743_add_moderator_column_to_ne_par.php ├── 20140930101915_add_timer.php ├── 20190504181939_add_sentence_quality.php ├── 20160716091852_ne_annot_number_per_tagset.php ├── 20160122154639_obj_property_multiple_values.php ├── 20200625120954_long_good_sentences.php ├── 20151028150709_move_object_type_to2_level.php ├── 20150720124123_last_dict_revision.php ├── 20160109110445_ne_objects_properties.php ├── 20140923233544_add_permission_ne_moderator.php ├── 20151105153227_many_entities2_many_mentions.php ├── 20171203130148_drop_sentence_check.php ├── 20140830004643_named_entities_event_log.php ├── 20140828143321_named_entities_comments.php ├── 20151107191437_color_for_types.php ├── 20190416073732_add_user_generated_dict_revisions.php ├── 20150420150929_new_achievements_table.php ├── 20150709223738_simplify_user_rating.php ├── 20150603164038_remove_old_achievements.php ├── 20150623182816_remove_user_meta_options.php ├── 20151108101851_many_books2_many_tagsets.php ├── 20160129161233_mw_basic_structure.php ├── 20150216164636_undelete_lemmata.php ├── 20150624200024_change_candidate_samples.php ├── 20151023153845_add_ne_tagsets.php └── 20150314214304_add_tables_for_selective_backup.php ├── postagging └── brill │ └── unsupervised │ ├── python │ ├── learn_rules │ │ └── __init__.py │ ├── spearman_test │ │ ├── 1.txt │ │ ├── 1r.txt │ │ ├── 2.txt │ │ └── 3.txt │ ├── check_disjoint.sh │ ├── learning_test │ │ ├── rand1.tab │ │ └── rand0.tab │ ├── apply.py │ ├── pictures.py │ └── sample_corpus.py │ ├── cpp │ ├── include │ │ ├── corpora_io.h │ │ ├── utils.h │ │ ├── dict.h │ │ └── sentence.h │ ├── train │ │ ├── Makefile │ │ ├── aux.h │ │ └── aux.cpp │ ├── lemmatizer │ │ └── Makefile │ └── lib │ │ └── brill.cpp │ └── perl │ └── diff_tab_markup.pl ├── scripts ├── oc2conllu │ └── requirements.txt ├── train_tokenizer.sh ├── aot_import │ ├── lists │ │ ├── add_Arch_ADJF.txt │ │ ├── add_Dist_PRTS.txt │ │ ├── add_Infr_ablt_plur.txt │ │ ├── add_Erro_PRTS.txt │ │ ├── add_Dist_aux.txt │ │ ├── add_Litr.txt │ │ ├── abbr_del.txt │ │ ├── remove_ANim.txt │ │ ├── Del_anim-inan&Add_ANim.txt │ │ ├── add_Coun_gent_plur.txt │ │ ├── list_adjf_fixd_NOUN.txt │ │ ├── add_Erro_ADJS.txt │ │ ├── add_Infr_ADJS.txt │ │ ├── add_Infr_COMP.txt │ │ ├── add_Infr_ablt_sing.txt │ │ ├── pred_to_intj.txt │ │ ├── list_adjf_fixd_ADVB.txt │ │ ├── adjs_forms_del.txt │ │ ├── list_numr_dupl_gent.txt │ │ └── Unite_Paras&Add_ANim.txt │ ├── read_mrd.pl │ ├── rgramtab.tab │ ├── list_paradigm.pl │ ├── morphs.mrd.patch │ └── bad_lemma_grammems.txt ├── export_and_stats.sh ├── exports.sh ├── subst.txt ├── stats │ └── update_stats.sh ├── ma_pools │ ├── post_merge.php │ ├── autopublish.php │ └── unpublish_pools.py ├── tokenizer │ ├── train.php │ ├── tokenize.php │ ├── cronrunner.pl │ ├── tokenizer_exceptions.txt │ └── check_sentence_split.pl ├── invalidate_auth_tokens.pl ├── run_validators.sh ├── mwords │ ├── rules.txt │ └── search.php ├── json2ini.py ├── cronrunner.pl ├── validators │ ├── par_validator.py │ └── url_validator.py ├── export │ └── export_dict.sh ├── check_dog_achievement.php ├── backup.sh ├── consistency │ ├── form2tf.pl │ ├── form2lemma.pl │ └── dict_update_forms.pl └── delete_unused_files.pl ├── export ├── annot │ ├── disamb_nonmod_tests │ │ ├── pools.txt │ │ ├── pool_158.tab │ │ └── annot.opcorpora.canon_out.xml │ └── no_homonymy_constants.py ├── pools │ └── export_pools.sh ├── pools.pl └── database │ ├── backup.sh │ └── copy_nulled_tables.sql ├── favicon.ico ├── doc ├── presentations │ ├── 2012_September29_OnePage.sh │ ├── 2012_May31_Dialog_RoundTable.sh │ ├── img │ │ ├── annotation-lifecycle.sh │ │ ├── 2012_miem_1.png │ │ ├── 2012_miem_2.png │ │ ├── 2012_miem_3.png │ │ ├── 2012_miem_4.png │ │ ├── 2012_miem_5.png │ │ ├── 2012_miem_6.png │ │ ├── markupUI2.png │ │ ├── markupUI2-part.png │ │ ├── 2011_nlpseminar_1.png │ │ ├── 2011_nlpseminar_2.png │ │ ├── 2011_nlpseminar_3.png │ │ └── annotation-lifecycle.png │ └── 2012_September29_OnePage.tex └── articles │ └── img │ ├── 2011_Dialog_img1.png │ └── 2012_MIEM_img1.png ├── assets ├── img │ ├── sf.ttf │ ├── grey.png │ ├── logo.png │ ├── star.png │ ├── appeal1.png │ ├── appeal2.png │ ├── fb-pic.png │ ├── lang_en.png │ ├── lang_ru.png │ ├── robot.png │ ├── icon_plus.png │ ├── robot_big.png │ ├── tiny_grid.png │ ├── ajax-loader.gif │ ├── icon_glass.png │ ├── icon_smile.png │ ├── icon_target.png │ ├── wiki │ │ ├── Markup.png │ │ ├── Buttonanswer.png │ │ ├── Buttonnomore.png │ │ ├── Buttonother.png │ │ ├── Buttonskip.png │ │ ├── Tasklist_php.png │ │ ├── Buttoncomment.png │ │ ├── Buttonwantmore.png │ │ ├── Leftcontextex.png │ │ └── Rightcontextex.png │ ├── icon_speed_60.png │ ├── icon_star_gray.png │ ├── icon_star_red.png │ ├── icon_user_blue.png │ ├── tiny_grid_blue.png │ ├── icon_speedometer.png │ ├── icon_star_green.png │ ├── icon_star_orange.png │ ├── icon_star_yellow.png │ ├── icon_trophy_black.png │ ├── icon_user_orange.png │ └── badges │ │ ├── aist-400x400.png │ │ ├── bobr-400x400.png │ │ ├── dog-400x400.png │ │ ├── fish-400x400.png │ │ ├── ainl2015-400x400.png │ │ ├── chameleon-400x400.png │ │ ├── wantmore-400x400.png │ │ ├── aist-100x100-grayscale.png │ │ ├── bobr-100x100-grayscale.png │ │ ├── dog-100x100-grayscale.png │ │ ├── fish-100x100-grayscale.png │ │ ├── ainl2015-100x100-grayscale.png │ │ ├── chameleon-100x100-grayscale.png │ │ └── wantmore-100x100-grayscale.png └── js │ └── merge_fails.js ├── no_js.php ├── README.md ├── templates ├── error.tpl ├── static │ ├── doc │ │ └── annotation.tpl │ ├── faq.tpl │ └── downloads.row.tpl ├── qa │ ├── tasks_guest.tpl │ ├── empty_books.tpl │ ├── sent_split.tpl │ ├── unknowns.tpl │ ├── good_sentences.tpl │ ├── dl_urls.tpl │ ├── pool_tabs.tpl │ ├── game_status.tpl │ ├── book_tags.tpl │ ├── pool_candidates.tpl │ ├── useful_pools.tpl │ ├── tokenizer.tpl │ └── pool_types.tpl ├── common_no_js.tpl ├── dict │ ├── links_main.tpl │ ├── absent.tpl │ └── links_single.tpl ├── search.tpl ├── books.tpl ├── top100.tpl ├── openid_license.tpl ├── addtext.tpl ├── footer.tpl ├── ner │ └── _partials │ │ └── objects-modal.tpl ├── tag_stats.tpl ├── comments.tpl ├── sentence_syntax_groups_moderator.tpl └── sentence_syntax_moderator.tpl ├── tokenizer_monitor.php ├── ajax ├── readonly.php ├── wantmore.php ├── clck_log.php ├── merge_tokens.php ├── tokenizer_monitor.php ├── add_book_tag.php ├── set_option.php ├── download_url.php ├── run_test.php ├── run_generator.php ├── set_token_text.php ├── game_mark_shown.php ├── lemma_search.php ├── select_book.php ├── publish_update.php ├── tag_autocomplete.php ├── get_context.php ├── paradigm_info.php ├── dict_pending.php ├── save_check.php ├── merge_fails.php ├── own_book.php ├── lastpar.php ├── anaphora.php ├── dict_reload.php ├── annot.php ├── post_comment.php ├── bind_book.php ├── get_comments.php └── guess_wiki_categ.php ├── SECURITY.md ├── search.php ├── comments.php ├── .gitignore ├── dict_diff.php ├── dict_history.php ├── diff.php ├── robots.txt ├── revert.php ├── sources.php ├── user.php ├── add.php ├── history.php ├── generator_cp.php ├── manual.php ├── syntax.php ├── composer.json ├── ner.php ├── phinx.php └── options.php /perl/lib/ISO/LMF.pm: -------------------------------------------------------------------------------- 1 | 2 | 3 | 1; 4 | -------------------------------------------------------------------------------- /lib/.htaccess: -------------------------------------------------------------------------------- 1 | Order allow,deny 2 | Deny from all 3 | -------------------------------------------------------------------------------- /anaphora/NE_extract/bad_noun.txt: -------------------------------------------------------------------------------- 1 | быль 2 | были 3 | уж 4 | -------------------------------------------------------------------------------- /migrations/.htaccess: -------------------------------------------------------------------------------- 1 | Order deny,allow 2 | Deny from all 3 | -------------------------------------------------------------------------------- /postagging/brill/unsupervised/python/learn_rules/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /scripts/oc2conllu/requirements.txt: -------------------------------------------------------------------------------- 1 | xmltodict 2 | russian_tagsets 3 | -------------------------------------------------------------------------------- /scripts/train_tokenizer.sh: -------------------------------------------------------------------------------- 1 | php /corpus/scripts/tokenizer/train.php 2 | -------------------------------------------------------------------------------- /export/annot/disamb_nonmod_tests/pools.txt: -------------------------------------------------------------------------------- 1 | 158 NOUN&plur&gent@NOUN&plur&accs 4 2 | -------------------------------------------------------------------------------- /favicon.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenCorpora/opencorpora/HEAD/favicon.ico -------------------------------------------------------------------------------- /doc/presentations/2012_September29_OnePage.sh: -------------------------------------------------------------------------------- 1 | pdflatex 2012_September29_OnePage.tex 2 | -------------------------------------------------------------------------------- /assets/img/sf.ttf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenCorpora/opencorpora/HEAD/assets/img/sf.ttf -------------------------------------------------------------------------------- /no_js.php: -------------------------------------------------------------------------------- 1 | display('common_no_js.tpl'); 4 | -------------------------------------------------------------------------------- /postagging/brill/unsupervised/python/spearman_test/1.txt: -------------------------------------------------------------------------------- 1 | 1 2 | 2 3 | 3 4 | 4 5 | 5 6 | 6 7 | -------------------------------------------------------------------------------- /postagging/brill/unsupervised/python/spearman_test/1r.txt: -------------------------------------------------------------------------------- 1 | 6 2 | 5 3 | 4 4 | 3 5 | 2 6 | 1 7 | -------------------------------------------------------------------------------- /postagging/brill/unsupervised/python/spearman_test/2.txt: -------------------------------------------------------------------------------- 1 | 2 2 | 1 3 | 4 4 | 3 5 | 6 6 | 5 7 | -------------------------------------------------------------------------------- /postagging/brill/unsupervised/python/spearman_test/3.txt: -------------------------------------------------------------------------------- 1 | 1 2 | 2 3 | 3 4 | 4 5 | 5 6 | 6 7 | -------------------------------------------------------------------------------- /assets/img/grey.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenCorpora/opencorpora/HEAD/assets/img/grey.png -------------------------------------------------------------------------------- /assets/img/logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenCorpora/opencorpora/HEAD/assets/img/logo.png -------------------------------------------------------------------------------- /assets/img/star.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenCorpora/opencorpora/HEAD/assets/img/star.png -------------------------------------------------------------------------------- /doc/presentations/2012_May31_Dialog_RoundTable.sh: -------------------------------------------------------------------------------- 1 | pdflatex 2012_May31_Dialog_RoundTable.tex 2 | -------------------------------------------------------------------------------- /assets/img/appeal1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenCorpora/opencorpora/HEAD/assets/img/appeal1.png -------------------------------------------------------------------------------- /assets/img/appeal2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenCorpora/opencorpora/HEAD/assets/img/appeal2.png -------------------------------------------------------------------------------- /assets/img/fb-pic.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenCorpora/opencorpora/HEAD/assets/img/fb-pic.png -------------------------------------------------------------------------------- /assets/img/lang_en.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenCorpora/opencorpora/HEAD/assets/img/lang_en.png -------------------------------------------------------------------------------- /assets/img/lang_ru.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenCorpora/opencorpora/HEAD/assets/img/lang_ru.png -------------------------------------------------------------------------------- /assets/img/robot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenCorpora/opencorpora/HEAD/assets/img/robot.png -------------------------------------------------------------------------------- /scripts/aot_import/lists/add_Arch_ADJF.txt: -------------------------------------------------------------------------------- 1 | #Добавить этим полным прилагательным помету Устар. 2 | самоё -------------------------------------------------------------------------------- /scripts/aot_import/lists/add_Dist_PRTS.txt: -------------------------------------------------------------------------------- 1 | #Добавить этим кратким причастиям помету Искаж 2 | стера 3 | -------------------------------------------------------------------------------- /assets/img/icon_plus.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenCorpora/opencorpora/HEAD/assets/img/icon_plus.png -------------------------------------------------------------------------------- /assets/img/robot_big.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenCorpora/opencorpora/HEAD/assets/img/robot_big.png -------------------------------------------------------------------------------- /assets/img/tiny_grid.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenCorpora/opencorpora/HEAD/assets/img/tiny_grid.png -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | opencorpora 2 | =========== 3 | 4 | A web-based engine for creating and annotating textual corpora 5 | -------------------------------------------------------------------------------- /assets/img/ajax-loader.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenCorpora/opencorpora/HEAD/assets/img/ajax-loader.gif -------------------------------------------------------------------------------- /assets/img/icon_glass.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenCorpora/opencorpora/HEAD/assets/img/icon_glass.png -------------------------------------------------------------------------------- /assets/img/icon_smile.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenCorpora/opencorpora/HEAD/assets/img/icon_smile.png -------------------------------------------------------------------------------- /assets/img/icon_target.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenCorpora/opencorpora/HEAD/assets/img/icon_target.png -------------------------------------------------------------------------------- /assets/img/wiki/Markup.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenCorpora/opencorpora/HEAD/assets/img/wiki/Markup.png -------------------------------------------------------------------------------- /assets/img/icon_speed_60.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenCorpora/opencorpora/HEAD/assets/img/icon_speed_60.png -------------------------------------------------------------------------------- /assets/img/icon_star_gray.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenCorpora/opencorpora/HEAD/assets/img/icon_star_gray.png -------------------------------------------------------------------------------- /assets/img/icon_star_red.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenCorpora/opencorpora/HEAD/assets/img/icon_star_red.png -------------------------------------------------------------------------------- /assets/img/icon_user_blue.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenCorpora/opencorpora/HEAD/assets/img/icon_user_blue.png -------------------------------------------------------------------------------- /assets/img/tiny_grid_blue.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenCorpora/opencorpora/HEAD/assets/img/tiny_grid_blue.png -------------------------------------------------------------------------------- /assets/img/icon_speedometer.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenCorpora/opencorpora/HEAD/assets/img/icon_speedometer.png -------------------------------------------------------------------------------- /assets/img/icon_star_green.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenCorpora/opencorpora/HEAD/assets/img/icon_star_green.png -------------------------------------------------------------------------------- /assets/img/icon_star_orange.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenCorpora/opencorpora/HEAD/assets/img/icon_star_orange.png -------------------------------------------------------------------------------- /assets/img/icon_star_yellow.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenCorpora/opencorpora/HEAD/assets/img/icon_star_yellow.png -------------------------------------------------------------------------------- /assets/img/icon_trophy_black.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenCorpora/opencorpora/HEAD/assets/img/icon_trophy_black.png -------------------------------------------------------------------------------- /assets/img/icon_user_orange.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenCorpora/opencorpora/HEAD/assets/img/icon_user_orange.png -------------------------------------------------------------------------------- /assets/img/wiki/Buttonanswer.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenCorpora/opencorpora/HEAD/assets/img/wiki/Buttonanswer.png -------------------------------------------------------------------------------- /assets/img/wiki/Buttonnomore.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenCorpora/opencorpora/HEAD/assets/img/wiki/Buttonnomore.png -------------------------------------------------------------------------------- /assets/img/wiki/Buttonother.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenCorpora/opencorpora/HEAD/assets/img/wiki/Buttonother.png -------------------------------------------------------------------------------- /assets/img/wiki/Buttonskip.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenCorpora/opencorpora/HEAD/assets/img/wiki/Buttonskip.png -------------------------------------------------------------------------------- /assets/img/wiki/Tasklist_php.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenCorpora/opencorpora/HEAD/assets/img/wiki/Tasklist_php.png -------------------------------------------------------------------------------- /doc/presentations/img/annotation-lifecycle.sh: -------------------------------------------------------------------------------- 1 | dot img/annotation-lifecycle.dot -Tpng -o img/annotation-lifecycle.png 2 | -------------------------------------------------------------------------------- /scripts/aot_import/read_mrd.pl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenCorpora/opencorpora/HEAD/scripts/aot_import/read_mrd.pl -------------------------------------------------------------------------------- /scripts/aot_import/rgramtab.tab: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenCorpora/opencorpora/HEAD/scripts/aot_import/rgramtab.tab -------------------------------------------------------------------------------- /assets/img/badges/aist-400x400.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenCorpora/opencorpora/HEAD/assets/img/badges/aist-400x400.png -------------------------------------------------------------------------------- /assets/img/badges/bobr-400x400.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenCorpora/opencorpora/HEAD/assets/img/badges/bobr-400x400.png -------------------------------------------------------------------------------- /assets/img/badges/dog-400x400.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenCorpora/opencorpora/HEAD/assets/img/badges/dog-400x400.png -------------------------------------------------------------------------------- /assets/img/badges/fish-400x400.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenCorpora/opencorpora/HEAD/assets/img/badges/fish-400x400.png -------------------------------------------------------------------------------- /assets/img/wiki/Buttoncomment.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenCorpora/opencorpora/HEAD/assets/img/wiki/Buttoncomment.png -------------------------------------------------------------------------------- /assets/img/wiki/Buttonwantmore.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenCorpora/opencorpora/HEAD/assets/img/wiki/Buttonwantmore.png -------------------------------------------------------------------------------- /assets/img/wiki/Leftcontextex.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenCorpora/opencorpora/HEAD/assets/img/wiki/Leftcontextex.png -------------------------------------------------------------------------------- /assets/img/wiki/Rightcontextex.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenCorpora/opencorpora/HEAD/assets/img/wiki/Rightcontextex.png -------------------------------------------------------------------------------- /scripts/export_and_stats.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | /corpus/scripts/exports.sh 3 | /corpus/scripts/stats/update_stats.sh 4 | -------------------------------------------------------------------------------- /doc/articles/img/2011_Dialog_img1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenCorpora/opencorpora/HEAD/doc/articles/img/2011_Dialog_img1.png -------------------------------------------------------------------------------- /doc/articles/img/2012_MIEM_img1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenCorpora/opencorpora/HEAD/doc/articles/img/2012_MIEM_img1.png -------------------------------------------------------------------------------- /doc/presentations/img/2012_miem_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenCorpora/opencorpora/HEAD/doc/presentations/img/2012_miem_1.png -------------------------------------------------------------------------------- /doc/presentations/img/2012_miem_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenCorpora/opencorpora/HEAD/doc/presentations/img/2012_miem_2.png -------------------------------------------------------------------------------- /doc/presentations/img/2012_miem_3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenCorpora/opencorpora/HEAD/doc/presentations/img/2012_miem_3.png -------------------------------------------------------------------------------- /doc/presentations/img/2012_miem_4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenCorpora/opencorpora/HEAD/doc/presentations/img/2012_miem_4.png -------------------------------------------------------------------------------- /doc/presentations/img/2012_miem_5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenCorpora/opencorpora/HEAD/doc/presentations/img/2012_miem_5.png -------------------------------------------------------------------------------- /doc/presentations/img/2012_miem_6.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenCorpora/opencorpora/HEAD/doc/presentations/img/2012_miem_6.png -------------------------------------------------------------------------------- /doc/presentations/img/markupUI2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenCorpora/opencorpora/HEAD/doc/presentations/img/markupUI2.png -------------------------------------------------------------------------------- /scripts/aot_import/list_paradigm.pl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenCorpora/opencorpora/HEAD/scripts/aot_import/list_paradigm.pl -------------------------------------------------------------------------------- /scripts/aot_import/morphs.mrd.patch: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenCorpora/opencorpora/HEAD/scripts/aot_import/morphs.mrd.patch -------------------------------------------------------------------------------- /templates/error.tpl: -------------------------------------------------------------------------------- 1 | {* Smarty *} 2 | {extends file='common.tpl'} 3 | {block name=content} 4 |

{$error_text}

5 | {/block} 6 | -------------------------------------------------------------------------------- /assets/img/badges/ainl2015-400x400.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenCorpora/opencorpora/HEAD/assets/img/badges/ainl2015-400x400.png -------------------------------------------------------------------------------- /assets/img/badges/chameleon-400x400.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenCorpora/opencorpora/HEAD/assets/img/badges/chameleon-400x400.png -------------------------------------------------------------------------------- /assets/img/badges/wantmore-400x400.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenCorpora/opencorpora/HEAD/assets/img/badges/wantmore-400x400.png -------------------------------------------------------------------------------- /doc/presentations/img/markupUI2-part.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenCorpora/opencorpora/HEAD/doc/presentations/img/markupUI2-part.png -------------------------------------------------------------------------------- /postagging/brill/unsupervised/python/check_disjoint.sh: -------------------------------------------------------------------------------- 1 | ls *.tab | xargs -I XXX grep -E -o "^[0-9]+" XXX | sort | uniq -c | sort -n | head 2 | -------------------------------------------------------------------------------- /templates/static/doc/annotation.tpl: -------------------------------------------------------------------------------- 1 | {* Smarty *} 2 | {extends file='common.tpl'} 3 | {block name='content'} 4 | {$content} 5 | {/block} 6 | -------------------------------------------------------------------------------- /anaphora/NE_extract/pronouns.txt: -------------------------------------------------------------------------------- 1 | он 2 | она 3 | оно 4 | они 5 | который 6 | себя 7 | себе 8 | собой 9 | их 10 | его 11 | ее 12 | свой 13 | -------------------------------------------------------------------------------- /assets/img/badges/aist-100x100-grayscale.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenCorpora/opencorpora/HEAD/assets/img/badges/aist-100x100-grayscale.png -------------------------------------------------------------------------------- /assets/img/badges/bobr-100x100-grayscale.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenCorpora/opencorpora/HEAD/assets/img/badges/bobr-100x100-grayscale.png -------------------------------------------------------------------------------- /assets/img/badges/dog-100x100-grayscale.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenCorpora/opencorpora/HEAD/assets/img/badges/dog-100x100-grayscale.png -------------------------------------------------------------------------------- /assets/img/badges/fish-100x100-grayscale.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenCorpora/opencorpora/HEAD/assets/img/badges/fish-100x100-grayscale.png -------------------------------------------------------------------------------- /doc/presentations/img/2011_nlpseminar_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenCorpora/opencorpora/HEAD/doc/presentations/img/2011_nlpseminar_1.png -------------------------------------------------------------------------------- /doc/presentations/img/2011_nlpseminar_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenCorpora/opencorpora/HEAD/doc/presentations/img/2011_nlpseminar_2.png -------------------------------------------------------------------------------- /doc/presentations/img/2011_nlpseminar_3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenCorpora/opencorpora/HEAD/doc/presentations/img/2011_nlpseminar_3.png -------------------------------------------------------------------------------- /export/annot/no_homonymy_constants.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python2 2 | # -*- coding: utf-8 -*- 3 | DECISION_OTHER = 'Other' 4 | DECISION_UNKNOWN = 'UNKN' -------------------------------------------------------------------------------- /scripts/aot_import/lists/add_Infr_ablt_plur.txt: -------------------------------------------------------------------------------- 1 | #Поставить этим формам сущ твор мн помету Разг 2 | дверьми 3 | дочерями 4 | лошадями 5 | плечьми 6 | -------------------------------------------------------------------------------- /doc/presentations/img/annotation-lifecycle.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenCorpora/opencorpora/HEAD/doc/presentations/img/annotation-lifecycle.png -------------------------------------------------------------------------------- /templates/qa/tasks_guest.tpl: -------------------------------------------------------------------------------- 1 | {* Smarty *} 2 | {extends file='common.tpl'} 3 | {block name=content} 4 |

Разметка

5 | {$content} 6 | {/block} 7 | -------------------------------------------------------------------------------- /assets/img/badges/ainl2015-100x100-grayscale.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenCorpora/opencorpora/HEAD/assets/img/badges/ainl2015-100x100-grayscale.png -------------------------------------------------------------------------------- /assets/img/badges/chameleon-100x100-grayscale.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenCorpora/opencorpora/HEAD/assets/img/badges/chameleon-100x100-grayscale.png -------------------------------------------------------------------------------- /assets/img/badges/wantmore-100x100-grayscale.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenCorpora/opencorpora/HEAD/assets/img/badges/wantmore-100x100-grayscale.png -------------------------------------------------------------------------------- /scripts/aot_import/lists/add_Erro_PRTS.txt: -------------------------------------------------------------------------------- 1 | #Добавить этим кратким причастиям помету Опеч 2 | раздана 3 | замыслен 4 | замыслена 5 | замыслено 6 | замыслены 7 | -------------------------------------------------------------------------------- /tokenizer_monitor.php: -------------------------------------------------------------------------------- 1 | display('tokenizer_monitor.tpl'); 7 | ?> 8 | -------------------------------------------------------------------------------- /scripts/exports.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | /corpus/scripts/export/export_annot.sh 3 | /corpus/scripts/export/export_ngram.sh 4 | /corpus/scripts/export/export_dict.sh 5 | -------------------------------------------------------------------------------- /scripts/aot_import/lists/add_Dist_aux.txt: -------------------------------------------------------------------------------- 1 | #Добавить этим формам помету Искаж 2 | алё 3 | зарям 4 | издалека 5 | ложут 6 | ольгя 7 | придти 8 | никитичом 9 | саввичом 10 | -------------------------------------------------------------------------------- /scripts/aot_import/lists/add_Litr.txt: -------------------------------------------------------------------------------- 1 | #Поставить этим компаративам помету Литературн 2 | горше 3 | огнь 4 | заградогнь 5 | угль 6 | церквам 7 | церквами 8 | церквах 9 | 10 | -------------------------------------------------------------------------------- /anaphora/ana_test.pairs: -------------------------------------------------------------------------------- 1 | 1_8 ANA_MATCH 2 | 6_11 PARA 3 | 6_13 PART_MATCH 4 | 13_31 PARA 5 | 29_32 PRON 6 | 47_52 ANA_MATCH 7 | 45_58 PARA 8 | 58_61 PARA 9 | 61_77 PARA 10 | 11 | -------------------------------------------------------------------------------- /ajax/readonly.php: -------------------------------------------------------------------------------- 1 | file_exists($config['project']['readonly_flag']) ? 1 : 0); 4 | die(json_encode($result)); 5 | ?> 6 | -------------------------------------------------------------------------------- /scripts/aot_import/lists/abbr_del.txt: -------------------------------------------------------------------------------- 1 | #удалить сокращения из парадигмы 2 | кг 3 | км 4 | млрд 5 | мм 6 | млн 7 | перев 8 | ред 9 | рис 10 | им 11 | см 12 | руб 13 | спб 14 | ст 15 | табл 16 | трлн 17 | тыс 18 | -------------------------------------------------------------------------------- /scripts/aot_import/lists/remove_ANim.txt: -------------------------------------------------------------------------------- 1 | азотобактерия 2 | архебактерия 3 | архибактерия 4 | бактерия 5 | железобактерия 6 | нитробактерия 7 | серобактерия 8 | уробактерия 9 | фотобактерия 10 | энтеробактерия 11 | -------------------------------------------------------------------------------- /scripts/aot_import/lists/Del_anim-inan&Add_ANim.txt: -------------------------------------------------------------------------------- 1 | #дано: перечислены леммы 2 | #задача: заменить anim или inan на ANim 3 | бомбардировщик 4 | вирус 5 | зародыш 6 | менингококк 7 | разведчик 8 | стрептококк 9 | эмбрион -------------------------------------------------------------------------------- /templates/common_no_js.tpl: -------------------------------------------------------------------------------- 1 | {extends "common.tpl"} 2 | 3 | {block name="nojs"}{/block} 4 | {block name="content"} 5 |
6 |

Ошибка

7 |

Для корректной работы сайта необходим JavaScript.

8 | {/block} -------------------------------------------------------------------------------- /SECURITY.md: -------------------------------------------------------------------------------- 1 | # Как сообщить об уязвимости 2 | 3 | Пожалуйста, пишите на granovsky@opencorpora.org и bocharov@opencorpora.org. 4 | 5 | # How to report a vulnerability 6 | 7 | Please contact granovsky@opencorpora.org and bocharov@opencorpora.org. 8 | -------------------------------------------------------------------------------- /scripts/subst.txt: -------------------------------------------------------------------------------- 1 | общий 2 | такой 3 | особенный 4 | русский 5 | английский 6 | больной 7 | весь 8 | один 9 | дальнейший 10 | знакомый 11 | каждый 12 | бессознательный 13 | другой 14 | обычной 15 | хороший 16 | несовершеннолетний 17 | угловой 18 | -------------------------------------------------------------------------------- /search.php: -------------------------------------------------------------------------------- 1 | assign('search', get_search_results($search, GET('exact_form', true))); 7 | $smarty->display('search.tpl'); 8 | log_timing(); 9 | -------------------------------------------------------------------------------- /comments.php: -------------------------------------------------------------------------------- 1 | assign('comments', get_latest_comments($skip)); 8 | $smarty->assign('skip', $skip); 9 | $smarty->display('comments.tpl'); 10 | log_timing(); 11 | ?> 12 | -------------------------------------------------------------------------------- /scripts/stats/update_stats.sh: -------------------------------------------------------------------------------- 1 | /corpus/scripts/cronrunner.pl "/corpus/scripts/stats/update_stats.pl /corpus/config.ini" 2 | /corpus/scripts/cronrunner.pl "/corpus/scripts/stats/update_tag_stats.pl /corpus/config.ini" 3 | /corpus/scripts/cronrunner.pl "php /corpus/scripts/stats/sentence_quality.php" 4 | -------------------------------------------------------------------------------- /templates/dict/links_main.tpl: -------------------------------------------------------------------------------- 1 | {* Smarty *} 2 | {extends file='common.tpl'} 3 | {block name='content'} 4 |

Виды связей

5 |
    6 | {foreach item=name key=typeid from=$data} 7 |
  1. {$name}
  2. 8 | {/foreach} 9 |
10 | {/block} 11 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .htaccess 2 | /assets/vendor 3 | /composer.phar 4 | /config.ini 5 | /config.json 6 | /doc/articles/*.pdf 7 | /doc/presentations/*.pdf 8 | /files 9 | /vendor 10 | /yadisk-auth 11 | /yandex_*.html 12 | /yandex_*.txt 13 | *.swp 14 | *.project 15 | *.prefs 16 | *.pydevproject 17 | *.pyc 18 | -------------------------------------------------------------------------------- /anaphora/features/runF.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | perl lineal.pl -m $1 -g $2 -p $3 > lineal.tmp 4 | python morph.py $2 $1 < $3 | cut -f 2- > morph.tmp 5 | perl synt.pl > synt.tmp 6 | python ParDistance.py $3 $2 $1 | cut -f 2 > ParDistance.tmp 7 | paste lineal.tmp morph.tmp synt.tmp ParDistance.tmp 8 | rm *.tmp 9 | -------------------------------------------------------------------------------- /ajax/wantmore.php: -------------------------------------------------------------------------------- 1 | emit(EventTypes::WANT_MORE); 11 | 12 | echo json_encode($result); -------------------------------------------------------------------------------- /ajax/clck_log.php: -------------------------------------------------------------------------------- 1 | 14 | -------------------------------------------------------------------------------- /scripts/ma_pools/post_merge.php: -------------------------------------------------------------------------------- 1 | 14 | -------------------------------------------------------------------------------- /dict_diff.php: -------------------------------------------------------------------------------- 1 | assign('diff', dict_diff($lemma_id, $set_id)); 8 | $smarty->display('dict/diff.tpl'); 9 | log_timing(); 10 | ?> 11 | -------------------------------------------------------------------------------- /scripts/ma_pools/autopublish.php: -------------------------------------------------------------------------------- 1 | 15 | -------------------------------------------------------------------------------- /dict_history.php: -------------------------------------------------------------------------------- 1 | assign('history', dict_history($lemma_id, $skip)); 7 | $smarty->assign('skip', $skip); 8 | $smarty->display('dict/history.tpl'); 9 | log_timing(); 10 | ?> 11 | -------------------------------------------------------------------------------- /scripts/aot_import/bad_lemma_grammems.txt: -------------------------------------------------------------------------------- 1 | * sing 2 | * plur 3 | * indc 4 | * nomn 5 | * gent 6 | * datv 7 | * accs 8 | * ablt 9 | * loct 10 | VERB excl 11 | VERB impr 12 | VERB pres 13 | VERB past 14 | VERB masc 15 | VERB neut 16 | GRND pres 17 | GRND past 18 | GRND V-sh 19 | ADJS neut 20 | PRTF masc 21 | PRTS femn 22 | COMP V-ej 23 | -------------------------------------------------------------------------------- /ajax/set_option.php: -------------------------------------------------------------------------------- 1 | 15 | -------------------------------------------------------------------------------- /diff.php: -------------------------------------------------------------------------------- 1 | assign('diff', main_diff($sent_id, $set_id, $rev_id)); 9 | $smarty->display('diff.tpl'); 10 | log_timing(); 11 | ?> 12 | -------------------------------------------------------------------------------- /ajax/run_test.php: -------------------------------------------------------------------------------- 1 | 17 | -------------------------------------------------------------------------------- /ajax/run_generator.php: -------------------------------------------------------------------------------- 1 | 17 | -------------------------------------------------------------------------------- /ajax/set_token_text.php: -------------------------------------------------------------------------------- 1 | execute(file_get_contents(__DIR__.'/initial_schema.sql')); 9 | } 10 | 11 | public function down() { 12 | throw new Exception("Not implemented"); 13 | } 14 | } 15 | -------------------------------------------------------------------------------- /ajax/game_mark_shown.php: -------------------------------------------------------------------------------- 1 | set_all_seen(); 9 | } 10 | catch (Exception $e) { 11 | $result['error'] = 1; 12 | } 13 | 14 | log_timing(true); 15 | die(json_encode($result)); 16 | -------------------------------------------------------------------------------- /ajax/lemma_search.php: -------------------------------------------------------------------------------- 1 | 14 | -------------------------------------------------------------------------------- /ajax/select_book.php: -------------------------------------------------------------------------------- 1 | $title) { 10 | $result['books'][] = array('id' => $id, 'title' => $title); 11 | } 12 | 13 | log_timing(true); 14 | die(json_encode($result)); 15 | ?> 16 | -------------------------------------------------------------------------------- /perl/lib/Lingua/AOT/MorphDict/FormSpec.pm: -------------------------------------------------------------------------------- 1 | package Lingua::AOT::MorphDict::FormSpec; 2 | 3 | use strict; 4 | use warnings; 5 | use utf8; 6 | use Encode; 7 | 8 | our $VERSION = "0.01"; 9 | 10 | 11 | sub new { 12 | my $self = {}; 13 | my $class; 14 | ($class, $self->{flex}, $self->{ancode}, $self->{prefix}) = @_; 15 | 16 | bless($self, $class); 17 | 18 | return $self; 19 | } 20 | 21 | -------------------------------------------------------------------------------- /ajax/publish_update.php: -------------------------------------------------------------------------------- 1 | 17 | -------------------------------------------------------------------------------- /migrations/20150703113519_add_pool_proto_name.php: -------------------------------------------------------------------------------- 1 | table("morph_annot_pool_types"); 10 | $types->addColumn('pool_proto_name', 'string', array('limit' => 120)) 11 | ->update(); 12 | } 13 | } 14 | -------------------------------------------------------------------------------- /ajax/tag_autocomplete.php: -------------------------------------------------------------------------------- 1 | 12 | -------------------------------------------------------------------------------- /scripts/aot_import/lists/list_adjf_fixd_NOUN.txt: -------------------------------------------------------------------------------- 1 | аллегри 2 | апаш 3 | апплике 4 | барокко 5 | беж 6 | бордо 7 | брокколи 8 | буфф 9 | валансьен 10 | дум-дум 11 | клёш 12 | либерти 13 | макси 14 | маренго 15 | миди 16 | модерн 17 | морзе 18 | онлайн 19 | пик 20 | пике 21 | плаке 22 | плиссе 23 | пралине 24 | рамбулье 25 | ретро 26 | рококо 27 | сомон 28 | фантази 29 | фри 30 | хаки 31 | экстра 32 | электрик 33 | -------------------------------------------------------------------------------- /perl/lib/ISO/LMF/EntityBase.pm: -------------------------------------------------------------------------------- 1 | package OpenCorpora::ISO::LMF::EntityBase; 2 | 3 | use strict; 4 | use warnings; 5 | use utf8; 6 | 7 | our $VERSION = "0.01"; 8 | 9 | 10 | 11 | 12 | sub new { 13 | my($class, %args) = @_; 14 | 15 | my $self = bless({}, $class); 16 | 17 | $self->{xmlatt} = {}; 18 | $self->{feat} = {}; 19 | $self->{fsr} = {}; 20 | 21 | return $self; 22 | } 23 | 24 | 25 | 1; 26 | -------------------------------------------------------------------------------- /postagging/brill/unsupervised/python/learning_test/rand1.tab: -------------------------------------------------------------------------------- 1 | sent 2 | 100 муму 1 мумуа TAGA 2 мумуб TAGB 3 | 101 куку 3 куку TAGC 4 | 102 . 0 . PNCT 5 | /sent 6 | 7 | sent 8 | 103 пупу 1 пупуа TAGA 9 | 104 зузу 3 зузу TAGC 10 | 105 . 0 . PNCT 11 | /sent 12 | 13 | sent 14 | 106 фуфу 2 фуфуб TAGB 15 | 107 . 0 . PNCT 16 | /sent 17 | 18 | sent 19 | 108 муму 1 мумуа TAGA 2 мумуб TAGB 20 | 109 . 0 . PNCT 21 | /sent 22 | -------------------------------------------------------------------------------- /scripts/aot_import/lists/add_Erro_ADJS.txt: -------------------------------------------------------------------------------- 1 | взаимосвязанна 2 | взаимосвязанны 3 | взаимосвязанно 4 | излюбленна 5 | излюбленны 6 | излюбленно 7 | незаверенна 8 | незаверенны 9 | незаверенно 10 | приверженна 11 | приверженны 12 | приверженно 13 | самоотвержена 14 | самоотвержено 15 | самоотвержены 16 | безветрено 17 | завихренно 18 | закомплексованно 19 | неуверено 20 | отверженно 21 | превыспренно 22 | превыспренны 23 | 24 | 25 | -------------------------------------------------------------------------------- /scripts/tokenizer/train.php: -------------------------------------------------------------------------------- 1 | train($limit); 16 | -------------------------------------------------------------------------------- /migrations/20160203212016_multiword_types.php: -------------------------------------------------------------------------------- 1 | table("mw_main") 11 | ->addColumn('mw_type', 'integer', array('signed' => false, 'limit' => MysqlAdapter::INT_TINY)) 12 | ->save(); 13 | } 14 | } 15 | -------------------------------------------------------------------------------- /robots.txt: -------------------------------------------------------------------------------- 1 | User-agent: * 2 | Disallow: /books 3 | Disallow: /dict 4 | Disallow: /diff 5 | Disallow: /files/saved 6 | Disallow: /history 7 | Disallow: /login 8 | Disallow: /options 9 | Disallow: /pools 10 | Disallow: /revert 11 | Disallow: /scripts 12 | Disallow: /sentence 13 | Disallow: /tasks 14 | Disallow: /user.php 15 | Disallow: /w/ 16 | Disallow: /wiki/Special:Search 17 | Disallow: /wiki/Special:Random 18 | Disallow: /?page=top100 19 | -------------------------------------------------------------------------------- /scripts/invalidate_auth_tokens.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | use strict; 3 | use DBI; 4 | use Config::INI::Reader; 5 | 6 | #reading config 7 | my $conf = Config::INI::Reader->read_file($ARGV[0]); 8 | $conf = $conf->{mysql}; 9 | 10 | my $dbh = DBI->connect('DBI:mysql:'.$conf->{'dbname'}.':'.$conf->{'host'}, $conf->{'user'}, $conf->{'passwd'}) or die $DBI::errstr; 11 | $dbh->do("DELETE FROM user_tokens WHERE timestamp<".(time()-60*60*24*7)); 12 | -------------------------------------------------------------------------------- /scripts/run_validators.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | CONFIG_INI=/corpus/config.ini 3 | CONFIG_JSON=/corpus/config.json 4 | 5 | python /corpus/scripts/validators/year_valid.py $CONFIG_INI 6 | python /corpus/scripts/validators/author_validator.py $CONFIG_INI 7 | python /corpus/scripts/validators/url_validator.py $CONFIG_INI 8 | python /corpus/scripts/validators/par_validator.py $CONFIG_INI 9 | /corpus/scripts/find_good_sentences.py $CONFIG_JSON 10 | -------------------------------------------------------------------------------- /postagging/brill/unsupervised/python/learning_test/rand0.tab: -------------------------------------------------------------------------------- 1 | sent 2 | 100 муму 1 мумуа TAGA 2 мумуб TAGB 3 | 101 куку 3 куку TAGC 4 | 102 . 0 . PNCT 5 | /sent 6 | 7 | sent 8 | 103 абаб 4 абаба TAGD 9 | 104 пупу 1 пупуа TAGA 10 | 105 зузу 3 зузу TAGC 11 | 106 . 0 . PNCT 12 | /sent 13 | 14 | sent 15 | 107 фуфу 2 фуфуб TAGB 16 | 108 . 0 . PNCT 17 | /sent 18 | 19 | sent 20 | 109 муму 1 мумуа TAGA 2 мумуб TAGB 21 | 110 . 0 . PNCT 22 | /sent 23 | -------------------------------------------------------------------------------- /postagging/brill/unsupervised/cpp/include/corpora_io.h: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | #include "sentence.h" 6 | #include "tag.h" 7 | 8 | #ifndef __CORPORA_IO_H 9 | #define __CORPORA_IO_H 10 | 11 | typedef std::vector SentenceCollection; 12 | 13 | void readCorpus(const std::string &fn, SentenceCollection &sc); 14 | 15 | std::set makeVariants(const std::string &s); 16 | 17 | #endif 18 | -------------------------------------------------------------------------------- /anaphora/NE_extract/run_toma.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | if [ -z "$1" ] || [ -z "$2" ] 4 | then 5 | echo "./run_toma.sh PlaintextDirName ResXmlDirName" 6 | exit 1 7 | else 8 | 9 | mkdir -p $2 10 | for d in $1/* 11 | do 12 | XML=$(basename $d) 13 | echo $XML 14 | D=`echo ${d//\//\\\/}` 15 | echo $D 16 | sed -i "s/Dir = \".*\"/Dir = \"$D\"/" config.proto 17 | ./tomita-upd config.proto > $2/$XML.xml 18 | done 19 | fi 20 | -------------------------------------------------------------------------------- /perl/lib/Lingua/AOT/MorphDict/AccentParadigm.pm: -------------------------------------------------------------------------------- 1 | package Lingua::AOT::MorphDict::AccentParadigm; 2 | 3 | use strict; 4 | use warnings; 5 | use utf8; 6 | 7 | our $VERSION = "0.01"; 8 | 9 | 10 | sub new { 11 | my ($class, $line) = @_; 12 | my $self = {}; 13 | 14 | @{$self->{forms}} = split(/;/, $line); 15 | 16 | bless($self, $class); 17 | return $self; 18 | } 19 | 20 | sub GetLastFormNo { 21 | my $self = shift; 22 | return $#{$self->{forms}}; 23 | } 24 | -------------------------------------------------------------------------------- /templates/static/faq.tpl: -------------------------------------------------------------------------------- 1 | {* Smarty *} 2 | {extends file='common.tpl'} 3 | {block name='content'} 4 |

О проекте

5 | 11 | {if isset($title)}

{$title}

{/if} 12 | {$content} 13 | {/block} 14 | -------------------------------------------------------------------------------- /scripts/aot_import/lists/add_Infr_ADJS.txt: -------------------------------------------------------------------------------- 1 | #Добавить этим кратким прилагательным помету Разг. 2 | вёртка 3 | квела 4 | сажённа 5 | сверхлёгка 6 | хлёстка 7 | чётка 8 | весёлы 9 | волён 10 | выспренне 11 | выспренни 12 | далёки 13 | добродушно-весёлы 14 | жестки 15 | искренно 16 | искренны 17 | кисл 18 | шустр 19 | неискренно 20 | неудовлетворённо 21 | общо 22 | угнетённа 23 | угнетённо 24 | угнетённы 25 | черствы 26 | нетвёрды 27 | твёрды 28 | огорчённа 29 | огорчённо 30 | остер 31 | -------------------------------------------------------------------------------- /ajax/get_context.php: -------------------------------------------------------------------------------- 1 | 17 | -------------------------------------------------------------------------------- /templates/dict/absent.tpl: -------------------------------------------------------------------------------- 1 | {* Smarty *} 2 | {extends file='common.tpl'} 3 | {block name='content'} 4 | 8 |

Top 500 токенов с UNKN

9 |
    10 | {foreach from=$words item=word} 11 |
  1. {$word.word|htmlspecialchars} [{$word.count}]
  2. 12 | {/foreach} 13 |
14 | {/block} 15 | -------------------------------------------------------------------------------- /templates/dict/links_single.tpl: -------------------------------------------------------------------------------- 1 | {* Smarty *} 2 | {extends file='common.tpl'} 3 | {block name='content'} 4 |

Всего пар лемм со связью типа {$data.name}: {$data.total}. Ниже не более 100 примеров.

5 | 14 | {/block} 15 | -------------------------------------------------------------------------------- /anaphora/NE_extract/kwtypes.proto: -------------------------------------------------------------------------------- 1 | import "base.proto"; // подключаем описания protobuf-типов (TAuxDicArticle и прочих) 2 | import "articles_base.proto"; // Файлы base.proto и articles_base.proto встроены в компилятор. 3 | 4 | message complex_prep: TAuxDicArticle { }; 5 | message complex_adv: TAuxDicArticle { }; 6 | message complex_conj: TAuxDicArticle { }; 7 | message introduct: TAuxDicArticle { }; 8 | message bad_noun: TAuxDicArticle { }; 9 | message pronoun: TAuxDicArticle { }; 10 | -------------------------------------------------------------------------------- /scripts/mwords/rules.txt: -------------------------------------------------------------------------------- 1 | # правила для поиска кандидатов в мультитокены 2 | # c # можно начинать строки с комментариями 3 | 4 | # поиск по умолчанию - по точной форме, без учёта регистра 5 | 6 | # через @ можно написать тип, он определяет надпись на кнопке у юзера, 7 | # по умолчанию там написано "мультитокен" 8 | # @1 - "сложный предлог" 9 | # @2 - "сложный союз" 10 | 11 | 12 | в течение @ 1 13 | в течении @ 1 14 | несмотря на @ 1 15 | т . к . @ 2 16 | так как @ 2 17 | хотя б 18 | хотя бы 19 | -------------------------------------------------------------------------------- /templates/qa/empty_books.tpl: -------------------------------------------------------------------------------- 1 | {* Smarty *} 2 | {extends file='common.tpl'} 3 | {block name=content} 4 |

Пустые тексты

5 |

Это тексты, к которым не приписано ни одного раздела и ни одного предложения.

6 |

Список обновляется при каждом обращении к этой странице.

7 |
    8 | {foreach item=book from=$books} 9 |
  1. {$book.name|htmlspecialchars}
  2. 10 | {foreachelse} 11 |

    Список пуст.

    12 | {/foreach} 13 |
14 | {/block} 15 | -------------------------------------------------------------------------------- /migrations/20150610120658_merge_fails_comments.php: -------------------------------------------------------------------------------- 1 | table('morph_annot_merge_comments', array('id' => false, 'primary_key' => array('sample_id'))); 10 | $tbl->addColumn('sample_id', 'integer', array('signed' => false)) 11 | ->addColumn('comment', 'text') 12 | ->create(); 13 | } 14 | } 15 | -------------------------------------------------------------------------------- /templates/search.tpl: -------------------------------------------------------------------------------- 1 | {* Smarty *} 2 | {extends file='common.tpl'} 3 | {block name=content} 4 |

Результаты поиска

5 |

Всего найдено: {$search.total}

6 | {foreach from=$search.results item=s name=m} 7 |

{$smarty.foreach.m.index + 1}. {foreach from=$s.context item=word key=tid}{if $tid == $s.mainword}{$word|htmlspecialchars}{else}{$word|htmlspecialchars}{/if} {/foreach}

8 | {/foreach} 9 | {/block} 10 | -------------------------------------------------------------------------------- /postagging/brill/unsupervised/cpp/train/Makefile: -------------------------------------------------------------------------------- 1 | CC=g++ 2 | INC_PATH=../include/ 3 | LIB_PATH=../lib/ 4 | CFLAGS=-c -O3 -std=c++0x -Wall -I$(INC_PATH) 5 | SOURCES=main.cpp aux.cpp $(LIB_PATH)corpora_io.cpp $(LIB_PATH)corpus_stat.cpp $(LIB_PATH)brill.cpp 6 | OBJECTS=$(SOURCES:.cpp=.o) 7 | INCLUDES=$(wildcard $(INC_PATH)/*.h) $(wildcard *.h) 8 | 9 | all: train 10 | 11 | train: $(OBJECTS) 12 | $(CC) -O3 $(OBJECTS) -o train 13 | 14 | .cpp.o: $< ${INCLUDES} 15 | $(CC) $(CFLAGS) $< -o $@ 16 | 17 | clean: 18 | rm -rf *.o train 19 | -------------------------------------------------------------------------------- /migrations/20160124093035_add_prop_order.php: -------------------------------------------------------------------------------- 1 | execute("alter table ne_object_props add column `order` int unsigned not null default 0"); 14 | } 15 | 16 | /** 17 | * Migrate Down. 18 | */ 19 | public function down() 20 | { 21 | $this->table("ne_object_props")->removeColumn("order"); 22 | } 23 | } -------------------------------------------------------------------------------- /perl/lib/ISO/LMF/LexicalResource.pm: -------------------------------------------------------------------------------- 1 | package OpenCorpora::ISO::LMF::LexicalResource; 2 | 3 | use strict; 4 | use warnings; 5 | use utf8; 6 | 7 | our $VERSION = "0.01"; 8 | 9 | sub new { 10 | my($class, %args) = @_; 11 | my $base = OpenCorpora::ISO::LMF::EntityBase->new(); 12 | my $self = bless($base, $class); 13 | 14 | $self->{lexicon} = (); 15 | 16 | return $self; 17 | } 18 | 19 | sub add_lexicon { 20 | my ($self, @lexicon) = @_; 21 | foreach my $l (@lexicon) { 22 | push @{$self->{lexicon}}, $l; 23 | } 24 | } 25 | 26 | 1; 27 | -------------------------------------------------------------------------------- /postagging/brill/unsupervised/python/apply.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | import sys 4 | 5 | from learn_rules.utils import apply_rule, read_corpus, write_corpus, parse_rule 6 | 7 | 8 | if __name__ == '__main__': 9 | TYPES = {'tag': 0, 'word': 1} 10 | rules = [] 11 | inc = read_corpus(sys.stdin) 12 | for line in open(sys.argv[1], 'r'): 13 | if not line: 14 | continue 15 | r = parse_rule(line) 16 | rules.append(r) 17 | inc = list(apply_rule(r, inc)) 18 | write_corpus(inc) 19 | -------------------------------------------------------------------------------- /scripts/json2ini.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import sys 3 | import json 4 | 5 | 6 | def main(path): 7 | with open(path) as fin: 8 | data = json.load(fin) 9 | for block_name, block_items in data.items(): 10 | print("[{}]".format(block_name)) 11 | for k, v in block_items.items(): 12 | vstr = ','.join(list(map(str, v))) if isinstance(v, list) else v 13 | print("{} = {}".format(k, vstr)) 14 | print() 15 | 16 | 17 | if __name__ == "__main__": 18 | main(sys.argv[1]) 19 | -------------------------------------------------------------------------------- /templates/qa/sent_split.tpl: -------------------------------------------------------------------------------- 1 | {* Smarty *} 2 | {extends file='common.tpl'} 3 | {block name=content} 4 |

Странное деление предложений

5 |

Список обновляется раз в час.

6 | 7 | 8 | {foreach from=$sentences item=s} 9 | 10 | 11 | 12 | 13 | 14 | {/foreach} 15 |
idТекст 
{$s.id}{$s.text|htmlspecialchars}исправить
16 | {/block} 17 | -------------------------------------------------------------------------------- /perl/lib/Lingua/AOT/MorphDict/Form.pm: -------------------------------------------------------------------------------- 1 | package Lingua::AOT::MorphDict::Form; 2 | 3 | use strict; 4 | use warnings; 5 | use utf8; 6 | use Encode; 7 | 8 | our $VERSION = "0.01"; 9 | 10 | 11 | sub new { 12 | my $self = {}; 13 | my ($class, $ref_dic, $text, $ancode, $lemma_ancode) = @_; 14 | ($self->{text}, $self->{ancode}) = ($text, $ancode); 15 | 16 | bless($self, $class); 17 | return $self; 18 | } 19 | 20 | sub Text { 21 | my $self = shift; 22 | return $self->{text}; 23 | } 24 | 25 | sub Ancode { 26 | my $self = shift; 27 | return $self->{ancode}; 28 | } 29 | -------------------------------------------------------------------------------- /postagging/brill/unsupervised/cpp/include/utils.h: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | #ifndef __UTILS_H 7 | #define __UTILS_H 8 | 9 | inline std::vector &split(const std::string &s, char delim, std::vector &elems) { 10 | // std::cerr << "split(\"" << s << "\", \'" << delim << "\' ...)" << std::endl; 11 | std::stringstream ss(s); 12 | std::string item; 13 | while(std::getline(ss, item, delim)) { 14 | elems.push_back(item); 15 | } 16 | return elems; 17 | } 18 | 19 | #endif 20 | -------------------------------------------------------------------------------- /ajax/paradigm_info.php: -------------------------------------------------------------------------------- 1 | $para['lemma_gram'], 'suffix' => $para['lemma_suffix_len']); 8 | $result['forms'] = array(); 9 | foreach ($para['forms'] as $form) { 10 | $result['forms'][] = array('gram' => join(', ', $form['grm']), 'suffix' => $form['suffix']); 11 | } 12 | } 13 | else 14 | $result['error'] = 1; 15 | 16 | log_timing(true); 17 | die(json_encode($result)); 18 | ?> 19 | -------------------------------------------------------------------------------- /revert.php: -------------------------------------------------------------------------------- 1 | 24 | -------------------------------------------------------------------------------- /scripts/tokenizer/tokenize.php: -------------------------------------------------------------------------------- 1 | tokenize($line) as $token) { 13 | echo implode("\t", array($token->start_pos, $token->end_pos, $token->get_feats_str_binary(), $token->border_weight)) . "\n"; 14 | } 15 | } 16 | 17 | ?> 18 | -------------------------------------------------------------------------------- /migrations/20150628112302_turn_game_on_for_all.php: -------------------------------------------------------------------------------- 1 | table("users"); 11 | $users->removeColumn("show_game"); 12 | } 13 | 14 | public function down() 15 | { 16 | $users = $this->table("users"); 17 | $users->addColumn("show_game", "integer", array('signed' => false, 'limit' => MysqlAdapter::INT_TINY)) 18 | ->save(); 19 | } 20 | } 21 | -------------------------------------------------------------------------------- /scripts/aot_import/lists/add_Infr_COMP.txt: -------------------------------------------------------------------------------- 1 | # Поставить компаративу помету Разг 2 | бойчее 3 | врожденнее 4 | врожденней 5 | поврожденнее 6 | поврожденней 7 | дешевее 8 | дичее 9 | закаленнее 10 | закаленней 11 | позакаленней 12 | позакаленнее 13 | неудовлетворённее 14 | неудовлетворённей 15 | понеудовлетворённее 16 | понеудовлетворённей 17 | угнетенней 18 | угнетеннее 19 | поугнетенней 20 | поугнетеннее 21 | позднее 22 | поздней 23 | попозднее 24 | попоздней 25 | прирожденнее 26 | прирожденней 27 | поприрожденнее 28 | поприрожденней 29 | старее 30 | старей 31 | постарее 32 | постарей 33 | хлеще 34 | похлеще 35 | -------------------------------------------------------------------------------- /templates/qa/unknowns.tpl: -------------------------------------------------------------------------------- 1 | {* Smarty *} 2 | {extends file='common.tpl'} 3 | {block name=content} 4 |

Токены из словаря, но с разбором UNKN ({$tokens|sizeof})

5 | 6 | {foreach item=token from=$tokens} 7 | 8 | 9 | 10 | 11 | {/foreach} 12 |
{$token.text|htmlspecialchars}{foreach from=$token.comments item=comment}

{$comment.text|htmlspecialchars} ({$comment.author|htmlspecialchars})

{/foreach}
13 | {/block} 14 | -------------------------------------------------------------------------------- /scripts/cronrunner.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | 3 | use strict; 4 | use warnings; 5 | 6 | my $cmd = shift or exit print qq{Usage: cronrunner.pl "cmd --args"}; 7 | 8 | my @pieces = grep defined, (split / /, $cmd)[0, 1]; 9 | my $file = (grep -e $_, @pieces)[0]; 10 | 11 | my $lock; 12 | if(defined $file) { 13 | $file = (split '/', $file)[-1]; 14 | $lock = "/var/lock/$file.lock"; 15 | } 16 | else { 17 | $lock = "/var/lock/$pieces[0].lock"; 18 | } 19 | 20 | 21 | system qq{flock --exclusive --nonblock $lock --command "$cmd"} 22 | and exit print "Failed to acquire run lock: $!"; 23 | 24 | exit 0; 25 | -------------------------------------------------------------------------------- /migrations/20160130085413_more_on_multiwords.php: -------------------------------------------------------------------------------- 1 | table('mw_answers') 11 | ->addColumn('answer', 'integer', array('signed' => false, 'limit' => MysqlAdapter::INT_TINY)) 12 | ->save(); 13 | 14 | $this->table('mw_main') 15 | ->addColumn('applied', 'integer', array('signed' => false, 'limit' => MysqlAdapter::INT_TINY)) 16 | ->save(); 17 | } 18 | } 19 | -------------------------------------------------------------------------------- /scripts/aot_import/lists/add_Infr_ablt_sing.txt: -------------------------------------------------------------------------------- 1 | #поставить форма твор падежа помету Разг 2 | баржой 3 | баржою 4 | бомжем 5 | военспецем 6 | главспецем 7 | голбцом 8 | гуляшем 9 | ильичем 10 | кешом 11 | киём 12 | кишмишом 13 | козьмичем 14 | кряжом 15 | кузьмичем 16 | кэшом 17 | ломтём 18 | лукичем 19 | мальцем 20 | мацей 21 | метакэшем 22 | миражом 23 | неровнёй 24 | неровнёю 25 | обжой 26 | петлей 27 | петлею 28 | пешней 29 | пешнею 30 | пращей 31 | ровнёй 32 | ровнёю 33 | ряжом 34 | сажем 35 | сазандарём 36 | слэшом 37 | спецем 38 | ставцем 39 | стукачем 40 | фомичем 41 | углем 42 | чавычой 43 | чувашом 44 | шлицом 45 | 46 | -------------------------------------------------------------------------------- /scripts/tokenizer/cronrunner.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | 3 | use strict; 4 | use warnings; 5 | 6 | my $cmd = shift or exit print qq{Usage: cronrunner.pl "cmd --args"}; 7 | 8 | my @pieces = grep defined, (split / /, $cmd)[0, 1]; 9 | my $file = (grep -e $_, @pieces)[0]; 10 | 11 | my $lock; 12 | if(defined $file) { 13 | $file = (split '/', $file)[-1]; 14 | $lock = "/var/lock/$file.lock"; 15 | } 16 | else { 17 | $lock = "/var/lock/$pieces[0].lock"; 18 | } 19 | 20 | 21 | system qq{flock --exclusive --non-block $lock --command "$cmd"} 22 | and exit print "Failed to acquire run lock: $!"; 23 | 24 | exit 0; 25 | -------------------------------------------------------------------------------- /perl/lib/ISO/LMF/Lexicon.pm: -------------------------------------------------------------------------------- 1 | package OpenCorpora::ISO::LMF::Lexicon; 2 | 3 | use strict; 4 | use warnings; 5 | use utf8; 6 | 7 | use OpenCorpora::ISO::LMF::EntityBase; 8 | 9 | our $VERSION = "0.01"; 10 | 11 | 12 | sub new { 13 | my($class, %args) = @_; 14 | my $base = OpenCorpora::ISO::LMF::EntityBase->new(); 15 | my $self = bless($base, $class); 16 | 17 | $self->{lexical_entries} = (); 18 | 19 | return $self; 20 | } 21 | 22 | sub add_lexical_entry { 23 | my ($self, @lexical_entries) = @_; 24 | foreach my $le (@lexical_entries) { 25 | push @{$self->{lexical_entries}}, $le; 26 | } 27 | } 28 | 29 | 1; 30 | -------------------------------------------------------------------------------- /ajax/dict_pending.php: -------------------------------------------------------------------------------- 1 | 24 | -------------------------------------------------------------------------------- /migrations/20160110211650_remove_old_ne_prop.php: -------------------------------------------------------------------------------- 1 | table("ne_objects") 10 | ->removeColumn("canon_name") 11 | ->removeColumn("wikidata_id") 12 | ->update(); 13 | } 14 | 15 | public function down() 16 | { 17 | $this->table("ne_objects") 18 | ->addColumn("canon_name", "string", array("limit" => 255)) 19 | ->addColumn("wikidata_id", "integer") 20 | ->save(); 21 | } 22 | } 23 | -------------------------------------------------------------------------------- /migrations/20151209163951_add_ne_book_moderator.php: -------------------------------------------------------------------------------- 1 | table("ne_books_tagsets") 12 | ->addColumn("moderator_id", "integer", array("default" => 0)) 13 | ->update(); 14 | } 15 | 16 | /** 17 | * Migrate Down. 18 | */ 19 | public function down() { 20 | $this->table("ne_books_tagsets") 21 | ->removeColumn("moderator_id") 22 | ->update(); 23 | } 24 | } 25 | -------------------------------------------------------------------------------- /templates/qa/good_sentences.tpl: -------------------------------------------------------------------------------- 1 | {* Smarty *} 2 | {extends file='common.tpl'} 3 | {block name=content} 4 |

Наименее омонимичные предложения

5 |

Список обновляется раз в сутки.

6 |

{if isset($smarty.get.no_zero)}показать неомонимичные{else}скрыть неомонимичные{/if}

7 | 8 | 9 | {foreach item=sentence from=$sentences} 10 | 11 | {/foreach} 12 |
#Всего словОмонимичных слов
{$sentence.id}{$sentence.total}{$sentence.homonymous}
13 | {/block} 14 | -------------------------------------------------------------------------------- /perl/lib/ISO/LMF/WordForm.pm: -------------------------------------------------------------------------------- 1 | package OpenCorpora::ISO::LMF::WordForm; 2 | 3 | use strict; 4 | use warnings; 5 | use utf8; 6 | 7 | use OpenCorpora::ISO::LMF::EntityBase; 8 | 9 | our $VERSION = "0.01"; 10 | 11 | 12 | sub new { 13 | my($class, %args) = @_; 14 | my $base = OpenCorpora::ISO::LMF::EntityBase->new(); 15 | my $self = bless($base, $class); 16 | 17 | $self->{form_representations} = (); 18 | 19 | return $self; 20 | } 21 | 22 | sub add_form_representation { 23 | my ($self, @form_representations) = @_; 24 | foreach my $fr (@form_representations) { 25 | push @{$self->{form_representations}}, $fr; 26 | } 27 | } 28 | 29 | 1; 30 | -------------------------------------------------------------------------------- /lib/timer.php: -------------------------------------------------------------------------------- 1 | 0) 12 | $user_id = $_SESSION['user_id']; 13 | 14 | $page = $_SERVER['REQUEST_URI']; 15 | sql_pe( 16 | "INSERT INTO timing (user_id, page, total_time, is_ajax) VALUES (?, ?, ?, ?)", 17 | array($user_id, $page, $total_time, $is_ajax ? 1 : 0) 18 | ); 19 | } 20 | ?> 21 | -------------------------------------------------------------------------------- /migrations/20151210150743_add_moderator_column_to_ne_par.php: -------------------------------------------------------------------------------- 1 | table("ne_paragraphs") 12 | ->addColumn("is_moderator", "boolean", array("default" => false)) 13 | ->update(); 14 | } 15 | 16 | /** 17 | * Migrate Down. 18 | */ 19 | public function down() { 20 | $this->table("ne_paragraphs") 21 | ->removeColumn("is_moderator") 22 | ->update(); 23 | } 24 | } 25 | -------------------------------------------------------------------------------- /perl/lib/Lingua/AOT/MorphDict/MorphVariant.pm: -------------------------------------------------------------------------------- 1 | package Lingua::AOT::MorphDict::MorphVariant; 2 | 3 | use strict; 4 | use warnings; 5 | use utf8; 6 | 7 | use Lingua::AOT::MorphDict::FormSpec; 8 | use Lingua::AOT::MorphDict::Form; 9 | use Lingua::AOT::MorphDict::Paradigm; 10 | 11 | our $VERSION = "0.01"; 12 | 13 | 14 | sub new { 15 | my ($class, $lemma_id, $ancode) = @_; 16 | my $self = {}; 17 | ($self->{lid}, $self->{ancode}) = ($lemma_id, $ancode); 18 | bless($self, $class); 19 | } 20 | 21 | sub LemmaId { 22 | my $self = shift; 23 | return $self->{lid}; 24 | } 25 | 26 | sub Ancode { 27 | my $self = shift; 28 | return $self->{ancode}; 29 | } 30 | -------------------------------------------------------------------------------- /ajax/save_check.php: -------------------------------------------------------------------------------- 1 | 24 | -------------------------------------------------------------------------------- /scripts/aot_import/lists/pred_to_intj.txt: -------------------------------------------------------------------------------- 1 | аллилуйя 2 | аминь 3 | брык 4 | верть 5 | вот 6 | дёрг 7 | да 8 | динь-динь-динь 9 | добро 10 | дрыг 11 | дудки 12 | кувырк 13 | мерси 14 | молчок 15 | морг 16 | на-ка 17 | нате-ка 18 | нет 19 | неужели 20 | неужто 21 | ни-ни 22 | паф 23 | пиф-паф 24 | плюс-минус 25 | пожалуйста 26 | прыг 27 | растудыть 28 | скок 29 | спасибо 30 | стук 31 | то-то 32 | топ 33 | трух-трух 34 | трюх-трюх 35 | тык 36 | тырк 37 | тю-тю 38 | умора 39 | фук 40 | фырк 41 | хап 42 | хлысть 43 | хлясть 44 | хоп 45 | цап 46 | цап-царап 47 | царап 48 | цоп 49 | чебурах 50 | чик-чирик 51 | шабаш 52 | шарк 53 | швах 54 | швырк 55 | шмыг 56 | шмяк 57 | щип 58 | юрк 59 | -------------------------------------------------------------------------------- /ajax/merge_fails.php: -------------------------------------------------------------------------------- 1 | $TMPDIR/pools.txt 12 | 13 | for id in $( cat $TMPDIR/pools.txt | gawk '{ print $1 }' ) 14 | do 15 | wget -q "http://localhost/pools.php?act=samples&pool_id=$id&tabs=1&mod_ans" --output-document=$TMPDIR/pool_$id.tab 16 | done 17 | 18 | cd $TMPDIR 19 | zip -q9 $EXPORT_DIR/pools.zip pool*.t* 20 | tar -cjf $EXPORT_DIR/pools.tar.bz2 pool*.tab pools.txt --remove-files 21 | 22 | rm -rf $TMPDIR 23 | -------------------------------------------------------------------------------- /migrations/20140930101915_add_timer.php: -------------------------------------------------------------------------------- 1 | execute("CREATE TABLE timing ( 10 | `timestamp` TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP, 11 | `user_id` SMALLINT UNSIGNED NOT NULL DEFAULT 0, 12 | `page` VARCHAR(255) NOT NULL, 13 | `total_time` FLOAT NOT NULL, 14 | `is_ajax` TINYINT UNSIGNED NOT NULL 15 | ) ENGINE=INNODB"); 16 | } 17 | 18 | public function down() 19 | { 20 | $this->dropTable("timing"); 21 | } 22 | } 23 | -------------------------------------------------------------------------------- /sources.php: -------------------------------------------------------------------------------- 1 | assign('sources', get_sources_page($skip, $what, $src)); 17 | $smarty->assign('what', $what); 18 | $smarty->assign('skip', $skip); 19 | $smarty->display('templates/sources.tpl'); 20 | } 21 | log_timing(); 22 | ?> 23 | -------------------------------------------------------------------------------- /user.php: -------------------------------------------------------------------------------- 1 | assign('user', get_user_info($id)); 10 | $smarty->assign('user_id', $id); 11 | 12 | $smarty->assign('complexity', array( 13 | 0 => 'Сложность неизвестна', 14 | 1 => 'Очень простые задания', 15 | 2 => 'Простые задания', 16 | 3 => 'Сложные задания', 17 | 4 => 'Очень сложные задания')); 18 | 19 | $am2 = new AchievementsManager($id); 20 | $smarty->assign('achievements', $a = $am2->pull_all()); 21 | 22 | $smarty->display('user.tpl'); 23 | log_timing(); 24 | -------------------------------------------------------------------------------- /anaphora/NE_extract/run_parse.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | if [ -z "$1" ] || [ -z "$2" ] 4 | then 5 | echo "./run_parse.sh MorphDirName XmlDirName ResDirName" 6 | exit 1 7 | else 8 | 9 | mkdir -p $3 10 | rm $3/* 11 | for d in $1/* 12 | do 13 | XML=$(basename $d) 14 | perl parse_xmlfacts.pl -m $d -x $2/$XML.xml > $3/$XML.tsv 2>>lost 15 | sort -u -k2,2 $3/$XML.tsv | sort -n > $3/$XML_sorted.tsv 16 | mv $3/$XML_sorted.tsv $3/$XML.tsv 17 | cat $3/$XML.tsv >> $3/groups.tsv 18 | done 19 | 20 | sort -u -k2,2 $3/groups.tsv | sort -n > $3/groups_sorted.tsv 21 | mv $3/groups_sorted.tsv $3/groups.tsv 22 | grep -P "\t17" $3/groups.tsv > $3/pronouns.tsv 23 | fi 24 | -------------------------------------------------------------------------------- /migrations/20190504181939_add_sentence_quality.php: -------------------------------------------------------------------------------- 1 | table('sentence_quality', array('id' => false, 'engine' => 'InnoDB')); 11 | $sq->addColumn('length', 'integer', array('signed' => false, 'limit' => MysqlAdapter::INT_SMALL)) 12 | ->addColumn('status', 'integer', array('signed' => false, 'limit' => MysqlAdapter::INT_TINY)) 13 | ->addColumn('count', 'integer', array('signed' => false, 'limit' => MysqlAdapter::INT_MEDIUM)) 14 | ->create(); 15 | } 16 | } 17 | -------------------------------------------------------------------------------- /doc/presentations/2012_September29_OnePage.tex: -------------------------------------------------------------------------------- 1 | \documentclass{beamer} 2 | \usepackage{cmap} 3 | \usepackage[utf8]{inputenc} 4 | \usepackage[russian]{babel} 5 | \usepackage{listings} 6 | \usetheme{Antibes} 7 | \usecolortheme{beaver} 8 | \usepackage{graphicx} 9 | \graphicspath{{img/}} 10 | 11 | \title{Проект Открытый корпус / Морфологическая разметка} 12 | \begin{document} 13 | 14 | %slide 01 15 | \begin{frame} 16 | \frametitle{Присоединяйтесь к созданию разметки} 17 | \large{http://opencorpora.org} 18 | \hspace{1.4cm} 19 | \small{\color{gray}\{vk.com|twitter.com\}/opencorpora} 20 | \begin{figure} 21 | \center{\includegraphics[width=1\linewidth]{markupUI2-part.png}} 22 | \end{figure} 23 | \end{frame} 24 | 25 | \end{document} 26 | -------------------------------------------------------------------------------- /templates/qa/dl_urls.tpl: -------------------------------------------------------------------------------- 1 | {* Smarty *} 2 | {extends file='common.tpl'} 3 | {block name=content} 4 |

Сохранённые копии текстов источников

5 |

Список обновляется при каждом обращении к этой странице.

6 | 7 | 8 | {foreach item=obj from=$urls} 9 | 10 | 11 | 12 | 13 | 14 | {/foreach} 15 |
ТекстurlФайл
{$obj.book_name|htmlspecialchars}{$obj.url|truncate}{$obj.filename}{if $obj.filename && !$obj.exists}, не существует{/if}
16 | {/block} 17 | -------------------------------------------------------------------------------- /ajax/own_book.php: -------------------------------------------------------------------------------- 1 | 0 ? $_SESSION['user_id'] : 0; 16 | 17 | sql_pe("UPDATE sources SET user_id=? WHERE source_id=? LIMIT 1", array($user_id, $sid)); 18 | } 19 | catch (Exception $e) { 20 | $result['error'] = 1; 21 | } 22 | log_timing(true); 23 | die(json_encode($result)); 24 | -------------------------------------------------------------------------------- /scripts/aot_import/lists/list_adjf_fixd_ADVB.txt: -------------------------------------------------------------------------------- 1 | адажио 2 | алегретто 3 | аллегро 4 | альсекко 5 | альфреско 6 | анданте 7 | андантино 8 | аппассионато 9 | арпеджио 10 | арпеджо 11 | брутто 12 | вибрато 13 | виваче 14 | виво 15 | глиссандо 16 | граве 17 | декрешендо 18 | декрещендо 19 | диминуэндо 20 | дольче 21 | ин-кварто 22 | ин-октаво 23 | ин-фолио 24 | кантабиле 25 | крешендо 26 | крещендо 27 | ларгетто 28 | ларго 29 | легато 30 | ленто 31 | маэстозо 32 | модерато 33 | неглиже 34 | нетто 35 | пианиссимо 36 | пиано 37 | пиццикато 38 | пиччикато 39 | престиссимо 40 | престо 41 | ритенуто 42 | соло 43 | стаккато 44 | стретто 45 | субито 46 | сфорцандо 47 | сфорцато 48 | тремоландо 49 | факсимиле 50 | форте 51 | фортиссимо 52 | фуриозо 53 | -------------------------------------------------------------------------------- /add.php: -------------------------------------------------------------------------------- 1 | assign('check', addtext_check(POST('txt'), POST('book_id', 0))); 14 | $smarty->display('addtext_check.tpl'); 15 | break; 16 | default: 17 | check_permission(PERM_ADDER); 18 | $smarty->assign('txt', POST('txt', '')); 19 | $smarty->display('addtext.tpl'); 20 | } 21 | 22 | log_timing(); 23 | -------------------------------------------------------------------------------- /templates/books.tpl: -------------------------------------------------------------------------------- 1 | {* Smarty *} 2 | {extends file='common.tpl'} 3 | {block name=content} 4 |

Тексты

5 |

Всего книг: {$books.num}{if $user_permission_adder}, 6 | добавить: 7 |

8 | {/if} 9 |
    10 | {foreach item=book from=$books.list} 11 |
  • {$book.title}
  • 12 | {/foreach} 13 |
14 | {/block} 15 | -------------------------------------------------------------------------------- /history.php: -------------------------------------------------------------------------------- 1 | setCaching(Smarty::CACHING_LIFETIME_SAVED); 12 | $smarty->setCacheLifetime(90); 13 | $cache_id = "$sent_id@$set_id@$skip@$maa@$user_id"; 14 | 15 | if (!is_cached('history.tpl', $cache_id)) { 16 | $smarty->assign('history', main_history($sent_id, $set_id, $skip, $maa, $user_id)); 17 | $smarty->assign('skip', $skip); 18 | $smarty->assign('maa', $maa); 19 | $smarty->assign('user_id', $user_id); 20 | } 21 | $smarty->display('history.tpl', $cache_id); 22 | log_timing(); 23 | ?> 24 | -------------------------------------------------------------------------------- /migrations/20160716091852_ne_annot_number_per_tagset.php: -------------------------------------------------------------------------------- 1 | table("ne_tagsets") 11 | ->addColumn('annots_per_text', 'integer', array( 12 | 'signed' => false, 13 | 'limit' => MysqlAdapter::INT_TINY, 14 | 'default' => 4 15 | )) 16 | ->addColumn('active_texts', 'integer', array( 17 | 'signed' => false, 18 | 'limit' => MysqlAdapter::INT_SMALL, 19 | 'default' => 10 20 | )) 21 | ->save(); 22 | } 23 | } 24 | -------------------------------------------------------------------------------- /templates/qa/pool_tabs.tpl: -------------------------------------------------------------------------------- 1 | {foreach from=$pool.samples item=sample} 2 | {$sample.id} {$sample.token_id} {strip} 3 | {foreach $sample.context as $token_id => $word}{if $token_id == $sample.mainword}[[{$word}]]{else}{$word}{/if} {/foreach} 4 | {/strip} {strip} 5 | {foreach from=$sample.comments item=comment} 6 | {$comment.text|replace:"\n":'\n'} ({$comment.author}, {$comment.timestamp|date_format:"%d.%m.%Y, %H:%M"}); 7 | {/foreach} 8 | {/strip} {strip} 9 | {foreach from=$sample.instances item=instance}{if $instance.answer_num == $smarty.const.MA_ANSWER_OTHER}Other {elseif $instance.answer_num > 0}{$instance.answer_gram} {/if} 10 | {/foreach} 11 | {/strip}{if isset($smarty.get.mod_ans) && isset($sample.moder_answer_gram)} {$sample.moder_answer_gram}{else}{/if} 12 | 13 | {/foreach} 14 | -------------------------------------------------------------------------------- /templates/top100.tpl: -------------------------------------------------------------------------------- 1 | {* Smarty *} 2 | {extends file='common.tpl'} 3 | {block name='content'} 4 | 5 | {if $smarty.get.what == 'colloc'} 6 | 7 | {foreach $stats as $i=>$s} 8 | 9 | {/foreach} 10 | {else} 11 | 12 | {foreach $stats as $i=>$s} 13 | 14 | {/foreach} 15 | {/if} 16 |
# Абс. частота 1Абс. частота 2Совм. частотаКоэфф.
{$i+1}{$s.lterm|htmlspecialchars} {$s.rterm|htmlspecialchars}{$s.lfreq}{$s.rfreq}{$s.cfreq}{$s.coeff}
#ТокенАбс. частотаipm (частота на миллион)
{$i+1}{$s.token|htmlspecialchars}{$s.abs}{$s.ipm}
17 | {/block} 18 | -------------------------------------------------------------------------------- /generator_cp.php: -------------------------------------------------------------------------------- 1 | assign('status', $current['status']); 10 | $smarty->assign('since', $current['since']); 11 | $smarty->assign('tag', $current['tag']); 12 | $smarty->assign('next', $current['next']); 13 | 14 | switch ($action) { 15 | case 'toggle': 16 | $new = toggle_generator_status(); 17 | $smarty->assign('status', $new['status']); 18 | $smarty->assign('since', $new['since']); 19 | $smarty->assign('tag', $new['tag']); 20 | $smarty->assign('next', $new['next']); 21 | 22 | break; 23 | } 24 | 25 | $smarty->display('generator_cp.tpl'); 26 | log_timing(); 27 | ?> 28 | -------------------------------------------------------------------------------- /migrations/20160122154639_obj_property_multiple_values.php: -------------------------------------------------------------------------------- 1 | execute("alter table ne_object_prop_vals drop primary key"); 12 | $this->execute("alter table ne_object_prop_vals add column val_id int unsigned not null auto_increment first, add primary key (val_id)"); 13 | } 14 | 15 | /** 16 | * Migrate Down. 17 | */ 18 | public function down() { 19 | $this->table("ne_object_prop_vals")->removeColumn("val_id")->update(); 20 | $this->execute("alter table ne_object_prop_vals add primary key (object_id, prop_id)"); 21 | } 22 | } 23 | -------------------------------------------------------------------------------- /templates/qa/game_status.tpl: -------------------------------------------------------------------------------- 1 |
2 |
3 |
{if $user_rating.remaining_percent < 85}{$user_rating.current}{/if}
4 |
5 | 6 | 7 | 8 |
9 |
10 |
11 | -------------------------------------------------------------------------------- /templates/openid_license.tpl: -------------------------------------------------------------------------------- 1 | {* Smarty *} 2 | {extends file='common.tpl'} 3 | {block name=content} 4 |

Согласие с лицензией

5 |

Вы входите на наш сайт в первый раз. Для того, чтобы продолжить работу, вы должны подтвердить свое согласие с лицензией.

6 |
7 | 8 | 9 |
10 | {/block} 11 | -------------------------------------------------------------------------------- /perl/lib/ISO/LMF/LexicalEntry.pm: -------------------------------------------------------------------------------- 1 | package OpenCorpora::ISO::LMF::LexicalEntry; 2 | 3 | use strict; 4 | use warnings; 5 | use utf8; 6 | 7 | our $VERSION = "0.01"; 8 | 9 | 10 | 11 | 12 | sub new { 13 | my($class, %args) = @_; 14 | 15 | my $self = bless({}, $class); 16 | 17 | # if (exists $args{handlers}) { 18 | # my $handlers = $args{handlers}; 19 | # $self->{handler_lemma} = exists $handlers->{lemma} ? $handlers->{lemma} : \&nop_function; 20 | # } 21 | 22 | $self->{lemma}->{text} = ""; 23 | $self->{lemma}->{gram} = {}; 24 | $self->{forms} = {}; 25 | 26 | return $self; 27 | } 28 | 29 | sub lemma_text { 30 | my $self = shift; 31 | if (@_) { 32 | my $text = shift; 33 | $self->{lemma}->{text} = $text; 34 | } 35 | 36 | return $self->{lemma}->{text}; 37 | } 38 | 39 | 1; 40 | -------------------------------------------------------------------------------- /manual.php: -------------------------------------------------------------------------------- 1 | assign('content', get_wiki_page("Инструкция для модераторов")); 11 | break; 12 | case 'newslist_announce': 13 | $smarty->assign('content', get_wiki_page("Newslist opencorpora-dev")); 14 | break; 15 | default: 16 | if ($pool_type) 17 | $smarty->assign('content', get_wiki_page(get_pool_manual_page($pool_type))); 18 | else 19 | $smarty->assign('content', get_wiki_page("Инструкция по интерфейсу для снятия омонимии")); 20 | } 21 | 22 | $smarty->display('static/doc/annotation.tpl'); 23 | log_timing(); 24 | ?> 25 | -------------------------------------------------------------------------------- /templates/addtext.tpl: -------------------------------------------------------------------------------- 1 | {* Smarty *} 2 | {extends file='common.tpl'} 3 | {block name=content} 4 |

Добавляем текст

5 |
6 | {if isset($txt)} 7 | 8 | {else} 9 | 10 | {/if} 11 | {if isset($smarty.get.to)} 12 | 13 | {/if} 14 |

15 | 16 |
17 | {/block} 18 | -------------------------------------------------------------------------------- /assets/js/merge_fails.js: -------------------------------------------------------------------------------- 1 | $(document).ready(function() { 2 | $(".approve-sample").change(function(event) { 3 | var $c = $(event.target).closest('td'); 4 | $.post("ajax/merge_fails.php", { 5 | act: "approve", 6 | value: $(this).is(":checked") ? 1 : 0, 7 | id: $(this).attr("data-id") 8 | }, 9 | function() { 10 | $c.addClass('bggreen'); 11 | }); 12 | }); 13 | 14 | $(".comment-cell").blur(function(event) { 15 | var $c = $(event.target).closest('td'); 16 | $.post("ajax/merge_fails.php", { 17 | act: "comment", 18 | id: $(this).attr("data-id"), 19 | text: $(this).text() 20 | }, 21 | function() { 22 | $c.addClass('bggreen'); 23 | }); 24 | }); 25 | }); 26 | -------------------------------------------------------------------------------- /scripts/aot_import/lists/adjs_forms_del.txt: -------------------------------------------------------------------------------- 1 | #Удалить из парадигмы кратких прилагательных следующие формы 2 | вынуждена 3 | вынуждено 4 | вынуждены 5 | изощрена 6 | изощрено 7 | изощрены 8 | искажена 9 | искажено 10 | искажены 11 | обижена 12 | обижено 13 | обижены 14 | обнажена 15 | обнажено 16 | обнажены 17 | обречена 18 | обречено 19 | обречены 20 | огорчено 21 | огорчено 22 | огорчены 23 | обособлена 24 | обособлено 25 | обособлены 26 | одарена 27 | одарено 28 | одарены 29 | предана 30 | предано 31 | преданы 32 | связана 33 | смущена 34 | убеждена 35 | уверена 36 | углублена 37 | удалена 38 | удручена 39 | утомлена 40 | связано 41 | смущено 42 | убеждено 43 | уверено 44 | углублено 45 | удалено 46 | удручено 47 | утомлено 48 | связаны 49 | смущены 50 | убеждены 51 | уверены 52 | углублены 53 | удалены 54 | удручены 55 | утомлены 56 | -------------------------------------------------------------------------------- /syntax.php: -------------------------------------------------------------------------------- 1 | assign('page', get_books_with_syntax()); 25 | $smarty->display('syntax/main.tpl'); 26 | } 27 | log_timing(); 28 | ?> 29 | -------------------------------------------------------------------------------- /export/annot/disamb_nonmod_tests/pool_158.tab: -------------------------------------------------------------------------------- 1 | 122834 332515 После подсчёта голосов 10 % избирательных участков , результаты выборов , в нижнюю палату парламента — Палату [[депутатов]] , выглядят так : NOUN & plur & gent NOUN & plur & gent NOUN & plur & accs 2 | 122769 124015 Также следует учесть , что в Австралии , с одной стороны , значительная часть аборигенов продолжала жить в таких же условиях , как жили их предки лет сто назад , с другой стороны , появилась небольшая городская прослойка [[аборигенов]] , проживавших в гетто и имевшие хорошее представление о том , что такое рок-музыка . NOUN & plur & gent NOUN & plur & gent NOUN & plur & gent 3 | 122850 473525 Рустэм [[Хамитов]] опроверг появившуюся в СМИ информацию о необычно высокой пенсии Муртазы Рахимова . Хамитов - ед.ч., именительный (quorax, 01.09.2012, 12:13); Other Other Other 4 | -------------------------------------------------------------------------------- /postagging/brill/unsupervised/cpp/lemmatizer/Makefile: -------------------------------------------------------------------------------- 1 | CC=g++ 2 | LD=g++ 3 | INC_PATH=../include/ 4 | LIB_PATH=../lib/ 5 | CFLAGS=-c -O3 -Wall -std=c++0x -I$(INC_PATH) `pkg-config --cflags glibmm-2.4` 6 | LDFLAGS=`pkg-config --libs glibmm-2.4` 7 | 8 | all: lemmatizer 9 | 10 | lemmatizer: main.o corpora_io.o dict.o 11 | $(LD) -O3 main.o corpora_io.o dict.o $(LDFLAGS) -o lemmatizer 12 | 13 | main.o: main.cpp $(INC_PATH)tag.h $(INC_PATH)token.h $(INC_PATH)utils.h $(INC_PATH)sentence.h $(INC_PATH)corpora_io.h $(INC_PATH)dict.h 14 | $(CC) $(CFLAGS) main.cpp 15 | 16 | corpora_io.o: $(LIB_PATH)corpora_io.cpp $(INC_PATH)corpora_io.h $(INC_PATH)sentence.h $(INC_PATH)token.h $(INC_PATH)tag.h $(INC_PATH)utils.h 17 | $(CC) $(CFLAGS) $(LIB_PATH)corpora_io.cpp 18 | 19 | dict.o: $(LIB_PATH)dict.cpp $(INC_PATH)dict.h 20 | $(CC) $(CFLAGS) $(LIB_PATH)dict.cpp 21 | -------------------------------------------------------------------------------- /templates/footer.tpl: -------------------------------------------------------------------------------- 1 | {* Smarty *} 2 | 13 | -------------------------------------------------------------------------------- /export/pools.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | 3 | use strict; 4 | use utf8; 5 | use DBI; 6 | use Config::INI::Reader; 7 | 8 | $ARGV[0] or die "Usage: $0 "; 9 | #reading config 10 | my $conf = Config::INI::Reader->read_file($ARGV[0]); 11 | $conf = $conf->{mysql}; 12 | 13 | #main 14 | my $dbh = DBI->connect('DBI:mysql:'.$conf->{'dbname'}.':'.$conf->{'host'}, $conf->{'user'}, $conf->{'passwd'}); 15 | if (!$dbh) { 16 | die $DBI::errstr; 17 | } 18 | 19 | my $scan = $dbh->prepare(" 20 | SELECT pool_id, status, grammemes 21 | FROM morph_annot_pools p 22 | LEFT JOIN morph_annot_pool_types t 23 | ON (p.pool_type = t.type_id) 24 | ORDER BY pool_id 25 | "); 26 | $scan->execute(); 27 | while (my $r = $scan->fetchrow_hashref()) { 28 | printf "%d\t%s\t%d\n", 29 | $r->{'pool_id'}, $r->{'grammemes'}, $r->{'status'}; 30 | } 31 | -------------------------------------------------------------------------------- /anaphora/export_pairs.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python2 2 | # -*- coding: utf-8 -*- 3 | import sys 4 | sys.path.append('/corpus/python') 5 | from Annotation import AnnotationEditor 6 | 7 | CONFIG_PATH = "/corpus/config.ini" 8 | 9 | def do_export(dbh): 10 | dbh.execute(""" 11 | SELECT token_id, group_id, book_id 12 | FROM anaphora 13 | LEFT JOIN tokens ON (anaphora.token_id = tokens.tf_id) 14 | JOIN sentences USING (sent_id) 15 | JOIN paragraphs USING (par_id) 16 | ORDER BY book_id, group_id, token_id 17 | """) 18 | 19 | for row in dbh.fetchall(): 20 | print("{2}\t{0}\t{1}".format(row['token_id'], row['group_id'], row['book_id'])) 21 | 22 | def main(): 23 | editor = AnnotationEditor(CONFIG_PATH) 24 | do_export(editor.db_cursor) 25 | 26 | if __name__ == "__main__": 27 | main() 28 | -------------------------------------------------------------------------------- /migrations/20200625120954_long_good_sentences.php: -------------------------------------------------------------------------------- 1 | table('good_sentences'); 11 | $gs->changeColumn('num_words', 'integer', ['signed' => false, 'limit' => MysqlAdapter::INT_SMALL]); 12 | $gs->changeColumn('num_homonymous', 'integer', ['signed' => false, 'limit' => MysqlAdapter::INT_SMALL]); 13 | } 14 | 15 | public function down() 16 | { 17 | $gs = $this->table('good_sentences'); 18 | $gs->changeColumn('num_words', 'integer', ['signed' => false, 'limit' => MysqlAdapter::INT_TINY]); 19 | $gs->changeColumn('num_homonymous', 'integer', ['signed' => false, 'limit' => MysqlAdapter::INT_TINY]); 20 | } 21 | } 22 | -------------------------------------------------------------------------------- /migrations/20151028150709_move_object_type_to2_level.php: -------------------------------------------------------------------------------- 1 | table("ne_objects") 12 | ->removeColumn("object_type_id") 13 | ->update(); 14 | 15 | $this->table("ne_mentions") 16 | ->addColumn('object_type_id', 'integer') 17 | ->update(); 18 | } 19 | 20 | /** 21 | * Migrate Down. 22 | */ 23 | public function down() { 24 | $this->table("ne_mentions") 25 | ->removeColumn("object_type_id") 26 | ->update(); 27 | 28 | $this->table("ne_objects") 29 | ->addColumn('object_type_id', 'integer') 30 | ->update(); 31 | } 32 | } 33 | -------------------------------------------------------------------------------- /postagging/brill/unsupervised/cpp/include/dict.h: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | #include 7 | 8 | #include "tag.h" 9 | 10 | #ifndef __DICT_H 11 | #define __DICT_H 12 | 13 | class Dict { 14 | 15 | std::set unknown; 16 | std::vector> v_interp; 17 | std::tr1::unordered_map d; 18 | // std::tr1::unordered_map > d; 19 | // std::vector< std::set > vmi; 20 | // std::map< std::set, size_t > mmi; 21 | 22 | public: 23 | Dict() { 24 | MorphInterp mi(0, "UNKN"); 25 | unknown.insert(mi); 26 | } 27 | 28 | void load(const std::string &fn); 29 | 30 | const std::set& lookup(const Glib::ustring &str) const; 31 | // bool lookup(const Glib::ustring &str, ...) 32 | }; 33 | 34 | #endif 35 | -------------------------------------------------------------------------------- /ajax/lastpar.php: -------------------------------------------------------------------------------- 1 | '; 15 | 16 | $r = sql_fetch_array(sql_query("SELECT SUBSTRING_INDEX(source, ' ', -5) AS `end` FROM sentences WHERE `par_id` = $par_id ORDER BY `pos` DESC LIMIT 1")); 17 | $result['text'] .= $r['end']; 18 | 19 | $result['text'] = htmlspecialchars($result['text']); 20 | $result['num'] = $num; 21 | 22 | log_timing(true); 23 | die(json_encode($result)); 24 | ?> 25 | -------------------------------------------------------------------------------- /scripts/validators/par_validator.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import sys 3 | import ConfigParser, MySQLdb 4 | 5 | config = ConfigParser.ConfigParser() 6 | config.read(sys.argv[1]) 7 | 8 | hostname = config.get ('mysql', 'host') 9 | dbname = config.get ('mysql', 'dbname') 10 | username = config.get ('mysql', 'user') 11 | password = config.get ('mysql', 'passwd') 12 | 13 | db = MySQLdb.connect (hostname, username, password, dbname, use_unicode=True) 14 | 15 | cursor = db.cursor() 16 | cursor.execute('SET NAMES utf8') 17 | cursor.execute("""DELETE FROM tag_errors WHERE error_type=5""") 18 | cursor.execute("""SELECT book_id FROM books WHERE (book_id in (SELECT distinct parent_id FROM books)) and (book_id IN (SELECT distinct book_id FROM paragraphs))""") 19 | 20 | data = cursor.fetchall() 21 | for i in data: 22 | query = """INSERT INTO tag_errors VALUES (%d, '%s', %d)""" % (i[0], '', 5) 23 | cursor.execute(query) 24 | db.commit() 25 | db.close 26 | -------------------------------------------------------------------------------- /scripts/export/export_dict.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | ROOT_PATH=${ROOT_PATH:-/corpus} 4 | RO_FLAG=$ROOT_PATH/readonly.tmp 5 | 6 | touch $RO_FLAG 7 | newpath=$ROOT_PATH/files/export/dict/dict.opcorpora 8 | $ROOT_PATH/export/dict/export_dict.pl $1 <$ROOT_PATH/config.ini >$newpath.xml 9 | rm $RO_FLAG 10 | if [ `ls -l $newpath.xml | awk '{print $5}'` -gt 100 ]; then 11 | bzip2 -cq9 $newpath.xml >$newpath.xml.bz2.new 12 | mv $newpath.xml.bz2.new $newpath.xml.bz2 13 | zip -jq9 $newpath.xml.zip.new $newpath.xml 14 | mv $newpath.xml.zip.new $newpath.xml.zip 15 | rm $newpath.xml 16 | 17 | touch $RO_FLAG 18 | $ROOT_PATH/export/dict/export_dict.pl $1 -p <$ROOT_PATH/config.ini >$newpath.txt 19 | rm $RO_FLAG 20 | bzip2 -cq9 $newpath.txt >$newpath.txt.bz2.new 21 | mv $newpath.txt.bz2.new $newpath.txt.bz2 22 | zip -jq9 $newpath.txt.zip.new $newpath.txt 23 | mv $newpath.txt.zip.new $newpath.txt.zip 24 | rm $newpath.txt 25 | fi 26 | -------------------------------------------------------------------------------- /templates/ner/_partials/objects-modal.tpl: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /templates/tag_stats.tpl: -------------------------------------------------------------------------------- 1 | {* Smarty *} 2 | {extends file='common.tpl'} 3 | {block name=content} 4 |

Статистика

5 | 11 | 16 | 17 | {foreach from=$stats item=group key=gname} 18 | 19 | {foreach from=$group item=elem} 20 | 21 | {/foreach} 22 | {/foreach} 23 |
{$gname}текстовслов
{$elem.value|htmlspecialchars}{$elem.texts}{$elem.words}
24 | {/block} 25 | -------------------------------------------------------------------------------- /composer.json: -------------------------------------------------------------------------------- 1 | { 2 | "require": { 3 | "robmorgan/phinx": ">=0.4.3", 4 | "symfony/class-loader": "2.6.0", 5 | "symfony/config": "2.6.0", 6 | "symfony/console": "2.6.0", 7 | "symfony/yaml": "2.6.0", 8 | "swiftmailer/swiftmailer": "5.4.12", 9 | "robloach/component-installer": "@stable", 10 | "monolog/monolog": "@stable", 11 | 12 | "components/jquery": "1.10", 13 | "opencorpora/bootstrap": "dev-master", 14 | "mouse0270/bootstrap-notify": "3.1.3", 15 | "smarty/smarty": "4.3.1" 16 | 17 | }, 18 | 19 | "repositories": [ 20 | { 21 | "type": "vcs", 22 | "url": "https://github.com/opencorpora/bootstrap" 23 | }, 24 | { 25 | "type": "vcs", 26 | "url": "https://github.com/mouse0270/bootstrap-notify" 27 | } 28 | ], 29 | 30 | "config": { 31 | "component-dir": "assets/vendor" 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /scripts/check_dog_achievement.php: -------------------------------------------------------------------------------- 1 | emit(EventTypes::MONTH_PASSED); 21 | } 22 | print "Achievements pinged, finishing\n"; 23 | } -------------------------------------------------------------------------------- /ajax/anaphora.php: -------------------------------------------------------------------------------- 1 | Ошибки в тегах 5 |

Список обновляется раз в час.

6 | 7 | {foreach item=err from=$errata} 8 | 9 | 10 | 11 | {strip} 12 | 20 | {/strip} 21 | 22 | {foreachelse} 23 | 24 | {/foreach} 25 |
{$err.book_id}{$err.tag_name|htmlspecialchars|truncate:100|default:" "}{if $err.error_type == 1}Ошибка в годе 13 | {elseif $err.error_type == 2}Ошибка в дате 14 | {elseif $err.error_type == 3}Не хватает тега "Автор:" 15 | {elseif $err.error_type == 4}Ссылка на википроект без версии 16 | {elseif $err.error_type == 5}Приписаны дочерние тексты и параграфы 17 | {elseif $err.error_type == 6}Не хватает URL 18 | {else}Неизвестная ошибка{/if} 19 |
Список пуст.
26 | {/block} 27 | -------------------------------------------------------------------------------- /scripts/mwords/search.php: -------------------------------------------------------------------------------- 1 | 1) { 8 | if(in_array($argv[1], array('--help', '-help', '-h', '-?'))) 9 | die("Скрипт для поиска кандидатов в мультитокены. Поиск шаблонов из rules.txt \nЗапуск без аргументов или с единственным аргументом - ограничением строк(и) в rules.txt в формате N или N-M.\nСтроки нумеруются с 1.\n"); 10 | else { 11 | $str = $argv[1]; 12 | $borders = explode("-", $str); 13 | if (sizeof($borders) > 1) 14 | $limit = range((int)$borders[0]-1, (int)$borders[1]-1); 15 | else 16 | $limit = array((int)$str-1); 17 | } 18 | } 19 | 20 | set_include_path(get_include_path().PATH_SEPARATOR.'/corpus'); 21 | require_once('lib/header_ajax.php'); 22 | require_once('lib/lib_multiwords.php'); 23 | 24 | $searcher = new MultiWordFinder(getcwd() . "/rules.txt", $limit); 25 | $searcher->find(); 26 | -------------------------------------------------------------------------------- /migrations/20150720124123_last_dict_revision.php: -------------------------------------------------------------------------------- 1 | table("dict_revisions"); 10 | $rev->addColumn('is_last', 'boolean', array('signed' => false)) 11 | ->addIndex(array('is_last')) 12 | ->save(); 13 | 14 | $tmp = $this->table("tmp_dict_rev", array('id' => false)); 15 | $tmp->addColumn('rev_id', 'integer')->save(); 16 | 17 | $this->execute("INSERT INTO tmp_dict_rev (SELECT MAX(rev_id) FROM dict_revisions GROUP BY lemma_id)"); 18 | $this->execute("UPDATE dict_revisions LEFT JOIN tmp_dict_rev USING (rev_id) SET is_last = 1 WHERE tmp_dict_rev.rev_id IS NOT NULL"); 19 | 20 | $this->dropTable("tmp_dict_rev"); 21 | } 22 | 23 | public function down() 24 | { 25 | $this->table("dict_revisions")->removeColumn('is_last')->update(); 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /ner.php: -------------------------------------------------------------------------------- 1 | assign('active_page', 'tasks'); 9 | 10 | check_logged(); 11 | 12 | $action = GET('act', ''); 13 | $tagset_id = get_current_tagset(); 14 | 15 | switch ($action) { 16 | 17 | case 'manual': 18 | $smarty->assign('content', get_wiki_page("nermanual/" . (int)GET('id'))); 19 | $smarty->display('static/doc/annotation.tpl'); 20 | break; 21 | 22 | default: 23 | $is_ner_mod = user_has_permission(PERM_NE_MODER); 24 | 25 | $smarty->assign('possible_guidelines', get_ne_guidelines()); 26 | $smarty->assign('is_ner_mod', $is_ner_mod); 27 | $smarty->assign('current_guideline', $tagset_id); 28 | $smarty->assign('page', get_books_with_NE($tagset_id, !$is_ner_mod)); 29 | $smarty->display(($is_ner_mod ? 'ner/main-moderator.tpl' : 'ner/main.tpl')); 30 | } 31 | log_timing(); 32 | -------------------------------------------------------------------------------- /postagging/brill/unsupervised/cpp/train/aux.h: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | #include "tag.h" 6 | #include "sentence.h" 7 | 8 | //std::string toString(const std::map &m); 9 | std::string toString(const std::map &m); 10 | 11 | std::string PrintRules(const std::list& lr); 12 | std::string PrintSC(const SentenceCollection &sc); 13 | 14 | template 15 | struct less_by_second { 16 | std::map& rmap; 17 | less_by_second(std::map& _rmap) : rmap(_rmap) { } 18 | 19 | bool operator()(const T& a, const T& b) const { 20 | return rmap[a] > rmap[b]; 21 | } 22 | }; 23 | 24 | //class TagStat; 25 | /* 26 | template 27 | struct less_by_from_freq { 28 | std::map& rmap; 29 | less_by_from_freq(std::map& _rmap) : rmap(_rmap) { } 30 | 31 | bool operator()(const T& a, const T& b) const { 32 | return rmap[a.from].freq > rmap[b.from].freq; 33 | } 34 | };*/ 35 | -------------------------------------------------------------------------------- /postagging/brill/unsupervised/python/pictures.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | from matplotlib import pyplot 4 | import sys 5 | 6 | from utils import numb_amb_corpus as n 7 | from utils import read_corpus 8 | import os 9 | 10 | 11 | def tokens_rules(tokens, rules): 12 | pyplot.plot(tokens, rules, 'ro') 13 | pyplot.show() 14 | 15 | 16 | def tokens(files): 17 | def t(f): 18 | f = read_corpus(f) 19 | return n(f)[0] 20 | return [t(open(f, 'r').read()) for f in files] 21 | 22 | 23 | if __name__ == '__main__': 24 | tokens = os.listdir(sys.argv[1]) 25 | rules = os.listdir(sys.argv[2]) 26 | t = [] 27 | r = [] 28 | for f1, f2 in zip(tokens, rules): 29 | f1 = 'tokens\%s' % f1 30 | f2 = 'rules\%s' % f2 31 | t += [int(x.lstrip().split()[0]) for x in open(f1, 'r').read().rstrip().split('\n')] 32 | r += [int(x.lstrip().split()[0]) for x in open(f2, 'r').read().rstrip().split('\n')] 33 | tokens_rules(t, r) 34 | -------------------------------------------------------------------------------- /phinx.php: -------------------------------------------------------------------------------- 1 | array( 7 | 'migrations' => __DIR__.'/migrations', 8 | ), 9 | 'environments' => array( 10 | 'default_migration_table' => 'phinxlog', 11 | 'default_database' => 'corpora', 12 | 'production' => array( 13 | 'adapter' => 'mysql', 14 | 'host' => $config['mysql']['host'], 15 | 'name' => $config['mysql']['dbname'], 16 | 'user' => $config['mysql']['user'], 17 | 'pass' => $config['mysql']['passwd'], 18 | 'port' => '3306', 19 | 'charset' => 'utf8', 20 | ), 21 | 'development' => array( 22 | 'adapter' => 'mysql', 23 | 'host' => '127.0.0.1', 24 | 'name' => 'corpora', 25 | 'user' => 'root', 26 | 'pass' => '', 27 | 'port' => '3306', 28 | 'charset' => 'utf8' 29 | ), 30 | ), 31 | ); 32 | -------------------------------------------------------------------------------- /migrations/20160109110445_ne_objects_properties.php: -------------------------------------------------------------------------------- 1 | table("ne_object_props", array("id" => "prop_id", "engine" => "InnoDB")); 12 | $props 13 | ->addColumn("prop_key", "string", array("limit" => 100)) 14 | ->save(); 15 | $vals = $this->table("ne_object_prop_vals", array("id" => false, "primary_key" => array("object_id", "prop_id"), "engine" => "InnoDB")); 16 | $vals 17 | ->addColumn("object_id", "integer") 18 | ->addColumn("prop_id", "integer") 19 | ->addColumn("prop_val", "string") 20 | ->save(); 21 | } 22 | 23 | /** 24 | * Migrate Down. 25 | */ 26 | public function down() { 27 | $this->dropTable("ne_object_props"); 28 | $this->dropTable("ne_object_prop_vals"); 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /migrations/20140923233544_add_permission_ne_moderator.php: -------------------------------------------------------------------------------- 1 | execute(" 27 | ALTER TABLE `user_permissions` 28 | ADD COLUMN `perm_check_ne` tinyint(3) unsigned not null;"); 29 | } 30 | 31 | /** 32 | * Migrate Down. 33 | */ 34 | public function down() 35 | { 36 | $this->table('user_permissions')->removeColumn('perm_check_ne'); 37 | } 38 | } -------------------------------------------------------------------------------- /migrations/20151105153227_many_entities2_many_mentions.php: -------------------------------------------------------------------------------- 1 | table("ne_entities") 12 | ->removeColumn("mention_id") 13 | ->update(); 14 | $this->table("ne_entities_mentions", array("id" => false, "primary_key" => array("entity_id", "mention_id"), "engine" => "InnoDB")) 15 | ->addColumn("entity_id", "integer") 16 | ->addColumn("mention_id", "integer") 17 | ->addIndex("entity_id") 18 | ->addIndex("mention_id") 19 | ->save(); 20 | } 21 | 22 | /** 23 | * Migrate Down. 24 | */ 25 | public function down() { 26 | $this->table('ne_entities') 27 | ->addColumn('mention_id', 'integer', array('default' => 0)) 28 | ->update(); 29 | $this->dropTable("ne_entities_mentions"); 30 | } 31 | } 32 | -------------------------------------------------------------------------------- /lib/lib_mail.php: -------------------------------------------------------------------------------- 1 | setSubject($header) 10 | // Set the From address with an associative array 11 | ->setFrom(array('robot@opencorpora.org' => 'OpenCorpora')) 12 | // Set the To addresses with an associative array 13 | ->setTo($to) 14 | // Give it a body 15 | ->setBody($body); 16 | // Create the Transport 17 | global $config; 18 | $transport = Swift_SmtpTransport::newInstance($config['mail']['host'], $config['mail']['port'], $config['mail']['encrypt']) 19 | ->setUsername($config['mail']['user']) 20 | ->setPassword($config['mail']['password']); 21 | // Create the Mailer using your created Transport 22 | $mailer = Swift_Mailer::newInstance($transport); 23 | return $mailer->send($message); 24 | } 25 | -------------------------------------------------------------------------------- /migrations/20171203130148_drop_sentence_check.php: -------------------------------------------------------------------------------- 1 | dropTable("sentence_check"); 15 | } 16 | 17 | /** 18 | * Migrate Down. 19 | */ 20 | public function down() 21 | { 22 | $tbl = $this->table("sentence_check", array("id" => false, "engine" => "InnoDB")); 23 | $tbl->addColumn('sent_id', 'integer', array('signed' => false, 'limit' => MysqlAdapter::INT_MEDIUM)) 24 | ->addColumn('user_id', 'integer', array('signed' => false, 'limit' => MysqlAdapter::INT_SMALL)) 25 | ->addColumn('status', 'integer', array('signed' => false, 'limit' => MysqlAdapter::INT_TINY)) 26 | ->addColumn('timestamp', 'integer', array('signed' => false)) 27 | ->addIndex('sent_id') 28 | ->addIndex('user_id') 29 | ->save(); 30 | } 31 | } 32 | -------------------------------------------------------------------------------- /migrations/20140830004643_named_entities_event_log.php: -------------------------------------------------------------------------------- 1 | table('ne_event_log', array('id' => 'event_id', 'engine' => 'InnoDB')); 26 | $users->addColumn('user_id', 'integer') 27 | ->addColumn('message', 'text') 28 | ->addColumn('created', 'timestamp', array('default' => 'CURRENT_TIMESTAMP')) 29 | ->save(); 30 | } 31 | 32 | /** 33 | * Migrate Down. 34 | */ 35 | public function down() 36 | { 37 | $this->dropTable('ne_event_log'); 38 | } 39 | } -------------------------------------------------------------------------------- /scripts/backup.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | RO_FLAG=/corpus/readonly.tmp 3 | 4 | touch $RO_FLAG 5 | TEMP_DUMP=/backup/temp.sql 6 | if [ ! -d /backup/`date +%Y%m` ]; then 7 | mkdir /backup/`date +%Y%m` 8 | fi 9 | NOW=`date +%Y%m%d_%H%M` 10 | mysqldump \ 11 | --host ocdb \ 12 | --ignore-table=opcorpora.dict_errata \ 13 | --ignore-table=opcorpora.form2lemma \ 14 | --ignore-table=opcorpora.form2tf \ 15 | --ignore-table=opcorpora.tokenizer_strange \ 16 | opcorpora > $TEMP_DUMP 17 | mysqldump \ 18 | --host ocdb \ 19 | --no-data \ 20 | opcorpora \ 21 | dict_errata \ 22 | form2lemma \ 23 | form2tf \ 24 | tokenizer_strange \ 25 | >> $TEMP_DUMP 26 | rm $RO_FLAG 27 | nice xz -cze8 $TEMP_DUMP >/backup/`date +%Y%m`/oc$NOW.sql.xz 28 | rm $TEMP_DUMP 29 | mysqldump \ 30 | --host ocdb \ 31 | wikidb | xz -ze8 > /backup/`date +%Y%m`/wiki$NOW.sql.xz 32 | 33 | # backup to Yandex.Disk 34 | curl -s --user `cat /corpus/yadisk-auth` -T /backup/`date +%Y%m`/oc$NOW.sql.xz https://webdav.yandex.ru/opencorpora/backup/ || echo "Failed to upload backup to YaDisk" 35 | -------------------------------------------------------------------------------- /lib/header_ajax.php: -------------------------------------------------------------------------------- 1 | setAttribute(PDO::ATTR_EMULATE_PREPARES, false); 21 | $pdo_db->setAttribute(PDO::ATTR_ERRMODE, PDO::ERRMODE_SILENT); 22 | $pdo_db->query("SET NAMES utf8"); 23 | $pdo_db->query("SET session sql_mode = ''"); 24 | 25 | mb_internal_encoding('UTF-8'); 26 | mb_regex_encoding('UTF-8'); 27 | 28 | $result = array('error' => 0); // this will end up being json-encoded and returned 29 | ?> 30 | -------------------------------------------------------------------------------- /ajax/dict_reload.php: -------------------------------------------------------------------------------- 1 | "; 13 | foreach ($pset->parses as $parse) { 14 | $result['xml'] .= ''; 15 | foreach($parse->gramlist as $gram) { 16 | if (OPTION(OPT_GRAMNAMES) == 1) { 17 | $result['xml'] .= ''; 18 | } else { 19 | $result['xml'] .= ''; 20 | } 21 | } 22 | $result['xml'] .= ''; 23 | } 24 | $result['xml'] .= ''; 25 | } 26 | catch (Exception $e) { 27 | $result['error'] = 1; 28 | } 29 | 30 | log_timing(true); 31 | die(json_encode($result)); 32 | ?> 33 | -------------------------------------------------------------------------------- /perl/lib/Lingua/AOT/MorphDict/Paradigm.pm: -------------------------------------------------------------------------------- 1 | package Lingua::AOT::MorphDict::Paradigm; 2 | 3 | use strict; 4 | use warnings; 5 | use utf8; 6 | 7 | use Lingua::AOT::MorphDict::FormSpec; 8 | 9 | our $VERSION = "0.01"; 10 | 11 | 12 | sub new { 13 | my ($class, $line, $optRemoveAccentDoublicates) = @_; 14 | my $self = {}; 15 | 16 | my ($paradigm_text, $other) = split(/\#/, $line); 17 | if (defined($other) && length($other) > 0) { 18 | die "mrd paring error: $_"; 19 | } 20 | 21 | my %h_known_fs; 22 | while ($paradigm_text =~ /%(([\-А-ЯЁ]*)\*([А-ЯЁа-яё]+)(\*([А-ЯЁ]*))?)/g) { 23 | if (!exists($h_known_fs{$1}) || 0 == $optRemoveAccentDoublicates) { 24 | push @{$self->{forms}}, new Lingua::AOT::MorphDict::FormSpec($2, $3, $5); 25 | $h_known_fs{$1} = 1; 26 | } 27 | } 28 | 29 | if (!defined($self->{forms})) { 30 | die "can't parse paradigm \"$paradigm_text\"\n"; 31 | } 32 | 33 | bless($self, $class); 34 | 35 | return $self; 36 | } 37 | 38 | sub FormSpecs { 39 | my $self = shift; 40 | return $self->{forms}; 41 | } 42 | 43 | sub GetLastFormNo { 44 | my $self = shift; 45 | return $#{$self->{forms}}; 46 | } 47 | -------------------------------------------------------------------------------- /scripts/tokenizer/tokenizer_exceptions.txt: -------------------------------------------------------------------------------- 1 | +100500 2 | A/H1N1 3 | AC/DC 4 | С++ 5 | K.18/40 6 | K.353(r) 7 | Will.I.AM 8 | Yahoo! 9 | АЕС+Ф 10 | А.Р.Т.О. 11 | Б.С.Г.-Пресс 12 | ВКП(б) 13 | Вору.НЕТ 14 | Кот-д'Ивуар 15 | О.С.П.-студии 16 | Театр.doc 17 | ТРИ"О" 18 | Яндекс.Директе 19 | Яндекс.Интерес 20 | Яндекс.Картах 21 | Яндекс.Кошелек 22 | Яндекс.Кошелёк 23 | Яндекс.Деньги 24 | Яндекс.Метрика 25 | Яндекс.Музыка 26 | Яндекс.Музыки 27 | Яндекс.Новости 28 | Яндекс.Панорамы 29 | Яндекс.Фотках 30 | Тайга.инфо 31 | .рф 32 | .РФ 33 | 1072.COMPNEWS 34 | 1072.COMPNEWS.TALK 35 | sb-thread:interrupt-thread 36 | sb-thread:terminate-thread 37 | C:\Program Files\ 38 | см³ 39 | м² 40 | см² 41 | км² 42 | 10³ 43 | Р`глор 44 | Р`глора 45 | Р`глору 46 | р`глориты 47 | р`глоритом 48 | р`глоритское 49 | Са’дом 50 | I’m 51 | ФИГНЯ@ЕЩЁ.ФИГНЯ 52 | ВОРУ.НЕТ 53 | Леди@Mail.ru 54 | 55 | # это надо добавить в словарь (issue 731) 56 | др.-греч 57 | пра-и.е 58 | сев.-зап 59 | ст.-слав 60 | чл.-корр 61 | 62 | # запиканные ругательства 63 | б...ь 64 | ж... 65 | с.. 66 | с... 67 | 68 | # рад(а), опечален(а) и т.п. 69 | клима(ктер)ическая 70 | опечален(а) 71 | рад(а) 72 | электрогастро(энтеро)граммы 73 | язык(и) 74 | -------------------------------------------------------------------------------- /export/database/backup.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | TMPFILE=/tmp/devbackup.sql 3 | DESTFILE=/corpus/files/export/database/database-dev.sql 4 | DBHOST=`cat /corpus/config.ini | grep -A4 '\[mysql\]' | grep host | cut -d'=' -f2 | sed 's/ //g'` 5 | DBNAME=`cat /corpus/config.ini | grep -A4 '\[mysql\]' | grep dbname | cut -d'=' -f2 | sed 's/ //g'` 6 | 7 | mysql --host $DBHOST $DBNAME < $(dirname $0)"/copy_nulled_tables.sql" || exit 1 8 | 9 | mysqldump --host $DBHOST \ 10 | --ignore-table=opcorpora.users \ 11 | --ignore-table=opcorpora.user_tokens \ 12 | --ignore-table=opcorpora.dict_errata \ 13 | --ignore-table=opcorpora.form2lemma \ 14 | --ignore-table=opcorpora.form2tf \ 15 | --ignore-table=opcorpora.tokenizer_strange \ 16 | $DBNAME > $TMPFILE || exit 1 17 | mysqldump --host $DBHOST \ 18 | --no-data $DBNAME \ 19 | dict_errata \ 20 | form2lemma \ 21 | form2tf \ 22 | tokenizer_strange \ 23 | >> $TMPFILE || exit 1 24 | 25 | sed -i 's/`users_for_selective_backup`/`users`/g' $TMPFILE 26 | sed -i 's/`user_tokens_for_selective_backup`/`user_tokens`/g' $TMPFILE 27 | 28 | gzip -c $TMPFILE >$DESTFILE.tmp.gz && mv $DESTFILE{.tmp,}.gz 29 | rm $TMPFILE 30 | -------------------------------------------------------------------------------- /templates/static/downloads.row.tpl: -------------------------------------------------------------------------------- 1 | {* 2 | variables are: 3 | $N 4 | $suffix 5 | $lowercase 6 | $lemma 7 | $words 8 | *} 9 | 10 | {$N}_{$suffix} 11 | {if $lemma}+{else}—{/if} 12 | {if $lowercase}—{else}+{/if} 13 | {if $words == 'A'}+ (A**){elseif $words == 'B'}+ (B**){elseif $words}+{else}—{/if} 14 | архив .bz2 ({$dl.ngram.$N.$suffix.bz2.size} Мб) 15 | архив .zip ({$dl.ngram.$N.$suffix.zip.size} Мб) 16 | top100 17 | {$dl.ngram.$N.$suffix.updated} 18 | 19 | -------------------------------------------------------------------------------- /migrations/20140828143321_named_entities_comments.php: -------------------------------------------------------------------------------- 1 | table('ne_paragraph_comments', array('id' => 'comment_id', 'engine' => 'InnoDB')); 26 | $users->addColumn('user_id', 'integer') 27 | ->addColumn('par_id', 'integer') 28 | ->addColumn('comment', 'text') 29 | ->addColumn('created', 'timestamp', array('default' => 'CURRENT_TIMESTAMP')) 30 | ->addIndex('par_id') 31 | ->save(); 32 | } 33 | 34 | /** 35 | * Migrate Down. 36 | */ 37 | public function down() 38 | { 39 | $this->dropTable('ne_paragraph_comments'); 40 | } 41 | } -------------------------------------------------------------------------------- /export/database/copy_nulled_tables.sql: -------------------------------------------------------------------------------- 1 | SET NAMES utf8; 2 | 3 | -- table `users` --> `users_for_selective_backup` 4 | 5 | DROP TABLE `users_for_selective_backup`; 6 | CREATE TABLE `users_for_selective_backup` LIKE `users`; 7 | 8 | INSERT INTO `users_for_selective_backup` ( 9 | `user_id`, 10 | `user_name`, 11 | `user_passwd`, 12 | `user_email`, 13 | `user_reg`, 14 | `user_shown_name`, 15 | `user_team`, 16 | `user_level`, 17 | `user_shown_level`, 18 | `user_rating10` 19 | ) SELECT 20 | `user_id`, 21 | `user_name`, 22 | '' AS `user_passwd`, 23 | '' AS `user_email`, 24 | `user_reg`, 25 | `user_shown_name`, 26 | `user_team`, 27 | `user_level`, 28 | `user_shown_level`, 29 | `user_rating10` 30 | FROM `users` 31 | WHERE 1 = 1; 32 | 33 | -- table `user_tokens` --> `user_tokens_for_selective_backup` 34 | 35 | DROP TABLE `user_tokens_for_selective_backup`; 36 | CREATE TABLE `user_tokens_for_selective_backup` LIKE `user_tokens`; 37 | 38 | INSERT INTO `user_tokens_for_selective_backup` ( 39 | `user_id`, 40 | `token`, 41 | `timestamp` 42 | ) SELECT 43 | `user_id`, 44 | 0 AS `token`, 45 | `timestamp` 46 | FROM `user_tokens` 47 | WHERE 1 = 1; 48 | -------------------------------------------------------------------------------- /scripts/tokenizer/check_sentence_split.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | use strict; 3 | use utf8; 4 | use DBI; 5 | use Encode; 6 | use Config::INI::Reader; 7 | 8 | binmode(STDOUT, ':utf8'); 9 | binmode(STDERR, ':utf8'); 10 | 11 | my @exc = ("им", "мин", "тыс", "англ", "нем", "фр", "рус", "(англ", "(нем", "напр", "св", "см", "утв", "Пер", "лат", "ред", "рис", "vs", "ул", "ст"); 12 | my %exc = map {$_ => 1} @exc; 13 | #reading config 14 | my $conf = Config::INI::Reader->read_file($ARGV[0]); 15 | $conf = $conf->{mysql}; 16 | 17 | my $dbh = DBI->connect('DBI:mysql:'.$conf->{'dbname'}.':'.$conf->{'host'}, $conf->{'user'}, $conf->{'passwd'}) or die $DBI::errstr; 18 | $dbh->do("SET NAMES utf8"); 19 | my $sent = $dbh->prepare("SELECT `sent_id`, `source` FROM sentences"); 20 | my $str_drop = $dbh->prepare("TRUNCATE TABLE sentences_strange"); 21 | my $str_ins = $dbh->prepare("INSERT INTO sentences_strange VALUES(?)"); 22 | my $str; 23 | $str_drop->execute(); 24 | $sent->execute(); 25 | 26 | while (my $ref = $sent->fetchrow_hashref()) { 27 | $str = decode('utf8', $ref->{'source'}); 28 | if ($str =~ /\s([^А-ЯЁA-Z0-9\s]+)[\.\!\?]\s+[А-ЯЁA-Z]/) { 29 | next if exists $exc{$1}; 30 | $str_ins->execute($ref->{'sent_id'}); 31 | } 32 | } 33 | -------------------------------------------------------------------------------- /migrations/20151107191437_color_for_types.php: -------------------------------------------------------------------------------- 1 | table("ne_tags") 26 | ->addColumn("color_number", "integer", array("default" => 1)) 27 | ->update(); 28 | $this->table("ne_object_types") 29 | ->addColumn("color_number", "integer", array("default" => 1)) 30 | ->update(); 31 | } 32 | 33 | /** 34 | * Migrate Down. 35 | */ 36 | public function down() 37 | { 38 | $this->table("ne_tags") 39 | ->removeColumn("color_number") 40 | ->update(); 41 | $this->table("ne_object_types") 42 | ->removeColumn("color_number") 43 | ->update(); 44 | } 45 | } -------------------------------------------------------------------------------- /anaphora/NE_extract/maindic.gzt: -------------------------------------------------------------------------------- 1 | encoding "utf8"; // указываем кодировку, в которой написан этот файл 2 | 3 | import "base.proto"; // подключаем описания protobuf-типов (TAuxDicArticle и прочих) 4 | import "articles_base.proto"; // Файлы base.proto и articles_base.proto встроены в компилятор. 5 | // Их необходимо включать в начало любого gzt-словаря. 6 | 7 | import "facttypes.proto"; 8 | import "kwtypes.proto"; 9 | 10 | TAuxDicArticle "именные_группы" 11 | { 12 | key = { "tomita:ne_base.cxx" type=CUSTOM } 13 | } 14 | 15 | //fio "_фио" 16 | //{ 17 | // key = {"alg:fio" type=CUSTOM} 18 | //} 19 | 20 | complex_prep "сложный_предлог" 21 | { 22 | key = { "complex_prep.txt" type=FILE }; 23 | } 24 | 25 | complex_adv "наречное_выражение" 26 | { 27 | key = { "complex_adv.txt" type=FILE }; 28 | } 29 | 30 | complex_conj "сложный_союз" 31 | { 32 | key = { "complex_conj.txt" type=FILE }; 33 | } 34 | 35 | introduct "вводный_оборот" 36 | { 37 | key = { "vvodny_oborot.txt" type=FILE }; 38 | } 39 | 40 | bad_noun "плохие_сущ" 41 | { 42 | key = { "bad_noun.txt" type=FILE }; 43 | } 44 | pronoun "анаф_мест" 45 | { 46 | key = { "pronouns.txt" type=FILE }; 47 | } 48 | -------------------------------------------------------------------------------- /migrations/20190416073732_add_user_generated_dict_revisions.php: -------------------------------------------------------------------------------- 1 | table('dict_revisions_ugc', array('id' => 'rev_id' , 'engine' => 'InnoDB')); 11 | $revs_ugc->addColumn('user_id', 'integer', array('signed' => false)) 12 | ->addColumn('created_ts', 'timestamp', array('default' => 'CURRENT_TIMESTAMP')) 13 | ->addColumn('lemma_id', 'integer', array('signed' => false)) 14 | ->addColumn('rev_text', 'text') 15 | ->addColumn('comment', 'text') 16 | ->addColumn('status', 'integer', array('signed' => false, 'limit' => MysqlAdapter::INT_TINY, 'default' => 0)) 17 | ->addColumn('moder_id', 'integer', array('signed' => false, 'default' => 0)) 18 | ->addIndex(['user_id']) 19 | ->create(); 20 | 21 | $revs_main = $this->table('dict_revisions'); 22 | $revs_main->addColumn('ugc_rev_id', 'integer', array('signed' => false, 'default' => 0)) 23 | ->update(); 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /ajax/annot.php: -------------------------------------------------------------------------------- 1 | emit(EventTypes::TASK_DONE); 30 | } 31 | } 32 | catch (Exception $e) { 33 | $result['status'] = $e->getMessage(); 34 | $result['error'] = 1; 35 | } 36 | log_timing(true); 37 | die(json_encode($result)); 38 | -------------------------------------------------------------------------------- /migrations/20150420150929_new_achievements_table.php: -------------------------------------------------------------------------------- 1 | table('user_achievements', array('id' => 'achievement_id', 'engine' => 'InnoDB')); 12 | 13 | $users 14 | ->addColumn('user_id', 'integer') 15 | ->addColumn('achievement_type', 'text') 16 | ->addColumn('level', 'integer', array('default' => 0)) 17 | ->addColumn('progress', 'integer', array('default' => 0)) 18 | ->addColumn('seen', 'boolean', array('default' => TRUE)) 19 | ->addColumn('updated', 'timestamp', array('default' => 'CURRENT_TIMESTAMP', 20 | 'update' => 'CURRENT_TIMESTAMP')) 21 | ->addIndex('user_id') 22 | ->save(); 23 | 24 | $this->execute("ALTER TABLE `user_achievements` 25 | ADD UNIQUE `user_id_achievement_type` 26 | (`user_id`, `achievement_type`(128));"); 27 | } 28 | 29 | /** 30 | * Migrate Down. 31 | */ 32 | public function down() { 33 | $this->dropTable('user_achievements'); 34 | } 35 | } -------------------------------------------------------------------------------- /ajax/post_comment.php: -------------------------------------------------------------------------------- 1 | 40 | -------------------------------------------------------------------------------- /migrations/20150709223738_simplify_user_rating.php: -------------------------------------------------------------------------------- 1 | dropTable("user_rating_log"); 11 | $this->table("morph_annot_pool_types")->removeColumn("rating_weight")->update(); 12 | } 13 | 14 | public function down() 15 | { 16 | $log = $this->table("user_rating_log", array('id' => false)); 17 | $log->addColumn('user_id', 'integer', array('signed' => false, 'limit' => MysqlAdapter::INT_SMALL)) 18 | ->addColumn('delta', 'integer', array('signed' => true, 'limit' => MysqlAdapter::INT_SMALL)) 19 | ->addColumn('pool_id', 'integer', array('signed' => false, 'limit' => MysqlAdapter::INT_SMALL)) 20 | ->addColumn('timestamp', 'integer', array('signed' => false)) 21 | ->addIndex(array('user_id')) 22 | ->addIndex(array('timestamp')) 23 | ->save(); 24 | $this->table("morph_annot_pool_types") 25 | ->addColumn('rating_weight', 'integer', array('signed' => false, 'after' => 'has_focus', 'limit' => MysqlAdapter::INT_SMALL)) 26 | ->save(); 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /perl/lib/Lingua/AOT/MorphDict/Gramtab.pm: -------------------------------------------------------------------------------- 1 | package Lingua::AOT::MorphDict::Gramtab; 2 | 3 | use strict; 4 | use warnings; 5 | use utf8; 6 | use Encode; 7 | 8 | our $VERSION = "0.01"; 9 | 10 | 11 | sub new { 12 | my ($class, $fn) = @_; 13 | my $self = {}; 14 | bless($self, $class); 15 | 16 | $self->load_gramtab($fn); 17 | 18 | return $self; 19 | } 20 | 21 | sub load_gramtab { 22 | my ($self, $fn) = @_; 23 | 24 | open(FH, "< $fn") || die "can't open gramtab file \"$fn\""; 25 | binmode(FH, ":encoding(windows-1251)"); 26 | while() { 27 | chomp $_; 28 | if ($_ =~ /^([А-ЯЁа-яё]{2,2})\s+(\w)\s+([А-ЯЁа-яё_\-\*]+)\s+([а-яё\-0-9\,]*)/) { 29 | my ($ancode, $pos, $gram_line) = ($1, $3, $4); 30 | $gram_line =~ s/,\s*$//; 31 | $gram_line =~ s/,,/,/g; 32 | $self->{ancodes}->{$ancode} = ("*" ne $pos ? $pos : ""); 33 | if (length($self->{ancodes}->{$ancode}) > 0 && length($gram_line) > 0) { 34 | $self->{ancodes}->{$ancode} .= ", "; 35 | } 36 | $self->{ancodes}->{$ancode} .= $gram_line; 37 | } 38 | } 39 | close(FH); 40 | } 41 | 42 | sub Ancode2Grammems { 43 | my ($self, $ancode) = @_; 44 | if (!exists $self->{ancodes}->{$ancode}) { 45 | die "can't find ancode\"$ancode\""; 46 | } 47 | return $self->{ancodes}->{$ancode}; 48 | } 49 | -------------------------------------------------------------------------------- /scripts/aot_import/lists/Unite_Paras&Add_ANim.txt: -------------------------------------------------------------------------------- 1 | #Дано: сейчас в словаре должно быть по 2 слова на каждую здесь представленную лемму, отличающиеся одушевленностью 2 | #Задача: 3 | #1. Проверить, что таких слов ровно два 4 | #2. если перед нами лемма ж.р.: объединить слова, заменив значение одушевленности на ANim во всех падежах кроме ВН. МН., у которого должно быть два вариант (одуш и неодуш, взять из объединяемых парадигм) 5 | #3. если перед нами лемма м.р. или с.р.: объединить слова, заменив значение одушевленности на ANim во всех падежах кроме ВН. МН. и ЕД., у которых должно быть по два вариант (одуш и неодуш, взять из объединяемых парадигм) 6 | АЗОТОБАКТЕР 7 | АМФИБИЯ 8 | АНАЭРОБ 9 | АСЦИДИЯ 10 | АЭРОБ 11 | БАКТЕРИОФАГ 12 | БАКТЕРОИД 13 | БАЦИЛЛА 14 | ВИБРИОН 15 | ВОБЛА 16 | ГАСТРОЗОИД 17 | ГОНОКОКК 18 | ДАКТИЛОЗОИД 19 | ДАМКА 20 | ДВОРНИК 21 | ДИПЛОКОКК 22 | ИНФУЗОРИЯ 23 | ИСТРЕБИТЕЛЬ 24 | КОЗЫРЬ 25 | КОКК 26 | КРЕВЕТКА 27 | КУКЛА 28 | МАНЕКЕН 29 | МИКРОБ 30 | МИКРОКОКК 31 | МНОГОНОЖКА 32 | ПЕРСОНАЖ 33 | ПЕШКА 34 | ПИЛИДИЙ 35 | ПНЕВМОКОКК 36 | ПОЛИП 37 | ПОЛИПОИД 38 | ПРИВИДЕНИЕ 39 | ПРОТОБИОНТ 40 | ПРОТОТИП 41 | ПРОЭМБРИОН 42 | СЕЛЁДКА 43 | СЕЛЁДОЧКА 44 | СЕЛЬДЬ 45 | СИМБИОНТ 46 | СПОРОВИК 47 | СТАФИЛОКОКК 48 | СУЩЕСТВО 49 | УСТРИЦА 50 | ЭНТЕРОКОКК 51 | ЭХИНОКОКК -------------------------------------------------------------------------------- /anaphora/NE_extract/config.proto: -------------------------------------------------------------------------------- 1 | encoding "utf8"; // указываем кодировку, в которой написан конфигурационный файл 2 | 3 | TTextMinerConfig { 4 | Dictionary = "maindic.gzt"; // путь к корневому словарю 5 | 6 | PrettyOutput = "NE.html"; 7 | 8 | Input = { 9 | Dir = "learn_data/learn_plaintext/AnaphFiles/OFC"; // путь к входным файлам 10 | // File = "data/1238.txt"; // путь к входным файлам 11 | } 12 | 13 | Articles = [ 14 | { Name = "именные_группы" } // название статьи в корневом словаре, 15 | // которая содержит запускаемую грамматику 16 | ] 17 | 18 | Facts = [ 19 | { 20 | Name = "NamedEntity"; 21 | } 22 | { 23 | Name = "ComplexNE"; 24 | } 25 | { 26 | Name = "ComplexNE1"; 27 | } 28 | { 29 | Name = "ComplexNE2"; 30 | } 31 | { 32 | Name = "ComplexNE3"; 33 | } 34 | { 35 | Name = "ComplexNE4"; 36 | } 37 | { 38 | Name = "ComplexNE5"; 39 | } 40 | { 41 | Name = "ComplexNE6"; 42 | } 43 | { 44 | Name = "ComplexNE7"; 45 | } 46 | ] 47 | 48 | Output = { 49 | // File = "newsru.xml"; 50 | Format = xml; 51 | // Format = text; 52 | // Mode = append; 53 | } 54 | 55 | // PrintRules = "rules.txt"; 56 | // PrintTree = "tree.txt"; 57 | } 58 | -------------------------------------------------------------------------------- /migrations/20150603164038_remove_old_achievements.php: -------------------------------------------------------------------------------- 1 | dropTable("user_badges_types"); 13 | $this->dropTable("user_badges"); 14 | } 15 | 16 | /** 17 | * Migrate Down. 18 | */ 19 | public function down() 20 | { 21 | $this->execute(" 22 | CREATE TABLE `user_badges` ( 23 | `user_id` smallint(5) unsigned NOT NULL, 24 | `badge_id` tinyint(3) unsigned NOT NULL, 25 | `shown` int(10) unsigned NOT NULL, 26 | KEY `user_id` (`user_id`), 27 | KEY `shown` (`shown`) 28 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8; 29 | 30 | CREATE TABLE `user_badges_types` ( 31 | `badge_id` tinyint(3) unsigned NOT NULL, 32 | `badge_name` varchar(127) NOT NULL, 33 | `badge_descr` text NOT NULL, 34 | `badge_image` varchar(255) NOT NULL, 35 | `badge_group` tinyint(3) unsigned NOT NULL, 36 | PRIMARY KEY (`badge_id`) 37 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8; 38 | "); 39 | } 40 | } 41 | -------------------------------------------------------------------------------- /templates/qa/pool_candidates.tpl: -------------------------------------------------------------------------------- 1 | {* Smarty *} 2 | {extends file='common.tpl'} 3 | {block name=content} 4 |

Найденные примеры для типа {$data.id}

5 |

Пулы этого типа называются {$data.pool_name}

6 |

Показано не более 200 случайно выбранных примеров из {$data.found_samples}.

7 | {if $is_admin} 8 |
9 | случайных
10 | первых
11 | сделать таких пулов
12 |
13 |
14 |
15 | {/if} 16 | 17 | {foreach from=$data.samples item=c} 18 | 25 | {/foreach} 26 |
{strip} 19 | {foreach from=$c.context item=word key=tf_id} 20 | {if $tf_id == $c.mainword}{$word|htmlspecialchars} 21 | {else}{$word|htmlspecialchars}{/if} 22 |   23 | {/foreach} 24 | {/strip}
27 | {/block} 28 | -------------------------------------------------------------------------------- /ajax/bind_book.php: -------------------------------------------------------------------------------- 1 | 38 | -------------------------------------------------------------------------------- /templates/comments.tpl: -------------------------------------------------------------------------------- 1 | {* Smarty *} 2 | {extends file='common.tpl'} 3 | {block name=content} 4 |

Свежие комментарии

5 | 9 | 10 | {foreach item=comment from=$comments.c} 11 | 12 | 13 | 14 | 15 | 16 | 17 | {/foreach} 18 |
Предложение {$comment.sent_id}{$comment.user_name|htmlspecialchars}{$comment.ts|date_format:"%a %d.%m.%Y, %H:%M"}{$comment.text|htmlspecialchars}
19 | 23 | {/block} 24 | -------------------------------------------------------------------------------- /postagging/brill/unsupervised/cpp/train/aux.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | #include "tag.h" 6 | #include "sentence.h" 7 | #include "corpora_io.h" 8 | 9 | #include "brill.h" 10 | 11 | using namespace std; 12 | 13 | string PrintSC(const SentenceCollection &sc) { 14 | stringstream ss; 15 | 16 | SentenceCollection::const_iterator cit = sc.begin(); 17 | while (sc.end() != cit) { 18 | ss << cit->str() << endl; 19 | cit++; 20 | } 21 | 22 | return ss.str(); 23 | } 24 | 25 | string PrintRules(const list& lr) { 26 | stringstream ss; 27 | 28 | list::const_iterator cit = lr.begin(); 29 | while (lr.end() != cit) { 30 | ss << cit->str() << endl; 31 | cit++; 32 | } 33 | 34 | return ss.str(); 35 | } 36 | 37 | /*string toString(const map &m) { 38 | map::const_iterator cit = m.begin(); 39 | stringstream ss; 40 | while (m.end() != cit) { 41 | ss << '\t' << cit->first.str() << '\t' << cit->second << endl; 42 | cit++; 43 | } 44 | 45 | return ss.str(); 46 | }*/ 47 | 48 | string toString(const map &m) { 49 | map::const_iterator cit = m.begin(); 50 | stringstream ss; 51 | while (m.end() != cit) { 52 | ss << '\t' << cit->first << '\t' << cit->second << endl; 53 | cit++; 54 | } 55 | 56 | return ss.str(); 57 | } 58 | -------------------------------------------------------------------------------- /postagging/brill/unsupervised/cpp/include/sentence.h: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | #include "token.h" 7 | 8 | #ifndef __SENTENCE_H 9 | #define __SENTENCE_H 10 | 11 | class Sentence { 12 | std::vector v; 13 | std::vector vId; 14 | std::map id2pos; 15 | 16 | public: 17 | Sentence() { 18 | v.reserve(20); 19 | vId.reserve(20); 20 | } 21 | 22 | void clear() { 23 | v.clear(); 24 | id2pos.clear(); 25 | vId.clear(); 26 | } 27 | 28 | void push_back(const Token &t) { 29 | v.push_back(t); 30 | vId.push_back(0); 31 | } 32 | 33 | void push_back(const Token &t, int id) { 34 | v.push_back(t); 35 | id2pos[id] = v.size() - 1; 36 | vId.push_back(id); 37 | } 38 | 39 | size_t size() const { 40 | return v.size(); 41 | } 42 | 43 | inline const Token& getToken(size_t pos) const { 44 | return v[pos]; 45 | } 46 | 47 | const Token& getToken(size_t pos, int &id) const { 48 | id = vId[pos]; 49 | return v[pos]; 50 | } 51 | 52 | inline Token& getNonConstToken(size_t pos) { 53 | return v[pos]; 54 | } 55 | 56 | std::string str() const { 57 | std::stringstream ss; 58 | for (size_t i = 0; i < v.size(); i++) 59 | ss << vId[i] << '\t' << v[i].str() << std::endl; 60 | 61 | return ss.str(); 62 | } 63 | }; 64 | 65 | #endif 66 | -------------------------------------------------------------------------------- /scripts/consistency/form2tf.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | use strict; 3 | use utf8; 4 | use Encode; 5 | use DBI; 6 | use Config::INI::Reader; 7 | 8 | binmode(STDERR, ':utf8'); 9 | 10 | #reading config 11 | my $conf = Config::INI::Reader->read_file($ARGV[0]); 12 | $conf = $conf->{mysql}; 13 | 14 | #main 15 | my $dbh = DBI->connect('DBI:mysql:'.$conf->{'dbname'}.':'.$conf->{'host'}, $conf->{'user'}, $conf->{'passwd'}) or die $DBI::errstr; 16 | $dbh->do("SET NAMES utf8"); 17 | $dbh->{'AutoCommit'} = 0; 18 | if ($dbh->{'AutoCommit'}) { 19 | die "Setting AutoCommit failed"; 20 | } 21 | 22 | my $max1 = $dbh->prepare("SELECT MAX(tf_id) AS max1 FROM tokens"); 23 | my $max2 = $dbh->prepare("SELECT MAX(tf_id) AS max2 FROM form2tf"); 24 | my $scan = $dbh->prepare("SELECT tf_id, tf_text FROM tokens WHERE tf_id NOT IN (SELECT tf_id FROM form2tf) ORDER BY tf_id LIMIT ?"); 25 | my $ins = $dbh->prepare("INSERT INTO form2tf VALUES(?, ?)"); 26 | 27 | $max1->execute(); 28 | $max2->execute(); 29 | if ($max1->fetchrow_hashref()->{'max1'} == $max2->fetchrow_hashref()->{'max2'}) { 30 | $dbh->commit(); 31 | exit 0; 32 | } 33 | 34 | $scan->execute(500); 35 | while (my $ref = $scan->fetchrow_hashref()) { 36 | my $txt = $ref->{'tf_text'}; 37 | $txt = decode('utf-8', $txt); 38 | $txt =~ tr/А-Я/а-я/; 39 | $txt =~ s/[Ёё]/е/g; 40 | $ins->execute($txt, $ref->{'tf_id'}); 41 | } 42 | 43 | $dbh->commit(); 44 | -------------------------------------------------------------------------------- /templates/qa/useful_pools.tpl: -------------------------------------------------------------------------------- 1 | {* Smarty *} 2 | {extends file='common.tpl'} 3 | {block name=content} 4 | 13 |

Пулы, сильнее всего влияющие на объём корпуса со снятой омонимией

14 |

Список хороших предложений обновляется раз в сутки.

15 |

Тип: 16 | 21 | 22 | 23 | {foreach item=pool from=$pools} 24 | 25 | 34 | {/foreach} 35 |
ПулЦиферка
26 | {if $pool.status == 4} 27 | {elseif $pool.status == 5} 28 | {elseif $pool.status == 6} 29 | {/if} 30 | {$pool.name|htmlspecialchars} 31 | {if $pool.moderator} (модератор – {$pool.moderator|htmlspecialchars}){/if} 32 | {$pool.count} 33 |
36 | {/block} 37 | -------------------------------------------------------------------------------- /anaphora/learning/baseline.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | # python baseline.py test.pairs ref.pairs output 4 | 5 | import sys 6 | from itertools import groupby 7 | 8 | import random 9 | 10 | 11 | def baseline_random(X_test, y_test, output=sys.stdout): 12 | for anph, antc in groupby(X_test, key=lambda x: x.split('__')[1]): 13 | antc = list(antc) 14 | k = random.randint(0, len(antc) - 1) 15 | for i, pair in enumerate(antc): 16 | if i == k: 17 | print >> output, pair + '\t1' 18 | continue 19 | print >> output, pair + '\t0' 20 | 21 | 22 | def baseline_near(X_test, y_test): 23 | for anph, antc in groupby(X_test, key=lambda x: x.split('__')[1]): 24 | k = len(antc) - 1 25 | for i, pair in enumerate(antc): 26 | if i == k: 27 | print pair + '\t1' 28 | continue 29 | print pair + '\t0' 30 | 31 | 32 | def baseline_random_probs(X_test, y_test): 33 | pass 34 | 35 | if __name__ == '__main__': 36 | test = [] 37 | ref = [] 38 | 39 | with open(sys.argv[1]) as t: 40 | for line in t: 41 | test.append(line.rstrip('\r\n')) 42 | with open(sys.argv[2]) as r: 43 | for line in r: 44 | ref.append(line.rstrip('\r\n')) 45 | with open(sys.argv[3], 'w') as pred: 46 | baseline_random(test, ref, output=pred) 47 | -------------------------------------------------------------------------------- /templates/qa/tokenizer.tpl: -------------------------------------------------------------------------------- 1 | {* Smarty *} 2 | {extends file='common.tpl'} 3 | {block name=content} 4 |

Странно токенизированные места

5 |

Обновлено {$obj.timestamp|date_format:"%d.%m.%Y, %H:%M"}, однозначные решения в {$obj.coeff}% случаев.

6 | {foreach from=$obj.broken item=token} 7 |

Сломалось на предложении {$token.sent_id}, токен «{$token.token_text|htmlspecialchars}».

8 | {/foreach} 9 |

10 | {if isset($smarty.get.newest)} 11 | важные сверху 12 | {else} 13 | новые сверху 14 | {/if} 15 |

16 | 17 | {foreach item=i from=$obj.items} 18 | 19 | 25 | 26 | 32 | 33 | 34 | {/foreach} 35 |
20 | Предложение {$i.sent_id} 21 | {if $i.comments == 1} 22 | (комментарии) 23 | {/if} 24 | {$i.coeff}{strip} 27 | {$i.lcontext|htmlspecialchars} 28 | {$i.focus|htmlspecialchars} 29 | {if $i.border}  {/if} 30 | {$i.rcontext|htmlspecialchars} 31 | {/strip}исправить
36 | {/block} 37 | -------------------------------------------------------------------------------- /templates/sentence_syntax_groups_moderator.tpl: -------------------------------------------------------------------------------- 1 | {* Вспомогательный шаблон - таблица групп *} 2 | 3 | {* Требует: groups, group_types *} 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | {foreach $groups.simple as $group} 14 | 15 | 16 | 17 | 18 | 19 | 20 | {/foreach} 21 | {foreach $groups.complex as $group} 22 | 23 | 24 | 25 | 26 | 30 | 31 | {/foreach} 32 | 33 |
ГруппаТипВершина
{$group.text}{$group_types[$group.type]}{$group.token_texts[$group.head_id]}
{$group.text}{$group_types[$group.type]}{foreach $group.children_texts as $tid => $ttext} 27 | {if $group.head_id == $tid}{$ttext[1]|htmlspecialchars}{/if} 28 | {/foreach} 29 |
34 | -------------------------------------------------------------------------------- /scripts/delete_unused_files.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | use strict; 3 | use DBI; 4 | use Config::INI::Reader; 5 | 6 | #reading config 7 | my $conf = Config::INI::Reader->read_file($ARGV[0]); 8 | $conf = $conf->{mysql}; 9 | 10 | my $dbh = DBI->connect('DBI:mysql:'.$conf->{'dbname'}.':'.$conf->{'host'}, $conf->{'user'}, $conf->{'passwd'}) or die $DBI::errstr; 11 | my $scan = $dbh->prepare("SELECT book_id from book_tags WHERE tag_name = (SELECT CONCAT('url:', url) FROM downloaded_urls WHERE filename=?)"); 12 | 13 | my $GO = $ARGV[1] eq 'go' ? 1 : 0; 14 | my $count_all = 0; 15 | my $count_to_delete = 0; 16 | my $total_size = 0; 17 | 18 | opendir D, '../files/saved' or die "Failed to open dir"; 19 | while (my $f = readdir D) { 20 | next unless -f "../files/saved/$f"; 21 | my $ff = $f; 22 | $ff =~ s/\.html?$//; 23 | $scan->execute($ff); 24 | ++$count_all; 25 | if (!$scan->fetchrow_hashref()) { 26 | $count_to_delete++; 27 | $total_size += (stat("../files/saved/$f"))[7]; 28 | if ($GO) { 29 | print "deleting $f\n"; 30 | unlink "../files/saved/$f" or warn "Failed to delete $f"; 31 | } else { 32 | print "should delete $f\n"; 33 | } 34 | } 35 | } 36 | close D; 37 | 38 | printf "Total files: %d, %s: %d (%.2f Mb)\n", $count_all, ($GO ? 'deleted' : 'to delete'), $count_to_delete, $total_size / (1024 * 1024); 39 | 40 | if (!$GO && $count_to_delete) { 41 | print "Now run '$0 $ARGV[0] go' to delete the files\n"; 42 | } 43 | -------------------------------------------------------------------------------- /migrations/20150623182816_remove_user_meta_options.php: -------------------------------------------------------------------------------- 1 | dropTable('user_options'); 11 | } 12 | 13 | public function down() 14 | { 15 | $opt = $this->table('user_options', array('id' => false, 'primary_key' => array('option_id'))); 16 | $opt->addColumn('option_id', 'integer', array('signed' => false, 'limit' => MysqlAdapter::INT_SMALL)) 17 | ->addColumn('option_name', 'string', array('limit' => 128)) 18 | ->addColumn('option_values', 'string', array('limit' => 64)) 19 | ->addColumn('default_value', 'integer', array('signed' => false, 'limit' => MysqlAdapter::INT_SMALL)) 20 | ->addColumn('order_by', 'integer', array('signed' => false, 'limit' => MysqlAdapter::INT_SMALL)) 21 | ->save(); 22 | 23 | $this->execute("INSERT INTO `user_options` VALUES 24 | (1,'Показывать русские названия граммем','1',1,1), 25 | (2,'Язык/Language','1=Русский|2=English',1,2), 26 | (3,'Количество примеров для разметки','1=5|2=10|3=20|4=50',1,3), 27 | (4,'Split annotation pools into pages','1',1,4), 28 | (5,'Use fast mode in NE annotation','1',0,5); 29 | "); 30 | } 31 | } 32 | -------------------------------------------------------------------------------- /scripts/validators/url_validator.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import ConfigParser, MySQLdb 3 | import sys 4 | config = ConfigParser.ConfigParser() 5 | config.read(sys.argv[1]) 6 | 7 | hostname = config.get ('mysql', 'host') 8 | dbname = config.get ('mysql', 'dbname') 9 | username = config.get ('mysql', 'user') 10 | password = config.get ('mysql', 'passwd') 11 | 12 | db = MySQLdb.connect(hostname, username, password, dbname, use_unicode=True) 13 | 14 | cursor = db.cursor() 15 | cursor.execute('SET NAMES utf8') 16 | cursor.execute("""DELETE FROM tag_errors WHERE error_type = 4""") 17 | cursor.execute("""DELETE FROM tag_errors WHERE error_type = 6""") 18 | cursor.execute("""SELECT book_id, tag_name FROM book_tags WHERE 19 | (book_id in (select book_id from books where parent_id = 8 or parent_id = 56)) and (tag_name LIKE 'url:%') and (tag_name NOT LIKE '%oldid=%')""") 20 | data = cursor.fetchall(); 21 | for i in data: 22 | query = """INSERT INTO tag_errors VALUES(%d, '%s', %d)""" % (i[0], i[1], 4) 23 | cursor.execute(query) 24 | 25 | cursor.execute("""SELECT distinct book_id FROM books WHERE book_id NOT IN (SELECT book_id FROM book_tags WHERE tag_name LIKE 'url:%')""") 26 | data1 = cursor.fetchall(); 27 | #for i in data: 28 | # query = """INSERT INTO tag_errors VALUES(%d, '%s', %d)""" % (i[0], i[1], 4) 29 | # cursor.execute(query) 30 | for j in data1: 31 | query1 = """INSERT INTO tag_errors VALUES(%d, '%s', %d)""" % (j[0], '', 6) 32 | cursor.execute(query1) 33 | db.commit() 34 | db.close 35 | 36 | 37 | -------------------------------------------------------------------------------- /migrations/20151108101851_many_books2_many_tagsets.php: -------------------------------------------------------------------------------- 1 | table("ne_books_tagsets", array("id" => false, "primary_key" => array("book_id", "tagset_id"), "engine" => "InnoDB")); 12 | $link 13 | ->addColumn("book_id", "integer") 14 | ->addColumn("tagset_id", "integer", array("signed" => false, "limit" => MysqlAdapter::INT_TINY)) 15 | ->addIndex("book_id") 16 | ->addIndex("tagset_id") 17 | ->save(); 18 | 19 | $this->execute("INSERT INTO ne_books_tagsets SELECT book_id, 1 FROM books where ne_on = 1"); 20 | $this->table("books")->removeColumn("ne_on")->update(); 21 | 22 | } 23 | 24 | /** 25 | * Migrate Down. 26 | */ 27 | public function down() { 28 | $this->table("books") 29 | ->addColumn("ne_on", "boolean") 30 | ->addIndex("ne_on") 31 | ->update(); 32 | $books_on = $this->fetchAll("SELECT book_id FROM ne_books_tagsets"); 33 | $book_ids = array(); 34 | foreach ($books_on as $book) 35 | $book_ids[] = $book["book_id"]; 36 | $this->execute("UPDATE books SET ne_on = 1 WHERE book_id IN (" . implode(', ', $book_ids) . ")"); 37 | $this->dropTable("ne_books_tagsets"); 38 | } 39 | } 40 | -------------------------------------------------------------------------------- /anaphora/learning/learn.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | import numpy as np 4 | from sklearn.grid_search import GridSearchCV 5 | from sklearn.metrics import make_scorer 6 | from sklearn.svm import SVC 7 | from sklearn.naive_bayes import GaussianNB 8 | 9 | from scorer import score 10 | 11 | 12 | def svc(X_train, y_train): 13 | #clf = GridSearchCV(SVC(), scoring=make_scorer(score)) 14 | clf = SVC() 15 | clf.fit(X_train, y_train) 16 | print "The best classifier is: ", clf.best_estimator_ 17 | return clf.best_estimator_ 18 | 19 | 20 | def gnb(X_train, y_train): 21 | clf = GaussianNB() 22 | clf.fit(X_train, y_train) 23 | return clf 24 | 25 | 26 | def load_files(): 27 | train = np.loadtxt('learning.tab') 28 | test = np.loadtxt('test.tab') 29 | 30 | X_train = train[:, 1:-1] 31 | y_train = train[:, -1] 32 | X_test = test[:, 1:-1] 33 | y_test = test[:, -1] 34 | 35 | # pairs 36 | names_train = train[:, 0] 37 | names_test = test[:, 0] 38 | 39 | return X_train, y_train, X_test, y_test, names_train, names_test 40 | 41 | 42 | if __name__ == '__main__': 43 | X_train, y_train, X_test, y_test, train, test = load_files() 44 | estimator = svc(X_train, y_train) 45 | 46 | y_train_predict = estimator.predict(X_train) 47 | y_test_predict = estimator.predict(X_test) 48 | 49 | print score(y_test_predict) 50 | 51 | np.savetxt('learning.pred.tab', y_train_predict) 52 | np.savetxt('test.pred.tab', y_test_predict) 53 | -------------------------------------------------------------------------------- /perl/lib/Lingua/AOT/test.pl: -------------------------------------------------------------------------------- 1 | use strict; 2 | use utf8; 3 | 4 | use Data::Dumper; 5 | use Lingua::AOT::MorphDict; 6 | 7 | my $d = new Lingua::AOT::MorphDict(Mrd=>"morphs.mrd", Gramtab=>"rgramtab.tab"); 8 | 9 | binmode(STDOUT, ":encoding(utf-8)"); 10 | 11 | my $text = "Косил косой косой косой а зомби зомби зомби . Эти типы стали есть в цехе . Мама мыла раму и стекло ."; 12 | my @words = split(/\s+/, $text); 13 | 14 | foreach my $w (@words) { 15 | my $i = $d->Lookup($w); 16 | print "$w\t"; 17 | if (!defined($i)) { 18 | print "UNKNOWN_WORD"; 19 | } else { 20 | foreach my $mv (@{$i}) { 21 | my $lemma = $d->GetLemma($mv->LemmaId()); 22 | my $form_grm = $d->Ancode2Grammems($mv->Ancode()); 23 | my $lemma_grm = $d->Ancode2Grammems($lemma->Ancode()) if defined $lemma->Ancode(); 24 | print " # " . $lemma->GetDefForm()->Text() . "/" . $form_grm; 25 | if (length($lemma_grm) > 0) { 26 | print ", " . $lemma_grm; 27 | } 28 | } 29 | } 30 | print "\n"; 31 | } 32 | die; 33 | for (my $l = 0; $l < $d->MaxLemmaNo(); $l++) { 34 | my $lemma = $d->GetLemma($l); 35 | print "PARA " . $lemma->ParadigmId() . "\n"; 36 | 37 | for (my $f = 0; $f < $lemma->MaxFormNo(); $f++) { 38 | my $form = $lemma->GetForm($f); 39 | 40 | print $form->Text() . "\t". $d->Ancode2Grammems($form->Ancode()); 41 | if (defined $lemma->Ancode()) { 42 | print ", " . $d->Ancode2Grammems($lemma->Ancode()) . "\n"; 43 | } else { 44 | print "\n"; 45 | } 46 | } 47 | 48 | print "\n"; 49 | } 50 | 51 | -------------------------------------------------------------------------------- /scripts/consistency/form2lemma.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | use strict; 3 | use utf8; 4 | use DBI; 5 | use Encode; 6 | use Config::INI::Reader; 7 | 8 | #reading config 9 | my $conf = Config::INI::Reader->read_file($ARGV[0]); 10 | $conf = $conf->{mysql}; 11 | 12 | #main 13 | my $dbh = DBI->connect('DBI:mysql:'.$conf->{'dbname'}.':'.$conf->{'host'}, $conf->{'user'}, $conf->{'passwd'}) or die $DBI::errstr; 14 | $dbh->do("SET NAMES utf8"); 15 | $dbh->{'AutoCommit'} = 0; 16 | if ($dbh->{'AutoCommit'}) { 17 | die "Setting AutoCommit failed"; 18 | } 19 | 20 | my $scan = $dbh->prepare("SELECT rev_id, lemma_id, rev_text FROM dict_revisions WHERE f2l_check=0 ORDER BY rev_id LIMIT 2000"); 21 | my $del = $dbh->prepare("DELETE FROM form2lemma WHERE lemma_id=?"); 22 | my $ins = $dbh->prepare("INSERT INTO form2lemma VALUES(?, ?, ?, ?)"); 23 | my $upd = $dbh->prepare("UPDATE dict_revisions SET f2l_check=1 WHERE rev_id=? LIMIT 1"); 24 | 25 | $scan->execute(); 26 | while(my $ref = $scan->fetchrow_hashref()) { 27 | my $txt = decode('utf8', $ref->{'rev_text'}); 28 | $txt =~ /(.*?)<\/l>/; 29 | my ($lemma, $lemma_gr) = ($1, $2); 30 | $del->execute($ref->{'lemma_id'}); 31 | while ($txt =~ /(.*?)<\/f>/g) { 32 | my ($f, $g) = ($1, $2); 33 | #print STDERR "$f\t".$ref->{'lemma_id'}."\t$lemma\t$g\n"; 34 | $ins->execute($f, $ref->{'lemma_id'}, $lemma, $lemma_gr.$g); 35 | } 36 | $upd->execute($ref->{'rev_id'}); 37 | #print STDERR 'At revision '.$ref->{'rev_id'}."\n"; 38 | } 39 | 40 | $dbh->commit(); 41 | -------------------------------------------------------------------------------- /anaphora/learning/scorer.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | # python scorer.py *.ref.pairs *.groups < *.pairs 4 | 5 | import sys 6 | 7 | _groups = 'learning.groups' 8 | _ref = 'learning.pairs' 9 | 10 | 11 | def load_pred(p): 12 | for line in p: 13 | line = line.rstrip('\n').split('\t') 14 | yield line 15 | 16 | 17 | def load_heads(h): 18 | heads = {} 19 | hlist = [] 20 | with open(sys.argv[2]) as gfile: 21 | for line in gfile: 22 | line = line.rstrip('\n').split('\t') 23 | heads[line[0]] = line[2] 24 | hlist.append(line[2]) 25 | return heads, hlist 26 | 27 | 28 | def score(pred, groups=_groups, ref=_ref): 29 | heads, hl = load_heads(groups) 30 | pairs = load_pairs(ref) 31 | 32 | corr = 0.0 33 | all = 0 34 | for i, line in enumerate(pred): 35 | pair, cl = line 36 | if cl == '1': 37 | antc, anph = pair.split('__') 38 | if hl[i] == heads[pairs[anph]]: 39 | corr += 1 40 | all += 1 41 | return corr / all 42 | 43 | 44 | def load_pairs(r): 45 | pairs = {} 46 | with open(r) as ref: 47 | for line in ref: 48 | antc, anph = line.rstrip('\n').split('\t') 49 | pairs[anph] = antc 50 | pairs = {} 51 | 52 | if __name__ == '__main__': 53 | print "Precision on class 1 is {:.2%}".format(score(load_pred(sys.stdin), 54 | groups=sys.argv[1], ref=sys.argv[2])) 55 | -------------------------------------------------------------------------------- /anaphora/NE_extract/facttypes.proto: -------------------------------------------------------------------------------- 1 | import "base.proto"; // описание protobuf-типов 2 | import "facttypes_base.proto"; // описание protobuf-типа NFactType.TFact 3 | 4 | message NamedEntity : NFactType.TFact 5 | { 6 | required string Self = 1; 7 | required string Type = 2; 8 | required string Main = 3; 9 | } 10 | 11 | message ComplexNE : NFactType.TFact 12 | { 13 | required string Self = 1; 14 | required string Type = 2; 15 | required string Main = 3; 16 | } 17 | 18 | message ComplexNE1 : NFactType.TFact 19 | { 20 | required string Self = 1; 21 | required string Type = 2; 22 | required string Main = 3; 23 | } 24 | 25 | message ComplexNE2 : NFactType.TFact 26 | { 27 | required string Self = 1; 28 | required string Type = 2; 29 | required string Main = 3; 30 | } 31 | message ComplexNE3 : NFactType.TFact 32 | { 33 | required string Self = 1; 34 | required string Type = 2; 35 | required string Main = 3; 36 | } 37 | 38 | message ComplexNE4 : NFactType.TFact 39 | { 40 | required string Self = 1; 41 | required string Type = 2; 42 | required string Main = 3; 43 | } 44 | message ComplexNE5 : NFactType.TFact 45 | { 46 | required string Self = 1; 47 | required string Type = 2; 48 | required string Main = 3; 49 | } 50 | message ComplexNE6 : NFactType.TFact 51 | { 52 | required string Self = 1; 53 | required string Type = 2; 54 | required string Main = 3; 55 | } 56 | message ComplexNE7 : NFactType.TFact 57 | { 58 | required string Self = 1; 59 | required string Type = 2; 60 | required string Main = 3; 61 | } 62 | -------------------------------------------------------------------------------- /templates/qa/pool_types.tpl: -------------------------------------------------------------------------------- 1 | {* Smarty *} 2 | {extends file='common.tpl'} 3 | {block name=content} 4 |
5 | 6 | 7 | {foreach $data as $id => $t} 8 | 9 | 10 | 11 | 18 | 19 | 20 | {/foreach} 21 |
{$id}{$t.grammemes} 12 | 13 | 14 | 15 | 16 | 17 |
22 | 23 |
24 | {/block} 25 | -------------------------------------------------------------------------------- /options.php: -------------------------------------------------------------------------------- 1 | emit(EventTypes::JOINED_TEAM); 20 | } 21 | alert_set('success','Настройки сохранены'); 22 | header('Location:options.php'); 23 | break; 24 | case 'readonly_on': 25 | set_readonly_on(); 26 | header('Location:options.php'); 27 | break; 28 | case 'readonly_off': 29 | set_readonly_off(); 30 | header('Location:options.php'); 31 | break; 32 | default: 33 | check_logged(); 34 | $mgr = new UserOptionsManager(); 35 | $smarty->assign('meta', $mgr->get_all_options(true)); 36 | $smarty->assign('current_email', get_user_email($_SESSION['user_id'])); 37 | $smarty->assign('current_name', get_user_shown_name($_SESSION['user_id'])); 38 | $smarty->assign('teams',get_team_list()); 39 | $smarty->assign('user_team',get_user_team($_SESSION['user_id'])); 40 | $smarty->display('options.tpl'); 41 | } 42 | 43 | log_timing(); 44 | -------------------------------------------------------------------------------- /perl/lib/OpenCorpora/Dict/SimpleReader.pm: -------------------------------------------------------------------------------- 1 | package OpenCorpora::Dict::SimpleReader; 2 | 3 | use strict; 4 | use warnings; 5 | use utf8; 6 | 7 | use OpenCorpora::Dict::Entry; 8 | 9 | our $VERSION = "0.01"; 10 | 11 | 12 | 13 | 14 | sub new { 15 | my($class, %args) = @_; 16 | 17 | my $self = bless({}, $class); 18 | 19 | if (exists $args{handlers}) { 20 | my $handlers = $args{handlers}; 21 | $self->{handler_lemma} = exists $handlers->{lemma} ? $handlers->{lemma} : \&nop_function; 22 | } 23 | 24 | $self->{buffer} = ""; 25 | 26 | return $self; 27 | } 28 | 29 | sub parse { 30 | my $self = shift; 31 | my $chunk = shift; 32 | 33 | $self->{buffer} .= $chunk; 34 | 35 | if ($self->{buffer} =~ s/(.+?)<\/dr>//ms) { 36 | my $dr = $1; 37 | my $dict_entry = OpenCorpora::Dict::Entry->new(); 38 | while ($dr =~ s/(.+?)<\/l>//ms) { 39 | if (!defined($dict_entry->lemma_text)) { 40 | $dict_entry->lemma_text($1); 41 | my $gram_chunk = $2; 42 | while ($gram_chunk =~ //g) { 43 | $dict_entry->lemma_gram_add($1); 44 | } 45 | } else { 46 | die "multiple lemma in article $dr"; 47 | } 48 | } 49 | 50 | while ($dr =~ s/(.+?)<\/f>//ms) { 51 | my $fid = $dict_entry->add_form($1); 52 | my $gram_chunk = $2; 53 | while ($gram_chunk =~ //g) { 54 | $dict_entry->add_form_gram($fid, $1); 55 | } 56 | } 57 | 58 | $self->{handler_lemma}($dict_entry); 59 | } 60 | } 61 | 62 | sub nop_function { } 63 | -------------------------------------------------------------------------------- /anaphora/ana_test.groups: -------------------------------------------------------------------------------- 1 | 1 1,2,3,4 2 2 | 2 7 7 3 | 3 10,11 10 4 | 4 11 11 5 | 5 13,14 14 6 | 6 17,18,19,20 18 7 | 7 23 23 8 | 8 25,26 26 9 | 9 28,29 29 10 | 10 32,33 32 11 | 11 33 33 12 | 12 39 39 13 | 13 50,51,52,53 51 14 | 14 55,56,57,58 55 15 | 15 61 61 16 | 16 63 63 17 | 17 67,68,69,70,71 70 18 | 18 82,83 83 19 | 19 86 86 20 | 20 88,89,90 ALL 21 | 21 92,93,94 ALL 22 | 22 102,103 103 23 | 23 104,105 104 24 | 24 110,111,112,113 110 25 | 25 115,116 115 26 | 26 119 119 27 | 27 122 122 28 | 28 134,135,136 134 29 | 29 135,136 136 30 | 30 142,143,144 143 31 | 31 144 144 32 | 32 148 148 33 | 33 152,153,154,155 153 34 | 34 158,159 158 35 | 35 161,162 161 36 | 36 162 162 37 | 37 171 171 38 | 38 178,179,180,181,182 178 39 | 39 186,187,188,189,190,191,192,193 186 40 | 40 189,190 ALL 41 | 41 192,193 ALL 42 | 42 196 196 43 | 43 203 203 44 | 44 207,208,209,210,211 207 45 | 45 208,209,210,211 209 46 | 46 214 214 47 | 47 217,218,219 218 48 | 48 221 221 49 | 49 223 223 50 | 50 226 226 51 | 51 232,233 233 52 | 52 237,238,239 238 53 | 53 248,249 248 54 | 54 255,256 255 55 | 55 256 256 56 | 56 258,259,260 258 57 | 57 259,260 259 58 | 58 260 260 59 | 59 269 269 60 | 60 278,279,280,281,282 278 61 | 61 279,280,281,282 280 62 | 62 288 288 63 | 63 293,294,295,296 296 64 | 64 301,302,303,304 303 65 | 65 306,307 307 66 | 66 309,310,311 ALL 67 | 67 313 313 68 | 68 327,328 327 69 | 69 328 328 70 | 70 334,335 334 71 | 71 335 335 72 | 72 338,339 338 73 | 73 339 339 74 | 74 344,345,349,350,351,352 344 75 | 75 355 355 76 | 76 358,359,360,361,362,363,364,365,366,367,368,369,370,371,372 360 77 | 77 380,381,382,383 381 78 | 78 387,388,389,390,391 387 79 | -------------------------------------------------------------------------------- /scripts/consistency/dict_update_forms.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | use strict; 3 | use utf8; 4 | use DBI; 5 | use Config::INI::Reader; 6 | 7 | #reading config 8 | my $conf = Config::INI::Reader->read_file($ARGV[0]); 9 | $conf = $conf->{mysql}; 10 | 11 | #main 12 | my $dbh = DBI->connect('DBI:mysql:'.$conf->{'dbname'}.':'.$conf->{'host'}, $conf->{'user'}, $conf->{'passwd'}) or die $DBI::errstr; 13 | $dbh->do("SET NAMES utf8"); 14 | $dbh->{'AutoCommit'} = 0; 15 | if ($dbh->{'AutoCommit'}) { 16 | die "Setting AutoCommit failed"; 17 | } 18 | 19 | #if there are any words still not checked by form2tf, we should do nothing 20 | my $max1 = $dbh->prepare("SELECT MAX(tf_id) AS max1 FROM tokens"); 21 | my $max2 = $dbh->prepare("SELECT MAX(tf_id) AS max2 FROM form2tf"); 22 | $max1->execute(); 23 | $max2->execute(); 24 | if ($max1->fetchrow_hashref()->{'max1'} != $max2->fetchrow_hashref()->{'max2'}) { 25 | $dbh->commit(); 26 | exit 0; 27 | } 28 | 29 | my $scan = $dbh->prepare("SELECT form_text, rev_id FROM updated_forms LIMIT ?"); 30 | my $scan_f2tf = $dbh->prepare("SELECT tf_id FROM form2tf WHERE form_text=?"); 31 | my $del = $dbh->prepare("DELETE FROM updated_forms WHERE form_text=? AND rev_id=?"); 32 | my $ins = $dbh->prepare("INSERT INTO updated_tokens VALUES (?, ?)"); 33 | 34 | $scan->execute(10); 35 | while(my $ref = $scan->fetchrow_hashref()) { 36 | $scan_f2tf->execute($ref->{'form_text'}); 37 | while(my $ref1 = $scan_f2tf->fetchrow_hashref()) { 38 | $ins->execute($ref1->{'tf_id'}, $ref->{'rev_id'}); 39 | } 40 | $del->execute($ref->{'form_text'}, $ref->{'rev_id'}); 41 | } 42 | 43 | $dbh->commit(); 44 | -------------------------------------------------------------------------------- /migrations/20160129161233_mw_basic_structure.php: -------------------------------------------------------------------------------- 1 | table("mw_main", array("id" => "mw_id", "engine" => "InnoDB")); 13 | $main 14 | ->addColumn('status', 'integer', array('signed' => false, 'limit' => MysqlAdapter::INT_TINY)) 15 | ->save(); 16 | 17 | $tokens = $this->table("mw_tokens", array("id" => false, "primary_key" => array("mw_id", "tf_id"), "engine" => "InnoDB")); 18 | $tokens 19 | ->addColumn("mw_id", "integer") 20 | ->addColumn("tf_id", "integer", array("signed" => false)) 21 | ->addIndex("mw_id") 22 | ->addIndex("tf_id") 23 | ->save(); 24 | 25 | $answers = $this->table("mw_answers", array("id" => false, "primary_key" => array("mw_id", "user_id"), "engine" => "InnoDB")); 26 | $answers 27 | ->addColumn("mw_id", "integer") 28 | ->addColumn("user_id", "integer") 29 | ->addColumn("ts", "timestamp", array("default" => "CURRENT_TIMESTAMP")) 30 | ->addIndex("mw_id") 31 | ->addIndex("user_id") 32 | ->save(); 33 | 34 | } 35 | 36 | /** 37 | * Migrate Down. 38 | */ 39 | public function down() { 40 | $this->dropTable("mw_main"); 41 | $this->dropTable("mw_tokens"); 42 | $this->dropTable("mw_answers"); 43 | 44 | } 45 | } 46 | -------------------------------------------------------------------------------- /migrations/20150216164636_undelete_lemmata.php: -------------------------------------------------------------------------------- 1 | execute(" 13 | ALTER TABLE dict_lemmata 14 | ADD COLUMN `deleted` tinyint(3) unsigned not null; 15 | "); 16 | $this->execute(" 17 | ALTER TABLE dict_lemmata 18 | ADD INDEX(`deleted`) 19 | "); 20 | $this->execute(" 21 | INSERT INTO dict_lemmata ( 22 | SELECT lemma_id, lemma_text, 1 23 | FROM dict_lemmata_deleted 24 | ) 25 | "); 26 | $this->dropTable('dict_lemmata_deleted'); 27 | } 28 | 29 | /** 30 | * Migrate Down. 31 | */ 32 | public function down() 33 | { 34 | $this->table('dict_lemmata_deleted', array('id' => false)) 35 | ->addColumn('lemma_id', 'integer') 36 | ->addColumn('lemma_text', 'string', array('limit' => 50)) 37 | ->save(); 38 | $this->execute(" 39 | INSERT INTO dict_lemmata_deleted ( 40 | SELECT lemma_id, lemma_text 41 | FROM dict_lemmata 42 | WHERE deleted=1 43 | ) 44 | "); 45 | $this->execute(" 46 | DELETE FROM dict_lemmata 47 | WHERE deleted=1 48 | "); 49 | $this->table('dict_lemmata')->removeIndex(array('deleted')); 50 | $this->table('dict_lemmata')->removeColumn('deleted'); 51 | } 52 | } 53 | -------------------------------------------------------------------------------- /scripts/ma_pools/unpublish_pools.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python2 2 | # -*- coding: utf-8 -*- 3 | import sys 4 | import time 5 | import json 6 | import MySQLdb 7 | from MySQLdb.cursors import DictCursor 8 | 9 | POOL_STATUS_PUBLISHED = 3 10 | POOL_STATUS_UNPUBLISHED = 4 11 | 12 | def check_pools(dbh): 13 | dbh.execute(""" 14 | SELECT pool_id 15 | FROM morph_annot_pools 16 | WHERE status = {0} 17 | AND pool_id NOT IN ( 18 | SELECT DISTINCT pool_id 19 | FROM morph_annot_instances 20 | LEFT JOIN morph_annot_samples USING (sample_id) 21 | LEFT JOIN morph_annot_pools USING (pool_id) 22 | WHERE status = {0} 23 | AND answer = 0 24 | ) 25 | """.format(POOL_STATUS_PUBLISHED)) 26 | 27 | for pool in dbh.fetchall(): 28 | set_pool_status(dbh, pool['pool_id'], POOL_STATUS_UNPUBLISHED) 29 | def set_pool_status(dbh, pool_id, status): 30 | dbh.execute("UPDATE morph_annot_pools SET status={0}, updated_ts={2} WHERE pool_id={1} LIMIT 1".format(status, pool_id, int(time.time()))) 31 | def main(): 32 | with open(sys.argv[1]) as fconf: 33 | config = json.load(fconf)['mysql'] 34 | hostname = config['host'] 35 | dbname = config['dbname'] 36 | username = config['user'] 37 | password = config['passwd'] 38 | 39 | db = MySQLdb.connect(hostname, username, password, dbname, use_unicode=True, charset="utf8") 40 | dbh = db.cursor(DictCursor) 41 | dbh.execute('START TRANSACTION') 42 | 43 | check_pools(dbh) 44 | db.commit() 45 | 46 | if __name__ == "__main__": 47 | main() 48 | -------------------------------------------------------------------------------- /ajax/get_comments.php: -------------------------------------------------------------------------------- 1 | $id, 8 | 'timestamp' => $comm[$id]['ts'], 9 | 'author' => $comm[$id]['author'], 10 | 'reply_to' => $comm[$id]['parent'], 11 | 'text' => htmlspecialchars($comm[$id]['text']) 12 | ); 13 | if (!isset($hier[$id])) 14 | return; 15 | if (!$id) { 16 | foreach($hier[$id] as $cid) { 17 | recursive_print($cid, $comm, $hier, $ret); 18 | } 19 | } else { 20 | foreach(array_reverse($hier[$id]) as $cid) { 21 | recursive_print($cid, $comm, $hier, $ret); 22 | } 23 | } 24 | } 25 | 26 | 27 | try { 28 | $sent_id = (int)POST('sent_id'); 29 | 30 | $comm = array(); 31 | $result['comments'] = array(); 32 | $res = sql_query("SELECT sc.*, u.user_name FROM sentence_comments sc LEFT JOIN users u ON (sc.user_id=u.user_id) WHERE sent_id=$sent_id ORDER BY timestamp"); 33 | while ($r = sql_fetch_array($res)) { 34 | $comm[$r['comment_id']] = array( 35 | 'ts' => date("d.m.y, H:i", $r['timestamp']), 36 | 'author' => $r['user_name'], 37 | 'parent' => $r['parent_id'], 38 | 'text' => nl2br($r['text']), 39 | ); 40 | $hier[$r['parent_id']][] = $r['comment_id']; 41 | } 42 | recursive_print(0, $comm, $hier, $result['comments']); 43 | } 44 | catch (Exception $e) { 45 | $result['error'] = 1; 46 | } 47 | 48 | log_timing(true); 49 | die(json_encode($result)); 50 | ?> 51 | -------------------------------------------------------------------------------- /postagging/brill/unsupervised/python/sample_corpus.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | import sys 4 | import random 5 | import argparse 6 | 7 | 8 | def rand_sent(l, n, c): 9 | used = [] 10 | for i in range(c): 11 | nums = set(range(1, l)) - set(used) 12 | sample = random.sample(nums, n) 13 | used += sample 14 | yield sample 15 | 16 | 17 | def get_random_sentences(corpus, nums, out): 18 | i = 0 19 | with open(out, 'w') as out: 20 | for line in corpus: 21 | #print i 22 | line = line.rstrip().decode('utf-8') 23 | if not line: 24 | continue 25 | if line.startswith('sent'): 26 | i += 1 27 | if i in nums: 28 | print >> out, line.encode('utf-8') 29 | 30 | 31 | def get_corpora(corpus, l, c, n, out): 32 | randomnums = rand_sent(l, n, c) 33 | for i, nums in enumerate(randomnums): 34 | i_out = '%s%d.tab' % (out, i) 35 | get_random_sentences(corpus, nums, i_out) 36 | 37 | 38 | if __name__ == '__main__': 39 | p = argparse.ArgumentParser() 40 | p.add_argument('-l', '--limit', type=int, default=1000000, 41 | help='Specify the number of sentences from which random corpora should be extracted') 42 | p.add_argument('-n', type=int, default=1, 43 | help='Size of corpus to be extracted') 44 | p.add_argument('-c', type=int, default=1, 45 | help='Number of random corpora') 46 | p.add_argument('-p', default='rand', help='Output file prefix') 47 | args = p.parse_args() 48 | get_corpora(sys.stdin, args.limit, args.c, args.n, args.p) 49 | -------------------------------------------------------------------------------- /export/annot/disamb_nonmod_tests/annot.opcorpora.canon_out.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | -------------------------------------------------------------------------------- /migrations/20150624200024_change_candidate_samples.php: -------------------------------------------------------------------------------- 1 | execute("TRUNCATE TABLE morph_annot_candidate_samples"); // yes, this is irreversible 11 | $this->execute("DELETE FROM morph_annot_samples WHERE pool_id IN ( 12 | SELECT pool_id FROM morph_annot_pools WHERE status < 2 13 | )"); // and this 14 | $this->execute("DELETE FROM morph_annot_pools WHERE status < 2"); // and this 15 | 16 | $cs = $this->table("morph_annot_candidate_samples"); 17 | $cs->renameColumn("pool_id", "pool_type")->save(); 18 | 19 | $types = $this->table("morph_annot_pool_types"); 20 | $types->addColumn('last_auto_search', 'integer', array('signed' => false, 'default' => 0))->save(); 21 | 22 | $this->table("morph_annot_pools")->removeColumn('token_check'); 23 | } 24 | 25 | public function down() 26 | { 27 | $this->execute("TRUNCATE TABLE morph_annot_candidate_samples"); // yes, this is irreversible 28 | $cs = $this->table("morph_annot_candidate_samples"); 29 | $cs->renameColumn("pool_type", "pool_id")->save(); 30 | 31 | $types = $this->table("morph_annot_pool_types"); 32 | $types->removeColumn('last_auto_search'); 33 | 34 | $pools = $this->table("morph_annot_pools"); 35 | $pools->addColumn('token_check', 'integer', array('signed' => false, 'limit' => MysqlAdapter::INT_TINY, 'after' => 'pool_name')) 36 | ->save(); 37 | } 38 | } 39 | -------------------------------------------------------------------------------- /templates/sentence_syntax_moderator.tpl: -------------------------------------------------------------------------------- 1 | {* Smarty *} 2 | {extends file='sentence_syntax.tpl'} 3 | 4 | {block name=syntax_heading} 5 |
Вы являетесь модератором этой книги. Ваши именные группы будут использоваться при разметке анафоры.
6 | {/block} 7 | 8 | {block name=button_caption}Модератор{/block} 9 | {block name=inject_groups_json} 10 | 11 | {literal} 12 | 15 | {/literal} 16 | {/block} 17 | 18 | {block name=syntax_bottom} 19 |

Разметка других пользователей

20 | 21 | {foreach from=$all_groups item=gr key=uid} 22 | 23 | {if $uid == $smarty.session.user_id}{continue}{/if} 24 | 25 |
26 |
27 |
Разметка @{$group_owners[$uid].shown_name}:
28 |
29 | {foreach item=token from=$sentence.tokens}{$token.tf_text|htmlspecialchars}{/foreach} 30 |
31 |
32 |
33 |
Выделенные группы показать искусственные
34 |
35 | {include "sentence_syntax_groups_moderator.tpl" groups=$gr group_types=$group_types} 36 |
37 |
38 |
39 | {/foreach} 40 | {/block} 41 | -------------------------------------------------------------------------------- /anaphora/pairs.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | import sys 5 | import argparse 6 | import itertools 7 | 8 | parser = argparse.ArgumentParser(argument_default=argparse.SUPPRESS, description='Make pairs "group id - pronoun id".') 9 | parser.add_argument('infile1', nargs='?', type = argparse.FileType('r'), help = 'file with a list of groups') 10 | parser.add_argument('infile2', nargs='?', type = argparse.FileType('r'), help = 'file with a list of pronouns') 11 | args = parser.parse_args() 12 | 13 | 14 | def getid(fullid): 15 | return int(fullid.split('_')[1]) 16 | 17 | 18 | def gettextid(fullid): 19 | return int(fullid.split('_')[0]) 20 | 21 | groups = {} 22 | pron = {} 23 | 24 | 25 | for s in args.infile1: # dict group_id : max token 26 | if not s.rstrip('\r\n'): 27 | continue 28 | s = s.strip().split('\t') 29 | groups[s[0]] = s[1].split(',')[-1] 30 | 31 | 32 | for line in args.infile2: # dict pron_id : token number 33 | if not line.rstrip('\r\n'): 34 | continue 35 | line = line.strip().split('\t') 36 | pron[line[0]] = line[1] 37 | 38 | 39 | group_keys = sorted(groups.keys()) 40 | pronoun_keys = sorted(pron.keys()) 41 | 42 | 43 | result = itertools.product(group_keys, pronoun_keys) 44 | prev = 0 45 | for i in result: 46 | g = getid(groups[i[0]]) 47 | tg = gettextid(groups[i[0]]) 48 | tp = gettextid(pron[i[1]]) 49 | if tg == tp and g < getid(pron[i[1]]) \ 50 | and g > prev: 51 | sys.stdout.write('{0}__{1}'.format(str(i[0]), str(i[1]) + '\n')) 52 | prev = getid(pron[i[1]]) 53 | else: 54 | continue 55 | 56 | -------------------------------------------------------------------------------- /ajax/guess_wiki_categ.php: -------------------------------------------------------------------------------- 1 | array(), 'topic' => array()); 5 | $cat_str = POST('cat', ''); 6 | if ($cat_str) { 7 | $categ = explode('|', $cat_str); 8 | foreach ($categ as $cat) { 9 | $cat = str_replace('Категория:', '', $cat); 10 | if (in_array($cat, array('Опубликовано'))) { 11 | continue; 12 | } 13 | if (preg_match('/^(\d+) (\S+) (\d\d\d\d)$/', $cat, $matches)) { 14 | $result['cats']['date'] = sprintf("%02s", $matches[1]).'/'.month_to_number($matches[2]); 15 | $result['cats']['year'] = $matches[3]; 16 | } 17 | else { 18 | if (check_for_geo($cat)) { 19 | $result['cats']['geo'][] = $cat; 20 | } 21 | else { 22 | $result['cats']['topic'][] = $cat; 23 | } 24 | } 25 | } 26 | } 27 | 28 | function check_for_geo($s) { 29 | $res = sql_query("SELECT tag_name FROM book_tags WHERE tag_name = ? LIMIT 1", array('Гео:ВикиКатегория:'.$s)); 30 | return sizeof($res); 31 | } 32 | function month_to_number($s) { 33 | $months = array( 34 | 'января' => '01', 35 | 'февраля' => '02', 36 | 'марта' => '03', 37 | 'апреля' => '04', 38 | 'мая' => '05', 39 | 'июня' => '06', 40 | 'июля' => '07', 41 | 'августа' => '08', 42 | 'cентября' => '09', //latin 'c' 43 | 'сентября' => '09', //cyrillic 'с' 44 | 'октября' => '10', 45 | 'ноября' => '11', 46 | 'декабря' => '12' 47 | ); 48 | return $months[$s]; 49 | } 50 | 51 | log_timing(true); 52 | die(json_encode($result)); 53 | -------------------------------------------------------------------------------- /postagging/brill/unsupervised/cpp/lib/brill.cpp: -------------------------------------------------------------------------------- 1 | 2 | #include 3 | #include 4 | #include 5 | 6 | #include "brill.h" 7 | 8 | using namespace std; 9 | 10 | void Context::parse(const std::string &s) { 11 | vector v; 12 | split(s, '&', v); 13 | 14 | for (vector::iterator cit = v.begin(); v.end() != cit; cit++) { 15 | //if ('&' == (*cit)[0]) (*cit) = cit->substr(1); 16 | //cerr << *cit << endl; 17 | Condition c(*cit); 18 | elements.insert(c); 19 | } 20 | } 21 | 22 | 23 | void Condition::parse(const string &s) { 24 | stringstream ss(s); 25 | char c; 26 | 27 | while (ss.good() && ' ' == ss.peek()) ss.get(c); 28 | 29 | // Consume position 30 | signed int p; 31 | ss >> p; 32 | 33 | // Consume ':' 34 | ss >> c; 35 | if (':' != c) throw; 36 | 37 | // Consume condition type 38 | char buff[8]; 39 | EType w; 40 | if ('t' == ss.peek()) { 41 | ss.get(buff, 4); 42 | w = tag; 43 | } else if ('w' == ss.peek()) { 44 | ss.get(buff, 5); 45 | w = word; 46 | } else throw; 47 | 48 | // Consume '=' 49 | ss >> c; 50 | if ('=' != c) throw; 51 | 52 | if (tag == w) { 53 | // Consume tag set 54 | TagSet ts; 55 | while (!ss.eof()) { 56 | do { ss.get(c); } while (!ss.eof() && ' ' == c); 57 | if ('#' == c) break; 58 | ss.unget(); 59 | ss.get(buff, 5); 60 | if (0 == strlen(buff)) break; 61 | //buff[4] = 0; 62 | Tag t(buff); 63 | ts.insert(t); 64 | } 65 | 66 | value = ts; 67 | } else { 68 | stringbuf sb; 69 | ss.get(sb, ' '); 70 | 71 | form = sb.str(); 72 | } 73 | 74 | // Если мы тут, то всё прочиталось хорошо 75 | pos = p; 76 | what = w; 77 | } 78 | -------------------------------------------------------------------------------- /migrations/20151023153845_add_ne_tagsets.php: -------------------------------------------------------------------------------- 1 | table('ne_tagsets', array('id' => false, 'primary_key' => 'tagset_id', 'engine' => 'InnoDB')); 11 | $tagsets->addColumn('tagset_id', 'integer', array('signed' => false, 'identity' => true, 'limit' => MysqlAdapter::INT_TINY)) 12 | ->addColumn('tagset_name', 'string', array('limit' => 32)) 13 | ->save(); 14 | $this->execute("INSERT INTO ne_tagsets VALUES(1, 'NE_2014')"); 15 | 16 | $this->table('ne_tags') 17 | ->addColumn('tagset_id', 'integer', array('signed' => false, 'limit' => MysqlAdapter::INT_TINY)) 18 | ->addIndex(array('tagset_id')) 19 | ->save(); 20 | $this->execute("UPDATE ne_tags SET tagset_id = 1"); 21 | 22 | $this->table('ne_paragraphs') 23 | ->addColumn('tagset_id', 'integer', array('signed' => false, 'limit' => MysqlAdapter::INT_TINY)) 24 | ->addIndex(array('tagset_id')) 25 | ->save(); 26 | $this->execute("UPDATE ne_paragraphs SET tagset_id = 1"); 27 | 28 | $this->table('ne_paragraph_comments')->renameColumn('par_id', 'annot_id')->update(); 29 | } 30 | 31 | public function down() 32 | { 33 | $this->table('ne_tags')->removeColumn('tagset_id')->update(); 34 | $this->table('ne_paragraphs')->removeColumn('tagset_id')->update(); 35 | $this->table('ne_paragraph_comments')->renameColumn('annot_id', 'par_id')->update(); 36 | $this->dropTable('ne_tagsets'); 37 | } 38 | } 39 | -------------------------------------------------------------------------------- /migrations/20150314214304_add_tables_for_selective_backup.php: -------------------------------------------------------------------------------- 1 | execute("CREATE TABLE `users_for_selective_backup` ( 10 | `user_id` smallint(5) unsigned NOT NULL AUTO_INCREMENT, 11 | `user_name` varchar(120) NOT NULL, 12 | `user_passwd` varchar(32) NOT NULL, 13 | `user_email` varchar(100) NOT NULL, 14 | `user_reg` int(10) unsigned NOT NULL, 15 | `user_shown_name` varchar(120) NOT NULL, 16 | `user_team` smallint(5) unsigned NOT NULL, 17 | `user_level` tinyint(3) unsigned NOT NULL, 18 | `user_shown_level` tinyint(3) unsigned NOT NULL, 19 | `user_rating10` int(10) unsigned NOT NULL, 20 | `show_game` tinyint(3) unsigned NOT NULL, 21 | PRIMARY KEY (`user_id`), 22 | KEY `user_team` (`user_team`), 23 | KEY `user_rating10` (`user_rating10`) 24 | ) ENGINE=InnoDB AUTO_INCREMENT=1 DEFAULT CHARSET=utf8;"); 25 | 26 | $this->execute("CREATE TABLE `user_tokens_for_selective_backup` ( 27 | `user_id` SMALLINT(5) UNSIGNED NOT NULL, 28 | `token` INT(10) UNSIGNED NOT NULL, 29 | `timestamp` INT(10) UNSIGNED NOT NULL, 30 | KEY `user_id` (`user_id`) 31 | ) ENGINE=INNODB DEFAULT CHARSET=utf8;"); 32 | } 33 | 34 | public function down() 35 | { 36 | $this->dropTable("users_for_selective_backup"); 37 | $this->dropTable("user_tokens_for_selective_backup"); 38 | } 39 | } -------------------------------------------------------------------------------- /perl/lib/OpenCorpora/Dict/Entry.pm: -------------------------------------------------------------------------------- 1 | package OpenCorpora::Dict::Entry; 2 | 3 | use strict; 4 | use warnings; 5 | use utf8; 6 | 7 | our $VERSION = "0.01"; 8 | 9 | 10 | 11 | 12 | sub new { 13 | my($class, %args) = @_; 14 | 15 | my $self = bless({}, $class); 16 | 17 | # if (exists $args{handlers}) { 18 | # my $handlers = $args{handlers}; 19 | # $self->{handler_lemma} = exists $handlers->{lemma} ? $handlers->{lemma} : \&nop_function; 20 | # } 21 | 22 | $self->{lemma}->{text} = undef; 23 | $self->{lemma}->{gram} = {}; 24 | $self->{forms} = []; 25 | 26 | return $self; 27 | } 28 | 29 | sub lemma_text { 30 | my $self = shift; 31 | if (@_) { 32 | my $text = shift; 33 | $self->{lemma}->{text} = $text; 34 | } 35 | 36 | return $self->{lemma}->{text}; 37 | } 38 | 39 | sub lemma_gram_add { 40 | my $self = shift; 41 | my $gram = shift; 42 | $self->{lemma}->{gram}->{$gram} = 1; 43 | } 44 | 45 | sub lemma_gram { 46 | my $self = shift; 47 | my $gram = shift; 48 | if (exists $self->{lemma}->{gram}->{$gram}) { 49 | return 1; 50 | } 51 | return 0; 52 | } 53 | 54 | sub add_form { 55 | my $self = shift; 56 | my $text = shift; 57 | push @{$self->{forms}}, {form => $text}; 58 | return $#{$self->{forms}}; 59 | } 60 | 61 | sub add_form_gram { 62 | my $self = shift; 63 | my ($fid, $gram) = @_; 64 | $self->{forms}->[$fid]->{gram}->{$gram} = 1; 65 | } 66 | 67 | sub get_form_ids { 68 | my $self = shift; 69 | return map {$_} 0..$#{$self->{forms}}; 70 | } 71 | 72 | sub get_form_text { 73 | my ($self, $fid) = @_; 74 | return $self->{forms}->[$fid]->{form}; 75 | } 76 | 77 | sub get_form_grams { 78 | my ($self, $fid) = @_; 79 | return sort keys %{$self->{forms}->[$fid]->{gram}}; 80 | } 81 | 82 | 1; 83 | -------------------------------------------------------------------------------- /postagging/brill/unsupervised/perl/diff_tab_markup.pl: -------------------------------------------------------------------------------- 1 | use strict; 2 | 3 | my @names; 4 | my @files; 5 | 6 | foreach my $arg (@ARGV) { 7 | push @names, $arg; 8 | 9 | my %h; 10 | $h{before} = load_tab_markup($arg . ".orig"); 11 | $h{after} = load_tab_markup($arg . ".final"); 12 | 13 | push @files, \%h; 14 | } 15 | 16 | for (my $i = 0; $i <= $#files; $i++) { 17 | my %cmp_result = compare_markup($files[$i]->{before}, $files[$i]->{after}); 18 | print "$names[$i] " 19 | . $#{$files[$i]->{before}} 20 | . " " . $cmp_result{ndiff} 21 | . " " . $cmp_result{ndiff_percent} 22 | . "\n"; 23 | 24 | for (my $j = 0; $j <= $#{$cmp_result{diff}->{1}}; $j++) { 25 | print STDERR $cmp_result{diff}->{1}->[$j] . "\n"; 26 | print STDERR $cmp_result{diff}->{2}->[$j] . "\n"; 27 | print STDERR "\n"; 28 | } 29 | } 30 | 31 | sub compare_markup { 32 | my ($rm1, $rm2) = @_; 33 | my %r; 34 | 35 | if ($#{$rm1} != $#{$rm2}) { 36 | die "$#{$rm1} != $#{$rm2}"; 37 | } 38 | 39 | my $n = 0; 40 | 41 | for (my $i = 0; $i <= $#{$rm1}; $i++) { 42 | if ($rm1->[$i]->{str} ne $rm2->[$i]->{str}) { 43 | $n += 1; 44 | push @{$r{diff}->{1}}, $rm1->[$i]->{str}; 45 | push @{$r{diff}->{2}}, $rm2->[$i]->{str}; 46 | } 47 | } 48 | 49 | $r{ndiff} = $n; 50 | $r{ndiff_percent} = $n / $#{$rm1}; 51 | 52 | return %r; 53 | } 54 | 55 | sub load_tab_markup { 56 | my ($fn) = @_; 57 | my @m; 58 | 59 | open(F, "< $fn") || die "can't open $fn"; 60 | while () { 61 | chomp $_; 62 | if ($_ =~ /^$/) { next; } 63 | 64 | my %h; 65 | 66 | $h{str} = $_; 67 | if ($_ =~ /^(\d+)/) { 68 | $h{tid} = $1; 69 | } 70 | 71 | push @m, \%h; 72 | } 73 | close(F); 74 | 75 | return \@m; 76 | } 77 | --------------------------------------------------------------------------------