├── .github └── ISSUE_TEMPLATE │ ├── bug_report.md │ ├── config.yml │ └── feature-request.md ├── .gitignore ├── .readthedocs.yml ├── CNAME ├── Jenkinsfile ├── LICENSE ├── MANIFEST.in ├── NLP_OSS_DeepPavlov_ACL_Demo_final.pdf ├── README.md ├── _config.yml ├── _layouts └── default.html ├── deeppavlov ├── __init__.py ├── __main__.py ├── _meta.py ├── configs │ ├── __init__.py │ ├── classifiers │ │ ├── boolqa_rubert.json │ │ ├── few_shot_roberta.json │ │ ├── glue │ │ │ ├── glue_cola_roberta.json │ │ │ ├── glue_mnli_cased_bert_torch.json │ │ │ ├── glue_mnli_mm_cased_bert_torch.json │ │ │ ├── glue_mnli_roberta.json │ │ │ ├── glue_mrpc_roberta.json │ │ │ ├── glue_qnli_roberta.json │ │ │ ├── glue_qqp_roberta.json │ │ │ ├── glue_rte_cased_bert_torch.json │ │ │ ├── glue_rte_roberta_mnli.json │ │ │ ├── glue_sst2_roberta.json │ │ │ ├── glue_stsb_roberta.json │ │ │ └── glue_wnli_roberta.json │ │ ├── insults_kaggle_bert.json │ │ ├── paraphraser_convers_distilrubert_2L.json │ │ ├── paraphraser_convers_distilrubert_6L.json │ │ ├── paraphraser_rubert.json │ │ ├── query_pr.json │ │ ├── rusentiment_bert.json │ │ ├── rusentiment_convers_bert.json │ │ ├── rusentiment_convers_distilrubert_2L.json │ │ ├── rusentiment_convers_distilrubert_6L.json │ │ ├── sentiment_sst_conv_bert.json │ │ ├── sentiment_twitter.json │ │ ├── superglue │ │ │ ├── superglue_boolq_roberta_mnli.json │ │ │ ├── superglue_copa_roberta.json │ │ │ ├── superglue_record_roberta.json │ │ │ └── superglue_wic_bert.json │ │ └── topics_distilbert_base_uncased.json │ ├── doc_retrieval │ │ ├── en_ranker_pop_wiki.json │ │ ├── en_ranker_tfidf_wiki.json │ │ └── ru_ranker_tfidf_wiki.json │ ├── embedder │ │ ├── bert_embedder.json │ │ └── bert_sentence_embedder.json │ ├── entity_extraction │ │ ├── entity_detection_en.json │ │ ├── entity_detection_ru.json │ │ ├── entity_extraction_en.json │ │ ├── entity_extraction_ru.json │ │ ├── entity_linking_en.json │ │ └── entity_linking_ru.json │ ├── faq │ │ └── fasttext_logreg.json │ ├── kbqa │ │ ├── kbqa_cq_en.json │ │ ├── kbqa_cq_ru.json │ │ └── wiki_parser.json │ ├── morpho_syntax_parser │ │ ├── morpho_ru_syntagrus_bert.json │ │ ├── ru_syntagrus_joint_parsing.json │ │ └── syntax_ru_syntagrus_bert.json │ ├── multitask │ │ ├── mt_glue.json │ │ └── multitask_example.json │ ├── ner │ │ ├── ner_bert_base.json │ │ ├── ner_case_agnostic_mdistilbert.json │ │ ├── ner_collection3_bert.json │ │ ├── ner_conll2003_bert.json │ │ ├── ner_conll2003_deberta_crf.json │ │ ├── ner_ontonotes_bert.json │ │ ├── ner_ontonotes_bert_mult.json │ │ ├── ner_ontonotes_deberta_crf.json │ │ ├── ner_rus_bert.json │ │ ├── ner_rus_bert_probas.json │ │ ├── ner_rus_convers_distilrubert_2L.json │ │ └── ner_rus_convers_distilrubert_6L.json │ ├── odqa │ │ ├── en_odqa_infer_wiki.json │ │ ├── en_odqa_pop_infer_wiki.json │ │ └── ru_odqa_infer_wiki.json │ ├── ranking │ │ ├── path_ranking_nll_roberta_en.json │ │ ├── ranking_ubuntu_v2_torch_bert_uncased.json │ │ ├── rel_ranking_nll_bert_ru.json │ │ └── rel_ranking_roberta_en.json │ ├── regressors │ │ └── translation_ranker.json │ ├── relation_extraction │ │ ├── re_docred.json │ │ └── re_rured.json │ ├── russian_super_glue │ │ ├── russian_superglue_danetqa_rubert.json │ │ ├── russian_superglue_lidirus_rubert.json │ │ ├── russian_superglue_muserc_rubert.json │ │ ├── russian_superglue_parus_rubert.json │ │ ├── russian_superglue_rcb_rubert.json │ │ ├── russian_superglue_rucos_rubert.json │ │ ├── russian_superglue_russe_rubert.json │ │ ├── russian_superglue_rwsd_rubert.json │ │ └── russian_superglue_terra_rubert.json │ ├── sentence_segmentation │ │ └── sentseg_dailydialog_bert.json │ ├── spelling_correction │ │ ├── brillmoore_wikitypos_en.json │ │ └── levenshtein_corrector_ru.json │ └── squad │ │ ├── qa_multisberquad_bert.json │ │ ├── qa_nq_psgcls_bert.json │ │ ├── qa_squad2_bert.json │ │ ├── squad_bert.json │ │ ├── squad_ru_bert.json │ │ ├── squad_ru_convers_distilrubert_2L.json │ │ └── squad_ru_convers_distilrubert_6L.json ├── core │ ├── __init__.py │ ├── commands │ │ ├── __init__.py │ │ ├── infer.py │ │ ├── train.py │ │ └── utils.py │ ├── common │ │ ├── __init__.py │ │ ├── aliases.py │ │ ├── base.py │ │ ├── chainer.py │ │ ├── cross_validation.py │ │ ├── errors.py │ │ ├── file.py │ │ ├── log.py │ │ ├── log_events.py │ │ ├── metrics_registry.json │ │ ├── metrics_registry.py │ │ ├── params.py │ │ ├── params_search.py │ │ ├── paths.py │ │ ├── prints.py │ │ ├── registry.json │ │ ├── registry.py │ │ └── requirements_registry.json │ ├── data │ │ ├── __init__.py │ │ ├── data_fitting_iterator.py │ │ ├── data_learning_iterator.py │ │ ├── dataset_reader.py │ │ ├── simple_vocab.py │ │ └── utils.py │ ├── models │ │ ├── __init__.py │ │ ├── component.py │ │ ├── estimator.py │ │ ├── nn_model.py │ │ ├── serializable.py │ │ └── torch_model.py │ └── trainers │ │ ├── __init__.py │ │ ├── fit_trainer.py │ │ ├── nn_trainer.py │ │ ├── torch_trainer.py │ │ └── utils.py ├── dataset_iterators │ ├── __init__.py │ ├── basic_classification_iterator.py │ ├── huggingface_dataset_iterator.py │ ├── morphotagger_iterator.py │ ├── multitask_iterator.py │ ├── siamese_iterator.py │ ├── sqlite_iterator.py │ ├── squad_iterator.py │ └── typos_iterator.py ├── dataset_readers │ ├── __init__.py │ ├── basic_classification_reader.py │ ├── boolqa_reader.py │ ├── conll2003_reader.py │ ├── docred_reader.py │ ├── faq_reader.py │ ├── huggingface_dataset_reader.py │ ├── imdb_reader.py │ ├── line_reader.py │ ├── morphotagging_dataset_reader.py │ ├── multitask_reader.py │ ├── odqa_reader.py │ ├── paraphraser_reader.py │ ├── rel_ranking_reader.py │ ├── rured_reader.py │ ├── sq_reader.py │ ├── squad_dataset_reader.py │ ├── typos_reader.py │ └── ubuntu_v2_reader.py ├── deep.py ├── download.py ├── metrics │ ├── __init__.py │ ├── accuracy.py │ ├── bleu.py │ ├── correlation.py │ ├── elmo_metrics.py │ ├── fmeasure.py │ ├── google_bleu.py │ ├── log_loss.py │ ├── mse.py │ ├── recall_at_k.py │ ├── record_metrics.py │ ├── roc_auc_score.py │ └── squad_metrics.py ├── models │ ├── __init__.py │ ├── api_requester │ │ ├── __init__.py │ │ ├── api_requester.py │ │ └── api_router.py │ ├── classifiers │ │ ├── __init__.py │ │ ├── cos_sim_classifier.py │ │ ├── dnnc_proba2labels.py │ │ ├── proba2labels.py │ │ ├── re_bert.py │ │ ├── torch_classification_model.py │ │ ├── torch_nets.py │ │ └── utils.py │ ├── doc_retrieval │ │ ├── __init__.py │ │ ├── bpr.py │ │ ├── logit_ranker.py │ │ ├── pop_ranker.py │ │ ├── tfidf_ranker.py │ │ └── utils.py │ ├── embedders │ │ ├── __init__.py │ │ ├── abstract_embedder.py │ │ ├── fasttext_embedder.py │ │ ├── tfidf_weighted_embedder.py │ │ └── transformers_embedder.py │ ├── entity_extraction │ │ ├── __init__.py │ │ ├── entity_detection_parser.py │ │ ├── entity_linking.py │ │ ├── find_word.py │ │ └── ner_chunker.py │ ├── kbqa │ │ ├── __init__.py │ │ ├── query_generator.py │ │ ├── query_generator_base.py │ │ ├── rel_ranking_infer.py │ │ ├── ru_adj_to_noun.py │ │ ├── sentence_answer.py │ │ ├── template_matcher.py │ │ ├── tree_to_sparql.py │ │ ├── type_define.py │ │ ├── utils.py │ │ └── wiki_parser.py │ ├── morpho_syntax_parser │ │ ├── __init__.py │ │ ├── dependency_decoding.py │ │ ├── joint.py │ │ ├── spacy_lemmatizer.py │ │ └── syntax_parsing.py │ ├── preprocessors │ │ ├── __init__.py │ │ ├── dirty_comments_preprocessor.py │ │ ├── dnnc_preprocessor.py │ │ ├── mask.py │ │ ├── multitask_preprocessor.py │ │ ├── ner_preprocessor.py │ │ ├── odqa_preprocessors.py │ │ ├── one_hotter.py │ │ ├── re_preprocessor.py │ │ ├── response_base_loader.py │ │ ├── sanitizer.py │ │ ├── sentseg_preprocessor.py │ │ ├── squad_preprocessor.py │ │ ├── str_lower.py │ │ ├── str_token_reverser.py │ │ ├── str_utf8_encoder.py │ │ ├── torch_transformers_preprocessor.py │ │ └── transformers_preprocessor.py │ ├── ranking │ │ ├── __init__.py │ │ └── metrics.py │ ├── relation_extraction │ │ ├── __init__.py │ │ ├── losses.py │ │ └── relation_extraction_bert.py │ ├── sklearn │ │ ├── __init__.py │ │ └── sklearn_component.py │ ├── spelling_correction │ │ ├── __init__.py │ │ ├── brillmoore │ │ │ ├── __init__.py │ │ │ └── error_model.py │ │ ├── electors │ │ │ ├── __init__.py │ │ │ ├── kenlm_elector.py │ │ │ └── top1_elector.py │ │ └── levenshtein │ │ │ ├── __init__.py │ │ │ ├── levenshtein_searcher.py │ │ │ ├── searcher_component.py │ │ │ └── tabled_trie.py │ ├── tokenizers │ │ ├── __init__.py │ │ ├── lazy_tokenizer.py │ │ ├── nltk_moses_tokenizer.py │ │ ├── nltk_tokenizer.py │ │ ├── spacy_tokenizer.py │ │ ├── split_tokenizer.py │ │ └── utils.py │ ├── torch_bert │ │ ├── __init__.py │ │ ├── crf.py │ │ ├── multitask_transformer.py │ │ ├── torch_bert_ranker.py │ │ ├── torch_transformers_classifier.py │ │ ├── torch_transformers_el_ranker.py │ │ ├── torch_transformers_multiplechoice.py │ │ ├── torch_transformers_nll_ranking.py │ │ ├── torch_transformers_sequence_tagger.py │ │ ├── torch_transformers_squad.py │ │ └── torch_transformers_syntax_parser.py │ └── vectorizers │ │ ├── __init__.py │ │ └── hashing_tfidf_vectorizer.py ├── paramsearch.py ├── requirements │ ├── datasets.txt │ ├── dependency_decoding.txt │ ├── en_core_web_sm.txt │ ├── faiss.txt │ ├── fasttext.txt │ ├── hdt.txt │ ├── kenlm.txt │ ├── lxml.txt │ ├── opt_einsum.txt │ ├── protobuf.txt │ ├── pytorch.txt │ ├── rapidfuzz.txt │ ├── razdel.txt │ ├── ru_core_news_sm.txt │ ├── sacremoses.txt │ ├── sentencepiece.txt │ ├── slovnet.txt │ ├── sortedcontainers.txt │ ├── torchcrf.txt │ ├── transformers.txt │ ├── udapi.txt │ └── whapi.txt ├── settings.py ├── utils │ ├── __init__.py │ ├── benchmarks │ │ ├── __init__.py │ │ └── benchmarks.py │ ├── connector │ │ ├── __init__.py │ │ └── dialog_logger.py │ ├── pip_wrapper │ │ ├── __init__.py │ │ └── pip_wrapper.py │ ├── server │ │ ├── __init__.py │ │ ├── metrics.py │ │ └── server.py │ ├── settings │ │ ├── __init__.py │ │ ├── dialog_logger_config.json │ │ ├── log_config.json │ │ └── server_config.json │ └── socket │ │ ├── __init__.py │ │ └── socket.py └── vocabs │ ├── __init__.py │ ├── typos.py │ └── wiki_sqlite.py ├── docs ├── Makefile ├── _static │ ├── aws_ec2 │ │ ├── 01_login_to_aws.png │ │ ├── 02_choose_ubuntu.png │ │ ├── 03_select_instance_type.png │ │ ├── 04_add_storage.png │ │ ├── 05_review_instance.png │ │ ├── 06_go_to_running_instances.png │ │ ├── 07_wait_init.png │ │ ├── 08_01_set_sec_group.png │ │ ├── 08_02_set_inbound.png │ │ ├── 09_01_select_connect.png │ │ └── 09_02_connection_info.png │ ├── deeppavlov.css │ ├── deeppavlov.png │ ├── deeppavlov_logo.png │ ├── dp_agnt_diag.png │ ├── gobot_diagram.png │ ├── ipavlov_footer.png │ ├── kvret_diagram.png │ ├── my_blocks.css │ ├── social │ │ ├── Medium_Monogram.svg │ │ ├── Twitter_Social_Icon_Circle_Color.svg │ │ ├── telegram.png │ │ └── youtube_social_circle_red.png │ └── tree.png ├── _templates │ └── footer.html ├── apiref │ ├── core.rst │ ├── core │ │ ├── commands.rst │ │ ├── common.rst │ │ ├── data.rst │ │ ├── models.rst │ │ └── trainers.rst │ ├── dataset_iterators.rst │ ├── dataset_readers.rst │ ├── metrics.rst │ ├── models.rst │ ├── models │ │ ├── api_requester.rst │ │ ├── classifiers.rst │ │ ├── doc_retrieval.rst │ │ ├── embedders.rst │ │ ├── entity_extraction.rst │ │ ├── kbqa.rst │ │ ├── preprocessors.rst │ │ ├── relation_extraction.rst │ │ ├── sklearn.rst │ │ ├── spelling_correction.rst │ │ ├── tokenizers.rst │ │ ├── torch_bert.rst │ │ └── vectorizers.rst │ └── vocabs.rst ├── conf.py ├── devguides │ ├── contribution_guide.rst │ └── registry.rst ├── features │ ├── hypersearch.rst │ ├── models │ │ ├── KBQA.ipynb │ │ ├── NER.ipynb │ │ ├── ODQA.ipynb │ │ ├── SQuAD.ipynb │ │ ├── bert.rst │ │ ├── classification.ipynb │ │ ├── entity_extraction.ipynb │ │ ├── few_shot_classification.ipynb │ │ ├── morpho_tagger.ipynb │ │ ├── multitask_bert.rst │ │ ├── neural_ranking.ipynb │ │ ├── popularity_ranking.rst │ │ ├── relation_extraction.ipynb │ │ ├── spelling_correction.ipynb │ │ ├── superglue.rst │ │ ├── syntax_parser.ipynb │ │ └── tfidf_ranking.ipynb │ ├── overview.rst │ └── pretrained_vectors.rst ├── index.rst ├── integrations │ ├── aws_ec2.rst │ ├── rest_api.rst │ ├── settings.rst │ └── socket_api.rst ├── internships │ └── internships.rst └── intro │ ├── configuration.rst │ ├── installation.rst │ ├── overview.rst │ ├── python.ipynb │ └── quick_start.rst ├── requirements.txt ├── setup.py ├── tests ├── __init__.py ├── test_configs │ └── doc_retrieval │ │ ├── en_ranker_pop_wiki_test.json │ │ ├── en_ranker_tfidf_wiki_test.json │ │ └── ru_ranker_tfidf_wiki_test.json └── test_quick_start.py └── utils ├── Docker ├── Dockerfile ├── README.md ├── cmd.sh └── docker-compose.yml ├── __init__.py └── prepare ├── __init__.py ├── hashes.py ├── optimize_ipynb.py ├── registry.py └── upload.py /.github/ISSUE_TEMPLATE/bug_report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug report 3 | about: Report on a bug you encountered 4 | title: '' 5 | labels: bug 6 | assignees: '' 7 | 8 | --- 9 | 10 | Want to contribute to DeepPavlov? Please read the [contributing guideline](http://docs.deeppavlov.ai/en/master/devguides/contribution_guide.html) first. 11 | 12 | Please enter all the information below, otherwise your issue may be closed without a warning. 13 | 14 | 15 | **DeepPavlov version** (you can look it up by running `pip show deeppavlov`): 16 | 17 | **Python version**: 18 | 19 | **Operating system** (ubuntu linux, windows, ...): 20 | 21 | **Issue**: 22 | 23 | 24 | **Content or a name of a configuration file**: 25 | ``` 26 | 27 | ``` 28 | 29 | 30 | **Command that led to error**: 31 | ``` 32 | 33 | ``` 34 | 35 | **Error (including full traceback)**: 36 | ``` 37 | 38 | ``` 39 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/config.yml: -------------------------------------------------------------------------------- 1 | blank_issues_enabled: false 2 | contact_links: 3 | - name: Ask a question 4 | url: https://forum.deeppavlov.ai/ 5 | about: If you have a different question, please ask it in the forum https://forum.deeppavlov.ai 6 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature-request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Feature request 3 | about: Suggest a feature to improve the DeepPavlov library 4 | title: '' 5 | labels: enhancement 6 | assignees: '' 7 | 8 | --- 9 | 10 | Want to contribute to DeepPavlov? Please read the [contributing guideline](http://docs.deeppavlov.ai/en/master/devguides/contribution_guide.html) first. 11 | 12 | 13 | **What problem are we trying to solve?**: 14 | ``` 15 | 16 | ``` 17 | 18 | **How can we solve it?**: 19 | ``` 20 | 21 | ``` 22 | 23 | **Are there other issues that block this solution?**: 24 | ``` 25 | 26 | ``` 27 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | 49 | # Translations 50 | *.mo 51 | *.pot 52 | 53 | # Django stuff: 54 | *.log 55 | local_settings.py 56 | 57 | # Flask stuff: 58 | instance/ 59 | .webassets-cache 60 | 61 | # Scrapy stuff: 62 | .scrapy 63 | 64 | # Sphinx documentation 65 | docs/_build/ 66 | 67 | # PyBuilder 68 | target/ 69 | 70 | # Jupyter Notebook 71 | .ipynb_checkpoints 72 | 73 | # pyenv 74 | .python-version 75 | 76 | # celery beat schedule file 77 | celerybeat-schedule 78 | 79 | # SageMath parsed files 80 | *.sage.py 81 | 82 | # dotenv 83 | .env 84 | 85 | # virtualenv 86 | .venv 87 | venv/ 88 | ENV/ 89 | 90 | # Spyder project settings 91 | .spyderproject 92 | .spyproject 93 | 94 | # Rope project settings 95 | .ropeproject 96 | 97 | # mkdocs documentation 98 | /site 99 | 100 | # mypy 101 | .mypy_cache/ 102 | 103 | #IDEA 104 | .idea/ 105 | 106 | #Atom IDE 107 | .ftpconfig 108 | 109 | #vscode IDE 110 | .vscode 111 | 112 | # Vim 113 | *.vim 114 | *.vimrc 115 | 116 | #GIT 117 | .git/ 118 | 119 | #Default usr dir 120 | download/ 121 | 122 | #project test 123 | /test/ 124 | .pytest_cache 125 | 126 | # project data 127 | /data/ 128 | 129 | # local dockerfiles 130 | /Dockerfile 131 | /entrypoint.sh 132 | /.dockerignore 133 | -------------------------------------------------------------------------------- /.readthedocs.yml: -------------------------------------------------------------------------------- 1 | # .readthedocs.yml 2 | version: 2 3 | 4 | build: 5 | os: "ubuntu-20.04" 6 | tools: 7 | python: "3.10" 8 | formats: [] 9 | 10 | python: 11 | install: 12 | - method: pip 13 | path: . 14 | extra_requirements: 15 | - docs 16 | -------------------------------------------------------------------------------- /CNAME: -------------------------------------------------------------------------------- 1 | deeppavlov.ai -------------------------------------------------------------------------------- /Jenkinsfile: -------------------------------------------------------------------------------- 1 | node('cuda-module') { 2 | timestamps { 3 | try { 4 | stage('Clean') { 5 | sh "rm -rf .[^.] .??* *" 6 | } 7 | stage('Checkout') { 8 | checkout scm 9 | } 10 | stage('Setup') { 11 | env.TFHUB_CACHE_DIR="tfhub_cache" 12 | sh """ 13 | EPOCH=\$(date +%s) docker-compose -f utils/Docker/docker-compose.yml -p $BUILD_TAG build 14 | """ 15 | } 16 | stage('Tests') { 17 | sh """ 18 | docker-compose -f utils/Docker/docker-compose.yml -p $BUILD_TAG up py36 py37 19 | docker-compose -f utils/Docker/docker-compose.yml -p $BUILD_TAG ps | grep Exit | grep -v 'Exit 0' && exit 1 20 | docker-compose -f utils/Docker/docker-compose.yml -p $BUILD_TAG up py38 py39 21 | docker-compose -f utils/Docker/docker-compose.yml -p $BUILD_TAG ps | grep Exit | grep -v 'Exit 0' && exit 1 22 | docker-compose -f utils/Docker/docker-compose.yml -p $BUILD_TAG up py310 py311 23 | docker-compose -f utils/Docker/docker-compose.yml -p $BUILD_TAG ps | grep Exit | grep -v 'Exit 0' && exit 1 || exit 0 24 | """ 25 | currentBuild.result = 'SUCCESS' 26 | } 27 | } 28 | catch(e) { 29 | currentBuild.result = 'FAILURE' 30 | throw e 31 | } 32 | finally { 33 | sh """ 34 | docker-compose -f utils/Docker/docker-compose.yml -p $BUILD_TAG rm -f 35 | docker network rm \$(echo $BUILD_TAG | awk '{print tolower(\$0)}')_default 36 | """ 37 | emailext to: "\${DEFAULT_RECIPIENTS}", 38 | subject: "${env.JOB_NAME} - Build # ${currentBuild.number} - ${currentBuild.result}!", 39 | body: '${BRANCH_NAME} - ${BUILD_URL}', 40 | attachLog: true 41 | } 42 | } 43 | } 44 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include README.MD 2 | include LICENSE 3 | include requirements.txt 4 | include deeppavlov/requirements/*.txt 5 | recursive-include deeppavlov *.json 6 | recursive-include deeppavlov *.md 7 | -------------------------------------------------------------------------------- /NLP_OSS_DeepPavlov_ACL_Demo_final.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deeppavlov/DeepPavlov/5f9fbed0c7191466bc7621e604b810f66f254c03/NLP_OSS_DeepPavlov_ACL_Demo_final.pdf -------------------------------------------------------------------------------- /_config.yml: -------------------------------------------------------------------------------- 1 | theme: jekyll-theme-leap-day 2 | google_analytics: UA-139843736-5 3 | include: 4 | - _static 5 | -------------------------------------------------------------------------------- /deeppavlov/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017 Neural Networks and Deep Learning lab, MIPT 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import sys 16 | from pathlib import Path 17 | 18 | from ._meta import __author__, __description__, __email__, __keywords__, __license__, __version__ 19 | from .configs import configs 20 | from .core.commands.infer import build_model 21 | from .core.commands.train import train_evaluate_model_from_config 22 | from .core.common.base import Element, Model 23 | from .core.common.chainer import Chainer 24 | from .core.common.log import init_logger 25 | from .download import deep_download 26 | 27 | 28 | # TODO: make better 29 | def train_model(config: [str, Path, dict], install: bool = False, 30 | download: bool = False, recursive: bool = False) -> Chainer: 31 | train_evaluate_model_from_config(config, install=install, download=download, recursive=recursive) 32 | return build_model(config, load_trained=True) 33 | 34 | 35 | def evaluate_model(config: [str, Path, dict], install: bool = False, 36 | download: bool = False, recursive: bool = False) -> dict: 37 | return train_evaluate_model_from_config(config, to_train=False, install=install, 38 | download=download, recursive=recursive) 39 | 40 | 41 | # check version 42 | assert sys.hexversion >= 0x3060000, 'Does not work in python3.5 or lower' 43 | 44 | # resolve conflicts with previous DeepPavlov installations versioned up to 0.0.9 45 | dot_dp_path = Path('~/.deeppavlov').expanduser().resolve() 46 | if dot_dp_path.is_file(): 47 | dot_dp_path.unlink() 48 | 49 | # initiate logging 50 | init_logger() 51 | -------------------------------------------------------------------------------- /deeppavlov/__main__.py: -------------------------------------------------------------------------------- 1 | if __name__ == '__main__': 2 | from .deep import main 3 | 4 | main() 5 | -------------------------------------------------------------------------------- /deeppavlov/_meta.py: -------------------------------------------------------------------------------- 1 | __version__ = '1.7.0' 2 | __author__ = 'Neural Networks and Deep Learning lab, MIPT' 3 | __description__ = 'An open source library for building end-to-end dialog systems and training chatbots.' 4 | __keywords__ = ['NLP', 'NER', 'SQUAD', 'Intents', 'Chatbot'] 5 | __license__ = 'Apache License, Version 2.0' 6 | __email__ = 'info@deeppavlov.ai' 7 | -------------------------------------------------------------------------------- /deeppavlov/configs/classifiers/boolqa_rubert.json: -------------------------------------------------------------------------------- 1 | { 2 | "dataset_reader": { 3 | "class_name": "boolqa_reader", 4 | "data_path": "{DOWNLOADS_PATH}/boolqa_data", 5 | "language": "ru" 6 | }, 7 | "dataset_iterator": { 8 | "class_name": "basic_classification_iterator", 9 | "seed": 243 10 | }, 11 | "chainer": { 12 | "in": ["text_a", "text_b"], 13 | "in_y": ["y"], 14 | "pipe": [ 15 | { 16 | "class_name": "torch_transformers_preprocessor", 17 | "vocab_file": "{TRANSFORMER}", 18 | "do_lower_case": false, 19 | "max_seq_length": 128, 20 | "in": ["text_a", "text_b"], 21 | "out": ["bert_features"] 22 | }, 23 | { 24 | "class_name": "torch_transformers_classifier", 25 | "n_classes": 2, 26 | "pretrained_bert": "{TRANSFORMER}", 27 | "save_path": "{MODELS_PATH}/boolqa_rubert/model_rubert", 28 | "load_path": "{MODELS_PATH}/boolqa_rubert/model_rubert", 29 | "optimizer": "AdamW", 30 | "optimizer_parameters": {"lr": 2e-05}, 31 | "learning_rate_drop_patience": 3, 32 | "learning_rate_drop_div": 2.0, 33 | "in": ["bert_features"], 34 | "in_y": ["y"], 35 | "out": ["predictions"] 36 | } 37 | ], 38 | "out": ["predictions"] 39 | }, 40 | "train": { 41 | "epochs": 50, 42 | "batch_size": 32, 43 | "train_metrics": ["f1", "acc"], 44 | "metrics": ["f1", "acc"], 45 | "validation_patience": 5, 46 | "val_every_n_epochs": 1, 47 | "log_every_n_epochs": 1, 48 | "evaluation_targets": ["valid", "train"], 49 | "show_examples": false, 50 | "class_name": "torch_trainer" 51 | }, 52 | "metadata": { 53 | "variables": { 54 | "ROOT_PATH": "~/.deeppavlov", 55 | "DOWNLOADS_PATH": "{ROOT_PATH}/downloads", 56 | "MODELS_PATH": "{ROOT_PATH}/models", 57 | "TRANSFORMER": "DeepPavlov/rubert-base-cased" 58 | } 59 | } 60 | } 61 | -------------------------------------------------------------------------------- /deeppavlov/configs/classifiers/few_shot_roberta.json: -------------------------------------------------------------------------------- 1 | { 2 | "chainer": { 3 | "in": ["texts", "dataset"], 4 | "in_y": ["y_true"], 5 | "pipe": [ 6 | { 7 | "class_name": "dnnc_pair_generator", 8 | "in": ["texts", "dataset"], 9 | "out": ["x", "x_support", "x_populated", "y_support"], 10 | "bidirectional": true 11 | }, 12 | { 13 | "class_name": "torch_transformers_preprocessor", 14 | "in": ["x_populated", "x_support"], 15 | "out": ["bert_features"], 16 | "vocab_file": "{BASE_MODEL}", 17 | "do_lower_case": true, 18 | "max_seq_length": 128 19 | }, 20 | { 21 | "class_name": "torch_transformers_classifier", 22 | "main": true, 23 | "in": ["bert_features"], 24 | "out": ["simmilarity_scores"], 25 | "n_classes": 2, 26 | "return_probas": true, 27 | "pretrained_bert": "{BASE_MODEL}", 28 | "save_path": "{MODEL_PATH}/model", 29 | "load_path": "{MODEL_PATH}/model", 30 | "is_binary": "{BINARY_CLASSIFICATION}" 31 | }, 32 | { 33 | "class_name": "dnnc_proba2labels", 34 | "is_binary": "{BINARY_CLASSIFICATION}", 35 | "in": ["simmilarity_scores", "x", "x_populated", "x_support", "y_support"], 36 | "out": ["y_pred"], 37 | "confidence_threshold": 0.0 38 | } 39 | ], 40 | "out": ["y_pred"] 41 | }, 42 | "metadata": { 43 | "variables": { 44 | "ROOT_PATH": "~/.deeppavlov", 45 | "MODEL_PATH": "{ROOT_PATH}/models/fewshot/roberta_nli_mrpc_1_10", 46 | "BINARY_CLASSIFICATION": true, 47 | "BASE_MODEL": "roberta-base" 48 | }, 49 | "download": [ 50 | { 51 | "url": "http://files.deeppavlov.ai/v1/classifiers/fewshot/roberta_nli_mrpc_1_10.tar.gz", 52 | "subdir": "{MODEL_PATH}" 53 | } 54 | ] 55 | } 56 | } 57 | -------------------------------------------------------------------------------- /deeppavlov/configs/classifiers/glue/glue_stsb_roberta.json: -------------------------------------------------------------------------------- 1 | { 2 | "dataset_reader": { 3 | "class_name": "huggingface_dataset_reader", 4 | "path": "{COMPETITION}", 5 | "name": "{TASK}", 6 | "train": "train", 7 | "valid": "validation", 8 | "test": "test" 9 | }, 10 | "dataset_iterator": { 11 | "class_name": "huggingface_dataset_iterator", 12 | "features": ["sentence1", "sentence2"], 13 | "label": "label", 14 | "use_label_name": false, 15 | "seed": 42 16 | }, 17 | "chainer": { 18 | "in": ["sentence1", "sentence2"], 19 | "in_y": ["y"], 20 | "pipe": [ 21 | { 22 | "class_name": "torch_transformers_preprocessor", 23 | "vocab_file": "{BASE_MODEL}", 24 | "do_lower_case": false, 25 | "max_seq_length": 64, 26 | "in": ["sentence1", "sentence2"], 27 | "out": ["bert_features"] 28 | }, 29 | { 30 | "class_name": "torch_transformers_classifier", 31 | "n_classes": 1, 32 | "return_probas": false, 33 | "pretrained_bert": "{BASE_MODEL}", 34 | "save_path": "{MODEL_PATH}/model", 35 | "load_path": "{MODEL_PATH}/model", 36 | "optimizer": "AdamW", 37 | "optimizer_parameters": { 38 | "lr": 2e-05 39 | }, 40 | "learning_rate_drop_patience": 3, 41 | "learning_rate_drop_div": 2.0, 42 | "in": ["bert_features"], 43 | "in_y": ["y"], 44 | "out": ["y_pred"] 45 | } 46 | ], 47 | "out": ["y_pred"] 48 | }, 49 | "train": { 50 | "batch_size": 32, 51 | "metrics": [ 52 | "pearson_correlation", 53 | "spearman_correlation" 54 | ], 55 | "validation_patience": 10, 56 | "val_every_n_epochs": 1, 57 | "log_every_n_epochs": 1, 58 | "show_examples": false, 59 | "evaluation_targets": ["train", "valid"], 60 | "class_name": "torch_trainer", 61 | "tensorboard_log_dir": "{MODEL_PATH}/", 62 | "pytest_max_batches": 2 63 | }, 64 | "metadata": { 65 | "variables": { 66 | "BASE_MODEL": "roberta-large", 67 | "ROOT_PATH": "~/.deeppavlov", 68 | "DOWNLOADS_PATH": "{ROOT_PATH}/downloads", 69 | "MODELS_PATH": "{ROOT_PATH}/models", 70 | "COMPETITION": "glue", 71 | "TASK": "stsb", 72 | "MODEL_PATH": "{MODELS_PATH}/{COMPETITION}/{TASK}/{BASE_MODEL}" 73 | }, 74 | "download": [ 75 | { 76 | "url": "http://files.deeppavlov.ai/v1/glue/glue_stsb_roberta.tar.gz", 77 | "subdir": "{MODEL_PATH}" 78 | } 79 | ] 80 | } 81 | } 82 | -------------------------------------------------------------------------------- /deeppavlov/configs/classifiers/paraphraser_rubert.json: -------------------------------------------------------------------------------- 1 | { 2 | "dataset_reader": { 3 | "class_name": "paraphraser_reader", 4 | "data_path": "{DOWNLOADS_PATH}/paraphraser_data", 5 | "do_lower_case": false 6 | }, 7 | "dataset_iterator": { 8 | "class_name": "siamese_iterator", 9 | "seed": 243, 10 | "len_valid": 500 11 | }, 12 | "chainer": { 13 | "in": ["text_a", "text_b"], 14 | "in_y": ["y"], 15 | "pipe": [ 16 | { 17 | "class_name": "torch_transformers_preprocessor", 18 | "vocab_file": "{TRANSFORMER}", 19 | "do_lower_case": false, 20 | "max_seq_length": 64, 21 | "in": ["text_a", "text_b"], 22 | "out": ["bert_features"] 23 | }, 24 | { 25 | "class_name": "torch_transformers_classifier", 26 | "n_classes": 2, 27 | "pretrained_bert": "{TRANSFORMER}", 28 | "save_path": "{MODEL_PATH}/model", 29 | "load_path": "{MODEL_PATH}/model", 30 | "optimizer": "AdamW", 31 | "optimizer_parameters": {"lr": 2e-05}, 32 | "learning_rate_drop_patience": 3, 33 | "learning_rate_drop_div": 2.0, 34 | "in": ["bert_features"], 35 | "in_y": ["y"], 36 | "out": ["predictions"] 37 | } 38 | ], 39 | "out": ["predictions"] 40 | }, 41 | "train": { 42 | "batch_size": 64, 43 | "pytest_max_batches": 2, 44 | "train_metrics": ["f1", "acc"], 45 | "metrics": ["f1", "acc"], 46 | "validation_patience": 7, 47 | "val_every_n_batches": 50, 48 | "log_every_n_batches": 50, 49 | "evaluation_targets": ["valid", "test"], 50 | "class_name": "torch_trainer" 51 | }, 52 | "metadata": { 53 | "variables": { 54 | "ROOT_PATH": "~/.deeppavlov", 55 | "DOWNLOADS_PATH": "{ROOT_PATH}/downloads", 56 | "MODELS_PATH": "{ROOT_PATH}/models", 57 | "MODEL_PATH": "{MODELS_PATH}/classifiers/paraphraser_rubert_torch", 58 | "TRANSFORMER": "DeepPavlov/rubert-base-cased" 59 | }, 60 | "download": [ 61 | { 62 | "url": "http://files.deeppavlov.ai/datasets/paraphraser.zip", 63 | "subdir": "{DOWNLOADS_PATH}/paraphraser_data" 64 | }, 65 | { 66 | "url": "http://files.deeppavlov.ai/datasets/paraphraser_gold.zip", 67 | "subdir": "{DOWNLOADS_PATH}/paraphraser_data" 68 | }, 69 | { 70 | "url": "http://files.deeppavlov.ai/v1/classifiers/paraphraser_rubert/paraphraser_rubert_v1.tar.gz", 71 | "subdir": "{MODEL_PATH}" 72 | } 73 | ] 74 | } 75 | } 76 | -------------------------------------------------------------------------------- /deeppavlov/configs/doc_retrieval/en_ranker_pop_wiki.json: -------------------------------------------------------------------------------- 1 | { 2 | "dataset_reader": { 3 | "class_name": "odqa_reader", 4 | "data_path": "{DOWNLOADS_PATH}/odqa/enwiki", 5 | "save_path": "{DOWNLOADS_PATH}/odqa/enwiki.db", 6 | "dataset_format": "wiki" 7 | }, 8 | "dataset_iterator": { 9 | "class_name": "sqlite_iterator", 10 | "shuffle": false, 11 | "load_path": "{DOWNLOADS_PATH}/odqa/enwiki_l100.db" 12 | }, 13 | "chainer": { 14 | "in": ["docs"], 15 | "in_y": ["doc_ids", "doc_nums"], 16 | "out": ["pop_doc_ids"], 17 | "pipe": [ 18 | { 19 | "class_name": "hashing_tfidf_vectorizer", 20 | "id": "vectorizer", 21 | "fit_on": ["docs", "doc_ids", "doc_nums"], 22 | "save_path": "{MODELS_PATH}/odqa/enwiki_tfidf_matrix_par_lite.npz", 23 | "load_path": "{MODELS_PATH}/odqa/enwiki_tfidf_matrix_par_lite.npz", 24 | "tokenizer": { 25 | "class_name": "stream_spacy_tokenizer", 26 | "lemmas": true, 27 | "lowercase": true, 28 | "filter_stopwords": true, 29 | "ngram_range": [1, 3] 30 | } 31 | }, 32 | { 33 | "class_name": "tfidf_ranker", 34 | "top_n": 100, 35 | "in": ["docs"], 36 | "out": ["tfidf_doc_ids", "tfidf_doc_scores"], 37 | "vectorizer": "#vectorizer" 38 | }, 39 | { 40 | "class_name": "pop_ranker", 41 | "pop_dict_path": "{DOWNLOADS_PATH}/odqa/enwiki_popularities.json", 42 | "load_path": "{MODELS_PATH}/odqa/logreg_3features_v2.joblib", 43 | "top_n": 100, 44 | "in": ["tfidf_doc_ids", "tfidf_doc_scores"], 45 | "out": ["pop_doc_ids", "pop_doc_scores"] 46 | } 47 | ] 48 | }, 49 | "train": { 50 | "batch_size": 10000, 51 | "evaluation_targets": [], 52 | "class_name": "fit_trainer" 53 | }, 54 | "metadata": { 55 | "variables": { 56 | "ROOT_PATH": "~/.deeppavlov", 57 | "DOWNLOADS_PATH": "{ROOT_PATH}/downloads", 58 | "MODELS_PATH": "{ROOT_PATH}/models" 59 | }, 60 | "download": [ 61 | { 62 | "url": "http://files.deeppavlov.ai/deeppavlov_data/odqa/enwiki_l100.tar.gz", 63 | "subdir": "{DOWNLOADS_PATH}/odqa" 64 | }, 65 | { 66 | "url": "http://files.deeppavlov.ai/deeppavlov_data/odqa/enwiki_tfidf_matrix_par_lite.tar.gz", 67 | "subdir": "{MODELS_PATH}/odqa" 68 | }, 69 | { 70 | "url": "http://files.deeppavlov.ai/deeppavlov_data/odqa/enwiki_popularities.tar.gz", 71 | "subdir": "{DOWNLOADS_PATH}/odqa" 72 | }, 73 | { 74 | "url": "http://files.deeppavlov.ai/deeppavlov_data/ranking/logreg_3features_v2.joblib", 75 | "subdir": "{MODELS_PATH}/odqa" 76 | } 77 | ] 78 | } 79 | } 80 | -------------------------------------------------------------------------------- /deeppavlov/configs/doc_retrieval/en_ranker_tfidf_wiki.json: -------------------------------------------------------------------------------- 1 | { 2 | "dataset_reader": { 3 | "class_name": "odqa_reader", 4 | "data_path": "{DOWNLOADS_PATH}/odqa/enwiki", 5 | "save_path": "{DOWNLOADS_PATH}/odqa/enwiki.db", 6 | "dataset_format": "wiki" 7 | }, 8 | "dataset_iterator": { 9 | "class_name": "sqlite_iterator", 10 | "shuffle": false, 11 | "load_path": "{DOWNLOADS_PATH}/odqa/enwiki_l100.db" 12 | }, 13 | "chainer": { 14 | "in": ["docs"], 15 | "in_y": ["doc_ids", "doc_nums"], 16 | "out": ["tfidf_doc_ids"], 17 | "pipe": [ 18 | { 19 | "class_name": "hashing_tfidf_vectorizer", 20 | "id": "vectorizer", 21 | "fit_on": ["docs", "doc_ids", "doc_nums"], 22 | "save_path": "{MODELS_PATH}/odqa/enwiki_tfidf_matrix_par_lite.npz", 23 | "load_path": "{MODELS_PATH}/odqa/enwiki_tfidf_matrix_par_lite.npz", 24 | "tokenizer": { 25 | "class_name": "stream_spacy_tokenizer", 26 | "lemmas": true, 27 | "lowercase": true, 28 | "filter_stopwords": true, 29 | "ngram_range": [1, 3] 30 | } 31 | }, 32 | { 33 | "class_name": "tfidf_ranker", 34 | "top_n": 100, 35 | "in": ["docs"], 36 | "out": ["tfidf_doc_ids", "tfidf_doc_scores"], 37 | "vectorizer": "#vectorizer" 38 | } 39 | ] 40 | }, 41 | "train": { 42 | "batch_size": 10000, 43 | "evaluation_targets": [], 44 | "class_name": "fit_trainer" 45 | }, 46 | "metadata": { 47 | "variables": { 48 | "ROOT_PATH": "~/.deeppavlov", 49 | "DOWNLOADS_PATH": "{ROOT_PATH}/downloads", 50 | "MODELS_PATH": "{ROOT_PATH}/models" 51 | }, 52 | "download": [ 53 | { 54 | "url": "http://files.deeppavlov.ai/deeppavlov_data/odqa/enwiki_l100.tar.gz", 55 | "subdir": "{DOWNLOADS_PATH}/odqa" 56 | }, 57 | { 58 | "url": "http://files.deeppavlov.ai/deeppavlov_data/odqa/enwiki_tfidf_matrix_par_lite.tar.gz", 59 | "subdir": "{MODELS_PATH}/odqa" 60 | } 61 | ] 62 | } 63 | } 64 | -------------------------------------------------------------------------------- /deeppavlov/configs/doc_retrieval/ru_ranker_tfidf_wiki.json: -------------------------------------------------------------------------------- 1 | { 2 | "dataset_reader": { 3 | "class_name": "odqa_reader", 4 | "data_path": "{DOWNLOADS_PATH}/odqa/ruwiki", 5 | "save_path": "{DOWNLOADS_PATH}/odqa/ruwiki_par_page_compr.db", 6 | "dataset_format": "wiki" 7 | }, 8 | "dataset_iterator": { 9 | "class_name": "sqlite_iterator", 10 | "shuffle": false, 11 | "load_path": "{DOWNLOADS_PATH}/odqa/ruwiki_par_page_compr.db" 12 | }, 13 | "chainer": { 14 | "in": ["docs"], 15 | "in_y": ["doc_ids", "doc_nums"], 16 | "out": ["tfidf_doc_ids"], 17 | "pipe": [ 18 | { 19 | "class_name": "hashing_tfidf_vectorizer", 20 | "id": "vectorizer", 21 | "fit_on": ["docs", "doc_ids", "doc_nums"], 22 | "save_path": "{MODELS_PATH}/odqa/ruwiki_tfidf_matrix_compr.npz", 23 | "load_path": "{MODELS_PATH}/odqa/ruwiki_tfidf_matrix_compr.npz", 24 | "tokenizer": { 25 | "class_name": "stream_spacy_tokenizer", 26 | "spacy_model": "ru_core_news_sm", 27 | "lemmas": true, 28 | "lowercase": true, 29 | "filter_stopwords": true, 30 | "ngram_range": [1, 3] 31 | } 32 | }, 33 | { 34 | "class_name": "tfidf_ranker", 35 | "top_n": 100, 36 | "in": ["docs"], 37 | "out": ["tfidf_doc_ids", "tfidf_doc_scores"], 38 | "vectorizer": "#vectorizer" 39 | } 40 | ] 41 | }, 42 | "train": { 43 | "batch_size": 10000, 44 | "evaluation_targets": [], 45 | "class_name": "fit_trainer" 46 | }, 47 | "metadata": { 48 | "variables": { 49 | "ROOT_PATH": "~/.deeppavlov", 50 | "DOWNLOADS_PATH": "{ROOT_PATH}/downloads", 51 | "MODELS_PATH": "{ROOT_PATH}/models" 52 | }, 53 | "download": [ 54 | { 55 | "url": "http://files.deeppavlov.ai/deeppavlov_data/odqa/ruwiki_par_page_compr.tar.gz", 56 | "subdir": "{DOWNLOADS_PATH}/odqa" 57 | }, 58 | { 59 | "url": "http://files.deeppavlov.ai/deeppavlov_data/odqa/ruwiki_tfidf_matrix_compr.tar.gz", 60 | "subdir": "{MODELS_PATH}/odqa" 61 | } 62 | ] 63 | } 64 | } 65 | -------------------------------------------------------------------------------- /deeppavlov/configs/embedder/bert_embedder.json: -------------------------------------------------------------------------------- 1 | { 2 | "chainer": { 3 | "in": ["texts"], 4 | "pipe": [ 5 | { 6 | "class_name": "transformers_bert_preprocessor", 7 | "vocab_file": "{BERT_PATH}/vocab.txt", 8 | "do_lower_case": false, 9 | "max_seq_length": 512, 10 | "in": ["texts"], 11 | "out": ["tokens", "subword_tokens", "subword_tok_ids", "startofword_markers", "attention_mask"] 12 | }, 13 | { 14 | "class_name": "transformers_bert_embedder", 15 | "bert_config_path": "{BERT_PATH}/bert_config.json", 16 | "load_path": "{BERT_PATH}", 17 | "truncate": true, 18 | "in": ["subword_tok_ids", "startofword_markers", "attention_mask"], 19 | "out": ["word_emb", "subword_emb", "max_emb", "mean_emb", "pooler_output"] 20 | } 21 | ], 22 | "out": ["tokens", "word_emb", "subword_tokens", "subword_emb", "max_emb", "mean_emb", "pooler_output"] 23 | }, 24 | "train": {}, 25 | "metadata": { 26 | "variables": { 27 | "ROOT_PATH": "~/.deeppavlov", 28 | "DOWNLOADS_PATH": "{ROOT_PATH}/downloads", 29 | "BERT_PATH": "{DOWNLOADS_PATH}/bert_models/multi_cased_L-12_H-768_A-12_pt" 30 | }, 31 | "labels": {}, 32 | "download": [ 33 | { 34 | "url": "http://files.deeppavlov.ai/deeppavlov_data/bert/multi_cased_L-12_H-768_A-12_pt.tar.gz", 35 | "subdir": "{DOWNLOADS_PATH}/bert_models" 36 | } 37 | ] 38 | } 39 | } 40 | -------------------------------------------------------------------------------- /deeppavlov/configs/embedder/bert_sentence_embedder.json: -------------------------------------------------------------------------------- 1 | { 2 | "chainer": { 3 | "in": ["texts"], 4 | "pipe": [ 5 | { 6 | "class_name": "transformers_bert_preprocessor", 7 | "vocab_file": "{BERT_PATH}/vocab.txt", 8 | "do_lower_case": false, 9 | "max_seq_length": 512, 10 | "in": ["texts"], 11 | "out": ["tokens", "subword_tokens", "subword_tok_ids", "startofword_markers", "attention_mask"] 12 | }, 13 | { 14 | "class_name": "transformers_bert_embedder", 15 | "bert_config_path": "{BERT_PATH}/config.json", 16 | "load_path": "{BERT_PATH}", 17 | "truncate": false, 18 | "in": ["subword_tok_ids", "startofword_markers", "attention_mask"], 19 | "out": ["word_emb", "subword_emb", "max_emb", "mean_emb", "pooler_output"] 20 | } 21 | ], 22 | "out": ["max_emb", "mean_emb", "pooler_output"] 23 | }, 24 | "train": {}, 25 | "metadata": { 26 | "variables": { 27 | "ROOT_PATH": "~/.deeppavlov", 28 | "DOWNLOADS_PATH": "{ROOT_PATH}/downloads", 29 | "BERT_PATH": "{DOWNLOADS_PATH}/bert_models/sentence_multi_cased_L-12_H-768_A-12_pt_v1" 30 | }, 31 | "labels": {}, 32 | "download": [ 33 | { 34 | "url": "http://files.deeppavlov.ai/deeppavlov_data/bert/sentence_multi_cased_L-12_H-768_A-12_pt_v1.tar.gz", 35 | "subdir": "{DOWNLOADS_PATH}/bert_models" 36 | } 37 | ] 38 | } 39 | } 40 | -------------------------------------------------------------------------------- /deeppavlov/configs/entity_extraction/entity_detection_en.json: -------------------------------------------------------------------------------- 1 | { 2 | "chainer": { 3 | "in": ["x"], 4 | "pipe": [ 5 | { 6 | "class_name": "ner_chunker", 7 | "batch_size": 16, 8 | "max_seq_len" : 300, 9 | "vocab_file": "{TRANSFORMER}", 10 | "in": ["x"], 11 | "out": ["x_chunk", "chunk_nums", "chunk_sentences_offsets", "chunk_sentences"] 12 | }, 13 | { 14 | "thres_proba": 0.6, 15 | "o_tag": "O", 16 | "tags_file": "{NER_PATH}/tag.dict", 17 | "class_name": "entity_detection_parser", 18 | "id": "edp" 19 | }, 20 | { 21 | "class_name": "ner_chunk_model", 22 | "ner": { 23 | "config_path": "{CONFIGS_PATH}/ner/ner_ontonotes_bert.json", 24 | "overwrite": { 25 | "chainer.out": ["x_tokens", "tokens_offsets", "y_pred", "probas"] 26 | } 27 | }, 28 | "ner_parser": "#edp", 29 | "in": ["x_chunk", "chunk_nums", "chunk_sentences_offsets", "chunk_sentences"], 30 | "out": ["entity_substr", "entity_offsets", "entity_positions", "tags", "sentences_offsets", "sentences", "probas"] 31 | } 32 | ], 33 | "out": ["entity_substr", "entity_offsets", "entity_positions", "tags", "sentences_offsets", "sentences", "probas"] 34 | }, 35 | "metadata": { 36 | "variables": { 37 | "ROOT_PATH": "~/.deeppavlov", 38 | "MODELS_PATH": "{ROOT_PATH}/models", 39 | "CONFIGS_PATH": "{DEEPPAVLOV_PATH}/configs", 40 | "TRANSFORMER": "bert-base-cased", 41 | "NER_PATH": "{MODELS_PATH}/ner_ontonotes_bert_torch_crf" 42 | } 43 | } 44 | } 45 | -------------------------------------------------------------------------------- /deeppavlov/configs/entity_extraction/entity_detection_ru.json: -------------------------------------------------------------------------------- 1 | { 2 | "chainer": { 3 | "in": ["x"], 4 | "pipe": [ 5 | { 6 | "class_name": "ner_chunker", 7 | "batch_size": 16, 8 | "max_seq_len" : 300, 9 | "vocab_file": "{TRANSFORMER}", 10 | "in": ["x"], 11 | "out": ["x_chunk", "chunk_nums", "chunk_sentences_offsets", "chunk_sentences"] 12 | }, 13 | { 14 | "thres_proba": 0.05, 15 | "o_tag": "O", 16 | "tags_file": "{NER_PATH}/tag.dict", 17 | "class_name": "entity_detection_parser", 18 | "id": "edp" 19 | }, 20 | { 21 | "class_name": "ner_chunk_model", 22 | "ner": {"config_path": "{CONFIGS_PATH}/ner/ner_rus_bert_probas.json"}, 23 | "ner_parser": "#edp", 24 | "in": ["x_chunk", "chunk_nums", "chunk_sentences_offsets", "chunk_sentences"], 25 | "out": ["entity_substr", "entity_offsets", "entity_positions", "tags", "sentences_offsets", "sentences", "probas"] 26 | } 27 | ], 28 | "out": ["entity_substr", "entity_offsets", "entity_positions", "tags", "sentences_offsets", "sentences", "probas"] 29 | }, 30 | "metadata": { 31 | "variables": { 32 | "ROOT_PATH": "~/.deeppavlov", 33 | "MODELS_PATH": "{ROOT_PATH}/models", 34 | "CONFIGS_PATH": "{DEEPPAVLOV_PATH}/configs", 35 | "TRANSFORMER": "DeepPavlov/rubert-base-cased", 36 | "NER_PATH": "{MODELS_PATH}/wiki_ner_rus_bert" 37 | } 38 | } 39 | } 40 | -------------------------------------------------------------------------------- /deeppavlov/configs/entity_extraction/entity_extraction_en.json: -------------------------------------------------------------------------------- 1 | { 2 | "chainer": { 3 | "in": ["x"], 4 | "pipe": [ 5 | { 6 | "config_path": "{CONFIGS_PATH}/entity_extraction/entity_detection_en.json", 7 | "in": ["x"], 8 | "out": ["entity_substr", "entity_offsets", "entity_positions", "tags", "sentences_offsets", "sentences", "probas"] 9 | }, 10 | { 11 | "config_path": "{CONFIGS_PATH}/entity_extraction/entity_linking_en.json", 12 | "in": ["entity_substr", "tags", "probas", "sentences", "entity_offsets", "sentences_offsets"], 13 | "out": ["entity_ids", "entity_conf", "entity_pages", "entity_labels"] 14 | } 15 | ], 16 | "out": ["entity_substr", "tags", "entity_offsets", "entity_ids", "entity_conf", "entity_pages", "entity_labels"] 17 | }, 18 | "metadata": { 19 | "variables": { 20 | "CONFIGS_PATH": "{DEEPPAVLOV_PATH}/configs" 21 | } 22 | } 23 | } 24 | -------------------------------------------------------------------------------- /deeppavlov/configs/entity_extraction/entity_extraction_ru.json: -------------------------------------------------------------------------------- 1 | { 2 | "chainer": { 3 | "in": ["x"], 4 | "pipe": [ 5 | { 6 | "config_path": "{CONFIGS_PATH}/entity_extraction/entity_detection_ru.json", 7 | "in": ["x"], 8 | "out": ["entity_substr", "entity_offsets", "entity_positions", "tags", "sentences_offsets", "sentences", "probas"] 9 | }, 10 | { 11 | "config_path": "{CONFIGS_PATH}/entity_extraction/entity_linking_ru.json", 12 | "in": ["entity_substr", "tags", "probas", "sentences", "entity_offsets", "sentences_offsets"], 13 | "out": ["entity_ids", "entity_conf", "entity_pages", "entity_labels"] 14 | } 15 | ], 16 | "out": ["entity_substr", "tags", "entity_offsets", "entity_ids", "entity_conf", "entity_pages", "entity_labels"] 17 | }, 18 | "metadata": { 19 | "variables": { 20 | "CONFIGS_PATH": "{DEEPPAVLOV_PATH}/configs" 21 | } 22 | } 23 | } 24 | -------------------------------------------------------------------------------- /deeppavlov/configs/entity_extraction/entity_linking_en.json: -------------------------------------------------------------------------------- 1 | { 2 | "chainer": { 3 | "in": ["entity_substr", "tags", "probas", "sentences", "entity_offsets", "sentences_offsets"], 4 | "pipe": [ 5 | { 6 | "class_name": "torch_transformers_entity_ranker_infer", 7 | "id": "entity_descr_ranking", 8 | "pretrained_bert": "{TRANSFORMER}", 9 | "encoder_weights_path": "{MODELS_PATH}/entity_linking_eng/encoder.pth.tar", 10 | "bilinear_weights_path": "{MODELS_PATH}/entity_linking_eng/bilinear.pth.tar", 11 | "special_token_id": 30522, 12 | "emb_size": 512, 13 | "block_size": 8 14 | }, 15 | { 16 | "class_name": "entity_linker", 17 | "in": ["entity_substr", "tags", "probas", "sentences", "entity_offsets", "sentences_offsets"], 18 | "out": ["entity_ids", "entity_conf", "entity_pages", "entity_labels"], 19 | "load_path": "{DOWNLOADS_PATH}/entity_linking_eng", 20 | "entities_database_filename": "el_eng_v2.db", 21 | "entity_ranker": "#entity_descr_ranking", 22 | "rank_in_runtime": true, 23 | "num_entities_for_bert_ranking": 20, 24 | "include_mention": false, 25 | "num_entities_to_return": 3, 26 | "lemmatize": true, 27 | "use_descriptions": true, 28 | "use_connections": true, 29 | "use_tags": true, 30 | "full_paragraph": true, 31 | "return_confidences": true, 32 | "lang": "en" 33 | } 34 | ], 35 | "out": ["entity_ids", "entity_conf", "entity_pages", "entity_labels"] 36 | }, 37 | "metadata": { 38 | "variables": { 39 | "ROOT_PATH": "~/.deeppavlov", 40 | "DOWNLOADS_PATH": "{ROOT_PATH}/downloads", 41 | "MODELS_PATH": "{ROOT_PATH}/models", 42 | "TRANSFORMER": "prajjwal1/bert-small" 43 | }, 44 | "download": [ 45 | { 46 | "url": "http://files.deeppavlov.ai/kbqa/downloads/el_db_eng_v2.tar.gz", 47 | "subdir": "{DOWNLOADS_PATH}/entity_linking_eng" 48 | }, 49 | { 50 | "url": "http://files.deeppavlov.ai/deeppavlov_data/entity_linking/el_ranker_eng.tar.gz", 51 | "subdir": "{MODELS_PATH}/entity_linking_eng" 52 | } 53 | ] 54 | } 55 | } 56 | -------------------------------------------------------------------------------- /deeppavlov/configs/kbqa/wiki_parser.json: -------------------------------------------------------------------------------- 1 | { 2 | "chainer": { 3 | "in": ["parser_info", "query"], 4 | "pipe": [ 5 | { 6 | "class_name": "wiki_parser", 7 | "in": ["parser_info", "query"], 8 | "out": ["wiki_parser_output"], 9 | "wiki_filename": "{DOWNLOADS_PATH}/wikidata/wikidata_compr.pickle", 10 | "file_format": "pickle", 11 | "lang": "@en" 12 | } 13 | ], 14 | "out": ["wiki_parser_output"] 15 | }, 16 | "metadata": { 17 | "variables": { 18 | "ROOT_PATH": "~/.deeppavlov", 19 | "DOWNLOADS_PATH": "{ROOT_PATH}/downloads", 20 | "MODELS_PATH": "{ROOT_PATH}/models", 21 | "CONFIGS_PATH": "{DEEPPAVLOV_PATH}/configs" 22 | }, 23 | "download": [ 24 | { 25 | "url": "http://files.deeppavlov.ai/kbqa/wikidata/wikidata_compr.pickle", 26 | "subdir": "{DOWNLOADS_PATH}/wikidata" 27 | } 28 | ] 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /deeppavlov/configs/morpho_syntax_parser/ru_syntagrus_joint_parsing.json: -------------------------------------------------------------------------------- 1 | { 2 | "chainer": { 3 | "in": ["x_words"], 4 | "pipe": [ 5 | { 6 | "id": "main", 7 | "class_name": "joint_tagger_parser", 8 | "tagger": { 9 | "config_path": "{CONFIGS_PATH}/morpho_syntax_parser/morpho_ru_syntagrus_bert.json", 10 | "overwrite": {"chainer.pipe.6.return_string": false} 11 | }, 12 | "parser": { 13 | "config_path": "{CONFIGS_PATH}/morpho_syntax_parser/syntax_ru_syntagrus_bert.json", 14 | "overwrite": {"chainer.pipe.6.return_string": false} 15 | }, 16 | "in": ["x_words"], 17 | "out": ["y_parsed"] 18 | } 19 | ], 20 | "out": ["y_parsed"] 21 | }, 22 | "metadata": { 23 | "variables": { 24 | "CONFIGS_PATH": "{DEEPPAVLOV_PATH}/configs" 25 | } 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /deeppavlov/configs/ner/ner_bert_base.json: -------------------------------------------------------------------------------- 1 | { 2 | "chainer": { 3 | "in": ["x"], 4 | "in_y": ["y"], 5 | "pipe": [ 6 | { 7 | "class_name": "torch_transformers_ner_preprocessor", 8 | "vocab_file": "{BASE_MODEL}", 9 | "in": ["x"], 10 | "out": ["x_tokens", "x_subword_tokens", "x_subword_tok_ids", "startofword_markers", "attention_mask", "tokens_offsets"] 11 | }, 12 | { 13 | "id": "tag_vocab", 14 | "class_name": "simple_vocab", 15 | "unk_token": ["O"], 16 | "save_path": "{MODEL_PATH}/tag.dict", 17 | "load_path": "{MODEL_PATH}/tag.dict", 18 | "fit_on": ["y"], 19 | "in": ["y"], 20 | "out": ["y_ind"] 21 | }, 22 | { 23 | "class_name": "torch_transformers_sequence_tagger", 24 | "n_tags": "#tag_vocab.len", 25 | "pretrained_bert": "{BASE_MODEL}", 26 | "save_path": "{MODEL_PATH}/model", 27 | "load_path": "{MODEL_PATH}/model", 28 | "in": ["x_subword_tok_ids", "attention_mask", "startofword_markers"], 29 | "in_y": ["y_ind"], 30 | "out": ["y_pred_ind", "probas"] 31 | }, 32 | { 33 | "ref": "tag_vocab", 34 | "in": ["y_pred_ind"], 35 | "out": ["y_pred"] 36 | } 37 | ], 38 | "out": ["x_tokens", "y_pred"] 39 | }, 40 | "metadata": { 41 | "variables": { 42 | "BASE_MODEL": "bert-base-multilingual-cased", 43 | "ROOT_PATH": "~/.deeppavlov", 44 | "DOWNLOADS_PATH": "{ROOT_PATH}/downloads", 45 | "MODELS_PATH": "{ROOT_PATH}/models", 46 | "MODEL_PATH": "{MODELS_PATH}/ner/{BASE_MODEL}" 47 | }, 48 | "download": [ 49 | { 50 | "url": "http://files.deeppavlov.ai/v1/ner/ner_bert_base.tar.gz", 51 | "subdir": "{MODEL_PATH}" 52 | } 53 | ] 54 | } 55 | } 56 | -------------------------------------------------------------------------------- /deeppavlov/configs/odqa/en_odqa_infer_wiki.json: -------------------------------------------------------------------------------- 1 | { 2 | "chainer": { 3 | "in": ["question_raw"], 4 | "out": ["answer", "answer_score", "answer_place"], 5 | "pipe": [ 6 | { 7 | "config_path": "{CONFIGS_PATH}/doc_retrieval/en_ranker_tfidf_wiki.json", 8 | "in": ["question_raw"], 9 | "out": ["tfidf_doc_ids"] 10 | }, 11 | { 12 | "class_name": "bpr", 13 | "load_path": "{MODELS_PATH}/bpr/eng", 14 | "query_encoder_file": "query_encoder_en.pth.tar", 15 | "bpr_index": "bpr_finetuned_nq_adv.idx", 16 | "pretrained_model": "bert-base-uncased", 17 | "top_n": 100, 18 | "in": ["question_raw"], 19 | "out": ["bpr_doc_ids"] 20 | }, 21 | { 22 | "class_name": "concat_lists", 23 | "in": ["tfidf_doc_ids", "bpr_doc_ids"], 24 | "out": ["doc_ids"] 25 | }, 26 | { 27 | "class_name": "wiki_sqlite_vocab", 28 | "in": ["doc_ids"], 29 | "out": ["doc_text"], 30 | "join_docs": false, 31 | "shuffle": false, 32 | "load_path": "{DOWNLOADS_PATH}/odqa/enwiki_l100.db" 33 | }, 34 | { 35 | "class_name": "string_multiplier", 36 | "in": ["question_raw", "doc_text"], 37 | "out":["questions"] 38 | }, 39 | { 40 | "class_name": "logit_ranker", 41 | "batch_size": 64, 42 | "squad_model": {"config_path": "{CONFIGS_PATH}/squad/qa_nq_psgcls_bert.json"}, 43 | "sort_noans": true, 44 | "in": ["doc_text", "questions"], 45 | "out": ["answer", "answer_score", "answer_place"] 46 | } 47 | ] 48 | }, 49 | "metadata": { 50 | "variables": { 51 | "ROOT_PATH": "~/.deeppavlov", 52 | "DOWNLOADS_PATH": "{ROOT_PATH}/downloads", 53 | "MODELS_PATH": "{ROOT_PATH}/models", 54 | "CONFIGS_PATH": "{DEEPPAVLOV_PATH}/configs" 55 | }, 56 | "download": [ 57 | { 58 | "url": "http://files.deeppavlov.ai/deeppavlov_data/odqa/bpr_encoder_index_eng.tar.gz", 59 | "subdir": "{MODELS_PATH}/bpr/eng" 60 | } 61 | ] 62 | } 63 | } 64 | -------------------------------------------------------------------------------- /deeppavlov/configs/odqa/en_odqa_pop_infer_wiki.json: -------------------------------------------------------------------------------- 1 | { 2 | "chainer": { 3 | "in": ["question_raw"], 4 | "out": ["answer", "answer_score", "answer_place"], 5 | "pipe": [ 6 | { 7 | "config_path": "{CONFIGS_PATH}/doc_retrieval/en_ranker_pop_wiki.json", 8 | "in": ["question_raw"], 9 | "out": ["tfidf_doc_ids"] 10 | }, 11 | { 12 | "class_name": "bpr", 13 | "load_path": "{MODELS_PATH}/bpr/eng", 14 | "query_encoder_file": "query_encoder_en.pth.tar", 15 | "bpr_index": "bpr_finetuned_nq_adv.idx", 16 | "pretrained_model": "bert-base-uncased", 17 | "top_n": 100, 18 | "in": ["question_raw"], 19 | "out": ["bpr_doc_ids"] 20 | }, 21 | { 22 | "class_name": "concat_lists", 23 | "in": ["tfidf_doc_ids", "bpr_doc_ids"], 24 | "out": ["doc_ids"] 25 | }, 26 | { 27 | "class_name": "wiki_sqlite_vocab", 28 | "in": ["doc_ids"], 29 | "out": ["doc_text"], 30 | "join_docs": false, 31 | "shuffle": false, 32 | "load_path": "{DOWNLOADS_PATH}/odqa/enwiki_l100.db" 33 | }, 34 | { 35 | "class_name": "string_multiplier", 36 | "in": ["question_raw", "doc_text"], 37 | "out":["questions"] 38 | }, 39 | { 40 | "class_name": "logit_ranker", 41 | "batch_size": 64, 42 | "squad_model": {"config_path": "{CONFIGS_PATH}/squad/qa_nq_psgcls_bert.json"}, 43 | "sort_noans": true, 44 | "in": ["doc_text", "questions"], 45 | "out": ["answer", "answer_score", "answer_place"] 46 | } 47 | ] 48 | }, 49 | "metadata": { 50 | "variables": { 51 | "ROOT_PATH": "~/.deeppavlov", 52 | "DOWNLOADS_PATH": "{ROOT_PATH}/downloads", 53 | "MODELS_PATH": "{ROOT_PATH}/models", 54 | "CONFIGS_PATH": "{DEEPPAVLOV_PATH}/configs" 55 | }, 56 | "download": [ 57 | { 58 | "url": "http://files.deeppavlov.ai/deeppavlov_data/odqa/bpr_encoder_index_eng.tar.gz", 59 | "subdir": "{MODELS_PATH}/bpr/eng" 60 | } 61 | ] 62 | } 63 | } 64 | -------------------------------------------------------------------------------- /deeppavlov/configs/odqa/ru_odqa_infer_wiki.json: -------------------------------------------------------------------------------- 1 | { 2 | "chainer": { 3 | "in": ["question_raw"], 4 | "out": ["best_answer"], 5 | "pipe": [ 6 | { 7 | "config_path": "{CONFIGS_PATH}/doc_retrieval/ru_ranker_tfidf_wiki.json", 8 | "in": ["question_raw"], 9 | "out": ["tfidf_doc_ids"] 10 | }, 11 | { 12 | "class_name": "wiki_sqlite_vocab", 13 | "in": ["tfidf_doc_ids"], 14 | "out": ["tfidf_doc_text"], 15 | "join_docs": false, 16 | "shuffle": false, 17 | "load_path": "{DOWNLOADS_PATH}/odqa/ruwiki_par_page_compr.db" 18 | }, 19 | { 20 | "class_name": "string_multiplier", 21 | "in": ["question_raw", "tfidf_doc_text"], 22 | "out":["questions"] 23 | }, 24 | { 25 | "class_name": "logit_ranker", 26 | "batch_size": 64, 27 | "squad_model": {"config_path": "{CONFIGS_PATH}/squad/qa_multisberquad_bert.json"}, 28 | "sort_noans": true, 29 | "in": ["tfidf_doc_text", "questions"], 30 | "out": ["best_answer", "best_answer_score"] 31 | } 32 | ] 33 | }, 34 | "metadata": { 35 | "variables": { 36 | "ROOT_PATH": "~/.deeppavlov", 37 | "DOWNLOADS_PATH": "{ROOT_PATH}/downloads", 38 | "MODELS_PATH": "{ROOT_PATH}/models", 39 | "CONFIGS_PATH": "{DEEPPAVLOV_PATH}/configs" 40 | }, 41 | "download": [ 42 | ] 43 | } 44 | } 45 | -------------------------------------------------------------------------------- /deeppavlov/configs/ranking/path_ranking_nll_roberta_en.json: -------------------------------------------------------------------------------- 1 | { 2 | "chainer": { 3 | "in": ["question", "rels"], 4 | "pipe": [ 5 | { 6 | "class_name": "path_ranking_preprocessor", 7 | "vocab_file": "{TRANSFORMER}", 8 | "do_lower_case": false, 9 | "additional_special_tokens": ["", "", "", "", "", "", ""], 10 | "max_seq_length": 96, 11 | "in": ["question", "rels"], 12 | "out": ["bert_features"] 13 | }, 14 | { 15 | "class_name": "torch_transformers_nll_ranker", 16 | "in": ["bert_features"], 17 | "out": ["model_output"], 18 | "return_probas": true, 19 | "save_path": "{MODEL_PATH}/model", 20 | "load_path": "{MODEL_PATH}/model", 21 | "encoder_save_path": "{MODEL_PATH}/encoder", 22 | "linear_save_path": "{MODEL_PATH}/linear", 23 | "pretrained_bert": "{TRANSFORMER}", 24 | "learning_rate_drop_patience": 5, 25 | "learning_rate_drop_div": 1.5, 26 | "optimizer_parameters": {"lr": 1e-5, "weight_decay": 0.01, "eps": 1e-6} 27 | } 28 | ], 29 | "out": ["model_output"] 30 | }, 31 | "metadata": { 32 | "variables": { 33 | "TRANSFORMER": "haisongzhang/roberta-tiny-cased", 34 | "MODEL_PATH": "~/.deeppavlov/models/classifiers/path_ranking_nll_roberta_lcquad2" 35 | }, 36 | "download": [ 37 | { 38 | "url": "http://files.deeppavlov.ai/kbqa/models/path_ranking_nll_roberta_lcquad2.tar.gz", 39 | "subdir": "{MODEL_PATH}" 40 | } 41 | ] 42 | } 43 | } 44 | -------------------------------------------------------------------------------- /deeppavlov/configs/ranking/ranking_ubuntu_v2_torch_bert_uncased.json: -------------------------------------------------------------------------------- 1 | { 2 | "dataset_reader": { 3 | "class_name": "ubuntu_v2_reader", 4 | "data_path": "{DOWNLOADS_PATH}/ubuntu_v2_data" 5 | }, 6 | "dataset_iterator": { 7 | "class_name": "siamese_iterator", 8 | "seed": 243 9 | }, 10 | "chainer": { 11 | "in": [ 12 | "x" 13 | ], 14 | "in_y": [ 15 | "y" 16 | ], 17 | "pipe": [ 18 | { 19 | "class_name": "torch_bert_ranker_preprocessor", 20 | "vocab_file": "bert-base-uncased", 21 | "do_lower_case": true, 22 | "max_seq_length": 128, 23 | "in": [ 24 | "x" 25 | ], 26 | "out": [ 27 | "bert_features" 28 | ] 29 | }, 30 | { 31 | "class_name": "torch_bert_ranker", 32 | "pretrained_bert": "bert-base-uncased", 33 | "save_path": "{MODEL_PATH}/model", 34 | "load_path": "{MODEL_PATH}/model", 35 | "optimizer": "AdamW", 36 | "optimizer_parameters": { 37 | "lr": 2e-5, 38 | "weight_decay": 1e-2, 39 | "betas": [ 40 | 0.9, 41 | 0.999 42 | ], 43 | "eps": 1e-6 44 | }, 45 | "clip_norm": 1.0, 46 | "in": [ 47 | "bert_features" 48 | ], 49 | "in_y": [ 50 | "y" 51 | ], 52 | "out": [ 53 | "predictions" 54 | ] 55 | } 56 | ], 57 | "out": [ 58 | "predictions" 59 | ] 60 | }, 61 | "train": { 62 | "batch_size": 32, 63 | "pytest_max_batches": 2, 64 | "train_metrics": [], 65 | "metrics": [ 66 | "r@1", 67 | "r@2", 68 | "r@5" 69 | ], 70 | "validation_patience": 1, 71 | "val_every_n_epochs": 1, 72 | "log_every_n_epochs": 1, 73 | "evaluation_targets": [ 74 | "valid", 75 | "test" 76 | ], 77 | "class_name": "torch_trainer" 78 | }, 79 | "metadata": { 80 | "variables": { 81 | "ROOT_PATH": "~/.deeppavlov", 82 | "DOWNLOADS_PATH": "{ROOT_PATH}/downloads", 83 | "MODELS_PATH": "{ROOT_PATH}/models", 84 | "MODEL_PATH": "{MODELS_PATH}/ubuntu_v2_uncased_torch_bert_model" 85 | }, 86 | "download": [ 87 | { 88 | "url": "http://files.deeppavlov.ai/datasets/ubuntu_v2_data.tar.gz", 89 | "subdir": "{DOWNLOADS_PATH}/ubuntu_v2_data" 90 | }, 91 | { 92 | "url": "http://files.deeppavlov.ai/deeppavlov_data/ubuntu_v2_uncased_torch_bert_model_v2.tar.gz", 93 | "subdir": "{MODELS_PATH}" 94 | } 95 | ] 96 | } 97 | } 98 | -------------------------------------------------------------------------------- /deeppavlov/configs/ranking/rel_ranking_nll_bert_ru.json: -------------------------------------------------------------------------------- 1 | { 2 | "chainer": { 3 | "in": ["question", "rels"], 4 | "pipe": [ 5 | { 6 | "class_name": "path_ranking_preprocessor", 7 | "vocab_file": "{TRANSFORMER}", 8 | "do_lower_case": false, 9 | "max_seq_length": 96, 10 | "in": ["question", "rels"], 11 | "out": ["bert_features"] 12 | }, 13 | { 14 | "class_name": "torch_transformers_nll_ranker", 15 | "in": ["bert_features"], 16 | "out": ["model_output"], 17 | "return_probas": true, 18 | "save_path": "{MODEL_PATH}/model", 19 | "load_path": "{MODEL_PATH}/model", 20 | "encoder_save_path": "{MODEL_PATH}/encoder", 21 | "linear_save_path": "{MODEL_PATH}/linear", 22 | "pretrained_bert": "{TRANSFORMER}", 23 | "learning_rate_drop_patience": 4, 24 | "learning_rate_drop_div": 1.5, 25 | "optimizer_parameters": {"lr": 1e-5, "weight_decay": 0.01, "eps": 1e-6} 26 | } 27 | ], 28 | "out": ["model_output"] 29 | }, 30 | "metadata": { 31 | "variables": { 32 | "ROOT_PATH": "~/.deeppavlov", 33 | "DOWNLOADS_PATH": "{ROOT_PATH}/downloads", 34 | "MODELS_PATH": "{ROOT_PATH}/models", 35 | "TRANSFORMER": "DeepPavlov/rubert-base-cased", 36 | "MODEL_PATH": "{MODELS_PATH}/classifiers/rel_ranking_nll_bert_ru" 37 | }, 38 | "download": [ 39 | { 40 | "url": "http://files.deeppavlov.ai/kbqa/models/rel_ranking_nll_bert_ru.tar.gz", 41 | "subdir": "{MODEL_PATH}" 42 | } 43 | ] 44 | } 45 | } 46 | -------------------------------------------------------------------------------- /deeppavlov/configs/spelling_correction/brillmoore_wikitypos_en.json: -------------------------------------------------------------------------------- 1 | { 2 | "dataset_reader": { 3 | "class_name": "typos_wikipedia_reader", 4 | "data_path": "{DOWNLOADS_PATH}" 5 | }, 6 | "dataset_iterator": { 7 | "class_name": "typos_iterator", 8 | "test_ratio": 0.05 9 | }, 10 | "chainer":{ 11 | "in": ["x"], 12 | "in_y": ["y"], 13 | "pipe": [ 14 | { 15 | "class_name": "str_lower", 16 | "id": "lower", 17 | "in": ["x"], 18 | "out": ["x_lower"] 19 | }, 20 | { 21 | "class_name": "nltk_moses_tokenizer", 22 | "id": "tokenizer", 23 | "in": ["x_lower"], 24 | "out": ["x_tokens"] 25 | }, 26 | { 27 | "ref": "tokenizer", 28 | "in": ["y"], 29 | "out": ["y_tokens"] 30 | }, 31 | { 32 | "fit_on": ["x_tokens", "y_tokens"], 33 | "in": ["x_tokens"], 34 | "out": ["tokens_candidates"], 35 | "class_name": "spelling_error_model", 36 | "window": 1, 37 | "candidates_count": 4, 38 | "dictionary": { 39 | "class_name": "wikitionary_100K_vocab", 40 | "data_dir": "{DOWNLOADS_PATH}/vocabs" 41 | }, 42 | "save_path": "{MODELS_PATH}/error_model/error_model.tsv" 43 | }, 44 | { 45 | "class_name": "kenlm_elector", 46 | "in": ["tokens_candidates"], 47 | "out": ["y_predicted_tokens"], 48 | "load_path": "{DOWNLOADS_PATH}/language_models/en_wiki_no_punkt.arpa.binary" 49 | }, 50 | { 51 | "ref": "tokenizer", 52 | "in": ["y_predicted_tokens"], 53 | "out": ["y_predicted"] 54 | } 55 | ], 56 | "out": ["y_predicted"] 57 | }, 58 | "train": { 59 | "evaluation_targets": ["test"], 60 | "class_name": "fit_trainer" 61 | }, 62 | "metadata": { 63 | "variables": { 64 | "ROOT_PATH": "~/.deeppavlov", 65 | "DOWNLOADS_PATH": "{ROOT_PATH}/downloads", 66 | "MODELS_PATH": "{ROOT_PATH}/models" 67 | }, 68 | "download": [ 69 | { 70 | "url": "http://files.deeppavlov.ai/deeppavlov_data/error_model.tar.gz", 71 | "subdir": "{MODELS_PATH}" 72 | }, 73 | { 74 | "url": "http://files.deeppavlov.ai/lang_models/en_wiki_no_punkt.arpa.binary.gz", 75 | "subdir": "{DOWNLOADS_PATH}/language_models" 76 | }, 77 | { 78 | "url": "http://files.deeppavlov.ai/datasets/wiktionary/wikipedia_100K_vocab.tar.gz", 79 | "subdir": "{DOWNLOADS_PATH}/vocabs" 80 | } 81 | ] 82 | } 83 | } -------------------------------------------------------------------------------- /deeppavlov/configs/spelling_correction/levenshtein_corrector_ru.json: -------------------------------------------------------------------------------- 1 | { 2 | "chainer":{ 3 | "in": ["x"], 4 | "pipe": [ 5 | { 6 | "class_name": "str_lower", 7 | "id": "lower", 8 | "in": ["x"], 9 | "out": ["x_lower"] 10 | }, 11 | { 12 | "class_name": "nltk_moses_tokenizer", 13 | "id": "tokenizer", 14 | "in": ["x_lower"], 15 | "out": ["x_tokens"] 16 | }, 17 | { 18 | "id": "vocab", 19 | "class_name": "simple_vocab", 20 | "save_path": "{DOWNLOADS_PATH}/vocabs/russian_words_vocab.dict", 21 | "load_path": "{DOWNLOADS_PATH}/vocabs/russian_words_vocab.dict" 22 | }, 23 | { 24 | "in": ["x_tokens"], 25 | "out": ["tokens_candidates"], 26 | "class_name": "spelling_levenshtein", 27 | "words": "#vocab.keys()" 28 | }, 29 | { 30 | "class_name": "kenlm_elector", 31 | "in": ["tokens_candidates"], 32 | "out": ["y_predicted_tokens"], 33 | "load_path": "{DOWNLOADS_PATH}/language_models/ru_wiyalen_no_punkt.arpa.binary" 34 | }, 35 | { 36 | "ref": "tokenizer", 37 | "in": ["y_predicted_tokens"], 38 | "out": ["y_predicted"] 39 | } 40 | ], 41 | "out": ["y_predicted"] 42 | }, 43 | "metadata": { 44 | "variables": { 45 | "ROOT_PATH": "~/.deeppavlov", 46 | "DOWNLOADS_PATH": "{ROOT_PATH}/downloads", 47 | "MODELS_PATH": "{ROOT_PATH}/models" 48 | }, 49 | "download": [ 50 | { 51 | "url": "http://files.deeppavlov.ai/deeppavlov_data/vocabs/russian_words_vocab.dict.gz", 52 | "subdir": "{DOWNLOADS_PATH}/vocabs" 53 | }, 54 | { 55 | "url": "http://files.deeppavlov.ai/lang_models/ru_wiyalen_no_punkt.arpa.binary.gz", 56 | "subdir": "{DOWNLOADS_PATH}/language_models" 57 | } 58 | ] 59 | } 60 | } 61 | -------------------------------------------------------------------------------- /deeppavlov/configs/squad/qa_nq_psgcls_bert.json: -------------------------------------------------------------------------------- 1 | { 2 | "chainer": { 3 | "in": ["context_raw", "question_raw"], 4 | "pipe": [ 5 | { 6 | "class_name": "torch_squad_transformers_preprocessor", 7 | "vocab_file": "{TRANSFORMER}", 8 | "do_lower_case": "{LOWERCASE}", 9 | "max_seq_length": 384, 10 | "in": ["question_raw", "context_raw"], 11 | "out": ["bert_features", "subtokens", "split_context"] 12 | }, 13 | { 14 | "class_name": "squad_bert_mapping", 15 | "do_lower_case": "{LOWERCASE}", 16 | "in": ["split_context", "bert_features", "subtokens"], 17 | "out": ["subtok2chars", "char2subtoks"] 18 | }, 19 | { 20 | "class_name": "torch_transformers_squad", 21 | "pretrained_bert": "{TRANSFORMER}", 22 | "save_path": "{MODEL_PATH}/model", 23 | "load_path": "{MODEL_PATH}/model", 24 | "torch_seed": 1, 25 | "optimizer": "AdamW", 26 | "optimizer_parameters": { 27 | "lr": 2e-05, 28 | "weight_decay": 0.01, 29 | "betas": [0.9, 0.999], 30 | "eps": 1e-06 31 | }, 32 | "random_seed": 1, 33 | "psg_cls": true, 34 | "learning_rate_drop_patience": 2, 35 | "learning_rate_drop_div": 2.0, 36 | "in": ["bert_features"], 37 | "out": ["ans_start_predicted", "ans_end_predicted", "logits", "scores", "inds"] 38 | }, 39 | { 40 | "class_name": "squad_bert_ans_postprocessor", 41 | "in": ["ans_start_predicted", "ans_end_predicted", "split_context", "subtok2chars", "subtokens", "inds"], 42 | "out": ["ans_predicted", "ans_start_predicted", "ans_end_predicted"] 43 | } 44 | ], 45 | "out": ["ans_predicted", "ans_start_predicted", "scores"] 46 | }, 47 | "metadata": { 48 | "variables": { 49 | "LOWERCASE": true, 50 | "TRANSFORMER": "bert-base-uncased", 51 | "ROOT_PATH": "~/.deeppavlov", 52 | "DOWNLOADS_PATH": "{ROOT_PATH}/downloads", 53 | "MODELS_PATH": "{ROOT_PATH}/models", 54 | "MODEL_PATH": "{MODELS_PATH}/passage_reader_classifier_eng" 55 | }, 56 | "download": [ 57 | { 58 | "url": "http://files.deeppavlov.ai/deeppavlov_data/odqa/nq_psgcls_bert.tar.gz", 59 | "subdir": "{MODEL_PATH}" 60 | } 61 | ] 62 | } 63 | } 64 | -------------------------------------------------------------------------------- /deeppavlov/core/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deeppavlov/DeepPavlov/5f9fbed0c7191466bc7621e604b810f66f254c03/deeppavlov/core/__init__.py -------------------------------------------------------------------------------- /deeppavlov/core/commands/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deeppavlov/DeepPavlov/5f9fbed0c7191466bc7621e604b810f66f254c03/deeppavlov/core/commands/__init__.py -------------------------------------------------------------------------------- /deeppavlov/core/common/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deeppavlov/DeepPavlov/5f9fbed0c7191466bc7621e604b810f66f254c03/deeppavlov/core/common/__init__.py -------------------------------------------------------------------------------- /deeppavlov/core/common/aliases.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017 Neural Networks and Deep Learning lab, MIPT 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | ALIASES = { 16 | 'kbqa_cq': 'kbqa_cq_en', 17 | 'kbqa_cq_online': 'kbqa_cq_en', 18 | 'kbqa_cq_rus': 'kbqa_cq_ru', 19 | 'multi_squad_noans': 'qa_squad2_bert', 20 | 'multi_squad_noans_infer': 'qa_squad2_bert', 21 | 'multi_squad_retr_noans': 'qa_squad2_bert', 22 | 'ner_collection3_m1': 'ner_collection3_bert', 23 | 'ner_conll2003': 'ner_conll2003_bert', 24 | 'ner_conll2003_torch_bert': 'ner_conll2003_bert', 25 | 'ner_dstc2': 'ner_conll2003_bert', 26 | 'ner_few_shot_ru': 'ner_rus_bert', 27 | 'ner_few_shot_ru_simulate': 'ner_rus_bert', 28 | 'ner_ontonotes': 'ner_ontonotes_bert', 29 | 'ner_ontonotes_bert_emb': 'ner_ontonotes_bert', 30 | 'ner_ontonotes_bert_mult_torch': 'ner_ontonotes_bert_mult', 31 | 'ner_ontonotes_bert_torch': 'ner_ontonotes_bert', 32 | 'ner_rus': 'ner_rus_bert', 33 | 'paraphraser_bert': 'paraphraser_rubert', 34 | 'ru_odqa_infer_wiki_rubert': 'ru_odqa_infer_wiki', 35 | 'sentseg_dailydialog': 'sentseg_dailydialog_bert', 36 | 'squad': 'squad_bert', 37 | 'squad_bert_infer': 'squad_bert', 38 | 'squad_bert_multilingual_freezed_emb': 'squad_bert', 39 | 'squad_ru': 'squad_ru_bert', 40 | 'squad_ru_bert_infer': 'squad_ru_bert', 41 | 'squad_ru_convers_distilrubert_2L_infer': 'squad_ru_convers_distilrubert_2L', 42 | 'squad_ru_convers_distilrubert_6L_infer': 'squad_ru_convers_distilrubert_6L', 43 | 'squad_ru_rubert': 'squad_ru_bert', 44 | 'squad_ru_rubert_infer': 'squad_ru_bert', 45 | 'squad_torch_bert': 'squad_bert', 46 | 'squad_torch_bert_infer': 'squad_bert' 47 | } 48 | -------------------------------------------------------------------------------- /deeppavlov/core/common/errors.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017 Neural Networks and Deep Learning lab, MIPT 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import logging 16 | 17 | logger = logging.getLogger(__name__) 18 | 19 | 20 | class ConfigError(Exception): 21 | """Any configuration error.""" 22 | 23 | def __init__(self, message): 24 | super(ConfigError, self).__init__() 25 | self.message = message 26 | 27 | def __str__(self): 28 | return repr(self.message) 29 | -------------------------------------------------------------------------------- /deeppavlov/core/common/log.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017 Neural Networks and Deep Learning lab, MIPT 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import json 16 | import logging 17 | import logging.config 18 | from pathlib import Path 19 | 20 | from .paths import get_settings_path 21 | 22 | LOG_CONFIG_FILENAME = 'log_config.json' 23 | TRACEBACK_LOGGER_ERRORS = True 24 | 25 | root_path = Path(__file__).resolve().parents[3] 26 | 27 | log_config_path = get_settings_path() / LOG_CONFIG_FILENAME 28 | 29 | with log_config_path.open(encoding='utf8') as log_config_json: 30 | log_config = json.load(log_config_json) 31 | 32 | 33 | class ProbeFilter(logging.Filter): 34 | """ProbeFilter class is used to filter POST requests to /probe endpoint from logs.""" 35 | 36 | def filter(self, record: logging.LogRecord) -> bool: 37 | """To log the record method should return True.""" 38 | return 'POST /probe HTTP' not in record.getMessage() 39 | 40 | 41 | def init_logger(): 42 | configured_loggers = [log_config.get('root', {})] + [logger for logger in 43 | log_config.get('loggers', {}).values()] 44 | 45 | used_handlers = {handler for log in configured_loggers for handler in log.get('handlers', [])} 46 | 47 | for handler_id, handler in list(log_config['handlers'].items()): 48 | if handler_id not in used_handlers: 49 | del log_config['handlers'][handler_id] 50 | elif 'filename' in handler.keys(): 51 | filename = handler['filename'] 52 | logfile_path = Path(filename).expanduser().resolve() 53 | handler['filename'] = str(logfile_path) 54 | 55 | logging.config.dictConfig(log_config) 56 | -------------------------------------------------------------------------------- /deeppavlov/core/common/log_events.py: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Neural Networks and Deep Learning lab, MIPT 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from logging import getLogger 16 | from typing import Optional 17 | from deeppavlov.core.commands.utils import expand_path 18 | 19 | log = getLogger(__name__) 20 | 21 | 22 | class TBWriter: 23 | def __init__(self, tensorboard_log_dir: str): 24 | # TODO: After adding wandb logger, create common parent class for both loggers 25 | from torch.utils.tensorboard import SummaryWriter 26 | tensorboard_log_dir = expand_path(tensorboard_log_dir) 27 | self.tb_train_writer = SummaryWriter(str(tensorboard_log_dir / 'train_log')) 28 | self.tb_valid_writer = SummaryWriter(str(tensorboard_log_dir / 'valid_log')) 29 | 30 | # TODO: find how to write Summary 31 | def write_train(self, tag, scalar_value, global_step): 32 | self.tb_train_writer.add_scalar(tag, scalar_value, global_step) 33 | 34 | def write_valid(self, tag, scalar_value, global_step): 35 | self.tb_valid_writer.add_scalar(tag, scalar_value, global_step) 36 | 37 | def flush(self): 38 | self.tb_train_writer.flush() 39 | self.tb_valid_writer.flush() 40 | 41 | 42 | def get_tb_writer(tensorboard_log_dir: Optional[str]) -> Optional[TBWriter]: 43 | try: 44 | if tensorboard_log_dir is not None: 45 | tb_writer = TBWriter(tensorboard_log_dir) 46 | else: 47 | tb_writer = None 48 | except ImportError: 49 | log.error('Failed to import SummaryWriter from torch.utils.tensorboard. Failed to initialize Tensorboard ' 50 | 'logger. Install appropriate Pytorch version to use this logger or remove tensorboard_log_dir ' 51 | 'parameter from the train parameters list in the configuration file.') 52 | tb_writer = None 53 | return tb_writer 54 | -------------------------------------------------------------------------------- /deeppavlov/core/common/metrics_registry.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017 Neural Networks and Deep Learning lab, MIPT 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import importlib 16 | import json 17 | from logging import getLogger 18 | from pathlib import Path 19 | from typing import Callable, Any 20 | 21 | from deeppavlov.core.common.errors import ConfigError 22 | 23 | log = getLogger(__name__) 24 | 25 | _registry_path = Path(__file__).parent / 'metrics_registry.json' 26 | if _registry_path.exists(): 27 | with _registry_path.open(encoding='utf-8') as f: 28 | _REGISTRY = json.load(f) 29 | else: 30 | _REGISTRY = {} 31 | 32 | 33 | def fn_from_str(name: str) -> Callable[..., Any]: 34 | """Returns a function object with the name given in string.""" 35 | try: 36 | module_name, fn_name = name.split(':') 37 | return getattr(importlib.import_module(module_name), fn_name) 38 | except ValueError: 39 | raise ConfigError('Expected function description in a `module.submodules:function_name` form, but got `{}`' 40 | .format(name)) 41 | except AttributeError: 42 | # noinspection PyUnboundLocalVariable 43 | raise ConfigError(f"Incorrect metric: '{module_name}' has no attribute '{fn_name}'.") 44 | 45 | 46 | def register_metric(metric_name: str) -> Callable[..., Any]: 47 | """Decorator for metric registration.""" 48 | 49 | def decorate(fn): 50 | fn_name = fn.__module__ + ':' + fn.__name__ 51 | if metric_name in _REGISTRY and _REGISTRY[metric_name] != fn_name: 52 | log.warning('"{}" is already registered as a metric name, the old function will be ignored' 53 | .format(metric_name)) 54 | _REGISTRY[metric_name] = fn_name 55 | return fn 56 | 57 | return decorate 58 | 59 | 60 | def get_metric_by_name(name: str) -> Callable[..., Any]: 61 | """Returns a metric callable with a corresponding name.""" 62 | name = _REGISTRY.get(name, name) 63 | return fn_from_str(name) 64 | -------------------------------------------------------------------------------- /deeppavlov/core/common/paths.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017 Neural Networks and Deep Learning lab, MIPT 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | import os 15 | import shutil 16 | 17 | from pathlib import Path 18 | 19 | _root_path = Path(__file__).resolve().parents[3] 20 | _default_settings_path: Path = _root_path / 'deeppavlov' / 'utils' / 'settings' 21 | _settings_path = Path(os.getenv('DP_SETTINGS_PATH', _default_settings_path)).expanduser().resolve() 22 | if _settings_path.is_file(): 23 | raise FileExistsError(f'DP_SETTINGS_PATH={_settings_path} is a file and not a directory') 24 | 25 | if _default_settings_path in _settings_path.parents: 26 | raise RecursionError(f'DP_SETTINGS_PATH={_settings_path} is relative' 27 | f' to the default settings path {_default_settings_path}') 28 | 29 | 30 | def get_settings_path() -> Path: 31 | """Return an absolute path to the DeepPavlov settings directory""" 32 | populate_settings_dir() 33 | return _settings_path 34 | 35 | 36 | def populate_settings_dir(force: bool = False) -> bool: 37 | """ 38 | Populate settings directory with default settings files 39 | 40 | Args: 41 | force: if ``True``, replace existing settings files with default ones 42 | 43 | Returns: 44 | ``True`` if any files were copied and ``False`` otherwise 45 | """ 46 | res = False 47 | if _default_settings_path == _settings_path: 48 | return res 49 | 50 | for src in list(_default_settings_path.glob('**/*.json')): 51 | dest = _settings_path / src.relative_to(_default_settings_path) 52 | if not force and dest.exists(): 53 | continue 54 | res = True 55 | dest.parent.mkdir(parents=True, exist_ok=True) 56 | shutil.copy(src, dest) 57 | return res 58 | -------------------------------------------------------------------------------- /deeppavlov/core/common/prints.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017 Neural Networks and Deep Learning lab, MIPT 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import sys 16 | from contextlib import redirect_stdout 17 | 18 | 19 | class RedirectedPrints(redirect_stdout): 20 | """Context manager for temporarily redirecting stdout to another stream """ 21 | 22 | def __init__(self, new_target=sys.stderr): 23 | super().__init__(new_target=new_target) 24 | -------------------------------------------------------------------------------- /deeppavlov/core/data/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deeppavlov/DeepPavlov/5f9fbed0c7191466bc7621e604b810f66f254c03/deeppavlov/core/data/__init__.py -------------------------------------------------------------------------------- /deeppavlov/core/data/dataset_reader.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017 Neural Networks and Deep Learning lab, MIPT 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from typing import List, Dict, Tuple, Any 16 | 17 | 18 | class DatasetReader: 19 | """An abstract class for reading data from some location and construction of a dataset.""" 20 | 21 | def read(self, data_path: str, *args, **kwargs) -> Dict[str, List[Tuple[Any, Any]]]: 22 | """Reads a file from a path and returns data as a list of tuples of inputs and correct outputs 23 | for every data type in ``train``, ``valid`` and ``test``. 24 | """ 25 | raise NotImplementedError 26 | -------------------------------------------------------------------------------- /deeppavlov/core/models/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deeppavlov/DeepPavlov/5f9fbed0c7191466bc7621e604b810f66f254c03/deeppavlov/core/models/__init__.py -------------------------------------------------------------------------------- /deeppavlov/core/models/component.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017 Neural Networks and Deep Learning lab, MIPT 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from abc import ABCMeta, abstractmethod 16 | 17 | from logging import getLogger 18 | 19 | log = getLogger(__name__) 20 | 21 | 22 | class Component(metaclass=ABCMeta): 23 | """Abstract class for all callables that could be used in Chainer's pipe.""" 24 | 25 | @abstractmethod 26 | def __call__(self, *args, **kwargs): 27 | pass 28 | 29 | def reset(self): 30 | pass 31 | 32 | def destroy(self): 33 | attr_list = list(self.__dict__.keys()) 34 | for attr_name in attr_list: 35 | attr = getattr(self, attr_name) 36 | if hasattr(attr, 'destroy'): 37 | attr.destroy() 38 | delattr(self, attr_name) 39 | -------------------------------------------------------------------------------- /deeppavlov/core/models/estimator.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017 Neural Networks and Deep Learning lab, MIPT 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from abc import abstractmethod 16 | 17 | from .component import Component 18 | from .serializable import Serializable 19 | 20 | 21 | class Estimator(Component, Serializable): 22 | """Abstract class for components that could be fitted on the data as a whole.""" 23 | 24 | @abstractmethod 25 | def fit(self, *args, **kwargs): 26 | pass 27 | -------------------------------------------------------------------------------- /deeppavlov/core/models/nn_model.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017 Neural Networks and Deep Learning lab, MIPT 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from abc import abstractmethod 16 | 17 | from .component import Component 18 | from .serializable import Serializable 19 | 20 | 21 | class NNModel(Component, Serializable): 22 | """Abstract class for deep learning components.""" 23 | 24 | @abstractmethod 25 | def train_on_batch(self, x: list, y: list): 26 | pass 27 | 28 | def process_event(self, event_name, data): 29 | pass 30 | -------------------------------------------------------------------------------- /deeppavlov/core/models/serializable.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017 Neural Networks and Deep Learning lab, MIPT 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from abc import ABCMeta, abstractmethod 16 | from logging import getLogger 17 | from pathlib import Path 18 | from typing import Union, Optional 19 | 20 | from deeppavlov.core.commands.utils import expand_path 21 | 22 | log = getLogger(__name__) 23 | 24 | 25 | class Serializable(metaclass=ABCMeta): 26 | """Abstract base class that expresses the interface for all models that can serialize data to a path.""" 27 | 28 | def __init__(self, save_path: Optional[Union[str, Path]], load_path: Optional[Union[str, Path]] = None, 29 | mode: str = 'infer', 30 | *args, **kwargs) -> None: 31 | 32 | if save_path: 33 | self.save_path = expand_path(save_path) 34 | self.save_path.parent.mkdir(parents=True, exist_ok=True) 35 | else: 36 | self.save_path = None 37 | 38 | if load_path: 39 | self.load_path = expand_path(load_path) 40 | if mode != 'train' and self.save_path and self.load_path != self.save_path: 41 | log.warning("Load path '{}' differs from save path '{}' in '{}' mode for {}." 42 | .format(self.load_path, self.save_path, mode, self.__class__.__name__)) 43 | elif mode != 'train' and self.save_path: 44 | self.load_path = self.save_path 45 | log.warning("No load path is set for {} in '{}' mode. Using save path instead" 46 | .format(self.__class__.__name__, mode)) 47 | else: 48 | self.load_path = None 49 | log.warning("No load path is set for {}!".format(self.__class__.__name__)) 50 | 51 | @abstractmethod 52 | def save(self, *args, **kwargs): 53 | pass 54 | 55 | @abstractmethod 56 | def load(self, *args, **kwargs): 57 | pass 58 | -------------------------------------------------------------------------------- /deeppavlov/core/trainers/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Neural Networks and Deep Learning lab, MIPT 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from .fit_trainer import FitTrainer 16 | from .nn_trainer import NNTrainer 17 | from .torch_trainer import TorchTrainer 18 | -------------------------------------------------------------------------------- /deeppavlov/core/trainers/torch_trainer.py: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Neural Networks and Deep Learning lab, MIPT 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from logging import getLogger 16 | from typing import Tuple, Optional, Iterable, Collection, Any 17 | 18 | from deeppavlov.core.trainers.utils import Metric 19 | from deeppavlov.core.common.registry import register 20 | from deeppavlov.core.data.data_learning_iterator import DataLearningIterator 21 | from deeppavlov.core.trainers.nn_trainer import NNTrainer 22 | 23 | log = getLogger(__name__) 24 | 25 | 26 | @register('torch_trainer') 27 | class TorchTrainer(NNTrainer): 28 | 29 | def test(self, data: Iterable[Tuple[Collection[Any], Collection[Any]]], 30 | metrics: Optional[Collection[Metric]] = None, *, 31 | start_time: Optional[float] = None, show_examples: Optional[bool] = None) -> dict: 32 | self._chainer.get_main_component().model.eval() 33 | 34 | report = super(TorchTrainer, self).test(data=data, metrics=metrics, start_time=start_time, 35 | show_examples=show_examples) 36 | self._chainer.get_main_component().model.train() 37 | return report 38 | 39 | def train_on_batches(self, iterator: DataLearningIterator) -> None: 40 | self._chainer.get_main_component().model.train() 41 | super(TorchTrainer, self).train_on_batches(iterator=iterator) 42 | self._chainer.get_main_component().model.eval() 43 | -------------------------------------------------------------------------------- /deeppavlov/dataset_iterators/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deeppavlov/DeepPavlov/5f9fbed0c7191466bc7621e604b810f66f254c03/deeppavlov/dataset_iterators/__init__.py -------------------------------------------------------------------------------- /deeppavlov/dataset_iterators/siamese_iterator.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017 Neural Networks and Deep Learning lab, MIPT 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from logging import getLogger 16 | from typing import Dict, List, Tuple 17 | 18 | from deeppavlov.core.common.registry import register 19 | from deeppavlov.core.data.data_learning_iterator import DataLearningIterator 20 | 21 | log = getLogger(__name__) 22 | 23 | 24 | @register('siamese_iterator') 25 | class SiameseIterator(DataLearningIterator): 26 | """The class contains methods for iterating over a dataset for ranking in training, validation and test mode.""" 27 | 28 | def split(self, *args, len_valid=1000, len_test=1000, **kwargs) -> None: 29 | if len(self.valid) == 0 and len_valid != 0: 30 | self.random.shuffle(self.train) 31 | self.valid = self.train[-len_valid:] 32 | self.train = self.train[:-len_valid] 33 | if len(self.test) == 0 and len_test != 0: 34 | self.random.shuffle(self.train) 35 | self.test = self.train[-len_test:] 36 | self.train = self.train[:-len_test] 37 | -------------------------------------------------------------------------------- /deeppavlov/dataset_iterators/typos_iterator.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017 Neural Networks and Deep Learning lab, MIPT 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from deeppavlov.core.common.registry import register 16 | from deeppavlov.core.data.data_learning_iterator import DataLearningIterator 17 | 18 | 19 | @register('typos_iterator') 20 | class TyposDatasetIterator(DataLearningIterator): 21 | """Implementation of :class:`~deeppavlov.core.data.data_learning_iterator.DataLearningIterator` used for training 22 | :class:`~deeppavlov.models.spelling_correction.brillmoore.ErrorModel` 23 | 24 | """ 25 | 26 | def split(self, test_ratio: float = 0., *args, **kwargs): 27 | """Split all data into train and test 28 | 29 | Args: 30 | test_ratio: ratio of test data to train, from 0. to 1. 31 | """ 32 | self.train += self.valid + self.test 33 | 34 | split = int(len(self.train) * test_ratio) 35 | 36 | self.random.shuffle(self.train) 37 | 38 | self.test = self.train[:split] 39 | self.train = self.train[split:] 40 | self.valid = [] 41 | -------------------------------------------------------------------------------- /deeppavlov/dataset_readers/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deeppavlov/DeepPavlov/5f9fbed0c7191466bc7621e604b810f66f254c03/deeppavlov/dataset_readers/__init__.py -------------------------------------------------------------------------------- /deeppavlov/dataset_readers/faq_reader.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017 Neural Networks and Deep Learning lab, MIPT 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, softwaredata 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from typing import Dict 16 | 17 | from pandas import read_csv 18 | 19 | from deeppavlov.core.common.registry import register 20 | from deeppavlov.core.data.dataset_reader import DatasetReader 21 | 22 | 23 | @register('faq_reader') 24 | class FaqDatasetReader(DatasetReader): 25 | """Reader for FAQ dataset""" 26 | 27 | def read(self, data_path: str = None, data_url: str = None, x_col_name: str = 'x', y_col_name: str = 'y') -> Dict: 28 | """ 29 | Read FAQ dataset from specified csv file or remote url 30 | 31 | Parameters: 32 | data_path: path to csv file of FAQ 33 | data_url: url to csv file of FAQ 34 | x_col_name: name of Question column in csv file 35 | y_col_name: name of Answer column in csv file 36 | 37 | Returns: 38 | A dictionary containing training, validation and test parts of the dataset obtainable via 39 | ``train``, ``valid`` and ``test`` keys. 40 | """ 41 | 42 | if data_url is not None: 43 | data = read_csv(data_url) 44 | elif data_path is not None: 45 | data = read_csv(data_path) 46 | else: 47 | raise ValueError("Please specify data_path or data_url parameter") 48 | 49 | x = data[x_col_name] 50 | y = data[y_col_name] 51 | 52 | train_xy_tuples = [(x[i].strip(), y[i].strip()) for i in range(len(x))] 53 | 54 | dataset = dict() 55 | dataset["train"] = train_xy_tuples 56 | dataset["valid"] = [] 57 | dataset["test"] = [] 58 | 59 | return dataset 60 | -------------------------------------------------------------------------------- /deeppavlov/dataset_readers/line_reader.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017 Neural Networks and Deep Learning lab, MIPT 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, softwaredata 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from typing import Dict 16 | 17 | from deeppavlov.core.common.registry import register 18 | from deeppavlov.core.data.dataset_reader import DatasetReader 19 | 20 | 21 | @register('line_reader') 22 | class LineReader(DatasetReader): 23 | """Read txt file by lines""" 24 | 25 | def read(self, data_path: str = None, *args, **kwargs) -> Dict: 26 | """Read lines from txt file 27 | 28 | Args: 29 | data_path: path to txt file 30 | 31 | Returns: 32 | A dictionary containing training, validation and test parts of the dataset obtainable via ``train``, ``valid`` and ``test`` keys. 33 | """ 34 | 35 | with open(data_path) as f: 36 | content = f.readlines() 37 | 38 | dataset = dict() 39 | dataset["train"] = [(line,) for line in content] 40 | dataset["valid"] = [] 41 | dataset["test"] = [] 42 | 43 | return dataset 44 | -------------------------------------------------------------------------------- /deeppavlov/dataset_readers/multitask_reader.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017 Neural Networks and Deep Learning lab, MIPT 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import copy 16 | from logging import getLogger 17 | from typing import Dict 18 | 19 | from deeppavlov.core.common.registry import get_model, register 20 | from deeppavlov.core.data.dataset_reader import DatasetReader 21 | 22 | log = getLogger(__name__) 23 | 24 | 25 | @register('multitask_reader') 26 | class MultiTaskReader(DatasetReader): 27 | """Class to read several datasets simultaneously.""" 28 | 29 | def read(self, tasks: Dict[str, Dict[str, dict]], task_defaults: dict = None, **kwargs): 30 | """Creates dataset readers for tasks and returns what task dataset readers `read()` methods return. 31 | 32 | Args: 33 | tasks: dictionary which keys are task names and values are dictionaries with param name - value pairs for 34 | nested dataset readers initialization. If task has key-value pair ``'use_task_defaults': False``, 35 | task_defaults for this task dataset reader will be ignored. 36 | task_defaults: default task parameters. 37 | 38 | Returns: 39 | dictionary which keys are task names and values are what task readers `read()` methods returned. 40 | """ 41 | data = dict() 42 | if task_defaults is None: 43 | task_defaults = dict() 44 | for task_name, task_params in tasks.items(): 45 | if task_params.pop('use_task_defaults', True) is True: 46 | task_config = copy.deepcopy(task_defaults) 47 | task_config.update(task_params) 48 | else: 49 | task_config = task_params 50 | reader = get_model(task_config.pop('class_name'))() 51 | data[task_name] = reader.read(**task_config) 52 | return data 53 | -------------------------------------------------------------------------------- /deeppavlov/dataset_readers/paraphraser_reader.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017 Neural Networks and Deep Learning lab, MIPT 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import xml.etree.ElementTree as ET 16 | from pathlib import Path 17 | from typing import Dict, List, Tuple 18 | 19 | from deeppavlov.core.commands.utils import expand_path 20 | from deeppavlov.core.common.registry import register 21 | from deeppavlov.core.data.dataset_reader import DatasetReader 22 | 23 | 24 | @register('paraphraser_reader') 25 | class ParaphraserReader(DatasetReader): 26 | """The class to read the paraphraser.ru dataset from files. 27 | 28 | Please, see https://paraphraser.ru. 29 | """ 30 | 31 | def read(self, 32 | data_path: str, 33 | do_lower_case: bool = True, 34 | *args, **kwargs) -> Dict[str, List[Tuple[Tuple[str, str], int]]]: 35 | """Read the paraphraser.ru dataset from files. 36 | 37 | Args: 38 | data_path: A path to a folder with dataset files. 39 | do_lower_case: Do you want to lowercase all texts 40 | """ 41 | 42 | data_path = expand_path(data_path) 43 | train_fname = data_path / 'paraphrases.xml' 44 | test_fname = data_path / 'paraphrases_gold.xml' 45 | 46 | train_data = self._build_data(train_fname, do_lower_case) 47 | test_data = self._build_data(test_fname, do_lower_case) 48 | return {"train": train_data, "valid": [], "test": test_data} 49 | 50 | @staticmethod 51 | def _build_data(data_path: Path, do_lower_case: bool) -> List[Tuple[Tuple[str, str], int]]: 52 | root = ET.fromstring(data_path.read_text(encoding='utf8')) 53 | data = {} 54 | for paraphrase in root.findall('corpus/paraphrase'): 55 | key = (paraphrase.find('value[@name="text_1"]').text, 56 | paraphrase.find('value[@name="text_2"]').text) 57 | if do_lower_case: 58 | key = tuple([t.lower() for t in key]) 59 | 60 | data[key] = 1 if int(paraphrase.find('value[@name="class"]').text) >= 0 else 0 61 | return list(data.items()) 62 | -------------------------------------------------------------------------------- /deeppavlov/dataset_readers/rel_ranking_reader.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017 Neural Networks and Deep Learning lab, MIPT 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import xml.etree.ElementTree as ET 16 | from pathlib import Path 17 | from typing import Dict, List, Tuple 18 | 19 | from deeppavlov.core.commands.utils import expand_path 20 | from deeppavlov.core.common.registry import register 21 | from deeppavlov.core.data.dataset_reader import DatasetReader 22 | 23 | 24 | @register('rel_ranking_reader') 25 | class ParaphraserReader(DatasetReader): 26 | """The class to read the paraphraser.ru dataset from files. 27 | ​ 28 | Please, see https://paraphraser.ru. 29 | """ 30 | 31 | def read(self, 32 | data_path: str, 33 | do_lower_case: bool = True, 34 | *args, **kwargs) -> Dict[str, List[Tuple[Tuple[str, str], int]]]: 35 | """Read the paraphraser.ru dataset from files. 36 | ​ 37 | Args: 38 | data_path: A path to a folder with dataset files. 39 | do_lower_case: Do you want to lowercase all texts 40 | """ 41 | 42 | data_path = expand_path(data_path) 43 | train_fname = data_path / 'paraphrases.xml' 44 | test_fname = data_path / 'paraphrases_gold.xml' 45 | 46 | train_data = self._build_data(train_fname, do_lower_case) 47 | test_data = self._build_data(test_fname, do_lower_case) 48 | return {"train": train_data, "valid": [], "test": test_data} 49 | 50 | @staticmethod 51 | def _build_data(data_path: Path, do_lower_case: bool) -> List[Tuple[Tuple[str, str], int]]: 52 | root = ET.fromstring(data_path.read_text(encoding='utf8')) 53 | data = [] 54 | for paraphrase in root.findall('corpus/paraphrase'): 55 | key = (paraphrase.find('value[@name="text_1"]').text, 56 | paraphrase.find('value[@name="text_2"]').text) 57 | if do_lower_case: 58 | key = tuple([t.lower() for t in key]) 59 | 60 | pos_or_neg = int(paraphrase.find('value[@name="class"]').text) 61 | data.append((key, pos_or_neg)) 62 | return data 63 | -------------------------------------------------------------------------------- /deeppavlov/metrics/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deeppavlov/DeepPavlov/5f9fbed0c7191466bc7621e604b810f66f254c03/deeppavlov/metrics/__init__.py -------------------------------------------------------------------------------- /deeppavlov/metrics/correlation.py: -------------------------------------------------------------------------------- 1 | # Copyright 2020 Neural Networks and Deep Learning lab, MIPT 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from scipy.stats import pearsonr, spearmanr 16 | from sklearn.metrics import matthews_corrcoef 17 | 18 | from deeppavlov.core.common.metrics_registry import register_metric 19 | 20 | 21 | @register_metric('pearson_correlation') 22 | def pearson_correlation(y_true, y_predicted) -> float: 23 | return pearsonr(y_predicted, y_true)[0] 24 | 25 | 26 | @register_metric('spearman_correlation') 27 | def spearman_correlation(y_true, y_predicted) -> float: 28 | return spearmanr(y_predicted, y_true)[0] 29 | 30 | 31 | @register_metric('matthews_correlation') 32 | def matthews_correlation(y_true, y_predicted) -> float: 33 | return matthews_corrcoef(y_true, y_predicted) 34 | -------------------------------------------------------------------------------- /deeppavlov/metrics/elmo_metrics.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017 Neural Networks and Deep Learning lab, MIPT 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from typing import List 16 | 17 | import numpy as np 18 | 19 | from deeppavlov.core.common.metrics_registry import register_metric 20 | 21 | 22 | @register_metric('elmo_loss2ppl') 23 | def elmo_loss2ppl(losses: List[np.ndarray]) -> float: 24 | """ Calculates perplexity by loss 25 | 26 | Args: 27 | losses: list of numpy arrays of model losses 28 | 29 | Returns: 30 | perplexity : float 31 | """ 32 | avg_loss = np.mean(losses) 33 | return float(np.exp(avg_loss)) 34 | -------------------------------------------------------------------------------- /deeppavlov/metrics/log_loss.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017 Neural Networks and Deep Learning lab, MIPT 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | 16 | from typing import List, Union 17 | 18 | import numpy as np 19 | from sklearn.metrics import log_loss 20 | 21 | from deeppavlov.core.common.metrics_registry import register_metric 22 | 23 | 24 | @register_metric('log_loss') 25 | def sk_log_loss(y_true: Union[List[List[float]], List[List[int]], np.ndarray], 26 | y_predicted: Union[List[List[float]], List[List[int]], np.ndarray]) -> float: 27 | """ 28 | Calculates log loss. 29 | 30 | Args: 31 | y_true: list or array of true values 32 | y_predicted: list or array of predicted values 33 | 34 | Returns: 35 | Log loss 36 | 37 | Alias: 38 | log_loss 39 | """ 40 | return log_loss(y_true, y_predicted) 41 | -------------------------------------------------------------------------------- /deeppavlov/metrics/mse.py: -------------------------------------------------------------------------------- 1 | # Copyright 2020 Neural Networks and Deep Learning lab, MIPT 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import numpy as np 16 | from sklearn.metrics import mean_squared_error 17 | from typing import Union 18 | 19 | from deeppavlov.core.common.metrics_registry import register_metric 20 | 21 | 22 | @register_metric('mean_squared_error') 23 | def mse(y_true: Union[np.array, list], 24 | y_predicted: Union[np.array, list], 25 | *args, 26 | **kwargs) -> float: 27 | """ 28 | Calculates mean squared error. 29 | Args: 30 | y_true: list of true values 31 | y_predicted: list of predicted values 32 | Returns: 33 | float: Mean squared error 34 | """ 35 | for value in [y_true, y_predicted]: 36 | assert (np.isfinite(value).all()) 37 | return mean_squared_error(y_true, y_predicted, *args, **kwargs) 38 | -------------------------------------------------------------------------------- /deeppavlov/metrics/recall_at_k.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017 Neural Networks and Deep Learning lab, MIPT 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | 16 | from typing import List 17 | 18 | import numpy as np 19 | 20 | from deeppavlov.core.common.metrics_registry import register_metric 21 | 22 | 23 | def recall_at_k(y_true: List[int], y_pred: List[List[np.ndarray]], k: int): 24 | """ 25 | Calculates recall at k ranking metric. 26 | 27 | Args: 28 | y_true: Labels. Not used in the calculation of the metric. 29 | y_predicted: Predictions. 30 | Each prediction contains ranking score of all ranking candidates for the particular data sample. 31 | It is supposed that the ranking score for the true candidate goes first in the prediction. 32 | 33 | Returns: 34 | Recall at k 35 | """ 36 | num_examples = float(len(y_pred)) 37 | predictions = np.array(y_pred) 38 | predictions = np.flip(np.argsort(predictions, -1), -1)[:, :k] 39 | num_correct = 0 40 | for el in predictions: 41 | if 0 in el: 42 | num_correct += 1 43 | return float(num_correct) / num_examples 44 | 45 | 46 | @register_metric('r@1') 47 | def r_at_1(y_true, y_pred): 48 | return recall_at_k(y_true, y_pred, k=1) 49 | 50 | 51 | @register_metric('r@2') 52 | def r_at_2(y_true, y_pred): 53 | return recall_at_k(y_true, y_pred, k=2) 54 | 55 | 56 | @register_metric('r@5') 57 | def r_at_5(labels, predictions): 58 | return recall_at_k(labels, predictions, k=5) 59 | 60 | 61 | @register_metric('r@10') 62 | def r_at_10(labels, predictions): 63 | return recall_at_k(labels, predictions, k=10) 64 | -------------------------------------------------------------------------------- /deeppavlov/metrics/roc_auc_score.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017 Neural Networks and Deep Learning lab, MIPT 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | 16 | from typing import List, Union 17 | 18 | import numpy as np 19 | import sklearn.metrics 20 | 21 | from deeppavlov.core.common.metrics_registry import register_metric 22 | 23 | 24 | @register_metric('roc_auc') 25 | def roc_auc_score(y_true: Union[List[List[float]], List[List[int]], np.ndarray], 26 | y_pred: Union[List[List[float]], List[List[int]], np.ndarray]) -> float: 27 | """ 28 | Compute Area Under the Curve (AUC) from prediction scores. 29 | 30 | Args: 31 | y_true: true binary labels 32 | y_pred: target scores, can either be probability estimates of the positive class 33 | 34 | Returns: 35 | Area Under the Curve (AUC) from prediction scores 36 | 37 | Alias: 38 | roc_auc 39 | """ 40 | try: 41 | return sklearn.metrics.roc_auc_score(np.squeeze(np.array(y_true)), 42 | np.squeeze(np.array(y_pred)), average="macro") 43 | except ValueError: 44 | return 0. 45 | -------------------------------------------------------------------------------- /deeppavlov/models/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017 Neural Networks and Deep Learning lab, MIPT 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import os 16 | 17 | import nltk 18 | 19 | from deeppavlov.core.common.prints import RedirectedPrints 20 | 21 | if not os.environ.get('DP_SKIP_NLTK_DOWNLOAD'): 22 | with RedirectedPrints(): 23 | nltk.download('punkt', quiet=True) 24 | nltk.download('stopwords', quiet=True) 25 | nltk.download('perluniprops', quiet=True) 26 | nltk.download('nonbreaking_prefixes', quiet=True) 27 | -------------------------------------------------------------------------------- /deeppavlov/models/api_requester/__init__.py: -------------------------------------------------------------------------------- 1 | from .api_requester import * 2 | -------------------------------------------------------------------------------- /deeppavlov/models/api_requester/api_router.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017 Neural Networks and Deep Learning lab, MIPT 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import concurrent 16 | from concurrent.futures import ProcessPoolExecutor 17 | from logging import getLogger 18 | from typing import List 19 | 20 | from deeppavlov.core.common.registry import register 21 | from deeppavlov.core.models.component import Component 22 | from deeppavlov.models.api_requester import ApiRequester 23 | 24 | logger = getLogger(__name__) 25 | 26 | 27 | @register("api_router") 28 | class ApiRouter(Component): 29 | """A helper class for running multiple API requesters on the same data in parallel 30 | 31 | Args: 32 | api_requesters: list of ApiRequester objects 33 | n_workers: The maximum number of subprocesses to run 34 | 35 | Attributes: 36 | api_requesters: list of ApiRequester objects 37 | n_workers: The maximum number of subprocesses to run 38 | """ 39 | 40 | def __init__(self, api_requesters: List[ApiRequester], n_workers: int = 1, *args, **kwargs): 41 | self.api_requesters = api_requesters 42 | self.n_workers = n_workers 43 | 44 | def __call__(self, *args): 45 | """ 46 | 47 | Args: 48 | *args: list of arguments to forward to the API requesters 49 | 50 | Returns: 51 | results of the requests 52 | """ 53 | with ProcessPoolExecutor(self.n_workers) as executor: 54 | futures = [executor.submit(api_requester, *args) for api_requester 55 | in 56 | self.api_requesters] 57 | 58 | concurrent.futures.wait(futures) 59 | results = [] 60 | for future, api_requester in zip(futures, self.api_requesters): 61 | result = future.result() 62 | if api_requester.out_count > 1: 63 | results += result 64 | else: 65 | results.append(result) 66 | 67 | return results 68 | -------------------------------------------------------------------------------- /deeppavlov/models/classifiers/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deeppavlov/DeepPavlov/5f9fbed0c7191466bc7621e604b810f66f254c03/deeppavlov/models/classifiers/__init__.py -------------------------------------------------------------------------------- /deeppavlov/models/doc_retrieval/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deeppavlov/DeepPavlov/5f9fbed0c7191466bc7621e604b810f66f254c03/deeppavlov/models/doc_retrieval/__init__.py -------------------------------------------------------------------------------- /deeppavlov/models/doc_retrieval/utils.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017 Neural Networks and Deep Learning lab, MIPT 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from typing import Any, List 16 | 17 | import nltk 18 | 19 | from deeppavlov.core.common.registry import register 20 | 21 | 22 | @register('concat_lists') 23 | def concat_lists(list_a: List[List[Any]], list_b: List[List[Any]]): 24 | list_u = [] 25 | for element_a, element_b in zip(list_a, list_b): 26 | list_u.append(element_a + element_b) 27 | return list_u 28 | 29 | 30 | def find_answer_sentence(answer_pos: int, context: str) -> str: 31 | answer_sentence = "" 32 | context_sentences = nltk.sent_tokenize(context) 33 | start = 0 34 | context_sentences_offsets = [] 35 | for sentence in context_sentences: 36 | end = start + len(sentence) 37 | context_sentences_offsets.append((start, end)) 38 | start = end + 1 39 | 40 | for sentence, (start_offset, end_offset) in zip(context_sentences, context_sentences_offsets): 41 | if start_offset < answer_pos < end_offset: 42 | answer_sentence = sentence 43 | break 44 | 45 | return answer_sentence 46 | -------------------------------------------------------------------------------- /deeppavlov/models/embedders/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deeppavlov/DeepPavlov/5f9fbed0c7191466bc7621e604b810f66f254c03/deeppavlov/models/embedders/__init__.py -------------------------------------------------------------------------------- /deeppavlov/models/embedders/fasttext_embedder.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017 Neural Networks and Deep Learning lab, MIPT 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from logging import getLogger 16 | from typing import Iterator 17 | 18 | import fasttext 19 | 20 | import numpy as np 21 | 22 | from deeppavlov.core.common.registry import register 23 | from deeppavlov.models.embedders.abstract_embedder import Embedder 24 | 25 | log = getLogger(__name__) 26 | 27 | 28 | @register('fasttext') 29 | class FasttextEmbedder(Embedder): 30 | """ 31 | Class implements fastText embedding model 32 | 33 | Args: 34 | load_path: path where to load pre-trained embedding model from 35 | pad_zero: whether to pad samples or not 36 | 37 | Attributes: 38 | model: fastText model instance 39 | tok2emb: dictionary with already embedded tokens 40 | dim: dimension of embeddings 41 | pad_zero: whether to pad sequence of tokens with zeros or not 42 | load_path: path with pre-trained fastText binary model 43 | """ 44 | 45 | def _get_word_vector(self, w: str) -> np.ndarray: 46 | return self.model.get_word_vector(w) 47 | 48 | def load(self) -> None: 49 | """ 50 | Load fastText binary model from self.load_path 51 | """ 52 | log.debug(f"[loading fastText embeddings from `{self.load_path}`]") 53 | self.model = fasttext.load_model(str(self.load_path)) 54 | self.dim = self.model.get_dimension() 55 | 56 | def __iter__(self) -> Iterator[str]: 57 | """ 58 | Iterate over all words from fastText model vocabulary 59 | 60 | Returns: 61 | iterator 62 | """ 63 | yield from self.model.get_words() 64 | -------------------------------------------------------------------------------- /deeppavlov/models/entity_extraction/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deeppavlov/DeepPavlov/5f9fbed0c7191466bc7621e604b810f66f254c03/deeppavlov/models/entity_extraction/__init__.py -------------------------------------------------------------------------------- /deeppavlov/models/kbqa/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deeppavlov/DeepPavlov/5f9fbed0c7191466bc7621e604b810f66f254c03/deeppavlov/models/kbqa/__init__.py -------------------------------------------------------------------------------- /deeppavlov/models/morpho_syntax_parser/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deeppavlov/DeepPavlov/5f9fbed0c7191466bc7621e604b810f66f254c03/deeppavlov/models/morpho_syntax_parser/__init__.py -------------------------------------------------------------------------------- /deeppavlov/models/morpho_syntax_parser/dependency_decoding.py: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Neural Networks and Deep Learning lab, MIPT 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from typing import List 16 | 17 | import numpy as np 18 | from ufal.chu_liu_edmonds import chu_liu_edmonds 19 | 20 | from deeppavlov.core.common.registry import register 21 | from deeppavlov.core.models.component import Component 22 | 23 | 24 | @register('chu_liu_edmonds_transformer') 25 | class ChuLiuEdmonds(Component): 26 | """ 27 | A wrapper for Chu-Liu-Edmonds algorithm for maximum spanning tree 28 | """ 29 | 30 | def __init__(self, min_edge_prob=1e-6, **kwargs): 31 | self.min_edge_prob = min_edge_prob 32 | 33 | def __call__(self, probs: List[np.ndarray]) -> List[List[int]]: 34 | """Applies Chu-Liu-Edmonds algorithm to the matrix of head probabilities. 35 | probs: a 3D-array of probabilities of shape B*L*(L+1) 36 | """ 37 | answer = [] 38 | for elem in probs: 39 | m, n = elem.shape 40 | if n == m + 1: 41 | elem = np.log10(np.maximum(self.min_edge_prob, elem)) - np.log10(self.min_edge_prob) 42 | elem = np.concatenate([np.zeros_like(elem[:1, :]), elem], axis=0) 43 | # it makes impossible to create multiple edges 0->i 44 | elem[1:, 0] += np.log10(self.min_edge_prob) * len(elem) 45 | heads, _ = chu_liu_edmonds(elem.astype("float64")) 46 | answer.append(heads[1:]) 47 | else: 48 | raise ValueError("First and second axis lengths m, n of probs should satisfy the condition n == m + 1") 49 | return answer 50 | -------------------------------------------------------------------------------- /deeppavlov/models/morpho_syntax_parser/spacy_lemmatizer.py: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Neural Networks and Deep Learning lab, MIPT 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from typing import List 16 | 17 | import spacy 18 | 19 | from deeppavlov.core.common.registry import register 20 | from deeppavlov.core.models.component import Component 21 | 22 | 23 | @register('spacy_lemmatizer') 24 | class SpacyLemmatizer(Component): 25 | def __init__(self, model: str, **kwargs): 26 | self.nlp = spacy.load(model) 27 | 28 | def __call__(self, words_batch: List[List[str]]): 29 | return [[self.nlp(word)[0].lemma_ for word in words_list] for words_list in words_batch] 30 | -------------------------------------------------------------------------------- /deeppavlov/models/preprocessors/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deeppavlov/DeepPavlov/5f9fbed0c7191466bc7621e604b810f66f254c03/deeppavlov/models/preprocessors/__init__.py -------------------------------------------------------------------------------- /deeppavlov/models/preprocessors/dnnc_preprocessor.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017 Neural Networks and Deep Learning lab, MIPT 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from logging import getLogger 16 | from typing import List, Tuple 17 | 18 | import numpy as np 19 | 20 | from deeppavlov.core.common.registry import register 21 | from deeppavlov.core.models.component import Component 22 | 23 | log = getLogger(__name__) 24 | 25 | 26 | @register('dnnc_pair_generator') 27 | class PairGenerator(Component): 28 | """ 29 | Generates all possible ordered pairs from 'texts_batch' and 'support_dataset' 30 | 31 | Args: 32 | bidirectional: adds pairs in reverse order 33 | """ 34 | 35 | def __init__(self, bidirectional: bool = False, **kwargs) -> None: 36 | self.bidirectional = bidirectional 37 | 38 | def __call__(self, 39 | texts: List[str], 40 | dataset: List[List[str]], 41 | ) -> Tuple[List[str], List[str], List[str], List[str]]: 42 | hypotesis_batch = [] 43 | premise_batch = [] 44 | hypotesis_labels_batch = [] 45 | for [premise, [hypotesis, hypotesis_labels]] in zip(texts * len(dataset), 46 | np.repeat(dataset, len(texts), axis=0)): 47 | premise_batch.append(premise) 48 | hypotesis_batch.append(hypotesis) 49 | hypotesis_labels_batch.append(hypotesis_labels) 50 | 51 | if self.bidirectional: 52 | premise_batch.append(hypotesis) 53 | hypotesis_batch.append(premise) 54 | hypotesis_labels_batch.append(hypotesis_labels) 55 | return texts, hypotesis_batch, premise_batch, hypotesis_labels_batch 56 | -------------------------------------------------------------------------------- /deeppavlov/models/preprocessors/mask.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017 Neural Networks and Deep Learning lab, MIPT 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import numpy as np 16 | 17 | from deeppavlov.core.common.registry import register 18 | from deeppavlov.core.models.component import Component 19 | 20 | 21 | @register('mask') 22 | class Mask(Component): 23 | """Takes a batch of tokens and returns the masks of corresponding length""" 24 | def __init__(self, *args, **kwargs): 25 | pass 26 | 27 | @staticmethod 28 | def __call__(tokens_batch, **kwargs): 29 | batch_size = len(tokens_batch) 30 | max_len = max(len(utt) for utt in tokens_batch) 31 | mask = np.zeros([batch_size, max_len], dtype=np.float32) 32 | for n, utterance in enumerate(tokens_batch): 33 | mask[n, :len(utterance)] = 1 34 | 35 | return mask 36 | -------------------------------------------------------------------------------- /deeppavlov/models/preprocessors/sentseg_preprocessor.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | 3 | from deeppavlov.core.common.registry import register 4 | 5 | 6 | @register("sentseg_restore_sent") 7 | def SentSegRestoreSent(batch_words: List[List[str]], batch_tags: List[List[str]]) -> List[str]: 8 | ret = [] 9 | for words, tags in zip(batch_words, batch_tags): 10 | if len(tags) == 0: 11 | ret.append("") 12 | continue 13 | sent = words[0] 14 | punct = "" if tags[0] == "O" else tags[0][-1] 15 | for word, tag in zip(words[1:], tags[1:]): 16 | if tag != "O": 17 | sent += punct 18 | punct = tag[-1] 19 | sent += " " + word 20 | sent += punct 21 | ret.append(sent) 22 | 23 | return ret 24 | -------------------------------------------------------------------------------- /deeppavlov/models/preprocessors/str_lower.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017 Neural Networks and Deep Learning lab, MIPT 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from typing import Union 16 | 17 | from deeppavlov.core.common.registry import register 18 | 19 | 20 | @register('str_lower') 21 | def str_lower(batch: Union[str, list, tuple]): 22 | """Recursively search for strings in a list and convert them to lowercase 23 | 24 | Args: 25 | batch: a string or a list containing strings at some level of nesting 26 | 27 | Returns: 28 | the same structure where all strings are converted to lowercase 29 | """ 30 | if isinstance(batch, str): 31 | return batch.lower() 32 | else: 33 | return list(map(str_lower, batch)) 34 | -------------------------------------------------------------------------------- /deeppavlov/models/ranking/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deeppavlov/DeepPavlov/5f9fbed0c7191466bc7621e604b810f66f254c03/deeppavlov/models/ranking/__init__.py -------------------------------------------------------------------------------- /deeppavlov/models/ranking/metrics.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017 Neural Networks and Deep Learning lab, MIPT 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import numpy as np 16 | 17 | from deeppavlov.core.common.metrics_registry import register_metric 18 | 19 | 20 | @register_metric('rank_response') 21 | def rank_response(y_true, y_pred): 22 | num_examples = float(len(y_pred)) 23 | predictions = np.array(y_pred) 24 | predictions = np.flip(np.argsort(predictions, -1), -1) 25 | rank_tot = 0 26 | for el in predictions: 27 | for i, x in enumerate(el): 28 | if x == 0: 29 | rank_tot += i 30 | break 31 | return float(rank_tot) / num_examples 32 | 33 | 34 | @register_metric('r@1_insQA') 35 | def r_at_1_insQA(y_true, y_pred): 36 | return recall_at_k_insQA(y_true, y_pred, k=1) 37 | 38 | 39 | def recall_at_k_insQA(y_true, y_pred, k): 40 | labels = np.repeat(np.expand_dims(np.asarray(y_true), axis=1), k, axis=1) 41 | predictions = np.array(y_pred) 42 | predictions = np.flip(np.argsort(predictions, -1), -1)[:, :k] 43 | flags = np.zeros_like(predictions) 44 | for i in range(predictions.shape[0]): 45 | for j in range(predictions.shape[1]): 46 | if predictions[i][j] in np.arange(labels[i][j]): 47 | flags[i][j] = 1. 48 | return np.mean((np.sum(flags, -1) >= 1.).astype(float)) 49 | -------------------------------------------------------------------------------- /deeppavlov/models/relation_extraction/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deeppavlov/DeepPavlov/5f9fbed0c7191466bc7621e604b810f66f254c03/deeppavlov/models/relation_extraction/__init__.py -------------------------------------------------------------------------------- /deeppavlov/models/relation_extraction/losses.py: -------------------------------------------------------------------------------- 1 | """ 2 | This code is copied from ATLOP algorithm (https://github.com/wzhouad/ATLOP/blob/main/losses.py) 3 | """ 4 | 5 | import torch 6 | import torch.nn as nn 7 | import torch.nn.functional as F 8 | from torch import Tensor 9 | 10 | 11 | class ATLoss(nn.Module): 12 | def __init__(self): 13 | super().__init__() 14 | 15 | def forward(self, logits: Tensor, labels: Tensor) -> float: 16 | """ 17 | Args: 18 | logits: predicted probabilities (shape: batch size x num classes) 19 | labels: one-hot encoded true labels (shape: batch size x num classes) 20 | """ 21 | 22 | # TH label 23 | th_label = torch.zeros_like(labels, dtype=torch.float).to(labels) 24 | th_label[:, 0] = 1.0 25 | labels[:, 0] = 0.0 26 | 27 | p_mask = labels + th_label # = 1 for the gold labels + for 0 (negative) class, 0 otherwise 28 | n_mask = 1 - labels # = 0 for the gold labels, 1 otherwise 29 | 30 | # Rank positive classes to TH 31 | logit1 = logits - (1 - p_mask) * 1e30 # org logits remain for gold labels + 0 class, others are reduced by 1 32 | loss1 = -(F.log_softmax(logit1, dim=-1) * labels).sum(1) 33 | 34 | # Rank TH to negative classes 35 | logit2 = logits - (1 - n_mask) * 1e30 # org logits remain for not gold and not 0-class, others are reduced by 1 36 | loss2 = -(F.log_softmax(logit2, dim=-1) * th_label).sum(1) 37 | 38 | # Sum two parts 39 | loss = loss1 + loss2 40 | loss = loss.mean() 41 | return loss 42 | 43 | def get_label(self, logits: Tensor, num_labels: int = -1, threshold: float = None) -> Tensor: 44 | """ Calculated the labels """ 45 | if threshold: 46 | th_logit = torch.full((len(logits), 1), threshold) 47 | else: 48 | th_logit = logits[:, 0].unsqueeze(1) # vector of predicted probabilities for class 0 (negative class) 49 | output = torch.zeros_like(logits).to(logits) 50 | mask = (logits > th_logit) # for each sample: True, if prob for a class > prob for neg class, False otherwise 51 | if num_labels > 0: 52 | top_v, _ = torch.topk(logits, num_labels, dim=1) # len(num_labels) max elements; sorted 53 | top_v = top_v[:, -1] # the smallest pro for each sample 54 | mask = (logits >= top_v.unsqueeze(1)) & mask # mask + additionally: logits should be bigger than minimum 55 | output[mask] = 1.0 56 | output[:, 0] = (output.sum(1) == 0.).to(logits) # no relation if no label matched 57 | return output 58 | -------------------------------------------------------------------------------- /deeppavlov/models/sklearn/__init__.py: -------------------------------------------------------------------------------- 1 | from .sklearn_component import * 2 | -------------------------------------------------------------------------------- /deeppavlov/models/spelling_correction/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deeppavlov/DeepPavlov/5f9fbed0c7191466bc7621e604b810f66f254c03/deeppavlov/models/spelling_correction/__init__.py -------------------------------------------------------------------------------- /deeppavlov/models/spelling_correction/brillmoore/__init__.py: -------------------------------------------------------------------------------- 1 | from .error_model import ErrorModel 2 | -------------------------------------------------------------------------------- /deeppavlov/models/spelling_correction/electors/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deeppavlov/DeepPavlov/5f9fbed0c7191466bc7621e604b810f66f254c03/deeppavlov/models/spelling_correction/electors/__init__.py -------------------------------------------------------------------------------- /deeppavlov/models/spelling_correction/electors/top1_elector.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017 Neural Networks and Deep Learning lab, MIPT 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from logging import getLogger 16 | from typing import List, Tuple 17 | 18 | from deeppavlov.core.common.registry import register 19 | from deeppavlov.core.models.component import Component 20 | 21 | logger = getLogger(__name__) 22 | 23 | 24 | @register('top1_elector') 25 | class TopOneElector(Component): 26 | """Component that chooses a candidate with highest base probability for every token 27 | 28 | """ 29 | 30 | def __init__(self, *args, **kwargs): 31 | pass 32 | 33 | def __call__(self, batch: List[List[List[Tuple[float, str]]]]) -> List[List[str]]: 34 | """Choose the best candidate for every token 35 | 36 | Args: 37 | batch: batch of probabilities and string values of candidates for every token in a sentence 38 | 39 | Returns: 40 | batch of corrected tokenized sentences 41 | """ 42 | return [[max(sublist)[1] for sublist in candidates] for candidates in batch] 43 | -------------------------------------------------------------------------------- /deeppavlov/models/spelling_correction/levenshtein/__init__.py: -------------------------------------------------------------------------------- 1 | from .searcher_component import LevenshteinSearcherComponent 2 | -------------------------------------------------------------------------------- /deeppavlov/models/tokenizers/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deeppavlov/DeepPavlov/5f9fbed0c7191466bc7621e604b810f66f254c03/deeppavlov/models/tokenizers/__init__.py -------------------------------------------------------------------------------- /deeppavlov/models/tokenizers/lazy_tokenizer.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017 Neural Networks and Deep Learning lab, MIPT 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from logging import getLogger 16 | 17 | from nltk import word_tokenize 18 | 19 | from deeppavlov.core.common.registry import register 20 | 21 | log = getLogger(__name__) 22 | 23 | 24 | @register('lazy_tokenizer') 25 | def lazy_tokenizer(batch): 26 | """Tokenizes if there is something to tokenize.""" 27 | 28 | if len(batch) > 0 and isinstance(batch[0], str): 29 | batch = [word_tokenize(utt) for utt in batch] 30 | return batch 31 | -------------------------------------------------------------------------------- /deeppavlov/models/tokenizers/nltk_moses_tokenizer.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017 Neural Networks and Deep Learning lab, MIPT 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | from typing import Union, List 15 | 16 | from sacremoses import MosesDetokenizer, MosesTokenizer 17 | 18 | from deeppavlov.core.common.registry import register 19 | from deeppavlov.core.models.component import Component 20 | 21 | 22 | @register("nltk_moses_tokenizer") 23 | class NLTKMosesTokenizer(Component): 24 | """Class for splitting texts on tokens using NLTK wrapper over MosesTokenizer 25 | 26 | Attributes: 27 | escape: whether escape characters for use in html markup 28 | tokenizer: tokenizer instance from nltk.tokenize.moses 29 | detokenizer: detokenizer instance from nltk.tokenize.moses 30 | 31 | Args: 32 | escape: whether escape characters for use in html markup 33 | """ 34 | 35 | def __init__(self, escape: bool = False, *args, **kwargs): 36 | self.escape = escape 37 | self.tokenizer = MosesTokenizer() 38 | self.detokenizer = MosesDetokenizer() 39 | 40 | def __call__(self, batch: List[Union[str, List[str]]]) -> List[Union[List[str], str]]: 41 | """Tokenize given batch of strings or detokenize given batch of lists of tokens 42 | 43 | Args: 44 | batch: list of text samples or list of lists of tokens 45 | 46 | Returns: 47 | list of lists of tokens or list of text samples 48 | """ 49 | if isinstance(batch[0], str): 50 | return [self.tokenizer.tokenize(line, escape=self.escape) for line in batch] 51 | else: 52 | return [self.detokenizer.detokenize(line, return_str=True, unescape=self.escape) 53 | for line in batch] 54 | -------------------------------------------------------------------------------- /deeppavlov/models/tokenizers/nltk_tokenizer.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017 Neural Networks and Deep Learning lab, MIPT 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from typing import List 16 | 17 | import nltk 18 | 19 | from deeppavlov.core.common.registry import register 20 | from deeppavlov.core.models.component import Component 21 | 22 | 23 | @register("nltk_tokenizer") 24 | class NLTKTokenizer(Component): 25 | """Class for splitting texts on tokens using NLTK 26 | 27 | Args: 28 | tokenizer: tokenization mode for `nltk.tokenize` 29 | download: whether to download nltk data 30 | 31 | Attributes: 32 | tokenizer: tokenizer instance from nltk.tokenizers 33 | """ 34 | 35 | def __init__(self, tokenizer: str = "wordpunct_tokenize", download: bool = False, 36 | *args, **kwargs): 37 | if download: 38 | nltk.download() 39 | self.tokenizer = getattr(nltk.tokenize, tokenizer, None) 40 | if not callable(self.tokenizer): 41 | raise AttributeError("Tokenizer {} is not defined in nltk.tokenizer".format(tokenizer)) 42 | 43 | def __call__(self, batch: List[str]) -> List[List[str]]: 44 | """Tokenize given batch 45 | 46 | Args: 47 | batch: list of text samples 48 | 49 | Returns: 50 | list of lists of tokens 51 | """ 52 | return [self.tokenizer(sent) for sent in batch] 53 | -------------------------------------------------------------------------------- /deeppavlov/models/tokenizers/split_tokenizer.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017 Neural Networks and Deep Learning lab, MIPT 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from typing import List 16 | 17 | from deeppavlov.core.common.registry import register 18 | from deeppavlov.core.models.component import Component 19 | 20 | 21 | @register("split_tokenizer") 22 | class SplitTokenizer(Component): 23 | """ 24 | Generates utterance's tokens by mere python's ``str.split()``. 25 | 26 | Doesn't have any parameters. 27 | """ 28 | 29 | def __init__(self, **kwargs) -> None: 30 | pass 31 | 32 | def __call__(self, batch: List[str]) -> List[List[str]]: 33 | """ 34 | Tokenize given batch 35 | 36 | Args: 37 | batch: list of texts to tokenize 38 | 39 | Returns: 40 | tokenized batch 41 | """ 42 | if isinstance(batch, (list, tuple)): 43 | return [sample.split() for sample in batch] 44 | else: 45 | raise NotImplementedError('not implemented for types other than' 46 | ' list or tuple') 47 | -------------------------------------------------------------------------------- /deeppavlov/models/tokenizers/utils.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017 Neural Networks and Deep Learning lab, MIPT 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import re 16 | from typing import List, Generator, Any 17 | 18 | 19 | def detokenize(tokens): 20 | """ 21 | Detokenizing a text undoes the tokenizing operation, restores 22 | punctuation and spaces to the places that people expect them to be. 23 | Ideally, `detokenize(tokenize(text))` should be identical to `text`, 24 | except for line breaks. 25 | """ 26 | text = ' '.join(tokens) 27 | step0 = text.replace('. . .', '...') 28 | step1 = step0.replace("`` ", '"').replace(" ''", '"') 29 | step2 = step1.replace(" ( ", " (").replace(" ) ", ") ") 30 | step3 = re.sub(r' ([.,:;?!%]+)([ \'"`])', r"\1\2", step2) 31 | step4 = re.sub(r' ([.,:;?!%]+)$', r"\1", step3) 32 | step5 = step4.replace(" '", "'").replace(" n't", "n't") \ 33 | .replace(" nt", "nt").replace("can not", "cannot") 34 | step6 = step5.replace(" ` ", " '") 35 | return step6.strip() 36 | 37 | 38 | def ngramize(items: List[str], ngram_range=(1, 1), doc: str = None) -> Generator[List[str], Any, None]: 39 | """ 40 | Make ngrams from a list of tokens/lemmas 41 | :param items: list of tokens, lemmas or other strings to form ngrams 42 | :param ngram_range: range for producing ngrams, ex. for unigrams + bigrams should be set to 43 | (1, 2), for bigrams only should be set to (2, 2) 44 | :return: ngrams (as strings) generator 45 | """ 46 | 47 | ngrams = [] 48 | ranges = [(0, i) for i in range(ngram_range[0], ngram_range[1] + 1)] 49 | for r in ranges: 50 | ngrams += list(zip(*[items[j:] for j in range(*r)])) 51 | 52 | formatted_ngrams = [' '.join(item) for item in ngrams] 53 | if doc is not None: 54 | doc_lower = doc.lower() 55 | formatted_ngrams = [ngram for ngram in formatted_ngrams if (ngram in doc or ngram in doc_lower)] 56 | 57 | yield formatted_ngrams 58 | -------------------------------------------------------------------------------- /deeppavlov/models/torch_bert/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deeppavlov/DeepPavlov/5f9fbed0c7191466bc7621e604b810f66f254c03/deeppavlov/models/torch_bert/__init__.py -------------------------------------------------------------------------------- /deeppavlov/models/torch_bert/crf.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | from torch import nn 4 | from torchcrf import CRF as CRFbase 5 | 6 | 7 | class CRF(CRFbase): 8 | """Class with Conditional Random Field from PyTorch-CRF library 9 | with modified training function 10 | """ 11 | 12 | def __init__(self, num_tags: int, batch_first: bool = False) -> None: 13 | super().__init__(num_tags=num_tags, batch_first=batch_first) 14 | nn.init.zeros_(self.transitions) 15 | nn.init.zeros_(self.start_transitions) 16 | nn.init.zeros_(self.end_transitions) 17 | self.stats = torch.zeros((num_tags, num_tags), dtype=torch.float) 18 | self.zeros = torch.zeros((num_tags, num_tags), dtype=torch.float) 19 | self.neg = torch.full((num_tags, num_tags), -1000.0) 20 | 21 | def forward(self, tags_batch: torch.LongTensor, y_masks: np.ndarray): 22 | seq_lengths = np.sum(y_masks, axis=1) 23 | for seq_len, tags_list in zip(seq_lengths, tags_batch): 24 | if seq_len > 1: 25 | for i in range(seq_len - 1): 26 | self.stats[int(tags_list[i])][int(tags_list[i + 1])] += 1.0 27 | with torch.no_grad(): 28 | self.transitions.copy_(torch.where(self.stats > 0, self.zeros, self.neg)) 29 | -------------------------------------------------------------------------------- /deeppavlov/models/vectorizers/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /deeppavlov/requirements/datasets.txt: -------------------------------------------------------------------------------- 1 | datasets>=1.16.0,<2.5.0;python_version<="3.10" 2 | datasets==2.2.*;python_version=="3.11.*" 3 | -------------------------------------------------------------------------------- /deeppavlov/requirements/dependency_decoding.txt: -------------------------------------------------------------------------------- 1 | ufal.chu-liu-edmonds 2 | -------------------------------------------------------------------------------- /deeppavlov/requirements/en_core_web_sm.txt: -------------------------------------------------------------------------------- 1 | https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.5.0/en_core_web_sm-3.5.0-py3-none-any.whl 2 | spacy 3 | -------------------------------------------------------------------------------- /deeppavlov/requirements/faiss.txt: -------------------------------------------------------------------------------- 1 | faiss-cpu==1.7.2;python_version<="3.10" 2 | faiss-cpu==1.7.4;python_version=="3.11.*" 3 | -------------------------------------------------------------------------------- /deeppavlov/requirements/fasttext.txt: -------------------------------------------------------------------------------- 1 | fasttext==0.9.* 2 | -------------------------------------------------------------------------------- /deeppavlov/requirements/hdt.txt: -------------------------------------------------------------------------------- 1 | hdt==2.3 2 | -------------------------------------------------------------------------------- /deeppavlov/requirements/kenlm.txt: -------------------------------------------------------------------------------- 1 | pypi-kenlm==0.1.20220713;python_version<="3.10" 2 | kenlm==0.2.*;python_version=="3.11.*" 3 | -------------------------------------------------------------------------------- /deeppavlov/requirements/lxml.txt: -------------------------------------------------------------------------------- 1 | lxml==4.9.* 2 | -------------------------------------------------------------------------------- /deeppavlov/requirements/opt_einsum.txt: -------------------------------------------------------------------------------- 1 | opt-einsum==3.3.* 2 | -------------------------------------------------------------------------------- /deeppavlov/requirements/protobuf.txt: -------------------------------------------------------------------------------- 1 | protobuf<=3.20 2 | -------------------------------------------------------------------------------- /deeppavlov/requirements/pytorch.txt: -------------------------------------------------------------------------------- 1 | torch>=1.6.0,<1.14.0 2 | -------------------------------------------------------------------------------- /deeppavlov/requirements/rapidfuzz.txt: -------------------------------------------------------------------------------- 1 | rapidfuzz==2.1.* 2 | -------------------------------------------------------------------------------- /deeppavlov/requirements/razdel.txt: -------------------------------------------------------------------------------- 1 | razdel==0.5.0 2 | -------------------------------------------------------------------------------- /deeppavlov/requirements/ru_core_news_sm.txt: -------------------------------------------------------------------------------- 1 | https://github.com/explosion/spacy-models/releases/download/ru_core_news_sm-3.5.0/ru_core_news_sm-3.5.0-py3-none-any.whl 2 | spacy 3 | -------------------------------------------------------------------------------- /deeppavlov/requirements/sacremoses.txt: -------------------------------------------------------------------------------- 1 | sacremoses==0.0.53 2 | -------------------------------------------------------------------------------- /deeppavlov/requirements/sentencepiece.txt: -------------------------------------------------------------------------------- 1 | sentencepiece==0.2.0 2 | -------------------------------------------------------------------------------- /deeppavlov/requirements/slovnet.txt: -------------------------------------------------------------------------------- 1 | slovnet==0.5.* 2 | navec 3 | -------------------------------------------------------------------------------- /deeppavlov/requirements/sortedcontainers.txt: -------------------------------------------------------------------------------- 1 | sortedcontainers==2.4.* 2 | -------------------------------------------------------------------------------- /deeppavlov/requirements/torchcrf.txt: -------------------------------------------------------------------------------- 1 | pytorch-crf==0.7.* 2 | -------------------------------------------------------------------------------- /deeppavlov/requirements/transformers.txt: -------------------------------------------------------------------------------- 1 | transformers>=4.13.0,<4.25.0;python_version<"3.8" 2 | transformers==4.30.0;python_version>="3.8" 3 | -------------------------------------------------------------------------------- /deeppavlov/requirements/udapi.txt: -------------------------------------------------------------------------------- 1 | udapi==0.3.* 2 | -------------------------------------------------------------------------------- /deeppavlov/requirements/whapi.txt: -------------------------------------------------------------------------------- 1 | bs4 2 | whapi==0.6.* 3 | -------------------------------------------------------------------------------- /deeppavlov/settings.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017 Neural Networks and Deep Learning lab, MIPT 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import argparse 16 | 17 | from deeppavlov.core.common.paths import get_settings_path, populate_settings_dir 18 | 19 | parser = argparse.ArgumentParser() 20 | 21 | parser.add_argument("-d", "--default", action="store_true", help="return to defaults") 22 | 23 | 24 | def main(): 25 | """DeepPavlov console configuration utility.""" 26 | args = parser.parse_args() 27 | path = get_settings_path() 28 | 29 | if args.default: 30 | if populate_settings_dir(force=True): 31 | print(f'Populated {path} with default settings files') 32 | else: 33 | print(f'{path} is already a default settings directory') 34 | else: 35 | print(f'Current DeepPavlov settings path: {path}') 36 | 37 | 38 | if __name__ == "__main__": 39 | main() 40 | -------------------------------------------------------------------------------- /deeppavlov/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deeppavlov/DeepPavlov/5f9fbed0c7191466bc7621e604b810f66f254c03/deeppavlov/utils/__init__.py -------------------------------------------------------------------------------- /deeppavlov/utils/benchmarks/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deeppavlov/DeepPavlov/5f9fbed0c7191466bc7621e604b810f66f254c03/deeppavlov/utils/benchmarks/__init__.py -------------------------------------------------------------------------------- /deeppavlov/utils/connector/__init__.py: -------------------------------------------------------------------------------- 1 | from .dialog_logger import DialogLogger 2 | -------------------------------------------------------------------------------- /deeppavlov/utils/pip_wrapper/__init__.py: -------------------------------------------------------------------------------- 1 | from .pip_wrapper import * 2 | -------------------------------------------------------------------------------- /deeppavlov/utils/server/__init__.py: -------------------------------------------------------------------------------- 1 | from .server import get_server_params, get_ssl_params, redirect_root_to_docs, start_model_server 2 | -------------------------------------------------------------------------------- /deeppavlov/utils/server/metrics.py: -------------------------------------------------------------------------------- 1 | # Copyright 2020 Neural Networks and Deep Learning lab, MIPT 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import time 16 | from typing import Tuple 17 | 18 | from prometheus_client import CONTENT_TYPE_LATEST, REGISTRY, generate_latest 19 | from prometheus_client import Counter, Gauge, Histogram 20 | from starlette.middleware.base import BaseHTTPMiddleware, RequestResponseEndpoint 21 | from starlette.requests import Request 22 | from starlette.responses import Response 23 | from starlette.types import ASGIApp 24 | 25 | REQUESTS_COUNT = Counter('http_requests_count', 'Number of processed requests', ['endpoint', 'status_code']) 26 | REQUESTS_LATENCY = Histogram('http_requests_latency_seconds', 'Request latency histogram', ['endpoint']) 27 | REQUESTS_IN_PROGRESS = Gauge('http_requests_in_progress', 'Number of requests currently being processed', ['endpoint']) 28 | 29 | 30 | def metrics(request: Request) -> Response: 31 | return Response(generate_latest(REGISTRY), media_type=CONTENT_TYPE_LATEST) 32 | 33 | 34 | class PrometheusMiddleware(BaseHTTPMiddleware): 35 | def __init__(self, app: ASGIApp, ignore_paths: Tuple = ()) -> None: 36 | super().__init__(app) 37 | self.ignore_paths = ignore_paths 38 | 39 | async def dispatch(self, request: Request, call_next: RequestResponseEndpoint) -> Response: 40 | endpoint = request.url.path 41 | 42 | if endpoint in self.ignore_paths: 43 | return await call_next(request) 44 | 45 | REQUESTS_IN_PROGRESS.labels(endpoint=endpoint).inc() 46 | 47 | start_time = time.perf_counter() 48 | status_code = 500 49 | 50 | try: 51 | response = await call_next(request) 52 | status_code = response.status_code 53 | finally: 54 | if status_code == 200: 55 | duration = time.perf_counter() - start_time 56 | REQUESTS_LATENCY.labels(endpoint=endpoint).observe(duration) 57 | REQUESTS_COUNT.labels(endpoint=endpoint, status_code=status_code).inc() 58 | REQUESTS_IN_PROGRESS.labels(endpoint=endpoint).dec() 59 | 60 | return response 61 | -------------------------------------------------------------------------------- /deeppavlov/utils/settings/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deeppavlov/DeepPavlov/5f9fbed0c7191466bc7621e604b810f66f254c03/deeppavlov/utils/settings/__init__.py -------------------------------------------------------------------------------- /deeppavlov/utils/settings/dialog_logger_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "enabled": false, 3 | "logger_name": "default", 4 | "log_path": "~/.deeppavlov/dialog_logs", 5 | "logfile_max_size_kb": 10240, 6 | "ensure_ascii": false 7 | } -------------------------------------------------------------------------------- /deeppavlov/utils/settings/log_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "version": 1, 3 | "disable_existing_loggers": false, 4 | "loggers": { 5 | "deeppavlov": { 6 | "level": "INFO", 7 | "handlers": [ 8 | "stderr" 9 | ], 10 | "propagate": true 11 | }, 12 | "uvicorn.access": { 13 | "level": "INFO", 14 | "handlers": [ 15 | "uvicorn_handler" 16 | ], 17 | "propagate": true 18 | }, 19 | "uvicorn.error": { 20 | "level": "INFO", 21 | "handlers": [ 22 | "uvicorn_handler" 23 | ], 24 | "propagate": true 25 | }, 26 | "train_report": { 27 | "level": "INFO", 28 | "handlers": [ 29 | "train_handler" 30 | ], 31 | "propagate": true 32 | }, 33 | "filelock": { 34 | "level": "WARNING", 35 | "handlers": [ 36 | "stdout" 37 | ], 38 | "propagate": true 39 | } 40 | }, 41 | "formatters": { 42 | "default": { 43 | "format": "%(asctime)s.%(msecs)d %(levelname)s in '%(name)s'['%(module)s'] at line %(lineno)d: %(message)s", 44 | "datefmt": "%Y-%m-%d %H:%M:%S" 45 | }, 46 | "uvicorn_fmt": { 47 | "format": "%(asctime)s %(message)s", 48 | "datefmt": "%Y-%m-%d %H:%M:%S" 49 | }, 50 | "message": { 51 | "format": "%(message)s" 52 | } 53 | }, 54 | "handlers": { 55 | "file": { 56 | "class": "logging.FileHandler", 57 | "level": "DEBUG", 58 | "formatter": "default", 59 | "filename": "~/.deeppavlov/log.log" 60 | }, 61 | "stdout": { 62 | "class": "logging.StreamHandler", 63 | "level": "DEBUG", 64 | "formatter": "default", 65 | "stream": "ext://sys.stdout" 66 | }, 67 | "stderr": { 68 | "class": "logging.StreamHandler", 69 | "level": "DEBUG", 70 | "formatter": "default", 71 | "stream": "ext://sys.stderr" 72 | }, 73 | "uvicorn_handler": { 74 | "class": "logging.StreamHandler", 75 | "level": "INFO", 76 | "formatter": "uvicorn_fmt", 77 | "stream": "ext://sys.stdout", 78 | "filters": ["probeFilter"] 79 | }, 80 | "train_handler": { 81 | "class": "logging.StreamHandler", 82 | "level": "INFO", 83 | "formatter": "message", 84 | "stream": "ext://sys.stdout" 85 | } 86 | }, 87 | "filters": { 88 | "probeFilter": { 89 | "()": "deeppavlov.core.common.log.ProbeFilter" 90 | } 91 | } 92 | } 93 | -------------------------------------------------------------------------------- /deeppavlov/utils/settings/server_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "common_defaults": { 3 | "host": "0.0.0.0", 4 | "port": 5000, 5 | "model_args_names": [], 6 | "https": false, 7 | "https_cert_path": "", 8 | "https_key_path": "", 9 | "socket_type": "TCP", 10 | "unix_socket_file": "/tmp/deeppavlov_socket.s", 11 | "socket_launch_message": "launching socket server at" 12 | } 13 | } 14 | -------------------------------------------------------------------------------- /deeppavlov/utils/socket/__init__.py: -------------------------------------------------------------------------------- 1 | from .socket import encode, start_socket_server 2 | -------------------------------------------------------------------------------- /deeppavlov/vocabs/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deeppavlov/DeepPavlov/5f9fbed0c7191466bc7621e604b810f66f254c03/deeppavlov/vocabs/__init__.py -------------------------------------------------------------------------------- /deeppavlov/vocabs/wiki_sqlite.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017 Neural Networks and Deep Learning lab, MIPT 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from logging import getLogger 16 | from typing import List, Any, Optional, Union 17 | 18 | from deeppavlov.core.common.registry import register 19 | from deeppavlov.core.models.component import Component 20 | from deeppavlov.dataset_iterators.sqlite_iterator import SQLiteDataIterator 21 | 22 | logger = getLogger(__name__) 23 | 24 | 25 | @register('wiki_sqlite_vocab') 26 | class WikiSQLiteVocab(SQLiteDataIterator, Component): 27 | """Get content from SQLite database by document ids. 28 | 29 | Args: 30 | load_path: a path to local DB file 31 | join_docs: whether to join extracted docs with ' ' or not 32 | shuffle: whether to shuffle data or not 33 | 34 | Attributes: 35 | join_docs: whether to join extracted docs with ' ' or not 36 | 37 | """ 38 | 39 | def __init__(self, load_path: str, join_docs: bool = True, shuffle: bool = False, **kwargs) -> None: 40 | SQLiteDataIterator.__init__(self, load_path=load_path, shuffle=shuffle) 41 | self.join_docs = join_docs 42 | 43 | def __call__(self, doc_ids: Optional[List[List[Any]]] = None, *args, **kwargs) -> List[Union[str, List[str]]]: 44 | """Get the contents of files, stacked by space or as they are. 45 | 46 | Args: 47 | doc_ids: a batch of lists of ids to get contents for 48 | 49 | Returns: 50 | a list of contents / list of lists of contents 51 | """ 52 | all_contents = [] 53 | if not doc_ids: 54 | logger.warning('No doc_ids are provided in WikiSqliteVocab, return all docs') 55 | doc_ids = [self.get_doc_ids()] 56 | 57 | for ids in doc_ids: 58 | contents = [self.get_doc_content(doc_id) for doc_id in ids] 59 | if self.join_docs: 60 | contents = ' '.join(contents) 61 | all_contents.append(contents) 62 | 63 | return all_contents 64 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = -WT 6 | SPHINXBUILD = sphinx-build 7 | SPHINXPROJ = DeepPavlov 8 | SOURCEDIR = . 9 | BUILDDIR = _build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) -------------------------------------------------------------------------------- /docs/_static/aws_ec2/01_login_to_aws.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deeppavlov/DeepPavlov/5f9fbed0c7191466bc7621e604b810f66f254c03/docs/_static/aws_ec2/01_login_to_aws.png -------------------------------------------------------------------------------- /docs/_static/aws_ec2/02_choose_ubuntu.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deeppavlov/DeepPavlov/5f9fbed0c7191466bc7621e604b810f66f254c03/docs/_static/aws_ec2/02_choose_ubuntu.png -------------------------------------------------------------------------------- /docs/_static/aws_ec2/03_select_instance_type.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deeppavlov/DeepPavlov/5f9fbed0c7191466bc7621e604b810f66f254c03/docs/_static/aws_ec2/03_select_instance_type.png -------------------------------------------------------------------------------- /docs/_static/aws_ec2/04_add_storage.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deeppavlov/DeepPavlov/5f9fbed0c7191466bc7621e604b810f66f254c03/docs/_static/aws_ec2/04_add_storage.png -------------------------------------------------------------------------------- /docs/_static/aws_ec2/05_review_instance.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deeppavlov/DeepPavlov/5f9fbed0c7191466bc7621e604b810f66f254c03/docs/_static/aws_ec2/05_review_instance.png -------------------------------------------------------------------------------- /docs/_static/aws_ec2/06_go_to_running_instances.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deeppavlov/DeepPavlov/5f9fbed0c7191466bc7621e604b810f66f254c03/docs/_static/aws_ec2/06_go_to_running_instances.png -------------------------------------------------------------------------------- /docs/_static/aws_ec2/07_wait_init.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deeppavlov/DeepPavlov/5f9fbed0c7191466bc7621e604b810f66f254c03/docs/_static/aws_ec2/07_wait_init.png -------------------------------------------------------------------------------- /docs/_static/aws_ec2/08_01_set_sec_group.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deeppavlov/DeepPavlov/5f9fbed0c7191466bc7621e604b810f66f254c03/docs/_static/aws_ec2/08_01_set_sec_group.png -------------------------------------------------------------------------------- /docs/_static/aws_ec2/08_02_set_inbound.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deeppavlov/DeepPavlov/5f9fbed0c7191466bc7621e604b810f66f254c03/docs/_static/aws_ec2/08_02_set_inbound.png -------------------------------------------------------------------------------- /docs/_static/aws_ec2/09_01_select_connect.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deeppavlov/DeepPavlov/5f9fbed0c7191466bc7621e604b810f66f254c03/docs/_static/aws_ec2/09_01_select_connect.png -------------------------------------------------------------------------------- /docs/_static/aws_ec2/09_02_connection_info.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deeppavlov/DeepPavlov/5f9fbed0c7191466bc7621e604b810f66f254c03/docs/_static/aws_ec2/09_02_connection_info.png -------------------------------------------------------------------------------- /docs/_static/deeppavlov.css: -------------------------------------------------------------------------------- 1 | .wy-side-nav-search { 2 | background-color: #0176bd; 3 | } 4 | 5 | .wy-nav-content { 6 | max-width: 1000px; 7 | } 8 | 9 | .wy-side-nav-search>div.version { 10 | color: #ffffff; 11 | } -------------------------------------------------------------------------------- /docs/_static/deeppavlov.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deeppavlov/DeepPavlov/5f9fbed0c7191466bc7621e604b810f66f254c03/docs/_static/deeppavlov.png -------------------------------------------------------------------------------- /docs/_static/deeppavlov_logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deeppavlov/DeepPavlov/5f9fbed0c7191466bc7621e604b810f66f254c03/docs/_static/deeppavlov_logo.png -------------------------------------------------------------------------------- /docs/_static/dp_agnt_diag.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deeppavlov/DeepPavlov/5f9fbed0c7191466bc7621e604b810f66f254c03/docs/_static/dp_agnt_diag.png -------------------------------------------------------------------------------- /docs/_static/gobot_diagram.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deeppavlov/DeepPavlov/5f9fbed0c7191466bc7621e604b810f66f254c03/docs/_static/gobot_diagram.png -------------------------------------------------------------------------------- /docs/_static/ipavlov_footer.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deeppavlov/DeepPavlov/5f9fbed0c7191466bc7621e604b810f66f254c03/docs/_static/ipavlov_footer.png -------------------------------------------------------------------------------- /docs/_static/kvret_diagram.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deeppavlov/DeepPavlov/5f9fbed0c7191466bc7621e604b810f66f254c03/docs/_static/kvret_diagram.png -------------------------------------------------------------------------------- /docs/_static/my_blocks.css: -------------------------------------------------------------------------------- 1 | button.copybtn svg { 2 | width: 1.3em; 3 | height: 1.3em; 4 | padding: 0.1em; 5 | } 6 | 7 | button.copybtn { 8 | top: 0.2em; 9 | width: 1.4em; 10 | height: 1.4em; 11 | } 12 | 13 | .rst-content .linenodiv pre, .rst-content div[class^=highlight] pre, .rst-content pre.literal-block { 14 | font-size: 13px; 15 | line-height: 1.4; 16 | } 17 | -------------------------------------------------------------------------------- /docs/_static/social/Medium_Monogram.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Monogram 5 | Created with Sketch. 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /docs/_static/social/Twitter_Social_Icon_Circle_Color.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 5 | 9 | 10 | 11 | 12 | 13 | 19 | 20 | 21 | -------------------------------------------------------------------------------- /docs/_static/social/telegram.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deeppavlov/DeepPavlov/5f9fbed0c7191466bc7621e604b810f66f254c03/docs/_static/social/telegram.png -------------------------------------------------------------------------------- /docs/_static/social/youtube_social_circle_red.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deeppavlov/DeepPavlov/5f9fbed0c7191466bc7621e604b810f66f254c03/docs/_static/social/youtube_social_circle_red.png -------------------------------------------------------------------------------- /docs/_static/tree.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deeppavlov/DeepPavlov/5f9fbed0c7191466bc7621e604b810f66f254c03/docs/_static/tree.png -------------------------------------------------------------------------------- /docs/apiref/core.rst: -------------------------------------------------------------------------------- 1 | core 2 | ==== 3 | DeepPavlov Core 4 | 5 | .. automodule:: deeppavlov.core 6 | :members: 7 | 8 | .. toctree:: 9 | :glob: 10 | :caption: Core 11 | 12 | core/* 13 | -------------------------------------------------------------------------------- /docs/apiref/core/commands.rst: -------------------------------------------------------------------------------- 1 | deeppavlov.core.commands 2 | ======================== 3 | Basic training and inference functions. 4 | 5 | .. automodule:: deeppavlov.core.commands.infer 6 | :members: 7 | 8 | .. automodule:: deeppavlov.core.commands.train 9 | :members: 10 | -------------------------------------------------------------------------------- /docs/apiref/core/common.rst: -------------------------------------------------------------------------------- 1 | deeppavlov.core.common 2 | ====================== 3 | Registration and classes initialization functionality, class method decorators. 4 | 5 | .. autoclass:: deeppavlov.core.common.chainer.Chainer 6 | :members: 7 | 8 | .. automethod:: __call__ 9 | 10 | .. autoclass:: deeppavlov.core.common.base.Element 11 | 12 | .. automethod:: __init__ 13 | 14 | .. autoclass:: deeppavlov.core.common.base.Model 15 | 16 | .. automethod:: __init__ 17 | 18 | .. automodule:: deeppavlov.core.common.metrics_registry 19 | :members: 20 | 21 | .. automodule:: deeppavlov.core.common.params 22 | :members: 23 | 24 | .. automodule:: deeppavlov.core.common.registry 25 | :members: 26 | -------------------------------------------------------------------------------- /docs/apiref/core/data.rst: -------------------------------------------------------------------------------- 1 | deeppavlov.core.data 2 | ==================== 3 | DatasetReader, Vocab, DataLearningIterator and DataFittingIterator classes. 4 | 5 | .. autoclass:: deeppavlov.core.data.dataset_reader.DatasetReader 6 | 7 | .. autoclass:: deeppavlov.core.data.data_fitting_iterator.DataFittingIterator 8 | 9 | .. autoclass:: deeppavlov.core.data.data_learning_iterator.DataLearningIterator 10 | 11 | .. autoclass:: deeppavlov.core.data.simple_vocab.SimpleVocabulary 12 | -------------------------------------------------------------------------------- /docs/apiref/core/models.rst: -------------------------------------------------------------------------------- 1 | deeppavlov.core.models 2 | ====================== 3 | Abstract model classes and interfaces. 4 | 5 | .. autoclass:: deeppavlov.core.models.component.Component 6 | 7 | .. autoclass:: deeppavlov.core.models.serializable.Serializable 8 | 9 | .. autoclass:: deeppavlov.core.models.estimator.Estimator 10 | 11 | .. autoclass:: deeppavlov.core.models.nn_model.NNModel 12 | 13 | .. autoclass:: deeppavlov.core.models.torch_model.TorchModel 14 | -------------------------------------------------------------------------------- /docs/apiref/core/trainers.rst: -------------------------------------------------------------------------------- 1 | deeppavlov.core.trainers 2 | ======================== 3 | Trainer classes. 4 | 5 | .. autoclass:: deeppavlov.core.trainers.FitTrainer 6 | :members: 7 | 8 | .. autoclass:: deeppavlov.core.trainers.NNTrainer 9 | :members: 10 | :inherited-members: 11 | -------------------------------------------------------------------------------- /docs/apiref/dataset_iterators.rst: -------------------------------------------------------------------------------- 1 | dataset_iterators 2 | ================= 3 | Concrete DatasetIterator classes. 4 | 5 | .. autoclass:: deeppavlov.dataset_iterators.basic_classification_iterator.BasicClassificationDatasetIterator 6 | :members: 7 | 8 | .. autoclass:: deeppavlov.dataset_iterators.siamese_iterator.SiameseIterator 9 | 10 | .. autoclass:: deeppavlov.dataset_iterators.sqlite_iterator.SQLiteDataIterator 11 | 12 | .. autoclass:: deeppavlov.dataset_iterators.squad_iterator.SquadIterator 13 | 14 | .. automodule:: deeppavlov.dataset_iterators.typos_iterator 15 | :members: 16 | 17 | .. automodule:: deeppavlov.dataset_iterators.multitask_iterator 18 | :members: 19 | -------------------------------------------------------------------------------- /docs/apiref/dataset_readers.rst: -------------------------------------------------------------------------------- 1 | dataset_readers 2 | =============== 3 | Concrete DatasetReader classes. 4 | 5 | .. autoclass:: deeppavlov.dataset_readers.basic_classification_reader.BasicClassificationDatasetReader 6 | :members: 7 | 8 | .. autoclass:: deeppavlov.dataset_readers.conll2003_reader.Conll2003DatasetReader 9 | 10 | .. autoclass:: deeppavlov.dataset_readers.faq_reader.FaqDatasetReader 11 | :members: 12 | 13 | .. autoclass:: deeppavlov.dataset_readers.line_reader.LineReader 14 | :members: 15 | 16 | .. autoclass:: deeppavlov.dataset_readers.paraphraser_reader.ParaphraserReader 17 | 18 | .. autoclass:: deeppavlov.dataset_readers.squad_dataset_reader.SquadDatasetReader 19 | :members: 20 | 21 | .. automodule:: deeppavlov.dataset_readers.typos_reader 22 | :members: 23 | 24 | .. automodule:: deeppavlov.dataset_readers.ubuntu_v2_reader 25 | :members: 26 | 27 | .. automodule:: deeppavlov.dataset_readers.multitask_reader 28 | :members: 29 | -------------------------------------------------------------------------------- /docs/apiref/metrics.rst: -------------------------------------------------------------------------------- 1 | metrics 2 | ======= 3 | Different Metric functions. 4 | 5 | .. automodule:: deeppavlov.metrics 6 | :members: 7 | 8 | .. autofunction:: deeppavlov.metrics.accuracy.sets_accuracy 9 | 10 | .. autofunction:: deeppavlov.metrics.fmeasure.round_f1 11 | 12 | .. autofunction:: deeppavlov.metrics.fmeasure.round_f1_macro 13 | 14 | .. autofunction:: deeppavlov.metrics.fmeasure.round_f1_weighted 15 | 16 | .. autofunction:: deeppavlov.metrics.fmeasure.ner_f1 17 | 18 | .. autofunction:: deeppavlov.metrics.fmeasure.ner_token_f1 19 | 20 | .. autofunction:: deeppavlov.metrics.log_loss.sk_log_loss 21 | 22 | .. autofunction:: deeppavlov.metrics.roc_auc_score.roc_auc_score 23 | -------------------------------------------------------------------------------- /docs/apiref/models.rst: -------------------------------------------------------------------------------- 1 | models 2 | ====== 3 | Concrete Model classes. 4 | 5 | .. automodule:: deeppavlov.models 6 | :members: 7 | 8 | .. toctree:: 9 | :glob: 10 | :caption: Models 11 | 12 | models/* -------------------------------------------------------------------------------- /docs/apiref/models/api_requester.rst: -------------------------------------------------------------------------------- 1 | deeppavlov.models.api_requester 2 | =============================== 3 | 4 | .. automodule:: deeppavlov.models.api_requester 5 | :members: 6 | 7 | .. autoclass:: deeppavlov.models.api_requester.api_requester.ApiRequester 8 | 9 | .. automethod:: __call__ 10 | .. automethod:: get_async_response 11 | 12 | 13 | .. autoclass:: deeppavlov.models.api_requester.api_router.ApiRouter 14 | 15 | .. automethod:: __call__ 16 | -------------------------------------------------------------------------------- /docs/apiref/models/classifiers.rst: -------------------------------------------------------------------------------- 1 | deeppavlov.models.classifiers 2 | ============================= 3 | 4 | .. automodule:: deeppavlov.models.classifiers 5 | :members: 6 | 7 | .. autoclass:: deeppavlov.models.classifiers.torch_classification_model.TorchTextClassificationModel 8 | :members: 9 | 10 | .. automethod:: __call__ 11 | 12 | .. autoclass:: deeppavlov.models.classifiers.cos_sim_classifier.CosineSimilarityClassifier 13 | :members: 14 | 15 | .. automethod:: __call__ 16 | 17 | .. autoclass:: deeppavlov.models.classifiers.proba2labels.Proba2Labels 18 | :members: 19 | 20 | .. automethod:: __call__ 21 | -------------------------------------------------------------------------------- /docs/apiref/models/doc_retrieval.rst: -------------------------------------------------------------------------------- 1 | deeppavlov.models.doc_retrieval 2 | =============================== 3 | 4 | Document retrieval classes. 5 | 6 | .. automodule:: deeppavlov.models.doc_retrieval 7 | 8 | .. autoclass:: deeppavlov.models.doc_retrieval.tfidf_ranker.TfidfRanker 9 | :members: 10 | 11 | .. automethod:: __call__ 12 | 13 | .. autoclass:: deeppavlov.models.doc_retrieval.logit_ranker.LogitRanker 14 | :members: 15 | 16 | .. automethod:: __call__ 17 | 18 | .. autoclass:: deeppavlov.models.doc_retrieval.pop_ranker.PopRanker 19 | :members: 20 | 21 | .. automethod:: __call__ -------------------------------------------------------------------------------- /docs/apiref/models/embedders.rst: -------------------------------------------------------------------------------- 1 | deeppavlov.models.embedders 2 | ============================ 3 | 4 | .. autoclass:: deeppavlov.models.embedders.fasttext_embedder.FasttextEmbedder 5 | 6 | .. automethod:: __call__ 7 | .. automethod:: __iter__ 8 | 9 | .. autoclass:: deeppavlov.models.embedders.tfidf_weighted_embedder.TfidfWeightedEmbedder 10 | 11 | .. automethod:: __call__ 12 | 13 | .. autoclass:: deeppavlov.models.embedders.transformers_embedder.TransformersBertEmbedder 14 | 15 | .. automethod:: __call__ 16 | -------------------------------------------------------------------------------- /docs/apiref/models/entity_extraction.rst: -------------------------------------------------------------------------------- 1 | deeppavlov.models.entity_extraction 2 | =================================== 3 | 4 | .. autoclass:: deeppavlov.models.entity_extraction.ner_chunker.NerChunker 5 | 6 | .. automethod:: __init__ 7 | .. automethod:: __call__ 8 | 9 | .. autoclass:: deeppavlov.models.entity_extraction.entity_linking.EntityLinker 10 | 11 | .. automethod:: __init__ 12 | .. automethod:: __call__ 13 | 14 | .. autoclass:: deeppavlov.models.entity_extraction.entity_detection_parser.EntityDetectionParser 15 | 16 | .. automethod:: __init__ 17 | .. automethod:: __call__ 18 | 19 | .. autoclass:: deeppavlov.models.entity_extraction.entity_detection_parser.QuestionSignChecker 20 | -------------------------------------------------------------------------------- /docs/apiref/models/kbqa.rst: -------------------------------------------------------------------------------- 1 | deeppavlov.models.kbqa 2 | ====================== 3 | 4 | .. automodule:: deeppavlov.models.kbqa 5 | 6 | .. autoclass:: deeppavlov.models.kbqa.type_define.AnswerTypesExtractor 7 | 8 | .. automethod:: __init__ 9 | .. automethod:: __call__ 10 | 11 | .. autoclass:: deeppavlov.models.kbqa.query_generator.QueryGenerator 12 | 13 | .. automethod:: __init__ 14 | .. automethod:: __call__ 15 | 16 | .. autoclass:: deeppavlov.models.kbqa.query_generator_base.QueryGeneratorBase 17 | 18 | .. automethod:: __init__ 19 | .. automethod:: __call__ 20 | 21 | .. autoclass:: deeppavlov.models.kbqa.rel_ranking_infer.RelRankerInfer 22 | 23 | .. automethod:: __init__ 24 | .. automethod:: __call__ 25 | 26 | .. autoclass:: deeppavlov.models.kbqa.template_matcher.TemplateMatcher 27 | 28 | .. automethod:: __init__ 29 | .. automethod:: __call__ 30 | 31 | .. autoclass:: deeppavlov.models.kbqa.ru_adj_to_noun.RuAdjToNoun 32 | 33 | .. automethod:: __init__ 34 | .. automethod:: __call__ 35 | 36 | .. autoclass:: deeppavlov.models.kbqa.tree_to_sparql.TreeToSparql 37 | 38 | .. automethod:: __init__ 39 | .. automethod:: __call__ 40 | 41 | .. autoclass:: deeppavlov.models.kbqa.wiki_parser.WikiParser 42 | 43 | .. automethod:: __init__ 44 | .. automethod:: __call__ 45 | -------------------------------------------------------------------------------- /docs/apiref/models/preprocessors.rst: -------------------------------------------------------------------------------- 1 | deeppavlov.models.preprocessors 2 | =============================== 3 | 4 | .. autoclass:: deeppavlov.models.preprocessors.dirty_comments_preprocessor.DirtyCommentsPreprocessor 5 | 6 | .. automethod:: __call__ 7 | 8 | .. autoclass:: deeppavlov.models.preprocessors.mask.Mask 9 | 10 | .. autoclass:: deeppavlov.models.preprocessors.one_hotter.OneHotter 11 | 12 | .. autoclass:: deeppavlov.models.preprocessors.sanitizer.Sanitizer 13 | 14 | .. autofunction:: deeppavlov.models.preprocessors.str_lower.str_lower 15 | 16 | .. autoclass:: deeppavlov.models.preprocessors.str_token_reverser.StrTokenReverser 17 | 18 | .. automethod:: __call__ 19 | 20 | .. autoclass:: deeppavlov.models.preprocessors.str_utf8_encoder.StrUTF8Encoder 21 | 22 | .. automethod:: __call__ 23 | 24 | .. autoclass:: deeppavlov.models.preprocessors.odqa_preprocessors.DocumentChunker 25 | 26 | .. automethod:: __call__ 27 | 28 | .. autoclass:: deeppavlov.models.preprocessors.odqa_preprocessors.StringMultiplier 29 | 30 | .. automethod:: __call__ 31 | -------------------------------------------------------------------------------- /docs/apiref/models/relation_extraction.rst: -------------------------------------------------------------------------------- 1 | deeppavlov.models.relation_extraction 2 | ===================================== 3 | 4 | .. autoclass:: deeppavlov.models.relation_extraction.relation_extraction_bert.REBertModel 5 | 6 | .. automethod:: __init__ 7 | .. automethod:: __call__ 8 | .. automethod:: train_on_batch 9 | -------------------------------------------------------------------------------- /docs/apiref/models/sklearn.rst: -------------------------------------------------------------------------------- 1 | deeppavlov.models.sklearn 2 | ============================= 3 | 4 | .. automodule:: deeppavlov.models.sklearn 5 | :members: 6 | 7 | .. autoclass:: deeppavlov.models.sklearn.sklearn_component.SklearnComponent 8 | 9 | .. automethod:: __call__ 10 | .. automethod:: fit 11 | .. automethod:: init_from_scratch 12 | .. automethod:: load 13 | .. automethod:: save 14 | .. automethod:: compose_input_data 15 | .. automethod:: get_class_attributes 16 | .. automethod:: get_function_params 17 | -------------------------------------------------------------------------------- /docs/apiref/models/spelling_correction.rst: -------------------------------------------------------------------------------- 1 | deeppavlov.models.spelling_correction 2 | ===================================== 3 | 4 | .. autoclass:: deeppavlov.models.spelling_correction.brillmoore.ErrorModel 5 | 6 | .. automethod:: __call__ 7 | .. automethod:: fit 8 | .. automethod:: save 9 | .. automethod:: load 10 | 11 | .. autoclass:: deeppavlov.models.spelling_correction.levenshtein.LevenshteinSearcherComponent 12 | 13 | .. automethod:: __call__ 14 | 15 | 16 | .. autoclass:: deeppavlov.models.spelling_correction.electors.top1_elector.TopOneElector 17 | 18 | .. automethod:: __call__ 19 | 20 | .. autoclass:: deeppavlov.models.spelling_correction.electors.kenlm_elector.KenlmElector 21 | 22 | .. automethod:: __call__ 23 | -------------------------------------------------------------------------------- /docs/apiref/models/tokenizers.rst: -------------------------------------------------------------------------------- 1 | deeppavlov.models.tokenizers 2 | ============================ 3 | 4 | .. autoclass:: deeppavlov.models.tokenizers.nltk_moses_tokenizer.NLTKMosesTokenizer 5 | 6 | .. automethod:: __call__ 7 | 8 | .. autoclass:: deeppavlov.models.tokenizers.nltk_tokenizer.NLTKTokenizer 9 | 10 | .. automethod:: __call__ 11 | 12 | .. autoclass:: deeppavlov.models.tokenizers.split_tokenizer.SplitTokenizer 13 | 14 | .. autoclass:: deeppavlov.models.tokenizers.spacy_tokenizer.StreamSpacyTokenizer 15 | 16 | .. automethod:: __call__ -------------------------------------------------------------------------------- /docs/apiref/models/torch_bert.rst: -------------------------------------------------------------------------------- 1 | deeppavlov.models.torch_bert 2 | ============================ 3 | 4 | .. automodule:: deeppavlov.models.torch_bert 5 | :members: 6 | 7 | .. autoclass:: deeppavlov.models.preprocessors.torch_transformers_preprocessor.TorchTransformersPreprocessor 8 | 9 | .. automethod:: __call__ 10 | 11 | .. autoclass:: deeppavlov.models.preprocessors.torch_transformers_preprocessor.TorchTransformersNerPreprocessor 12 | 13 | .. automethod:: __call__ 14 | 15 | .. autoclass:: deeppavlov.models.preprocessors.torch_transformers_preprocessor.TorchBertRankerPreprocessor 16 | 17 | .. automethod:: __call__ 18 | 19 | .. autoclass:: deeppavlov.models.torch_bert.torch_transformers_classifier.TorchTransformersClassifierModel 20 | 21 | .. automethod:: __call__ 22 | .. automethod:: train_on_batch 23 | 24 | .. autoclass:: deeppavlov.models.torch_bert.torch_transformers_sequence_tagger.TorchTransformersSequenceTagger 25 | 26 | .. automethod:: __call__ 27 | .. automethod:: train_on_batch 28 | 29 | .. autoclass:: deeppavlov.models.torch_bert.torch_transformers_squad.TorchTransformersSquad 30 | 31 | .. automethod:: __call__ 32 | .. automethod:: train_on_batch 33 | 34 | .. autoclass:: deeppavlov.models.torch_bert.torch_bert_ranker.TorchBertRankerModel 35 | 36 | .. automethod:: __call__ 37 | .. automethod:: train_on_batch 38 | -------------------------------------------------------------------------------- /docs/apiref/models/vectorizers.rst: -------------------------------------------------------------------------------- 1 | deeppavlov.models.vectorizers 2 | ============================= 3 | 4 | 5 | .. autoclass:: deeppavlov.models.vectorizers.hashing_tfidf_vectorizer.HashingTfIdfVectorizer 6 | :members: 7 | 8 | .. automethod:: __call__ 9 | -------------------------------------------------------------------------------- /docs/apiref/vocabs.rst: -------------------------------------------------------------------------------- 1 | vocabs 2 | ====== 3 | Concrete Vocab classes. 4 | 5 | .. automodule:: deeppavlov.vocabs 6 | :members: 7 | 8 | .. autoclass:: deeppavlov.vocabs.wiki_sqlite.WikiSQLiteVocab 9 | :members: 10 | 11 | .. automethod:: __call__ 12 | 13 | .. automodule:: deeppavlov.vocabs.typos 14 | :members: 15 | -------------------------------------------------------------------------------- /docs/devguides/registry.rst: -------------------------------------------------------------------------------- 1 | Register your model 2 | =================== 3 | 4 | In order to extend the library, you need to register your classes and functions; it is done in two steps. 5 | 6 | 1. Decorate your :class:`~deeppavlov.core.models.component.Component` 7 | (or :class:`~deeppavlov.core.data.dataset_reader.DatasetReader`, 8 | or :class:`~deeppavlov.core.data.data_learning_iterator.DataLearningIterator`, 9 | or :class:`~deeppavlov.core.data.data_fitting_iterator.DataFittingIterator`) 10 | using :func:`~deeppavlov.core.common.registry.register` and/or metrics function 11 | using :func:`~deeppavlov.core.common.metrics_registry.register_metric`. 12 | 13 | 2. Rebuild the registry running from DeepPavlov root directory: 14 | 15 | :: 16 | 17 | python -m utils.prepare.registry 18 | 19 | This script imports all the modules in deeppavlov package, builds the registry from them and writes it to a file. 20 | 21 | 22 | However, it is possible to use some classes and functions inside configuration files without registering them explicitly. 23 | There are two options available here: 24 | 25 | - instead of ``{"class_name": "registered_component_name"}`` in config file use key-value pair similar to 26 | ``{"class_name": "my_package.my_module:MyClass"}`` 27 | 28 | - if your classes/functions are properly decorated but not included in the registry, use ``"metadata"`` section of 29 | your config file specifying imports as ``"metadata": {"imports": ["my_local_package.my_module", "global_package.module"]}``; 30 | then the second step described above will be unnecessary (local packages are imported from the current working 31 | directory). 32 | -------------------------------------------------------------------------------- /docs/features/hypersearch.rst: -------------------------------------------------------------------------------- 1 | Hyperparameters optimization 2 | ============================ 3 | 4 | You can search for best hyperparameters of your model in DeepPavlov by means of cross-validation. 5 | 6 | Cross-validation 7 | ~~~~~~~~~~~~~~~~ 8 | 9 | You can run cross-validation in DeepPavlov to select best parameters of your model. 10 | For this purpose you have to run special command 'paramserach'. for example: 11 | 12 | .. code:: bash 13 | 14 | python -m deeppavlov.paramsearch path_to_json_config.json --folds 5 15 | 16 | 17 | Parameters 18 | ---------- 19 | 20 | Cross validation command have several parameters: 21 | 22 | - ``config_path``: 23 | Specify config path, where you model is located. 24 | - ``--folds``: 25 | This parameter shows how many folds you need in cross validation. 26 | Do you want to use leave one out cross validation instead of folds? 27 | Just specify this: ``--folds loo``. 28 | If you want not to cross-validate just omit this parameter. 29 | - ``--search_type``: 30 | This parameter is optional - default value is "grid" (grid search). 31 | 32 | 33 | .. note:: 34 | 35 | Folds will be created automatically from union of train and validation datasets. 36 | 37 | 38 | Special parameters in config 39 | ---------------------------- 40 | Config file of model should be consist of parameters ranges for search. 41 | For example, you try to optimize regularization coefficient in model, 42 | so you should add additional parameter in config with suffix '_range'. 43 | Let's see example for logistic regression model: 44 | 45 | .. code:: python 46 | 47 | { 48 | "class_name": "faq_logreg_model", 49 | "in": "q_vect", 50 | "fit_on": ["q_vect", "y"], 51 | "c": {"search_choice": [1, 10, 100, 1000]}, 52 | "out": ["answer", "score"] 53 | } 54 | 55 | In this example parameter "c" described as search_choice, values for grid search: 56 | 57 | .. code:: python 58 | 59 | {"search_choice": [value_0, ..., value_n]} 60 | 61 | 62 | Results 63 | ------- 64 | As a result you'll have new json config with best model parameters. 65 | It'll be stored in the same directory as config file and will have suffix '_cvbest.json'. 66 | Also you'll see final log messages about best model: 67 | 68 | .. code:: bash 69 | 70 | INFO in '__main__'['paramsearch'] at line 169: Best model params: {'C': 10000, 'penalty': 'l1', 'accuracy': 0.81466} 71 | INFO in '__main__'['paramsearch'] at line 184: Best model saved in json-file: path_to_model_config_cvbest.json 72 | -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | Welcome to DeepPavlov's documentation! 2 | ====================================== 3 | 4 | .. toctree:: 5 | :glob: 6 | :maxdepth: 1 7 | 8 | Installation 9 | QuickStart 10 | General concepts 11 | Configuration file 12 | Python pipelines 13 | Models overview 14 | 15 | 16 | .. toctree:: 17 | :glob: 18 | :maxdepth: 2 19 | :caption: Features 20 | 21 | Pre-trained embeddings 22 | AutoML 23 | 24 | 25 | .. toctree:: 26 | :glob: 27 | :maxdepth: 1 28 | :caption: Models 29 | 30 | Multitask BERT 31 | Context Question Answering 32 | Classification 33 | Few-shot Classification 34 | Named Entity Recognition 35 | Entity Extraction 36 | BERT-based models 37 | Morphological Tagging 38 | Neural Ranking 39 | Spelling Correction 40 | Syntactic Parsing 41 | TF-IDF Ranking 42 | Popularity Ranking 43 | Knowledge Base Question answering 44 | Relation Extraction 45 | SuperGLUE Submission 46 | Open-Domain Question Answering 47 | 48 | 49 | .. toctree:: 50 | :glob: 51 | :maxdepth: 3 52 | :caption: Integrations 53 | 54 | REST API 55 | Socket API 56 | Amazon AWS deployment 57 | DeepPavlov settings 58 | 59 | 60 | .. toctree:: 61 | :glob: 62 | :maxdepth: 3 63 | :caption: Developer Guides 64 | 65 | Contribution guide 66 | Register your model 67 | 68 | 69 | .. toctree:: 70 | :glob: 71 | :maxdepth: 3 72 | :caption: Internships 73 | 74 | Internships 75 | 76 | 77 | .. toctree:: 78 | :glob: 79 | :maxdepth: 3 80 | :caption: Package Reference 81 | 82 | apiref/* 83 | 84 | 85 | Indices and tables 86 | ================== 87 | 88 | * :ref:`genindex` 89 | * :ref:`modindex` 90 | -------------------------------------------------------------------------------- /docs/integrations/settings.rst: -------------------------------------------------------------------------------- 1 | DeepPavlov settings 2 | =================== 3 | 4 | DeepPavlov provides some tools to facilitate its usage (e.g. dialog logging, settings management). This document is aimed to guide you through them. 5 | 6 | 1. Settings files access and management 7 | --------------------------------------- 8 | 9 | Most of DeepPavlov settings are located in settings files, which in turn are located in a settings folder. Default settings folder location is ``deeppavlov/utils/settings`` . 10 | 11 | You can override a settings directory path by setting the ``DP_SETTINGS_PATH`` environment variable. Missing files will be added automatically when running any deeppavlov script. 12 | 13 | You can get current full path to settings directory with ``python -m deeppavlov.settings``. 14 | To reset settings in the current settings directory one can use ``python -m deeppavlov.settings -d``. 15 | 16 | 2. Dialog logging 17 | ----------------- 18 | 19 | DeepPavlov supports logging of infered utterances and DeepPavlov model responses. You can manage dialog logging by 20 | editing ``dialog_logger_config.json`` file in a settings directory. 21 | 22 | Following dialog logging settings are available: 23 | 24 | 1. **enabled** (default: ``false``): turns on/off dialog logging for DeepPavlov instance; 25 | 2. **log_path** (default: ``~/.deeppavlov/dialog_logs``): sets directory where dialog logs are stored; 26 | 3. **logger_name** (default: ``default``): sets subdirectory name for storing dialog logs; 27 | 4. **logfile_max_size_kb** (default: ``10240``): sets logfile maximum size in kilobytes. If exceeded, new log file is created; 28 | 5. **ensure_ascii** (default: ``false``): If ``true``, converts all non-ASCII symbols in logged content to Unicode code points. 29 | 30 | 3. Environment variables 31 | ------------------------ 32 | 33 | - **DP_SETTINGS_PATH** — custom path to a directory that contains settings files. It's automatically populated with missing files when running any deeppavlov scripts. 34 | - **DP_SKIP_NLTK_DOWNLOAD** set to ``TRUE`` to prevent automatic downloading of **nltk** packages (``punkt``, ``stopwords``, ``perluniprops``, ``nonbreaking_prefixes``) 35 | -------------------------------------------------------------------------------- /docs/internships/internships.rst: -------------------------------------------------------------------------------- 1 | 2 | Internships 3 | =========== 4 | 5 | Do you have ideas on how to improve dialog systems for everyone? Are you ready to make an impact across the world? 6 | Great, then join us! 7 | 8 | Let’s shape the future of Conversational AI together. An internship is for aspiring graduate and undergraduate students 9 | who are passionate about Conversational AI technology and offer diverse perspectives. 10 | 11 | As an intern, you will work on some of the most ambitious technical problems, develop new ML solutions that will impact 12 | future DeepPavlov products and make the lives of DeepPavlov users easier. 13 | 14 | All interns are paired with a mentor and will participate directly in DeepPavlov's groundbreaking work. 15 | There are no restrictions on publications based on internships. International candidates are welcome to apply. 16 | 17 | Each of our research teams has specific test assignments for interested candidates, so please familiarize yourself 18 | with our `projects `_ that best match your skills and interests. 19 | 20 | `Apply now at our website `_. 21 | -------------------------------------------------------------------------------- /docs/intro/installation.rst: -------------------------------------------------------------------------------- 1 | Installation 2 | ============ 3 | 4 | DeepPavlov supports **Linux**, **Windows 10+** (through WSL/WSL2), **MacOS** (Big Sur+) platforms, **Python 3.6-3.11**. 5 | Depending on the model used, you may need from 4 to 16 GB RAM. 6 | 7 | Install with pip 8 | ~~~~~~~~~~~~~~~~ 9 | 10 | You should install DeepPavlov in a `virtual environment `_. If you’re 11 | unfamiliar with Python virtual environments, take a look at this 12 | `guide `_. A virtual 13 | environment makes it easier to manage different projects, and avoid compatibility issues between dependencies. 14 | 15 | #. Create a virtual environment: 16 | 17 | .. code:: bash 18 | 19 | python -m venv env 20 | 21 | #. Activate the virtual environment on Linux (`source` could be replaced with `.`): 22 | 23 | .. code:: bash 24 | 25 | source env/bin/activate 26 | 27 | #. Install DeepPavlov inside this virtual environment: 28 | 29 | .. code:: bash 30 | 31 | pip install deeppavlov 32 | 33 | Install from source 34 | ~~~~~~~~~~~~~~~~~~~ 35 | 36 | Install DeepPavlov **dev** branch from source with the following command: 37 | 38 | .. code:: bash 39 | 40 | pip install git+http://github.com/deeppavlov/DeepPavlov@dev 41 | 42 | This command installs the bleeding edge dev version rather than the latest release version. The dev version is useful 43 | for staying up-to-date with the latest developments. For instance, if a bug has been fixed since the last release but 44 | a new release hasn’t been rolled out yet. However, this means the dev version may not always be stable. 45 | 46 | Editable install 47 | ~~~~~~~~~~~~~~~~ 48 | 49 | You will need an editable install if you want to make changes in the DeepPavlov source code that immediately take place 50 | without requiring a new installation. 51 | 52 | Clone the repository and install DeepPavlov with the following commands: 53 | 54 | .. code:: bash 55 | 56 | git clone http://github.com/deeppavlov/DeepPavlov.git 57 | pip install -e DeepPavlov 58 | 59 | Docker Images 60 | ~~~~~~~~~~~~~ 61 | 62 | We have built several DeepPavlov based Docker images, which include: 63 | 64 | * DeepPavlov based Jupyter notebook Docker image; 65 | * Docker images which serve some of our models and allow to access them 66 | via REST API (:doc:`riseapi ` mode). 67 | 68 | Here is our `DockerHub repository `_ with 69 | images and deployment instructions. 70 | -------------------------------------------------------------------------------- /docs/intro/overview.rst: -------------------------------------------------------------------------------- 1 | Conceptual overview 2 | =================== 3 | 4 | Our goal is to enable AI-application developers and researchers with: 5 | 6 | - A set of pre-trained NLP models, pre-defined dialog system components 7 | (ML/DL/Rule-based), and pipeline templates; 8 | - A framework for implementing and testing their own dialog models; 9 | - Tools for application integration with adjacent infrastructure 10 | (messengers, helpdesk software, etc.); 11 | - Benchmarking environments for conversational models and uniform access 12 | to relevant datasets. 13 | 14 | .. image:: ../_static/dp_agnt_diag.png 15 | 16 | 17 | Key Concepts 18 | ------------ 19 | 20 | - A ``Model`` is any NLP model that doesn't necessarily communicates 21 | with the user in natural language. 22 | - A ``Component`` is a reusable functional part of a ``Model``. 23 | - ``Rule-based Models`` cannot be trained. 24 | - ``Machine Learning Models`` can be trained only stand alone. 25 | - ``Deep Learning Models`` can be trained independently and in an 26 | end-to-end mode being joined in a chain. 27 | - A ``Chainer`` builds a model pipeline from heterogeneous 28 | components (Rule-based/ML/DL). It allows one to train and infer models in 29 | a pipeline as a whole. 30 | 31 | The smallest building block of the library is a ``Component``. 32 | A ``Component`` stands for any kind of function in an NLP pipeline. It can 33 | be implemented as a neural network, a non-neural ML model, or a 34 | rule-based system. 35 | 36 | ``Component``\ s can be joined into a ``Model``. A ``Model`` 37 | solves a larger NLP task than a ``Component``. However, in terms of 38 | implementation, ``Model``\ s are not different from ``Component``\ s. 39 | 40 | Most of DeepPavlov models are built on top of `PyTorch `__. 41 | Other external libraries can be used to build basic components. 42 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | fastapi>=0.47.0,<=0.89.1 2 | filelock>=3.0.0,<3.10.0 3 | nltk>=3.2.4,<3.10.0 4 | numpy<1.24 5 | pandas>=1.0.0,<1.6.0 6 | prometheus-client>=0.13.0,<=1.16.0 7 | pydantic<2 8 | pybind11==2.10.3 9 | requests>=2.19.0,<3.0.0 10 | scikit-learn>=0.24,<1.1.0;python_version<="3.10" 11 | scikit-learn==1.4.0;python_version=="3.11.*" 12 | tqdm>=4.42.0,<4.65.0 13 | uvicorn>=0.13.0,<0.19.0 14 | wheel 15 | scipy<1.10.0;python_version<"3.8" 16 | scipy==1.10.0;python_version>="3.8" 17 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deeppavlov/DeepPavlov/5f9fbed0c7191466bc7621e604b810f66f254c03/tests/__init__.py -------------------------------------------------------------------------------- /tests/test_configs/doc_retrieval/en_ranker_tfidf_wiki_test.json: -------------------------------------------------------------------------------- 1 | { 2 | "dataset_reader": { 3 | "class_name": "odqa_reader", 4 | "data_path": "{DOWNLOADS_PATH}/odqa/enwiki_test", 5 | "save_path": "{DOWNLOADS_PATH}/odqa/enwiki_test.db", 6 | "dataset_format": "txt" 7 | }, 8 | "dataset_iterator": { 9 | "class_name": "sqlite_iterator", 10 | "shuffle": false, 11 | "load_path": "{DOWNLOADS_PATH}/odqa/enwiki_test.db" 12 | }, 13 | "chainer": { 14 | "in": [ 15 | "docs" 16 | ], 17 | "in_y": [ 18 | "doc_ids", 19 | "doc_nums" 20 | ], 21 | "out": [ 22 | "tfidf_doc_ids" 23 | ], 24 | "pipe": [ 25 | { 26 | "class_name": "hashing_tfidf_vectorizer", 27 | "id": "vectorizer", 28 | "fit_on": [ 29 | "docs", 30 | "doc_ids", 31 | "doc_nums" 32 | ], 33 | "save_path": "{DOWNLOADS_PATH}/odqa/enwiki_test_tfidf.npz", 34 | "load_path": "{DOWNLOADS_PATH}/odqa/enwiki_test_tfidf.npz", 35 | "tokenizer": { 36 | "class_name": "stream_spacy_tokenizer", 37 | "lemmas": true, 38 | "ngram_range": [ 39 | 1, 40 | 2 41 | ] 42 | } 43 | }, 44 | { 45 | "class_name": "tfidf_ranker", 46 | "top_n": 20, 47 | "in": [ 48 | "docs" 49 | ], 50 | "out": [ 51 | "tfidf_doc_ids", 52 | "tfidf_doc_scores" 53 | ], 54 | "vectorizer": "#vectorizer" 55 | } 56 | ] 57 | }, 58 | "train": { 59 | "batch_size": 2, 60 | "evaluation_targets": [], 61 | "class_name": "fit_trainer" 62 | }, 63 | "metadata": { 64 | "variables": { 65 | "ROOT_PATH": "~/.deeppavlov", 66 | "DOWNLOADS_PATH": "{ROOT_PATH}/downloads", 67 | "MODELS_PATH": "{ROOT_PATH}/models" 68 | }, 69 | "download": [ 70 | { 71 | "url": "http://files.deeppavlov.ai/datasets/wikipedia/enwiki_test.tar.gz", 72 | "subdir": "{DOWNLOADS_PATH}/odqa" 73 | } 74 | ] 75 | } 76 | } -------------------------------------------------------------------------------- /tests/test_configs/doc_retrieval/ru_ranker_tfidf_wiki_test.json: -------------------------------------------------------------------------------- 1 | { 2 | "dataset_reader": { 3 | "class_name": "odqa_reader", 4 | "data_path": "{DOWNLOADS_PATH}/odqa/ruwiki_test", 5 | "save_path": "{DOWNLOADS_PATH}/odqa/ruwiki_test.db", 6 | "dataset_format": "txt" 7 | }, 8 | "dataset_iterator": { 9 | "class_name": "sqlite_iterator", 10 | "shuffle": false, 11 | "load_path": "{DOWNLOADS_PATH}/odqa/ruwiki_test.db" 12 | }, 13 | "chainer": { 14 | "in": [ 15 | "docs" 16 | ], 17 | "in_y": [ 18 | "doc_ids", 19 | "doc_nums" 20 | ], 21 | "out": [ 22 | "tfidf_doc_ids" 23 | ], 24 | "pipe": [ 25 | { 26 | "class_name": "hashing_tfidf_vectorizer", 27 | "id": "vectorizer", 28 | "fit_on": [ 29 | "docs", 30 | "doc_ids", 31 | "doc_nums" 32 | ], 33 | "save_path": "{DOWNLOADS_PATH}/odqa/ruwiki_test_tfidf.npz", 34 | "load_path": "{DOWNLOADS_PATH}/odqa/ruwiki_test_tfidf.npz", 35 | "tokenizer": { 36 | "class_name": "stream_spacy_tokenizer", 37 | "spacy_model": "ru_core_news_sm", 38 | "lemmas": true, 39 | "lowercase": true, 40 | "filter_stopwords": true, 41 | "ngram_range": [ 42 | 1, 43 | 2 44 | ] 45 | } 46 | }, 47 | { 48 | "class_name": "tfidf_ranker", 49 | "top_n": 20, 50 | "in": [ 51 | "docs" 52 | ], 53 | "out": [ 54 | "tfidf_doc_ids", 55 | "tfidf_doc_scores" 56 | ], 57 | "vectorizer": "#vectorizer" 58 | } 59 | ] 60 | }, 61 | "train": { 62 | "batch_size": 2, 63 | "evaluation_targets": [], 64 | "class_name": "fit_trainer" 65 | }, 66 | "metadata": { 67 | "variables": { 68 | "ROOT_PATH": "~/.deeppavlov", 69 | "DOWNLOADS_PATH": "{ROOT_PATH}/downloads", 70 | "MODELS_PATH": "{ROOT_PATH}/models" 71 | }, 72 | "download": [ 73 | { 74 | "url": "http://files.deeppavlov.ai/datasets/wikipedia/ruwiki_test.tar.gz", 75 | "subdir": "{DOWNLOADS_PATH}/odqa" 76 | } 77 | ] 78 | } 79 | } -------------------------------------------------------------------------------- /utils/Docker/Dockerfile: -------------------------------------------------------------------------------- 1 | ARG BASE_IMAGE 2 | 3 | FROM $BASE_IMAGE 4 | 5 | SHELL ["/bin/bash", "-c"] 6 | 7 | ENV DP_PYTEST_API_PORT=5000 8 | ENV DP_PYTEST_NO_CACHE=True 9 | ENV LANG='en_US.UTF-8' 10 | 11 | ARG DEBIAN_FRONTEND=noninteractive 12 | ARG PYTHON_VERSION 13 | 14 | RUN rm -f /etc/apt/sources.list.d/cuda*.list && \ 15 | apt update && \ 16 | apt install -y --no-install-recommends \ 17 | build-essential \ 18 | dpkg-dev \ 19 | gcc \ 20 | git \ 21 | libbz2-dev \ 22 | libc6-dev \ 23 | libexpat1-dev \ 24 | libffi-dev \ 25 | libgdbm-dev \ 26 | liblzma-dev \ 27 | libncursesw5-dev \ 28 | libreadline-dev \ 29 | libsqlite3-dev \ 30 | libssl-dev \ 31 | libxslt-dev \ 32 | locales \ 33 | make \ 34 | pandoc \ 35 | tk-dev \ 36 | wget \ 37 | xz-utils \ 38 | zlib1g-dev && \ 39 | locale-gen en_US.UTF-8 && \ 40 | wget --no-check-certificate -O python.tar.xz https://www.python.org/ftp/python/$PYTHON_VERSION/Python-$PYTHON_VERSION.tar.xz && \ 41 | mkdir -p /usr/src/python && \ 42 | tar -xC /usr/src/python --strip-components=1 -f python.tar.xz && \ 43 | rm python.tar.xz && \ 44 | cd /usr/src/python && \ 45 | ./configure && \ 46 | make -j "$(nproc)" altinstall && \ 47 | ln -s /usr/local/bin/python${PYTHON_VERSION%.*} /usr/local/bin/python && \ 48 | ln -s /usr/local/bin/pip${PYTHON_VERSION%.*} /usr/local/bin/pip && \ 49 | pip install --upgrade pip && \ 50 | pip install pybind11==2.2.4 && \ 51 | rm -rf /usr/src/python /var/lib/apt/lists/* 52 | 53 | WORKDIR /app 54 | 55 | # two commands to prevent caching of the next layers 56 | ARG EPOCH 57 | ENV EPOCH=$EPOCH 58 | 59 | COPY . . 60 | 61 | CMD utils/Docker/cmd.sh 62 | -------------------------------------------------------------------------------- /utils/Docker/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deeppavlov/DeepPavlov/5f9fbed0c7191466bc7621e604b810f66f254c03/utils/Docker/README.md -------------------------------------------------------------------------------- /utils/Docker/cmd.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -e 4 | 5 | pip install .[tests,docs] 6 | 7 | rm -rf `find . -mindepth 1 -maxdepth 1 ! -name tests ! -name Jenkinsfile ! -name docs` 8 | 9 | cd docs 10 | make clean 11 | make html 12 | cd .. 13 | 14 | flake8 `python -c 'import deeppavlov; print(deeppavlov.__path__[0])'` --count --select=E9,F63,F7,F82 --show-source --statistics 15 | 16 | pytest -v --disable-warnings --instafail $PYTEST_ARGS 17 | -------------------------------------------------------------------------------- /utils/Docker/docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: '3.7' 2 | services: 3 | py36: 4 | build: 5 | context: ../../ 6 | dockerfile: utils/Docker/Dockerfile 7 | args: 8 | - EPOCH=$EPOCH 9 | - PYTHON_VERSION=3.6.15 10 | - BASE_IMAGE=nvidia/cuda:11.5.2-cudnn8-runtime-ubuntu20.04 11 | user: '${UID}:${GID}' 12 | environment: 13 | - CUDA_VISIBLE_DEVICES=$TEST_GPU_0 14 | - PYTEST_ARGS=$PYTEST_ARGS 15 | - DP_PYTEST_NO_CACHE=True 16 | py37: 17 | build: 18 | context: ../../ 19 | dockerfile: utils/Docker/Dockerfile 20 | args: 21 | - EPOCH=$EPOCH 22 | - PYTHON_VERSION=3.7.16 23 | - BASE_IMAGE=nvidia/cuda:11.5.2-cudnn8-runtime-ubuntu20.04 24 | user: '${UID}:${GID}' 25 | environment: 26 | - CUDA_VISIBLE_DEVICES=$TEST_GPU_1 27 | - PYTEST_ARGS=$PYTEST_ARGS 28 | - DP_PYTEST_NO_CACHE=True 29 | py38: 30 | build: 31 | context: ../../ 32 | dockerfile: utils/Docker/Dockerfile 33 | args: 34 | - EPOCH=$EPOCH 35 | - PYTHON_VERSION=3.8.16 36 | - BASE_IMAGE=nvidia/cuda:11.5.2-cudnn8-runtime-ubuntu20.04 37 | user: '${UID}:${GID}' 38 | environment: 39 | - CUDA_VISIBLE_DEVICES=$TEST_GPU_0 40 | - PYTEST_ARGS=$PYTEST_ARGS 41 | - DP_PYTEST_NO_CACHE=True 42 | py39: 43 | build: 44 | context: ../../ 45 | dockerfile: utils/Docker/Dockerfile 46 | args: 47 | - EPOCH=$EPOCH 48 | - PYTHON_VERSION=3.9.16 49 | - BASE_IMAGE=nvidia/cuda:11.5.2-cudnn8-runtime-ubuntu20.04 50 | user: '${UID}:${GID}' 51 | environment: 52 | - CUDA_VISIBLE_DEVICES=$TEST_GPU_1 53 | - PYTEST_ARGS=$PYTEST_ARGS 54 | - DP_PYTEST_NO_CACHE=True 55 | py310: 56 | build: 57 | context: ../../ 58 | dockerfile: utils/Docker/Dockerfile 59 | args: 60 | - EPOCH=$EPOCH 61 | - PYTHON_VERSION=3.10.9 62 | - BASE_IMAGE=nvidia/cuda:11.5.2-cudnn8-runtime-ubuntu20.04 63 | user: '${UID}:${GID}' 64 | environment: 65 | - CUDA_VISIBLE_DEVICES=$TEST_GPU_0 66 | - PYTEST_ARGS=$PYTEST_ARGS 67 | - DP_PYTEST_NO_CACHE=True 68 | py311: 69 | build: 70 | context: ../../ 71 | dockerfile: utils/Docker/Dockerfile 72 | args: 73 | - EPOCH=$EPOCH 74 | - PYTHON_VERSION=3.11.6 75 | - BASE_IMAGE=nvidia/cuda:11.5.2-cudnn8-runtime-ubuntu20.04 76 | user: '${UID}:${GID}' 77 | environment: 78 | - CUDA_VISIBLE_DEVICES=$TEST_GPU_1 79 | - PYTEST_ARGS=$PYTEST_ARGS 80 | - DP_PYTEST_NO_CACHE=True 81 | -------------------------------------------------------------------------------- /utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deeppavlov/DeepPavlov/5f9fbed0c7191466bc7621e604b810f66f254c03/utils/__init__.py -------------------------------------------------------------------------------- /utils/prepare/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deeppavlov/DeepPavlov/5f9fbed0c7191466bc7621e604b810f66f254c03/utils/prepare/__init__.py -------------------------------------------------------------------------------- /utils/prepare/registry.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017 Neural Networks and Deep Learning lab, MIPT 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import json 16 | import pkgutil 17 | from importlib import import_module, reload 18 | 19 | import deeppavlov 20 | from deeppavlov.core.common.metrics_registry import _registry_path as m_registry_path, _REGISTRY as M_REGISTRY 21 | from deeppavlov.core.common.registry import _registry_path as c_registry_path, _REGISTRY as C_REGISTRY 22 | 23 | if __name__ == '__main__': 24 | C_REGISTRY.clear() 25 | M_REGISTRY.clear() 26 | 27 | for _, pkg_name, _ in pkgutil.walk_packages(deeppavlov.__path__, deeppavlov.__name__ + '.'): 28 | if pkg_name not in ('deeppavlov.core.common.registry', 'deeppavlov.core.common.metrics_registry'): 29 | reload(import_module(pkg_name)) 30 | 31 | with c_registry_path.open('w', encoding='utf-8') as f: 32 | json.dump(dict(sorted(C_REGISTRY.items())), f, indent=2) 33 | 34 | with m_registry_path.open('w', encoding='utf-8') as f: 35 | json.dump(dict(sorted(M_REGISTRY.items())), f, indent=2) 36 | --------------------------------------------------------------------------------