├── .github
└── ISSUE_TEMPLATE
│ ├── bug_report.md
│ ├── config.yml
│ └── feature-request.md
├── .gitignore
├── .readthedocs.yml
├── CNAME
├── Jenkinsfile
├── LICENSE
├── MANIFEST.in
├── NLP_OSS_DeepPavlov_ACL_Demo_final.pdf
├── README.md
├── _config.yml
├── _layouts
└── default.html
├── deeppavlov
├── __init__.py
├── __main__.py
├── _meta.py
├── configs
│ ├── __init__.py
│ ├── classifiers
│ │ ├── boolqa_rubert.json
│ │ ├── few_shot_roberta.json
│ │ ├── glue
│ │ │ ├── glue_cola_roberta.json
│ │ │ ├── glue_mnli_cased_bert_torch.json
│ │ │ ├── glue_mnli_mm_cased_bert_torch.json
│ │ │ ├── glue_mnli_roberta.json
│ │ │ ├── glue_mrpc_roberta.json
│ │ │ ├── glue_qnli_roberta.json
│ │ │ ├── glue_qqp_roberta.json
│ │ │ ├── glue_rte_cased_bert_torch.json
│ │ │ ├── glue_rte_roberta_mnli.json
│ │ │ ├── glue_sst2_roberta.json
│ │ │ ├── glue_stsb_roberta.json
│ │ │ └── glue_wnli_roberta.json
│ │ ├── insults_kaggle_bert.json
│ │ ├── paraphraser_convers_distilrubert_2L.json
│ │ ├── paraphraser_convers_distilrubert_6L.json
│ │ ├── paraphraser_rubert.json
│ │ ├── query_pr.json
│ │ ├── rusentiment_bert.json
│ │ ├── rusentiment_convers_bert.json
│ │ ├── rusentiment_convers_distilrubert_2L.json
│ │ ├── rusentiment_convers_distilrubert_6L.json
│ │ ├── sentiment_sst_conv_bert.json
│ │ ├── sentiment_twitter.json
│ │ ├── superglue
│ │ │ ├── superglue_boolq_roberta_mnli.json
│ │ │ ├── superglue_copa_roberta.json
│ │ │ ├── superglue_record_roberta.json
│ │ │ └── superglue_wic_bert.json
│ │ └── topics_distilbert_base_uncased.json
│ ├── doc_retrieval
│ │ ├── en_ranker_pop_wiki.json
│ │ ├── en_ranker_tfidf_wiki.json
│ │ └── ru_ranker_tfidf_wiki.json
│ ├── embedder
│ │ ├── bert_embedder.json
│ │ └── bert_sentence_embedder.json
│ ├── entity_extraction
│ │ ├── entity_detection_en.json
│ │ ├── entity_detection_ru.json
│ │ ├── entity_extraction_en.json
│ │ ├── entity_extraction_ru.json
│ │ ├── entity_linking_en.json
│ │ └── entity_linking_ru.json
│ ├── faq
│ │ └── fasttext_logreg.json
│ ├── kbqa
│ │ ├── kbqa_cq_en.json
│ │ ├── kbqa_cq_ru.json
│ │ └── wiki_parser.json
│ ├── morpho_syntax_parser
│ │ ├── morpho_ru_syntagrus_bert.json
│ │ ├── ru_syntagrus_joint_parsing.json
│ │ └── syntax_ru_syntagrus_bert.json
│ ├── multitask
│ │ ├── mt_glue.json
│ │ └── multitask_example.json
│ ├── ner
│ │ ├── ner_bert_base.json
│ │ ├── ner_case_agnostic_mdistilbert.json
│ │ ├── ner_collection3_bert.json
│ │ ├── ner_conll2003_bert.json
│ │ ├── ner_conll2003_deberta_crf.json
│ │ ├── ner_ontonotes_bert.json
│ │ ├── ner_ontonotes_bert_mult.json
│ │ ├── ner_ontonotes_deberta_crf.json
│ │ ├── ner_rus_bert.json
│ │ ├── ner_rus_bert_probas.json
│ │ ├── ner_rus_convers_distilrubert_2L.json
│ │ └── ner_rus_convers_distilrubert_6L.json
│ ├── odqa
│ │ ├── en_odqa_infer_wiki.json
│ │ ├── en_odqa_pop_infer_wiki.json
│ │ └── ru_odqa_infer_wiki.json
│ ├── ranking
│ │ ├── path_ranking_nll_roberta_en.json
│ │ ├── ranking_ubuntu_v2_torch_bert_uncased.json
│ │ ├── rel_ranking_nll_bert_ru.json
│ │ └── rel_ranking_roberta_en.json
│ ├── regressors
│ │ └── translation_ranker.json
│ ├── relation_extraction
│ │ ├── re_docred.json
│ │ └── re_rured.json
│ ├── russian_super_glue
│ │ ├── russian_superglue_danetqa_rubert.json
│ │ ├── russian_superglue_lidirus_rubert.json
│ │ ├── russian_superglue_muserc_rubert.json
│ │ ├── russian_superglue_parus_rubert.json
│ │ ├── russian_superglue_rcb_rubert.json
│ │ ├── russian_superglue_rucos_rubert.json
│ │ ├── russian_superglue_russe_rubert.json
│ │ ├── russian_superglue_rwsd_rubert.json
│ │ └── russian_superglue_terra_rubert.json
│ ├── sentence_segmentation
│ │ └── sentseg_dailydialog_bert.json
│ ├── spelling_correction
│ │ ├── brillmoore_wikitypos_en.json
│ │ └── levenshtein_corrector_ru.json
│ └── squad
│ │ ├── qa_multisberquad_bert.json
│ │ ├── qa_nq_psgcls_bert.json
│ │ ├── qa_squad2_bert.json
│ │ ├── squad_bert.json
│ │ ├── squad_ru_bert.json
│ │ ├── squad_ru_convers_distilrubert_2L.json
│ │ └── squad_ru_convers_distilrubert_6L.json
├── core
│ ├── __init__.py
│ ├── commands
│ │ ├── __init__.py
│ │ ├── infer.py
│ │ ├── train.py
│ │ └── utils.py
│ ├── common
│ │ ├── __init__.py
│ │ ├── aliases.py
│ │ ├── base.py
│ │ ├── chainer.py
│ │ ├── cross_validation.py
│ │ ├── errors.py
│ │ ├── file.py
│ │ ├── log.py
│ │ ├── log_events.py
│ │ ├── metrics_registry.json
│ │ ├── metrics_registry.py
│ │ ├── params.py
│ │ ├── params_search.py
│ │ ├── paths.py
│ │ ├── prints.py
│ │ ├── registry.json
│ │ ├── registry.py
│ │ └── requirements_registry.json
│ ├── data
│ │ ├── __init__.py
│ │ ├── data_fitting_iterator.py
│ │ ├── data_learning_iterator.py
│ │ ├── dataset_reader.py
│ │ ├── simple_vocab.py
│ │ └── utils.py
│ ├── models
│ │ ├── __init__.py
│ │ ├── component.py
│ │ ├── estimator.py
│ │ ├── nn_model.py
│ │ ├── serializable.py
│ │ └── torch_model.py
│ └── trainers
│ │ ├── __init__.py
│ │ ├── fit_trainer.py
│ │ ├── nn_trainer.py
│ │ ├── torch_trainer.py
│ │ └── utils.py
├── dataset_iterators
│ ├── __init__.py
│ ├── basic_classification_iterator.py
│ ├── huggingface_dataset_iterator.py
│ ├── morphotagger_iterator.py
│ ├── multitask_iterator.py
│ ├── siamese_iterator.py
│ ├── sqlite_iterator.py
│ ├── squad_iterator.py
│ └── typos_iterator.py
├── dataset_readers
│ ├── __init__.py
│ ├── basic_classification_reader.py
│ ├── boolqa_reader.py
│ ├── conll2003_reader.py
│ ├── docred_reader.py
│ ├── faq_reader.py
│ ├── huggingface_dataset_reader.py
│ ├── imdb_reader.py
│ ├── line_reader.py
│ ├── morphotagging_dataset_reader.py
│ ├── multitask_reader.py
│ ├── odqa_reader.py
│ ├── paraphraser_reader.py
│ ├── rel_ranking_reader.py
│ ├── rured_reader.py
│ ├── sq_reader.py
│ ├── squad_dataset_reader.py
│ ├── typos_reader.py
│ └── ubuntu_v2_reader.py
├── deep.py
├── download.py
├── metrics
│ ├── __init__.py
│ ├── accuracy.py
│ ├── bleu.py
│ ├── correlation.py
│ ├── elmo_metrics.py
│ ├── fmeasure.py
│ ├── google_bleu.py
│ ├── log_loss.py
│ ├── mse.py
│ ├── recall_at_k.py
│ ├── record_metrics.py
│ ├── roc_auc_score.py
│ └── squad_metrics.py
├── models
│ ├── __init__.py
│ ├── api_requester
│ │ ├── __init__.py
│ │ ├── api_requester.py
│ │ └── api_router.py
│ ├── classifiers
│ │ ├── __init__.py
│ │ ├── cos_sim_classifier.py
│ │ ├── dnnc_proba2labels.py
│ │ ├── proba2labels.py
│ │ ├── re_bert.py
│ │ ├── torch_classification_model.py
│ │ ├── torch_nets.py
│ │ └── utils.py
│ ├── doc_retrieval
│ │ ├── __init__.py
│ │ ├── bpr.py
│ │ ├── logit_ranker.py
│ │ ├── pop_ranker.py
│ │ ├── tfidf_ranker.py
│ │ └── utils.py
│ ├── embedders
│ │ ├── __init__.py
│ │ ├── abstract_embedder.py
│ │ ├── fasttext_embedder.py
│ │ ├── tfidf_weighted_embedder.py
│ │ └── transformers_embedder.py
│ ├── entity_extraction
│ │ ├── __init__.py
│ │ ├── entity_detection_parser.py
│ │ ├── entity_linking.py
│ │ ├── find_word.py
│ │ └── ner_chunker.py
│ ├── kbqa
│ │ ├── __init__.py
│ │ ├── query_generator.py
│ │ ├── query_generator_base.py
│ │ ├── rel_ranking_infer.py
│ │ ├── ru_adj_to_noun.py
│ │ ├── sentence_answer.py
│ │ ├── template_matcher.py
│ │ ├── tree_to_sparql.py
│ │ ├── type_define.py
│ │ ├── utils.py
│ │ └── wiki_parser.py
│ ├── morpho_syntax_parser
│ │ ├── __init__.py
│ │ ├── dependency_decoding.py
│ │ ├── joint.py
│ │ ├── spacy_lemmatizer.py
│ │ └── syntax_parsing.py
│ ├── preprocessors
│ │ ├── __init__.py
│ │ ├── dirty_comments_preprocessor.py
│ │ ├── dnnc_preprocessor.py
│ │ ├── mask.py
│ │ ├── multitask_preprocessor.py
│ │ ├── ner_preprocessor.py
│ │ ├── odqa_preprocessors.py
│ │ ├── one_hotter.py
│ │ ├── re_preprocessor.py
│ │ ├── response_base_loader.py
│ │ ├── sanitizer.py
│ │ ├── sentseg_preprocessor.py
│ │ ├── squad_preprocessor.py
│ │ ├── str_lower.py
│ │ ├── str_token_reverser.py
│ │ ├── str_utf8_encoder.py
│ │ ├── torch_transformers_preprocessor.py
│ │ └── transformers_preprocessor.py
│ ├── ranking
│ │ ├── __init__.py
│ │ └── metrics.py
│ ├── relation_extraction
│ │ ├── __init__.py
│ │ ├── losses.py
│ │ └── relation_extraction_bert.py
│ ├── sklearn
│ │ ├── __init__.py
│ │ └── sklearn_component.py
│ ├── spelling_correction
│ │ ├── __init__.py
│ │ ├── brillmoore
│ │ │ ├── __init__.py
│ │ │ └── error_model.py
│ │ ├── electors
│ │ │ ├── __init__.py
│ │ │ ├── kenlm_elector.py
│ │ │ └── top1_elector.py
│ │ └── levenshtein
│ │ │ ├── __init__.py
│ │ │ ├── levenshtein_searcher.py
│ │ │ ├── searcher_component.py
│ │ │ └── tabled_trie.py
│ ├── tokenizers
│ │ ├── __init__.py
│ │ ├── lazy_tokenizer.py
│ │ ├── nltk_moses_tokenizer.py
│ │ ├── nltk_tokenizer.py
│ │ ├── spacy_tokenizer.py
│ │ ├── split_tokenizer.py
│ │ └── utils.py
│ ├── torch_bert
│ │ ├── __init__.py
│ │ ├── crf.py
│ │ ├── multitask_transformer.py
│ │ ├── torch_bert_ranker.py
│ │ ├── torch_transformers_classifier.py
│ │ ├── torch_transformers_el_ranker.py
│ │ ├── torch_transformers_multiplechoice.py
│ │ ├── torch_transformers_nll_ranking.py
│ │ ├── torch_transformers_sequence_tagger.py
│ │ ├── torch_transformers_squad.py
│ │ └── torch_transformers_syntax_parser.py
│ └── vectorizers
│ │ ├── __init__.py
│ │ └── hashing_tfidf_vectorizer.py
├── paramsearch.py
├── requirements
│ ├── datasets.txt
│ ├── dependency_decoding.txt
│ ├── en_core_web_sm.txt
│ ├── faiss.txt
│ ├── fasttext.txt
│ ├── hdt.txt
│ ├── kenlm.txt
│ ├── lxml.txt
│ ├── opt_einsum.txt
│ ├── protobuf.txt
│ ├── pytorch.txt
│ ├── rapidfuzz.txt
│ ├── razdel.txt
│ ├── ru_core_news_sm.txt
│ ├── sacremoses.txt
│ ├── sentencepiece.txt
│ ├── slovnet.txt
│ ├── sortedcontainers.txt
│ ├── torchcrf.txt
│ ├── transformers.txt
│ ├── udapi.txt
│ └── whapi.txt
├── settings.py
├── utils
│ ├── __init__.py
│ ├── benchmarks
│ │ ├── __init__.py
│ │ └── benchmarks.py
│ ├── connector
│ │ ├── __init__.py
│ │ └── dialog_logger.py
│ ├── pip_wrapper
│ │ ├── __init__.py
│ │ └── pip_wrapper.py
│ ├── server
│ │ ├── __init__.py
│ │ ├── metrics.py
│ │ └── server.py
│ ├── settings
│ │ ├── __init__.py
│ │ ├── dialog_logger_config.json
│ │ ├── log_config.json
│ │ └── server_config.json
│ └── socket
│ │ ├── __init__.py
│ │ └── socket.py
└── vocabs
│ ├── __init__.py
│ ├── typos.py
│ └── wiki_sqlite.py
├── docs
├── Makefile
├── _static
│ ├── aws_ec2
│ │ ├── 01_login_to_aws.png
│ │ ├── 02_choose_ubuntu.png
│ │ ├── 03_select_instance_type.png
│ │ ├── 04_add_storage.png
│ │ ├── 05_review_instance.png
│ │ ├── 06_go_to_running_instances.png
│ │ ├── 07_wait_init.png
│ │ ├── 08_01_set_sec_group.png
│ │ ├── 08_02_set_inbound.png
│ │ ├── 09_01_select_connect.png
│ │ └── 09_02_connection_info.png
│ ├── deeppavlov.css
│ ├── deeppavlov.png
│ ├── deeppavlov_logo.png
│ ├── dp_agnt_diag.png
│ ├── gobot_diagram.png
│ ├── ipavlov_footer.png
│ ├── kvret_diagram.png
│ ├── my_blocks.css
│ ├── social
│ │ ├── Medium_Monogram.svg
│ │ ├── Twitter_Social_Icon_Circle_Color.svg
│ │ ├── telegram.png
│ │ └── youtube_social_circle_red.png
│ └── tree.png
├── _templates
│ └── footer.html
├── apiref
│ ├── core.rst
│ ├── core
│ │ ├── commands.rst
│ │ ├── common.rst
│ │ ├── data.rst
│ │ ├── models.rst
│ │ └── trainers.rst
│ ├── dataset_iterators.rst
│ ├── dataset_readers.rst
│ ├── metrics.rst
│ ├── models.rst
│ ├── models
│ │ ├── api_requester.rst
│ │ ├── classifiers.rst
│ │ ├── doc_retrieval.rst
│ │ ├── embedders.rst
│ │ ├── entity_extraction.rst
│ │ ├── kbqa.rst
│ │ ├── preprocessors.rst
│ │ ├── relation_extraction.rst
│ │ ├── sklearn.rst
│ │ ├── spelling_correction.rst
│ │ ├── tokenizers.rst
│ │ ├── torch_bert.rst
│ │ └── vectorizers.rst
│ └── vocabs.rst
├── conf.py
├── devguides
│ ├── contribution_guide.rst
│ └── registry.rst
├── features
│ ├── hypersearch.rst
│ ├── models
│ │ ├── KBQA.ipynb
│ │ ├── NER.ipynb
│ │ ├── ODQA.ipynb
│ │ ├── SQuAD.ipynb
│ │ ├── bert.rst
│ │ ├── classification.ipynb
│ │ ├── entity_extraction.ipynb
│ │ ├── few_shot_classification.ipynb
│ │ ├── morpho_tagger.ipynb
│ │ ├── multitask_bert.rst
│ │ ├── neural_ranking.ipynb
│ │ ├── popularity_ranking.rst
│ │ ├── relation_extraction.ipynb
│ │ ├── spelling_correction.ipynb
│ │ ├── superglue.rst
│ │ ├── syntax_parser.ipynb
│ │ └── tfidf_ranking.ipynb
│ ├── overview.rst
│ └── pretrained_vectors.rst
├── index.rst
├── integrations
│ ├── aws_ec2.rst
│ ├── rest_api.rst
│ ├── settings.rst
│ └── socket_api.rst
├── internships
│ └── internships.rst
└── intro
│ ├── configuration.rst
│ ├── installation.rst
│ ├── overview.rst
│ ├── python.ipynb
│ └── quick_start.rst
├── requirements.txt
├── setup.py
├── tests
├── __init__.py
├── test_configs
│ └── doc_retrieval
│ │ ├── en_ranker_pop_wiki_test.json
│ │ ├── en_ranker_tfidf_wiki_test.json
│ │ └── ru_ranker_tfidf_wiki_test.json
└── test_quick_start.py
└── utils
├── Docker
├── Dockerfile
├── README.md
├── cmd.sh
└── docker-compose.yml
├── __init__.py
└── prepare
├── __init__.py
├── hashes.py
├── optimize_ipynb.py
├── registry.py
└── upload.py
/.github/ISSUE_TEMPLATE/bug_report.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: Bug report
3 | about: Report on a bug you encountered
4 | title: ''
5 | labels: bug
6 | assignees: ''
7 |
8 | ---
9 |
10 | Want to contribute to DeepPavlov? Please read the [contributing guideline](http://docs.deeppavlov.ai/en/master/devguides/contribution_guide.html) first.
11 |
12 | Please enter all the information below, otherwise your issue may be closed without a warning.
13 |
14 |
15 | **DeepPavlov version** (you can look it up by running `pip show deeppavlov`):
16 |
17 | **Python version**:
18 |
19 | **Operating system** (ubuntu linux, windows, ...):
20 |
21 | **Issue**:
22 |
23 |
24 | **Content or a name of a configuration file**:
25 | ```
26 |
27 | ```
28 |
29 |
30 | **Command that led to error**:
31 | ```
32 |
33 | ```
34 |
35 | **Error (including full traceback)**:
36 | ```
37 |
38 | ```
39 |
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/config.yml:
--------------------------------------------------------------------------------
1 | blank_issues_enabled: false
2 | contact_links:
3 | - name: Ask a question
4 | url: https://forum.deeppavlov.ai/
5 | about: If you have a different question, please ask it in the forum https://forum.deeppavlov.ai
6 |
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feature-request.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: Feature request
3 | about: Suggest a feature to improve the DeepPavlov library
4 | title: ''
5 | labels: enhancement
6 | assignees: ''
7 |
8 | ---
9 |
10 | Want to contribute to DeepPavlov? Please read the [contributing guideline](http://docs.deeppavlov.ai/en/master/devguides/contribution_guide.html) first.
11 |
12 |
13 | **What problem are we trying to solve?**:
14 | ```
15 |
16 | ```
17 |
18 | **How can we solve it?**:
19 | ```
20 |
21 | ```
22 |
23 | **Are there other issues that block this solution?**:
24 | ```
25 |
26 | ```
27 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | env/
12 | build/
13 | develop-eggs/
14 | dist/
15 | downloads/
16 | eggs/
17 | .eggs/
18 | lib/
19 | lib64/
20 | parts/
21 | sdist/
22 | var/
23 | wheels/
24 | *.egg-info/
25 | .installed.cfg
26 | *.egg
27 |
28 | # PyInstaller
29 | # Usually these files are written by a python script from a template
30 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
31 | *.manifest
32 | *.spec
33 |
34 | # Installer logs
35 | pip-log.txt
36 | pip-delete-this-directory.txt
37 |
38 | # Unit test / coverage reports
39 | htmlcov/
40 | .tox/
41 | .coverage
42 | .coverage.*
43 | .cache
44 | nosetests.xml
45 | coverage.xml
46 | *.cover
47 | .hypothesis/
48 |
49 | # Translations
50 | *.mo
51 | *.pot
52 |
53 | # Django stuff:
54 | *.log
55 | local_settings.py
56 |
57 | # Flask stuff:
58 | instance/
59 | .webassets-cache
60 |
61 | # Scrapy stuff:
62 | .scrapy
63 |
64 | # Sphinx documentation
65 | docs/_build/
66 |
67 | # PyBuilder
68 | target/
69 |
70 | # Jupyter Notebook
71 | .ipynb_checkpoints
72 |
73 | # pyenv
74 | .python-version
75 |
76 | # celery beat schedule file
77 | celerybeat-schedule
78 |
79 | # SageMath parsed files
80 | *.sage.py
81 |
82 | # dotenv
83 | .env
84 |
85 | # virtualenv
86 | .venv
87 | venv/
88 | ENV/
89 |
90 | # Spyder project settings
91 | .spyderproject
92 | .spyproject
93 |
94 | # Rope project settings
95 | .ropeproject
96 |
97 | # mkdocs documentation
98 | /site
99 |
100 | # mypy
101 | .mypy_cache/
102 |
103 | #IDEA
104 | .idea/
105 |
106 | #Atom IDE
107 | .ftpconfig
108 |
109 | #vscode IDE
110 | .vscode
111 |
112 | # Vim
113 | *.vim
114 | *.vimrc
115 |
116 | #GIT
117 | .git/
118 |
119 | #Default usr dir
120 | download/
121 |
122 | #project test
123 | /test/
124 | .pytest_cache
125 |
126 | # project data
127 | /data/
128 |
129 | # local dockerfiles
130 | /Dockerfile
131 | /entrypoint.sh
132 | /.dockerignore
133 |
--------------------------------------------------------------------------------
/.readthedocs.yml:
--------------------------------------------------------------------------------
1 | # .readthedocs.yml
2 | version: 2
3 |
4 | build:
5 | os: "ubuntu-20.04"
6 | tools:
7 | python: "3.10"
8 | formats: []
9 |
10 | python:
11 | install:
12 | - method: pip
13 | path: .
14 | extra_requirements:
15 | - docs
16 |
--------------------------------------------------------------------------------
/CNAME:
--------------------------------------------------------------------------------
1 | deeppavlov.ai
--------------------------------------------------------------------------------
/Jenkinsfile:
--------------------------------------------------------------------------------
1 | node('cuda-module') {
2 | timestamps {
3 | try {
4 | stage('Clean') {
5 | sh "rm -rf .[^.] .??* *"
6 | }
7 | stage('Checkout') {
8 | checkout scm
9 | }
10 | stage('Setup') {
11 | env.TFHUB_CACHE_DIR="tfhub_cache"
12 | sh """
13 | EPOCH=\$(date +%s) docker-compose -f utils/Docker/docker-compose.yml -p $BUILD_TAG build
14 | """
15 | }
16 | stage('Tests') {
17 | sh """
18 | docker-compose -f utils/Docker/docker-compose.yml -p $BUILD_TAG up py36 py37
19 | docker-compose -f utils/Docker/docker-compose.yml -p $BUILD_TAG ps | grep Exit | grep -v 'Exit 0' && exit 1
20 | docker-compose -f utils/Docker/docker-compose.yml -p $BUILD_TAG up py38 py39
21 | docker-compose -f utils/Docker/docker-compose.yml -p $BUILD_TAG ps | grep Exit | grep -v 'Exit 0' && exit 1
22 | docker-compose -f utils/Docker/docker-compose.yml -p $BUILD_TAG up py310 py311
23 | docker-compose -f utils/Docker/docker-compose.yml -p $BUILD_TAG ps | grep Exit | grep -v 'Exit 0' && exit 1 || exit 0
24 | """
25 | currentBuild.result = 'SUCCESS'
26 | }
27 | }
28 | catch(e) {
29 | currentBuild.result = 'FAILURE'
30 | throw e
31 | }
32 | finally {
33 | sh """
34 | docker-compose -f utils/Docker/docker-compose.yml -p $BUILD_TAG rm -f
35 | docker network rm \$(echo $BUILD_TAG | awk '{print tolower(\$0)}')_default
36 | """
37 | emailext to: "\${DEFAULT_RECIPIENTS}",
38 | subject: "${env.JOB_NAME} - Build # ${currentBuild.number} - ${currentBuild.result}!",
39 | body: '${BRANCH_NAME} - ${BUILD_URL}',
40 | attachLog: true
41 | }
42 | }
43 | }
44 |
--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include README.MD
2 | include LICENSE
3 | include requirements.txt
4 | include deeppavlov/requirements/*.txt
5 | recursive-include deeppavlov *.json
6 | recursive-include deeppavlov *.md
7 |
--------------------------------------------------------------------------------
/NLP_OSS_DeepPavlov_ACL_Demo_final.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deeppavlov/DeepPavlov/5f9fbed0c7191466bc7621e604b810f66f254c03/NLP_OSS_DeepPavlov_ACL_Demo_final.pdf
--------------------------------------------------------------------------------
/_config.yml:
--------------------------------------------------------------------------------
1 | theme: jekyll-theme-leap-day
2 | google_analytics: UA-139843736-5
3 | include:
4 | - _static
5 |
--------------------------------------------------------------------------------
/deeppavlov/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright 2017 Neural Networks and Deep Learning lab, MIPT
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | import sys
16 | from pathlib import Path
17 |
18 | from ._meta import __author__, __description__, __email__, __keywords__, __license__, __version__
19 | from .configs import configs
20 | from .core.commands.infer import build_model
21 | from .core.commands.train import train_evaluate_model_from_config
22 | from .core.common.base import Element, Model
23 | from .core.common.chainer import Chainer
24 | from .core.common.log import init_logger
25 | from .download import deep_download
26 |
27 |
28 | # TODO: make better
29 | def train_model(config: [str, Path, dict], install: bool = False,
30 | download: bool = False, recursive: bool = False) -> Chainer:
31 | train_evaluate_model_from_config(config, install=install, download=download, recursive=recursive)
32 | return build_model(config, load_trained=True)
33 |
34 |
35 | def evaluate_model(config: [str, Path, dict], install: bool = False,
36 | download: bool = False, recursive: bool = False) -> dict:
37 | return train_evaluate_model_from_config(config, to_train=False, install=install,
38 | download=download, recursive=recursive)
39 |
40 |
41 | # check version
42 | assert sys.hexversion >= 0x3060000, 'Does not work in python3.5 or lower'
43 |
44 | # resolve conflicts with previous DeepPavlov installations versioned up to 0.0.9
45 | dot_dp_path = Path('~/.deeppavlov').expanduser().resolve()
46 | if dot_dp_path.is_file():
47 | dot_dp_path.unlink()
48 |
49 | # initiate logging
50 | init_logger()
51 |
--------------------------------------------------------------------------------
/deeppavlov/__main__.py:
--------------------------------------------------------------------------------
1 | if __name__ == '__main__':
2 | from .deep import main
3 |
4 | main()
5 |
--------------------------------------------------------------------------------
/deeppavlov/_meta.py:
--------------------------------------------------------------------------------
1 | __version__ = '1.7.0'
2 | __author__ = 'Neural Networks and Deep Learning lab, MIPT'
3 | __description__ = 'An open source library for building end-to-end dialog systems and training chatbots.'
4 | __keywords__ = ['NLP', 'NER', 'SQUAD', 'Intents', 'Chatbot']
5 | __license__ = 'Apache License, Version 2.0'
6 | __email__ = 'info@deeppavlov.ai'
7 |
--------------------------------------------------------------------------------
/deeppavlov/configs/classifiers/boolqa_rubert.json:
--------------------------------------------------------------------------------
1 | {
2 | "dataset_reader": {
3 | "class_name": "boolqa_reader",
4 | "data_path": "{DOWNLOADS_PATH}/boolqa_data",
5 | "language": "ru"
6 | },
7 | "dataset_iterator": {
8 | "class_name": "basic_classification_iterator",
9 | "seed": 243
10 | },
11 | "chainer": {
12 | "in": ["text_a", "text_b"],
13 | "in_y": ["y"],
14 | "pipe": [
15 | {
16 | "class_name": "torch_transformers_preprocessor",
17 | "vocab_file": "{TRANSFORMER}",
18 | "do_lower_case": false,
19 | "max_seq_length": 128,
20 | "in": ["text_a", "text_b"],
21 | "out": ["bert_features"]
22 | },
23 | {
24 | "class_name": "torch_transformers_classifier",
25 | "n_classes": 2,
26 | "pretrained_bert": "{TRANSFORMER}",
27 | "save_path": "{MODELS_PATH}/boolqa_rubert/model_rubert",
28 | "load_path": "{MODELS_PATH}/boolqa_rubert/model_rubert",
29 | "optimizer": "AdamW",
30 | "optimizer_parameters": {"lr": 2e-05},
31 | "learning_rate_drop_patience": 3,
32 | "learning_rate_drop_div": 2.0,
33 | "in": ["bert_features"],
34 | "in_y": ["y"],
35 | "out": ["predictions"]
36 | }
37 | ],
38 | "out": ["predictions"]
39 | },
40 | "train": {
41 | "epochs": 50,
42 | "batch_size": 32,
43 | "train_metrics": ["f1", "acc"],
44 | "metrics": ["f1", "acc"],
45 | "validation_patience": 5,
46 | "val_every_n_epochs": 1,
47 | "log_every_n_epochs": 1,
48 | "evaluation_targets": ["valid", "train"],
49 | "show_examples": false,
50 | "class_name": "torch_trainer"
51 | },
52 | "metadata": {
53 | "variables": {
54 | "ROOT_PATH": "~/.deeppavlov",
55 | "DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
56 | "MODELS_PATH": "{ROOT_PATH}/models",
57 | "TRANSFORMER": "DeepPavlov/rubert-base-cased"
58 | }
59 | }
60 | }
61 |
--------------------------------------------------------------------------------
/deeppavlov/configs/classifiers/few_shot_roberta.json:
--------------------------------------------------------------------------------
1 | {
2 | "chainer": {
3 | "in": ["texts", "dataset"],
4 | "in_y": ["y_true"],
5 | "pipe": [
6 | {
7 | "class_name": "dnnc_pair_generator",
8 | "in": ["texts", "dataset"],
9 | "out": ["x", "x_support", "x_populated", "y_support"],
10 | "bidirectional": true
11 | },
12 | {
13 | "class_name": "torch_transformers_preprocessor",
14 | "in": ["x_populated", "x_support"],
15 | "out": ["bert_features"],
16 | "vocab_file": "{BASE_MODEL}",
17 | "do_lower_case": true,
18 | "max_seq_length": 128
19 | },
20 | {
21 | "class_name": "torch_transformers_classifier",
22 | "main": true,
23 | "in": ["bert_features"],
24 | "out": ["simmilarity_scores"],
25 | "n_classes": 2,
26 | "return_probas": true,
27 | "pretrained_bert": "{BASE_MODEL}",
28 | "save_path": "{MODEL_PATH}/model",
29 | "load_path": "{MODEL_PATH}/model",
30 | "is_binary": "{BINARY_CLASSIFICATION}"
31 | },
32 | {
33 | "class_name": "dnnc_proba2labels",
34 | "is_binary": "{BINARY_CLASSIFICATION}",
35 | "in": ["simmilarity_scores", "x", "x_populated", "x_support", "y_support"],
36 | "out": ["y_pred"],
37 | "confidence_threshold": 0.0
38 | }
39 | ],
40 | "out": ["y_pred"]
41 | },
42 | "metadata": {
43 | "variables": {
44 | "ROOT_PATH": "~/.deeppavlov",
45 | "MODEL_PATH": "{ROOT_PATH}/models/fewshot/roberta_nli_mrpc_1_10",
46 | "BINARY_CLASSIFICATION": true,
47 | "BASE_MODEL": "roberta-base"
48 | },
49 | "download": [
50 | {
51 | "url": "http://files.deeppavlov.ai/v1/classifiers/fewshot/roberta_nli_mrpc_1_10.tar.gz",
52 | "subdir": "{MODEL_PATH}"
53 | }
54 | ]
55 | }
56 | }
57 |
--------------------------------------------------------------------------------
/deeppavlov/configs/classifiers/glue/glue_stsb_roberta.json:
--------------------------------------------------------------------------------
1 | {
2 | "dataset_reader": {
3 | "class_name": "huggingface_dataset_reader",
4 | "path": "{COMPETITION}",
5 | "name": "{TASK}",
6 | "train": "train",
7 | "valid": "validation",
8 | "test": "test"
9 | },
10 | "dataset_iterator": {
11 | "class_name": "huggingface_dataset_iterator",
12 | "features": ["sentence1", "sentence2"],
13 | "label": "label",
14 | "use_label_name": false,
15 | "seed": 42
16 | },
17 | "chainer": {
18 | "in": ["sentence1", "sentence2"],
19 | "in_y": ["y"],
20 | "pipe": [
21 | {
22 | "class_name": "torch_transformers_preprocessor",
23 | "vocab_file": "{BASE_MODEL}",
24 | "do_lower_case": false,
25 | "max_seq_length": 64,
26 | "in": ["sentence1", "sentence2"],
27 | "out": ["bert_features"]
28 | },
29 | {
30 | "class_name": "torch_transformers_classifier",
31 | "n_classes": 1,
32 | "return_probas": false,
33 | "pretrained_bert": "{BASE_MODEL}",
34 | "save_path": "{MODEL_PATH}/model",
35 | "load_path": "{MODEL_PATH}/model",
36 | "optimizer": "AdamW",
37 | "optimizer_parameters": {
38 | "lr": 2e-05
39 | },
40 | "learning_rate_drop_patience": 3,
41 | "learning_rate_drop_div": 2.0,
42 | "in": ["bert_features"],
43 | "in_y": ["y"],
44 | "out": ["y_pred"]
45 | }
46 | ],
47 | "out": ["y_pred"]
48 | },
49 | "train": {
50 | "batch_size": 32,
51 | "metrics": [
52 | "pearson_correlation",
53 | "spearman_correlation"
54 | ],
55 | "validation_patience": 10,
56 | "val_every_n_epochs": 1,
57 | "log_every_n_epochs": 1,
58 | "show_examples": false,
59 | "evaluation_targets": ["train", "valid"],
60 | "class_name": "torch_trainer",
61 | "tensorboard_log_dir": "{MODEL_PATH}/",
62 | "pytest_max_batches": 2
63 | },
64 | "metadata": {
65 | "variables": {
66 | "BASE_MODEL": "roberta-large",
67 | "ROOT_PATH": "~/.deeppavlov",
68 | "DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
69 | "MODELS_PATH": "{ROOT_PATH}/models",
70 | "COMPETITION": "glue",
71 | "TASK": "stsb",
72 | "MODEL_PATH": "{MODELS_PATH}/{COMPETITION}/{TASK}/{BASE_MODEL}"
73 | },
74 | "download": [
75 | {
76 | "url": "http://files.deeppavlov.ai/v1/glue/glue_stsb_roberta.tar.gz",
77 | "subdir": "{MODEL_PATH}"
78 | }
79 | ]
80 | }
81 | }
82 |
--------------------------------------------------------------------------------
/deeppavlov/configs/classifiers/paraphraser_rubert.json:
--------------------------------------------------------------------------------
1 | {
2 | "dataset_reader": {
3 | "class_name": "paraphraser_reader",
4 | "data_path": "{DOWNLOADS_PATH}/paraphraser_data",
5 | "do_lower_case": false
6 | },
7 | "dataset_iterator": {
8 | "class_name": "siamese_iterator",
9 | "seed": 243,
10 | "len_valid": 500
11 | },
12 | "chainer": {
13 | "in": ["text_a", "text_b"],
14 | "in_y": ["y"],
15 | "pipe": [
16 | {
17 | "class_name": "torch_transformers_preprocessor",
18 | "vocab_file": "{TRANSFORMER}",
19 | "do_lower_case": false,
20 | "max_seq_length": 64,
21 | "in": ["text_a", "text_b"],
22 | "out": ["bert_features"]
23 | },
24 | {
25 | "class_name": "torch_transformers_classifier",
26 | "n_classes": 2,
27 | "pretrained_bert": "{TRANSFORMER}",
28 | "save_path": "{MODEL_PATH}/model",
29 | "load_path": "{MODEL_PATH}/model",
30 | "optimizer": "AdamW",
31 | "optimizer_parameters": {"lr": 2e-05},
32 | "learning_rate_drop_patience": 3,
33 | "learning_rate_drop_div": 2.0,
34 | "in": ["bert_features"],
35 | "in_y": ["y"],
36 | "out": ["predictions"]
37 | }
38 | ],
39 | "out": ["predictions"]
40 | },
41 | "train": {
42 | "batch_size": 64,
43 | "pytest_max_batches": 2,
44 | "train_metrics": ["f1", "acc"],
45 | "metrics": ["f1", "acc"],
46 | "validation_patience": 7,
47 | "val_every_n_batches": 50,
48 | "log_every_n_batches": 50,
49 | "evaluation_targets": ["valid", "test"],
50 | "class_name": "torch_trainer"
51 | },
52 | "metadata": {
53 | "variables": {
54 | "ROOT_PATH": "~/.deeppavlov",
55 | "DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
56 | "MODELS_PATH": "{ROOT_PATH}/models",
57 | "MODEL_PATH": "{MODELS_PATH}/classifiers/paraphraser_rubert_torch",
58 | "TRANSFORMER": "DeepPavlov/rubert-base-cased"
59 | },
60 | "download": [
61 | {
62 | "url": "http://files.deeppavlov.ai/datasets/paraphraser.zip",
63 | "subdir": "{DOWNLOADS_PATH}/paraphraser_data"
64 | },
65 | {
66 | "url": "http://files.deeppavlov.ai/datasets/paraphraser_gold.zip",
67 | "subdir": "{DOWNLOADS_PATH}/paraphraser_data"
68 | },
69 | {
70 | "url": "http://files.deeppavlov.ai/v1/classifiers/paraphraser_rubert/paraphraser_rubert_v1.tar.gz",
71 | "subdir": "{MODEL_PATH}"
72 | }
73 | ]
74 | }
75 | }
76 |
--------------------------------------------------------------------------------
/deeppavlov/configs/doc_retrieval/en_ranker_pop_wiki.json:
--------------------------------------------------------------------------------
1 | {
2 | "dataset_reader": {
3 | "class_name": "odqa_reader",
4 | "data_path": "{DOWNLOADS_PATH}/odqa/enwiki",
5 | "save_path": "{DOWNLOADS_PATH}/odqa/enwiki.db",
6 | "dataset_format": "wiki"
7 | },
8 | "dataset_iterator": {
9 | "class_name": "sqlite_iterator",
10 | "shuffle": false,
11 | "load_path": "{DOWNLOADS_PATH}/odqa/enwiki_l100.db"
12 | },
13 | "chainer": {
14 | "in": ["docs"],
15 | "in_y": ["doc_ids", "doc_nums"],
16 | "out": ["pop_doc_ids"],
17 | "pipe": [
18 | {
19 | "class_name": "hashing_tfidf_vectorizer",
20 | "id": "vectorizer",
21 | "fit_on": ["docs", "doc_ids", "doc_nums"],
22 | "save_path": "{MODELS_PATH}/odqa/enwiki_tfidf_matrix_par_lite.npz",
23 | "load_path": "{MODELS_PATH}/odqa/enwiki_tfidf_matrix_par_lite.npz",
24 | "tokenizer": {
25 | "class_name": "stream_spacy_tokenizer",
26 | "lemmas": true,
27 | "lowercase": true,
28 | "filter_stopwords": true,
29 | "ngram_range": [1, 3]
30 | }
31 | },
32 | {
33 | "class_name": "tfidf_ranker",
34 | "top_n": 100,
35 | "in": ["docs"],
36 | "out": ["tfidf_doc_ids", "tfidf_doc_scores"],
37 | "vectorizer": "#vectorizer"
38 | },
39 | {
40 | "class_name": "pop_ranker",
41 | "pop_dict_path": "{DOWNLOADS_PATH}/odqa/enwiki_popularities.json",
42 | "load_path": "{MODELS_PATH}/odqa/logreg_3features_v2.joblib",
43 | "top_n": 100,
44 | "in": ["tfidf_doc_ids", "tfidf_doc_scores"],
45 | "out": ["pop_doc_ids", "pop_doc_scores"]
46 | }
47 | ]
48 | },
49 | "train": {
50 | "batch_size": 10000,
51 | "evaluation_targets": [],
52 | "class_name": "fit_trainer"
53 | },
54 | "metadata": {
55 | "variables": {
56 | "ROOT_PATH": "~/.deeppavlov",
57 | "DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
58 | "MODELS_PATH": "{ROOT_PATH}/models"
59 | },
60 | "download": [
61 | {
62 | "url": "http://files.deeppavlov.ai/deeppavlov_data/odqa/enwiki_l100.tar.gz",
63 | "subdir": "{DOWNLOADS_PATH}/odqa"
64 | },
65 | {
66 | "url": "http://files.deeppavlov.ai/deeppavlov_data/odqa/enwiki_tfidf_matrix_par_lite.tar.gz",
67 | "subdir": "{MODELS_PATH}/odqa"
68 | },
69 | {
70 | "url": "http://files.deeppavlov.ai/deeppavlov_data/odqa/enwiki_popularities.tar.gz",
71 | "subdir": "{DOWNLOADS_PATH}/odqa"
72 | },
73 | {
74 | "url": "http://files.deeppavlov.ai/deeppavlov_data/ranking/logreg_3features_v2.joblib",
75 | "subdir": "{MODELS_PATH}/odqa"
76 | }
77 | ]
78 | }
79 | }
80 |
--------------------------------------------------------------------------------
/deeppavlov/configs/doc_retrieval/en_ranker_tfidf_wiki.json:
--------------------------------------------------------------------------------
1 | {
2 | "dataset_reader": {
3 | "class_name": "odqa_reader",
4 | "data_path": "{DOWNLOADS_PATH}/odqa/enwiki",
5 | "save_path": "{DOWNLOADS_PATH}/odqa/enwiki.db",
6 | "dataset_format": "wiki"
7 | },
8 | "dataset_iterator": {
9 | "class_name": "sqlite_iterator",
10 | "shuffle": false,
11 | "load_path": "{DOWNLOADS_PATH}/odqa/enwiki_l100.db"
12 | },
13 | "chainer": {
14 | "in": ["docs"],
15 | "in_y": ["doc_ids", "doc_nums"],
16 | "out": ["tfidf_doc_ids"],
17 | "pipe": [
18 | {
19 | "class_name": "hashing_tfidf_vectorizer",
20 | "id": "vectorizer",
21 | "fit_on": ["docs", "doc_ids", "doc_nums"],
22 | "save_path": "{MODELS_PATH}/odqa/enwiki_tfidf_matrix_par_lite.npz",
23 | "load_path": "{MODELS_PATH}/odqa/enwiki_tfidf_matrix_par_lite.npz",
24 | "tokenizer": {
25 | "class_name": "stream_spacy_tokenizer",
26 | "lemmas": true,
27 | "lowercase": true,
28 | "filter_stopwords": true,
29 | "ngram_range": [1, 3]
30 | }
31 | },
32 | {
33 | "class_name": "tfidf_ranker",
34 | "top_n": 100,
35 | "in": ["docs"],
36 | "out": ["tfidf_doc_ids", "tfidf_doc_scores"],
37 | "vectorizer": "#vectorizer"
38 | }
39 | ]
40 | },
41 | "train": {
42 | "batch_size": 10000,
43 | "evaluation_targets": [],
44 | "class_name": "fit_trainer"
45 | },
46 | "metadata": {
47 | "variables": {
48 | "ROOT_PATH": "~/.deeppavlov",
49 | "DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
50 | "MODELS_PATH": "{ROOT_PATH}/models"
51 | },
52 | "download": [
53 | {
54 | "url": "http://files.deeppavlov.ai/deeppavlov_data/odqa/enwiki_l100.tar.gz",
55 | "subdir": "{DOWNLOADS_PATH}/odqa"
56 | },
57 | {
58 | "url": "http://files.deeppavlov.ai/deeppavlov_data/odqa/enwiki_tfidf_matrix_par_lite.tar.gz",
59 | "subdir": "{MODELS_PATH}/odqa"
60 | }
61 | ]
62 | }
63 | }
64 |
--------------------------------------------------------------------------------
/deeppavlov/configs/doc_retrieval/ru_ranker_tfidf_wiki.json:
--------------------------------------------------------------------------------
1 | {
2 | "dataset_reader": {
3 | "class_name": "odqa_reader",
4 | "data_path": "{DOWNLOADS_PATH}/odqa/ruwiki",
5 | "save_path": "{DOWNLOADS_PATH}/odqa/ruwiki_par_page_compr.db",
6 | "dataset_format": "wiki"
7 | },
8 | "dataset_iterator": {
9 | "class_name": "sqlite_iterator",
10 | "shuffle": false,
11 | "load_path": "{DOWNLOADS_PATH}/odqa/ruwiki_par_page_compr.db"
12 | },
13 | "chainer": {
14 | "in": ["docs"],
15 | "in_y": ["doc_ids", "doc_nums"],
16 | "out": ["tfidf_doc_ids"],
17 | "pipe": [
18 | {
19 | "class_name": "hashing_tfidf_vectorizer",
20 | "id": "vectorizer",
21 | "fit_on": ["docs", "doc_ids", "doc_nums"],
22 | "save_path": "{MODELS_PATH}/odqa/ruwiki_tfidf_matrix_compr.npz",
23 | "load_path": "{MODELS_PATH}/odqa/ruwiki_tfidf_matrix_compr.npz",
24 | "tokenizer": {
25 | "class_name": "stream_spacy_tokenizer",
26 | "spacy_model": "ru_core_news_sm",
27 | "lemmas": true,
28 | "lowercase": true,
29 | "filter_stopwords": true,
30 | "ngram_range": [1, 3]
31 | }
32 | },
33 | {
34 | "class_name": "tfidf_ranker",
35 | "top_n": 100,
36 | "in": ["docs"],
37 | "out": ["tfidf_doc_ids", "tfidf_doc_scores"],
38 | "vectorizer": "#vectorizer"
39 | }
40 | ]
41 | },
42 | "train": {
43 | "batch_size": 10000,
44 | "evaluation_targets": [],
45 | "class_name": "fit_trainer"
46 | },
47 | "metadata": {
48 | "variables": {
49 | "ROOT_PATH": "~/.deeppavlov",
50 | "DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
51 | "MODELS_PATH": "{ROOT_PATH}/models"
52 | },
53 | "download": [
54 | {
55 | "url": "http://files.deeppavlov.ai/deeppavlov_data/odqa/ruwiki_par_page_compr.tar.gz",
56 | "subdir": "{DOWNLOADS_PATH}/odqa"
57 | },
58 | {
59 | "url": "http://files.deeppavlov.ai/deeppavlov_data/odqa/ruwiki_tfidf_matrix_compr.tar.gz",
60 | "subdir": "{MODELS_PATH}/odqa"
61 | }
62 | ]
63 | }
64 | }
65 |
--------------------------------------------------------------------------------
/deeppavlov/configs/embedder/bert_embedder.json:
--------------------------------------------------------------------------------
1 | {
2 | "chainer": {
3 | "in": ["texts"],
4 | "pipe": [
5 | {
6 | "class_name": "transformers_bert_preprocessor",
7 | "vocab_file": "{BERT_PATH}/vocab.txt",
8 | "do_lower_case": false,
9 | "max_seq_length": 512,
10 | "in": ["texts"],
11 | "out": ["tokens", "subword_tokens", "subword_tok_ids", "startofword_markers", "attention_mask"]
12 | },
13 | {
14 | "class_name": "transformers_bert_embedder",
15 | "bert_config_path": "{BERT_PATH}/bert_config.json",
16 | "load_path": "{BERT_PATH}",
17 | "truncate": true,
18 | "in": ["subword_tok_ids", "startofword_markers", "attention_mask"],
19 | "out": ["word_emb", "subword_emb", "max_emb", "mean_emb", "pooler_output"]
20 | }
21 | ],
22 | "out": ["tokens", "word_emb", "subword_tokens", "subword_emb", "max_emb", "mean_emb", "pooler_output"]
23 | },
24 | "train": {},
25 | "metadata": {
26 | "variables": {
27 | "ROOT_PATH": "~/.deeppavlov",
28 | "DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
29 | "BERT_PATH": "{DOWNLOADS_PATH}/bert_models/multi_cased_L-12_H-768_A-12_pt"
30 | },
31 | "labels": {},
32 | "download": [
33 | {
34 | "url": "http://files.deeppavlov.ai/deeppavlov_data/bert/multi_cased_L-12_H-768_A-12_pt.tar.gz",
35 | "subdir": "{DOWNLOADS_PATH}/bert_models"
36 | }
37 | ]
38 | }
39 | }
40 |
--------------------------------------------------------------------------------
/deeppavlov/configs/embedder/bert_sentence_embedder.json:
--------------------------------------------------------------------------------
1 | {
2 | "chainer": {
3 | "in": ["texts"],
4 | "pipe": [
5 | {
6 | "class_name": "transformers_bert_preprocessor",
7 | "vocab_file": "{BERT_PATH}/vocab.txt",
8 | "do_lower_case": false,
9 | "max_seq_length": 512,
10 | "in": ["texts"],
11 | "out": ["tokens", "subword_tokens", "subword_tok_ids", "startofword_markers", "attention_mask"]
12 | },
13 | {
14 | "class_name": "transformers_bert_embedder",
15 | "bert_config_path": "{BERT_PATH}/config.json",
16 | "load_path": "{BERT_PATH}",
17 | "truncate": false,
18 | "in": ["subword_tok_ids", "startofword_markers", "attention_mask"],
19 | "out": ["word_emb", "subword_emb", "max_emb", "mean_emb", "pooler_output"]
20 | }
21 | ],
22 | "out": ["max_emb", "mean_emb", "pooler_output"]
23 | },
24 | "train": {},
25 | "metadata": {
26 | "variables": {
27 | "ROOT_PATH": "~/.deeppavlov",
28 | "DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
29 | "BERT_PATH": "{DOWNLOADS_PATH}/bert_models/sentence_multi_cased_L-12_H-768_A-12_pt_v1"
30 | },
31 | "labels": {},
32 | "download": [
33 | {
34 | "url": "http://files.deeppavlov.ai/deeppavlov_data/bert/sentence_multi_cased_L-12_H-768_A-12_pt_v1.tar.gz",
35 | "subdir": "{DOWNLOADS_PATH}/bert_models"
36 | }
37 | ]
38 | }
39 | }
40 |
--------------------------------------------------------------------------------
/deeppavlov/configs/entity_extraction/entity_detection_en.json:
--------------------------------------------------------------------------------
1 | {
2 | "chainer": {
3 | "in": ["x"],
4 | "pipe": [
5 | {
6 | "class_name": "ner_chunker",
7 | "batch_size": 16,
8 | "max_seq_len" : 300,
9 | "vocab_file": "{TRANSFORMER}",
10 | "in": ["x"],
11 | "out": ["x_chunk", "chunk_nums", "chunk_sentences_offsets", "chunk_sentences"]
12 | },
13 | {
14 | "thres_proba": 0.6,
15 | "o_tag": "O",
16 | "tags_file": "{NER_PATH}/tag.dict",
17 | "class_name": "entity_detection_parser",
18 | "id": "edp"
19 | },
20 | {
21 | "class_name": "ner_chunk_model",
22 | "ner": {
23 | "config_path": "{CONFIGS_PATH}/ner/ner_ontonotes_bert.json",
24 | "overwrite": {
25 | "chainer.out": ["x_tokens", "tokens_offsets", "y_pred", "probas"]
26 | }
27 | },
28 | "ner_parser": "#edp",
29 | "in": ["x_chunk", "chunk_nums", "chunk_sentences_offsets", "chunk_sentences"],
30 | "out": ["entity_substr", "entity_offsets", "entity_positions", "tags", "sentences_offsets", "sentences", "probas"]
31 | }
32 | ],
33 | "out": ["entity_substr", "entity_offsets", "entity_positions", "tags", "sentences_offsets", "sentences", "probas"]
34 | },
35 | "metadata": {
36 | "variables": {
37 | "ROOT_PATH": "~/.deeppavlov",
38 | "MODELS_PATH": "{ROOT_PATH}/models",
39 | "CONFIGS_PATH": "{DEEPPAVLOV_PATH}/configs",
40 | "TRANSFORMER": "bert-base-cased",
41 | "NER_PATH": "{MODELS_PATH}/ner_ontonotes_bert_torch_crf"
42 | }
43 | }
44 | }
45 |
--------------------------------------------------------------------------------
/deeppavlov/configs/entity_extraction/entity_detection_ru.json:
--------------------------------------------------------------------------------
1 | {
2 | "chainer": {
3 | "in": ["x"],
4 | "pipe": [
5 | {
6 | "class_name": "ner_chunker",
7 | "batch_size": 16,
8 | "max_seq_len" : 300,
9 | "vocab_file": "{TRANSFORMER}",
10 | "in": ["x"],
11 | "out": ["x_chunk", "chunk_nums", "chunk_sentences_offsets", "chunk_sentences"]
12 | },
13 | {
14 | "thres_proba": 0.05,
15 | "o_tag": "O",
16 | "tags_file": "{NER_PATH}/tag.dict",
17 | "class_name": "entity_detection_parser",
18 | "id": "edp"
19 | },
20 | {
21 | "class_name": "ner_chunk_model",
22 | "ner": {"config_path": "{CONFIGS_PATH}/ner/ner_rus_bert_probas.json"},
23 | "ner_parser": "#edp",
24 | "in": ["x_chunk", "chunk_nums", "chunk_sentences_offsets", "chunk_sentences"],
25 | "out": ["entity_substr", "entity_offsets", "entity_positions", "tags", "sentences_offsets", "sentences", "probas"]
26 | }
27 | ],
28 | "out": ["entity_substr", "entity_offsets", "entity_positions", "tags", "sentences_offsets", "sentences", "probas"]
29 | },
30 | "metadata": {
31 | "variables": {
32 | "ROOT_PATH": "~/.deeppavlov",
33 | "MODELS_PATH": "{ROOT_PATH}/models",
34 | "CONFIGS_PATH": "{DEEPPAVLOV_PATH}/configs",
35 | "TRANSFORMER": "DeepPavlov/rubert-base-cased",
36 | "NER_PATH": "{MODELS_PATH}/wiki_ner_rus_bert"
37 | }
38 | }
39 | }
40 |
--------------------------------------------------------------------------------
/deeppavlov/configs/entity_extraction/entity_extraction_en.json:
--------------------------------------------------------------------------------
1 | {
2 | "chainer": {
3 | "in": ["x"],
4 | "pipe": [
5 | {
6 | "config_path": "{CONFIGS_PATH}/entity_extraction/entity_detection_en.json",
7 | "in": ["x"],
8 | "out": ["entity_substr", "entity_offsets", "entity_positions", "tags", "sentences_offsets", "sentences", "probas"]
9 | },
10 | {
11 | "config_path": "{CONFIGS_PATH}/entity_extraction/entity_linking_en.json",
12 | "in": ["entity_substr", "tags", "probas", "sentences", "entity_offsets", "sentences_offsets"],
13 | "out": ["entity_ids", "entity_conf", "entity_pages", "entity_labels"]
14 | }
15 | ],
16 | "out": ["entity_substr", "tags", "entity_offsets", "entity_ids", "entity_conf", "entity_pages", "entity_labels"]
17 | },
18 | "metadata": {
19 | "variables": {
20 | "CONFIGS_PATH": "{DEEPPAVLOV_PATH}/configs"
21 | }
22 | }
23 | }
24 |
--------------------------------------------------------------------------------
/deeppavlov/configs/entity_extraction/entity_extraction_ru.json:
--------------------------------------------------------------------------------
1 | {
2 | "chainer": {
3 | "in": ["x"],
4 | "pipe": [
5 | {
6 | "config_path": "{CONFIGS_PATH}/entity_extraction/entity_detection_ru.json",
7 | "in": ["x"],
8 | "out": ["entity_substr", "entity_offsets", "entity_positions", "tags", "sentences_offsets", "sentences", "probas"]
9 | },
10 | {
11 | "config_path": "{CONFIGS_PATH}/entity_extraction/entity_linking_ru.json",
12 | "in": ["entity_substr", "tags", "probas", "sentences", "entity_offsets", "sentences_offsets"],
13 | "out": ["entity_ids", "entity_conf", "entity_pages", "entity_labels"]
14 | }
15 | ],
16 | "out": ["entity_substr", "tags", "entity_offsets", "entity_ids", "entity_conf", "entity_pages", "entity_labels"]
17 | },
18 | "metadata": {
19 | "variables": {
20 | "CONFIGS_PATH": "{DEEPPAVLOV_PATH}/configs"
21 | }
22 | }
23 | }
24 |
--------------------------------------------------------------------------------
/deeppavlov/configs/entity_extraction/entity_linking_en.json:
--------------------------------------------------------------------------------
1 | {
2 | "chainer": {
3 | "in": ["entity_substr", "tags", "probas", "sentences", "entity_offsets", "sentences_offsets"],
4 | "pipe": [
5 | {
6 | "class_name": "torch_transformers_entity_ranker_infer",
7 | "id": "entity_descr_ranking",
8 | "pretrained_bert": "{TRANSFORMER}",
9 | "encoder_weights_path": "{MODELS_PATH}/entity_linking_eng/encoder.pth.tar",
10 | "bilinear_weights_path": "{MODELS_PATH}/entity_linking_eng/bilinear.pth.tar",
11 | "special_token_id": 30522,
12 | "emb_size": 512,
13 | "block_size": 8
14 | },
15 | {
16 | "class_name": "entity_linker",
17 | "in": ["entity_substr", "tags", "probas", "sentences", "entity_offsets", "sentences_offsets"],
18 | "out": ["entity_ids", "entity_conf", "entity_pages", "entity_labels"],
19 | "load_path": "{DOWNLOADS_PATH}/entity_linking_eng",
20 | "entities_database_filename": "el_eng_v2.db",
21 | "entity_ranker": "#entity_descr_ranking",
22 | "rank_in_runtime": true,
23 | "num_entities_for_bert_ranking": 20,
24 | "include_mention": false,
25 | "num_entities_to_return": 3,
26 | "lemmatize": true,
27 | "use_descriptions": true,
28 | "use_connections": true,
29 | "use_tags": true,
30 | "full_paragraph": true,
31 | "return_confidences": true,
32 | "lang": "en"
33 | }
34 | ],
35 | "out": ["entity_ids", "entity_conf", "entity_pages", "entity_labels"]
36 | },
37 | "metadata": {
38 | "variables": {
39 | "ROOT_PATH": "~/.deeppavlov",
40 | "DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
41 | "MODELS_PATH": "{ROOT_PATH}/models",
42 | "TRANSFORMER": "prajjwal1/bert-small"
43 | },
44 | "download": [
45 | {
46 | "url": "http://files.deeppavlov.ai/kbqa/downloads/el_db_eng_v2.tar.gz",
47 | "subdir": "{DOWNLOADS_PATH}/entity_linking_eng"
48 | },
49 | {
50 | "url": "http://files.deeppavlov.ai/deeppavlov_data/entity_linking/el_ranker_eng.tar.gz",
51 | "subdir": "{MODELS_PATH}/entity_linking_eng"
52 | }
53 | ]
54 | }
55 | }
56 |
--------------------------------------------------------------------------------
/deeppavlov/configs/kbqa/wiki_parser.json:
--------------------------------------------------------------------------------
1 | {
2 | "chainer": {
3 | "in": ["parser_info", "query"],
4 | "pipe": [
5 | {
6 | "class_name": "wiki_parser",
7 | "in": ["parser_info", "query"],
8 | "out": ["wiki_parser_output"],
9 | "wiki_filename": "{DOWNLOADS_PATH}/wikidata/wikidata_compr.pickle",
10 | "file_format": "pickle",
11 | "lang": "@en"
12 | }
13 | ],
14 | "out": ["wiki_parser_output"]
15 | },
16 | "metadata": {
17 | "variables": {
18 | "ROOT_PATH": "~/.deeppavlov",
19 | "DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
20 | "MODELS_PATH": "{ROOT_PATH}/models",
21 | "CONFIGS_PATH": "{DEEPPAVLOV_PATH}/configs"
22 | },
23 | "download": [
24 | {
25 | "url": "http://files.deeppavlov.ai/kbqa/wikidata/wikidata_compr.pickle",
26 | "subdir": "{DOWNLOADS_PATH}/wikidata"
27 | }
28 | ]
29 | }
30 | }
31 |
--------------------------------------------------------------------------------
/deeppavlov/configs/morpho_syntax_parser/ru_syntagrus_joint_parsing.json:
--------------------------------------------------------------------------------
1 | {
2 | "chainer": {
3 | "in": ["x_words"],
4 | "pipe": [
5 | {
6 | "id": "main",
7 | "class_name": "joint_tagger_parser",
8 | "tagger": {
9 | "config_path": "{CONFIGS_PATH}/morpho_syntax_parser/morpho_ru_syntagrus_bert.json",
10 | "overwrite": {"chainer.pipe.6.return_string": false}
11 | },
12 | "parser": {
13 | "config_path": "{CONFIGS_PATH}/morpho_syntax_parser/syntax_ru_syntagrus_bert.json",
14 | "overwrite": {"chainer.pipe.6.return_string": false}
15 | },
16 | "in": ["x_words"],
17 | "out": ["y_parsed"]
18 | }
19 | ],
20 | "out": ["y_parsed"]
21 | },
22 | "metadata": {
23 | "variables": {
24 | "CONFIGS_PATH": "{DEEPPAVLOV_PATH}/configs"
25 | }
26 | }
27 | }
28 |
--------------------------------------------------------------------------------
/deeppavlov/configs/ner/ner_bert_base.json:
--------------------------------------------------------------------------------
1 | {
2 | "chainer": {
3 | "in": ["x"],
4 | "in_y": ["y"],
5 | "pipe": [
6 | {
7 | "class_name": "torch_transformers_ner_preprocessor",
8 | "vocab_file": "{BASE_MODEL}",
9 | "in": ["x"],
10 | "out": ["x_tokens", "x_subword_tokens", "x_subword_tok_ids", "startofword_markers", "attention_mask", "tokens_offsets"]
11 | },
12 | {
13 | "id": "tag_vocab",
14 | "class_name": "simple_vocab",
15 | "unk_token": ["O"],
16 | "save_path": "{MODEL_PATH}/tag.dict",
17 | "load_path": "{MODEL_PATH}/tag.dict",
18 | "fit_on": ["y"],
19 | "in": ["y"],
20 | "out": ["y_ind"]
21 | },
22 | {
23 | "class_name": "torch_transformers_sequence_tagger",
24 | "n_tags": "#tag_vocab.len",
25 | "pretrained_bert": "{BASE_MODEL}",
26 | "save_path": "{MODEL_PATH}/model",
27 | "load_path": "{MODEL_PATH}/model",
28 | "in": ["x_subword_tok_ids", "attention_mask", "startofword_markers"],
29 | "in_y": ["y_ind"],
30 | "out": ["y_pred_ind", "probas"]
31 | },
32 | {
33 | "ref": "tag_vocab",
34 | "in": ["y_pred_ind"],
35 | "out": ["y_pred"]
36 | }
37 | ],
38 | "out": ["x_tokens", "y_pred"]
39 | },
40 | "metadata": {
41 | "variables": {
42 | "BASE_MODEL": "bert-base-multilingual-cased",
43 | "ROOT_PATH": "~/.deeppavlov",
44 | "DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
45 | "MODELS_PATH": "{ROOT_PATH}/models",
46 | "MODEL_PATH": "{MODELS_PATH}/ner/{BASE_MODEL}"
47 | },
48 | "download": [
49 | {
50 | "url": "http://files.deeppavlov.ai/v1/ner/ner_bert_base.tar.gz",
51 | "subdir": "{MODEL_PATH}"
52 | }
53 | ]
54 | }
55 | }
56 |
--------------------------------------------------------------------------------
/deeppavlov/configs/odqa/en_odqa_infer_wiki.json:
--------------------------------------------------------------------------------
1 | {
2 | "chainer": {
3 | "in": ["question_raw"],
4 | "out": ["answer", "answer_score", "answer_place"],
5 | "pipe": [
6 | {
7 | "config_path": "{CONFIGS_PATH}/doc_retrieval/en_ranker_tfidf_wiki.json",
8 | "in": ["question_raw"],
9 | "out": ["tfidf_doc_ids"]
10 | },
11 | {
12 | "class_name": "bpr",
13 | "load_path": "{MODELS_PATH}/bpr/eng",
14 | "query_encoder_file": "query_encoder_en.pth.tar",
15 | "bpr_index": "bpr_finetuned_nq_adv.idx",
16 | "pretrained_model": "bert-base-uncased",
17 | "top_n": 100,
18 | "in": ["question_raw"],
19 | "out": ["bpr_doc_ids"]
20 | },
21 | {
22 | "class_name": "concat_lists",
23 | "in": ["tfidf_doc_ids", "bpr_doc_ids"],
24 | "out": ["doc_ids"]
25 | },
26 | {
27 | "class_name": "wiki_sqlite_vocab",
28 | "in": ["doc_ids"],
29 | "out": ["doc_text"],
30 | "join_docs": false,
31 | "shuffle": false,
32 | "load_path": "{DOWNLOADS_PATH}/odqa/enwiki_l100.db"
33 | },
34 | {
35 | "class_name": "string_multiplier",
36 | "in": ["question_raw", "doc_text"],
37 | "out":["questions"]
38 | },
39 | {
40 | "class_name": "logit_ranker",
41 | "batch_size": 64,
42 | "squad_model": {"config_path": "{CONFIGS_PATH}/squad/qa_nq_psgcls_bert.json"},
43 | "sort_noans": true,
44 | "in": ["doc_text", "questions"],
45 | "out": ["answer", "answer_score", "answer_place"]
46 | }
47 | ]
48 | },
49 | "metadata": {
50 | "variables": {
51 | "ROOT_PATH": "~/.deeppavlov",
52 | "DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
53 | "MODELS_PATH": "{ROOT_PATH}/models",
54 | "CONFIGS_PATH": "{DEEPPAVLOV_PATH}/configs"
55 | },
56 | "download": [
57 | {
58 | "url": "http://files.deeppavlov.ai/deeppavlov_data/odqa/bpr_encoder_index_eng.tar.gz",
59 | "subdir": "{MODELS_PATH}/bpr/eng"
60 | }
61 | ]
62 | }
63 | }
64 |
--------------------------------------------------------------------------------
/deeppavlov/configs/odqa/en_odqa_pop_infer_wiki.json:
--------------------------------------------------------------------------------
1 | {
2 | "chainer": {
3 | "in": ["question_raw"],
4 | "out": ["answer", "answer_score", "answer_place"],
5 | "pipe": [
6 | {
7 | "config_path": "{CONFIGS_PATH}/doc_retrieval/en_ranker_pop_wiki.json",
8 | "in": ["question_raw"],
9 | "out": ["tfidf_doc_ids"]
10 | },
11 | {
12 | "class_name": "bpr",
13 | "load_path": "{MODELS_PATH}/bpr/eng",
14 | "query_encoder_file": "query_encoder_en.pth.tar",
15 | "bpr_index": "bpr_finetuned_nq_adv.idx",
16 | "pretrained_model": "bert-base-uncased",
17 | "top_n": 100,
18 | "in": ["question_raw"],
19 | "out": ["bpr_doc_ids"]
20 | },
21 | {
22 | "class_name": "concat_lists",
23 | "in": ["tfidf_doc_ids", "bpr_doc_ids"],
24 | "out": ["doc_ids"]
25 | },
26 | {
27 | "class_name": "wiki_sqlite_vocab",
28 | "in": ["doc_ids"],
29 | "out": ["doc_text"],
30 | "join_docs": false,
31 | "shuffle": false,
32 | "load_path": "{DOWNLOADS_PATH}/odqa/enwiki_l100.db"
33 | },
34 | {
35 | "class_name": "string_multiplier",
36 | "in": ["question_raw", "doc_text"],
37 | "out":["questions"]
38 | },
39 | {
40 | "class_name": "logit_ranker",
41 | "batch_size": 64,
42 | "squad_model": {"config_path": "{CONFIGS_PATH}/squad/qa_nq_psgcls_bert.json"},
43 | "sort_noans": true,
44 | "in": ["doc_text", "questions"],
45 | "out": ["answer", "answer_score", "answer_place"]
46 | }
47 | ]
48 | },
49 | "metadata": {
50 | "variables": {
51 | "ROOT_PATH": "~/.deeppavlov",
52 | "DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
53 | "MODELS_PATH": "{ROOT_PATH}/models",
54 | "CONFIGS_PATH": "{DEEPPAVLOV_PATH}/configs"
55 | },
56 | "download": [
57 | {
58 | "url": "http://files.deeppavlov.ai/deeppavlov_data/odqa/bpr_encoder_index_eng.tar.gz",
59 | "subdir": "{MODELS_PATH}/bpr/eng"
60 | }
61 | ]
62 | }
63 | }
64 |
--------------------------------------------------------------------------------
/deeppavlov/configs/odqa/ru_odqa_infer_wiki.json:
--------------------------------------------------------------------------------
1 | {
2 | "chainer": {
3 | "in": ["question_raw"],
4 | "out": ["best_answer"],
5 | "pipe": [
6 | {
7 | "config_path": "{CONFIGS_PATH}/doc_retrieval/ru_ranker_tfidf_wiki.json",
8 | "in": ["question_raw"],
9 | "out": ["tfidf_doc_ids"]
10 | },
11 | {
12 | "class_name": "wiki_sqlite_vocab",
13 | "in": ["tfidf_doc_ids"],
14 | "out": ["tfidf_doc_text"],
15 | "join_docs": false,
16 | "shuffle": false,
17 | "load_path": "{DOWNLOADS_PATH}/odqa/ruwiki_par_page_compr.db"
18 | },
19 | {
20 | "class_name": "string_multiplier",
21 | "in": ["question_raw", "tfidf_doc_text"],
22 | "out":["questions"]
23 | },
24 | {
25 | "class_name": "logit_ranker",
26 | "batch_size": 64,
27 | "squad_model": {"config_path": "{CONFIGS_PATH}/squad/qa_multisberquad_bert.json"},
28 | "sort_noans": true,
29 | "in": ["tfidf_doc_text", "questions"],
30 | "out": ["best_answer", "best_answer_score"]
31 | }
32 | ]
33 | },
34 | "metadata": {
35 | "variables": {
36 | "ROOT_PATH": "~/.deeppavlov",
37 | "DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
38 | "MODELS_PATH": "{ROOT_PATH}/models",
39 | "CONFIGS_PATH": "{DEEPPAVLOV_PATH}/configs"
40 | },
41 | "download": [
42 | ]
43 | }
44 | }
45 |
--------------------------------------------------------------------------------
/deeppavlov/configs/ranking/path_ranking_nll_roberta_en.json:
--------------------------------------------------------------------------------
1 | {
2 | "chainer": {
3 | "in": ["question", "rels"],
4 | "pipe": [
5 | {
6 | "class_name": "path_ranking_preprocessor",
7 | "vocab_file": "{TRANSFORMER}",
8 | "do_lower_case": false,
9 | "additional_special_tokens": ["", "", "", "", "", "", ""],
10 | "max_seq_length": 96,
11 | "in": ["question", "rels"],
12 | "out": ["bert_features"]
13 | },
14 | {
15 | "class_name": "torch_transformers_nll_ranker",
16 | "in": ["bert_features"],
17 | "out": ["model_output"],
18 | "return_probas": true,
19 | "save_path": "{MODEL_PATH}/model",
20 | "load_path": "{MODEL_PATH}/model",
21 | "encoder_save_path": "{MODEL_PATH}/encoder",
22 | "linear_save_path": "{MODEL_PATH}/linear",
23 | "pretrained_bert": "{TRANSFORMER}",
24 | "learning_rate_drop_patience": 5,
25 | "learning_rate_drop_div": 1.5,
26 | "optimizer_parameters": {"lr": 1e-5, "weight_decay": 0.01, "eps": 1e-6}
27 | }
28 | ],
29 | "out": ["model_output"]
30 | },
31 | "metadata": {
32 | "variables": {
33 | "TRANSFORMER": "haisongzhang/roberta-tiny-cased",
34 | "MODEL_PATH": "~/.deeppavlov/models/classifiers/path_ranking_nll_roberta_lcquad2"
35 | },
36 | "download": [
37 | {
38 | "url": "http://files.deeppavlov.ai/kbqa/models/path_ranking_nll_roberta_lcquad2.tar.gz",
39 | "subdir": "{MODEL_PATH}"
40 | }
41 | ]
42 | }
43 | }
44 |
--------------------------------------------------------------------------------
/deeppavlov/configs/ranking/ranking_ubuntu_v2_torch_bert_uncased.json:
--------------------------------------------------------------------------------
1 | {
2 | "dataset_reader": {
3 | "class_name": "ubuntu_v2_reader",
4 | "data_path": "{DOWNLOADS_PATH}/ubuntu_v2_data"
5 | },
6 | "dataset_iterator": {
7 | "class_name": "siamese_iterator",
8 | "seed": 243
9 | },
10 | "chainer": {
11 | "in": [
12 | "x"
13 | ],
14 | "in_y": [
15 | "y"
16 | ],
17 | "pipe": [
18 | {
19 | "class_name": "torch_bert_ranker_preprocessor",
20 | "vocab_file": "bert-base-uncased",
21 | "do_lower_case": true,
22 | "max_seq_length": 128,
23 | "in": [
24 | "x"
25 | ],
26 | "out": [
27 | "bert_features"
28 | ]
29 | },
30 | {
31 | "class_name": "torch_bert_ranker",
32 | "pretrained_bert": "bert-base-uncased",
33 | "save_path": "{MODEL_PATH}/model",
34 | "load_path": "{MODEL_PATH}/model",
35 | "optimizer": "AdamW",
36 | "optimizer_parameters": {
37 | "lr": 2e-5,
38 | "weight_decay": 1e-2,
39 | "betas": [
40 | 0.9,
41 | 0.999
42 | ],
43 | "eps": 1e-6
44 | },
45 | "clip_norm": 1.0,
46 | "in": [
47 | "bert_features"
48 | ],
49 | "in_y": [
50 | "y"
51 | ],
52 | "out": [
53 | "predictions"
54 | ]
55 | }
56 | ],
57 | "out": [
58 | "predictions"
59 | ]
60 | },
61 | "train": {
62 | "batch_size": 32,
63 | "pytest_max_batches": 2,
64 | "train_metrics": [],
65 | "metrics": [
66 | "r@1",
67 | "r@2",
68 | "r@5"
69 | ],
70 | "validation_patience": 1,
71 | "val_every_n_epochs": 1,
72 | "log_every_n_epochs": 1,
73 | "evaluation_targets": [
74 | "valid",
75 | "test"
76 | ],
77 | "class_name": "torch_trainer"
78 | },
79 | "metadata": {
80 | "variables": {
81 | "ROOT_PATH": "~/.deeppavlov",
82 | "DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
83 | "MODELS_PATH": "{ROOT_PATH}/models",
84 | "MODEL_PATH": "{MODELS_PATH}/ubuntu_v2_uncased_torch_bert_model"
85 | },
86 | "download": [
87 | {
88 | "url": "http://files.deeppavlov.ai/datasets/ubuntu_v2_data.tar.gz",
89 | "subdir": "{DOWNLOADS_PATH}/ubuntu_v2_data"
90 | },
91 | {
92 | "url": "http://files.deeppavlov.ai/deeppavlov_data/ubuntu_v2_uncased_torch_bert_model_v2.tar.gz",
93 | "subdir": "{MODELS_PATH}"
94 | }
95 | ]
96 | }
97 | }
98 |
--------------------------------------------------------------------------------
/deeppavlov/configs/ranking/rel_ranking_nll_bert_ru.json:
--------------------------------------------------------------------------------
1 | {
2 | "chainer": {
3 | "in": ["question", "rels"],
4 | "pipe": [
5 | {
6 | "class_name": "path_ranking_preprocessor",
7 | "vocab_file": "{TRANSFORMER}",
8 | "do_lower_case": false,
9 | "max_seq_length": 96,
10 | "in": ["question", "rels"],
11 | "out": ["bert_features"]
12 | },
13 | {
14 | "class_name": "torch_transformers_nll_ranker",
15 | "in": ["bert_features"],
16 | "out": ["model_output"],
17 | "return_probas": true,
18 | "save_path": "{MODEL_PATH}/model",
19 | "load_path": "{MODEL_PATH}/model",
20 | "encoder_save_path": "{MODEL_PATH}/encoder",
21 | "linear_save_path": "{MODEL_PATH}/linear",
22 | "pretrained_bert": "{TRANSFORMER}",
23 | "learning_rate_drop_patience": 4,
24 | "learning_rate_drop_div": 1.5,
25 | "optimizer_parameters": {"lr": 1e-5, "weight_decay": 0.01, "eps": 1e-6}
26 | }
27 | ],
28 | "out": ["model_output"]
29 | },
30 | "metadata": {
31 | "variables": {
32 | "ROOT_PATH": "~/.deeppavlov",
33 | "DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
34 | "MODELS_PATH": "{ROOT_PATH}/models",
35 | "TRANSFORMER": "DeepPavlov/rubert-base-cased",
36 | "MODEL_PATH": "{MODELS_PATH}/classifiers/rel_ranking_nll_bert_ru"
37 | },
38 | "download": [
39 | {
40 | "url": "http://files.deeppavlov.ai/kbqa/models/rel_ranking_nll_bert_ru.tar.gz",
41 | "subdir": "{MODEL_PATH}"
42 | }
43 | ]
44 | }
45 | }
46 |
--------------------------------------------------------------------------------
/deeppavlov/configs/spelling_correction/brillmoore_wikitypos_en.json:
--------------------------------------------------------------------------------
1 | {
2 | "dataset_reader": {
3 | "class_name": "typos_wikipedia_reader",
4 | "data_path": "{DOWNLOADS_PATH}"
5 | },
6 | "dataset_iterator": {
7 | "class_name": "typos_iterator",
8 | "test_ratio": 0.05
9 | },
10 | "chainer":{
11 | "in": ["x"],
12 | "in_y": ["y"],
13 | "pipe": [
14 | {
15 | "class_name": "str_lower",
16 | "id": "lower",
17 | "in": ["x"],
18 | "out": ["x_lower"]
19 | },
20 | {
21 | "class_name": "nltk_moses_tokenizer",
22 | "id": "tokenizer",
23 | "in": ["x_lower"],
24 | "out": ["x_tokens"]
25 | },
26 | {
27 | "ref": "tokenizer",
28 | "in": ["y"],
29 | "out": ["y_tokens"]
30 | },
31 | {
32 | "fit_on": ["x_tokens", "y_tokens"],
33 | "in": ["x_tokens"],
34 | "out": ["tokens_candidates"],
35 | "class_name": "spelling_error_model",
36 | "window": 1,
37 | "candidates_count": 4,
38 | "dictionary": {
39 | "class_name": "wikitionary_100K_vocab",
40 | "data_dir": "{DOWNLOADS_PATH}/vocabs"
41 | },
42 | "save_path": "{MODELS_PATH}/error_model/error_model.tsv"
43 | },
44 | {
45 | "class_name": "kenlm_elector",
46 | "in": ["tokens_candidates"],
47 | "out": ["y_predicted_tokens"],
48 | "load_path": "{DOWNLOADS_PATH}/language_models/en_wiki_no_punkt.arpa.binary"
49 | },
50 | {
51 | "ref": "tokenizer",
52 | "in": ["y_predicted_tokens"],
53 | "out": ["y_predicted"]
54 | }
55 | ],
56 | "out": ["y_predicted"]
57 | },
58 | "train": {
59 | "evaluation_targets": ["test"],
60 | "class_name": "fit_trainer"
61 | },
62 | "metadata": {
63 | "variables": {
64 | "ROOT_PATH": "~/.deeppavlov",
65 | "DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
66 | "MODELS_PATH": "{ROOT_PATH}/models"
67 | },
68 | "download": [
69 | {
70 | "url": "http://files.deeppavlov.ai/deeppavlov_data/error_model.tar.gz",
71 | "subdir": "{MODELS_PATH}"
72 | },
73 | {
74 | "url": "http://files.deeppavlov.ai/lang_models/en_wiki_no_punkt.arpa.binary.gz",
75 | "subdir": "{DOWNLOADS_PATH}/language_models"
76 | },
77 | {
78 | "url": "http://files.deeppavlov.ai/datasets/wiktionary/wikipedia_100K_vocab.tar.gz",
79 | "subdir": "{DOWNLOADS_PATH}/vocabs"
80 | }
81 | ]
82 | }
83 | }
--------------------------------------------------------------------------------
/deeppavlov/configs/spelling_correction/levenshtein_corrector_ru.json:
--------------------------------------------------------------------------------
1 | {
2 | "chainer":{
3 | "in": ["x"],
4 | "pipe": [
5 | {
6 | "class_name": "str_lower",
7 | "id": "lower",
8 | "in": ["x"],
9 | "out": ["x_lower"]
10 | },
11 | {
12 | "class_name": "nltk_moses_tokenizer",
13 | "id": "tokenizer",
14 | "in": ["x_lower"],
15 | "out": ["x_tokens"]
16 | },
17 | {
18 | "id": "vocab",
19 | "class_name": "simple_vocab",
20 | "save_path": "{DOWNLOADS_PATH}/vocabs/russian_words_vocab.dict",
21 | "load_path": "{DOWNLOADS_PATH}/vocabs/russian_words_vocab.dict"
22 | },
23 | {
24 | "in": ["x_tokens"],
25 | "out": ["tokens_candidates"],
26 | "class_name": "spelling_levenshtein",
27 | "words": "#vocab.keys()"
28 | },
29 | {
30 | "class_name": "kenlm_elector",
31 | "in": ["tokens_candidates"],
32 | "out": ["y_predicted_tokens"],
33 | "load_path": "{DOWNLOADS_PATH}/language_models/ru_wiyalen_no_punkt.arpa.binary"
34 | },
35 | {
36 | "ref": "tokenizer",
37 | "in": ["y_predicted_tokens"],
38 | "out": ["y_predicted"]
39 | }
40 | ],
41 | "out": ["y_predicted"]
42 | },
43 | "metadata": {
44 | "variables": {
45 | "ROOT_PATH": "~/.deeppavlov",
46 | "DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
47 | "MODELS_PATH": "{ROOT_PATH}/models"
48 | },
49 | "download": [
50 | {
51 | "url": "http://files.deeppavlov.ai/deeppavlov_data/vocabs/russian_words_vocab.dict.gz",
52 | "subdir": "{DOWNLOADS_PATH}/vocabs"
53 | },
54 | {
55 | "url": "http://files.deeppavlov.ai/lang_models/ru_wiyalen_no_punkt.arpa.binary.gz",
56 | "subdir": "{DOWNLOADS_PATH}/language_models"
57 | }
58 | ]
59 | }
60 | }
61 |
--------------------------------------------------------------------------------
/deeppavlov/configs/squad/qa_nq_psgcls_bert.json:
--------------------------------------------------------------------------------
1 | {
2 | "chainer": {
3 | "in": ["context_raw", "question_raw"],
4 | "pipe": [
5 | {
6 | "class_name": "torch_squad_transformers_preprocessor",
7 | "vocab_file": "{TRANSFORMER}",
8 | "do_lower_case": "{LOWERCASE}",
9 | "max_seq_length": 384,
10 | "in": ["question_raw", "context_raw"],
11 | "out": ["bert_features", "subtokens", "split_context"]
12 | },
13 | {
14 | "class_name": "squad_bert_mapping",
15 | "do_lower_case": "{LOWERCASE}",
16 | "in": ["split_context", "bert_features", "subtokens"],
17 | "out": ["subtok2chars", "char2subtoks"]
18 | },
19 | {
20 | "class_name": "torch_transformers_squad",
21 | "pretrained_bert": "{TRANSFORMER}",
22 | "save_path": "{MODEL_PATH}/model",
23 | "load_path": "{MODEL_PATH}/model",
24 | "torch_seed": 1,
25 | "optimizer": "AdamW",
26 | "optimizer_parameters": {
27 | "lr": 2e-05,
28 | "weight_decay": 0.01,
29 | "betas": [0.9, 0.999],
30 | "eps": 1e-06
31 | },
32 | "random_seed": 1,
33 | "psg_cls": true,
34 | "learning_rate_drop_patience": 2,
35 | "learning_rate_drop_div": 2.0,
36 | "in": ["bert_features"],
37 | "out": ["ans_start_predicted", "ans_end_predicted", "logits", "scores", "inds"]
38 | },
39 | {
40 | "class_name": "squad_bert_ans_postprocessor",
41 | "in": ["ans_start_predicted", "ans_end_predicted", "split_context", "subtok2chars", "subtokens", "inds"],
42 | "out": ["ans_predicted", "ans_start_predicted", "ans_end_predicted"]
43 | }
44 | ],
45 | "out": ["ans_predicted", "ans_start_predicted", "scores"]
46 | },
47 | "metadata": {
48 | "variables": {
49 | "LOWERCASE": true,
50 | "TRANSFORMER": "bert-base-uncased",
51 | "ROOT_PATH": "~/.deeppavlov",
52 | "DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
53 | "MODELS_PATH": "{ROOT_PATH}/models",
54 | "MODEL_PATH": "{MODELS_PATH}/passage_reader_classifier_eng"
55 | },
56 | "download": [
57 | {
58 | "url": "http://files.deeppavlov.ai/deeppavlov_data/odqa/nq_psgcls_bert.tar.gz",
59 | "subdir": "{MODEL_PATH}"
60 | }
61 | ]
62 | }
63 | }
64 |
--------------------------------------------------------------------------------
/deeppavlov/core/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deeppavlov/DeepPavlov/5f9fbed0c7191466bc7621e604b810f66f254c03/deeppavlov/core/__init__.py
--------------------------------------------------------------------------------
/deeppavlov/core/commands/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deeppavlov/DeepPavlov/5f9fbed0c7191466bc7621e604b810f66f254c03/deeppavlov/core/commands/__init__.py
--------------------------------------------------------------------------------
/deeppavlov/core/common/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deeppavlov/DeepPavlov/5f9fbed0c7191466bc7621e604b810f66f254c03/deeppavlov/core/common/__init__.py
--------------------------------------------------------------------------------
/deeppavlov/core/common/aliases.py:
--------------------------------------------------------------------------------
1 | # Copyright 2017 Neural Networks and Deep Learning lab, MIPT
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | ALIASES = {
16 | 'kbqa_cq': 'kbqa_cq_en',
17 | 'kbqa_cq_online': 'kbqa_cq_en',
18 | 'kbqa_cq_rus': 'kbqa_cq_ru',
19 | 'multi_squad_noans': 'qa_squad2_bert',
20 | 'multi_squad_noans_infer': 'qa_squad2_bert',
21 | 'multi_squad_retr_noans': 'qa_squad2_bert',
22 | 'ner_collection3_m1': 'ner_collection3_bert',
23 | 'ner_conll2003': 'ner_conll2003_bert',
24 | 'ner_conll2003_torch_bert': 'ner_conll2003_bert',
25 | 'ner_dstc2': 'ner_conll2003_bert',
26 | 'ner_few_shot_ru': 'ner_rus_bert',
27 | 'ner_few_shot_ru_simulate': 'ner_rus_bert',
28 | 'ner_ontonotes': 'ner_ontonotes_bert',
29 | 'ner_ontonotes_bert_emb': 'ner_ontonotes_bert',
30 | 'ner_ontonotes_bert_mult_torch': 'ner_ontonotes_bert_mult',
31 | 'ner_ontonotes_bert_torch': 'ner_ontonotes_bert',
32 | 'ner_rus': 'ner_rus_bert',
33 | 'paraphraser_bert': 'paraphraser_rubert',
34 | 'ru_odqa_infer_wiki_rubert': 'ru_odqa_infer_wiki',
35 | 'sentseg_dailydialog': 'sentseg_dailydialog_bert',
36 | 'squad': 'squad_bert',
37 | 'squad_bert_infer': 'squad_bert',
38 | 'squad_bert_multilingual_freezed_emb': 'squad_bert',
39 | 'squad_ru': 'squad_ru_bert',
40 | 'squad_ru_bert_infer': 'squad_ru_bert',
41 | 'squad_ru_convers_distilrubert_2L_infer': 'squad_ru_convers_distilrubert_2L',
42 | 'squad_ru_convers_distilrubert_6L_infer': 'squad_ru_convers_distilrubert_6L',
43 | 'squad_ru_rubert': 'squad_ru_bert',
44 | 'squad_ru_rubert_infer': 'squad_ru_bert',
45 | 'squad_torch_bert': 'squad_bert',
46 | 'squad_torch_bert_infer': 'squad_bert'
47 | }
48 |
--------------------------------------------------------------------------------
/deeppavlov/core/common/errors.py:
--------------------------------------------------------------------------------
1 | # Copyright 2017 Neural Networks and Deep Learning lab, MIPT
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | import logging
16 |
17 | logger = logging.getLogger(__name__)
18 |
19 |
20 | class ConfigError(Exception):
21 | """Any configuration error."""
22 |
23 | def __init__(self, message):
24 | super(ConfigError, self).__init__()
25 | self.message = message
26 |
27 | def __str__(self):
28 | return repr(self.message)
29 |
--------------------------------------------------------------------------------
/deeppavlov/core/common/log.py:
--------------------------------------------------------------------------------
1 | # Copyright 2017 Neural Networks and Deep Learning lab, MIPT
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | import json
16 | import logging
17 | import logging.config
18 | from pathlib import Path
19 |
20 | from .paths import get_settings_path
21 |
22 | LOG_CONFIG_FILENAME = 'log_config.json'
23 | TRACEBACK_LOGGER_ERRORS = True
24 |
25 | root_path = Path(__file__).resolve().parents[3]
26 |
27 | log_config_path = get_settings_path() / LOG_CONFIG_FILENAME
28 |
29 | with log_config_path.open(encoding='utf8') as log_config_json:
30 | log_config = json.load(log_config_json)
31 |
32 |
33 | class ProbeFilter(logging.Filter):
34 | """ProbeFilter class is used to filter POST requests to /probe endpoint from logs."""
35 |
36 | def filter(self, record: logging.LogRecord) -> bool:
37 | """To log the record method should return True."""
38 | return 'POST /probe HTTP' not in record.getMessage()
39 |
40 |
41 | def init_logger():
42 | configured_loggers = [log_config.get('root', {})] + [logger for logger in
43 | log_config.get('loggers', {}).values()]
44 |
45 | used_handlers = {handler for log in configured_loggers for handler in log.get('handlers', [])}
46 |
47 | for handler_id, handler in list(log_config['handlers'].items()):
48 | if handler_id not in used_handlers:
49 | del log_config['handlers'][handler_id]
50 | elif 'filename' in handler.keys():
51 | filename = handler['filename']
52 | logfile_path = Path(filename).expanduser().resolve()
53 | handler['filename'] = str(logfile_path)
54 |
55 | logging.config.dictConfig(log_config)
56 |
--------------------------------------------------------------------------------
/deeppavlov/core/common/log_events.py:
--------------------------------------------------------------------------------
1 | # Copyright 2019 Neural Networks and Deep Learning lab, MIPT
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | from logging import getLogger
16 | from typing import Optional
17 | from deeppavlov.core.commands.utils import expand_path
18 |
19 | log = getLogger(__name__)
20 |
21 |
22 | class TBWriter:
23 | def __init__(self, tensorboard_log_dir: str):
24 | # TODO: After adding wandb logger, create common parent class for both loggers
25 | from torch.utils.tensorboard import SummaryWriter
26 | tensorboard_log_dir = expand_path(tensorboard_log_dir)
27 | self.tb_train_writer = SummaryWriter(str(tensorboard_log_dir / 'train_log'))
28 | self.tb_valid_writer = SummaryWriter(str(tensorboard_log_dir / 'valid_log'))
29 |
30 | # TODO: find how to write Summary
31 | def write_train(self, tag, scalar_value, global_step):
32 | self.tb_train_writer.add_scalar(tag, scalar_value, global_step)
33 |
34 | def write_valid(self, tag, scalar_value, global_step):
35 | self.tb_valid_writer.add_scalar(tag, scalar_value, global_step)
36 |
37 | def flush(self):
38 | self.tb_train_writer.flush()
39 | self.tb_valid_writer.flush()
40 |
41 |
42 | def get_tb_writer(tensorboard_log_dir: Optional[str]) -> Optional[TBWriter]:
43 | try:
44 | if tensorboard_log_dir is not None:
45 | tb_writer = TBWriter(tensorboard_log_dir)
46 | else:
47 | tb_writer = None
48 | except ImportError:
49 | log.error('Failed to import SummaryWriter from torch.utils.tensorboard. Failed to initialize Tensorboard '
50 | 'logger. Install appropriate Pytorch version to use this logger or remove tensorboard_log_dir '
51 | 'parameter from the train parameters list in the configuration file.')
52 | tb_writer = None
53 | return tb_writer
54 |
--------------------------------------------------------------------------------
/deeppavlov/core/common/metrics_registry.py:
--------------------------------------------------------------------------------
1 | # Copyright 2017 Neural Networks and Deep Learning lab, MIPT
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | import importlib
16 | import json
17 | from logging import getLogger
18 | from pathlib import Path
19 | from typing import Callable, Any
20 |
21 | from deeppavlov.core.common.errors import ConfigError
22 |
23 | log = getLogger(__name__)
24 |
25 | _registry_path = Path(__file__).parent / 'metrics_registry.json'
26 | if _registry_path.exists():
27 | with _registry_path.open(encoding='utf-8') as f:
28 | _REGISTRY = json.load(f)
29 | else:
30 | _REGISTRY = {}
31 |
32 |
33 | def fn_from_str(name: str) -> Callable[..., Any]:
34 | """Returns a function object with the name given in string."""
35 | try:
36 | module_name, fn_name = name.split(':')
37 | return getattr(importlib.import_module(module_name), fn_name)
38 | except ValueError:
39 | raise ConfigError('Expected function description in a `module.submodules:function_name` form, but got `{}`'
40 | .format(name))
41 | except AttributeError:
42 | # noinspection PyUnboundLocalVariable
43 | raise ConfigError(f"Incorrect metric: '{module_name}' has no attribute '{fn_name}'.")
44 |
45 |
46 | def register_metric(metric_name: str) -> Callable[..., Any]:
47 | """Decorator for metric registration."""
48 |
49 | def decorate(fn):
50 | fn_name = fn.__module__ + ':' + fn.__name__
51 | if metric_name in _REGISTRY and _REGISTRY[metric_name] != fn_name:
52 | log.warning('"{}" is already registered as a metric name, the old function will be ignored'
53 | .format(metric_name))
54 | _REGISTRY[metric_name] = fn_name
55 | return fn
56 |
57 | return decorate
58 |
59 |
60 | def get_metric_by_name(name: str) -> Callable[..., Any]:
61 | """Returns a metric callable with a corresponding name."""
62 | name = _REGISTRY.get(name, name)
63 | return fn_from_str(name)
64 |
--------------------------------------------------------------------------------
/deeppavlov/core/common/paths.py:
--------------------------------------------------------------------------------
1 | # Copyright 2017 Neural Networks and Deep Learning lab, MIPT
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | import os
15 | import shutil
16 |
17 | from pathlib import Path
18 |
19 | _root_path = Path(__file__).resolve().parents[3]
20 | _default_settings_path: Path = _root_path / 'deeppavlov' / 'utils' / 'settings'
21 | _settings_path = Path(os.getenv('DP_SETTINGS_PATH', _default_settings_path)).expanduser().resolve()
22 | if _settings_path.is_file():
23 | raise FileExistsError(f'DP_SETTINGS_PATH={_settings_path} is a file and not a directory')
24 |
25 | if _default_settings_path in _settings_path.parents:
26 | raise RecursionError(f'DP_SETTINGS_PATH={_settings_path} is relative'
27 | f' to the default settings path {_default_settings_path}')
28 |
29 |
30 | def get_settings_path() -> Path:
31 | """Return an absolute path to the DeepPavlov settings directory"""
32 | populate_settings_dir()
33 | return _settings_path
34 |
35 |
36 | def populate_settings_dir(force: bool = False) -> bool:
37 | """
38 | Populate settings directory with default settings files
39 |
40 | Args:
41 | force: if ``True``, replace existing settings files with default ones
42 |
43 | Returns:
44 | ``True`` if any files were copied and ``False`` otherwise
45 | """
46 | res = False
47 | if _default_settings_path == _settings_path:
48 | return res
49 |
50 | for src in list(_default_settings_path.glob('**/*.json')):
51 | dest = _settings_path / src.relative_to(_default_settings_path)
52 | if not force and dest.exists():
53 | continue
54 | res = True
55 | dest.parent.mkdir(parents=True, exist_ok=True)
56 | shutil.copy(src, dest)
57 | return res
58 |
--------------------------------------------------------------------------------
/deeppavlov/core/common/prints.py:
--------------------------------------------------------------------------------
1 | # Copyright 2017 Neural Networks and Deep Learning lab, MIPT
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | import sys
16 | from contextlib import redirect_stdout
17 |
18 |
19 | class RedirectedPrints(redirect_stdout):
20 | """Context manager for temporarily redirecting stdout to another stream """
21 |
22 | def __init__(self, new_target=sys.stderr):
23 | super().__init__(new_target=new_target)
24 |
--------------------------------------------------------------------------------
/deeppavlov/core/data/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deeppavlov/DeepPavlov/5f9fbed0c7191466bc7621e604b810f66f254c03/deeppavlov/core/data/__init__.py
--------------------------------------------------------------------------------
/deeppavlov/core/data/dataset_reader.py:
--------------------------------------------------------------------------------
1 | # Copyright 2017 Neural Networks and Deep Learning lab, MIPT
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | from typing import List, Dict, Tuple, Any
16 |
17 |
18 | class DatasetReader:
19 | """An abstract class for reading data from some location and construction of a dataset."""
20 |
21 | def read(self, data_path: str, *args, **kwargs) -> Dict[str, List[Tuple[Any, Any]]]:
22 | """Reads a file from a path and returns data as a list of tuples of inputs and correct outputs
23 | for every data type in ``train``, ``valid`` and ``test``.
24 | """
25 | raise NotImplementedError
26 |
--------------------------------------------------------------------------------
/deeppavlov/core/models/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deeppavlov/DeepPavlov/5f9fbed0c7191466bc7621e604b810f66f254c03/deeppavlov/core/models/__init__.py
--------------------------------------------------------------------------------
/deeppavlov/core/models/component.py:
--------------------------------------------------------------------------------
1 | # Copyright 2017 Neural Networks and Deep Learning lab, MIPT
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | from abc import ABCMeta, abstractmethod
16 |
17 | from logging import getLogger
18 |
19 | log = getLogger(__name__)
20 |
21 |
22 | class Component(metaclass=ABCMeta):
23 | """Abstract class for all callables that could be used in Chainer's pipe."""
24 |
25 | @abstractmethod
26 | def __call__(self, *args, **kwargs):
27 | pass
28 |
29 | def reset(self):
30 | pass
31 |
32 | def destroy(self):
33 | attr_list = list(self.__dict__.keys())
34 | for attr_name in attr_list:
35 | attr = getattr(self, attr_name)
36 | if hasattr(attr, 'destroy'):
37 | attr.destroy()
38 | delattr(self, attr_name)
39 |
--------------------------------------------------------------------------------
/deeppavlov/core/models/estimator.py:
--------------------------------------------------------------------------------
1 | # Copyright 2017 Neural Networks and Deep Learning lab, MIPT
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | from abc import abstractmethod
16 |
17 | from .component import Component
18 | from .serializable import Serializable
19 |
20 |
21 | class Estimator(Component, Serializable):
22 | """Abstract class for components that could be fitted on the data as a whole."""
23 |
24 | @abstractmethod
25 | def fit(self, *args, **kwargs):
26 | pass
27 |
--------------------------------------------------------------------------------
/deeppavlov/core/models/nn_model.py:
--------------------------------------------------------------------------------
1 | # Copyright 2017 Neural Networks and Deep Learning lab, MIPT
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | from abc import abstractmethod
16 |
17 | from .component import Component
18 | from .serializable import Serializable
19 |
20 |
21 | class NNModel(Component, Serializable):
22 | """Abstract class for deep learning components."""
23 |
24 | @abstractmethod
25 | def train_on_batch(self, x: list, y: list):
26 | pass
27 |
28 | def process_event(self, event_name, data):
29 | pass
30 |
--------------------------------------------------------------------------------
/deeppavlov/core/models/serializable.py:
--------------------------------------------------------------------------------
1 | # Copyright 2017 Neural Networks and Deep Learning lab, MIPT
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | from abc import ABCMeta, abstractmethod
16 | from logging import getLogger
17 | from pathlib import Path
18 | from typing import Union, Optional
19 |
20 | from deeppavlov.core.commands.utils import expand_path
21 |
22 | log = getLogger(__name__)
23 |
24 |
25 | class Serializable(metaclass=ABCMeta):
26 | """Abstract base class that expresses the interface for all models that can serialize data to a path."""
27 |
28 | def __init__(self, save_path: Optional[Union[str, Path]], load_path: Optional[Union[str, Path]] = None,
29 | mode: str = 'infer',
30 | *args, **kwargs) -> None:
31 |
32 | if save_path:
33 | self.save_path = expand_path(save_path)
34 | self.save_path.parent.mkdir(parents=True, exist_ok=True)
35 | else:
36 | self.save_path = None
37 |
38 | if load_path:
39 | self.load_path = expand_path(load_path)
40 | if mode != 'train' and self.save_path and self.load_path != self.save_path:
41 | log.warning("Load path '{}' differs from save path '{}' in '{}' mode for {}."
42 | .format(self.load_path, self.save_path, mode, self.__class__.__name__))
43 | elif mode != 'train' and self.save_path:
44 | self.load_path = self.save_path
45 | log.warning("No load path is set for {} in '{}' mode. Using save path instead"
46 | .format(self.__class__.__name__, mode))
47 | else:
48 | self.load_path = None
49 | log.warning("No load path is set for {}!".format(self.__class__.__name__))
50 |
51 | @abstractmethod
52 | def save(self, *args, **kwargs):
53 | pass
54 |
55 | @abstractmethod
56 | def load(self, *args, **kwargs):
57 | pass
58 |
--------------------------------------------------------------------------------
/deeppavlov/core/trainers/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright 2019 Neural Networks and Deep Learning lab, MIPT
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | from .fit_trainer import FitTrainer
16 | from .nn_trainer import NNTrainer
17 | from .torch_trainer import TorchTrainer
18 |
--------------------------------------------------------------------------------
/deeppavlov/core/trainers/torch_trainer.py:
--------------------------------------------------------------------------------
1 | # Copyright 2019 Neural Networks and Deep Learning lab, MIPT
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | from logging import getLogger
16 | from typing import Tuple, Optional, Iterable, Collection, Any
17 |
18 | from deeppavlov.core.trainers.utils import Metric
19 | from deeppavlov.core.common.registry import register
20 | from deeppavlov.core.data.data_learning_iterator import DataLearningIterator
21 | from deeppavlov.core.trainers.nn_trainer import NNTrainer
22 |
23 | log = getLogger(__name__)
24 |
25 |
26 | @register('torch_trainer')
27 | class TorchTrainer(NNTrainer):
28 |
29 | def test(self, data: Iterable[Tuple[Collection[Any], Collection[Any]]],
30 | metrics: Optional[Collection[Metric]] = None, *,
31 | start_time: Optional[float] = None, show_examples: Optional[bool] = None) -> dict:
32 | self._chainer.get_main_component().model.eval()
33 |
34 | report = super(TorchTrainer, self).test(data=data, metrics=metrics, start_time=start_time,
35 | show_examples=show_examples)
36 | self._chainer.get_main_component().model.train()
37 | return report
38 |
39 | def train_on_batches(self, iterator: DataLearningIterator) -> None:
40 | self._chainer.get_main_component().model.train()
41 | super(TorchTrainer, self).train_on_batches(iterator=iterator)
42 | self._chainer.get_main_component().model.eval()
43 |
--------------------------------------------------------------------------------
/deeppavlov/dataset_iterators/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deeppavlov/DeepPavlov/5f9fbed0c7191466bc7621e604b810f66f254c03/deeppavlov/dataset_iterators/__init__.py
--------------------------------------------------------------------------------
/deeppavlov/dataset_iterators/siamese_iterator.py:
--------------------------------------------------------------------------------
1 | # Copyright 2017 Neural Networks and Deep Learning lab, MIPT
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | from logging import getLogger
16 | from typing import Dict, List, Tuple
17 |
18 | from deeppavlov.core.common.registry import register
19 | from deeppavlov.core.data.data_learning_iterator import DataLearningIterator
20 |
21 | log = getLogger(__name__)
22 |
23 |
24 | @register('siamese_iterator')
25 | class SiameseIterator(DataLearningIterator):
26 | """The class contains methods for iterating over a dataset for ranking in training, validation and test mode."""
27 |
28 | def split(self, *args, len_valid=1000, len_test=1000, **kwargs) -> None:
29 | if len(self.valid) == 0 and len_valid != 0:
30 | self.random.shuffle(self.train)
31 | self.valid = self.train[-len_valid:]
32 | self.train = self.train[:-len_valid]
33 | if len(self.test) == 0 and len_test != 0:
34 | self.random.shuffle(self.train)
35 | self.test = self.train[-len_test:]
36 | self.train = self.train[:-len_test]
37 |
--------------------------------------------------------------------------------
/deeppavlov/dataset_iterators/typos_iterator.py:
--------------------------------------------------------------------------------
1 | # Copyright 2017 Neural Networks and Deep Learning lab, MIPT
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | from deeppavlov.core.common.registry import register
16 | from deeppavlov.core.data.data_learning_iterator import DataLearningIterator
17 |
18 |
19 | @register('typos_iterator')
20 | class TyposDatasetIterator(DataLearningIterator):
21 | """Implementation of :class:`~deeppavlov.core.data.data_learning_iterator.DataLearningIterator` used for training
22 | :class:`~deeppavlov.models.spelling_correction.brillmoore.ErrorModel`
23 |
24 | """
25 |
26 | def split(self, test_ratio: float = 0., *args, **kwargs):
27 | """Split all data into train and test
28 |
29 | Args:
30 | test_ratio: ratio of test data to train, from 0. to 1.
31 | """
32 | self.train += self.valid + self.test
33 |
34 | split = int(len(self.train) * test_ratio)
35 |
36 | self.random.shuffle(self.train)
37 |
38 | self.test = self.train[:split]
39 | self.train = self.train[split:]
40 | self.valid = []
41 |
--------------------------------------------------------------------------------
/deeppavlov/dataset_readers/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deeppavlov/DeepPavlov/5f9fbed0c7191466bc7621e604b810f66f254c03/deeppavlov/dataset_readers/__init__.py
--------------------------------------------------------------------------------
/deeppavlov/dataset_readers/faq_reader.py:
--------------------------------------------------------------------------------
1 | # Copyright 2017 Neural Networks and Deep Learning lab, MIPT
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, softwaredata
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | from typing import Dict
16 |
17 | from pandas import read_csv
18 |
19 | from deeppavlov.core.common.registry import register
20 | from deeppavlov.core.data.dataset_reader import DatasetReader
21 |
22 |
23 | @register('faq_reader')
24 | class FaqDatasetReader(DatasetReader):
25 | """Reader for FAQ dataset"""
26 |
27 | def read(self, data_path: str = None, data_url: str = None, x_col_name: str = 'x', y_col_name: str = 'y') -> Dict:
28 | """
29 | Read FAQ dataset from specified csv file or remote url
30 |
31 | Parameters:
32 | data_path: path to csv file of FAQ
33 | data_url: url to csv file of FAQ
34 | x_col_name: name of Question column in csv file
35 | y_col_name: name of Answer column in csv file
36 |
37 | Returns:
38 | A dictionary containing training, validation and test parts of the dataset obtainable via
39 | ``train``, ``valid`` and ``test`` keys.
40 | """
41 |
42 | if data_url is not None:
43 | data = read_csv(data_url)
44 | elif data_path is not None:
45 | data = read_csv(data_path)
46 | else:
47 | raise ValueError("Please specify data_path or data_url parameter")
48 |
49 | x = data[x_col_name]
50 | y = data[y_col_name]
51 |
52 | train_xy_tuples = [(x[i].strip(), y[i].strip()) for i in range(len(x))]
53 |
54 | dataset = dict()
55 | dataset["train"] = train_xy_tuples
56 | dataset["valid"] = []
57 | dataset["test"] = []
58 |
59 | return dataset
60 |
--------------------------------------------------------------------------------
/deeppavlov/dataset_readers/line_reader.py:
--------------------------------------------------------------------------------
1 | # Copyright 2017 Neural Networks and Deep Learning lab, MIPT
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, softwaredata
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | from typing import Dict
16 |
17 | from deeppavlov.core.common.registry import register
18 | from deeppavlov.core.data.dataset_reader import DatasetReader
19 |
20 |
21 | @register('line_reader')
22 | class LineReader(DatasetReader):
23 | """Read txt file by lines"""
24 |
25 | def read(self, data_path: str = None, *args, **kwargs) -> Dict:
26 | """Read lines from txt file
27 |
28 | Args:
29 | data_path: path to txt file
30 |
31 | Returns:
32 | A dictionary containing training, validation and test parts of the dataset obtainable via ``train``, ``valid`` and ``test`` keys.
33 | """
34 |
35 | with open(data_path) as f:
36 | content = f.readlines()
37 |
38 | dataset = dict()
39 | dataset["train"] = [(line,) for line in content]
40 | dataset["valid"] = []
41 | dataset["test"] = []
42 |
43 | return dataset
44 |
--------------------------------------------------------------------------------
/deeppavlov/dataset_readers/multitask_reader.py:
--------------------------------------------------------------------------------
1 | # Copyright 2017 Neural Networks and Deep Learning lab, MIPT
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | import copy
16 | from logging import getLogger
17 | from typing import Dict
18 |
19 | from deeppavlov.core.common.registry import get_model, register
20 | from deeppavlov.core.data.dataset_reader import DatasetReader
21 |
22 | log = getLogger(__name__)
23 |
24 |
25 | @register('multitask_reader')
26 | class MultiTaskReader(DatasetReader):
27 | """Class to read several datasets simultaneously."""
28 |
29 | def read(self, tasks: Dict[str, Dict[str, dict]], task_defaults: dict = None, **kwargs):
30 | """Creates dataset readers for tasks and returns what task dataset readers `read()` methods return.
31 |
32 | Args:
33 | tasks: dictionary which keys are task names and values are dictionaries with param name - value pairs for
34 | nested dataset readers initialization. If task has key-value pair ``'use_task_defaults': False``,
35 | task_defaults for this task dataset reader will be ignored.
36 | task_defaults: default task parameters.
37 |
38 | Returns:
39 | dictionary which keys are task names and values are what task readers `read()` methods returned.
40 | """
41 | data = dict()
42 | if task_defaults is None:
43 | task_defaults = dict()
44 | for task_name, task_params in tasks.items():
45 | if task_params.pop('use_task_defaults', True) is True:
46 | task_config = copy.deepcopy(task_defaults)
47 | task_config.update(task_params)
48 | else:
49 | task_config = task_params
50 | reader = get_model(task_config.pop('class_name'))()
51 | data[task_name] = reader.read(**task_config)
52 | return data
53 |
--------------------------------------------------------------------------------
/deeppavlov/dataset_readers/paraphraser_reader.py:
--------------------------------------------------------------------------------
1 | # Copyright 2017 Neural Networks and Deep Learning lab, MIPT
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | import xml.etree.ElementTree as ET
16 | from pathlib import Path
17 | from typing import Dict, List, Tuple
18 |
19 | from deeppavlov.core.commands.utils import expand_path
20 | from deeppavlov.core.common.registry import register
21 | from deeppavlov.core.data.dataset_reader import DatasetReader
22 |
23 |
24 | @register('paraphraser_reader')
25 | class ParaphraserReader(DatasetReader):
26 | """The class to read the paraphraser.ru dataset from files.
27 |
28 | Please, see https://paraphraser.ru.
29 | """
30 |
31 | def read(self,
32 | data_path: str,
33 | do_lower_case: bool = True,
34 | *args, **kwargs) -> Dict[str, List[Tuple[Tuple[str, str], int]]]:
35 | """Read the paraphraser.ru dataset from files.
36 |
37 | Args:
38 | data_path: A path to a folder with dataset files.
39 | do_lower_case: Do you want to lowercase all texts
40 | """
41 |
42 | data_path = expand_path(data_path)
43 | train_fname = data_path / 'paraphrases.xml'
44 | test_fname = data_path / 'paraphrases_gold.xml'
45 |
46 | train_data = self._build_data(train_fname, do_lower_case)
47 | test_data = self._build_data(test_fname, do_lower_case)
48 | return {"train": train_data, "valid": [], "test": test_data}
49 |
50 | @staticmethod
51 | def _build_data(data_path: Path, do_lower_case: bool) -> List[Tuple[Tuple[str, str], int]]:
52 | root = ET.fromstring(data_path.read_text(encoding='utf8'))
53 | data = {}
54 | for paraphrase in root.findall('corpus/paraphrase'):
55 | key = (paraphrase.find('value[@name="text_1"]').text,
56 | paraphrase.find('value[@name="text_2"]').text)
57 | if do_lower_case:
58 | key = tuple([t.lower() for t in key])
59 |
60 | data[key] = 1 if int(paraphrase.find('value[@name="class"]').text) >= 0 else 0
61 | return list(data.items())
62 |
--------------------------------------------------------------------------------
/deeppavlov/dataset_readers/rel_ranking_reader.py:
--------------------------------------------------------------------------------
1 | # Copyright 2017 Neural Networks and Deep Learning lab, MIPT
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | import xml.etree.ElementTree as ET
16 | from pathlib import Path
17 | from typing import Dict, List, Tuple
18 |
19 | from deeppavlov.core.commands.utils import expand_path
20 | from deeppavlov.core.common.registry import register
21 | from deeppavlov.core.data.dataset_reader import DatasetReader
22 |
23 |
24 | @register('rel_ranking_reader')
25 | class ParaphraserReader(DatasetReader):
26 | """The class to read the paraphraser.ru dataset from files.
27 |
28 | Please, see https://paraphraser.ru.
29 | """
30 |
31 | def read(self,
32 | data_path: str,
33 | do_lower_case: bool = True,
34 | *args, **kwargs) -> Dict[str, List[Tuple[Tuple[str, str], int]]]:
35 | """Read the paraphraser.ru dataset from files.
36 |
37 | Args:
38 | data_path: A path to a folder with dataset files.
39 | do_lower_case: Do you want to lowercase all texts
40 | """
41 |
42 | data_path = expand_path(data_path)
43 | train_fname = data_path / 'paraphrases.xml'
44 | test_fname = data_path / 'paraphrases_gold.xml'
45 |
46 | train_data = self._build_data(train_fname, do_lower_case)
47 | test_data = self._build_data(test_fname, do_lower_case)
48 | return {"train": train_data, "valid": [], "test": test_data}
49 |
50 | @staticmethod
51 | def _build_data(data_path: Path, do_lower_case: bool) -> List[Tuple[Tuple[str, str], int]]:
52 | root = ET.fromstring(data_path.read_text(encoding='utf8'))
53 | data = []
54 | for paraphrase in root.findall('corpus/paraphrase'):
55 | key = (paraphrase.find('value[@name="text_1"]').text,
56 | paraphrase.find('value[@name="text_2"]').text)
57 | if do_lower_case:
58 | key = tuple([t.lower() for t in key])
59 |
60 | pos_or_neg = int(paraphrase.find('value[@name="class"]').text)
61 | data.append((key, pos_or_neg))
62 | return data
63 |
--------------------------------------------------------------------------------
/deeppavlov/metrics/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deeppavlov/DeepPavlov/5f9fbed0c7191466bc7621e604b810f66f254c03/deeppavlov/metrics/__init__.py
--------------------------------------------------------------------------------
/deeppavlov/metrics/correlation.py:
--------------------------------------------------------------------------------
1 | # Copyright 2020 Neural Networks and Deep Learning lab, MIPT
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | from scipy.stats import pearsonr, spearmanr
16 | from sklearn.metrics import matthews_corrcoef
17 |
18 | from deeppavlov.core.common.metrics_registry import register_metric
19 |
20 |
21 | @register_metric('pearson_correlation')
22 | def pearson_correlation(y_true, y_predicted) -> float:
23 | return pearsonr(y_predicted, y_true)[0]
24 |
25 |
26 | @register_metric('spearman_correlation')
27 | def spearman_correlation(y_true, y_predicted) -> float:
28 | return spearmanr(y_predicted, y_true)[0]
29 |
30 |
31 | @register_metric('matthews_correlation')
32 | def matthews_correlation(y_true, y_predicted) -> float:
33 | return matthews_corrcoef(y_true, y_predicted)
34 |
--------------------------------------------------------------------------------
/deeppavlov/metrics/elmo_metrics.py:
--------------------------------------------------------------------------------
1 | # Copyright 2017 Neural Networks and Deep Learning lab, MIPT
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | from typing import List
16 |
17 | import numpy as np
18 |
19 | from deeppavlov.core.common.metrics_registry import register_metric
20 |
21 |
22 | @register_metric('elmo_loss2ppl')
23 | def elmo_loss2ppl(losses: List[np.ndarray]) -> float:
24 | """ Calculates perplexity by loss
25 |
26 | Args:
27 | losses: list of numpy arrays of model losses
28 |
29 | Returns:
30 | perplexity : float
31 | """
32 | avg_loss = np.mean(losses)
33 | return float(np.exp(avg_loss))
34 |
--------------------------------------------------------------------------------
/deeppavlov/metrics/log_loss.py:
--------------------------------------------------------------------------------
1 | # Copyright 2017 Neural Networks and Deep Learning lab, MIPT
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 |
16 | from typing import List, Union
17 |
18 | import numpy as np
19 | from sklearn.metrics import log_loss
20 |
21 | from deeppavlov.core.common.metrics_registry import register_metric
22 |
23 |
24 | @register_metric('log_loss')
25 | def sk_log_loss(y_true: Union[List[List[float]], List[List[int]], np.ndarray],
26 | y_predicted: Union[List[List[float]], List[List[int]], np.ndarray]) -> float:
27 | """
28 | Calculates log loss.
29 |
30 | Args:
31 | y_true: list or array of true values
32 | y_predicted: list or array of predicted values
33 |
34 | Returns:
35 | Log loss
36 |
37 | Alias:
38 | log_loss
39 | """
40 | return log_loss(y_true, y_predicted)
41 |
--------------------------------------------------------------------------------
/deeppavlov/metrics/mse.py:
--------------------------------------------------------------------------------
1 | # Copyright 2020 Neural Networks and Deep Learning lab, MIPT
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | import numpy as np
16 | from sklearn.metrics import mean_squared_error
17 | from typing import Union
18 |
19 | from deeppavlov.core.common.metrics_registry import register_metric
20 |
21 |
22 | @register_metric('mean_squared_error')
23 | def mse(y_true: Union[np.array, list],
24 | y_predicted: Union[np.array, list],
25 | *args,
26 | **kwargs) -> float:
27 | """
28 | Calculates mean squared error.
29 | Args:
30 | y_true: list of true values
31 | y_predicted: list of predicted values
32 | Returns:
33 | float: Mean squared error
34 | """
35 | for value in [y_true, y_predicted]:
36 | assert (np.isfinite(value).all())
37 | return mean_squared_error(y_true, y_predicted, *args, **kwargs)
38 |
--------------------------------------------------------------------------------
/deeppavlov/metrics/recall_at_k.py:
--------------------------------------------------------------------------------
1 | # Copyright 2017 Neural Networks and Deep Learning lab, MIPT
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 |
16 | from typing import List
17 |
18 | import numpy as np
19 |
20 | from deeppavlov.core.common.metrics_registry import register_metric
21 |
22 |
23 | def recall_at_k(y_true: List[int], y_pred: List[List[np.ndarray]], k: int):
24 | """
25 | Calculates recall at k ranking metric.
26 |
27 | Args:
28 | y_true: Labels. Not used in the calculation of the metric.
29 | y_predicted: Predictions.
30 | Each prediction contains ranking score of all ranking candidates for the particular data sample.
31 | It is supposed that the ranking score for the true candidate goes first in the prediction.
32 |
33 | Returns:
34 | Recall at k
35 | """
36 | num_examples = float(len(y_pred))
37 | predictions = np.array(y_pred)
38 | predictions = np.flip(np.argsort(predictions, -1), -1)[:, :k]
39 | num_correct = 0
40 | for el in predictions:
41 | if 0 in el:
42 | num_correct += 1
43 | return float(num_correct) / num_examples
44 |
45 |
46 | @register_metric('r@1')
47 | def r_at_1(y_true, y_pred):
48 | return recall_at_k(y_true, y_pred, k=1)
49 |
50 |
51 | @register_metric('r@2')
52 | def r_at_2(y_true, y_pred):
53 | return recall_at_k(y_true, y_pred, k=2)
54 |
55 |
56 | @register_metric('r@5')
57 | def r_at_5(labels, predictions):
58 | return recall_at_k(labels, predictions, k=5)
59 |
60 |
61 | @register_metric('r@10')
62 | def r_at_10(labels, predictions):
63 | return recall_at_k(labels, predictions, k=10)
64 |
--------------------------------------------------------------------------------
/deeppavlov/metrics/roc_auc_score.py:
--------------------------------------------------------------------------------
1 | # Copyright 2017 Neural Networks and Deep Learning lab, MIPT
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 |
16 | from typing import List, Union
17 |
18 | import numpy as np
19 | import sklearn.metrics
20 |
21 | from deeppavlov.core.common.metrics_registry import register_metric
22 |
23 |
24 | @register_metric('roc_auc')
25 | def roc_auc_score(y_true: Union[List[List[float]], List[List[int]], np.ndarray],
26 | y_pred: Union[List[List[float]], List[List[int]], np.ndarray]) -> float:
27 | """
28 | Compute Area Under the Curve (AUC) from prediction scores.
29 |
30 | Args:
31 | y_true: true binary labels
32 | y_pred: target scores, can either be probability estimates of the positive class
33 |
34 | Returns:
35 | Area Under the Curve (AUC) from prediction scores
36 |
37 | Alias:
38 | roc_auc
39 | """
40 | try:
41 | return sklearn.metrics.roc_auc_score(np.squeeze(np.array(y_true)),
42 | np.squeeze(np.array(y_pred)), average="macro")
43 | except ValueError:
44 | return 0.
45 |
--------------------------------------------------------------------------------
/deeppavlov/models/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright 2017 Neural Networks and Deep Learning lab, MIPT
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | import os
16 |
17 | import nltk
18 |
19 | from deeppavlov.core.common.prints import RedirectedPrints
20 |
21 | if not os.environ.get('DP_SKIP_NLTK_DOWNLOAD'):
22 | with RedirectedPrints():
23 | nltk.download('punkt', quiet=True)
24 | nltk.download('stopwords', quiet=True)
25 | nltk.download('perluniprops', quiet=True)
26 | nltk.download('nonbreaking_prefixes', quiet=True)
27 |
--------------------------------------------------------------------------------
/deeppavlov/models/api_requester/__init__.py:
--------------------------------------------------------------------------------
1 | from .api_requester import *
2 |
--------------------------------------------------------------------------------
/deeppavlov/models/api_requester/api_router.py:
--------------------------------------------------------------------------------
1 | # Copyright 2017 Neural Networks and Deep Learning lab, MIPT
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | import concurrent
16 | from concurrent.futures import ProcessPoolExecutor
17 | from logging import getLogger
18 | from typing import List
19 |
20 | from deeppavlov.core.common.registry import register
21 | from deeppavlov.core.models.component import Component
22 | from deeppavlov.models.api_requester import ApiRequester
23 |
24 | logger = getLogger(__name__)
25 |
26 |
27 | @register("api_router")
28 | class ApiRouter(Component):
29 | """A helper class for running multiple API requesters on the same data in parallel
30 |
31 | Args:
32 | api_requesters: list of ApiRequester objects
33 | n_workers: The maximum number of subprocesses to run
34 |
35 | Attributes:
36 | api_requesters: list of ApiRequester objects
37 | n_workers: The maximum number of subprocesses to run
38 | """
39 |
40 | def __init__(self, api_requesters: List[ApiRequester], n_workers: int = 1, *args, **kwargs):
41 | self.api_requesters = api_requesters
42 | self.n_workers = n_workers
43 |
44 | def __call__(self, *args):
45 | """
46 |
47 | Args:
48 | *args: list of arguments to forward to the API requesters
49 |
50 | Returns:
51 | results of the requests
52 | """
53 | with ProcessPoolExecutor(self.n_workers) as executor:
54 | futures = [executor.submit(api_requester, *args) for api_requester
55 | in
56 | self.api_requesters]
57 |
58 | concurrent.futures.wait(futures)
59 | results = []
60 | for future, api_requester in zip(futures, self.api_requesters):
61 | result = future.result()
62 | if api_requester.out_count > 1:
63 | results += result
64 | else:
65 | results.append(result)
66 |
67 | return results
68 |
--------------------------------------------------------------------------------
/deeppavlov/models/classifiers/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deeppavlov/DeepPavlov/5f9fbed0c7191466bc7621e604b810f66f254c03/deeppavlov/models/classifiers/__init__.py
--------------------------------------------------------------------------------
/deeppavlov/models/doc_retrieval/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deeppavlov/DeepPavlov/5f9fbed0c7191466bc7621e604b810f66f254c03/deeppavlov/models/doc_retrieval/__init__.py
--------------------------------------------------------------------------------
/deeppavlov/models/doc_retrieval/utils.py:
--------------------------------------------------------------------------------
1 | # Copyright 2017 Neural Networks and Deep Learning lab, MIPT
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | from typing import Any, List
16 |
17 | import nltk
18 |
19 | from deeppavlov.core.common.registry import register
20 |
21 |
22 | @register('concat_lists')
23 | def concat_lists(list_a: List[List[Any]], list_b: List[List[Any]]):
24 | list_u = []
25 | for element_a, element_b in zip(list_a, list_b):
26 | list_u.append(element_a + element_b)
27 | return list_u
28 |
29 |
30 | def find_answer_sentence(answer_pos: int, context: str) -> str:
31 | answer_sentence = ""
32 | context_sentences = nltk.sent_tokenize(context)
33 | start = 0
34 | context_sentences_offsets = []
35 | for sentence in context_sentences:
36 | end = start + len(sentence)
37 | context_sentences_offsets.append((start, end))
38 | start = end + 1
39 |
40 | for sentence, (start_offset, end_offset) in zip(context_sentences, context_sentences_offsets):
41 | if start_offset < answer_pos < end_offset:
42 | answer_sentence = sentence
43 | break
44 |
45 | return answer_sentence
46 |
--------------------------------------------------------------------------------
/deeppavlov/models/embedders/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deeppavlov/DeepPavlov/5f9fbed0c7191466bc7621e604b810f66f254c03/deeppavlov/models/embedders/__init__.py
--------------------------------------------------------------------------------
/deeppavlov/models/embedders/fasttext_embedder.py:
--------------------------------------------------------------------------------
1 | # Copyright 2017 Neural Networks and Deep Learning lab, MIPT
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | from logging import getLogger
16 | from typing import Iterator
17 |
18 | import fasttext
19 |
20 | import numpy as np
21 |
22 | from deeppavlov.core.common.registry import register
23 | from deeppavlov.models.embedders.abstract_embedder import Embedder
24 |
25 | log = getLogger(__name__)
26 |
27 |
28 | @register('fasttext')
29 | class FasttextEmbedder(Embedder):
30 | """
31 | Class implements fastText embedding model
32 |
33 | Args:
34 | load_path: path where to load pre-trained embedding model from
35 | pad_zero: whether to pad samples or not
36 |
37 | Attributes:
38 | model: fastText model instance
39 | tok2emb: dictionary with already embedded tokens
40 | dim: dimension of embeddings
41 | pad_zero: whether to pad sequence of tokens with zeros or not
42 | load_path: path with pre-trained fastText binary model
43 | """
44 |
45 | def _get_word_vector(self, w: str) -> np.ndarray:
46 | return self.model.get_word_vector(w)
47 |
48 | def load(self) -> None:
49 | """
50 | Load fastText binary model from self.load_path
51 | """
52 | log.debug(f"[loading fastText embeddings from `{self.load_path}`]")
53 | self.model = fasttext.load_model(str(self.load_path))
54 | self.dim = self.model.get_dimension()
55 |
56 | def __iter__(self) -> Iterator[str]:
57 | """
58 | Iterate over all words from fastText model vocabulary
59 |
60 | Returns:
61 | iterator
62 | """
63 | yield from self.model.get_words()
64 |
--------------------------------------------------------------------------------
/deeppavlov/models/entity_extraction/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deeppavlov/DeepPavlov/5f9fbed0c7191466bc7621e604b810f66f254c03/deeppavlov/models/entity_extraction/__init__.py
--------------------------------------------------------------------------------
/deeppavlov/models/kbqa/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deeppavlov/DeepPavlov/5f9fbed0c7191466bc7621e604b810f66f254c03/deeppavlov/models/kbqa/__init__.py
--------------------------------------------------------------------------------
/deeppavlov/models/morpho_syntax_parser/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deeppavlov/DeepPavlov/5f9fbed0c7191466bc7621e604b810f66f254c03/deeppavlov/models/morpho_syntax_parser/__init__.py
--------------------------------------------------------------------------------
/deeppavlov/models/morpho_syntax_parser/dependency_decoding.py:
--------------------------------------------------------------------------------
1 | # Copyright 2019 Neural Networks and Deep Learning lab, MIPT
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | from typing import List
16 |
17 | import numpy as np
18 | from ufal.chu_liu_edmonds import chu_liu_edmonds
19 |
20 | from deeppavlov.core.common.registry import register
21 | from deeppavlov.core.models.component import Component
22 |
23 |
24 | @register('chu_liu_edmonds_transformer')
25 | class ChuLiuEdmonds(Component):
26 | """
27 | A wrapper for Chu-Liu-Edmonds algorithm for maximum spanning tree
28 | """
29 |
30 | def __init__(self, min_edge_prob=1e-6, **kwargs):
31 | self.min_edge_prob = min_edge_prob
32 |
33 | def __call__(self, probs: List[np.ndarray]) -> List[List[int]]:
34 | """Applies Chu-Liu-Edmonds algorithm to the matrix of head probabilities.
35 | probs: a 3D-array of probabilities of shape B*L*(L+1)
36 | """
37 | answer = []
38 | for elem in probs:
39 | m, n = elem.shape
40 | if n == m + 1:
41 | elem = np.log10(np.maximum(self.min_edge_prob, elem)) - np.log10(self.min_edge_prob)
42 | elem = np.concatenate([np.zeros_like(elem[:1, :]), elem], axis=0)
43 | # it makes impossible to create multiple edges 0->i
44 | elem[1:, 0] += np.log10(self.min_edge_prob) * len(elem)
45 | heads, _ = chu_liu_edmonds(elem.astype("float64"))
46 | answer.append(heads[1:])
47 | else:
48 | raise ValueError("First and second axis lengths m, n of probs should satisfy the condition n == m + 1")
49 | return answer
50 |
--------------------------------------------------------------------------------
/deeppavlov/models/morpho_syntax_parser/spacy_lemmatizer.py:
--------------------------------------------------------------------------------
1 | # Copyright 2019 Neural Networks and Deep Learning lab, MIPT
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | from typing import List
16 |
17 | import spacy
18 |
19 | from deeppavlov.core.common.registry import register
20 | from deeppavlov.core.models.component import Component
21 |
22 |
23 | @register('spacy_lemmatizer')
24 | class SpacyLemmatizer(Component):
25 | def __init__(self, model: str, **kwargs):
26 | self.nlp = spacy.load(model)
27 |
28 | def __call__(self, words_batch: List[List[str]]):
29 | return [[self.nlp(word)[0].lemma_ for word in words_list] for words_list in words_batch]
30 |
--------------------------------------------------------------------------------
/deeppavlov/models/preprocessors/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deeppavlov/DeepPavlov/5f9fbed0c7191466bc7621e604b810f66f254c03/deeppavlov/models/preprocessors/__init__.py
--------------------------------------------------------------------------------
/deeppavlov/models/preprocessors/dnnc_preprocessor.py:
--------------------------------------------------------------------------------
1 | # Copyright 2017 Neural Networks and Deep Learning lab, MIPT
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | from logging import getLogger
16 | from typing import List, Tuple
17 |
18 | import numpy as np
19 |
20 | from deeppavlov.core.common.registry import register
21 | from deeppavlov.core.models.component import Component
22 |
23 | log = getLogger(__name__)
24 |
25 |
26 | @register('dnnc_pair_generator')
27 | class PairGenerator(Component):
28 | """
29 | Generates all possible ordered pairs from 'texts_batch' and 'support_dataset'
30 |
31 | Args:
32 | bidirectional: adds pairs in reverse order
33 | """
34 |
35 | def __init__(self, bidirectional: bool = False, **kwargs) -> None:
36 | self.bidirectional = bidirectional
37 |
38 | def __call__(self,
39 | texts: List[str],
40 | dataset: List[List[str]],
41 | ) -> Tuple[List[str], List[str], List[str], List[str]]:
42 | hypotesis_batch = []
43 | premise_batch = []
44 | hypotesis_labels_batch = []
45 | for [premise, [hypotesis, hypotesis_labels]] in zip(texts * len(dataset),
46 | np.repeat(dataset, len(texts), axis=0)):
47 | premise_batch.append(premise)
48 | hypotesis_batch.append(hypotesis)
49 | hypotesis_labels_batch.append(hypotesis_labels)
50 |
51 | if self.bidirectional:
52 | premise_batch.append(hypotesis)
53 | hypotesis_batch.append(premise)
54 | hypotesis_labels_batch.append(hypotesis_labels)
55 | return texts, hypotesis_batch, premise_batch, hypotesis_labels_batch
56 |
--------------------------------------------------------------------------------
/deeppavlov/models/preprocessors/mask.py:
--------------------------------------------------------------------------------
1 | # Copyright 2017 Neural Networks and Deep Learning lab, MIPT
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | import numpy as np
16 |
17 | from deeppavlov.core.common.registry import register
18 | from deeppavlov.core.models.component import Component
19 |
20 |
21 | @register('mask')
22 | class Mask(Component):
23 | """Takes a batch of tokens and returns the masks of corresponding length"""
24 | def __init__(self, *args, **kwargs):
25 | pass
26 |
27 | @staticmethod
28 | def __call__(tokens_batch, **kwargs):
29 | batch_size = len(tokens_batch)
30 | max_len = max(len(utt) for utt in tokens_batch)
31 | mask = np.zeros([batch_size, max_len], dtype=np.float32)
32 | for n, utterance in enumerate(tokens_batch):
33 | mask[n, :len(utterance)] = 1
34 |
35 | return mask
36 |
--------------------------------------------------------------------------------
/deeppavlov/models/preprocessors/sentseg_preprocessor.py:
--------------------------------------------------------------------------------
1 | from typing import List
2 |
3 | from deeppavlov.core.common.registry import register
4 |
5 |
6 | @register("sentseg_restore_sent")
7 | def SentSegRestoreSent(batch_words: List[List[str]], batch_tags: List[List[str]]) -> List[str]:
8 | ret = []
9 | for words, tags in zip(batch_words, batch_tags):
10 | if len(tags) == 0:
11 | ret.append("")
12 | continue
13 | sent = words[0]
14 | punct = "" if tags[0] == "O" else tags[0][-1]
15 | for word, tag in zip(words[1:], tags[1:]):
16 | if tag != "O":
17 | sent += punct
18 | punct = tag[-1]
19 | sent += " " + word
20 | sent += punct
21 | ret.append(sent)
22 |
23 | return ret
24 |
--------------------------------------------------------------------------------
/deeppavlov/models/preprocessors/str_lower.py:
--------------------------------------------------------------------------------
1 | # Copyright 2017 Neural Networks and Deep Learning lab, MIPT
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | from typing import Union
16 |
17 | from deeppavlov.core.common.registry import register
18 |
19 |
20 | @register('str_lower')
21 | def str_lower(batch: Union[str, list, tuple]):
22 | """Recursively search for strings in a list and convert them to lowercase
23 |
24 | Args:
25 | batch: a string or a list containing strings at some level of nesting
26 |
27 | Returns:
28 | the same structure where all strings are converted to lowercase
29 | """
30 | if isinstance(batch, str):
31 | return batch.lower()
32 | else:
33 | return list(map(str_lower, batch))
34 |
--------------------------------------------------------------------------------
/deeppavlov/models/ranking/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deeppavlov/DeepPavlov/5f9fbed0c7191466bc7621e604b810f66f254c03/deeppavlov/models/ranking/__init__.py
--------------------------------------------------------------------------------
/deeppavlov/models/ranking/metrics.py:
--------------------------------------------------------------------------------
1 | # Copyright 2017 Neural Networks and Deep Learning lab, MIPT
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | import numpy as np
16 |
17 | from deeppavlov.core.common.metrics_registry import register_metric
18 |
19 |
20 | @register_metric('rank_response')
21 | def rank_response(y_true, y_pred):
22 | num_examples = float(len(y_pred))
23 | predictions = np.array(y_pred)
24 | predictions = np.flip(np.argsort(predictions, -1), -1)
25 | rank_tot = 0
26 | for el in predictions:
27 | for i, x in enumerate(el):
28 | if x == 0:
29 | rank_tot += i
30 | break
31 | return float(rank_tot) / num_examples
32 |
33 |
34 | @register_metric('r@1_insQA')
35 | def r_at_1_insQA(y_true, y_pred):
36 | return recall_at_k_insQA(y_true, y_pred, k=1)
37 |
38 |
39 | def recall_at_k_insQA(y_true, y_pred, k):
40 | labels = np.repeat(np.expand_dims(np.asarray(y_true), axis=1), k, axis=1)
41 | predictions = np.array(y_pred)
42 | predictions = np.flip(np.argsort(predictions, -1), -1)[:, :k]
43 | flags = np.zeros_like(predictions)
44 | for i in range(predictions.shape[0]):
45 | for j in range(predictions.shape[1]):
46 | if predictions[i][j] in np.arange(labels[i][j]):
47 | flags[i][j] = 1.
48 | return np.mean((np.sum(flags, -1) >= 1.).astype(float))
49 |
--------------------------------------------------------------------------------
/deeppavlov/models/relation_extraction/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deeppavlov/DeepPavlov/5f9fbed0c7191466bc7621e604b810f66f254c03/deeppavlov/models/relation_extraction/__init__.py
--------------------------------------------------------------------------------
/deeppavlov/models/relation_extraction/losses.py:
--------------------------------------------------------------------------------
1 | """
2 | This code is copied from ATLOP algorithm (https://github.com/wzhouad/ATLOP/blob/main/losses.py)
3 | """
4 |
5 | import torch
6 | import torch.nn as nn
7 | import torch.nn.functional as F
8 | from torch import Tensor
9 |
10 |
11 | class ATLoss(nn.Module):
12 | def __init__(self):
13 | super().__init__()
14 |
15 | def forward(self, logits: Tensor, labels: Tensor) -> float:
16 | """
17 | Args:
18 | logits: predicted probabilities (shape: batch size x num classes)
19 | labels: one-hot encoded true labels (shape: batch size x num classes)
20 | """
21 |
22 | # TH label
23 | th_label = torch.zeros_like(labels, dtype=torch.float).to(labels)
24 | th_label[:, 0] = 1.0
25 | labels[:, 0] = 0.0
26 |
27 | p_mask = labels + th_label # = 1 for the gold labels + for 0 (negative) class, 0 otherwise
28 | n_mask = 1 - labels # = 0 for the gold labels, 1 otherwise
29 |
30 | # Rank positive classes to TH
31 | logit1 = logits - (1 - p_mask) * 1e30 # org logits remain for gold labels + 0 class, others are reduced by 1
32 | loss1 = -(F.log_softmax(logit1, dim=-1) * labels).sum(1)
33 |
34 | # Rank TH to negative classes
35 | logit2 = logits - (1 - n_mask) * 1e30 # org logits remain for not gold and not 0-class, others are reduced by 1
36 | loss2 = -(F.log_softmax(logit2, dim=-1) * th_label).sum(1)
37 |
38 | # Sum two parts
39 | loss = loss1 + loss2
40 | loss = loss.mean()
41 | return loss
42 |
43 | def get_label(self, logits: Tensor, num_labels: int = -1, threshold: float = None) -> Tensor:
44 | """ Calculated the labels """
45 | if threshold:
46 | th_logit = torch.full((len(logits), 1), threshold)
47 | else:
48 | th_logit = logits[:, 0].unsqueeze(1) # vector of predicted probabilities for class 0 (negative class)
49 | output = torch.zeros_like(logits).to(logits)
50 | mask = (logits > th_logit) # for each sample: True, if prob for a class > prob for neg class, False otherwise
51 | if num_labels > 0:
52 | top_v, _ = torch.topk(logits, num_labels, dim=1) # len(num_labels) max elements; sorted
53 | top_v = top_v[:, -1] # the smallest pro for each sample
54 | mask = (logits >= top_v.unsqueeze(1)) & mask # mask + additionally: logits should be bigger than minimum
55 | output[mask] = 1.0
56 | output[:, 0] = (output.sum(1) == 0.).to(logits) # no relation if no label matched
57 | return output
58 |
--------------------------------------------------------------------------------
/deeppavlov/models/sklearn/__init__.py:
--------------------------------------------------------------------------------
1 | from .sklearn_component import *
2 |
--------------------------------------------------------------------------------
/deeppavlov/models/spelling_correction/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deeppavlov/DeepPavlov/5f9fbed0c7191466bc7621e604b810f66f254c03/deeppavlov/models/spelling_correction/__init__.py
--------------------------------------------------------------------------------
/deeppavlov/models/spelling_correction/brillmoore/__init__.py:
--------------------------------------------------------------------------------
1 | from .error_model import ErrorModel
2 |
--------------------------------------------------------------------------------
/deeppavlov/models/spelling_correction/electors/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deeppavlov/DeepPavlov/5f9fbed0c7191466bc7621e604b810f66f254c03/deeppavlov/models/spelling_correction/electors/__init__.py
--------------------------------------------------------------------------------
/deeppavlov/models/spelling_correction/electors/top1_elector.py:
--------------------------------------------------------------------------------
1 | # Copyright 2017 Neural Networks and Deep Learning lab, MIPT
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | from logging import getLogger
16 | from typing import List, Tuple
17 |
18 | from deeppavlov.core.common.registry import register
19 | from deeppavlov.core.models.component import Component
20 |
21 | logger = getLogger(__name__)
22 |
23 |
24 | @register('top1_elector')
25 | class TopOneElector(Component):
26 | """Component that chooses a candidate with highest base probability for every token
27 |
28 | """
29 |
30 | def __init__(self, *args, **kwargs):
31 | pass
32 |
33 | def __call__(self, batch: List[List[List[Tuple[float, str]]]]) -> List[List[str]]:
34 | """Choose the best candidate for every token
35 |
36 | Args:
37 | batch: batch of probabilities and string values of candidates for every token in a sentence
38 |
39 | Returns:
40 | batch of corrected tokenized sentences
41 | """
42 | return [[max(sublist)[1] for sublist in candidates] for candidates in batch]
43 |
--------------------------------------------------------------------------------
/deeppavlov/models/spelling_correction/levenshtein/__init__.py:
--------------------------------------------------------------------------------
1 | from .searcher_component import LevenshteinSearcherComponent
2 |
--------------------------------------------------------------------------------
/deeppavlov/models/tokenizers/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deeppavlov/DeepPavlov/5f9fbed0c7191466bc7621e604b810f66f254c03/deeppavlov/models/tokenizers/__init__.py
--------------------------------------------------------------------------------
/deeppavlov/models/tokenizers/lazy_tokenizer.py:
--------------------------------------------------------------------------------
1 | # Copyright 2017 Neural Networks and Deep Learning lab, MIPT
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | from logging import getLogger
16 |
17 | from nltk import word_tokenize
18 |
19 | from deeppavlov.core.common.registry import register
20 |
21 | log = getLogger(__name__)
22 |
23 |
24 | @register('lazy_tokenizer')
25 | def lazy_tokenizer(batch):
26 | """Tokenizes if there is something to tokenize."""
27 |
28 | if len(batch) > 0 and isinstance(batch[0], str):
29 | batch = [word_tokenize(utt) for utt in batch]
30 | return batch
31 |
--------------------------------------------------------------------------------
/deeppavlov/models/tokenizers/nltk_moses_tokenizer.py:
--------------------------------------------------------------------------------
1 | # Copyright 2017 Neural Networks and Deep Learning lab, MIPT
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | from typing import Union, List
15 |
16 | from sacremoses import MosesDetokenizer, MosesTokenizer
17 |
18 | from deeppavlov.core.common.registry import register
19 | from deeppavlov.core.models.component import Component
20 |
21 |
22 | @register("nltk_moses_tokenizer")
23 | class NLTKMosesTokenizer(Component):
24 | """Class for splitting texts on tokens using NLTK wrapper over MosesTokenizer
25 |
26 | Attributes:
27 | escape: whether escape characters for use in html markup
28 | tokenizer: tokenizer instance from nltk.tokenize.moses
29 | detokenizer: detokenizer instance from nltk.tokenize.moses
30 |
31 | Args:
32 | escape: whether escape characters for use in html markup
33 | """
34 |
35 | def __init__(self, escape: bool = False, *args, **kwargs):
36 | self.escape = escape
37 | self.tokenizer = MosesTokenizer()
38 | self.detokenizer = MosesDetokenizer()
39 |
40 | def __call__(self, batch: List[Union[str, List[str]]]) -> List[Union[List[str], str]]:
41 | """Tokenize given batch of strings or detokenize given batch of lists of tokens
42 |
43 | Args:
44 | batch: list of text samples or list of lists of tokens
45 |
46 | Returns:
47 | list of lists of tokens or list of text samples
48 | """
49 | if isinstance(batch[0], str):
50 | return [self.tokenizer.tokenize(line, escape=self.escape) for line in batch]
51 | else:
52 | return [self.detokenizer.detokenize(line, return_str=True, unescape=self.escape)
53 | for line in batch]
54 |
--------------------------------------------------------------------------------
/deeppavlov/models/tokenizers/nltk_tokenizer.py:
--------------------------------------------------------------------------------
1 | # Copyright 2017 Neural Networks and Deep Learning lab, MIPT
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | from typing import List
16 |
17 | import nltk
18 |
19 | from deeppavlov.core.common.registry import register
20 | from deeppavlov.core.models.component import Component
21 |
22 |
23 | @register("nltk_tokenizer")
24 | class NLTKTokenizer(Component):
25 | """Class for splitting texts on tokens using NLTK
26 |
27 | Args:
28 | tokenizer: tokenization mode for `nltk.tokenize`
29 | download: whether to download nltk data
30 |
31 | Attributes:
32 | tokenizer: tokenizer instance from nltk.tokenizers
33 | """
34 |
35 | def __init__(self, tokenizer: str = "wordpunct_tokenize", download: bool = False,
36 | *args, **kwargs):
37 | if download:
38 | nltk.download()
39 | self.tokenizer = getattr(nltk.tokenize, tokenizer, None)
40 | if not callable(self.tokenizer):
41 | raise AttributeError("Tokenizer {} is not defined in nltk.tokenizer".format(tokenizer))
42 |
43 | def __call__(self, batch: List[str]) -> List[List[str]]:
44 | """Tokenize given batch
45 |
46 | Args:
47 | batch: list of text samples
48 |
49 | Returns:
50 | list of lists of tokens
51 | """
52 | return [self.tokenizer(sent) for sent in batch]
53 |
--------------------------------------------------------------------------------
/deeppavlov/models/tokenizers/split_tokenizer.py:
--------------------------------------------------------------------------------
1 | # Copyright 2017 Neural Networks and Deep Learning lab, MIPT
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | from typing import List
16 |
17 | from deeppavlov.core.common.registry import register
18 | from deeppavlov.core.models.component import Component
19 |
20 |
21 | @register("split_tokenizer")
22 | class SplitTokenizer(Component):
23 | """
24 | Generates utterance's tokens by mere python's ``str.split()``.
25 |
26 | Doesn't have any parameters.
27 | """
28 |
29 | def __init__(self, **kwargs) -> None:
30 | pass
31 |
32 | def __call__(self, batch: List[str]) -> List[List[str]]:
33 | """
34 | Tokenize given batch
35 |
36 | Args:
37 | batch: list of texts to tokenize
38 |
39 | Returns:
40 | tokenized batch
41 | """
42 | if isinstance(batch, (list, tuple)):
43 | return [sample.split() for sample in batch]
44 | else:
45 | raise NotImplementedError('not implemented for types other than'
46 | ' list or tuple')
47 |
--------------------------------------------------------------------------------
/deeppavlov/models/tokenizers/utils.py:
--------------------------------------------------------------------------------
1 | # Copyright 2017 Neural Networks and Deep Learning lab, MIPT
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | import re
16 | from typing import List, Generator, Any
17 |
18 |
19 | def detokenize(tokens):
20 | """
21 | Detokenizing a text undoes the tokenizing operation, restores
22 | punctuation and spaces to the places that people expect them to be.
23 | Ideally, `detokenize(tokenize(text))` should be identical to `text`,
24 | except for line breaks.
25 | """
26 | text = ' '.join(tokens)
27 | step0 = text.replace('. . .', '...')
28 | step1 = step0.replace("`` ", '"').replace(" ''", '"')
29 | step2 = step1.replace(" ( ", " (").replace(" ) ", ") ")
30 | step3 = re.sub(r' ([.,:;?!%]+)([ \'"`])', r"\1\2", step2)
31 | step4 = re.sub(r' ([.,:;?!%]+)$', r"\1", step3)
32 | step5 = step4.replace(" '", "'").replace(" n't", "n't") \
33 | .replace(" nt", "nt").replace("can not", "cannot")
34 | step6 = step5.replace(" ` ", " '")
35 | return step6.strip()
36 |
37 |
38 | def ngramize(items: List[str], ngram_range=(1, 1), doc: str = None) -> Generator[List[str], Any, None]:
39 | """
40 | Make ngrams from a list of tokens/lemmas
41 | :param items: list of tokens, lemmas or other strings to form ngrams
42 | :param ngram_range: range for producing ngrams, ex. for unigrams + bigrams should be set to
43 | (1, 2), for bigrams only should be set to (2, 2)
44 | :return: ngrams (as strings) generator
45 | """
46 |
47 | ngrams = []
48 | ranges = [(0, i) for i in range(ngram_range[0], ngram_range[1] + 1)]
49 | for r in ranges:
50 | ngrams += list(zip(*[items[j:] for j in range(*r)]))
51 |
52 | formatted_ngrams = [' '.join(item) for item in ngrams]
53 | if doc is not None:
54 | doc_lower = doc.lower()
55 | formatted_ngrams = [ngram for ngram in formatted_ngrams if (ngram in doc or ngram in doc_lower)]
56 |
57 | yield formatted_ngrams
58 |
--------------------------------------------------------------------------------
/deeppavlov/models/torch_bert/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deeppavlov/DeepPavlov/5f9fbed0c7191466bc7621e604b810f66f254c03/deeppavlov/models/torch_bert/__init__.py
--------------------------------------------------------------------------------
/deeppavlov/models/torch_bert/crf.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import torch
3 | from torch import nn
4 | from torchcrf import CRF as CRFbase
5 |
6 |
7 | class CRF(CRFbase):
8 | """Class with Conditional Random Field from PyTorch-CRF library
9 | with modified training function
10 | """
11 |
12 | def __init__(self, num_tags: int, batch_first: bool = False) -> None:
13 | super().__init__(num_tags=num_tags, batch_first=batch_first)
14 | nn.init.zeros_(self.transitions)
15 | nn.init.zeros_(self.start_transitions)
16 | nn.init.zeros_(self.end_transitions)
17 | self.stats = torch.zeros((num_tags, num_tags), dtype=torch.float)
18 | self.zeros = torch.zeros((num_tags, num_tags), dtype=torch.float)
19 | self.neg = torch.full((num_tags, num_tags), -1000.0)
20 |
21 | def forward(self, tags_batch: torch.LongTensor, y_masks: np.ndarray):
22 | seq_lengths = np.sum(y_masks, axis=1)
23 | for seq_len, tags_list in zip(seq_lengths, tags_batch):
24 | if seq_len > 1:
25 | for i in range(seq_len - 1):
26 | self.stats[int(tags_list[i])][int(tags_list[i + 1])] += 1.0
27 | with torch.no_grad():
28 | self.transitions.copy_(torch.where(self.stats > 0, self.zeros, self.neg))
29 |
--------------------------------------------------------------------------------
/deeppavlov/models/vectorizers/__init__.py:
--------------------------------------------------------------------------------
1 |
2 |
--------------------------------------------------------------------------------
/deeppavlov/requirements/datasets.txt:
--------------------------------------------------------------------------------
1 | datasets>=1.16.0,<2.5.0;python_version<="3.10"
2 | datasets==2.2.*;python_version=="3.11.*"
3 |
--------------------------------------------------------------------------------
/deeppavlov/requirements/dependency_decoding.txt:
--------------------------------------------------------------------------------
1 | ufal.chu-liu-edmonds
2 |
--------------------------------------------------------------------------------
/deeppavlov/requirements/en_core_web_sm.txt:
--------------------------------------------------------------------------------
1 | https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.5.0/en_core_web_sm-3.5.0-py3-none-any.whl
2 | spacy
3 |
--------------------------------------------------------------------------------
/deeppavlov/requirements/faiss.txt:
--------------------------------------------------------------------------------
1 | faiss-cpu==1.7.2;python_version<="3.10"
2 | faiss-cpu==1.7.4;python_version=="3.11.*"
3 |
--------------------------------------------------------------------------------
/deeppavlov/requirements/fasttext.txt:
--------------------------------------------------------------------------------
1 | fasttext==0.9.*
2 |
--------------------------------------------------------------------------------
/deeppavlov/requirements/hdt.txt:
--------------------------------------------------------------------------------
1 | hdt==2.3
2 |
--------------------------------------------------------------------------------
/deeppavlov/requirements/kenlm.txt:
--------------------------------------------------------------------------------
1 | pypi-kenlm==0.1.20220713;python_version<="3.10"
2 | kenlm==0.2.*;python_version=="3.11.*"
3 |
--------------------------------------------------------------------------------
/deeppavlov/requirements/lxml.txt:
--------------------------------------------------------------------------------
1 | lxml==4.9.*
2 |
--------------------------------------------------------------------------------
/deeppavlov/requirements/opt_einsum.txt:
--------------------------------------------------------------------------------
1 | opt-einsum==3.3.*
2 |
--------------------------------------------------------------------------------
/deeppavlov/requirements/protobuf.txt:
--------------------------------------------------------------------------------
1 | protobuf<=3.20
2 |
--------------------------------------------------------------------------------
/deeppavlov/requirements/pytorch.txt:
--------------------------------------------------------------------------------
1 | torch>=1.6.0,<1.14.0
2 |
--------------------------------------------------------------------------------
/deeppavlov/requirements/rapidfuzz.txt:
--------------------------------------------------------------------------------
1 | rapidfuzz==2.1.*
2 |
--------------------------------------------------------------------------------
/deeppavlov/requirements/razdel.txt:
--------------------------------------------------------------------------------
1 | razdel==0.5.0
2 |
--------------------------------------------------------------------------------
/deeppavlov/requirements/ru_core_news_sm.txt:
--------------------------------------------------------------------------------
1 | https://github.com/explosion/spacy-models/releases/download/ru_core_news_sm-3.5.0/ru_core_news_sm-3.5.0-py3-none-any.whl
2 | spacy
3 |
--------------------------------------------------------------------------------
/deeppavlov/requirements/sacremoses.txt:
--------------------------------------------------------------------------------
1 | sacremoses==0.0.53
2 |
--------------------------------------------------------------------------------
/deeppavlov/requirements/sentencepiece.txt:
--------------------------------------------------------------------------------
1 | sentencepiece==0.2.0
2 |
--------------------------------------------------------------------------------
/deeppavlov/requirements/slovnet.txt:
--------------------------------------------------------------------------------
1 | slovnet==0.5.*
2 | navec
3 |
--------------------------------------------------------------------------------
/deeppavlov/requirements/sortedcontainers.txt:
--------------------------------------------------------------------------------
1 | sortedcontainers==2.4.*
2 |
--------------------------------------------------------------------------------
/deeppavlov/requirements/torchcrf.txt:
--------------------------------------------------------------------------------
1 | pytorch-crf==0.7.*
2 |
--------------------------------------------------------------------------------
/deeppavlov/requirements/transformers.txt:
--------------------------------------------------------------------------------
1 | transformers>=4.13.0,<4.25.0;python_version<"3.8"
2 | transformers==4.30.0;python_version>="3.8"
3 |
--------------------------------------------------------------------------------
/deeppavlov/requirements/udapi.txt:
--------------------------------------------------------------------------------
1 | udapi==0.3.*
2 |
--------------------------------------------------------------------------------
/deeppavlov/requirements/whapi.txt:
--------------------------------------------------------------------------------
1 | bs4
2 | whapi==0.6.*
3 |
--------------------------------------------------------------------------------
/deeppavlov/settings.py:
--------------------------------------------------------------------------------
1 | # Copyright 2017 Neural Networks and Deep Learning lab, MIPT
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | import argparse
16 |
17 | from deeppavlov.core.common.paths import get_settings_path, populate_settings_dir
18 |
19 | parser = argparse.ArgumentParser()
20 |
21 | parser.add_argument("-d", "--default", action="store_true", help="return to defaults")
22 |
23 |
24 | def main():
25 | """DeepPavlov console configuration utility."""
26 | args = parser.parse_args()
27 | path = get_settings_path()
28 |
29 | if args.default:
30 | if populate_settings_dir(force=True):
31 | print(f'Populated {path} with default settings files')
32 | else:
33 | print(f'{path} is already a default settings directory')
34 | else:
35 | print(f'Current DeepPavlov settings path: {path}')
36 |
37 |
38 | if __name__ == "__main__":
39 | main()
40 |
--------------------------------------------------------------------------------
/deeppavlov/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deeppavlov/DeepPavlov/5f9fbed0c7191466bc7621e604b810f66f254c03/deeppavlov/utils/__init__.py
--------------------------------------------------------------------------------
/deeppavlov/utils/benchmarks/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deeppavlov/DeepPavlov/5f9fbed0c7191466bc7621e604b810f66f254c03/deeppavlov/utils/benchmarks/__init__.py
--------------------------------------------------------------------------------
/deeppavlov/utils/connector/__init__.py:
--------------------------------------------------------------------------------
1 | from .dialog_logger import DialogLogger
2 |
--------------------------------------------------------------------------------
/deeppavlov/utils/pip_wrapper/__init__.py:
--------------------------------------------------------------------------------
1 | from .pip_wrapper import *
2 |
--------------------------------------------------------------------------------
/deeppavlov/utils/server/__init__.py:
--------------------------------------------------------------------------------
1 | from .server import get_server_params, get_ssl_params, redirect_root_to_docs, start_model_server
2 |
--------------------------------------------------------------------------------
/deeppavlov/utils/server/metrics.py:
--------------------------------------------------------------------------------
1 | # Copyright 2020 Neural Networks and Deep Learning lab, MIPT
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | import time
16 | from typing import Tuple
17 |
18 | from prometheus_client import CONTENT_TYPE_LATEST, REGISTRY, generate_latest
19 | from prometheus_client import Counter, Gauge, Histogram
20 | from starlette.middleware.base import BaseHTTPMiddleware, RequestResponseEndpoint
21 | from starlette.requests import Request
22 | from starlette.responses import Response
23 | from starlette.types import ASGIApp
24 |
25 | REQUESTS_COUNT = Counter('http_requests_count', 'Number of processed requests', ['endpoint', 'status_code'])
26 | REQUESTS_LATENCY = Histogram('http_requests_latency_seconds', 'Request latency histogram', ['endpoint'])
27 | REQUESTS_IN_PROGRESS = Gauge('http_requests_in_progress', 'Number of requests currently being processed', ['endpoint'])
28 |
29 |
30 | def metrics(request: Request) -> Response:
31 | return Response(generate_latest(REGISTRY), media_type=CONTENT_TYPE_LATEST)
32 |
33 |
34 | class PrometheusMiddleware(BaseHTTPMiddleware):
35 | def __init__(self, app: ASGIApp, ignore_paths: Tuple = ()) -> None:
36 | super().__init__(app)
37 | self.ignore_paths = ignore_paths
38 |
39 | async def dispatch(self, request: Request, call_next: RequestResponseEndpoint) -> Response:
40 | endpoint = request.url.path
41 |
42 | if endpoint in self.ignore_paths:
43 | return await call_next(request)
44 |
45 | REQUESTS_IN_PROGRESS.labels(endpoint=endpoint).inc()
46 |
47 | start_time = time.perf_counter()
48 | status_code = 500
49 |
50 | try:
51 | response = await call_next(request)
52 | status_code = response.status_code
53 | finally:
54 | if status_code == 200:
55 | duration = time.perf_counter() - start_time
56 | REQUESTS_LATENCY.labels(endpoint=endpoint).observe(duration)
57 | REQUESTS_COUNT.labels(endpoint=endpoint, status_code=status_code).inc()
58 | REQUESTS_IN_PROGRESS.labels(endpoint=endpoint).dec()
59 |
60 | return response
61 |
--------------------------------------------------------------------------------
/deeppavlov/utils/settings/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deeppavlov/DeepPavlov/5f9fbed0c7191466bc7621e604b810f66f254c03/deeppavlov/utils/settings/__init__.py
--------------------------------------------------------------------------------
/deeppavlov/utils/settings/dialog_logger_config.json:
--------------------------------------------------------------------------------
1 | {
2 | "enabled": false,
3 | "logger_name": "default",
4 | "log_path": "~/.deeppavlov/dialog_logs",
5 | "logfile_max_size_kb": 10240,
6 | "ensure_ascii": false
7 | }
--------------------------------------------------------------------------------
/deeppavlov/utils/settings/log_config.json:
--------------------------------------------------------------------------------
1 | {
2 | "version": 1,
3 | "disable_existing_loggers": false,
4 | "loggers": {
5 | "deeppavlov": {
6 | "level": "INFO",
7 | "handlers": [
8 | "stderr"
9 | ],
10 | "propagate": true
11 | },
12 | "uvicorn.access": {
13 | "level": "INFO",
14 | "handlers": [
15 | "uvicorn_handler"
16 | ],
17 | "propagate": true
18 | },
19 | "uvicorn.error": {
20 | "level": "INFO",
21 | "handlers": [
22 | "uvicorn_handler"
23 | ],
24 | "propagate": true
25 | },
26 | "train_report": {
27 | "level": "INFO",
28 | "handlers": [
29 | "train_handler"
30 | ],
31 | "propagate": true
32 | },
33 | "filelock": {
34 | "level": "WARNING",
35 | "handlers": [
36 | "stdout"
37 | ],
38 | "propagate": true
39 | }
40 | },
41 | "formatters": {
42 | "default": {
43 | "format": "%(asctime)s.%(msecs)d %(levelname)s in '%(name)s'['%(module)s'] at line %(lineno)d: %(message)s",
44 | "datefmt": "%Y-%m-%d %H:%M:%S"
45 | },
46 | "uvicorn_fmt": {
47 | "format": "%(asctime)s %(message)s",
48 | "datefmt": "%Y-%m-%d %H:%M:%S"
49 | },
50 | "message": {
51 | "format": "%(message)s"
52 | }
53 | },
54 | "handlers": {
55 | "file": {
56 | "class": "logging.FileHandler",
57 | "level": "DEBUG",
58 | "formatter": "default",
59 | "filename": "~/.deeppavlov/log.log"
60 | },
61 | "stdout": {
62 | "class": "logging.StreamHandler",
63 | "level": "DEBUG",
64 | "formatter": "default",
65 | "stream": "ext://sys.stdout"
66 | },
67 | "stderr": {
68 | "class": "logging.StreamHandler",
69 | "level": "DEBUG",
70 | "formatter": "default",
71 | "stream": "ext://sys.stderr"
72 | },
73 | "uvicorn_handler": {
74 | "class": "logging.StreamHandler",
75 | "level": "INFO",
76 | "formatter": "uvicorn_fmt",
77 | "stream": "ext://sys.stdout",
78 | "filters": ["probeFilter"]
79 | },
80 | "train_handler": {
81 | "class": "logging.StreamHandler",
82 | "level": "INFO",
83 | "formatter": "message",
84 | "stream": "ext://sys.stdout"
85 | }
86 | },
87 | "filters": {
88 | "probeFilter": {
89 | "()": "deeppavlov.core.common.log.ProbeFilter"
90 | }
91 | }
92 | }
93 |
--------------------------------------------------------------------------------
/deeppavlov/utils/settings/server_config.json:
--------------------------------------------------------------------------------
1 | {
2 | "common_defaults": {
3 | "host": "0.0.0.0",
4 | "port": 5000,
5 | "model_args_names": [],
6 | "https": false,
7 | "https_cert_path": "",
8 | "https_key_path": "",
9 | "socket_type": "TCP",
10 | "unix_socket_file": "/tmp/deeppavlov_socket.s",
11 | "socket_launch_message": "launching socket server at"
12 | }
13 | }
14 |
--------------------------------------------------------------------------------
/deeppavlov/utils/socket/__init__.py:
--------------------------------------------------------------------------------
1 | from .socket import encode, start_socket_server
2 |
--------------------------------------------------------------------------------
/deeppavlov/vocabs/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deeppavlov/DeepPavlov/5f9fbed0c7191466bc7621e604b810f66f254c03/deeppavlov/vocabs/__init__.py
--------------------------------------------------------------------------------
/deeppavlov/vocabs/wiki_sqlite.py:
--------------------------------------------------------------------------------
1 | # Copyright 2017 Neural Networks and Deep Learning lab, MIPT
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | from logging import getLogger
16 | from typing import List, Any, Optional, Union
17 |
18 | from deeppavlov.core.common.registry import register
19 | from deeppavlov.core.models.component import Component
20 | from deeppavlov.dataset_iterators.sqlite_iterator import SQLiteDataIterator
21 |
22 | logger = getLogger(__name__)
23 |
24 |
25 | @register('wiki_sqlite_vocab')
26 | class WikiSQLiteVocab(SQLiteDataIterator, Component):
27 | """Get content from SQLite database by document ids.
28 |
29 | Args:
30 | load_path: a path to local DB file
31 | join_docs: whether to join extracted docs with ' ' or not
32 | shuffle: whether to shuffle data or not
33 |
34 | Attributes:
35 | join_docs: whether to join extracted docs with ' ' or not
36 |
37 | """
38 |
39 | def __init__(self, load_path: str, join_docs: bool = True, shuffle: bool = False, **kwargs) -> None:
40 | SQLiteDataIterator.__init__(self, load_path=load_path, shuffle=shuffle)
41 | self.join_docs = join_docs
42 |
43 | def __call__(self, doc_ids: Optional[List[List[Any]]] = None, *args, **kwargs) -> List[Union[str, List[str]]]:
44 | """Get the contents of files, stacked by space or as they are.
45 |
46 | Args:
47 | doc_ids: a batch of lists of ids to get contents for
48 |
49 | Returns:
50 | a list of contents / list of lists of contents
51 | """
52 | all_contents = []
53 | if not doc_ids:
54 | logger.warning('No doc_ids are provided in WikiSqliteVocab, return all docs')
55 | doc_ids = [self.get_doc_ids()]
56 |
57 | for ids in doc_ids:
58 | contents = [self.get_doc_content(doc_id) for doc_id in ids]
59 | if self.join_docs:
60 | contents = ' '.join(contents)
61 | all_contents.append(contents)
62 |
63 | return all_contents
64 |
--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
1 | # Minimal makefile for Sphinx documentation
2 | #
3 |
4 | # You can set these variables from the command line.
5 | SPHINXOPTS = -WT
6 | SPHINXBUILD = sphinx-build
7 | SPHINXPROJ = DeepPavlov
8 | SOURCEDIR = .
9 | BUILDDIR = _build
10 |
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 |
15 | .PHONY: help Makefile
16 |
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
--------------------------------------------------------------------------------
/docs/_static/aws_ec2/01_login_to_aws.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deeppavlov/DeepPavlov/5f9fbed0c7191466bc7621e604b810f66f254c03/docs/_static/aws_ec2/01_login_to_aws.png
--------------------------------------------------------------------------------
/docs/_static/aws_ec2/02_choose_ubuntu.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deeppavlov/DeepPavlov/5f9fbed0c7191466bc7621e604b810f66f254c03/docs/_static/aws_ec2/02_choose_ubuntu.png
--------------------------------------------------------------------------------
/docs/_static/aws_ec2/03_select_instance_type.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deeppavlov/DeepPavlov/5f9fbed0c7191466bc7621e604b810f66f254c03/docs/_static/aws_ec2/03_select_instance_type.png
--------------------------------------------------------------------------------
/docs/_static/aws_ec2/04_add_storage.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deeppavlov/DeepPavlov/5f9fbed0c7191466bc7621e604b810f66f254c03/docs/_static/aws_ec2/04_add_storage.png
--------------------------------------------------------------------------------
/docs/_static/aws_ec2/05_review_instance.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deeppavlov/DeepPavlov/5f9fbed0c7191466bc7621e604b810f66f254c03/docs/_static/aws_ec2/05_review_instance.png
--------------------------------------------------------------------------------
/docs/_static/aws_ec2/06_go_to_running_instances.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deeppavlov/DeepPavlov/5f9fbed0c7191466bc7621e604b810f66f254c03/docs/_static/aws_ec2/06_go_to_running_instances.png
--------------------------------------------------------------------------------
/docs/_static/aws_ec2/07_wait_init.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deeppavlov/DeepPavlov/5f9fbed0c7191466bc7621e604b810f66f254c03/docs/_static/aws_ec2/07_wait_init.png
--------------------------------------------------------------------------------
/docs/_static/aws_ec2/08_01_set_sec_group.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deeppavlov/DeepPavlov/5f9fbed0c7191466bc7621e604b810f66f254c03/docs/_static/aws_ec2/08_01_set_sec_group.png
--------------------------------------------------------------------------------
/docs/_static/aws_ec2/08_02_set_inbound.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deeppavlov/DeepPavlov/5f9fbed0c7191466bc7621e604b810f66f254c03/docs/_static/aws_ec2/08_02_set_inbound.png
--------------------------------------------------------------------------------
/docs/_static/aws_ec2/09_01_select_connect.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deeppavlov/DeepPavlov/5f9fbed0c7191466bc7621e604b810f66f254c03/docs/_static/aws_ec2/09_01_select_connect.png
--------------------------------------------------------------------------------
/docs/_static/aws_ec2/09_02_connection_info.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deeppavlov/DeepPavlov/5f9fbed0c7191466bc7621e604b810f66f254c03/docs/_static/aws_ec2/09_02_connection_info.png
--------------------------------------------------------------------------------
/docs/_static/deeppavlov.css:
--------------------------------------------------------------------------------
1 | .wy-side-nav-search {
2 | background-color: #0176bd;
3 | }
4 |
5 | .wy-nav-content {
6 | max-width: 1000px;
7 | }
8 |
9 | .wy-side-nav-search>div.version {
10 | color: #ffffff;
11 | }
--------------------------------------------------------------------------------
/docs/_static/deeppavlov.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deeppavlov/DeepPavlov/5f9fbed0c7191466bc7621e604b810f66f254c03/docs/_static/deeppavlov.png
--------------------------------------------------------------------------------
/docs/_static/deeppavlov_logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deeppavlov/DeepPavlov/5f9fbed0c7191466bc7621e604b810f66f254c03/docs/_static/deeppavlov_logo.png
--------------------------------------------------------------------------------
/docs/_static/dp_agnt_diag.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deeppavlov/DeepPavlov/5f9fbed0c7191466bc7621e604b810f66f254c03/docs/_static/dp_agnt_diag.png
--------------------------------------------------------------------------------
/docs/_static/gobot_diagram.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deeppavlov/DeepPavlov/5f9fbed0c7191466bc7621e604b810f66f254c03/docs/_static/gobot_diagram.png
--------------------------------------------------------------------------------
/docs/_static/ipavlov_footer.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deeppavlov/DeepPavlov/5f9fbed0c7191466bc7621e604b810f66f254c03/docs/_static/ipavlov_footer.png
--------------------------------------------------------------------------------
/docs/_static/kvret_diagram.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deeppavlov/DeepPavlov/5f9fbed0c7191466bc7621e604b810f66f254c03/docs/_static/kvret_diagram.png
--------------------------------------------------------------------------------
/docs/_static/my_blocks.css:
--------------------------------------------------------------------------------
1 | button.copybtn svg {
2 | width: 1.3em;
3 | height: 1.3em;
4 | padding: 0.1em;
5 | }
6 |
7 | button.copybtn {
8 | top: 0.2em;
9 | width: 1.4em;
10 | height: 1.4em;
11 | }
12 |
13 | .rst-content .linenodiv pre, .rst-content div[class^=highlight] pre, .rst-content pre.literal-block {
14 | font-size: 13px;
15 | line-height: 1.4;
16 | }
17 |
--------------------------------------------------------------------------------
/docs/_static/social/Medium_Monogram.svg:
--------------------------------------------------------------------------------
1 |
2 |
--------------------------------------------------------------------------------
/docs/_static/social/Twitter_Social_Icon_Circle_Color.svg:
--------------------------------------------------------------------------------
1 |
2 |
3 |
21 |
--------------------------------------------------------------------------------
/docs/_static/social/telegram.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deeppavlov/DeepPavlov/5f9fbed0c7191466bc7621e604b810f66f254c03/docs/_static/social/telegram.png
--------------------------------------------------------------------------------
/docs/_static/social/youtube_social_circle_red.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deeppavlov/DeepPavlov/5f9fbed0c7191466bc7621e604b810f66f254c03/docs/_static/social/youtube_social_circle_red.png
--------------------------------------------------------------------------------
/docs/_static/tree.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deeppavlov/DeepPavlov/5f9fbed0c7191466bc7621e604b810f66f254c03/docs/_static/tree.png
--------------------------------------------------------------------------------
/docs/apiref/core.rst:
--------------------------------------------------------------------------------
1 | core
2 | ====
3 | DeepPavlov Core
4 |
5 | .. automodule:: deeppavlov.core
6 | :members:
7 |
8 | .. toctree::
9 | :glob:
10 | :caption: Core
11 |
12 | core/*
13 |
--------------------------------------------------------------------------------
/docs/apiref/core/commands.rst:
--------------------------------------------------------------------------------
1 | deeppavlov.core.commands
2 | ========================
3 | Basic training and inference functions.
4 |
5 | .. automodule:: deeppavlov.core.commands.infer
6 | :members:
7 |
8 | .. automodule:: deeppavlov.core.commands.train
9 | :members:
10 |
--------------------------------------------------------------------------------
/docs/apiref/core/common.rst:
--------------------------------------------------------------------------------
1 | deeppavlov.core.common
2 | ======================
3 | Registration and classes initialization functionality, class method decorators.
4 |
5 | .. autoclass:: deeppavlov.core.common.chainer.Chainer
6 | :members:
7 |
8 | .. automethod:: __call__
9 |
10 | .. autoclass:: deeppavlov.core.common.base.Element
11 |
12 | .. automethod:: __init__
13 |
14 | .. autoclass:: deeppavlov.core.common.base.Model
15 |
16 | .. automethod:: __init__
17 |
18 | .. automodule:: deeppavlov.core.common.metrics_registry
19 | :members:
20 |
21 | .. automodule:: deeppavlov.core.common.params
22 | :members:
23 |
24 | .. automodule:: deeppavlov.core.common.registry
25 | :members:
26 |
--------------------------------------------------------------------------------
/docs/apiref/core/data.rst:
--------------------------------------------------------------------------------
1 | deeppavlov.core.data
2 | ====================
3 | DatasetReader, Vocab, DataLearningIterator and DataFittingIterator classes.
4 |
5 | .. autoclass:: deeppavlov.core.data.dataset_reader.DatasetReader
6 |
7 | .. autoclass:: deeppavlov.core.data.data_fitting_iterator.DataFittingIterator
8 |
9 | .. autoclass:: deeppavlov.core.data.data_learning_iterator.DataLearningIterator
10 |
11 | .. autoclass:: deeppavlov.core.data.simple_vocab.SimpleVocabulary
12 |
--------------------------------------------------------------------------------
/docs/apiref/core/models.rst:
--------------------------------------------------------------------------------
1 | deeppavlov.core.models
2 | ======================
3 | Abstract model classes and interfaces.
4 |
5 | .. autoclass:: deeppavlov.core.models.component.Component
6 |
7 | .. autoclass:: deeppavlov.core.models.serializable.Serializable
8 |
9 | .. autoclass:: deeppavlov.core.models.estimator.Estimator
10 |
11 | .. autoclass:: deeppavlov.core.models.nn_model.NNModel
12 |
13 | .. autoclass:: deeppavlov.core.models.torch_model.TorchModel
14 |
--------------------------------------------------------------------------------
/docs/apiref/core/trainers.rst:
--------------------------------------------------------------------------------
1 | deeppavlov.core.trainers
2 | ========================
3 | Trainer classes.
4 |
5 | .. autoclass:: deeppavlov.core.trainers.FitTrainer
6 | :members:
7 |
8 | .. autoclass:: deeppavlov.core.trainers.NNTrainer
9 | :members:
10 | :inherited-members:
11 |
--------------------------------------------------------------------------------
/docs/apiref/dataset_iterators.rst:
--------------------------------------------------------------------------------
1 | dataset_iterators
2 | =================
3 | Concrete DatasetIterator classes.
4 |
5 | .. autoclass:: deeppavlov.dataset_iterators.basic_classification_iterator.BasicClassificationDatasetIterator
6 | :members:
7 |
8 | .. autoclass:: deeppavlov.dataset_iterators.siamese_iterator.SiameseIterator
9 |
10 | .. autoclass:: deeppavlov.dataset_iterators.sqlite_iterator.SQLiteDataIterator
11 |
12 | .. autoclass:: deeppavlov.dataset_iterators.squad_iterator.SquadIterator
13 |
14 | .. automodule:: deeppavlov.dataset_iterators.typos_iterator
15 | :members:
16 |
17 | .. automodule:: deeppavlov.dataset_iterators.multitask_iterator
18 | :members:
19 |
--------------------------------------------------------------------------------
/docs/apiref/dataset_readers.rst:
--------------------------------------------------------------------------------
1 | dataset_readers
2 | ===============
3 | Concrete DatasetReader classes.
4 |
5 | .. autoclass:: deeppavlov.dataset_readers.basic_classification_reader.BasicClassificationDatasetReader
6 | :members:
7 |
8 | .. autoclass:: deeppavlov.dataset_readers.conll2003_reader.Conll2003DatasetReader
9 |
10 | .. autoclass:: deeppavlov.dataset_readers.faq_reader.FaqDatasetReader
11 | :members:
12 |
13 | .. autoclass:: deeppavlov.dataset_readers.line_reader.LineReader
14 | :members:
15 |
16 | .. autoclass:: deeppavlov.dataset_readers.paraphraser_reader.ParaphraserReader
17 |
18 | .. autoclass:: deeppavlov.dataset_readers.squad_dataset_reader.SquadDatasetReader
19 | :members:
20 |
21 | .. automodule:: deeppavlov.dataset_readers.typos_reader
22 | :members:
23 |
24 | .. automodule:: deeppavlov.dataset_readers.ubuntu_v2_reader
25 | :members:
26 |
27 | .. automodule:: deeppavlov.dataset_readers.multitask_reader
28 | :members:
29 |
--------------------------------------------------------------------------------
/docs/apiref/metrics.rst:
--------------------------------------------------------------------------------
1 | metrics
2 | =======
3 | Different Metric functions.
4 |
5 | .. automodule:: deeppavlov.metrics
6 | :members:
7 |
8 | .. autofunction:: deeppavlov.metrics.accuracy.sets_accuracy
9 |
10 | .. autofunction:: deeppavlov.metrics.fmeasure.round_f1
11 |
12 | .. autofunction:: deeppavlov.metrics.fmeasure.round_f1_macro
13 |
14 | .. autofunction:: deeppavlov.metrics.fmeasure.round_f1_weighted
15 |
16 | .. autofunction:: deeppavlov.metrics.fmeasure.ner_f1
17 |
18 | .. autofunction:: deeppavlov.metrics.fmeasure.ner_token_f1
19 |
20 | .. autofunction:: deeppavlov.metrics.log_loss.sk_log_loss
21 |
22 | .. autofunction:: deeppavlov.metrics.roc_auc_score.roc_auc_score
23 |
--------------------------------------------------------------------------------
/docs/apiref/models.rst:
--------------------------------------------------------------------------------
1 | models
2 | ======
3 | Concrete Model classes.
4 |
5 | .. automodule:: deeppavlov.models
6 | :members:
7 |
8 | .. toctree::
9 | :glob:
10 | :caption: Models
11 |
12 | models/*
--------------------------------------------------------------------------------
/docs/apiref/models/api_requester.rst:
--------------------------------------------------------------------------------
1 | deeppavlov.models.api_requester
2 | ===============================
3 |
4 | .. automodule:: deeppavlov.models.api_requester
5 | :members:
6 |
7 | .. autoclass:: deeppavlov.models.api_requester.api_requester.ApiRequester
8 |
9 | .. automethod:: __call__
10 | .. automethod:: get_async_response
11 |
12 |
13 | .. autoclass:: deeppavlov.models.api_requester.api_router.ApiRouter
14 |
15 | .. automethod:: __call__
16 |
--------------------------------------------------------------------------------
/docs/apiref/models/classifiers.rst:
--------------------------------------------------------------------------------
1 | deeppavlov.models.classifiers
2 | =============================
3 |
4 | .. automodule:: deeppavlov.models.classifiers
5 | :members:
6 |
7 | .. autoclass:: deeppavlov.models.classifiers.torch_classification_model.TorchTextClassificationModel
8 | :members:
9 |
10 | .. automethod:: __call__
11 |
12 | .. autoclass:: deeppavlov.models.classifiers.cos_sim_classifier.CosineSimilarityClassifier
13 | :members:
14 |
15 | .. automethod:: __call__
16 |
17 | .. autoclass:: deeppavlov.models.classifiers.proba2labels.Proba2Labels
18 | :members:
19 |
20 | .. automethod:: __call__
21 |
--------------------------------------------------------------------------------
/docs/apiref/models/doc_retrieval.rst:
--------------------------------------------------------------------------------
1 | deeppavlov.models.doc_retrieval
2 | ===============================
3 |
4 | Document retrieval classes.
5 |
6 | .. automodule:: deeppavlov.models.doc_retrieval
7 |
8 | .. autoclass:: deeppavlov.models.doc_retrieval.tfidf_ranker.TfidfRanker
9 | :members:
10 |
11 | .. automethod:: __call__
12 |
13 | .. autoclass:: deeppavlov.models.doc_retrieval.logit_ranker.LogitRanker
14 | :members:
15 |
16 | .. automethod:: __call__
17 |
18 | .. autoclass:: deeppavlov.models.doc_retrieval.pop_ranker.PopRanker
19 | :members:
20 |
21 | .. automethod:: __call__
--------------------------------------------------------------------------------
/docs/apiref/models/embedders.rst:
--------------------------------------------------------------------------------
1 | deeppavlov.models.embedders
2 | ============================
3 |
4 | .. autoclass:: deeppavlov.models.embedders.fasttext_embedder.FasttextEmbedder
5 |
6 | .. automethod:: __call__
7 | .. automethod:: __iter__
8 |
9 | .. autoclass:: deeppavlov.models.embedders.tfidf_weighted_embedder.TfidfWeightedEmbedder
10 |
11 | .. automethod:: __call__
12 |
13 | .. autoclass:: deeppavlov.models.embedders.transformers_embedder.TransformersBertEmbedder
14 |
15 | .. automethod:: __call__
16 |
--------------------------------------------------------------------------------
/docs/apiref/models/entity_extraction.rst:
--------------------------------------------------------------------------------
1 | deeppavlov.models.entity_extraction
2 | ===================================
3 |
4 | .. autoclass:: deeppavlov.models.entity_extraction.ner_chunker.NerChunker
5 |
6 | .. automethod:: __init__
7 | .. automethod:: __call__
8 |
9 | .. autoclass:: deeppavlov.models.entity_extraction.entity_linking.EntityLinker
10 |
11 | .. automethod:: __init__
12 | .. automethod:: __call__
13 |
14 | .. autoclass:: deeppavlov.models.entity_extraction.entity_detection_parser.EntityDetectionParser
15 |
16 | .. automethod:: __init__
17 | .. automethod:: __call__
18 |
19 | .. autoclass:: deeppavlov.models.entity_extraction.entity_detection_parser.QuestionSignChecker
20 |
--------------------------------------------------------------------------------
/docs/apiref/models/kbqa.rst:
--------------------------------------------------------------------------------
1 | deeppavlov.models.kbqa
2 | ======================
3 |
4 | .. automodule:: deeppavlov.models.kbqa
5 |
6 | .. autoclass:: deeppavlov.models.kbqa.type_define.AnswerTypesExtractor
7 |
8 | .. automethod:: __init__
9 | .. automethod:: __call__
10 |
11 | .. autoclass:: deeppavlov.models.kbqa.query_generator.QueryGenerator
12 |
13 | .. automethod:: __init__
14 | .. automethod:: __call__
15 |
16 | .. autoclass:: deeppavlov.models.kbqa.query_generator_base.QueryGeneratorBase
17 |
18 | .. automethod:: __init__
19 | .. automethod:: __call__
20 |
21 | .. autoclass:: deeppavlov.models.kbqa.rel_ranking_infer.RelRankerInfer
22 |
23 | .. automethod:: __init__
24 | .. automethod:: __call__
25 |
26 | .. autoclass:: deeppavlov.models.kbqa.template_matcher.TemplateMatcher
27 |
28 | .. automethod:: __init__
29 | .. automethod:: __call__
30 |
31 | .. autoclass:: deeppavlov.models.kbqa.ru_adj_to_noun.RuAdjToNoun
32 |
33 | .. automethod:: __init__
34 | .. automethod:: __call__
35 |
36 | .. autoclass:: deeppavlov.models.kbqa.tree_to_sparql.TreeToSparql
37 |
38 | .. automethod:: __init__
39 | .. automethod:: __call__
40 |
41 | .. autoclass:: deeppavlov.models.kbqa.wiki_parser.WikiParser
42 |
43 | .. automethod:: __init__
44 | .. automethod:: __call__
45 |
--------------------------------------------------------------------------------
/docs/apiref/models/preprocessors.rst:
--------------------------------------------------------------------------------
1 | deeppavlov.models.preprocessors
2 | ===============================
3 |
4 | .. autoclass:: deeppavlov.models.preprocessors.dirty_comments_preprocessor.DirtyCommentsPreprocessor
5 |
6 | .. automethod:: __call__
7 |
8 | .. autoclass:: deeppavlov.models.preprocessors.mask.Mask
9 |
10 | .. autoclass:: deeppavlov.models.preprocessors.one_hotter.OneHotter
11 |
12 | .. autoclass:: deeppavlov.models.preprocessors.sanitizer.Sanitizer
13 |
14 | .. autofunction:: deeppavlov.models.preprocessors.str_lower.str_lower
15 |
16 | .. autoclass:: deeppavlov.models.preprocessors.str_token_reverser.StrTokenReverser
17 |
18 | .. automethod:: __call__
19 |
20 | .. autoclass:: deeppavlov.models.preprocessors.str_utf8_encoder.StrUTF8Encoder
21 |
22 | .. automethod:: __call__
23 |
24 | .. autoclass:: deeppavlov.models.preprocessors.odqa_preprocessors.DocumentChunker
25 |
26 | .. automethod:: __call__
27 |
28 | .. autoclass:: deeppavlov.models.preprocessors.odqa_preprocessors.StringMultiplier
29 |
30 | .. automethod:: __call__
31 |
--------------------------------------------------------------------------------
/docs/apiref/models/relation_extraction.rst:
--------------------------------------------------------------------------------
1 | deeppavlov.models.relation_extraction
2 | =====================================
3 |
4 | .. autoclass:: deeppavlov.models.relation_extraction.relation_extraction_bert.REBertModel
5 |
6 | .. automethod:: __init__
7 | .. automethod:: __call__
8 | .. automethod:: train_on_batch
9 |
--------------------------------------------------------------------------------
/docs/apiref/models/sklearn.rst:
--------------------------------------------------------------------------------
1 | deeppavlov.models.sklearn
2 | =============================
3 |
4 | .. automodule:: deeppavlov.models.sklearn
5 | :members:
6 |
7 | .. autoclass:: deeppavlov.models.sklearn.sklearn_component.SklearnComponent
8 |
9 | .. automethod:: __call__
10 | .. automethod:: fit
11 | .. automethod:: init_from_scratch
12 | .. automethod:: load
13 | .. automethod:: save
14 | .. automethod:: compose_input_data
15 | .. automethod:: get_class_attributes
16 | .. automethod:: get_function_params
17 |
--------------------------------------------------------------------------------
/docs/apiref/models/spelling_correction.rst:
--------------------------------------------------------------------------------
1 | deeppavlov.models.spelling_correction
2 | =====================================
3 |
4 | .. autoclass:: deeppavlov.models.spelling_correction.brillmoore.ErrorModel
5 |
6 | .. automethod:: __call__
7 | .. automethod:: fit
8 | .. automethod:: save
9 | .. automethod:: load
10 |
11 | .. autoclass:: deeppavlov.models.spelling_correction.levenshtein.LevenshteinSearcherComponent
12 |
13 | .. automethod:: __call__
14 |
15 |
16 | .. autoclass:: deeppavlov.models.spelling_correction.electors.top1_elector.TopOneElector
17 |
18 | .. automethod:: __call__
19 |
20 | .. autoclass:: deeppavlov.models.spelling_correction.electors.kenlm_elector.KenlmElector
21 |
22 | .. automethod:: __call__
23 |
--------------------------------------------------------------------------------
/docs/apiref/models/tokenizers.rst:
--------------------------------------------------------------------------------
1 | deeppavlov.models.tokenizers
2 | ============================
3 |
4 | .. autoclass:: deeppavlov.models.tokenizers.nltk_moses_tokenizer.NLTKMosesTokenizer
5 |
6 | .. automethod:: __call__
7 |
8 | .. autoclass:: deeppavlov.models.tokenizers.nltk_tokenizer.NLTKTokenizer
9 |
10 | .. automethod:: __call__
11 |
12 | .. autoclass:: deeppavlov.models.tokenizers.split_tokenizer.SplitTokenizer
13 |
14 | .. autoclass:: deeppavlov.models.tokenizers.spacy_tokenizer.StreamSpacyTokenizer
15 |
16 | .. automethod:: __call__
--------------------------------------------------------------------------------
/docs/apiref/models/torch_bert.rst:
--------------------------------------------------------------------------------
1 | deeppavlov.models.torch_bert
2 | ============================
3 |
4 | .. automodule:: deeppavlov.models.torch_bert
5 | :members:
6 |
7 | .. autoclass:: deeppavlov.models.preprocessors.torch_transformers_preprocessor.TorchTransformersPreprocessor
8 |
9 | .. automethod:: __call__
10 |
11 | .. autoclass:: deeppavlov.models.preprocessors.torch_transformers_preprocessor.TorchTransformersNerPreprocessor
12 |
13 | .. automethod:: __call__
14 |
15 | .. autoclass:: deeppavlov.models.preprocessors.torch_transformers_preprocessor.TorchBertRankerPreprocessor
16 |
17 | .. automethod:: __call__
18 |
19 | .. autoclass:: deeppavlov.models.torch_bert.torch_transformers_classifier.TorchTransformersClassifierModel
20 |
21 | .. automethod:: __call__
22 | .. automethod:: train_on_batch
23 |
24 | .. autoclass:: deeppavlov.models.torch_bert.torch_transformers_sequence_tagger.TorchTransformersSequenceTagger
25 |
26 | .. automethod:: __call__
27 | .. automethod:: train_on_batch
28 |
29 | .. autoclass:: deeppavlov.models.torch_bert.torch_transformers_squad.TorchTransformersSquad
30 |
31 | .. automethod:: __call__
32 | .. automethod:: train_on_batch
33 |
34 | .. autoclass:: deeppavlov.models.torch_bert.torch_bert_ranker.TorchBertRankerModel
35 |
36 | .. automethod:: __call__
37 | .. automethod:: train_on_batch
38 |
--------------------------------------------------------------------------------
/docs/apiref/models/vectorizers.rst:
--------------------------------------------------------------------------------
1 | deeppavlov.models.vectorizers
2 | =============================
3 |
4 |
5 | .. autoclass:: deeppavlov.models.vectorizers.hashing_tfidf_vectorizer.HashingTfIdfVectorizer
6 | :members:
7 |
8 | .. automethod:: __call__
9 |
--------------------------------------------------------------------------------
/docs/apiref/vocabs.rst:
--------------------------------------------------------------------------------
1 | vocabs
2 | ======
3 | Concrete Vocab classes.
4 |
5 | .. automodule:: deeppavlov.vocabs
6 | :members:
7 |
8 | .. autoclass:: deeppavlov.vocabs.wiki_sqlite.WikiSQLiteVocab
9 | :members:
10 |
11 | .. automethod:: __call__
12 |
13 | .. automodule:: deeppavlov.vocabs.typos
14 | :members:
15 |
--------------------------------------------------------------------------------
/docs/devguides/registry.rst:
--------------------------------------------------------------------------------
1 | Register your model
2 | ===================
3 |
4 | In order to extend the library, you need to register your classes and functions; it is done in two steps.
5 |
6 | 1. Decorate your :class:`~deeppavlov.core.models.component.Component`
7 | (or :class:`~deeppavlov.core.data.dataset_reader.DatasetReader`,
8 | or :class:`~deeppavlov.core.data.data_learning_iterator.DataLearningIterator`,
9 | or :class:`~deeppavlov.core.data.data_fitting_iterator.DataFittingIterator`)
10 | using :func:`~deeppavlov.core.common.registry.register` and/or metrics function
11 | using :func:`~deeppavlov.core.common.metrics_registry.register_metric`.
12 |
13 | 2. Rebuild the registry running from DeepPavlov root directory:
14 |
15 | ::
16 |
17 | python -m utils.prepare.registry
18 |
19 | This script imports all the modules in deeppavlov package, builds the registry from them and writes it to a file.
20 |
21 |
22 | However, it is possible to use some classes and functions inside configuration files without registering them explicitly.
23 | There are two options available here:
24 |
25 | - instead of ``{"class_name": "registered_component_name"}`` in config file use key-value pair similar to
26 | ``{"class_name": "my_package.my_module:MyClass"}``
27 |
28 | - if your classes/functions are properly decorated but not included in the registry, use ``"metadata"`` section of
29 | your config file specifying imports as ``"metadata": {"imports": ["my_local_package.my_module", "global_package.module"]}``;
30 | then the second step described above will be unnecessary (local packages are imported from the current working
31 | directory).
32 |
--------------------------------------------------------------------------------
/docs/features/hypersearch.rst:
--------------------------------------------------------------------------------
1 | Hyperparameters optimization
2 | ============================
3 |
4 | You can search for best hyperparameters of your model in DeepPavlov by means of cross-validation.
5 |
6 | Cross-validation
7 | ~~~~~~~~~~~~~~~~
8 |
9 | You can run cross-validation in DeepPavlov to select best parameters of your model.
10 | For this purpose you have to run special command 'paramserach'. for example:
11 |
12 | .. code:: bash
13 |
14 | python -m deeppavlov.paramsearch path_to_json_config.json --folds 5
15 |
16 |
17 | Parameters
18 | ----------
19 |
20 | Cross validation command have several parameters:
21 |
22 | - ``config_path``:
23 | Specify config path, where you model is located.
24 | - ``--folds``:
25 | This parameter shows how many folds you need in cross validation.
26 | Do you want to use leave one out cross validation instead of folds?
27 | Just specify this: ``--folds loo``.
28 | If you want not to cross-validate just omit this parameter.
29 | - ``--search_type``:
30 | This parameter is optional - default value is "grid" (grid search).
31 |
32 |
33 | .. note::
34 |
35 | Folds will be created automatically from union of train and validation datasets.
36 |
37 |
38 | Special parameters in config
39 | ----------------------------
40 | Config file of model should be consist of parameters ranges for search.
41 | For example, you try to optimize regularization coefficient in model,
42 | so you should add additional parameter in config with suffix '_range'.
43 | Let's see example for logistic regression model:
44 |
45 | .. code:: python
46 |
47 | {
48 | "class_name": "faq_logreg_model",
49 | "in": "q_vect",
50 | "fit_on": ["q_vect", "y"],
51 | "c": {"search_choice": [1, 10, 100, 1000]},
52 | "out": ["answer", "score"]
53 | }
54 |
55 | In this example parameter "c" described as search_choice, values for grid search:
56 |
57 | .. code:: python
58 |
59 | {"search_choice": [value_0, ..., value_n]}
60 |
61 |
62 | Results
63 | -------
64 | As a result you'll have new json config with best model parameters.
65 | It'll be stored in the same directory as config file and will have suffix '_cvbest.json'.
66 | Also you'll see final log messages about best model:
67 |
68 | .. code:: bash
69 |
70 | INFO in '__main__'['paramsearch'] at line 169: Best model params: {'C': 10000, 'penalty': 'l1', 'accuracy': 0.81466}
71 | INFO in '__main__'['paramsearch'] at line 184: Best model saved in json-file: path_to_model_config_cvbest.json
72 |
--------------------------------------------------------------------------------
/docs/index.rst:
--------------------------------------------------------------------------------
1 | Welcome to DeepPavlov's documentation!
2 | ======================================
3 |
4 | .. toctree::
5 | :glob:
6 | :maxdepth: 1
7 |
8 | Installation
9 | QuickStart
10 | General concepts
11 | Configuration file
12 | Python pipelines
13 | Models overview
14 |
15 |
16 | .. toctree::
17 | :glob:
18 | :maxdepth: 2
19 | :caption: Features
20 |
21 | Pre-trained embeddings
22 | AutoML
23 |
24 |
25 | .. toctree::
26 | :glob:
27 | :maxdepth: 1
28 | :caption: Models
29 |
30 | Multitask BERT
31 | Context Question Answering
32 | Classification
33 | Few-shot Classification
34 | Named Entity Recognition
35 | Entity Extraction
36 | BERT-based models
37 | Morphological Tagging
38 | Neural Ranking
39 | Spelling Correction
40 | Syntactic Parsing
41 | TF-IDF Ranking
42 | Popularity Ranking
43 | Knowledge Base Question answering
44 | Relation Extraction
45 | SuperGLUE Submission
46 | Open-Domain Question Answering
47 |
48 |
49 | .. toctree::
50 | :glob:
51 | :maxdepth: 3
52 | :caption: Integrations
53 |
54 | REST API
55 | Socket API
56 | Amazon AWS deployment
57 | DeepPavlov settings
58 |
59 |
60 | .. toctree::
61 | :glob:
62 | :maxdepth: 3
63 | :caption: Developer Guides
64 |
65 | Contribution guide
66 | Register your model
67 |
68 |
69 | .. toctree::
70 | :glob:
71 | :maxdepth: 3
72 | :caption: Internships
73 |
74 | Internships
75 |
76 |
77 | .. toctree::
78 | :glob:
79 | :maxdepth: 3
80 | :caption: Package Reference
81 |
82 | apiref/*
83 |
84 |
85 | Indices and tables
86 | ==================
87 |
88 | * :ref:`genindex`
89 | * :ref:`modindex`
90 |
--------------------------------------------------------------------------------
/docs/integrations/settings.rst:
--------------------------------------------------------------------------------
1 | DeepPavlov settings
2 | ===================
3 |
4 | DeepPavlov provides some tools to facilitate its usage (e.g. dialog logging, settings management). This document is aimed to guide you through them.
5 |
6 | 1. Settings files access and management
7 | ---------------------------------------
8 |
9 | Most of DeepPavlov settings are located in settings files, which in turn are located in a settings folder. Default settings folder location is ``deeppavlov/utils/settings`` .
10 |
11 | You can override a settings directory path by setting the ``DP_SETTINGS_PATH`` environment variable. Missing files will be added automatically when running any deeppavlov script.
12 |
13 | You can get current full path to settings directory with ``python -m deeppavlov.settings``.
14 | To reset settings in the current settings directory one can use ``python -m deeppavlov.settings -d``.
15 |
16 | 2. Dialog logging
17 | -----------------
18 |
19 | DeepPavlov supports logging of infered utterances and DeepPavlov model responses. You can manage dialog logging by
20 | editing ``dialog_logger_config.json`` file in a settings directory.
21 |
22 | Following dialog logging settings are available:
23 |
24 | 1. **enabled** (default: ``false``): turns on/off dialog logging for DeepPavlov instance;
25 | 2. **log_path** (default: ``~/.deeppavlov/dialog_logs``): sets directory where dialog logs are stored;
26 | 3. **logger_name** (default: ``default``): sets subdirectory name for storing dialog logs;
27 | 4. **logfile_max_size_kb** (default: ``10240``): sets logfile maximum size in kilobytes. If exceeded, new log file is created;
28 | 5. **ensure_ascii** (default: ``false``): If ``true``, converts all non-ASCII symbols in logged content to Unicode code points.
29 |
30 | 3. Environment variables
31 | ------------------------
32 |
33 | - **DP_SETTINGS_PATH** — custom path to a directory that contains settings files. It's automatically populated with missing files when running any deeppavlov scripts.
34 | - **DP_SKIP_NLTK_DOWNLOAD** set to ``TRUE`` to prevent automatic downloading of **nltk** packages (``punkt``, ``stopwords``, ``perluniprops``, ``nonbreaking_prefixes``)
35 |
--------------------------------------------------------------------------------
/docs/internships/internships.rst:
--------------------------------------------------------------------------------
1 |
2 | Internships
3 | ===========
4 |
5 | Do you have ideas on how to improve dialog systems for everyone? Are you ready to make an impact across the world?
6 | Great, then join us!
7 |
8 | Let’s shape the future of Conversational AI together. An internship is for aspiring graduate and undergraduate students
9 | who are passionate about Conversational AI technology and offer diverse perspectives.
10 |
11 | As an intern, you will work on some of the most ambitious technical problems, develop new ML solutions that will impact
12 | future DeepPavlov products and make the lives of DeepPavlov users easier.
13 |
14 | All interns are paired with a mentor and will participate directly in DeepPavlov's groundbreaking work.
15 | There are no restrictions on publications based on internships. International candidates are welcome to apply.
16 |
17 | Each of our research teams has specific test assignments for interested candidates, so please familiarize yourself
18 | with our `projects `_ that best match your skills and interests.
19 |
20 | `Apply now at our website `_.
21 |
--------------------------------------------------------------------------------
/docs/intro/installation.rst:
--------------------------------------------------------------------------------
1 | Installation
2 | ============
3 |
4 | DeepPavlov supports **Linux**, **Windows 10+** (through WSL/WSL2), **MacOS** (Big Sur+) platforms, **Python 3.6-3.11**.
5 | Depending on the model used, you may need from 4 to 16 GB RAM.
6 |
7 | Install with pip
8 | ~~~~~~~~~~~~~~~~
9 |
10 | You should install DeepPavlov in a `virtual environment `_. If you’re
11 | unfamiliar with Python virtual environments, take a look at this
12 | `guide `_. A virtual
13 | environment makes it easier to manage different projects, and avoid compatibility issues between dependencies.
14 |
15 | #. Create a virtual environment:
16 |
17 | .. code:: bash
18 |
19 | python -m venv env
20 |
21 | #. Activate the virtual environment on Linux (`source` could be replaced with `.`):
22 |
23 | .. code:: bash
24 |
25 | source env/bin/activate
26 |
27 | #. Install DeepPavlov inside this virtual environment:
28 |
29 | .. code:: bash
30 |
31 | pip install deeppavlov
32 |
33 | Install from source
34 | ~~~~~~~~~~~~~~~~~~~
35 |
36 | Install DeepPavlov **dev** branch from source with the following command:
37 |
38 | .. code:: bash
39 |
40 | pip install git+http://github.com/deeppavlov/DeepPavlov@dev
41 |
42 | This command installs the bleeding edge dev version rather than the latest release version. The dev version is useful
43 | for staying up-to-date with the latest developments. For instance, if a bug has been fixed since the last release but
44 | a new release hasn’t been rolled out yet. However, this means the dev version may not always be stable.
45 |
46 | Editable install
47 | ~~~~~~~~~~~~~~~~
48 |
49 | You will need an editable install if you want to make changes in the DeepPavlov source code that immediately take place
50 | without requiring a new installation.
51 |
52 | Clone the repository and install DeepPavlov with the following commands:
53 |
54 | .. code:: bash
55 |
56 | git clone http://github.com/deeppavlov/DeepPavlov.git
57 | pip install -e DeepPavlov
58 |
59 | Docker Images
60 | ~~~~~~~~~~~~~
61 |
62 | We have built several DeepPavlov based Docker images, which include:
63 |
64 | * DeepPavlov based Jupyter notebook Docker image;
65 | * Docker images which serve some of our models and allow to access them
66 | via REST API (:doc:`riseapi ` mode).
67 |
68 | Here is our `DockerHub repository `_ with
69 | images and deployment instructions.
70 |
--------------------------------------------------------------------------------
/docs/intro/overview.rst:
--------------------------------------------------------------------------------
1 | Conceptual overview
2 | ===================
3 |
4 | Our goal is to enable AI-application developers and researchers with:
5 |
6 | - A set of pre-trained NLP models, pre-defined dialog system components
7 | (ML/DL/Rule-based), and pipeline templates;
8 | - A framework for implementing and testing their own dialog models;
9 | - Tools for application integration with adjacent infrastructure
10 | (messengers, helpdesk software, etc.);
11 | - Benchmarking environments for conversational models and uniform access
12 | to relevant datasets.
13 |
14 | .. image:: ../_static/dp_agnt_diag.png
15 |
16 |
17 | Key Concepts
18 | ------------
19 |
20 | - A ``Model`` is any NLP model that doesn't necessarily communicates
21 | with the user in natural language.
22 | - A ``Component`` is a reusable functional part of a ``Model``.
23 | - ``Rule-based Models`` cannot be trained.
24 | - ``Machine Learning Models`` can be trained only stand alone.
25 | - ``Deep Learning Models`` can be trained independently and in an
26 | end-to-end mode being joined in a chain.
27 | - A ``Chainer`` builds a model pipeline from heterogeneous
28 | components (Rule-based/ML/DL). It allows one to train and infer models in
29 | a pipeline as a whole.
30 |
31 | The smallest building block of the library is a ``Component``.
32 | A ``Component`` stands for any kind of function in an NLP pipeline. It can
33 | be implemented as a neural network, a non-neural ML model, or a
34 | rule-based system.
35 |
36 | ``Component``\ s can be joined into a ``Model``. A ``Model``
37 | solves a larger NLP task than a ``Component``. However, in terms of
38 | implementation, ``Model``\ s are not different from ``Component``\ s.
39 |
40 | Most of DeepPavlov models are built on top of `PyTorch `__.
41 | Other external libraries can be used to build basic components.
42 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | fastapi>=0.47.0,<=0.89.1
2 | filelock>=3.0.0,<3.10.0
3 | nltk>=3.2.4,<3.10.0
4 | numpy<1.24
5 | pandas>=1.0.0,<1.6.0
6 | prometheus-client>=0.13.0,<=1.16.0
7 | pydantic<2
8 | pybind11==2.10.3
9 | requests>=2.19.0,<3.0.0
10 | scikit-learn>=0.24,<1.1.0;python_version<="3.10"
11 | scikit-learn==1.4.0;python_version=="3.11.*"
12 | tqdm>=4.42.0,<4.65.0
13 | uvicorn>=0.13.0,<0.19.0
14 | wheel
15 | scipy<1.10.0;python_version<"3.8"
16 | scipy==1.10.0;python_version>="3.8"
17 |
--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deeppavlov/DeepPavlov/5f9fbed0c7191466bc7621e604b810f66f254c03/tests/__init__.py
--------------------------------------------------------------------------------
/tests/test_configs/doc_retrieval/en_ranker_tfidf_wiki_test.json:
--------------------------------------------------------------------------------
1 | {
2 | "dataset_reader": {
3 | "class_name": "odqa_reader",
4 | "data_path": "{DOWNLOADS_PATH}/odqa/enwiki_test",
5 | "save_path": "{DOWNLOADS_PATH}/odqa/enwiki_test.db",
6 | "dataset_format": "txt"
7 | },
8 | "dataset_iterator": {
9 | "class_name": "sqlite_iterator",
10 | "shuffle": false,
11 | "load_path": "{DOWNLOADS_PATH}/odqa/enwiki_test.db"
12 | },
13 | "chainer": {
14 | "in": [
15 | "docs"
16 | ],
17 | "in_y": [
18 | "doc_ids",
19 | "doc_nums"
20 | ],
21 | "out": [
22 | "tfidf_doc_ids"
23 | ],
24 | "pipe": [
25 | {
26 | "class_name": "hashing_tfidf_vectorizer",
27 | "id": "vectorizer",
28 | "fit_on": [
29 | "docs",
30 | "doc_ids",
31 | "doc_nums"
32 | ],
33 | "save_path": "{DOWNLOADS_PATH}/odqa/enwiki_test_tfidf.npz",
34 | "load_path": "{DOWNLOADS_PATH}/odqa/enwiki_test_tfidf.npz",
35 | "tokenizer": {
36 | "class_name": "stream_spacy_tokenizer",
37 | "lemmas": true,
38 | "ngram_range": [
39 | 1,
40 | 2
41 | ]
42 | }
43 | },
44 | {
45 | "class_name": "tfidf_ranker",
46 | "top_n": 20,
47 | "in": [
48 | "docs"
49 | ],
50 | "out": [
51 | "tfidf_doc_ids",
52 | "tfidf_doc_scores"
53 | ],
54 | "vectorizer": "#vectorizer"
55 | }
56 | ]
57 | },
58 | "train": {
59 | "batch_size": 2,
60 | "evaluation_targets": [],
61 | "class_name": "fit_trainer"
62 | },
63 | "metadata": {
64 | "variables": {
65 | "ROOT_PATH": "~/.deeppavlov",
66 | "DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
67 | "MODELS_PATH": "{ROOT_PATH}/models"
68 | },
69 | "download": [
70 | {
71 | "url": "http://files.deeppavlov.ai/datasets/wikipedia/enwiki_test.tar.gz",
72 | "subdir": "{DOWNLOADS_PATH}/odqa"
73 | }
74 | ]
75 | }
76 | }
--------------------------------------------------------------------------------
/tests/test_configs/doc_retrieval/ru_ranker_tfidf_wiki_test.json:
--------------------------------------------------------------------------------
1 | {
2 | "dataset_reader": {
3 | "class_name": "odqa_reader",
4 | "data_path": "{DOWNLOADS_PATH}/odqa/ruwiki_test",
5 | "save_path": "{DOWNLOADS_PATH}/odqa/ruwiki_test.db",
6 | "dataset_format": "txt"
7 | },
8 | "dataset_iterator": {
9 | "class_name": "sqlite_iterator",
10 | "shuffle": false,
11 | "load_path": "{DOWNLOADS_PATH}/odqa/ruwiki_test.db"
12 | },
13 | "chainer": {
14 | "in": [
15 | "docs"
16 | ],
17 | "in_y": [
18 | "doc_ids",
19 | "doc_nums"
20 | ],
21 | "out": [
22 | "tfidf_doc_ids"
23 | ],
24 | "pipe": [
25 | {
26 | "class_name": "hashing_tfidf_vectorizer",
27 | "id": "vectorizer",
28 | "fit_on": [
29 | "docs",
30 | "doc_ids",
31 | "doc_nums"
32 | ],
33 | "save_path": "{DOWNLOADS_PATH}/odqa/ruwiki_test_tfidf.npz",
34 | "load_path": "{DOWNLOADS_PATH}/odqa/ruwiki_test_tfidf.npz",
35 | "tokenizer": {
36 | "class_name": "stream_spacy_tokenizer",
37 | "spacy_model": "ru_core_news_sm",
38 | "lemmas": true,
39 | "lowercase": true,
40 | "filter_stopwords": true,
41 | "ngram_range": [
42 | 1,
43 | 2
44 | ]
45 | }
46 | },
47 | {
48 | "class_name": "tfidf_ranker",
49 | "top_n": 20,
50 | "in": [
51 | "docs"
52 | ],
53 | "out": [
54 | "tfidf_doc_ids",
55 | "tfidf_doc_scores"
56 | ],
57 | "vectorizer": "#vectorizer"
58 | }
59 | ]
60 | },
61 | "train": {
62 | "batch_size": 2,
63 | "evaluation_targets": [],
64 | "class_name": "fit_trainer"
65 | },
66 | "metadata": {
67 | "variables": {
68 | "ROOT_PATH": "~/.deeppavlov",
69 | "DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
70 | "MODELS_PATH": "{ROOT_PATH}/models"
71 | },
72 | "download": [
73 | {
74 | "url": "http://files.deeppavlov.ai/datasets/wikipedia/ruwiki_test.tar.gz",
75 | "subdir": "{DOWNLOADS_PATH}/odqa"
76 | }
77 | ]
78 | }
79 | }
--------------------------------------------------------------------------------
/utils/Docker/Dockerfile:
--------------------------------------------------------------------------------
1 | ARG BASE_IMAGE
2 |
3 | FROM $BASE_IMAGE
4 |
5 | SHELL ["/bin/bash", "-c"]
6 |
7 | ENV DP_PYTEST_API_PORT=5000
8 | ENV DP_PYTEST_NO_CACHE=True
9 | ENV LANG='en_US.UTF-8'
10 |
11 | ARG DEBIAN_FRONTEND=noninteractive
12 | ARG PYTHON_VERSION
13 |
14 | RUN rm -f /etc/apt/sources.list.d/cuda*.list && \
15 | apt update && \
16 | apt install -y --no-install-recommends \
17 | build-essential \
18 | dpkg-dev \
19 | gcc \
20 | git \
21 | libbz2-dev \
22 | libc6-dev \
23 | libexpat1-dev \
24 | libffi-dev \
25 | libgdbm-dev \
26 | liblzma-dev \
27 | libncursesw5-dev \
28 | libreadline-dev \
29 | libsqlite3-dev \
30 | libssl-dev \
31 | libxslt-dev \
32 | locales \
33 | make \
34 | pandoc \
35 | tk-dev \
36 | wget \
37 | xz-utils \
38 | zlib1g-dev && \
39 | locale-gen en_US.UTF-8 && \
40 | wget --no-check-certificate -O python.tar.xz https://www.python.org/ftp/python/$PYTHON_VERSION/Python-$PYTHON_VERSION.tar.xz && \
41 | mkdir -p /usr/src/python && \
42 | tar -xC /usr/src/python --strip-components=1 -f python.tar.xz && \
43 | rm python.tar.xz && \
44 | cd /usr/src/python && \
45 | ./configure && \
46 | make -j "$(nproc)" altinstall && \
47 | ln -s /usr/local/bin/python${PYTHON_VERSION%.*} /usr/local/bin/python && \
48 | ln -s /usr/local/bin/pip${PYTHON_VERSION%.*} /usr/local/bin/pip && \
49 | pip install --upgrade pip && \
50 | pip install pybind11==2.2.4 && \
51 | rm -rf /usr/src/python /var/lib/apt/lists/*
52 |
53 | WORKDIR /app
54 |
55 | # two commands to prevent caching of the next layers
56 | ARG EPOCH
57 | ENV EPOCH=$EPOCH
58 |
59 | COPY . .
60 |
61 | CMD utils/Docker/cmd.sh
62 |
--------------------------------------------------------------------------------
/utils/Docker/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deeppavlov/DeepPavlov/5f9fbed0c7191466bc7621e604b810f66f254c03/utils/Docker/README.md
--------------------------------------------------------------------------------
/utils/Docker/cmd.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | set -e
4 |
5 | pip install .[tests,docs]
6 |
7 | rm -rf `find . -mindepth 1 -maxdepth 1 ! -name tests ! -name Jenkinsfile ! -name docs`
8 |
9 | cd docs
10 | make clean
11 | make html
12 | cd ..
13 |
14 | flake8 `python -c 'import deeppavlov; print(deeppavlov.__path__[0])'` --count --select=E9,F63,F7,F82 --show-source --statistics
15 |
16 | pytest -v --disable-warnings --instafail $PYTEST_ARGS
17 |
--------------------------------------------------------------------------------
/utils/Docker/docker-compose.yml:
--------------------------------------------------------------------------------
1 | version: '3.7'
2 | services:
3 | py36:
4 | build:
5 | context: ../../
6 | dockerfile: utils/Docker/Dockerfile
7 | args:
8 | - EPOCH=$EPOCH
9 | - PYTHON_VERSION=3.6.15
10 | - BASE_IMAGE=nvidia/cuda:11.5.2-cudnn8-runtime-ubuntu20.04
11 | user: '${UID}:${GID}'
12 | environment:
13 | - CUDA_VISIBLE_DEVICES=$TEST_GPU_0
14 | - PYTEST_ARGS=$PYTEST_ARGS
15 | - DP_PYTEST_NO_CACHE=True
16 | py37:
17 | build:
18 | context: ../../
19 | dockerfile: utils/Docker/Dockerfile
20 | args:
21 | - EPOCH=$EPOCH
22 | - PYTHON_VERSION=3.7.16
23 | - BASE_IMAGE=nvidia/cuda:11.5.2-cudnn8-runtime-ubuntu20.04
24 | user: '${UID}:${GID}'
25 | environment:
26 | - CUDA_VISIBLE_DEVICES=$TEST_GPU_1
27 | - PYTEST_ARGS=$PYTEST_ARGS
28 | - DP_PYTEST_NO_CACHE=True
29 | py38:
30 | build:
31 | context: ../../
32 | dockerfile: utils/Docker/Dockerfile
33 | args:
34 | - EPOCH=$EPOCH
35 | - PYTHON_VERSION=3.8.16
36 | - BASE_IMAGE=nvidia/cuda:11.5.2-cudnn8-runtime-ubuntu20.04
37 | user: '${UID}:${GID}'
38 | environment:
39 | - CUDA_VISIBLE_DEVICES=$TEST_GPU_0
40 | - PYTEST_ARGS=$PYTEST_ARGS
41 | - DP_PYTEST_NO_CACHE=True
42 | py39:
43 | build:
44 | context: ../../
45 | dockerfile: utils/Docker/Dockerfile
46 | args:
47 | - EPOCH=$EPOCH
48 | - PYTHON_VERSION=3.9.16
49 | - BASE_IMAGE=nvidia/cuda:11.5.2-cudnn8-runtime-ubuntu20.04
50 | user: '${UID}:${GID}'
51 | environment:
52 | - CUDA_VISIBLE_DEVICES=$TEST_GPU_1
53 | - PYTEST_ARGS=$PYTEST_ARGS
54 | - DP_PYTEST_NO_CACHE=True
55 | py310:
56 | build:
57 | context: ../../
58 | dockerfile: utils/Docker/Dockerfile
59 | args:
60 | - EPOCH=$EPOCH
61 | - PYTHON_VERSION=3.10.9
62 | - BASE_IMAGE=nvidia/cuda:11.5.2-cudnn8-runtime-ubuntu20.04
63 | user: '${UID}:${GID}'
64 | environment:
65 | - CUDA_VISIBLE_DEVICES=$TEST_GPU_0
66 | - PYTEST_ARGS=$PYTEST_ARGS
67 | - DP_PYTEST_NO_CACHE=True
68 | py311:
69 | build:
70 | context: ../../
71 | dockerfile: utils/Docker/Dockerfile
72 | args:
73 | - EPOCH=$EPOCH
74 | - PYTHON_VERSION=3.11.6
75 | - BASE_IMAGE=nvidia/cuda:11.5.2-cudnn8-runtime-ubuntu20.04
76 | user: '${UID}:${GID}'
77 | environment:
78 | - CUDA_VISIBLE_DEVICES=$TEST_GPU_1
79 | - PYTEST_ARGS=$PYTEST_ARGS
80 | - DP_PYTEST_NO_CACHE=True
81 |
--------------------------------------------------------------------------------
/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deeppavlov/DeepPavlov/5f9fbed0c7191466bc7621e604b810f66f254c03/utils/__init__.py
--------------------------------------------------------------------------------
/utils/prepare/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/deeppavlov/DeepPavlov/5f9fbed0c7191466bc7621e604b810f66f254c03/utils/prepare/__init__.py
--------------------------------------------------------------------------------
/utils/prepare/registry.py:
--------------------------------------------------------------------------------
1 | # Copyright 2017 Neural Networks and Deep Learning lab, MIPT
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | import json
16 | import pkgutil
17 | from importlib import import_module, reload
18 |
19 | import deeppavlov
20 | from deeppavlov.core.common.metrics_registry import _registry_path as m_registry_path, _REGISTRY as M_REGISTRY
21 | from deeppavlov.core.common.registry import _registry_path as c_registry_path, _REGISTRY as C_REGISTRY
22 |
23 | if __name__ == '__main__':
24 | C_REGISTRY.clear()
25 | M_REGISTRY.clear()
26 |
27 | for _, pkg_name, _ in pkgutil.walk_packages(deeppavlov.__path__, deeppavlov.__name__ + '.'):
28 | if pkg_name not in ('deeppavlov.core.common.registry', 'deeppavlov.core.common.metrics_registry'):
29 | reload(import_module(pkg_name))
30 |
31 | with c_registry_path.open('w', encoding='utf-8') as f:
32 | json.dump(dict(sorted(C_REGISTRY.items())), f, indent=2)
33 |
34 | with m_registry_path.open('w', encoding='utf-8') as f:
35 | json.dump(dict(sorted(M_REGISTRY.items())), f, indent=2)
36 |
--------------------------------------------------------------------------------