├── .github ├── ISSUE_TEMPLATE │ ├── bug_report.md │ ├── config.yml │ └── feature_request.md ├── pull_request_template.md └── workflows │ └── unit-tests.yml ├── .gitignore ├── CITATION.cff ├── LICENSE ├── README.md ├── docs ├── Makefile ├── annotations │ ├── constituency │ │ ├── ctb.md │ │ ├── index.md │ │ ├── npcmj.md │ │ └── ptb.md │ ├── dep │ │ ├── index.md │ │ ├── pmt.md │ │ ├── sd_en.md │ │ ├── sd_zh.md │ │ └── ud.md │ ├── index.md │ ├── ner │ │ ├── index.md │ │ ├── msra.md │ │ ├── ontonotes.md │ │ └── pku.md │ ├── pos │ │ ├── 863.md │ │ ├── ctb.md │ │ ├── index.md │ │ ├── npcmj.md │ │ ├── pku.md │ │ └── ud.md │ ├── sdp │ │ ├── dm.md │ │ ├── index.md │ │ ├── pas.md │ │ ├── psd.md │ │ └── semeval16.md │ ├── srl │ │ ├── cpb.md │ │ ├── index.md │ │ └── propbank.md │ └── tok │ │ ├── ctb.md │ │ ├── index.md │ │ └── msr.md ├── api │ ├── common │ │ ├── configurable.rst │ │ ├── conll.rst │ │ ├── constant.rst │ │ ├── document.rst │ │ └── index.md │ ├── hanlp │ │ ├── common │ │ │ ├── component.rst │ │ │ ├── dataset.md │ │ │ ├── index.md │ │ │ ├── structure.md │ │ │ ├── torch_component.md │ │ │ ├── transform.md │ │ │ └── vocab.md │ │ ├── components │ │ │ ├── classifiers.md │ │ │ ├── eos.md │ │ │ ├── index.md │ │ │ ├── lemmatizer.md │ │ │ ├── mtl │ │ │ │ ├── index.md │ │ │ │ ├── mtl.md │ │ │ │ └── tasks │ │ │ │ │ ├── constituency.md │ │ │ │ │ ├── dep.md │ │ │ │ │ ├── index.md │ │ │ │ │ ├── lem.md │ │ │ │ │ ├── ner │ │ │ │ │ ├── biaffine_ner.md │ │ │ │ │ ├── index.md │ │ │ │ │ └── tag_ner.md │ │ │ │ │ ├── pos.md │ │ │ │ │ ├── sdp.md │ │ │ │ │ ├── srl │ │ │ │ │ ├── bio_srl.md │ │ │ │ │ ├── index.md │ │ │ │ │ └── rank_srl.md │ │ │ │ │ ├── task.md │ │ │ │ │ ├── tok.md │ │ │ │ │ └── ud.md │ │ │ ├── ner │ │ │ │ ├── biaffine_ner.md │ │ │ │ ├── index.md │ │ │ │ ├── rnn_ner.md │ │ │ │ └── transformer_ner.md │ │ │ ├── parsers │ │ │ │ ├── biaffine_dep.md │ │ │ │ ├── biaffine_sdp.md │ │ │ │ ├── crf_constituency_parser.md │ │ │ │ ├── index.md │ │ │ │ └── ud_parser.md │ │ │ ├── pipeline.md │ │ │ ├── srl │ │ │ │ ├── index.md │ │ │ │ ├── span_bio.md │ │ │ │ └── span_rank.md │ │ │ ├── sts.md │ │ │ ├── taggers │ │ │ │ ├── index.md │ │ │ │ ├── rnn_tagger.md │ │ │ │ └── transformer_tagger.md │ │ │ └── tokenizers │ │ │ │ ├── index.md │ │ │ │ ├── multi_criteria.md │ │ │ │ └── transformer.md │ │ ├── datasets │ │ │ ├── constituency │ │ │ │ ├── constituency_dataset.md │ │ │ │ ├── index.md │ │ │ │ └── resources.md │ │ │ ├── dep │ │ │ │ ├── conll_dataset.md │ │ │ │ ├── index.md │ │ │ │ └── resources.md │ │ │ ├── eos │ │ │ │ ├── eos.md │ │ │ │ ├── index.md │ │ │ │ └── resources.md │ │ │ ├── index.md │ │ │ ├── ner │ │ │ │ ├── index.md │ │ │ │ ├── json.md │ │ │ │ ├── resources.md │ │ │ │ └── tsv.md │ │ │ ├── pos │ │ │ │ ├── index.md │ │ │ │ └── resources.md │ │ │ ├── srl │ │ │ │ ├── conll2012_dataset.md │ │ │ │ ├── index.md │ │ │ │ └── resources.md │ │ │ └── tok │ │ │ │ ├── index.md │ │ │ │ ├── mcws_dataset.md │ │ │ │ ├── resources.md │ │ │ │ └── txt.md │ │ ├── hanlp.rst │ │ ├── index.md │ │ ├── layers │ │ │ ├── decoders │ │ │ │ ├── biaffine_ner.md │ │ │ │ ├── index.md │ │ │ │ └── linear_crf.md │ │ │ ├── embeddings │ │ │ │ ├── char_cnn.md │ │ │ │ ├── char_rnn.md │ │ │ │ ├── embedding.md │ │ │ │ ├── fasttext.md │ │ │ │ ├── index.md │ │ │ │ ├── transformer.md │ │ │ │ └── word2vec.md │ │ │ ├── index.md │ │ │ └── transformers │ │ │ │ ├── encoder.md │ │ │ │ ├── index.md │ │ │ │ └── tokenizer.md │ │ ├── pretrained │ │ │ ├── amr.md │ │ │ ├── amr2text.md │ │ │ ├── constituency.md │ │ │ ├── dep.md │ │ │ ├── eos.md │ │ │ ├── fasttext.md │ │ │ ├── glove.md │ │ │ ├── index.md │ │ │ ├── mlm.md │ │ │ ├── mtl.md │ │ │ ├── ner.md │ │ │ ├── pos.md │ │ │ ├── sdp.md │ │ │ ├── srl.md │ │ │ ├── sts.md │ │ │ ├── tok.md │ │ │ └── word2vec.md │ │ └── utils │ │ │ ├── index.md │ │ │ └── io_util.md │ ├── restful.rst │ ├── restful_golang.md │ ├── restful_java.md │ └── trie │ │ ├── dictionary.md │ │ ├── index.md │ │ └── trie.md ├── conf.py ├── configure.md ├── contributing.md ├── data_format.md ├── index.md ├── install.md ├── references.bib ├── references.rst └── tutorial.md ├── hanlp ├── __init__.py ├── callbacks │ ├── __init__.py │ └── fine_csv_logger.py ├── common │ ├── __init__.py │ ├── component.py │ ├── dataset.py │ ├── keras_component.py │ ├── structure.py │ ├── torch_component.py │ ├── transform.py │ ├── transform_tf.py │ ├── vocab.py │ └── vocab_tf.py ├── components │ ├── __init__.py │ ├── amr │ │ ├── __init__.py │ │ ├── amrbart │ │ │ ├── __init__.py │ │ │ ├── bart_amr_generation.py │ │ │ ├── bart_amr_parser.py │ │ │ ├── common │ │ │ │ ├── __init__.py │ │ │ │ ├── constant.py │ │ │ │ ├── penman_interface.py │ │ │ │ └── postprocessing.py │ │ │ ├── data_interface │ │ │ │ ├── __init__.py │ │ │ │ └── dataset.py │ │ │ ├── model_interface │ │ │ │ ├── __init__.py │ │ │ │ ├── modeling_bart.py │ │ │ │ └── tokenization_bart.py │ │ │ └── preprocess │ │ │ │ ├── __init__.py │ │ │ │ ├── amr_io.py │ │ │ │ ├── penman_interface.py │ │ │ │ └── read_and_process.py │ │ └── seq2seq │ │ │ ├── __init__.py │ │ │ ├── dataset │ │ │ ├── IO.py │ │ │ ├── __init__.py │ │ │ ├── dataset.py │ │ │ ├── linearization.py │ │ │ ├── penman.py │ │ │ ├── postprocessing.py │ │ │ ├── tokenization_bart.py │ │ │ └── tokenization_t5.py │ │ │ ├── evaluation.py │ │ │ ├── optim.py │ │ │ └── seq2seq_amr_parser.py │ ├── classifiers │ │ ├── __init__.py │ │ ├── fasttext_classifier.py │ │ ├── transformer_classifier.py │ │ ├── transformer_classifier_hf.py │ │ ├── transformer_classifier_tf.py │ │ └── transformer_regression_hf.py │ ├── distillation │ │ ├── __init__.py │ │ ├── distillable_component.py │ │ ├── losses.py │ │ └── schedulers.py │ ├── eos │ │ ├── __init__.py │ │ └── ngram.py │ ├── lambda_wrapper.py │ ├── lemmatizer.py │ ├── lm │ │ ├── __init__.py │ │ └── mlm.py │ ├── mtl │ │ ├── __init__.py │ │ ├── multi_task_learning.py │ │ └── tasks │ │ │ ├── __init__.py │ │ │ ├── amr.py │ │ │ ├── constituency.py │ │ │ ├── dep.py │ │ │ ├── dep_2nd.py │ │ │ ├── lem.py │ │ │ ├── ner │ │ │ ├── __init__.py │ │ │ ├── biaffine_ner.py │ │ │ └── tag_ner.py │ │ │ ├── pos.py │ │ │ ├── sdp.py │ │ │ ├── srl │ │ │ ├── __init__.py │ │ │ ├── bio_srl.py │ │ │ └── rank_srl.py │ │ │ ├── tok │ │ │ ├── __init__.py │ │ │ ├── reg_tok.py │ │ │ └── tag_tok.py │ │ │ └── ud.py │ ├── ner │ │ ├── __init__.py │ │ ├── biaffine_ner │ │ │ ├── __init__.py │ │ │ ├── biaffine_ner.py │ │ │ └── biaffine_ner_model.py │ │ ├── ner_tf.py │ │ ├── rnn_ner.py │ │ └── transformer_ner.py │ ├── parsers │ │ ├── __init__.py │ │ ├── alg.py │ │ ├── alg_tf.py │ │ ├── biaffine │ │ │ ├── __init__.py │ │ │ ├── biaffine.py │ │ │ ├── biaffine_2nd_dep.py │ │ │ ├── biaffine_dep.py │ │ │ ├── biaffine_model.py │ │ │ ├── biaffine_sdp.py │ │ │ ├── mlp.py │ │ │ ├── structual_attention.py │ │ │ └── variationalbilstm.py │ │ ├── biaffine_parser_tf.py │ │ ├── biaffine_tf │ │ │ ├── __init__.py │ │ │ ├── alg.py │ │ │ ├── layers.py │ │ │ └── model.py │ │ ├── chu_liu_edmonds.py │ │ ├── conll.py │ │ ├── constituency │ │ │ ├── __init__.py │ │ │ ├── crf_constituency_model.py │ │ │ ├── crf_constituency_parser.py │ │ │ └── treecrf.py │ │ ├── parse_alg.py │ │ └── ud │ │ │ ├── __init__.py │ │ │ ├── lemma_edit.py │ │ │ ├── tag_decoder.py │ │ │ ├── ud_model.py │ │ │ ├── ud_parser.py │ │ │ ├── udify_util.py │ │ │ └── util.py │ ├── pipeline.py │ ├── rnn_language_model_tf.py │ ├── srl │ │ ├── __init__.py │ │ ├── span_bio │ │ │ ├── __init__.py │ │ │ ├── baffine_tagging.py │ │ │ └── span_bio.py │ │ └── span_rank │ │ │ ├── __init__.py │ │ │ ├── highway_variational_lstm.py │ │ │ ├── inference_utils.py │ │ │ ├── layer.py │ │ │ ├── span_rank.py │ │ │ ├── span_ranking_srl_model.py │ │ │ ├── srl_eval_utils.py │ │ │ └── util.py │ ├── sts │ │ ├── __init__.py │ │ └── transformer_sts.py │ ├── taggers │ │ ├── __init__.py │ │ ├── cnn_tagger_tf.py │ │ ├── ngram_conv │ │ │ ├── __init__.py │ │ │ └── ngram_conv_tagger.py │ │ ├── pos_tf.py │ │ ├── rnn │ │ │ ├── __init__.py │ │ │ └── rnntaggingmodel.py │ │ ├── rnn_tagger.py │ │ ├── rnn_tagger_tf.py │ │ ├── tagger.py │ │ ├── tagger_tf.py │ │ ├── transformers │ │ │ ├── __init__.py │ │ │ ├── metrics_tf.py │ │ │ ├── transformer_tagger.py │ │ │ ├── transformer_tagger_tf.py │ │ │ └── transformer_transform_tf.py │ │ └── util.py │ └── tokenizers │ │ ├── __init__.py │ │ ├── multi_criteria_cws_transformer.py │ │ ├── tok.py │ │ ├── tok_tf.py │ │ └── transformer.py ├── datasets │ ├── __init__.py │ ├── classification │ │ ├── __init__.py │ │ └── sentiment.py │ ├── coref │ │ ├── __init__.py │ │ └── loaders │ │ │ ├── __init__.py │ │ │ └── conll12coref.py │ ├── eos │ │ ├── __init__.py │ │ ├── eos.py │ │ └── loaders │ │ │ ├── __init__.py │ │ │ └── nn_eos.py │ ├── lm │ │ ├── __init__.py │ │ └── loaders │ │ │ ├── __init__.py │ │ │ └── lm_dataset.py │ ├── lu │ │ ├── __init__.py │ │ └── glue.py │ ├── ner │ │ ├── __init__.py │ │ ├── conll03.py │ │ ├── loaders │ │ │ ├── __init__.py │ │ │ ├── json_ner.py │ │ │ └── tsv.py │ │ ├── msra.py │ │ ├── resume.py │ │ └── weibo.py │ ├── parsing │ │ ├── __init__.py │ │ ├── amr.py │ │ ├── ctb5.py │ │ ├── ctb7.py │ │ ├── ctb8.py │ │ ├── ctb9.py │ │ ├── loaders │ │ │ ├── __init__.py │ │ │ ├── _ctb_utils.py │ │ │ ├── conll_dataset.py │ │ │ └── constituency_dataset.py │ │ ├── pmt1.py │ │ ├── ptb.py │ │ ├── semeval15.py │ │ ├── semeval16.py │ │ └── ud │ │ │ ├── __init__.py │ │ │ ├── ud210.py │ │ │ ├── ud210m.py │ │ │ ├── ud23.py │ │ │ ├── ud23m.py │ │ │ ├── ud27.py │ │ │ └── ud27m.py │ ├── pos │ │ ├── __init__.py │ │ └── ctb5.py │ ├── qa │ │ ├── __init__.py │ │ └── hotpotqa.py │ ├── srl │ │ ├── __init__.py │ │ ├── loaders │ │ │ ├── __init__.py │ │ │ ├── conll2012.py │ │ │ └── ontonotes_loader.py │ │ └── ontonotes5 │ │ │ ├── __init__.py │ │ │ ├── _utils.py │ │ │ ├── chinese.py │ │ │ └── english.py │ ├── sts │ │ ├── __init__.py │ │ └── stsb.py │ └── tokenization │ │ ├── __init__.py │ │ ├── ctb6.py │ │ ├── loaders │ │ ├── __init__.py │ │ ├── chunking_dataset.py │ │ ├── multi_criteria_cws │ │ │ ├── __init__.py │ │ │ └── mcws_dataset.py │ │ └── txt.py │ │ └── sighan2005 │ │ ├── __init__.py │ │ ├── as_.py │ │ ├── cityu.py │ │ ├── msr.py │ │ └── pku.py ├── layers │ ├── __init__.py │ ├── cnn_encoder.py │ ├── crf │ │ ├── __init__.py │ │ ├── crf.py │ │ ├── crf_layer_tf.py │ │ └── crf_tf.py │ ├── dropout.py │ ├── embeddings │ │ ├── __init__.py │ │ ├── char_cnn.py │ │ ├── char_cnn_tf.py │ │ ├── char_rnn.py │ │ ├── char_rnn_tf.py │ │ ├── concat_embedding.py │ │ ├── contextual_string_embedding.py │ │ ├── contextual_string_embedding_tf.py │ │ ├── contextual_word_embedding.py │ │ ├── embedding.py │ │ ├── fast_text.py │ │ ├── fast_text_tf.py │ │ ├── util.py │ │ ├── util_tf.py │ │ ├── word2vec.py │ │ └── word2vec_tf.py │ ├── feed_forward.py │ ├── feedforward.py │ ├── scalar_mix.py │ ├── time_distributed.py │ ├── transformers │ │ ├── __init__.py │ │ ├── encoder.py │ │ ├── loader_tf.py │ │ ├── pt_imports.py │ │ ├── relative_transformer.py │ │ ├── resource.py │ │ ├── tf_imports.py │ │ ├── utils.py │ │ └── utils_tf.py │ └── weight_normalization.py ├── losses │ ├── __init__.py │ └── sparse_categorical_crossentropy.py ├── metrics │ ├── __init__.py │ ├── accuracy.py │ ├── amr │ │ ├── __init__.py │ │ └── smatch_eval.py │ ├── chunking │ │ ├── __init__.py │ │ ├── binary_chunking_f1.py │ │ ├── bmes_tf.py │ │ ├── chunking_f1.py │ │ ├── chunking_f1_tf.py │ │ ├── conlleval.py │ │ ├── iobes_tf.py │ │ └── sequence_labeling.py │ ├── f1.py │ ├── metric.py │ ├── mtl.py │ ├── parsing │ │ ├── __init__.py │ │ ├── attachmentscore.py │ │ ├── conllx_eval.py │ │ ├── labeled_f1.py │ │ ├── labeled_f1_tf.py │ │ ├── labeled_score.py │ │ ├── semdep_eval.py │ │ └── span.py │ ├── spearman_correlation.py │ └── srl │ │ ├── __init__.py │ │ └── srlconll.py ├── optimizers │ ├── __init__.py │ └── adamw │ │ ├── __init__.py │ │ └── optimization.py ├── pretrained │ ├── __init__.py │ ├── amr.py │ ├── amr2text.py │ ├── classifiers.py │ ├── constituency.py │ ├── dep.py │ ├── eos.py │ ├── fasttext.py │ ├── glove.py │ ├── mtl.py │ ├── ner.py │ ├── pos.py │ ├── rnnlm.py │ ├── sdp.py │ ├── srl.py │ ├── sts.py │ ├── tok.py │ └── word2vec.py ├── transform │ ├── __init__.py │ ├── conll_tf.py │ ├── glue_tf.py │ ├── table_tf.py │ ├── tacred_tf.py │ ├── text_tf.py │ ├── transformer_tokenizer.py │ ├── tsv_tf.py │ └── txt_tf.py ├── utils │ ├── __init__.py │ ├── component_util.py │ ├── file_read_backwards │ │ ├── __init__.py │ │ ├── buffer_work_space.py │ │ └── file_read_backwards.py │ ├── init_util.py │ ├── io_util.py │ ├── lang │ │ ├── __init__.py │ │ ├── en │ │ │ ├── __init__.py │ │ │ └── english_tokenizer.py │ │ ├── ja │ │ │ ├── __init__.py │ │ │ └── bert_tok.py │ │ └── zh │ │ │ ├── __init__.py │ │ │ ├── char_table.py │ │ │ └── localization.py │ ├── log_util.py │ ├── rules.py │ ├── span_util.py │ ├── string_util.py │ ├── tf_util.py │ ├── time_util.py │ └── torch_util.py └── version.py ├── plugins ├── README.md ├── hanlp_common │ ├── README.md │ ├── __init__.py │ ├── hanlp_common │ │ ├── __init__.py │ │ ├── amr.py │ │ ├── configurable.py │ │ ├── conll.py │ │ ├── constant.py │ │ ├── document.py │ │ ├── io.py │ │ ├── reflection.py │ │ ├── structure.py │ │ ├── util.py │ │ └── visualization.py │ └── setup.py ├── hanlp_demo │ ├── README.md │ ├── hanlp_demo │ │ ├── __init__.py │ │ ├── block_windows.py │ │ ├── en │ │ │ ├── __init__.py │ │ │ ├── demo_amr.py │ │ │ ├── demo_dep.py │ │ │ ├── demo_lm.py │ │ │ ├── demo_ner.py │ │ │ ├── demo_pipeline.py │ │ │ ├── demo_pos.py │ │ │ ├── demo_sdp.py │ │ │ ├── demo_sentiment_analysis.py │ │ │ ├── demo_tok.py │ │ │ └── train_sst2_albert_base.py │ │ ├── ja │ │ │ ├── __init__.py │ │ │ └── demo_mtl.py │ │ ├── mul │ │ │ ├── __init__.py │ │ │ ├── demo_lid.py │ │ │ ├── demo_lid_restful.py │ │ │ ├── demo_mtl.py │ │ │ └── train │ │ │ │ ├── __init__.py │ │ │ │ └── mul_base.py │ │ ├── sent_split.py │ │ └── zh │ │ │ ├── __init__.py │ │ │ ├── abstractive_summarization_restful.ipynb │ │ │ ├── amr_restful.ipynb │ │ │ ├── amr_stl.ipynb │ │ │ ├── classification_restful.ipynb │ │ │ ├── con_mtl.ipynb │ │ │ ├── con_restful.ipynb │ │ │ ├── con_stl.ipynb │ │ │ ├── cor_restful.ipynb │ │ │ ├── demo_amr.py │ │ │ ├── demo_custom_dict.py │ │ │ ├── demo_custom_dict_stl.py │ │ │ ├── demo_del_tasks.py │ │ │ ├── demo_document.py │ │ │ ├── demo_mlm.py │ │ │ ├── demo_mtl.py │ │ │ ├── demo_ner_dict.py │ │ │ ├── demo_parse_constituency.py │ │ │ ├── demo_pipeline.py │ │ │ ├── demo_pos_dict.py │ │ │ ├── demo_sts.py │ │ │ ├── demo_word2vec.py │ │ │ ├── dep_mtl.ipynb │ │ │ ├── dep_restful.ipynb │ │ │ ├── dep_stl.ipynb │ │ │ ├── extractive_summarization_restful.ipynb │ │ │ ├── gec_restful.ipynb │ │ │ ├── keyphrase_restful.ipynb │ │ │ ├── lid_restful.ipynb │ │ │ ├── lid_stl.ipynb │ │ │ ├── ner_mtl.ipynb │ │ │ ├── ner_restful.ipynb │ │ │ ├── ner_stl.ipynb │ │ │ ├── pos_mtl.ipynb │ │ │ ├── pos_restful.ipynb │ │ │ ├── pos_stl.ipynb │ │ │ ├── sdp_mtl.ipynb │ │ │ ├── sdp_restful.ipynb │ │ │ ├── sdp_stl.ipynb │ │ │ ├── sentiment_restful.ipynb │ │ │ ├── srl_mtl.ipynb │ │ │ ├── srl_restful.ipynb │ │ │ ├── srl_stl.ipynb │ │ │ ├── sts_restful.ipynb │ │ │ ├── sts_stl.ipynb │ │ │ ├── tf │ │ │ ├── __init__.py │ │ │ ├── demo_classifier.py │ │ │ ├── demo_client.py │ │ │ ├── demo_cws.py │ │ │ ├── demo_cws_trie.py │ │ │ ├── demo_dep.py │ │ │ ├── demo_fasttext.py │ │ │ ├── demo_multiprocess.py │ │ │ ├── demo_ner.py │ │ │ ├── demo_pipeline.py │ │ │ ├── demo_pos.py │ │ │ ├── demo_sdp.py │ │ │ ├── demo_serving.py │ │ │ └── train │ │ │ │ ├── __init__.py │ │ │ │ ├── cws │ │ │ │ ├── __init__.py │ │ │ │ ├── train_ctb6_cws_albert.py │ │ │ │ ├── train_ctb6_cws_bert.py │ │ │ │ ├── train_ctb6_cws_convseg.py │ │ │ │ ├── train_large_bert_cws.py │ │ │ │ ├── train_large_conv_cws.py │ │ │ │ ├── train_large_cws_albert.py │ │ │ │ ├── train_large_cws_electra.py │ │ │ │ ├── train_large_rnn_cws.py │ │ │ │ ├── train_msr_cws_albert.py │ │ │ │ ├── train_msr_cws_bert.py │ │ │ │ ├── train_msr_cws_ngram_conv.py │ │ │ │ ├── train_msr_cws_ngram_conv_embed.py │ │ │ │ ├── train_pku980106_conv_cws.py │ │ │ │ ├── train_pku980106_rnn_cws.py │ │ │ │ └── train_pku_conv_cws.py │ │ │ │ ├── finetune_msra_ner_albert.py │ │ │ │ ├── train_chnsenticorp_bert.py │ │ │ │ ├── train_conll03_ner_bert.py │ │ │ │ ├── train_conll03_ner_flair.py │ │ │ │ ├── train_ctb5_dep.py │ │ │ │ ├── train_ctb5_pos_rnn.py │ │ │ │ ├── train_ctb7_dep.py │ │ │ │ ├── train_ctb9_pos_albert.py │ │ │ │ ├── train_ctb9_pos_electra.py │ │ │ │ ├── train_msra_ner_albert.py │ │ │ │ ├── train_msra_ner_bert.py │ │ │ │ ├── train_msra_ner_electra.py │ │ │ │ ├── train_msra_ner_ngram_conv.py │ │ │ │ ├── train_msra_ner_rnn.py │ │ │ │ ├── train_ptb_dep_biaffine_albert.py │ │ │ │ ├── train_ptb_dep_biaffine_bert.py │ │ │ │ ├── train_ptb_dep_biaffine_bert_96.6.py │ │ │ │ ├── train_ptb_dep_biaffine_bert_positional.py │ │ │ │ ├── train_ptb_dep_sa_albert.py │ │ │ │ ├── train_ptb_dep_sa_albert_topk.py │ │ │ │ ├── train_ptb_dep_sa_bert.py │ │ │ │ ├── train_ptb_dep_sa_pos_bert.py │ │ │ │ ├── train_ptb_pos_rnn_fasttext.py │ │ │ │ ├── train_semeval15_dm.py │ │ │ │ ├── train_semeval15_pas.py │ │ │ │ ├── train_semeval15_psd.py │ │ │ │ ├── train_semeval16_news.py │ │ │ │ └── train_semeval16_text.py │ │ │ ├── tok_mtl.ipynb │ │ │ ├── tok_restful.ipynb │ │ │ ├── tok_stl.ipynb │ │ │ ├── train │ │ │ ├── __init__.py │ │ │ ├── finetune_ner.py │ │ │ ├── open_base.py │ │ │ └── open_small.py │ │ │ ├── train_sota_bert_pku.py │ │ │ ├── tst_restful.ipynb │ │ │ └── tutorial.ipynb │ └── setup.py ├── hanlp_restful │ ├── README.md │ ├── hanlp_restful │ │ └── __init__.py │ ├── setup.py │ └── tests │ │ ├── __init__.py │ │ └── test_client.py ├── hanlp_restful_golang │ └── README.md ├── hanlp_restful_java │ ├── pom.xml │ └── src │ │ ├── main │ │ └── java │ │ │ └── com │ │ │ └── hankcs │ │ │ └── hanlp │ │ │ └── restful │ │ │ ├── BaseInput.java │ │ │ ├── CoreferenceResolutionOutput.java │ │ │ ├── DocumentInput.java │ │ │ ├── HanLPClient.java │ │ │ ├── SentenceInput.java │ │ │ ├── Span.java │ │ │ ├── TokenInput.java │ │ │ └── mrp │ │ │ ├── Anchor.java │ │ │ ├── Edge.java │ │ │ ├── MeaningRepresentation.java │ │ │ └── Node.java │ │ └── test │ │ └── java │ │ └── com │ │ └── hankcs │ │ └── hanlp │ │ └── restful │ │ ├── HanLPClientTest.java │ │ └── MeaningRepresentationTest.java └── hanlp_trie │ ├── README.md │ ├── hanlp_trie │ ├── __init__.py │ ├── dictionary.py │ └── trie.py │ ├── setup.py │ └── tests │ ├── __init__.py │ ├── test_trie.py │ └── test_trie_dict.py ├── setup.py └── tests ├── __init__.py ├── test_config_tracker.py ├── test_mtl.py ├── test_pipeline.py ├── test_rules.py └── test_string_util.py /.github/ISSUE_TEMPLATE/bug_report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: 🐛发现一个bug 3 | about: 需提交版本号、触发代码、错误日志 4 | title: '' 5 | labels: bug 6 | assignees: hankcs 7 | 8 | --- 9 | 10 | 13 | 14 | **Describe the bug** 15 | A clear and concise description of what the bug is. 16 | 17 | **Code to reproduce the issue** 18 | Provide a reproducible test case that is the bare minimum necessary to generate the problem. 19 | 20 | ```python 21 | ``` 22 | 23 | **Describe the current behavior** 24 | A clear and concise description of what happened. 25 | 26 | **Expected behavior** 27 | A clear and concise description of what you expected to happen. 28 | 29 | **System information** 30 | - OS Platform and Distribution (e.g., Linux Ubuntu 16.04): 31 | - Python version: 32 | - HanLP version: 33 | 34 | **Other info / logs** 35 | Include any logs or source code that would be helpful to diagnose the problem. If including tracebacks, please include the full traceback. Large logs and files should be attached. 36 | 37 | * [ ] I've completed this form and searched the web for solutions. 38 | 39 | 40 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/config.yml: -------------------------------------------------------------------------------- 1 | blank_issues_enabled: false 2 | contact_links: 3 | - name: ⁉️ 提问求助请上论坛 4 | url: https://bbs.hankcs.com/ 5 | about: 欢迎前往蝴蝶效应论坛求助 6 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature_request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: 🚀新功能请愿 3 | about: 建议增加一个新功能 4 | title: '' 5 | labels: feature request 6 | assignees: hankcs 7 | 8 | --- 9 | 10 | 17 | 18 | **Describe the feature and the current behavior/state.** 19 | 20 | **Will this change the current api? How?** 21 | 22 | **Who will benefit with this feature?** 23 | 24 | **Are you willing to contribute it (Yes/No):** 25 | 26 | **System information** 27 | - OS Platform and Distribution (e.g., Linux Ubuntu 16.04): 28 | - Python version: 29 | - HanLP version: 30 | 31 | **Any other info** 32 | 33 | * [ ] I've carefully completed this form. 34 | 35 | 36 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line, and also 5 | # from the environment for the first two. 6 | SPHINXOPTS ?= 7 | SPHINXBUILD ?= sphinx-build 8 | SOURCEDIR = . 9 | BUILDDIR = _build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | -------------------------------------------------------------------------------- /docs/annotations/constituency/index.md: -------------------------------------------------------------------------------- 1 | # Constituency Parsing 2 | 3 | ## Chinese 4 | ```{toctree} 5 | ctb 6 | ``` 7 | 8 | ## English 9 | ```{toctree} 10 | ptb 11 | ``` 12 | 13 | ## Japanese 14 | ```{toctree} 15 | npcmj 16 | ``` 17 | 18 | -------------------------------------------------------------------------------- /docs/annotations/dep/index.md: -------------------------------------------------------------------------------- 1 | # Dependency Parsing 2 | 3 | ## Chinese 4 | 5 | ```{toctree} 6 | sd_zh 7 | pmt 8 | ``` 9 | 10 | ## English 11 | 12 | ```{toctree} 13 | sd_en 14 | ``` 15 | 16 | ## Multilingual 17 | 18 | ```{toctree} 19 | ud 20 | ``` 21 | -------------------------------------------------------------------------------- /docs/annotations/index.md: -------------------------------------------------------------------------------- 1 | # Annotations 2 | 3 | 4 | ```{toctree} 5 | tok/index 6 | pos/index 7 | ner/index 8 | dep/index 9 | sdp/index 10 | srl/index 11 | constituency/index 12 | ``` 13 | 14 | -------------------------------------------------------------------------------- /docs/annotations/ner/index.md: -------------------------------------------------------------------------------- 1 | # Named Entity Recognition 2 | 3 | ## Chinese 4 | 5 | ```{toctree} 6 | pku 7 | msra 8 | ``` 9 | 10 | ## Multilingual 11 | 12 | ```{toctree} 13 | ontonotes 14 | ``` 15 | -------------------------------------------------------------------------------- /docs/annotations/pos/index.md: -------------------------------------------------------------------------------- 1 | # Part-of-Speech Tagging 2 | 3 | ## Chinese 4 | ```{toctree} 5 | ctb 6 | pku 7 | 863 8 | ``` 9 | 10 | ## Japanese 11 | ```{toctree} 12 | npcmj 13 | ``` 14 | 15 | ## Multilingual 16 | 17 | ```{toctree} 18 | ud 19 | ``` 20 | 21 | 22 | 23 | -------------------------------------------------------------------------------- /docs/annotations/sdp/dm.md: -------------------------------------------------------------------------------- 1 | # The reduction of Minimal Recursion Semantics 2 | 3 | Please refer to [Minimal Recursion Semantics An Introduction](https://www.cl.cam.ac.uk/~aac10/papers/mrs.pdf). 4 | -------------------------------------------------------------------------------- /docs/annotations/sdp/index.md: -------------------------------------------------------------------------------- 1 | # Semantic Dependency Parsing 2 | 3 | ## Chinese 4 | 5 | ```{toctree} 6 | semeval16 7 | ``` 8 | 9 | ## English 10 | 11 | ```{toctree} 12 | dm 13 | pas 14 | psd 15 | ``` 16 | 17 | -------------------------------------------------------------------------------- /docs/annotations/sdp/pas.md: -------------------------------------------------------------------------------- 1 | # Predicate-Argument Structures 2 | 3 | Please refer to [Probabilistic disambiguation models for wide-coverage HPSG parsing](https://www.aclweb.org/anthology/P05-1011.pdf). 4 | -------------------------------------------------------------------------------- /docs/annotations/sdp/psd.md: -------------------------------------------------------------------------------- 1 | # Prague Czech-English Dependency Treebank 2 | 3 | Please refer to [Prague Czech-English Dependency Treebank](http://ufal.mff.cuni.cz/pcedt2.0/en/index.html). 4 | -------------------------------------------------------------------------------- /docs/annotations/srl/index.md: -------------------------------------------------------------------------------- 1 | # Semantic Role Labeling 2 | 3 | ## Chinese 4 | ```{toctree} 5 | cpb 6 | ``` 7 | 8 | ## English 9 | ```{toctree} 10 | propbank 11 | ``` 12 | 13 | -------------------------------------------------------------------------------- /docs/annotations/tok/index.md: -------------------------------------------------------------------------------- 1 | # Tokenization 2 | 3 | ## Chinese 4 | ```{toctree} 5 | ctb 6 | msr 7 | ``` -------------------------------------------------------------------------------- /docs/api/common/configurable.rst: -------------------------------------------------------------------------------- 1 | .. _api/configurable: 2 | 3 | configurable 4 | ==================== 5 | 6 | 7 | .. autoclass:: hanlp_common.configurable.Configurable 8 | :members: 9 | 10 | .. autoclass:: hanlp_common.configurable.AutoConfigurable 11 | :members: 12 | -------------------------------------------------------------------------------- /docs/api/common/conll.rst: -------------------------------------------------------------------------------- 1 | .. _api/conll: 2 | 3 | conll 4 | ==================== 5 | 6 | 7 | .. autoclass:: hanlp_common.conll.CoNLLWord 8 | :members: 9 | 10 | .. autoclass:: hanlp_common.conll.CoNLLUWord 11 | :members: 12 | 13 | .. autoclass:: hanlp_common.conll.CoNLLSentence 14 | :members: -------------------------------------------------------------------------------- /docs/api/common/constant.rst: -------------------------------------------------------------------------------- 1 | constant 2 | ==================== 3 | 4 | 5 | .. automodule:: hanlp_common.constant 6 | :members: 7 | -------------------------------------------------------------------------------- /docs/api/common/document.rst: -------------------------------------------------------------------------------- 1 | .. _api/document: 2 | 3 | document 4 | ==================== 5 | 6 | .. currentmodule:: hanlp_common 7 | 8 | .. autoclass:: hanlp_common.document.Document 9 | :members: 10 | -------------------------------------------------------------------------------- /docs/api/common/index.md: -------------------------------------------------------------------------------- 1 | # hanlp_common 2 | 3 | Common APIs shared between `hanlp` and `restful`. 4 | 5 | ```{toctree} 6 | document 7 | conll 8 | configurable 9 | constant 10 | ``` 11 | 12 | -------------------------------------------------------------------------------- /docs/api/hanlp/common/component.rst: -------------------------------------------------------------------------------- 1 | component 2 | ================= 3 | 4 | .. currentmodule:: hanlp.common 5 | 6 | .. autoclass:: hanlp.common.component.Component 7 | :members: 8 | -------------------------------------------------------------------------------- /docs/api/hanlp/common/index.md: -------------------------------------------------------------------------------- 1 | # common 2 | 3 | Common base classes. 4 | 5 | ```{toctree} 6 | structure 7 | vocab 8 | transform 9 | dataset 10 | component 11 | torch_component 12 | ``` 13 | 14 | -------------------------------------------------------------------------------- /docs/api/hanlp/common/structure.md: -------------------------------------------------------------------------------- 1 | # structure 2 | 3 | ```{eval-rst} 4 | .. currentmodule:: hanlp.common 5 | 6 | .. autoclass:: hanlp.common.structure.ConfigTracker 7 | :members: 8 | 9 | .. autoclass:: hanlp.common.structure.History 10 | :members: 11 | 12 | ``` 13 | -------------------------------------------------------------------------------- /docs/api/hanlp/common/torch_component.md: -------------------------------------------------------------------------------- 1 | # torch_component 2 | 3 | ```{eval-rst} 4 | .. currentmodule:: hanlp.common.torch_component 5 | 6 | .. autoclass:: hanlp.common.torch_component.TorchComponent 7 | :members: 8 | 9 | ``` 10 | -------------------------------------------------------------------------------- /docs/api/hanlp/common/transform.md: -------------------------------------------------------------------------------- 1 | # transform 2 | 3 | ```{eval-rst} 4 | .. currentmodule:: hanlp.common 5 | 6 | .. autoclass:: hanlp.common.transform.VocabDict 7 | :members: 8 | 9 | ``` 10 | -------------------------------------------------------------------------------- /docs/api/hanlp/common/vocab.md: -------------------------------------------------------------------------------- 1 | # vocab 2 | 3 | ```{eval-rst} 4 | .. currentmodule:: hanlp.common 5 | 6 | .. autoclass:: hanlp.common.transform.Vocab 7 | :members: 8 | :special-members: 9 | :exclude-members: __init__, __repr__, __call__, __str__ 10 | 11 | ``` 12 | -------------------------------------------------------------------------------- /docs/api/hanlp/components/classifiers.md: -------------------------------------------------------------------------------- 1 | # classifiers 2 | 3 | ```{eval-rst} 4 | .. currentmodule:: hanlp.components.classifiers 5 | 6 | .. autoclass:: hanlp.components.classifiers.transformer_classifier.TransformerClassifier 7 | :members: 8 | 9 | ``` 10 | -------------------------------------------------------------------------------- /docs/api/hanlp/components/eos.md: -------------------------------------------------------------------------------- 1 | # eos 2 | 3 | ```{eval-rst} 4 | .. currentmodule:: hanlp.components.eos 5 | 6 | .. autoclass:: hanlp.components.eos.ngram.NgramSentenceBoundaryDetector 7 | :members: 8 | 9 | ``` 10 | -------------------------------------------------------------------------------- /docs/api/hanlp/components/index.md: -------------------------------------------------------------------------------- 1 | # components 2 | 3 | NLP components. 4 | 5 | ```{toctree} 6 | mtl/index 7 | classifiers 8 | eos 9 | tokenizers/index 10 | lemmatizer 11 | taggers/index 12 | ner/index 13 | parsers/index 14 | srl/index 15 | pipeline 16 | sts 17 | ``` 18 | 19 | -------------------------------------------------------------------------------- /docs/api/hanlp/components/lemmatizer.md: -------------------------------------------------------------------------------- 1 | # lemmatizer 2 | 3 | ```{eval-rst} 4 | .. currentmodule:: hanlp.components.lemmatizer 5 | 6 | .. autoclass:: TransformerLemmatizer 7 | :members: 8 | 9 | ``` 10 | -------------------------------------------------------------------------------- /docs/api/hanlp/components/mtl/index.md: -------------------------------------------------------------------------------- 1 | # mtl 2 | 3 | Multi-Task Learning (MTL) framework. 4 | 5 | ```{toctree} 6 | mtl 7 | tasks/index 8 | ``` 9 | 10 | -------------------------------------------------------------------------------- /docs/api/hanlp/components/mtl/mtl.md: -------------------------------------------------------------------------------- 1 | # MultiTaskLearning 2 | 3 | ```{eval-rst} 4 | .. currentmodule:: hanlp.components.mtl 5 | 6 | .. autoclass:: hanlp.components.mtl.multi_task_learning.MultiTaskLearning 7 | :members: 8 | :special-members: 9 | :exclude-members: __init__, __repr__ 10 | 11 | ``` 12 | -------------------------------------------------------------------------------- /docs/api/hanlp/components/mtl/tasks/constituency.md: -------------------------------------------------------------------------------- 1 | # con 2 | 3 | Constituency parsing. 4 | 5 | ```{eval-rst} 6 | .. currentmodule:: hanlp.components.mtl 7 | 8 | .. autoclass:: hanlp.components.mtl.tasks.constituency.CRFConstituencyParsing 9 | :members: 10 | :exclude-members: execute_training_loop, fit_dataloader 11 | 12 | ``` 13 | -------------------------------------------------------------------------------- /docs/api/hanlp/components/mtl/tasks/dep.md: -------------------------------------------------------------------------------- 1 | # dep 2 | 3 | Dependency parsing. 4 | 5 | ```{eval-rst} 6 | .. currentmodule:: hanlp.components.mtl 7 | 8 | .. autoclass:: hanlp.components.mtl.tasks.dep.BiaffineDependencyParsing 9 | :members: 10 | :exclude-members: execute_training_loop, fit_dataloader 11 | 12 | ``` 13 | -------------------------------------------------------------------------------- /docs/api/hanlp/components/mtl/tasks/index.md: -------------------------------------------------------------------------------- 1 | # tasks 2 | 3 | Multi-Task Learning (MTL) tasks. 4 | 5 | ```{toctree} 6 | task 7 | constituency 8 | dep 9 | sdp 10 | ud 11 | lem 12 | pos 13 | tok 14 | ner/index 15 | srl/index 16 | ``` 17 | 18 | -------------------------------------------------------------------------------- /docs/api/hanlp/components/mtl/tasks/lem.md: -------------------------------------------------------------------------------- 1 | # lem 2 | 3 | Lemmatization. 4 | 5 | ```{eval-rst} 6 | .. currentmodule:: hanlp.components.mtl 7 | 8 | .. autoclass:: hanlp.components.mtl.tasks.lem.TransformerLemmatization 9 | :members: 10 | :exclude-members: execute_training_loop, fit_dataloader 11 | 12 | ``` 13 | -------------------------------------------------------------------------------- /docs/api/hanlp/components/mtl/tasks/ner/biaffine_ner.md: -------------------------------------------------------------------------------- 1 | # biaffine_ner 2 | 3 | Biaffine Named Entity Recognition. 4 | 5 | ```{eval-rst} 6 | .. currentmodule:: hanlp.components.mtl 7 | 8 | .. autoclass:: hanlp.components.mtl.tasks.ner.biaffine_ner.BiaffineNamedEntityRecognition 9 | :members: 10 | :exclude-members: execute_training_loop, fit_dataloader 11 | 12 | ``` 13 | -------------------------------------------------------------------------------- /docs/api/hanlp/components/mtl/tasks/ner/index.md: -------------------------------------------------------------------------------- 1 | # ner 2 | 3 | Named Entity Recognition. 4 | 5 | ```{toctree} 6 | tag_ner 7 | biaffine_ner 8 | ``` 9 | 10 | -------------------------------------------------------------------------------- /docs/api/hanlp/components/mtl/tasks/ner/tag_ner.md: -------------------------------------------------------------------------------- 1 | # tag_ner 2 | 3 | Tagging based Named Entity Recognition. 4 | 5 | ```{eval-rst} 6 | .. currentmodule:: hanlp.components.mtl 7 | 8 | .. autoclass:: hanlp.components.mtl.tasks.ner.tag_ner.TaggingNamedEntityRecognition 9 | :members: 10 | :exclude-members: execute_training_loop, fit_dataloader 11 | 12 | ``` 13 | -------------------------------------------------------------------------------- /docs/api/hanlp/components/mtl/tasks/pos.md: -------------------------------------------------------------------------------- 1 | # pos 2 | 3 | Part-of-speech tagging. 4 | 5 | ```{eval-rst} 6 | .. currentmodule:: hanlp.components.mtl 7 | 8 | .. autoclass:: hanlp.components.mtl.tasks.pos.TransformerTagging 9 | :members: 10 | :exclude-members: execute_training_loop, fit_dataloader 11 | 12 | ``` 13 | -------------------------------------------------------------------------------- /docs/api/hanlp/components/mtl/tasks/sdp.md: -------------------------------------------------------------------------------- 1 | # sdp 2 | 3 | Semantic Dependency Parsing. 4 | 5 | ```{eval-rst} 6 | .. currentmodule:: hanlp.components.mtl 7 | 8 | .. autoclass:: hanlp.components.mtl.tasks.sdp.BiaffineSemanticDependencyParsing 9 | :members: 10 | :exclude-members: execute_training_loop, fit_dataloader 11 | 12 | ``` 13 | -------------------------------------------------------------------------------- /docs/api/hanlp/components/mtl/tasks/srl/bio_srl.md: -------------------------------------------------------------------------------- 1 | # bio_srl 2 | 3 | BIO Tagging based Semantic Role Labeling. 4 | 5 | ```{eval-rst} 6 | .. currentmodule:: hanlp.components.mtl 7 | 8 | .. autoclass:: hanlp.components.mtl.tasks.srl.bio_srl.SpanBIOSemanticRoleLabeling 9 | :members: 10 | :exclude-members: execute_training_loop, fit_dataloader 11 | 12 | ``` 13 | -------------------------------------------------------------------------------- /docs/api/hanlp/components/mtl/tasks/srl/index.md: -------------------------------------------------------------------------------- 1 | # srl 2 | 3 | Semantic Role Labeling. 4 | 5 | ```{toctree} 6 | bio_srl 7 | rank_srl 8 | ``` 9 | 10 | -------------------------------------------------------------------------------- /docs/api/hanlp/components/mtl/tasks/srl/rank_srl.md: -------------------------------------------------------------------------------- 1 | # rank_srl 2 | 3 | Span Ranking Semantic Role Labeling. 4 | 5 | ```{eval-rst} 6 | .. currentmodule:: hanlp.components.mtl 7 | 8 | .. autoclass:: hanlp.components.mtl.tasks.srl.rank_srl.SpanRankingSemanticRoleLabeling 9 | :members: 10 | :exclude-members: execute_training_loop, fit_dataloader 11 | 12 | ``` 13 | -------------------------------------------------------------------------------- /docs/api/hanlp/components/mtl/tasks/task.md: -------------------------------------------------------------------------------- 1 | # Task 2 | 3 | ```{eval-rst} 4 | .. currentmodule:: hanlp.components.mtl 5 | 6 | .. autoclass:: hanlp.components.mtl.tasks.Task 7 | :members: 8 | :exclude-members: execute_training_loop, fit_dataloader 9 | 10 | ``` 11 | -------------------------------------------------------------------------------- /docs/api/hanlp/components/mtl/tasks/tok.md: -------------------------------------------------------------------------------- 1 | # tok 2 | 3 | Tokenization. 4 | 5 | ```{eval-rst} 6 | .. currentmodule:: hanlp.components.mtl 7 | 8 | .. autoclass:: hanlp.components.mtl.tasks.tok.tag_tok.TaggingTokenization 9 | :members: 10 | :exclude-members: execute_training_loop, fit_dataloader 11 | 12 | ``` 13 | -------------------------------------------------------------------------------- /docs/api/hanlp/components/mtl/tasks/ud.md: -------------------------------------------------------------------------------- 1 | # ud 2 | 3 | Universal Dependencies Parsing (lemmatization, features, PoS tagging and dependency parsing). 4 | 5 | ```{eval-rst} 6 | .. currentmodule:: hanlp.components.mtl 7 | 8 | .. autoclass:: hanlp.components.mtl.tasks.ud.UniversalDependenciesParsing 9 | :members: 10 | :exclude-members: execute_training_loop, fit_dataloader 11 | 12 | ``` 13 | -------------------------------------------------------------------------------- /docs/api/hanlp/components/ner/biaffine_ner.md: -------------------------------------------------------------------------------- 1 | # biaffine_ner 2 | 3 | Biaffine Named Entity Recognition. 4 | 5 | ```{eval-rst} 6 | .. currentmodule:: hanlp.components.ner.transformer_ner 7 | 8 | .. autoclass:: hanlp.components.ner.biaffine_ner.biaffine_ner.BiaffineNamedEntityRecognizer 9 | :members: 10 | 11 | ``` 12 | -------------------------------------------------------------------------------- /docs/api/hanlp/components/ner/index.md: -------------------------------------------------------------------------------- 1 | # ner 2 | 3 | Named Entity Recognition. 4 | 5 | ```{toctree} 6 | transformer_ner 7 | rnn_ner 8 | biaffine_ner 9 | ``` 10 | 11 | -------------------------------------------------------------------------------- /docs/api/hanlp/components/ner/rnn_ner.md: -------------------------------------------------------------------------------- 1 | # rnn_ner 2 | 3 | Tagging based Named Entity Recognition. 4 | 5 | ```{eval-rst} 6 | .. currentmodule:: hanlp.components.ner.rnn_ner 7 | 8 | .. autoclass:: hanlp.components.ner.rnn_ner.RNNNamedEntityRecognizer 9 | :members: 10 | 11 | ``` 12 | -------------------------------------------------------------------------------- /docs/api/hanlp/components/ner/transformer_ner.md: -------------------------------------------------------------------------------- 1 | # transformer_ner 2 | 3 | Tagging based Named Entity Recognition. 4 | 5 | ```{eval-rst} 6 | .. currentmodule:: hanlp.components.ner.transformer_ner 7 | 8 | .. autoclass:: hanlp.components.ner.transformer_ner.TransformerNamedEntityRecognizer 9 | :members: 10 | 11 | ``` 12 | -------------------------------------------------------------------------------- /docs/api/hanlp/components/parsers/biaffine_dep.md: -------------------------------------------------------------------------------- 1 | # biaffine_dep 2 | 3 | Biaffine dependency parser. 4 | 5 | ```{eval-rst} 6 | .. currentmodule:: hanlp.components 7 | 8 | .. autoclass:: hanlp.components.parsers.biaffine.biaffine_dep.BiaffineDependencyParser 9 | :members: 10 | 11 | ``` 12 | -------------------------------------------------------------------------------- /docs/api/hanlp/components/parsers/biaffine_sdp.md: -------------------------------------------------------------------------------- 1 | # biaffine_sdp 2 | 3 | Biaffine dependency parser. 4 | 5 | ```{eval-rst} 6 | .. currentmodule:: hanlp.components 7 | 8 | .. autoclass:: hanlp.components.parsers.biaffine.biaffine_sdp.BiaffineSemanticDependencyParser 9 | :members: 10 | 11 | ``` 12 | -------------------------------------------------------------------------------- /docs/api/hanlp/components/parsers/crf_constituency_parser.md: -------------------------------------------------------------------------------- 1 | # crf_constituency_parser 2 | 3 | Biaffine dependency parser. 4 | 5 | ```{eval-rst} 6 | .. currentmodule:: hanlp.components 7 | 8 | .. autoclass:: hanlp.components.parsers.constituency.crf_constituency_parser.CRFConstituencyParser 9 | :members: 10 | 11 | ``` 12 | -------------------------------------------------------------------------------- /docs/api/hanlp/components/parsers/index.md: -------------------------------------------------------------------------------- 1 | # parsers 2 | 3 | Parsers. 4 | 5 | ```{toctree} 6 | biaffine_dep 7 | biaffine_sdp 8 | ud_parser 9 | crf_constituency_parser 10 | ``` 11 | 12 | -------------------------------------------------------------------------------- /docs/api/hanlp/components/parsers/ud_parser.md: -------------------------------------------------------------------------------- 1 | # ud_parser 2 | 3 | Universal Dependencies Parsing (lemmatization, features, PoS tagging and dependency parsing). 4 | 5 | ```{eval-rst} 6 | .. currentmodule:: hanlp.components 7 | 8 | .. autoclass:: hanlp.components.parsers.ud.ud_parser.UniversalDependenciesParser 9 | :members: 10 | 11 | ``` 12 | -------------------------------------------------------------------------------- /docs/api/hanlp/components/pipeline.md: -------------------------------------------------------------------------------- 1 | # pipeline 2 | 3 | ```{eval-rst} 4 | .. currentmodule:: hanlp.components.pipeline 5 | 6 | .. autoclass:: hanlp.components.pipeline.Pipe 7 | :members: 8 | 9 | .. autoclass:: hanlp.components.pipeline.Pipeline 10 | :members: 11 | 12 | ``` 13 | 14 | -------------------------------------------------------------------------------- /docs/api/hanlp/components/srl/index.md: -------------------------------------------------------------------------------- 1 | # srl 2 | 3 | Semantic Role Labelers. 4 | 5 | ```{toctree} 6 | span_rank 7 | span_bio 8 | ``` 9 | 10 | -------------------------------------------------------------------------------- /docs/api/hanlp/components/srl/span_bio.md: -------------------------------------------------------------------------------- 1 | # span_bio 2 | 3 | Span BIO tagging based SRL. 4 | 5 | ```{eval-rst} 6 | .. currentmodule:: hanlp.components.srl.span_bio.span_bio 7 | 8 | .. autoclass:: SpanBIOSemanticRoleLabeler 9 | :members: 10 | 11 | ``` 12 | -------------------------------------------------------------------------------- /docs/api/hanlp/components/srl/span_rank.md: -------------------------------------------------------------------------------- 1 | # span_rank 2 | 3 | Span Rank based SRL. 4 | 5 | ```{eval-rst} 6 | .. currentmodule:: hanlp.components.srl.span_rank.span_rank 7 | 8 | .. autoclass:: SpanRankingSemanticRoleLabeler 9 | :members: 10 | 11 | ``` 12 | -------------------------------------------------------------------------------- /docs/api/hanlp/components/sts.md: -------------------------------------------------------------------------------- 1 | # sts 2 | 3 | ```{eval-rst} 4 | .. currentmodule:: hanlp.components.sts 5 | 6 | .. autoclass:: hanlp.components.sts.transformer_sts.TransformerSemanticTextualSimilarity 7 | :members: 8 | 9 | ``` 10 | 11 | -------------------------------------------------------------------------------- /docs/api/hanlp/components/taggers/index.md: -------------------------------------------------------------------------------- 1 | # taggers 2 | 3 | Taggers. 4 | 5 | ```{toctree} 6 | transformer_tagger 7 | rnn_tagger 8 | ``` 9 | 10 | -------------------------------------------------------------------------------- /docs/api/hanlp/components/taggers/rnn_tagger.md: -------------------------------------------------------------------------------- 1 | # rnn_tagger 2 | 3 | RNN based tagger. 4 | 5 | ```{eval-rst} 6 | .. currentmodule:: hanlp.components 7 | 8 | .. autoclass:: hanlp.components.taggers.rnn_tagger.RNNTagger 9 | :members: 10 | 11 | ``` 12 | -------------------------------------------------------------------------------- /docs/api/hanlp/components/taggers/transformer_tagger.md: -------------------------------------------------------------------------------- 1 | # transformer_tagger 2 | 3 | Transformer based tagger. 4 | 5 | ```{eval-rst} 6 | .. currentmodule:: hanlp.components 7 | 8 | .. autoclass:: hanlp.components.taggers.transformers.transformer_tagger.TransformerTagger 9 | :members: 10 | 11 | ``` 12 | -------------------------------------------------------------------------------- /docs/api/hanlp/components/tokenizers/index.md: -------------------------------------------------------------------------------- 1 | # tokenizers 2 | 3 | Tokenizers. 4 | 5 | ```{toctree} 6 | transformer 7 | multi_criteria 8 | ``` 9 | 10 | -------------------------------------------------------------------------------- /docs/api/hanlp/components/tokenizers/multi_criteria.md: -------------------------------------------------------------------------------- 1 | # multi_criteria 2 | 3 | Transformer based Multi-Criteria Word tokenizer. 4 | 5 | ```{eval-rst} 6 | .. currentmodule:: hanlp.components.tokenizers.multi_criteria_cws_transformer 7 | 8 | .. autoclass:: hanlp.components.tokenizers.multi_criteria_cws_transformer.MultiCriteriaTransformerTaggingTokenizer 9 | :members: 10 | 11 | ``` 12 | -------------------------------------------------------------------------------- /docs/api/hanlp/components/tokenizers/transformer.md: -------------------------------------------------------------------------------- 1 | # transformer 2 | 3 | Transformer based tokenizer. 4 | 5 | ```{eval-rst} 6 | .. currentmodule:: hanlp.components.tokenizers.transformer 7 | 8 | .. autoclass:: hanlp.components.tokenizers.transformer.TransformerTaggingTokenizer 9 | :members: 10 | 11 | ``` 12 | -------------------------------------------------------------------------------- /docs/api/hanlp/datasets/constituency/constituency_dataset.md: -------------------------------------------------------------------------------- 1 | # constituency_dataset 2 | 3 | ```{eval-rst} 4 | 5 | .. autoclass:: hanlp.datasets.parsing.loaders.constituency_dataset.ConstituencyDataset 6 | :members: 7 | 8 | ``` 9 | -------------------------------------------------------------------------------- /docs/api/hanlp/datasets/constituency/index.md: -------------------------------------------------------------------------------- 1 | # con 2 | 3 | Constituency parsing datasets. 4 | 5 | ```{toctree} 6 | constituency_dataset 7 | resources 8 | ``` 9 | 10 | -------------------------------------------------------------------------------- /docs/api/hanlp/datasets/constituency/resources.md: -------------------------------------------------------------------------------- 1 | # resources 2 | 3 | ## Chinese Treebank 4 | 5 | 6 | ### CTB8 7 | 8 | 9 | 10 | ````{margin} **Discussion** 11 | ```{seealso} 12 | About our data split on [our forum](https://bbs.hankcs.com/t/topic/3024). 13 | ``` 14 | ```` 15 | 16 | ```{eval-rst} 17 | 18 | 19 | .. autodata:: hanlp.datasets.parsing.ctb8.CTB8_BRACKET_LINE_NOEC_TRAIN 20 | .. autodata:: hanlp.datasets.parsing.ctb8.CTB8_BRACKET_LINE_NOEC_DEV 21 | .. autodata:: hanlp.datasets.parsing.ctb8.CTB8_BRACKET_LINE_NOEC_TEST 22 | 23 | ``` 24 | 25 | ### CTB9 26 | 27 | ````{margin} **Discussion** 28 | ```{seealso} 29 | About our data split on [our forum](https://bbs.hankcs.com/t/topic/3024). 30 | ``` 31 | ```` 32 | 33 | ```{eval-rst} 34 | 35 | 36 | .. autodata:: hanlp.datasets.parsing.ctb9.CTB9_BRACKET_LINE_NOEC_TRAIN 37 | .. autodata:: hanlp.datasets.parsing.ctb9.CTB9_BRACKET_LINE_NOEC_DEV 38 | .. autodata:: hanlp.datasets.parsing.ctb9.CTB9_BRACKET_LINE_NOEC_TEST 39 | 40 | ``` 41 | 42 | ## English Treebank 43 | 44 | ### PTB 45 | 46 | ```{eval-rst} 47 | 48 | .. autodata:: hanlp.datasets.parsing.ptb.PTB_TRAIN 49 | .. autodata:: hanlp.datasets.parsing.ptb.PTB_DEV 50 | .. autodata:: hanlp.datasets.parsing.ptb.PTB_TEST 51 | 52 | ``` 53 | -------------------------------------------------------------------------------- /docs/api/hanlp/datasets/dep/conll_dataset.md: -------------------------------------------------------------------------------- 1 | # conll 2 | 3 | ```{eval-rst} 4 | .. currentmodule:: hanlp.datasets.parsing.loaders.conll_dataset 5 | 6 | 7 | .. autoclass:: CoNLLParsingDataset 8 | :members: 9 | 10 | ``` 11 | -------------------------------------------------------------------------------- /docs/api/hanlp/datasets/dep/index.md: -------------------------------------------------------------------------------- 1 | # dep 2 | 3 | Dependency parsing datasets. 4 | 5 | ```{toctree} 6 | conll_dataset 7 | resources 8 | ``` 9 | 10 | -------------------------------------------------------------------------------- /docs/api/hanlp/datasets/eos/eos.md: -------------------------------------------------------------------------------- 1 | # eos 2 | 3 | ```{eval-rst} 4 | .. currentmodule:: hanlp.datasets.eos.eos 5 | 6 | .. autoclass:: SentenceBoundaryDetectionDataset 7 | :members: 8 | 9 | ``` 10 | -------------------------------------------------------------------------------- /docs/api/hanlp/datasets/eos/index.md: -------------------------------------------------------------------------------- 1 | # eos 2 | 3 | Sentence boundary detection datasets. 4 | 5 | ```{toctree} 6 | eos 7 | resources 8 | ``` 9 | 10 | -------------------------------------------------------------------------------- /docs/api/hanlp/datasets/eos/resources.md: -------------------------------------------------------------------------------- 1 | # resources 2 | 3 | ## nn_eos 4 | 5 | ```{eval-rst} 6 | 7 | .. automodule:: hanlp.datasets.eos.loaders.nn_eos 8 | :members: 9 | 10 | ``` -------------------------------------------------------------------------------- /docs/api/hanlp/datasets/index.md: -------------------------------------------------------------------------------- 1 | # datasets 2 | 3 | ```{eval-rst} 4 | NLP datasets grouped by tasks. For each task, we provide at least one ``torch.utils.data.Dataset`` compatible class 5 | and several open-source resources. Their file format and description can be found in their ``Dataset.load_file`` 6 | documents. Their contents are split into ``TRAIN``, ``DEV`` and ``TEST`` portions, each of them is stored in 7 | a Python constant which can be fetched using :meth:`~hanlp.utils.io_util.get_resource`. 8 | ``` 9 | 10 | ````{margin} **Professionals use Linux** 11 | ```{note} 12 | Many preprocessing scripts written by professionals make heavy use of Linux/Unix tool chains like shell, perl, gcc, 13 | etc., which is not available or buggy on Windows. You may need a *nix evironment to run these scripts. 14 | ``` 15 | ```` 16 | 17 | ```{toctree} 18 | eos/index 19 | tok/index 20 | pos/index 21 | ner/index 22 | dep/index 23 | srl/index 24 | constituency/index 25 | ``` 26 | 27 | -------------------------------------------------------------------------------- /docs/api/hanlp/datasets/ner/index.md: -------------------------------------------------------------------------------- 1 | # ner 2 | 3 | NER datasets. 4 | 5 | ```{toctree} 6 | tsv 7 | json 8 | resources 9 | ``` 10 | 11 | -------------------------------------------------------------------------------- /docs/api/hanlp/datasets/ner/json.md: -------------------------------------------------------------------------------- 1 | # json 2 | 3 | ```{eval-rst} 4 | .. currentmodule:: hanlp.datasets.ner.loaders.json_ner 5 | 6 | .. autoclass:: JsonNERDataset 7 | :members: 8 | 9 | ``` 10 | -------------------------------------------------------------------------------- /docs/api/hanlp/datasets/ner/resources.md: -------------------------------------------------------------------------------- 1 | # resources 2 | 3 | ## CoNLL 2003 4 | 5 | ```{eval-rst} 6 | 7 | .. automodule:: hanlp.datasets.ner.conll03 8 | :members: 9 | 10 | ``` 11 | 12 | ## MSRA 13 | 14 | ```{eval-rst} 15 | 16 | .. automodule:: hanlp.datasets.ner.msra 17 | :members: 18 | 19 | ``` 20 | 21 | ## OntoNotes5 22 | 23 | ```{eval-rst} 24 | 25 | .. autodata:: hanlp.datasets.srl.ontonotes5.chinese.ONTONOTES5_CONLL12_CHINESE_TRAIN 26 | .. autodata:: hanlp.datasets.srl.ontonotes5.chinese.ONTONOTES5_CONLL12_CHINESE_DEV 27 | .. autodata:: hanlp.datasets.srl.ontonotes5.chinese.ONTONOTES5_CONLL12_CHINESE_TEST 28 | 29 | .. autodata:: hanlp.datasets.srl.ontonotes5.chinese.ONTONOTES5_CONLL12_NER_CHINESE_TRAIN 30 | .. autodata:: hanlp.datasets.srl.ontonotes5.chinese.ONTONOTES5_CONLL12_NER_CHINESE_DEV 31 | .. autodata:: hanlp.datasets.srl.ontonotes5.chinese.ONTONOTES5_CONLL12_NER_CHINESE_TEST 32 | 33 | ``` 34 | 35 | ## Resume 36 | 37 | ```{eval-rst} 38 | 39 | .. automodule:: hanlp.datasets.ner.resume 40 | :members: 41 | ``` 42 | 43 | ## Weibo 44 | 45 | 46 | ```{eval-rst} 47 | 48 | .. automodule:: hanlp.datasets.ner.weibo 49 | :members: 50 | ``` -------------------------------------------------------------------------------- /docs/api/hanlp/datasets/ner/tsv.md: -------------------------------------------------------------------------------- 1 | # tsv 2 | 3 | ```{eval-rst} 4 | .. currentmodule:: hanlp.datasets.ner.loaders.tsv 5 | 6 | .. autoclass:: TSVTaggingDataset 7 | :members: 8 | 9 | ``` 10 | -------------------------------------------------------------------------------- /docs/api/hanlp/datasets/pos/index.md: -------------------------------------------------------------------------------- 1 | # pos 2 | 3 | PoS datasets. 4 | 5 | ```{eval-rst} 6 | PoS is a normal tagging task which uses :class:`hanlp.datasets.ner.loaders.tsv.TSVTaggingDataset` for loading. 7 | ``` 8 | 9 | ```{toctree} 10 | resources 11 | ``` 12 | 13 | -------------------------------------------------------------------------------- /docs/api/hanlp/datasets/pos/resources.md: -------------------------------------------------------------------------------- 1 | # resources 2 | 3 | ## CTB5 4 | 5 | ```{eval-rst} 6 | 7 | .. automodule:: hanlp.datasets.pos.ctb5 8 | :members: 9 | 10 | ``` 11 | 12 | ## CTB8 13 | 14 | ```{eval-rst} 15 | 16 | .. autodata:: hanlp.datasets.parsing.ctb8.CTB8_POS_TRAIN 17 | .. autodata:: hanlp.datasets.parsing.ctb8.CTB8_POS_DEV 18 | .. autodata:: hanlp.datasets.parsing.ctb8.CTB8_POS_TEST 19 | 20 | ``` 21 | 22 | ## CTB9 23 | 24 | 25 | ```{eval-rst} 26 | 27 | 28 | .. autodata:: hanlp.datasets.parsing.ctb9.CTB9_POS_TRAIN 29 | .. autodata:: hanlp.datasets.parsing.ctb9.CTB9_POS_DEV 30 | .. autodata:: hanlp.datasets.parsing.ctb9.CTB9_POS_TEST 31 | 32 | ``` -------------------------------------------------------------------------------- /docs/api/hanlp/datasets/srl/conll2012_dataset.md: -------------------------------------------------------------------------------- 1 | # conll2012_dataset 2 | 3 | ```{eval-rst} 4 | 5 | .. autoclass:: hanlp.datasets.srl.loaders.conll2012.CoNLL2012SRLDataset 6 | :members: 7 | 8 | ``` 9 | -------------------------------------------------------------------------------- /docs/api/hanlp/datasets/srl/index.md: -------------------------------------------------------------------------------- 1 | # srl 2 | 3 | Semantic Role Labeling datasets. 4 | 5 | ```{toctree} 6 | conll2012_dataset 7 | resources 8 | ``` 9 | 10 | -------------------------------------------------------------------------------- /docs/api/hanlp/datasets/srl/resources.md: -------------------------------------------------------------------------------- 1 | # resources 2 | 3 | ## OntoNotes 5 4 | 5 | ### Chinese 6 | 7 | ```{eval-rst} 8 | 9 | .. autodata:: hanlp.datasets.srl.ontonotes5.chinese.ONTONOTES5_CONLL12_CHINESE_TRAIN 10 | :noindex: 11 | .. autodata:: hanlp.datasets.srl.ontonotes5.chinese.ONTONOTES5_CONLL12_CHINESE_DEV 12 | :noindex: 13 | .. autodata:: hanlp.datasets.srl.ontonotes5.chinese.ONTONOTES5_CONLL12_CHINESE_TEST 14 | :noindex: 15 | 16 | ``` 17 | -------------------------------------------------------------------------------- /docs/api/hanlp/datasets/tok/index.md: -------------------------------------------------------------------------------- 1 | # tok 2 | 3 | Tokenization datasets. 4 | 5 | ```{toctree} 6 | txt 7 | mcws_dataset 8 | resources 9 | ``` 10 | 11 | -------------------------------------------------------------------------------- /docs/api/hanlp/datasets/tok/mcws_dataset.md: -------------------------------------------------------------------------------- 1 | # mcws_dataset 2 | 3 | ```{eval-rst} 4 | .. currentmodule:: hanlp.datasets.tokenization.loaders.multi_criteria_cws.mcws_dataset 5 | 6 | .. autoclass:: MultiCriteriaTextTokenizingDataset 7 | :members: 8 | 9 | ``` 10 | -------------------------------------------------------------------------------- /docs/api/hanlp/datasets/tok/resources.md: -------------------------------------------------------------------------------- 1 | # resources 2 | 3 | ## sighan2005 4 | 5 | [The Second International Chinese Word Segmentation Bakeoff](http://sighan.cs.uchicago.edu/bakeoff2005/) took place over the summer of 2005. 6 | 7 | ### pku 8 | 9 | ```{eval-rst} 10 | 11 | .. automodule:: hanlp.datasets.tokenization.sighan2005.pku 12 | :members: 13 | 14 | ``` 15 | 16 | ### msr 17 | 18 | ```{eval-rst} 19 | 20 | .. automodule:: hanlp.datasets.tokenization.sighan2005.msr 21 | :members: 22 | 23 | ``` 24 | 25 | ### as 26 | 27 | ```{eval-rst} 28 | 29 | .. automodule:: hanlp.datasets.tokenization.sighan2005.as_ 30 | :members: 31 | 32 | ``` 33 | 34 | ### cityu 35 | 36 | ```{eval-rst} 37 | 38 | .. automodule:: hanlp.datasets.tokenization.sighan2005.cityu 39 | :members: 40 | 41 | ``` 42 | 43 | ## CTB6 44 | 45 | ```{eval-rst} 46 | 47 | .. automodule:: hanlp.datasets.tokenization.ctb6 48 | :members: 49 | 50 | ``` 51 | 52 | ## CTB8 53 | 54 | 55 | ```{eval-rst} 56 | 57 | .. automodule:: hanlp.datasets.parsing.ctb8 58 | 59 | .. autodata:: CTB8_CWS_TRAIN 60 | .. autodata:: CTB8_CWS_DEV 61 | .. autodata:: CTB8_CWS_TEST 62 | 63 | ``` 64 | 65 | ## CTB9 66 | 67 | 68 | ```{eval-rst} 69 | 70 | .. automodule:: hanlp.datasets.parsing.ctb9 71 | 72 | .. autodata:: CTB9_CWS_TRAIN 73 | .. autodata:: CTB9_CWS_DEV 74 | .. autodata:: CTB9_CWS_TEST 75 | 76 | ``` -------------------------------------------------------------------------------- /docs/api/hanlp/datasets/tok/txt.md: -------------------------------------------------------------------------------- 1 | # txt 2 | 3 | ```{eval-rst} 4 | .. currentmodule:: hanlp.datasets.tokenization.loaders.txt 5 | 6 | .. autoclass:: TextTokenizingDataset 7 | :members: 8 | 9 | ``` 10 | -------------------------------------------------------------------------------- /docs/api/hanlp/hanlp.rst: -------------------------------------------------------------------------------- 1 | .. _api/main: 2 | 3 | hanlp 4 | ========== 5 | 6 | .. currentmodule:: hanlp 7 | 8 | .. autofunction:: load 9 | 10 | .. autofunction:: pipeline -------------------------------------------------------------------------------- /docs/api/hanlp/index.md: -------------------------------------------------------------------------------- 1 | # hanlp 2 | 3 | Core APIs for `hanlp`. 4 | 5 | ```{toctree} 6 | hanlp 7 | common/index 8 | components/index 9 | pretrained/index 10 | datasets/index 11 | utils/index 12 | layers/index 13 | ``` -------------------------------------------------------------------------------- /docs/api/hanlp/layers/decoders/biaffine_ner.md: -------------------------------------------------------------------------------- 1 | # biaffine_ner 2 | 3 | 4 | ```{eval-rst} 5 | 6 | .. autoclass:: hanlp.components.ner.biaffine_ner.biaffine_ner_model.BiaffineNamedEntityRecognitionDecoder 7 | :members: 8 | 9 | ``` 10 | -------------------------------------------------------------------------------- /docs/api/hanlp/layers/decoders/index.md: -------------------------------------------------------------------------------- 1 | # decoders 2 | 3 | ```{toctree} 4 | linear_crf 5 | biaffine_ner 6 | ``` 7 | 8 | -------------------------------------------------------------------------------- /docs/api/hanlp/layers/decoders/linear_crf.md: -------------------------------------------------------------------------------- 1 | # linear_crf 2 | 3 | 4 | ```{eval-rst} 5 | 6 | .. autoclass:: hanlp.components.mtl.tasks.pos.LinearCRFDecoder 7 | :members: 8 | 9 | ``` 10 | -------------------------------------------------------------------------------- /docs/api/hanlp/layers/embeddings/char_cnn.md: -------------------------------------------------------------------------------- 1 | # char_cnn 2 | 3 | 4 | ```{eval-rst} 5 | 6 | .. autoclass:: hanlp.layers.embeddings.char_cnn.CharCNN 7 | :members: 8 | 9 | .. autoclass:: hanlp.layers.embeddings.char_cnn.CharCNNEmbedding 10 | :members: 11 | 12 | ``` 13 | -------------------------------------------------------------------------------- /docs/api/hanlp/layers/embeddings/char_rnn.md: -------------------------------------------------------------------------------- 1 | # char_rnn 2 | 3 | 4 | ```{eval-rst} 5 | 6 | .. autoclass:: hanlp.layers.embeddings.char_rnn.CharRNN 7 | :members: 8 | 9 | .. autoclass:: hanlp.layers.embeddings.char_rnn.CharRNNEmbedding 10 | :members: 11 | 12 | ``` 13 | -------------------------------------------------------------------------------- /docs/api/hanlp/layers/embeddings/embedding.md: -------------------------------------------------------------------------------- 1 | # embedding 2 | 3 | 4 | ```{eval-rst} 5 | 6 | .. autoclass:: hanlp.layers.embeddings.embedding.Embedding 7 | :members: 8 | 9 | .. autoclass:: hanlp.layers.embeddings.embedding.ConcatModuleList 10 | :members: 11 | 12 | .. autoclass:: hanlp.layers.embeddings.embedding.EmbeddingList 13 | :members: 14 | 15 | ``` 16 | -------------------------------------------------------------------------------- /docs/api/hanlp/layers/embeddings/fasttext.md: -------------------------------------------------------------------------------- 1 | # fasttext 2 | 3 | ```{eval-rst} 4 | 5 | .. autoclass:: hanlp.layers.embeddings.fast_text.FastTextEmbedding 6 | :members: 7 | 8 | .. autoclass:: hanlp.layers.embeddings.fast_text.FastTextEmbeddingModule 9 | :members: 10 | 11 | ``` 12 | -------------------------------------------------------------------------------- /docs/api/hanlp/layers/embeddings/index.md: -------------------------------------------------------------------------------- 1 | # embeddings 2 | 3 | ```{toctree} 4 | embedding 5 | word2vec 6 | fasttext 7 | char_cnn 8 | char_rnn 9 | transformer 10 | ``` 11 | 12 | -------------------------------------------------------------------------------- /docs/api/hanlp/layers/embeddings/transformer.md: -------------------------------------------------------------------------------- 1 | # transformer 2 | 3 | 4 | ```{eval-rst} 5 | 6 | .. autoclass:: hanlp.layers.embeddings.contextual_word_embedding.ContextualWordEmbedding 7 | :members: 8 | 9 | .. autoclass:: hanlp.layers.embeddings.contextual_word_embedding.ContextualWordEmbeddingModule 10 | :members: 11 | 12 | ``` 13 | -------------------------------------------------------------------------------- /docs/api/hanlp/layers/embeddings/word2vec.md: -------------------------------------------------------------------------------- 1 | # word2vec 2 | 3 | ```{eval-rst} 4 | 5 | .. autoclass:: hanlp.layers.embeddings.word2vec.Word2VecEmbedding 6 | :members: 7 | 8 | .. autoclass:: hanlp.layers.embeddings.word2vec.Word2VecEmbeddingModule 9 | :members: 10 | 11 | ``` 12 | -------------------------------------------------------------------------------- /docs/api/hanlp/layers/index.md: -------------------------------------------------------------------------------- 1 | # layers 2 | 3 | ```{toctree} 4 | embeddings/index 5 | transformers/index 6 | decoders/index 7 | ``` 8 | 9 | -------------------------------------------------------------------------------- /docs/api/hanlp/layers/transformers/encoder.md: -------------------------------------------------------------------------------- 1 | # encoder 2 | 3 | 4 | ```{eval-rst} 5 | 6 | .. autoclass:: hanlp.layers.transformers.encoder.TransformerEncoder 7 | :members: 8 | 9 | ``` 10 | -------------------------------------------------------------------------------- /docs/api/hanlp/layers/transformers/index.md: -------------------------------------------------------------------------------- 1 | # transformers 2 | 3 | ```{toctree} 4 | encoder 5 | tokenizer 6 | ``` 7 | 8 | -------------------------------------------------------------------------------- /docs/api/hanlp/layers/transformers/tokenizer.md: -------------------------------------------------------------------------------- 1 | # tokenizer 2 | 3 | 4 | ```{eval-rst} 5 | 6 | .. autoclass:: hanlp.transform.transformer_tokenizer.TransformerSequenceTokenizer 7 | :members: 8 | 9 | ``` 10 | -------------------------------------------------------------------------------- /docs/api/hanlp/pretrained/amr.md: -------------------------------------------------------------------------------- 1 | --- 2 | jupytext: 3 | formats: ipynb,md:myst 4 | text_representation: 5 | extension: .md 6 | format_name: myst 7 | format_version: '0.8' 8 | jupytext_version: 1.4.2 9 | kernelspec: 10 | display_name: Python 3 11 | language: python 12 | name: python3 13 | --- 14 | # amr 15 | 16 | AMR captures “who is doing what to whom” in a sentence. Each sentence is represented as a rooted, directed, acyclic graph with labels on edges (relations) and leaves (concepts). 17 | Before loading an AMR model, make sure to install HanLP with the `amr` dependencies: 18 | 19 | ```shell 20 | pip install hanlp[amr] -U 21 | ``` 22 | 23 | To parse a raw sentence into AMR: 24 | 25 | ```{eval-rst} 26 | .. margin:: Batching is Faster 27 | 28 | .. Hint:: Parse multiple sentences at once for faster speed! 29 | ``` 30 | 31 | 32 | ```{code-cell} ipython3 33 | :tags: [output_scroll] 34 | import hanlp 35 | 36 | amr_parser = hanlp.load(hanlp.pretrained.amr.AMR3_SEQ2SEQ_BART_LARGE) 37 | amr = amr_parser('The boy wants the girl to believe him.') 38 | print(amr) 39 | ``` 40 | 41 | All the pre-trained parsers and their scores are listed below. 42 | 43 | ```{eval-rst} 44 | 45 | .. automodule:: hanlp.pretrained.amr 46 | :members: 47 | 48 | ``` -------------------------------------------------------------------------------- /docs/api/hanlp/pretrained/dep.md: -------------------------------------------------------------------------------- 1 | # dep 2 | 3 | ```{eval-rst} 4 | 5 | .. automodule:: hanlp.pretrained.dep 6 | :members: 7 | 8 | ``` -------------------------------------------------------------------------------- /docs/api/hanlp/pretrained/eos.md: -------------------------------------------------------------------------------- 1 | # eos 2 | 3 | 4 | ```{eval-rst} 5 | 6 | .. automodule:: hanlp.pretrained.eos 7 | :members: 8 | 9 | ``` -------------------------------------------------------------------------------- /docs/api/hanlp/pretrained/fasttext.md: -------------------------------------------------------------------------------- 1 | # fasttext 2 | 3 | ```{eval-rst} 4 | 5 | .. automodule:: hanlp.pretrained.fasttext 6 | :members: 7 | 8 | ``` -------------------------------------------------------------------------------- /docs/api/hanlp/pretrained/glove.md: -------------------------------------------------------------------------------- 1 | # glove 2 | 3 | ```{eval-rst} 4 | 5 | .. automodule:: hanlp.pretrained.glove 6 | :members: 7 | 8 | ``` -------------------------------------------------------------------------------- /docs/api/hanlp/pretrained/index.md: -------------------------------------------------------------------------------- 1 | # pretrained 2 | 3 | ```{eval-rst} 4 | NLP components grouped by tasks. For each task, we provide at least one :class:`~hanlp.common.component.Component` 5 | compatible class and several pretrained models. Each of them is stored in a Python constant which can be fetched 6 | using :meth:`hanlp.load`. 7 | ``` 8 | 9 | ```{toctree} 10 | mtl 11 | eos 12 | tok 13 | pos 14 | ner 15 | dep 16 | constituency 17 | srl 18 | sdp 19 | amr 20 | amr2text 21 | sts 22 | word2vec 23 | glove 24 | fasttext 25 | mlm 26 | ``` 27 | 28 | -------------------------------------------------------------------------------- /docs/api/hanlp/pretrained/mtl.md: -------------------------------------------------------------------------------- 1 | # mtl 2 | 3 | ```{eval-rst} 4 | 5 | .. automodule:: hanlp.pretrained.mtl 6 | :members: 7 | 8 | ``` -------------------------------------------------------------------------------- /docs/api/hanlp/pretrained/ner.md: -------------------------------------------------------------------------------- 1 | # ner 2 | 3 | ```{eval-rst} 4 | 5 | .. automodule:: hanlp.pretrained.ner 6 | :members: 7 | 8 | ``` -------------------------------------------------------------------------------- /docs/api/hanlp/pretrained/pos.md: -------------------------------------------------------------------------------- 1 | --- 2 | jupytext: 3 | formats: ipynb,md:myst 4 | text_representation: 5 | extension: .md 6 | format_name: myst 7 | format_version: '0.8' 8 | jupytext_version: 1.4.2 9 | kernelspec: 10 | display_name: Python 3 11 | language: python 12 | name: python3 13 | --- 14 | 15 | # pos 16 | 17 | The process of classifying words into their **parts of speech** and labeling them accordingly is known as **part-of-speech tagging**, **POS-tagging**, or simply **tagging**. 18 | 19 | To tag a tokenized sentence: 20 | 21 | ````{margin} Batching is Faster 22 | ```{hint} 23 | Tag multiple sentences at once for faster speed! 24 | ``` 25 | ```` 26 | 27 | 28 | ```{code-cell} ipython3 29 | :tags: [output_scroll] 30 | import hanlp 31 | 32 | pos = hanlp.load(hanlp.pretrained.pos.CTB9_POS_ELECTRA_SMALL) 33 | pos(['我', '的', '希望', '是', '希望', '世界', '和平']) 34 | ``` 35 | 36 | ````{margin} Custom Dictionary Supported 37 | ```{seealso} 38 | See [this tutorial](https://github.com/hankcs/HanLP/blob/master/plugins/hanlp_demo/hanlp_demo/zh/demo_pos_dict.py) for custom dictionary. 39 | ``` 40 | ```` 41 | 42 | All the pre-trained taggers and their details are listed below. 43 | 44 | ```{eval-rst} 45 | 46 | .. automodule:: hanlp.pretrained.pos 47 | :members: 48 | 49 | ``` -------------------------------------------------------------------------------- /docs/api/hanlp/pretrained/sdp.md: -------------------------------------------------------------------------------- 1 | # sdp 2 | 3 | ```{eval-rst} 4 | 5 | .. automodule:: hanlp.pretrained.sdp 6 | :members: 7 | 8 | ``` -------------------------------------------------------------------------------- /docs/api/hanlp/pretrained/srl.md: -------------------------------------------------------------------------------- 1 | --- 2 | jupytext: 3 | formats: ipynb,md:myst 4 | text_representation: 5 | extension: .md 6 | format_name: myst 7 | format_version: '0.8' 8 | jupytext_version: 1.4.2 9 | kernelspec: 10 | display_name: Python 3 11 | language: python 12 | name: python3 13 | --- 14 | 15 | # srl 16 | 17 | Semantic Role Labeling (SRL) is one shallow semantic parsing that produces predicate-argument structures which are semantic roles (or participants) such as agent, patient, and theme associated with verbs. 18 | 19 | Inputs to SRL are tokenized sentences: 20 | 21 | ````{margin} Batching is Faster 22 | ```{hint} 23 | Feed in multiple sentences at once for faster speed! 24 | ``` 25 | ```` 26 | 27 | 28 | ```{code-cell} ipython3 29 | :tags: [output_scroll] 30 | import hanlp 31 | 32 | srl = hanlp.load(hanlp.pretrained.srl.CPB3_SRL_ELECTRA_SMALL) 33 | srl(['男孩', '希望', '女孩', '相信', '他', '。']) 34 | ``` 35 | 36 | All the pre-trained labelers and their details are listed below. 37 | 38 | ```{eval-rst} 39 | 40 | .. automodule:: hanlp.pretrained.srl 41 | :members: 42 | 43 | ``` -------------------------------------------------------------------------------- /docs/api/hanlp/pretrained/sts.md: -------------------------------------------------------------------------------- 1 | --- 2 | jupytext: 3 | formats: ipynb,md:myst 4 | text_representation: 5 | extension: .md 6 | format_name: myst 7 | format_version: '0.8' 8 | jupytext_version: 1.4.2 9 | kernelspec: 10 | display_name: Python 3 11 | language: python 12 | name: python3 13 | --- 14 | 15 | # sts 16 | 17 | `sts` package holds pre-trained Semantic Textual Similarity (STS) models. We surveyed both supervised and unsupervised 18 | models and we believe that unsupervised models are still immature at this moment. Unsupervised STS is good for IR but 19 | not NLP especially on sentences with little lexical overlap. 20 | 21 | 22 | ```{eval-rst} 23 | 24 | .. automodule:: hanlp.pretrained.sts 25 | :members: 26 | 27 | ``` 28 | 29 | ```{code-cell} ipython3 30 | import hanlp 31 | 32 | sim = hanlp.load(hanlp.pretrained.sts.STS_ELECTRA_BASE_ZH) 33 | sim([ 34 | ['看图猜一电影名', '看图猜电影'], 35 | ['无线路由器怎么无线上网', '无线上网卡和无线路由器怎么用'], 36 | ['北京到上海的动车票', '上海到北京的动车票'], 37 | ]) 38 | ``` -------------------------------------------------------------------------------- /docs/api/hanlp/pretrained/tok.md: -------------------------------------------------------------------------------- 1 | --- 2 | jupytext: 3 | formats: ipynb,md:myst 4 | text_representation: 5 | extension: .md 6 | format_name: myst 7 | format_version: '0.8' 8 | jupytext_version: 1.4.2 9 | kernelspec: 10 | display_name: Python 3 11 | language: python 12 | name: python3 13 | --- 14 | 15 | # tok 16 | 17 | Tokenization is a way of separating a sentence into smaller units called tokens. In lexical analysis, tokens usually refer to words. 18 | 19 | ````{margin} Batching is Faster 20 | ```{hint} 21 | Tokenize multiple sentences at once for faster speed! 22 | ``` 23 | ```` 24 | ````{margin} Custom Dictionary Supported 25 | ```{seealso} 26 | See [this tutorial](https://github.com/hankcs/HanLP/blob/master/plugins/hanlp_demo/hanlp_demo/zh/demo_custom_dict.py) for custom dictionary. 27 | ``` 28 | ```` 29 | 30 | To tokenize raw sentences: 31 | 32 | 33 | ```{code-cell} ipython3 34 | :tags: [output_scroll] 35 | import hanlp 36 | 37 | tok = hanlp.load(hanlp.pretrained.tok.COARSE_ELECTRA_SMALL_ZH) 38 | tok(['商品和服务。', '晓美焰来到北京立方庭参观自然语义科技公司']) 39 | ``` 40 | 41 | All the pre-trained tokenizers and their details are listed below. 42 | 43 | 44 | ```{eval-rst} 45 | 46 | .. automodule:: hanlp.pretrained.tok 47 | :members: 48 | 49 | ``` 50 | 51 | -------------------------------------------------------------------------------- /docs/api/hanlp/utils/index.md: -------------------------------------------------------------------------------- 1 | # utils 2 | 3 | Utilities. 4 | 5 | ```{toctree} 6 | io_util 7 | ``` 8 | -------------------------------------------------------------------------------- /docs/api/hanlp/utils/io_util.md: -------------------------------------------------------------------------------- 1 | # io_util 2 | 3 | ```{eval-rst} 4 | 5 | .. currentmodule:: hanlp.utils 6 | 7 | .. automodule:: hanlp.utils.io_util 8 | :members: 9 | 10 | ``` 11 | -------------------------------------------------------------------------------- /docs/api/restful.rst: -------------------------------------------------------------------------------- 1 | .. _api/hanlp_restful: 2 | 3 | hanlp_restful 4 | ==================== 5 | 6 | .. currentmodule:: hanlp_restful 7 | 8 | .. autoclass:: HanLPClient 9 | :members: 10 | :special-members: 11 | :exclude-members: __init__, __repr__, __weakref__ -------------------------------------------------------------------------------- /docs/api/restful_golang.md: -------------------------------------------------------------------------------- 1 | # Golang RESTful API 2 | 3 | ## Install 4 | 5 | ```shell script 6 | go get -u github.com/hankcs/gohanlp@main 7 | ``` 8 | 9 | ## Quick Start 10 | 11 | Obtain an `auth` from any compatible service provider like our [free service](https://bbs.hankcs.com/t/apply-for-free-hanlp-restful-apis/3178), then initiate a `HanLPClient` and call its `Parse` interface. 12 | 13 | ```java 14 | package main 15 | 16 | import ( 17 | "fmt" 18 | "github.com/hankcs/gohanlp/hanlp" 19 | ) 20 | 21 | func main() { 22 | client := hanlp.HanLPClient(hanlp.WithAuth("The auth you applied for")) // anonymous users can skip auth 23 | s, _ := client.Parse("In 2021, HanLPv2.1 delivers state-of-the-art multilingual NLP techniques to production environments.",hanlp.WithLanguage("mul")) 24 | fmt.Println(s) 25 | } 26 | ``` 27 | 28 | Refer to our [testcases](https://github.com/hankcs/gohanlp/blob/main/main_test.go) and [data format](../data_format) for more details. 29 | 30 | -------------------------------------------------------------------------------- /docs/api/restful_java.md: -------------------------------------------------------------------------------- 1 | # Java RESTful API 2 | 3 | Add the following dependency into the `pom.xml` file of your project. 4 | 5 | ```xml 6 | 7 | com.hankcs.hanlp.restful 8 | hanlp-restful 9 | 0.0.15 10 | 11 | ``` 12 | 13 | Obtain an `auth` from any compatible service provider like our [free service](https://bbs.hankcs.com/t/apply-for-free-hanlp-restful-apis/3178), then initiate a `HanLPClient` and call its `parse` interface. 14 | 15 | ```java 16 | HanLPClient client = new HanLPClient("https://hanlp.hankcs.com/api", null); // Replace null with your auth 17 | System.out.println(client.parse("2021年HanLPv2.1为生产环境带来次世代最先进的多语种NLP技术。晓美焰来到北京立方庭参观自然语义科技公司。")); 18 | ``` 19 | 20 | Refer to our [testcases](https://github.com/hankcs/HanLP/blob/master/plugins/hanlp_restful_java/src/test/java/com/hankcs/hanlp/restful/HanLPClientTest.java) and [data format](../data_format) for more details. 21 | 22 | -------------------------------------------------------------------------------- /docs/api/trie/dictionary.md: -------------------------------------------------------------------------------- 1 | # dictionary 2 | 3 | ```{eval-rst} 4 | .. currentmodule:: hanlp_trie 5 | 6 | .. autoclass:: hanlp_trie.dictionary.DictInterface 7 | :members: 8 | 9 | .. autoclass:: hanlp_trie.dictionary.TrieDict 10 | :members: 11 | ``` 12 | -------------------------------------------------------------------------------- /docs/api/trie/index.md: -------------------------------------------------------------------------------- 1 | # hanlp_trie 2 | 3 | HanLP trie/dictionary interface and referential implementation. 4 | 5 | ```{toctree} 6 | trie 7 | dictionary 8 | ``` 9 | 10 | -------------------------------------------------------------------------------- /docs/api/trie/trie.md: -------------------------------------------------------------------------------- 1 | # trie 2 | 3 | ```{eval-rst} 4 | .. currentmodule:: hanlp_trie 5 | 6 | .. autoclass:: hanlp_trie.trie.Node 7 | :members: 8 | 9 | .. autoclass:: hanlp_trie.trie.Trie 10 | :members: 11 | ``` 12 | -------------------------------------------------------------------------------- /docs/references.rst: -------------------------------------------------------------------------------- 1 | References 2 | ================== 3 | 4 | .. bibliography:: references.bib 5 | :cited: 6 | :style: astrostyle -------------------------------------------------------------------------------- /hanlp/callbacks/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # Author: hankcs 3 | # Date: 2019-12-05 02:10 -------------------------------------------------------------------------------- /hanlp/common/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # Author: hankcs 3 | # Date: 2019-08-26 14:45 4 | -------------------------------------------------------------------------------- /hanlp/common/component.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # Author: hankcs 3 | # Date: 2019-08-26 14:45 4 | import inspect 5 | from abc import ABC, abstractmethod 6 | from typing import Any 7 | 8 | from hanlp_common.configurable import Configurable 9 | 10 | 11 | class Component(Configurable, ABC): 12 | @abstractmethod 13 | def predict(self, *args, **kwargs): 14 | """Predict on data. This is the base class for all components, including rule based and statistical ones. 15 | 16 | Args: 17 | *args: Any type of data subject to sub-classes 18 | **kwargs: Additional arguments 19 | 20 | Returns: Any predicted annotations. 21 | 22 | """ 23 | raise NotImplementedError('%s.%s()' % (self.__class__.__name__, inspect.stack()[0][3])) 24 | 25 | def __call__(self, *args, **kwargs): 26 | """ 27 | A shortcut for :func:`~hanlp.common.component.predict`. 28 | 29 | Args: 30 | *args: Any type of data subject to sub-classes 31 | **kwargs: Additional arguments 32 | 33 | Returns: Any predicted annotations. 34 | 35 | """ 36 | return self.predict(*args, **kwargs) 37 | -------------------------------------------------------------------------------- /hanlp/components/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # Author: hankcs 3 | # Date: 2019-08-26 16:10 4 | from .pipeline import Pipeline -------------------------------------------------------------------------------- /hanlp/components/amr/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # Author: hankcs 3 | # Date: 2020-08-20 17:35 4 | -------------------------------------------------------------------------------- /hanlp/components/amr/amrbart/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # Author: hankcs 3 | # Date: 2022-12-05 17:53 4 | -------------------------------------------------------------------------------- /hanlp/components/amr/amrbart/common/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # Author: hankcs 3 | # Date: 2022-12-05 17:53 4 | -------------------------------------------------------------------------------- /hanlp/components/amr/amrbart/data_interface/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # Author: hankcs 3 | # Date: 2022-12-07 14:36 4 | -------------------------------------------------------------------------------- /hanlp/components/amr/amrbart/model_interface/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # Author: hankcs 3 | # Date: 2022-12-03 20:33 4 | -------------------------------------------------------------------------------- /hanlp/components/amr/amrbart/preprocess/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # Author: hankcs 3 | # Date: 2022-12-03 20:33 4 | -------------------------------------------------------------------------------- /hanlp/components/amr/seq2seq/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # Author: hankcs 3 | # Date: 2021-04-27 19:24 4 | -------------------------------------------------------------------------------- /hanlp/components/amr/seq2seq/dataset/IO.py: -------------------------------------------------------------------------------- 1 | import glob 2 | from typing import List, Union, Iterable 3 | from pathlib import Path 4 | from .penman import pm_load as pm_load 5 | 6 | 7 | def read_raw_amr_data( 8 | paths: List[Union[str, Path]], 9 | use_recategorization=False, 10 | dereify=True, 11 | remove_wiki=False, 12 | ): 13 | assert paths 14 | 15 | if not isinstance(paths, Iterable): 16 | paths = [paths] 17 | 18 | graphs = [] 19 | for path_ in paths: 20 | for path in glob.glob(str(path_)): 21 | path = Path(path) 22 | assert path.exists(), f'{path} not exist' 23 | graphs.extend(pm_load(path, dereify=dereify, remove_wiki=remove_wiki)) 24 | 25 | assert graphs, 'No graphs loaded' 26 | 27 | if use_recategorization: 28 | for g in graphs: 29 | metadata = g.metadata 30 | metadata['snt_orig'] = metadata['snt'] 31 | tokens = eval(metadata['tokens']) 32 | metadata['snt'] = ' '.join( 33 | [t for t in tokens if not ((t.startswith('-L') or t.startswith('-R')) and t.endswith('-'))]) 34 | 35 | return graphs 36 | -------------------------------------------------------------------------------- /hanlp/components/amr/seq2seq/dataset/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # Author: hankcs 3 | # Date: 2021-04-27 19:29 4 | -------------------------------------------------------------------------------- /hanlp/components/amr/seq2seq/evaluation.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | 3 | import penman 4 | 5 | 6 | def write_predictions(predictions_path, tokenizer, graphs): 7 | pieces = [penman.encode(g) for g in graphs] 8 | text = '\n\n'.join(pieces) 9 | if tokenizer: 10 | text = text.replace(tokenizer.INIT, '') 11 | Path(predictions_path).write_text(text) 12 | return predictions_path 13 | 14 | 15 | def compute_smatch(pred, gold): 16 | from perin_parser.thirdparty.mtool import smatch 17 | with Path(pred).open() as p, Path(gold).open() as g: 18 | score = next(smatch.score_amr_pairs(p, g)) 19 | return score[2] 20 | 21 | 22 | def compute_bleu(gold_sentences, pred_sentences): 23 | from sacrebleu import corpus_bleu 24 | return corpus_bleu(pred_sentences, [gold_sentences]) 25 | -------------------------------------------------------------------------------- /hanlp/components/classifiers/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # Author: hankcs 3 | # Date: 2019-11-10 13:18 -------------------------------------------------------------------------------- /hanlp/components/distillation/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # Author: hankcs 3 | # Date: 2020-10-17 20:29 4 | -------------------------------------------------------------------------------- /hanlp/components/eos/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # Author: hankcs 3 | # Date: 2020-07-26 20:19 -------------------------------------------------------------------------------- /hanlp/components/lambda_wrapper.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # Author: hankcs 3 | # Date: 2019-12-31 18:36 4 | from typing import Callable, Any 5 | 6 | from hanlp.common.component import Component 7 | from hanlp_common.reflection import classpath_of, object_from_classpath, str_to_type 8 | 9 | 10 | class LambdaComponent(Component): 11 | def __init__(self, function: Callable) -> None: 12 | super().__init__() 13 | self.config = {} 14 | self.function = function 15 | self.config['function'] = classpath_of(function) 16 | self.config['classpath'] = classpath_of(self) 17 | 18 | def predict(self, data: Any, **kwargs): 19 | unpack = kwargs.pop('_hanlp_unpack', None) 20 | if unpack: 21 | return self.function(*data, **kwargs) 22 | return self.function(data, **kwargs) 23 | 24 | @staticmethod 25 | def from_config(meta: dict, **kwargs): 26 | cls = str_to_type(meta['classpath']) 27 | function = meta['function'] 28 | function = object_from_classpath(function) 29 | return cls(function) 30 | -------------------------------------------------------------------------------- /hanlp/components/lm/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # Author: hankcs 3 | # Date: 2022-01-29 21:07 4 | -------------------------------------------------------------------------------- /hanlp/components/mtl/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # Author: hankcs 3 | # Date: 2020-06-20 19:54 -------------------------------------------------------------------------------- /hanlp/components/mtl/tasks/ner/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # Author: hankcs 3 | # Date: 2020-12-03 14:34 4 | -------------------------------------------------------------------------------- /hanlp/components/mtl/tasks/srl/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # Author: hankcs 3 | # Date: 2020-12-04 16:49 4 | -------------------------------------------------------------------------------- /hanlp/components/mtl/tasks/tok/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # Author: hankcs 3 | # Date: 2020-08-11 16:34 -------------------------------------------------------------------------------- /hanlp/components/ner/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # Author: hankcs 3 | # Date: 2020-07-21 17:22 -------------------------------------------------------------------------------- /hanlp/components/ner/biaffine_ner/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # Author: hankcs 3 | # Date: 2020-07-21 18:41 -------------------------------------------------------------------------------- /hanlp/components/parsers/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # Author: hankcs 3 | # Date: 2019-12-22 12:46 -------------------------------------------------------------------------------- /hanlp/components/parsers/biaffine/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # Author: hankcs 3 | # Date: 2020-05-08 20:43 4 | -------------------------------------------------------------------------------- /hanlp/components/parsers/biaffine_tf/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # Author: hankcs 3 | # Date: 2019-12-26 23:03 -------------------------------------------------------------------------------- /hanlp/components/parsers/constituency/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # Author: hankcs 3 | # Date: 2020-11-28 19:26 4 | -------------------------------------------------------------------------------- /hanlp/components/parsers/ud/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # Author: hankcs 3 | # Date: 2020-12-14 20:34 4 | -------------------------------------------------------------------------------- /hanlp/components/parsers/ud/util.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # Author: hankcs 3 | # Date: 2020-12-14 20:44 4 | from hanlp_common.constant import ROOT 5 | from hanlp.components.parsers.ud.lemma_edit import gen_lemma_rule 6 | 7 | 8 | def generate_lemma_rule(sample: dict): 9 | if 'LEMMA' in sample: 10 | sample['lemma'] = [gen_lemma_rule(word, lemma) if lemma != "_" else "_" for word, lemma in 11 | zip(sample['FORM'], sample['LEMMA'])] 12 | return sample 13 | 14 | 15 | def append_bos(sample: dict): 16 | if 'FORM' in sample: 17 | sample['token'] = [ROOT] + sample['FORM'] 18 | if 'UPOS' in sample: 19 | sample['pos'] = sample['UPOS'][:1] + sample['UPOS'] 20 | sample['arc'] = [0] + sample['HEAD'] 21 | sample['rel'] = sample['DEPREL'][:1] + sample['DEPREL'] 22 | sample['lemma'] = sample['lemma'][:1] + sample['lemma'] 23 | sample['feat'] = sample['FEATS'][:1] + sample['FEATS'] 24 | return sample 25 | 26 | 27 | def sample_form_missing(sample: dict): 28 | return all(t == '_' for t in sample['FORM']) 29 | -------------------------------------------------------------------------------- /hanlp/components/srl/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # Author: hankcs 3 | # Date: 2020-06-22 20:50 -------------------------------------------------------------------------------- /hanlp/components/srl/span_bio/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # Author: hankcs 3 | # Date: 2020-12-04 13:59 4 | -------------------------------------------------------------------------------- /hanlp/components/srl/span_rank/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # Author: hankcs 3 | # Date: 2020-07-19 22:22 -------------------------------------------------------------------------------- /hanlp/components/srl/span_rank/util.py: -------------------------------------------------------------------------------- 1 | # Adopted from https://github.com/KiroSummer/A_Syntax-aware_MTL_Framework_for_Chinese_SRL 2 | import torch 3 | 4 | 5 | def block_orth_normal_initializer(input_size, output_size): 6 | weight = [] 7 | for o in output_size: 8 | for i in input_size: 9 | param = torch.FloatTensor(o, i) 10 | torch.nn.init.orthogonal_(param) 11 | weight.append(param) 12 | return torch.cat(weight) 13 | -------------------------------------------------------------------------------- /hanlp/components/sts/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # Author: hankcs 3 | # Date: 2021-05-20 17:02 4 | -------------------------------------------------------------------------------- /hanlp/components/taggers/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # Author: hankcs 3 | # Date: 2019-08-28 15:39 -------------------------------------------------------------------------------- /hanlp/components/taggers/ngram_conv/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # Author: hankcs 3 | # Date: 2019-12-29 22:18 -------------------------------------------------------------------------------- /hanlp/components/taggers/pos_tf.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # Author: hankcs 3 | # Date: 2019-12-05 23:05 4 | from hanlp.components.taggers.cnn_tagger_tf import CNNTaggerTF 5 | from hanlp.components.taggers.rnn_tagger_tf import RNNTaggerTF 6 | 7 | 8 | class CNNPartOfSpeechTaggerTF(CNNTaggerTF): 9 | pass 10 | 11 | 12 | class RNNPartOfSpeechTaggerTF(RNNTaggerTF): 13 | pass 14 | -------------------------------------------------------------------------------- /hanlp/components/taggers/rnn/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # Author: hankcs 3 | # Date: 2020-05-19 15:41 -------------------------------------------------------------------------------- /hanlp/components/taggers/transformers/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # Author: hankcs 3 | # Date: 2019-12-29 13:57 -------------------------------------------------------------------------------- /hanlp/components/taggers/transformers/metrics_tf.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # Author: hankcs 3 | # Date: 2019-12-30 16:33 4 | import tensorflow as tf 5 | 6 | 7 | class Accuracy(tf.keras.metrics.SparseCategoricalAccuracy): 8 | 9 | def __init__(self, name='sparse_categorical_accuracy', dtype=None, mask_value=0): 10 | super().__init__(name, dtype) 11 | self.mask_value = mask_value 12 | 13 | def update_state(self, y_true, y_pred, sample_weight=None): 14 | sample_weight = tf.not_equal(y_true, self.mask_value) 15 | return super().update_state(y_true, y_pred, sample_weight) 16 | -------------------------------------------------------------------------------- /hanlp/components/taggers/util.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # Author: hankcs 3 | # Date: 2020-06-01 00:31 4 | from typing import List, Tuple 5 | from hanlp.utils.span_util import allowed_transitions 6 | 7 | 8 | def guess_tagging_scheme(labels: List[str]) -> str: 9 | tagset = set(y.split('-')[0] for y in labels) 10 | for scheme in "BIO", "BIOUL", "BMES", 'IOBES': 11 | if tagset == set(list(scheme)): 12 | return scheme 13 | 14 | 15 | def guess_allowed_transitions(labels) -> List[Tuple[int, int]]: 16 | scheme = guess_tagging_scheme(labels) 17 | if not scheme: 18 | return None 19 | if scheme == 'IOBES': 20 | scheme = 'BIOUL' 21 | labels = [y.replace('E-', 'L-').replace('S-', 'U-') for y in labels] 22 | return allowed_transitions(scheme, dict(enumerate(labels))) 23 | -------------------------------------------------------------------------------- /hanlp/components/tokenizers/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # Author: hankcs 3 | # Date: 2020-08-11 02:48 -------------------------------------------------------------------------------- /hanlp/datasets/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # Author: hankcs 3 | # Date: 2019-06-13 18:15 4 | -------------------------------------------------------------------------------- /hanlp/datasets/classification/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # Author: hankcs 3 | # Date: 2019-11-10 11:49 -------------------------------------------------------------------------------- /hanlp/datasets/classification/sentiment.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # Author: hankcs 3 | # Date: 2019-12-30 21:03 4 | _ERNIE_TASK_DATA = 'https://ernie.bj.bcebos.com/task_data_zh.tgz#' 5 | 6 | CHNSENTICORP_ERNIE_TRAIN = _ERNIE_TASK_DATA + 'chnsenticorp/train.tsv' 7 | CHNSENTICORP_ERNIE_DEV = _ERNIE_TASK_DATA + 'chnsenticorp/dev.tsv' 8 | CHNSENTICORP_ERNIE_TEST = _ERNIE_TASK_DATA + 'chnsenticorp/test.tsv' 9 | -------------------------------------------------------------------------------- /hanlp/datasets/coref/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # Author: hankcs 3 | # Date: 2020-07-04 13:39 -------------------------------------------------------------------------------- /hanlp/datasets/coref/loaders/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # Author: hankcs 3 | # Date: 2021-12-28 19:03 4 | -------------------------------------------------------------------------------- /hanlp/datasets/eos/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # Author: hankcs 3 | # Date: 2020-07-26 18:11 -------------------------------------------------------------------------------- /hanlp/datasets/eos/loaders/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # Author: hankcs 3 | # Date: 2021-12-28 19:03 4 | -------------------------------------------------------------------------------- /hanlp/datasets/eos/loaders/nn_eos.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # Author: hankcs 3 | # Date: 2020-12-24 22:51 4 | _SETIMES2_EN_HR_SENTENCES_HOME = 'https://schweter.eu/cloud/nn_eos/SETIMES2.en-hr.sentences.tar.xz' 5 | SETIMES2_EN_HR_HR_SENTENCES_TRAIN = _SETIMES2_EN_HR_SENTENCES_HOME + '#SETIMES2.en-hr.hr.sentences.train' 6 | '''Training set of SETimes corpus.''' 7 | SETIMES2_EN_HR_HR_SENTENCES_DEV = _SETIMES2_EN_HR_SENTENCES_HOME + '#SETIMES2.en-hr.hr.sentences.dev' 8 | '''Dev set of SETimes corpus.''' 9 | SETIMES2_EN_HR_HR_SENTENCES_TEST = _SETIMES2_EN_HR_SENTENCES_HOME + '#SETIMES2.en-hr.hr.sentences.test' 10 | '''Test set of SETimes corpus.''' 11 | _EUROPARL_V7_DE_EN_EN_SENTENCES_HOME = 'http://schweter.eu/cloud/nn_eos/europarl-v7.de-en.en.sentences.tar.xz' 12 | EUROPARL_V7_DE_EN_EN_SENTENCES_TRAIN = _EUROPARL_V7_DE_EN_EN_SENTENCES_HOME + '#europarl-v7.de-en.en.sentences.train' 13 | '''Training set of Europarl corpus (:cite:`koehn2005europarl`).''' 14 | EUROPARL_V7_DE_EN_EN_SENTENCES_DEV = _EUROPARL_V7_DE_EN_EN_SENTENCES_HOME + '#europarl-v7.de-en.en.sentences.dev' 15 | '''Dev set of Europarl corpus (:cite:`koehn2005europarl`).''' 16 | EUROPARL_V7_DE_EN_EN_SENTENCES_TEST = _EUROPARL_V7_DE_EN_EN_SENTENCES_HOME + '#europarl-v7.de-en.en.sentences.test' 17 | '''Test set of Europarl corpus (:cite:`koehn2005europarl`).''' 18 | -------------------------------------------------------------------------------- /hanlp/datasets/lm/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # Author: hankcs 3 | # Date: 2020-06-05 21:41 4 | 5 | _PTB_HOME = 'http://www.fit.vutbr.cz/~imikolov/rnnlm/simple-examples.tgz#' 6 | PTB_TOKEN_TRAIN = _PTB_HOME + 'data/ptb.train.txt' 7 | PTB_TOKEN_DEV = _PTB_HOME + 'data/ptb.valid.txt' 8 | PTB_TOKEN_TEST = _PTB_HOME + 'data/ptb.test.txt' 9 | 10 | PTB_CHAR_TRAIN = _PTB_HOME + 'data/ptb.char.train.txt' 11 | PTB_CHAR_DEV = _PTB_HOME + 'data/ptb.char.valid.txt' 12 | PTB_CHAR_TEST = _PTB_HOME + 'data/ptb.char.test.txt' 13 | -------------------------------------------------------------------------------- /hanlp/datasets/lm/loaders/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # Author: hankcs 3 | # Date: 2021-12-28 19:04 4 | -------------------------------------------------------------------------------- /hanlp/datasets/lu/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # Author: hankcs 3 | # Date: 2021-12-28 19:08 4 | -------------------------------------------------------------------------------- /hanlp/datasets/lu/glue.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # Author: hankcs 3 | # Date: 2019-11-10 11:47 4 | from hanlp.common.dataset import TableDataset 5 | 6 | STANFORD_SENTIMENT_TREEBANK_2_TRAIN = 'http://file.hankcs.com/corpus/SST2.zip#train.tsv' 7 | STANFORD_SENTIMENT_TREEBANK_2_DEV = 'http://file.hankcs.com/corpus/SST2.zip#dev.tsv' 8 | STANFORD_SENTIMENT_TREEBANK_2_TEST = 'http://file.hankcs.com/corpus/SST2.zip#test.tsv' 9 | 10 | MICROSOFT_RESEARCH_PARAPHRASE_CORPUS_TRAIN = 'http://file.hankcs.com/corpus/mrpc.zip#train.tsv' 11 | MICROSOFT_RESEARCH_PARAPHRASE_CORPUS_DEV = 'http://file.hankcs.com/corpus/mrpc.zip#dev.tsv' 12 | MICROSOFT_RESEARCH_PARAPHRASE_CORPUS_TEST = 'http://file.hankcs.com/corpus/mrpc.zip#test.tsv' 13 | 14 | 15 | class SST2Dataset(TableDataset): 16 | pass 17 | 18 | 19 | def main(): 20 | dataset = SST2Dataset(STANFORD_SENTIMENT_TREEBANK_2_TEST) 21 | print(dataset) 22 | 23 | 24 | if __name__ == '__main__': 25 | main() 26 | -------------------------------------------------------------------------------- /hanlp/datasets/ner/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # Author: hankcs 3 | # Date: 2019-12-06 15:32 -------------------------------------------------------------------------------- /hanlp/datasets/ner/conll03.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # Author: hankcs 3 | # Date: 2019-12-06 15:31 4 | 5 | 6 | CONLL03_EN_TRAIN = 'https://file.hankcs.com/corpus/conll03_en_iobes.zip#eng.train.tsv' 7 | '''Training set of CoNLL03 (:cite:`tjong-kim-sang-de-meulder-2003-introduction`)''' 8 | CONLL03_EN_DEV = 'https://file.hankcs.com/corpus/conll03_en_iobes.zip#eng.dev.tsv' 9 | '''Dev set of CoNLL03 (:cite:`tjong-kim-sang-de-meulder-2003-introduction`)''' 10 | CONLL03_EN_TEST = 'https://file.hankcs.com/corpus/conll03_en_iobes.zip#eng.test.tsv' 11 | '''Test set of CoNLL03 (:cite:`tjong-kim-sang-de-meulder-2003-introduction`)''' 12 | -------------------------------------------------------------------------------- /hanlp/datasets/ner/loaders/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # Author: hankcs 3 | # Date: 2021-12-28 19:04 4 | -------------------------------------------------------------------------------- /hanlp/datasets/ner/resume.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # Author: hankcs 3 | # Date: 2020-06-08 12:10 4 | from hanlp.common.dataset import TransformableDataset 5 | 6 | from hanlp.utils.io_util import get_resource, generate_words_tags_from_tsv 7 | 8 | _RESUME_NER_HOME = 'https://github.com/jiesutd/LatticeLSTM/archive/master.zip#' 9 | 10 | RESUME_NER_TRAIN = _RESUME_NER_HOME + 'ResumeNER/train.char.bmes' 11 | '''Training set of Resume in char level.''' 12 | RESUME_NER_DEV = _RESUME_NER_HOME + 'ResumeNER/dev.char.bmes' 13 | '''Dev set of Resume in char level.''' 14 | RESUME_NER_TEST = _RESUME_NER_HOME + 'ResumeNER/test.char.bmes' 15 | '''Test set of Resume in char level.''' 16 | 17 | -------------------------------------------------------------------------------- /hanlp/datasets/ner/weibo.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # Author: hankcs 3 | # Date: 2020-06-03 23:33 4 | from hanlp.common.dataset import TransformableDataset 5 | 6 | from hanlp.utils.io_util import get_resource, generate_words_tags_from_tsv 7 | 8 | _WEIBO_NER_HOME = 'https://github.com/hltcoe/golden-horse/archive/master.zip#data/' 9 | 10 | WEIBO_NER_TRAIN = _WEIBO_NER_HOME + 'weiboNER_2nd_conll.train' 11 | '''Training set of Weibo in char level.''' 12 | WEIBO_NER_DEV = _WEIBO_NER_HOME + 'weiboNER_2nd_conll.dev' 13 | '''Dev set of Weibo in char level.''' 14 | WEIBO_NER_TEST = _WEIBO_NER_HOME + 'weiboNER_2nd_conll.test' 15 | '''Test set of Weibo in char level.''' 16 | -------------------------------------------------------------------------------- /hanlp/datasets/parsing/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # Author: hankcs 3 | # Date: 2019-12-28 00:51 4 | -------------------------------------------------------------------------------- /hanlp/datasets/parsing/ctb5.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # Author: hankcs 3 | # Date: 2019-12-28 18:44 4 | from hanlp_common.constant import HANLP_URL 5 | 6 | _CTB_HOME = HANLP_URL + 'embeddings/SUDA-LA-CIP_20200109_021624.zip#' 7 | 8 | _CTB5_DEP_HOME = _CTB_HOME + 'BPNN/data/ctb5/' 9 | 10 | CTB5_DEP_TRAIN = _CTB5_DEP_HOME + 'train.conll' 11 | '''Training set for ctb5 dependency parsing.''' 12 | CTB5_DEP_DEV = _CTB5_DEP_HOME + 'dev.conll' 13 | '''Dev set for ctb5 dependency parsing.''' 14 | CTB5_DEP_TEST = _CTB5_DEP_HOME + 'test.conll' 15 | '''Test set for ctb5 dependency parsing.''' 16 | 17 | CIP_W2V_100_CN = _CTB_HOME + 'BPNN/data/embed.txt' 18 | -------------------------------------------------------------------------------- /hanlp/datasets/parsing/ctb7.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # Author: hankcs 3 | # Date: 2019-12-28 18:44 4 | from hanlp.datasets.parsing.ctb5 import _CTB_HOME 5 | 6 | _CTB7_HOME = _CTB_HOME + 'BPNN/data/ctb7/' 7 | 8 | CTB7_DEP_TRAIN = _CTB7_HOME + 'train.conll' 9 | '''Training set for ctb7 dependency parsing.''' 10 | CTB7_DEP_DEV = _CTB7_HOME + 'dev.conll' 11 | '''Dev set for ctb7 dependency parsing.''' 12 | CTB7_DEP_TEST = _CTB7_HOME + 'test.conll' 13 | '''Test set for ctb7 dependency parsing.''' 14 | -------------------------------------------------------------------------------- /hanlp/datasets/parsing/loaders/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # Author: hankcs 3 | # Date: 2021-12-28 19:04 4 | -------------------------------------------------------------------------------- /hanlp/datasets/parsing/ud/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # Author: hankcs 3 | # Date: 2020-12-07 21:45 4 | import os 5 | import shutil 6 | 7 | from hanlp.components.parsers.ud.udify_util import get_ud_treebank_files 8 | from hanlp.utils.io_util import get_resource 9 | from hanlp.utils.log_util import flash 10 | 11 | 12 | def concat_treebanks(home, version): 13 | ud_home = get_resource(home) 14 | treebanks = get_ud_treebank_files(ud_home) 15 | output_dir = os.path.abspath(os.path.join(ud_home, os.path.pardir, os.path.pardir, f'ud-multilingual-v{version}')) 16 | if os.path.isdir(output_dir): 17 | return output_dir 18 | os.makedirs(output_dir) 19 | train, dev, test = list(zip(*[treebanks[k] for k in treebanks])) 20 | 21 | for treebank, name in zip([train, dev, test], ["train.conllu", "dev.conllu", "test.conllu"]): 22 | flash(f'Concatenating {len(train)} treebanks into {name} [blink][yellow]...[/yellow][/blink]') 23 | with open(os.path.join(output_dir, name), 'w') as write: 24 | for t in treebank: 25 | if not t: 26 | continue 27 | with open(t, 'r') as read: 28 | shutil.copyfileobj(read, write) 29 | flash('') 30 | return output_dir 31 | -------------------------------------------------------------------------------- /hanlp/datasets/parsing/ud/ud210m.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # Author: hankcs 3 | # Date: 2020-05-21 20:39 4 | import os 5 | 6 | from hanlp.datasets.parsing.ud import concat_treebanks 7 | from hanlp.datasets.parsing.ud.ud210 import _UD_210_HOME 8 | 9 | _UD_210_MULTILINGUAL_HOME = concat_treebanks(_UD_210_HOME, '2.10') 10 | UD_210_MULTILINGUAL_TRAIN = os.path.join(_UD_210_MULTILINGUAL_HOME, 'train.conllu') 11 | "Training set of multilingual UD_210 obtained by concatenating all training sets." 12 | UD_210_MULTILINGUAL_DEV = os.path.join(_UD_210_MULTILINGUAL_HOME, 'dev.conllu') 13 | "Dev set of multilingual UD_210 obtained by concatenating all dev sets." 14 | UD_210_MULTILINGUAL_TEST = os.path.join(_UD_210_MULTILINGUAL_HOME, 'test.conllu') 15 | "Test set of multilingual UD_210 obtained by concatenating all test sets." 16 | -------------------------------------------------------------------------------- /hanlp/datasets/parsing/ud/ud23m.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # Author: hankcs 3 | # Date: 2020-05-21 20:39 4 | import os 5 | 6 | from hanlp.datasets.parsing.ud import concat_treebanks 7 | from .ud23 import _UD_23_HOME 8 | 9 | _UD_23_MULTILINGUAL_HOME = concat_treebanks(_UD_23_HOME, '2.3') 10 | UD_23_MULTILINGUAL_TRAIN = os.path.join(_UD_23_MULTILINGUAL_HOME, 'train.conllu') 11 | UD_23_MULTILINGUAL_DEV = os.path.join(_UD_23_MULTILINGUAL_HOME, 'dev.conllu') 12 | UD_23_MULTILINGUAL_TEST = os.path.join(_UD_23_MULTILINGUAL_HOME, 'test.conllu') 13 | -------------------------------------------------------------------------------- /hanlp/datasets/parsing/ud/ud27m.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # Author: hankcs 3 | # Date: 2020-05-21 20:39 4 | import os 5 | 6 | from hanlp.datasets.parsing.ud import concat_treebanks 7 | from hanlp.datasets.parsing.ud.ud27 import _UD_27_HOME 8 | 9 | _UD_27_MULTILINGUAL_HOME = concat_treebanks(_UD_27_HOME, '2.7') 10 | UD_27_MULTILINGUAL_TRAIN = os.path.join(_UD_27_MULTILINGUAL_HOME, 'train.conllu') 11 | "Training set of multilingual UD_27 obtained by concatenating all training sets." 12 | UD_27_MULTILINGUAL_DEV = os.path.join(_UD_27_MULTILINGUAL_HOME, 'dev.conllu') 13 | "Dev set of multilingual UD_27 obtained by concatenating all dev sets." 14 | UD_27_MULTILINGUAL_TEST = os.path.join(_UD_27_MULTILINGUAL_HOME, 'test.conllu') 15 | "Test set of multilingual UD_27 obtained by concatenating all test sets." 16 | -------------------------------------------------------------------------------- /hanlp/datasets/pos/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # Author: hankcs 3 | # Date: 2019-12-28 22:50 -------------------------------------------------------------------------------- /hanlp/datasets/pos/ctb5.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # Author: hankcs 3 | # Date: 2019-12-28 22:51 4 | 5 | _CTB5_POS_HOME = 'http://file.hankcs.com/corpus/ctb5.1-pos.zip' 6 | 7 | CTB5_POS_TRAIN = f'{_CTB5_POS_HOME}#train.tsv' 8 | '''PoS training set for CTB5.''' 9 | CTB5_POS_DEV = f'{_CTB5_POS_HOME}#dev.tsv' 10 | '''PoS dev set for CTB5.''' 11 | CTB5_POS_TEST = f'{_CTB5_POS_HOME}#test.tsv' 12 | '''PoS test set for CTB5.''' 13 | -------------------------------------------------------------------------------- /hanlp/datasets/qa/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # Author: hankcs 3 | # Date: 2020-03-20 19:17 -------------------------------------------------------------------------------- /hanlp/datasets/srl/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # Author: hankcs 3 | # Date: 2020-06-22 19:15 4 | 5 | 6 | -------------------------------------------------------------------------------- /hanlp/datasets/srl/loaders/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # Author: hankcs 3 | # Date: 2021-12-28 19:05 4 | -------------------------------------------------------------------------------- /hanlp/datasets/srl/ontonotes5/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # Author: hankcs 3 | # Date: 2020-11-26 16:07 4 | ONTONOTES5_HOME = 'https://catalog.ldc.upenn.edu/LDC2013T19/LDC2013T19.tgz#/ontonotes-release-5.0/data/' 5 | CONLL12_HOME = ONTONOTES5_HOME + '../conll-2012/' 6 | -------------------------------------------------------------------------------- /hanlp/datasets/sts/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # Author: hankcs 3 | # Date: 2021-05-20 16:25 4 | -------------------------------------------------------------------------------- /hanlp/datasets/tokenization/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # Author: hankcs 3 | # Date: 2020-08-01 12:33 -------------------------------------------------------------------------------- /hanlp/datasets/tokenization/ctb6.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # Author: hankcs 3 | # Date: 2019-12-28 22:19 4 | 5 | _CTB6_CWS_HOME = 'http://file.hankcs.com/corpus/ctb6_cws.zip' 6 | 7 | CTB6_CWS_TRAIN = _CTB6_CWS_HOME + '#train.txt' 8 | '''CTB6 training set.''' 9 | CTB6_CWS_DEV = _CTB6_CWS_HOME + '#dev.txt' 10 | '''CTB6 dev set.''' 11 | CTB6_CWS_TEST = _CTB6_CWS_HOME + '#test.txt' 12 | '''CTB6 test set.''' 13 | -------------------------------------------------------------------------------- /hanlp/datasets/tokenization/loaders/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # Author: hankcs 3 | # Date: 2021-12-28 19:06 4 | -------------------------------------------------------------------------------- /hanlp/datasets/tokenization/loaders/multi_criteria_cws/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # Author: hankcs 3 | # Date: 2020-08-11 20:35 4 | 5 | _HOME = 'https://github.com/hankcs/multi-criteria-cws/archive/naive-mix.zip#data/raw/' 6 | 7 | CNC_TRAIN_ALL = _HOME + 'cnc/train-all.txt' 8 | CNC_TRAIN = _HOME + 'cnc/train.txt' 9 | CNC_DEV = _HOME + 'cnc/dev.txt' 10 | CNC_TEST = _HOME + 'cnc/test.txt' 11 | 12 | CTB_TRAIN_ALL = _HOME + 'ctb/train-all.txt' 13 | CTB_TRAIN = _HOME + 'ctb/train.txt' 14 | CTB_DEV = _HOME + 'ctb/dev.txt' 15 | CTB_TEST = _HOME + 'ctb/test.txt' 16 | 17 | SXU_TRAIN_ALL = _HOME + 'sxu/train-all.txt' 18 | SXU_TRAIN = _HOME + 'sxu/train.txt' 19 | SXU_DEV = _HOME + 'sxu/dev.txt' 20 | SXU_TEST = _HOME + 'sxu/test.txt' 21 | 22 | UDC_TRAIN_ALL = _HOME + 'udc/train-all.txt' 23 | UDC_TRAIN = _HOME + 'udc/train.txt' 24 | UDC_DEV = _HOME + 'udc/dev.txt' 25 | UDC_TEST = _HOME + 'udc/test.txt' 26 | 27 | WTB_TRAIN_ALL = _HOME + 'wtb/train-all.txt' 28 | WTB_TRAIN = _HOME + 'wtb/train.txt' 29 | WTB_DEV = _HOME + 'wtb/dev.txt' 30 | WTB_TEST = _HOME + 'wtb/test.txt' 31 | 32 | ZX_TRAIN_ALL = _HOME + 'zx/train-all.txt' 33 | ZX_TRAIN = _HOME + 'zx/train.txt' 34 | ZX_DEV = _HOME + 'zx/dev.txt' 35 | ZX_TEST = _HOME + 'zx/test.txt' 36 | -------------------------------------------------------------------------------- /hanlp/datasets/tokenization/sighan2005/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # Author: hankcs 3 | # Date: 2019-12-21 15:42 4 | import os 5 | 6 | from hanlp.utils.io_util import get_resource, split_file 7 | from hanlp.utils.log_util import logger 8 | 9 | SIGHAN2005 = 'http://sighan.cs.uchicago.edu/bakeoff2005/data/icwb2-data.zip' 10 | 11 | 12 | def make(train): 13 | root = get_resource(SIGHAN2005) 14 | train = os.path.join(root, train.split('#')[-1]) 15 | if not os.path.isfile(train): 16 | full = train.replace('_90.txt', '.utf8') 17 | logger.info(f'Splitting {full} into training set and valid set with 9:1 proportion') 18 | valid = train.replace('90.txt', '10.txt') 19 | split_file(full, train=0.9, dev=0.1, test=0, names={'train': train, 'dev': valid}) 20 | assert os.path.isfile(train), f'Failed to make {train}' 21 | assert os.path.isfile(valid), f'Failed to make {valid}' 22 | logger.info(f'Successfully made {train} {valid}') 23 | -------------------------------------------------------------------------------- /hanlp/datasets/tokenization/sighan2005/as_.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # Author: hankcs 3 | # Date: 2019-12-21 15:42 4 | from hanlp.datasets.tokenization.sighan2005 import SIGHAN2005, make 5 | 6 | SIGHAN2005_AS_DICT = SIGHAN2005 + "#" + "gold/as_training_words.utf8" 7 | '''Dictionary built on trainings set.''' 8 | SIGHAN2005_AS_TRAIN_ALL = SIGHAN2005 + "#" + "training/as_training.utf8" 9 | '''Full training set.''' 10 | SIGHAN2005_AS_TRAIN = SIGHAN2005 + "#" + "training/as_training_90.txt" 11 | '''Training set (first 90% of the full official training set).''' 12 | SIGHAN2005_AS_DEV = SIGHAN2005 + "#" + "training/as_training_10.txt" 13 | '''Dev set (last 10% of full official training set).''' 14 | SIGHAN2005_AS_TEST_INPUT = SIGHAN2005 + "#" + "testing/as_testing.utf8" 15 | '''Test input.''' 16 | SIGHAN2005_AS_TEST = SIGHAN2005 + "#" + "gold/as_testing_gold.utf8" 17 | '''Test set.''' 18 | 19 | make(SIGHAN2005_AS_TRAIN) 20 | -------------------------------------------------------------------------------- /hanlp/datasets/tokenization/sighan2005/cityu.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # Author: hankcs 3 | # Date: 2019-12-21 15:42 4 | from hanlp.datasets.tokenization.sighan2005 import SIGHAN2005, make 5 | 6 | SIGHAN2005_CITYU_DICT = SIGHAN2005 + "#" + "gold/cityu_training_words.utf8" 7 | '''Dictionary built on trainings set.''' 8 | SIGHAN2005_CITYU_TRAIN_ALL = SIGHAN2005 + "#" + "training/cityu_training.utf8" 9 | '''Full training set.''' 10 | SIGHAN2005_CITYU_TRAIN = SIGHAN2005 + "#" + "training/cityu_training_90.txt" 11 | '''Training set (first 90% of the full official training set).''' 12 | SIGHAN2005_CITYU_DEV = SIGHAN2005 + "#" + "training/cityu_training_10.txt" 13 | '''Dev set (last 10% of full official training set).''' 14 | SIGHAN2005_CITYU_TEST_INPUT = SIGHAN2005 + "#" + "testing/cityu_test.utf8" 15 | '''Test input.''' 16 | SIGHAN2005_CITYU_TEST = SIGHAN2005 + "#" + "gold/cityu_test_gold.utf8" 17 | '''Test set.''' 18 | 19 | make(SIGHAN2005_CITYU_TRAIN) 20 | -------------------------------------------------------------------------------- /hanlp/datasets/tokenization/sighan2005/msr.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # Author: hankcs 3 | # Date: 2019-12-21 15:42 4 | from hanlp.datasets.tokenization.sighan2005 import SIGHAN2005, make 5 | 6 | SIGHAN2005_MSR_DICT = SIGHAN2005 + "#" + "gold/msr_training_words.utf8" 7 | '''Dictionary built on trainings set.''' 8 | SIGHAN2005_MSR_TRAIN_ALL = SIGHAN2005 + "#" + "training/msr_training.utf8" 9 | '''Full training set.''' 10 | SIGHAN2005_MSR_TRAIN = SIGHAN2005 + "#" + "training/msr_training_90.txt" 11 | '''Training set (first 90% of the full official training set).''' 12 | SIGHAN2005_MSR_DEV = SIGHAN2005 + "#" + "training/msr_training_10.txt" 13 | '''Dev set (last 10% of full official training set).''' 14 | SIGHAN2005_MSR_TEST_INPUT = SIGHAN2005 + "#" + "testing/msr_test.utf8" 15 | '''Test input.''' 16 | SIGHAN2005_MSR_TEST = SIGHAN2005 + "#" + "gold/msr_test_gold.utf8" 17 | '''Test set.''' 18 | 19 | make(SIGHAN2005_MSR_TRAIN) 20 | -------------------------------------------------------------------------------- /hanlp/datasets/tokenization/sighan2005/pku.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # Author: hankcs 3 | # Date: 2019-12-21 15:42 4 | from hanlp.datasets.tokenization.sighan2005 import SIGHAN2005, make 5 | 6 | SIGHAN2005_PKU_DICT = SIGHAN2005 + "#" + "gold/pku_training_words.utf8" 7 | '''Dictionary built on trainings set.''' 8 | SIGHAN2005_PKU_TRAIN_ALL = SIGHAN2005 + "#" + "training/pku_training.utf8" 9 | '''Full training set.''' 10 | SIGHAN2005_PKU_TRAIN = SIGHAN2005 + "#" + "training/pku_training_90.txt" 11 | '''Training set (first 90% of the full official training set).''' 12 | SIGHAN2005_PKU_DEV = SIGHAN2005 + "#" + "training/pku_training_10.txt" 13 | '''Dev set (last 10% of full official training set).''' 14 | SIGHAN2005_PKU_TEST_INPUT = SIGHAN2005 + "#" + "testing/pku_test.utf8" 15 | '''Test input.''' 16 | SIGHAN2005_PKU_TEST = SIGHAN2005 + "#" + "gold/pku_test_gold.utf8" 17 | '''Test set.''' 18 | 19 | make(SIGHAN2005_PKU_TRAIN) 20 | -------------------------------------------------------------------------------- /hanlp/layers/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # Author: hankcs 3 | # Date: 2019-10-26 00:50 -------------------------------------------------------------------------------- /hanlp/layers/crf/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # Author: hankcs 3 | # Date: 2019-12-18 22:55 -------------------------------------------------------------------------------- /hanlp/layers/embeddings/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # Author: hankcs 3 | # Date: 2019-08-24 21:48 4 | -------------------------------------------------------------------------------- /hanlp/layers/feed_forward.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # Author: hankcs 3 | # Date: 2020-07-06 14:37 4 | from typing import Union, List 5 | 6 | from hanlp.layers import feedforward 7 | 8 | from hanlp.common.structure import ConfigTracker 9 | 10 | 11 | class FeedForward(feedforward.FeedForward, ConfigTracker): 12 | def __init__(self, input_dim: int, num_layers: int, hidden_dims: Union[int, List[int]], 13 | activations: Union[str, List[str]], dropout: Union[float, List[float]] = 0.0) -> None: 14 | super().__init__(input_dim, num_layers, hidden_dims, activations, dropout) 15 | ConfigTracker.__init__(self, locals()) 16 | -------------------------------------------------------------------------------- /hanlp/layers/transformers/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # Author: hankcs 3 | # Date: 2019-12-29 15:17 4 | # mute transformers 5 | import logging 6 | 7 | logging.getLogger('transformers.file_utils').setLevel(logging.ERROR) 8 | logging.getLogger('transformers.filelock').setLevel(logging.ERROR) 9 | logging.getLogger('transformers.tokenization_utils').setLevel(logging.ERROR) 10 | logging.getLogger('transformers.configuration_utils').setLevel(logging.ERROR) 11 | logging.getLogger('transformers.modeling_tf_utils').setLevel(logging.ERROR) 12 | logging.getLogger('transformers.modeling_utils').setLevel(logging.ERROR) 13 | logging.getLogger('transformers.tokenization_utils_base').setLevel(logging.ERROR) 14 | -------------------------------------------------------------------------------- /hanlp/layers/transformers/loader_tf.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # Author: hankcs 3 | # Date: 2020-01-04 06:05 4 | import tensorflow as tf 5 | from transformers import TFAutoModel 6 | 7 | from hanlp.layers.transformers.pt_imports import AutoTokenizer_, AutoModel_ 8 | 9 | 10 | def build_transformer(transformer, max_seq_length, num_labels, tagging=True, tokenizer_only=False): 11 | tokenizer = AutoTokenizer_.from_pretrained(transformer) 12 | if tokenizer_only: 13 | return tokenizer 14 | l_bert = TFAutoModel.from_pretrained(transformer) 15 | l_input_ids = tf.keras.layers.Input(shape=(max_seq_length,), dtype='int32', name="input_ids") 16 | l_mask_ids = tf.keras.layers.Input(shape=(max_seq_length,), dtype='int32', name="mask_ids") 17 | l_token_type_ids = tf.keras.layers.Input(shape=(max_seq_length,), dtype='int32', name="token_type_ids") 18 | output = l_bert(input_ids=l_input_ids, token_type_ids=l_token_type_ids, attention_mask=l_mask_ids).last_hidden_state 19 | if not tagging: 20 | output = tf.keras.layers.Lambda(lambda seq: seq[:, 0, :])(output) 21 | logits = tf.keras.layers.Dense(num_labels)(output) 22 | model = tf.keras.Model(inputs=[l_input_ids, l_mask_ids, l_token_type_ids], outputs=logits) 23 | model.build(input_shape=(None, max_seq_length)) 24 | return model, tokenizer 25 | -------------------------------------------------------------------------------- /hanlp/layers/transformers/tf_imports.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # Author: hankcs 3 | # Date: 2020-05-08 21:57 4 | from transformers import BertTokenizer, BertConfig, PretrainedConfig, TFAutoModel, \ 5 | AutoConfig, AutoTokenizer, PreTrainedTokenizer, TFPreTrainedModel, TFAlbertModel, TFAutoModelWithLMHead, \ 6 | BertTokenizerFast, TFAlbertForMaskedLM, AlbertConfig, TFBertModel 7 | -------------------------------------------------------------------------------- /hanlp/losses/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # Author: hankcs 3 | # Date: 2019-12-20 01:28 -------------------------------------------------------------------------------- /hanlp/metrics/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # Author: hankcs 3 | # Date: 2019-09-14 21:55 -------------------------------------------------------------------------------- /hanlp/metrics/amr/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # Author: hankcs 3 | # Date: 2020-08-24 12:47 -------------------------------------------------------------------------------- /hanlp/metrics/chunking/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # Author: hankcs 3 | # Date: 2019-12-21 03:49 -------------------------------------------------------------------------------- /hanlp/metrics/chunking/binary_chunking_f1.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # Author: hankcs 3 | # Date: 2020-08-02 14:27 4 | from collections import defaultdict 5 | from typing import List, Union 6 | 7 | import torch 8 | 9 | from hanlp.metrics.f1 import F1 10 | 11 | 12 | class BinaryChunkingF1(F1): 13 | def __call__(self, pred_tags: torch.LongTensor, gold_tags: torch.LongTensor, lens: List[int] = None): 14 | if lens is None: 15 | lens = [gold_tags.size(1)] * gold_tags.size(0) 16 | self.update(self.decode_spans(pred_tags, lens), self.decode_spans(gold_tags, lens)) 17 | 18 | def update(self, pred_tags, gold_tags): 19 | for pred, gold in zip(pred_tags, gold_tags): 20 | super().__call__(set(pred), set(gold)) 21 | 22 | @staticmethod 23 | def decode_spans(pred_tags: torch.LongTensor, lens: Union[List[int], torch.LongTensor]): 24 | if isinstance(lens, torch.Tensor): 25 | lens = lens.tolist() 26 | batch_pred = defaultdict(list) 27 | for batch, offset in pred_tags.nonzero(as_tuple=False).tolist(): 28 | batch_pred[batch].append(offset) 29 | batch_pred_spans = [[(0, l)] for l in lens] 30 | for batch, offsets in batch_pred.items(): 31 | l = lens[batch] 32 | batch_pred_spans[batch] = list(zip(offsets, offsets[1:] + [l])) 33 | return batch_pred_spans 34 | -------------------------------------------------------------------------------- /hanlp/metrics/chunking/iobes_tf.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # Author: hankcs 3 | # Date: 2019-09-14 21:55 4 | 5 | from hanlp.common.vocab_tf import VocabTF 6 | from hanlp.metrics.chunking.conlleval import SpanF1 7 | from hanlp.metrics.chunking.chunking_f1_tf import ChunkingF1_TF 8 | 9 | 10 | class IOBES_F1_TF(ChunkingF1_TF): 11 | 12 | def __init__(self, tag_vocab: VocabTF, from_logits=True, name='f1', dtype=None, **kwargs): 13 | super().__init__(tag_vocab, from_logits, name, dtype, **kwargs) 14 | self.state = SpanF1() 15 | 16 | def update_tags(self, true_tags, pred_tags): 17 | # true_tags = list(itertools.chain.from_iterable(true_tags)) 18 | # pred_tags = list(itertools.chain.from_iterable(pred_tags)) 19 | # self.state.update_state(true_tags, pred_tags) 20 | for gold, pred in zip(true_tags, pred_tags): 21 | self.state.update_state(gold, pred) 22 | return self.result() 23 | 24 | def result(self): 25 | return self.state.result(full=False, verbose=False).fscore 26 | 27 | def reset_states(self): 28 | self.state.reset_state() 29 | -------------------------------------------------------------------------------- /hanlp/metrics/metric.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # Author: hankcs 3 | # Date: 2020-06-03 11:35 4 | from abc import ABC, abstractmethod 5 | 6 | 7 | class Metric(ABC): 8 | 9 | def __lt__(self, other): 10 | return self.score < other 11 | 12 | def __le__(self, other): 13 | return self.score <= other 14 | 15 | def __eq__(self, other): 16 | return self.score == other 17 | 18 | def __ge__(self, other): 19 | return self.score >= other 20 | 21 | def __gt__(self, other): 22 | return self.score > other 23 | 24 | def __ne__(self, other): 25 | return self.score != other 26 | 27 | @property 28 | @abstractmethod 29 | def score(self): 30 | pass 31 | 32 | @abstractmethod 33 | def __call__(self, pred, gold, mask=None): 34 | pass 35 | 36 | def __repr__(self) -> str: 37 | return f'{self.score}:.4f' 38 | 39 | def __float__(self): 40 | return self.score 41 | 42 | @abstractmethod 43 | def reset(self): 44 | pass 45 | -------------------------------------------------------------------------------- /hanlp/metrics/parsing/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # Author: hankcs 3 | # Date: 2019-12-27 00:48 -------------------------------------------------------------------------------- /hanlp/metrics/srl/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # Author: hankcs 3 | # Date: 2020-07-16 18:44 -------------------------------------------------------------------------------- /hanlp/optimizers/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # Author: hankcs 3 | # Date: 2019-11-11 18:44 -------------------------------------------------------------------------------- /hanlp/pretrained/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # Author: hankcs 3 | # Date: 2019-12-28 19:10 4 | from hanlp.pretrained import tok 5 | from hanlp.pretrained import dep 6 | from hanlp.pretrained import sdp 7 | from hanlp.pretrained import glove 8 | from hanlp.pretrained import pos 9 | from hanlp.pretrained import rnnlm 10 | from hanlp.pretrained import word2vec 11 | from hanlp.pretrained import ner 12 | from hanlp.pretrained import classifiers 13 | from hanlp.pretrained import fasttext 14 | from hanlp.pretrained import mtl 15 | from hanlp.pretrained import eos 16 | from hanlp.pretrained import sts 17 | from hanlp.pretrained import constituency 18 | from hanlp.pretrained import amr 19 | from hanlp.pretrained import amr2text 20 | from hanlp.pretrained import srl 21 | 22 | # Will be filled up during runtime 23 | ALL = {} 24 | -------------------------------------------------------------------------------- /hanlp/pretrained/amr2text.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # Author: hankcs 3 | # Date: 2022-12-07 15:19 4 | from hanlp_common.constant import HANLP_URL 5 | 6 | AMR3_GRAPH_PRETRAIN_GENERATION = HANLP_URL + 'amr2text/amr3_graph_pretrain_generation_20221207_153535.zip' 7 | '''A seq2seq (:cite:`bevilacqua-etal-2021-one`) BART (:cite:`lewis-etal-2020-bart`) large AMR2Text generator trained on 8 | Abstract Meaning Representation 3.0 (:cite:`knight2014abstract`) with graph pre-training (:cite:`bai-etal-2022-graph`). 9 | Its Sacre-BLEU is ``50.38`` according to their official repository. 10 | ''' 11 | 12 | # Will be filled up during runtime 13 | ALL = {} 14 | -------------------------------------------------------------------------------- /hanlp/pretrained/classifiers.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # Author: hankcs 3 | # Date: 2020-01-01 03:51 4 | from hanlp_common.constant import HANLP_URL 5 | 6 | CHNSENTICORP_BERT_BASE_ZH = HANLP_URL + 'classification/chnsenticorp_bert_base_20211228_163210.zip' 7 | SST2_ALBERT_BASE_EN = HANLP_URL + 'classification/sst2_albert_base_20211228_164917.zip' 8 | 9 | LID_176_FASTTEXT_BASE = 'https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin' 10 | ''' 11 | 126MB FastText model for language identification trained on data from Wikipedia, Tatoeba and SETimes. 12 | ''' 13 | LID_176_FASTTEXT_SMALL = 'https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.ftz' 14 | ''' 15 | 917kB FastText model for language identification trained on data from Wikipedia, Tatoeba and SETimes. 16 | ''' 17 | 18 | ALL = {} 19 | -------------------------------------------------------------------------------- /hanlp/pretrained/constituency.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # Author=hankcs 3 | # Date=2022-01-18 10:34 4 | from hanlp_common.constant import HANLP_URL 5 | 6 | CTB9_CON_ELECTRA_SMALL = HANLP_URL + 'constituency/ctb9_con_electra_small_20220215_230116.zip' 7 | 'Electra (:cite:`clark2020electra`) small tree CRF model (:cite:`ijcai2020-560`) trained on CTB9 with major categories. ' \ 8 | 'Its performance is UCM=39.06% LCM=34.99% UP=90.05% UR=90.01% UF=90.03% LP=87.02% LR=86.98% LF=87.00%.' 9 | 10 | CTB9_CON_FULL_TAG_ELECTRA_SMALL = HANLP_URL + 'constituency/ctb9_full_tag_con_electra_small_20220118_103119.zip' 11 | 'Electra (:cite:`clark2020electra`) small tree CRF model (:cite:`ijcai2020-560`) trained on CTB9 with full subcategories. ' \ 12 | 'Its performance is UCM=38.29% LCM=28.95% UP=90.16% UR=90.13% UF=90.15% LP=83.46% LR=83.43% LF=83.45%.' 13 | 14 | CTB9_CON_FULL_TAG_ERNIE_GRAM = 'http://download.hanlp.com/constituency/extra/ctb9_full_tag_con_ernie_20220331_121430.zip' 15 | 'ERNIE-GRAM (:cite:`xiao-etal-2021-ernie`) base tree CRF model (:cite:`ijcai2020-560`) trained on CTB9 with full subcategories. ' \ 16 | 'Its performance is UCM=42.04% LCM=31.72% UP=91.33% UR=91.53% UF=91.43% LP=85.31% LR=85.49% LF=85.40%.' 17 | 18 | # Will be filled up during runtime 19 | ALL = {} 20 | -------------------------------------------------------------------------------- /hanlp/pretrained/dep.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # Author: hankcs 3 | # Date: 2019-12-29 02:55 4 | from hanlp_common.constant import HANLP_URL 5 | 6 | CTB5_BIAFFINE_DEP_ZH = HANLP_URL + 'dep/biaffine_ctb5_20191229_025833.zip' 7 | 'Biaffine LSTM model (:cite:`dozat:17a`) trained on CTB5.' 8 | CTB7_BIAFFINE_DEP_ZH = HANLP_URL + 'dep/biaffine_ctb7_20200109_022431.zip' 9 | 'Biaffine LSTM model (:cite:`dozat:17a`) trained on CTB7.' 10 | CTB9_DEP_ELECTRA_SMALL = HANLP_URL + 'dep/ctb9_dep_electra_small_20220216_100306.zip' 11 | 'Electra small encoder (:cite:`clark2020electra`) with Biaffine decoder (:cite:`dozat:17a`) trained on CTB9-SD330. ' \ 12 | 'Performance is UAS=87.68% LAS=83.54%.' 13 | PMT1_DEP_ELECTRA_SMALL = HANLP_URL + 'dep/pmt_dep_electra_small_20220218_134518.zip' 14 | 'Electra small encoder (:cite:`clark2020electra`) with Biaffine decoder (:cite:`dozat:17a`) trained on PKU ' \ 15 | 'Multi-view Chinese Treebank (PMT) 1.0 (:cite:`qiu-etal-2014-multi`). Performance is UAS=91.21% LAS=88.65%.' 16 | CTB9_UDC_ELECTRA_SMALL = HANLP_URL + 'dep/udc_dep_electra_small_20220218_095452.zip' 17 | 'Electra small encoder (:cite:`clark2020electra`) with Biaffine decoder (:cite:`dozat:17a`) trained on CTB9-UD420. ' \ 18 | 'Performance is UAS=85.92% LAS=81.13% .' 19 | 20 | PTB_BIAFFINE_DEP_EN = HANLP_URL + 'dep/ptb_dep_biaffine_20200101_174624.zip' 21 | 'Biaffine LSTM model (:cite:`dozat:17a`) trained on PTB.' 22 | 23 | ALL = {} 24 | -------------------------------------------------------------------------------- /hanlp/pretrained/eos.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # Author: hankcs 3 | # Date: 2020-12-22 13:22 4 | from hanlp_common.constant import HANLP_URL 5 | 6 | UD_CTB_EOS_MUL = HANLP_URL + 'eos/eos_ud_ctb_mul_20201222_133543.zip' 7 | 'EOS model (:cite:`Schweter:Ahmed:2019`) trained on concatenated UD2.3 and CTB9.' 8 | 9 | # Will be filled up during runtime 10 | ALL = {} 11 | -------------------------------------------------------------------------------- /hanlp/pretrained/fasttext.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # Author: hankcs 3 | # Date: 2019-12-30 18:57 4 | FASTTEXT_DEBUG_EMBEDDING_EN = 'https://elit-models.s3-us-west-2.amazonaws.com/fasttext.debug.bin.zip' 5 | FASTTEXT_CC_300_EN = 'https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.bin.gz' 6 | 'FastText (:cite:`bojanowski2017enriching`) embeddings trained on Common Crawl.' 7 | FASTTEXT_WIKI_NYT_AMAZON_FRIENDS_200_EN \ 8 | = 'https://elit-models.s3-us-west-2.amazonaws.com/fasttext-200-wikipedia-nytimes-amazon-friends-20191107.bin' 9 | 'FastText (:cite:`bojanowski2017enriching`) embeddings trained on wikipedia, nytimes and friends.' 10 | 11 | FASTTEXT_WIKI_300_ZH = 'https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.zh.zip#wiki.zh.bin' 12 | 'FastText (:cite:`bojanowski2017enriching`) embeddings trained on Chinese Wikipedia.' 13 | FASTTEXT_WIKI_300_ZH_CLASSICAL = 'https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.zh_classical.zip#wiki.zh_classical.bin' 14 | 'FastText (:cite:`bojanowski2017enriching`) embeddings trained on traditional Chinese wikipedia.' 15 | 16 | ALL = {} 17 | -------------------------------------------------------------------------------- /hanlp/pretrained/glove.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # Author: hankcs 3 | # Date: 2019-08-27 20:42 4 | 5 | _GLOVE_6B_ROOT = 'http://downloads.cs.stanford.edu/nlp/data/glove.6B.zip' 6 | 7 | GLOVE_6B_50D = _GLOVE_6B_ROOT + '#' + 'glove.6B.50d.txt' 8 | 'Global Vectors for Word Representation (:cite:`pennington-etal-2014-glove`) 50d trained on 6B tokens.' 9 | GLOVE_6B_100D = _GLOVE_6B_ROOT + '#' + 'glove.6B.100d.txt' 10 | 'Global Vectors for Word Representation (:cite:`pennington-etal-2014-glove`) 100d trained on 6B tokens.' 11 | GLOVE_6B_200D = _GLOVE_6B_ROOT + '#' + 'glove.6B.200d.txt' 12 | 'Global Vectors for Word Representation (:cite:`pennington-etal-2014-glove`) 200d trained on 6B tokens.' 13 | GLOVE_6B_300D = _GLOVE_6B_ROOT + '#' + 'glove.6B.300d.txt' 14 | 'Global Vectors for Word Representation (:cite:`pennington-etal-2014-glove`) 300d trained on 6B tokens.' 15 | 16 | GLOVE_840B_300D = 'http://nlp.stanford.edu/data/glove.840B.300d.zip' 17 | 'Global Vectors for Word Representation (:cite:`pennington-etal-2014-glove`) 300d trained on 840B tokens.' 18 | 19 | ALL = {} 20 | -------------------------------------------------------------------------------- /hanlp/pretrained/ner.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # Author: hankcs 3 | # Date: 2019-12-30 20:07 4 | from hanlp_common.constant import HANLP_URL 5 | 6 | MSRA_NER_BERT_BASE_ZH = HANLP_URL + 'ner/ner_bert_base_msra_20211227_114712.zip' 7 | 'BERT model (:cite:`devlin-etal-2019-bert`) trained on MSRA with 3 entity types.' 8 | MSRA_NER_ALBERT_BASE_ZH = HANLP_URL + 'ner/msra_ner_albert_base_20211228_173323.zip' 9 | 'ALBERT model (:cite:`Lan2020ALBERT:`) trained on MSRA with 3 entity types.' 10 | MSRA_NER_ELECTRA_SMALL_ZH = HANLP_URL + 'ner/msra_ner_electra_small_20220215_205503.zip' 11 | 'Electra small model (:cite:`clark2020electra`) trained on MSRA with 26 entity types. F1 = `95.16`' 12 | CONLL03_NER_BERT_BASE_CASED_EN = HANLP_URL + 'ner/ner_conll03_bert_base_cased_en_20211227_121443.zip' 13 | 'BERT model (:cite:`devlin-etal-2019-bert`) trained on CoNLL03.' 14 | 15 | ALL = {} 16 | -------------------------------------------------------------------------------- /hanlp/pretrained/rnnlm.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # Author: hankcs 3 | # Date: 2019-12-19 03:47 4 | from hanlp_common.constant import HANLP_URL 5 | 6 | FLAIR_LM_FW_WMT11_EN_TF = HANLP_URL + 'lm/flair_lm_wmt11_en_20200211_091932.zip#flair_lm_fw_wmt11_en' 7 | 'The forward LSTM of Contextual String Embedding (:cite:`akbik-etal-2018-contextual`).' 8 | FLAIR_LM_BW_WMT11_EN_TF = HANLP_URL + 'lm/flair_lm_wmt11_en_20200211_091932.zip#flair_lm_bw_wmt11_en' 9 | 'The backward LSTM of Contextual String Embedding (:cite:`akbik-etal-2018-contextual`).' 10 | FLAIR_LM_WMT11_EN = HANLP_URL + 'lm/flair_lm_wmt11_en_20200601_205350.zip' 11 | 'The BiLSTM of Contextual String Embedding (:cite:`akbik-etal-2018-contextual`).' 12 | 13 | ALL = {} 14 | -------------------------------------------------------------------------------- /hanlp/pretrained/sdp.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # Author: hankcs 3 | # Date: 2019-12-31 23:54 4 | from hanlp_common.constant import HANLP_URL 5 | 6 | SEMEVAL16_NEWS_BIAFFINE_ZH = HANLP_URL + 'sdp/semeval16-news-biaffine_20191231_235407.zip' 7 | 'Biaffine SDP (:cite:`he-choi-2019`) trained on SemEval16 news data.' 8 | SEMEVAL16_TEXT_BIAFFINE_ZH = HANLP_URL + 'sdp/semeval16-text-biaffine_20200101_002257.zip' 9 | 'Biaffine SDP (:cite:`he-choi-2019`) trained on SemEval16 text data.' 10 | 11 | SEMEVAL16_ALL_ELECTRA_SMALL_ZH = HANLP_URL + 'sdp/semeval16_sdp_electra_small_20220719_171433.zip' 12 | 'Biaffine SDP (:cite:`he-choi-2019`) trained on SemEval16 text and news data. Performance: ``UF: 83.03% LF: 72.58%``' 13 | 14 | SEMEVAL15_PAS_BIAFFINE_EN = HANLP_URL + 'sdp/semeval15_biaffine_pas_20200103_152405.zip' 15 | 'Biaffine SDP (:cite:`he-choi-2019`) trained on SemEval15 PAS data.' 16 | SEMEVAL15_PSD_BIAFFINE_EN = HANLP_URL + 'sdp/semeval15_biaffine_psd_20200106_123009.zip' 17 | 'Biaffine SDP (:cite:`he-choi-2019`) trained on SemEval15 PSD data.' 18 | SEMEVAL15_DM_BIAFFINE_EN = HANLP_URL + 'sdp/semeval15_biaffine_dm_20200106_122808.zip' 19 | 'Biaffine SDP (:cite:`he-choi-2019`) trained on SemEval15 DM data.' 20 | 21 | ALL = {} 22 | -------------------------------------------------------------------------------- /hanlp/pretrained/srl.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # Author: hankcs 3 | # Date: 2021-08-07 19:07 4 | from hanlp_common.constant import HANLP_URL 5 | 6 | CPB3_SRL_ELECTRA_SMALL = HANLP_URL + 'srl/cpb3_electra_small_crf_has_transform_20220218_135910.zip' 7 | 'Electra small model (:cite:`clark2020electra`) trained on CPB3. P=75.87% R=76.24% F1=76.05%.' 8 | 9 | ALL = {} 10 | -------------------------------------------------------------------------------- /hanlp/pretrained/sts.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # Author: hankcs 3 | # Date: 2021-05-24 12:51 4 | from hanlp_common.constant import HANLP_URL 5 | 6 | STS_ELECTRA_BASE_ZH = HANLP_URL + 'sts/sts_electra_base_zh_20210530_200109.zip' 7 | 'A naive regression model trained on concatenated STS corpora.' 8 | 9 | # Will be filled up during runtime 10 | ALL = {} 11 | -------------------------------------------------------------------------------- /hanlp/transform/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # Author: hankcs 3 | # Date: 2019-12-29 22:24 -------------------------------------------------------------------------------- /hanlp/utils/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # Author: hankcs 3 | # Date: 2019-08-24 22:12 4 | from . import rules 5 | 6 | 7 | def ls_resource_in_module(root) -> dict: 8 | res = dict() 9 | for k, v in root.__dict__.items(): 10 | if k.startswith('_') or v == root: 11 | continue 12 | if isinstance(v, str): 13 | if v.startswith('http') and not v.endswith('/') and not v.endswith('#') and not v.startswith('_'): 14 | res[k] = v 15 | elif type(v).__name__ == 'module': 16 | res.update(ls_resource_in_module(v)) 17 | if 'ALL' in root.__dict__ and isinstance(root.__dict__['ALL'], dict): 18 | root.__dict__['ALL'].update(res) 19 | return res 20 | -------------------------------------------------------------------------------- /hanlp/utils/file_read_backwards/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from .file_read_backwards import FileReadBackwards # noqa: F401 4 | 5 | __author__ = """Robin Robin""" 6 | __email__ = 'robinsquare42@gmail.com' 7 | __version__ = '2.0.0' 8 | -------------------------------------------------------------------------------- /hanlp/utils/init_util.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # Author: hankcs 3 | # Date: 2020-05-27 13:25 4 | import math 5 | 6 | import torch 7 | from torch import nn 8 | import functools 9 | 10 | 11 | def embedding_uniform(tensor:torch.Tensor, seed=233): 12 | gen = torch.Generator().manual_seed(seed) 13 | with torch.no_grad(): 14 | fan_out = tensor.size(-1) 15 | bound = math.sqrt(3.0 / fan_out) 16 | return tensor.uniform_(-bound, bound, generator=gen) 17 | -------------------------------------------------------------------------------- /hanlp/utils/lang/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # Author: hankcs 3 | # Date: 2020-01-09 18:46 4 | 5 | __doc__ = ''' 6 | This package holds misc utils for specific languages. 7 | ''' 8 | -------------------------------------------------------------------------------- /hanlp/utils/lang/en/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # Author: hankcs 3 | # Date: 2021-12-28 19:28 4 | -------------------------------------------------------------------------------- /hanlp/utils/lang/ja/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # Author: hankcs 3 | # Date: 2021-05-13 13:24 4 | -------------------------------------------------------------------------------- /hanlp/utils/lang/zh/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # Author: hankcs 3 | # Date: 2020-01-09 18:47 -------------------------------------------------------------------------------- /hanlp/version.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # Author: hankcs 3 | # Date: 2019-12-28 19:26 4 | 5 | __version__ = '2.1.0-beta.64' 6 | """HanLP version""" 7 | 8 | 9 | class NotCompatible(Exception): 10 | pass 11 | -------------------------------------------------------------------------------- /plugins/README.md: -------------------------------------------------------------------------------- 1 | # Plugins for HanLP 2 | 3 | This directory contains modules shared across several individual packages or non core APIs. 4 | If you plan to submit any plugins, please put it here too. 5 | 6 | For developers, run the following set-up. 7 | 8 | ```bash 9 | pip install -e hanlp_trie 10 | pip install -e hanlp_common 11 | pip install -e hanlp_restful 12 | ``` -------------------------------------------------------------------------------- /plugins/hanlp_common/README.md: -------------------------------------------------------------------------------- 1 | # Common utilities and structures for HanLP 2 | 3 | [中文](https://github.com/hankcs/HanLP/tree/doc-zh) | [1.x](https://github.com/hankcs/HanLP/tree/1.x) | [forum](https://bbs.hankcs.com/) | [docker](https://github.com/WalterInSH/hanlp-jupyter-docker) 4 | 5 | The multilingual NLP library for researchers and companies, built on PyTorch and TensorFlow 2.x, for advancing state-of-the-art deep learning techniques in both academia and industry. HanLP was designed from day one to be efficient, user friendly and extendable. It comes with pretrained models for various human languages including English, Chinese and many others. Currently, HanLP 2.0 is in alpha stage with more killer features on the roadmap. Discussions are welcomed on our [forum](https://bbs.hankcs.com/), while bug reports and feature requests are reserved for GitHub issues. For Java users, please checkout the [1.x](https://github.com/hankcs/HanLP/tree/1.x) branch. 6 | 7 | 8 | ## Installation 9 | 10 | ```bash 11 | pip install hanlp 12 | ``` 13 | 14 | ## License 15 | 16 | HanLP is licensed under **Apache License 2.0**. You can use HanLP in your commercial products for free. We would appreciate it if you add a link to HanLP on your website. 17 | 18 | -------------------------------------------------------------------------------- /plugins/hanlp_common/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # Author: hankcs 3 | # Date: 2020-12-16 22:20 4 | -------------------------------------------------------------------------------- /plugins/hanlp_common/hanlp_common/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # Author: hankcs 3 | # Date: 2020-12-16 22:21 4 | -------------------------------------------------------------------------------- /plugins/hanlp_common/hanlp_common/constant.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # Author: hankcs 3 | # Date: 2019-06-13 22:41 4 | import os 5 | 6 | PAD = '' 7 | '''Padding token.''' 8 | UNK = '' 9 | '''Unknown token.''' 10 | CLS = '[CLS]' 11 | BOS = '' 12 | EOS = '' 13 | ROOT = BOS 14 | IDX = '_idx_' 15 | '''Key for index.''' 16 | HANLP_URL = os.getenv('HANLP_URL', 'https://file.hankcs.com/hanlp/') 17 | '''Resource URL.''' 18 | HANLP_VERBOSE = os.environ.get('HANLP_VERBOSE', '1').lower() in ('1', 'true', 'yes') 19 | '''Enable verbose or not.''' 20 | NULL = '' 21 | PRED = 'PRED' 22 | 23 | IPYTHON = os.environ.get('HANLP_IPYTHON', '1').lower() in ('1', 'true', 'yes') # Allow the user to disable IPYTHON 24 | if IPYTHON: 25 | try: 26 | # noinspection PyUnresolvedReferences,PyStatementEffect 27 | get_ipython 28 | except NameError: 29 | IPYTHON = False 30 | -------------------------------------------------------------------------------- /plugins/hanlp_common/hanlp_common/io.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # Author: hankcs 3 | # Date: 2020-12-16 22:38 4 | import json 5 | import os 6 | import pickle 7 | import sys 8 | from typing import Union 9 | 10 | 11 | def save_pickle(item, path): 12 | with open(path, 'wb') as f: 13 | pickle.dump(item, f) 14 | 15 | 16 | def load_pickle(path): 17 | with open(path, 'rb') as f: 18 | return pickle.load(f) 19 | 20 | 21 | def save_json(item: Union[dict, list, str, int, float], path: str, ensure_ascii=False, cls=None, 22 | default=lambda o: repr(o), indent=2): 23 | dirname = os.path.dirname(path) 24 | if dirname: 25 | os.makedirs(dirname, exist_ok=True) 26 | with open(path, 'w', encoding='utf-8') as out: 27 | json.dump(item, out, ensure_ascii=ensure_ascii, indent=indent, cls=cls, default=default) 28 | 29 | 30 | def load_json(path): 31 | with open(path, encoding='utf-8') as src: 32 | return json.load(src) 33 | 34 | 35 | def filename_is_json(filename): 36 | filename, file_extension = os.path.splitext(filename) 37 | return file_extension in ['.json', '.jsonl'] 38 | 39 | 40 | def eprint(*args, **kwargs): 41 | print(*args, file=sys.stderr, **kwargs) 42 | -------------------------------------------------------------------------------- /plugins/hanlp_demo/README.md: -------------------------------------------------------------------------------- 1 | # Demos and examples for HanLP 2 | 3 | This package is intended for demonstration purpose and won't be released to pypi. **Training requires a fair understanding of Linux and Python which might not be the case for everybody.** 4 | 5 | You need a Linux/macOS system with Internet on because some corpora and bash scripts will be downloaded during training. Training on Windows might work if you are an expert but we believe it's very rare. 6 | 7 | Your `python` command needs to be Python2 while `python3` needs to be Python3. 8 | 9 | You need to install this package and run it from the **root** folder of HanLP. 10 | 11 | ```bash 12 | pip install -e plugins/hanlp_demo 13 | python3 plugins/hanlp_demo/hanlp_demo/zh/train/open_small.py 14 | ``` 15 | 16 | -------------------------------------------------------------------------------- /plugins/hanlp_demo/hanlp_demo/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # Author: hankcs 3 | # Date: 2020-11-29 17:48 4 | 5 | -------------------------------------------------------------------------------- /plugins/hanlp_demo/hanlp_demo/block_windows.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # Author: hankcs 3 | # Date: 2021-07-28 21:38 4 | from hanlp.utils.io_util import windows 5 | 6 | assert not windows(), 'Windows is not supported for this script. Please run it on Linux systems.' 7 | -------------------------------------------------------------------------------- /plugins/hanlp_demo/hanlp_demo/en/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # Author: hankcs 3 | # Date: 2020-01-01 17:55 -------------------------------------------------------------------------------- /plugins/hanlp_demo/hanlp_demo/en/demo_amr.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # Author: hankcs 3 | # Date: 2022-01-25 19:09 4 | import hanlp 5 | 6 | amr_parser = hanlp.load(hanlp.pretrained.amr.AMR3_SEQ2SEQ_BART_LARGE) 7 | amr = amr_parser('The boy wants the girl to believe him.') 8 | print(amr) 9 | -------------------------------------------------------------------------------- /plugins/hanlp_demo/hanlp_demo/en/demo_dep.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # Author: hankcs 3 | # Date: 2020-01-01 17:55 4 | import hanlp 5 | 6 | syntactic_parser = hanlp.load(hanlp.pretrained.dep.PTB_BIAFFINE_DEP_EN) 7 | sent = [('Is', 'VBZ'), 8 | ('this', 'DT'), 9 | ('the', 'DT'), 10 | ('future', 'NN'), 11 | ('of', 'IN'), 12 | ('chamber', 'NN'), 13 | ('music', 'NN'), 14 | ('?', '.')] 15 | tree = syntactic_parser(sent) 16 | print(tree) 17 | -------------------------------------------------------------------------------- /plugins/hanlp_demo/hanlp_demo/en/demo_lm.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # Author: hankcs 3 | # Date: 2020-02-11 09:14 4 | import hanlp 5 | 6 | lm = hanlp.load(hanlp.pretrained.rnnlm.FLAIR_LM_FW_WMT11_EN_TF) 7 | print(''.join(lm.generate_text(list('hello')))) 8 | -------------------------------------------------------------------------------- /plugins/hanlp_demo/hanlp_demo/en/demo_ner.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # Author: hankcs 3 | # Date: 2020-01-03 22:50 4 | import hanlp 5 | 6 | recognizer = hanlp.load(hanlp.pretrained.ner.CONLL03_NER_BERT_BASE_CASED_EN) 7 | print(recognizer(["President", "Obama", "is", "speaking", "at", "the", "White", "House", "."])) 8 | -------------------------------------------------------------------------------- /plugins/hanlp_demo/hanlp_demo/en/demo_pipeline.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # Author: hankcs 3 | # Date: 2020-01-04 21:05 4 | import hanlp 5 | from hanlp.utils.lang.en.english_tokenizer import tokenize_english 6 | 7 | tokenizer = tokenize_english 8 | tagger = hanlp.load(hanlp.pretrained.pos.PTB_POS_RNN_FASTTEXT_EN) 9 | syntactic_parser = hanlp.load(hanlp.pretrained.dep.PTB_BIAFFINE_DEP_EN) 10 | semantic_parser = hanlp.load(hanlp.pretrained.sdp.SEMEVAL15_PAS_BIAFFINE_EN) 11 | 12 | pipeline = hanlp.pipeline() \ 13 | .append(hanlp.utils.rules.split_sentence, output_key='sentences') \ 14 | .append(tokenizer, output_key='tokens') \ 15 | .append(tagger, output_key='part_of_speech_tags') \ 16 | .append(syntactic_parser, input_key=('tokens', 'part_of_speech_tags'), output_key='syntactic_dependencies', 17 | conll=False) \ 18 | .append(semantic_parser, input_key=('tokens', 'part_of_speech_tags'), output_key='semantic_dependencies', 19 | conll=False) 20 | print(pipeline) 21 | 22 | text = '''Jobs and Wozniak co-founded Apple in 1976 to sell Wozniak's Apple I personal computer. 23 | Together the duo gained fame and wealth a year later with the Apple II. 24 | ''' 25 | 26 | doc = pipeline(text) 27 | print(doc) 28 | 29 | # You can save the config to disk for deploying or sharing. 30 | pipeline.save('en.json') 31 | # Then load it smoothly. 32 | deployed = hanlp.load('en.json') 33 | print(deployed) 34 | print(deployed(text)) 35 | -------------------------------------------------------------------------------- /plugins/hanlp_demo/hanlp_demo/en/demo_pos.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # Author: hankcs 3 | # Date: 2020-01-03 22:16 4 | import hanlp 5 | 6 | tagger = hanlp.load(hanlp.pretrained.pos.PTB_POS_RNN_FASTTEXT_EN) 7 | print(tagger([['I', 'banked', '2', 'dollars', 'in', 'a', 'bank', '.'], 8 | ['Is', 'this', 'the', 'future', 'of', 'chamber', 'music', '?']])) 9 | -------------------------------------------------------------------------------- /plugins/hanlp_demo/hanlp_demo/en/demo_sdp.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # Author: hankcs 3 | # Date: 2020-01-03 15:26 4 | import hanlp 5 | from hanlp_common.conll import CoNLLSentence 6 | 7 | # semeval15 offers three independent annotations over the Penn Treebank (PTB) 8 | semantic_parser = hanlp.load(hanlp.pretrained.sdp.SEMEVAL15_PAS_BIAFFINE_EN) 9 | # semantic_parser = hanlp.load(hanlp.pretrained.sdp.SEMEVAL15_DM_BIAFFINE_EN) 10 | # semantic_parser = hanlp.load(hanlp.pretrained.sdp.SEMEVAL15_PSD_BIAFFINE_EN) 11 | sent = [('Is', 'VBZ'), 12 | ('this', 'DT'), 13 | ('the', 'DT'), 14 | ('future', 'NN'), 15 | ('of', 'IN'), 16 | ('chamber', 'NN'), 17 | ('music', 'NN'), 18 | ('?', '.')] 19 | tree = semantic_parser(sent) # type:CoNLLSentence 20 | print(tree) 21 | -------------------------------------------------------------------------------- /plugins/hanlp_demo/hanlp_demo/en/demo_sentiment_analysis.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # Author: hankcs 3 | # Date: 2020-01-01 03:52 4 | import hanlp 5 | 6 | classifier = hanlp.load('SST2_ALBERT_BASE_EN') 7 | print(classifier.predict('I feel lucky')) 8 | -------------------------------------------------------------------------------- /plugins/hanlp_demo/hanlp_demo/en/demo_tok.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # Author: hankcs 3 | # Date: 2020-01-02 19:41 4 | from hanlp.utils.lang.en.english_tokenizer import tokenize_english 5 | 6 | text = """\ 7 | Don't go gentle into that good night. 8 | """ 9 | print(tokenize_english(text)) 10 | -------------------------------------------------------------------------------- /plugins/hanlp_demo/hanlp_demo/en/train_sst2_albert_base.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # Author: hankcs 3 | # Date: 2019-11-10 17:41 4 | import os 5 | 6 | from hanlp.components.classifiers.transformer_classifier_tf import TransformerClassifierTF 7 | 8 | from tests import cdroot 9 | 10 | from hanlp.datasets.glu.glue import STANFORD_SENTIMENT_TREEBANK_2_DEV, STANFORD_SENTIMENT_TREEBANK_2_TRAIN, \ 11 | STANFORD_SENTIMENT_TREEBANK_2_TEST 12 | 13 | cdroot() 14 | save_dir = os.path.join('data', 'model', 'sst', 'sst2_albert_base') 15 | classifier = TransformerClassifierTF() 16 | classifier.fit(STANFORD_SENTIMENT_TREEBANK_2_TRAIN, STANFORD_SENTIMENT_TREEBANK_2_DEV, save_dir, 17 | transformer='albert-base-v2') 18 | classifier.load(save_dir) 19 | print(classifier('it\' s a charming and often affecting journey')) 20 | classifier.evaluate(STANFORD_SENTIMENT_TREEBANK_2_TEST, save_dir=save_dir) 21 | print(f'Model saved in {save_dir}') 22 | -------------------------------------------------------------------------------- /plugins/hanlp_demo/hanlp_demo/ja/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # Author: hankcs 3 | # Date: 2021-05-17 22:30 4 | -------------------------------------------------------------------------------- /plugins/hanlp_demo/hanlp_demo/ja/demo_mtl.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # Author: hankcs 3 | # Date: 2021-05-17 22:30 4 | import hanlp 5 | from hanlp_common.document import Document 6 | 7 | HanLP = hanlp.load(hanlp.pretrained.mtl.NPCMJ_UD_KYOTO_TOK_POS_CON_BERT_BASE_CHAR_JA) 8 | doc: Document = HanLP([ 9 | '2021年、HanLPv2.1は次世代の最先端多言語NLP技術を本番環境に導入します。', 10 | '奈須きのこは1973年11月28日に千葉県円空山で生まれ、ゲーム制作会社「ノーツ」の設立者だ。', 11 | ]) 12 | print(doc) 13 | doc.pretty_print() 14 | -------------------------------------------------------------------------------- /plugins/hanlp_demo/hanlp_demo/mul/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # Author: hankcs 3 | # Date: 2020-12-31 22:25 4 | -------------------------------------------------------------------------------- /plugins/hanlp_demo/hanlp_demo/mul/demo_lid.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # Author: hankcs 3 | # Date: 2022-09-28 16:49 4 | import hanlp 5 | 6 | lid = hanlp.load(hanlp.pretrained.classifiers.LID_176_FASTTEXT_BASE) 7 | 8 | print(lid('In 2021, HanLPv2.1 delivers state-of-the-art multilingual NLP techniques to production environments.')) 9 | lang, prob = lid('2021年、HanLPv2.1は次世代の最先端多言語NLP技術を本番環境に導入します。', prob=True) 10 | print(f'{lang} language identified with probability {prob:.3%}') 11 | print(lid('2021年 HanLPv2.1为生产环境带来次世代最先进的多语种NLP技术。', topk=2)) 12 | 13 | # For a combination of languages, predict top-k languages with probabilities: 14 | text = ''' 15 | 2021年 HanLPv2.1为生产环境带来次世代最先进的多语种NLP技术。 16 | In 2021, HanLPv2.1 delivers state-of-the-art multilingual NLP techniques to production environments. 17 | ''' 18 | 19 | print(lid(text, topk=3, prob=True)) 20 | -------------------------------------------------------------------------------- /plugins/hanlp_demo/hanlp_demo/mul/demo_lid_restful.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # Author: hankcs 3 | # Date: 2022-09-28 16:49 4 | from hanlp_restful import HanLPClient 5 | 6 | HanLP = HanLPClient('https://hanlp.hankcs.com/api', auth=None, language='mul') 7 | 8 | print(HanLP.language_identification([ 9 | 'In 2021, HanLPv2.1 delivers state-of-the-art multilingual NLP techniques to production environment.', 10 | '2021年、HanLPv2.1は次世代の最先端多言語NLP技術を本番環境に導入します。', 11 | '2021年 HanLPv2.1为生产环境带来次世代最先进的多语种NLP技术。', 12 | ])) 13 | -------------------------------------------------------------------------------- /plugins/hanlp_demo/hanlp_demo/mul/demo_mtl.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # Author: hankcs 3 | # Date: 2020-12-31 13:51 4 | import hanlp 5 | from hanlp_common.document import Document 6 | 7 | HanLP = hanlp.load(hanlp.pretrained.mtl.UD_ONTONOTES_TOK_POS_LEM_FEA_NER_SRL_DEP_SDP_CON_XLMR_BASE) 8 | doc: Document = HanLP([ 9 | 'In 2021, HanLPv2.1 delivers state-of-the-art multilingual NLP techniques to production environment.', 10 | '2021年、HanLPv2.1は次世代の最先端多言語NLP技術を本番環境に導入します。', 11 | '2021年 HanLPv2.1为生产环境带来次世代最先进的多语种NLP技术。', 12 | ]) 13 | print(doc) 14 | doc.pretty_print() 15 | -------------------------------------------------------------------------------- /plugins/hanlp_demo/hanlp_demo/mul/train/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # Author: hankcs 3 | # Date: 2023-02-21 19:40 4 | -------------------------------------------------------------------------------- /plugins/hanlp_demo/hanlp_demo/sent_split.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # Author: hankcs 3 | # Date: 2020-12-31 14:23 4 | import hanlp 5 | 6 | split_sent = hanlp.load(hanlp.pretrained.eos.UD_CTB_EOS_MUL) 7 | output = split_sent('3.14 is pi. “你好!!!”——他说。劇場版「Fate/stay night [HF]」最終章公開カウントダウン!') 8 | print('\n'.join(output)) 9 | # See also https://hanlp.hankcs.com/docs/api/hanlp/components/eos.html 10 | -------------------------------------------------------------------------------- /plugins/hanlp_demo/hanlp_demo/zh/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # Author: hankcs 3 | # Date: 2020-12-31 13:51 4 | -------------------------------------------------------------------------------- /plugins/hanlp_demo/hanlp_demo/zh/demo_amr.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # Author: hankcs 3 | # Date: 2022-04-12 22:19 4 | import hanlp 5 | 6 | parser = hanlp.load(hanlp.pretrained.amr.MRP2020_AMR_ENG_ZHO_XLM_BASE) 7 | 8 | # For Chinese: 9 | print(parser(["男孩", "希望", "女孩", "相信", "他", "。"])) 10 | print(parser(["男孩", "希望", "女孩", "相信", "他", "。"], output_amr=False)) 11 | 12 | # For English: 13 | print(parser(['The', 'boy', 'wants', 'the', 'girl', 'to', 'believe', 'him', '.'], language='eng')) 14 | # It's suggested to also feed the lemma for stabler performance. 15 | print(parser([('The', 'the'), ('boy', 'boy'), ('wants', 'want'), ('the', 'the'), ('girl', 'girl'), ('to', 'to'), 16 | ('believe', 'believe'), ('him', 'he'), ('.', '.')], language='eng')) 17 | -------------------------------------------------------------------------------- /plugins/hanlp_demo/hanlp_demo/zh/demo_custom_dict.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # Author: hankcs 3 | # Date: 2020-12-15 22:26 4 | import hanlp 5 | from hanlp.components.mtl.multi_task_learning import MultiTaskLearning 6 | from hanlp.components.mtl.tasks.tok.tag_tok import TaggingTokenization 7 | 8 | # 加载多任务模型 9 | HanLP: MultiTaskLearning = hanlp.load(hanlp.pretrained.mtl.CLOSE_TOK_POS_NER_SRL_DEP_SDP_CON_ELECTRA_SMALL_ZH) 10 | # 获取分词任务(以tok开头的任务都是分词任务,以细分标准为例) 11 | tok: TaggingTokenization = HanLP['tok/fine'] 12 | 13 | tok.dict_force = tok.dict_combine = None 14 | print(f'不挂词典:\n{HanLP("商品和服务项目")["tok/fine"]}') 15 | 16 | tok.dict_force = {'和服', '服务项目'} 17 | print(f'强制模式:\n{HanLP("商品和服务项目")["tok/fine"]}') # 慎用,详见《自然语言处理入门》第二章 18 | 19 | tok.dict_force = {'和服务': ['和', '服务']} 20 | print(f'强制校正:\n{HanLP("正向匹配商品和服务、任何和服务必按上述切分")["tok/fine"]}') 21 | 22 | tok.dict_force = None 23 | tok.dict_combine = {'和服', '服务项目'} 24 | print(f'合并模式:\n{HanLP("商品和服务项目")["tok/fine"]}') 25 | 26 | # 需要算法基础才能理解,初学者可参考 http://nlp.hankcs.com/book.php 27 | # See also https://hanlp.hankcs.com/docs/api/hanlp/components/tokenizers/transformer.html 28 | 29 | # 含有空格、制表符等(Transformer tokenizer去掉的字符)的词语需要用tuple的形式提供 30 | tok.dict_combine = {('iPad', 'Pro'), '2个空格'} 31 | print(f'空格匹配:\n{HanLP("如何评价iPad Pro ?iPad Pro有2个空格", tasks="tok/fine")["tok/fine"]}') 32 | # 聪明的用户请继续阅读:tuple词典中的字符串其实等价于该字符串的所有可能的切分方式 33 | print(f'词典内容:\n{dict(tok.dict_combine.config["dictionary"]).keys()}') 34 | -------------------------------------------------------------------------------- /plugins/hanlp_demo/hanlp_demo/zh/demo_custom_dict_stl.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # Author: hankcs 3 | # Date: 2020-12-15 22:26 4 | import hanlp 5 | from hanlp.components.tokenizers.transformer import TransformerTaggingTokenizer 6 | 7 | # 加载一个旧版本单任务模型演示分词错误(最新版已经修复): 8 | tok: TransformerTaggingTokenizer = hanlp.load('https://file.hankcs.com/hanlp/tok/coarse_electra_small_20220220_013548.zip') 9 | 10 | tok.dict_force = tok.dict_combine = None 11 | print(f'不挂词典:\n{tok("首相和川普通电话")}') 12 | 13 | tok.dict_force = {'川普'} 14 | print(f'强制模式:\n{tok(["首相和川普通电话", "银川普通人与川普通电话讲四川普通话"])}') # 慎用,详见《自然语言处理入门》第二章 15 | 16 | tok.dict_force = {'川普通电话': ['川普', '通', '电话']} 17 | print(f'强制校正:\n{tok(["首相和川普通电话", "银川普通人与川普通电话讲四川普通话"])}') 18 | 19 | tok.dict_force = None 20 | tok.dict_combine = {'美国总统'} 21 | print(f'合并模式:\n{tok("首相和川普通电话,川普是美国总统。")}') 22 | 23 | # 需要算法基础才能理解,初学者可参考 http://nlp.hankcs.com/book.php 24 | # See also https://hanlp.hankcs.com/docs/api/hanlp/components/tokenizers/transformer.html 25 | -------------------------------------------------------------------------------- /plugins/hanlp_demo/hanlp_demo/zh/demo_del_tasks.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # Author: hankcs 3 | # Date: 2021-02-03 13:28 4 | import hanlp 5 | from hanlp.components.mtl.multi_task_learning import MultiTaskLearning 6 | from hanlp_common.document import Document 7 | 8 | HanLP: MultiTaskLearning = hanlp.load(hanlp.pretrained.mtl.OPEN_TOK_POS_NER_SRL_DEP_SDP_CON_ELECTRA_SMALL_ZH) 9 | tasks = list(HanLP.tasks.keys()) 10 | print(tasks) # Pick what you need from what we have 11 | for task in tasks: 12 | if task not in ('tok', 'pos'): 13 | del HanLP[task] 14 | # You can save it as a new component 15 | # HanLP.save('path/to/new/component') 16 | # HanLP.load('path/to/new/component') 17 | print(HanLP.tasks.keys()) 18 | doc: Document = HanLP(['2021年HanLPv2.1为生产环境带来次世代最先进的多语种NLP技术。', 'up主来到北京立方庭参观自然语义科技公司。']) 19 | print(doc) 20 | doc.pretty_print() 21 | -------------------------------------------------------------------------------- /plugins/hanlp_demo/hanlp_demo/zh/demo_mlm.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # Author: hankcs 3 | # Date: 2022-01-29 21:11 4 | from hanlp.components.lm.mlm import MaskedLanguageModel 5 | 6 | mlm = MaskedLanguageModel() 7 | mlm.load('bert-base-chinese') 8 | print(mlm('生活的真谛是[MASK]。')) 9 | 10 | # Batching is always faster 11 | print(mlm(['生活的真谛是[MASK]。', '巴黎是[MASK][MASK]的首都。'])) 12 | -------------------------------------------------------------------------------- /plugins/hanlp_demo/hanlp_demo/zh/demo_mtl.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # Author: hankcs 3 | # Date: 2020-12-31 13:51 4 | import hanlp 5 | from hanlp_common.document import Document 6 | 7 | # CLOSE是自然语义标注的闭源语料库,BASE是中号模型,ZH中文 8 | HanLP = hanlp.load(hanlp.pretrained.mtl.CLOSE_TOK_POS_NER_SRL_DEP_SDP_CON_ELECTRA_BASE_ZH) 9 | # 默认执行全部任务 10 | doc: Document = HanLP(['2021年HanLPv2.1为生产环境带来次世代最先进的多语种NLP技术。', '阿婆主来到北京立方庭参观自然语义科技公司。']) 11 | # 返回类型Document是dict的子类,打印出来兼容JSON 12 | print(doc) 13 | # 即时可视化,防止换行请最大化窗口,推荐在Jupyter Notebook里调用 14 | doc.pretty_print() 15 | # 指定可视化OntoNotes标准的NER 16 | # doc.pretty_print(ner='ner/ontonotes', pos='pku') 17 | -------------------------------------------------------------------------------- /plugins/hanlp_demo/hanlp_demo/zh/demo_ner_dict.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # Author: hankcs 3 | # Date: 2021-04-29 11:06 4 | import hanlp 5 | from hanlp.components.mtl.tasks.ner.tag_ner import TaggingNamedEntityRecognition 6 | from hanlp.utils.io_util import get_resource 7 | 8 | HanLP = hanlp.load(hanlp.pretrained.mtl.CLOSE_TOK_POS_NER_SRL_DEP_SDP_CON_ERNIE_GRAM_ZH) 9 | ner: TaggingNamedEntityRecognition = HanLP['ner/msra'] 10 | ner.dict_whitelist = {'午饭后': 'TIME'} 11 | doc = HanLP('2021年测试高血压是138,时间是午饭后2点45,低血压是44', tasks='ner/msra') 12 | doc.pretty_print() 13 | print(doc['ner/msra']) 14 | 15 | ner.dict_tags = {('名字', '叫', '金华'): ('O', 'O', 'S-PERSON')} 16 | HanLP('他在浙江金华出生,他的名字叫金华。', tasks='ner/msra').pretty_print() 17 | 18 | # HanLP.save(get_resource(hanlp.pretrained.mtl.CLOSE_TOK_POS_NER_SRL_DEP_SDP_CON_ERNIE_GRAM_ZH)) 19 | 20 | # 需要算法基础才能理解,初学者可参考 http://nlp.hankcs.com/book.php 21 | # See https://hanlp.hankcs.com/docs/api/hanlp/components/mtl/tasks/ner/tag_ner.html 22 | -------------------------------------------------------------------------------- /plugins/hanlp_demo/hanlp_demo/zh/demo_pipeline.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # Author: hankcs 3 | # Date: 2021-12-28 20:47 4 | import hanlp 5 | 6 | # Pipeline allows blending multiple callable functions no matter they are a rule, a TensorFlow component or a PyTorch 7 | # one. However, it's slower than the MTL framework. 8 | # pos = hanlp.load(hanlp.pretrained.pos.CTB9_POS_ALBERT_BASE) # In case both tf and torch are used, load tf first. 9 | 10 | HanLP = hanlp.pipeline() \ 11 | .append(hanlp.utils.rules.split_sentence, output_key='sentences') \ 12 | .append(hanlp.load('CTB9_TOK_ELECTRA_SMALL'), output_key='tok') \ 13 | .append(hanlp.load('CTB9_POS_ELECTRA_SMALL'), output_key='pos') \ 14 | .append(hanlp.load('MSRA_NER_ELECTRA_SMALL_ZH'), output_key='ner', input_key='tok') \ 15 | .append(hanlp.load('CTB9_DEP_ELECTRA_SMALL', conll=False), output_key='dep', input_key='tok') \ 16 | .append(hanlp.load('CTB9_CON_ELECTRA_SMALL'), output_key='con', input_key='tok') 17 | 18 | doc = HanLP('2021年HanLPv2.1为生产环境带来次世代最先进的多语种NLP技术。阿婆主来到北京立方庭参观自然语义科技公司。') 19 | print(doc) 20 | doc.pretty_print() 21 | -------------------------------------------------------------------------------- /plugins/hanlp_demo/hanlp_demo/zh/demo_pos_dict.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # Author: hankcs 3 | # Date: 2020-12-15 22:26 4 | import hanlp 5 | from hanlp.components.mtl.multi_task_learning import MultiTaskLearning 6 | from hanlp.components.mtl.tasks.pos import TransformerTagging 7 | from hanlp.components.mtl.tasks.tok.tag_tok import TaggingTokenization 8 | from tests import cdroot 9 | 10 | cdroot() 11 | HanLP: MultiTaskLearning = hanlp.load(hanlp.pretrained.mtl.CLOSE_TOK_POS_NER_SRL_DEP_SDP_CON_ELECTRA_SMALL_ZH) 12 | 13 | # Demonstrates custom dict in part-of-speech tagging 14 | pos: TransformerTagging = HanLP['pos/ctb'] 15 | 16 | print(f'自定义单个词性:') 17 | pos.dict_tags = {'HanLP': 'state-of-the-art-tool'} 18 | HanLP("HanLP为生产环境带来次世代最先进的多语种NLP技术。", tasks='pos/ctb').pretty_print() 19 | 20 | print(f'根据上下文自定义词性:') 21 | pos.dict_tags = {('的', '希望'): ('补语成分', '名词'), '希望': '动词'} 22 | HanLP("我的希望是希望张晚霞的背影被晚霞映红。", tasks='pos/ctb').pretty_print() 23 | 24 | # 需要算法基础才能理解,初学者可参考 http://nlp.hankcs.com/book.php 25 | # See also https://hanlp.hankcs.com/docs/api/hanlp/components/taggers/transformer_tagger.html 26 | -------------------------------------------------------------------------------- /plugins/hanlp_demo/hanlp_demo/zh/demo_sts.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # Author: hankcs 3 | # Date: 2021-05-24 13:15 4 | import hanlp 5 | 6 | sim = hanlp.load(hanlp.pretrained.sts.STS_ELECTRA_BASE_ZH) 7 | print(sim([ 8 | ['看图猜一电影名', '看图猜电影'], 9 | ['无线路由器怎么无线上网', '无线上网卡和无线路由器怎么用'], 10 | ['北京到上海的动车票', '上海到北京的动车票'], 11 | ])) 12 | -------------------------------------------------------------------------------- /plugins/hanlp_demo/hanlp_demo/zh/demo_word2vec.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # Author: hankcs 3 | # Date: 2021-12-12 18:33 4 | import hanlp 5 | import torch 6 | 7 | word2vec = hanlp.load(hanlp.pretrained.word2vec.CONVSEG_W2V_NEWS_TENSITE_WORD_PKU) 8 | vec = word2vec('先进') 9 | print(vec) 10 | 11 | print(torch.nn.functional.cosine_similarity(word2vec('先进'), word2vec('优秀'), dim=0)) 12 | print(torch.nn.functional.cosine_similarity(word2vec('先进'), word2vec('水果'), dim=0)) 13 | 14 | print('获取语义最相似的词语:') 15 | print(word2vec.most_similar('上海')) 16 | # print(word2vec.most_similar(['上海', '寒冷'])) # batching更快 17 | 18 | print('非常寒冷是OOV所以无法获取:') 19 | print(word2vec.most_similar('非常寒冷')) 20 | print('但是在doc2vec模式下OOV也可以进行相似度计算:') 21 | print(word2vec.most_similar('非常寒冷', doc2vec=True)) 22 | print('甚至可以处理短文本:') 23 | print(word2vec.most_similar('国家图书馆推出2022年春节主题活动', doc2vec=True)) 24 | -------------------------------------------------------------------------------- /plugins/hanlp_demo/hanlp_demo/zh/tf/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # Author: hankcs 3 | # Date: 2020-12-31 20:36 4 | -------------------------------------------------------------------------------- /plugins/hanlp_demo/hanlp_demo/zh/tf/demo_classifier.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # Author: hankcs 3 | # Date: 2020-01-01 03:52 4 | from hanlp.datasets.classification.sentiment import CHNSENTICORP_ERNIE_TEST 5 | 6 | import hanlp 7 | 8 | classifier = hanlp.load('CHNSENTICORP_BERT_BASE_ZH') 9 | print(classifier.predict('前台客房服务态度非常好!早餐很丰富,房价很干净。再接再厉!')) 10 | 11 | # predict a whole file in batch mode 12 | outputs = classifier.predict(classifier.transform.file_to_inputs(CHNSENTICORP_ERNIE_TEST), gold=True) 13 | print(outputs[:5]) 14 | -------------------------------------------------------------------------------- /plugins/hanlp_demo/hanlp_demo/zh/tf/demo_client.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # Author: hankcs 3 | # Date: 2020-01-08 04:43 4 | # pip3 install tensorflow-serving-api-gpu 5 | import grpc 6 | import tensorflow as tf 7 | from tensorflow_core.python.framework import tensor_util 8 | from tensorflow_serving.apis import predict_pb2, prediction_service_pb2_grpc 9 | import hanlp 10 | from hanlp.common.keras_component import KerasComponent 11 | 12 | tagger: KerasComponent = hanlp.load(hanlp.pretrained.pos.CTB5_POS_RNN, transform_only=True) 13 | transform = tagger.transform 14 | del tagger 15 | 16 | inputs = [['商品', '和', '服务'], 17 | ['我', '的', '希望', '是', '希望', '和平']] 18 | 19 | samples = next(iter(transform.inputs_to_dataset(inputs)))[0] 20 | print(samples) 21 | 22 | channel = grpc.insecure_channel('{host}:{port}'.format(host='localhost', port=8500)) 23 | stub = prediction_service_pb2_grpc.PredictionServiceStub(channel) 24 | request = predict_pb2.PredictRequest() 25 | request.model_spec.name = 'ctb5_pos_rnn_20191229_015325' 26 | request.model_spec.signature_name = 'serving_default' 27 | request.inputs['embedding_input'].CopyFrom( 28 | tf.make_tensor_proto(samples, dtype=tf.float32)) 29 | result = stub.Predict(request, 10.0) # 10 secs timeout 30 | print(result) 31 | prediction = tensor_util.MakeNdarray(result.outputs['dense']) 32 | print(prediction) 33 | 34 | print(list(transform.Y_to_outputs(prediction, inputs=inputs))) 35 | -------------------------------------------------------------------------------- /plugins/hanlp_demo/hanlp_demo/zh/tf/demo_cws.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # Author: hankcs 3 | # Date: 2019-12-28 21:25 4 | import hanlp 5 | 6 | tokenizer = hanlp.load(hanlp.pretrained.tok.LARGE_ALBERT_BASE) 7 | print(tokenizer('商品和服务')) 8 | print(tokenizer(['萨哈夫说,伊拉克将同联合国销毁伊拉克大规模杀伤性武器特别委员会继续保持合作。', 9 | '上海华安工业(集团)公司董事长谭旭光和秘书张晚霞来到美国纽约现代艺术博物馆参观。', 10 | 'HanLP支援臺灣正體、香港繁體,具有新詞辨識能力的中文斷詞系統'])) 11 | 12 | text = 'NLP统计模型没有加规则,聪明人知道自己加。英文、数字、自定义词典统统都是规则。' 13 | print(tokenizer(text)) 14 | 15 | dic = {'自定义词典': 'custom_dict', '聪明人': 'smart'} 16 | 17 | 18 | def split_by_dic(text: str): 19 | # We use regular expression for the sake of simplicity. 20 | # However, you should use some trie trees for production 21 | import re 22 | p = re.compile('(' + '|'.join(dic.keys()) + ')') 23 | sents, offset, words = [], 0, [] 24 | for m in p.finditer(text): 25 | if offset < m.start(): 26 | sents.append(text[offset: m.start()]) 27 | words.append((m.group(), dic[m.group()])) 28 | offset = m.end() 29 | if offset < len(text): 30 | sents.append(text[offset:]) 31 | words.append((None, None)) 32 | flat = [] 33 | for pred, (word, tag) in zip(tokenizer(sents), words): 34 | flat.extend(pred) 35 | if word: 36 | flat.append((word, tag)) 37 | return flat 38 | 39 | 40 | print(split_by_dic(text)) 41 | -------------------------------------------------------------------------------- /plugins/hanlp_demo/hanlp_demo/zh/tf/demo_dep.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # Author: hankcs 3 | # Date: 2019-12-28 21:25 4 | import hanlp 5 | 6 | syntactic_parser = hanlp.load(hanlp.pretrained.dep.CTB7_BIAFFINE_DEP_ZH) 7 | sent = [('蜡烛', 'NN'), ('两', 'CD'), ('头', 'NN'), ('烧', 'VV')] 8 | tree = syntactic_parser(sent) 9 | print(tree) 10 | -------------------------------------------------------------------------------- /plugins/hanlp_demo/hanlp_demo/zh/tf/demo_fasttext.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # Author: hankcs 3 | # Date: 2021-12-12 18:33 4 | import hanlp 5 | import torch 6 | 7 | # fasttext is a `torch.nn.Module`. Unless you know how to code in 8 | # PyTorch, otherwise don't bother to use this. 9 | fasttext = hanlp.load(hanlp.pretrained.fasttext.FASTTEXT_WIKI_300_ZH) 10 | 11 | vec = fasttext('单词') 12 | print(vec) 13 | 14 | print(torch.nn.functional.cosine_similarity(fasttext('单词'), fasttext('词语'), dim=0)) 15 | print(torch.nn.functional.cosine_similarity(fasttext('单词'), fasttext('今天'), dim=0)) 16 | -------------------------------------------------------------------------------- /plugins/hanlp_demo/hanlp_demo/zh/tf/demo_multiprocess.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # Author: hankcs 3 | # Date: 2020-02-15 11:30 4 | import multiprocessing 5 | import hanlp 6 | 7 | tokenizer = hanlp.load(hanlp.pretrained.tok.LARGE_ALBERT_BASE) 8 | 9 | 10 | def worker(job): 11 | print(job) 12 | print(tokenizer(job)) 13 | 14 | 15 | if __name__ == '__main__': 16 | num_proc = 2 17 | # Important! The python multiprocessing package defaults to just call fork when creating a child process. 18 | # This cannot work when the child process calls async code (i.e TensorFlow is multithreaded). 19 | # See https://github.com/tensorflow/tensorflow/issues/8220#issuecomment-302826884 20 | # See https://sefiks.com/2019/03/20/tips-and-tricks-for-gpu-and-multiprocessing-in-tensorflow/ 21 | multiprocessing.set_start_method('spawn', force=True) # only spawn works with TensorFlow 22 | with multiprocessing.Pool(num_proc) as pool: 23 | pool.map(worker, [f'给{i}号进程的任务' for i in range(num_proc)]) 24 | -------------------------------------------------------------------------------- /plugins/hanlp_demo/hanlp_demo/zh/tf/demo_ner.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # Author: hankcs 3 | # Date: 2019-12-30 19:52 4 | import hanlp 5 | 6 | recognizer = hanlp.load(hanlp.pretrained.ner.MSRA_NER_BERT_BASE_ZH) 7 | print(recognizer.predict([list('上海华安工业(集团)公司董事长谭旭光和秘书张晚霞来到美国纽约现代艺术博物馆参观。'), 8 | list('萨哈夫说,伊拉克将同联合国销毁伊拉克大规模杀伤性武器特别委员会继续保持合作。')])) 9 | -------------------------------------------------------------------------------- /plugins/hanlp_demo/hanlp_demo/zh/tf/demo_pipeline.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # Author: hankcs 3 | # Date: 2019-12-31 03:24 4 | 5 | import hanlp 6 | 7 | tokenizer = hanlp.load('LARGE_ALBERT_BASE') 8 | tagger = hanlp.load('CTB9_POS_ALBERT_BASE') 9 | syntactic_parser = hanlp.load('CTB7_BIAFFINE_DEP_ZH') 10 | semantic_parser = hanlp.load('SEMEVAL16_TEXT_BIAFFINE_ZH') 11 | 12 | pipeline = hanlp.pipeline() \ 13 | .append(hanlp.utils.rules.split_sentence, output_key='sentences') \ 14 | .append(tokenizer, output_key='tokens') \ 15 | .append(tagger, output_key='part_of_speech_tags') \ 16 | .append(syntactic_parser, input_key=('tokens', 'part_of_speech_tags'), output_key='syntactic_dependencies', conll=False) \ 17 | .append(semantic_parser, input_key=('tokens', 'part_of_speech_tags'), output_key='semantic_dependencies', conll=False) 18 | print(pipeline) 19 | 20 | text = '''HanLP是一系列模型与算法组成的自然语言处理工具包,目标是普及自然语言处理在生产环境中的应用。 21 | HanLP具备功能完善、性能高效、架构清晰、语料时新、可自定义的特点。 22 | 内部算法经过工业界和学术界考验,配套书籍《自然语言处理入门》已经出版。 23 | ''' 24 | 25 | doc = pipeline(text) 26 | print(doc) 27 | # By default the doc is json serializable, it holds true if your pipes output json serializable object too. 28 | # print(json.dumps(doc, ensure_ascii=False, indent=2)) 29 | 30 | # You can save the config to disk for deploying or sharing. 31 | pipeline.save('zh.json') 32 | # Then load it smoothly. 33 | deployed = hanlp.load('zh.json') 34 | print(deployed) 35 | print(deployed(text)) 36 | -------------------------------------------------------------------------------- /plugins/hanlp_demo/hanlp_demo/zh/tf/demo_pos.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # Author: hankcs 3 | # Date: 2019-12-28 21:25 4 | import hanlp 5 | from hanlp.pretrained.pos import CTB9_POS_ALBERT_BASE 6 | 7 | tagger = hanlp.load(CTB9_POS_ALBERT_BASE) 8 | print(tagger.predict(['我', '的', '希望', '是', '希望', '世界', '和平'])) 9 | print(tagger.predict([['支持', '批处理', '地', '预测'], ['速度', '更', '快']])) 10 | -------------------------------------------------------------------------------- /plugins/hanlp_demo/hanlp_demo/zh/tf/demo_sdp.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # Author: hankcs 3 | # Date: 2019-12-31 23:55 4 | import hanlp 5 | 6 | semantic_parser = hanlp.load('SEMEVAL16_NEWS_BIAFFINE_ZH') 7 | sent = [('蜡烛', 'NN'), ('两', 'CD'), ('头', 'NN'), ('烧', 'VV')] 8 | print(semantic_parser(sent)) 9 | -------------------------------------------------------------------------------- /plugins/hanlp_demo/hanlp_demo/zh/tf/demo_serving.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # Author: hankcs 3 | # Date: 2020-01-06 20:23 4 | import hanlp 5 | from hanlp.common.keras_component import KerasComponent 6 | 7 | tagger: KerasComponent = hanlp.load(hanlp.pretrained.pos.CTB5_POS_RNN) 8 | print(tagger('商品 和 服务'.split())) 9 | tagger.serve() 10 | -------------------------------------------------------------------------------- /plugins/hanlp_demo/hanlp_demo/zh/tf/train/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # Author: hankcs 3 | # Date: 2021-12-26 23:25 4 | -------------------------------------------------------------------------------- /plugins/hanlp_demo/hanlp_demo/zh/tf/train/cws/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # Author: hankcs 3 | # Date: 2020-01-01 20:55 -------------------------------------------------------------------------------- /plugins/hanlp_demo/hanlp_demo/zh/tf/train/cws/train_ctb6_cws_albert.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # Author: hankcs 3 | # Date: 2019-12-28 22:22 4 | 5 | from hanlp.components.tokenizers.tok_tf import TransformerTokenizerTF 6 | from hanlp.datasets.tokenization.ctb6 import CTB6_CWS_TRAIN, CTB6_CWS_DEV, CTB6_CWS_TEST 7 | from tests import cdroot 8 | 9 | cdroot() 10 | tokenizer = TransformerTokenizerTF() 11 | save_dir = 'data/model/cws_bert_albert_ctb6' 12 | tokenizer.fit(CTB6_CWS_TRAIN, CTB6_CWS_DEV, save_dir, 13 | transformer='/home/ubuntu/hankcs/laser/data/transformer/albert_base_tf2', 14 | metrics='f1', learning_rate=5e-5, epochs=3) 15 | tokenizer.load(save_dir) 16 | print(tokenizer.predict(['中央民族乐团离开北京前往维也纳', '商品和服务'])) 17 | tokenizer.evaluate(CTB6_CWS_TEST, save_dir=save_dir) 18 | print(f'Model saved in {save_dir}') 19 | -------------------------------------------------------------------------------- /plugins/hanlp_demo/hanlp_demo/zh/tf/train/cws/train_ctb6_cws_bert.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # Author: hankcs 3 | # Date: 2019-12-28 22:22 4 | 5 | from hanlp.components.tokenizers.tok_tf import TransformerTokenizerTF 6 | from hanlp.datasets.tokenization.ctb6 import CTB6_CWS_TRAIN, CTB6_CWS_DEV, CTB6_CWS_TEST 7 | from tests import cdroot 8 | 9 | cdroot() 10 | tokenizer = TransformerTokenizerTF() 11 | save_dir = 'data/model/cws_bert_base_ctb6' 12 | tokenizer.fit(CTB6_CWS_TRAIN, CTB6_CWS_DEV, save_dir, transformer='chinese_L-12_H-768_A-12', 13 | metrics='f1') 14 | tokenizer.load(save_dir) 15 | print(tokenizer.predict(['中央民族乐团离开北京前往维也纳', '商品和服务'])) 16 | tokenizer.evaluate(CTB6_CWS_TEST, save_dir=save_dir) 17 | print(f'Model saved in {save_dir}') 18 | -------------------------------------------------------------------------------- /plugins/hanlp_demo/hanlp_demo/zh/tf/train/cws/train_ctb6_cws_convseg.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # Author: hankcs 3 | # Date: 2019-12-28 22:22 4 | import tensorflow as tf 5 | 6 | from hanlp.components.tokenizers.tok_tf import NgramConvTokenizerTF 7 | from hanlp.datasets.tokenization.ctb6 import CTB6_CWS_TRAIN, CTB6_CWS_DEV, CTB6_CWS_TEST 8 | from hanlp.pretrained.word2vec import CONVSEG_W2V_NEWS_TENSITE_CHAR 9 | from tests import cdroot 10 | 11 | cdroot() 12 | tokenizer = NgramConvTokenizerTF() 13 | save_dir = 'data/model/cws/ctb6_cws' 14 | optimizer = tf.keras.optimizers.Adam(learning_rate=0.001, 15 | epsilon=1e-8, clipnorm=5) 16 | tokenizer.fit(CTB6_CWS_TRAIN, 17 | CTB6_CWS_DEV, 18 | save_dir, 19 | word_embed={'class_name': 'HanLP>Word2VecEmbedding', 20 | 'config': { 21 | 'trainable': True, 22 | 'filepath': CONVSEG_W2V_NEWS_TENSITE_CHAR, 23 | 'expand_vocab': False, 24 | 'lowercase': False, 25 | }}, 26 | optimizer=optimizer, 27 | window_size=0, 28 | weight_norm=True) 29 | tokenizer.evaluate(CTB6_CWS_TEST, save_dir=save_dir, output=False) 30 | print(tokenizer.predict(['中央民族乐团离开北京前往维也纳', '商品和服务'])) 31 | print(f'Model saved in {save_dir}') 32 | -------------------------------------------------------------------------------- /plugins/hanlp_demo/hanlp_demo/zh/tf/train/cws/train_large_bert_cws.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # Author: hankcs 3 | # Date: 2019-12-21 15:39 4 | from hanlp.components.tokenizers.tok_tf import TransformerTokenizerTF 5 | from hanlp.datasets.tokenization.ctb6 import CTB6_CWS_DEV, CTB6_CWS_TEST 6 | from tests import cdroot 7 | 8 | cdroot() 9 | tokenizer = TransformerTokenizerTF() 10 | save_dir = 'data/model/cws_bert_base_100million' 11 | tokenizer.fit('data/cws/large/all.txt', CTB6_CWS_DEV, save_dir, transformer='bert-base-chinese', 12 | metrics='accuracy', batch_size=32) 13 | tokenizer.load(save_dir, metrics='f1') 14 | print(tokenizer.predict(['中央民族乐团离开北京前往维也纳', '商品和服务'])) 15 | tokenizer.evaluate(CTB6_CWS_TEST, save_dir=save_dir) 16 | print(f'Model saved in {save_dir}') 17 | -------------------------------------------------------------------------------- /plugins/hanlp_demo/hanlp_demo/zh/tf/train/cws/train_large_conv_cws.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # Author: hankcs 3 | # Date: 2019-12-29 21:58 4 | 5 | import tensorflow as tf 6 | 7 | from hanlp.components.tokenizers.tok_tf import NgramConvTokenizerTF 8 | from hanlp.datasets.cws.ctb import CTB6_CWS_TRAIN, CTB6_CWS_DEV, CTB6_CWS_TEST 9 | from hanlp.pretrained.word2vec import CONVSEG_W2V_NEWS_TENSITE_CHAR 10 | from tests import cdroot 11 | 12 | cdroot() 13 | tokenizer = NgramConvTokenizerTF() 14 | save_dir = 'data/model/cws/ctb6_cws' 15 | optimizer = tf.keras.optimizers.Adam(learning_rate=0.001, 16 | epsilon=1e-8, clipnorm=5) 17 | tokenizer.fit(CTB6_CWS_TRAIN, 18 | CTB6_CWS_DEV, 19 | save_dir, 20 | word_embed={'class_name': 'HanLP>Word2VecEmbedding', 21 | 'config': { 22 | 'trainable': True, 23 | 'filepath': CONVSEG_W2V_NEWS_TENSITE_CHAR, 24 | 'expand_vocab': False, 25 | 'lowercase': False, 26 | }}, 27 | optimizer=optimizer, 28 | window_size=0, 29 | weight_norm=True) 30 | tokenizer.evaluate(CTB6_CWS_TEST, save_dir=save_dir, output=False) 31 | print(tokenizer.predict(['中央民族乐团离开北京前往维也纳', '商品和服务'])) 32 | print(f'Model saved in {save_dir}') 33 | -------------------------------------------------------------------------------- /plugins/hanlp_demo/hanlp_demo/zh/tf/train/cws/train_large_cws_albert.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # Author: hankcs 3 | # Date: 2019-12-28 22:22 4 | from hanlp.components.tokenizers.tok_tf import TransformerTokenizerTF 5 | from hanlp.datasets.tokenization.ctb6 import CTB6_CWS_DEV, CTB6_CWS_TEST 6 | from tests import cdroot 7 | 8 | cdroot() 9 | tokenizer = TransformerTokenizerTF() 10 | save_dir = 'data/model/large_corpus_cws_albert_base' 11 | tokenizer.fit('data/cws/large/all.txt', 12 | CTB6_CWS_DEV, save_dir, 13 | transformer='uer/albert-base-chinese-cluecorpussmall', 14 | max_seq_length=128, 15 | metrics='accuracy', learning_rate=5e-5, epochs=3) 16 | tokenizer.load(save_dir, metrics='f1') 17 | print(tokenizer.predict(['中央民族乐团离开北京前往维也纳', '商品和服务'])) 18 | tokenizer.evaluate(CTB6_CWS_TEST, save_dir=save_dir) 19 | print(f'Model saved in {save_dir}') 20 | -------------------------------------------------------------------------------- /plugins/hanlp_demo/hanlp_demo/zh/tf/train/cws/train_large_cws_electra.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # Author: hankcs 3 | # Date: 2019-12-28 22:22 4 | from hanlp.components.tokenizers.tok_tf import TransformerTokenizerTF 5 | from hanlp.datasets.tokenization.ctb6 import CTB6_CWS_DEV, CTB6_CWS_TEST 6 | from tests import cdroot 7 | 8 | cdroot() 9 | tokenizer = TransformerTokenizerTF() 10 | save_dir = 'data/model/large_corpus_cws_electra_small' 11 | tokenizer.fit('data/cws/large/all.txt', 12 | CTB6_CWS_DEV, save_dir, 13 | transformer='hfl/chinese-electra-small-discriminator', 14 | max_seq_length=128, 15 | metrics='accuracy', learning_rate=5e-5, epochs=10) 16 | tokenizer.load(save_dir, metrics='f1') 17 | print(tokenizer.predict(['中央民族乐团离开北京前往维也纳', '商品和服务'])) 18 | tokenizer.evaluate(CTB6_CWS_TEST, save_dir=save_dir) 19 | print(f'Model saved in {save_dir}') 20 | -------------------------------------------------------------------------------- /plugins/hanlp_demo/hanlp_demo/zh/tf/train/cws/train_msr_cws_albert.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # Author: hankcs 3 | # Date: 2019-12-28 22:22 4 | 5 | from hanlp.components.tokenizers.tok import TransformerTokenizer 6 | from hanlp.datasets.cws.ctb import CTB6_CWS_TEST 7 | from hanlp.datasets.tokenization.sighan2005.msr import SIGHAN2005_MSR_VALID, SIGHAN2005_MSR_TRAIN 8 | from tests import cdroot 9 | 10 | cdroot() 11 | tokenizer = TransformerTokenizer() 12 | save_dir = 'data/model/msr_cws_albert_base' 13 | tokenizer.fit(SIGHAN2005_MSR_TRAIN, SIGHAN2005_MSR_VALID, save_dir, 14 | transformer='albert_base_zh', 15 | max_seq_length=150, 16 | metrics='f1', learning_rate=5e-5, epochs=10) 17 | tokenizer.load(save_dir) 18 | print(tokenizer.predict(['中央民族乐团离开北京前往维也纳', '商品和服务'])) 19 | tokenizer.evaluate(CTB6_CWS_TEST, save_dir=save_dir) 20 | print(f'Model saved in {save_dir}') 21 | -------------------------------------------------------------------------------- /plugins/hanlp_demo/hanlp_demo/zh/tf/train/cws/train_msr_cws_bert.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # Author: hankcs 3 | # Date: 2019-12-21 15:39 4 | from hanlp.components.tokenizers.tok_tf import TransformerTokenizerTF 5 | from hanlp.datasets.tokenization.sighan2005.msr import SIGHAN2005_MSR_TRAIN, SIGHAN2005_MSR_DEV, SIGHAN2005_MSR_TEST 6 | from tests import cdroot 7 | 8 | cdroot() 9 | tokenizer = TransformerTokenizerTF() 10 | save_dir = 'data/model/cws_bert_base_msra' 11 | tokenizer.fit(SIGHAN2005_MSR_TRAIN, SIGHAN2005_MSR_DEV, save_dir, transformer='bert-base-chinese', 12 | metrics='f1') 13 | # tagger.load(save_dir) 14 | print(tokenizer.predict(['中央民族乐团离开北京前往维也纳', '商品和服务'])) 15 | tokenizer.evaluate(SIGHAN2005_MSR_TEST, save_dir=save_dir) 16 | print(f'Model saved in {save_dir}') 17 | -------------------------------------------------------------------------------- /plugins/hanlp_demo/hanlp_demo/zh/tf/train/cws/train_msr_cws_ngram_conv.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # Author: hankcs 3 | # Date: 2019-12-21 15:39 4 | import tensorflow as tf 5 | 6 | from hanlp.components.tokenizers.tok_tf import NgramConvTokenizerTF 7 | from hanlp.datasets.tokenization.sighan2005.msr import SIGHAN2005_MSR_TRAIN, SIGHAN2005_MSR_DEV, SIGHAN2005_MSR_TEST 8 | from hanlp.pretrained.word2vec import CONVSEG_W2V_NEWS_TENSITE_CHAR 9 | from tests import cdroot 10 | 11 | cdroot() 12 | tokenizer = NgramConvTokenizerTF() 13 | save_dir = 'data/model/cws/convseg-msr-nocrf-noembed' 14 | tokenizer.fit(SIGHAN2005_MSR_TRAIN, 15 | SIGHAN2005_MSR_DEV, 16 | save_dir, 17 | word_embed={'class_name': 'HanLP>Word2VecEmbedding', 18 | 'config': { 19 | 'trainable': True, 20 | 'filepath': CONVSEG_W2V_NEWS_TENSITE_CHAR, 21 | 'expand_vocab': False, 22 | 'lowercase': False, 23 | }}, 24 | optimizer=tf.keras.optimizers.Adam(learning_rate=0.001, 25 | epsilon=1e-8, clipnorm=5), 26 | epochs=100, 27 | window_size=0, 28 | metrics='f1', 29 | weight_norm=True) 30 | print(tokenizer.predict(['中央民族乐团离开北京前往维也纳', '商品和服务'])) 31 | tokenizer.evaluate(SIGHAN2005_MSR_TEST, save_dir=save_dir) 32 | -------------------------------------------------------------------------------- /plugins/hanlp_demo/hanlp_demo/zh/tf/train/cws/train_pku980106_conv_cws.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # Author: hankcs 3 | # Date: 2019-12-21 15:39 4 | import tensorflow as tf 5 | 6 | from hanlp.components.tokenizers.tok_tf import NgramConvTokenizerTF 7 | from hanlp.pretrained.word2vec import RADICAL_CHAR_EMBEDDING_100 8 | from tests import cdroot 9 | 10 | cdroot() 11 | 12 | tokenizer = NgramConvTokenizerTF() 13 | save_dir = 'data/model/cws/pku98_6m_conv_ngram' 14 | optimizer = tf.keras.optimizers.Adam(learning_rate=0.001, 15 | epsilon=1e-8, clipnorm=5) 16 | tokenizer.fit('data/cws/pku98/199801-06-seg.txt', 17 | 'data/cws/pku98/test_pku98_name_merged.txt', 18 | save_dir, 19 | word_embed={'class_name': 'HanLP>Word2VecEmbedding', 20 | 'config': { 21 | 'trainable': False, 22 | 'filepath': RADICAL_CHAR_EMBEDDING_100, 23 | 'expand_vocab': True, 24 | 'lowercase': False, 25 | }}, 26 | optimizer=optimizer, 27 | window_size=0, 28 | weight_norm=True) 29 | tokenizer.evaluate('data/cws/pku98/test_pku98_name_merged.txt', save_dir=save_dir, output=False) 30 | print(tokenizer.predict(['中央民族乐团离开北京前往维也纳', '商品和服务'])) 31 | print(f'Model saved in {save_dir}') 32 | -------------------------------------------------------------------------------- /plugins/hanlp_demo/hanlp_demo/zh/tf/train/cws/train_pku980106_rnn_cws.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # Author: hankcs 3 | # Date: 2019-12-21 15:39 4 | import tensorflow as tf 5 | 6 | from hanlp.components.tokenizers.tok_tf import RNNTokenizerTF 7 | from hanlp.pretrained.word2vec import RADICAL_CHAR_EMBEDDING_100 8 | from tests import cdroot 9 | 10 | cdroot() 11 | 12 | tokenizer = RNNTokenizerTF() 13 | save_dir = 'data/model/cws/pku_6m_rnn_cws' 14 | optimizer = tf.keras.optimizers.Adam(learning_rate=0.001, 15 | epsilon=1e-8, clipnorm=5) 16 | tokenizer.fit('data/cws/pku98/199801-06-seg.txt', 17 | 'data/cws/pku98/pku98_test.txt', 18 | save_dir, 19 | embeddings={'class_name': 'HanLP>Word2VecEmbedding', 20 | 'config': { 21 | 'trainable': False, 22 | 'filepath': RADICAL_CHAR_EMBEDDING_100, 23 | 'expand_vocab': True, 24 | 'lowercase': False, 25 | }} 26 | ) 27 | tokenizer.evaluate('data/cws/pku98/pku98_test.txt', save_dir=save_dir, output=False) 28 | print(tokenizer.predict(['中央民族乐团离开北京前往维也纳', '商品和服务'])) 29 | print(f'Model saved in {save_dir}') 30 | -------------------------------------------------------------------------------- /plugins/hanlp_demo/hanlp_demo/zh/tf/train/finetune_msra_ner_albert.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # Author: hankcs 3 | # Date: 2019-12-28 23:15 4 | import hanlp 5 | from hanlp.components.ner.ner_tf import TransformerNamedEntityRecognizerTF 6 | from hanlp.datasets.ner.msra import MSRA_NER_CHAR_LEVEL_TRAIN, MSRA_NER_CHAR_LEVEL_DEV, MSRA_NER_CHAR_LEVEL_TEST 7 | from tests import cdroot 8 | 9 | cdroot() 10 | recognizer = TransformerNamedEntityRecognizerTF() 11 | save_dir = 'data/model/ner/finetune_ner_albert_base_zh_msra' 12 | recognizer.fit(MSRA_NER_CHAR_LEVEL_TRAIN, MSRA_NER_CHAR_LEVEL_DEV, save_dir, transformer='albert_base_zh', 13 | finetune=hanlp.pretrained.ner.MSRA_NER_ALBERT_BASE_ZH) 14 | recognizer.load(save_dir) 15 | print(recognizer.predict(list('上海华安工业(集团)公司董事长谭旭光和秘书张晚霞来到美国纽约现代艺术博物馆参观。'))) 16 | recognizer.evaluate(MSRA_NER_CHAR_LEVEL_TEST, save_dir=save_dir) 17 | print(f'Model saved in {save_dir}') 18 | -------------------------------------------------------------------------------- /plugins/hanlp_demo/hanlp_demo/zh/tf/train/train_chnsenticorp_bert.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # Author: hankcs 3 | # Date: 2019-12-30 21:01 4 | from hanlp.components.classifiers.transformer_classifier_tf import TransformerClassifierTF, TransformerTextTransform 5 | from hanlp.datasets.classification.sentiment import CHNSENTICORP_ERNIE_TRAIN, CHNSENTICORP_ERNIE_TEST, \ 6 | CHNSENTICORP_ERNIE_DEV 7 | from tests import cdroot 8 | 9 | cdroot() 10 | save_dir = 'data/model/classification/chnsenticorp_bert_base' 11 | classifier = TransformerClassifierTF(TransformerTextTransform(y_column=0)) 12 | classifier.fit(CHNSENTICORP_ERNIE_TRAIN, CHNSENTICORP_ERNIE_DEV, save_dir, 13 | transformer='bert-base-chinese') 14 | classifier.load(save_dir) 15 | print(classifier.predict('前台客房服务态度非常好!早餐很丰富,房价很干净。再接再厉!')) 16 | classifier.evaluate(CHNSENTICORP_ERNIE_TEST, save_dir=save_dir) 17 | -------------------------------------------------------------------------------- /plugins/hanlp_demo/hanlp_demo/zh/tf/train/train_conll03_ner_bert.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # Author: hankcs 3 | # Date: 2019-10-25 21:34 4 | from hanlp.components.ner.ner_tf import TransformerNamedEntityRecognizerTF 5 | from hanlp.datasets.ner.conll03 import CONLL03_EN_TRAIN, CONLL03_EN_DEV, CONLL03_EN_TEST 6 | from tests import cdroot 7 | 8 | cdroot() 9 | tagger = TransformerNamedEntityRecognizerTF() 10 | save_dir = 'data/model/ner/ner_conll03_bert_base_cased_en' 11 | tagger.fit(CONLL03_EN_TRAIN, CONLL03_EN_DEV, save_dir, transformer='bert-base-cased', 12 | metrics='accuracy') 13 | tagger.load(save_dir, metrics='f1') 14 | print(tagger.predict('West Indian all-rounder Phil Simmons eats apple .'.split())) 15 | tagger.evaluate(CONLL03_EN_TEST, save_dir=save_dir, output=False, batch_size=32) 16 | print(f'Model saved in {save_dir}') 17 | -------------------------------------------------------------------------------- /plugins/hanlp_demo/hanlp_demo/zh/tf/train/train_ctb5_dep.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # Author: hankcs 3 | # Date: 2019-12-28 18:33 4 | from hanlp.components.parsers.biaffine_parser_tf import BiaffineDependencyParserTF 5 | from hanlp.datasets.parsing.ctb5 import CTB5_DEP_TRAIN, CTB5_DEP_DEV, CTB5_DEP_TEST 6 | from hanlp.pretrained.word2vec import CTB5_FASTTEXT_300_CN 7 | from tests import cdroot 8 | 9 | cdroot() 10 | save_dir = 'data/model/dep/biaffine_ctb' 11 | parser = BiaffineDependencyParserTF() 12 | parser.fit(CTB5_DEP_TRAIN, CTB5_DEP_DEV, save_dir, 13 | pretrained_embed={'class_name': 'HanLP>Word2VecEmbedding', 14 | 'config': { 15 | 'trainable': False, 16 | 'embeddings_initializer': 'zero', 17 | 'filepath': CTB5_FASTTEXT_300_CN, 18 | 'expand_vocab': True, 19 | 'lowercase': True, 20 | 'normalize': True, 21 | }}, 22 | ) 23 | parser.load(save_dir) 24 | sentence = [('中国', 'NR'), ('批准', 'VV'), ('设立', 'VV'), ('外商', 'NN'), ('投资', 'NN'), ('企业', 'NN'), ('逾', 'VV'), 25 | ('三十万', 'CD'), ('家', 'M')] 26 | print(parser.predict(sentence)) 27 | parser.evaluate(CTB5_DEP_TEST, save_dir) 28 | -------------------------------------------------------------------------------- /plugins/hanlp_demo/hanlp_demo/zh/tf/train/train_ctb5_pos_rnn.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # Author: hankcs 3 | # Date: 2019-12-28 22:46 4 | from hanlp.components.taggers.pos_tf import RNNPartOfSpeechTaggerTF 5 | from hanlp.datasets.pos.ctb5 import CTB5_POS_TRAIN, CTB5_POS_DEV, CTB5_POS_TEST 6 | from hanlp.pretrained.fasttext import FASTTEXT_WIKI_300_ZH 7 | from tests import cdroot 8 | 9 | cdroot() 10 | tagger = RNNPartOfSpeechTaggerTF() 11 | save_dir = 'data/model/pos/ctb5_pos_rnn_fasttext' 12 | tagger.fit(CTB5_POS_TRAIN, CTB5_POS_DEV, save_dir, embeddings={'class_name': 'HanLP>FastTextEmbedding', 13 | 'config': {'filepath': FASTTEXT_WIKI_300_ZH}}, ) 14 | tagger.evaluate(CTB5_POS_TEST, save_dir=save_dir) 15 | print(f'Model saved in {save_dir}') 16 | -------------------------------------------------------------------------------- /plugins/hanlp_demo/hanlp_demo/zh/tf/train/train_ctb7_dep.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # Author: hankcs 3 | # Date: 2019-12-28 18:33 4 | from hanlp.components.parsers.biaffine_parser_tf import BiaffineDependencyParserTF 5 | from hanlp.datasets.parsing.ctb5 import CIP_W2V_100_CN 6 | from hanlp.datasets.parsing.ctb7 import CTB7_DEP_TRAIN, CTB7_DEP_DEV, CTB7_DEP_TEST 7 | from tests import cdroot 8 | 9 | cdroot() 10 | save_dir = 'data/model/dep/biaffine_ctb7' 11 | parser = BiaffineDependencyParserTF() 12 | parser.fit(CTB7_DEP_TRAIN, CTB7_DEP_DEV, save_dir, 13 | pretrained_embed={'class_name': 'HanLP>Word2VecEmbedding', 14 | 'config': { 15 | 'trainable': False, 16 | 'embeddings_initializer': 'zero', 17 | 'filepath': CIP_W2V_100_CN, 18 | 'expand_vocab': True, 19 | 'lowercase': True, 20 | 'normalize': True, 21 | }}, 22 | ) 23 | parser.load(save_dir) 24 | sentence = [('中国', 'NR'), ('批准', 'VV'), ('设立', 'VV'), ('外商', 'NN'), ('投资', 'NN'), ('企业', 'NN'), ('逾', 'VV'), 25 | ('三十万', 'CD'), ('家', 'M')] 26 | print(parser.predict(sentence)) 27 | parser.evaluate(CTB7_DEP_TEST, save_dir) 28 | -------------------------------------------------------------------------------- /plugins/hanlp_demo/hanlp_demo/zh/tf/train/train_ctb9_pos_albert.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # Author: hankcs 3 | # Date: 2019-12-28 23:15 4 | from hanlp.components.taggers.transformers.transformer_tagger_tf import TransformerTaggerTF 5 | from tests import cdroot 6 | 7 | cdroot() 8 | tagger = TransformerTaggerTF() 9 | save_dir = 'data/model/pos/ctb9_albert_base' 10 | tagger.fit('data/pos/ctb9/train.tsv', 11 | 'data/pos/ctb9/test.tsv', 12 | save_dir, 13 | transformer='uer/albert-base-chinese-cluecorpussmall', 14 | max_seq_length=130, 15 | warmup_steps_ratio=0.1, 16 | epochs=20, 17 | learning_rate=5e-5) 18 | tagger.load(save_dir) 19 | print(tagger(['我', '的', '希望', '是', '希望', '和平'])) 20 | tagger.evaluate('data/pos/ctb9/test.tsv', save_dir=save_dir) 21 | print(f'Model saved in {save_dir}') 22 | -------------------------------------------------------------------------------- /plugins/hanlp_demo/hanlp_demo/zh/tf/train/train_ctb9_pos_electra.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # Author: hankcs 3 | # Date: 2019-12-28 23:15 4 | from hanlp.components.taggers.transformers.transformer_tagger_tf import TransformerTaggerTF 5 | from tests import cdroot 6 | 7 | cdroot() 8 | tagger = TransformerTaggerTF() 9 | save_dir = 'data/model/pos/ctb9_electra_small_zh_epoch_20' 10 | tagger.fit('data/pos/ctb9/train.tsv', 11 | 'data/pos/ctb9/test.tsv', 12 | save_dir, 13 | transformer='hfl/chinese-electra-small-discriminator', 14 | max_seq_length=130, 15 | warmup_steps_ratio=0.1, 16 | epochs=20, 17 | learning_rate=5e-5) 18 | tagger.load(save_dir) 19 | print(tagger(['我', '的', '希望', '是', '希望', '和平'])) 20 | tagger.evaluate('data/pos/ctb9/test.tsv', save_dir=save_dir) 21 | print(f'Model saved in {save_dir}') 22 | -------------------------------------------------------------------------------- /plugins/hanlp_demo/hanlp_demo/zh/tf/train/train_msra_ner_albert.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # Author: hankcs 3 | # Date: 2019-12-28 23:15 4 | from hanlp.components.ner.ner_tf import TransformerNamedEntityRecognizerTF 5 | from hanlp.datasets.ner.msra import MSRA_NER_CHAR_LEVEL_TRAIN, MSRA_NER_CHAR_LEVEL_DEV, MSRA_NER_CHAR_LEVEL_TEST 6 | from tests import cdroot 7 | 8 | cdroot() 9 | recognizer = TransformerNamedEntityRecognizerTF() 10 | save_dir = 'data/model/ner/msra_ner_albert_base' 11 | recognizer.fit(MSRA_NER_CHAR_LEVEL_TRAIN, MSRA_NER_CHAR_LEVEL_DEV, save_dir, 12 | transformer='uer/albert-base-chinese-cluecorpussmall', 13 | learning_rate=5e-5, 14 | metrics='accuracy') # Use accuracy to speed up training 15 | recognizer.load(save_dir, metrics='f1') 16 | print(recognizer.predict(list('上海华安工业(集团)公司董事长谭旭光和秘书张晚霞来到美国纽约现代艺术博物馆参观。'))) 17 | recognizer.evaluate(MSRA_NER_CHAR_LEVEL_TEST, save_dir=save_dir) 18 | print(f'Model saved in {save_dir}') 19 | -------------------------------------------------------------------------------- /plugins/hanlp_demo/hanlp_demo/zh/tf/train/train_msra_ner_bert.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # Author: hankcs 3 | # Date: 2019-12-28 23:15 4 | from hanlp.components.ner.ner_tf import TransformerNamedEntityRecognizerTF 5 | from hanlp.datasets.ner.msra import MSRA_NER_CHAR_LEVEL_TRAIN, MSRA_NER_CHAR_LEVEL_DEV, MSRA_NER_CHAR_LEVEL_TEST 6 | from tests import cdroot 7 | 8 | cdroot() 9 | recognizer = TransformerNamedEntityRecognizerTF() 10 | save_dir = 'data/model/ner/ner_bert_base_msra_1' 11 | recognizer.fit(MSRA_NER_CHAR_LEVEL_TRAIN, MSRA_NER_CHAR_LEVEL_DEV, save_dir, transformer='bert-base-chinese', 12 | metrics='accuracy') # accuracy is faster 13 | recognizer.load(save_dir, metrics='f1') 14 | print(recognizer.predict(list('上海华安工业(集团)公司董事长谭旭光和秘书张晚霞来到美国纽约现代艺术博物馆参观。'))) 15 | recognizer.evaluate(MSRA_NER_CHAR_LEVEL_TEST, save_dir=save_dir) 16 | print(f'Model saved in {save_dir}') 17 | -------------------------------------------------------------------------------- /plugins/hanlp_demo/hanlp_demo/zh/tf/train/train_msra_ner_electra.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # Author: hankcs 3 | # Date: 2019-12-28 23:15 4 | from hanlp.components.ner.ner_tf import TransformerNamedEntityRecognizerTF 5 | from hanlp.datasets.ner.msra import MSRA_NER_CHAR_LEVEL_TRAIN, MSRA_NER_CHAR_LEVEL_DEV, MSRA_NER_CHAR_LEVEL_TEST 6 | from tests import cdroot 7 | 8 | cdroot() 9 | recognizer = TransformerNamedEntityRecognizerTF() 10 | save_dir = 'data/model/ner/ner_electra_small_zh_msra_sparse_categorical_crossentropy' 11 | recognizer.fit(MSRA_NER_CHAR_LEVEL_TRAIN, MSRA_NER_CHAR_LEVEL_DEV, save_dir, 12 | transformer='hfl/chinese-electra-small-discriminator', 13 | learning_rate=5e-5, 14 | metrics='accuracy') # Use accuracy to speed up training 15 | recognizer.load(save_dir, metrics='f1') 16 | print(recognizer.predict(list('上海华安工业(集团)公司董事长谭旭光和秘书张晚霞来到美国纽约现代艺术博物馆参观。'))) 17 | recognizer.evaluate(MSRA_NER_CHAR_LEVEL_TEST, save_dir=save_dir) 18 | print(f'Model saved in {save_dir}') 19 | -------------------------------------------------------------------------------- /plugins/hanlp_demo/hanlp_demo/zh/tf/train/train_msra_ner_rnn.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # Author: hankcs 3 | # Date: 2019-12-28 23:15 4 | from hanlp.components.ner.ner_tf import RNNNamedEntityRecognizerTF 5 | from hanlp.datasets.ner.msra import MSRA_NER_CHAR_LEVEL_TRAIN, MSRA_NER_CHAR_LEVEL_DEV, MSRA_NER_CHAR_LEVEL_TEST 6 | from hanlp.pretrained.word2vec import RADICAL_CHAR_EMBEDDING_100 7 | from tests import cdroot 8 | 9 | cdroot() 10 | recognizer = RNNNamedEntityRecognizerTF() 11 | save_dir = 'data/model/ner/msra_ner_rnn' 12 | recognizer.fit(MSRA_NER_CHAR_LEVEL_TRAIN, MSRA_NER_CHAR_LEVEL_DEV, save_dir, 13 | embeddings=RADICAL_CHAR_EMBEDDING_100, 14 | embedding_trainable=True, 15 | epochs=100) 16 | recognizer.evaluate(MSRA_NER_CHAR_LEVEL_TEST, save_dir) 17 | -------------------------------------------------------------------------------- /plugins/hanlp_demo/hanlp_demo/zh/tf/train/train_ptb_dep_biaffine_albert.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # Author: hankcs 3 | # Date: 2020-03-07 23:48 4 | from hanlp.metrics.parsing import conllx_eval 5 | 6 | from hanlp.datasets.parsing.ptb import PTB_SD330_DEV, PTB_SD330_TRAIN, PTB_SD330_TEST, PTB_TOKEN_MAPPING 7 | from hanlp.components.parsers.biaffine_parser_tf import BiaffineTransformerDependencyParserTF 8 | from tests import cdroot 9 | 10 | cdroot() 11 | save_dir = 'data/model/dep/ptb_albert3' 12 | parser = BiaffineTransformerDependencyParserTF() 13 | parser.fit(PTB_SD330_TRAIN, PTB_SD330_DEV, save_dir, 14 | 'albert-xxlarge-v2', 15 | batch_size=256, 16 | warmup_steps_ratio=.1, 17 | token_mapping=PTB_TOKEN_MAPPING, 18 | samples_per_batch=150, 19 | transformer_dropout=.33, 20 | learning_rate=2e-3, 21 | learning_rate_transformer=1e-5, 22 | # early_stopping_patience=10, 23 | ) 24 | parser.load(save_dir) 25 | # output = f'{save_dir}/test.predict.conll' 26 | parser.evaluate(PTB_SD330_TEST, save_dir, warm_up=False) 27 | # uas, las = conllx_eval.evaluate(PTB_SD330_TEST, output) 28 | # print(f'Official UAS: {uas:.4f} LAS: {las:.4f}') 29 | print(f'Model saved in {save_dir}') 30 | -------------------------------------------------------------------------------- /plugins/hanlp_demo/hanlp_demo/zh/tf/train/train_ptb_dep_biaffine_bert.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # Author: hankcs 3 | # Date: 2020-03-07 23:48 4 | from hanlp.metrics.parsing import conllx_eval 5 | 6 | from hanlp.datasets.parsing.ptb import PTB_SD330_DEV, PTB_SD330_TRAIN, PTB_SD330_TEST, PTB_TOKEN_MAPPING 7 | from hanlp.components.parsers.biaffine_parser_tf import BiaffineTransformerDependencyParserTF 8 | from tests import cdroot 9 | 10 | cdroot() 11 | save_dir = 'data/model/dep/ptb_bert_1e-5' 12 | parser = BiaffineTransformerDependencyParserTF() 13 | # parser.fit(PTB_SD330_TRAIN, PTB_SD330_DEV, save_dir, 'bert-base-uncased', 14 | # batch_size=3000, 15 | # warmup_steps_ratio=.1, 16 | # token_mapping=PTB_TOKEN_MAPPING, 17 | # samples_per_batch=150, 18 | # transformer_dropout=.33, 19 | # learning_rate=2e-3, 20 | # learning_rate_transformer=1e-5, 21 | # # early_stopping_patience=10, 22 | # ) 23 | parser.load(save_dir, tree='tarjan') 24 | # output = f'{save_dir}/test.predict.conll' 25 | parser.evaluate(PTB_SD330_TEST, save_dir, warm_up=False) 26 | # uas, las = conllx_eval.evaluate(PTB_SD330_TEST, output) 27 | # print(f'Official UAS: {uas:.4f} LAS: {las:.4f}') 28 | print(f'Model saved in {save_dir}') 29 | -------------------------------------------------------------------------------- /plugins/hanlp_demo/hanlp_demo/zh/tf/train/train_ptb_dep_biaffine_bert_96.6.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # Author: hankcs 3 | # Date: 2020-03-07 23:48 4 | 5 | from hanlp.datasets.parsing.ptb import PTB_SD330_DEV, PTB_SD330_TRAIN, PTB_SD330_TEST, PTB_TOKEN_MAPPING 6 | from hanlp.components.parsers.biaffine_parser_tf import BiaffineTransformerDependencyParserTF 7 | from tests import cdroot 8 | from hanlp.metrics.parsing import conllx_eval 9 | 10 | cdroot() 11 | save_dir = 'data/model/dep/ptb_bert_96.61' 12 | parser = BiaffineTransformerDependencyParserTF() 13 | # parser.fit(PTB_SD330_TRAIN, PTB_SD330_DEV, save_dir, 'bert-base-uncased', 14 | # batch_size=3000, 15 | # warmup_steps_ratio=.1, 16 | # token_mapping=PTB_TOKEN_MAPPING, 17 | # samples_per_batch=150, 18 | # ) 19 | parser.load(save_dir) 20 | output = f'{save_dir}/test.predict.conll' 21 | parser.evaluate(PTB_SD330_TEST, save_dir, warm_up=False, output=output) 22 | uas, las = conllx_eval.evaluate(PTB_SD330_TEST, output) 23 | print(f'Official UAS: {uas:.4f} LAS: {las:.4f}') 24 | print(f'Model saved in {save_dir}') 25 | -------------------------------------------------------------------------------- /plugins/hanlp_demo/hanlp_demo/zh/tf/train/train_ptb_dep_biaffine_bert_positional.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # Author: hankcs 3 | # Date: 2020-03-07 23:48 4 | from hanlp.metrics.parsing import conllx_eval 5 | 6 | from hanlp.datasets.parsing.ptb import PTB_SD330_DEV, PTB_SD330_TRAIN, PTB_SD330_TEST, PTB_TOKEN_MAPPING 7 | from hanlp.components.parsers.biaffine_parser_tf import BiaffineTransformerDependencyParserTF 8 | from tests import cdroot 9 | 10 | cdroot() 11 | save_dir = 'data/model/dep/ptb_bert_positional_diff_lr' 12 | parser = BiaffineTransformerDependencyParserTF() 13 | parser.fit(PTB_SD330_TRAIN, PTB_SD330_DEV, save_dir, 'bert-base-uncased', 14 | batch_size=3000, 15 | warmup_steps_ratio=.1, 16 | token_mapping=PTB_TOKEN_MAPPING, 17 | samples_per_batch=150, 18 | transformer_dropout=.33, 19 | learning_rate=1e-4, 20 | learning_rate_transformer=1e-5, 21 | d_positional=128, 22 | # early_stopping_patience=10, 23 | ) 24 | # parser.load(save_dir) 25 | # output = f'{save_dir}/test.predict.conll' 26 | parser.evaluate(PTB_SD330_TEST, save_dir, warm_up=False) 27 | # uas, las = conllx_eval.evaluate(PTB_SD330_TEST, output) 28 | # print(f'Official UAS: {uas:.4f} LAS: {las:.4f}') 29 | # print(f'Model saved in {save_dir}') 30 | -------------------------------------------------------------------------------- /plugins/hanlp_demo/hanlp_demo/zh/tf/train/train_ptb_dep_sa_albert_topk.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # Author: hankcs 3 | # Date: 2020-03-07 23:48 4 | from hanlp.metrics.parsing import conllx_eval 5 | 6 | from hanlp.datasets.parsing.ptb import PTB_SD330_DEV, PTB_SD330_TRAIN, PTB_SD330_TEST, PTB_TOKEN_MAPPING 7 | from hanlp.components.parsers.biaffine_parser_tf import BiaffineTransformerDependencyParserTF, \ 8 | StructuralAttentionDependencyParserTF 9 | from hanlp.pretrained.glove import GLOVE_840B_300D 10 | from tests import cdroot 11 | 12 | cdroot() 13 | save_dir = 'data/model/dep/ptb_sa_topk' 14 | parser = StructuralAttentionDependencyParserTF() 15 | parser.fit(PTB_SD330_TRAIN, PTB_SD330_DEV, save_dir, 'bert-base-uncased', 16 | batch_size=3000, 17 | warmup_steps_ratio=.1, 18 | token_mapping=PTB_TOKEN_MAPPING, 19 | samples_per_batch=150, 20 | transformer_dropout=.33, 21 | masked_lm_dropout=.33, 22 | learning_rate=2e-3, 23 | learning_rate_transformer=1e-5, 24 | 25 | # alpha=1, 26 | # early_stopping_patience=10, 27 | # num_decoder_layers=2, 28 | ) 29 | parser.load(save_dir) 30 | # output = f'{save_dir}/test.predict.conll' 31 | parser.evaluate(PTB_SD330_TEST, save_dir, warm_up=False) 32 | # uas, las = conllx_eval.evaluate(PTB_SD330_TEST, output) 33 | # print(f'Official UAS: {uas:.4f} LAS: {las:.4f}') 34 | print(f'Model saved in {save_dir}') 35 | -------------------------------------------------------------------------------- /plugins/hanlp_demo/hanlp_demo/zh/tf/train/train_ptb_pos_rnn_fasttext.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # Author: hankcs 3 | # Date: 2019-10-25 21:34 4 | 5 | import tensorflow as tf 6 | 7 | from hanlp.components.taggers.pos_tf import RNNPartOfSpeechTaggerTF 8 | from hanlp.pretrained.fasttext import FASTTEXT_CC_300_EN 9 | from tests import cdroot 10 | 11 | cdroot() 12 | tagger = RNNPartOfSpeechTaggerTF() 13 | save_dir = 'data/model/pos/ptb_pos_rnn_fasttext' 14 | optimizer = tf.keras.optimizers.SGD(lr=0.015) 15 | # optimizer = 'adam' 16 | tagger.fit('data/ptb-pos/train.tsv', 17 | 'data/ptb-pos/dev.tsv', 18 | batch_size=10, 19 | save_dir=save_dir, 20 | embeddings={'class_name': 'HanLP>FastTextEmbedding', 21 | 'config': {'filepath': FASTTEXT_CC_300_EN}}, 22 | optimizer=optimizer, 23 | lr_decay_per_epoch=0.05, 24 | rnn_units=100, 25 | rnn_input_dropout=0.5, 26 | rnn_output_dropout=0.5, 27 | epochs=100, 28 | verbose=True) 29 | tagger.load(save_dir) 30 | tagger.evaluate('data/ptb-pos/test.tsv', save_dir=save_dir, output=False) 31 | print(tagger.predict(['This' 'time', 'is', 'for', 'dinner'])) 32 | print(tagger.predict([['This', 'is', 'an', 'old', 'story'], 33 | ['Not', 'this', 'year', '.']])) 34 | print(f'Model saved in {save_dir}') 35 | -------------------------------------------------------------------------------- /plugins/hanlp_demo/hanlp_demo/zh/tf/train/train_semeval15_dm.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # Author: hankcs 3 | # Date: 2020-01-01 18:26 4 | from hanlp.components.parsers.biaffine_parser_tf import BiaffineSemanticDependencyParserTF 5 | from hanlp.pretrained.glove import GLOVE_6B_100D 6 | from tests import cdroot 7 | 8 | cdroot() 9 | save_dir = 'data/model/sdp/semeval15_biaffine_dm' 10 | parser = BiaffineSemanticDependencyParserTF() 11 | parser.fit('data/semeval15/en.dm.train.conll', 'data/semeval15/en.dm.dev.conll', save_dir, 12 | pretrained_embed={'class_name': 'HanLP>Word2VecEmbedding', 13 | 'config': { 14 | 'trainable': False, 15 | 'embeddings_initializer': 'zero', 16 | 'filepath': GLOVE_6B_100D, 17 | 'expand_vocab': True, 18 | 'lowercase': True, 19 | 'normalize': True, 20 | }}, 21 | ) 22 | parser.load(save_dir) # disable variational dropout during evaluation so as to use CudaLSTM 23 | sentence = [('Is', 'VBZ'), ('this', 'DT'), ('the', 'DT'), ('future', 'NN'), ('of', 'IN'), ('chamber', 'NN'), 24 | ('music', 'NN'), ('?', '.')] 25 | print(parser.predict(sentence)) 26 | parser.evaluate('data/semeval15/en.id.dm.auto.conllu', save_dir) 27 | parser.evaluate('data/semeval15/en.ood.dm.auto.conllu', save_dir) 28 | -------------------------------------------------------------------------------- /plugins/hanlp_demo/hanlp_demo/zh/tf/train/train_semeval15_pas.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # Author: hankcs 3 | # Date: 2020-01-01 18:26 4 | from hanlp.components.parsers.biaffine_parser_tf import BiaffineSemanticDependencyParserTF 5 | from hanlp.pretrained.glove import GLOVE_6B_100D 6 | from tests import cdroot 7 | 8 | cdroot() 9 | save_dir = 'data/model/sdp/semeval15_biaffine_pas' 10 | parser = BiaffineSemanticDependencyParserTF() 11 | parser.fit('data/semeval15/en.pas.train.conll', 'data/semeval15/en.pas.dev.conll', save_dir, 12 | pretrained_embed={'class_name': 'HanLP>Word2VecEmbedding', 13 | 'config': { 14 | 'trainable': False, 15 | 'embeddings_initializer': 'zero', 16 | 'filepath': GLOVE_6B_100D, 17 | 'expand_vocab': True, 18 | 'lowercase': True, 19 | 'normalize': True, 20 | }}, 21 | ) 22 | parser.load(save_dir) # disable variational dropout during evaluation so as to use CudaLSTM 23 | sentence = [('Is', 'VBZ'), ('this', 'DT'), ('the', 'DT'), ('future', 'NN'), ('of', 'IN'), ('chamber', 'NN'), 24 | ('music', 'NN'), ('?', '.')] 25 | print(parser.predict(sentence)) 26 | parser.evaluate('data/semeval15/en.id.pas.conll', save_dir) 27 | parser.evaluate('data/semeval15/en.ood.pas.conll', save_dir) 28 | -------------------------------------------------------------------------------- /plugins/hanlp_demo/hanlp_demo/zh/tf/train/train_semeval15_psd.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # Author: hankcs 3 | # Date: 2020-01-01 18:26 4 | from hanlp.components.parsers.biaffine_parser_tf import BiaffineSemanticDependencyParserTF 5 | from hanlp.pretrained.glove import GLOVE_6B_100D 6 | from tests import cdroot 7 | 8 | cdroot() 9 | save_dir = 'data/model/sdp/semeval15_biaffine_psd' 10 | parser = BiaffineSemanticDependencyParserTF() 11 | parser.fit('data/semeval15/en.psd.train.conll', 'data/semeval15/en.psd.dev.conll', save_dir, 12 | pretrained_embed={'class_name': 'HanLP>Word2VecEmbedding', 13 | 'config': { 14 | 'trainable': False, 15 | 'embeddings_initializer': 'zero', 16 | 'filepath': GLOVE_6B_100D, 17 | 'expand_vocab': True, 18 | 'lowercase': True, 19 | 'normalize': True, 20 | }}, 21 | ) 22 | parser.load(save_dir) # disable variational dropout during evaluation so as to use CudaLSTM 23 | sentence = [('Is', 'VBZ'), ('this', 'DT'), ('the', 'DT'), ('future', 'NN'), ('of', 'IN'), ('chamber', 'NN'), 24 | ('music', 'NN'), ('?', '.')] 25 | print(parser.predict(sentence)) 26 | parser.evaluate('data/semeval15/en.id.psd.conll', save_dir) 27 | parser.evaluate('data/semeval15/en.ood.psd.conll', save_dir) 28 | -------------------------------------------------------------------------------- /plugins/hanlp_demo/hanlp_demo/zh/train/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # Author: hankcs 3 | # Date: 2020-12-31 20:12 4 | -------------------------------------------------------------------------------- /plugins/hanlp_demo/hanlp_demo/zh/train_sota_bert_pku.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # Author: hankcs 3 | # Date: 2020-08-11 02:47 4 | from hanlp.common.dataset import SortingSamplerBuilder 5 | from hanlp.components.tokenizers.transformer import TransformerTaggingTokenizer 6 | from hanlp.datasets.tokenization.sighan2005.pku import SIGHAN2005_PKU_TRAIN_ALL, SIGHAN2005_PKU_TEST 7 | from tests import cdroot 8 | 9 | cdroot() 10 | tokenizer = TransformerTaggingTokenizer() 11 | save_dir = 'data/model/cws/sighan2005_pku_bert_base_96.7' 12 | tokenizer.fit( 13 | SIGHAN2005_PKU_TRAIN_ALL, 14 | SIGHAN2005_PKU_TEST, # Conventionally, no devset is used. See Tian et al. (2020). 15 | save_dir, 16 | 'bert-base-chinese', 17 | max_seq_len=300, 18 | char_level=True, 19 | hard_constraint=True, 20 | sampler_builder=SortingSamplerBuilder(batch_size=32), 21 | epochs=3, 22 | adam_epsilon=1e-6, 23 | warmup_steps=0.1, 24 | weight_decay=0.01, 25 | word_dropout=0.1, 26 | seed=1660853059, 27 | ) 28 | tokenizer.evaluate(SIGHAN2005_PKU_TEST, save_dir) 29 | print(f'Model saved in {save_dir}') 30 | -------------------------------------------------------------------------------- /plugins/hanlp_demo/setup.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # Author: hankcs 3 | # Date: 2019-12-28 19:26 4 | from os.path import abspath, join, dirname 5 | from setuptools import find_packages, setup 6 | 7 | this_dir = abspath(dirname(__file__)) 8 | with open(join(this_dir, 'README.md'), encoding='utf-8') as file: 9 | long_description = file.read() 10 | 11 | setup( 12 | name='hanlp_demo', 13 | version='0.0.1', 14 | description='HanLP: Han Language Processing', 15 | long_description=long_description, 16 | long_description_content_type="text/markdown", 17 | url='https://github.com/hankcs/HanLP', 18 | author='hankcs', 19 | author_email='hankcshe@gmail.com', 20 | license='Apache License 2.0', 21 | classifiers=[ 22 | 'Intended Audience :: Science/Research', 23 | 'Intended Audience :: Developers', 24 | "Development Status :: 3 - Alpha", 25 | 'Operating System :: OS Independent', 26 | "License :: OSI Approved :: Apache Software License", 27 | 'Programming Language :: Python :: 3 :: Only', 28 | 'Topic :: Scientific/Engineering :: Artificial Intelligence', 29 | "Topic :: Text Processing :: Linguistic" 30 | ], 31 | keywords='corpus,machine-learning,NLU,NLP', 32 | packages=find_packages(exclude=['docs', 'tests*']), 33 | include_package_data=True, 34 | install_requires=[ 35 | 'hanlp_common' 36 | ], 37 | python_requires='>=3.6', 38 | ) 39 | -------------------------------------------------------------------------------- /plugins/hanlp_restful/README.md: -------------------------------------------------------------------------------- 1 | # RESTFul API Client for HanLP 2 | 3 | [中文](https://github.com/hankcs/HanLP/tree/doc-zh) | [1.x](https://github.com/hankcs/HanLP/tree/1.x) | [forum](https://bbs.hankcs.com/) | [docker](https://github.com/WalterInSH/hanlp-jupyter-docker) 4 | 5 | The multilingual NLP library for researchers and companies, built on PyTorch and TensorFlow 2.x, for advancing state-of-the-art deep learning techniques in both academia and industry. HanLP was designed from day one to be efficient, user friendly and extendable. It comes with pretrained models for various human languages including English, Chinese and many others. Currently, HanLP 2.0 is in alpha stage with more killer features on the roadmap. Discussions are welcomed on our [forum](https://bbs.hankcs.com/), while bug reports and feature requests are reserved for GitHub issues. For Java users, please checkout the [1.x](https://github.com/hankcs/HanLP/tree/1.x) branch. 6 | 7 | 8 | ## Installation 9 | 10 | ```bash 11 | pip install hanlp-restful 12 | ``` 13 | 14 | ## License 15 | 16 | HanLP is licensed under **Apache License 2.0**. You can use HanLP in your commercial products for free. We would appreciate it if you add a link to HanLP on your website. 17 | 18 | -------------------------------------------------------------------------------- /plugins/hanlp_restful/setup.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # Author: hankcs 3 | # Date: 2019-12-28 19:26 4 | from os.path import abspath, join, dirname 5 | from setuptools import find_packages, setup 6 | 7 | this_dir = abspath(dirname(__file__)) 8 | with open(join(this_dir, 'README.md'), encoding='utf-8') as file: 9 | long_description = file.read() 10 | 11 | setup( 12 | name='hanlp_restful', 13 | version='0.0.23', 14 | description='HanLP: Han Language Processing', 15 | long_description=long_description, 16 | long_description_content_type="text/markdown", 17 | url='https://github.com/hankcs/HanLP', 18 | author='hankcs', 19 | author_email='hankcshe@gmail.com', 20 | license='Apache License 2.0', 21 | classifiers=[ 22 | 'Intended Audience :: Science/Research', 23 | 'Intended Audience :: Developers', 24 | "Development Status :: 3 - Alpha", 25 | 'Operating System :: OS Independent', 26 | "License :: OSI Approved :: Apache Software License", 27 | 'Programming Language :: Python :: 3 :: Only', 28 | 'Topic :: Scientific/Engineering :: Artificial Intelligence', 29 | "Topic :: Text Processing :: Linguistic" 30 | ], 31 | keywords='corpus,machine-learning,NLU,NLP', 32 | packages=find_packages(exclude=['docs', 'tests*']), 33 | include_package_data=True, 34 | install_requires=[ 35 | 'hanlp_common' 36 | ], 37 | python_requires='>=3.6', 38 | ) 39 | -------------------------------------------------------------------------------- /plugins/hanlp_restful/tests/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # Author: hankcs 3 | # Date: 2020-11-29 18:05 4 | -------------------------------------------------------------------------------- /plugins/hanlp_restful_golang/README.md: -------------------------------------------------------------------------------- 1 | # gohanlp 2 | 3 | Golang RESTful Client for HanLP 4 | 5 | We have moved to https://github.com/hankcs/gohanlp -------------------------------------------------------------------------------- /plugins/hanlp_restful_java/src/main/java/com/hankcs/hanlp/restful/BaseInput.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Han He 3 | * me@hankcs.com 4 | * 2020-12-27 12:07 AM 5 | * 6 | * 7 | * Copyright (c) 2020, Han He. All Rights Reserved, http://www.hankcs.com/ 8 | * See LICENSE file in the project root for full license information. 9 | * 10 | */ 11 | package com.hankcs.hanlp.restful; 12 | 13 | /** 14 | * @author hankcs 15 | */ 16 | public class BaseInput 17 | { 18 | public String[] tasks; 19 | public String[] skip_tasks; 20 | public String language; 21 | 22 | public BaseInput(String[] tasks, String[] skipTasks, String language) 23 | { 24 | this.tasks = tasks; 25 | this.skip_tasks = skipTasks; 26 | this.language = language; 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /plugins/hanlp_restful_java/src/main/java/com/hankcs/hanlp/restful/CoreferenceResolutionOutput.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Han He 3 | * me@hankcs.com 4 | * 2021-10-16 4:43 PM 5 | * 6 | * 7 | * Copyright (c) 2021, Han He. All Rights Reserved, http://www.hankcs.com/ 8 | * See LICENSE file in the project root for full license information. 9 | * 10 | */ 11 | package com.hankcs.hanlp.restful; 12 | 13 | import java.util.ArrayList; 14 | import java.util.List; 15 | import java.util.Set; 16 | 17 | /** 18 | * A data class for coreference resolution 19 | * 20 | * @author hankcs 21 | */ 22 | public class CoreferenceResolutionOutput 23 | { 24 | public List> clusters; 25 | public ArrayList tokens; 26 | 27 | public CoreferenceResolutionOutput(List> clusters, ArrayList tokens) 28 | { 29 | this.clusters = clusters; 30 | this.tokens = tokens; 31 | } 32 | } 33 | -------------------------------------------------------------------------------- /plugins/hanlp_restful_java/src/main/java/com/hankcs/hanlp/restful/DocumentInput.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Han He 3 | * me@hankcs.com 4 | * 2020-12-27 12:09 AM 5 | * 6 | * 7 | * Copyright (c) 2020, Han He. All Rights Reserved, http://www.hankcs.com/ 8 | * See LICENSE file in the project root for full license information. 9 | * 10 | */ 11 | package com.hankcs.hanlp.restful; 12 | 13 | /** 14 | * @author hankcs 15 | */ 16 | public class DocumentInput extends BaseInput 17 | { 18 | public String text; 19 | 20 | public DocumentInput(String text, String[] tasks, String[] skipTasks, String language) 21 | { 22 | super(tasks, skipTasks, language); 23 | this.text = text; 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /plugins/hanlp_restful_java/src/main/java/com/hankcs/hanlp/restful/SentenceInput.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Han He 3 | * me@hankcs.com 4 | * 2020-12-27 12:09 AM 5 | * 6 | * 7 | * Copyright (c) 2020, Han He. All Rights Reserved, http://www.hankcs.com/ 8 | * See LICENSE file in the project root for full license information. 9 | * 10 | */ 11 | package com.hankcs.hanlp.restful; 12 | 13 | /** 14 | * @author hankcs 15 | */ 16 | public class SentenceInput extends BaseInput 17 | { 18 | public String[] text; 19 | 20 | public SentenceInput(String[] text, String[] tasks, String[] skipTasks, String language) 21 | { 22 | super(tasks, skipTasks, language); 23 | this.text = text; 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /plugins/hanlp_restful_java/src/main/java/com/hankcs/hanlp/restful/TokenInput.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Han He 3 | * me@hankcs.com 4 | * 2020-12-27 12:09 AM 5 | * 6 | * 7 | * Copyright (c) 2020, Han He. All Rights Reserved, http://www.hankcs.com/ 8 | * See LICENSE file in the project root for full license information. 9 | * 10 | */ 11 | package com.hankcs.hanlp.restful; 12 | 13 | /** 14 | * @author hankcs 15 | */ 16 | public class TokenInput extends BaseInput 17 | { 18 | public String[][] tokens; 19 | 20 | public TokenInput(String[][] tokens, String[] tasks, String[] skipTasks, String language) 21 | { 22 | super(tasks, skipTasks, language); 23 | this.tokens = tokens; 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /plugins/hanlp_restful_java/src/main/java/com/hankcs/hanlp/restful/mrp/Anchor.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Han He 3 | * me@hankcs.com 4 | * 2022-04-13 8:58 AM 5 | * 6 | * 7 | * Copyright (c) 2022, Han He. All Rights Reserved, http://www.hankcs.com/ 8 | * See LICENSE file in the project root for full license information. 9 | * 10 | */ 11 | package com.hankcs.hanlp.restful.mrp; 12 | 13 | /** 14 | * @author hankcs 15 | */ 16 | public class Anchor 17 | { 18 | public String from; 19 | public String to; 20 | } 21 | -------------------------------------------------------------------------------- /plugins/hanlp_restful_java/src/main/java/com/hankcs/hanlp/restful/mrp/Edge.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Han He 3 | * me@hankcs.com 4 | * 2022-04-13 9:01 AM 5 | * 6 | * 7 | * Copyright (c) 2022, Han He. All Rights Reserved, http://www.hankcs.com/ 8 | * See LICENSE file in the project root for full license information. 9 | * 10 | */ 11 | package com.hankcs.hanlp.restful.mrp; 12 | 13 | /** 14 | * @author hankcs 15 | */ 16 | public class Edge 17 | { 18 | public int source; 19 | public int target; 20 | public String label; 21 | } 22 | -------------------------------------------------------------------------------- /plugins/hanlp_restful_java/src/main/java/com/hankcs/hanlp/restful/mrp/MeaningRepresentation.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Han He 3 | * me@hankcs.com 4 | * 2022-04-13 8:57 AM 5 | * 6 | * 7 | * Copyright (c) 2022, Han He. All Rights Reserved, http://www.hankcs.com/ 8 | * See LICENSE file in the project root for full license information. 9 | * 10 | */ 11 | package com.hankcs.hanlp.restful.mrp; 12 | 13 | /** 14 | * Graph-based meaning representation. 15 | * 16 | * @author hankcs 17 | */ 18 | public class MeaningRepresentation 19 | { 20 | public String id; 21 | public String input; 22 | public Node[] nodes; 23 | public Edge[] edges; 24 | public String[] tops; 25 | public String framework; 26 | } 27 | -------------------------------------------------------------------------------- /plugins/hanlp_restful_java/src/main/java/com/hankcs/hanlp/restful/mrp/Node.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Han He 3 | * me@hankcs.com 4 | * 2022-04-13 8:57 AM 5 | * 6 | * 7 | * Copyright (c) 2022, Han He. All Rights Reserved, http://www.hankcs.com/ 8 | * See LICENSE file in the project root for full license information. 9 | * 10 | */ 11 | package com.hankcs.hanlp.restful.mrp; 12 | 13 | /** 14 | * @author hankcs 15 | */ 16 | public class Node 17 | { 18 | public int id; 19 | public String label; 20 | public String[] properties; 21 | public String[] values; 22 | public Anchor[] anchors; 23 | } 24 | -------------------------------------------------------------------------------- /plugins/hanlp_trie/README.md: -------------------------------------------------------------------------------- 1 | # Trie interface and implementation for HanLP 2 | 3 | [中文](https://github.com/hankcs/HanLP/tree/doc-zh) | [1.x](https://github.com/hankcs/HanLP/tree/1.x) | [forum](https://bbs.hankcs.com/) | [docker](https://github.com/WalterInSH/hanlp-jupyter-docker) 4 | 5 | The multilingual NLP library for researchers and companies, built on PyTorch and TensorFlow 2.x, for advancing state-of-the-art deep learning techniques in both academia and industry. HanLP was designed from day one to be efficient, user friendly and extendable. It comes with pretrained models for various human languages including English, Chinese and many others. Currently, HanLP 2.0 is in alpha stage with more killer features on the roadmap. Discussions are welcomed on our [forum](https://bbs.hankcs.com/), while bug reports and feature requests are reserved for GitHub issues. For Java users, please checkout the [1.x](https://github.com/hankcs/HanLP/tree/1.x) branch. 6 | 7 | ## Installation 8 | 9 | ```bash 10 | pip install hanlp 11 | ``` 12 | 13 | 14 | ## License 15 | 16 | HanLP is licensed under **Apache License 2.0**. You can use HanLP in your commercial products for free. We would appreciate it if you add a link to HanLP on your website. 17 | 18 | -------------------------------------------------------------------------------- /plugins/hanlp_trie/hanlp_trie/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # Author: hankcs 3 | # Date: 2020-11-29 17:48 4 | from .trie import Trie 5 | from .dictionary import DictInterface, TrieDict 6 | -------------------------------------------------------------------------------- /plugins/hanlp_trie/setup.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # Author: hankcs 3 | # Date: 2019-12-28 19:26 4 | from os.path import abspath, join, dirname 5 | from setuptools import find_packages, setup 6 | 7 | this_dir = abspath(dirname(__file__)) 8 | with open(join(this_dir, 'README.md'), encoding='utf-8') as file: 9 | long_description = file.read() 10 | 11 | setup( 12 | name='hanlp_trie', 13 | version='0.0.5', 14 | description='HanLP: Han Language Processing', 15 | long_description=long_description, 16 | long_description_content_type="text/markdown", 17 | url='https://github.com/hankcs/HanLP', 18 | author='hankcs', 19 | author_email='hankcshe@gmail.com', 20 | license='Apache License 2.0', 21 | classifiers=[ 22 | 'Intended Audience :: Science/Research', 23 | 'Intended Audience :: Developers', 24 | "Development Status :: 3 - Alpha", 25 | 'Operating System :: OS Independent', 26 | "License :: OSI Approved :: Apache Software License", 27 | 'Programming Language :: Python :: 3 :: Only', 28 | 'Topic :: Scientific/Engineering :: Artificial Intelligence', 29 | "Topic :: Text Processing :: Linguistic" 30 | ], 31 | keywords='corpus,machine-learning,NLU,NLP', 32 | packages=find_packages(exclude=['docs', 'tests*']), 33 | include_package_data=True, 34 | install_requires=[ 35 | 'hanlp_common' 36 | ], 37 | python_requires='>=3.6', 38 | ) 39 | -------------------------------------------------------------------------------- /plugins/hanlp_trie/tests/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # Author: hankcs 3 | # Date: 2020-11-29 18:05 4 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # Author: hankcs 3 | # Date: 2019-06-13 23:43 4 | import os 5 | 6 | root = os.path.abspath(os.path.join(os.path.dirname(__file__), os.pardir)) 7 | 8 | 9 | def cdroot(): 10 | """ 11 | cd to project root, so models are saved in the root folder 12 | """ 13 | os.chdir(root) 14 | -------------------------------------------------------------------------------- /tests/test_config_tracker.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | from hanlp.common.structure import ConfigTracker 4 | 5 | 6 | class MyClass(ConfigTracker): 7 | def __init__(self, i_need_this='yes') -> None: 8 | super().__init__(locals()) 9 | 10 | 11 | class TestConfigTracker(unittest.TestCase): 12 | def test_init(self): 13 | obj = MyClass() 14 | self.assertEqual(obj.config.get('i_need_this', None), 'yes') 15 | 16 | 17 | if __name__ == '__main__': 18 | unittest.main() 19 | -------------------------------------------------------------------------------- /tests/test_pipeline.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import hanlp 3 | 4 | 5 | class TestPipeLine(unittest.TestCase): 6 | def test_copy(self): 7 | pipe = hanlp.pipeline().append(hanlp.utils.rules.split_sentence) 8 | copied_pipe = pipe.copy() 9 | test_text = "今天天气真好。我要去散步。" 10 | assert pipe is not copied_pipe 11 | copied_pipe.append(lambda sent: "".join(sent)) 12 | assert pipe(test_text) != copied_pipe(test_text) 13 | 14 | if __name__ == '__main__': 15 | unittest.main() 16 | -------------------------------------------------------------------------------- /tests/test_rules.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # Author: hankcs 3 | # Date: 2022-03-22 17:17 4 | import unittest 5 | 6 | from hanlp.utils.rules import split_sentence 7 | 8 | 9 | class TestRules(unittest.TestCase): 10 | def test_eos(self): 11 | self.assertListEqual(list(split_sentence('叶')), ['叶']) 12 | self.assertListEqual(list(split_sentence('他说:“加油。”谢谢')), ['他说:“加油。”', '谢谢']) 13 | self.assertListEqual(list(split_sentence('Go to hankcs.com. Yes.')), ['Go to hankcs.com.', 'Yes.']) 14 | 15 | 16 | if __name__ == '__main__': 17 | unittest.main() 18 | -------------------------------------------------------------------------------- /tests/test_string_util.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # Author: hankcs 3 | # Date: 2022-03-22 17:17 4 | import unittest 5 | 6 | from hanlp.utils.string_util import possible_tokenization 7 | 8 | 9 | class TestStringUtility(unittest.TestCase): 10 | def test_enumerate_tokenization(self): 11 | text = '商品和服务' 12 | toks = possible_tokenization(text) 13 | assert len(set(toks)) == 2 ** (len(text) - 1) 14 | for each in toks: 15 | assert ''.join(each) == text 16 | 17 | 18 | if __name__ == '__main__': 19 | unittest.main() 20 | --------------------------------------------------------------------------------