├── .github
├── ISSUE_TEMPLATE
│ ├── bug_report.md
│ ├── config.yml
│ └── feature_request.md
├── pull_request_template.md
└── workflows
│ └── unit-tests.yml
├── .gitignore
├── CITATION.cff
├── LICENSE
├── README.md
├── docs
├── Makefile
├── annotations
│ ├── constituency
│ │ ├── ctb.md
│ │ ├── index.md
│ │ ├── npcmj.md
│ │ └── ptb.md
│ ├── dep
│ │ ├── index.md
│ │ ├── pmt.md
│ │ ├── sd_en.md
│ │ ├── sd_zh.md
│ │ └── ud.md
│ ├── index.md
│ ├── ner
│ │ ├── index.md
│ │ ├── msra.md
│ │ ├── ontonotes.md
│ │ └── pku.md
│ ├── pos
│ │ ├── 863.md
│ │ ├── ctb.md
│ │ ├── index.md
│ │ ├── npcmj.md
│ │ ├── pku.md
│ │ └── ud.md
│ ├── sdp
│ │ ├── dm.md
│ │ ├── index.md
│ │ ├── pas.md
│ │ ├── psd.md
│ │ └── semeval16.md
│ ├── srl
│ │ ├── cpb.md
│ │ ├── index.md
│ │ └── propbank.md
│ └── tok
│ │ ├── ctb.md
│ │ ├── index.md
│ │ └── msr.md
├── api
│ ├── common
│ │ ├── configurable.rst
│ │ ├── conll.rst
│ │ ├── constant.rst
│ │ ├── document.rst
│ │ └── index.md
│ ├── hanlp
│ │ ├── common
│ │ │ ├── component.rst
│ │ │ ├── dataset.md
│ │ │ ├── index.md
│ │ │ ├── structure.md
│ │ │ ├── torch_component.md
│ │ │ ├── transform.md
│ │ │ └── vocab.md
│ │ ├── components
│ │ │ ├── classifiers.md
│ │ │ ├── eos.md
│ │ │ ├── index.md
│ │ │ ├── lemmatizer.md
│ │ │ ├── mtl
│ │ │ │ ├── index.md
│ │ │ │ ├── mtl.md
│ │ │ │ └── tasks
│ │ │ │ │ ├── constituency.md
│ │ │ │ │ ├── dep.md
│ │ │ │ │ ├── index.md
│ │ │ │ │ ├── lem.md
│ │ │ │ │ ├── ner
│ │ │ │ │ ├── biaffine_ner.md
│ │ │ │ │ ├── index.md
│ │ │ │ │ └── tag_ner.md
│ │ │ │ │ ├── pos.md
│ │ │ │ │ ├── sdp.md
│ │ │ │ │ ├── srl
│ │ │ │ │ ├── bio_srl.md
│ │ │ │ │ ├── index.md
│ │ │ │ │ └── rank_srl.md
│ │ │ │ │ ├── task.md
│ │ │ │ │ ├── tok.md
│ │ │ │ │ └── ud.md
│ │ │ ├── ner
│ │ │ │ ├── biaffine_ner.md
│ │ │ │ ├── index.md
│ │ │ │ ├── rnn_ner.md
│ │ │ │ └── transformer_ner.md
│ │ │ ├── parsers
│ │ │ │ ├── biaffine_dep.md
│ │ │ │ ├── biaffine_sdp.md
│ │ │ │ ├── crf_constituency_parser.md
│ │ │ │ ├── index.md
│ │ │ │ └── ud_parser.md
│ │ │ ├── pipeline.md
│ │ │ ├── srl
│ │ │ │ ├── index.md
│ │ │ │ ├── span_bio.md
│ │ │ │ └── span_rank.md
│ │ │ ├── sts.md
│ │ │ ├── taggers
│ │ │ │ ├── index.md
│ │ │ │ ├── rnn_tagger.md
│ │ │ │ └── transformer_tagger.md
│ │ │ └── tokenizers
│ │ │ │ ├── index.md
│ │ │ │ ├── multi_criteria.md
│ │ │ │ └── transformer.md
│ │ ├── datasets
│ │ │ ├── constituency
│ │ │ │ ├── constituency_dataset.md
│ │ │ │ ├── index.md
│ │ │ │ └── resources.md
│ │ │ ├── dep
│ │ │ │ ├── conll_dataset.md
│ │ │ │ ├── index.md
│ │ │ │ └── resources.md
│ │ │ ├── eos
│ │ │ │ ├── eos.md
│ │ │ │ ├── index.md
│ │ │ │ └── resources.md
│ │ │ ├── index.md
│ │ │ ├── ner
│ │ │ │ ├── index.md
│ │ │ │ ├── json.md
│ │ │ │ ├── resources.md
│ │ │ │ └── tsv.md
│ │ │ ├── pos
│ │ │ │ ├── index.md
│ │ │ │ └── resources.md
│ │ │ ├── srl
│ │ │ │ ├── conll2012_dataset.md
│ │ │ │ ├── index.md
│ │ │ │ └── resources.md
│ │ │ └── tok
│ │ │ │ ├── index.md
│ │ │ │ ├── mcws_dataset.md
│ │ │ │ ├── resources.md
│ │ │ │ └── txt.md
│ │ ├── hanlp.rst
│ │ ├── index.md
│ │ ├── layers
│ │ │ ├── decoders
│ │ │ │ ├── biaffine_ner.md
│ │ │ │ ├── index.md
│ │ │ │ └── linear_crf.md
│ │ │ ├── embeddings
│ │ │ │ ├── char_cnn.md
│ │ │ │ ├── char_rnn.md
│ │ │ │ ├── embedding.md
│ │ │ │ ├── fasttext.md
│ │ │ │ ├── index.md
│ │ │ │ ├── transformer.md
│ │ │ │ └── word2vec.md
│ │ │ ├── index.md
│ │ │ └── transformers
│ │ │ │ ├── encoder.md
│ │ │ │ ├── index.md
│ │ │ │ └── tokenizer.md
│ │ ├── pretrained
│ │ │ ├── amr.md
│ │ │ ├── amr2text.md
│ │ │ ├── constituency.md
│ │ │ ├── dep.md
│ │ │ ├── eos.md
│ │ │ ├── fasttext.md
│ │ │ ├── glove.md
│ │ │ ├── index.md
│ │ │ ├── mlm.md
│ │ │ ├── mtl.md
│ │ │ ├── ner.md
│ │ │ ├── pos.md
│ │ │ ├── sdp.md
│ │ │ ├── srl.md
│ │ │ ├── sts.md
│ │ │ ├── tok.md
│ │ │ └── word2vec.md
│ │ └── utils
│ │ │ ├── index.md
│ │ │ └── io_util.md
│ ├── restful.rst
│ ├── restful_golang.md
│ ├── restful_java.md
│ └── trie
│ │ ├── dictionary.md
│ │ ├── index.md
│ │ └── trie.md
├── conf.py
├── configure.md
├── contributing.md
├── data_format.md
├── index.md
├── install.md
├── references.bib
├── references.rst
└── tutorial.md
├── hanlp
├── __init__.py
├── callbacks
│ ├── __init__.py
│ └── fine_csv_logger.py
├── common
│ ├── __init__.py
│ ├── component.py
│ ├── dataset.py
│ ├── keras_component.py
│ ├── structure.py
│ ├── torch_component.py
│ ├── transform.py
│ ├── transform_tf.py
│ ├── vocab.py
│ └── vocab_tf.py
├── components
│ ├── __init__.py
│ ├── amr
│ │ ├── __init__.py
│ │ ├── amrbart
│ │ │ ├── __init__.py
│ │ │ ├── bart_amr_generation.py
│ │ │ ├── bart_amr_parser.py
│ │ │ ├── common
│ │ │ │ ├── __init__.py
│ │ │ │ ├── constant.py
│ │ │ │ ├── penman_interface.py
│ │ │ │ └── postprocessing.py
│ │ │ ├── data_interface
│ │ │ │ ├── __init__.py
│ │ │ │ └── dataset.py
│ │ │ ├── model_interface
│ │ │ │ ├── __init__.py
│ │ │ │ ├── modeling_bart.py
│ │ │ │ └── tokenization_bart.py
│ │ │ └── preprocess
│ │ │ │ ├── __init__.py
│ │ │ │ ├── amr_io.py
│ │ │ │ ├── penman_interface.py
│ │ │ │ └── read_and_process.py
│ │ └── seq2seq
│ │ │ ├── __init__.py
│ │ │ ├── dataset
│ │ │ ├── IO.py
│ │ │ ├── __init__.py
│ │ │ ├── dataset.py
│ │ │ ├── linearization.py
│ │ │ ├── penman.py
│ │ │ ├── postprocessing.py
│ │ │ ├── tokenization_bart.py
│ │ │ └── tokenization_t5.py
│ │ │ ├── evaluation.py
│ │ │ ├── optim.py
│ │ │ └── seq2seq_amr_parser.py
│ ├── classifiers
│ │ ├── __init__.py
│ │ ├── fasttext_classifier.py
│ │ ├── transformer_classifier.py
│ │ ├── transformer_classifier_hf.py
│ │ ├── transformer_classifier_tf.py
│ │ └── transformer_regression_hf.py
│ ├── distillation
│ │ ├── __init__.py
│ │ ├── distillable_component.py
│ │ ├── losses.py
│ │ └── schedulers.py
│ ├── eos
│ │ ├── __init__.py
│ │ └── ngram.py
│ ├── lambda_wrapper.py
│ ├── lemmatizer.py
│ ├── lm
│ │ ├── __init__.py
│ │ └── mlm.py
│ ├── mtl
│ │ ├── __init__.py
│ │ ├── multi_task_learning.py
│ │ └── tasks
│ │ │ ├── __init__.py
│ │ │ ├── amr.py
│ │ │ ├── constituency.py
│ │ │ ├── dep.py
│ │ │ ├── dep_2nd.py
│ │ │ ├── lem.py
│ │ │ ├── ner
│ │ │ ├── __init__.py
│ │ │ ├── biaffine_ner.py
│ │ │ └── tag_ner.py
│ │ │ ├── pos.py
│ │ │ ├── sdp.py
│ │ │ ├── srl
│ │ │ ├── __init__.py
│ │ │ ├── bio_srl.py
│ │ │ └── rank_srl.py
│ │ │ ├── tok
│ │ │ ├── __init__.py
│ │ │ ├── reg_tok.py
│ │ │ └── tag_tok.py
│ │ │ └── ud.py
│ ├── ner
│ │ ├── __init__.py
│ │ ├── biaffine_ner
│ │ │ ├── __init__.py
│ │ │ ├── biaffine_ner.py
│ │ │ └── biaffine_ner_model.py
│ │ ├── ner_tf.py
│ │ ├── rnn_ner.py
│ │ └── transformer_ner.py
│ ├── parsers
│ │ ├── __init__.py
│ │ ├── alg.py
│ │ ├── alg_tf.py
│ │ ├── biaffine
│ │ │ ├── __init__.py
│ │ │ ├── biaffine.py
│ │ │ ├── biaffine_2nd_dep.py
│ │ │ ├── biaffine_dep.py
│ │ │ ├── biaffine_model.py
│ │ │ ├── biaffine_sdp.py
│ │ │ ├── mlp.py
│ │ │ ├── structual_attention.py
│ │ │ └── variationalbilstm.py
│ │ ├── biaffine_parser_tf.py
│ │ ├── biaffine_tf
│ │ │ ├── __init__.py
│ │ │ ├── alg.py
│ │ │ ├── layers.py
│ │ │ └── model.py
│ │ ├── chu_liu_edmonds.py
│ │ ├── conll.py
│ │ ├── constituency
│ │ │ ├── __init__.py
│ │ │ ├── crf_constituency_model.py
│ │ │ ├── crf_constituency_parser.py
│ │ │ └── treecrf.py
│ │ ├── parse_alg.py
│ │ └── ud
│ │ │ ├── __init__.py
│ │ │ ├── lemma_edit.py
│ │ │ ├── tag_decoder.py
│ │ │ ├── ud_model.py
│ │ │ ├── ud_parser.py
│ │ │ ├── udify_util.py
│ │ │ └── util.py
│ ├── pipeline.py
│ ├── rnn_language_model_tf.py
│ ├── srl
│ │ ├── __init__.py
│ │ ├── span_bio
│ │ │ ├── __init__.py
│ │ │ ├── baffine_tagging.py
│ │ │ └── span_bio.py
│ │ └── span_rank
│ │ │ ├── __init__.py
│ │ │ ├── highway_variational_lstm.py
│ │ │ ├── inference_utils.py
│ │ │ ├── layer.py
│ │ │ ├── span_rank.py
│ │ │ ├── span_ranking_srl_model.py
│ │ │ ├── srl_eval_utils.py
│ │ │ └── util.py
│ ├── sts
│ │ ├── __init__.py
│ │ └── transformer_sts.py
│ ├── taggers
│ │ ├── __init__.py
│ │ ├── cnn_tagger_tf.py
│ │ ├── ngram_conv
│ │ │ ├── __init__.py
│ │ │ └── ngram_conv_tagger.py
│ │ ├── pos_tf.py
│ │ ├── rnn
│ │ │ ├── __init__.py
│ │ │ └── rnntaggingmodel.py
│ │ ├── rnn_tagger.py
│ │ ├── rnn_tagger_tf.py
│ │ ├── tagger.py
│ │ ├── tagger_tf.py
│ │ ├── transformers
│ │ │ ├── __init__.py
│ │ │ ├── metrics_tf.py
│ │ │ ├── transformer_tagger.py
│ │ │ ├── transformer_tagger_tf.py
│ │ │ └── transformer_transform_tf.py
│ │ └── util.py
│ └── tokenizers
│ │ ├── __init__.py
│ │ ├── multi_criteria_cws_transformer.py
│ │ ├── tok.py
│ │ ├── tok_tf.py
│ │ └── transformer.py
├── datasets
│ ├── __init__.py
│ ├── classification
│ │ ├── __init__.py
│ │ └── sentiment.py
│ ├── coref
│ │ ├── __init__.py
│ │ └── loaders
│ │ │ ├── __init__.py
│ │ │ └── conll12coref.py
│ ├── eos
│ │ ├── __init__.py
│ │ ├── eos.py
│ │ └── loaders
│ │ │ ├── __init__.py
│ │ │ └── nn_eos.py
│ ├── lm
│ │ ├── __init__.py
│ │ └── loaders
│ │ │ ├── __init__.py
│ │ │ └── lm_dataset.py
│ ├── lu
│ │ ├── __init__.py
│ │ └── glue.py
│ ├── ner
│ │ ├── __init__.py
│ │ ├── conll03.py
│ │ ├── loaders
│ │ │ ├── __init__.py
│ │ │ ├── json_ner.py
│ │ │ └── tsv.py
│ │ ├── msra.py
│ │ ├── resume.py
│ │ └── weibo.py
│ ├── parsing
│ │ ├── __init__.py
│ │ ├── amr.py
│ │ ├── ctb5.py
│ │ ├── ctb7.py
│ │ ├── ctb8.py
│ │ ├── ctb9.py
│ │ ├── loaders
│ │ │ ├── __init__.py
│ │ │ ├── _ctb_utils.py
│ │ │ ├── conll_dataset.py
│ │ │ └── constituency_dataset.py
│ │ ├── pmt1.py
│ │ ├── ptb.py
│ │ ├── semeval15.py
│ │ ├── semeval16.py
│ │ └── ud
│ │ │ ├── __init__.py
│ │ │ ├── ud210.py
│ │ │ ├── ud210m.py
│ │ │ ├── ud23.py
│ │ │ ├── ud23m.py
│ │ │ ├── ud27.py
│ │ │ └── ud27m.py
│ ├── pos
│ │ ├── __init__.py
│ │ └── ctb5.py
│ ├── qa
│ │ ├── __init__.py
│ │ └── hotpotqa.py
│ ├── srl
│ │ ├── __init__.py
│ │ ├── loaders
│ │ │ ├── __init__.py
│ │ │ ├── conll2012.py
│ │ │ └── ontonotes_loader.py
│ │ └── ontonotes5
│ │ │ ├── __init__.py
│ │ │ ├── _utils.py
│ │ │ ├── chinese.py
│ │ │ └── english.py
│ ├── sts
│ │ ├── __init__.py
│ │ └── stsb.py
│ └── tokenization
│ │ ├── __init__.py
│ │ ├── ctb6.py
│ │ ├── loaders
│ │ ├── __init__.py
│ │ ├── chunking_dataset.py
│ │ ├── multi_criteria_cws
│ │ │ ├── __init__.py
│ │ │ └── mcws_dataset.py
│ │ └── txt.py
│ │ └── sighan2005
│ │ ├── __init__.py
│ │ ├── as_.py
│ │ ├── cityu.py
│ │ ├── msr.py
│ │ └── pku.py
├── layers
│ ├── __init__.py
│ ├── cnn_encoder.py
│ ├── crf
│ │ ├── __init__.py
│ │ ├── crf.py
│ │ ├── crf_layer_tf.py
│ │ └── crf_tf.py
│ ├── dropout.py
│ ├── embeddings
│ │ ├── __init__.py
│ │ ├── char_cnn.py
│ │ ├── char_cnn_tf.py
│ │ ├── char_rnn.py
│ │ ├── char_rnn_tf.py
│ │ ├── concat_embedding.py
│ │ ├── contextual_string_embedding.py
│ │ ├── contextual_string_embedding_tf.py
│ │ ├── contextual_word_embedding.py
│ │ ├── embedding.py
│ │ ├── fast_text.py
│ │ ├── fast_text_tf.py
│ │ ├── util.py
│ │ ├── util_tf.py
│ │ ├── word2vec.py
│ │ └── word2vec_tf.py
│ ├── feed_forward.py
│ ├── feedforward.py
│ ├── scalar_mix.py
│ ├── time_distributed.py
│ ├── transformers
│ │ ├── __init__.py
│ │ ├── encoder.py
│ │ ├── loader_tf.py
│ │ ├── pt_imports.py
│ │ ├── relative_transformer.py
│ │ ├── resource.py
│ │ ├── tf_imports.py
│ │ ├── utils.py
│ │ └── utils_tf.py
│ └── weight_normalization.py
├── losses
│ ├── __init__.py
│ └── sparse_categorical_crossentropy.py
├── metrics
│ ├── __init__.py
│ ├── accuracy.py
│ ├── amr
│ │ ├── __init__.py
│ │ └── smatch_eval.py
│ ├── chunking
│ │ ├── __init__.py
│ │ ├── binary_chunking_f1.py
│ │ ├── bmes_tf.py
│ │ ├── chunking_f1.py
│ │ ├── chunking_f1_tf.py
│ │ ├── conlleval.py
│ │ ├── iobes_tf.py
│ │ └── sequence_labeling.py
│ ├── f1.py
│ ├── metric.py
│ ├── mtl.py
│ ├── parsing
│ │ ├── __init__.py
│ │ ├── attachmentscore.py
│ │ ├── conllx_eval.py
│ │ ├── labeled_f1.py
│ │ ├── labeled_f1_tf.py
│ │ ├── labeled_score.py
│ │ ├── semdep_eval.py
│ │ └── span.py
│ ├── spearman_correlation.py
│ └── srl
│ │ ├── __init__.py
│ │ └── srlconll.py
├── optimizers
│ ├── __init__.py
│ └── adamw
│ │ ├── __init__.py
│ │ └── optimization.py
├── pretrained
│ ├── __init__.py
│ ├── amr.py
│ ├── amr2text.py
│ ├── classifiers.py
│ ├── constituency.py
│ ├── dep.py
│ ├── eos.py
│ ├── fasttext.py
│ ├── glove.py
│ ├── mtl.py
│ ├── ner.py
│ ├── pos.py
│ ├── rnnlm.py
│ ├── sdp.py
│ ├── srl.py
│ ├── sts.py
│ ├── tok.py
│ └── word2vec.py
├── transform
│ ├── __init__.py
│ ├── conll_tf.py
│ ├── glue_tf.py
│ ├── table_tf.py
│ ├── tacred_tf.py
│ ├── text_tf.py
│ ├── transformer_tokenizer.py
│ ├── tsv_tf.py
│ └── txt_tf.py
├── utils
│ ├── __init__.py
│ ├── component_util.py
│ ├── file_read_backwards
│ │ ├── __init__.py
│ │ ├── buffer_work_space.py
│ │ └── file_read_backwards.py
│ ├── init_util.py
│ ├── io_util.py
│ ├── lang
│ │ ├── __init__.py
│ │ ├── en
│ │ │ ├── __init__.py
│ │ │ └── english_tokenizer.py
│ │ ├── ja
│ │ │ ├── __init__.py
│ │ │ └── bert_tok.py
│ │ └── zh
│ │ │ ├── __init__.py
│ │ │ ├── char_table.py
│ │ │ └── localization.py
│ ├── log_util.py
│ ├── rules.py
│ ├── span_util.py
│ ├── string_util.py
│ ├── tf_util.py
│ ├── time_util.py
│ └── torch_util.py
└── version.py
├── plugins
├── README.md
├── hanlp_common
│ ├── README.md
│ ├── __init__.py
│ ├── hanlp_common
│ │ ├── __init__.py
│ │ ├── amr.py
│ │ ├── configurable.py
│ │ ├── conll.py
│ │ ├── constant.py
│ │ ├── document.py
│ │ ├── io.py
│ │ ├── reflection.py
│ │ ├── structure.py
│ │ ├── util.py
│ │ └── visualization.py
│ └── setup.py
├── hanlp_demo
│ ├── README.md
│ ├── hanlp_demo
│ │ ├── __init__.py
│ │ ├── block_windows.py
│ │ ├── en
│ │ │ ├── __init__.py
│ │ │ ├── demo_amr.py
│ │ │ ├── demo_dep.py
│ │ │ ├── demo_lm.py
│ │ │ ├── demo_ner.py
│ │ │ ├── demo_pipeline.py
│ │ │ ├── demo_pos.py
│ │ │ ├── demo_sdp.py
│ │ │ ├── demo_sentiment_analysis.py
│ │ │ ├── demo_tok.py
│ │ │ └── train_sst2_albert_base.py
│ │ ├── ja
│ │ │ ├── __init__.py
│ │ │ └── demo_mtl.py
│ │ ├── mul
│ │ │ ├── __init__.py
│ │ │ ├── demo_lid.py
│ │ │ ├── demo_lid_restful.py
│ │ │ ├── demo_mtl.py
│ │ │ └── train
│ │ │ │ ├── __init__.py
│ │ │ │ └── mul_base.py
│ │ ├── sent_split.py
│ │ └── zh
│ │ │ ├── __init__.py
│ │ │ ├── abstractive_summarization_restful.ipynb
│ │ │ ├── amr_restful.ipynb
│ │ │ ├── amr_stl.ipynb
│ │ │ ├── classification_restful.ipynb
│ │ │ ├── con_mtl.ipynb
│ │ │ ├── con_restful.ipynb
│ │ │ ├── con_stl.ipynb
│ │ │ ├── cor_restful.ipynb
│ │ │ ├── demo_amr.py
│ │ │ ├── demo_custom_dict.py
│ │ │ ├── demo_custom_dict_stl.py
│ │ │ ├── demo_del_tasks.py
│ │ │ ├── demo_document.py
│ │ │ ├── demo_mlm.py
│ │ │ ├── demo_mtl.py
│ │ │ ├── demo_ner_dict.py
│ │ │ ├── demo_parse_constituency.py
│ │ │ ├── demo_pipeline.py
│ │ │ ├── demo_pos_dict.py
│ │ │ ├── demo_sts.py
│ │ │ ├── demo_word2vec.py
│ │ │ ├── dep_mtl.ipynb
│ │ │ ├── dep_restful.ipynb
│ │ │ ├── dep_stl.ipynb
│ │ │ ├── extractive_summarization_restful.ipynb
│ │ │ ├── gec_restful.ipynb
│ │ │ ├── keyphrase_restful.ipynb
│ │ │ ├── lid_restful.ipynb
│ │ │ ├── lid_stl.ipynb
│ │ │ ├── ner_mtl.ipynb
│ │ │ ├── ner_restful.ipynb
│ │ │ ├── ner_stl.ipynb
│ │ │ ├── pos_mtl.ipynb
│ │ │ ├── pos_restful.ipynb
│ │ │ ├── pos_stl.ipynb
│ │ │ ├── sdp_mtl.ipynb
│ │ │ ├── sdp_restful.ipynb
│ │ │ ├── sdp_stl.ipynb
│ │ │ ├── sentiment_restful.ipynb
│ │ │ ├── srl_mtl.ipynb
│ │ │ ├── srl_restful.ipynb
│ │ │ ├── srl_stl.ipynb
│ │ │ ├── sts_restful.ipynb
│ │ │ ├── sts_stl.ipynb
│ │ │ ├── tf
│ │ │ ├── __init__.py
│ │ │ ├── demo_classifier.py
│ │ │ ├── demo_client.py
│ │ │ ├── demo_cws.py
│ │ │ ├── demo_cws_trie.py
│ │ │ ├── demo_dep.py
│ │ │ ├── demo_fasttext.py
│ │ │ ├── demo_multiprocess.py
│ │ │ ├── demo_ner.py
│ │ │ ├── demo_pipeline.py
│ │ │ ├── demo_pos.py
│ │ │ ├── demo_sdp.py
│ │ │ ├── demo_serving.py
│ │ │ └── train
│ │ │ │ ├── __init__.py
│ │ │ │ ├── cws
│ │ │ │ ├── __init__.py
│ │ │ │ ├── train_ctb6_cws_albert.py
│ │ │ │ ├── train_ctb6_cws_bert.py
│ │ │ │ ├── train_ctb6_cws_convseg.py
│ │ │ │ ├── train_large_bert_cws.py
│ │ │ │ ├── train_large_conv_cws.py
│ │ │ │ ├── train_large_cws_albert.py
│ │ │ │ ├── train_large_cws_electra.py
│ │ │ │ ├── train_large_rnn_cws.py
│ │ │ │ ├── train_msr_cws_albert.py
│ │ │ │ ├── train_msr_cws_bert.py
│ │ │ │ ├── train_msr_cws_ngram_conv.py
│ │ │ │ ├── train_msr_cws_ngram_conv_embed.py
│ │ │ │ ├── train_pku980106_conv_cws.py
│ │ │ │ ├── train_pku980106_rnn_cws.py
│ │ │ │ └── train_pku_conv_cws.py
│ │ │ │ ├── finetune_msra_ner_albert.py
│ │ │ │ ├── train_chnsenticorp_bert.py
│ │ │ │ ├── train_conll03_ner_bert.py
│ │ │ │ ├── train_conll03_ner_flair.py
│ │ │ │ ├── train_ctb5_dep.py
│ │ │ │ ├── train_ctb5_pos_rnn.py
│ │ │ │ ├── train_ctb7_dep.py
│ │ │ │ ├── train_ctb9_pos_albert.py
│ │ │ │ ├── train_ctb9_pos_electra.py
│ │ │ │ ├── train_msra_ner_albert.py
│ │ │ │ ├── train_msra_ner_bert.py
│ │ │ │ ├── train_msra_ner_electra.py
│ │ │ │ ├── train_msra_ner_ngram_conv.py
│ │ │ │ ├── train_msra_ner_rnn.py
│ │ │ │ ├── train_ptb_dep_biaffine_albert.py
│ │ │ │ ├── train_ptb_dep_biaffine_bert.py
│ │ │ │ ├── train_ptb_dep_biaffine_bert_96.6.py
│ │ │ │ ├── train_ptb_dep_biaffine_bert_positional.py
│ │ │ │ ├── train_ptb_dep_sa_albert.py
│ │ │ │ ├── train_ptb_dep_sa_albert_topk.py
│ │ │ │ ├── train_ptb_dep_sa_bert.py
│ │ │ │ ├── train_ptb_dep_sa_pos_bert.py
│ │ │ │ ├── train_ptb_pos_rnn_fasttext.py
│ │ │ │ ├── train_semeval15_dm.py
│ │ │ │ ├── train_semeval15_pas.py
│ │ │ │ ├── train_semeval15_psd.py
│ │ │ │ ├── train_semeval16_news.py
│ │ │ │ └── train_semeval16_text.py
│ │ │ ├── tok_mtl.ipynb
│ │ │ ├── tok_restful.ipynb
│ │ │ ├── tok_stl.ipynb
│ │ │ ├── train
│ │ │ ├── __init__.py
│ │ │ ├── finetune_ner.py
│ │ │ ├── open_base.py
│ │ │ └── open_small.py
│ │ │ ├── train_sota_bert_pku.py
│ │ │ ├── tst_restful.ipynb
│ │ │ └── tutorial.ipynb
│ └── setup.py
├── hanlp_restful
│ ├── README.md
│ ├── hanlp_restful
│ │ └── __init__.py
│ ├── setup.py
│ └── tests
│ │ ├── __init__.py
│ │ └── test_client.py
├── hanlp_restful_golang
│ └── README.md
├── hanlp_restful_java
│ ├── pom.xml
│ └── src
│ │ ├── main
│ │ └── java
│ │ │ └── com
│ │ │ └── hankcs
│ │ │ └── hanlp
│ │ │ └── restful
│ │ │ ├── BaseInput.java
│ │ │ ├── CoreferenceResolutionOutput.java
│ │ │ ├── DocumentInput.java
│ │ │ ├── HanLPClient.java
│ │ │ ├── SentenceInput.java
│ │ │ ├── Span.java
│ │ │ ├── TokenInput.java
│ │ │ └── mrp
│ │ │ ├── Anchor.java
│ │ │ ├── Edge.java
│ │ │ ├── MeaningRepresentation.java
│ │ │ └── Node.java
│ │ └── test
│ │ └── java
│ │ └── com
│ │ └── hankcs
│ │ └── hanlp
│ │ └── restful
│ │ ├── HanLPClientTest.java
│ │ └── MeaningRepresentationTest.java
└── hanlp_trie
│ ├── README.md
│ ├── hanlp_trie
│ ├── __init__.py
│ ├── dictionary.py
│ └── trie.py
│ ├── setup.py
│ └── tests
│ ├── __init__.py
│ ├── test_trie.py
│ └── test_trie_dict.py
├── setup.py
└── tests
├── __init__.py
├── test_config_tracker.py
├── test_mtl.py
├── test_pipeline.py
├── test_rules.py
└── test_string_util.py
/.github/ISSUE_TEMPLATE/bug_report.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: 🐛发现一个bug
3 | about: 需提交版本号、触发代码、错误日志
4 | title: ''
5 | labels: bug
6 | assignees: hankcs
7 |
8 | ---
9 |
10 |
13 |
14 | **Describe the bug**
15 | A clear and concise description of what the bug is.
16 |
17 | **Code to reproduce the issue**
18 | Provide a reproducible test case that is the bare minimum necessary to generate the problem.
19 |
20 | ```python
21 | ```
22 |
23 | **Describe the current behavior**
24 | A clear and concise description of what happened.
25 |
26 | **Expected behavior**
27 | A clear and concise description of what you expected to happen.
28 |
29 | **System information**
30 | - OS Platform and Distribution (e.g., Linux Ubuntu 16.04):
31 | - Python version:
32 | - HanLP version:
33 |
34 | **Other info / logs**
35 | Include any logs or source code that would be helpful to diagnose the problem. If including tracebacks, please include the full traceback. Large logs and files should be attached.
36 |
37 | * [ ] I've completed this form and searched the web for solutions.
38 |
39 |
40 |
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/config.yml:
--------------------------------------------------------------------------------
1 | blank_issues_enabled: false
2 | contact_links:
3 | - name: ⁉️ 提问求助请上论坛
4 | url: https://bbs.hankcs.com/
5 | about: 欢迎前往蝴蝶效应论坛求助
6 |
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feature_request.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: 🚀新功能请愿
3 | about: 建议增加一个新功能
4 | title: ''
5 | labels: feature request
6 | assignees: hankcs
7 |
8 | ---
9 |
10 |
17 |
18 | **Describe the feature and the current behavior/state.**
19 |
20 | **Will this change the current api? How?**
21 |
22 | **Who will benefit with this feature?**
23 |
24 | **Are you willing to contribute it (Yes/No):**
25 |
26 | **System information**
27 | - OS Platform and Distribution (e.g., Linux Ubuntu 16.04):
28 | - Python version:
29 | - HanLP version:
30 |
31 | **Any other info**
32 |
33 | * [ ] I've carefully completed this form.
34 |
35 |
36 |
--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
1 | # Minimal makefile for Sphinx documentation
2 | #
3 |
4 | # You can set these variables from the command line, and also
5 | # from the environment for the first two.
6 | SPHINXOPTS ?=
7 | SPHINXBUILD ?= sphinx-build
8 | SOURCEDIR = .
9 | BUILDDIR = _build
10 |
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 |
15 | .PHONY: help Makefile
16 |
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 |
--------------------------------------------------------------------------------
/docs/annotations/constituency/index.md:
--------------------------------------------------------------------------------
1 | # Constituency Parsing
2 |
3 | ## Chinese
4 | ```{toctree}
5 | ctb
6 | ```
7 |
8 | ## English
9 | ```{toctree}
10 | ptb
11 | ```
12 |
13 | ## Japanese
14 | ```{toctree}
15 | npcmj
16 | ```
17 |
18 |
--------------------------------------------------------------------------------
/docs/annotations/dep/index.md:
--------------------------------------------------------------------------------
1 | # Dependency Parsing
2 |
3 | ## Chinese
4 |
5 | ```{toctree}
6 | sd_zh
7 | pmt
8 | ```
9 |
10 | ## English
11 |
12 | ```{toctree}
13 | sd_en
14 | ```
15 |
16 | ## Multilingual
17 |
18 | ```{toctree}
19 | ud
20 | ```
21 |
--------------------------------------------------------------------------------
/docs/annotations/index.md:
--------------------------------------------------------------------------------
1 | # Annotations
2 |
3 |
4 | ```{toctree}
5 | tok/index
6 | pos/index
7 | ner/index
8 | dep/index
9 | sdp/index
10 | srl/index
11 | constituency/index
12 | ```
13 |
14 |
--------------------------------------------------------------------------------
/docs/annotations/ner/index.md:
--------------------------------------------------------------------------------
1 | # Named Entity Recognition
2 |
3 | ## Chinese
4 |
5 | ```{toctree}
6 | pku
7 | msra
8 | ```
9 |
10 | ## Multilingual
11 |
12 | ```{toctree}
13 | ontonotes
14 | ```
15 |
--------------------------------------------------------------------------------
/docs/annotations/pos/index.md:
--------------------------------------------------------------------------------
1 | # Part-of-Speech Tagging
2 |
3 | ## Chinese
4 | ```{toctree}
5 | ctb
6 | pku
7 | 863
8 | ```
9 |
10 | ## Japanese
11 | ```{toctree}
12 | npcmj
13 | ```
14 |
15 | ## Multilingual
16 |
17 | ```{toctree}
18 | ud
19 | ```
20 |
21 |
22 |
23 |
--------------------------------------------------------------------------------
/docs/annotations/sdp/dm.md:
--------------------------------------------------------------------------------
1 | # The reduction of Minimal Recursion Semantics
2 |
3 | Please refer to [Minimal Recursion Semantics An Introduction](https://www.cl.cam.ac.uk/~aac10/papers/mrs.pdf).
4 |
--------------------------------------------------------------------------------
/docs/annotations/sdp/index.md:
--------------------------------------------------------------------------------
1 | # Semantic Dependency Parsing
2 |
3 | ## Chinese
4 |
5 | ```{toctree}
6 | semeval16
7 | ```
8 |
9 | ## English
10 |
11 | ```{toctree}
12 | dm
13 | pas
14 | psd
15 | ```
16 |
17 |
--------------------------------------------------------------------------------
/docs/annotations/sdp/pas.md:
--------------------------------------------------------------------------------
1 | # Predicate-Argument Structures
2 |
3 | Please refer to [Probabilistic disambiguation models for wide-coverage HPSG parsing](https://www.aclweb.org/anthology/P05-1011.pdf).
4 |
--------------------------------------------------------------------------------
/docs/annotations/sdp/psd.md:
--------------------------------------------------------------------------------
1 | # Prague Czech-English Dependency Treebank
2 |
3 | Please refer to [Prague Czech-English Dependency Treebank](http://ufal.mff.cuni.cz/pcedt2.0/en/index.html).
4 |
--------------------------------------------------------------------------------
/docs/annotations/srl/index.md:
--------------------------------------------------------------------------------
1 | # Semantic Role Labeling
2 |
3 | ## Chinese
4 | ```{toctree}
5 | cpb
6 | ```
7 |
8 | ## English
9 | ```{toctree}
10 | propbank
11 | ```
12 |
13 |
--------------------------------------------------------------------------------
/docs/annotations/tok/index.md:
--------------------------------------------------------------------------------
1 | # Tokenization
2 |
3 | ## Chinese
4 | ```{toctree}
5 | ctb
6 | msr
7 | ```
--------------------------------------------------------------------------------
/docs/api/common/configurable.rst:
--------------------------------------------------------------------------------
1 | .. _api/configurable:
2 |
3 | configurable
4 | ====================
5 |
6 |
7 | .. autoclass:: hanlp_common.configurable.Configurable
8 | :members:
9 |
10 | .. autoclass:: hanlp_common.configurable.AutoConfigurable
11 | :members:
12 |
--------------------------------------------------------------------------------
/docs/api/common/conll.rst:
--------------------------------------------------------------------------------
1 | .. _api/conll:
2 |
3 | conll
4 | ====================
5 |
6 |
7 | .. autoclass:: hanlp_common.conll.CoNLLWord
8 | :members:
9 |
10 | .. autoclass:: hanlp_common.conll.CoNLLUWord
11 | :members:
12 |
13 | .. autoclass:: hanlp_common.conll.CoNLLSentence
14 | :members:
--------------------------------------------------------------------------------
/docs/api/common/constant.rst:
--------------------------------------------------------------------------------
1 | constant
2 | ====================
3 |
4 |
5 | .. automodule:: hanlp_common.constant
6 | :members:
7 |
--------------------------------------------------------------------------------
/docs/api/common/document.rst:
--------------------------------------------------------------------------------
1 | .. _api/document:
2 |
3 | document
4 | ====================
5 |
6 | .. currentmodule:: hanlp_common
7 |
8 | .. autoclass:: hanlp_common.document.Document
9 | :members:
10 |
--------------------------------------------------------------------------------
/docs/api/common/index.md:
--------------------------------------------------------------------------------
1 | # hanlp_common
2 |
3 | Common APIs shared between `hanlp` and `restful`.
4 |
5 | ```{toctree}
6 | document
7 | conll
8 | configurable
9 | constant
10 | ```
11 |
12 |
--------------------------------------------------------------------------------
/docs/api/hanlp/common/component.rst:
--------------------------------------------------------------------------------
1 | component
2 | =================
3 |
4 | .. currentmodule:: hanlp.common
5 |
6 | .. autoclass:: hanlp.common.component.Component
7 | :members:
8 |
--------------------------------------------------------------------------------
/docs/api/hanlp/common/index.md:
--------------------------------------------------------------------------------
1 | # common
2 |
3 | Common base classes.
4 |
5 | ```{toctree}
6 | structure
7 | vocab
8 | transform
9 | dataset
10 | component
11 | torch_component
12 | ```
13 |
14 |
--------------------------------------------------------------------------------
/docs/api/hanlp/common/structure.md:
--------------------------------------------------------------------------------
1 | # structure
2 |
3 | ```{eval-rst}
4 | .. currentmodule:: hanlp.common
5 |
6 | .. autoclass:: hanlp.common.structure.ConfigTracker
7 | :members:
8 |
9 | .. autoclass:: hanlp.common.structure.History
10 | :members:
11 |
12 | ```
13 |
--------------------------------------------------------------------------------
/docs/api/hanlp/common/torch_component.md:
--------------------------------------------------------------------------------
1 | # torch_component
2 |
3 | ```{eval-rst}
4 | .. currentmodule:: hanlp.common.torch_component
5 |
6 | .. autoclass:: hanlp.common.torch_component.TorchComponent
7 | :members:
8 |
9 | ```
10 |
--------------------------------------------------------------------------------
/docs/api/hanlp/common/transform.md:
--------------------------------------------------------------------------------
1 | # transform
2 |
3 | ```{eval-rst}
4 | .. currentmodule:: hanlp.common
5 |
6 | .. autoclass:: hanlp.common.transform.VocabDict
7 | :members:
8 |
9 | ```
10 |
--------------------------------------------------------------------------------
/docs/api/hanlp/common/vocab.md:
--------------------------------------------------------------------------------
1 | # vocab
2 |
3 | ```{eval-rst}
4 | .. currentmodule:: hanlp.common
5 |
6 | .. autoclass:: hanlp.common.transform.Vocab
7 | :members:
8 | :special-members:
9 | :exclude-members: __init__, __repr__, __call__, __str__
10 |
11 | ```
12 |
--------------------------------------------------------------------------------
/docs/api/hanlp/components/classifiers.md:
--------------------------------------------------------------------------------
1 | # classifiers
2 |
3 | ```{eval-rst}
4 | .. currentmodule:: hanlp.components.classifiers
5 |
6 | .. autoclass:: hanlp.components.classifiers.transformer_classifier.TransformerClassifier
7 | :members:
8 |
9 | ```
10 |
--------------------------------------------------------------------------------
/docs/api/hanlp/components/eos.md:
--------------------------------------------------------------------------------
1 | # eos
2 |
3 | ```{eval-rst}
4 | .. currentmodule:: hanlp.components.eos
5 |
6 | .. autoclass:: hanlp.components.eos.ngram.NgramSentenceBoundaryDetector
7 | :members:
8 |
9 | ```
10 |
--------------------------------------------------------------------------------
/docs/api/hanlp/components/index.md:
--------------------------------------------------------------------------------
1 | # components
2 |
3 | NLP components.
4 |
5 | ```{toctree}
6 | mtl/index
7 | classifiers
8 | eos
9 | tokenizers/index
10 | lemmatizer
11 | taggers/index
12 | ner/index
13 | parsers/index
14 | srl/index
15 | pipeline
16 | sts
17 | ```
18 |
19 |
--------------------------------------------------------------------------------
/docs/api/hanlp/components/lemmatizer.md:
--------------------------------------------------------------------------------
1 | # lemmatizer
2 |
3 | ```{eval-rst}
4 | .. currentmodule:: hanlp.components.lemmatizer
5 |
6 | .. autoclass:: TransformerLemmatizer
7 | :members:
8 |
9 | ```
10 |
--------------------------------------------------------------------------------
/docs/api/hanlp/components/mtl/index.md:
--------------------------------------------------------------------------------
1 | # mtl
2 |
3 | Multi-Task Learning (MTL) framework.
4 |
5 | ```{toctree}
6 | mtl
7 | tasks/index
8 | ```
9 |
10 |
--------------------------------------------------------------------------------
/docs/api/hanlp/components/mtl/mtl.md:
--------------------------------------------------------------------------------
1 | # MultiTaskLearning
2 |
3 | ```{eval-rst}
4 | .. currentmodule:: hanlp.components.mtl
5 |
6 | .. autoclass:: hanlp.components.mtl.multi_task_learning.MultiTaskLearning
7 | :members:
8 | :special-members:
9 | :exclude-members: __init__, __repr__
10 |
11 | ```
12 |
--------------------------------------------------------------------------------
/docs/api/hanlp/components/mtl/tasks/constituency.md:
--------------------------------------------------------------------------------
1 | # con
2 |
3 | Constituency parsing.
4 |
5 | ```{eval-rst}
6 | .. currentmodule:: hanlp.components.mtl
7 |
8 | .. autoclass:: hanlp.components.mtl.tasks.constituency.CRFConstituencyParsing
9 | :members:
10 | :exclude-members: execute_training_loop, fit_dataloader
11 |
12 | ```
13 |
--------------------------------------------------------------------------------
/docs/api/hanlp/components/mtl/tasks/dep.md:
--------------------------------------------------------------------------------
1 | # dep
2 |
3 | Dependency parsing.
4 |
5 | ```{eval-rst}
6 | .. currentmodule:: hanlp.components.mtl
7 |
8 | .. autoclass:: hanlp.components.mtl.tasks.dep.BiaffineDependencyParsing
9 | :members:
10 | :exclude-members: execute_training_loop, fit_dataloader
11 |
12 | ```
13 |
--------------------------------------------------------------------------------
/docs/api/hanlp/components/mtl/tasks/index.md:
--------------------------------------------------------------------------------
1 | # tasks
2 |
3 | Multi-Task Learning (MTL) tasks.
4 |
5 | ```{toctree}
6 | task
7 | constituency
8 | dep
9 | sdp
10 | ud
11 | lem
12 | pos
13 | tok
14 | ner/index
15 | srl/index
16 | ```
17 |
18 |
--------------------------------------------------------------------------------
/docs/api/hanlp/components/mtl/tasks/lem.md:
--------------------------------------------------------------------------------
1 | # lem
2 |
3 | Lemmatization.
4 |
5 | ```{eval-rst}
6 | .. currentmodule:: hanlp.components.mtl
7 |
8 | .. autoclass:: hanlp.components.mtl.tasks.lem.TransformerLemmatization
9 | :members:
10 | :exclude-members: execute_training_loop, fit_dataloader
11 |
12 | ```
13 |
--------------------------------------------------------------------------------
/docs/api/hanlp/components/mtl/tasks/ner/biaffine_ner.md:
--------------------------------------------------------------------------------
1 | # biaffine_ner
2 |
3 | Biaffine Named Entity Recognition.
4 |
5 | ```{eval-rst}
6 | .. currentmodule:: hanlp.components.mtl
7 |
8 | .. autoclass:: hanlp.components.mtl.tasks.ner.biaffine_ner.BiaffineNamedEntityRecognition
9 | :members:
10 | :exclude-members: execute_training_loop, fit_dataloader
11 |
12 | ```
13 |
--------------------------------------------------------------------------------
/docs/api/hanlp/components/mtl/tasks/ner/index.md:
--------------------------------------------------------------------------------
1 | # ner
2 |
3 | Named Entity Recognition.
4 |
5 | ```{toctree}
6 | tag_ner
7 | biaffine_ner
8 | ```
9 |
10 |
--------------------------------------------------------------------------------
/docs/api/hanlp/components/mtl/tasks/ner/tag_ner.md:
--------------------------------------------------------------------------------
1 | # tag_ner
2 |
3 | Tagging based Named Entity Recognition.
4 |
5 | ```{eval-rst}
6 | .. currentmodule:: hanlp.components.mtl
7 |
8 | .. autoclass:: hanlp.components.mtl.tasks.ner.tag_ner.TaggingNamedEntityRecognition
9 | :members:
10 | :exclude-members: execute_training_loop, fit_dataloader
11 |
12 | ```
13 |
--------------------------------------------------------------------------------
/docs/api/hanlp/components/mtl/tasks/pos.md:
--------------------------------------------------------------------------------
1 | # pos
2 |
3 | Part-of-speech tagging.
4 |
5 | ```{eval-rst}
6 | .. currentmodule:: hanlp.components.mtl
7 |
8 | .. autoclass:: hanlp.components.mtl.tasks.pos.TransformerTagging
9 | :members:
10 | :exclude-members: execute_training_loop, fit_dataloader
11 |
12 | ```
13 |
--------------------------------------------------------------------------------
/docs/api/hanlp/components/mtl/tasks/sdp.md:
--------------------------------------------------------------------------------
1 | # sdp
2 |
3 | Semantic Dependency Parsing.
4 |
5 | ```{eval-rst}
6 | .. currentmodule:: hanlp.components.mtl
7 |
8 | .. autoclass:: hanlp.components.mtl.tasks.sdp.BiaffineSemanticDependencyParsing
9 | :members:
10 | :exclude-members: execute_training_loop, fit_dataloader
11 |
12 | ```
13 |
--------------------------------------------------------------------------------
/docs/api/hanlp/components/mtl/tasks/srl/bio_srl.md:
--------------------------------------------------------------------------------
1 | # bio_srl
2 |
3 | BIO Tagging based Semantic Role Labeling.
4 |
5 | ```{eval-rst}
6 | .. currentmodule:: hanlp.components.mtl
7 |
8 | .. autoclass:: hanlp.components.mtl.tasks.srl.bio_srl.SpanBIOSemanticRoleLabeling
9 | :members:
10 | :exclude-members: execute_training_loop, fit_dataloader
11 |
12 | ```
13 |
--------------------------------------------------------------------------------
/docs/api/hanlp/components/mtl/tasks/srl/index.md:
--------------------------------------------------------------------------------
1 | # srl
2 |
3 | Semantic Role Labeling.
4 |
5 | ```{toctree}
6 | bio_srl
7 | rank_srl
8 | ```
9 |
10 |
--------------------------------------------------------------------------------
/docs/api/hanlp/components/mtl/tasks/srl/rank_srl.md:
--------------------------------------------------------------------------------
1 | # rank_srl
2 |
3 | Span Ranking Semantic Role Labeling.
4 |
5 | ```{eval-rst}
6 | .. currentmodule:: hanlp.components.mtl
7 |
8 | .. autoclass:: hanlp.components.mtl.tasks.srl.rank_srl.SpanRankingSemanticRoleLabeling
9 | :members:
10 | :exclude-members: execute_training_loop, fit_dataloader
11 |
12 | ```
13 |
--------------------------------------------------------------------------------
/docs/api/hanlp/components/mtl/tasks/task.md:
--------------------------------------------------------------------------------
1 | # Task
2 |
3 | ```{eval-rst}
4 | .. currentmodule:: hanlp.components.mtl
5 |
6 | .. autoclass:: hanlp.components.mtl.tasks.Task
7 | :members:
8 | :exclude-members: execute_training_loop, fit_dataloader
9 |
10 | ```
11 |
--------------------------------------------------------------------------------
/docs/api/hanlp/components/mtl/tasks/tok.md:
--------------------------------------------------------------------------------
1 | # tok
2 |
3 | Tokenization.
4 |
5 | ```{eval-rst}
6 | .. currentmodule:: hanlp.components.mtl
7 |
8 | .. autoclass:: hanlp.components.mtl.tasks.tok.tag_tok.TaggingTokenization
9 | :members:
10 | :exclude-members: execute_training_loop, fit_dataloader
11 |
12 | ```
13 |
--------------------------------------------------------------------------------
/docs/api/hanlp/components/mtl/tasks/ud.md:
--------------------------------------------------------------------------------
1 | # ud
2 |
3 | Universal Dependencies Parsing (lemmatization, features, PoS tagging and dependency parsing).
4 |
5 | ```{eval-rst}
6 | .. currentmodule:: hanlp.components.mtl
7 |
8 | .. autoclass:: hanlp.components.mtl.tasks.ud.UniversalDependenciesParsing
9 | :members:
10 | :exclude-members: execute_training_loop, fit_dataloader
11 |
12 | ```
13 |
--------------------------------------------------------------------------------
/docs/api/hanlp/components/ner/biaffine_ner.md:
--------------------------------------------------------------------------------
1 | # biaffine_ner
2 |
3 | Biaffine Named Entity Recognition.
4 |
5 | ```{eval-rst}
6 | .. currentmodule:: hanlp.components.ner.transformer_ner
7 |
8 | .. autoclass:: hanlp.components.ner.biaffine_ner.biaffine_ner.BiaffineNamedEntityRecognizer
9 | :members:
10 |
11 | ```
12 |
--------------------------------------------------------------------------------
/docs/api/hanlp/components/ner/index.md:
--------------------------------------------------------------------------------
1 | # ner
2 |
3 | Named Entity Recognition.
4 |
5 | ```{toctree}
6 | transformer_ner
7 | rnn_ner
8 | biaffine_ner
9 | ```
10 |
11 |
--------------------------------------------------------------------------------
/docs/api/hanlp/components/ner/rnn_ner.md:
--------------------------------------------------------------------------------
1 | # rnn_ner
2 |
3 | Tagging based Named Entity Recognition.
4 |
5 | ```{eval-rst}
6 | .. currentmodule:: hanlp.components.ner.rnn_ner
7 |
8 | .. autoclass:: hanlp.components.ner.rnn_ner.RNNNamedEntityRecognizer
9 | :members:
10 |
11 | ```
12 |
--------------------------------------------------------------------------------
/docs/api/hanlp/components/ner/transformer_ner.md:
--------------------------------------------------------------------------------
1 | # transformer_ner
2 |
3 | Tagging based Named Entity Recognition.
4 |
5 | ```{eval-rst}
6 | .. currentmodule:: hanlp.components.ner.transformer_ner
7 |
8 | .. autoclass:: hanlp.components.ner.transformer_ner.TransformerNamedEntityRecognizer
9 | :members:
10 |
11 | ```
12 |
--------------------------------------------------------------------------------
/docs/api/hanlp/components/parsers/biaffine_dep.md:
--------------------------------------------------------------------------------
1 | # biaffine_dep
2 |
3 | Biaffine dependency parser.
4 |
5 | ```{eval-rst}
6 | .. currentmodule:: hanlp.components
7 |
8 | .. autoclass:: hanlp.components.parsers.biaffine.biaffine_dep.BiaffineDependencyParser
9 | :members:
10 |
11 | ```
12 |
--------------------------------------------------------------------------------
/docs/api/hanlp/components/parsers/biaffine_sdp.md:
--------------------------------------------------------------------------------
1 | # biaffine_sdp
2 |
3 | Biaffine dependency parser.
4 |
5 | ```{eval-rst}
6 | .. currentmodule:: hanlp.components
7 |
8 | .. autoclass:: hanlp.components.parsers.biaffine.biaffine_sdp.BiaffineSemanticDependencyParser
9 | :members:
10 |
11 | ```
12 |
--------------------------------------------------------------------------------
/docs/api/hanlp/components/parsers/crf_constituency_parser.md:
--------------------------------------------------------------------------------
1 | # crf_constituency_parser
2 |
3 | Biaffine dependency parser.
4 |
5 | ```{eval-rst}
6 | .. currentmodule:: hanlp.components
7 |
8 | .. autoclass:: hanlp.components.parsers.constituency.crf_constituency_parser.CRFConstituencyParser
9 | :members:
10 |
11 | ```
12 |
--------------------------------------------------------------------------------
/docs/api/hanlp/components/parsers/index.md:
--------------------------------------------------------------------------------
1 | # parsers
2 |
3 | Parsers.
4 |
5 | ```{toctree}
6 | biaffine_dep
7 | biaffine_sdp
8 | ud_parser
9 | crf_constituency_parser
10 | ```
11 |
12 |
--------------------------------------------------------------------------------
/docs/api/hanlp/components/parsers/ud_parser.md:
--------------------------------------------------------------------------------
1 | # ud_parser
2 |
3 | Universal Dependencies Parsing (lemmatization, features, PoS tagging and dependency parsing).
4 |
5 | ```{eval-rst}
6 | .. currentmodule:: hanlp.components
7 |
8 | .. autoclass:: hanlp.components.parsers.ud.ud_parser.UniversalDependenciesParser
9 | :members:
10 |
11 | ```
12 |
--------------------------------------------------------------------------------
/docs/api/hanlp/components/pipeline.md:
--------------------------------------------------------------------------------
1 | # pipeline
2 |
3 | ```{eval-rst}
4 | .. currentmodule:: hanlp.components.pipeline
5 |
6 | .. autoclass:: hanlp.components.pipeline.Pipe
7 | :members:
8 |
9 | .. autoclass:: hanlp.components.pipeline.Pipeline
10 | :members:
11 |
12 | ```
13 |
14 |
--------------------------------------------------------------------------------
/docs/api/hanlp/components/srl/index.md:
--------------------------------------------------------------------------------
1 | # srl
2 |
3 | Semantic Role Labelers.
4 |
5 | ```{toctree}
6 | span_rank
7 | span_bio
8 | ```
9 |
10 |
--------------------------------------------------------------------------------
/docs/api/hanlp/components/srl/span_bio.md:
--------------------------------------------------------------------------------
1 | # span_bio
2 |
3 | Span BIO tagging based SRL.
4 |
5 | ```{eval-rst}
6 | .. currentmodule:: hanlp.components.srl.span_bio.span_bio
7 |
8 | .. autoclass:: SpanBIOSemanticRoleLabeler
9 | :members:
10 |
11 | ```
12 |
--------------------------------------------------------------------------------
/docs/api/hanlp/components/srl/span_rank.md:
--------------------------------------------------------------------------------
1 | # span_rank
2 |
3 | Span Rank based SRL.
4 |
5 | ```{eval-rst}
6 | .. currentmodule:: hanlp.components.srl.span_rank.span_rank
7 |
8 | .. autoclass:: SpanRankingSemanticRoleLabeler
9 | :members:
10 |
11 | ```
12 |
--------------------------------------------------------------------------------
/docs/api/hanlp/components/sts.md:
--------------------------------------------------------------------------------
1 | # sts
2 |
3 | ```{eval-rst}
4 | .. currentmodule:: hanlp.components.sts
5 |
6 | .. autoclass:: hanlp.components.sts.transformer_sts.TransformerSemanticTextualSimilarity
7 | :members:
8 |
9 | ```
10 |
11 |
--------------------------------------------------------------------------------
/docs/api/hanlp/components/taggers/index.md:
--------------------------------------------------------------------------------
1 | # taggers
2 |
3 | Taggers.
4 |
5 | ```{toctree}
6 | transformer_tagger
7 | rnn_tagger
8 | ```
9 |
10 |
--------------------------------------------------------------------------------
/docs/api/hanlp/components/taggers/rnn_tagger.md:
--------------------------------------------------------------------------------
1 | # rnn_tagger
2 |
3 | RNN based tagger.
4 |
5 | ```{eval-rst}
6 | .. currentmodule:: hanlp.components
7 |
8 | .. autoclass:: hanlp.components.taggers.rnn_tagger.RNNTagger
9 | :members:
10 |
11 | ```
12 |
--------------------------------------------------------------------------------
/docs/api/hanlp/components/taggers/transformer_tagger.md:
--------------------------------------------------------------------------------
1 | # transformer_tagger
2 |
3 | Transformer based tagger.
4 |
5 | ```{eval-rst}
6 | .. currentmodule:: hanlp.components
7 |
8 | .. autoclass:: hanlp.components.taggers.transformers.transformer_tagger.TransformerTagger
9 | :members:
10 |
11 | ```
12 |
--------------------------------------------------------------------------------
/docs/api/hanlp/components/tokenizers/index.md:
--------------------------------------------------------------------------------
1 | # tokenizers
2 |
3 | Tokenizers.
4 |
5 | ```{toctree}
6 | transformer
7 | multi_criteria
8 | ```
9 |
10 |
--------------------------------------------------------------------------------
/docs/api/hanlp/components/tokenizers/multi_criteria.md:
--------------------------------------------------------------------------------
1 | # multi_criteria
2 |
3 | Transformer based Multi-Criteria Word tokenizer.
4 |
5 | ```{eval-rst}
6 | .. currentmodule:: hanlp.components.tokenizers.multi_criteria_cws_transformer
7 |
8 | .. autoclass:: hanlp.components.tokenizers.multi_criteria_cws_transformer.MultiCriteriaTransformerTaggingTokenizer
9 | :members:
10 |
11 | ```
12 |
--------------------------------------------------------------------------------
/docs/api/hanlp/components/tokenizers/transformer.md:
--------------------------------------------------------------------------------
1 | # transformer
2 |
3 | Transformer based tokenizer.
4 |
5 | ```{eval-rst}
6 | .. currentmodule:: hanlp.components.tokenizers.transformer
7 |
8 | .. autoclass:: hanlp.components.tokenizers.transformer.TransformerTaggingTokenizer
9 | :members:
10 |
11 | ```
12 |
--------------------------------------------------------------------------------
/docs/api/hanlp/datasets/constituency/constituency_dataset.md:
--------------------------------------------------------------------------------
1 | # constituency_dataset
2 |
3 | ```{eval-rst}
4 |
5 | .. autoclass:: hanlp.datasets.parsing.loaders.constituency_dataset.ConstituencyDataset
6 | :members:
7 |
8 | ```
9 |
--------------------------------------------------------------------------------
/docs/api/hanlp/datasets/constituency/index.md:
--------------------------------------------------------------------------------
1 | # con
2 |
3 | Constituency parsing datasets.
4 |
5 | ```{toctree}
6 | constituency_dataset
7 | resources
8 | ```
9 |
10 |
--------------------------------------------------------------------------------
/docs/api/hanlp/datasets/constituency/resources.md:
--------------------------------------------------------------------------------
1 | # resources
2 |
3 | ## Chinese Treebank
4 |
5 |
6 | ### CTB8
7 |
8 |
9 |
10 | ````{margin} **Discussion**
11 | ```{seealso}
12 | About our data split on [our forum](https://bbs.hankcs.com/t/topic/3024).
13 | ```
14 | ````
15 |
16 | ```{eval-rst}
17 |
18 |
19 | .. autodata:: hanlp.datasets.parsing.ctb8.CTB8_BRACKET_LINE_NOEC_TRAIN
20 | .. autodata:: hanlp.datasets.parsing.ctb8.CTB8_BRACKET_LINE_NOEC_DEV
21 | .. autodata:: hanlp.datasets.parsing.ctb8.CTB8_BRACKET_LINE_NOEC_TEST
22 |
23 | ```
24 |
25 | ### CTB9
26 |
27 | ````{margin} **Discussion**
28 | ```{seealso}
29 | About our data split on [our forum](https://bbs.hankcs.com/t/topic/3024).
30 | ```
31 | ````
32 |
33 | ```{eval-rst}
34 |
35 |
36 | .. autodata:: hanlp.datasets.parsing.ctb9.CTB9_BRACKET_LINE_NOEC_TRAIN
37 | .. autodata:: hanlp.datasets.parsing.ctb9.CTB9_BRACKET_LINE_NOEC_DEV
38 | .. autodata:: hanlp.datasets.parsing.ctb9.CTB9_BRACKET_LINE_NOEC_TEST
39 |
40 | ```
41 |
42 | ## English Treebank
43 |
44 | ### PTB
45 |
46 | ```{eval-rst}
47 |
48 | .. autodata:: hanlp.datasets.parsing.ptb.PTB_TRAIN
49 | .. autodata:: hanlp.datasets.parsing.ptb.PTB_DEV
50 | .. autodata:: hanlp.datasets.parsing.ptb.PTB_TEST
51 |
52 | ```
53 |
--------------------------------------------------------------------------------
/docs/api/hanlp/datasets/dep/conll_dataset.md:
--------------------------------------------------------------------------------
1 | # conll
2 |
3 | ```{eval-rst}
4 | .. currentmodule:: hanlp.datasets.parsing.loaders.conll_dataset
5 |
6 |
7 | .. autoclass:: CoNLLParsingDataset
8 | :members:
9 |
10 | ```
11 |
--------------------------------------------------------------------------------
/docs/api/hanlp/datasets/dep/index.md:
--------------------------------------------------------------------------------
1 | # dep
2 |
3 | Dependency parsing datasets.
4 |
5 | ```{toctree}
6 | conll_dataset
7 | resources
8 | ```
9 |
10 |
--------------------------------------------------------------------------------
/docs/api/hanlp/datasets/eos/eos.md:
--------------------------------------------------------------------------------
1 | # eos
2 |
3 | ```{eval-rst}
4 | .. currentmodule:: hanlp.datasets.eos.eos
5 |
6 | .. autoclass:: SentenceBoundaryDetectionDataset
7 | :members:
8 |
9 | ```
10 |
--------------------------------------------------------------------------------
/docs/api/hanlp/datasets/eos/index.md:
--------------------------------------------------------------------------------
1 | # eos
2 |
3 | Sentence boundary detection datasets.
4 |
5 | ```{toctree}
6 | eos
7 | resources
8 | ```
9 |
10 |
--------------------------------------------------------------------------------
/docs/api/hanlp/datasets/eos/resources.md:
--------------------------------------------------------------------------------
1 | # resources
2 |
3 | ## nn_eos
4 |
5 | ```{eval-rst}
6 |
7 | .. automodule:: hanlp.datasets.eos.loaders.nn_eos
8 | :members:
9 |
10 | ```
--------------------------------------------------------------------------------
/docs/api/hanlp/datasets/index.md:
--------------------------------------------------------------------------------
1 | # datasets
2 |
3 | ```{eval-rst}
4 | NLP datasets grouped by tasks. For each task, we provide at least one ``torch.utils.data.Dataset`` compatible class
5 | and several open-source resources. Their file format and description can be found in their ``Dataset.load_file``
6 | documents. Their contents are split into ``TRAIN``, ``DEV`` and ``TEST`` portions, each of them is stored in
7 | a Python constant which can be fetched using :meth:`~hanlp.utils.io_util.get_resource`.
8 | ```
9 |
10 | ````{margin} **Professionals use Linux**
11 | ```{note}
12 | Many preprocessing scripts written by professionals make heavy use of Linux/Unix tool chains like shell, perl, gcc,
13 | etc., which is not available or buggy on Windows. You may need a *nix evironment to run these scripts.
14 | ```
15 | ````
16 |
17 | ```{toctree}
18 | eos/index
19 | tok/index
20 | pos/index
21 | ner/index
22 | dep/index
23 | srl/index
24 | constituency/index
25 | ```
26 |
27 |
--------------------------------------------------------------------------------
/docs/api/hanlp/datasets/ner/index.md:
--------------------------------------------------------------------------------
1 | # ner
2 |
3 | NER datasets.
4 |
5 | ```{toctree}
6 | tsv
7 | json
8 | resources
9 | ```
10 |
11 |
--------------------------------------------------------------------------------
/docs/api/hanlp/datasets/ner/json.md:
--------------------------------------------------------------------------------
1 | # json
2 |
3 | ```{eval-rst}
4 | .. currentmodule:: hanlp.datasets.ner.loaders.json_ner
5 |
6 | .. autoclass:: JsonNERDataset
7 | :members:
8 |
9 | ```
10 |
--------------------------------------------------------------------------------
/docs/api/hanlp/datasets/ner/resources.md:
--------------------------------------------------------------------------------
1 | # resources
2 |
3 | ## CoNLL 2003
4 |
5 | ```{eval-rst}
6 |
7 | .. automodule:: hanlp.datasets.ner.conll03
8 | :members:
9 |
10 | ```
11 |
12 | ## MSRA
13 |
14 | ```{eval-rst}
15 |
16 | .. automodule:: hanlp.datasets.ner.msra
17 | :members:
18 |
19 | ```
20 |
21 | ## OntoNotes5
22 |
23 | ```{eval-rst}
24 |
25 | .. autodata:: hanlp.datasets.srl.ontonotes5.chinese.ONTONOTES5_CONLL12_CHINESE_TRAIN
26 | .. autodata:: hanlp.datasets.srl.ontonotes5.chinese.ONTONOTES5_CONLL12_CHINESE_DEV
27 | .. autodata:: hanlp.datasets.srl.ontonotes5.chinese.ONTONOTES5_CONLL12_CHINESE_TEST
28 |
29 | .. autodata:: hanlp.datasets.srl.ontonotes5.chinese.ONTONOTES5_CONLL12_NER_CHINESE_TRAIN
30 | .. autodata:: hanlp.datasets.srl.ontonotes5.chinese.ONTONOTES5_CONLL12_NER_CHINESE_DEV
31 | .. autodata:: hanlp.datasets.srl.ontonotes5.chinese.ONTONOTES5_CONLL12_NER_CHINESE_TEST
32 |
33 | ```
34 |
35 | ## Resume
36 |
37 | ```{eval-rst}
38 |
39 | .. automodule:: hanlp.datasets.ner.resume
40 | :members:
41 | ```
42 |
43 | ## Weibo
44 |
45 |
46 | ```{eval-rst}
47 |
48 | .. automodule:: hanlp.datasets.ner.weibo
49 | :members:
50 | ```
--------------------------------------------------------------------------------
/docs/api/hanlp/datasets/ner/tsv.md:
--------------------------------------------------------------------------------
1 | # tsv
2 |
3 | ```{eval-rst}
4 | .. currentmodule:: hanlp.datasets.ner.loaders.tsv
5 |
6 | .. autoclass:: TSVTaggingDataset
7 | :members:
8 |
9 | ```
10 |
--------------------------------------------------------------------------------
/docs/api/hanlp/datasets/pos/index.md:
--------------------------------------------------------------------------------
1 | # pos
2 |
3 | PoS datasets.
4 |
5 | ```{eval-rst}
6 | PoS is a normal tagging task which uses :class:`hanlp.datasets.ner.loaders.tsv.TSVTaggingDataset` for loading.
7 | ```
8 |
9 | ```{toctree}
10 | resources
11 | ```
12 |
13 |
--------------------------------------------------------------------------------
/docs/api/hanlp/datasets/pos/resources.md:
--------------------------------------------------------------------------------
1 | # resources
2 |
3 | ## CTB5
4 |
5 | ```{eval-rst}
6 |
7 | .. automodule:: hanlp.datasets.pos.ctb5
8 | :members:
9 |
10 | ```
11 |
12 | ## CTB8
13 |
14 | ```{eval-rst}
15 |
16 | .. autodata:: hanlp.datasets.parsing.ctb8.CTB8_POS_TRAIN
17 | .. autodata:: hanlp.datasets.parsing.ctb8.CTB8_POS_DEV
18 | .. autodata:: hanlp.datasets.parsing.ctb8.CTB8_POS_TEST
19 |
20 | ```
21 |
22 | ## CTB9
23 |
24 |
25 | ```{eval-rst}
26 |
27 |
28 | .. autodata:: hanlp.datasets.parsing.ctb9.CTB9_POS_TRAIN
29 | .. autodata:: hanlp.datasets.parsing.ctb9.CTB9_POS_DEV
30 | .. autodata:: hanlp.datasets.parsing.ctb9.CTB9_POS_TEST
31 |
32 | ```
--------------------------------------------------------------------------------
/docs/api/hanlp/datasets/srl/conll2012_dataset.md:
--------------------------------------------------------------------------------
1 | # conll2012_dataset
2 |
3 | ```{eval-rst}
4 |
5 | .. autoclass:: hanlp.datasets.srl.loaders.conll2012.CoNLL2012SRLDataset
6 | :members:
7 |
8 | ```
9 |
--------------------------------------------------------------------------------
/docs/api/hanlp/datasets/srl/index.md:
--------------------------------------------------------------------------------
1 | # srl
2 |
3 | Semantic Role Labeling datasets.
4 |
5 | ```{toctree}
6 | conll2012_dataset
7 | resources
8 | ```
9 |
10 |
--------------------------------------------------------------------------------
/docs/api/hanlp/datasets/srl/resources.md:
--------------------------------------------------------------------------------
1 | # resources
2 |
3 | ## OntoNotes 5
4 |
5 | ### Chinese
6 |
7 | ```{eval-rst}
8 |
9 | .. autodata:: hanlp.datasets.srl.ontonotes5.chinese.ONTONOTES5_CONLL12_CHINESE_TRAIN
10 | :noindex:
11 | .. autodata:: hanlp.datasets.srl.ontonotes5.chinese.ONTONOTES5_CONLL12_CHINESE_DEV
12 | :noindex:
13 | .. autodata:: hanlp.datasets.srl.ontonotes5.chinese.ONTONOTES5_CONLL12_CHINESE_TEST
14 | :noindex:
15 |
16 | ```
17 |
--------------------------------------------------------------------------------
/docs/api/hanlp/datasets/tok/index.md:
--------------------------------------------------------------------------------
1 | # tok
2 |
3 | Tokenization datasets.
4 |
5 | ```{toctree}
6 | txt
7 | mcws_dataset
8 | resources
9 | ```
10 |
11 |
--------------------------------------------------------------------------------
/docs/api/hanlp/datasets/tok/mcws_dataset.md:
--------------------------------------------------------------------------------
1 | # mcws_dataset
2 |
3 | ```{eval-rst}
4 | .. currentmodule:: hanlp.datasets.tokenization.loaders.multi_criteria_cws.mcws_dataset
5 |
6 | .. autoclass:: MultiCriteriaTextTokenizingDataset
7 | :members:
8 |
9 | ```
10 |
--------------------------------------------------------------------------------
/docs/api/hanlp/datasets/tok/resources.md:
--------------------------------------------------------------------------------
1 | # resources
2 |
3 | ## sighan2005
4 |
5 | [The Second International Chinese Word Segmentation Bakeoff](http://sighan.cs.uchicago.edu/bakeoff2005/) took place over the summer of 2005.
6 |
7 | ### pku
8 |
9 | ```{eval-rst}
10 |
11 | .. automodule:: hanlp.datasets.tokenization.sighan2005.pku
12 | :members:
13 |
14 | ```
15 |
16 | ### msr
17 |
18 | ```{eval-rst}
19 |
20 | .. automodule:: hanlp.datasets.tokenization.sighan2005.msr
21 | :members:
22 |
23 | ```
24 |
25 | ### as
26 |
27 | ```{eval-rst}
28 |
29 | .. automodule:: hanlp.datasets.tokenization.sighan2005.as_
30 | :members:
31 |
32 | ```
33 |
34 | ### cityu
35 |
36 | ```{eval-rst}
37 |
38 | .. automodule:: hanlp.datasets.tokenization.sighan2005.cityu
39 | :members:
40 |
41 | ```
42 |
43 | ## CTB6
44 |
45 | ```{eval-rst}
46 |
47 | .. automodule:: hanlp.datasets.tokenization.ctb6
48 | :members:
49 |
50 | ```
51 |
52 | ## CTB8
53 |
54 |
55 | ```{eval-rst}
56 |
57 | .. automodule:: hanlp.datasets.parsing.ctb8
58 |
59 | .. autodata:: CTB8_CWS_TRAIN
60 | .. autodata:: CTB8_CWS_DEV
61 | .. autodata:: CTB8_CWS_TEST
62 |
63 | ```
64 |
65 | ## CTB9
66 |
67 |
68 | ```{eval-rst}
69 |
70 | .. automodule:: hanlp.datasets.parsing.ctb9
71 |
72 | .. autodata:: CTB9_CWS_TRAIN
73 | .. autodata:: CTB9_CWS_DEV
74 | .. autodata:: CTB9_CWS_TEST
75 |
76 | ```
--------------------------------------------------------------------------------
/docs/api/hanlp/datasets/tok/txt.md:
--------------------------------------------------------------------------------
1 | # txt
2 |
3 | ```{eval-rst}
4 | .. currentmodule:: hanlp.datasets.tokenization.loaders.txt
5 |
6 | .. autoclass:: TextTokenizingDataset
7 | :members:
8 |
9 | ```
10 |
--------------------------------------------------------------------------------
/docs/api/hanlp/hanlp.rst:
--------------------------------------------------------------------------------
1 | .. _api/main:
2 |
3 | hanlp
4 | ==========
5 |
6 | .. currentmodule:: hanlp
7 |
8 | .. autofunction:: load
9 |
10 | .. autofunction:: pipeline
--------------------------------------------------------------------------------
/docs/api/hanlp/index.md:
--------------------------------------------------------------------------------
1 | # hanlp
2 |
3 | Core APIs for `hanlp`.
4 |
5 | ```{toctree}
6 | hanlp
7 | common/index
8 | components/index
9 | pretrained/index
10 | datasets/index
11 | utils/index
12 | layers/index
13 | ```
--------------------------------------------------------------------------------
/docs/api/hanlp/layers/decoders/biaffine_ner.md:
--------------------------------------------------------------------------------
1 | # biaffine_ner
2 |
3 |
4 | ```{eval-rst}
5 |
6 | .. autoclass:: hanlp.components.ner.biaffine_ner.biaffine_ner_model.BiaffineNamedEntityRecognitionDecoder
7 | :members:
8 |
9 | ```
10 |
--------------------------------------------------------------------------------
/docs/api/hanlp/layers/decoders/index.md:
--------------------------------------------------------------------------------
1 | # decoders
2 |
3 | ```{toctree}
4 | linear_crf
5 | biaffine_ner
6 | ```
7 |
8 |
--------------------------------------------------------------------------------
/docs/api/hanlp/layers/decoders/linear_crf.md:
--------------------------------------------------------------------------------
1 | # linear_crf
2 |
3 |
4 | ```{eval-rst}
5 |
6 | .. autoclass:: hanlp.components.mtl.tasks.pos.LinearCRFDecoder
7 | :members:
8 |
9 | ```
10 |
--------------------------------------------------------------------------------
/docs/api/hanlp/layers/embeddings/char_cnn.md:
--------------------------------------------------------------------------------
1 | # char_cnn
2 |
3 |
4 | ```{eval-rst}
5 |
6 | .. autoclass:: hanlp.layers.embeddings.char_cnn.CharCNN
7 | :members:
8 |
9 | .. autoclass:: hanlp.layers.embeddings.char_cnn.CharCNNEmbedding
10 | :members:
11 |
12 | ```
13 |
--------------------------------------------------------------------------------
/docs/api/hanlp/layers/embeddings/char_rnn.md:
--------------------------------------------------------------------------------
1 | # char_rnn
2 |
3 |
4 | ```{eval-rst}
5 |
6 | .. autoclass:: hanlp.layers.embeddings.char_rnn.CharRNN
7 | :members:
8 |
9 | .. autoclass:: hanlp.layers.embeddings.char_rnn.CharRNNEmbedding
10 | :members:
11 |
12 | ```
13 |
--------------------------------------------------------------------------------
/docs/api/hanlp/layers/embeddings/embedding.md:
--------------------------------------------------------------------------------
1 | # embedding
2 |
3 |
4 | ```{eval-rst}
5 |
6 | .. autoclass:: hanlp.layers.embeddings.embedding.Embedding
7 | :members:
8 |
9 | .. autoclass:: hanlp.layers.embeddings.embedding.ConcatModuleList
10 | :members:
11 |
12 | .. autoclass:: hanlp.layers.embeddings.embedding.EmbeddingList
13 | :members:
14 |
15 | ```
16 |
--------------------------------------------------------------------------------
/docs/api/hanlp/layers/embeddings/fasttext.md:
--------------------------------------------------------------------------------
1 | # fasttext
2 |
3 | ```{eval-rst}
4 |
5 | .. autoclass:: hanlp.layers.embeddings.fast_text.FastTextEmbedding
6 | :members:
7 |
8 | .. autoclass:: hanlp.layers.embeddings.fast_text.FastTextEmbeddingModule
9 | :members:
10 |
11 | ```
12 |
--------------------------------------------------------------------------------
/docs/api/hanlp/layers/embeddings/index.md:
--------------------------------------------------------------------------------
1 | # embeddings
2 |
3 | ```{toctree}
4 | embedding
5 | word2vec
6 | fasttext
7 | char_cnn
8 | char_rnn
9 | transformer
10 | ```
11 |
12 |
--------------------------------------------------------------------------------
/docs/api/hanlp/layers/embeddings/transformer.md:
--------------------------------------------------------------------------------
1 | # transformer
2 |
3 |
4 | ```{eval-rst}
5 |
6 | .. autoclass:: hanlp.layers.embeddings.contextual_word_embedding.ContextualWordEmbedding
7 | :members:
8 |
9 | .. autoclass:: hanlp.layers.embeddings.contextual_word_embedding.ContextualWordEmbeddingModule
10 | :members:
11 |
12 | ```
13 |
--------------------------------------------------------------------------------
/docs/api/hanlp/layers/embeddings/word2vec.md:
--------------------------------------------------------------------------------
1 | # word2vec
2 |
3 | ```{eval-rst}
4 |
5 | .. autoclass:: hanlp.layers.embeddings.word2vec.Word2VecEmbedding
6 | :members:
7 |
8 | .. autoclass:: hanlp.layers.embeddings.word2vec.Word2VecEmbeddingModule
9 | :members:
10 |
11 | ```
12 |
--------------------------------------------------------------------------------
/docs/api/hanlp/layers/index.md:
--------------------------------------------------------------------------------
1 | # layers
2 |
3 | ```{toctree}
4 | embeddings/index
5 | transformers/index
6 | decoders/index
7 | ```
8 |
9 |
--------------------------------------------------------------------------------
/docs/api/hanlp/layers/transformers/encoder.md:
--------------------------------------------------------------------------------
1 | # encoder
2 |
3 |
4 | ```{eval-rst}
5 |
6 | .. autoclass:: hanlp.layers.transformers.encoder.TransformerEncoder
7 | :members:
8 |
9 | ```
10 |
--------------------------------------------------------------------------------
/docs/api/hanlp/layers/transformers/index.md:
--------------------------------------------------------------------------------
1 | # transformers
2 |
3 | ```{toctree}
4 | encoder
5 | tokenizer
6 | ```
7 |
8 |
--------------------------------------------------------------------------------
/docs/api/hanlp/layers/transformers/tokenizer.md:
--------------------------------------------------------------------------------
1 | # tokenizer
2 |
3 |
4 | ```{eval-rst}
5 |
6 | .. autoclass:: hanlp.transform.transformer_tokenizer.TransformerSequenceTokenizer
7 | :members:
8 |
9 | ```
10 |
--------------------------------------------------------------------------------
/docs/api/hanlp/pretrained/amr.md:
--------------------------------------------------------------------------------
1 | ---
2 | jupytext:
3 | formats: ipynb,md:myst
4 | text_representation:
5 | extension: .md
6 | format_name: myst
7 | format_version: '0.8'
8 | jupytext_version: 1.4.2
9 | kernelspec:
10 | display_name: Python 3
11 | language: python
12 | name: python3
13 | ---
14 | # amr
15 |
16 | AMR captures “who is doing what to whom” in a sentence. Each sentence is represented as a rooted, directed, acyclic graph with labels on edges (relations) and leaves (concepts).
17 | Before loading an AMR model, make sure to install HanLP with the `amr` dependencies:
18 |
19 | ```shell
20 | pip install hanlp[amr] -U
21 | ```
22 |
23 | To parse a raw sentence into AMR:
24 |
25 | ```{eval-rst}
26 | .. margin:: Batching is Faster
27 |
28 | .. Hint:: Parse multiple sentences at once for faster speed!
29 | ```
30 |
31 |
32 | ```{code-cell} ipython3
33 | :tags: [output_scroll]
34 | import hanlp
35 |
36 | amr_parser = hanlp.load(hanlp.pretrained.amr.AMR3_SEQ2SEQ_BART_LARGE)
37 | amr = amr_parser('The boy wants the girl to believe him.')
38 | print(amr)
39 | ```
40 |
41 | All the pre-trained parsers and their scores are listed below.
42 |
43 | ```{eval-rst}
44 |
45 | .. automodule:: hanlp.pretrained.amr
46 | :members:
47 |
48 | ```
--------------------------------------------------------------------------------
/docs/api/hanlp/pretrained/dep.md:
--------------------------------------------------------------------------------
1 | # dep
2 |
3 | ```{eval-rst}
4 |
5 | .. automodule:: hanlp.pretrained.dep
6 | :members:
7 |
8 | ```
--------------------------------------------------------------------------------
/docs/api/hanlp/pretrained/eos.md:
--------------------------------------------------------------------------------
1 | # eos
2 |
3 |
4 | ```{eval-rst}
5 |
6 | .. automodule:: hanlp.pretrained.eos
7 | :members:
8 |
9 | ```
--------------------------------------------------------------------------------
/docs/api/hanlp/pretrained/fasttext.md:
--------------------------------------------------------------------------------
1 | # fasttext
2 |
3 | ```{eval-rst}
4 |
5 | .. automodule:: hanlp.pretrained.fasttext
6 | :members:
7 |
8 | ```
--------------------------------------------------------------------------------
/docs/api/hanlp/pretrained/glove.md:
--------------------------------------------------------------------------------
1 | # glove
2 |
3 | ```{eval-rst}
4 |
5 | .. automodule:: hanlp.pretrained.glove
6 | :members:
7 |
8 | ```
--------------------------------------------------------------------------------
/docs/api/hanlp/pretrained/index.md:
--------------------------------------------------------------------------------
1 | # pretrained
2 |
3 | ```{eval-rst}
4 | NLP components grouped by tasks. For each task, we provide at least one :class:`~hanlp.common.component.Component`
5 | compatible class and several pretrained models. Each of them is stored in a Python constant which can be fetched
6 | using :meth:`hanlp.load`.
7 | ```
8 |
9 | ```{toctree}
10 | mtl
11 | eos
12 | tok
13 | pos
14 | ner
15 | dep
16 | constituency
17 | srl
18 | sdp
19 | amr
20 | amr2text
21 | sts
22 | word2vec
23 | glove
24 | fasttext
25 | mlm
26 | ```
27 |
28 |
--------------------------------------------------------------------------------
/docs/api/hanlp/pretrained/mtl.md:
--------------------------------------------------------------------------------
1 | # mtl
2 |
3 | ```{eval-rst}
4 |
5 | .. automodule:: hanlp.pretrained.mtl
6 | :members:
7 |
8 | ```
--------------------------------------------------------------------------------
/docs/api/hanlp/pretrained/ner.md:
--------------------------------------------------------------------------------
1 | # ner
2 |
3 | ```{eval-rst}
4 |
5 | .. automodule:: hanlp.pretrained.ner
6 | :members:
7 |
8 | ```
--------------------------------------------------------------------------------
/docs/api/hanlp/pretrained/pos.md:
--------------------------------------------------------------------------------
1 | ---
2 | jupytext:
3 | formats: ipynb,md:myst
4 | text_representation:
5 | extension: .md
6 | format_name: myst
7 | format_version: '0.8'
8 | jupytext_version: 1.4.2
9 | kernelspec:
10 | display_name: Python 3
11 | language: python
12 | name: python3
13 | ---
14 |
15 | # pos
16 |
17 | The process of classifying words into their **parts of speech** and labeling them accordingly is known as **part-of-speech tagging**, **POS-tagging**, or simply **tagging**.
18 |
19 | To tag a tokenized sentence:
20 |
21 | ````{margin} Batching is Faster
22 | ```{hint}
23 | Tag multiple sentences at once for faster speed!
24 | ```
25 | ````
26 |
27 |
28 | ```{code-cell} ipython3
29 | :tags: [output_scroll]
30 | import hanlp
31 |
32 | pos = hanlp.load(hanlp.pretrained.pos.CTB9_POS_ELECTRA_SMALL)
33 | pos(['我', '的', '希望', '是', '希望', '世界', '和平'])
34 | ```
35 |
36 | ````{margin} Custom Dictionary Supported
37 | ```{seealso}
38 | See [this tutorial](https://github.com/hankcs/HanLP/blob/master/plugins/hanlp_demo/hanlp_demo/zh/demo_pos_dict.py) for custom dictionary.
39 | ```
40 | ````
41 |
42 | All the pre-trained taggers and their details are listed below.
43 |
44 | ```{eval-rst}
45 |
46 | .. automodule:: hanlp.pretrained.pos
47 | :members:
48 |
49 | ```
--------------------------------------------------------------------------------
/docs/api/hanlp/pretrained/sdp.md:
--------------------------------------------------------------------------------
1 | # sdp
2 |
3 | ```{eval-rst}
4 |
5 | .. automodule:: hanlp.pretrained.sdp
6 | :members:
7 |
8 | ```
--------------------------------------------------------------------------------
/docs/api/hanlp/pretrained/srl.md:
--------------------------------------------------------------------------------
1 | ---
2 | jupytext:
3 | formats: ipynb,md:myst
4 | text_representation:
5 | extension: .md
6 | format_name: myst
7 | format_version: '0.8'
8 | jupytext_version: 1.4.2
9 | kernelspec:
10 | display_name: Python 3
11 | language: python
12 | name: python3
13 | ---
14 |
15 | # srl
16 |
17 | Semantic Role Labeling (SRL) is one shallow semantic parsing that produces predicate-argument structures which are semantic roles (or participants) such as agent, patient, and theme associated with verbs.
18 |
19 | Inputs to SRL are tokenized sentences:
20 |
21 | ````{margin} Batching is Faster
22 | ```{hint}
23 | Feed in multiple sentences at once for faster speed!
24 | ```
25 | ````
26 |
27 |
28 | ```{code-cell} ipython3
29 | :tags: [output_scroll]
30 | import hanlp
31 |
32 | srl = hanlp.load(hanlp.pretrained.srl.CPB3_SRL_ELECTRA_SMALL)
33 | srl(['男孩', '希望', '女孩', '相信', '他', '。'])
34 | ```
35 |
36 | All the pre-trained labelers and their details are listed below.
37 |
38 | ```{eval-rst}
39 |
40 | .. automodule:: hanlp.pretrained.srl
41 | :members:
42 |
43 | ```
--------------------------------------------------------------------------------
/docs/api/hanlp/pretrained/sts.md:
--------------------------------------------------------------------------------
1 | ---
2 | jupytext:
3 | formats: ipynb,md:myst
4 | text_representation:
5 | extension: .md
6 | format_name: myst
7 | format_version: '0.8'
8 | jupytext_version: 1.4.2
9 | kernelspec:
10 | display_name: Python 3
11 | language: python
12 | name: python3
13 | ---
14 |
15 | # sts
16 |
17 | `sts` package holds pre-trained Semantic Textual Similarity (STS) models. We surveyed both supervised and unsupervised
18 | models and we believe that unsupervised models are still immature at this moment. Unsupervised STS is good for IR but
19 | not NLP especially on sentences with little lexical overlap.
20 |
21 |
22 | ```{eval-rst}
23 |
24 | .. automodule:: hanlp.pretrained.sts
25 | :members:
26 |
27 | ```
28 |
29 | ```{code-cell} ipython3
30 | import hanlp
31 |
32 | sim = hanlp.load(hanlp.pretrained.sts.STS_ELECTRA_BASE_ZH)
33 | sim([
34 | ['看图猜一电影名', '看图猜电影'],
35 | ['无线路由器怎么无线上网', '无线上网卡和无线路由器怎么用'],
36 | ['北京到上海的动车票', '上海到北京的动车票'],
37 | ])
38 | ```
--------------------------------------------------------------------------------
/docs/api/hanlp/pretrained/tok.md:
--------------------------------------------------------------------------------
1 | ---
2 | jupytext:
3 | formats: ipynb,md:myst
4 | text_representation:
5 | extension: .md
6 | format_name: myst
7 | format_version: '0.8'
8 | jupytext_version: 1.4.2
9 | kernelspec:
10 | display_name: Python 3
11 | language: python
12 | name: python3
13 | ---
14 |
15 | # tok
16 |
17 | Tokenization is a way of separating a sentence into smaller units called tokens. In lexical analysis, tokens usually refer to words.
18 |
19 | ````{margin} Batching is Faster
20 | ```{hint}
21 | Tokenize multiple sentences at once for faster speed!
22 | ```
23 | ````
24 | ````{margin} Custom Dictionary Supported
25 | ```{seealso}
26 | See [this tutorial](https://github.com/hankcs/HanLP/blob/master/plugins/hanlp_demo/hanlp_demo/zh/demo_custom_dict.py) for custom dictionary.
27 | ```
28 | ````
29 |
30 | To tokenize raw sentences:
31 |
32 |
33 | ```{code-cell} ipython3
34 | :tags: [output_scroll]
35 | import hanlp
36 |
37 | tok = hanlp.load(hanlp.pretrained.tok.COARSE_ELECTRA_SMALL_ZH)
38 | tok(['商品和服务。', '晓美焰来到北京立方庭参观自然语义科技公司'])
39 | ```
40 |
41 | All the pre-trained tokenizers and their details are listed below.
42 |
43 |
44 | ```{eval-rst}
45 |
46 | .. automodule:: hanlp.pretrained.tok
47 | :members:
48 |
49 | ```
50 |
51 |
--------------------------------------------------------------------------------
/docs/api/hanlp/utils/index.md:
--------------------------------------------------------------------------------
1 | # utils
2 |
3 | Utilities.
4 |
5 | ```{toctree}
6 | io_util
7 | ```
8 |
--------------------------------------------------------------------------------
/docs/api/hanlp/utils/io_util.md:
--------------------------------------------------------------------------------
1 | # io_util
2 |
3 | ```{eval-rst}
4 |
5 | .. currentmodule:: hanlp.utils
6 |
7 | .. automodule:: hanlp.utils.io_util
8 | :members:
9 |
10 | ```
11 |
--------------------------------------------------------------------------------
/docs/api/restful.rst:
--------------------------------------------------------------------------------
1 | .. _api/hanlp_restful:
2 |
3 | hanlp_restful
4 | ====================
5 |
6 | .. currentmodule:: hanlp_restful
7 |
8 | .. autoclass:: HanLPClient
9 | :members:
10 | :special-members:
11 | :exclude-members: __init__, __repr__, __weakref__
--------------------------------------------------------------------------------
/docs/api/restful_golang.md:
--------------------------------------------------------------------------------
1 | # Golang RESTful API
2 |
3 | ## Install
4 |
5 | ```shell script
6 | go get -u github.com/hankcs/gohanlp@main
7 | ```
8 |
9 | ## Quick Start
10 |
11 | Obtain an `auth` from any compatible service provider like our [free service](https://bbs.hankcs.com/t/apply-for-free-hanlp-restful-apis/3178), then initiate a `HanLPClient` and call its `Parse` interface.
12 |
13 | ```java
14 | package main
15 |
16 | import (
17 | "fmt"
18 | "github.com/hankcs/gohanlp/hanlp"
19 | )
20 |
21 | func main() {
22 | client := hanlp.HanLPClient(hanlp.WithAuth("The auth you applied for")) // anonymous users can skip auth
23 | s, _ := client.Parse("In 2021, HanLPv2.1 delivers state-of-the-art multilingual NLP techniques to production environments.",hanlp.WithLanguage("mul"))
24 | fmt.Println(s)
25 | }
26 | ```
27 |
28 | Refer to our [testcases](https://github.com/hankcs/gohanlp/blob/main/main_test.go) and [data format](../data_format) for more details.
29 |
30 |
--------------------------------------------------------------------------------
/docs/api/restful_java.md:
--------------------------------------------------------------------------------
1 | # Java RESTful API
2 |
3 | Add the following dependency into the `pom.xml` file of your project.
4 |
5 | ```xml
6 |
7 | com.hankcs.hanlp.restful
8 | hanlp-restful
9 | 0.0.15
10 |
11 | ```
12 |
13 | Obtain an `auth` from any compatible service provider like our [free service](https://bbs.hankcs.com/t/apply-for-free-hanlp-restful-apis/3178), then initiate a `HanLPClient` and call its `parse` interface.
14 |
15 | ```java
16 | HanLPClient client = new HanLPClient("https://hanlp.hankcs.com/api", null); // Replace null with your auth
17 | System.out.println(client.parse("2021年HanLPv2.1为生产环境带来次世代最先进的多语种NLP技术。晓美焰来到北京立方庭参观自然语义科技公司。"));
18 | ```
19 |
20 | Refer to our [testcases](https://github.com/hankcs/HanLP/blob/master/plugins/hanlp_restful_java/src/test/java/com/hankcs/hanlp/restful/HanLPClientTest.java) and [data format](../data_format) for more details.
21 |
22 |
--------------------------------------------------------------------------------
/docs/api/trie/dictionary.md:
--------------------------------------------------------------------------------
1 | # dictionary
2 |
3 | ```{eval-rst}
4 | .. currentmodule:: hanlp_trie
5 |
6 | .. autoclass:: hanlp_trie.dictionary.DictInterface
7 | :members:
8 |
9 | .. autoclass:: hanlp_trie.dictionary.TrieDict
10 | :members:
11 | ```
12 |
--------------------------------------------------------------------------------
/docs/api/trie/index.md:
--------------------------------------------------------------------------------
1 | # hanlp_trie
2 |
3 | HanLP trie/dictionary interface and referential implementation.
4 |
5 | ```{toctree}
6 | trie
7 | dictionary
8 | ```
9 |
10 |
--------------------------------------------------------------------------------
/docs/api/trie/trie.md:
--------------------------------------------------------------------------------
1 | # trie
2 |
3 | ```{eval-rst}
4 | .. currentmodule:: hanlp_trie
5 |
6 | .. autoclass:: hanlp_trie.trie.Node
7 | :members:
8 |
9 | .. autoclass:: hanlp_trie.trie.Trie
10 | :members:
11 | ```
12 |
--------------------------------------------------------------------------------
/docs/references.rst:
--------------------------------------------------------------------------------
1 | References
2 | ==================
3 |
4 | .. bibliography:: references.bib
5 | :cited:
6 | :style: astrostyle
--------------------------------------------------------------------------------
/hanlp/callbacks/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # Author: hankcs
3 | # Date: 2019-12-05 02:10
--------------------------------------------------------------------------------
/hanlp/common/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # Author: hankcs
3 | # Date: 2019-08-26 14:45
4 |
--------------------------------------------------------------------------------
/hanlp/common/component.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # Author: hankcs
3 | # Date: 2019-08-26 14:45
4 | import inspect
5 | from abc import ABC, abstractmethod
6 | from typing import Any
7 |
8 | from hanlp_common.configurable import Configurable
9 |
10 |
11 | class Component(Configurable, ABC):
12 | @abstractmethod
13 | def predict(self, *args, **kwargs):
14 | """Predict on data. This is the base class for all components, including rule based and statistical ones.
15 |
16 | Args:
17 | *args: Any type of data subject to sub-classes
18 | **kwargs: Additional arguments
19 |
20 | Returns: Any predicted annotations.
21 |
22 | """
23 | raise NotImplementedError('%s.%s()' % (self.__class__.__name__, inspect.stack()[0][3]))
24 |
25 | def __call__(self, *args, **kwargs):
26 | """
27 | A shortcut for :func:`~hanlp.common.component.predict`.
28 |
29 | Args:
30 | *args: Any type of data subject to sub-classes
31 | **kwargs: Additional arguments
32 |
33 | Returns: Any predicted annotations.
34 |
35 | """
36 | return self.predict(*args, **kwargs)
37 |
--------------------------------------------------------------------------------
/hanlp/components/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # Author: hankcs
3 | # Date: 2019-08-26 16:10
4 | from .pipeline import Pipeline
--------------------------------------------------------------------------------
/hanlp/components/amr/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # Author: hankcs
3 | # Date: 2020-08-20 17:35
4 |
--------------------------------------------------------------------------------
/hanlp/components/amr/amrbart/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # Author: hankcs
3 | # Date: 2022-12-05 17:53
4 |
--------------------------------------------------------------------------------
/hanlp/components/amr/amrbart/common/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # Author: hankcs
3 | # Date: 2022-12-05 17:53
4 |
--------------------------------------------------------------------------------
/hanlp/components/amr/amrbart/data_interface/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # Author: hankcs
3 | # Date: 2022-12-07 14:36
4 |
--------------------------------------------------------------------------------
/hanlp/components/amr/amrbart/model_interface/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # Author: hankcs
3 | # Date: 2022-12-03 20:33
4 |
--------------------------------------------------------------------------------
/hanlp/components/amr/amrbart/preprocess/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # Author: hankcs
3 | # Date: 2022-12-03 20:33
4 |
--------------------------------------------------------------------------------
/hanlp/components/amr/seq2seq/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # Author: hankcs
3 | # Date: 2021-04-27 19:24
4 |
--------------------------------------------------------------------------------
/hanlp/components/amr/seq2seq/dataset/IO.py:
--------------------------------------------------------------------------------
1 | import glob
2 | from typing import List, Union, Iterable
3 | from pathlib import Path
4 | from .penman import pm_load as pm_load
5 |
6 |
7 | def read_raw_amr_data(
8 | paths: List[Union[str, Path]],
9 | use_recategorization=False,
10 | dereify=True,
11 | remove_wiki=False,
12 | ):
13 | assert paths
14 |
15 | if not isinstance(paths, Iterable):
16 | paths = [paths]
17 |
18 | graphs = []
19 | for path_ in paths:
20 | for path in glob.glob(str(path_)):
21 | path = Path(path)
22 | assert path.exists(), f'{path} not exist'
23 | graphs.extend(pm_load(path, dereify=dereify, remove_wiki=remove_wiki))
24 |
25 | assert graphs, 'No graphs loaded'
26 |
27 | if use_recategorization:
28 | for g in graphs:
29 | metadata = g.metadata
30 | metadata['snt_orig'] = metadata['snt']
31 | tokens = eval(metadata['tokens'])
32 | metadata['snt'] = ' '.join(
33 | [t for t in tokens if not ((t.startswith('-L') or t.startswith('-R')) and t.endswith('-'))])
34 |
35 | return graphs
36 |
--------------------------------------------------------------------------------
/hanlp/components/amr/seq2seq/dataset/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # Author: hankcs
3 | # Date: 2021-04-27 19:29
4 |
--------------------------------------------------------------------------------
/hanlp/components/amr/seq2seq/evaluation.py:
--------------------------------------------------------------------------------
1 | from pathlib import Path
2 |
3 | import penman
4 |
5 |
6 | def write_predictions(predictions_path, tokenizer, graphs):
7 | pieces = [penman.encode(g) for g in graphs]
8 | text = '\n\n'.join(pieces)
9 | if tokenizer:
10 | text = text.replace(tokenizer.INIT, '')
11 | Path(predictions_path).write_text(text)
12 | return predictions_path
13 |
14 |
15 | def compute_smatch(pred, gold):
16 | from perin_parser.thirdparty.mtool import smatch
17 | with Path(pred).open() as p, Path(gold).open() as g:
18 | score = next(smatch.score_amr_pairs(p, g))
19 | return score[2]
20 |
21 |
22 | def compute_bleu(gold_sentences, pred_sentences):
23 | from sacrebleu import corpus_bleu
24 | return corpus_bleu(pred_sentences, [gold_sentences])
25 |
--------------------------------------------------------------------------------
/hanlp/components/classifiers/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # Author: hankcs
3 | # Date: 2019-11-10 13:18
--------------------------------------------------------------------------------
/hanlp/components/distillation/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # Author: hankcs
3 | # Date: 2020-10-17 20:29
4 |
--------------------------------------------------------------------------------
/hanlp/components/eos/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # Author: hankcs
3 | # Date: 2020-07-26 20:19
--------------------------------------------------------------------------------
/hanlp/components/lambda_wrapper.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # Author: hankcs
3 | # Date: 2019-12-31 18:36
4 | from typing import Callable, Any
5 |
6 | from hanlp.common.component import Component
7 | from hanlp_common.reflection import classpath_of, object_from_classpath, str_to_type
8 |
9 |
10 | class LambdaComponent(Component):
11 | def __init__(self, function: Callable) -> None:
12 | super().__init__()
13 | self.config = {}
14 | self.function = function
15 | self.config['function'] = classpath_of(function)
16 | self.config['classpath'] = classpath_of(self)
17 |
18 | def predict(self, data: Any, **kwargs):
19 | unpack = kwargs.pop('_hanlp_unpack', None)
20 | if unpack:
21 | return self.function(*data, **kwargs)
22 | return self.function(data, **kwargs)
23 |
24 | @staticmethod
25 | def from_config(meta: dict, **kwargs):
26 | cls = str_to_type(meta['classpath'])
27 | function = meta['function']
28 | function = object_from_classpath(function)
29 | return cls(function)
30 |
--------------------------------------------------------------------------------
/hanlp/components/lm/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # Author: hankcs
3 | # Date: 2022-01-29 21:07
4 |
--------------------------------------------------------------------------------
/hanlp/components/mtl/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # Author: hankcs
3 | # Date: 2020-06-20 19:54
--------------------------------------------------------------------------------
/hanlp/components/mtl/tasks/ner/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # Author: hankcs
3 | # Date: 2020-12-03 14:34
4 |
--------------------------------------------------------------------------------
/hanlp/components/mtl/tasks/srl/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # Author: hankcs
3 | # Date: 2020-12-04 16:49
4 |
--------------------------------------------------------------------------------
/hanlp/components/mtl/tasks/tok/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # Author: hankcs
3 | # Date: 2020-08-11 16:34
--------------------------------------------------------------------------------
/hanlp/components/ner/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # Author: hankcs
3 | # Date: 2020-07-21 17:22
--------------------------------------------------------------------------------
/hanlp/components/ner/biaffine_ner/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # Author: hankcs
3 | # Date: 2020-07-21 18:41
--------------------------------------------------------------------------------
/hanlp/components/parsers/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # Author: hankcs
3 | # Date: 2019-12-22 12:46
--------------------------------------------------------------------------------
/hanlp/components/parsers/biaffine/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # Author: hankcs
3 | # Date: 2020-05-08 20:43
4 |
--------------------------------------------------------------------------------
/hanlp/components/parsers/biaffine_tf/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # Author: hankcs
3 | # Date: 2019-12-26 23:03
--------------------------------------------------------------------------------
/hanlp/components/parsers/constituency/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # Author: hankcs
3 | # Date: 2020-11-28 19:26
4 |
--------------------------------------------------------------------------------
/hanlp/components/parsers/ud/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # Author: hankcs
3 | # Date: 2020-12-14 20:34
4 |
--------------------------------------------------------------------------------
/hanlp/components/parsers/ud/util.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # Author: hankcs
3 | # Date: 2020-12-14 20:44
4 | from hanlp_common.constant import ROOT
5 | from hanlp.components.parsers.ud.lemma_edit import gen_lemma_rule
6 |
7 |
8 | def generate_lemma_rule(sample: dict):
9 | if 'LEMMA' in sample:
10 | sample['lemma'] = [gen_lemma_rule(word, lemma) if lemma != "_" else "_" for word, lemma in
11 | zip(sample['FORM'], sample['LEMMA'])]
12 | return sample
13 |
14 |
15 | def append_bos(sample: dict):
16 | if 'FORM' in sample:
17 | sample['token'] = [ROOT] + sample['FORM']
18 | if 'UPOS' in sample:
19 | sample['pos'] = sample['UPOS'][:1] + sample['UPOS']
20 | sample['arc'] = [0] + sample['HEAD']
21 | sample['rel'] = sample['DEPREL'][:1] + sample['DEPREL']
22 | sample['lemma'] = sample['lemma'][:1] + sample['lemma']
23 | sample['feat'] = sample['FEATS'][:1] + sample['FEATS']
24 | return sample
25 |
26 |
27 | def sample_form_missing(sample: dict):
28 | return all(t == '_' for t in sample['FORM'])
29 |
--------------------------------------------------------------------------------
/hanlp/components/srl/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # Author: hankcs
3 | # Date: 2020-06-22 20:50
--------------------------------------------------------------------------------
/hanlp/components/srl/span_bio/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # Author: hankcs
3 | # Date: 2020-12-04 13:59
4 |
--------------------------------------------------------------------------------
/hanlp/components/srl/span_rank/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # Author: hankcs
3 | # Date: 2020-07-19 22:22
--------------------------------------------------------------------------------
/hanlp/components/srl/span_rank/util.py:
--------------------------------------------------------------------------------
1 | # Adopted from https://github.com/KiroSummer/A_Syntax-aware_MTL_Framework_for_Chinese_SRL
2 | import torch
3 |
4 |
5 | def block_orth_normal_initializer(input_size, output_size):
6 | weight = []
7 | for o in output_size:
8 | for i in input_size:
9 | param = torch.FloatTensor(o, i)
10 | torch.nn.init.orthogonal_(param)
11 | weight.append(param)
12 | return torch.cat(weight)
13 |
--------------------------------------------------------------------------------
/hanlp/components/sts/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # Author: hankcs
3 | # Date: 2021-05-20 17:02
4 |
--------------------------------------------------------------------------------
/hanlp/components/taggers/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # Author: hankcs
3 | # Date: 2019-08-28 15:39
--------------------------------------------------------------------------------
/hanlp/components/taggers/ngram_conv/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # Author: hankcs
3 | # Date: 2019-12-29 22:18
--------------------------------------------------------------------------------
/hanlp/components/taggers/pos_tf.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # Author: hankcs
3 | # Date: 2019-12-05 23:05
4 | from hanlp.components.taggers.cnn_tagger_tf import CNNTaggerTF
5 | from hanlp.components.taggers.rnn_tagger_tf import RNNTaggerTF
6 |
7 |
8 | class CNNPartOfSpeechTaggerTF(CNNTaggerTF):
9 | pass
10 |
11 |
12 | class RNNPartOfSpeechTaggerTF(RNNTaggerTF):
13 | pass
14 |
--------------------------------------------------------------------------------
/hanlp/components/taggers/rnn/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # Author: hankcs
3 | # Date: 2020-05-19 15:41
--------------------------------------------------------------------------------
/hanlp/components/taggers/transformers/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # Author: hankcs
3 | # Date: 2019-12-29 13:57
--------------------------------------------------------------------------------
/hanlp/components/taggers/transformers/metrics_tf.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # Author: hankcs
3 | # Date: 2019-12-30 16:33
4 | import tensorflow as tf
5 |
6 |
7 | class Accuracy(tf.keras.metrics.SparseCategoricalAccuracy):
8 |
9 | def __init__(self, name='sparse_categorical_accuracy', dtype=None, mask_value=0):
10 | super().__init__(name, dtype)
11 | self.mask_value = mask_value
12 |
13 | def update_state(self, y_true, y_pred, sample_weight=None):
14 | sample_weight = tf.not_equal(y_true, self.mask_value)
15 | return super().update_state(y_true, y_pred, sample_weight)
16 |
--------------------------------------------------------------------------------
/hanlp/components/taggers/util.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # Author: hankcs
3 | # Date: 2020-06-01 00:31
4 | from typing import List, Tuple
5 | from hanlp.utils.span_util import allowed_transitions
6 |
7 |
8 | def guess_tagging_scheme(labels: List[str]) -> str:
9 | tagset = set(y.split('-')[0] for y in labels)
10 | for scheme in "BIO", "BIOUL", "BMES", 'IOBES':
11 | if tagset == set(list(scheme)):
12 | return scheme
13 |
14 |
15 | def guess_allowed_transitions(labels) -> List[Tuple[int, int]]:
16 | scheme = guess_tagging_scheme(labels)
17 | if not scheme:
18 | return None
19 | if scheme == 'IOBES':
20 | scheme = 'BIOUL'
21 | labels = [y.replace('E-', 'L-').replace('S-', 'U-') for y in labels]
22 | return allowed_transitions(scheme, dict(enumerate(labels)))
23 |
--------------------------------------------------------------------------------
/hanlp/components/tokenizers/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # Author: hankcs
3 | # Date: 2020-08-11 02:48
--------------------------------------------------------------------------------
/hanlp/datasets/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # Author: hankcs
3 | # Date: 2019-06-13 18:15
4 |
--------------------------------------------------------------------------------
/hanlp/datasets/classification/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # Author: hankcs
3 | # Date: 2019-11-10 11:49
--------------------------------------------------------------------------------
/hanlp/datasets/classification/sentiment.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # Author: hankcs
3 | # Date: 2019-12-30 21:03
4 | _ERNIE_TASK_DATA = 'https://ernie.bj.bcebos.com/task_data_zh.tgz#'
5 |
6 | CHNSENTICORP_ERNIE_TRAIN = _ERNIE_TASK_DATA + 'chnsenticorp/train.tsv'
7 | CHNSENTICORP_ERNIE_DEV = _ERNIE_TASK_DATA + 'chnsenticorp/dev.tsv'
8 | CHNSENTICORP_ERNIE_TEST = _ERNIE_TASK_DATA + 'chnsenticorp/test.tsv'
9 |
--------------------------------------------------------------------------------
/hanlp/datasets/coref/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # Author: hankcs
3 | # Date: 2020-07-04 13:39
--------------------------------------------------------------------------------
/hanlp/datasets/coref/loaders/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # Author: hankcs
3 | # Date: 2021-12-28 19:03
4 |
--------------------------------------------------------------------------------
/hanlp/datasets/eos/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # Author: hankcs
3 | # Date: 2020-07-26 18:11
--------------------------------------------------------------------------------
/hanlp/datasets/eos/loaders/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # Author: hankcs
3 | # Date: 2021-12-28 19:03
4 |
--------------------------------------------------------------------------------
/hanlp/datasets/eos/loaders/nn_eos.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # Author: hankcs
3 | # Date: 2020-12-24 22:51
4 | _SETIMES2_EN_HR_SENTENCES_HOME = 'https://schweter.eu/cloud/nn_eos/SETIMES2.en-hr.sentences.tar.xz'
5 | SETIMES2_EN_HR_HR_SENTENCES_TRAIN = _SETIMES2_EN_HR_SENTENCES_HOME + '#SETIMES2.en-hr.hr.sentences.train'
6 | '''Training set of SETimes corpus.'''
7 | SETIMES2_EN_HR_HR_SENTENCES_DEV = _SETIMES2_EN_HR_SENTENCES_HOME + '#SETIMES2.en-hr.hr.sentences.dev'
8 | '''Dev set of SETimes corpus.'''
9 | SETIMES2_EN_HR_HR_SENTENCES_TEST = _SETIMES2_EN_HR_SENTENCES_HOME + '#SETIMES2.en-hr.hr.sentences.test'
10 | '''Test set of SETimes corpus.'''
11 | _EUROPARL_V7_DE_EN_EN_SENTENCES_HOME = 'http://schweter.eu/cloud/nn_eos/europarl-v7.de-en.en.sentences.tar.xz'
12 | EUROPARL_V7_DE_EN_EN_SENTENCES_TRAIN = _EUROPARL_V7_DE_EN_EN_SENTENCES_HOME + '#europarl-v7.de-en.en.sentences.train'
13 | '''Training set of Europarl corpus (:cite:`koehn2005europarl`).'''
14 | EUROPARL_V7_DE_EN_EN_SENTENCES_DEV = _EUROPARL_V7_DE_EN_EN_SENTENCES_HOME + '#europarl-v7.de-en.en.sentences.dev'
15 | '''Dev set of Europarl corpus (:cite:`koehn2005europarl`).'''
16 | EUROPARL_V7_DE_EN_EN_SENTENCES_TEST = _EUROPARL_V7_DE_EN_EN_SENTENCES_HOME + '#europarl-v7.de-en.en.sentences.test'
17 | '''Test set of Europarl corpus (:cite:`koehn2005europarl`).'''
18 |
--------------------------------------------------------------------------------
/hanlp/datasets/lm/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # Author: hankcs
3 | # Date: 2020-06-05 21:41
4 |
5 | _PTB_HOME = 'http://www.fit.vutbr.cz/~imikolov/rnnlm/simple-examples.tgz#'
6 | PTB_TOKEN_TRAIN = _PTB_HOME + 'data/ptb.train.txt'
7 | PTB_TOKEN_DEV = _PTB_HOME + 'data/ptb.valid.txt'
8 | PTB_TOKEN_TEST = _PTB_HOME + 'data/ptb.test.txt'
9 |
10 | PTB_CHAR_TRAIN = _PTB_HOME + 'data/ptb.char.train.txt'
11 | PTB_CHAR_DEV = _PTB_HOME + 'data/ptb.char.valid.txt'
12 | PTB_CHAR_TEST = _PTB_HOME + 'data/ptb.char.test.txt'
13 |
--------------------------------------------------------------------------------
/hanlp/datasets/lm/loaders/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # Author: hankcs
3 | # Date: 2021-12-28 19:04
4 |
--------------------------------------------------------------------------------
/hanlp/datasets/lu/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # Author: hankcs
3 | # Date: 2021-12-28 19:08
4 |
--------------------------------------------------------------------------------
/hanlp/datasets/lu/glue.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # Author: hankcs
3 | # Date: 2019-11-10 11:47
4 | from hanlp.common.dataset import TableDataset
5 |
6 | STANFORD_SENTIMENT_TREEBANK_2_TRAIN = 'http://file.hankcs.com/corpus/SST2.zip#train.tsv'
7 | STANFORD_SENTIMENT_TREEBANK_2_DEV = 'http://file.hankcs.com/corpus/SST2.zip#dev.tsv'
8 | STANFORD_SENTIMENT_TREEBANK_2_TEST = 'http://file.hankcs.com/corpus/SST2.zip#test.tsv'
9 |
10 | MICROSOFT_RESEARCH_PARAPHRASE_CORPUS_TRAIN = 'http://file.hankcs.com/corpus/mrpc.zip#train.tsv'
11 | MICROSOFT_RESEARCH_PARAPHRASE_CORPUS_DEV = 'http://file.hankcs.com/corpus/mrpc.zip#dev.tsv'
12 | MICROSOFT_RESEARCH_PARAPHRASE_CORPUS_TEST = 'http://file.hankcs.com/corpus/mrpc.zip#test.tsv'
13 |
14 |
15 | class SST2Dataset(TableDataset):
16 | pass
17 |
18 |
19 | def main():
20 | dataset = SST2Dataset(STANFORD_SENTIMENT_TREEBANK_2_TEST)
21 | print(dataset)
22 |
23 |
24 | if __name__ == '__main__':
25 | main()
26 |
--------------------------------------------------------------------------------
/hanlp/datasets/ner/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # Author: hankcs
3 | # Date: 2019-12-06 15:32
--------------------------------------------------------------------------------
/hanlp/datasets/ner/conll03.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # Author: hankcs
3 | # Date: 2019-12-06 15:31
4 |
5 |
6 | CONLL03_EN_TRAIN = 'https://file.hankcs.com/corpus/conll03_en_iobes.zip#eng.train.tsv'
7 | '''Training set of CoNLL03 (:cite:`tjong-kim-sang-de-meulder-2003-introduction`)'''
8 | CONLL03_EN_DEV = 'https://file.hankcs.com/corpus/conll03_en_iobes.zip#eng.dev.tsv'
9 | '''Dev set of CoNLL03 (:cite:`tjong-kim-sang-de-meulder-2003-introduction`)'''
10 | CONLL03_EN_TEST = 'https://file.hankcs.com/corpus/conll03_en_iobes.zip#eng.test.tsv'
11 | '''Test set of CoNLL03 (:cite:`tjong-kim-sang-de-meulder-2003-introduction`)'''
12 |
--------------------------------------------------------------------------------
/hanlp/datasets/ner/loaders/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # Author: hankcs
3 | # Date: 2021-12-28 19:04
4 |
--------------------------------------------------------------------------------
/hanlp/datasets/ner/resume.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # Author: hankcs
3 | # Date: 2020-06-08 12:10
4 | from hanlp.common.dataset import TransformableDataset
5 |
6 | from hanlp.utils.io_util import get_resource, generate_words_tags_from_tsv
7 |
8 | _RESUME_NER_HOME = 'https://github.com/jiesutd/LatticeLSTM/archive/master.zip#'
9 |
10 | RESUME_NER_TRAIN = _RESUME_NER_HOME + 'ResumeNER/train.char.bmes'
11 | '''Training set of Resume in char level.'''
12 | RESUME_NER_DEV = _RESUME_NER_HOME + 'ResumeNER/dev.char.bmes'
13 | '''Dev set of Resume in char level.'''
14 | RESUME_NER_TEST = _RESUME_NER_HOME + 'ResumeNER/test.char.bmes'
15 | '''Test set of Resume in char level.'''
16 |
17 |
--------------------------------------------------------------------------------
/hanlp/datasets/ner/weibo.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # Author: hankcs
3 | # Date: 2020-06-03 23:33
4 | from hanlp.common.dataset import TransformableDataset
5 |
6 | from hanlp.utils.io_util import get_resource, generate_words_tags_from_tsv
7 |
8 | _WEIBO_NER_HOME = 'https://github.com/hltcoe/golden-horse/archive/master.zip#data/'
9 |
10 | WEIBO_NER_TRAIN = _WEIBO_NER_HOME + 'weiboNER_2nd_conll.train'
11 | '''Training set of Weibo in char level.'''
12 | WEIBO_NER_DEV = _WEIBO_NER_HOME + 'weiboNER_2nd_conll.dev'
13 | '''Dev set of Weibo in char level.'''
14 | WEIBO_NER_TEST = _WEIBO_NER_HOME + 'weiboNER_2nd_conll.test'
15 | '''Test set of Weibo in char level.'''
16 |
--------------------------------------------------------------------------------
/hanlp/datasets/parsing/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # Author: hankcs
3 | # Date: 2019-12-28 00:51
4 |
--------------------------------------------------------------------------------
/hanlp/datasets/parsing/ctb5.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # Author: hankcs
3 | # Date: 2019-12-28 18:44
4 | from hanlp_common.constant import HANLP_URL
5 |
6 | _CTB_HOME = HANLP_URL + 'embeddings/SUDA-LA-CIP_20200109_021624.zip#'
7 |
8 | _CTB5_DEP_HOME = _CTB_HOME + 'BPNN/data/ctb5/'
9 |
10 | CTB5_DEP_TRAIN = _CTB5_DEP_HOME + 'train.conll'
11 | '''Training set for ctb5 dependency parsing.'''
12 | CTB5_DEP_DEV = _CTB5_DEP_HOME + 'dev.conll'
13 | '''Dev set for ctb5 dependency parsing.'''
14 | CTB5_DEP_TEST = _CTB5_DEP_HOME + 'test.conll'
15 | '''Test set for ctb5 dependency parsing.'''
16 |
17 | CIP_W2V_100_CN = _CTB_HOME + 'BPNN/data/embed.txt'
18 |
--------------------------------------------------------------------------------
/hanlp/datasets/parsing/ctb7.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # Author: hankcs
3 | # Date: 2019-12-28 18:44
4 | from hanlp.datasets.parsing.ctb5 import _CTB_HOME
5 |
6 | _CTB7_HOME = _CTB_HOME + 'BPNN/data/ctb7/'
7 |
8 | CTB7_DEP_TRAIN = _CTB7_HOME + 'train.conll'
9 | '''Training set for ctb7 dependency parsing.'''
10 | CTB7_DEP_DEV = _CTB7_HOME + 'dev.conll'
11 | '''Dev set for ctb7 dependency parsing.'''
12 | CTB7_DEP_TEST = _CTB7_HOME + 'test.conll'
13 | '''Test set for ctb7 dependency parsing.'''
14 |
--------------------------------------------------------------------------------
/hanlp/datasets/parsing/loaders/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # Author: hankcs
3 | # Date: 2021-12-28 19:04
4 |
--------------------------------------------------------------------------------
/hanlp/datasets/parsing/ud/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # Author: hankcs
3 | # Date: 2020-12-07 21:45
4 | import os
5 | import shutil
6 |
7 | from hanlp.components.parsers.ud.udify_util import get_ud_treebank_files
8 | from hanlp.utils.io_util import get_resource
9 | from hanlp.utils.log_util import flash
10 |
11 |
12 | def concat_treebanks(home, version):
13 | ud_home = get_resource(home)
14 | treebanks = get_ud_treebank_files(ud_home)
15 | output_dir = os.path.abspath(os.path.join(ud_home, os.path.pardir, os.path.pardir, f'ud-multilingual-v{version}'))
16 | if os.path.isdir(output_dir):
17 | return output_dir
18 | os.makedirs(output_dir)
19 | train, dev, test = list(zip(*[treebanks[k] for k in treebanks]))
20 |
21 | for treebank, name in zip([train, dev, test], ["train.conllu", "dev.conllu", "test.conllu"]):
22 | flash(f'Concatenating {len(train)} treebanks into {name} [blink][yellow]...[/yellow][/blink]')
23 | with open(os.path.join(output_dir, name), 'w') as write:
24 | for t in treebank:
25 | if not t:
26 | continue
27 | with open(t, 'r') as read:
28 | shutil.copyfileobj(read, write)
29 | flash('')
30 | return output_dir
31 |
--------------------------------------------------------------------------------
/hanlp/datasets/parsing/ud/ud210m.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # Author: hankcs
3 | # Date: 2020-05-21 20:39
4 | import os
5 |
6 | from hanlp.datasets.parsing.ud import concat_treebanks
7 | from hanlp.datasets.parsing.ud.ud210 import _UD_210_HOME
8 |
9 | _UD_210_MULTILINGUAL_HOME = concat_treebanks(_UD_210_HOME, '2.10')
10 | UD_210_MULTILINGUAL_TRAIN = os.path.join(_UD_210_MULTILINGUAL_HOME, 'train.conllu')
11 | "Training set of multilingual UD_210 obtained by concatenating all training sets."
12 | UD_210_MULTILINGUAL_DEV = os.path.join(_UD_210_MULTILINGUAL_HOME, 'dev.conllu')
13 | "Dev set of multilingual UD_210 obtained by concatenating all dev sets."
14 | UD_210_MULTILINGUAL_TEST = os.path.join(_UD_210_MULTILINGUAL_HOME, 'test.conllu')
15 | "Test set of multilingual UD_210 obtained by concatenating all test sets."
16 |
--------------------------------------------------------------------------------
/hanlp/datasets/parsing/ud/ud23m.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # Author: hankcs
3 | # Date: 2020-05-21 20:39
4 | import os
5 |
6 | from hanlp.datasets.parsing.ud import concat_treebanks
7 | from .ud23 import _UD_23_HOME
8 |
9 | _UD_23_MULTILINGUAL_HOME = concat_treebanks(_UD_23_HOME, '2.3')
10 | UD_23_MULTILINGUAL_TRAIN = os.path.join(_UD_23_MULTILINGUAL_HOME, 'train.conllu')
11 | UD_23_MULTILINGUAL_DEV = os.path.join(_UD_23_MULTILINGUAL_HOME, 'dev.conllu')
12 | UD_23_MULTILINGUAL_TEST = os.path.join(_UD_23_MULTILINGUAL_HOME, 'test.conllu')
13 |
--------------------------------------------------------------------------------
/hanlp/datasets/parsing/ud/ud27m.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # Author: hankcs
3 | # Date: 2020-05-21 20:39
4 | import os
5 |
6 | from hanlp.datasets.parsing.ud import concat_treebanks
7 | from hanlp.datasets.parsing.ud.ud27 import _UD_27_HOME
8 |
9 | _UD_27_MULTILINGUAL_HOME = concat_treebanks(_UD_27_HOME, '2.7')
10 | UD_27_MULTILINGUAL_TRAIN = os.path.join(_UD_27_MULTILINGUAL_HOME, 'train.conllu')
11 | "Training set of multilingual UD_27 obtained by concatenating all training sets."
12 | UD_27_MULTILINGUAL_DEV = os.path.join(_UD_27_MULTILINGUAL_HOME, 'dev.conllu')
13 | "Dev set of multilingual UD_27 obtained by concatenating all dev sets."
14 | UD_27_MULTILINGUAL_TEST = os.path.join(_UD_27_MULTILINGUAL_HOME, 'test.conllu')
15 | "Test set of multilingual UD_27 obtained by concatenating all test sets."
16 |
--------------------------------------------------------------------------------
/hanlp/datasets/pos/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # Author: hankcs
3 | # Date: 2019-12-28 22:50
--------------------------------------------------------------------------------
/hanlp/datasets/pos/ctb5.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # Author: hankcs
3 | # Date: 2019-12-28 22:51
4 |
5 | _CTB5_POS_HOME = 'http://file.hankcs.com/corpus/ctb5.1-pos.zip'
6 |
7 | CTB5_POS_TRAIN = f'{_CTB5_POS_HOME}#train.tsv'
8 | '''PoS training set for CTB5.'''
9 | CTB5_POS_DEV = f'{_CTB5_POS_HOME}#dev.tsv'
10 | '''PoS dev set for CTB5.'''
11 | CTB5_POS_TEST = f'{_CTB5_POS_HOME}#test.tsv'
12 | '''PoS test set for CTB5.'''
13 |
--------------------------------------------------------------------------------
/hanlp/datasets/qa/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # Author: hankcs
3 | # Date: 2020-03-20 19:17
--------------------------------------------------------------------------------
/hanlp/datasets/srl/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # Author: hankcs
3 | # Date: 2020-06-22 19:15
4 |
5 |
6 |
--------------------------------------------------------------------------------
/hanlp/datasets/srl/loaders/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # Author: hankcs
3 | # Date: 2021-12-28 19:05
4 |
--------------------------------------------------------------------------------
/hanlp/datasets/srl/ontonotes5/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # Author: hankcs
3 | # Date: 2020-11-26 16:07
4 | ONTONOTES5_HOME = 'https://catalog.ldc.upenn.edu/LDC2013T19/LDC2013T19.tgz#/ontonotes-release-5.0/data/'
5 | CONLL12_HOME = ONTONOTES5_HOME + '../conll-2012/'
6 |
--------------------------------------------------------------------------------
/hanlp/datasets/sts/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # Author: hankcs
3 | # Date: 2021-05-20 16:25
4 |
--------------------------------------------------------------------------------
/hanlp/datasets/tokenization/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # Author: hankcs
3 | # Date: 2020-08-01 12:33
--------------------------------------------------------------------------------
/hanlp/datasets/tokenization/ctb6.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # Author: hankcs
3 | # Date: 2019-12-28 22:19
4 |
5 | _CTB6_CWS_HOME = 'http://file.hankcs.com/corpus/ctb6_cws.zip'
6 |
7 | CTB6_CWS_TRAIN = _CTB6_CWS_HOME + '#train.txt'
8 | '''CTB6 training set.'''
9 | CTB6_CWS_DEV = _CTB6_CWS_HOME + '#dev.txt'
10 | '''CTB6 dev set.'''
11 | CTB6_CWS_TEST = _CTB6_CWS_HOME + '#test.txt'
12 | '''CTB6 test set.'''
13 |
--------------------------------------------------------------------------------
/hanlp/datasets/tokenization/loaders/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # Author: hankcs
3 | # Date: 2021-12-28 19:06
4 |
--------------------------------------------------------------------------------
/hanlp/datasets/tokenization/loaders/multi_criteria_cws/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # Author: hankcs
3 | # Date: 2020-08-11 20:35
4 |
5 | _HOME = 'https://github.com/hankcs/multi-criteria-cws/archive/naive-mix.zip#data/raw/'
6 |
7 | CNC_TRAIN_ALL = _HOME + 'cnc/train-all.txt'
8 | CNC_TRAIN = _HOME + 'cnc/train.txt'
9 | CNC_DEV = _HOME + 'cnc/dev.txt'
10 | CNC_TEST = _HOME + 'cnc/test.txt'
11 |
12 | CTB_TRAIN_ALL = _HOME + 'ctb/train-all.txt'
13 | CTB_TRAIN = _HOME + 'ctb/train.txt'
14 | CTB_DEV = _HOME + 'ctb/dev.txt'
15 | CTB_TEST = _HOME + 'ctb/test.txt'
16 |
17 | SXU_TRAIN_ALL = _HOME + 'sxu/train-all.txt'
18 | SXU_TRAIN = _HOME + 'sxu/train.txt'
19 | SXU_DEV = _HOME + 'sxu/dev.txt'
20 | SXU_TEST = _HOME + 'sxu/test.txt'
21 |
22 | UDC_TRAIN_ALL = _HOME + 'udc/train-all.txt'
23 | UDC_TRAIN = _HOME + 'udc/train.txt'
24 | UDC_DEV = _HOME + 'udc/dev.txt'
25 | UDC_TEST = _HOME + 'udc/test.txt'
26 |
27 | WTB_TRAIN_ALL = _HOME + 'wtb/train-all.txt'
28 | WTB_TRAIN = _HOME + 'wtb/train.txt'
29 | WTB_DEV = _HOME + 'wtb/dev.txt'
30 | WTB_TEST = _HOME + 'wtb/test.txt'
31 |
32 | ZX_TRAIN_ALL = _HOME + 'zx/train-all.txt'
33 | ZX_TRAIN = _HOME + 'zx/train.txt'
34 | ZX_DEV = _HOME + 'zx/dev.txt'
35 | ZX_TEST = _HOME + 'zx/test.txt'
36 |
--------------------------------------------------------------------------------
/hanlp/datasets/tokenization/sighan2005/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # Author: hankcs
3 | # Date: 2019-12-21 15:42
4 | import os
5 |
6 | from hanlp.utils.io_util import get_resource, split_file
7 | from hanlp.utils.log_util import logger
8 |
9 | SIGHAN2005 = 'http://sighan.cs.uchicago.edu/bakeoff2005/data/icwb2-data.zip'
10 |
11 |
12 | def make(train):
13 | root = get_resource(SIGHAN2005)
14 | train = os.path.join(root, train.split('#')[-1])
15 | if not os.path.isfile(train):
16 | full = train.replace('_90.txt', '.utf8')
17 | logger.info(f'Splitting {full} into training set and valid set with 9:1 proportion')
18 | valid = train.replace('90.txt', '10.txt')
19 | split_file(full, train=0.9, dev=0.1, test=0, names={'train': train, 'dev': valid})
20 | assert os.path.isfile(train), f'Failed to make {train}'
21 | assert os.path.isfile(valid), f'Failed to make {valid}'
22 | logger.info(f'Successfully made {train} {valid}')
23 |
--------------------------------------------------------------------------------
/hanlp/datasets/tokenization/sighan2005/as_.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # Author: hankcs
3 | # Date: 2019-12-21 15:42
4 | from hanlp.datasets.tokenization.sighan2005 import SIGHAN2005, make
5 |
6 | SIGHAN2005_AS_DICT = SIGHAN2005 + "#" + "gold/as_training_words.utf8"
7 | '''Dictionary built on trainings set.'''
8 | SIGHAN2005_AS_TRAIN_ALL = SIGHAN2005 + "#" + "training/as_training.utf8"
9 | '''Full training set.'''
10 | SIGHAN2005_AS_TRAIN = SIGHAN2005 + "#" + "training/as_training_90.txt"
11 | '''Training set (first 90% of the full official training set).'''
12 | SIGHAN2005_AS_DEV = SIGHAN2005 + "#" + "training/as_training_10.txt"
13 | '''Dev set (last 10% of full official training set).'''
14 | SIGHAN2005_AS_TEST_INPUT = SIGHAN2005 + "#" + "testing/as_testing.utf8"
15 | '''Test input.'''
16 | SIGHAN2005_AS_TEST = SIGHAN2005 + "#" + "gold/as_testing_gold.utf8"
17 | '''Test set.'''
18 |
19 | make(SIGHAN2005_AS_TRAIN)
20 |
--------------------------------------------------------------------------------
/hanlp/datasets/tokenization/sighan2005/cityu.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # Author: hankcs
3 | # Date: 2019-12-21 15:42
4 | from hanlp.datasets.tokenization.sighan2005 import SIGHAN2005, make
5 |
6 | SIGHAN2005_CITYU_DICT = SIGHAN2005 + "#" + "gold/cityu_training_words.utf8"
7 | '''Dictionary built on trainings set.'''
8 | SIGHAN2005_CITYU_TRAIN_ALL = SIGHAN2005 + "#" + "training/cityu_training.utf8"
9 | '''Full training set.'''
10 | SIGHAN2005_CITYU_TRAIN = SIGHAN2005 + "#" + "training/cityu_training_90.txt"
11 | '''Training set (first 90% of the full official training set).'''
12 | SIGHAN2005_CITYU_DEV = SIGHAN2005 + "#" + "training/cityu_training_10.txt"
13 | '''Dev set (last 10% of full official training set).'''
14 | SIGHAN2005_CITYU_TEST_INPUT = SIGHAN2005 + "#" + "testing/cityu_test.utf8"
15 | '''Test input.'''
16 | SIGHAN2005_CITYU_TEST = SIGHAN2005 + "#" + "gold/cityu_test_gold.utf8"
17 | '''Test set.'''
18 |
19 | make(SIGHAN2005_CITYU_TRAIN)
20 |
--------------------------------------------------------------------------------
/hanlp/datasets/tokenization/sighan2005/msr.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # Author: hankcs
3 | # Date: 2019-12-21 15:42
4 | from hanlp.datasets.tokenization.sighan2005 import SIGHAN2005, make
5 |
6 | SIGHAN2005_MSR_DICT = SIGHAN2005 + "#" + "gold/msr_training_words.utf8"
7 | '''Dictionary built on trainings set.'''
8 | SIGHAN2005_MSR_TRAIN_ALL = SIGHAN2005 + "#" + "training/msr_training.utf8"
9 | '''Full training set.'''
10 | SIGHAN2005_MSR_TRAIN = SIGHAN2005 + "#" + "training/msr_training_90.txt"
11 | '''Training set (first 90% of the full official training set).'''
12 | SIGHAN2005_MSR_DEV = SIGHAN2005 + "#" + "training/msr_training_10.txt"
13 | '''Dev set (last 10% of full official training set).'''
14 | SIGHAN2005_MSR_TEST_INPUT = SIGHAN2005 + "#" + "testing/msr_test.utf8"
15 | '''Test input.'''
16 | SIGHAN2005_MSR_TEST = SIGHAN2005 + "#" + "gold/msr_test_gold.utf8"
17 | '''Test set.'''
18 |
19 | make(SIGHAN2005_MSR_TRAIN)
20 |
--------------------------------------------------------------------------------
/hanlp/datasets/tokenization/sighan2005/pku.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # Author: hankcs
3 | # Date: 2019-12-21 15:42
4 | from hanlp.datasets.tokenization.sighan2005 import SIGHAN2005, make
5 |
6 | SIGHAN2005_PKU_DICT = SIGHAN2005 + "#" + "gold/pku_training_words.utf8"
7 | '''Dictionary built on trainings set.'''
8 | SIGHAN2005_PKU_TRAIN_ALL = SIGHAN2005 + "#" + "training/pku_training.utf8"
9 | '''Full training set.'''
10 | SIGHAN2005_PKU_TRAIN = SIGHAN2005 + "#" + "training/pku_training_90.txt"
11 | '''Training set (first 90% of the full official training set).'''
12 | SIGHAN2005_PKU_DEV = SIGHAN2005 + "#" + "training/pku_training_10.txt"
13 | '''Dev set (last 10% of full official training set).'''
14 | SIGHAN2005_PKU_TEST_INPUT = SIGHAN2005 + "#" + "testing/pku_test.utf8"
15 | '''Test input.'''
16 | SIGHAN2005_PKU_TEST = SIGHAN2005 + "#" + "gold/pku_test_gold.utf8"
17 | '''Test set.'''
18 |
19 | make(SIGHAN2005_PKU_TRAIN)
20 |
--------------------------------------------------------------------------------
/hanlp/layers/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # Author: hankcs
3 | # Date: 2019-10-26 00:50
--------------------------------------------------------------------------------
/hanlp/layers/crf/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # Author: hankcs
3 | # Date: 2019-12-18 22:55
--------------------------------------------------------------------------------
/hanlp/layers/embeddings/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # Author: hankcs
3 | # Date: 2019-08-24 21:48
4 |
--------------------------------------------------------------------------------
/hanlp/layers/feed_forward.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # Author: hankcs
3 | # Date: 2020-07-06 14:37
4 | from typing import Union, List
5 |
6 | from hanlp.layers import feedforward
7 |
8 | from hanlp.common.structure import ConfigTracker
9 |
10 |
11 | class FeedForward(feedforward.FeedForward, ConfigTracker):
12 | def __init__(self, input_dim: int, num_layers: int, hidden_dims: Union[int, List[int]],
13 | activations: Union[str, List[str]], dropout: Union[float, List[float]] = 0.0) -> None:
14 | super().__init__(input_dim, num_layers, hidden_dims, activations, dropout)
15 | ConfigTracker.__init__(self, locals())
16 |
--------------------------------------------------------------------------------
/hanlp/layers/transformers/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # Author: hankcs
3 | # Date: 2019-12-29 15:17
4 | # mute transformers
5 | import logging
6 |
7 | logging.getLogger('transformers.file_utils').setLevel(logging.ERROR)
8 | logging.getLogger('transformers.filelock').setLevel(logging.ERROR)
9 | logging.getLogger('transformers.tokenization_utils').setLevel(logging.ERROR)
10 | logging.getLogger('transformers.configuration_utils').setLevel(logging.ERROR)
11 | logging.getLogger('transformers.modeling_tf_utils').setLevel(logging.ERROR)
12 | logging.getLogger('transformers.modeling_utils').setLevel(logging.ERROR)
13 | logging.getLogger('transformers.tokenization_utils_base').setLevel(logging.ERROR)
14 |
--------------------------------------------------------------------------------
/hanlp/layers/transformers/loader_tf.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # Author: hankcs
3 | # Date: 2020-01-04 06:05
4 | import tensorflow as tf
5 | from transformers import TFAutoModel
6 |
7 | from hanlp.layers.transformers.pt_imports import AutoTokenizer_, AutoModel_
8 |
9 |
10 | def build_transformer(transformer, max_seq_length, num_labels, tagging=True, tokenizer_only=False):
11 | tokenizer = AutoTokenizer_.from_pretrained(transformer)
12 | if tokenizer_only:
13 | return tokenizer
14 | l_bert = TFAutoModel.from_pretrained(transformer)
15 | l_input_ids = tf.keras.layers.Input(shape=(max_seq_length,), dtype='int32', name="input_ids")
16 | l_mask_ids = tf.keras.layers.Input(shape=(max_seq_length,), dtype='int32', name="mask_ids")
17 | l_token_type_ids = tf.keras.layers.Input(shape=(max_seq_length,), dtype='int32', name="token_type_ids")
18 | output = l_bert(input_ids=l_input_ids, token_type_ids=l_token_type_ids, attention_mask=l_mask_ids).last_hidden_state
19 | if not tagging:
20 | output = tf.keras.layers.Lambda(lambda seq: seq[:, 0, :])(output)
21 | logits = tf.keras.layers.Dense(num_labels)(output)
22 | model = tf.keras.Model(inputs=[l_input_ids, l_mask_ids, l_token_type_ids], outputs=logits)
23 | model.build(input_shape=(None, max_seq_length))
24 | return model, tokenizer
25 |
--------------------------------------------------------------------------------
/hanlp/layers/transformers/tf_imports.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # Author: hankcs
3 | # Date: 2020-05-08 21:57
4 | from transformers import BertTokenizer, BertConfig, PretrainedConfig, TFAutoModel, \
5 | AutoConfig, AutoTokenizer, PreTrainedTokenizer, TFPreTrainedModel, TFAlbertModel, TFAutoModelWithLMHead, \
6 | BertTokenizerFast, TFAlbertForMaskedLM, AlbertConfig, TFBertModel
7 |
--------------------------------------------------------------------------------
/hanlp/losses/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # Author: hankcs
3 | # Date: 2019-12-20 01:28
--------------------------------------------------------------------------------
/hanlp/metrics/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # Author: hankcs
3 | # Date: 2019-09-14 21:55
--------------------------------------------------------------------------------
/hanlp/metrics/amr/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # Author: hankcs
3 | # Date: 2020-08-24 12:47
--------------------------------------------------------------------------------
/hanlp/metrics/chunking/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # Author: hankcs
3 | # Date: 2019-12-21 03:49
--------------------------------------------------------------------------------
/hanlp/metrics/chunking/binary_chunking_f1.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # Author: hankcs
3 | # Date: 2020-08-02 14:27
4 | from collections import defaultdict
5 | from typing import List, Union
6 |
7 | import torch
8 |
9 | from hanlp.metrics.f1 import F1
10 |
11 |
12 | class BinaryChunkingF1(F1):
13 | def __call__(self, pred_tags: torch.LongTensor, gold_tags: torch.LongTensor, lens: List[int] = None):
14 | if lens is None:
15 | lens = [gold_tags.size(1)] * gold_tags.size(0)
16 | self.update(self.decode_spans(pred_tags, lens), self.decode_spans(gold_tags, lens))
17 |
18 | def update(self, pred_tags, gold_tags):
19 | for pred, gold in zip(pred_tags, gold_tags):
20 | super().__call__(set(pred), set(gold))
21 |
22 | @staticmethod
23 | def decode_spans(pred_tags: torch.LongTensor, lens: Union[List[int], torch.LongTensor]):
24 | if isinstance(lens, torch.Tensor):
25 | lens = lens.tolist()
26 | batch_pred = defaultdict(list)
27 | for batch, offset in pred_tags.nonzero(as_tuple=False).tolist():
28 | batch_pred[batch].append(offset)
29 | batch_pred_spans = [[(0, l)] for l in lens]
30 | for batch, offsets in batch_pred.items():
31 | l = lens[batch]
32 | batch_pred_spans[batch] = list(zip(offsets, offsets[1:] + [l]))
33 | return batch_pred_spans
34 |
--------------------------------------------------------------------------------
/hanlp/metrics/chunking/iobes_tf.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # Author: hankcs
3 | # Date: 2019-09-14 21:55
4 |
5 | from hanlp.common.vocab_tf import VocabTF
6 | from hanlp.metrics.chunking.conlleval import SpanF1
7 | from hanlp.metrics.chunking.chunking_f1_tf import ChunkingF1_TF
8 |
9 |
10 | class IOBES_F1_TF(ChunkingF1_TF):
11 |
12 | def __init__(self, tag_vocab: VocabTF, from_logits=True, name='f1', dtype=None, **kwargs):
13 | super().__init__(tag_vocab, from_logits, name, dtype, **kwargs)
14 | self.state = SpanF1()
15 |
16 | def update_tags(self, true_tags, pred_tags):
17 | # true_tags = list(itertools.chain.from_iterable(true_tags))
18 | # pred_tags = list(itertools.chain.from_iterable(pred_tags))
19 | # self.state.update_state(true_tags, pred_tags)
20 | for gold, pred in zip(true_tags, pred_tags):
21 | self.state.update_state(gold, pred)
22 | return self.result()
23 |
24 | def result(self):
25 | return self.state.result(full=False, verbose=False).fscore
26 |
27 | def reset_states(self):
28 | self.state.reset_state()
29 |
--------------------------------------------------------------------------------
/hanlp/metrics/metric.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # Author: hankcs
3 | # Date: 2020-06-03 11:35
4 | from abc import ABC, abstractmethod
5 |
6 |
7 | class Metric(ABC):
8 |
9 | def __lt__(self, other):
10 | return self.score < other
11 |
12 | def __le__(self, other):
13 | return self.score <= other
14 |
15 | def __eq__(self, other):
16 | return self.score == other
17 |
18 | def __ge__(self, other):
19 | return self.score >= other
20 |
21 | def __gt__(self, other):
22 | return self.score > other
23 |
24 | def __ne__(self, other):
25 | return self.score != other
26 |
27 | @property
28 | @abstractmethod
29 | def score(self):
30 | pass
31 |
32 | @abstractmethod
33 | def __call__(self, pred, gold, mask=None):
34 | pass
35 |
36 | def __repr__(self) -> str:
37 | return f'{self.score}:.4f'
38 |
39 | def __float__(self):
40 | return self.score
41 |
42 | @abstractmethod
43 | def reset(self):
44 | pass
45 |
--------------------------------------------------------------------------------
/hanlp/metrics/parsing/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # Author: hankcs
3 | # Date: 2019-12-27 00:48
--------------------------------------------------------------------------------
/hanlp/metrics/srl/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # Author: hankcs
3 | # Date: 2020-07-16 18:44
--------------------------------------------------------------------------------
/hanlp/optimizers/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # Author: hankcs
3 | # Date: 2019-11-11 18:44
--------------------------------------------------------------------------------
/hanlp/pretrained/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # Author: hankcs
3 | # Date: 2019-12-28 19:10
4 | from hanlp.pretrained import tok
5 | from hanlp.pretrained import dep
6 | from hanlp.pretrained import sdp
7 | from hanlp.pretrained import glove
8 | from hanlp.pretrained import pos
9 | from hanlp.pretrained import rnnlm
10 | from hanlp.pretrained import word2vec
11 | from hanlp.pretrained import ner
12 | from hanlp.pretrained import classifiers
13 | from hanlp.pretrained import fasttext
14 | from hanlp.pretrained import mtl
15 | from hanlp.pretrained import eos
16 | from hanlp.pretrained import sts
17 | from hanlp.pretrained import constituency
18 | from hanlp.pretrained import amr
19 | from hanlp.pretrained import amr2text
20 | from hanlp.pretrained import srl
21 |
22 | # Will be filled up during runtime
23 | ALL = {}
24 |
--------------------------------------------------------------------------------
/hanlp/pretrained/amr2text.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # Author: hankcs
3 | # Date: 2022-12-07 15:19
4 | from hanlp_common.constant import HANLP_URL
5 |
6 | AMR3_GRAPH_PRETRAIN_GENERATION = HANLP_URL + 'amr2text/amr3_graph_pretrain_generation_20221207_153535.zip'
7 | '''A seq2seq (:cite:`bevilacqua-etal-2021-one`) BART (:cite:`lewis-etal-2020-bart`) large AMR2Text generator trained on
8 | Abstract Meaning Representation 3.0 (:cite:`knight2014abstract`) with graph pre-training (:cite:`bai-etal-2022-graph`).
9 | Its Sacre-BLEU is ``50.38`` according to their official repository.
10 | '''
11 |
12 | # Will be filled up during runtime
13 | ALL = {}
14 |
--------------------------------------------------------------------------------
/hanlp/pretrained/classifiers.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # Author: hankcs
3 | # Date: 2020-01-01 03:51
4 | from hanlp_common.constant import HANLP_URL
5 |
6 | CHNSENTICORP_BERT_BASE_ZH = HANLP_URL + 'classification/chnsenticorp_bert_base_20211228_163210.zip'
7 | SST2_ALBERT_BASE_EN = HANLP_URL + 'classification/sst2_albert_base_20211228_164917.zip'
8 |
9 | LID_176_FASTTEXT_BASE = 'https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin'
10 | '''
11 | 126MB FastText model for language identification trained on data from Wikipedia, Tatoeba and SETimes.
12 | '''
13 | LID_176_FASTTEXT_SMALL = 'https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.ftz'
14 | '''
15 | 917kB FastText model for language identification trained on data from Wikipedia, Tatoeba and SETimes.
16 | '''
17 |
18 | ALL = {}
19 |
--------------------------------------------------------------------------------
/hanlp/pretrained/constituency.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # Author=hankcs
3 | # Date=2022-01-18 10:34
4 | from hanlp_common.constant import HANLP_URL
5 |
6 | CTB9_CON_ELECTRA_SMALL = HANLP_URL + 'constituency/ctb9_con_electra_small_20220215_230116.zip'
7 | 'Electra (:cite:`clark2020electra`) small tree CRF model (:cite:`ijcai2020-560`) trained on CTB9 with major categories. ' \
8 | 'Its performance is UCM=39.06% LCM=34.99% UP=90.05% UR=90.01% UF=90.03% LP=87.02% LR=86.98% LF=87.00%.'
9 |
10 | CTB9_CON_FULL_TAG_ELECTRA_SMALL = HANLP_URL + 'constituency/ctb9_full_tag_con_electra_small_20220118_103119.zip'
11 | 'Electra (:cite:`clark2020electra`) small tree CRF model (:cite:`ijcai2020-560`) trained on CTB9 with full subcategories. ' \
12 | 'Its performance is UCM=38.29% LCM=28.95% UP=90.16% UR=90.13% UF=90.15% LP=83.46% LR=83.43% LF=83.45%.'
13 |
14 | CTB9_CON_FULL_TAG_ERNIE_GRAM = 'http://download.hanlp.com/constituency/extra/ctb9_full_tag_con_ernie_20220331_121430.zip'
15 | 'ERNIE-GRAM (:cite:`xiao-etal-2021-ernie`) base tree CRF model (:cite:`ijcai2020-560`) trained on CTB9 with full subcategories. ' \
16 | 'Its performance is UCM=42.04% LCM=31.72% UP=91.33% UR=91.53% UF=91.43% LP=85.31% LR=85.49% LF=85.40%.'
17 |
18 | # Will be filled up during runtime
19 | ALL = {}
20 |
--------------------------------------------------------------------------------
/hanlp/pretrained/dep.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # Author: hankcs
3 | # Date: 2019-12-29 02:55
4 | from hanlp_common.constant import HANLP_URL
5 |
6 | CTB5_BIAFFINE_DEP_ZH = HANLP_URL + 'dep/biaffine_ctb5_20191229_025833.zip'
7 | 'Biaffine LSTM model (:cite:`dozat:17a`) trained on CTB5.'
8 | CTB7_BIAFFINE_DEP_ZH = HANLP_URL + 'dep/biaffine_ctb7_20200109_022431.zip'
9 | 'Biaffine LSTM model (:cite:`dozat:17a`) trained on CTB7.'
10 | CTB9_DEP_ELECTRA_SMALL = HANLP_URL + 'dep/ctb9_dep_electra_small_20220216_100306.zip'
11 | 'Electra small encoder (:cite:`clark2020electra`) with Biaffine decoder (:cite:`dozat:17a`) trained on CTB9-SD330. ' \
12 | 'Performance is UAS=87.68% LAS=83.54%.'
13 | PMT1_DEP_ELECTRA_SMALL = HANLP_URL + 'dep/pmt_dep_electra_small_20220218_134518.zip'
14 | 'Electra small encoder (:cite:`clark2020electra`) with Biaffine decoder (:cite:`dozat:17a`) trained on PKU ' \
15 | 'Multi-view Chinese Treebank (PMT) 1.0 (:cite:`qiu-etal-2014-multi`). Performance is UAS=91.21% LAS=88.65%.'
16 | CTB9_UDC_ELECTRA_SMALL = HANLP_URL + 'dep/udc_dep_electra_small_20220218_095452.zip'
17 | 'Electra small encoder (:cite:`clark2020electra`) with Biaffine decoder (:cite:`dozat:17a`) trained on CTB9-UD420. ' \
18 | 'Performance is UAS=85.92% LAS=81.13% .'
19 |
20 | PTB_BIAFFINE_DEP_EN = HANLP_URL + 'dep/ptb_dep_biaffine_20200101_174624.zip'
21 | 'Biaffine LSTM model (:cite:`dozat:17a`) trained on PTB.'
22 |
23 | ALL = {}
24 |
--------------------------------------------------------------------------------
/hanlp/pretrained/eos.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # Author: hankcs
3 | # Date: 2020-12-22 13:22
4 | from hanlp_common.constant import HANLP_URL
5 |
6 | UD_CTB_EOS_MUL = HANLP_URL + 'eos/eos_ud_ctb_mul_20201222_133543.zip'
7 | 'EOS model (:cite:`Schweter:Ahmed:2019`) trained on concatenated UD2.3 and CTB9.'
8 |
9 | # Will be filled up during runtime
10 | ALL = {}
11 |
--------------------------------------------------------------------------------
/hanlp/pretrained/fasttext.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # Author: hankcs
3 | # Date: 2019-12-30 18:57
4 | FASTTEXT_DEBUG_EMBEDDING_EN = 'https://elit-models.s3-us-west-2.amazonaws.com/fasttext.debug.bin.zip'
5 | FASTTEXT_CC_300_EN = 'https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.bin.gz'
6 | 'FastText (:cite:`bojanowski2017enriching`) embeddings trained on Common Crawl.'
7 | FASTTEXT_WIKI_NYT_AMAZON_FRIENDS_200_EN \
8 | = 'https://elit-models.s3-us-west-2.amazonaws.com/fasttext-200-wikipedia-nytimes-amazon-friends-20191107.bin'
9 | 'FastText (:cite:`bojanowski2017enriching`) embeddings trained on wikipedia, nytimes and friends.'
10 |
11 | FASTTEXT_WIKI_300_ZH = 'https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.zh.zip#wiki.zh.bin'
12 | 'FastText (:cite:`bojanowski2017enriching`) embeddings trained on Chinese Wikipedia.'
13 | FASTTEXT_WIKI_300_ZH_CLASSICAL = 'https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.zh_classical.zip#wiki.zh_classical.bin'
14 | 'FastText (:cite:`bojanowski2017enriching`) embeddings trained on traditional Chinese wikipedia.'
15 |
16 | ALL = {}
17 |
--------------------------------------------------------------------------------
/hanlp/pretrained/glove.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # Author: hankcs
3 | # Date: 2019-08-27 20:42
4 |
5 | _GLOVE_6B_ROOT = 'http://downloads.cs.stanford.edu/nlp/data/glove.6B.zip'
6 |
7 | GLOVE_6B_50D = _GLOVE_6B_ROOT + '#' + 'glove.6B.50d.txt'
8 | 'Global Vectors for Word Representation (:cite:`pennington-etal-2014-glove`) 50d trained on 6B tokens.'
9 | GLOVE_6B_100D = _GLOVE_6B_ROOT + '#' + 'glove.6B.100d.txt'
10 | 'Global Vectors for Word Representation (:cite:`pennington-etal-2014-glove`) 100d trained on 6B tokens.'
11 | GLOVE_6B_200D = _GLOVE_6B_ROOT + '#' + 'glove.6B.200d.txt'
12 | 'Global Vectors for Word Representation (:cite:`pennington-etal-2014-glove`) 200d trained on 6B tokens.'
13 | GLOVE_6B_300D = _GLOVE_6B_ROOT + '#' + 'glove.6B.300d.txt'
14 | 'Global Vectors for Word Representation (:cite:`pennington-etal-2014-glove`) 300d trained on 6B tokens.'
15 |
16 | GLOVE_840B_300D = 'http://nlp.stanford.edu/data/glove.840B.300d.zip'
17 | 'Global Vectors for Word Representation (:cite:`pennington-etal-2014-glove`) 300d trained on 840B tokens.'
18 |
19 | ALL = {}
20 |
--------------------------------------------------------------------------------
/hanlp/pretrained/ner.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # Author: hankcs
3 | # Date: 2019-12-30 20:07
4 | from hanlp_common.constant import HANLP_URL
5 |
6 | MSRA_NER_BERT_BASE_ZH = HANLP_URL + 'ner/ner_bert_base_msra_20211227_114712.zip'
7 | 'BERT model (:cite:`devlin-etal-2019-bert`) trained on MSRA with 3 entity types.'
8 | MSRA_NER_ALBERT_BASE_ZH = HANLP_URL + 'ner/msra_ner_albert_base_20211228_173323.zip'
9 | 'ALBERT model (:cite:`Lan2020ALBERT:`) trained on MSRA with 3 entity types.'
10 | MSRA_NER_ELECTRA_SMALL_ZH = HANLP_URL + 'ner/msra_ner_electra_small_20220215_205503.zip'
11 | 'Electra small model (:cite:`clark2020electra`) trained on MSRA with 26 entity types. F1 = `95.16`'
12 | CONLL03_NER_BERT_BASE_CASED_EN = HANLP_URL + 'ner/ner_conll03_bert_base_cased_en_20211227_121443.zip'
13 | 'BERT model (:cite:`devlin-etal-2019-bert`) trained on CoNLL03.'
14 |
15 | ALL = {}
16 |
--------------------------------------------------------------------------------
/hanlp/pretrained/rnnlm.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # Author: hankcs
3 | # Date: 2019-12-19 03:47
4 | from hanlp_common.constant import HANLP_URL
5 |
6 | FLAIR_LM_FW_WMT11_EN_TF = HANLP_URL + 'lm/flair_lm_wmt11_en_20200211_091932.zip#flair_lm_fw_wmt11_en'
7 | 'The forward LSTM of Contextual String Embedding (:cite:`akbik-etal-2018-contextual`).'
8 | FLAIR_LM_BW_WMT11_EN_TF = HANLP_URL + 'lm/flair_lm_wmt11_en_20200211_091932.zip#flair_lm_bw_wmt11_en'
9 | 'The backward LSTM of Contextual String Embedding (:cite:`akbik-etal-2018-contextual`).'
10 | FLAIR_LM_WMT11_EN = HANLP_URL + 'lm/flair_lm_wmt11_en_20200601_205350.zip'
11 | 'The BiLSTM of Contextual String Embedding (:cite:`akbik-etal-2018-contextual`).'
12 |
13 | ALL = {}
14 |
--------------------------------------------------------------------------------
/hanlp/pretrained/sdp.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # Author: hankcs
3 | # Date: 2019-12-31 23:54
4 | from hanlp_common.constant import HANLP_URL
5 |
6 | SEMEVAL16_NEWS_BIAFFINE_ZH = HANLP_URL + 'sdp/semeval16-news-biaffine_20191231_235407.zip'
7 | 'Biaffine SDP (:cite:`he-choi-2019`) trained on SemEval16 news data.'
8 | SEMEVAL16_TEXT_BIAFFINE_ZH = HANLP_URL + 'sdp/semeval16-text-biaffine_20200101_002257.zip'
9 | 'Biaffine SDP (:cite:`he-choi-2019`) trained on SemEval16 text data.'
10 |
11 | SEMEVAL16_ALL_ELECTRA_SMALL_ZH = HANLP_URL + 'sdp/semeval16_sdp_electra_small_20220719_171433.zip'
12 | 'Biaffine SDP (:cite:`he-choi-2019`) trained on SemEval16 text and news data. Performance: ``UF: 83.03% LF: 72.58%``'
13 |
14 | SEMEVAL15_PAS_BIAFFINE_EN = HANLP_URL + 'sdp/semeval15_biaffine_pas_20200103_152405.zip'
15 | 'Biaffine SDP (:cite:`he-choi-2019`) trained on SemEval15 PAS data.'
16 | SEMEVAL15_PSD_BIAFFINE_EN = HANLP_URL + 'sdp/semeval15_biaffine_psd_20200106_123009.zip'
17 | 'Biaffine SDP (:cite:`he-choi-2019`) trained on SemEval15 PSD data.'
18 | SEMEVAL15_DM_BIAFFINE_EN = HANLP_URL + 'sdp/semeval15_biaffine_dm_20200106_122808.zip'
19 | 'Biaffine SDP (:cite:`he-choi-2019`) trained on SemEval15 DM data.'
20 |
21 | ALL = {}
22 |
--------------------------------------------------------------------------------
/hanlp/pretrained/srl.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # Author: hankcs
3 | # Date: 2021-08-07 19:07
4 | from hanlp_common.constant import HANLP_URL
5 |
6 | CPB3_SRL_ELECTRA_SMALL = HANLP_URL + 'srl/cpb3_electra_small_crf_has_transform_20220218_135910.zip'
7 | 'Electra small model (:cite:`clark2020electra`) trained on CPB3. P=75.87% R=76.24% F1=76.05%.'
8 |
9 | ALL = {}
10 |
--------------------------------------------------------------------------------
/hanlp/pretrained/sts.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # Author: hankcs
3 | # Date: 2021-05-24 12:51
4 | from hanlp_common.constant import HANLP_URL
5 |
6 | STS_ELECTRA_BASE_ZH = HANLP_URL + 'sts/sts_electra_base_zh_20210530_200109.zip'
7 | 'A naive regression model trained on concatenated STS corpora.'
8 |
9 | # Will be filled up during runtime
10 | ALL = {}
11 |
--------------------------------------------------------------------------------
/hanlp/transform/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # Author: hankcs
3 | # Date: 2019-12-29 22:24
--------------------------------------------------------------------------------
/hanlp/utils/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # Author: hankcs
3 | # Date: 2019-08-24 22:12
4 | from . import rules
5 |
6 |
7 | def ls_resource_in_module(root) -> dict:
8 | res = dict()
9 | for k, v in root.__dict__.items():
10 | if k.startswith('_') or v == root:
11 | continue
12 | if isinstance(v, str):
13 | if v.startswith('http') and not v.endswith('/') and not v.endswith('#') and not v.startswith('_'):
14 | res[k] = v
15 | elif type(v).__name__ == 'module':
16 | res.update(ls_resource_in_module(v))
17 | if 'ALL' in root.__dict__ and isinstance(root.__dict__['ALL'], dict):
18 | root.__dict__['ALL'].update(res)
19 | return res
20 |
--------------------------------------------------------------------------------
/hanlp/utils/file_read_backwards/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | from .file_read_backwards import FileReadBackwards # noqa: F401
4 |
5 | __author__ = """Robin Robin"""
6 | __email__ = 'robinsquare42@gmail.com'
7 | __version__ = '2.0.0'
8 |
--------------------------------------------------------------------------------
/hanlp/utils/init_util.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # Author: hankcs
3 | # Date: 2020-05-27 13:25
4 | import math
5 |
6 | import torch
7 | from torch import nn
8 | import functools
9 |
10 |
11 | def embedding_uniform(tensor:torch.Tensor, seed=233):
12 | gen = torch.Generator().manual_seed(seed)
13 | with torch.no_grad():
14 | fan_out = tensor.size(-1)
15 | bound = math.sqrt(3.0 / fan_out)
16 | return tensor.uniform_(-bound, bound, generator=gen)
17 |
--------------------------------------------------------------------------------
/hanlp/utils/lang/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # Author: hankcs
3 | # Date: 2020-01-09 18:46
4 |
5 | __doc__ = '''
6 | This package holds misc utils for specific languages.
7 | '''
8 |
--------------------------------------------------------------------------------
/hanlp/utils/lang/en/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # Author: hankcs
3 | # Date: 2021-12-28 19:28
4 |
--------------------------------------------------------------------------------
/hanlp/utils/lang/ja/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # Author: hankcs
3 | # Date: 2021-05-13 13:24
4 |
--------------------------------------------------------------------------------
/hanlp/utils/lang/zh/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # Author: hankcs
3 | # Date: 2020-01-09 18:47
--------------------------------------------------------------------------------
/hanlp/version.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # Author: hankcs
3 | # Date: 2019-12-28 19:26
4 |
5 | __version__ = '2.1.0-beta.64'
6 | """HanLP version"""
7 |
8 |
9 | class NotCompatible(Exception):
10 | pass
11 |
--------------------------------------------------------------------------------
/plugins/README.md:
--------------------------------------------------------------------------------
1 | # Plugins for HanLP
2 |
3 | This directory contains modules shared across several individual packages or non core APIs.
4 | If you plan to submit any plugins, please put it here too.
5 |
6 | For developers, run the following set-up.
7 |
8 | ```bash
9 | pip install -e hanlp_trie
10 | pip install -e hanlp_common
11 | pip install -e hanlp_restful
12 | ```
--------------------------------------------------------------------------------
/plugins/hanlp_common/README.md:
--------------------------------------------------------------------------------
1 | # Common utilities and structures for HanLP
2 |
3 | [中文](https://github.com/hankcs/HanLP/tree/doc-zh) | [1.x](https://github.com/hankcs/HanLP/tree/1.x) | [forum](https://bbs.hankcs.com/) | [docker](https://github.com/WalterInSH/hanlp-jupyter-docker)
4 |
5 | The multilingual NLP library for researchers and companies, built on PyTorch and TensorFlow 2.x, for advancing state-of-the-art deep learning techniques in both academia and industry. HanLP was designed from day one to be efficient, user friendly and extendable. It comes with pretrained models for various human languages including English, Chinese and many others. Currently, HanLP 2.0 is in alpha stage with more killer features on the roadmap. Discussions are welcomed on our [forum](https://bbs.hankcs.com/), while bug reports and feature requests are reserved for GitHub issues. For Java users, please checkout the [1.x](https://github.com/hankcs/HanLP/tree/1.x) branch.
6 |
7 |
8 | ## Installation
9 |
10 | ```bash
11 | pip install hanlp
12 | ```
13 |
14 | ## License
15 |
16 | HanLP is licensed under **Apache License 2.0**. You can use HanLP in your commercial products for free. We would appreciate it if you add a link to HanLP on your website.
17 |
18 |
--------------------------------------------------------------------------------
/plugins/hanlp_common/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # Author: hankcs
3 | # Date: 2020-12-16 22:20
4 |
--------------------------------------------------------------------------------
/plugins/hanlp_common/hanlp_common/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # Author: hankcs
3 | # Date: 2020-12-16 22:21
4 |
--------------------------------------------------------------------------------
/plugins/hanlp_common/hanlp_common/constant.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # Author: hankcs
3 | # Date: 2019-06-13 22:41
4 | import os
5 |
6 | PAD = ''
7 | '''Padding token.'''
8 | UNK = ''
9 | '''Unknown token.'''
10 | CLS = '[CLS]'
11 | BOS = ''
12 | EOS = ''
13 | ROOT = BOS
14 | IDX = '_idx_'
15 | '''Key for index.'''
16 | HANLP_URL = os.getenv('HANLP_URL', 'https://file.hankcs.com/hanlp/')
17 | '''Resource URL.'''
18 | HANLP_VERBOSE = os.environ.get('HANLP_VERBOSE', '1').lower() in ('1', 'true', 'yes')
19 | '''Enable verbose or not.'''
20 | NULL = ''
21 | PRED = 'PRED'
22 |
23 | IPYTHON = os.environ.get('HANLP_IPYTHON', '1').lower() in ('1', 'true', 'yes') # Allow the user to disable IPYTHON
24 | if IPYTHON:
25 | try:
26 | # noinspection PyUnresolvedReferences,PyStatementEffect
27 | get_ipython
28 | except NameError:
29 | IPYTHON = False
30 |
--------------------------------------------------------------------------------
/plugins/hanlp_common/hanlp_common/io.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # Author: hankcs
3 | # Date: 2020-12-16 22:38
4 | import json
5 | import os
6 | import pickle
7 | import sys
8 | from typing import Union
9 |
10 |
11 | def save_pickle(item, path):
12 | with open(path, 'wb') as f:
13 | pickle.dump(item, f)
14 |
15 |
16 | def load_pickle(path):
17 | with open(path, 'rb') as f:
18 | return pickle.load(f)
19 |
20 |
21 | def save_json(item: Union[dict, list, str, int, float], path: str, ensure_ascii=False, cls=None,
22 | default=lambda o: repr(o), indent=2):
23 | dirname = os.path.dirname(path)
24 | if dirname:
25 | os.makedirs(dirname, exist_ok=True)
26 | with open(path, 'w', encoding='utf-8') as out:
27 | json.dump(item, out, ensure_ascii=ensure_ascii, indent=indent, cls=cls, default=default)
28 |
29 |
30 | def load_json(path):
31 | with open(path, encoding='utf-8') as src:
32 | return json.load(src)
33 |
34 |
35 | def filename_is_json(filename):
36 | filename, file_extension = os.path.splitext(filename)
37 | return file_extension in ['.json', '.jsonl']
38 |
39 |
40 | def eprint(*args, **kwargs):
41 | print(*args, file=sys.stderr, **kwargs)
42 |
--------------------------------------------------------------------------------
/plugins/hanlp_demo/README.md:
--------------------------------------------------------------------------------
1 | # Demos and examples for HanLP
2 |
3 | This package is intended for demonstration purpose and won't be released to pypi. **Training requires a fair understanding of Linux and Python which might not be the case for everybody.**
4 |
5 | You need a Linux/macOS system with Internet on because some corpora and bash scripts will be downloaded during training. Training on Windows might work if you are an expert but we believe it's very rare.
6 |
7 | Your `python` command needs to be Python2 while `python3` needs to be Python3.
8 |
9 | You need to install this package and run it from the **root** folder of HanLP.
10 |
11 | ```bash
12 | pip install -e plugins/hanlp_demo
13 | python3 plugins/hanlp_demo/hanlp_demo/zh/train/open_small.py
14 | ```
15 |
16 |
--------------------------------------------------------------------------------
/plugins/hanlp_demo/hanlp_demo/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # Author: hankcs
3 | # Date: 2020-11-29 17:48
4 |
5 |
--------------------------------------------------------------------------------
/plugins/hanlp_demo/hanlp_demo/block_windows.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # Author: hankcs
3 | # Date: 2021-07-28 21:38
4 | from hanlp.utils.io_util import windows
5 |
6 | assert not windows(), 'Windows is not supported for this script. Please run it on Linux systems.'
7 |
--------------------------------------------------------------------------------
/plugins/hanlp_demo/hanlp_demo/en/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # Author: hankcs
3 | # Date: 2020-01-01 17:55
--------------------------------------------------------------------------------
/plugins/hanlp_demo/hanlp_demo/en/demo_amr.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # Author: hankcs
3 | # Date: 2022-01-25 19:09
4 | import hanlp
5 |
6 | amr_parser = hanlp.load(hanlp.pretrained.amr.AMR3_SEQ2SEQ_BART_LARGE)
7 | amr = amr_parser('The boy wants the girl to believe him.')
8 | print(amr)
9 |
--------------------------------------------------------------------------------
/plugins/hanlp_demo/hanlp_demo/en/demo_dep.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # Author: hankcs
3 | # Date: 2020-01-01 17:55
4 | import hanlp
5 |
6 | syntactic_parser = hanlp.load(hanlp.pretrained.dep.PTB_BIAFFINE_DEP_EN)
7 | sent = [('Is', 'VBZ'),
8 | ('this', 'DT'),
9 | ('the', 'DT'),
10 | ('future', 'NN'),
11 | ('of', 'IN'),
12 | ('chamber', 'NN'),
13 | ('music', 'NN'),
14 | ('?', '.')]
15 | tree = syntactic_parser(sent)
16 | print(tree)
17 |
--------------------------------------------------------------------------------
/plugins/hanlp_demo/hanlp_demo/en/demo_lm.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # Author: hankcs
3 | # Date: 2020-02-11 09:14
4 | import hanlp
5 |
6 | lm = hanlp.load(hanlp.pretrained.rnnlm.FLAIR_LM_FW_WMT11_EN_TF)
7 | print(''.join(lm.generate_text(list('hello'))))
8 |
--------------------------------------------------------------------------------
/plugins/hanlp_demo/hanlp_demo/en/demo_ner.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # Author: hankcs
3 | # Date: 2020-01-03 22:50
4 | import hanlp
5 |
6 | recognizer = hanlp.load(hanlp.pretrained.ner.CONLL03_NER_BERT_BASE_CASED_EN)
7 | print(recognizer(["President", "Obama", "is", "speaking", "at", "the", "White", "House", "."]))
8 |
--------------------------------------------------------------------------------
/plugins/hanlp_demo/hanlp_demo/en/demo_pipeline.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # Author: hankcs
3 | # Date: 2020-01-04 21:05
4 | import hanlp
5 | from hanlp.utils.lang.en.english_tokenizer import tokenize_english
6 |
7 | tokenizer = tokenize_english
8 | tagger = hanlp.load(hanlp.pretrained.pos.PTB_POS_RNN_FASTTEXT_EN)
9 | syntactic_parser = hanlp.load(hanlp.pretrained.dep.PTB_BIAFFINE_DEP_EN)
10 | semantic_parser = hanlp.load(hanlp.pretrained.sdp.SEMEVAL15_PAS_BIAFFINE_EN)
11 |
12 | pipeline = hanlp.pipeline() \
13 | .append(hanlp.utils.rules.split_sentence, output_key='sentences') \
14 | .append(tokenizer, output_key='tokens') \
15 | .append(tagger, output_key='part_of_speech_tags') \
16 | .append(syntactic_parser, input_key=('tokens', 'part_of_speech_tags'), output_key='syntactic_dependencies',
17 | conll=False) \
18 | .append(semantic_parser, input_key=('tokens', 'part_of_speech_tags'), output_key='semantic_dependencies',
19 | conll=False)
20 | print(pipeline)
21 |
22 | text = '''Jobs and Wozniak co-founded Apple in 1976 to sell Wozniak's Apple I personal computer.
23 | Together the duo gained fame and wealth a year later with the Apple II.
24 | '''
25 |
26 | doc = pipeline(text)
27 | print(doc)
28 |
29 | # You can save the config to disk for deploying or sharing.
30 | pipeline.save('en.json')
31 | # Then load it smoothly.
32 | deployed = hanlp.load('en.json')
33 | print(deployed)
34 | print(deployed(text))
35 |
--------------------------------------------------------------------------------
/plugins/hanlp_demo/hanlp_demo/en/demo_pos.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # Author: hankcs
3 | # Date: 2020-01-03 22:16
4 | import hanlp
5 |
6 | tagger = hanlp.load(hanlp.pretrained.pos.PTB_POS_RNN_FASTTEXT_EN)
7 | print(tagger([['I', 'banked', '2', 'dollars', 'in', 'a', 'bank', '.'],
8 | ['Is', 'this', 'the', 'future', 'of', 'chamber', 'music', '?']]))
9 |
--------------------------------------------------------------------------------
/plugins/hanlp_demo/hanlp_demo/en/demo_sdp.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # Author: hankcs
3 | # Date: 2020-01-03 15:26
4 | import hanlp
5 | from hanlp_common.conll import CoNLLSentence
6 |
7 | # semeval15 offers three independent annotations over the Penn Treebank (PTB)
8 | semantic_parser = hanlp.load(hanlp.pretrained.sdp.SEMEVAL15_PAS_BIAFFINE_EN)
9 | # semantic_parser = hanlp.load(hanlp.pretrained.sdp.SEMEVAL15_DM_BIAFFINE_EN)
10 | # semantic_parser = hanlp.load(hanlp.pretrained.sdp.SEMEVAL15_PSD_BIAFFINE_EN)
11 | sent = [('Is', 'VBZ'),
12 | ('this', 'DT'),
13 | ('the', 'DT'),
14 | ('future', 'NN'),
15 | ('of', 'IN'),
16 | ('chamber', 'NN'),
17 | ('music', 'NN'),
18 | ('?', '.')]
19 | tree = semantic_parser(sent) # type:CoNLLSentence
20 | print(tree)
21 |
--------------------------------------------------------------------------------
/plugins/hanlp_demo/hanlp_demo/en/demo_sentiment_analysis.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # Author: hankcs
3 | # Date: 2020-01-01 03:52
4 | import hanlp
5 |
6 | classifier = hanlp.load('SST2_ALBERT_BASE_EN')
7 | print(classifier.predict('I feel lucky'))
8 |
--------------------------------------------------------------------------------
/plugins/hanlp_demo/hanlp_demo/en/demo_tok.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # Author: hankcs
3 | # Date: 2020-01-02 19:41
4 | from hanlp.utils.lang.en.english_tokenizer import tokenize_english
5 |
6 | text = """\
7 | Don't go gentle into that good night.
8 | """
9 | print(tokenize_english(text))
10 |
--------------------------------------------------------------------------------
/plugins/hanlp_demo/hanlp_demo/en/train_sst2_albert_base.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # Author: hankcs
3 | # Date: 2019-11-10 17:41
4 | import os
5 |
6 | from hanlp.components.classifiers.transformer_classifier_tf import TransformerClassifierTF
7 |
8 | from tests import cdroot
9 |
10 | from hanlp.datasets.glu.glue import STANFORD_SENTIMENT_TREEBANK_2_DEV, STANFORD_SENTIMENT_TREEBANK_2_TRAIN, \
11 | STANFORD_SENTIMENT_TREEBANK_2_TEST
12 |
13 | cdroot()
14 | save_dir = os.path.join('data', 'model', 'sst', 'sst2_albert_base')
15 | classifier = TransformerClassifierTF()
16 | classifier.fit(STANFORD_SENTIMENT_TREEBANK_2_TRAIN, STANFORD_SENTIMENT_TREEBANK_2_DEV, save_dir,
17 | transformer='albert-base-v2')
18 | classifier.load(save_dir)
19 | print(classifier('it\' s a charming and often affecting journey'))
20 | classifier.evaluate(STANFORD_SENTIMENT_TREEBANK_2_TEST, save_dir=save_dir)
21 | print(f'Model saved in {save_dir}')
22 |
--------------------------------------------------------------------------------
/plugins/hanlp_demo/hanlp_demo/ja/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # Author: hankcs
3 | # Date: 2021-05-17 22:30
4 |
--------------------------------------------------------------------------------
/plugins/hanlp_demo/hanlp_demo/ja/demo_mtl.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # Author: hankcs
3 | # Date: 2021-05-17 22:30
4 | import hanlp
5 | from hanlp_common.document import Document
6 |
7 | HanLP = hanlp.load(hanlp.pretrained.mtl.NPCMJ_UD_KYOTO_TOK_POS_CON_BERT_BASE_CHAR_JA)
8 | doc: Document = HanLP([
9 | '2021年、HanLPv2.1は次世代の最先端多言語NLP技術を本番環境に導入します。',
10 | '奈須きのこは1973年11月28日に千葉県円空山で生まれ、ゲーム制作会社「ノーツ」の設立者だ。',
11 | ])
12 | print(doc)
13 | doc.pretty_print()
14 |
--------------------------------------------------------------------------------
/plugins/hanlp_demo/hanlp_demo/mul/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # Author: hankcs
3 | # Date: 2020-12-31 22:25
4 |
--------------------------------------------------------------------------------
/plugins/hanlp_demo/hanlp_demo/mul/demo_lid.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # Author: hankcs
3 | # Date: 2022-09-28 16:49
4 | import hanlp
5 |
6 | lid = hanlp.load(hanlp.pretrained.classifiers.LID_176_FASTTEXT_BASE)
7 |
8 | print(lid('In 2021, HanLPv2.1 delivers state-of-the-art multilingual NLP techniques to production environments.'))
9 | lang, prob = lid('2021年、HanLPv2.1は次世代の最先端多言語NLP技術を本番環境に導入します。', prob=True)
10 | print(f'{lang} language identified with probability {prob:.3%}')
11 | print(lid('2021年 HanLPv2.1为生产环境带来次世代最先进的多语种NLP技术。', topk=2))
12 |
13 | # For a combination of languages, predict top-k languages with probabilities:
14 | text = '''
15 | 2021年 HanLPv2.1为生产环境带来次世代最先进的多语种NLP技术。
16 | In 2021, HanLPv2.1 delivers state-of-the-art multilingual NLP techniques to production environments.
17 | '''
18 |
19 | print(lid(text, topk=3, prob=True))
20 |
--------------------------------------------------------------------------------
/plugins/hanlp_demo/hanlp_demo/mul/demo_lid_restful.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # Author: hankcs
3 | # Date: 2022-09-28 16:49
4 | from hanlp_restful import HanLPClient
5 |
6 | HanLP = HanLPClient('https://hanlp.hankcs.com/api', auth=None, language='mul')
7 |
8 | print(HanLP.language_identification([
9 | 'In 2021, HanLPv2.1 delivers state-of-the-art multilingual NLP techniques to production environment.',
10 | '2021年、HanLPv2.1は次世代の最先端多言語NLP技術を本番環境に導入します。',
11 | '2021年 HanLPv2.1为生产环境带来次世代最先进的多语种NLP技术。',
12 | ]))
13 |
--------------------------------------------------------------------------------
/plugins/hanlp_demo/hanlp_demo/mul/demo_mtl.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # Author: hankcs
3 | # Date: 2020-12-31 13:51
4 | import hanlp
5 | from hanlp_common.document import Document
6 |
7 | HanLP = hanlp.load(hanlp.pretrained.mtl.UD_ONTONOTES_TOK_POS_LEM_FEA_NER_SRL_DEP_SDP_CON_XLMR_BASE)
8 | doc: Document = HanLP([
9 | 'In 2021, HanLPv2.1 delivers state-of-the-art multilingual NLP techniques to production environment.',
10 | '2021年、HanLPv2.1は次世代の最先端多言語NLP技術を本番環境に導入します。',
11 | '2021年 HanLPv2.1为生产环境带来次世代最先进的多语种NLP技术。',
12 | ])
13 | print(doc)
14 | doc.pretty_print()
15 |
--------------------------------------------------------------------------------
/plugins/hanlp_demo/hanlp_demo/mul/train/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # Author: hankcs
3 | # Date: 2023-02-21 19:40
4 |
--------------------------------------------------------------------------------
/plugins/hanlp_demo/hanlp_demo/sent_split.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # Author: hankcs
3 | # Date: 2020-12-31 14:23
4 | import hanlp
5 |
6 | split_sent = hanlp.load(hanlp.pretrained.eos.UD_CTB_EOS_MUL)
7 | output = split_sent('3.14 is pi. “你好!!!”——他说。劇場版「Fate/stay night [HF]」最終章公開カウントダウン!')
8 | print('\n'.join(output))
9 | # See also https://hanlp.hankcs.com/docs/api/hanlp/components/eos.html
10 |
--------------------------------------------------------------------------------
/plugins/hanlp_demo/hanlp_demo/zh/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # Author: hankcs
3 | # Date: 2020-12-31 13:51
4 |
--------------------------------------------------------------------------------
/plugins/hanlp_demo/hanlp_demo/zh/demo_amr.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # Author: hankcs
3 | # Date: 2022-04-12 22:19
4 | import hanlp
5 |
6 | parser = hanlp.load(hanlp.pretrained.amr.MRP2020_AMR_ENG_ZHO_XLM_BASE)
7 |
8 | # For Chinese:
9 | print(parser(["男孩", "希望", "女孩", "相信", "他", "。"]))
10 | print(parser(["男孩", "希望", "女孩", "相信", "他", "。"], output_amr=False))
11 |
12 | # For English:
13 | print(parser(['The', 'boy', 'wants', 'the', 'girl', 'to', 'believe', 'him', '.'], language='eng'))
14 | # It's suggested to also feed the lemma for stabler performance.
15 | print(parser([('The', 'the'), ('boy', 'boy'), ('wants', 'want'), ('the', 'the'), ('girl', 'girl'), ('to', 'to'),
16 | ('believe', 'believe'), ('him', 'he'), ('.', '.')], language='eng'))
17 |
--------------------------------------------------------------------------------
/plugins/hanlp_demo/hanlp_demo/zh/demo_custom_dict.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # Author: hankcs
3 | # Date: 2020-12-15 22:26
4 | import hanlp
5 | from hanlp.components.mtl.multi_task_learning import MultiTaskLearning
6 | from hanlp.components.mtl.tasks.tok.tag_tok import TaggingTokenization
7 |
8 | # 加载多任务模型
9 | HanLP: MultiTaskLearning = hanlp.load(hanlp.pretrained.mtl.CLOSE_TOK_POS_NER_SRL_DEP_SDP_CON_ELECTRA_SMALL_ZH)
10 | # 获取分词任务(以tok开头的任务都是分词任务,以细分标准为例)
11 | tok: TaggingTokenization = HanLP['tok/fine']
12 |
13 | tok.dict_force = tok.dict_combine = None
14 | print(f'不挂词典:\n{HanLP("商品和服务项目")["tok/fine"]}')
15 |
16 | tok.dict_force = {'和服', '服务项目'}
17 | print(f'强制模式:\n{HanLP("商品和服务项目")["tok/fine"]}') # 慎用,详见《自然语言处理入门》第二章
18 |
19 | tok.dict_force = {'和服务': ['和', '服务']}
20 | print(f'强制校正:\n{HanLP("正向匹配商品和服务、任何和服务必按上述切分")["tok/fine"]}')
21 |
22 | tok.dict_force = None
23 | tok.dict_combine = {'和服', '服务项目'}
24 | print(f'合并模式:\n{HanLP("商品和服务项目")["tok/fine"]}')
25 |
26 | # 需要算法基础才能理解,初学者可参考 http://nlp.hankcs.com/book.php
27 | # See also https://hanlp.hankcs.com/docs/api/hanlp/components/tokenizers/transformer.html
28 |
29 | # 含有空格、制表符等(Transformer tokenizer去掉的字符)的词语需要用tuple的形式提供
30 | tok.dict_combine = {('iPad', 'Pro'), '2个空格'}
31 | print(f'空格匹配:\n{HanLP("如何评价iPad Pro ?iPad Pro有2个空格", tasks="tok/fine")["tok/fine"]}')
32 | # 聪明的用户请继续阅读:tuple词典中的字符串其实等价于该字符串的所有可能的切分方式
33 | print(f'词典内容:\n{dict(tok.dict_combine.config["dictionary"]).keys()}')
34 |
--------------------------------------------------------------------------------
/plugins/hanlp_demo/hanlp_demo/zh/demo_custom_dict_stl.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # Author: hankcs
3 | # Date: 2020-12-15 22:26
4 | import hanlp
5 | from hanlp.components.tokenizers.transformer import TransformerTaggingTokenizer
6 |
7 | # 加载一个旧版本单任务模型演示分词错误(最新版已经修复):
8 | tok: TransformerTaggingTokenizer = hanlp.load('https://file.hankcs.com/hanlp/tok/coarse_electra_small_20220220_013548.zip')
9 |
10 | tok.dict_force = tok.dict_combine = None
11 | print(f'不挂词典:\n{tok("首相和川普通电话")}')
12 |
13 | tok.dict_force = {'川普'}
14 | print(f'强制模式:\n{tok(["首相和川普通电话", "银川普通人与川普通电话讲四川普通话"])}') # 慎用,详见《自然语言处理入门》第二章
15 |
16 | tok.dict_force = {'川普通电话': ['川普', '通', '电话']}
17 | print(f'强制校正:\n{tok(["首相和川普通电话", "银川普通人与川普通电话讲四川普通话"])}')
18 |
19 | tok.dict_force = None
20 | tok.dict_combine = {'美国总统'}
21 | print(f'合并模式:\n{tok("首相和川普通电话,川普是美国总统。")}')
22 |
23 | # 需要算法基础才能理解,初学者可参考 http://nlp.hankcs.com/book.php
24 | # See also https://hanlp.hankcs.com/docs/api/hanlp/components/tokenizers/transformer.html
25 |
--------------------------------------------------------------------------------
/plugins/hanlp_demo/hanlp_demo/zh/demo_del_tasks.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # Author: hankcs
3 | # Date: 2021-02-03 13:28
4 | import hanlp
5 | from hanlp.components.mtl.multi_task_learning import MultiTaskLearning
6 | from hanlp_common.document import Document
7 |
8 | HanLP: MultiTaskLearning = hanlp.load(hanlp.pretrained.mtl.OPEN_TOK_POS_NER_SRL_DEP_SDP_CON_ELECTRA_SMALL_ZH)
9 | tasks = list(HanLP.tasks.keys())
10 | print(tasks) # Pick what you need from what we have
11 | for task in tasks:
12 | if task not in ('tok', 'pos'):
13 | del HanLP[task]
14 | # You can save it as a new component
15 | # HanLP.save('path/to/new/component')
16 | # HanLP.load('path/to/new/component')
17 | print(HanLP.tasks.keys())
18 | doc: Document = HanLP(['2021年HanLPv2.1为生产环境带来次世代最先进的多语种NLP技术。', 'up主来到北京立方庭参观自然语义科技公司。'])
19 | print(doc)
20 | doc.pretty_print()
21 |
--------------------------------------------------------------------------------
/plugins/hanlp_demo/hanlp_demo/zh/demo_mlm.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # Author: hankcs
3 | # Date: 2022-01-29 21:11
4 | from hanlp.components.lm.mlm import MaskedLanguageModel
5 |
6 | mlm = MaskedLanguageModel()
7 | mlm.load('bert-base-chinese')
8 | print(mlm('生活的真谛是[MASK]。'))
9 |
10 | # Batching is always faster
11 | print(mlm(['生活的真谛是[MASK]。', '巴黎是[MASK][MASK]的首都。']))
12 |
--------------------------------------------------------------------------------
/plugins/hanlp_demo/hanlp_demo/zh/demo_mtl.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # Author: hankcs
3 | # Date: 2020-12-31 13:51
4 | import hanlp
5 | from hanlp_common.document import Document
6 |
7 | # CLOSE是自然语义标注的闭源语料库,BASE是中号模型,ZH中文
8 | HanLP = hanlp.load(hanlp.pretrained.mtl.CLOSE_TOK_POS_NER_SRL_DEP_SDP_CON_ELECTRA_BASE_ZH)
9 | # 默认执行全部任务
10 | doc: Document = HanLP(['2021年HanLPv2.1为生产环境带来次世代最先进的多语种NLP技术。', '阿婆主来到北京立方庭参观自然语义科技公司。'])
11 | # 返回类型Document是dict的子类,打印出来兼容JSON
12 | print(doc)
13 | # 即时可视化,防止换行请最大化窗口,推荐在Jupyter Notebook里调用
14 | doc.pretty_print()
15 | # 指定可视化OntoNotes标准的NER
16 | # doc.pretty_print(ner='ner/ontonotes', pos='pku')
17 |
--------------------------------------------------------------------------------
/plugins/hanlp_demo/hanlp_demo/zh/demo_ner_dict.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # Author: hankcs
3 | # Date: 2021-04-29 11:06
4 | import hanlp
5 | from hanlp.components.mtl.tasks.ner.tag_ner import TaggingNamedEntityRecognition
6 | from hanlp.utils.io_util import get_resource
7 |
8 | HanLP = hanlp.load(hanlp.pretrained.mtl.CLOSE_TOK_POS_NER_SRL_DEP_SDP_CON_ERNIE_GRAM_ZH)
9 | ner: TaggingNamedEntityRecognition = HanLP['ner/msra']
10 | ner.dict_whitelist = {'午饭后': 'TIME'}
11 | doc = HanLP('2021年测试高血压是138,时间是午饭后2点45,低血压是44', tasks='ner/msra')
12 | doc.pretty_print()
13 | print(doc['ner/msra'])
14 |
15 | ner.dict_tags = {('名字', '叫', '金华'): ('O', 'O', 'S-PERSON')}
16 | HanLP('他在浙江金华出生,他的名字叫金华。', tasks='ner/msra').pretty_print()
17 |
18 | # HanLP.save(get_resource(hanlp.pretrained.mtl.CLOSE_TOK_POS_NER_SRL_DEP_SDP_CON_ERNIE_GRAM_ZH))
19 |
20 | # 需要算法基础才能理解,初学者可参考 http://nlp.hankcs.com/book.php
21 | # See https://hanlp.hankcs.com/docs/api/hanlp/components/mtl/tasks/ner/tag_ner.html
22 |
--------------------------------------------------------------------------------
/plugins/hanlp_demo/hanlp_demo/zh/demo_pipeline.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # Author: hankcs
3 | # Date: 2021-12-28 20:47
4 | import hanlp
5 |
6 | # Pipeline allows blending multiple callable functions no matter they are a rule, a TensorFlow component or a PyTorch
7 | # one. However, it's slower than the MTL framework.
8 | # pos = hanlp.load(hanlp.pretrained.pos.CTB9_POS_ALBERT_BASE) # In case both tf and torch are used, load tf first.
9 |
10 | HanLP = hanlp.pipeline() \
11 | .append(hanlp.utils.rules.split_sentence, output_key='sentences') \
12 | .append(hanlp.load('CTB9_TOK_ELECTRA_SMALL'), output_key='tok') \
13 | .append(hanlp.load('CTB9_POS_ELECTRA_SMALL'), output_key='pos') \
14 | .append(hanlp.load('MSRA_NER_ELECTRA_SMALL_ZH'), output_key='ner', input_key='tok') \
15 | .append(hanlp.load('CTB9_DEP_ELECTRA_SMALL', conll=False), output_key='dep', input_key='tok') \
16 | .append(hanlp.load('CTB9_CON_ELECTRA_SMALL'), output_key='con', input_key='tok')
17 |
18 | doc = HanLP('2021年HanLPv2.1为生产环境带来次世代最先进的多语种NLP技术。阿婆主来到北京立方庭参观自然语义科技公司。')
19 | print(doc)
20 | doc.pretty_print()
21 |
--------------------------------------------------------------------------------
/plugins/hanlp_demo/hanlp_demo/zh/demo_pos_dict.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # Author: hankcs
3 | # Date: 2020-12-15 22:26
4 | import hanlp
5 | from hanlp.components.mtl.multi_task_learning import MultiTaskLearning
6 | from hanlp.components.mtl.tasks.pos import TransformerTagging
7 | from hanlp.components.mtl.tasks.tok.tag_tok import TaggingTokenization
8 | from tests import cdroot
9 |
10 | cdroot()
11 | HanLP: MultiTaskLearning = hanlp.load(hanlp.pretrained.mtl.CLOSE_TOK_POS_NER_SRL_DEP_SDP_CON_ELECTRA_SMALL_ZH)
12 |
13 | # Demonstrates custom dict in part-of-speech tagging
14 | pos: TransformerTagging = HanLP['pos/ctb']
15 |
16 | print(f'自定义单个词性:')
17 | pos.dict_tags = {'HanLP': 'state-of-the-art-tool'}
18 | HanLP("HanLP为生产环境带来次世代最先进的多语种NLP技术。", tasks='pos/ctb').pretty_print()
19 |
20 | print(f'根据上下文自定义词性:')
21 | pos.dict_tags = {('的', '希望'): ('补语成分', '名词'), '希望': '动词'}
22 | HanLP("我的希望是希望张晚霞的背影被晚霞映红。", tasks='pos/ctb').pretty_print()
23 |
24 | # 需要算法基础才能理解,初学者可参考 http://nlp.hankcs.com/book.php
25 | # See also https://hanlp.hankcs.com/docs/api/hanlp/components/taggers/transformer_tagger.html
26 |
--------------------------------------------------------------------------------
/plugins/hanlp_demo/hanlp_demo/zh/demo_sts.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # Author: hankcs
3 | # Date: 2021-05-24 13:15
4 | import hanlp
5 |
6 | sim = hanlp.load(hanlp.pretrained.sts.STS_ELECTRA_BASE_ZH)
7 | print(sim([
8 | ['看图猜一电影名', '看图猜电影'],
9 | ['无线路由器怎么无线上网', '无线上网卡和无线路由器怎么用'],
10 | ['北京到上海的动车票', '上海到北京的动车票'],
11 | ]))
12 |
--------------------------------------------------------------------------------
/plugins/hanlp_demo/hanlp_demo/zh/demo_word2vec.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # Author: hankcs
3 | # Date: 2021-12-12 18:33
4 | import hanlp
5 | import torch
6 |
7 | word2vec = hanlp.load(hanlp.pretrained.word2vec.CONVSEG_W2V_NEWS_TENSITE_WORD_PKU)
8 | vec = word2vec('先进')
9 | print(vec)
10 |
11 | print(torch.nn.functional.cosine_similarity(word2vec('先进'), word2vec('优秀'), dim=0))
12 | print(torch.nn.functional.cosine_similarity(word2vec('先进'), word2vec('水果'), dim=0))
13 |
14 | print('获取语义最相似的词语:')
15 | print(word2vec.most_similar('上海'))
16 | # print(word2vec.most_similar(['上海', '寒冷'])) # batching更快
17 |
18 | print('非常寒冷是OOV所以无法获取:')
19 | print(word2vec.most_similar('非常寒冷'))
20 | print('但是在doc2vec模式下OOV也可以进行相似度计算:')
21 | print(word2vec.most_similar('非常寒冷', doc2vec=True))
22 | print('甚至可以处理短文本:')
23 | print(word2vec.most_similar('国家图书馆推出2022年春节主题活动', doc2vec=True))
24 |
--------------------------------------------------------------------------------
/plugins/hanlp_demo/hanlp_demo/zh/tf/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # Author: hankcs
3 | # Date: 2020-12-31 20:36
4 |
--------------------------------------------------------------------------------
/plugins/hanlp_demo/hanlp_demo/zh/tf/demo_classifier.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # Author: hankcs
3 | # Date: 2020-01-01 03:52
4 | from hanlp.datasets.classification.sentiment import CHNSENTICORP_ERNIE_TEST
5 |
6 | import hanlp
7 |
8 | classifier = hanlp.load('CHNSENTICORP_BERT_BASE_ZH')
9 | print(classifier.predict('前台客房服务态度非常好!早餐很丰富,房价很干净。再接再厉!'))
10 |
11 | # predict a whole file in batch mode
12 | outputs = classifier.predict(classifier.transform.file_to_inputs(CHNSENTICORP_ERNIE_TEST), gold=True)
13 | print(outputs[:5])
14 |
--------------------------------------------------------------------------------
/plugins/hanlp_demo/hanlp_demo/zh/tf/demo_client.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # Author: hankcs
3 | # Date: 2020-01-08 04:43
4 | # pip3 install tensorflow-serving-api-gpu
5 | import grpc
6 | import tensorflow as tf
7 | from tensorflow_core.python.framework import tensor_util
8 | from tensorflow_serving.apis import predict_pb2, prediction_service_pb2_grpc
9 | import hanlp
10 | from hanlp.common.keras_component import KerasComponent
11 |
12 | tagger: KerasComponent = hanlp.load(hanlp.pretrained.pos.CTB5_POS_RNN, transform_only=True)
13 | transform = tagger.transform
14 | del tagger
15 |
16 | inputs = [['商品', '和', '服务'],
17 | ['我', '的', '希望', '是', '希望', '和平']]
18 |
19 | samples = next(iter(transform.inputs_to_dataset(inputs)))[0]
20 | print(samples)
21 |
22 | channel = grpc.insecure_channel('{host}:{port}'.format(host='localhost', port=8500))
23 | stub = prediction_service_pb2_grpc.PredictionServiceStub(channel)
24 | request = predict_pb2.PredictRequest()
25 | request.model_spec.name = 'ctb5_pos_rnn_20191229_015325'
26 | request.model_spec.signature_name = 'serving_default'
27 | request.inputs['embedding_input'].CopyFrom(
28 | tf.make_tensor_proto(samples, dtype=tf.float32))
29 | result = stub.Predict(request, 10.0) # 10 secs timeout
30 | print(result)
31 | prediction = tensor_util.MakeNdarray(result.outputs['dense'])
32 | print(prediction)
33 |
34 | print(list(transform.Y_to_outputs(prediction, inputs=inputs)))
35 |
--------------------------------------------------------------------------------
/plugins/hanlp_demo/hanlp_demo/zh/tf/demo_cws.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # Author: hankcs
3 | # Date: 2019-12-28 21:25
4 | import hanlp
5 |
6 | tokenizer = hanlp.load(hanlp.pretrained.tok.LARGE_ALBERT_BASE)
7 | print(tokenizer('商品和服务'))
8 | print(tokenizer(['萨哈夫说,伊拉克将同联合国销毁伊拉克大规模杀伤性武器特别委员会继续保持合作。',
9 | '上海华安工业(集团)公司董事长谭旭光和秘书张晚霞来到美国纽约现代艺术博物馆参观。',
10 | 'HanLP支援臺灣正體、香港繁體,具有新詞辨識能力的中文斷詞系統']))
11 |
12 | text = 'NLP统计模型没有加规则,聪明人知道自己加。英文、数字、自定义词典统统都是规则。'
13 | print(tokenizer(text))
14 |
15 | dic = {'自定义词典': 'custom_dict', '聪明人': 'smart'}
16 |
17 |
18 | def split_by_dic(text: str):
19 | # We use regular expression for the sake of simplicity.
20 | # However, you should use some trie trees for production
21 | import re
22 | p = re.compile('(' + '|'.join(dic.keys()) + ')')
23 | sents, offset, words = [], 0, []
24 | for m in p.finditer(text):
25 | if offset < m.start():
26 | sents.append(text[offset: m.start()])
27 | words.append((m.group(), dic[m.group()]))
28 | offset = m.end()
29 | if offset < len(text):
30 | sents.append(text[offset:])
31 | words.append((None, None))
32 | flat = []
33 | for pred, (word, tag) in zip(tokenizer(sents), words):
34 | flat.extend(pred)
35 | if word:
36 | flat.append((word, tag))
37 | return flat
38 |
39 |
40 | print(split_by_dic(text))
41 |
--------------------------------------------------------------------------------
/plugins/hanlp_demo/hanlp_demo/zh/tf/demo_dep.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # Author: hankcs
3 | # Date: 2019-12-28 21:25
4 | import hanlp
5 |
6 | syntactic_parser = hanlp.load(hanlp.pretrained.dep.CTB7_BIAFFINE_DEP_ZH)
7 | sent = [('蜡烛', 'NN'), ('两', 'CD'), ('头', 'NN'), ('烧', 'VV')]
8 | tree = syntactic_parser(sent)
9 | print(tree)
10 |
--------------------------------------------------------------------------------
/plugins/hanlp_demo/hanlp_demo/zh/tf/demo_fasttext.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # Author: hankcs
3 | # Date: 2021-12-12 18:33
4 | import hanlp
5 | import torch
6 |
7 | # fasttext is a `torch.nn.Module`. Unless you know how to code in
8 | # PyTorch, otherwise don't bother to use this.
9 | fasttext = hanlp.load(hanlp.pretrained.fasttext.FASTTEXT_WIKI_300_ZH)
10 |
11 | vec = fasttext('单词')
12 | print(vec)
13 |
14 | print(torch.nn.functional.cosine_similarity(fasttext('单词'), fasttext('词语'), dim=0))
15 | print(torch.nn.functional.cosine_similarity(fasttext('单词'), fasttext('今天'), dim=0))
16 |
--------------------------------------------------------------------------------
/plugins/hanlp_demo/hanlp_demo/zh/tf/demo_multiprocess.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # Author: hankcs
3 | # Date: 2020-02-15 11:30
4 | import multiprocessing
5 | import hanlp
6 |
7 | tokenizer = hanlp.load(hanlp.pretrained.tok.LARGE_ALBERT_BASE)
8 |
9 |
10 | def worker(job):
11 | print(job)
12 | print(tokenizer(job))
13 |
14 |
15 | if __name__ == '__main__':
16 | num_proc = 2
17 | # Important! The python multiprocessing package defaults to just call fork when creating a child process.
18 | # This cannot work when the child process calls async code (i.e TensorFlow is multithreaded).
19 | # See https://github.com/tensorflow/tensorflow/issues/8220#issuecomment-302826884
20 | # See https://sefiks.com/2019/03/20/tips-and-tricks-for-gpu-and-multiprocessing-in-tensorflow/
21 | multiprocessing.set_start_method('spawn', force=True) # only spawn works with TensorFlow
22 | with multiprocessing.Pool(num_proc) as pool:
23 | pool.map(worker, [f'给{i}号进程的任务' for i in range(num_proc)])
24 |
--------------------------------------------------------------------------------
/plugins/hanlp_demo/hanlp_demo/zh/tf/demo_ner.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # Author: hankcs
3 | # Date: 2019-12-30 19:52
4 | import hanlp
5 |
6 | recognizer = hanlp.load(hanlp.pretrained.ner.MSRA_NER_BERT_BASE_ZH)
7 | print(recognizer.predict([list('上海华安工业(集团)公司董事长谭旭光和秘书张晚霞来到美国纽约现代艺术博物馆参观。'),
8 | list('萨哈夫说,伊拉克将同联合国销毁伊拉克大规模杀伤性武器特别委员会继续保持合作。')]))
9 |
--------------------------------------------------------------------------------
/plugins/hanlp_demo/hanlp_demo/zh/tf/demo_pipeline.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # Author: hankcs
3 | # Date: 2019-12-31 03:24
4 |
5 | import hanlp
6 |
7 | tokenizer = hanlp.load('LARGE_ALBERT_BASE')
8 | tagger = hanlp.load('CTB9_POS_ALBERT_BASE')
9 | syntactic_parser = hanlp.load('CTB7_BIAFFINE_DEP_ZH')
10 | semantic_parser = hanlp.load('SEMEVAL16_TEXT_BIAFFINE_ZH')
11 |
12 | pipeline = hanlp.pipeline() \
13 | .append(hanlp.utils.rules.split_sentence, output_key='sentences') \
14 | .append(tokenizer, output_key='tokens') \
15 | .append(tagger, output_key='part_of_speech_tags') \
16 | .append(syntactic_parser, input_key=('tokens', 'part_of_speech_tags'), output_key='syntactic_dependencies', conll=False) \
17 | .append(semantic_parser, input_key=('tokens', 'part_of_speech_tags'), output_key='semantic_dependencies', conll=False)
18 | print(pipeline)
19 |
20 | text = '''HanLP是一系列模型与算法组成的自然语言处理工具包,目标是普及自然语言处理在生产环境中的应用。
21 | HanLP具备功能完善、性能高效、架构清晰、语料时新、可自定义的特点。
22 | 内部算法经过工业界和学术界考验,配套书籍《自然语言处理入门》已经出版。
23 | '''
24 |
25 | doc = pipeline(text)
26 | print(doc)
27 | # By default the doc is json serializable, it holds true if your pipes output json serializable object too.
28 | # print(json.dumps(doc, ensure_ascii=False, indent=2))
29 |
30 | # You can save the config to disk for deploying or sharing.
31 | pipeline.save('zh.json')
32 | # Then load it smoothly.
33 | deployed = hanlp.load('zh.json')
34 | print(deployed)
35 | print(deployed(text))
36 |
--------------------------------------------------------------------------------
/plugins/hanlp_demo/hanlp_demo/zh/tf/demo_pos.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # Author: hankcs
3 | # Date: 2019-12-28 21:25
4 | import hanlp
5 | from hanlp.pretrained.pos import CTB9_POS_ALBERT_BASE
6 |
7 | tagger = hanlp.load(CTB9_POS_ALBERT_BASE)
8 | print(tagger.predict(['我', '的', '希望', '是', '希望', '世界', '和平']))
9 | print(tagger.predict([['支持', '批处理', '地', '预测'], ['速度', '更', '快']]))
10 |
--------------------------------------------------------------------------------
/plugins/hanlp_demo/hanlp_demo/zh/tf/demo_sdp.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # Author: hankcs
3 | # Date: 2019-12-31 23:55
4 | import hanlp
5 |
6 | semantic_parser = hanlp.load('SEMEVAL16_NEWS_BIAFFINE_ZH')
7 | sent = [('蜡烛', 'NN'), ('两', 'CD'), ('头', 'NN'), ('烧', 'VV')]
8 | print(semantic_parser(sent))
9 |
--------------------------------------------------------------------------------
/plugins/hanlp_demo/hanlp_demo/zh/tf/demo_serving.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # Author: hankcs
3 | # Date: 2020-01-06 20:23
4 | import hanlp
5 | from hanlp.common.keras_component import KerasComponent
6 |
7 | tagger: KerasComponent = hanlp.load(hanlp.pretrained.pos.CTB5_POS_RNN)
8 | print(tagger('商品 和 服务'.split()))
9 | tagger.serve()
10 |
--------------------------------------------------------------------------------
/plugins/hanlp_demo/hanlp_demo/zh/tf/train/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # Author: hankcs
3 | # Date: 2021-12-26 23:25
4 |
--------------------------------------------------------------------------------
/plugins/hanlp_demo/hanlp_demo/zh/tf/train/cws/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # Author: hankcs
3 | # Date: 2020-01-01 20:55
--------------------------------------------------------------------------------
/plugins/hanlp_demo/hanlp_demo/zh/tf/train/cws/train_ctb6_cws_albert.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # Author: hankcs
3 | # Date: 2019-12-28 22:22
4 |
5 | from hanlp.components.tokenizers.tok_tf import TransformerTokenizerTF
6 | from hanlp.datasets.tokenization.ctb6 import CTB6_CWS_TRAIN, CTB6_CWS_DEV, CTB6_CWS_TEST
7 | from tests import cdroot
8 |
9 | cdroot()
10 | tokenizer = TransformerTokenizerTF()
11 | save_dir = 'data/model/cws_bert_albert_ctb6'
12 | tokenizer.fit(CTB6_CWS_TRAIN, CTB6_CWS_DEV, save_dir,
13 | transformer='/home/ubuntu/hankcs/laser/data/transformer/albert_base_tf2',
14 | metrics='f1', learning_rate=5e-5, epochs=3)
15 | tokenizer.load(save_dir)
16 | print(tokenizer.predict(['中央民族乐团离开北京前往维也纳', '商品和服务']))
17 | tokenizer.evaluate(CTB6_CWS_TEST, save_dir=save_dir)
18 | print(f'Model saved in {save_dir}')
19 |
--------------------------------------------------------------------------------
/plugins/hanlp_demo/hanlp_demo/zh/tf/train/cws/train_ctb6_cws_bert.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # Author: hankcs
3 | # Date: 2019-12-28 22:22
4 |
5 | from hanlp.components.tokenizers.tok_tf import TransformerTokenizerTF
6 | from hanlp.datasets.tokenization.ctb6 import CTB6_CWS_TRAIN, CTB6_CWS_DEV, CTB6_CWS_TEST
7 | from tests import cdroot
8 |
9 | cdroot()
10 | tokenizer = TransformerTokenizerTF()
11 | save_dir = 'data/model/cws_bert_base_ctb6'
12 | tokenizer.fit(CTB6_CWS_TRAIN, CTB6_CWS_DEV, save_dir, transformer='chinese_L-12_H-768_A-12',
13 | metrics='f1')
14 | tokenizer.load(save_dir)
15 | print(tokenizer.predict(['中央民族乐团离开北京前往维也纳', '商品和服务']))
16 | tokenizer.evaluate(CTB6_CWS_TEST, save_dir=save_dir)
17 | print(f'Model saved in {save_dir}')
18 |
--------------------------------------------------------------------------------
/plugins/hanlp_demo/hanlp_demo/zh/tf/train/cws/train_ctb6_cws_convseg.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # Author: hankcs
3 | # Date: 2019-12-28 22:22
4 | import tensorflow as tf
5 |
6 | from hanlp.components.tokenizers.tok_tf import NgramConvTokenizerTF
7 | from hanlp.datasets.tokenization.ctb6 import CTB6_CWS_TRAIN, CTB6_CWS_DEV, CTB6_CWS_TEST
8 | from hanlp.pretrained.word2vec import CONVSEG_W2V_NEWS_TENSITE_CHAR
9 | from tests import cdroot
10 |
11 | cdroot()
12 | tokenizer = NgramConvTokenizerTF()
13 | save_dir = 'data/model/cws/ctb6_cws'
14 | optimizer = tf.keras.optimizers.Adam(learning_rate=0.001,
15 | epsilon=1e-8, clipnorm=5)
16 | tokenizer.fit(CTB6_CWS_TRAIN,
17 | CTB6_CWS_DEV,
18 | save_dir,
19 | word_embed={'class_name': 'HanLP>Word2VecEmbedding',
20 | 'config': {
21 | 'trainable': True,
22 | 'filepath': CONVSEG_W2V_NEWS_TENSITE_CHAR,
23 | 'expand_vocab': False,
24 | 'lowercase': False,
25 | }},
26 | optimizer=optimizer,
27 | window_size=0,
28 | weight_norm=True)
29 | tokenizer.evaluate(CTB6_CWS_TEST, save_dir=save_dir, output=False)
30 | print(tokenizer.predict(['中央民族乐团离开北京前往维也纳', '商品和服务']))
31 | print(f'Model saved in {save_dir}')
32 |
--------------------------------------------------------------------------------
/plugins/hanlp_demo/hanlp_demo/zh/tf/train/cws/train_large_bert_cws.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # Author: hankcs
3 | # Date: 2019-12-21 15:39
4 | from hanlp.components.tokenizers.tok_tf import TransformerTokenizerTF
5 | from hanlp.datasets.tokenization.ctb6 import CTB6_CWS_DEV, CTB6_CWS_TEST
6 | from tests import cdroot
7 |
8 | cdroot()
9 | tokenizer = TransformerTokenizerTF()
10 | save_dir = 'data/model/cws_bert_base_100million'
11 | tokenizer.fit('data/cws/large/all.txt', CTB6_CWS_DEV, save_dir, transformer='bert-base-chinese',
12 | metrics='accuracy', batch_size=32)
13 | tokenizer.load(save_dir, metrics='f1')
14 | print(tokenizer.predict(['中央民族乐团离开北京前往维也纳', '商品和服务']))
15 | tokenizer.evaluate(CTB6_CWS_TEST, save_dir=save_dir)
16 | print(f'Model saved in {save_dir}')
17 |
--------------------------------------------------------------------------------
/plugins/hanlp_demo/hanlp_demo/zh/tf/train/cws/train_large_conv_cws.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # Author: hankcs
3 | # Date: 2019-12-29 21:58
4 |
5 | import tensorflow as tf
6 |
7 | from hanlp.components.tokenizers.tok_tf import NgramConvTokenizerTF
8 | from hanlp.datasets.cws.ctb import CTB6_CWS_TRAIN, CTB6_CWS_DEV, CTB6_CWS_TEST
9 | from hanlp.pretrained.word2vec import CONVSEG_W2V_NEWS_TENSITE_CHAR
10 | from tests import cdroot
11 |
12 | cdroot()
13 | tokenizer = NgramConvTokenizerTF()
14 | save_dir = 'data/model/cws/ctb6_cws'
15 | optimizer = tf.keras.optimizers.Adam(learning_rate=0.001,
16 | epsilon=1e-8, clipnorm=5)
17 | tokenizer.fit(CTB6_CWS_TRAIN,
18 | CTB6_CWS_DEV,
19 | save_dir,
20 | word_embed={'class_name': 'HanLP>Word2VecEmbedding',
21 | 'config': {
22 | 'trainable': True,
23 | 'filepath': CONVSEG_W2V_NEWS_TENSITE_CHAR,
24 | 'expand_vocab': False,
25 | 'lowercase': False,
26 | }},
27 | optimizer=optimizer,
28 | window_size=0,
29 | weight_norm=True)
30 | tokenizer.evaluate(CTB6_CWS_TEST, save_dir=save_dir, output=False)
31 | print(tokenizer.predict(['中央民族乐团离开北京前往维也纳', '商品和服务']))
32 | print(f'Model saved in {save_dir}')
33 |
--------------------------------------------------------------------------------
/plugins/hanlp_demo/hanlp_demo/zh/tf/train/cws/train_large_cws_albert.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # Author: hankcs
3 | # Date: 2019-12-28 22:22
4 | from hanlp.components.tokenizers.tok_tf import TransformerTokenizerTF
5 | from hanlp.datasets.tokenization.ctb6 import CTB6_CWS_DEV, CTB6_CWS_TEST
6 | from tests import cdroot
7 |
8 | cdroot()
9 | tokenizer = TransformerTokenizerTF()
10 | save_dir = 'data/model/large_corpus_cws_albert_base'
11 | tokenizer.fit('data/cws/large/all.txt',
12 | CTB6_CWS_DEV, save_dir,
13 | transformer='uer/albert-base-chinese-cluecorpussmall',
14 | max_seq_length=128,
15 | metrics='accuracy', learning_rate=5e-5, epochs=3)
16 | tokenizer.load(save_dir, metrics='f1')
17 | print(tokenizer.predict(['中央民族乐团离开北京前往维也纳', '商品和服务']))
18 | tokenizer.evaluate(CTB6_CWS_TEST, save_dir=save_dir)
19 | print(f'Model saved in {save_dir}')
20 |
--------------------------------------------------------------------------------
/plugins/hanlp_demo/hanlp_demo/zh/tf/train/cws/train_large_cws_electra.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # Author: hankcs
3 | # Date: 2019-12-28 22:22
4 | from hanlp.components.tokenizers.tok_tf import TransformerTokenizerTF
5 | from hanlp.datasets.tokenization.ctb6 import CTB6_CWS_DEV, CTB6_CWS_TEST
6 | from tests import cdroot
7 |
8 | cdroot()
9 | tokenizer = TransformerTokenizerTF()
10 | save_dir = 'data/model/large_corpus_cws_electra_small'
11 | tokenizer.fit('data/cws/large/all.txt',
12 | CTB6_CWS_DEV, save_dir,
13 | transformer='hfl/chinese-electra-small-discriminator',
14 | max_seq_length=128,
15 | metrics='accuracy', learning_rate=5e-5, epochs=10)
16 | tokenizer.load(save_dir, metrics='f1')
17 | print(tokenizer.predict(['中央民族乐团离开北京前往维也纳', '商品和服务']))
18 | tokenizer.evaluate(CTB6_CWS_TEST, save_dir=save_dir)
19 | print(f'Model saved in {save_dir}')
20 |
--------------------------------------------------------------------------------
/plugins/hanlp_demo/hanlp_demo/zh/tf/train/cws/train_msr_cws_albert.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # Author: hankcs
3 | # Date: 2019-12-28 22:22
4 |
5 | from hanlp.components.tokenizers.tok import TransformerTokenizer
6 | from hanlp.datasets.cws.ctb import CTB6_CWS_TEST
7 | from hanlp.datasets.tokenization.sighan2005.msr import SIGHAN2005_MSR_VALID, SIGHAN2005_MSR_TRAIN
8 | from tests import cdroot
9 |
10 | cdroot()
11 | tokenizer = TransformerTokenizer()
12 | save_dir = 'data/model/msr_cws_albert_base'
13 | tokenizer.fit(SIGHAN2005_MSR_TRAIN, SIGHAN2005_MSR_VALID, save_dir,
14 | transformer='albert_base_zh',
15 | max_seq_length=150,
16 | metrics='f1', learning_rate=5e-5, epochs=10)
17 | tokenizer.load(save_dir)
18 | print(tokenizer.predict(['中央民族乐团离开北京前往维也纳', '商品和服务']))
19 | tokenizer.evaluate(CTB6_CWS_TEST, save_dir=save_dir)
20 | print(f'Model saved in {save_dir}')
21 |
--------------------------------------------------------------------------------
/plugins/hanlp_demo/hanlp_demo/zh/tf/train/cws/train_msr_cws_bert.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # Author: hankcs
3 | # Date: 2019-12-21 15:39
4 | from hanlp.components.tokenizers.tok_tf import TransformerTokenizerTF
5 | from hanlp.datasets.tokenization.sighan2005.msr import SIGHAN2005_MSR_TRAIN, SIGHAN2005_MSR_DEV, SIGHAN2005_MSR_TEST
6 | from tests import cdroot
7 |
8 | cdroot()
9 | tokenizer = TransformerTokenizerTF()
10 | save_dir = 'data/model/cws_bert_base_msra'
11 | tokenizer.fit(SIGHAN2005_MSR_TRAIN, SIGHAN2005_MSR_DEV, save_dir, transformer='bert-base-chinese',
12 | metrics='f1')
13 | # tagger.load(save_dir)
14 | print(tokenizer.predict(['中央民族乐团离开北京前往维也纳', '商品和服务']))
15 | tokenizer.evaluate(SIGHAN2005_MSR_TEST, save_dir=save_dir)
16 | print(f'Model saved in {save_dir}')
17 |
--------------------------------------------------------------------------------
/plugins/hanlp_demo/hanlp_demo/zh/tf/train/cws/train_msr_cws_ngram_conv.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # Author: hankcs
3 | # Date: 2019-12-21 15:39
4 | import tensorflow as tf
5 |
6 | from hanlp.components.tokenizers.tok_tf import NgramConvTokenizerTF
7 | from hanlp.datasets.tokenization.sighan2005.msr import SIGHAN2005_MSR_TRAIN, SIGHAN2005_MSR_DEV, SIGHAN2005_MSR_TEST
8 | from hanlp.pretrained.word2vec import CONVSEG_W2V_NEWS_TENSITE_CHAR
9 | from tests import cdroot
10 |
11 | cdroot()
12 | tokenizer = NgramConvTokenizerTF()
13 | save_dir = 'data/model/cws/convseg-msr-nocrf-noembed'
14 | tokenizer.fit(SIGHAN2005_MSR_TRAIN,
15 | SIGHAN2005_MSR_DEV,
16 | save_dir,
17 | word_embed={'class_name': 'HanLP>Word2VecEmbedding',
18 | 'config': {
19 | 'trainable': True,
20 | 'filepath': CONVSEG_W2V_NEWS_TENSITE_CHAR,
21 | 'expand_vocab': False,
22 | 'lowercase': False,
23 | }},
24 | optimizer=tf.keras.optimizers.Adam(learning_rate=0.001,
25 | epsilon=1e-8, clipnorm=5),
26 | epochs=100,
27 | window_size=0,
28 | metrics='f1',
29 | weight_norm=True)
30 | print(tokenizer.predict(['中央民族乐团离开北京前往维也纳', '商品和服务']))
31 | tokenizer.evaluate(SIGHAN2005_MSR_TEST, save_dir=save_dir)
32 |
--------------------------------------------------------------------------------
/plugins/hanlp_demo/hanlp_demo/zh/tf/train/cws/train_pku980106_conv_cws.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # Author: hankcs
3 | # Date: 2019-12-21 15:39
4 | import tensorflow as tf
5 |
6 | from hanlp.components.tokenizers.tok_tf import NgramConvTokenizerTF
7 | from hanlp.pretrained.word2vec import RADICAL_CHAR_EMBEDDING_100
8 | from tests import cdroot
9 |
10 | cdroot()
11 |
12 | tokenizer = NgramConvTokenizerTF()
13 | save_dir = 'data/model/cws/pku98_6m_conv_ngram'
14 | optimizer = tf.keras.optimizers.Adam(learning_rate=0.001,
15 | epsilon=1e-8, clipnorm=5)
16 | tokenizer.fit('data/cws/pku98/199801-06-seg.txt',
17 | 'data/cws/pku98/test_pku98_name_merged.txt',
18 | save_dir,
19 | word_embed={'class_name': 'HanLP>Word2VecEmbedding',
20 | 'config': {
21 | 'trainable': False,
22 | 'filepath': RADICAL_CHAR_EMBEDDING_100,
23 | 'expand_vocab': True,
24 | 'lowercase': False,
25 | }},
26 | optimizer=optimizer,
27 | window_size=0,
28 | weight_norm=True)
29 | tokenizer.evaluate('data/cws/pku98/test_pku98_name_merged.txt', save_dir=save_dir, output=False)
30 | print(tokenizer.predict(['中央民族乐团离开北京前往维也纳', '商品和服务']))
31 | print(f'Model saved in {save_dir}')
32 |
--------------------------------------------------------------------------------
/plugins/hanlp_demo/hanlp_demo/zh/tf/train/cws/train_pku980106_rnn_cws.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # Author: hankcs
3 | # Date: 2019-12-21 15:39
4 | import tensorflow as tf
5 |
6 | from hanlp.components.tokenizers.tok_tf import RNNTokenizerTF
7 | from hanlp.pretrained.word2vec import RADICAL_CHAR_EMBEDDING_100
8 | from tests import cdroot
9 |
10 | cdroot()
11 |
12 | tokenizer = RNNTokenizerTF()
13 | save_dir = 'data/model/cws/pku_6m_rnn_cws'
14 | optimizer = tf.keras.optimizers.Adam(learning_rate=0.001,
15 | epsilon=1e-8, clipnorm=5)
16 | tokenizer.fit('data/cws/pku98/199801-06-seg.txt',
17 | 'data/cws/pku98/pku98_test.txt',
18 | save_dir,
19 | embeddings={'class_name': 'HanLP>Word2VecEmbedding',
20 | 'config': {
21 | 'trainable': False,
22 | 'filepath': RADICAL_CHAR_EMBEDDING_100,
23 | 'expand_vocab': True,
24 | 'lowercase': False,
25 | }}
26 | )
27 | tokenizer.evaluate('data/cws/pku98/pku98_test.txt', save_dir=save_dir, output=False)
28 | print(tokenizer.predict(['中央民族乐团离开北京前往维也纳', '商品和服务']))
29 | print(f'Model saved in {save_dir}')
30 |
--------------------------------------------------------------------------------
/plugins/hanlp_demo/hanlp_demo/zh/tf/train/finetune_msra_ner_albert.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # Author: hankcs
3 | # Date: 2019-12-28 23:15
4 | import hanlp
5 | from hanlp.components.ner.ner_tf import TransformerNamedEntityRecognizerTF
6 | from hanlp.datasets.ner.msra import MSRA_NER_CHAR_LEVEL_TRAIN, MSRA_NER_CHAR_LEVEL_DEV, MSRA_NER_CHAR_LEVEL_TEST
7 | from tests import cdroot
8 |
9 | cdroot()
10 | recognizer = TransformerNamedEntityRecognizerTF()
11 | save_dir = 'data/model/ner/finetune_ner_albert_base_zh_msra'
12 | recognizer.fit(MSRA_NER_CHAR_LEVEL_TRAIN, MSRA_NER_CHAR_LEVEL_DEV, save_dir, transformer='albert_base_zh',
13 | finetune=hanlp.pretrained.ner.MSRA_NER_ALBERT_BASE_ZH)
14 | recognizer.load(save_dir)
15 | print(recognizer.predict(list('上海华安工业(集团)公司董事长谭旭光和秘书张晚霞来到美国纽约现代艺术博物馆参观。')))
16 | recognizer.evaluate(MSRA_NER_CHAR_LEVEL_TEST, save_dir=save_dir)
17 | print(f'Model saved in {save_dir}')
18 |
--------------------------------------------------------------------------------
/plugins/hanlp_demo/hanlp_demo/zh/tf/train/train_chnsenticorp_bert.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # Author: hankcs
3 | # Date: 2019-12-30 21:01
4 | from hanlp.components.classifiers.transformer_classifier_tf import TransformerClassifierTF, TransformerTextTransform
5 | from hanlp.datasets.classification.sentiment import CHNSENTICORP_ERNIE_TRAIN, CHNSENTICORP_ERNIE_TEST, \
6 | CHNSENTICORP_ERNIE_DEV
7 | from tests import cdroot
8 |
9 | cdroot()
10 | save_dir = 'data/model/classification/chnsenticorp_bert_base'
11 | classifier = TransformerClassifierTF(TransformerTextTransform(y_column=0))
12 | classifier.fit(CHNSENTICORP_ERNIE_TRAIN, CHNSENTICORP_ERNIE_DEV, save_dir,
13 | transformer='bert-base-chinese')
14 | classifier.load(save_dir)
15 | print(classifier.predict('前台客房服务态度非常好!早餐很丰富,房价很干净。再接再厉!'))
16 | classifier.evaluate(CHNSENTICORP_ERNIE_TEST, save_dir=save_dir)
17 |
--------------------------------------------------------------------------------
/plugins/hanlp_demo/hanlp_demo/zh/tf/train/train_conll03_ner_bert.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # Author: hankcs
3 | # Date: 2019-10-25 21:34
4 | from hanlp.components.ner.ner_tf import TransformerNamedEntityRecognizerTF
5 | from hanlp.datasets.ner.conll03 import CONLL03_EN_TRAIN, CONLL03_EN_DEV, CONLL03_EN_TEST
6 | from tests import cdroot
7 |
8 | cdroot()
9 | tagger = TransformerNamedEntityRecognizerTF()
10 | save_dir = 'data/model/ner/ner_conll03_bert_base_cased_en'
11 | tagger.fit(CONLL03_EN_TRAIN, CONLL03_EN_DEV, save_dir, transformer='bert-base-cased',
12 | metrics='accuracy')
13 | tagger.load(save_dir, metrics='f1')
14 | print(tagger.predict('West Indian all-rounder Phil Simmons eats apple .'.split()))
15 | tagger.evaluate(CONLL03_EN_TEST, save_dir=save_dir, output=False, batch_size=32)
16 | print(f'Model saved in {save_dir}')
17 |
--------------------------------------------------------------------------------
/plugins/hanlp_demo/hanlp_demo/zh/tf/train/train_ctb5_dep.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # Author: hankcs
3 | # Date: 2019-12-28 18:33
4 | from hanlp.components.parsers.biaffine_parser_tf import BiaffineDependencyParserTF
5 | from hanlp.datasets.parsing.ctb5 import CTB5_DEP_TRAIN, CTB5_DEP_DEV, CTB5_DEP_TEST
6 | from hanlp.pretrained.word2vec import CTB5_FASTTEXT_300_CN
7 | from tests import cdroot
8 |
9 | cdroot()
10 | save_dir = 'data/model/dep/biaffine_ctb'
11 | parser = BiaffineDependencyParserTF()
12 | parser.fit(CTB5_DEP_TRAIN, CTB5_DEP_DEV, save_dir,
13 | pretrained_embed={'class_name': 'HanLP>Word2VecEmbedding',
14 | 'config': {
15 | 'trainable': False,
16 | 'embeddings_initializer': 'zero',
17 | 'filepath': CTB5_FASTTEXT_300_CN,
18 | 'expand_vocab': True,
19 | 'lowercase': True,
20 | 'normalize': True,
21 | }},
22 | )
23 | parser.load(save_dir)
24 | sentence = [('中国', 'NR'), ('批准', 'VV'), ('设立', 'VV'), ('外商', 'NN'), ('投资', 'NN'), ('企业', 'NN'), ('逾', 'VV'),
25 | ('三十万', 'CD'), ('家', 'M')]
26 | print(parser.predict(sentence))
27 | parser.evaluate(CTB5_DEP_TEST, save_dir)
28 |
--------------------------------------------------------------------------------
/plugins/hanlp_demo/hanlp_demo/zh/tf/train/train_ctb5_pos_rnn.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # Author: hankcs
3 | # Date: 2019-12-28 22:46
4 | from hanlp.components.taggers.pos_tf import RNNPartOfSpeechTaggerTF
5 | from hanlp.datasets.pos.ctb5 import CTB5_POS_TRAIN, CTB5_POS_DEV, CTB5_POS_TEST
6 | from hanlp.pretrained.fasttext import FASTTEXT_WIKI_300_ZH
7 | from tests import cdroot
8 |
9 | cdroot()
10 | tagger = RNNPartOfSpeechTaggerTF()
11 | save_dir = 'data/model/pos/ctb5_pos_rnn_fasttext'
12 | tagger.fit(CTB5_POS_TRAIN, CTB5_POS_DEV, save_dir, embeddings={'class_name': 'HanLP>FastTextEmbedding',
13 | 'config': {'filepath': FASTTEXT_WIKI_300_ZH}}, )
14 | tagger.evaluate(CTB5_POS_TEST, save_dir=save_dir)
15 | print(f'Model saved in {save_dir}')
16 |
--------------------------------------------------------------------------------
/plugins/hanlp_demo/hanlp_demo/zh/tf/train/train_ctb7_dep.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # Author: hankcs
3 | # Date: 2019-12-28 18:33
4 | from hanlp.components.parsers.biaffine_parser_tf import BiaffineDependencyParserTF
5 | from hanlp.datasets.parsing.ctb5 import CIP_W2V_100_CN
6 | from hanlp.datasets.parsing.ctb7 import CTB7_DEP_TRAIN, CTB7_DEP_DEV, CTB7_DEP_TEST
7 | from tests import cdroot
8 |
9 | cdroot()
10 | save_dir = 'data/model/dep/biaffine_ctb7'
11 | parser = BiaffineDependencyParserTF()
12 | parser.fit(CTB7_DEP_TRAIN, CTB7_DEP_DEV, save_dir,
13 | pretrained_embed={'class_name': 'HanLP>Word2VecEmbedding',
14 | 'config': {
15 | 'trainable': False,
16 | 'embeddings_initializer': 'zero',
17 | 'filepath': CIP_W2V_100_CN,
18 | 'expand_vocab': True,
19 | 'lowercase': True,
20 | 'normalize': True,
21 | }},
22 | )
23 | parser.load(save_dir)
24 | sentence = [('中国', 'NR'), ('批准', 'VV'), ('设立', 'VV'), ('外商', 'NN'), ('投资', 'NN'), ('企业', 'NN'), ('逾', 'VV'),
25 | ('三十万', 'CD'), ('家', 'M')]
26 | print(parser.predict(sentence))
27 | parser.evaluate(CTB7_DEP_TEST, save_dir)
28 |
--------------------------------------------------------------------------------
/plugins/hanlp_demo/hanlp_demo/zh/tf/train/train_ctb9_pos_albert.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # Author: hankcs
3 | # Date: 2019-12-28 23:15
4 | from hanlp.components.taggers.transformers.transformer_tagger_tf import TransformerTaggerTF
5 | from tests import cdroot
6 |
7 | cdroot()
8 | tagger = TransformerTaggerTF()
9 | save_dir = 'data/model/pos/ctb9_albert_base'
10 | tagger.fit('data/pos/ctb9/train.tsv',
11 | 'data/pos/ctb9/test.tsv',
12 | save_dir,
13 | transformer='uer/albert-base-chinese-cluecorpussmall',
14 | max_seq_length=130,
15 | warmup_steps_ratio=0.1,
16 | epochs=20,
17 | learning_rate=5e-5)
18 | tagger.load(save_dir)
19 | print(tagger(['我', '的', '希望', '是', '希望', '和平']))
20 | tagger.evaluate('data/pos/ctb9/test.tsv', save_dir=save_dir)
21 | print(f'Model saved in {save_dir}')
22 |
--------------------------------------------------------------------------------
/plugins/hanlp_demo/hanlp_demo/zh/tf/train/train_ctb9_pos_electra.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # Author: hankcs
3 | # Date: 2019-12-28 23:15
4 | from hanlp.components.taggers.transformers.transformer_tagger_tf import TransformerTaggerTF
5 | from tests import cdroot
6 |
7 | cdroot()
8 | tagger = TransformerTaggerTF()
9 | save_dir = 'data/model/pos/ctb9_electra_small_zh_epoch_20'
10 | tagger.fit('data/pos/ctb9/train.tsv',
11 | 'data/pos/ctb9/test.tsv',
12 | save_dir,
13 | transformer='hfl/chinese-electra-small-discriminator',
14 | max_seq_length=130,
15 | warmup_steps_ratio=0.1,
16 | epochs=20,
17 | learning_rate=5e-5)
18 | tagger.load(save_dir)
19 | print(tagger(['我', '的', '希望', '是', '希望', '和平']))
20 | tagger.evaluate('data/pos/ctb9/test.tsv', save_dir=save_dir)
21 | print(f'Model saved in {save_dir}')
22 |
--------------------------------------------------------------------------------
/plugins/hanlp_demo/hanlp_demo/zh/tf/train/train_msra_ner_albert.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # Author: hankcs
3 | # Date: 2019-12-28 23:15
4 | from hanlp.components.ner.ner_tf import TransformerNamedEntityRecognizerTF
5 | from hanlp.datasets.ner.msra import MSRA_NER_CHAR_LEVEL_TRAIN, MSRA_NER_CHAR_LEVEL_DEV, MSRA_NER_CHAR_LEVEL_TEST
6 | from tests import cdroot
7 |
8 | cdroot()
9 | recognizer = TransformerNamedEntityRecognizerTF()
10 | save_dir = 'data/model/ner/msra_ner_albert_base'
11 | recognizer.fit(MSRA_NER_CHAR_LEVEL_TRAIN, MSRA_NER_CHAR_LEVEL_DEV, save_dir,
12 | transformer='uer/albert-base-chinese-cluecorpussmall',
13 | learning_rate=5e-5,
14 | metrics='accuracy') # Use accuracy to speed up training
15 | recognizer.load(save_dir, metrics='f1')
16 | print(recognizer.predict(list('上海华安工业(集团)公司董事长谭旭光和秘书张晚霞来到美国纽约现代艺术博物馆参观。')))
17 | recognizer.evaluate(MSRA_NER_CHAR_LEVEL_TEST, save_dir=save_dir)
18 | print(f'Model saved in {save_dir}')
19 |
--------------------------------------------------------------------------------
/plugins/hanlp_demo/hanlp_demo/zh/tf/train/train_msra_ner_bert.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # Author: hankcs
3 | # Date: 2019-12-28 23:15
4 | from hanlp.components.ner.ner_tf import TransformerNamedEntityRecognizerTF
5 | from hanlp.datasets.ner.msra import MSRA_NER_CHAR_LEVEL_TRAIN, MSRA_NER_CHAR_LEVEL_DEV, MSRA_NER_CHAR_LEVEL_TEST
6 | from tests import cdroot
7 |
8 | cdroot()
9 | recognizer = TransformerNamedEntityRecognizerTF()
10 | save_dir = 'data/model/ner/ner_bert_base_msra_1'
11 | recognizer.fit(MSRA_NER_CHAR_LEVEL_TRAIN, MSRA_NER_CHAR_LEVEL_DEV, save_dir, transformer='bert-base-chinese',
12 | metrics='accuracy') # accuracy is faster
13 | recognizer.load(save_dir, metrics='f1')
14 | print(recognizer.predict(list('上海华安工业(集团)公司董事长谭旭光和秘书张晚霞来到美国纽约现代艺术博物馆参观。')))
15 | recognizer.evaluate(MSRA_NER_CHAR_LEVEL_TEST, save_dir=save_dir)
16 | print(f'Model saved in {save_dir}')
17 |
--------------------------------------------------------------------------------
/plugins/hanlp_demo/hanlp_demo/zh/tf/train/train_msra_ner_electra.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # Author: hankcs
3 | # Date: 2019-12-28 23:15
4 | from hanlp.components.ner.ner_tf import TransformerNamedEntityRecognizerTF
5 | from hanlp.datasets.ner.msra import MSRA_NER_CHAR_LEVEL_TRAIN, MSRA_NER_CHAR_LEVEL_DEV, MSRA_NER_CHAR_LEVEL_TEST
6 | from tests import cdroot
7 |
8 | cdroot()
9 | recognizer = TransformerNamedEntityRecognizerTF()
10 | save_dir = 'data/model/ner/ner_electra_small_zh_msra_sparse_categorical_crossentropy'
11 | recognizer.fit(MSRA_NER_CHAR_LEVEL_TRAIN, MSRA_NER_CHAR_LEVEL_DEV, save_dir,
12 | transformer='hfl/chinese-electra-small-discriminator',
13 | learning_rate=5e-5,
14 | metrics='accuracy') # Use accuracy to speed up training
15 | recognizer.load(save_dir, metrics='f1')
16 | print(recognizer.predict(list('上海华安工业(集团)公司董事长谭旭光和秘书张晚霞来到美国纽约现代艺术博物馆参观。')))
17 | recognizer.evaluate(MSRA_NER_CHAR_LEVEL_TEST, save_dir=save_dir)
18 | print(f'Model saved in {save_dir}')
19 |
--------------------------------------------------------------------------------
/plugins/hanlp_demo/hanlp_demo/zh/tf/train/train_msra_ner_rnn.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # Author: hankcs
3 | # Date: 2019-12-28 23:15
4 | from hanlp.components.ner.ner_tf import RNNNamedEntityRecognizerTF
5 | from hanlp.datasets.ner.msra import MSRA_NER_CHAR_LEVEL_TRAIN, MSRA_NER_CHAR_LEVEL_DEV, MSRA_NER_CHAR_LEVEL_TEST
6 | from hanlp.pretrained.word2vec import RADICAL_CHAR_EMBEDDING_100
7 | from tests import cdroot
8 |
9 | cdroot()
10 | recognizer = RNNNamedEntityRecognizerTF()
11 | save_dir = 'data/model/ner/msra_ner_rnn'
12 | recognizer.fit(MSRA_NER_CHAR_LEVEL_TRAIN, MSRA_NER_CHAR_LEVEL_DEV, save_dir,
13 | embeddings=RADICAL_CHAR_EMBEDDING_100,
14 | embedding_trainable=True,
15 | epochs=100)
16 | recognizer.evaluate(MSRA_NER_CHAR_LEVEL_TEST, save_dir)
17 |
--------------------------------------------------------------------------------
/plugins/hanlp_demo/hanlp_demo/zh/tf/train/train_ptb_dep_biaffine_albert.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # Author: hankcs
3 | # Date: 2020-03-07 23:48
4 | from hanlp.metrics.parsing import conllx_eval
5 |
6 | from hanlp.datasets.parsing.ptb import PTB_SD330_DEV, PTB_SD330_TRAIN, PTB_SD330_TEST, PTB_TOKEN_MAPPING
7 | from hanlp.components.parsers.biaffine_parser_tf import BiaffineTransformerDependencyParserTF
8 | from tests import cdroot
9 |
10 | cdroot()
11 | save_dir = 'data/model/dep/ptb_albert3'
12 | parser = BiaffineTransformerDependencyParserTF()
13 | parser.fit(PTB_SD330_TRAIN, PTB_SD330_DEV, save_dir,
14 | 'albert-xxlarge-v2',
15 | batch_size=256,
16 | warmup_steps_ratio=.1,
17 | token_mapping=PTB_TOKEN_MAPPING,
18 | samples_per_batch=150,
19 | transformer_dropout=.33,
20 | learning_rate=2e-3,
21 | learning_rate_transformer=1e-5,
22 | # early_stopping_patience=10,
23 | )
24 | parser.load(save_dir)
25 | # output = f'{save_dir}/test.predict.conll'
26 | parser.evaluate(PTB_SD330_TEST, save_dir, warm_up=False)
27 | # uas, las = conllx_eval.evaluate(PTB_SD330_TEST, output)
28 | # print(f'Official UAS: {uas:.4f} LAS: {las:.4f}')
29 | print(f'Model saved in {save_dir}')
30 |
--------------------------------------------------------------------------------
/plugins/hanlp_demo/hanlp_demo/zh/tf/train/train_ptb_dep_biaffine_bert.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # Author: hankcs
3 | # Date: 2020-03-07 23:48
4 | from hanlp.metrics.parsing import conllx_eval
5 |
6 | from hanlp.datasets.parsing.ptb import PTB_SD330_DEV, PTB_SD330_TRAIN, PTB_SD330_TEST, PTB_TOKEN_MAPPING
7 | from hanlp.components.parsers.biaffine_parser_tf import BiaffineTransformerDependencyParserTF
8 | from tests import cdroot
9 |
10 | cdroot()
11 | save_dir = 'data/model/dep/ptb_bert_1e-5'
12 | parser = BiaffineTransformerDependencyParserTF()
13 | # parser.fit(PTB_SD330_TRAIN, PTB_SD330_DEV, save_dir, 'bert-base-uncased',
14 | # batch_size=3000,
15 | # warmup_steps_ratio=.1,
16 | # token_mapping=PTB_TOKEN_MAPPING,
17 | # samples_per_batch=150,
18 | # transformer_dropout=.33,
19 | # learning_rate=2e-3,
20 | # learning_rate_transformer=1e-5,
21 | # # early_stopping_patience=10,
22 | # )
23 | parser.load(save_dir, tree='tarjan')
24 | # output = f'{save_dir}/test.predict.conll'
25 | parser.evaluate(PTB_SD330_TEST, save_dir, warm_up=False)
26 | # uas, las = conllx_eval.evaluate(PTB_SD330_TEST, output)
27 | # print(f'Official UAS: {uas:.4f} LAS: {las:.4f}')
28 | print(f'Model saved in {save_dir}')
29 |
--------------------------------------------------------------------------------
/plugins/hanlp_demo/hanlp_demo/zh/tf/train/train_ptb_dep_biaffine_bert_96.6.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # Author: hankcs
3 | # Date: 2020-03-07 23:48
4 |
5 | from hanlp.datasets.parsing.ptb import PTB_SD330_DEV, PTB_SD330_TRAIN, PTB_SD330_TEST, PTB_TOKEN_MAPPING
6 | from hanlp.components.parsers.biaffine_parser_tf import BiaffineTransformerDependencyParserTF
7 | from tests import cdroot
8 | from hanlp.metrics.parsing import conllx_eval
9 |
10 | cdroot()
11 | save_dir = 'data/model/dep/ptb_bert_96.61'
12 | parser = BiaffineTransformerDependencyParserTF()
13 | # parser.fit(PTB_SD330_TRAIN, PTB_SD330_DEV, save_dir, 'bert-base-uncased',
14 | # batch_size=3000,
15 | # warmup_steps_ratio=.1,
16 | # token_mapping=PTB_TOKEN_MAPPING,
17 | # samples_per_batch=150,
18 | # )
19 | parser.load(save_dir)
20 | output = f'{save_dir}/test.predict.conll'
21 | parser.evaluate(PTB_SD330_TEST, save_dir, warm_up=False, output=output)
22 | uas, las = conllx_eval.evaluate(PTB_SD330_TEST, output)
23 | print(f'Official UAS: {uas:.4f} LAS: {las:.4f}')
24 | print(f'Model saved in {save_dir}')
25 |
--------------------------------------------------------------------------------
/plugins/hanlp_demo/hanlp_demo/zh/tf/train/train_ptb_dep_biaffine_bert_positional.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # Author: hankcs
3 | # Date: 2020-03-07 23:48
4 | from hanlp.metrics.parsing import conllx_eval
5 |
6 | from hanlp.datasets.parsing.ptb import PTB_SD330_DEV, PTB_SD330_TRAIN, PTB_SD330_TEST, PTB_TOKEN_MAPPING
7 | from hanlp.components.parsers.biaffine_parser_tf import BiaffineTransformerDependencyParserTF
8 | from tests import cdroot
9 |
10 | cdroot()
11 | save_dir = 'data/model/dep/ptb_bert_positional_diff_lr'
12 | parser = BiaffineTransformerDependencyParserTF()
13 | parser.fit(PTB_SD330_TRAIN, PTB_SD330_DEV, save_dir, 'bert-base-uncased',
14 | batch_size=3000,
15 | warmup_steps_ratio=.1,
16 | token_mapping=PTB_TOKEN_MAPPING,
17 | samples_per_batch=150,
18 | transformer_dropout=.33,
19 | learning_rate=1e-4,
20 | learning_rate_transformer=1e-5,
21 | d_positional=128,
22 | # early_stopping_patience=10,
23 | )
24 | # parser.load(save_dir)
25 | # output = f'{save_dir}/test.predict.conll'
26 | parser.evaluate(PTB_SD330_TEST, save_dir, warm_up=False)
27 | # uas, las = conllx_eval.evaluate(PTB_SD330_TEST, output)
28 | # print(f'Official UAS: {uas:.4f} LAS: {las:.4f}')
29 | # print(f'Model saved in {save_dir}')
30 |
--------------------------------------------------------------------------------
/plugins/hanlp_demo/hanlp_demo/zh/tf/train/train_ptb_dep_sa_albert_topk.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # Author: hankcs
3 | # Date: 2020-03-07 23:48
4 | from hanlp.metrics.parsing import conllx_eval
5 |
6 | from hanlp.datasets.parsing.ptb import PTB_SD330_DEV, PTB_SD330_TRAIN, PTB_SD330_TEST, PTB_TOKEN_MAPPING
7 | from hanlp.components.parsers.biaffine_parser_tf import BiaffineTransformerDependencyParserTF, \
8 | StructuralAttentionDependencyParserTF
9 | from hanlp.pretrained.glove import GLOVE_840B_300D
10 | from tests import cdroot
11 |
12 | cdroot()
13 | save_dir = 'data/model/dep/ptb_sa_topk'
14 | parser = StructuralAttentionDependencyParserTF()
15 | parser.fit(PTB_SD330_TRAIN, PTB_SD330_DEV, save_dir, 'bert-base-uncased',
16 | batch_size=3000,
17 | warmup_steps_ratio=.1,
18 | token_mapping=PTB_TOKEN_MAPPING,
19 | samples_per_batch=150,
20 | transformer_dropout=.33,
21 | masked_lm_dropout=.33,
22 | learning_rate=2e-3,
23 | learning_rate_transformer=1e-5,
24 |
25 | # alpha=1,
26 | # early_stopping_patience=10,
27 | # num_decoder_layers=2,
28 | )
29 | parser.load(save_dir)
30 | # output = f'{save_dir}/test.predict.conll'
31 | parser.evaluate(PTB_SD330_TEST, save_dir, warm_up=False)
32 | # uas, las = conllx_eval.evaluate(PTB_SD330_TEST, output)
33 | # print(f'Official UAS: {uas:.4f} LAS: {las:.4f}')
34 | print(f'Model saved in {save_dir}')
35 |
--------------------------------------------------------------------------------
/plugins/hanlp_demo/hanlp_demo/zh/tf/train/train_ptb_pos_rnn_fasttext.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # Author: hankcs
3 | # Date: 2019-10-25 21:34
4 |
5 | import tensorflow as tf
6 |
7 | from hanlp.components.taggers.pos_tf import RNNPartOfSpeechTaggerTF
8 | from hanlp.pretrained.fasttext import FASTTEXT_CC_300_EN
9 | from tests import cdroot
10 |
11 | cdroot()
12 | tagger = RNNPartOfSpeechTaggerTF()
13 | save_dir = 'data/model/pos/ptb_pos_rnn_fasttext'
14 | optimizer = tf.keras.optimizers.SGD(lr=0.015)
15 | # optimizer = 'adam'
16 | tagger.fit('data/ptb-pos/train.tsv',
17 | 'data/ptb-pos/dev.tsv',
18 | batch_size=10,
19 | save_dir=save_dir,
20 | embeddings={'class_name': 'HanLP>FastTextEmbedding',
21 | 'config': {'filepath': FASTTEXT_CC_300_EN}},
22 | optimizer=optimizer,
23 | lr_decay_per_epoch=0.05,
24 | rnn_units=100,
25 | rnn_input_dropout=0.5,
26 | rnn_output_dropout=0.5,
27 | epochs=100,
28 | verbose=True)
29 | tagger.load(save_dir)
30 | tagger.evaluate('data/ptb-pos/test.tsv', save_dir=save_dir, output=False)
31 | print(tagger.predict(['This' 'time', 'is', 'for', 'dinner']))
32 | print(tagger.predict([['This', 'is', 'an', 'old', 'story'],
33 | ['Not', 'this', 'year', '.']]))
34 | print(f'Model saved in {save_dir}')
35 |
--------------------------------------------------------------------------------
/plugins/hanlp_demo/hanlp_demo/zh/tf/train/train_semeval15_dm.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # Author: hankcs
3 | # Date: 2020-01-01 18:26
4 | from hanlp.components.parsers.biaffine_parser_tf import BiaffineSemanticDependencyParserTF
5 | from hanlp.pretrained.glove import GLOVE_6B_100D
6 | from tests import cdroot
7 |
8 | cdroot()
9 | save_dir = 'data/model/sdp/semeval15_biaffine_dm'
10 | parser = BiaffineSemanticDependencyParserTF()
11 | parser.fit('data/semeval15/en.dm.train.conll', 'data/semeval15/en.dm.dev.conll', save_dir,
12 | pretrained_embed={'class_name': 'HanLP>Word2VecEmbedding',
13 | 'config': {
14 | 'trainable': False,
15 | 'embeddings_initializer': 'zero',
16 | 'filepath': GLOVE_6B_100D,
17 | 'expand_vocab': True,
18 | 'lowercase': True,
19 | 'normalize': True,
20 | }},
21 | )
22 | parser.load(save_dir) # disable variational dropout during evaluation so as to use CudaLSTM
23 | sentence = [('Is', 'VBZ'), ('this', 'DT'), ('the', 'DT'), ('future', 'NN'), ('of', 'IN'), ('chamber', 'NN'),
24 | ('music', 'NN'), ('?', '.')]
25 | print(parser.predict(sentence))
26 | parser.evaluate('data/semeval15/en.id.dm.auto.conllu', save_dir)
27 | parser.evaluate('data/semeval15/en.ood.dm.auto.conllu', save_dir)
28 |
--------------------------------------------------------------------------------
/plugins/hanlp_demo/hanlp_demo/zh/tf/train/train_semeval15_pas.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # Author: hankcs
3 | # Date: 2020-01-01 18:26
4 | from hanlp.components.parsers.biaffine_parser_tf import BiaffineSemanticDependencyParserTF
5 | from hanlp.pretrained.glove import GLOVE_6B_100D
6 | from tests import cdroot
7 |
8 | cdroot()
9 | save_dir = 'data/model/sdp/semeval15_biaffine_pas'
10 | parser = BiaffineSemanticDependencyParserTF()
11 | parser.fit('data/semeval15/en.pas.train.conll', 'data/semeval15/en.pas.dev.conll', save_dir,
12 | pretrained_embed={'class_name': 'HanLP>Word2VecEmbedding',
13 | 'config': {
14 | 'trainable': False,
15 | 'embeddings_initializer': 'zero',
16 | 'filepath': GLOVE_6B_100D,
17 | 'expand_vocab': True,
18 | 'lowercase': True,
19 | 'normalize': True,
20 | }},
21 | )
22 | parser.load(save_dir) # disable variational dropout during evaluation so as to use CudaLSTM
23 | sentence = [('Is', 'VBZ'), ('this', 'DT'), ('the', 'DT'), ('future', 'NN'), ('of', 'IN'), ('chamber', 'NN'),
24 | ('music', 'NN'), ('?', '.')]
25 | print(parser.predict(sentence))
26 | parser.evaluate('data/semeval15/en.id.pas.conll', save_dir)
27 | parser.evaluate('data/semeval15/en.ood.pas.conll', save_dir)
28 |
--------------------------------------------------------------------------------
/plugins/hanlp_demo/hanlp_demo/zh/tf/train/train_semeval15_psd.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # Author: hankcs
3 | # Date: 2020-01-01 18:26
4 | from hanlp.components.parsers.biaffine_parser_tf import BiaffineSemanticDependencyParserTF
5 | from hanlp.pretrained.glove import GLOVE_6B_100D
6 | from tests import cdroot
7 |
8 | cdroot()
9 | save_dir = 'data/model/sdp/semeval15_biaffine_psd'
10 | parser = BiaffineSemanticDependencyParserTF()
11 | parser.fit('data/semeval15/en.psd.train.conll', 'data/semeval15/en.psd.dev.conll', save_dir,
12 | pretrained_embed={'class_name': 'HanLP>Word2VecEmbedding',
13 | 'config': {
14 | 'trainable': False,
15 | 'embeddings_initializer': 'zero',
16 | 'filepath': GLOVE_6B_100D,
17 | 'expand_vocab': True,
18 | 'lowercase': True,
19 | 'normalize': True,
20 | }},
21 | )
22 | parser.load(save_dir) # disable variational dropout during evaluation so as to use CudaLSTM
23 | sentence = [('Is', 'VBZ'), ('this', 'DT'), ('the', 'DT'), ('future', 'NN'), ('of', 'IN'), ('chamber', 'NN'),
24 | ('music', 'NN'), ('?', '.')]
25 | print(parser.predict(sentence))
26 | parser.evaluate('data/semeval15/en.id.psd.conll', save_dir)
27 | parser.evaluate('data/semeval15/en.ood.psd.conll', save_dir)
28 |
--------------------------------------------------------------------------------
/plugins/hanlp_demo/hanlp_demo/zh/train/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # Author: hankcs
3 | # Date: 2020-12-31 20:12
4 |
--------------------------------------------------------------------------------
/plugins/hanlp_demo/hanlp_demo/zh/train_sota_bert_pku.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # Author: hankcs
3 | # Date: 2020-08-11 02:47
4 | from hanlp.common.dataset import SortingSamplerBuilder
5 | from hanlp.components.tokenizers.transformer import TransformerTaggingTokenizer
6 | from hanlp.datasets.tokenization.sighan2005.pku import SIGHAN2005_PKU_TRAIN_ALL, SIGHAN2005_PKU_TEST
7 | from tests import cdroot
8 |
9 | cdroot()
10 | tokenizer = TransformerTaggingTokenizer()
11 | save_dir = 'data/model/cws/sighan2005_pku_bert_base_96.7'
12 | tokenizer.fit(
13 | SIGHAN2005_PKU_TRAIN_ALL,
14 | SIGHAN2005_PKU_TEST, # Conventionally, no devset is used. See Tian et al. (2020).
15 | save_dir,
16 | 'bert-base-chinese',
17 | max_seq_len=300,
18 | char_level=True,
19 | hard_constraint=True,
20 | sampler_builder=SortingSamplerBuilder(batch_size=32),
21 | epochs=3,
22 | adam_epsilon=1e-6,
23 | warmup_steps=0.1,
24 | weight_decay=0.01,
25 | word_dropout=0.1,
26 | seed=1660853059,
27 | )
28 | tokenizer.evaluate(SIGHAN2005_PKU_TEST, save_dir)
29 | print(f'Model saved in {save_dir}')
30 |
--------------------------------------------------------------------------------
/plugins/hanlp_demo/setup.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # Author: hankcs
3 | # Date: 2019-12-28 19:26
4 | from os.path import abspath, join, dirname
5 | from setuptools import find_packages, setup
6 |
7 | this_dir = abspath(dirname(__file__))
8 | with open(join(this_dir, 'README.md'), encoding='utf-8') as file:
9 | long_description = file.read()
10 |
11 | setup(
12 | name='hanlp_demo',
13 | version='0.0.1',
14 | description='HanLP: Han Language Processing',
15 | long_description=long_description,
16 | long_description_content_type="text/markdown",
17 | url='https://github.com/hankcs/HanLP',
18 | author='hankcs',
19 | author_email='hankcshe@gmail.com',
20 | license='Apache License 2.0',
21 | classifiers=[
22 | 'Intended Audience :: Science/Research',
23 | 'Intended Audience :: Developers',
24 | "Development Status :: 3 - Alpha",
25 | 'Operating System :: OS Independent',
26 | "License :: OSI Approved :: Apache Software License",
27 | 'Programming Language :: Python :: 3 :: Only',
28 | 'Topic :: Scientific/Engineering :: Artificial Intelligence',
29 | "Topic :: Text Processing :: Linguistic"
30 | ],
31 | keywords='corpus,machine-learning,NLU,NLP',
32 | packages=find_packages(exclude=['docs', 'tests*']),
33 | include_package_data=True,
34 | install_requires=[
35 | 'hanlp_common'
36 | ],
37 | python_requires='>=3.6',
38 | )
39 |
--------------------------------------------------------------------------------
/plugins/hanlp_restful/README.md:
--------------------------------------------------------------------------------
1 | # RESTFul API Client for HanLP
2 |
3 | [中文](https://github.com/hankcs/HanLP/tree/doc-zh) | [1.x](https://github.com/hankcs/HanLP/tree/1.x) | [forum](https://bbs.hankcs.com/) | [docker](https://github.com/WalterInSH/hanlp-jupyter-docker)
4 |
5 | The multilingual NLP library for researchers and companies, built on PyTorch and TensorFlow 2.x, for advancing state-of-the-art deep learning techniques in both academia and industry. HanLP was designed from day one to be efficient, user friendly and extendable. It comes with pretrained models for various human languages including English, Chinese and many others. Currently, HanLP 2.0 is in alpha stage with more killer features on the roadmap. Discussions are welcomed on our [forum](https://bbs.hankcs.com/), while bug reports and feature requests are reserved for GitHub issues. For Java users, please checkout the [1.x](https://github.com/hankcs/HanLP/tree/1.x) branch.
6 |
7 |
8 | ## Installation
9 |
10 | ```bash
11 | pip install hanlp-restful
12 | ```
13 |
14 | ## License
15 |
16 | HanLP is licensed under **Apache License 2.0**. You can use HanLP in your commercial products for free. We would appreciate it if you add a link to HanLP on your website.
17 |
18 |
--------------------------------------------------------------------------------
/plugins/hanlp_restful/setup.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # Author: hankcs
3 | # Date: 2019-12-28 19:26
4 | from os.path import abspath, join, dirname
5 | from setuptools import find_packages, setup
6 |
7 | this_dir = abspath(dirname(__file__))
8 | with open(join(this_dir, 'README.md'), encoding='utf-8') as file:
9 | long_description = file.read()
10 |
11 | setup(
12 | name='hanlp_restful',
13 | version='0.0.23',
14 | description='HanLP: Han Language Processing',
15 | long_description=long_description,
16 | long_description_content_type="text/markdown",
17 | url='https://github.com/hankcs/HanLP',
18 | author='hankcs',
19 | author_email='hankcshe@gmail.com',
20 | license='Apache License 2.0',
21 | classifiers=[
22 | 'Intended Audience :: Science/Research',
23 | 'Intended Audience :: Developers',
24 | "Development Status :: 3 - Alpha",
25 | 'Operating System :: OS Independent',
26 | "License :: OSI Approved :: Apache Software License",
27 | 'Programming Language :: Python :: 3 :: Only',
28 | 'Topic :: Scientific/Engineering :: Artificial Intelligence',
29 | "Topic :: Text Processing :: Linguistic"
30 | ],
31 | keywords='corpus,machine-learning,NLU,NLP',
32 | packages=find_packages(exclude=['docs', 'tests*']),
33 | include_package_data=True,
34 | install_requires=[
35 | 'hanlp_common'
36 | ],
37 | python_requires='>=3.6',
38 | )
39 |
--------------------------------------------------------------------------------
/plugins/hanlp_restful/tests/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # Author: hankcs
3 | # Date: 2020-11-29 18:05
4 |
--------------------------------------------------------------------------------
/plugins/hanlp_restful_golang/README.md:
--------------------------------------------------------------------------------
1 | # gohanlp
2 |
3 | Golang RESTful Client for HanLP
4 |
5 | We have moved to https://github.com/hankcs/gohanlp
--------------------------------------------------------------------------------
/plugins/hanlp_restful_java/src/main/java/com/hankcs/hanlp/restful/BaseInput.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Han He
3 | * me@hankcs.com
4 | * 2020-12-27 12:07 AM
5 | *
6 | *
7 | * Copyright (c) 2020, Han He. All Rights Reserved, http://www.hankcs.com/
8 | * See LICENSE file in the project root for full license information.
9 | *
10 | */
11 | package com.hankcs.hanlp.restful;
12 |
13 | /**
14 | * @author hankcs
15 | */
16 | public class BaseInput
17 | {
18 | public String[] tasks;
19 | public String[] skip_tasks;
20 | public String language;
21 |
22 | public BaseInput(String[] tasks, String[] skipTasks, String language)
23 | {
24 | this.tasks = tasks;
25 | this.skip_tasks = skipTasks;
26 | this.language = language;
27 | }
28 | }
29 |
--------------------------------------------------------------------------------
/plugins/hanlp_restful_java/src/main/java/com/hankcs/hanlp/restful/CoreferenceResolutionOutput.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Han He
3 | * me@hankcs.com
4 | * 2021-10-16 4:43 PM
5 | *
6 | *
7 | * Copyright (c) 2021, Han He. All Rights Reserved, http://www.hankcs.com/
8 | * See LICENSE file in the project root for full license information.
9 | *
10 | */
11 | package com.hankcs.hanlp.restful;
12 |
13 | import java.util.ArrayList;
14 | import java.util.List;
15 | import java.util.Set;
16 |
17 | /**
18 | * A data class for coreference resolution
19 | *
20 | * @author hankcs
21 | */
22 | public class CoreferenceResolutionOutput
23 | {
24 | public List> clusters;
25 | public ArrayList tokens;
26 |
27 | public CoreferenceResolutionOutput(List> clusters, ArrayList tokens)
28 | {
29 | this.clusters = clusters;
30 | this.tokens = tokens;
31 | }
32 | }
33 |
--------------------------------------------------------------------------------
/plugins/hanlp_restful_java/src/main/java/com/hankcs/hanlp/restful/DocumentInput.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Han He
3 | * me@hankcs.com
4 | * 2020-12-27 12:09 AM
5 | *
6 | *
7 | * Copyright (c) 2020, Han He. All Rights Reserved, http://www.hankcs.com/
8 | * See LICENSE file in the project root for full license information.
9 | *
10 | */
11 | package com.hankcs.hanlp.restful;
12 |
13 | /**
14 | * @author hankcs
15 | */
16 | public class DocumentInput extends BaseInput
17 | {
18 | public String text;
19 |
20 | public DocumentInput(String text, String[] tasks, String[] skipTasks, String language)
21 | {
22 | super(tasks, skipTasks, language);
23 | this.text = text;
24 | }
25 | }
26 |
--------------------------------------------------------------------------------
/plugins/hanlp_restful_java/src/main/java/com/hankcs/hanlp/restful/SentenceInput.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Han He
3 | * me@hankcs.com
4 | * 2020-12-27 12:09 AM
5 | *
6 | *
7 | * Copyright (c) 2020, Han He. All Rights Reserved, http://www.hankcs.com/
8 | * See LICENSE file in the project root for full license information.
9 | *
10 | */
11 | package com.hankcs.hanlp.restful;
12 |
13 | /**
14 | * @author hankcs
15 | */
16 | public class SentenceInput extends BaseInput
17 | {
18 | public String[] text;
19 |
20 | public SentenceInput(String[] text, String[] tasks, String[] skipTasks, String language)
21 | {
22 | super(tasks, skipTasks, language);
23 | this.text = text;
24 | }
25 | }
26 |
--------------------------------------------------------------------------------
/plugins/hanlp_restful_java/src/main/java/com/hankcs/hanlp/restful/TokenInput.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Han He
3 | * me@hankcs.com
4 | * 2020-12-27 12:09 AM
5 | *
6 | *
7 | * Copyright (c) 2020, Han He. All Rights Reserved, http://www.hankcs.com/
8 | * See LICENSE file in the project root for full license information.
9 | *
10 | */
11 | package com.hankcs.hanlp.restful;
12 |
13 | /**
14 | * @author hankcs
15 | */
16 | public class TokenInput extends BaseInput
17 | {
18 | public String[][] tokens;
19 |
20 | public TokenInput(String[][] tokens, String[] tasks, String[] skipTasks, String language)
21 | {
22 | super(tasks, skipTasks, language);
23 | this.tokens = tokens;
24 | }
25 | }
26 |
--------------------------------------------------------------------------------
/plugins/hanlp_restful_java/src/main/java/com/hankcs/hanlp/restful/mrp/Anchor.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Han He
3 | * me@hankcs.com
4 | * 2022-04-13 8:58 AM
5 | *
6 | *
7 | * Copyright (c) 2022, Han He. All Rights Reserved, http://www.hankcs.com/
8 | * See LICENSE file in the project root for full license information.
9 | *
10 | */
11 | package com.hankcs.hanlp.restful.mrp;
12 |
13 | /**
14 | * @author hankcs
15 | */
16 | public class Anchor
17 | {
18 | public String from;
19 | public String to;
20 | }
21 |
--------------------------------------------------------------------------------
/plugins/hanlp_restful_java/src/main/java/com/hankcs/hanlp/restful/mrp/Edge.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Han He
3 | * me@hankcs.com
4 | * 2022-04-13 9:01 AM
5 | *
6 | *
7 | * Copyright (c) 2022, Han He. All Rights Reserved, http://www.hankcs.com/
8 | * See LICENSE file in the project root for full license information.
9 | *
10 | */
11 | package com.hankcs.hanlp.restful.mrp;
12 |
13 | /**
14 | * @author hankcs
15 | */
16 | public class Edge
17 | {
18 | public int source;
19 | public int target;
20 | public String label;
21 | }
22 |
--------------------------------------------------------------------------------
/plugins/hanlp_restful_java/src/main/java/com/hankcs/hanlp/restful/mrp/MeaningRepresentation.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Han He
3 | * me@hankcs.com
4 | * 2022-04-13 8:57 AM
5 | *
6 | *
7 | * Copyright (c) 2022, Han He. All Rights Reserved, http://www.hankcs.com/
8 | * See LICENSE file in the project root for full license information.
9 | *
10 | */
11 | package com.hankcs.hanlp.restful.mrp;
12 |
13 | /**
14 | * Graph-based meaning representation.
15 | *
16 | * @author hankcs
17 | */
18 | public class MeaningRepresentation
19 | {
20 | public String id;
21 | public String input;
22 | public Node[] nodes;
23 | public Edge[] edges;
24 | public String[] tops;
25 | public String framework;
26 | }
27 |
--------------------------------------------------------------------------------
/plugins/hanlp_restful_java/src/main/java/com/hankcs/hanlp/restful/mrp/Node.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Han He
3 | * me@hankcs.com
4 | * 2022-04-13 8:57 AM
5 | *
6 | *
7 | * Copyright (c) 2022, Han He. All Rights Reserved, http://www.hankcs.com/
8 | * See LICENSE file in the project root for full license information.
9 | *
10 | */
11 | package com.hankcs.hanlp.restful.mrp;
12 |
13 | /**
14 | * @author hankcs
15 | */
16 | public class Node
17 | {
18 | public int id;
19 | public String label;
20 | public String[] properties;
21 | public String[] values;
22 | public Anchor[] anchors;
23 | }
24 |
--------------------------------------------------------------------------------
/plugins/hanlp_trie/README.md:
--------------------------------------------------------------------------------
1 | # Trie interface and implementation for HanLP
2 |
3 | [中文](https://github.com/hankcs/HanLP/tree/doc-zh) | [1.x](https://github.com/hankcs/HanLP/tree/1.x) | [forum](https://bbs.hankcs.com/) | [docker](https://github.com/WalterInSH/hanlp-jupyter-docker)
4 |
5 | The multilingual NLP library for researchers and companies, built on PyTorch and TensorFlow 2.x, for advancing state-of-the-art deep learning techniques in both academia and industry. HanLP was designed from day one to be efficient, user friendly and extendable. It comes with pretrained models for various human languages including English, Chinese and many others. Currently, HanLP 2.0 is in alpha stage with more killer features on the roadmap. Discussions are welcomed on our [forum](https://bbs.hankcs.com/), while bug reports and feature requests are reserved for GitHub issues. For Java users, please checkout the [1.x](https://github.com/hankcs/HanLP/tree/1.x) branch.
6 |
7 | ## Installation
8 |
9 | ```bash
10 | pip install hanlp
11 | ```
12 |
13 |
14 | ## License
15 |
16 | HanLP is licensed under **Apache License 2.0**. You can use HanLP in your commercial products for free. We would appreciate it if you add a link to HanLP on your website.
17 |
18 |
--------------------------------------------------------------------------------
/plugins/hanlp_trie/hanlp_trie/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # Author: hankcs
3 | # Date: 2020-11-29 17:48
4 | from .trie import Trie
5 | from .dictionary import DictInterface, TrieDict
6 |
--------------------------------------------------------------------------------
/plugins/hanlp_trie/setup.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # Author: hankcs
3 | # Date: 2019-12-28 19:26
4 | from os.path import abspath, join, dirname
5 | from setuptools import find_packages, setup
6 |
7 | this_dir = abspath(dirname(__file__))
8 | with open(join(this_dir, 'README.md'), encoding='utf-8') as file:
9 | long_description = file.read()
10 |
11 | setup(
12 | name='hanlp_trie',
13 | version='0.0.5',
14 | description='HanLP: Han Language Processing',
15 | long_description=long_description,
16 | long_description_content_type="text/markdown",
17 | url='https://github.com/hankcs/HanLP',
18 | author='hankcs',
19 | author_email='hankcshe@gmail.com',
20 | license='Apache License 2.0',
21 | classifiers=[
22 | 'Intended Audience :: Science/Research',
23 | 'Intended Audience :: Developers',
24 | "Development Status :: 3 - Alpha",
25 | 'Operating System :: OS Independent',
26 | "License :: OSI Approved :: Apache Software License",
27 | 'Programming Language :: Python :: 3 :: Only',
28 | 'Topic :: Scientific/Engineering :: Artificial Intelligence',
29 | "Topic :: Text Processing :: Linguistic"
30 | ],
31 | keywords='corpus,machine-learning,NLU,NLP',
32 | packages=find_packages(exclude=['docs', 'tests*']),
33 | include_package_data=True,
34 | install_requires=[
35 | 'hanlp_common'
36 | ],
37 | python_requires='>=3.6',
38 | )
39 |
--------------------------------------------------------------------------------
/plugins/hanlp_trie/tests/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # Author: hankcs
3 | # Date: 2020-11-29 18:05
4 |
--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # Author: hankcs
3 | # Date: 2019-06-13 23:43
4 | import os
5 |
6 | root = os.path.abspath(os.path.join(os.path.dirname(__file__), os.pardir))
7 |
8 |
9 | def cdroot():
10 | """
11 | cd to project root, so models are saved in the root folder
12 | """
13 | os.chdir(root)
14 |
--------------------------------------------------------------------------------
/tests/test_config_tracker.py:
--------------------------------------------------------------------------------
1 | import unittest
2 |
3 | from hanlp.common.structure import ConfigTracker
4 |
5 |
6 | class MyClass(ConfigTracker):
7 | def __init__(self, i_need_this='yes') -> None:
8 | super().__init__(locals())
9 |
10 |
11 | class TestConfigTracker(unittest.TestCase):
12 | def test_init(self):
13 | obj = MyClass()
14 | self.assertEqual(obj.config.get('i_need_this', None), 'yes')
15 |
16 |
17 | if __name__ == '__main__':
18 | unittest.main()
19 |
--------------------------------------------------------------------------------
/tests/test_pipeline.py:
--------------------------------------------------------------------------------
1 | import unittest
2 | import hanlp
3 |
4 |
5 | class TestPipeLine(unittest.TestCase):
6 | def test_copy(self):
7 | pipe = hanlp.pipeline().append(hanlp.utils.rules.split_sentence)
8 | copied_pipe = pipe.copy()
9 | test_text = "今天天气真好。我要去散步。"
10 | assert pipe is not copied_pipe
11 | copied_pipe.append(lambda sent: "".join(sent))
12 | assert pipe(test_text) != copied_pipe(test_text)
13 |
14 | if __name__ == '__main__':
15 | unittest.main()
16 |
--------------------------------------------------------------------------------
/tests/test_rules.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # Author: hankcs
3 | # Date: 2022-03-22 17:17
4 | import unittest
5 |
6 | from hanlp.utils.rules import split_sentence
7 |
8 |
9 | class TestRules(unittest.TestCase):
10 | def test_eos(self):
11 | self.assertListEqual(list(split_sentence('叶')), ['叶'])
12 | self.assertListEqual(list(split_sentence('他说:“加油。”谢谢')), ['他说:“加油。”', '谢谢'])
13 | self.assertListEqual(list(split_sentence('Go to hankcs.com. Yes.')), ['Go to hankcs.com.', 'Yes.'])
14 |
15 |
16 | if __name__ == '__main__':
17 | unittest.main()
18 |
--------------------------------------------------------------------------------
/tests/test_string_util.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # Author: hankcs
3 | # Date: 2022-03-22 17:17
4 | import unittest
5 |
6 | from hanlp.utils.string_util import possible_tokenization
7 |
8 |
9 | class TestStringUtility(unittest.TestCase):
10 | def test_enumerate_tokenization(self):
11 | text = '商品和服务'
12 | toks = possible_tokenization(text)
13 | assert len(set(toks)) == 2 ** (len(text) - 1)
14 | for each in toks:
15 | assert ''.join(each) == text
16 |
17 |
18 | if __name__ == '__main__':
19 | unittest.main()
20 |
--------------------------------------------------------------------------------