├── train_ner.py ├── corpus └── .gitkeep ├── third-part └── .gitkeep ├── requirements_dev.txt ├── spacy-dev-resources ├── POS_depedency_model └── .gitignore ├── train_requirements.txt ├── requirements.txt ├── create_wikipedia_corpus.bash ├── extract_UD_Chinese-GSD_corpus.bash ├── .images ├── ner_of_doc.png ├── attributes_of_doc.png ├── dependency_of_doc.png ├── attributes_of_doc.html ├── temp.html └── dependency_of_doc.svg ├── merge_all_text_files.bash ├── move_wikipedia_corpus.bash ├── create_jsonl_corpus.bash ├── init_model.bash ├── download_UD_Chinese-GSD_corpus.bash ├── download_and_compile_brown_cluster.bash ├── train.bash ├── create_model_package.bash ├── create_init_model.bash ├── onto_to_spacy_json.bash ├── train_ner.bash ├── compute_brown_cluster.bash ├── compute_plain_word_vec.bash ├── compute_words_freq.bash ├── train_model.bash ├── .gitmodules ├── .idea ├── vcs.xml ├── modules.xml ├── misc.xml └── Chinese_models_for_SpaCy.iml ├── convert_UD_Chinese-GSD_corpus.bash ├── format_convertor.bash ├── update_model_meta.py ├── test_init_model.py ├── merge_all_text_files.py ├── all_in_one.bash ├── test_dependency_model.py ├── meta.json ├── test.py ├── test_as_model_dir.py ├── test_ner.py ├── test_load.py ├── LICENSE.md ├── merge_submodel.py ├── plain_word_vectors.py ├── README.md ├── workflow.md ├── README.en-US.md ├── .gitignore ├── onto_to_spacy_json.py ├── create_jsonl_vocabulary.py └── notebooks └── demo.ipynb /train_ner.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /corpus/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /third-part/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /requirements_dev.txt: -------------------------------------------------------------------------------- 1 | jieba 2 | -------------------------------------------------------------------------------- /spacy-dev-resources: -------------------------------------------------------------------------------- 1 | ../spacy-dev-resources -------------------------------------------------------------------------------- /POS_depedency_model/.gitignore: -------------------------------------------------------------------------------- 1 | * 2 | **/* 3 | !.gitignore -------------------------------------------------------------------------------- /train_requirements.txt: -------------------------------------------------------------------------------- 1 | -r ./spacy-dev-resources/requirements.txt 2 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | plac 2 | spacy 3 | pandas 4 | jieba 5 | ftfy 6 | validators 7 | -------------------------------------------------------------------------------- /create_wikipedia_corpus.bash: -------------------------------------------------------------------------------- 1 | cd chinese-wikipedia-corpus-creator 2 | bash ./allinone_process.bash 3 | -------------------------------------------------------------------------------- /extract_UD_Chinese-GSD_corpus.bash: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | unzip ./corpus/UD_Chinese-GSD.zip -d ./corpus 4 | -------------------------------------------------------------------------------- /.images/ner_of_doc.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howl-anderson/Chinese_models_for_SpaCy/HEAD/.images/ner_of_doc.png -------------------------------------------------------------------------------- /merge_all_text_files.bash: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | python ./merge_all_text_files.py ./token_cleaned_plain_files ./WORDS.txt 4 | -------------------------------------------------------------------------------- /move_wikipedia_corpus.bash: -------------------------------------------------------------------------------- 1 | cp -r chinese-wikipedia-corpus-creator/token_cleaned_plain_files token_cleaned_plain_files 2 | -------------------------------------------------------------------------------- /.images/attributes_of_doc.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howl-anderson/Chinese_models_for_SpaCy/HEAD/.images/attributes_of_doc.png -------------------------------------------------------------------------------- /.images/dependency_of_doc.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/howl-anderson/Chinese_models_for_SpaCy/HEAD/.images/dependency_of_doc.png -------------------------------------------------------------------------------- /create_jsonl_corpus.bash: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | python ./create_jsonl_vocabulary.py zh spacy_corpus.jsonl WORDS_FREQ.txt ./WORDS-c1000-p1.out/paths 4 | -------------------------------------------------------------------------------- /init_model.bash: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | python -m spacy init-model zh zh_model/ WORDS_FREQ.txt -c WORDS-c1000-p1.out/paths -v WORDS_VECS.txt 4 | -------------------------------------------------------------------------------- /download_UD_Chinese-GSD_corpus.bash: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | wget -c https://github.com/UniversalDependencies/UD_Chinese-GSD/archive/master.zip -O corpus/UD_Chinese-GSD.zip 4 | -------------------------------------------------------------------------------- /download_and_compile_brown_cluster.bash: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | cd third-part 4 | 5 | git clone https://github.com/percyliang/brown-cluster.git 6 | 7 | cd brown-cluster 8 | make -------------------------------------------------------------------------------- /train.bash: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | ../MITIE/tools/wordrep/build/wordrep --count-words 800000 --word-vects --basic-morph --cca-morph ../wikipedia-corpus-creator/token_cleaned_plain_files 4 | -------------------------------------------------------------------------------- /create_model_package.bash: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | python -m spacy package spacy_models/final_model spacy_package --force 4 | 5 | cd spacy_package/zh_core_web_sm-0.1.0 6 | python ./setup.py sdist 7 | -------------------------------------------------------------------------------- /create_init_model.bash: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | python -m spacy init-model zh spacy_models/base_model --jsonl-loc ./spacy_corpus.jsonl --vectors-loc WORDS_VECS.txt --vectors-name zh_core_web_sm.vectors 4 | -------------------------------------------------------------------------------- /onto_to_spacy_json.bash: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | python onto_to_spacy_json.py -i "./ontonotes-release-5.0/data/files/data/chinese/annotations/" -t "china_ner_train.json" -e "china_ner_eval.json" -v 0.05 4 | -------------------------------------------------------------------------------- /train_ner.bash: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | python -m spacy train zh spacy_models/ner_model ./china_ner_train.json ./china_ner_eval.json --pipeline ner -m meta.json -v ./spacy_models/dependency_model/model-best -n 1 4 | -------------------------------------------------------------------------------- /compute_brown_cluster.bash: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | cpu_count=`nproc --all` 4 | process_count=$(expr $cpu_count - 1) 5 | 6 | ./third-part/brown-cluster/wcluster --text WORDS.txt --c 1000 --threads ${process_count} 7 | -------------------------------------------------------------------------------- /compute_plain_word_vec.bash: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | cpu_count=`nproc --all` 4 | process_count=$(expr $cpu_count - 1) 5 | 6 | python ./spacy-dev-resources/training/plain_word_vectors.py -i 200 -n ${process_count} ./WORDS.txt WORDS_VECS.txt 7 | -------------------------------------------------------------------------------- /compute_words_freq.bash: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | cpu_count=`nproc --all` 4 | process_count=$(expr $cpu_count - 1) 5 | 6 | python ./spacy-dev-resources/training/plain_word_freqs.py -n ${process_count} token_cleaned_plain_files WORDS_FREQ.txt 7 | -------------------------------------------------------------------------------- /train_model.bash: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | python -m spacy train zh spacy_models/dependency_model corpus/spacy/zh-simplified-ud-train.json corpus/spacy/zh-simplified-ud-dev.json --pipeline tagger,parser -v spacy_models/base_model -m meta.json -V 0.1.0 -n 1 4 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "spacy-dev-resources"] 2 | path = spacy-dev-resources 3 | url = https://github.com/howl-anderson/spacy-dev-resources.git 4 | [submodule "third-part/brown-cluster"] 5 | path = third-part/brown-cluster 6 | url = https://github.com/howl-anderson/brown-cluster.git 7 | -------------------------------------------------------------------------------- /.idea/vcs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | -------------------------------------------------------------------------------- /.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /convert_UD_Chinese-GSD_corpus.bash: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | cd corpus/UD_Chinese-GSD-master 4 | 5 | opencc -i zh_gsd-ud-train.conllu -o zh-simplified-ud-train.conllu -c t2s.json 6 | opencc -i zh_gsd-ud-dev.conllu -o zh-simplified-ud-dev.conllu -c t2s.json 7 | opencc -i zh_gsd-ud-test.conllu -o zh-simplified-ud-test.conllu -c t2s.json 8 | -------------------------------------------------------------------------------- /.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 6 | 7 | -------------------------------------------------------------------------------- /format_convertor.bash: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | mkdir -p corpus/spacy 4 | 5 | python -m spacy convert corpus/UD_Chinese-GSD-master/zh-simplified-ud-train.conllu corpus/spacy 6 | python -m spacy convert corpus/UD_Chinese-GSD-master/zh-simplified-ud-dev.conllu corpus/spacy 7 | python -m spacy convert corpus/UD_Chinese-GSD-master/zh-simplified-ud-test.conllu corpus/spacy 8 | -------------------------------------------------------------------------------- /update_model_meta.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | 4 | def main(): 5 | with open("./spacy_models/base_model/meta.json") as fd: 6 | data = json.load(infd) 7 | 8 | data["name"] = "core_web_sm" 9 | 10 | with open("./spacy_models/base_model/meta.json", "wt") as fd: 11 | json.dump(data, fd) 12 | 13 | 14 | if __name__ == "__main__": 15 | main() 16 | -------------------------------------------------------------------------------- /test_init_model.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | from spacy import displacy 4 | import spacy 5 | 6 | nlp = spacy.load('zh_model/') 7 | 8 | 9 | def main(): 10 | doc = nlp("王小明在北京的清华大学读书") 11 | for token in doc: 12 | print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_, 13 | token.shape_, token.is_alpha, token.is_stop, token.has_vector, 14 | token.vector_norm, token.is_oov) 15 | 16 | displacy.serve(doc) 17 | 18 | 19 | if __name__ == "__main__": 20 | main() 21 | -------------------------------------------------------------------------------- /merge_all_text_files.py: -------------------------------------------------------------------------------- 1 | import pathlib 2 | import sys 3 | 4 | input_dir = sys.argv[1] 5 | input_path = pathlib.Path(input_dir) 6 | input_files = input_path.glob("*") 7 | 8 | output_file = sys.argv[2] 9 | output_path = pathlib.Path(output_file) 10 | 11 | 12 | with output_path.open("wt") as outfile: 13 | for fname in input_files: 14 | with fname.open("rt") as infile: 15 | for line in infile: 16 | if not line.endswith("\n"): 17 | line = line + "\n" 18 | outfile.write(line) 19 | -------------------------------------------------------------------------------- /all_in_one.bash: -------------------------------------------------------------------------------- 1 | ./create_wikipedia_corpus.bash 2 | ./move_wikipedia_corpus.bash 3 | ./compute_words_freq.bash 4 | ./merge_all_text_files.bash 5 | ./download_and_compile_brown_cluster.bash 6 | ./compute_plain_word_vec.bash 7 | ./create_init_model.bash 8 | ./update_model_meta.py 9 | ./download_UD_Chinese-GSD_corpus.bash 10 | ./extract_UD_Chinese-GSD_corpus.bash 11 | ./convert_UD_Chinese-GSD_corpus.bash 12 | ./format_convertor.bash 13 | ./init_model.bash 14 | ./train_model.bash 15 | ./onto_to_spacy_json.bash 16 | ./train_ner.bash 17 | ./merge_submodel.py 18 | -------------------------------------------------------------------------------- /test_dependency_model.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | from spacy import displacy 4 | import spacy 5 | 6 | nlp = spacy.load('depedency_model/model-final/') 7 | 8 | 9 | def main(): 10 | doc = nlp("王小明在北京的清华大学读书") 11 | for token in doc: 12 | print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_, 13 | token.shape_, token.is_alpha, token.is_stop, token.has_vector, 14 | token.vector_norm, token.is_oov) 15 | 16 | displacy.serve(doc) 17 | 18 | 19 | if __name__ == "__main__": 20 | main() 21 | -------------------------------------------------------------------------------- /meta.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "core_web_sm", 3 | "version": "0.1.0", 4 | "license": "CC BY-SA 3.0", 5 | "author": "Xiaoquan Kong", 6 | "url": "https://xiaoquankong.ai", 7 | "sources": [ 8 | "OntoNotes 5", 9 | "Common Crawl", 10 | "Universal Dependencies" 11 | ], 12 | "email": "u1mail2me@gmail.com", 13 | "description": "Chinese multi-task CNN trained on OntoNotes, with GloVe vectors trained on Common Crawl. Assigns word vectors, context-specific token vectors, POS tags, dependency parse and named entities." 14 | } 15 | -------------------------------------------------------------------------------- /test.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | from spacy import displacy 4 | 5 | import zh_core_web_sm 6 | 7 | nlp = zh_core_web_sm.load() 8 | 9 | 10 | def main(): 11 | doc = nlp("王小明在北京的清华大学读书") 12 | for token in doc: 13 | print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_, 14 | token.shape_, token.is_alpha, token.is_stop, token.has_vector, 15 | token.ent_iob_, token.ent_type_, 16 | token.vector_norm, token.is_oov) 17 | 18 | # displacy.serve(doc) 19 | 20 | 21 | if __name__ == "__main__": 22 | main() 23 | -------------------------------------------------------------------------------- /test_as_model_dir.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | from spacy import displacy 4 | import spacy 5 | 6 | nlp = spacy.load("./spacy_models/final_model") 7 | 8 | 9 | def main(): 10 | doc = nlp("王小明在北京的清华大学读书") 11 | for token in doc: 12 | print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_, 13 | token.shape_, token.is_alpha, token.is_stop, token.has_vector, 14 | token.ent_iob_, token.ent_type_, 15 | token.vector_norm, token.is_oov) 16 | 17 | # displacy.serve(doc) 18 | 19 | 20 | if __name__ == "__main__": 21 | main() 22 | -------------------------------------------------------------------------------- /test_ner.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | from spacy import displacy 4 | import spacy 5 | 6 | nlp = spacy.load('ner_model/model-final') 7 | 8 | 9 | def main(): 10 | doc = nlp("王小明在北京的清华大学读书") 11 | for token in doc: 12 | print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_, 13 | token.shape_, token.is_alpha, token.is_stop, token.has_vector, 14 | token.ent_iob_, token.ent_type_, 15 | token.vector_norm, token.is_oov) 16 | 17 | displacy.serve(doc, style='ent') 18 | 19 | 20 | if __name__ == "__main__": 21 | main() 22 | -------------------------------------------------------------------------------- /test_load.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | from spacy import displacy 4 | 5 | import spacy 6 | 7 | nlp = spacy.load('package_templete/zh_core_web_sm-2.0.5/zh_core_web_sm/zh_core_web_sm-2.0.5') 8 | 9 | 10 | def main(): 11 | doc = nlp("王小明在北京的清华大学读书") 12 | for token in doc: 13 | print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_, 14 | token.shape_, token.is_alpha, token.is_stop, token.has_vector, 15 | token.ent_iob_, token.ent_type_, 16 | token.vector_norm, token.is_oov) 17 | 18 | # displacy.serve(doc) 19 | 20 | 21 | if __name__ == "__main__": 22 | main() 23 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 Xiaoquan Kong 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /.idea/Chinese_models_for_SpaCy.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 21 | -------------------------------------------------------------------------------- /merge_submodel.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3 2 | 3 | import shutil 4 | import json 5 | from pathlib import Path 6 | 7 | 8 | def read_pipeline(meta_file): 9 | with open(meta_file) as fd: 10 | data = json.load(fd) 11 | return data["pipeline"] 12 | 13 | 14 | def update_pipeline(meta_file, pipeline): 15 | with open(meta_file) as fd: 16 | data = json.load(fd) 17 | 18 | data["pipeline"] = pipeline 19 | 20 | with open(meta_file, "w") as fd: 21 | json.dump(data, fd) 22 | 23 | 24 | def copy_tree(src: Path, dst: Path, folder: str): 25 | shutil.copytree(src / folder, dst / folder) 26 | 27 | 28 | def main(): 29 | target_dir = Path("./spacy_models/final_model") 30 | target_dir.mkdir(exist_ok=True) 31 | 32 | pipeline = [] 33 | 34 | source_dir = Path("./spacy_models/dependency_model/model-best") 35 | copy_tree(source_dir, target_dir, "parser") 36 | copy_tree(source_dir, target_dir, "tagger") 37 | copy_tree(source_dir, target_dir, "vocab") 38 | 39 | pipeline.extend(read_pipeline(source_dir / "meta.json")) 40 | 41 | source_dir = Path("./spacy_models/ner_model/model-best") 42 | copy_tree(source_dir, target_dir, "ner") 43 | shutil.copy(source_dir / "meta.json", target_dir / "meta.json") 44 | 45 | pipeline.extend(read_pipeline(source_dir / "meta.json")) 46 | 47 | update_pipeline(target_dir / "meta.json", pipeline) 48 | 49 | 50 | if __name__ == "__main__": 51 | main() 52 | -------------------------------------------------------------------------------- /plain_word_vectors.py: -------------------------------------------------------------------------------- 1 | import plac 2 | import gensim 3 | from gensim import utils 4 | 5 | 6 | class Corpus: 7 | def __init__(self, corpus_file): 8 | self.corpus_file = corpus_file 9 | 10 | def __iter__(self): 11 | with open(self.corpus_file) as fd: 12 | for line in fd: 13 | yield utils.simple_preprocess(line) 14 | 15 | 16 | @plac.annotations( 17 | in_dir=("Location of input directory"), 18 | out_loc=("Location of output file"), 19 | n_workers=("Number of workers", "option", "n", int), 20 | size=("Dimension of the word vectors", "option", "d", int), 21 | window=("Context window size", "option", "w", int), 22 | min_count=("Min count", "option", "m", int), 23 | negative=("Number of negative samples", "option", "g", int), 24 | nr_iter=("Number of iterations", "option", "i", int), 25 | ) 26 | def main( 27 | in_dir, 28 | out_loc, 29 | negative=5, 30 | n_workers=4, 31 | window=5, 32 | size=128, 33 | min_count=10, 34 | nr_iter=2, 35 | ): 36 | sentences = Corpus(in_dir) 37 | model = gensim.models.Word2Vec( 38 | sentences=sentences, 39 | size=size, 40 | window=window, 41 | min_count=min_count, 42 | workers=n_workers, 43 | sample=1e-5, 44 | negative=negative, 45 | iter=nr_iter, 46 | ) 47 | model.wv.save_word2vec_format(out_loc, binary=False) 48 | 49 | 50 | if __name__ == "__main__": 51 | plac.call(main) 52 | -------------------------------------------------------------------------------- /.images/attributes_of_doc.html: -------------------------------------------------------------------------------- 1 |
2 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 |
textlemma_pos_tag_dep_shape_is_alphais_stophas_vectorvector_normis_oov
0王小明王小明XNNPnsubjxxxTrueFalseTrue0.392991True
1XVVaclxTrueFalseTrue7.318524False
2北京北京XNNPdetxxTrueFalseTrue10.940736False
3XDECcase:decxTrueFalseTrue6.201293False
4清华大学清华大学XNNPobjxxxxTrueFalseTrue12.044737False
5读书读书XVVROOTxxTrueFalseTrue11.602811False
119 |
120 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [README written in English](README.en-US.md) 2 | ------------------------------ 3 | # SpaCy 官方中文模型已经上线(https://spacy.io/models/zh), 参考了本项目,具有相同的特性。本项目『推动 SpaCy 中文模型开发』的使命已经完成,本项目将进入维护状态,后续更新将只进行 bug 修复,感谢各位用户长期的关注和支持。 4 | 5 | # SpaCy 中文模型 6 | 7 | 为 SpaCy 提供的中文数据模型. 模型目前还处于 beta 公开测试的状态 。 8 | 9 | ## 在线演示 10 | 11 | 基于 Jupyter notebook 的在线演示在 [![Binder](https://mybinder.org/badge.svg)](https://mybinder.org/v2/gh/howl-anderson/Chinese_models_for_SpaCy/master?filepath=notebooks%2Fdemo.ipynb)。 12 | 13 | ### 特性 14 | 部分 `王小明在北京的清华大学读书` 这个 `Doc` 对象的属性信息: 15 | 16 | ![attributes_of_doc](.images/attributes_of_doc.png) 17 | 18 | ### NER (**New!**) 19 | 部分 `王小明在北京的清华大学读书` 这个 `Doc` 对象的 NER 信息: 20 | 21 | ![ner_of_doc](.images/ner_of_doc.png) 22 | 23 | ## 开始使用 24 | 25 | 模型用二进制文件的形式进行分发, 用户应该具备基础的 SpaCy (version > 2) 的基础知识. 26 | 27 | ### 系统要求 28 | 29 | Python 3 (也许支持 python2, 但未经过良好测试) 30 | 31 | ### 安装 32 | 33 | #### 下载模型 34 | 从 [releases](https://github.com/howl-anderson/Chinese_models_for_SpaCy/releases) 页面下载模型 (**New!** 为中国地区的用户提供了加速下载的链接)。假设所下载的模型名为 `zh_core_web_sm-2.x.x.tar.gz`。 35 | 36 | #### 安装模型 37 | 38 | ``` 39 | pip install zh_core_web_sm-2.x.x.tar.gz 40 | ``` 41 | 42 | 为了方便后续在 Rasa NLU 等框架中使用,需要再为这个模型建立一个链接,by 执行以下命令: 43 | 44 | ```bash 45 | spacy link zh_core_web_sm zh 46 | ``` 47 | 48 | 运行完成后就可以使用 zh 这个别名来访问这个模型了。 49 | 50 | ## 运行 Demo 代码 51 | 52 | Demo 代码位于 `test.py`. 在安装好模型后,用户下载或者克隆本仓库的代码,然后可以直接执行 53 | 54 | ```bash 55 | python3 ./test.py 56 | ``` 57 | 58 | 打开地址 `http://127.0.0.1:5000`, 将看到如下: 59 | 60 | ![Dependency of doc](.images/dependency_of_doc.png) 61 | 62 | ## 如何从零构造这个模型 63 | 64 | 见 [workflow](workflow.md) 65 | 66 | ## 语料库 67 | 本项目使用的语料库是 OntoNotes 5.0。 68 | 69 | 由于 OntoNotes 5.0 是 LDC ([Linguistic Data Consortium](https://www.ldc.upenn.edu/)) 的版权材料,无法直接包含在本项目中。好消息是,OntoNotes 5.0 对于 团体用户(包含企业和学术组织)是完全免费的。用户可以建立一个企业或者学术组织账号,然后免费获取 OntoNotes 5.0。 70 | 71 | ## TODO list 72 | 73 | * 属性 `pos_` 不正确. 这个和 SpaCy 中中文语言 Class 相关。 74 | * 属性 `shape_` and `is_alpha` 似乎对中文并无意义, 但需要权威信息确认一下. 75 | * 属性 `is_stop` 不正确. 这个和 SpaCy 中中文语言 Class 相关。 76 | * 属性 `vector` 似乎没有训练的很好。 77 | * 属性 `is_oov` 完全错误. 第一优先级修复。 78 | * NER 模型,因为缺少 LDC 语料库,目前不可用. 正在解决中正在训练中。 79 | * 将训练中所用的中间结果 release 出来, 方便用户自行定制模型 80 | 81 | ## 使用的组件 82 | 83 | * TODO 84 | 85 | ## 如何贡献 86 | 87 | 请阅读 [CONTRIBUTING.md](https://gist.github.com/PurpleBooth/b24679402957c63ec426) , 然后提交 pull requests 给我们. 88 | 89 | ## 版本化控制 90 | 91 | 我们使用 [SemVer](http://semver.org/) 做版本化的标准. 查看 `tags` 以了解所有的版本. 92 | 93 | ## 作者 94 | 95 | * **Xiaoquan Kong** - *Initial work* - [howl-anderson](https://github.com/howl-anderson) 96 | 97 | 更多贡献者信息,请参考 `contributors`. 98 | 99 | ## 版权 100 | 101 | MIT License - 详见 [LICENSE.md](LICENSE.md) 102 | 103 | ## 致谢 104 | 105 | * TODO 106 | -------------------------------------------------------------------------------- /.images/temp.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Title 6 | 7 | 8 |
9 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 |
textlemma_pos_tag_dep_shape_is_alphais_stophas_vectorvector_normis_oov
0王小明王小明XNNPnsubjxxxTrueFalseTrue0.392991True
1XVVaclxTrueFalseTrue7.318524False
2北京北京XNNPdetxxTrueFalseTrue10.940736False
3XDECcase:decxTrueFalseTrue6.201293False
4清华大学清华大学XNNPobjxxxxTrueFalseTrue12.044737False
5读书读书XVVROOTxxTrueFalseTrue11.602811False
126 |
127 | 128 | -------------------------------------------------------------------------------- /workflow.md: -------------------------------------------------------------------------------- 1 | # SpaCy Chinese model training workflow 2 | 3 | ## get preprocessed Chinese Wikipedia corpus 4 | see project [chinese-wikipedia-corpus-creator](https://github.com/howl-anderson/chinese-wikipedia-corpus-creator) for more details. 5 | 6 | ### produce wikipedia corpus ### 7 | * input: - 8 | * output: `token_cleaned_plain_files/` 9 | * script: `create_wikipedia_corpus.bash` 10 | 11 | ### copy corpus to workspace ### 12 | * input: `chinese-wikipedia-corpus-creator/token_cleaned_plain_files/`` 13 | * output: `token_cleaned_plain_files/` 14 | * script: `move_wikipedia_corpus.bash` 15 | 16 | ## computing word frequency 17 | * input: `token_cleaned_plain_files/*` 18 | * output: `WORDS_FREQ.txt` 19 | * script: `compute_words_freq.bash` 20 | 21 | ## merge all files into one 22 | * input: `token_cleaned_plain_files/*` 23 | * output: `WORDS.txt` 24 | * script: `merge_all_text_files.bash` 25 | 26 | ## compute brown cluster 27 | ### brown cluster computing software 28 | Official software is [brown-cluster](https://github.com/percyliang/brown-cluster). 29 | 30 | ### install 31 | * input: - 32 | * output: `` 33 | * script: `download_and_compile_brown_cluster.bash` 34 | 35 | ### computing 36 | * input: `WORDS.txt` 37 | * output: `WORDS-c1000-p1.out/*` 38 | * script: `compute_brown_cluster.bash` 39 | 40 | ## compute word vector 41 | * input: `token_cleaned_plain_files/*` 42 | * output: `WORDS_VECS.txt` 43 | * script: `compute_plain_word_vec.bash` 44 | 45 | ## initial SpaCy model 46 | 47 | ### build base model 48 | * input: `./WORDS-c1000-p1.out/paths WORDS_VECS.txt WORDS_FREQ.txt` 49 | * output: `spacy_models/base_model/**/*` 50 | * script: `create_init_model.bash` 51 | 52 | ### modify model name 53 | * input: `spacy_models/base_model/meta.json` 54 | * output: `spacy_models/base_model/meta.json` 55 | * script: `update_model_meta.py` 56 | 57 | 58 | ## getting UD_Chinese-GSD corpus 59 | 60 | ### download 61 | * input: - 62 | * output: `corpus/UD_Chinese-GS.zip` 63 | * script: `download_UD_Chinese-GSD_corpus.bash` 64 | 65 | ### extracting 66 | * input: `corpus/UD_Chinese-GSd.zip` 67 | * output: `corpus/UD_Chinese-GSd` 68 | * script: `extract_UD_Chinese-GSD_corpus.bash` 69 | 70 | ### convert to simplified Chinese 71 | * input: `corpus/UD_Chinese-GSd/zh-ud-*.conllu` 72 | * output: `corpus/UD_Chinese-GSd/zh-simplified-ud-*.conllu` 73 | * script: `convert_UD_Chinese-GSD_corpus.bash` 74 | 75 | ## convert UD corpus format 76 | * input: `.corpus/UD_Chinese-GSD/zh-simplified-ud-*.conllu` 77 | * output: `corpus/spacy/zh-simplified-ud-*.conllu` 78 | * script: `format_convertor.bash` 79 | 80 | ## init spacy model with word vector & word cluster & word frquence 81 | * input: `WORDS_FREQ.txt`, `WORDS-c1000-p1.out/paths`, `WORDS_VECS.txt` 82 | * output: `zh_model/*` 83 | * script: `init_model.bash` 84 | 85 | ## train SpaCy model for POS and dependency parser 86 | * input: `zh_model corpus/spacy/zh-simplified-ud-*.conllu` 87 | * output: `dependency_model` 88 | * script: `train_model.bash` 89 | 90 | ## translate onotNote 5 to spacy json file 91 | * input: `TODO` 92 | * output: `TODO` 93 | * script: `onto_to_spacy_json.bash` 94 | 95 | ## train SpaCy model for NER parser 96 | * input: `zh_model china_ner_train.json china_ner_eval.json` 97 | * output: `ner_model` 98 | * script: `train_ner.bash` 99 | 100 | ## merge sub-model 101 | * input: `spacy_models/dependency_model`, `spacy_models/ner_model` 102 | * output: `spacy_models/final_model` 103 | * script: `merge_submodel.py` 104 | 105 | ## create package 106 | * input: `spacy_models/final_model/` 107 | * output: `spacy_package/` 108 | * script: `./create_model_package.bash` 109 | -------------------------------------------------------------------------------- /README.en-US.md: -------------------------------------------------------------------------------- 1 | [中文版本的 README](README.zh-Hans.md) 2 | ------------------------------ 3 | 4 | # The official Chinese model for SpaCy is now available at (https://spacy.io/models/zh). It was developed with reference to this project and shares the same features. As the goal of this project — “promoting the development of the SpaCy Chinese model” — has been achieved, this repository will enter maintenance mode. Future updates will focus only on bug fixes. We would like to thank all users for their long-term attention and support. 5 | 6 | # Chinese models for SpaCy 7 | 8 | SpaCy (version > 2) models for Chinese language. Those models are rough and still **working in prograss**. But "Something is Better Than Nothing". 9 | 10 | ## Online demo 11 | 12 | An online jupyter notebook / demo is provided at [![Binder](https://mybinder.org/badge.svg)](https://mybinder.org/v2/gh/howl-anderson/Chinese_models_for_SpaCy/master?filepath=notebooks%2Fdemo.ipynb). 13 | 14 | ### Features 15 | 16 | Partial attributes of a `Doc` object for `王小明在北京的清华大学读书`: 17 | 18 | ![attributes_of_doc](.images/attributes_of_doc.png) 19 | 20 | ### NER (**New!**) 21 | NER of a `Doc` object for `王小明在北京的清华大学读书`: 22 | 23 | ![attributes_of_doc](.images/ner_of_doc.png) 24 | 25 | ## Getting Started 26 | 27 | Models are released as binary file, users should know basic knowledge of using SpaCy version 2+. 28 | 29 | ### Prerequisites 30 | 31 | Python 3 (maybe python2, but currently not well tested) 32 | 33 | ### Installing 34 | 35 | Download relased model from `releases`. 36 | 37 | ``` 38 | wget -c https://github.com/howl-anderson/Chinese_models_for_SpaCy/releases/download/v2.0.4/zh_core_web_sm-2.0.4.tar.gz 39 | ``` 40 | 41 | then install model 42 | 43 | ``` 44 | pip install zh_core_web_sm-2.0.4.tar.gz 45 | ``` 46 | 47 | 48 | ## Running demo code 49 | 50 | `test.py` contains demo codes. After install the model, user can download or clone this repo then execute: 51 | 52 | ```bash 53 | python3 ./test.py 54 | ``` 55 | 56 | then, open web browser to `http://127.0.0.1:5000`, user will see image simllar to this: 57 | 58 | ![Dependency of doc](.images/dependency_of_doc.png) 59 | 60 | ## How to re-produce model 61 | 62 | See [workflow](workflow.md) 63 | 64 | ## Corpus Data 65 | The corpus data used in this project is OntoNotes 5.0。 66 | 67 | Since OntoNotes 5.0 is copyright material of LDC ([Linguistic Data Consortium](https://www.ldc.upenn.edu/)) . This project can not include the daa directly。Good news is OntoNotes 5.0 is free to organizer user, you can set up a count for your company or school, then you can get the OntoNotes 5.0 at no cost。 68 | 69 | 70 | ## TODO list 71 | 72 | * Attribute `pos_` is not working correctly. This related to Language class in SpaCy. 73 | * Attribute `shape_` and `is_alpha` seems meaningless for Chinese, need make sure of it. 74 | * Attribute `is_stop` is not working correctly. This related to Language class in SpaCy. 75 | * Attribute `vector` seems not well trained 76 | * Attribute `is_oov` is totally incorrect. First priority. 77 | * NER model is not available due to lacking of LDC corpus. I am working on it. 78 | * Release all the intermediate material to help user build own model 79 | 80 | ## Built With 81 | 82 | * TODO 83 | 84 | ## Contributing 85 | 86 | Please read [CONTRIBUTING.md](https://gist.github.com/PurpleBooth/b24679402957c63ec426) for details on our code of conduct, and the process for submitting pull requests to us. 87 | 88 | ## Versioning 89 | 90 | We use [SemVer](http://semver.org/) for versioning. For the versions available, see the `tags` on this repository. 91 | 92 | ## Authors 93 | 94 | * **Xiaoquan Kong** - *Initial work* - [howl-anderson](https://github.com/howl-anderson) 95 | 96 | See also the list of `contributors` who participated in this project. 97 | 98 | ## License 99 | 100 | This project is licensed under the MIT License - see the [LICENSE.md](LICENSE.md) file for details 101 | 102 | ## Acknowledgments 103 | 104 | * TODO 105 | -------------------------------------------------------------------------------- /.images/dependency_of_doc.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 王小明 5 | X 6 | 7 | 8 | 9 | 10 | X 11 | 12 | 13 | 14 | 北京 15 | X 16 | 17 | 18 | 19 | 20 | X 21 | 22 | 23 | 24 | 清华大学 25 | X 26 | 27 | 28 | 29 | 读书 30 | X 31 | 32 | 33 | 34 | 35 | 36 | nsubj 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | acl 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | det 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | case:dec 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | obj 69 | 70 | 71 | 72 | 73 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | MANIFEST 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | .pytest_cache/ 49 | 50 | # Translations 51 | *.mo 52 | *.pot 53 | 54 | # Django stuff: 55 | *.log 56 | local_settings.py 57 | db.sqlite3 58 | 59 | # Flask stuff: 60 | instance/ 61 | .webassets-cache 62 | 63 | # Scrapy stuff: 64 | .scrapy 65 | 66 | # Sphinx documentation 67 | docs/_build/ 68 | 69 | # PyBuilder 70 | target/ 71 | 72 | # Jupyter Notebook 73 | .ipynb_checkpoints 74 | 75 | # pyenv 76 | .python-version 77 | 78 | # celery beat schedule file 79 | celerybeat-schedule 80 | 81 | # SageMath parsed files 82 | *.sage.py 83 | 84 | # Environments 85 | .env 86 | .venv 87 | env/ 88 | venv/ 89 | ENV/ 90 | env.bak/ 91 | venv.bak/ 92 | 93 | # Spyder project settings 94 | .spyderproject 95 | .spyproject 96 | 97 | # Rope project settings 98 | .ropeproject 99 | 100 | # mkdocs documentation 101 | /site 102 | 103 | # mypy 104 | .mypy_cache/ 105 | # General 106 | .DS_Store 107 | .AppleDouble 108 | .LSOverride 109 | 110 | # Icon must end with two \r 111 | Icon 112 | 113 | 114 | # Thumbnails 115 | ._* 116 | 117 | # Files that might appear in the root of a volume 118 | .DocumentRevisions-V100 119 | .fseventsd 120 | .Spotlight-V100 121 | .TemporaryItems 122 | .Trashes 123 | .VolumeIcon.icns 124 | .com.apple.timemachine.donotpresent 125 | 126 | # Directories potentially created on remote AFP share 127 | .AppleDB 128 | .AppleDesktop 129 | Network Trash Folder 130 | Temporary Items 131 | .apdisk 132 | # -*- mode: gitignore; -*- 133 | *~ 134 | \#*\# 135 | /.emacs.desktop 136 | /.emacs.desktop.lock 137 | *.elc 138 | auto-save-list 139 | tramp 140 | .\#* 141 | 142 | # Org-mode 143 | .org-id-locations 144 | *_archive 145 | 146 | # flymake-mode 147 | *_flymake.* 148 | 149 | # eshell files 150 | /eshell/history 151 | /eshell/lastdir 152 | 153 | # elpa packages 154 | /elpa/ 155 | 156 | # reftex files 157 | *.rel 158 | 159 | # AUCTeX auto folder 160 | /auto/ 161 | 162 | # cask packages 163 | .cask/ 164 | dist/ 165 | 166 | # Flycheck 167 | flycheck_*.el 168 | 169 | # server auth directory 170 | /server/ 171 | 172 | # projectiles files 173 | .projectile 174 | 175 | # directory configuration 176 | .dir-locals.el 177 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and WebStorm 178 | # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839 179 | 180 | # User-specific stuff 181 | .idea/**/workspace.xml 182 | .idea/**/tasks.xml 183 | .idea/**/dictionaries 184 | .idea/**/shelf 185 | 186 | # Sensitive or high-churn files 187 | .idea/**/dataSources/ 188 | .idea/**/dataSources.ids 189 | .idea/**/dataSources.local.xml 190 | .idea/**/sqlDataSources.xml 191 | .idea/**/dynamic.xml 192 | .idea/**/uiDesigner.xml 193 | 194 | # Gradle 195 | .idea/**/gradle.xml 196 | .idea/**/libraries 197 | 198 | # CMake 199 | cmake-build-debug/ 200 | cmake-build-release/ 201 | 202 | # Mongo Explorer plugin 203 | .idea/**/mongoSettings.xml 204 | 205 | # File-based project format 206 | *.iws 207 | 208 | # IntelliJ 209 | out/ 210 | 211 | # mpeltonen/sbt-idea plugin 212 | .idea_modules/ 213 | 214 | # JIRA plugin 215 | atlassian-ide-plugin.xml 216 | 217 | # Cursive Clojure plugin 218 | .idea/replstate.xml 219 | 220 | # Crashlytics plugin (for Android Studio and IntelliJ) 221 | com_crashlytics_export_strings.xml 222 | crashlytics.properties 223 | crashlytics-build.properties 224 | fabric.properties 225 | 226 | # Editor-based Rest Client 227 | .idea/httpRequests 228 | 229 | corpus/* 230 | 231 | zh_wiki_core_sm/* 232 | zh_wiki_core/* 233 | zh_model/ 234 | 235 | WORDS.* 236 | WORDS_* 237 | WORDS-*/ 238 | package_templete/ 239 | 240 | -------------------------------------------------------------------------------- /onto_to_spacy_json.py: -------------------------------------------------------------------------------- 1 | import json # for tuple support 2 | import plac 3 | import os 4 | import re 5 | from spacy.gold import biluo_tags_from_offsets 6 | import spacy 7 | 8 | nlp = spacy.blank("xx") 9 | from tqdm import tqdm 10 | import random 11 | 12 | 13 | def get_root_filename(onto_dir): 14 | name_files = [] 15 | for dirpath, subdirs, files in os.walk(onto_dir): 16 | for fname in files: 17 | if bool(re.search(".name", fname)): 18 | fn = os.path.join(dirpath, fname) 19 | fn = re.sub("\.name", "", fn) 20 | name_files.append(fn) 21 | return name_files 22 | 23 | 24 | def split_sentence(text): 25 | text = text.strip().split("\n")[1:-1] 26 | return text 27 | 28 | 29 | def split_doc(text): 30 | text_list = text.strip().split("\s', t)[0] for t in text_list] 32 | text_list = [re.sub('', "", t).strip() for t in text_list] 33 | return ids, text_list 34 | 35 | 36 | def clean_ent(ent): 37 | tag = re.findall('TYPE="(.+?)"', ent)[0] 38 | text = re.findall(">(.+)", ent)[0] 39 | text = re.sub("\$", "\$", text) 40 | return (text, tag) 41 | 42 | 43 | def raw_text(text): 44 | """Remove entity tags""" 45 | text = re.sub("", "", text) 46 | text = re.sub("", "", text) 47 | return text 48 | 49 | 50 | def ent_position(ents, text): 51 | search_point = 0 52 | spacy_ents = [] 53 | for ent in ents: 54 | remain_text = text[search_point:] 55 | ma = re.search(ent[0], remain_text) 56 | ent_tup = (ma.start() + search_point, ma.end() + search_point, ent[1]) 57 | spacy_ents.append(ent_tup) 58 | 59 | # update search point to prevent same word in different entity, 60 | # it will cause bug which hard to debug 61 | search_point = search_point + ma.end() 62 | return spacy_ents 63 | 64 | 65 | def text_to_spacy(markup): 66 | raw_ents = re.findall("", markup) 67 | ents = [clean_ent(raw_ent) for raw_ent in raw_ents] 68 | text = raw_text(markup) 69 | spacy_ents = ent_position(ents, text) 70 | final = (text, {"entities": spacy_ents}) 71 | return final 72 | 73 | 74 | def onf_to_raw(onf_file): 75 | """ 76 | Take in a path to a .onf Ontonotes file. Return the raw text (as much as possible). 77 | The quotes are usually quite messed up, so this is not going to look like real input text. 78 | """ 79 | with open(onf_file, "r") as f: 80 | onf = f.read() 81 | sentences = re.findall( 82 | "Plain sentence\:\n\-+?\n(.+?)Treebanked sentence", onf, re.DOTALL 83 | ) 84 | sentences = [re.sub("\n+?\s*", " ", i).strip() for i in sentences] 85 | paragraph = " ".join(sentences) 86 | return paragraph 87 | 88 | 89 | def name_to_sentences(ner_filename): 90 | """ 91 | Take a .name file and return a sentence list of the kind described here: 92 | https://github.com/explosion/spacy/blob/master/examples/training/training-data.json 93 | """ 94 | with open(ner_filename, "r") as f: 95 | doc = f.read() 96 | 97 | sentences = [] 98 | onto_sents = split_sentence(doc) 99 | for sent in onto_sents: 100 | offsets = text_to_spacy(sent) 101 | doc = nlp(offsets[0]) 102 | tags = biluo_tags_from_offsets(doc, offsets[1]["entities"]) 103 | ner_info = list(zip(doc, tags)) 104 | tokens = [] 105 | for n, i in enumerate(ner_info): 106 | token = { 107 | "head": 0, 108 | "dep": "", 109 | "tag": "", 110 | "orth": i[0].string, 111 | "ner": i[1], 112 | "id": n, 113 | } 114 | tokens.append(token) 115 | sentences.append({"tokens": tokens}) 116 | return sentences 117 | 118 | 119 | def dir_to_annotation(onto_dir): 120 | fns = get_root_filename(onto_dir) 121 | all_annotations = [] 122 | 123 | for fn in tqdm(fns): 124 | ner_filename = fn + ".name" 125 | onf_filename = fn + ".onf" 126 | 127 | try: 128 | raw = onf_to_raw(onf_filename) 129 | sentences = name_to_sentences(ner_filename) 130 | final = {"id": "fake", "paragraphs": [{"raw": raw, "sentences": sentences}]} 131 | all_annotations.append(final) 132 | except Exception as e: 133 | print("Error formatting ", fn, e) 134 | return all_annotations 135 | 136 | 137 | @plac.annotations( 138 | onto_dir=("Directory of OntoNotes data to traverse", "option", "i", str), 139 | train_file=("File to write training spaCy JSON out to", "option", "t", str), 140 | val_file=("File to write validation spaCy JSON out to", "option", "e", str), 141 | val_split=("Percentage to use for evaluation", "option", "v", float), 142 | ) 143 | def main(onto_dir, train_file, val_file, val_split=0.75): 144 | print("Reading and formatting annotations") 145 | all_annotations = dir_to_annotation(onto_dir) 146 | random.shuffle(all_annotations) 147 | cutpoint = round(val_split * len(all_annotations)) 148 | val = all_annotations[:cutpoint] 149 | train = all_annotations[cutpoint:] 150 | 151 | print( 152 | "Saving {0} training examples and {1} validation examples".format( 153 | len(train), len(val) 154 | ) 155 | ) 156 | with open(train_file, "w") as f: 157 | json.dump(train, f, ensure_ascii=False, indent=4) 158 | with open(val_file, "w") as f: 159 | json.dump(val, f, ensure_ascii=False, indent=4) 160 | 161 | 162 | if __name__ == "__main__": 163 | plac.call(main) 164 | -------------------------------------------------------------------------------- /create_jsonl_vocabulary.py: -------------------------------------------------------------------------------- 1 | import json 2 | import math 3 | import string 4 | from ast import literal_eval 5 | from pathlib import Path 6 | 7 | import ftfy 8 | import jsonlines 9 | import plac 10 | import validators 11 | from preshed.counter import PreshCounter 12 | from spacy.lang.en import stop_words as en_stop_words 13 | from spacy.lang.zh import stop_words as zh_stop_words 14 | from tqdm import tqdm 15 | 16 | 17 | class Word: 18 | counter = -1 19 | 20 | def __init__(self, word_str, cluster, probs): 21 | self._word = word_str 22 | self._cluster = cluster 23 | self._probs = probs 24 | 25 | chinese_punct = "!?。。"#$%&'()*+,-/:;<=>@[\]^_`{|}~⦅⦆「」、、〃》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏." 26 | self._punct_list = list(set(string.punctuation + chinese_punct)) 27 | 28 | chinese_whitespace = "" 29 | self._whitespace_list = list(set(string.whitespace + chinese_whitespace)) 30 | 31 | english_stopword = en_stop_words.STOP_WORDS 32 | chinese_stopword = zh_stop_words.STOP_WORDS 33 | self._stopword_list = {*english_stopword, *chinese_stopword} 34 | 35 | chinese_quote = "“”‘’" 36 | english_quote = "\"'" 37 | self._qute_list = list(set(english_quote + chinese_quote)) 38 | 39 | chinese_left_punct = "<([{" 40 | english_left_punct = "<([「『【〔〖〘〚{" 41 | self._left_punct_list = list(set(english_left_punct + chinese_left_punct)) 42 | 43 | chinese_right_punct = ">)]}" 44 | english_right_punct = ">)]」』】〕〗〙〛}" 45 | self._right_punct_list = list(set(english_right_punct + chinese_right_punct)) 46 | 47 | @property 48 | def orth(self): 49 | return self._word 50 | 51 | @property 52 | def id(self): 53 | self.__class__.counter += 1 54 | 55 | return self.__class__.counter 56 | 57 | @property 58 | def lower(self): 59 | return self._word.lower() 60 | 61 | @property 62 | def norm(self): 63 | return self._word 64 | 65 | @property 66 | def shape(self): 67 | return "".join(map(lambda x: "X" if x.isupper() else "x", self._word)) 68 | 69 | @property 70 | def prefix(self): 71 | return self._word[0] 72 | 73 | @property 74 | def suffix(self): 75 | return self._word[-1] 76 | 77 | @property 78 | def length(self): 79 | return len(self._word) 80 | 81 | @property 82 | def cluster(self): 83 | return self._cluster 84 | 85 | @property 86 | def prob(self): 87 | return self._probs.get(self, 0) 88 | 89 | @property 90 | def is_alpha(self): 91 | return self._word.isalpha() 92 | 93 | @property 94 | def is_ascii(self): 95 | # only for py 3.7 96 | # return self._word.isascii() 97 | try: 98 | self._word.encode('ascii') 99 | except UnicodeEncodeError: 100 | return False 101 | 102 | return True 103 | 104 | @property 105 | def is_digit(self): 106 | return self._word.isdigit() 107 | 108 | @property 109 | def is_lower(self): 110 | return self._word.islower() 111 | 112 | @property 113 | def is_punct(self): 114 | return self._word in self._punct_list 115 | 116 | @property 117 | def is_space(self): 118 | return self._word in self._whitespace_list 119 | 120 | @property 121 | def is_title(self): 122 | return self._word.istitle() 123 | 124 | @property 125 | def is_upper(self): 126 | return self._word.isupper() 127 | 128 | @property 129 | def like_url(self): 130 | return bool(validators.url(self._word)) 131 | 132 | @property 133 | def like_num(self): 134 | # TODO(howl-anderson): fix it later 135 | return False 136 | 137 | @property 138 | def like_email(self): 139 | return bool(validators.email(self._word)) 140 | 141 | @property 142 | def is_stop(self): 143 | return self._word in self._stopword_list 144 | 145 | @property 146 | def is_oov(self): 147 | return not self._word in self._probs 148 | 149 | @property 150 | def is_quote(self): 151 | return self._word in self._qute_list 152 | 153 | @property 154 | def is_left_punct(self): 155 | return self._word in self._left_punct_list 156 | 157 | @property 158 | def is_right_punct(self): 159 | return self._word in self._right_punct_list 160 | 161 | 162 | def read_freqs(freqs_loc, max_length=100, min_doc_freq=5, min_freq=50): 163 | print("Counting frequencies...") 164 | counts = PreshCounter() 165 | total = 0 166 | with freqs_loc.open() as f: 167 | for i, line in enumerate(f): 168 | freq, doc_freq, key = line.rstrip().split("\t", 2) 169 | freq = int(freq) 170 | counts.inc(i + 1, freq) 171 | total += freq 172 | counts.smooth() 173 | log_total = math.log(total) 174 | probs = {} 175 | with freqs_loc.open() as f: 176 | for line in tqdm(f): 177 | freq, doc_freq, key = line.rstrip().split("\t", 2) 178 | doc_freq = int(doc_freq) 179 | freq = int(freq) 180 | if doc_freq >= min_doc_freq and freq >= min_freq and len(key) < max_length: 181 | word = literal_eval(key) 182 | smooth_count = counts.smoother(int(freq)) 183 | probs[word] = math.log(smooth_count) - log_total 184 | oov_prob = math.log(counts.smoother(0)) - log_total 185 | return probs, oov_prob 186 | 187 | 188 | def read_clusters(clusters_loc): 189 | print("Reading clusters...") 190 | clusters = {} 191 | with clusters_loc.open() as f: 192 | for line in tqdm(f): 193 | try: 194 | cluster, word, freq = line.split() 195 | word = ftfy.fix_text(word) 196 | except ValueError: 197 | continue 198 | # If the clusterer has only seen the word a few times, its 199 | # cluster is unreliable. 200 | if int(freq) >= 3: 201 | clusters[word] = cluster 202 | else: 203 | clusters[word] = "0" 204 | # Expand clusters with re-casing 205 | for word, cluster in list(clusters.items()): 206 | if word.lower() not in clusters: 207 | clusters[word.lower()] = cluster 208 | if word.title() not in clusters: 209 | clusters[word.title()] = cluster 210 | if word.upper() not in clusters: 211 | clusters[word.upper()] = cluster 212 | return clusters 213 | 214 | 215 | @plac.annotations( 216 | lang=("model language", "positional", None, str), 217 | output_loc=("model output directory", "positional", None, str), 218 | freqs_loc=("location of words frequencies file", "positional", None, Path), 219 | clusters_loc=("location of brown clusters data", "positional", None, Path), 220 | ) 221 | def main(lang, output_loc, freqs_loc, clusters_loc): 222 | clusters = read_clusters(clusters_loc) 223 | probs, oov_prob = read_freqs(freqs_loc) 224 | 225 | with jsonlines.open(output_loc, mode="w") as writer: 226 | header = {"lang": lang, "settings": {"oov_prob": oov_prob}} 227 | 228 | writer.write(header) 229 | 230 | for word_str, cluster in clusters.items(): 231 | 232 | if not word_str: 233 | continue 234 | 235 | word = Word(word_str, cluster, probs) 236 | row = { 237 | "orth": word.orth, # the word text 238 | "id": word.id, # can correspond to row in vectors table 239 | "lower": word.lower, 240 | "norm": word.norm, 241 | "shape": word.shape, 242 | "prefix": word.prefix, 243 | "suffix": word.suffix, 244 | "length": word.length, 245 | "cluster": word.cluster, 246 | "prob": word.prob, 247 | "is_alpha": word.is_alpha, 248 | "is_ascii": word.is_ascii, 249 | "is_digit": word.is_digit, 250 | "is_lower": word.is_lower, 251 | "is_punct": word.is_punct, 252 | "is_space": word.is_space, 253 | "is_title": word.is_title, 254 | "is_upper": word.is_upper, 255 | "like_url": word.like_url, 256 | "like_num": word.like_num, 257 | "like_email": word.like_email, 258 | "is_stop": word.is_stop, 259 | "is_oov": word.is_oov, 260 | "is_quote": word.is_quote, 261 | "is_left_punct": word.is_left_punct, 262 | "is_right_punct": word.is_right_punct, 263 | } 264 | 265 | writer.write(row) 266 | 267 | 268 | if __name__ == "__main__": 269 | plac.call(main) 270 | -------------------------------------------------------------------------------- /notebooks/demo.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## install package" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 2, 13 | "metadata": {}, 14 | "outputs": [ 15 | { 16 | "name": "stdout", 17 | "output_type": "stream", 18 | "text": [ 19 | "Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple\n", 20 | "Processing ./zh_core_web_sm-2.0.3.tar.gz\n", 21 | "Requirement already satisfied: spacy>=2.0.0a18 in /home/howl/.local/lib/python3.5/site-packages (from zh-core-web-sm==2.0.3) (2.0.12)\n", 22 | "Requirement already satisfied: dill<0.3,>=0.2 in /home/howl/.local/lib/python3.5/site-packages (from spacy>=2.0.0a18->zh-core-web-sm==2.0.3) (0.2.8.2)\n", 23 | "Requirement already satisfied: thinc<6.11.0,>=6.10.3 in /home/howl/.local/lib/python3.5/site-packages (from spacy>=2.0.0a18->zh-core-web-sm==2.0.3) (6.10.3)\n", 24 | "Requirement already satisfied: numpy>=1.7 in /usr/local/lib/python3.5/dist-packages (from spacy>=2.0.0a18->zh-core-web-sm==2.0.3) (1.14.2)\n", 25 | "Requirement already satisfied: plac<1.0.0,>=0.9.6 in /home/howl/.local/lib/python3.5/site-packages (from spacy>=2.0.0a18->zh-core-web-sm==2.0.3) (0.9.6)\n", 26 | "Requirement already satisfied: requests<3.0.0,>=2.13.0 in /usr/local/lib/python3.5/dist-packages (from spacy>=2.0.0a18->zh-core-web-sm==2.0.3) (2.18.4)\n", 27 | "Requirement already satisfied: cymem<1.32,>=1.30 in /home/howl/.local/lib/python3.5/site-packages (from spacy>=2.0.0a18->zh-core-web-sm==2.0.3) (1.31.2)\n", 28 | "Requirement already satisfied: ujson>=1.35 in /home/howl/.local/lib/python3.5/site-packages (from spacy>=2.0.0a18->zh-core-web-sm==2.0.3) (1.35)\n", 29 | "Requirement already satisfied: regex==2017.4.5 in /home/howl/.local/lib/python3.5/site-packages (from spacy>=2.0.0a18->zh-core-web-sm==2.0.3) (2017.4.5)\n", 30 | "Requirement already satisfied: murmurhash<0.29,>=0.28 in /home/howl/.local/lib/python3.5/site-packages (from spacy>=2.0.0a18->zh-core-web-sm==2.0.3) (0.28.0)\n", 31 | "Requirement already satisfied: preshed<2.0.0,>=1.0.0 in /home/howl/.local/lib/python3.5/site-packages (from spacy>=2.0.0a18->zh-core-web-sm==2.0.3) (1.0.1)\n", 32 | "Requirement already satisfied: msgpack<1.0.0,>=0.5.6 in /home/howl/.local/lib/python3.5/site-packages (from thinc<6.11.0,>=6.10.3->spacy>=2.0.0a18->zh-core-web-sm==2.0.3) (0.5.6)\n", 33 | "Requirement already satisfied: tqdm<5.0.0,>=4.10.0 in /home/howl/.local/lib/python3.5/site-packages (from thinc<6.11.0,>=6.10.3->spacy>=2.0.0a18->zh-core-web-sm==2.0.3) (4.24.0)\n", 34 | "Requirement already satisfied: six<2.0.0,>=1.10.0 in /usr/lib/python3/dist-packages (from thinc<6.11.0,>=6.10.3->spacy>=2.0.0a18->zh-core-web-sm==2.0.3) (1.10.0)\n", 35 | "Requirement already satisfied: msgpack-numpy<1.0.0,>=0.4.1 in /home/howl/.local/lib/python3.5/site-packages (from thinc<6.11.0,>=6.10.3->spacy>=2.0.0a18->zh-core-web-sm==2.0.3) (0.4.3.1)\n", 36 | "Requirement already satisfied: wrapt<1.11.0,>=1.10.0 in /home/howl/.local/lib/python3.5/site-packages (from thinc<6.11.0,>=6.10.3->spacy>=2.0.0a18->zh-core-web-sm==2.0.3) (1.10.11)\n", 37 | "Requirement already satisfied: cytoolz<0.10,>=0.9.0 in /home/howl/.local/lib/python3.5/site-packages (from thinc<6.11.0,>=6.10.3->spacy>=2.0.0a18->zh-core-web-sm==2.0.3) (0.9.0.1)\n", 38 | "Requirement already satisfied: idna<2.7,>=2.5 in /usr/local/lib/python3.5/dist-packages (from requests<3.0.0,>=2.13.0->spacy>=2.0.0a18->zh-core-web-sm==2.0.3) (2.6)\n", 39 | "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.5/dist-packages (from requests<3.0.0,>=2.13.0->spacy>=2.0.0a18->zh-core-web-sm==2.0.3) (2018.4.16)\n", 40 | "Requirement already satisfied: chardet<3.1.0,>=3.0.2 in /usr/local/lib/python3.5/dist-packages (from requests<3.0.0,>=2.13.0->spacy>=2.0.0a18->zh-core-web-sm==2.0.3) (3.0.4)\n", 41 | "Requirement already satisfied: urllib3<1.23,>=1.21.1 in /usr/local/lib/python3.5/dist-packages (from requests<3.0.0,>=2.13.0->spacy>=2.0.0a18->zh-core-web-sm==2.0.3) (1.22)\n", 42 | "Requirement already satisfied: toolz>=0.8.0 in /home/howl/.local/lib/python3.5/site-packages (from cytoolz<0.10,>=0.9.0->thinc<6.11.0,>=6.10.3->spacy>=2.0.0a18->zh-core-web-sm==2.0.3) (0.9.0)\n", 43 | "Building wheels for collected packages: zh-core-web-sm\n", 44 | " Running setup.py bdist_wheel for zh-core-web-sm ... \u001b[?25ldone\n", 45 | "\u001b[?25h Stored in directory: /home/howl/.cache/pip/wheels/04/63/ec/a66252a0376a1953722cb70a85c50aa4836311eca4d69f75f3\n", 46 | "Successfully built zh-core-web-sm\n", 47 | "Installing collected packages: zh-core-web-sm\n", 48 | " Found existing installation: zh-core-web-sm 2.0.2\n", 49 | " Uninstalling zh-core-web-sm-2.0.2:\n", 50 | " Successfully uninstalled zh-core-web-sm-2.0.2\n", 51 | "Successfully installed zh-core-web-sm-2.0.3\n", 52 | "\u001b[33mYou are using pip version 10.0.1, however version 18.0 is available.\n", 53 | "You should consider upgrading via the 'pip install --upgrade pip' command.\u001b[0m\n" 54 | ] 55 | } 56 | ], 57 | "source": [ 58 | "!pip install -q https://github.com/howl-anderson/Chinese_models_for_SpaCy/releases/download/v2.2.X-0.1.0/zh_core_web_sm-0.1.0.tar.gz" 59 | ] 60 | }, 61 | { 62 | "cell_type": "code", 63 | "execution_count": 1, 64 | "metadata": {}, 65 | "outputs": [], 66 | "source": [ 67 | "!pip install -q pandas\n", 68 | "!pip install -q jieba\n", 69 | "!pip install -q tabulate" 70 | ] 71 | }, 72 | { 73 | "cell_type": "markdown", 74 | "metadata": {}, 75 | "source": [ 76 | "## import packages" 77 | ] 78 | }, 79 | { 80 | "cell_type": "code", 81 | "execution_count": 2, 82 | "metadata": {}, 83 | "outputs": [], 84 | "source": [ 85 | "from spacy import displacy\n", 86 | "from tabulate import tabulate\n", 87 | "import pandas as pd\n", 88 | "\n", 89 | "import zh_core_web_sm" 90 | ] 91 | }, 92 | { 93 | "cell_type": "markdown", 94 | "metadata": {}, 95 | "source": [ 96 | "## load models" 97 | ] 98 | }, 99 | { 100 | "cell_type": "code", 101 | "execution_count": 3, 102 | "metadata": {}, 103 | "outputs": [], 104 | "source": [ 105 | "nlp = zh_core_web_sm.load()" 106 | ] 107 | }, 108 | { 109 | "cell_type": "markdown", 110 | "metadata": {}, 111 | "source": [ 112 | "## parse doc" 113 | ] 114 | }, 115 | { 116 | "cell_type": "code", 117 | "execution_count": 4, 118 | "metadata": {}, 119 | "outputs": [ 120 | { 121 | "name": "stderr", 122 | "output_type": "stream", 123 | "text": [ 124 | "Building prefix dict from the default dictionary ...\n", 125 | "Loading model from cache /tmp/jieba.cache\n", 126 | "Loading model cost 0.435 seconds.\n", 127 | "Prefix dict has been built successfully.\n" 128 | ] 129 | } 130 | ], 131 | "source": [ 132 | "doc = nlp(\"王小明在北京的清华大学读书\")" 133 | ] 134 | }, 135 | { 136 | "cell_type": "markdown", 137 | "metadata": {}, 138 | "source": [ 139 | "## print doc's attributes" 140 | ] 141 | }, 142 | { 143 | "cell_type": "code", 144 | "execution_count": 5, 145 | "metadata": { 146 | "scrolled": false 147 | }, 148 | "outputs": [ 149 | { 150 | "data": { 151 | "text/html": [ 152 | "
\n", 153 | "\n", 166 | "\n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | " \n", 205 | " \n", 206 | " \n", 207 | " \n", 208 | " \n", 209 | " \n", 210 | " \n", 211 | " \n", 212 | " \n", 213 | " \n", 214 | " \n", 215 | " \n", 216 | " \n", 217 | " \n", 218 | " \n", 219 | " \n", 220 | " \n", 221 | " \n", 222 | " \n", 223 | " \n", 224 | " \n", 225 | " \n", 226 | " \n", 227 | " \n", 228 | " \n", 229 | " \n", 230 | " \n", 231 | " \n", 232 | " \n", 233 | " \n", 234 | " \n", 235 | " \n", 236 | " \n", 237 | " \n", 238 | " \n", 239 | " \n", 240 | " \n", 241 | " \n", 242 | " \n", 243 | " \n", 244 | " \n", 245 | " \n", 246 | " \n", 247 | " \n", 248 | " \n", 249 | " \n", 250 | " \n", 251 | " \n", 252 | " \n", 253 | " \n", 254 | " \n", 255 | " \n", 256 | " \n", 257 | " \n", 258 | " \n", 259 | " \n", 260 | " \n", 261 | " \n", 262 | " \n", 263 | " \n", 264 | " \n", 265 | " \n", 266 | " \n", 267 | " \n", 268 | " \n", 269 | " \n", 270 | " \n", 271 | " \n", 272 | " \n", 273 | " \n", 274 | " \n", 275 | " \n", 276 | " \n", 277 | " \n", 278 | " \n", 279 | " \n", 280 | " \n", 281 | " \n", 282 | " \n", 283 | "
textlemma_pos_tag_dep_shape_is_alphais_stophas_vectorent_iob_ent_type_vector_normis_oov
0王小明王小明XNNPnsubjxxxTrueFalseFalseBPERSON0.000000True
1VERBVVcasexTrueTrueTrueO6.573987False
2北京北京XNNPnmodxxTrueFalseTrueBGPE12.769391False
3PARTDECcase:decxTrueTrueTrueO6.886564False
4清华大学清华大学XNNPoblxxxxTrueFalseTrueBORG18.842812False
5读书读书VERBVVROOTxxTrueFalseTrueO18.138533False
\n", 284 | "
" 285 | ], 286 | "text/plain": [ 287 | " text lemma_ pos_ tag_ dep_ shape_ is_alpha is_stop has_vector \\\n", 288 | "0 王小明 王小明 X NNP nsubj xxx True False False \n", 289 | "1 在 在 VERB VV case x True True True \n", 290 | "2 北京 北京 X NNP nmod xx True False True \n", 291 | "3 的 的 PART DEC case:dec x True True True \n", 292 | "4 清华大学 清华大学 X NNP obl xxxx True False True \n", 293 | "5 读书 读书 VERB VV ROOT xx True False True \n", 294 | "\n", 295 | " ent_iob_ ent_type_ vector_norm is_oov \n", 296 | "0 B PERSON 0.000000 True \n", 297 | "1 O 6.573987 False \n", 298 | "2 B GPE 12.769391 False \n", 299 | "3 O 6.886564 False \n", 300 | "4 B ORG 18.842812 False \n", 301 | "5 O 18.138533 False " 302 | ] 303 | }, 304 | "execution_count": 5, 305 | "metadata": {}, 306 | "output_type": "execute_result" 307 | } 308 | ], 309 | "source": [ 310 | "headers = ['text', 'lemma_', 'pos_', 'tag_', 'dep_',\n", 311 | " 'shape_', 'is_alpha', 'is_stop', 'has_vector',\n", 312 | " 'ent_iob_', 'ent_type_',\n", 313 | " 'vector_norm', 'is_oov']\n", 314 | "\n", 315 | "doc_data = []\n", 316 | "\n", 317 | "for token in doc:\n", 318 | " token_data = [token.text, token.lemma_, token.pos_, token.tag_, token.dep_,\n", 319 | " token.shape_, token.is_alpha, token.is_stop, token.has_vector,\n", 320 | " token.ent_iob_, token.ent_type_,\n", 321 | " token.vector_norm, token.is_oov]\n", 322 | " doc_data.append(token_data)\n", 323 | "\n", 324 | "pd.DataFrame(doc_data, columns=headers)" 325 | ] 326 | }, 327 | { 328 | "cell_type": "markdown", 329 | "metadata": {}, 330 | "source": [ 331 | "## draw dependency graph" 332 | ] 333 | }, 334 | { 335 | "cell_type": "code", 336 | "execution_count": 6, 337 | "metadata": { 338 | "scrolled": false 339 | }, 340 | "outputs": [ 341 | { 342 | "data": { 343 | "text/html": [ 344 | "\n", 345 | "\n", 346 | " 王小明\n", 347 | " X\n", 348 | "\n", 349 | "\n", 350 | "\n", 351 | " \n", 352 | " VERB\n", 353 | "\n", 354 | "\n", 355 | "\n", 356 | " 北京\n", 357 | " X\n", 358 | "\n", 359 | "\n", 360 | "\n", 361 | " \n", 362 | " PART\n", 363 | "\n", 364 | "\n", 365 | "\n", 366 | " 清华大学\n", 367 | " X\n", 368 | "\n", 369 | "\n", 370 | "\n", 371 | " 读书\n", 372 | " VERB\n", 373 | "\n", 374 | "\n", 375 | "\n", 376 | " \n", 377 | " \n", 378 | " nsubj\n", 379 | " \n", 380 | " \n", 381 | "\n", 382 | "\n", 383 | "\n", 384 | " \n", 385 | " \n", 386 | " case\n", 387 | " \n", 388 | " \n", 389 | "\n", 390 | "\n", 391 | "\n", 392 | " \n", 393 | " \n", 394 | " nmod\n", 395 | " \n", 396 | " \n", 397 | "\n", 398 | "\n", 399 | "\n", 400 | " \n", 401 | " \n", 402 | " case:dec\n", 403 | " \n", 404 | " \n", 405 | "\n", 406 | "\n", 407 | "\n", 408 | " \n", 409 | " \n", 410 | " obl\n", 411 | " \n", 412 | " \n", 413 | "\n", 414 | "" 415 | ], 416 | "text/plain": [ 417 | "" 418 | ] 419 | }, 420 | "metadata": {}, 421 | "output_type": "display_data" 422 | } 423 | ], 424 | "source": [ 425 | "displacy.render(doc, jupyter=True)" 426 | ] 427 | }, 428 | { 429 | "cell_type": "code", 430 | "execution_count": 7, 431 | "metadata": {}, 432 | "outputs": [ 433 | { 434 | "data": { 435 | "text/html": [ 436 | "
\n", 437 | "\n", 438 | " 王小明\n", 439 | " PERSON\n", 440 | "\n", 441 | "在\n", 442 | "\n", 443 | " 北京\n", 444 | " GPE\n", 445 | "\n", 446 | "的\n", 447 | "\n", 448 | " 清华大学\n", 449 | " ORG\n", 450 | "\n", 451 | "读书
" 452 | ], 453 | "text/plain": [ 454 | "" 455 | ] 456 | }, 457 | "metadata": {}, 458 | "output_type": "display_data" 459 | } 460 | ], 461 | "source": [ 462 | "displacy.render(doc, jupyter=True, style='ent')" 463 | ] 464 | } 465 | ], 466 | "metadata": { 467 | "celltoolbar": "Raw Cell Format", 468 | "kernelspec": { 469 | "display_name": "Python 3", 470 | "language": "python", 471 | "name": "python3" 472 | }, 473 | "language_info": { 474 | "codemirror_mode": { 475 | "name": "ipython", 476 | "version": 3 477 | }, 478 | "file_extension": ".py", 479 | "mimetype": "text/x-python", 480 | "name": "python", 481 | "nbconvert_exporter": "python", 482 | "pygments_lexer": "ipython3", 483 | "version": "3.6.9" 484 | } 485 | }, 486 | "nbformat": 4, 487 | "nbformat_minor": 1 488 | } 489 | --------------------------------------------------------------------------------