├── train_ner.py
├── corpus
└── .gitkeep
├── third-part
└── .gitkeep
├── requirements_dev.txt
├── spacy-dev-resources
├── POS_depedency_model
└── .gitignore
├── train_requirements.txt
├── requirements.txt
├── create_wikipedia_corpus.bash
├── extract_UD_Chinese-GSD_corpus.bash
├── .images
├── ner_of_doc.png
├── attributes_of_doc.png
├── dependency_of_doc.png
├── attributes_of_doc.html
├── temp.html
└── dependency_of_doc.svg
├── merge_all_text_files.bash
├── move_wikipedia_corpus.bash
├── create_jsonl_corpus.bash
├── init_model.bash
├── download_UD_Chinese-GSD_corpus.bash
├── download_and_compile_brown_cluster.bash
├── train.bash
├── create_model_package.bash
├── create_init_model.bash
├── onto_to_spacy_json.bash
├── train_ner.bash
├── compute_brown_cluster.bash
├── compute_plain_word_vec.bash
├── compute_words_freq.bash
├── train_model.bash
├── .gitmodules
├── .idea
├── vcs.xml
├── modules.xml
├── misc.xml
└── Chinese_models_for_SpaCy.iml
├── convert_UD_Chinese-GSD_corpus.bash
├── format_convertor.bash
├── update_model_meta.py
├── test_init_model.py
├── merge_all_text_files.py
├── all_in_one.bash
├── test_dependency_model.py
├── meta.json
├── test.py
├── test_as_model_dir.py
├── test_ner.py
├── test_load.py
├── LICENSE.md
├── merge_submodel.py
├── plain_word_vectors.py
├── README.md
├── workflow.md
├── README.en-US.md
├── .gitignore
├── onto_to_spacy_json.py
├── create_jsonl_vocabulary.py
└── notebooks
└── demo.ipynb
/train_ner.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/corpus/.gitkeep:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/third-part/.gitkeep:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/requirements_dev.txt:
--------------------------------------------------------------------------------
1 | jieba
2 |
--------------------------------------------------------------------------------
/spacy-dev-resources:
--------------------------------------------------------------------------------
1 | ../spacy-dev-resources
--------------------------------------------------------------------------------
/POS_depedency_model/.gitignore:
--------------------------------------------------------------------------------
1 | *
2 | **/*
3 | !.gitignore
--------------------------------------------------------------------------------
/train_requirements.txt:
--------------------------------------------------------------------------------
1 | -r ./spacy-dev-resources/requirements.txt
2 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | plac
2 | spacy
3 | pandas
4 | jieba
5 | ftfy
6 | validators
7 |
--------------------------------------------------------------------------------
/create_wikipedia_corpus.bash:
--------------------------------------------------------------------------------
1 | cd chinese-wikipedia-corpus-creator
2 | bash ./allinone_process.bash
3 |
--------------------------------------------------------------------------------
/extract_UD_Chinese-GSD_corpus.bash:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | unzip ./corpus/UD_Chinese-GSD.zip -d ./corpus
4 |
--------------------------------------------------------------------------------
/.images/ner_of_doc.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/howl-anderson/Chinese_models_for_SpaCy/HEAD/.images/ner_of_doc.png
--------------------------------------------------------------------------------
/merge_all_text_files.bash:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | python ./merge_all_text_files.py ./token_cleaned_plain_files ./WORDS.txt
4 |
--------------------------------------------------------------------------------
/move_wikipedia_corpus.bash:
--------------------------------------------------------------------------------
1 | cp -r chinese-wikipedia-corpus-creator/token_cleaned_plain_files token_cleaned_plain_files
2 |
--------------------------------------------------------------------------------
/.images/attributes_of_doc.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/howl-anderson/Chinese_models_for_SpaCy/HEAD/.images/attributes_of_doc.png
--------------------------------------------------------------------------------
/.images/dependency_of_doc.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/howl-anderson/Chinese_models_for_SpaCy/HEAD/.images/dependency_of_doc.png
--------------------------------------------------------------------------------
/create_jsonl_corpus.bash:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | python ./create_jsonl_vocabulary.py zh spacy_corpus.jsonl WORDS_FREQ.txt ./WORDS-c1000-p1.out/paths
4 |
--------------------------------------------------------------------------------
/init_model.bash:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 |
3 | python -m spacy init-model zh zh_model/ WORDS_FREQ.txt -c WORDS-c1000-p1.out/paths -v WORDS_VECS.txt
4 |
--------------------------------------------------------------------------------
/download_UD_Chinese-GSD_corpus.bash:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | wget -c https://github.com/UniversalDependencies/UD_Chinese-GSD/archive/master.zip -O corpus/UD_Chinese-GSD.zip
4 |
--------------------------------------------------------------------------------
/download_and_compile_brown_cluster.bash:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 |
3 | cd third-part
4 |
5 | git clone https://github.com/percyliang/brown-cluster.git
6 |
7 | cd brown-cluster
8 | make
--------------------------------------------------------------------------------
/train.bash:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | ../MITIE/tools/wordrep/build/wordrep --count-words 800000 --word-vects --basic-morph --cca-morph ../wikipedia-corpus-creator/token_cleaned_plain_files
4 |
--------------------------------------------------------------------------------
/create_model_package.bash:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | python -m spacy package spacy_models/final_model spacy_package --force
4 |
5 | cd spacy_package/zh_core_web_sm-0.1.0
6 | python ./setup.py sdist
7 |
--------------------------------------------------------------------------------
/create_init_model.bash:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | python -m spacy init-model zh spacy_models/base_model --jsonl-loc ./spacy_corpus.jsonl --vectors-loc WORDS_VECS.txt --vectors-name zh_core_web_sm.vectors
4 |
--------------------------------------------------------------------------------
/onto_to_spacy_json.bash:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 |
3 | python onto_to_spacy_json.py -i "./ontonotes-release-5.0/data/files/data/chinese/annotations/" -t "china_ner_train.json" -e "china_ner_eval.json" -v 0.05
4 |
--------------------------------------------------------------------------------
/train_ner.bash:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | python -m spacy train zh spacy_models/ner_model ./china_ner_train.json ./china_ner_eval.json --pipeline ner -m meta.json -v ./spacy_models/dependency_model/model-best -n 1
4 |
--------------------------------------------------------------------------------
/compute_brown_cluster.bash:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | cpu_count=`nproc --all`
4 | process_count=$(expr $cpu_count - 1)
5 |
6 | ./third-part/brown-cluster/wcluster --text WORDS.txt --c 1000 --threads ${process_count}
7 |
--------------------------------------------------------------------------------
/compute_plain_word_vec.bash:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | cpu_count=`nproc --all`
4 | process_count=$(expr $cpu_count - 1)
5 |
6 | python ./spacy-dev-resources/training/plain_word_vectors.py -i 200 -n ${process_count} ./WORDS.txt WORDS_VECS.txt
7 |
--------------------------------------------------------------------------------
/compute_words_freq.bash:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | cpu_count=`nproc --all`
4 | process_count=$(expr $cpu_count - 1)
5 |
6 | python ./spacy-dev-resources/training/plain_word_freqs.py -n ${process_count} token_cleaned_plain_files WORDS_FREQ.txt
7 |
--------------------------------------------------------------------------------
/train_model.bash:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | python -m spacy train zh spacy_models/dependency_model corpus/spacy/zh-simplified-ud-train.json corpus/spacy/zh-simplified-ud-dev.json --pipeline tagger,parser -v spacy_models/base_model -m meta.json -V 0.1.0 -n 1
4 |
--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "spacy-dev-resources"]
2 | path = spacy-dev-resources
3 | url = https://github.com/howl-anderson/spacy-dev-resources.git
4 | [submodule "third-part/brown-cluster"]
5 | path = third-part/brown-cluster
6 | url = https://github.com/howl-anderson/brown-cluster.git
7 |
--------------------------------------------------------------------------------
/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
--------------------------------------------------------------------------------
/.idea/modules.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/convert_UD_Chinese-GSD_corpus.bash:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 |
3 | cd corpus/UD_Chinese-GSD-master
4 |
5 | opencc -i zh_gsd-ud-train.conllu -o zh-simplified-ud-train.conllu -c t2s.json
6 | opencc -i zh_gsd-ud-dev.conllu -o zh-simplified-ud-dev.conllu -c t2s.json
7 | opencc -i zh_gsd-ud-test.conllu -o zh-simplified-ud-test.conllu -c t2s.json
8 |
--------------------------------------------------------------------------------
/.idea/misc.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
--------------------------------------------------------------------------------
/format_convertor.bash:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | mkdir -p corpus/spacy
4 |
5 | python -m spacy convert corpus/UD_Chinese-GSD-master/zh-simplified-ud-train.conllu corpus/spacy
6 | python -m spacy convert corpus/UD_Chinese-GSD-master/zh-simplified-ud-dev.conllu corpus/spacy
7 | python -m spacy convert corpus/UD_Chinese-GSD-master/zh-simplified-ud-test.conllu corpus/spacy
8 |
--------------------------------------------------------------------------------
/update_model_meta.py:
--------------------------------------------------------------------------------
1 | import json
2 |
3 |
4 | def main():
5 | with open("./spacy_models/base_model/meta.json") as fd:
6 | data = json.load(infd)
7 |
8 | data["name"] = "core_web_sm"
9 |
10 | with open("./spacy_models/base_model/meta.json", "wt") as fd:
11 | json.dump(data, fd)
12 |
13 |
14 | if __name__ == "__main__":
15 | main()
16 |
--------------------------------------------------------------------------------
/test_init_model.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 | from spacy import displacy
4 | import spacy
5 |
6 | nlp = spacy.load('zh_model/')
7 |
8 |
9 | def main():
10 | doc = nlp("王小明在北京的清华大学读书")
11 | for token in doc:
12 | print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
13 | token.shape_, token.is_alpha, token.is_stop, token.has_vector,
14 | token.vector_norm, token.is_oov)
15 |
16 | displacy.serve(doc)
17 |
18 |
19 | if __name__ == "__main__":
20 | main()
21 |
--------------------------------------------------------------------------------
/merge_all_text_files.py:
--------------------------------------------------------------------------------
1 | import pathlib
2 | import sys
3 |
4 | input_dir = sys.argv[1]
5 | input_path = pathlib.Path(input_dir)
6 | input_files = input_path.glob("*")
7 |
8 | output_file = sys.argv[2]
9 | output_path = pathlib.Path(output_file)
10 |
11 |
12 | with output_path.open("wt") as outfile:
13 | for fname in input_files:
14 | with fname.open("rt") as infile:
15 | for line in infile:
16 | if not line.endswith("\n"):
17 | line = line + "\n"
18 | outfile.write(line)
19 |
--------------------------------------------------------------------------------
/all_in_one.bash:
--------------------------------------------------------------------------------
1 | ./create_wikipedia_corpus.bash
2 | ./move_wikipedia_corpus.bash
3 | ./compute_words_freq.bash
4 | ./merge_all_text_files.bash
5 | ./download_and_compile_brown_cluster.bash
6 | ./compute_plain_word_vec.bash
7 | ./create_init_model.bash
8 | ./update_model_meta.py
9 | ./download_UD_Chinese-GSD_corpus.bash
10 | ./extract_UD_Chinese-GSD_corpus.bash
11 | ./convert_UD_Chinese-GSD_corpus.bash
12 | ./format_convertor.bash
13 | ./init_model.bash
14 | ./train_model.bash
15 | ./onto_to_spacy_json.bash
16 | ./train_ner.bash
17 | ./merge_submodel.py
18 |
--------------------------------------------------------------------------------
/test_dependency_model.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 | from spacy import displacy
4 | import spacy
5 |
6 | nlp = spacy.load('depedency_model/model-final/')
7 |
8 |
9 | def main():
10 | doc = nlp("王小明在北京的清华大学读书")
11 | for token in doc:
12 | print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
13 | token.shape_, token.is_alpha, token.is_stop, token.has_vector,
14 | token.vector_norm, token.is_oov)
15 |
16 | displacy.serve(doc)
17 |
18 |
19 | if __name__ == "__main__":
20 | main()
21 |
--------------------------------------------------------------------------------
/meta.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "core_web_sm",
3 | "version": "0.1.0",
4 | "license": "CC BY-SA 3.0",
5 | "author": "Xiaoquan Kong",
6 | "url": "https://xiaoquankong.ai",
7 | "sources": [
8 | "OntoNotes 5",
9 | "Common Crawl",
10 | "Universal Dependencies"
11 | ],
12 | "email": "u1mail2me@gmail.com",
13 | "description": "Chinese multi-task CNN trained on OntoNotes, with GloVe vectors trained on Common Crawl. Assigns word vectors, context-specific token vectors, POS tags, dependency parse and named entities."
14 | }
15 |
--------------------------------------------------------------------------------
/test.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 | from spacy import displacy
4 |
5 | import zh_core_web_sm
6 |
7 | nlp = zh_core_web_sm.load()
8 |
9 |
10 | def main():
11 | doc = nlp("王小明在北京的清华大学读书")
12 | for token in doc:
13 | print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
14 | token.shape_, token.is_alpha, token.is_stop, token.has_vector,
15 | token.ent_iob_, token.ent_type_,
16 | token.vector_norm, token.is_oov)
17 |
18 | # displacy.serve(doc)
19 |
20 |
21 | if __name__ == "__main__":
22 | main()
23 |
--------------------------------------------------------------------------------
/test_as_model_dir.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 | from spacy import displacy
4 | import spacy
5 |
6 | nlp = spacy.load("./spacy_models/final_model")
7 |
8 |
9 | def main():
10 | doc = nlp("王小明在北京的清华大学读书")
11 | for token in doc:
12 | print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
13 | token.shape_, token.is_alpha, token.is_stop, token.has_vector,
14 | token.ent_iob_, token.ent_type_,
15 | token.vector_norm, token.is_oov)
16 |
17 | # displacy.serve(doc)
18 |
19 |
20 | if __name__ == "__main__":
21 | main()
22 |
--------------------------------------------------------------------------------
/test_ner.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 | from spacy import displacy
4 | import spacy
5 |
6 | nlp = spacy.load('ner_model/model-final')
7 |
8 |
9 | def main():
10 | doc = nlp("王小明在北京的清华大学读书")
11 | for token in doc:
12 | print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
13 | token.shape_, token.is_alpha, token.is_stop, token.has_vector,
14 | token.ent_iob_, token.ent_type_,
15 | token.vector_norm, token.is_oov)
16 |
17 | displacy.serve(doc, style='ent')
18 |
19 |
20 | if __name__ == "__main__":
21 | main()
22 |
--------------------------------------------------------------------------------
/test_load.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 | from spacy import displacy
4 |
5 | import spacy
6 |
7 | nlp = spacy.load('package_templete/zh_core_web_sm-2.0.5/zh_core_web_sm/zh_core_web_sm-2.0.5')
8 |
9 |
10 | def main():
11 | doc = nlp("王小明在北京的清华大学读书")
12 | for token in doc:
13 | print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
14 | token.shape_, token.is_alpha, token.is_stop, token.has_vector,
15 | token.ent_iob_, token.ent_type_,
16 | token.vector_norm, token.is_oov)
17 |
18 | # displacy.serve(doc)
19 |
20 |
21 | if __name__ == "__main__":
22 | main()
23 |
--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2018 Xiaoquan Kong
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/.idea/Chinese_models_for_SpaCy.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
--------------------------------------------------------------------------------
/merge_submodel.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python3
2 |
3 | import shutil
4 | import json
5 | from pathlib import Path
6 |
7 |
8 | def read_pipeline(meta_file):
9 | with open(meta_file) as fd:
10 | data = json.load(fd)
11 | return data["pipeline"]
12 |
13 |
14 | def update_pipeline(meta_file, pipeline):
15 | with open(meta_file) as fd:
16 | data = json.load(fd)
17 |
18 | data["pipeline"] = pipeline
19 |
20 | with open(meta_file, "w") as fd:
21 | json.dump(data, fd)
22 |
23 |
24 | def copy_tree(src: Path, dst: Path, folder: str):
25 | shutil.copytree(src / folder, dst / folder)
26 |
27 |
28 | def main():
29 | target_dir = Path("./spacy_models/final_model")
30 | target_dir.mkdir(exist_ok=True)
31 |
32 | pipeline = []
33 |
34 | source_dir = Path("./spacy_models/dependency_model/model-best")
35 | copy_tree(source_dir, target_dir, "parser")
36 | copy_tree(source_dir, target_dir, "tagger")
37 | copy_tree(source_dir, target_dir, "vocab")
38 |
39 | pipeline.extend(read_pipeline(source_dir / "meta.json"))
40 |
41 | source_dir = Path("./spacy_models/ner_model/model-best")
42 | copy_tree(source_dir, target_dir, "ner")
43 | shutil.copy(source_dir / "meta.json", target_dir / "meta.json")
44 |
45 | pipeline.extend(read_pipeline(source_dir / "meta.json"))
46 |
47 | update_pipeline(target_dir / "meta.json", pipeline)
48 |
49 |
50 | if __name__ == "__main__":
51 | main()
52 |
--------------------------------------------------------------------------------
/plain_word_vectors.py:
--------------------------------------------------------------------------------
1 | import plac
2 | import gensim
3 | from gensim import utils
4 |
5 |
6 | class Corpus:
7 | def __init__(self, corpus_file):
8 | self.corpus_file = corpus_file
9 |
10 | def __iter__(self):
11 | with open(self.corpus_file) as fd:
12 | for line in fd:
13 | yield utils.simple_preprocess(line)
14 |
15 |
16 | @plac.annotations(
17 | in_dir=("Location of input directory"),
18 | out_loc=("Location of output file"),
19 | n_workers=("Number of workers", "option", "n", int),
20 | size=("Dimension of the word vectors", "option", "d", int),
21 | window=("Context window size", "option", "w", int),
22 | min_count=("Min count", "option", "m", int),
23 | negative=("Number of negative samples", "option", "g", int),
24 | nr_iter=("Number of iterations", "option", "i", int),
25 | )
26 | def main(
27 | in_dir,
28 | out_loc,
29 | negative=5,
30 | n_workers=4,
31 | window=5,
32 | size=128,
33 | min_count=10,
34 | nr_iter=2,
35 | ):
36 | sentences = Corpus(in_dir)
37 | model = gensim.models.Word2Vec(
38 | sentences=sentences,
39 | size=size,
40 | window=window,
41 | min_count=min_count,
42 | workers=n_workers,
43 | sample=1e-5,
44 | negative=negative,
45 | iter=nr_iter,
46 | )
47 | model.wv.save_word2vec_format(out_loc, binary=False)
48 |
49 |
50 | if __name__ == "__main__":
51 | plac.call(main)
52 |
--------------------------------------------------------------------------------
/.images/attributes_of_doc.html:
--------------------------------------------------------------------------------
1 |
2 |
15 |
16 |
17 |
18 | |
19 | text |
20 | lemma_ |
21 | pos_ |
22 | tag_ |
23 | dep_ |
24 | shape_ |
25 | is_alpha |
26 | is_stop |
27 | has_vector |
28 | vector_norm |
29 | is_oov |
30 |
31 |
32 |
33 |
34 | | 0 |
35 | 王小明 |
36 | 王小明 |
37 | X |
38 | NNP |
39 | nsubj |
40 | xxx |
41 | True |
42 | False |
43 | True |
44 | 0.392991 |
45 | True |
46 |
47 |
48 | | 1 |
49 | 在 |
50 | 在 |
51 | X |
52 | VV |
53 | acl |
54 | x |
55 | True |
56 | False |
57 | True |
58 | 7.318524 |
59 | False |
60 |
61 |
62 | | 2 |
63 | 北京 |
64 | 北京 |
65 | X |
66 | NNP |
67 | det |
68 | xx |
69 | True |
70 | False |
71 | True |
72 | 10.940736 |
73 | False |
74 |
75 |
76 | | 3 |
77 | 的 |
78 | 的 |
79 | X |
80 | DEC |
81 | case:dec |
82 | x |
83 | True |
84 | False |
85 | True |
86 | 6.201293 |
87 | False |
88 |
89 |
90 | | 4 |
91 | 清华大学 |
92 | 清华大学 |
93 | X |
94 | NNP |
95 | obj |
96 | xxxx |
97 | True |
98 | False |
99 | True |
100 | 12.044737 |
101 | False |
102 |
103 |
104 | | 5 |
105 | 读书 |
106 | 读书 |
107 | X |
108 | VV |
109 | ROOT |
110 | xx |
111 | True |
112 | False |
113 | True |
114 | 11.602811 |
115 | False |
116 |
117 |
118 |
119 |
120 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | [README written in English](README.en-US.md)
2 | ------------------------------
3 | # SpaCy 官方中文模型已经上线(https://spacy.io/models/zh), 参考了本项目,具有相同的特性。本项目『推动 SpaCy 中文模型开发』的使命已经完成,本项目将进入维护状态,后续更新将只进行 bug 修复,感谢各位用户长期的关注和支持。
4 |
5 | # SpaCy 中文模型
6 |
7 | 为 SpaCy 提供的中文数据模型. 模型目前还处于 beta 公开测试的状态 。
8 |
9 | ## 在线演示
10 |
11 | 基于 Jupyter notebook 的在线演示在 [](https://mybinder.org/v2/gh/howl-anderson/Chinese_models_for_SpaCy/master?filepath=notebooks%2Fdemo.ipynb)。
12 |
13 | ### 特性
14 | 部分 `王小明在北京的清华大学读书` 这个 `Doc` 对象的属性信息:
15 |
16 | 
17 |
18 | ### NER (**New!**)
19 | 部分 `王小明在北京的清华大学读书` 这个 `Doc` 对象的 NER 信息:
20 |
21 | 
22 |
23 | ## 开始使用
24 |
25 | 模型用二进制文件的形式进行分发, 用户应该具备基础的 SpaCy (version > 2) 的基础知识.
26 |
27 | ### 系统要求
28 |
29 | Python 3 (也许支持 python2, 但未经过良好测试)
30 |
31 | ### 安装
32 |
33 | #### 下载模型
34 | 从 [releases](https://github.com/howl-anderson/Chinese_models_for_SpaCy/releases) 页面下载模型 (**New!** 为中国地区的用户提供了加速下载的链接)。假设所下载的模型名为 `zh_core_web_sm-2.x.x.tar.gz`。
35 |
36 | #### 安装模型
37 |
38 | ```
39 | pip install zh_core_web_sm-2.x.x.tar.gz
40 | ```
41 |
42 | 为了方便后续在 Rasa NLU 等框架中使用,需要再为这个模型建立一个链接,by 执行以下命令:
43 |
44 | ```bash
45 | spacy link zh_core_web_sm zh
46 | ```
47 |
48 | 运行完成后就可以使用 zh 这个别名来访问这个模型了。
49 |
50 | ## 运行 Demo 代码
51 |
52 | Demo 代码位于 `test.py`. 在安装好模型后,用户下载或者克隆本仓库的代码,然后可以直接执行
53 |
54 | ```bash
55 | python3 ./test.py
56 | ```
57 |
58 | 打开地址 `http://127.0.0.1:5000`, 将看到如下:
59 |
60 | 
61 |
62 | ## 如何从零构造这个模型
63 |
64 | 见 [workflow](workflow.md)
65 |
66 | ## 语料库
67 | 本项目使用的语料库是 OntoNotes 5.0。
68 |
69 | 由于 OntoNotes 5.0 是 LDC ([Linguistic Data Consortium](https://www.ldc.upenn.edu/)) 的版权材料,无法直接包含在本项目中。好消息是,OntoNotes 5.0 对于 团体用户(包含企业和学术组织)是完全免费的。用户可以建立一个企业或者学术组织账号,然后免费获取 OntoNotes 5.0。
70 |
71 | ## TODO list
72 |
73 | * 属性 `pos_` 不正确. 这个和 SpaCy 中中文语言 Class 相关。
74 | * 属性 `shape_` and `is_alpha` 似乎对中文并无意义, 但需要权威信息确认一下.
75 | * 属性 `is_stop` 不正确. 这个和 SpaCy 中中文语言 Class 相关。
76 | * 属性 `vector` 似乎没有训练的很好。
77 | * 属性 `is_oov` 完全错误. 第一优先级修复。
78 | * NER 模型,因为缺少 LDC 语料库,目前不可用. 正在解决中正在训练中。
79 | * 将训练中所用的中间结果 release 出来, 方便用户自行定制模型
80 |
81 | ## 使用的组件
82 |
83 | * TODO
84 |
85 | ## 如何贡献
86 |
87 | 请阅读 [CONTRIBUTING.md](https://gist.github.com/PurpleBooth/b24679402957c63ec426) , 然后提交 pull requests 给我们.
88 |
89 | ## 版本化控制
90 |
91 | 我们使用 [SemVer](http://semver.org/) 做版本化的标准. 查看 `tags` 以了解所有的版本.
92 |
93 | ## 作者
94 |
95 | * **Xiaoquan Kong** - *Initial work* - [howl-anderson](https://github.com/howl-anderson)
96 |
97 | 更多贡献者信息,请参考 `contributors`.
98 |
99 | ## 版权
100 |
101 | MIT License - 详见 [LICENSE.md](LICENSE.md)
102 |
103 | ## 致谢
104 |
105 | * TODO
106 |
--------------------------------------------------------------------------------
/.images/temp.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | Title
6 |
7 |
8 |
9 |
22 |
23 |
24 |
25 | |
26 | text |
27 | lemma_ |
28 | pos_ |
29 | tag_ |
30 | dep_ |
31 | shape_ |
32 | is_alpha |
33 | is_stop |
34 | has_vector |
35 | vector_norm |
36 | is_oov |
37 |
38 |
39 |
40 |
41 | | 0 |
42 | 王小明 |
43 | 王小明 |
44 | X |
45 | NNP |
46 | nsubj |
47 | xxx |
48 | True |
49 | False |
50 | True |
51 | 0.392991 |
52 | True |
53 |
54 |
55 | | 1 |
56 | 在 |
57 | 在 |
58 | X |
59 | VV |
60 | acl |
61 | x |
62 | True |
63 | False |
64 | True |
65 | 7.318524 |
66 | False |
67 |
68 |
69 | | 2 |
70 | 北京 |
71 | 北京 |
72 | X |
73 | NNP |
74 | det |
75 | xx |
76 | True |
77 | False |
78 | True |
79 | 10.940736 |
80 | False |
81 |
82 |
83 | | 3 |
84 | 的 |
85 | 的 |
86 | X |
87 | DEC |
88 | case:dec |
89 | x |
90 | True |
91 | False |
92 | True |
93 | 6.201293 |
94 | False |
95 |
96 |
97 | | 4 |
98 | 清华大学 |
99 | 清华大学 |
100 | X |
101 | NNP |
102 | obj |
103 | xxxx |
104 | True |
105 | False |
106 | True |
107 | 12.044737 |
108 | False |
109 |
110 |
111 | | 5 |
112 | 读书 |
113 | 读书 |
114 | X |
115 | VV |
116 | ROOT |
117 | xx |
118 | True |
119 | False |
120 | True |
121 | 11.602811 |
122 | False |
123 |
124 |
125 |
126 |
127 |
128 |
--------------------------------------------------------------------------------
/workflow.md:
--------------------------------------------------------------------------------
1 | # SpaCy Chinese model training workflow
2 |
3 | ## get preprocessed Chinese Wikipedia corpus
4 | see project [chinese-wikipedia-corpus-creator](https://github.com/howl-anderson/chinese-wikipedia-corpus-creator) for more details.
5 |
6 | ### produce wikipedia corpus ###
7 | * input: -
8 | * output: `token_cleaned_plain_files/`
9 | * script: `create_wikipedia_corpus.bash`
10 |
11 | ### copy corpus to workspace ###
12 | * input: `chinese-wikipedia-corpus-creator/token_cleaned_plain_files/``
13 | * output: `token_cleaned_plain_files/`
14 | * script: `move_wikipedia_corpus.bash`
15 |
16 | ## computing word frequency
17 | * input: `token_cleaned_plain_files/*`
18 | * output: `WORDS_FREQ.txt`
19 | * script: `compute_words_freq.bash`
20 |
21 | ## merge all files into one
22 | * input: `token_cleaned_plain_files/*`
23 | * output: `WORDS.txt`
24 | * script: `merge_all_text_files.bash`
25 |
26 | ## compute brown cluster
27 | ### brown cluster computing software
28 | Official software is [brown-cluster](https://github.com/percyliang/brown-cluster).
29 |
30 | ### install
31 | * input: -
32 | * output: ``
33 | * script: `download_and_compile_brown_cluster.bash`
34 |
35 | ### computing
36 | * input: `WORDS.txt`
37 | * output: `WORDS-c1000-p1.out/*`
38 | * script: `compute_brown_cluster.bash`
39 |
40 | ## compute word vector
41 | * input: `token_cleaned_plain_files/*`
42 | * output: `WORDS_VECS.txt`
43 | * script: `compute_plain_word_vec.bash`
44 |
45 | ## initial SpaCy model
46 |
47 | ### build base model
48 | * input: `./WORDS-c1000-p1.out/paths WORDS_VECS.txt WORDS_FREQ.txt`
49 | * output: `spacy_models/base_model/**/*`
50 | * script: `create_init_model.bash`
51 |
52 | ### modify model name
53 | * input: `spacy_models/base_model/meta.json`
54 | * output: `spacy_models/base_model/meta.json`
55 | * script: `update_model_meta.py`
56 |
57 |
58 | ## getting UD_Chinese-GSD corpus
59 |
60 | ### download
61 | * input: -
62 | * output: `corpus/UD_Chinese-GS.zip`
63 | * script: `download_UD_Chinese-GSD_corpus.bash`
64 |
65 | ### extracting
66 | * input: `corpus/UD_Chinese-GSd.zip`
67 | * output: `corpus/UD_Chinese-GSd`
68 | * script: `extract_UD_Chinese-GSD_corpus.bash`
69 |
70 | ### convert to simplified Chinese
71 | * input: `corpus/UD_Chinese-GSd/zh-ud-*.conllu`
72 | * output: `corpus/UD_Chinese-GSd/zh-simplified-ud-*.conllu`
73 | * script: `convert_UD_Chinese-GSD_corpus.bash`
74 |
75 | ## convert UD corpus format
76 | * input: `.corpus/UD_Chinese-GSD/zh-simplified-ud-*.conllu`
77 | * output: `corpus/spacy/zh-simplified-ud-*.conllu`
78 | * script: `format_convertor.bash`
79 |
80 | ## init spacy model with word vector & word cluster & word frquence
81 | * input: `WORDS_FREQ.txt`, `WORDS-c1000-p1.out/paths`, `WORDS_VECS.txt`
82 | * output: `zh_model/*`
83 | * script: `init_model.bash`
84 |
85 | ## train SpaCy model for POS and dependency parser
86 | * input: `zh_model corpus/spacy/zh-simplified-ud-*.conllu`
87 | * output: `dependency_model`
88 | * script: `train_model.bash`
89 |
90 | ## translate onotNote 5 to spacy json file
91 | * input: `TODO`
92 | * output: `TODO`
93 | * script: `onto_to_spacy_json.bash`
94 |
95 | ## train SpaCy model for NER parser
96 | * input: `zh_model china_ner_train.json china_ner_eval.json`
97 | * output: `ner_model`
98 | * script: `train_ner.bash`
99 |
100 | ## merge sub-model
101 | * input: `spacy_models/dependency_model`, `spacy_models/ner_model`
102 | * output: `spacy_models/final_model`
103 | * script: `merge_submodel.py`
104 |
105 | ## create package
106 | * input: `spacy_models/final_model/`
107 | * output: `spacy_package/`
108 | * script: `./create_model_package.bash`
109 |
--------------------------------------------------------------------------------
/README.en-US.md:
--------------------------------------------------------------------------------
1 | [中文版本的 README](README.zh-Hans.md)
2 | ------------------------------
3 |
4 | # The official Chinese model for SpaCy is now available at (https://spacy.io/models/zh). It was developed with reference to this project and shares the same features. As the goal of this project — “promoting the development of the SpaCy Chinese model” — has been achieved, this repository will enter maintenance mode. Future updates will focus only on bug fixes. We would like to thank all users for their long-term attention and support.
5 |
6 | # Chinese models for SpaCy
7 |
8 | SpaCy (version > 2) models for Chinese language. Those models are rough and still **working in prograss**. But "Something is Better Than Nothing".
9 |
10 | ## Online demo
11 |
12 | An online jupyter notebook / demo is provided at [](https://mybinder.org/v2/gh/howl-anderson/Chinese_models_for_SpaCy/master?filepath=notebooks%2Fdemo.ipynb).
13 |
14 | ### Features
15 |
16 | Partial attributes of a `Doc` object for `王小明在北京的清华大学读书`:
17 |
18 | 
19 |
20 | ### NER (**New!**)
21 | NER of a `Doc` object for `王小明在北京的清华大学读书`:
22 |
23 | 
24 |
25 | ## Getting Started
26 |
27 | Models are released as binary file, users should know basic knowledge of using SpaCy version 2+.
28 |
29 | ### Prerequisites
30 |
31 | Python 3 (maybe python2, but currently not well tested)
32 |
33 | ### Installing
34 |
35 | Download relased model from `releases`.
36 |
37 | ```
38 | wget -c https://github.com/howl-anderson/Chinese_models_for_SpaCy/releases/download/v2.0.4/zh_core_web_sm-2.0.4.tar.gz
39 | ```
40 |
41 | then install model
42 |
43 | ```
44 | pip install zh_core_web_sm-2.0.4.tar.gz
45 | ```
46 |
47 |
48 | ## Running demo code
49 |
50 | `test.py` contains demo codes. After install the model, user can download or clone this repo then execute:
51 |
52 | ```bash
53 | python3 ./test.py
54 | ```
55 |
56 | then, open web browser to `http://127.0.0.1:5000`, user will see image simllar to this:
57 |
58 | 
59 |
60 | ## How to re-produce model
61 |
62 | See [workflow](workflow.md)
63 |
64 | ## Corpus Data
65 | The corpus data used in this project is OntoNotes 5.0。
66 |
67 | Since OntoNotes 5.0 is copyright material of LDC ([Linguistic Data Consortium](https://www.ldc.upenn.edu/)) . This project can not include the daa directly。Good news is OntoNotes 5.0 is free to organizer user, you can set up a count for your company or school, then you can get the OntoNotes 5.0 at no cost。
68 |
69 |
70 | ## TODO list
71 |
72 | * Attribute `pos_` is not working correctly. This related to Language class in SpaCy.
73 | * Attribute `shape_` and `is_alpha` seems meaningless for Chinese, need make sure of it.
74 | * Attribute `is_stop` is not working correctly. This related to Language class in SpaCy.
75 | * Attribute `vector` seems not well trained
76 | * Attribute `is_oov` is totally incorrect. First priority.
77 | * NER model is not available due to lacking of LDC corpus. I am working on it.
78 | * Release all the intermediate material to help user build own model
79 |
80 | ## Built With
81 |
82 | * TODO
83 |
84 | ## Contributing
85 |
86 | Please read [CONTRIBUTING.md](https://gist.github.com/PurpleBooth/b24679402957c63ec426) for details on our code of conduct, and the process for submitting pull requests to us.
87 |
88 | ## Versioning
89 |
90 | We use [SemVer](http://semver.org/) for versioning. For the versions available, see the `tags` on this repository.
91 |
92 | ## Authors
93 |
94 | * **Xiaoquan Kong** - *Initial work* - [howl-anderson](https://github.com/howl-anderson)
95 |
96 | See also the list of `contributors` who participated in this project.
97 |
98 | ## License
99 |
100 | This project is licensed under the MIT License - see the [LICENSE.md](LICENSE.md) file for details
101 |
102 | ## Acknowledgments
103 |
104 | * TODO
105 |
--------------------------------------------------------------------------------
/.images/dependency_of_doc.svg:
--------------------------------------------------------------------------------
1 |
2 |
73 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | *.egg-info/
24 | .installed.cfg
25 | *.egg
26 | MANIFEST
27 |
28 | # PyInstaller
29 | # Usually these files are written by a python script from a template
30 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
31 | *.manifest
32 | *.spec
33 |
34 | # Installer logs
35 | pip-log.txt
36 | pip-delete-this-directory.txt
37 |
38 | # Unit test / coverage reports
39 | htmlcov/
40 | .tox/
41 | .coverage
42 | .coverage.*
43 | .cache
44 | nosetests.xml
45 | coverage.xml
46 | *.cover
47 | .hypothesis/
48 | .pytest_cache/
49 |
50 | # Translations
51 | *.mo
52 | *.pot
53 |
54 | # Django stuff:
55 | *.log
56 | local_settings.py
57 | db.sqlite3
58 |
59 | # Flask stuff:
60 | instance/
61 | .webassets-cache
62 |
63 | # Scrapy stuff:
64 | .scrapy
65 |
66 | # Sphinx documentation
67 | docs/_build/
68 |
69 | # PyBuilder
70 | target/
71 |
72 | # Jupyter Notebook
73 | .ipynb_checkpoints
74 |
75 | # pyenv
76 | .python-version
77 |
78 | # celery beat schedule file
79 | celerybeat-schedule
80 |
81 | # SageMath parsed files
82 | *.sage.py
83 |
84 | # Environments
85 | .env
86 | .venv
87 | env/
88 | venv/
89 | ENV/
90 | env.bak/
91 | venv.bak/
92 |
93 | # Spyder project settings
94 | .spyderproject
95 | .spyproject
96 |
97 | # Rope project settings
98 | .ropeproject
99 |
100 | # mkdocs documentation
101 | /site
102 |
103 | # mypy
104 | .mypy_cache/
105 | # General
106 | .DS_Store
107 | .AppleDouble
108 | .LSOverride
109 |
110 | # Icon must end with two \r
111 | Icon
112 |
113 |
114 | # Thumbnails
115 | ._*
116 |
117 | # Files that might appear in the root of a volume
118 | .DocumentRevisions-V100
119 | .fseventsd
120 | .Spotlight-V100
121 | .TemporaryItems
122 | .Trashes
123 | .VolumeIcon.icns
124 | .com.apple.timemachine.donotpresent
125 |
126 | # Directories potentially created on remote AFP share
127 | .AppleDB
128 | .AppleDesktop
129 | Network Trash Folder
130 | Temporary Items
131 | .apdisk
132 | # -*- mode: gitignore; -*-
133 | *~
134 | \#*\#
135 | /.emacs.desktop
136 | /.emacs.desktop.lock
137 | *.elc
138 | auto-save-list
139 | tramp
140 | .\#*
141 |
142 | # Org-mode
143 | .org-id-locations
144 | *_archive
145 |
146 | # flymake-mode
147 | *_flymake.*
148 |
149 | # eshell files
150 | /eshell/history
151 | /eshell/lastdir
152 |
153 | # elpa packages
154 | /elpa/
155 |
156 | # reftex files
157 | *.rel
158 |
159 | # AUCTeX auto folder
160 | /auto/
161 |
162 | # cask packages
163 | .cask/
164 | dist/
165 |
166 | # Flycheck
167 | flycheck_*.el
168 |
169 | # server auth directory
170 | /server/
171 |
172 | # projectiles files
173 | .projectile
174 |
175 | # directory configuration
176 | .dir-locals.el
177 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and WebStorm
178 | # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
179 |
180 | # User-specific stuff
181 | .idea/**/workspace.xml
182 | .idea/**/tasks.xml
183 | .idea/**/dictionaries
184 | .idea/**/shelf
185 |
186 | # Sensitive or high-churn files
187 | .idea/**/dataSources/
188 | .idea/**/dataSources.ids
189 | .idea/**/dataSources.local.xml
190 | .idea/**/sqlDataSources.xml
191 | .idea/**/dynamic.xml
192 | .idea/**/uiDesigner.xml
193 |
194 | # Gradle
195 | .idea/**/gradle.xml
196 | .idea/**/libraries
197 |
198 | # CMake
199 | cmake-build-debug/
200 | cmake-build-release/
201 |
202 | # Mongo Explorer plugin
203 | .idea/**/mongoSettings.xml
204 |
205 | # File-based project format
206 | *.iws
207 |
208 | # IntelliJ
209 | out/
210 |
211 | # mpeltonen/sbt-idea plugin
212 | .idea_modules/
213 |
214 | # JIRA plugin
215 | atlassian-ide-plugin.xml
216 |
217 | # Cursive Clojure plugin
218 | .idea/replstate.xml
219 |
220 | # Crashlytics plugin (for Android Studio and IntelliJ)
221 | com_crashlytics_export_strings.xml
222 | crashlytics.properties
223 | crashlytics-build.properties
224 | fabric.properties
225 |
226 | # Editor-based Rest Client
227 | .idea/httpRequests
228 |
229 | corpus/*
230 |
231 | zh_wiki_core_sm/*
232 | zh_wiki_core/*
233 | zh_model/
234 |
235 | WORDS.*
236 | WORDS_*
237 | WORDS-*/
238 | package_templete/
239 |
240 |
--------------------------------------------------------------------------------
/onto_to_spacy_json.py:
--------------------------------------------------------------------------------
1 | import json # for tuple support
2 | import plac
3 | import os
4 | import re
5 | from spacy.gold import biluo_tags_from_offsets
6 | import spacy
7 |
8 | nlp = spacy.blank("xx")
9 | from tqdm import tqdm
10 | import random
11 |
12 |
13 | def get_root_filename(onto_dir):
14 | name_files = []
15 | for dirpath, subdirs, files in os.walk(onto_dir):
16 | for fname in files:
17 | if bool(re.search(".name", fname)):
18 | fn = os.path.join(dirpath, fname)
19 | fn = re.sub("\.name", "", fn)
20 | name_files.append(fn)
21 | return name_files
22 |
23 |
24 | def split_sentence(text):
25 | text = text.strip().split("\n")[1:-1]
26 | return text
27 |
28 |
29 | def split_doc(text):
30 | text_list = text.strip().split("\s', t)[0] for t in text_list]
32 | text_list = [re.sub('', "", t).strip() for t in text_list]
33 | return ids, text_list
34 |
35 |
36 | def clean_ent(ent):
37 | tag = re.findall('TYPE="(.+?)"', ent)[0]
38 | text = re.findall(">(.+)", ent)[0]
39 | text = re.sub("\$", "\$", text)
40 | return (text, tag)
41 |
42 |
43 | def raw_text(text):
44 | """Remove entity tags"""
45 | text = re.sub("", "", text)
46 | text = re.sub("", "", text)
47 | return text
48 |
49 |
50 | def ent_position(ents, text):
51 | search_point = 0
52 | spacy_ents = []
53 | for ent in ents:
54 | remain_text = text[search_point:]
55 | ma = re.search(ent[0], remain_text)
56 | ent_tup = (ma.start() + search_point, ma.end() + search_point, ent[1])
57 | spacy_ents.append(ent_tup)
58 |
59 | # update search point to prevent same word in different entity,
60 | # it will cause bug which hard to debug
61 | search_point = search_point + ma.end()
62 | return spacy_ents
63 |
64 |
65 | def text_to_spacy(markup):
66 | raw_ents = re.findall("", markup)
67 | ents = [clean_ent(raw_ent) for raw_ent in raw_ents]
68 | text = raw_text(markup)
69 | spacy_ents = ent_position(ents, text)
70 | final = (text, {"entities": spacy_ents})
71 | return final
72 |
73 |
74 | def onf_to_raw(onf_file):
75 | """
76 | Take in a path to a .onf Ontonotes file. Return the raw text (as much as possible).
77 | The quotes are usually quite messed up, so this is not going to look like real input text.
78 | """
79 | with open(onf_file, "r") as f:
80 | onf = f.read()
81 | sentences = re.findall(
82 | "Plain sentence\:\n\-+?\n(.+?)Treebanked sentence", onf, re.DOTALL
83 | )
84 | sentences = [re.sub("\n+?\s*", " ", i).strip() for i in sentences]
85 | paragraph = " ".join(sentences)
86 | return paragraph
87 |
88 |
89 | def name_to_sentences(ner_filename):
90 | """
91 | Take a .name file and return a sentence list of the kind described here:
92 | https://github.com/explosion/spacy/blob/master/examples/training/training-data.json
93 | """
94 | with open(ner_filename, "r") as f:
95 | doc = f.read()
96 |
97 | sentences = []
98 | onto_sents = split_sentence(doc)
99 | for sent in onto_sents:
100 | offsets = text_to_spacy(sent)
101 | doc = nlp(offsets[0])
102 | tags = biluo_tags_from_offsets(doc, offsets[1]["entities"])
103 | ner_info = list(zip(doc, tags))
104 | tokens = []
105 | for n, i in enumerate(ner_info):
106 | token = {
107 | "head": 0,
108 | "dep": "",
109 | "tag": "",
110 | "orth": i[0].string,
111 | "ner": i[1],
112 | "id": n,
113 | }
114 | tokens.append(token)
115 | sentences.append({"tokens": tokens})
116 | return sentences
117 |
118 |
119 | def dir_to_annotation(onto_dir):
120 | fns = get_root_filename(onto_dir)
121 | all_annotations = []
122 |
123 | for fn in tqdm(fns):
124 | ner_filename = fn + ".name"
125 | onf_filename = fn + ".onf"
126 |
127 | try:
128 | raw = onf_to_raw(onf_filename)
129 | sentences = name_to_sentences(ner_filename)
130 | final = {"id": "fake", "paragraphs": [{"raw": raw, "sentences": sentences}]}
131 | all_annotations.append(final)
132 | except Exception as e:
133 | print("Error formatting ", fn, e)
134 | return all_annotations
135 |
136 |
137 | @plac.annotations(
138 | onto_dir=("Directory of OntoNotes data to traverse", "option", "i", str),
139 | train_file=("File to write training spaCy JSON out to", "option", "t", str),
140 | val_file=("File to write validation spaCy JSON out to", "option", "e", str),
141 | val_split=("Percentage to use for evaluation", "option", "v", float),
142 | )
143 | def main(onto_dir, train_file, val_file, val_split=0.75):
144 | print("Reading and formatting annotations")
145 | all_annotations = dir_to_annotation(onto_dir)
146 | random.shuffle(all_annotations)
147 | cutpoint = round(val_split * len(all_annotations))
148 | val = all_annotations[:cutpoint]
149 | train = all_annotations[cutpoint:]
150 |
151 | print(
152 | "Saving {0} training examples and {1} validation examples".format(
153 | len(train), len(val)
154 | )
155 | )
156 | with open(train_file, "w") as f:
157 | json.dump(train, f, ensure_ascii=False, indent=4)
158 | with open(val_file, "w") as f:
159 | json.dump(val, f, ensure_ascii=False, indent=4)
160 |
161 |
162 | if __name__ == "__main__":
163 | plac.call(main)
164 |
--------------------------------------------------------------------------------
/create_jsonl_vocabulary.py:
--------------------------------------------------------------------------------
1 | import json
2 | import math
3 | import string
4 | from ast import literal_eval
5 | from pathlib import Path
6 |
7 | import ftfy
8 | import jsonlines
9 | import plac
10 | import validators
11 | from preshed.counter import PreshCounter
12 | from spacy.lang.en import stop_words as en_stop_words
13 | from spacy.lang.zh import stop_words as zh_stop_words
14 | from tqdm import tqdm
15 |
16 |
17 | class Word:
18 | counter = -1
19 |
20 | def __init__(self, word_str, cluster, probs):
21 | self._word = word_str
22 | self._cluster = cluster
23 | self._probs = probs
24 |
25 | chinese_punct = "!?。。"#$%&'()*+,-/:;<=>@[\]^_`{|}~⦅⦆「」、、〃》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏."
26 | self._punct_list = list(set(string.punctuation + chinese_punct))
27 |
28 | chinese_whitespace = ""
29 | self._whitespace_list = list(set(string.whitespace + chinese_whitespace))
30 |
31 | english_stopword = en_stop_words.STOP_WORDS
32 | chinese_stopword = zh_stop_words.STOP_WORDS
33 | self._stopword_list = {*english_stopword, *chinese_stopword}
34 |
35 | chinese_quote = "“”‘’"
36 | english_quote = "\"'"
37 | self._qute_list = list(set(english_quote + chinese_quote))
38 |
39 | chinese_left_punct = "<([{"
40 | english_left_punct = "<([「『【〔〖〘〚{"
41 | self._left_punct_list = list(set(english_left_punct + chinese_left_punct))
42 |
43 | chinese_right_punct = ">)]}"
44 | english_right_punct = ">)]」』】〕〗〙〛}"
45 | self._right_punct_list = list(set(english_right_punct + chinese_right_punct))
46 |
47 | @property
48 | def orth(self):
49 | return self._word
50 |
51 | @property
52 | def id(self):
53 | self.__class__.counter += 1
54 |
55 | return self.__class__.counter
56 |
57 | @property
58 | def lower(self):
59 | return self._word.lower()
60 |
61 | @property
62 | def norm(self):
63 | return self._word
64 |
65 | @property
66 | def shape(self):
67 | return "".join(map(lambda x: "X" if x.isupper() else "x", self._word))
68 |
69 | @property
70 | def prefix(self):
71 | return self._word[0]
72 |
73 | @property
74 | def suffix(self):
75 | return self._word[-1]
76 |
77 | @property
78 | def length(self):
79 | return len(self._word)
80 |
81 | @property
82 | def cluster(self):
83 | return self._cluster
84 |
85 | @property
86 | def prob(self):
87 | return self._probs.get(self, 0)
88 |
89 | @property
90 | def is_alpha(self):
91 | return self._word.isalpha()
92 |
93 | @property
94 | def is_ascii(self):
95 | # only for py 3.7
96 | # return self._word.isascii()
97 | try:
98 | self._word.encode('ascii')
99 | except UnicodeEncodeError:
100 | return False
101 |
102 | return True
103 |
104 | @property
105 | def is_digit(self):
106 | return self._word.isdigit()
107 |
108 | @property
109 | def is_lower(self):
110 | return self._word.islower()
111 |
112 | @property
113 | def is_punct(self):
114 | return self._word in self._punct_list
115 |
116 | @property
117 | def is_space(self):
118 | return self._word in self._whitespace_list
119 |
120 | @property
121 | def is_title(self):
122 | return self._word.istitle()
123 |
124 | @property
125 | def is_upper(self):
126 | return self._word.isupper()
127 |
128 | @property
129 | def like_url(self):
130 | return bool(validators.url(self._word))
131 |
132 | @property
133 | def like_num(self):
134 | # TODO(howl-anderson): fix it later
135 | return False
136 |
137 | @property
138 | def like_email(self):
139 | return bool(validators.email(self._word))
140 |
141 | @property
142 | def is_stop(self):
143 | return self._word in self._stopword_list
144 |
145 | @property
146 | def is_oov(self):
147 | return not self._word in self._probs
148 |
149 | @property
150 | def is_quote(self):
151 | return self._word in self._qute_list
152 |
153 | @property
154 | def is_left_punct(self):
155 | return self._word in self._left_punct_list
156 |
157 | @property
158 | def is_right_punct(self):
159 | return self._word in self._right_punct_list
160 |
161 |
162 | def read_freqs(freqs_loc, max_length=100, min_doc_freq=5, min_freq=50):
163 | print("Counting frequencies...")
164 | counts = PreshCounter()
165 | total = 0
166 | with freqs_loc.open() as f:
167 | for i, line in enumerate(f):
168 | freq, doc_freq, key = line.rstrip().split("\t", 2)
169 | freq = int(freq)
170 | counts.inc(i + 1, freq)
171 | total += freq
172 | counts.smooth()
173 | log_total = math.log(total)
174 | probs = {}
175 | with freqs_loc.open() as f:
176 | for line in tqdm(f):
177 | freq, doc_freq, key = line.rstrip().split("\t", 2)
178 | doc_freq = int(doc_freq)
179 | freq = int(freq)
180 | if doc_freq >= min_doc_freq and freq >= min_freq and len(key) < max_length:
181 | word = literal_eval(key)
182 | smooth_count = counts.smoother(int(freq))
183 | probs[word] = math.log(smooth_count) - log_total
184 | oov_prob = math.log(counts.smoother(0)) - log_total
185 | return probs, oov_prob
186 |
187 |
188 | def read_clusters(clusters_loc):
189 | print("Reading clusters...")
190 | clusters = {}
191 | with clusters_loc.open() as f:
192 | for line in tqdm(f):
193 | try:
194 | cluster, word, freq = line.split()
195 | word = ftfy.fix_text(word)
196 | except ValueError:
197 | continue
198 | # If the clusterer has only seen the word a few times, its
199 | # cluster is unreliable.
200 | if int(freq) >= 3:
201 | clusters[word] = cluster
202 | else:
203 | clusters[word] = "0"
204 | # Expand clusters with re-casing
205 | for word, cluster in list(clusters.items()):
206 | if word.lower() not in clusters:
207 | clusters[word.lower()] = cluster
208 | if word.title() not in clusters:
209 | clusters[word.title()] = cluster
210 | if word.upper() not in clusters:
211 | clusters[word.upper()] = cluster
212 | return clusters
213 |
214 |
215 | @plac.annotations(
216 | lang=("model language", "positional", None, str),
217 | output_loc=("model output directory", "positional", None, str),
218 | freqs_loc=("location of words frequencies file", "positional", None, Path),
219 | clusters_loc=("location of brown clusters data", "positional", None, Path),
220 | )
221 | def main(lang, output_loc, freqs_loc, clusters_loc):
222 | clusters = read_clusters(clusters_loc)
223 | probs, oov_prob = read_freqs(freqs_loc)
224 |
225 | with jsonlines.open(output_loc, mode="w") as writer:
226 | header = {"lang": lang, "settings": {"oov_prob": oov_prob}}
227 |
228 | writer.write(header)
229 |
230 | for word_str, cluster in clusters.items():
231 |
232 | if not word_str:
233 | continue
234 |
235 | word = Word(word_str, cluster, probs)
236 | row = {
237 | "orth": word.orth, # the word text
238 | "id": word.id, # can correspond to row in vectors table
239 | "lower": word.lower,
240 | "norm": word.norm,
241 | "shape": word.shape,
242 | "prefix": word.prefix,
243 | "suffix": word.suffix,
244 | "length": word.length,
245 | "cluster": word.cluster,
246 | "prob": word.prob,
247 | "is_alpha": word.is_alpha,
248 | "is_ascii": word.is_ascii,
249 | "is_digit": word.is_digit,
250 | "is_lower": word.is_lower,
251 | "is_punct": word.is_punct,
252 | "is_space": word.is_space,
253 | "is_title": word.is_title,
254 | "is_upper": word.is_upper,
255 | "like_url": word.like_url,
256 | "like_num": word.like_num,
257 | "like_email": word.like_email,
258 | "is_stop": word.is_stop,
259 | "is_oov": word.is_oov,
260 | "is_quote": word.is_quote,
261 | "is_left_punct": word.is_left_punct,
262 | "is_right_punct": word.is_right_punct,
263 | }
264 |
265 | writer.write(row)
266 |
267 |
268 | if __name__ == "__main__":
269 | plac.call(main)
270 |
--------------------------------------------------------------------------------
/notebooks/demo.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "## install package"
8 | ]
9 | },
10 | {
11 | "cell_type": "code",
12 | "execution_count": 2,
13 | "metadata": {},
14 | "outputs": [
15 | {
16 | "name": "stdout",
17 | "output_type": "stream",
18 | "text": [
19 | "Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple\n",
20 | "Processing ./zh_core_web_sm-2.0.3.tar.gz\n",
21 | "Requirement already satisfied: spacy>=2.0.0a18 in /home/howl/.local/lib/python3.5/site-packages (from zh-core-web-sm==2.0.3) (2.0.12)\n",
22 | "Requirement already satisfied: dill<0.3,>=0.2 in /home/howl/.local/lib/python3.5/site-packages (from spacy>=2.0.0a18->zh-core-web-sm==2.0.3) (0.2.8.2)\n",
23 | "Requirement already satisfied: thinc<6.11.0,>=6.10.3 in /home/howl/.local/lib/python3.5/site-packages (from spacy>=2.0.0a18->zh-core-web-sm==2.0.3) (6.10.3)\n",
24 | "Requirement already satisfied: numpy>=1.7 in /usr/local/lib/python3.5/dist-packages (from spacy>=2.0.0a18->zh-core-web-sm==2.0.3) (1.14.2)\n",
25 | "Requirement already satisfied: plac<1.0.0,>=0.9.6 in /home/howl/.local/lib/python3.5/site-packages (from spacy>=2.0.0a18->zh-core-web-sm==2.0.3) (0.9.6)\n",
26 | "Requirement already satisfied: requests<3.0.0,>=2.13.0 in /usr/local/lib/python3.5/dist-packages (from spacy>=2.0.0a18->zh-core-web-sm==2.0.3) (2.18.4)\n",
27 | "Requirement already satisfied: cymem<1.32,>=1.30 in /home/howl/.local/lib/python3.5/site-packages (from spacy>=2.0.0a18->zh-core-web-sm==2.0.3) (1.31.2)\n",
28 | "Requirement already satisfied: ujson>=1.35 in /home/howl/.local/lib/python3.5/site-packages (from spacy>=2.0.0a18->zh-core-web-sm==2.0.3) (1.35)\n",
29 | "Requirement already satisfied: regex==2017.4.5 in /home/howl/.local/lib/python3.5/site-packages (from spacy>=2.0.0a18->zh-core-web-sm==2.0.3) (2017.4.5)\n",
30 | "Requirement already satisfied: murmurhash<0.29,>=0.28 in /home/howl/.local/lib/python3.5/site-packages (from spacy>=2.0.0a18->zh-core-web-sm==2.0.3) (0.28.0)\n",
31 | "Requirement already satisfied: preshed<2.0.0,>=1.0.0 in /home/howl/.local/lib/python3.5/site-packages (from spacy>=2.0.0a18->zh-core-web-sm==2.0.3) (1.0.1)\n",
32 | "Requirement already satisfied: msgpack<1.0.0,>=0.5.6 in /home/howl/.local/lib/python3.5/site-packages (from thinc<6.11.0,>=6.10.3->spacy>=2.0.0a18->zh-core-web-sm==2.0.3) (0.5.6)\n",
33 | "Requirement already satisfied: tqdm<5.0.0,>=4.10.0 in /home/howl/.local/lib/python3.5/site-packages (from thinc<6.11.0,>=6.10.3->spacy>=2.0.0a18->zh-core-web-sm==2.0.3) (4.24.0)\n",
34 | "Requirement already satisfied: six<2.0.0,>=1.10.0 in /usr/lib/python3/dist-packages (from thinc<6.11.0,>=6.10.3->spacy>=2.0.0a18->zh-core-web-sm==2.0.3) (1.10.0)\n",
35 | "Requirement already satisfied: msgpack-numpy<1.0.0,>=0.4.1 in /home/howl/.local/lib/python3.5/site-packages (from thinc<6.11.0,>=6.10.3->spacy>=2.0.0a18->zh-core-web-sm==2.0.3) (0.4.3.1)\n",
36 | "Requirement already satisfied: wrapt<1.11.0,>=1.10.0 in /home/howl/.local/lib/python3.5/site-packages (from thinc<6.11.0,>=6.10.3->spacy>=2.0.0a18->zh-core-web-sm==2.0.3) (1.10.11)\n",
37 | "Requirement already satisfied: cytoolz<0.10,>=0.9.0 in /home/howl/.local/lib/python3.5/site-packages (from thinc<6.11.0,>=6.10.3->spacy>=2.0.0a18->zh-core-web-sm==2.0.3) (0.9.0.1)\n",
38 | "Requirement already satisfied: idna<2.7,>=2.5 in /usr/local/lib/python3.5/dist-packages (from requests<3.0.0,>=2.13.0->spacy>=2.0.0a18->zh-core-web-sm==2.0.3) (2.6)\n",
39 | "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.5/dist-packages (from requests<3.0.0,>=2.13.0->spacy>=2.0.0a18->zh-core-web-sm==2.0.3) (2018.4.16)\n",
40 | "Requirement already satisfied: chardet<3.1.0,>=3.0.2 in /usr/local/lib/python3.5/dist-packages (from requests<3.0.0,>=2.13.0->spacy>=2.0.0a18->zh-core-web-sm==2.0.3) (3.0.4)\n",
41 | "Requirement already satisfied: urllib3<1.23,>=1.21.1 in /usr/local/lib/python3.5/dist-packages (from requests<3.0.0,>=2.13.0->spacy>=2.0.0a18->zh-core-web-sm==2.0.3) (1.22)\n",
42 | "Requirement already satisfied: toolz>=0.8.0 in /home/howl/.local/lib/python3.5/site-packages (from cytoolz<0.10,>=0.9.0->thinc<6.11.0,>=6.10.3->spacy>=2.0.0a18->zh-core-web-sm==2.0.3) (0.9.0)\n",
43 | "Building wheels for collected packages: zh-core-web-sm\n",
44 | " Running setup.py bdist_wheel for zh-core-web-sm ... \u001b[?25ldone\n",
45 | "\u001b[?25h Stored in directory: /home/howl/.cache/pip/wheels/04/63/ec/a66252a0376a1953722cb70a85c50aa4836311eca4d69f75f3\n",
46 | "Successfully built zh-core-web-sm\n",
47 | "Installing collected packages: zh-core-web-sm\n",
48 | " Found existing installation: zh-core-web-sm 2.0.2\n",
49 | " Uninstalling zh-core-web-sm-2.0.2:\n",
50 | " Successfully uninstalled zh-core-web-sm-2.0.2\n",
51 | "Successfully installed zh-core-web-sm-2.0.3\n",
52 | "\u001b[33mYou are using pip version 10.0.1, however version 18.0 is available.\n",
53 | "You should consider upgrading via the 'pip install --upgrade pip' command.\u001b[0m\n"
54 | ]
55 | }
56 | ],
57 | "source": [
58 | "!pip install -q https://github.com/howl-anderson/Chinese_models_for_SpaCy/releases/download/v2.2.X-0.1.0/zh_core_web_sm-0.1.0.tar.gz"
59 | ]
60 | },
61 | {
62 | "cell_type": "code",
63 | "execution_count": 1,
64 | "metadata": {},
65 | "outputs": [],
66 | "source": [
67 | "!pip install -q pandas\n",
68 | "!pip install -q jieba\n",
69 | "!pip install -q tabulate"
70 | ]
71 | },
72 | {
73 | "cell_type": "markdown",
74 | "metadata": {},
75 | "source": [
76 | "## import packages"
77 | ]
78 | },
79 | {
80 | "cell_type": "code",
81 | "execution_count": 2,
82 | "metadata": {},
83 | "outputs": [],
84 | "source": [
85 | "from spacy import displacy\n",
86 | "from tabulate import tabulate\n",
87 | "import pandas as pd\n",
88 | "\n",
89 | "import zh_core_web_sm"
90 | ]
91 | },
92 | {
93 | "cell_type": "markdown",
94 | "metadata": {},
95 | "source": [
96 | "## load models"
97 | ]
98 | },
99 | {
100 | "cell_type": "code",
101 | "execution_count": 3,
102 | "metadata": {},
103 | "outputs": [],
104 | "source": [
105 | "nlp = zh_core_web_sm.load()"
106 | ]
107 | },
108 | {
109 | "cell_type": "markdown",
110 | "metadata": {},
111 | "source": [
112 | "## parse doc"
113 | ]
114 | },
115 | {
116 | "cell_type": "code",
117 | "execution_count": 4,
118 | "metadata": {},
119 | "outputs": [
120 | {
121 | "name": "stderr",
122 | "output_type": "stream",
123 | "text": [
124 | "Building prefix dict from the default dictionary ...\n",
125 | "Loading model from cache /tmp/jieba.cache\n",
126 | "Loading model cost 0.435 seconds.\n",
127 | "Prefix dict has been built successfully.\n"
128 | ]
129 | }
130 | ],
131 | "source": [
132 | "doc = nlp(\"王小明在北京的清华大学读书\")"
133 | ]
134 | },
135 | {
136 | "cell_type": "markdown",
137 | "metadata": {},
138 | "source": [
139 | "## print doc's attributes"
140 | ]
141 | },
142 | {
143 | "cell_type": "code",
144 | "execution_count": 5,
145 | "metadata": {
146 | "scrolled": false
147 | },
148 | "outputs": [
149 | {
150 | "data": {
151 | "text/html": [
152 | "\n",
153 | "\n",
166 | "
\n",
167 | " \n",
168 | " \n",
169 | " | \n",
170 | " text | \n",
171 | " lemma_ | \n",
172 | " pos_ | \n",
173 | " tag_ | \n",
174 | " dep_ | \n",
175 | " shape_ | \n",
176 | " is_alpha | \n",
177 | " is_stop | \n",
178 | " has_vector | \n",
179 | " ent_iob_ | \n",
180 | " ent_type_ | \n",
181 | " vector_norm | \n",
182 | " is_oov | \n",
183 | "
\n",
184 | " \n",
185 | " \n",
186 | " \n",
187 | " | 0 | \n",
188 | " 王小明 | \n",
189 | " 王小明 | \n",
190 | " X | \n",
191 | " NNP | \n",
192 | " nsubj | \n",
193 | " xxx | \n",
194 | " True | \n",
195 | " False | \n",
196 | " False | \n",
197 | " B | \n",
198 | " PERSON | \n",
199 | " 0.000000 | \n",
200 | " True | \n",
201 | "
\n",
202 | " \n",
203 | " | 1 | \n",
204 | " 在 | \n",
205 | " 在 | \n",
206 | " VERB | \n",
207 | " VV | \n",
208 | " case | \n",
209 | " x | \n",
210 | " True | \n",
211 | " True | \n",
212 | " True | \n",
213 | " O | \n",
214 | " | \n",
215 | " 6.573987 | \n",
216 | " False | \n",
217 | "
\n",
218 | " \n",
219 | " | 2 | \n",
220 | " 北京 | \n",
221 | " 北京 | \n",
222 | " X | \n",
223 | " NNP | \n",
224 | " nmod | \n",
225 | " xx | \n",
226 | " True | \n",
227 | " False | \n",
228 | " True | \n",
229 | " B | \n",
230 | " GPE | \n",
231 | " 12.769391 | \n",
232 | " False | \n",
233 | "
\n",
234 | " \n",
235 | " | 3 | \n",
236 | " 的 | \n",
237 | " 的 | \n",
238 | " PART | \n",
239 | " DEC | \n",
240 | " case:dec | \n",
241 | " x | \n",
242 | " True | \n",
243 | " True | \n",
244 | " True | \n",
245 | " O | \n",
246 | " | \n",
247 | " 6.886564 | \n",
248 | " False | \n",
249 | "
\n",
250 | " \n",
251 | " | 4 | \n",
252 | " 清华大学 | \n",
253 | " 清华大学 | \n",
254 | " X | \n",
255 | " NNP | \n",
256 | " obl | \n",
257 | " xxxx | \n",
258 | " True | \n",
259 | " False | \n",
260 | " True | \n",
261 | " B | \n",
262 | " ORG | \n",
263 | " 18.842812 | \n",
264 | " False | \n",
265 | "
\n",
266 | " \n",
267 | " | 5 | \n",
268 | " 读书 | \n",
269 | " 读书 | \n",
270 | " VERB | \n",
271 | " VV | \n",
272 | " ROOT | \n",
273 | " xx | \n",
274 | " True | \n",
275 | " False | \n",
276 | " True | \n",
277 | " O | \n",
278 | " | \n",
279 | " 18.138533 | \n",
280 | " False | \n",
281 | "
\n",
282 | " \n",
283 | "
\n",
284 | "
"
285 | ],
286 | "text/plain": [
287 | " text lemma_ pos_ tag_ dep_ shape_ is_alpha is_stop has_vector \\\n",
288 | "0 王小明 王小明 X NNP nsubj xxx True False False \n",
289 | "1 在 在 VERB VV case x True True True \n",
290 | "2 北京 北京 X NNP nmod xx True False True \n",
291 | "3 的 的 PART DEC case:dec x True True True \n",
292 | "4 清华大学 清华大学 X NNP obl xxxx True False True \n",
293 | "5 读书 读书 VERB VV ROOT xx True False True \n",
294 | "\n",
295 | " ent_iob_ ent_type_ vector_norm is_oov \n",
296 | "0 B PERSON 0.000000 True \n",
297 | "1 O 6.573987 False \n",
298 | "2 B GPE 12.769391 False \n",
299 | "3 O 6.886564 False \n",
300 | "4 B ORG 18.842812 False \n",
301 | "5 O 18.138533 False "
302 | ]
303 | },
304 | "execution_count": 5,
305 | "metadata": {},
306 | "output_type": "execute_result"
307 | }
308 | ],
309 | "source": [
310 | "headers = ['text', 'lemma_', 'pos_', 'tag_', 'dep_',\n",
311 | " 'shape_', 'is_alpha', 'is_stop', 'has_vector',\n",
312 | " 'ent_iob_', 'ent_type_',\n",
313 | " 'vector_norm', 'is_oov']\n",
314 | "\n",
315 | "doc_data = []\n",
316 | "\n",
317 | "for token in doc:\n",
318 | " token_data = [token.text, token.lemma_, token.pos_, token.tag_, token.dep_,\n",
319 | " token.shape_, token.is_alpha, token.is_stop, token.has_vector,\n",
320 | " token.ent_iob_, token.ent_type_,\n",
321 | " token.vector_norm, token.is_oov]\n",
322 | " doc_data.append(token_data)\n",
323 | "\n",
324 | "pd.DataFrame(doc_data, columns=headers)"
325 | ]
326 | },
327 | {
328 | "cell_type": "markdown",
329 | "metadata": {},
330 | "source": [
331 | "## draw dependency graph"
332 | ]
333 | },
334 | {
335 | "cell_type": "code",
336 | "execution_count": 6,
337 | "metadata": {
338 | "scrolled": false
339 | },
340 | "outputs": [
341 | {
342 | "data": {
343 | "text/html": [
344 | ""
415 | ],
416 | "text/plain": [
417 | ""
418 | ]
419 | },
420 | "metadata": {},
421 | "output_type": "display_data"
422 | }
423 | ],
424 | "source": [
425 | "displacy.render(doc, jupyter=True)"
426 | ]
427 | },
428 | {
429 | "cell_type": "code",
430 | "execution_count": 7,
431 | "metadata": {},
432 | "outputs": [
433 | {
434 | "data": {
435 | "text/html": [
436 | "\n",
437 | "\n",
438 | " 王小明\n",
439 | " PERSON\n",
440 | "\n",
441 | "在\n",
442 | "\n",
443 | " 北京\n",
444 | " GPE\n",
445 | "\n",
446 | "的\n",
447 | "\n",
448 | " 清华大学\n",
449 | " ORG\n",
450 | "\n",
451 | "读书
"
452 | ],
453 | "text/plain": [
454 | ""
455 | ]
456 | },
457 | "metadata": {},
458 | "output_type": "display_data"
459 | }
460 | ],
461 | "source": [
462 | "displacy.render(doc, jupyter=True, style='ent')"
463 | ]
464 | }
465 | ],
466 | "metadata": {
467 | "celltoolbar": "Raw Cell Format",
468 | "kernelspec": {
469 | "display_name": "Python 3",
470 | "language": "python",
471 | "name": "python3"
472 | },
473 | "language_info": {
474 | "codemirror_mode": {
475 | "name": "ipython",
476 | "version": 3
477 | },
478 | "file_extension": ".py",
479 | "mimetype": "text/x-python",
480 | "name": "python",
481 | "nbconvert_exporter": "python",
482 | "pygments_lexer": "ipython3",
483 | "version": "3.6.9"
484 | }
485 | },
486 | "nbformat": 4,
487 | "nbformat_minor": 1
488 | }
489 |
--------------------------------------------------------------------------------