├── train_ner.py
├── corpus
    └── .gitkeep
├── third-part
    └── .gitkeep
├── requirements_dev.txt
├── spacy-dev-resources
├── POS_depedency_model
    └── .gitignore
├── train_requirements.txt
├── requirements.txt
├── create_wikipedia_corpus.bash
├── extract_UD_Chinese-GSD_corpus.bash
├── .images
    ├── ner_of_doc.png
    ├── attributes_of_doc.png
    ├── dependency_of_doc.png
    ├── attributes_of_doc.html
    ├── temp.html
    └── dependency_of_doc.svg
├── merge_all_text_files.bash
├── move_wikipedia_corpus.bash
├── create_jsonl_corpus.bash
├── init_model.bash
├── download_UD_Chinese-GSD_corpus.bash
├── download_and_compile_brown_cluster.bash
├── train.bash
├── create_model_package.bash
├── create_init_model.bash
├── onto_to_spacy_json.bash
├── train_ner.bash
├── compute_brown_cluster.bash
├── compute_plain_word_vec.bash
├── compute_words_freq.bash
├── train_model.bash
├── .gitmodules
├── .idea
    ├── vcs.xml
    ├── modules.xml
    ├── misc.xml
    └── Chinese_models_for_SpaCy.iml
├── convert_UD_Chinese-GSD_corpus.bash
├── format_convertor.bash
├── update_model_meta.py
├── test_init_model.py
├── merge_all_text_files.py
├── all_in_one.bash
├── test_dependency_model.py
├── meta.json
├── test.py
├── test_as_model_dir.py
├── test_ner.py
├── test_load.py
├── LICENSE.md
├── merge_submodel.py
├── plain_word_vectors.py
├── README.md
├── workflow.md
├── README.en-US.md
├── .gitignore
├── onto_to_spacy_json.py
├── create_jsonl_vocabulary.py
└── notebooks
    └── demo.ipynb


/train_ner.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/corpus/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/third-part/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/requirements_dev.txt:
--------------------------------------------------------------------------------
1 | jieba
2 | 


--------------------------------------------------------------------------------
/spacy-dev-resources:
--------------------------------------------------------------------------------
1 | ../spacy-dev-resources


--------------------------------------------------------------------------------
/POS_depedency_model/.gitignore:
--------------------------------------------------------------------------------
1 | *
2 | **/*
3 | !.gitignore


--------------------------------------------------------------------------------
/train_requirements.txt:
--------------------------------------------------------------------------------
1 | -r ./spacy-dev-resources/requirements.txt
2 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | plac
2 | spacy
3 | pandas
4 | jieba
5 | ftfy
6 | validators
7 | 


--------------------------------------------------------------------------------
/create_wikipedia_corpus.bash:
--------------------------------------------------------------------------------
1 | cd chinese-wikipedia-corpus-creator
2 | bash ./allinone_process.bash
3 | 


--------------------------------------------------------------------------------
/extract_UD_Chinese-GSD_corpus.bash:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | unzip ./corpus/UD_Chinese-GSD.zip -d ./corpus
4 | 


--------------------------------------------------------------------------------
/.images/ner_of_doc.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/howl-anderson/Chinese_models_for_SpaCy/HEAD/.images/ner_of_doc.png


--------------------------------------------------------------------------------
/merge_all_text_files.bash:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | python ./merge_all_text_files.py ./token_cleaned_plain_files ./WORDS.txt
4 | 


--------------------------------------------------------------------------------
/move_wikipedia_corpus.bash:
--------------------------------------------------------------------------------
1 | cp -r chinese-wikipedia-corpus-creator/token_cleaned_plain_files  token_cleaned_plain_files
2 | 


--------------------------------------------------------------------------------
/.images/attributes_of_doc.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/howl-anderson/Chinese_models_for_SpaCy/HEAD/.images/attributes_of_doc.png


--------------------------------------------------------------------------------
/.images/dependency_of_doc.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/howl-anderson/Chinese_models_for_SpaCy/HEAD/.images/dependency_of_doc.png


--------------------------------------------------------------------------------
/create_jsonl_corpus.bash:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | python ./create_jsonl_vocabulary.py zh spacy_corpus.jsonl WORDS_FREQ.txt ./WORDS-c1000-p1.out/paths
4 | 


--------------------------------------------------------------------------------
/init_model.bash:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | 
3 | python -m spacy init-model zh zh_model/ WORDS_FREQ.txt -c WORDS-c1000-p1.out/paths -v WORDS_VECS.txt
4 | 


--------------------------------------------------------------------------------
/download_UD_Chinese-GSD_corpus.bash:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | wget -c https://github.com/UniversalDependencies/UD_Chinese-GSD/archive/master.zip -O corpus/UD_Chinese-GSD.zip
4 | 


--------------------------------------------------------------------------------
/download_and_compile_brown_cluster.bash:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | 
3 | cd third-part
4 | 
5 | git clone https://github.com/percyliang/brown-cluster.git
6 | 
7 | cd brown-cluster
8 | make


--------------------------------------------------------------------------------
/train.bash:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | ../MITIE/tools/wordrep/build/wordrep --count-words 800000 --word-vects --basic-morph --cca-morph ../wikipedia-corpus-creator/token_cleaned_plain_files
4 | 


--------------------------------------------------------------------------------
/create_model_package.bash:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | python -m spacy package spacy_models/final_model spacy_package --force
4 | 
5 | cd spacy_package/zh_core_web_sm-0.1.0
6 | python ./setup.py sdist
7 | 


--------------------------------------------------------------------------------
/create_init_model.bash:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | python -m spacy init-model zh spacy_models/base_model --jsonl-loc ./spacy_corpus.jsonl --vectors-loc WORDS_VECS.txt --vectors-name zh_core_web_sm.vectors
4 | 


--------------------------------------------------------------------------------
/onto_to_spacy_json.bash:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | 
3 | python onto_to_spacy_json.py -i "./ontonotes-release-5.0/data/files/data/chinese/annotations/" -t "china_ner_train.json" -e "china_ner_eval.json" -v 0.05
4 | 


--------------------------------------------------------------------------------
/train_ner.bash:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | python -m spacy train zh spacy_models/ner_model ./china_ner_train.json ./china_ner_eval.json --pipeline ner -m meta.json -v ./spacy_models/dependency_model/model-best -n 1
4 | 


--------------------------------------------------------------------------------
/compute_brown_cluster.bash:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | cpu_count=`nproc --all`
4 | process_count=$(expr $cpu_count - 1)
5 | 
6 | ./third-part/brown-cluster/wcluster --text WORDS.txt --c 1000 --threads ${process_count}
7 | 


--------------------------------------------------------------------------------
/compute_plain_word_vec.bash:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | cpu_count=`nproc --all`
4 | process_count=$(expr $cpu_count - 1)
5 | 
6 | python ./spacy-dev-resources/training/plain_word_vectors.py -i 200 -n ${process_count} ./WORDS.txt WORDS_VECS.txt
7 | 


--------------------------------------------------------------------------------
/compute_words_freq.bash:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | cpu_count=`nproc --all`
4 | process_count=$(expr $cpu_count - 1)
5 | 
6 | python ./spacy-dev-resources/training/plain_word_freqs.py -n ${process_count} token_cleaned_plain_files WORDS_FREQ.txt
7 | 


--------------------------------------------------------------------------------
/train_model.bash:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | python -m spacy train zh spacy_models/dependency_model corpus/spacy/zh-simplified-ud-train.json corpus/spacy/zh-simplified-ud-dev.json --pipeline tagger,parser -v spacy_models/base_model -m meta.json -V 0.1.0 -n 1
4 | 


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "spacy-dev-resources"]
2 | 	path = spacy-dev-resources
3 | 	url = https://github.com/howl-anderson/spacy-dev-resources.git
4 | [submodule "third-part/brown-cluster"]
5 | 	path = third-part/brown-cluster
6 | 	url = https://github.com/howl-anderson/brown-cluster.git
7 | 


--------------------------------------------------------------------------------
/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="VcsDirectoryMappings">
4 |     <mapping directory="$PROJECT_DIR$" vcs="Git" />
5 |     <mapping directory="$PROJECT_DIR$/third-part/brown-cluster" vcs="Git" />
6 |   </component>
7 | </project>


--------------------------------------------------------------------------------
/.idea/modules.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="ProjectModuleManager">
4 |     <modules>
5 |       <module fileurl="file://$PROJECT_DIR$/.idea/Chinese_models_for_SpaCy.iml" filepath="$PROJECT_DIR$/.idea/Chinese_models_for_SpaCy.iml" />
6 |     </modules>
7 |   </component>
8 | </project>


--------------------------------------------------------------------------------
/convert_UD_Chinese-GSD_corpus.bash:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | 
3 | cd corpus/UD_Chinese-GSD-master
4 | 
5 | opencc -i zh_gsd-ud-train.conllu -o zh-simplified-ud-train.conllu -c t2s.json
6 | opencc -i zh_gsd-ud-dev.conllu -o zh-simplified-ud-dev.conllu -c t2s.json
7 | opencc -i zh_gsd-ud-test.conllu -o zh-simplified-ud-test.conllu -c t2s.json
8 | 


--------------------------------------------------------------------------------
/.idea/misc.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="JavaScriptSettings">
4 |     <option name="languageLevel" value="ES6" />
5 |   </component>
6 |   <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.6 (Chinese_models_for_SpaCy)" project-jdk-type="Python SDK" />
7 | </project>


--------------------------------------------------------------------------------
/format_convertor.bash:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | mkdir -p corpus/spacy
4 | 
5 | python -m spacy convert corpus/UD_Chinese-GSD-master/zh-simplified-ud-train.conllu corpus/spacy
6 | python -m spacy convert corpus/UD_Chinese-GSD-master/zh-simplified-ud-dev.conllu corpus/spacy
7 | python -m spacy convert corpus/UD_Chinese-GSD-master/zh-simplified-ud-test.conllu corpus/spacy
8 | 


--------------------------------------------------------------------------------
/update_model_meta.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | 
 3 | 
 4 | def main():
 5 |     with open("./spacy_models/base_model/meta.json") as fd:
 6 |         data = json.load(infd)
 7 | 
 8 |     data["name"] = "core_web_sm"
 9 | 
10 |     with open("./spacy_models/base_model/meta.json", "wt") as fd:
11 |         json.dump(data, fd)
12 | 
13 | 
14 | if __name__ == "__main__":
15 |     main()
16 | 


--------------------------------------------------------------------------------
/test_init_model.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | from spacy import displacy
 4 | import spacy
 5 | 
 6 | nlp = spacy.load('zh_model/')
 7 | 
 8 | 
 9 | def main():
10 |     doc = nlp("王小明在北京的清华大学读书")
11 |     for token in doc:
12 |         print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
13 |               token.shape_, token.is_alpha, token.is_stop, token.has_vector,
14 |               token.vector_norm, token.is_oov)
15 | 
16 |     displacy.serve(doc)
17 | 
18 | 
19 | if __name__ == "__main__":
20 |     main()
21 | 


--------------------------------------------------------------------------------
/merge_all_text_files.py:
--------------------------------------------------------------------------------
 1 | import pathlib
 2 | import sys
 3 | 
 4 | input_dir = sys.argv[1]
 5 | input_path = pathlib.Path(input_dir)
 6 | input_files = input_path.glob("*")
 7 | 
 8 | output_file = sys.argv[2]
 9 | output_path = pathlib.Path(output_file)
10 | 
11 | 
12 | with output_path.open("wt") as outfile:
13 |     for fname in input_files:
14 |         with fname.open("rt") as infile:
15 |             for line in infile:
16 |                 if not line.endswith("\n"):
17 |                     line = line + "\n"
18 |                 outfile.write(line)
19 | 


--------------------------------------------------------------------------------
/all_in_one.bash:
--------------------------------------------------------------------------------
 1 | ./create_wikipedia_corpus.bash
 2 | ./move_wikipedia_corpus.bash
 3 | ./compute_words_freq.bash
 4 | ./merge_all_text_files.bash
 5 | ./download_and_compile_brown_cluster.bash
 6 | ./compute_plain_word_vec.bash
 7 | ./create_init_model.bash
 8 | ./update_model_meta.py
 9 | ./download_UD_Chinese-GSD_corpus.bash
10 | ./extract_UD_Chinese-GSD_corpus.bash
11 | ./convert_UD_Chinese-GSD_corpus.bash
12 | ./format_convertor.bash
13 | ./init_model.bash
14 | ./train_model.bash
15 | ./onto_to_spacy_json.bash
16 | ./train_ner.bash
17 | ./merge_submodel.py
18 | 


--------------------------------------------------------------------------------
/test_dependency_model.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | from spacy import displacy
 4 | import spacy
 5 | 
 6 | nlp = spacy.load('depedency_model/model-final/')
 7 | 
 8 | 
 9 | def main():
10 |     doc = nlp("王小明在北京的清华大学读书")
11 |     for token in doc:
12 |         print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
13 |               token.shape_, token.is_alpha, token.is_stop, token.has_vector,
14 |               token.vector_norm, token.is_oov)
15 | 
16 |     displacy.serve(doc)
17 | 
18 | 
19 | if __name__ == "__main__":
20 |     main()
21 | 


--------------------------------------------------------------------------------
/meta.json:
--------------------------------------------------------------------------------
 1 |  {
 2 |      "name": "core_web_sm",
 3 |      "version": "0.1.0",
 4 |      "license": "CC BY-SA 3.0",
 5 |      "author": "Xiaoquan Kong",
 6 |      "url": "https://xiaoquankong.ai",
 7 |      "sources": [
 8 |          "OntoNotes 5",
 9 |          "Common Crawl",
10 |          "Universal Dependencies"
11 |      ],
12 |      "email": "u1mail2me@gmail.com",
13 |      "description": "Chinese multi-task CNN trained on OntoNotes, with GloVe vectors trained on Common Crawl. Assigns word vectors, context-specific token vectors, POS tags, dependency parse and named entities."
14 |  }
15 | 


--------------------------------------------------------------------------------
/test.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | from spacy import displacy
 4 | 
 5 | import zh_core_web_sm
 6 | 
 7 | nlp = zh_core_web_sm.load()
 8 | 
 9 | 
10 | def main():
11 |     doc = nlp("王小明在北京的清华大学读书")
12 |     for token in doc:
13 |         print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
14 |               token.shape_, token.is_alpha, token.is_stop, token.has_vector,
15 |               token.ent_iob_, token.ent_type_,
16 |               token.vector_norm, token.is_oov)
17 | 
18 |     # displacy.serve(doc)
19 | 
20 | 
21 | if __name__ == "__main__":
22 |     main()
23 | 


--------------------------------------------------------------------------------
/test_as_model_dir.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | from spacy import displacy
 4 | import spacy
 5 | 
 6 | nlp = spacy.load("./spacy_models/final_model")
 7 | 
 8 | 
 9 | def main():
10 |     doc = nlp("王小明在北京的清华大学读书")
11 |     for token in doc:
12 |         print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
13 |               token.shape_, token.is_alpha, token.is_stop, token.has_vector,
14 |               token.ent_iob_, token.ent_type_,
15 |               token.vector_norm, token.is_oov)
16 | 
17 |     # displacy.serve(doc)
18 | 
19 | 
20 | if __name__ == "__main__":
21 |     main()
22 | 


--------------------------------------------------------------------------------
/test_ner.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | from spacy import displacy
 4 | import spacy
 5 | 
 6 | nlp = spacy.load('ner_model/model-final')
 7 | 
 8 | 
 9 | def main():
10 |     doc = nlp("王小明在北京的清华大学读书")
11 |     for token in doc:
12 |         print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
13 |               token.shape_, token.is_alpha, token.is_stop, token.has_vector,
14 |               token.ent_iob_, token.ent_type_,
15 |               token.vector_norm, token.is_oov)
16 | 
17 |     displacy.serve(doc, style='ent')
18 | 
19 | 
20 | if __name__ == "__main__":
21 |     main()
22 | 


--------------------------------------------------------------------------------
/test_load.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | from spacy import displacy
 4 | 
 5 | import spacy
 6 | 
 7 | nlp = spacy.load('package_templete/zh_core_web_sm-2.0.5/zh_core_web_sm/zh_core_web_sm-2.0.5')
 8 | 
 9 | 
10 | def main():
11 |     doc = nlp("王小明在北京的清华大学读书")
12 |     for token in doc:
13 |         print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
14 |               token.shape_, token.is_alpha, token.is_stop, token.has_vector,
15 |               token.ent_iob_, token.ent_type_,
16 |               token.vector_norm, token.is_oov)
17 | 
18 |     # displacy.serve(doc)
19 | 
20 | 
21 | if __name__ == "__main__":
22 |     main()
23 | 


--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2018 Xiaoquan Kong
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/.idea/Chinese_models_for_SpaCy.iml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <module type="PYTHON_MODULE" version="4">
 3 |   <component name="NewModuleRootManager">
 4 |     <content url="file://$MODULE_DIR$">
 5 |       <sourceFolder url="file://$MODULE_DIR$/package_templete/zh_core_web_sm-2.0.2" isTestSource="false" />
 6 |       <excludeFolder url="file://$MODULE_DIR$/POS_depedency_model" />
 7 |       <excludeFolder url="file://$MODULE_DIR$/corpus" />
 8 |       <excludeFolder url="file://$MODULE_DIR$/package_templete" />
 9 |       <excludeFolder url="file://$MODULE_DIR$/spacy-dev-resources" />
10 |       <excludeFolder url="file://$MODULE_DIR$/token_cleaned_plain_files" />
11 |       <excludeFolder url="file://$MODULE_DIR$/zh_wiki_core" />
12 |       <excludeFolder url="file://$MODULE_DIR$/zh_wiki_core_sm" />
13 |     </content>
14 |     <orderEntry type="jdk" jdkName="Python 3.6 (Chinese_models_for_SpaCy)" jdkType="Python SDK" />
15 |     <orderEntry type="sourceFolder" forTests="false" />
16 |   </component>
17 |   <component name="TestRunnerService">
18 |     <option name="projectConfiguration" value="py.test" />
19 |     <option name="PROJECT_TEST_RUNNER" value="py.test" />
20 |   </component>
21 | </module>


--------------------------------------------------------------------------------
/merge_submodel.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python3
 2 | 
 3 | import shutil
 4 | import json
 5 | from pathlib import Path
 6 | 
 7 | 
 8 | def read_pipeline(meta_file):
 9 |     with open(meta_file) as fd:
10 |         data = json.load(fd)
11 |         return data["pipeline"]
12 | 
13 | 
14 | def update_pipeline(meta_file, pipeline):
15 |     with open(meta_file) as fd:
16 |         data = json.load(fd)
17 | 
18 |     data["pipeline"] = pipeline
19 | 
20 |     with open(meta_file, "w") as fd:
21 |         json.dump(data, fd)
22 | 
23 | 
24 | def copy_tree(src: Path, dst: Path, folder: str):
25 |     shutil.copytree(src / folder, dst / folder)
26 | 
27 | 
28 | def main():
29 |     target_dir = Path("./spacy_models/final_model")
30 |     target_dir.mkdir(exist_ok=True)
31 | 
32 |     pipeline = []
33 | 
34 |     source_dir = Path("./spacy_models/dependency_model/model-best")
35 |     copy_tree(source_dir, target_dir, "parser")
36 |     copy_tree(source_dir, target_dir, "tagger")
37 |     copy_tree(source_dir, target_dir, "vocab")
38 | 
39 |     pipeline.extend(read_pipeline(source_dir / "meta.json"))
40 | 
41 |     source_dir = Path("./spacy_models/ner_model/model-best")
42 |     copy_tree(source_dir, target_dir, "ner")
43 |     shutil.copy(source_dir / "meta.json", target_dir / "meta.json")
44 | 
45 |     pipeline.extend(read_pipeline(source_dir / "meta.json"))
46 | 
47 |     update_pipeline(target_dir / "meta.json", pipeline)
48 | 
49 | 
50 | if __name__ == "__main__":
51 |     main()
52 | 


--------------------------------------------------------------------------------
/plain_word_vectors.py:
--------------------------------------------------------------------------------
 1 | import plac
 2 | import gensim
 3 | from gensim import utils
 4 | 
 5 | 
 6 | class Corpus:
 7 |     def __init__(self, corpus_file):
 8 |         self.corpus_file = corpus_file
 9 | 
10 |     def __iter__(self):
11 |         with open(self.corpus_file) as fd:
12 |             for line in fd:
13 |                 yield utils.simple_preprocess(line)
14 | 
15 | 
16 | @plac.annotations(
17 |     in_dir=("Location of input directory"),
18 |     out_loc=("Location of output file"),
19 |     n_workers=("Number of workers", "option", "n", int),
20 |     size=("Dimension of the word vectors", "option", "d", int),
21 |     window=("Context window size", "option", "w", int),
22 |     min_count=("Min count", "option", "m", int),
23 |     negative=("Number of negative samples", "option", "g", int),
24 |     nr_iter=("Number of iterations", "option", "i", int),
25 | )
26 | def main(
27 |     in_dir,
28 |     out_loc,
29 |     negative=5,
30 |     n_workers=4,
31 |     window=5,
32 |     size=128,
33 |     min_count=10,
34 |     nr_iter=2,
35 | ):
36 |     sentences = Corpus(in_dir)
37 |     model = gensim.models.Word2Vec(
38 |         sentences=sentences,
39 |         size=size,
40 |         window=window,
41 |         min_count=min_count,
42 |         workers=n_workers,
43 |         sample=1e-5,
44 |         negative=negative,
45 |         iter=nr_iter,
46 |     )
47 |     model.wv.save_word2vec_format(out_loc, binary=False)
48 | 
49 | 
50 | if __name__ == "__main__":
51 |     plac.call(main)
52 | 


--------------------------------------------------------------------------------
/.images/attributes_of_doc.html:
--------------------------------------------------------------------------------
  1 | <div>
  2 | <style scoped="">
  3 |     .dataframe tbody tr th:only-of-type {
  4 |         vertical-align: middle;
  5 |     }
  6 | 
  7 |     .dataframe tbody tr th {
  8 |         vertical-align: top;
  9 |     }
 10 | 
 11 |     .dataframe thead th {
 12 |         text-align: right;
 13 |     }
 14 | </style>
 15 | <table border="1" class="dataframe">
 16 |   <thead>
 17 |     <tr style="text-align: right;">
 18 |       <th></th>
 19 |       <th>text</th>
 20 |       <th>lemma_</th>
 21 |       <th>pos_</th>
 22 |       <th>tag_</th>
 23 |       <th>dep_</th>
 24 |       <th>shape_</th>
 25 |       <th>is_alpha</th>
 26 |       <th>is_stop</th>
 27 |       <th>has_vector</th>
 28 |       <th>vector_norm</th>
 29 |       <th>is_oov</th>
 30 |     </tr>
 31 |   </thead>
 32 |   <tbody>
 33 |     <tr>
 34 |       <th>0</th>
 35 |       <td>王小明</td>
 36 |       <td>王小明</td>
 37 |       <td>X</td>
 38 |       <td>NNP</td>
 39 |       <td>nsubj</td>
 40 |       <td>xxx</td>
 41 |       <td>True</td>
 42 |       <td>False</td>
 43 |       <td>True</td>
 44 |       <td>0.392991</td>
 45 |       <td>True</td>
 46 |     </tr>
 47 |     <tr>
 48 |       <th>1</th>
 49 |       <td>在</td>
 50 |       <td>在</td>
 51 |       <td>X</td>
 52 |       <td>VV</td>
 53 |       <td>acl</td>
 54 |       <td>x</td>
 55 |       <td>True</td>
 56 |       <td>False</td>
 57 |       <td>True</td>
 58 |       <td>7.318524</td>
 59 |       <td>False</td>
 60 |     </tr>
 61 |     <tr>
 62 |       <th>2</th>
 63 |       <td>北京</td>
 64 |       <td>北京</td>
 65 |       <td>X</td>
 66 |       <td>NNP</td>
 67 |       <td>det</td>
 68 |       <td>xx</td>
 69 |       <td>True</td>
 70 |       <td>False</td>
 71 |       <td>True</td>
 72 |       <td>10.940736</td>
 73 |       <td>False</td>
 74 |     </tr>
 75 |     <tr>
 76 |       <th>3</th>
 77 |       <td>的</td>
 78 |       <td>的</td>
 79 |       <td>X</td>
 80 |       <td>DEC</td>
 81 |       <td>case:dec</td>
 82 |       <td>x</td>
 83 |       <td>True</td>
 84 |       <td>False</td>
 85 |       <td>True</td>
 86 |       <td>6.201293</td>
 87 |       <td>False</td>
 88 |     </tr>
 89 |     <tr>
 90 |       <th>4</th>
 91 |       <td>清华大学</td>
 92 |       <td>清华大学</td>
 93 |       <td>X</td>
 94 |       <td>NNP</td>
 95 |       <td>obj</td>
 96 |       <td>xxxx</td>
 97 |       <td>True</td>
 98 |       <td>False</td>
 99 |       <td>True</td>
100 |       <td>12.044737</td>
101 |       <td>False</td>
102 |     </tr>
103 |     <tr>
104 |       <th>5</th>
105 |       <td>读书</td>
106 |       <td>读书</td>
107 |       <td>X</td>
108 |       <td>VV</td>
109 |       <td>ROOT</td>
110 |       <td>xx</td>
111 |       <td>True</td>
112 |       <td>False</td>
113 |       <td>True</td>
114 |       <td>11.602811</td>
115 |       <td>False</td>
116 |     </tr>
117 |   </tbody>
118 | </table>
119 | </div>
120 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | [README written in English](README.en-US.md)
  2 | ------------------------------
  3 | # SpaCy 官方中文模型已经上线(https://spacy.io/models/zh), 参考了本项目，具有相同的特性。本项目『推动 SpaCy 中文模型开发』的使命已经完成，本项目将进入维护状态，后续更新将只进行 bug 修复，感谢各位用户长期的关注和支持。
  4 | 
  5 | # SpaCy 中文模型
  6 | 
  7 | 为 SpaCy 提供的中文数据模型. 模型目前还处于 beta 公开测试的状态 。
  8 | 
  9 | ## 在线演示
 10 | 
 11 | 基于 Jupyter notebook 的在线演示在 [![Binder](https://mybinder.org/badge.svg)](https://mybinder.org/v2/gh/howl-anderson/Chinese_models_for_SpaCy/master?filepath=notebooks%2Fdemo.ipynb)。
 12 | 
 13 | ### 特性
 14 | 部分 `王小明在北京的清华大学读书` 这个 `Doc` 对象的属性信息:
 15 | 
 16 | ![attributes_of_doc](.images/attributes_of_doc.png)
 17 | 
 18 | ### NER (**New!**)
 19 | 部分 `王小明在北京的清华大学读书` 这个 `Doc` 对象的 NER 信息:
 20 | 
 21 | ![ner_of_doc](.images/ner_of_doc.png)
 22 | 
 23 | ## 开始使用
 24 | 
 25 | 模型用二进制文件的形式进行分发, 用户应该具备基础的 SpaCy （version > 2) 的基础知识.
 26 | 
 27 | ### 系统要求
 28 | 
 29 | Python 3 (也许支持 python2, 但未经过良好测试)
 30 | 
 31 | ### 安装
 32 | 
 33 | #### 下载模型
 34 | 从 [releases](https://github.com/howl-anderson/Chinese_models_for_SpaCy/releases) 页面下载模型 (**New!** 为中国地区的用户提供了加速下载的链接)。假设所下载的模型名为 `zh_core_web_sm-2.x.x.tar.gz`。
 35 | 
 36 | #### 安装模型
 37 | 
 38 | ```
 39 | pip install zh_core_web_sm-2.x.x.tar.gz
 40 | ```
 41 | 
 42 | 为了方便后续在 Rasa NLU 等框架中使用，需要再为这个模型建立一个链接，by 执行以下命令：
 43 | 
 44 | ```bash
 45 | spacy link zh_core_web_sm zh
 46 | ```
 47 | 
 48 | 运行完成后就可以使用 zh 这个别名来访问这个模型了。
 49 | 
 50 | ## 运行 Demo 代码
 51 | 
 52 | Demo 代码位于 `test.py`. 在安装好模型后，用户下载或者克隆本仓库的代码，然后可以直接执行
 53 | 
 54 | ```bash
 55 | python3 ./test.py
 56 | ```
 57 | 
 58 | 打开地址 `http://127.0.0.1:5000`, 将看到如下：
 59 | 
 60 | ![Dependency of doc](.images/dependency_of_doc.png)
 61 | 
 62 | ## 如何从零构造这个模型
 63 | 
 64 | 见 [workflow](workflow.md)
 65 | 
 66 | ## 语料库
 67 | 本项目使用的语料库是 OntoNotes 5.0。
 68 | 
 69 | 由于 OntoNotes 5.0 是 LDC ([Linguistic Data Consortium](https://www.ldc.upenn.edu/)) 的版权材料，无法直接包含在本项目中。好消息是，OntoNotes 5.0 对于 团体用户（包含企业和学术组织）是完全免费的。用户可以建立一个企业或者学术组织账号，然后免费获取 OntoNotes 5.0。
 70 | 
 71 | ## TODO list
 72 | 
 73 | * 属性 `pos_` 不正确. 这个和 SpaCy 中中文语言 Class 相关。
 74 | * 属性 `shape_` and `is_alpha` 似乎对中文并无意义, 但需要权威信息确认一下.
 75 | * 属性 `is_stop` 不正确. 这个和 SpaCy 中中文语言 Class 相关。
 76 | * 属性 `vector` 似乎没有训练的很好。
 77 | * <s>属性 `is_oov` 完全错误. 第一优先级修复。</s>
 78 | * <s>NER 模型，因为缺少 LDC 语料库，目前不可用. 正在解决中正在训练中。</s> 
 79 | * 将训练中所用的中间结果 release 出来, 方便用户自行定制模型
 80 | 
 81 | ## 使用的组件
 82 | 
 83 | * TODO
 84 | 
 85 | ## 如何贡献
 86 | 
 87 | 请阅读 [CONTRIBUTING.md](https://gist.github.com/PurpleBooth/b24679402957c63ec426) , 然后提交 pull requests 给我们.
 88 | 
 89 | ## 版本化控制
 90 | 
 91 | 我们使用 [SemVer](http://semver.org/) 做版本化的标准. 查看 `tags` 以了解所有的版本.
 92 | 
 93 | ## 作者
 94 | 
 95 | * **Xiaoquan Kong** - *Initial work* - [howl-anderson](https://github.com/howl-anderson)
 96 | 
 97 | 更多贡献者信息，请参考 `contributors`.
 98 | 
 99 | ## 版权
100 | 
101 | MIT License - 详见 [LICENSE.md](LICENSE.md)
102 | 
103 | ## 致谢
104 | 
105 | * TODO
106 | 


--------------------------------------------------------------------------------
/.images/temp.html:
--------------------------------------------------------------------------------
  1 | <!DOCTYPE html>
  2 | <html lang="en">
  3 | <head>
  4 |     <meta charset="UTF-8">
  5 |     <title>Title</title>
  6 | </head>
  7 | <body>
  8 | <div>
  9 |     <style scoped="">
 10 |         .dataframe tbody tr th:only-of-type {
 11 |             vertical-align: middle;
 12 |         }
 13 | 
 14 |         .dataframe tbody tr th {
 15 |             vertical-align: top;
 16 |         }
 17 | 
 18 |         .dataframe thead th {
 19 |             text-align: right;
 20 |         }
 21 |     </style>
 22 |     <table border="1" class="dataframe">
 23 |         <thead>
 24 |         <tr style="text-align: right;">
 25 |             <th></th>
 26 |             <th>text</th>
 27 |             <th>lemma_</th>
 28 |             <th>pos_</th>
 29 |             <th>tag_</th>
 30 |             <th>dep_</th>
 31 |             <th>shape_</th>
 32 |             <th>is_alpha</th>
 33 |             <th>is_stop</th>
 34 |             <th>has_vector</th>
 35 |             <th>vector_norm</th>
 36 |             <th>is_oov</th>
 37 |         </tr>
 38 |         </thead>
 39 |         <tbody>
 40 |         <tr>
 41 |             <th>0</th>
 42 |             <td>王小明</td>
 43 |             <td>王小明</td>
 44 |             <td>X</td>
 45 |             <td>NNP</td>
 46 |             <td>nsubj</td>
 47 |             <td>xxx</td>
 48 |             <td>True</td>
 49 |             <td>False</td>
 50 |             <td>True</td>
 51 |             <td>0.392991</td>
 52 |             <td>True</td>
 53 |         </tr>
 54 |         <tr>
 55 |             <th>1</th>
 56 |             <td>在</td>
 57 |             <td>在</td>
 58 |             <td>X</td>
 59 |             <td>VV</td>
 60 |             <td>acl</td>
 61 |             <td>x</td>
 62 |             <td>True</td>
 63 |             <td>False</td>
 64 |             <td>True</td>
 65 |             <td>7.318524</td>
 66 |             <td>False</td>
 67 |         </tr>
 68 |         <tr>
 69 |             <th>2</th>
 70 |             <td>北京</td>
 71 |             <td>北京</td>
 72 |             <td>X</td>
 73 |             <td>NNP</td>
 74 |             <td>det</td>
 75 |             <td>xx</td>
 76 |             <td>True</td>
 77 |             <td>False</td>
 78 |             <td>True</td>
 79 |             <td>10.940736</td>
 80 |             <td>False</td>
 81 |         </tr>
 82 |         <tr>
 83 |             <th>3</th>
 84 |             <td>的</td>
 85 |             <td>的</td>
 86 |             <td>X</td>
 87 |             <td>DEC</td>
 88 |             <td>case:dec</td>
 89 |             <td>x</td>
 90 |             <td>True</td>
 91 |             <td>False</td>
 92 |             <td>True</td>
 93 |             <td>6.201293</td>
 94 |             <td>False</td>
 95 |         </tr>
 96 |         <tr>
 97 |             <th>4</th>
 98 |             <td>清华大学</td>
 99 |             <td>清华大学</td>
100 |             <td>X</td>
101 |             <td>NNP</td>
102 |             <td>obj</td>
103 |             <td>xxxx</td>
104 |             <td>True</td>
105 |             <td>False</td>
106 |             <td>True</td>
107 |             <td>12.044737</td>
108 |             <td>False</td>
109 |         </tr>
110 |         <tr>
111 |             <th>5</th>
112 |             <td>读书</td>
113 |             <td>读书</td>
114 |             <td>X</td>
115 |             <td>VV</td>
116 |             <td>ROOT</td>
117 |             <td>xx</td>
118 |             <td>True</td>
119 |             <td>False</td>
120 |             <td>True</td>
121 |             <td>11.602811</td>
122 |             <td>False</td>
123 |         </tr>
124 |         </tbody>
125 |     </table>
126 | </div>
127 | </body>
128 | </html>


--------------------------------------------------------------------------------
/workflow.md:
--------------------------------------------------------------------------------
  1 | # SpaCy Chinese model training workflow
  2 | 
  3 | ## get preprocessed Chinese Wikipedia corpus
  4 |    see project [chinese-wikipedia-corpus-creator](https://github.com/howl-anderson/chinese-wikipedia-corpus-creator) for more details.
  5 |    
  6 | ### produce wikipedia corpus ###
  7 |    * input: -
  8 |    * output: `token_cleaned_plain_files/`
  9 |    * script: `create_wikipedia_corpus.bash`
 10 | 
 11 | ### copy corpus to workspace ###
 12 |    * input: `chinese-wikipedia-corpus-creator/token_cleaned_plain_files/``
 13 |    * output: `token_cleaned_plain_files/`
 14 |    * script: `move_wikipedia_corpus.bash`
 15 | 
 16 | ## computing word frequency
 17 |    * input: `token_cleaned_plain_files/*`
 18 |    * output: `WORDS_FREQ.txt`
 19 |    * script: `compute_words_freq.bash`
 20 | 
 21 | ## merge all files into one
 22 |    * input: `token_cleaned_plain_files/*`
 23 |    * output: `WORDS.txt`
 24 |    * script: `merge_all_text_files.bash`
 25 | 
 26 | ## compute brown cluster
 27 | ### brown cluster computing software
 28 |    Official software is [brown-cluster](https://github.com/percyliang/brown-cluster).
 29 | 
 30 | ### install
 31 |    * input: -
 32 |    * output: ``
 33 |    * script: `download_and_compile_brown_cluster.bash`
 34 | 
 35 | ### computing
 36 |    * input: `WORDS.txt`
 37 |    * output: `WORDS-c1000-p1.out/*`
 38 |    * script: `compute_brown_cluster.bash`
 39 | 
 40 | ## compute word vector
 41 |    * input: `token_cleaned_plain_files/*`
 42 |    * output: `WORDS_VECS.txt`
 43 |    * script: `compute_plain_word_vec.bash`
 44 | 
 45 | ## initial SpaCy model
 46 | 
 47 | ### build base model
 48 |    * input: `./WORDS-c1000-p1.out/paths  WORDS_VECS.txt  WORDS_FREQ.txt`
 49 |    * output: `spacy_models/base_model/**/*`
 50 |    * script: `create_init_model.bash`
 51 |    
 52 | ### modify model name
 53 |    * input: `spacy_models/base_model/meta.json`
 54 |    * output: `spacy_models/base_model/meta.json`
 55 |    * script: `update_model_meta.py`
 56 |    
 57 | 
 58 | ## getting UD_Chinese-GSD corpus
 59 | 
 60 | ### download
 61 |    * input: -
 62 |    * output: `corpus/UD_Chinese-GS.zip`
 63 |    * script: `download_UD_Chinese-GSD_corpus.bash`
 64 | 
 65 | ### extracting
 66 |    * input: `corpus/UD_Chinese-GSd.zip`
 67 |    * output: `corpus/UD_Chinese-GSd`
 68 |    * script: `extract_UD_Chinese-GSD_corpus.bash`
 69 | 
 70 | ### convert to simplified Chinese
 71 |    * input: `corpus/UD_Chinese-GSd/zh-ud-*.conllu`
 72 |    * output: `corpus/UD_Chinese-GSd/zh-simplified-ud-*.conllu`
 73 |    * script: `convert_UD_Chinese-GSD_corpus.bash`
 74 | 
 75 | ## convert UD corpus format
 76 |    * input: `.corpus/UD_Chinese-GSD/zh-simplified-ud-*.conllu`
 77 |    * output: `corpus/spacy/zh-simplified-ud-*.conllu`
 78 |    * script: `format_convertor.bash`
 79 | 
 80 | ## init spacy model with word vector & word cluster & word frquence
 81 |    * input: `WORDS_FREQ.txt`, `WORDS-c1000-p1.out/paths`, `WORDS_VECS.txt`
 82 |    * output: `zh_model/*`
 83 |    * script: `init_model.bash`
 84 | 
 85 | ## train SpaCy model for POS and dependency parser
 86 |    * input: `zh_model  corpus/spacy/zh-simplified-ud-*.conllu`
 87 |    * output: `dependency_model`
 88 |    * script: `train_model.bash`
 89 | 
 90 | ## translate onotNote 5 to spacy json file
 91 |    * input: `TODO` 
 92 |    * output: `TODO`
 93 |    * script: `onto_to_spacy_json.bash`
 94 | 
 95 | ## train SpaCy model for NER parser
 96 |    * input: `zh_model china_ner_train.json china_ner_eval.json`
 97 |    * output: `ner_model`
 98 |    * script: `train_ner.bash`
 99 | 
100 | ## merge sub-model
101 |    * input: `spacy_models/dependency_model`, `spacy_models/ner_model`
102 |    * output: `spacy_models/final_model`
103 |    * script: `merge_submodel.py`
104 | 
105 | ## create package 
106 |    * input: `spacy_models/final_model/`
107 |    * output: `spacy_package/`
108 |    * script: `./create_model_package.bash`
109 | 


--------------------------------------------------------------------------------
/README.en-US.md:
--------------------------------------------------------------------------------
  1 | [中文版本的 README](README.zh-Hans.md)
  2 | ------------------------------
  3 | 
  4 | # The official Chinese model for SpaCy is now available at (https://spacy.io/models/zh). It was developed with reference to this project and shares the same features. As the goal of this project — “promoting the development of the SpaCy Chinese model” — has been achieved, this repository will enter maintenance mode. Future updates will focus only on bug fixes. We would like to thank all users for their long-term attention and support.
  5 | 
  6 | # Chinese models for SpaCy
  7 | 
  8 | SpaCy (version > 2) models for Chinese language. Those models are rough and still **working in prograss**. But "Something is Better Than Nothing".
  9 | 
 10 | ## Online demo
 11 | 
 12 | An online jupyter notebook / demo is provided at [![Binder](https://mybinder.org/badge.svg)](https://mybinder.org/v2/gh/howl-anderson/Chinese_models_for_SpaCy/master?filepath=notebooks%2Fdemo.ipynb).
 13 | 
 14 | ### Features
 15 | 
 16 | Partial attributes of a `Doc` object for `王小明在北京的清华大学读书`:
 17 | 
 18 | ![attributes_of_doc](.images/attributes_of_doc.png)
 19 | 
 20 | ### NER (**New!**)
 21 | NER of a `Doc` object for `王小明在北京的清华大学读书`:
 22 | 
 23 | ![attributes_of_doc](.images/ner_of_doc.png)
 24 | 
 25 | ## Getting Started
 26 | 
 27 | Models are released as binary file, users should know basic knowledge of using SpaCy version 2+.
 28 | 
 29 | ### Prerequisites
 30 | 
 31 | Python 3 (maybe python2, but currently not well tested)
 32 | 
 33 | ### Installing
 34 | 
 35 | Download relased model from `releases`.
 36 | 
 37 | ```
 38 | wget -c https://github.com/howl-anderson/Chinese_models_for_SpaCy/releases/download/v2.0.4/zh_core_web_sm-2.0.4.tar.gz
 39 | ```
 40 | 
 41 | then install model
 42 | 
 43 | ```
 44 | pip install zh_core_web_sm-2.0.4.tar.gz
 45 | ```
 46 | 
 47 | 
 48 | ## Running demo code
 49 | 
 50 | `test.py` contains demo codes. After install the model, user can download or clone this repo then execute:
 51 | 
 52 | ```bash
 53 | python3 ./test.py
 54 | ```
 55 | 
 56 | then, open web browser to `http://127.0.0.1:5000`, user will see image simllar to this:
 57 | 
 58 | ![Dependency of doc](.images/dependency_of_doc.png)
 59 | 
 60 | ## How to re-produce model
 61 | 
 62 | See [workflow](workflow.md)
 63 | 
 64 | ## Corpus Data
 65 | The corpus data used in this project is OntoNotes 5.0。
 66 | 
 67 | Since OntoNotes 5.0 is copyright material of LDC ([Linguistic Data Consortium](https://www.ldc.upenn.edu/)) . This project can not include the daa directly。Good news is OntoNotes 5.0 is free to organizer user, you can set up a count for your company or school, then you can get the OntoNotes 5.0 at no cost。
 68 | 
 69 | 
 70 | ## TODO list
 71 | 
 72 | * Attribute `pos_` is not working correctly. This related to Language class in SpaCy.
 73 | * Attribute `shape_` and `is_alpha` seems meaningless for Chinese, need make sure of it.
 74 | * Attribute `is_stop` is not working correctly. This related to Language class in SpaCy.
 75 | * Attribute `vector` seems not well trained
 76 | * <s>Attribute `is_oov` is totally incorrect. First priority.</s>
 77 | * <s>NER model is not available due to lacking of LDC corpus. I am working on it.</s>
 78 | * Release all the intermediate material to help user build own model
 79 | 
 80 | ## Built With
 81 | 
 82 | * TODO
 83 | 
 84 | ## Contributing
 85 | 
 86 | Please read [CONTRIBUTING.md](https://gist.github.com/PurpleBooth/b24679402957c63ec426) for details on our code of conduct, and the process for submitting pull requests to us.
 87 | 
 88 | ## Versioning
 89 | 
 90 | We use [SemVer](http://semver.org/) for versioning. For the versions available, see the `tags` on this repository.
 91 | 
 92 | ## Authors
 93 | 
 94 | * **Xiaoquan Kong** - *Initial work* - [howl-anderson](https://github.com/howl-anderson)
 95 | 
 96 | See also the list of `contributors` who participated in this project.
 97 | 
 98 | ## License
 99 | 
100 | This project is licensed under the MIT License - see the [LICENSE.md](LICENSE.md) file for details
101 | 
102 | ## Acknowledgments
103 | 
104 | * TODO
105 | 


--------------------------------------------------------------------------------
/.images/dependency_of_doc.svg:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8" ?>
 2 | <svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" id="0" class="displacy" width="1100" height="574.5" style="max-width: none; height: 574.5px; color: #000000; background: #ffffff; font-family: Arial">
 3 | <text class="displacy-token" fill="currentColor" text-anchor="middle" y="484.5">
 4 |     <tspan class="displacy-word" fill="currentColor" x="50">王小明</tspan>
 5 |     <tspan class="displacy-tag" dy="2em" fill="currentColor" x="50">X</tspan>
 6 | </text>
 7 | 
 8 | <text class="displacy-token" fill="currentColor" text-anchor="middle" y="484.5">
 9 |     <tspan class="displacy-word" fill="currentColor" x="225">在</tspan>
10 |     <tspan class="displacy-tag" dy="2em" fill="currentColor" x="225">X</tspan>
11 | </text>
12 | 
13 | <text class="displacy-token" fill="currentColor" text-anchor="middle" y="484.5">
14 |     <tspan class="displacy-word" fill="currentColor" x="400">北京</tspan>
15 |     <tspan class="displacy-tag" dy="2em" fill="currentColor" x="400">X</tspan>
16 | </text>
17 | 
18 | <text class="displacy-token" fill="currentColor" text-anchor="middle" y="484.5">
19 |     <tspan class="displacy-word" fill="currentColor" x="575">的</tspan>
20 |     <tspan class="displacy-tag" dy="2em" fill="currentColor" x="575">X</tspan>
21 | </text>
22 | 
23 | <text class="displacy-token" fill="currentColor" text-anchor="middle" y="484.5">
24 |     <tspan class="displacy-word" fill="currentColor" x="750">清华大学</tspan>
25 |     <tspan class="displacy-tag" dy="2em" fill="currentColor" x="750">X</tspan>
26 | </text>
27 | 
28 | <text class="displacy-token" fill="currentColor" text-anchor="middle" y="484.5">
29 |     <tspan class="displacy-word" fill="currentColor" x="925">读书</tspan>
30 |     <tspan class="displacy-tag" dy="2em" fill="currentColor" x="925">X</tspan>
31 | </text>
32 | 
33 | <g class="displacy-arrow">
34 |     <path class="displacy-arc" id="arrow-0-0" stroke-width="2px" d="M70,439.5 C70,2.0 925.0,2.0 925.0,439.5" fill="none" stroke="currentColor"/>
35 |     <text dy="1.25em" style="font-size: 0.8em; letter-spacing: 1px">
36 |         <textPath xlink:href="#arrow-0-0" class="displacy-label" startOffset="50%" fill="currentColor" text-anchor="middle">nsubj</textPath>
37 |     </text>
38 |     <path class="displacy-arrowhead" d="M70,441.5 L62,429.5 78,429.5" fill="currentColor"/>
39 | </g>
40 | 
41 | <g class="displacy-arrow">
42 |     <path class="displacy-arc" id="arrow-0-1" stroke-width="2px" d="M245,439.5 C245,89.5 920.0,89.5 920.0,439.5" fill="none" stroke="currentColor"/>
43 |     <text dy="1.25em" style="font-size: 0.8em; letter-spacing: 1px">
44 |         <textPath xlink:href="#arrow-0-1" class="displacy-label" startOffset="50%" fill="currentColor" text-anchor="middle">acl</textPath>
45 |     </text>
46 |     <path class="displacy-arrowhead" d="M245,441.5 L237,429.5 253,429.5" fill="currentColor"/>
47 | </g>
48 | 
49 | <g class="displacy-arrow">
50 |     <path class="displacy-arc" id="arrow-0-2" stroke-width="2px" d="M420,439.5 C420,264.5 735.0,264.5 735.0,439.5" fill="none" stroke="currentColor"/>
51 |     <text dy="1.25em" style="font-size: 0.8em; letter-spacing: 1px">
52 |         <textPath xlink:href="#arrow-0-2" class="displacy-label" startOffset="50%" fill="currentColor" text-anchor="middle">det</textPath>
53 |     </text>
54 |     <path class="displacy-arrowhead" d="M420,441.5 L412,429.5 428,429.5" fill="currentColor"/>
55 | </g>
56 | 
57 | <g class="displacy-arrow">
58 |     <path class="displacy-arc" id="arrow-0-3" stroke-width="2px" d="M420,439.5 C420,352.0 555.0,352.0 555.0,439.5" fill="none" stroke="currentColor"/>
59 |     <text dy="1.25em" style="font-size: 0.8em; letter-spacing: 1px">
60 |         <textPath xlink:href="#arrow-0-3" class="displacy-label" startOffset="50%" fill="currentColor" text-anchor="middle">case:dec</textPath>
61 |     </text>
62 |     <path class="displacy-arrowhead" d="M555.0,441.5 L563.0,429.5 547.0,429.5" fill="currentColor"/>
63 | </g>
64 | 
65 | <g class="displacy-arrow">
66 |     <path class="displacy-arc" id="arrow-0-4" stroke-width="2px" d="M245,439.5 C245,177.0 740.0,177.0 740.0,439.5" fill="none" stroke="currentColor"/>
67 |     <text dy="1.25em" style="font-size: 0.8em; letter-spacing: 1px">
68 |         <textPath xlink:href="#arrow-0-4" class="displacy-label" startOffset="50%" fill="currentColor" text-anchor="middle">obj</textPath>
69 |     </text>
70 |     <path class="displacy-arrowhead" d="M740.0,441.5 L748.0,429.5 732.0,429.5" fill="currentColor"/>
71 | </g>
72 | </svg>
73 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | *.egg-info/
 24 | .installed.cfg
 25 | *.egg
 26 | MANIFEST
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | .pytest_cache/
 49 | 
 50 | # Translations
 51 | *.mo
 52 | *.pot
 53 | 
 54 | # Django stuff:
 55 | *.log
 56 | local_settings.py
 57 | db.sqlite3
 58 | 
 59 | # Flask stuff:
 60 | instance/
 61 | .webassets-cache
 62 | 
 63 | # Scrapy stuff:
 64 | .scrapy
 65 | 
 66 | # Sphinx documentation
 67 | docs/_build/
 68 | 
 69 | # PyBuilder
 70 | target/
 71 | 
 72 | # Jupyter Notebook
 73 | .ipynb_checkpoints
 74 | 
 75 | # pyenv
 76 | .python-version
 77 | 
 78 | # celery beat schedule file
 79 | celerybeat-schedule
 80 | 
 81 | # SageMath parsed files
 82 | *.sage.py
 83 | 
 84 | # Environments
 85 | .env
 86 | .venv
 87 | env/
 88 | venv/
 89 | ENV/
 90 | env.bak/
 91 | venv.bak/
 92 | 
 93 | # Spyder project settings
 94 | .spyderproject
 95 | .spyproject
 96 | 
 97 | # Rope project settings
 98 | .ropeproject
 99 | 
100 | # mkdocs documentation
101 | /site
102 | 
103 | # mypy
104 | .mypy_cache/
105 | # General
106 | .DS_Store
107 | .AppleDouble
108 | .LSOverride
109 | 
110 | # Icon must end with two \r
111 | Icon
112 | 
113 | 
114 | # Thumbnails
115 | ._*
116 | 
117 | # Files that might appear in the root of a volume
118 | .DocumentRevisions-V100
119 | .fseventsd
120 | .Spotlight-V100
121 | .TemporaryItems
122 | .Trashes
123 | .VolumeIcon.icns
124 | .com.apple.timemachine.donotpresent
125 | 
126 | # Directories potentially created on remote AFP share
127 | .AppleDB
128 | .AppleDesktop
129 | Network Trash Folder
130 | Temporary Items
131 | .apdisk
132 | # -*- mode: gitignore; -*-
133 | *~
134 | \#*\#
135 | /.emacs.desktop
136 | /.emacs.desktop.lock
137 | *.elc
138 | auto-save-list
139 | tramp
140 | .\#*
141 | 
142 | # Org-mode
143 | .org-id-locations
144 | *_archive
145 | 
146 | # flymake-mode
147 | *_flymake.*
148 | 
149 | # eshell files
150 | /eshell/history
151 | /eshell/lastdir
152 | 
153 | # elpa packages
154 | /elpa/
155 | 
156 | # reftex files
157 | *.rel
158 | 
159 | # AUCTeX auto folder
160 | /auto/
161 | 
162 | # cask packages
163 | .cask/
164 | dist/
165 | 
166 | # Flycheck
167 | flycheck_*.el
168 | 
169 | # server auth directory
170 | /server/
171 | 
172 | # projectiles files
173 | .projectile
174 | 
175 | # directory configuration
176 | .dir-locals.el
177 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and WebStorm
178 | # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
179 | 
180 | # User-specific stuff
181 | .idea/**/workspace.xml
182 | .idea/**/tasks.xml
183 | .idea/**/dictionaries
184 | .idea/**/shelf
185 | 
186 | # Sensitive or high-churn files
187 | .idea/**/dataSources/
188 | .idea/**/dataSources.ids
189 | .idea/**/dataSources.local.xml
190 | .idea/**/sqlDataSources.xml
191 | .idea/**/dynamic.xml
192 | .idea/**/uiDesigner.xml
193 | 
194 | # Gradle
195 | .idea/**/gradle.xml
196 | .idea/**/libraries
197 | 
198 | # CMake
199 | cmake-build-debug/
200 | cmake-build-release/
201 | 
202 | # Mongo Explorer plugin
203 | .idea/**/mongoSettings.xml
204 | 
205 | # File-based project format
206 | *.iws
207 | 
208 | # IntelliJ
209 | out/
210 | 
211 | # mpeltonen/sbt-idea plugin
212 | .idea_modules/
213 | 
214 | # JIRA plugin
215 | atlassian-ide-plugin.xml
216 | 
217 | # Cursive Clojure plugin
218 | .idea/replstate.xml
219 | 
220 | # Crashlytics plugin (for Android Studio and IntelliJ)
221 | com_crashlytics_export_strings.xml
222 | crashlytics.properties
223 | crashlytics-build.properties
224 | fabric.properties
225 | 
226 | # Editor-based Rest Client
227 | .idea/httpRequests
228 | 
229 | corpus/*
230 | 
231 | zh_wiki_core_sm/*
232 | zh_wiki_core/*
233 | zh_model/
234 | 
235 | WORDS.*
236 | WORDS_*
237 | WORDS-*/
238 | package_templete/
239 | 
240 | 


--------------------------------------------------------------------------------
/onto_to_spacy_json.py:
--------------------------------------------------------------------------------
  1 | import json  # for tuple support
  2 | import plac
  3 | import os
  4 | import re
  5 | from spacy.gold import biluo_tags_from_offsets
  6 | import spacy
  7 | 
  8 | nlp = spacy.blank("xx")
  9 | from tqdm import tqdm
 10 | import random
 11 | 
 12 | 
 13 | def get_root_filename(onto_dir):
 14 |     name_files = []
 15 |     for dirpath, subdirs, files in os.walk(onto_dir):
 16 |         for fname in files:
 17 |             if bool(re.search(".name", fname)):
 18 |                 fn = os.path.join(dirpath, fname)
 19 |                 fn = re.sub("\.name", "", fn)
 20 |                 name_files.append(fn)
 21 |     return name_files
 22 | 
 23 | 
 24 | def split_sentence(text):
 25 |     text = text.strip().split("\n")[1:-1]
 26 |     return text
 27 | 
 28 | 
 29 | def split_doc(text):
 30 |     text_list = text.strip().split("</DOC>\s<DOC")
 31 |     ids = [re.findall('<DOC DOCNO="(.+?)">', t)[0] for t in text_list]
 32 |     text_list = [re.sub('<DOC DOCNO=".+?">', "", t).strip() for t in text_list]
 33 |     return ids, text_list
 34 | 
 35 | 
 36 | def clean_ent(ent):
 37 |     tag = re.findall('TYPE="(.+?)"', ent)[0]
 38 |     text = re.findall(">(.+)", ent)[0]
 39 |     text = re.sub("\$", "\$", text)
 40 |     return (text, tag)
 41 | 
 42 | 
 43 | def raw_text(text):
 44 |     """Remove entity tags"""
 45 |     text = re.sub("<ENAMEX .+?>", "", text)
 46 |     text = re.sub("</ENAMEX>", "", text)
 47 |     return text
 48 | 
 49 | 
 50 | def ent_position(ents, text):
 51 |     search_point = 0
 52 |     spacy_ents = []
 53 |     for ent in ents:
 54 |         remain_text = text[search_point:]
 55 |         ma = re.search(ent[0], remain_text)
 56 |         ent_tup = (ma.start() + search_point, ma.end() + search_point, ent[1])
 57 |         spacy_ents.append(ent_tup)
 58 | 
 59 |         # update search point to prevent same word in different entity,
 60 |         # it will cause bug which hard to debug
 61 |         search_point = search_point + ma.end()
 62 |     return spacy_ents
 63 | 
 64 | 
 65 | def text_to_spacy(markup):
 66 |     raw_ents = re.findall("<ENAMEX(.+?)</ENAMEX>", markup)
 67 |     ents = [clean_ent(raw_ent) for raw_ent in raw_ents]
 68 |     text = raw_text(markup)
 69 |     spacy_ents = ent_position(ents, text)
 70 |     final = (text, {"entities": spacy_ents})
 71 |     return final
 72 | 
 73 | 
 74 | def onf_to_raw(onf_file):
 75 |     """
 76 |     Take in a path to a .onf Ontonotes file. Return the raw text (as much as possible).
 77 |     The quotes are usually quite messed up, so this is not going to look like real input text.
 78 |     """
 79 |     with open(onf_file, "r") as f:
 80 |         onf = f.read()
 81 |     sentences = re.findall(
 82 |         "Plain sentence\:\n\-+?\n(.+?)Treebanked sentence", onf, re.DOTALL
 83 |     )
 84 |     sentences = [re.sub("\n+?\s*", " ", i).strip() for i in sentences]
 85 |     paragraph = " ".join(sentences)
 86 |     return paragraph
 87 | 
 88 | 
 89 | def name_to_sentences(ner_filename):
 90 |     """
 91 |     Take a .name file and return a sentence list of the kind described here:
 92 |     https://github.com/explosion/spacy/blob/master/examples/training/training-data.json
 93 |     """
 94 |     with open(ner_filename, "r") as f:
 95 |         doc = f.read()
 96 | 
 97 |     sentences = []
 98 |     onto_sents = split_sentence(doc)
 99 |     for sent in onto_sents:
100 |         offsets = text_to_spacy(sent)
101 |         doc = nlp(offsets[0])
102 |         tags = biluo_tags_from_offsets(doc, offsets[1]["entities"])
103 |         ner_info = list(zip(doc, tags))
104 |         tokens = []
105 |         for n, i in enumerate(ner_info):
106 |             token = {
107 |                 "head": 0,
108 |                 "dep": "",
109 |                 "tag": "",
110 |                 "orth": i[0].string,
111 |                 "ner": i[1],
112 |                 "id": n,
113 |             }
114 |             tokens.append(token)
115 |         sentences.append({"tokens": tokens})
116 |     return sentences
117 | 
118 | 
119 | def dir_to_annotation(onto_dir):
120 |     fns = get_root_filename(onto_dir)
121 |     all_annotations = []
122 | 
123 |     for fn in tqdm(fns):
124 |         ner_filename = fn + ".name"
125 |         onf_filename = fn + ".onf"
126 | 
127 |         try:
128 |             raw = onf_to_raw(onf_filename)
129 |             sentences = name_to_sentences(ner_filename)
130 |             final = {"id": "fake", "paragraphs": [{"raw": raw, "sentences": sentences}]}
131 |             all_annotations.append(final)
132 |         except Exception as e:
133 |             print("Error formatting ", fn, e)
134 |     return all_annotations
135 | 
136 | 
137 | @plac.annotations(
138 |     onto_dir=("Directory of OntoNotes data to traverse", "option", "i", str),
139 |     train_file=("File to write training spaCy JSON out to", "option", "t", str),
140 |     val_file=("File to write validation spaCy JSON out to", "option", "e", str),
141 |     val_split=("Percentage to use for evaluation", "option", "v", float),
142 | )
143 | def main(onto_dir, train_file, val_file, val_split=0.75):
144 |     print("Reading and formatting annotations")
145 |     all_annotations = dir_to_annotation(onto_dir)
146 |     random.shuffle(all_annotations)
147 |     cutpoint = round(val_split * len(all_annotations))
148 |     val = all_annotations[:cutpoint]
149 |     train = all_annotations[cutpoint:]
150 | 
151 |     print(
152 |         "Saving {0} training examples and {1} validation examples".format(
153 |             len(train), len(val)
154 |         )
155 |     )
156 |     with open(train_file, "w") as f:
157 |         json.dump(train, f, ensure_ascii=False, indent=4)
158 |     with open(val_file, "w") as f:
159 |         json.dump(val, f, ensure_ascii=False, indent=4)
160 | 
161 | 
162 | if __name__ == "__main__":
163 |     plac.call(main)
164 | 


--------------------------------------------------------------------------------
/create_jsonl_vocabulary.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import math
  3 | import string
  4 | from ast import literal_eval
  5 | from pathlib import Path
  6 | 
  7 | import ftfy
  8 | import jsonlines
  9 | import plac
 10 | import validators
 11 | from preshed.counter import PreshCounter
 12 | from spacy.lang.en import stop_words as en_stop_words
 13 | from spacy.lang.zh import stop_words as zh_stop_words
 14 | from tqdm import tqdm
 15 | 
 16 | 
 17 | class Word:
 18 |     counter = -1
 19 | 
 20 |     def __init__(self, word_str, cluster, probs):
 21 |         self._word = word_str
 22 |         self._cluster = cluster
 23 |         self._probs = probs
 24 | 
 25 |         chinese_punct = "！？｡。＂＃＄％＆＇（）＊＋，－／：；＜＝＞＠［＼］＾＿｀｛｜｝～｟｠｢｣､、〃》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏."
 26 |         self._punct_list = list(set(string.punctuation + chinese_punct))
 27 | 
 28 |         chinese_whitespace = ""
 29 |         self._whitespace_list = list(set(string.whitespace + chinese_whitespace))
 30 | 
 31 |         english_stopword = en_stop_words.STOP_WORDS
 32 |         chinese_stopword = zh_stop_words.STOP_WORDS
 33 |         self._stopword_list = {*english_stopword, *chinese_stopword}
 34 | 
 35 |         chinese_quote = "“”‘’"
 36 |         english_quote = "\"'"
 37 |         self._qute_list = list(set(english_quote + chinese_quote))
 38 | 
 39 |         chinese_left_punct = "<([{"
 40 |         english_left_punct = "＜（［「『【〔〖〘〚｛"
 41 |         self._left_punct_list = list(set(english_left_punct + chinese_left_punct))
 42 | 
 43 |         chinese_right_punct = ">)]}"
 44 |         english_right_punct = "＞）］」』】〕〗〙〛｝"
 45 |         self._right_punct_list = list(set(english_right_punct + chinese_right_punct))
 46 | 
 47 |     @property
 48 |     def orth(self):
 49 |         return self._word
 50 | 
 51 |     @property
 52 |     def id(self):
 53 |         self.__class__.counter += 1
 54 | 
 55 |         return self.__class__.counter
 56 | 
 57 |     @property
 58 |     def lower(self):
 59 |         return self._word.lower()
 60 | 
 61 |     @property
 62 |     def norm(self):
 63 |         return self._word
 64 | 
 65 |     @property
 66 |     def shape(self):
 67 |         return "".join(map(lambda x: "X" if x.isupper() else "x", self._word))
 68 | 
 69 |     @property
 70 |     def prefix(self):
 71 |         return self._word[0]
 72 | 
 73 |     @property
 74 |     def suffix(self):
 75 |         return self._word[-1]
 76 | 
 77 |     @property
 78 |     def length(self):
 79 |         return len(self._word)
 80 | 
 81 |     @property
 82 |     def cluster(self):
 83 |         return self._cluster
 84 | 
 85 |     @property
 86 |     def prob(self):
 87 |         return self._probs.get(self, 0)
 88 | 
 89 |     @property
 90 |     def is_alpha(self):
 91 |         return self._word.isalpha()
 92 | 
 93 |     @property
 94 |     def is_ascii(self):
 95 |         # only for py 3.7
 96 |         # return self._word.isascii()
 97 |         try:
 98 |             self._word.encode('ascii')
 99 |         except UnicodeEncodeError:
100 |             return False
101 | 
102 |         return True
103 | 
104 |     @property
105 |     def is_digit(self):
106 |         return self._word.isdigit()
107 | 
108 |     @property
109 |     def is_lower(self):
110 |         return self._word.islower()
111 | 
112 |     @property
113 |     def is_punct(self):
114 |         return self._word in self._punct_list
115 | 
116 |     @property
117 |     def is_space(self):
118 |         return self._word in self._whitespace_list
119 | 
120 |     @property
121 |     def is_title(self):
122 |         return self._word.istitle()
123 | 
124 |     @property
125 |     def is_upper(self):
126 |         return self._word.isupper()
127 | 
128 |     @property
129 |     def like_url(self):
130 |         return bool(validators.url(self._word))
131 | 
132 |     @property
133 |     def like_num(self):
134 |         # TODO(howl-anderson): fix it later
135 |         return False
136 | 
137 |     @property
138 |     def like_email(self):
139 |         return bool(validators.email(self._word))
140 | 
141 |     @property
142 |     def is_stop(self):
143 |         return self._word in self._stopword_list
144 | 
145 |     @property
146 |     def is_oov(self):
147 |         return not self._word in self._probs
148 | 
149 |     @property
150 |     def is_quote(self):
151 |         return self._word in self._qute_list
152 | 
153 |     @property
154 |     def is_left_punct(self):
155 |         return self._word in self._left_punct_list
156 | 
157 |     @property
158 |     def is_right_punct(self):
159 |         return self._word in self._right_punct_list
160 | 
161 | 
162 | def read_freqs(freqs_loc, max_length=100, min_doc_freq=5, min_freq=50):
163 |     print("Counting frequencies...")
164 |     counts = PreshCounter()
165 |     total = 0
166 |     with freqs_loc.open() as f:
167 |         for i, line in enumerate(f):
168 |             freq, doc_freq, key = line.rstrip().split("\t", 2)
169 |             freq = int(freq)
170 |             counts.inc(i + 1, freq)
171 |             total += freq
172 |     counts.smooth()
173 |     log_total = math.log(total)
174 |     probs = {}
175 |     with freqs_loc.open() as f:
176 |         for line in tqdm(f):
177 |             freq, doc_freq, key = line.rstrip().split("\t", 2)
178 |             doc_freq = int(doc_freq)
179 |             freq = int(freq)
180 |             if doc_freq >= min_doc_freq and freq >= min_freq and len(key) < max_length:
181 |                 word = literal_eval(key)
182 |                 smooth_count = counts.smoother(int(freq))
183 |                 probs[word] = math.log(smooth_count) - log_total
184 |     oov_prob = math.log(counts.smoother(0)) - log_total
185 |     return probs, oov_prob
186 | 
187 | 
188 | def read_clusters(clusters_loc):
189 |     print("Reading clusters...")
190 |     clusters = {}
191 |     with clusters_loc.open() as f:
192 |         for line in tqdm(f):
193 |             try:
194 |                 cluster, word, freq = line.split()
195 |                 word = ftfy.fix_text(word)
196 |             except ValueError:
197 |                 continue
198 |             # If the clusterer has only seen the word a few times, its
199 |             # cluster is unreliable.
200 |             if int(freq) >= 3:
201 |                 clusters[word] = cluster
202 |             else:
203 |                 clusters[word] = "0"
204 |     # Expand clusters with re-casing
205 |     for word, cluster in list(clusters.items()):
206 |         if word.lower() not in clusters:
207 |             clusters[word.lower()] = cluster
208 |         if word.title() not in clusters:
209 |             clusters[word.title()] = cluster
210 |         if word.upper() not in clusters:
211 |             clusters[word.upper()] = cluster
212 |     return clusters
213 | 
214 | 
215 | @plac.annotations(
216 |     lang=("model language", "positional", None, str),
217 |     output_loc=("model output directory", "positional", None, str),
218 |     freqs_loc=("location of words frequencies file", "positional", None, Path),
219 |     clusters_loc=("location of brown clusters data", "positional", None, Path),
220 | )
221 | def main(lang, output_loc, freqs_loc, clusters_loc):
222 |     clusters = read_clusters(clusters_loc)
223 |     probs, oov_prob = read_freqs(freqs_loc)
224 | 
225 |     with jsonlines.open(output_loc, mode="w") as writer:
226 |         header = {"lang": lang, "settings": {"oov_prob": oov_prob}}
227 | 
228 |         writer.write(header)
229 | 
230 |         for word_str, cluster in clusters.items():
231 | 
232 |             if not word_str:
233 |                 continue
234 | 
235 |             word = Word(word_str, cluster, probs)
236 |             row = {
237 |                 "orth": word.orth,  # the word text
238 |                 "id": word.id,  # can correspond to row in vectors table
239 |                 "lower": word.lower,
240 |                 "norm": word.norm,
241 |                 "shape": word.shape,
242 |                 "prefix": word.prefix,
243 |                 "suffix": word.suffix,
244 |                 "length": word.length,
245 |                 "cluster": word.cluster,
246 |                 "prob": word.prob,
247 |                 "is_alpha": word.is_alpha,
248 |                 "is_ascii": word.is_ascii,
249 |                 "is_digit": word.is_digit,
250 |                 "is_lower": word.is_lower,
251 |                 "is_punct": word.is_punct,
252 |                 "is_space": word.is_space,
253 |                 "is_title": word.is_title,
254 |                 "is_upper": word.is_upper,
255 |                 "like_url": word.like_url,
256 |                 "like_num": word.like_num,
257 |                 "like_email": word.like_email,
258 |                 "is_stop": word.is_stop,
259 |                 "is_oov": word.is_oov,
260 |                 "is_quote": word.is_quote,
261 |                 "is_left_punct": word.is_left_punct,
262 |                 "is_right_punct": word.is_right_punct,
263 |             }
264 | 
265 |             writer.write(row)
266 | 
267 | 
268 | if __name__ == "__main__":
269 |     plac.call(main)
270 | 


--------------------------------------------------------------------------------
/notebooks/demo.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "## install package"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "code",
 12 |    "execution_count": 2,
 13 |    "metadata": {},
 14 |    "outputs": [
 15 |     {
 16 |      "name": "stdout",
 17 |      "output_type": "stream",
 18 |      "text": [
 19 |       "Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple\n",
 20 |       "Processing ./zh_core_web_sm-2.0.3.tar.gz\n",
 21 |       "Requirement already satisfied: spacy>=2.0.0a18 in /home/howl/.local/lib/python3.5/site-packages (from zh-core-web-sm==2.0.3) (2.0.12)\n",
 22 |       "Requirement already satisfied: dill<0.3,>=0.2 in /home/howl/.local/lib/python3.5/site-packages (from spacy>=2.0.0a18->zh-core-web-sm==2.0.3) (0.2.8.2)\n",
 23 |       "Requirement already satisfied: thinc<6.11.0,>=6.10.3 in /home/howl/.local/lib/python3.5/site-packages (from spacy>=2.0.0a18->zh-core-web-sm==2.0.3) (6.10.3)\n",
 24 |       "Requirement already satisfied: numpy>=1.7 in /usr/local/lib/python3.5/dist-packages (from spacy>=2.0.0a18->zh-core-web-sm==2.0.3) (1.14.2)\n",
 25 |       "Requirement already satisfied: plac<1.0.0,>=0.9.6 in /home/howl/.local/lib/python3.5/site-packages (from spacy>=2.0.0a18->zh-core-web-sm==2.0.3) (0.9.6)\n",
 26 |       "Requirement already satisfied: requests<3.0.0,>=2.13.0 in /usr/local/lib/python3.5/dist-packages (from spacy>=2.0.0a18->zh-core-web-sm==2.0.3) (2.18.4)\n",
 27 |       "Requirement already satisfied: cymem<1.32,>=1.30 in /home/howl/.local/lib/python3.5/site-packages (from spacy>=2.0.0a18->zh-core-web-sm==2.0.3) (1.31.2)\n",
 28 |       "Requirement already satisfied: ujson>=1.35 in /home/howl/.local/lib/python3.5/site-packages (from spacy>=2.0.0a18->zh-core-web-sm==2.0.3) (1.35)\n",
 29 |       "Requirement already satisfied: regex==2017.4.5 in /home/howl/.local/lib/python3.5/site-packages (from spacy>=2.0.0a18->zh-core-web-sm==2.0.3) (2017.4.5)\n",
 30 |       "Requirement already satisfied: murmurhash<0.29,>=0.28 in /home/howl/.local/lib/python3.5/site-packages (from spacy>=2.0.0a18->zh-core-web-sm==2.0.3) (0.28.0)\n",
 31 |       "Requirement already satisfied: preshed<2.0.0,>=1.0.0 in /home/howl/.local/lib/python3.5/site-packages (from spacy>=2.0.0a18->zh-core-web-sm==2.0.3) (1.0.1)\n",
 32 |       "Requirement already satisfied: msgpack<1.0.0,>=0.5.6 in /home/howl/.local/lib/python3.5/site-packages (from thinc<6.11.0,>=6.10.3->spacy>=2.0.0a18->zh-core-web-sm==2.0.3) (0.5.6)\n",
 33 |       "Requirement already satisfied: tqdm<5.0.0,>=4.10.0 in /home/howl/.local/lib/python3.5/site-packages (from thinc<6.11.0,>=6.10.3->spacy>=2.0.0a18->zh-core-web-sm==2.0.3) (4.24.0)\n",
 34 |       "Requirement already satisfied: six<2.0.0,>=1.10.0 in /usr/lib/python3/dist-packages (from thinc<6.11.0,>=6.10.3->spacy>=2.0.0a18->zh-core-web-sm==2.0.3) (1.10.0)\n",
 35 |       "Requirement already satisfied: msgpack-numpy<1.0.0,>=0.4.1 in /home/howl/.local/lib/python3.5/site-packages (from thinc<6.11.0,>=6.10.3->spacy>=2.0.0a18->zh-core-web-sm==2.0.3) (0.4.3.1)\n",
 36 |       "Requirement already satisfied: wrapt<1.11.0,>=1.10.0 in /home/howl/.local/lib/python3.5/site-packages (from thinc<6.11.0,>=6.10.3->spacy>=2.0.0a18->zh-core-web-sm==2.0.3) (1.10.11)\n",
 37 |       "Requirement already satisfied: cytoolz<0.10,>=0.9.0 in /home/howl/.local/lib/python3.5/site-packages (from thinc<6.11.0,>=6.10.3->spacy>=2.0.0a18->zh-core-web-sm==2.0.3) (0.9.0.1)\n",
 38 |       "Requirement already satisfied: idna<2.7,>=2.5 in /usr/local/lib/python3.5/dist-packages (from requests<3.0.0,>=2.13.0->spacy>=2.0.0a18->zh-core-web-sm==2.0.3) (2.6)\n",
 39 |       "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.5/dist-packages (from requests<3.0.0,>=2.13.0->spacy>=2.0.0a18->zh-core-web-sm==2.0.3) (2018.4.16)\n",
 40 |       "Requirement already satisfied: chardet<3.1.0,>=3.0.2 in /usr/local/lib/python3.5/dist-packages (from requests<3.0.0,>=2.13.0->spacy>=2.0.0a18->zh-core-web-sm==2.0.3) (3.0.4)\n",
 41 |       "Requirement already satisfied: urllib3<1.23,>=1.21.1 in /usr/local/lib/python3.5/dist-packages (from requests<3.0.0,>=2.13.0->spacy>=2.0.0a18->zh-core-web-sm==2.0.3) (1.22)\n",
 42 |       "Requirement already satisfied: toolz>=0.8.0 in /home/howl/.local/lib/python3.5/site-packages (from cytoolz<0.10,>=0.9.0->thinc<6.11.0,>=6.10.3->spacy>=2.0.0a18->zh-core-web-sm==2.0.3) (0.9.0)\n",
 43 |       "Building wheels for collected packages: zh-core-web-sm\n",
 44 |       "  Running setup.py bdist_wheel for zh-core-web-sm ... \u001b[?25ldone\n",
 45 |       "\u001b[?25h  Stored in directory: /home/howl/.cache/pip/wheels/04/63/ec/a66252a0376a1953722cb70a85c50aa4836311eca4d69f75f3\n",
 46 |       "Successfully built zh-core-web-sm\n",
 47 |       "Installing collected packages: zh-core-web-sm\n",
 48 |       "  Found existing installation: zh-core-web-sm 2.0.2\n",
 49 |       "    Uninstalling zh-core-web-sm-2.0.2:\n",
 50 |       "      Successfully uninstalled zh-core-web-sm-2.0.2\n",
 51 |       "Successfully installed zh-core-web-sm-2.0.3\n",
 52 |       "\u001b[33mYou are using pip version 10.0.1, however version 18.0 is available.\n",
 53 |       "You should consider upgrading via the 'pip install --upgrade pip' command.\u001b[0m\n"
 54 |      ]
 55 |     }
 56 |    ],
 57 |    "source": [
 58 |     "!pip install -q https://github.com/howl-anderson/Chinese_models_for_SpaCy/releases/download/v2.2.X-0.1.0/zh_core_web_sm-0.1.0.tar.gz"
 59 |    ]
 60 |   },
 61 |   {
 62 |    "cell_type": "code",
 63 |    "execution_count": 1,
 64 |    "metadata": {},
 65 |    "outputs": [],
 66 |    "source": [
 67 |     "!pip install -q pandas\n",
 68 |     "!pip install -q jieba\n",
 69 |     "!pip install -q tabulate"
 70 |    ]
 71 |   },
 72 |   {
 73 |    "cell_type": "markdown",
 74 |    "metadata": {},
 75 |    "source": [
 76 |     "## import packages"
 77 |    ]
 78 |   },
 79 |   {
 80 |    "cell_type": "code",
 81 |    "execution_count": 2,
 82 |    "metadata": {},
 83 |    "outputs": [],
 84 |    "source": [
 85 |     "from spacy import displacy\n",
 86 |     "from tabulate import tabulate\n",
 87 |     "import pandas as pd\n",
 88 |     "\n",
 89 |     "import zh_core_web_sm"
 90 |    ]
 91 |   },
 92 |   {
 93 |    "cell_type": "markdown",
 94 |    "metadata": {},
 95 |    "source": [
 96 |     "## load models"
 97 |    ]
 98 |   },
 99 |   {
100 |    "cell_type": "code",
101 |    "execution_count": 3,
102 |    "metadata": {},
103 |    "outputs": [],
104 |    "source": [
105 |     "nlp = zh_core_web_sm.load()"
106 |    ]
107 |   },
108 |   {
109 |    "cell_type": "markdown",
110 |    "metadata": {},
111 |    "source": [
112 |     "## parse doc"
113 |    ]
114 |   },
115 |   {
116 |    "cell_type": "code",
117 |    "execution_count": 4,
118 |    "metadata": {},
119 |    "outputs": [
120 |     {
121 |      "name": "stderr",
122 |      "output_type": "stream",
123 |      "text": [
124 |       "Building prefix dict from the default dictionary ...\n",
125 |       "Loading model from cache /tmp/jieba.cache\n",
126 |       "Loading model cost 0.435 seconds.\n",
127 |       "Prefix dict has been built successfully.\n"
128 |      ]
129 |     }
130 |    ],
131 |    "source": [
132 |     "doc = nlp(\"王小明在北京的清华大学读书\")"
133 |    ]
134 |   },
135 |   {
136 |    "cell_type": "markdown",
137 |    "metadata": {},
138 |    "source": [
139 |     "## print doc's attributes"
140 |    ]
141 |   },
142 |   {
143 |    "cell_type": "code",
144 |    "execution_count": 5,
145 |    "metadata": {
146 |     "scrolled": false
147 |    },
148 |    "outputs": [
149 |     {
150 |      "data": {
151 |       "text/html": [
152 |        "<div>\n",
153 |        "<style scoped>\n",
154 |        "    .dataframe tbody tr th:only-of-type {\n",
155 |        "        vertical-align: middle;\n",
156 |        "    }\n",
157 |        "\n",
158 |        "    .dataframe tbody tr th {\n",
159 |        "        vertical-align: top;\n",
160 |        "    }\n",
161 |        "\n",
162 |        "    .dataframe thead th {\n",
163 |        "        text-align: right;\n",
164 |        "    }\n",
165 |        "</style>\n",
166 |        "<table border=\"1\" class=\"dataframe\">\n",
167 |        "  <thead>\n",
168 |        "    <tr style=\"text-align: right;\">\n",
169 |        "      <th></th>\n",
170 |        "      <th>text</th>\n",
171 |        "      <th>lemma_</th>\n",
172 |        "      <th>pos_</th>\n",
173 |        "      <th>tag_</th>\n",
174 |        "      <th>dep_</th>\n",
175 |        "      <th>shape_</th>\n",
176 |        "      <th>is_alpha</th>\n",
177 |        "      <th>is_stop</th>\n",
178 |        "      <th>has_vector</th>\n",
179 |        "      <th>ent_iob_</th>\n",
180 |        "      <th>ent_type_</th>\n",
181 |        "      <th>vector_norm</th>\n",
182 |        "      <th>is_oov</th>\n",
183 |        "    </tr>\n",
184 |        "  </thead>\n",
185 |        "  <tbody>\n",
186 |        "    <tr>\n",
187 |        "      <th>0</th>\n",
188 |        "      <td>王小明</td>\n",
189 |        "      <td>王小明</td>\n",
190 |        "      <td>X</td>\n",
191 |        "      <td>NNP</td>\n",
192 |        "      <td>nsubj</td>\n",
193 |        "      <td>xxx</td>\n",
194 |        "      <td>True</td>\n",
195 |        "      <td>False</td>\n",
196 |        "      <td>False</td>\n",
197 |        "      <td>B</td>\n",
198 |        "      <td>PERSON</td>\n",
199 |        "      <td>0.000000</td>\n",
200 |        "      <td>True</td>\n",
201 |        "    </tr>\n",
202 |        "    <tr>\n",
203 |        "      <th>1</th>\n",
204 |        "      <td>在</td>\n",
205 |        "      <td>在</td>\n",
206 |        "      <td>VERB</td>\n",
207 |        "      <td>VV</td>\n",
208 |        "      <td>case</td>\n",
209 |        "      <td>x</td>\n",
210 |        "      <td>True</td>\n",
211 |        "      <td>True</td>\n",
212 |        "      <td>True</td>\n",
213 |        "      <td>O</td>\n",
214 |        "      <td></td>\n",
215 |        "      <td>6.573987</td>\n",
216 |        "      <td>False</td>\n",
217 |        "    </tr>\n",
218 |        "    <tr>\n",
219 |        "      <th>2</th>\n",
220 |        "      <td>北京</td>\n",
221 |        "      <td>北京</td>\n",
222 |        "      <td>X</td>\n",
223 |        "      <td>NNP</td>\n",
224 |        "      <td>nmod</td>\n",
225 |        "      <td>xx</td>\n",
226 |        "      <td>True</td>\n",
227 |        "      <td>False</td>\n",
228 |        "      <td>True</td>\n",
229 |        "      <td>B</td>\n",
230 |        "      <td>GPE</td>\n",
231 |        "      <td>12.769391</td>\n",
232 |        "      <td>False</td>\n",
233 |        "    </tr>\n",
234 |        "    <tr>\n",
235 |        "      <th>3</th>\n",
236 |        "      <td>的</td>\n",
237 |        "      <td>的</td>\n",
238 |        "      <td>PART</td>\n",
239 |        "      <td>DEC</td>\n",
240 |        "      <td>case:dec</td>\n",
241 |        "      <td>x</td>\n",
242 |        "      <td>True</td>\n",
243 |        "      <td>True</td>\n",
244 |        "      <td>True</td>\n",
245 |        "      <td>O</td>\n",
246 |        "      <td></td>\n",
247 |        "      <td>6.886564</td>\n",
248 |        "      <td>False</td>\n",
249 |        "    </tr>\n",
250 |        "    <tr>\n",
251 |        "      <th>4</th>\n",
252 |        "      <td>清华大学</td>\n",
253 |        "      <td>清华大学</td>\n",
254 |        "      <td>X</td>\n",
255 |        "      <td>NNP</td>\n",
256 |        "      <td>obl</td>\n",
257 |        "      <td>xxxx</td>\n",
258 |        "      <td>True</td>\n",
259 |        "      <td>False</td>\n",
260 |        "      <td>True</td>\n",
261 |        "      <td>B</td>\n",
262 |        "      <td>ORG</td>\n",
263 |        "      <td>18.842812</td>\n",
264 |        "      <td>False</td>\n",
265 |        "    </tr>\n",
266 |        "    <tr>\n",
267 |        "      <th>5</th>\n",
268 |        "      <td>读书</td>\n",
269 |        "      <td>读书</td>\n",
270 |        "      <td>VERB</td>\n",
271 |        "      <td>VV</td>\n",
272 |        "      <td>ROOT</td>\n",
273 |        "      <td>xx</td>\n",
274 |        "      <td>True</td>\n",
275 |        "      <td>False</td>\n",
276 |        "      <td>True</td>\n",
277 |        "      <td>O</td>\n",
278 |        "      <td></td>\n",
279 |        "      <td>18.138533</td>\n",
280 |        "      <td>False</td>\n",
281 |        "    </tr>\n",
282 |        "  </tbody>\n",
283 |        "</table>\n",
284 |        "</div>"
285 |       ],
286 |       "text/plain": [
287 |        "   text lemma_  pos_ tag_      dep_ shape_  is_alpha  is_stop  has_vector  \\\n",
288 |        "0   王小明    王小明     X  NNP     nsubj    xxx      True    False       False   \n",
289 |        "1     在      在  VERB   VV      case      x      True     True        True   \n",
290 |        "2    北京     北京     X  NNP      nmod     xx      True    False        True   \n",
291 |        "3     的      的  PART  DEC  case:dec      x      True     True        True   \n",
292 |        "4  清华大学   清华大学     X  NNP       obl   xxxx      True    False        True   \n",
293 |        "5    读书     读书  VERB   VV      ROOT     xx      True    False        True   \n",
294 |        "\n",
295 |        "  ent_iob_ ent_type_  vector_norm  is_oov  \n",
296 |        "0        B    PERSON     0.000000    True  \n",
297 |        "1        O               6.573987   False  \n",
298 |        "2        B       GPE    12.769391   False  \n",
299 |        "3        O               6.886564   False  \n",
300 |        "4        B       ORG    18.842812   False  \n",
301 |        "5        O              18.138533   False  "
302 |       ]
303 |      },
304 |      "execution_count": 5,
305 |      "metadata": {},
306 |      "output_type": "execute_result"
307 |     }
308 |    ],
309 |    "source": [
310 |     "headers = ['text', 'lemma_', 'pos_', 'tag_', 'dep_',\n",
311 |     "           'shape_', 'is_alpha', 'is_stop', 'has_vector',\n",
312 |     "           'ent_iob_', 'ent_type_',\n",
313 |     "           'vector_norm', 'is_oov']\n",
314 |     "\n",
315 |     "doc_data = []\n",
316 |     "\n",
317 |     "for token in doc:\n",
318 |     "    token_data = [token.text, token.lemma_, token.pos_, token.tag_, token.dep_,\n",
319 |     "                  token.shape_, token.is_alpha, token.is_stop, token.has_vector,\n",
320 |     "                  token.ent_iob_, token.ent_type_,\n",
321 |     "                  token.vector_norm, token.is_oov]\n",
322 |     "    doc_data.append(token_data)\n",
323 |     "\n",
324 |     "pd.DataFrame(doc_data, columns=headers)"
325 |    ]
326 |   },
327 |   {
328 |    "cell_type": "markdown",
329 |    "metadata": {},
330 |    "source": [
331 |     "## draw dependency graph"
332 |    ]
333 |   },
334 |   {
335 |    "cell_type": "code",
336 |    "execution_count": 6,
337 |    "metadata": {
338 |     "scrolled": false
339 |    },
340 |    "outputs": [
341 |     {
342 |      "data": {
343 |       "text/html": [
344 |        "<svg xmlns=\"http://www.w3.org/2000/svg\" xmlns:xlink=\"http://www.w3.org/1999/xlink\" xml:lang=\"zh\" id=\"c3b1235515374dfd9468f0e90464c5a9-0\" class=\"displacy\" width=\"1100\" height=\"487.0\" direction=\"ltr\" style=\"max-width: none; height: 487.0px; color: #000000; background: #ffffff; font-family: Arial; direction: ltr\">\n",
345 |        "<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"397.0\">\n",
346 |        "    <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"50\">王小明</tspan>\n",
347 |        "    <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"50\">X</tspan>\n",
348 |        "</text>\n",
349 |        "\n",
350 |        "<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"397.0\">\n",
351 |        "    <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"225\">在</tspan>\n",
352 |        "    <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"225\">VERB</tspan>\n",
353 |        "</text>\n",
354 |        "\n",
355 |        "<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"397.0\">\n",
356 |        "    <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"400\">北京</tspan>\n",
357 |        "    <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"400\">X</tspan>\n",
358 |        "</text>\n",
359 |        "\n",
360 |        "<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"397.0\">\n",
361 |        "    <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"575\">的</tspan>\n",
362 |        "    <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"575\">PART</tspan>\n",
363 |        "</text>\n",
364 |        "\n",
365 |        "<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"397.0\">\n",
366 |        "    <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"750\">清华大学</tspan>\n",
367 |        "    <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"750\">X</tspan>\n",
368 |        "</text>\n",
369 |        "\n",
370 |        "<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"397.0\">\n",
371 |        "    <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"925\">读书</tspan>\n",
372 |        "    <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"925\">VERB</tspan>\n",
373 |        "</text>\n",
374 |        "\n",
375 |        "<g class=\"displacy-arrow\">\n",
376 |        "    <path class=\"displacy-arc\" id=\"arrow-c3b1235515374dfd9468f0e90464c5a9-0-0\" stroke-width=\"2px\" d=\"M70,352.0 C70,2.0 925.0,2.0 925.0,352.0\" fill=\"none\" stroke=\"currentColor\"/>\n",
377 |        "    <text dy=\"1.25em\" style=\"font-size: 0.8em; letter-spacing: 1px\">\n",
378 |        "        <textPath xlink:href=\"#arrow-c3b1235515374dfd9468f0e90464c5a9-0-0\" class=\"displacy-label\" startOffset=\"50%\" side=\"left\" fill=\"currentColor\" text-anchor=\"middle\">nsubj</textPath>\n",
379 |        "    </text>\n",
380 |        "    <path class=\"displacy-arrowhead\" d=\"M70,354.0 L62,342.0 78,342.0\" fill=\"currentColor\"/>\n",
381 |        "</g>\n",
382 |        "\n",
383 |        "<g class=\"displacy-arrow\">\n",
384 |        "    <path class=\"displacy-arc\" id=\"arrow-c3b1235515374dfd9468f0e90464c5a9-0-1\" stroke-width=\"2px\" d=\"M245,352.0 C245,89.5 745.0,89.5 745.0,352.0\" fill=\"none\" stroke=\"currentColor\"/>\n",
385 |        "    <text dy=\"1.25em\" style=\"font-size: 0.8em; letter-spacing: 1px\">\n",
386 |        "        <textPath xlink:href=\"#arrow-c3b1235515374dfd9468f0e90464c5a9-0-1\" class=\"displacy-label\" startOffset=\"50%\" side=\"left\" fill=\"currentColor\" text-anchor=\"middle\">case</textPath>\n",
387 |        "    </text>\n",
388 |        "    <path class=\"displacy-arrowhead\" d=\"M245,354.0 L237,342.0 253,342.0\" fill=\"currentColor\"/>\n",
389 |        "</g>\n",
390 |        "\n",
391 |        "<g class=\"displacy-arrow\">\n",
392 |        "    <path class=\"displacy-arc\" id=\"arrow-c3b1235515374dfd9468f0e90464c5a9-0-2\" stroke-width=\"2px\" d=\"M420,352.0 C420,177.0 740.0,177.0 740.0,352.0\" fill=\"none\" stroke=\"currentColor\"/>\n",
393 |        "    <text dy=\"1.25em\" style=\"font-size: 0.8em; letter-spacing: 1px\">\n",
394 |        "        <textPath xlink:href=\"#arrow-c3b1235515374dfd9468f0e90464c5a9-0-2\" class=\"displacy-label\" startOffset=\"50%\" side=\"left\" fill=\"currentColor\" text-anchor=\"middle\">nmod</textPath>\n",
395 |        "    </text>\n",
396 |        "    <path class=\"displacy-arrowhead\" d=\"M420,354.0 L412,342.0 428,342.0\" fill=\"currentColor\"/>\n",
397 |        "</g>\n",
398 |        "\n",
399 |        "<g class=\"displacy-arrow\">\n",
400 |        "    <path class=\"displacy-arc\" id=\"arrow-c3b1235515374dfd9468f0e90464c5a9-0-3\" stroke-width=\"2px\" d=\"M420,352.0 C420,264.5 560.0,264.5 560.0,352.0\" fill=\"none\" stroke=\"currentColor\"/>\n",
401 |        "    <text dy=\"1.25em\" style=\"font-size: 0.8em; letter-spacing: 1px\">\n",
402 |        "        <textPath xlink:href=\"#arrow-c3b1235515374dfd9468f0e90464c5a9-0-3\" class=\"displacy-label\" startOffset=\"50%\" side=\"left\" fill=\"currentColor\" text-anchor=\"middle\">case:dec</textPath>\n",
403 |        "    </text>\n",
404 |        "    <path class=\"displacy-arrowhead\" d=\"M560.0,354.0 L568.0,342.0 552.0,342.0\" fill=\"currentColor\"/>\n",
405 |        "</g>\n",
406 |        "\n",
407 |        "<g class=\"displacy-arrow\">\n",
408 |        "    <path class=\"displacy-arc\" id=\"arrow-c3b1235515374dfd9468f0e90464c5a9-0-4\" stroke-width=\"2px\" d=\"M770,352.0 C770,264.5 910.0,264.5 910.0,352.0\" fill=\"none\" stroke=\"currentColor\"/>\n",
409 |        "    <text dy=\"1.25em\" style=\"font-size: 0.8em; letter-spacing: 1px\">\n",
410 |        "        <textPath xlink:href=\"#arrow-c3b1235515374dfd9468f0e90464c5a9-0-4\" class=\"displacy-label\" startOffset=\"50%\" side=\"left\" fill=\"currentColor\" text-anchor=\"middle\">obl</textPath>\n",
411 |        "    </text>\n",
412 |        "    <path class=\"displacy-arrowhead\" d=\"M770,354.0 L762,342.0 778,342.0\" fill=\"currentColor\"/>\n",
413 |        "</g>\n",
414 |        "</svg>"
415 |       ],
416 |       "text/plain": [
417 |        "<IPython.core.display.HTML object>"
418 |       ]
419 |      },
420 |      "metadata": {},
421 |      "output_type": "display_data"
422 |     }
423 |    ],
424 |    "source": [
425 |     "displacy.render(doc, jupyter=True)"
426 |    ]
427 |   },
428 |   {
429 |    "cell_type": "code",
430 |    "execution_count": 7,
431 |    "metadata": {},
432 |    "outputs": [
433 |     {
434 |      "data": {
435 |       "text/html": [
436 |        "<div class=\"entities\" style=\"line-height: 2.5; direction: ltr\">\n",
437 |        "<mark class=\"entity\" style=\"background: #aa9cfc; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;\">\n",
438 |        "    王小明\n",
439 |        "    <span style=\"font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; text-transform: uppercase; vertical-align: middle; margin-left: 0.5rem\">PERSON</span>\n",
440 |        "</mark>\n",
441 |        "在\n",
442 |        "<mark class=\"entity\" style=\"background: #feca74; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;\">\n",
443 |        "    北京\n",
444 |        "    <span style=\"font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; text-transform: uppercase; vertical-align: middle; margin-left: 0.5rem\">GPE</span>\n",
445 |        "</mark>\n",
446 |        "的\n",
447 |        "<mark class=\"entity\" style=\"background: #7aecec; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;\">\n",
448 |        "    清华大学\n",
449 |        "    <span style=\"font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; text-transform: uppercase; vertical-align: middle; margin-left: 0.5rem\">ORG</span>\n",
450 |        "</mark>\n",
451 |        "读书</div>"
452 |       ],
453 |       "text/plain": [
454 |        "<IPython.core.display.HTML object>"
455 |       ]
456 |      },
457 |      "metadata": {},
458 |      "output_type": "display_data"
459 |     }
460 |    ],
461 |    "source": [
462 |     "displacy.render(doc, jupyter=True, style='ent')"
463 |    ]
464 |   }
465 |  ],
466 |  "metadata": {
467 |   "celltoolbar": "Raw Cell Format",
468 |   "kernelspec": {
469 |    "display_name": "Python 3",
470 |    "language": "python",
471 |    "name": "python3"
472 |   },
473 |   "language_info": {
474 |    "codemirror_mode": {
475 |     "name": "ipython",
476 |     "version": 3
477 |    },
478 |    "file_extension": ".py",
479 |    "mimetype": "text/x-python",
480 |    "name": "python",
481 |    "nbconvert_exporter": "python",
482 |    "pygments_lexer": "ipython3",
483 |    "version": "3.6.9"
484 |   }
485 |  },
486 |  "nbformat": 4,
487 |  "nbformat_minor": 1
488 | }
489 | 


--------------------------------------------------------------------------------