├── README.MD
├── spacy
    └── lang
    │   └── zh
    │       ├── __init__.py
    │       ├── __pycache__
    │           ├── __init__.cpython-36.pyc
    │           ├── lex_attrs.cpython-36.pyc
    │           ├── morph_rules.cpython-36.pyc
    │           ├── stop_words.cpython-36.pyc
    │           ├── syntax_iterators.cpython-36.pyc
    │           └── tag_map.cpython-36.pyc
    │       ├── examples.py
    │       ├── lex_attrs.py
    │       ├── morph_rules.py
    │       ├── stop_words.py
    │       ├── syntax_iterators.py
    │       └── tag_map.py
├── train_intent_parser_cn.py
├── train_ner_cn.py
├── train_new_entity_type_cn.py
├── train_parser_cn.py
├── train_tagger_cn.py
├── vectors_fast_text.py
└── zh_model
    ├── meta.json
    ├── ner
        ├── cfg
        ├── lower_model
        ├── moves
        ├── tok2vec_model
        └── upper_model
    ├── parser
        ├── cfg
        ├── lower_model
        ├── moves
        ├── tok2vec_model
        └── upper_model
    ├── tagger
        ├── cfg
        ├── model
        └── tag_map
    ├── tokenizer
    └── vocab
        ├── key2row
        ├── lexemes.bin
        ├── strings.json
        └── vectors


/README.MD:
--------------------------------------------------------------------------------
1 | 关于spaCy的中文语言和分析模型测试。
2 | 中文vocab使用的是80余万条新浪新闻(news.vec)，通过fasttext生成的300dim的vector。
3 | 所有内容均为配合 https://www.jianshu.com/p/9bfbdb5dc487 系列文章。
4 | 


--------------------------------------------------------------------------------
/spacy/lang/zh/__init__.py:
--------------------------------------------------------------------------------
 1 | # coding: utf8
 2 | from __future__ import unicode_literals
 3 | 
 4 | from .tag_map import TAG_MAP
 5 | from .stop_words import STOP_WORDS
 6 | from .lex_attrs import LEX_ATTRS
 7 | from .morph_rules import MORPH_RULES
 8 | from .syntax_iterators import SYNTAX_ITERATORS
 9 | 
10 | from ..tokenizer_exceptions import BASE_EXCEPTIONS
11 | from ..norm_exceptions import BASE_NORMS
12 | from ..punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
13 | from ..char_classes import UNITS, CURRENCY, QUOTES, PUNCT, HYPHENS, ICONS, LIST_UNITS, LIST_CURRENCY, LIST_QUOTES, LIST_PUNCT, LIST_HYPHENS, LIST_ELLIPSES, LIST_ICONS
14 | 
15 | from ...attrs import LANG, NORM
16 | from ...language import Language
17 | from ...tokens import Doc
18 | from ...util import update_exc, add_lookups
19 | 
20 | 
21 | class ChineseDefaults(Language.Defaults):
22 |     lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
23 |     lex_attr_getters.update(LEX_ATTRS)
24 |     lex_attr_getters[LANG] = lambda text: 'zh'  # for pickling
25 |     lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM],
26 |                                          BASE_NORMS)
27 |     tokenizer_exceptions = update_exc(BASE_EXCEPTIONS)
28 | 
29 |     use_jieba = True
30 |     tag_map = TAG_MAP
31 |     stop_words = STOP_WORDS
32 |     morph_rules = MORPH_RULES
33 |     syntax_iterators = SYNTAX_ITERATORS
34 | 
35 | 
36 | class Chinese(Language):
37 |     lang = 'zh'
38 |     Defaults = ChineseDefaults  # override defaults
39 | 
40 |     def make_doc(self, text):
41 |         if self.Defaults.use_jieba:
42 |             try:
43 |                 import jieba
44 |             except ImportError:
45 |                 msg = ("Jieba not installed. Either set Chinese.use_jieba = False, "
46 |                        "or install it https://github.com/fxsjy/jieba")
47 |                 raise ImportError(msg)
48 |             words = list(jieba.cut(text, cut_all=False))
49 |             words = [x for x in words if x]
50 |             return Doc(self.vocab, words=words, spaces=[False]*len(words))
51 |         else:
52 |             words = []
53 |             spaces = []
54 |             doc = self.tokenizer(text)
55 |             for token in self.tokenizer(text):
56 |                 words.extend(list(token.text))
57 |                 spaces.extend([False]*len(token.text))
58 |                 spaces[-1] = bool(token.whitespace_)
59 |             return Doc(self.vocab, words=words, spaces=spaces)
60 | 
61 | 
62 | __all__ = ['Chinese']
63 | 


--------------------------------------------------------------------------------
/spacy/lang/zh/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jeusgao/spaCy-new-language-test-Chinese/f202c409a56247cebd29165450919e83b412f309/spacy/lang/zh/__pycache__/__init__.cpython-36.pyc


--------------------------------------------------------------------------------
/spacy/lang/zh/__pycache__/lex_attrs.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jeusgao/spaCy-new-language-test-Chinese/f202c409a56247cebd29165450919e83b412f309/spacy/lang/zh/__pycache__/lex_attrs.cpython-36.pyc


--------------------------------------------------------------------------------
/spacy/lang/zh/__pycache__/morph_rules.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jeusgao/spaCy-new-language-test-Chinese/f202c409a56247cebd29165450919e83b412f309/spacy/lang/zh/__pycache__/morph_rules.cpython-36.pyc


--------------------------------------------------------------------------------
/spacy/lang/zh/__pycache__/stop_words.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jeusgao/spaCy-new-language-test-Chinese/f202c409a56247cebd29165450919e83b412f309/spacy/lang/zh/__pycache__/stop_words.cpython-36.pyc


--------------------------------------------------------------------------------
/spacy/lang/zh/__pycache__/syntax_iterators.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jeusgao/spaCy-new-language-test-Chinese/f202c409a56247cebd29165450919e83b412f309/spacy/lang/zh/__pycache__/syntax_iterators.cpython-36.pyc


--------------------------------------------------------------------------------
/spacy/lang/zh/__pycache__/tag_map.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jeusgao/spaCy-new-language-test-Chinese/f202c409a56247cebd29165450919e83b412f309/spacy/lang/zh/__pycache__/tag_map.cpython-36.pyc


--------------------------------------------------------------------------------
/spacy/lang/zh/examples.py:
--------------------------------------------------------------------------------
 1 | # coding: utf8
 2 | from __future__ import unicode_literals
 3 | 
 4 | 
 5 | """
 6 | Example sentences to test spaCy and its language models.
 7 | 
 8 | >>> from spacy.lang.zh.examples import sentences
 9 | >>> docs = nlp.pipe(sentences)
10 | """
11 | 
12 | 
13 | sentences = [
14 |     "苹果公司正考虑用一亿元买下英国的新创公司",
15 |     "自动驾驶汽车将保险责任归属转移至制造商",
16 |     "旧金山考虑禁止送货机器人在人行道上行驶",
17 |     "伦敦是英国的大城市"
18 | ]
19 | 


--------------------------------------------------------------------------------
/spacy/lang/zh/lex_attrs.py:
--------------------------------------------------------------------------------
 1 | # coding: utf8
 2 | from __future__ import unicode_literals
 3 | 
 4 | from ...attrs import LIKE_NUM
 5 | 
 6 | 
 7 | _num_words = ['零', '一', '二', '三', '四', '五', '六', '七',
 8 |               '八', '九', '十', '十一', '十二', '十三', '十四',
 9 |               '十五', '十六', '十七', '十八', '十九', '二十',
10 |               '三十', '四十', '五十', '六十', '七十', '八十', '九十',
11 |               '百', '千', '百万', '十亿', '万亿', '百兆',
12 |               'gajillion', 'bazillion']
13 | 
14 | 
15 | def like_num(text):
16 |     text = text.replace(',', '').replace('.', '')
17 |     if text.isdigit():
18 |         return True
19 |     if text.count('/') == 1:
20 |         num, denom = text.split('/')
21 |         if num.isdigit() and denom.isdigit():
22 |             return True
23 |     if text.lower() in _num_words:
24 |         return True
25 |     return False
26 | 
27 | 
28 | LEX_ATTRS = {
29 |     LIKE_NUM: like_num
30 | }
31 | 


--------------------------------------------------------------------------------
/spacy/lang/zh/morph_rules.py:
--------------------------------------------------------------------------------
 1 | # coding: utf8
 2 | from __future__ import unicode_literals
 3 | 
 4 | from ...symbols import LEMMA, PRON_LEMMA
 5 | 
 6 | 
 7 | MORPH_RULES = {
 8 |     "PRP": {
 9 |         "我":           {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "One", "Number": "Sing"},
10 |         "你":           {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Two"},
11 |         "他":           {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Three", "Number": "Sing", "Gender": "Masc"},
12 |         "她":           {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Three", "Number": "Sing", "Gender": "Fem"},
13 |         "它":           {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Three", "Number": "Sing", "Gender": "Neut"},
14 |         "我们":         {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "One", "Number": "Plur"},
15 |         "他们":         {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Three", "Number": "Plur", "Gender": "Masc"},
16 |         "她们":         {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Three", "Number": "Plur", "Gender": "Fem"},
17 | 
18 |         "我的":          {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "One", "Number": "Sing", "Poss": "Yes", "Reflex": "Yes"},
19 |         "他的":          {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Three", "Number": "Sing", "Gender": "Masc", "Poss": "Yes", "Reflex": "Yes"},
20 |         "她的":          {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Three", "Number": "Sing", "Gender": "Fem",  "Poss": "Yes", "Reflex": "Yes"},
21 |         "它的":          {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Three", "Number": "Sing", "Gender": "Neut", "Poss": "Yes", "Reflex": "Yes"},
22 |         "我们的":        {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "One", "Number": "Plur", "Poss": "Yes", "Reflex": "Yes"},
23 |         "你们的":        {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Two", "Number": "Plur", "Poss": "Yes", "Reflex": "Yes"},
24 |         "他们的":        {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Three", "Number": "Plur", "Poss": "Yes", "Reflex": "Yes"},
25 |         "她们的":        {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Three", "Number": "Plur", "Gender": "Fem", "Poss": "Yes", "Reflex": "Yes"},
26 | 
27 |         "我自己":        {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "One", "Number": "Sing",  "Case": "Acc", "Reflex": "Yes"},
28 |         "你自己":        {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Two", "Case": "Acc", "Reflex": "Yes"},
29 |         "他自己":        {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Three", "Number": "Sing", "Case": "Acc", "Gender": "Masc", "Reflex": "Yes"},
30 |         "她自己":        {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Three", "Number": "Sing", "Case": "Acc", "Gender": "Fem",  "Reflex": "Yes"},
31 |         "它自己":        {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Three", "Number": "Sing", "Case": "Acc", "Gender": "Neut", "Reflex": "Yes"},
32 |         "他们自己":      {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Three", "Number": "Sing", "Case": "Acc", "Reflex": "Yes"},
33 |         "她们自己":      {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Three", "Number": "Sing", "Case": "Acc", "Gender": "Fem","Reflex": "Yes"},
34 |     },
35 | 
36 |     "PRP$": {
37 |         "我的":          {LEMMA: PRON_LEMMA, "Person": "One", "Number": "Sing", "PronType": "Prs", "Poss": "Yes"},
38 |         "你的":          {LEMMA: PRON_LEMMA, "Person": "Two", "PronType": "Prs", "Poss": "Yes"},
39 |         "他的":          {LEMMA: PRON_LEMMA, "Person": "Three", "Number": "Sing", "Gender": "Masc", "PronType": "Prs", "Poss": "Yes"},
40 |         "她的":          {LEMMA: PRON_LEMMA, "Person": "Three", "Number": "Sing", "Gender": "Fem",  "PronType": "Prs", "Poss": "Yes"},
41 |         "它的":          {LEMMA: PRON_LEMMA, "Person": "Three", "Number": "Sing", "Gender": "Neut", "PronType": "Prs", "Poss": "Yes"},
42 |         "我们的":        {LEMMA: PRON_LEMMA, "Person": "One", "Number": "Plur", "PronType": "Prs", "Poss": "Yes"},
43 |         "他们的":        {LEMMA: PRON_LEMMA, "Person": "Three", "Number": "Plur", "PronType": "Prs", "Poss": "Yes"},
44 |         "她们的":        {LEMMA: PRON_LEMMA, "Person": "Three", "Number": "Plur", "PronType": "Prs", "Gender": "Fem", "Poss": "Yes"}
45 |     },
46 | 
47 |     "VBZ": {
48 |         "是":           {LEMMA: "be", "VerbForm": "Fin", "Tense": "Pres", "Mood": "Ind"},
49 |         "为":           {LEMMA: "be", "VerbForm": "Fin", "Tense": "Pres", "Mood": "Ind"},
50 |     },
51 | }
52 | 
53 | 
54 | for tag, rules in MORPH_RULES.items():
55 |     for key, attrs in dict(rules).items():
56 |         rules[key.title()] = attrs
57 | 


--------------------------------------------------------------------------------
/spacy/lang/zh/stop_words.py:
--------------------------------------------------------------------------------
   1 | # coding: utf8
   2 | from __future__ import unicode_literals
   3 | 
   4 | 
   5 | # Stop words
   6 | 
   7 | STOP_WORDS = set("""
   8 | a about above across after afterwards again against all almost alone along
   9 | already also although always am among amongst amount an and another any anyhow
  10 | anyone anything anyway anywhere are around as at
  11 | 
  12 | back be became because become becomes becoming been before beforehand behind
  13 | being below beside besides between beyond both bottom but by
  14 | 
  15 | call can cannot ca could
  16 | 
  17 | did do does doing done down due during
  18 | 
  19 | each eight either eleven else elsewhere empty enough even ever every
  20 | everyone everything everywhere except
  21 | 
  22 | few fifteen fifty first five for former formerly forty four from front full
  23 | further
  24 | 
  25 | get give go
  26 | 
  27 | had has have he hence her here hereafter hereby herein hereupon hers herself
  28 | him himself his how however hundred
  29 | 
  30 | i if in indeed into is it its itself
  31 | 
  32 | keep
  33 | 
  34 | last latter latterly least less
  35 | 
  36 | just
  37 | 
  38 | made make many may me meanwhile might mine more moreover most mostly move much
  39 | must my myself
  40 | 
  41 | name namely neither never nevertheless next nine no nobody none noone nor not
  42 | nothing now nowhere n't
  43 | 
  44 | of off often on once one only onto or other others otherwise our ours ourselves
  45 | out over own
  46 | 
  47 | part per perhaps please put
  48 | 
  49 | quite
  50 | 
  51 | rather re really regarding
  52 | 
  53 | same say see seem seemed seeming seems serious several she should show side
  54 | since six sixty so some somehow someone something sometime sometimes somewhere
  55 | still such
  56 | 
  57 | take ten than that the their them themselves then thence there thereafter
  58 | thereby therefore therein thereupon these they third this those though three
  59 | through throughout thru thus to together too top toward towards twelve twenty
  60 | two
  61 | 
  62 | under until up unless upon us used using
  63 | 
  64 | various very very via was we well were what whatever when whence whenever where
  65 | whereafter whereas whereby wherein whereupon wherever whether which while
  66 | whither who whoever whole whom whose why will with within without would
  67 | 
  68 | yet you your yours yourself yourselves
  69 | 
  70 | 'd 'll 'm 're 's 've
  71 | 
  72 | 现在
  73 | 嘿嘿
  74 | 还要
  75 | 要
  76 | 嗳
  77 | 赶早不赶晚
  78 | 这些
  79 | 日益
  80 | 倒不如
  81 | 逢
  82 | 立地
  83 | 不只
  84 | 一般
  85 | 豁然
  86 | 将近
  87 | 毫不
  88 | 嗬
  89 | 大张旗鼓
  90 | 嗯
  91 | 不可开交
  92 | "
  93 | #
  94 |  
  95 | !
  96 | &
  97 | 全力
  98 | 就算
  99 | '
 100 | $
 101 | %
 102 | ...
 103 | *
 104 | +
 105 | (
 106 | 她们
 107 | )
 108 | .
 109 | /
 110 | 各位
 111 | ,
 112 | -
 113 | ︿
 114 | 3
 115 | 2
 116 | 1
 117 | 不止
 118 | 基本
 119 | 0
 120 | 不拘
 121 | 7
 122 | 这里
 123 | 6
 124 | 5
 125 | 颇
 126 | 4
 127 | ;
 128 | :
 129 | 如此
 130 | 9
 131 | 8
 132 | 极度
 133 | ?
 134 | >
 135 | 首先
 136 | =
 137 | <
 138 | 也罢
 139 | @
 140 | A
 141 | 见
 142 | 当庭
 143 | 隔夜
 144 | 更
 145 | 不少
 146 | 不胜
 147 | \
 148 | _
 149 | 替
 150 | ^
 151 | 到目前为止
 152 | 大大
 153 | 除开
 154 | 腾
 155 | 暗中
 156 | 而外
 157 | 开始
 158 | `
 159 | 三番两次
 160 | 宁可
 161 | 这么
 162 | 权时
 163 | 结果
 164 | 大多
 165 | 除此以外
 166 | 单单
 167 | 如下
 168 | 几度
 169 | 何处
 170 | 喂
 171 | 如上
 172 | 矣
 173 | 喀
 174 | 喏
 175 | ~
 176 | 吧哒
 177 | |
 178 | 放量
 179 | 即便
 180 | 当年
 181 | 不对
 182 | 那
 183 | 顷刻
 184 | 本人
 185 | 是
 186 | 岂非
 187 | 己
 188 | 看
 189 | 趁热
 190 | 哪边
 191 | 立马
 192 | 乘势
 193 | 啥
 194 | 何况
 195 | 这个
 196 | 啦
 197 | 人民
 198 | 率尔
 199 | 那种
 200 | 仍然
 201 | 不能
 202 | 根据
 203 | 并肩
 204 | 相对而言
 205 | 也好
 206 | 啐
 207 | 什么样
 208 | 累年
 209 | 啊
 210 | 扑通
 211 | 即使
 212 | 开外
 213 | 大概
 214 | 依照
 215 | ·
 216 | 乃至
 217 | 与否
 218 | 总而言之
 219 | 高低
 220 | 切切
 221 | 多次
 222 | 比如说
 223 | 不亦乐乎
 224 | 如期
 225 | 简言之
 226 | 何妨
 227 | 不管怎样
 228 | 顺
 229 | 顷
 230 | 将才
 231 | 呆呆地
 232 | 略为
 233 | 更为
 234 | 大约
 235 | 其次
 236 | 倍加
 237 | 满
 238 | 不定
 239 | 除了
 240 | 都
 241 | 之后
 242 | 着
 243 | 难道
 244 | 不可
 245 | 至
 246 | 风雨无阻
 247 | 陡然
 248 | 为了
 249 | 及至
 250 | 对于
 251 | 虽说
 252 | 唉
 253 | 彻夜
 254 | 嘎嘎
 255 | 臭
 256 | 不同
 257 | 大体上
 258 | 自
 259 | ！
 260 | ＃
 261 | 来得及
 262 | 哦
 263 | 而
 264 | ％
 265 | ＄
 266 | 的
 267 | ＆
 268 | 哪
 269 | ）
 270 | 老
 271 | （
 272 | ＋
 273 | 哩
 274 | ＊
 275 | ，
 276 | 者
 277 | 人人
 278 | 比方
 279 | ０
 280 | １
 281 | 简而言之
 282 | 这么些
 283 | ２
 284 | ３
 285 | ４
 286 | ５
 287 | ６
 288 | 不至于
 289 | ７
 290 | 这部
 291 | 谁知
 292 | ８
 293 | 几时
 294 | ９
 295 | 屡
 296 | ：
 297 | ；
 298 | ＜
 299 | 应当
 300 | ＞
 301 | 哼
 302 | ？
 303 | 人家
 304 | 如今
 305 | ＠
 306 | 哇
 307 | 哈
 308 | 哉
 309 | 尽量
 310 | 总的说来
 311 | 继之
 312 | 单纯
 313 | 方才
 314 | 哎
 315 | 这么点儿
 316 | 极
 317 | 乘胜
 318 | 八成
 319 | 光是
 320 | 倘或
 321 | 哗
 322 | 被
 323 | 忽然
 324 | ［
 325 | 从头
 326 | 出去
 327 | 哟
 328 | ］
 329 | 咦
 330 | 切不可
 331 | 尽
 332 | 哪年
 333 | 竟然
 334 | 是否
 335 | 而况
 336 | 加以
 337 | 从此以后
 338 | 省得
 339 | 就
 340 | 咳
 341 | ﻿我们
 342 | 不力
 343 | 各个
 344 | 咱
 345 | 充其极
 346 | ｜
 347 | ｝
 348 | 次第
 349 | ～
 350 | 岂止
 351 | ｛
 352 | 某
 353 | 尽管如此
 354 | 偶而
 355 | 看上去
 356 | 截然
 357 | 甚而
 358 | 和
 359 | 如常
 360 | 任何
 361 | 极端
 362 | 接着
 363 | 嘎登
 364 | 咋
 365 | 皆可
 366 | 具体地说
 367 | 凝神
 368 | 这就是说
 369 | 将
 370 | 千万
 371 | 好在
 372 | 从早到晚
 373 | 各自
 374 | 咚
 375 | 取道
 376 | 纯粹
 377 | 这种
 378 | 只限
 379 | 上去
 380 | 恐怕
 381 | 呢
 382 | 莫非
 383 | 虽然
 384 | 碰巧
 385 | 呸
 386 | 紧接着
 387 | 即若
 388 | 本
 389 | 等等
 390 | 按照
 391 | 呵
 392 | 不单
 393 | 具体说来
 394 | 一旦
 395 | 望
 396 | 朝
 397 | 纵
 398 | 不要
 399 | 呀
 400 | 怎样
 401 | 呃
 402 | 轰然
 403 | 有
 404 | 每当
 405 | 接连不断
 406 | 呜
 407 | 呐
 408 | 不比
 409 | 呕
 410 | 纯
 411 | 呗
 412 | 各种
 413 | 理应
 414 | 连袂
 415 | 吧
 416 | 绝
 417 | 什么
 418 | 那里
 419 | 后来
 420 | 给
 421 | 日渐
 422 | 暗自
 423 | 以免
 424 | 经
 425 | 不然
 426 | 来
 427 | 饱
 428 | 别人
 429 | 吱
 430 | 看来
 431 | 沙沙
 432 | 同
 433 | 趁势
 434 | 切莫
 435 | 从重
 436 | 尽心尽力
 437 | 切勿
 438 | 果真
 439 | 各
 440 | ￥
 441 | 要不是
 442 | 白
 443 | 并排
 444 | 自己
 445 | 保管
 446 | 岂
 447 | 差一点
 448 | 默然
 449 | 此中
 450 | 能
 451 | 吗
 452 | 向
 453 | 吓
 454 | 藉以
 455 | 不惟
 456 | 的确
 457 | 此后
 458 | 让
 459 | 待到
 460 | 末##末
 461 | 哪些
 462 | 不然的话
 463 | 其他
 464 | 毫无
 465 | 连声
 466 | 趁早
 467 | 归
 468 | 几经
 469 | 当
 470 | 论
 471 | 九
 472 | 也
 473 | 挨门挨户
 474 | 挨次
 475 | 乘
 476 | 保险
 477 | 从小
 478 | 莫若
 479 | 乒
 480 | 彼
 481 | 乎
 482 | 刚好
 483 | 么
 484 | 如若
 485 | 之
 486 | 默默地
 487 | 是的
 488 | 嗡嗡
 489 | 请勿
 490 | 乃
 491 | 为
 492 | 得起
 493 | 借此
 494 | 该
 495 | 汝
 496 | 从此
 497 | 然而
 498 | 亲眼
 499 | 略微
 500 | 刚才
 501 | 一定
 502 | 反倒是
 503 | 按时
 504 | 临
 505 | 个
 506 | 倘若
 507 | 差不多
 508 | 从无到有
 509 | —
 510 | 起来
 511 | ’
 512 | ‘
 513 | 反之则
 514 | 弗
 515 | ”
 516 | “
 517 | 何止
 518 | 惯常
 519 | 姑且
 520 | 与其
 521 | …
 522 | 哪个
 523 | 反而
 524 | 常言道
 525 | 大抵
 526 | 不再
 527 | 且
 528 | 到了儿
 529 | 三
 530 | 上
 531 | 再者
 532 | 不
 533 | 并且
 534 | 与
 535 | 一
 536 | 趁着
 537 | 七
 538 | 两者
 539 | 等到
 540 | 不经意
 541 | 必
 542 | 如何
 543 | 来着
 544 | 不由得
 545 | 怎么样
 546 | 尽管
 547 | 知道
 548 | 任
 549 | 旁人
 550 | 不管
 551 | 由
 552 | 个人
 553 | 哪里
 554 | 似的
 555 | 以
 556 | 甭
 557 | 甫
 558 | 倒不如说
 559 | 用
 560 | 均
 561 | 其余
 562 | 长此下去
 563 | 们
 564 | 莫
 565 | 匆匆
 566 | 多少
 567 | 当着
 568 | 就是说
 569 | 他
 570 | 既然
 571 | 虽则
 572 | 纵使
 573 | 呼哧
 574 | 沿
 575 | 快
 576 | 仅
 577 | 联袂
 578 | 没
 579 | 或许
 580 | 仍
 581 | 来看
 582 | 俺们
 583 | 从
 584 | 倘然
 585 | 只是
 586 | 往
 587 | 大凡
 588 | 而言
 589 | 当真
 590 | 待
 591 | 因此
 592 | 很
 593 | 不如
 594 | 据此
 595 | 更进一步
 596 | 那么样
 597 | 纵然
 598 | 得
 599 | 不仅...而且
 600 | 极为
 601 | 尽然
 602 | 略
 603 | 长期以来
 604 | 互
 605 | 五
 606 | 不妨
 607 | 不止一次
 608 | 地
 609 | ——
 610 | 较比
 611 | 必须
 612 | 或是
 613 | 向着
 614 | 从古到今
 615 | 在
 616 | 尽如人意
 617 | 了
 618 | 毕竟
 619 | 二
 620 | 川流不息
 621 | 确实
 622 | 于
 623 | 可以
 624 | 你
 625 | 并没有
 626 | 当场
 627 | 要不
 628 | 那儿
 629 | 纵令
 630 | 恰巧
 631 | 无宁
 632 | 四
 633 | 来讲
 634 | 局外
 635 | 近年来
 636 | 因
 637 | 并
 638 | 但
 639 | 起首
 640 | ﻿,
 641 | 赶快
 642 | 方
 643 | 需要
 644 | 即令
 645 | 大略
 646 | 将要
 647 | 活
 648 | 不特
 649 | 然则
 650 | 极了
 651 | 何
 652 | 但是
 653 | 固
 654 | 不独
 655 | 何苦
 656 | 一则
 657 | 猛然
 658 | 屡屡
 659 | 传
 660 | 到底
 661 | 在下
 662 | 设使
 663 | 经过
 664 | 至于
 665 | 老老实实
 666 | 猛然间
 667 | 截至
 668 | 譬如
 669 | 很多
 670 | 一切
 671 | 别的
 672 | 要么
 673 | 趁机
 674 | 。
 675 | 、
 676 | 越是
 677 | 常
 678 | 　
 679 | 按期
 680 | 何尝
 681 | 》
 682 | 《
 683 | 〉
 684 | 〈
 685 | 动不动
 686 | 不外
 687 | 因为
 688 | 使得
 689 | 会
 690 | 既
 691 | 如果
 692 | 按说
 693 | 不大
 694 | 带
 695 | 自从
 696 | 以便
 697 | 宁肯
 698 | 当下
 699 | 不光
 700 | 它们
 701 | 之类
 702 | 老大
 703 | 尽可能
 704 | 尔后
 705 | 成年累月
 706 | 如上所述
 707 | 每个
 708 | 彼此
 709 | 从宽
 710 | 俺
 711 | 就此
 712 | 粗
 713 | 达旦
 714 | 当口儿
 715 | 归根结底
 716 | 看起来
 717 | 或多或少
 718 | 当中
 719 | 据我所知
 720 | 据实
 721 | 不免
 722 | 遵照
 723 | 固然
 724 | 缕缕
 725 | 换言之
 726 | 策略地
 727 | 居然
 728 | 连日来
 729 | 若
 730 | 起见
 731 | 比照
 732 | 嘎
 733 | 不成
 734 | 不仅仅是
 735 | 长话短说
 736 | 因而
 737 | 设若
 738 | 不论
 739 | 嘘
 740 | 嘛
 741 | 沿着
 742 | 恍然
 743 | 慢说
 744 | 亲身
 745 | 哼唷
 746 | 故
 747 | 便
 748 | 以至
 749 | 以致
 750 | 本着
 751 | 论说
 752 | 除外
 753 | 之所以
 754 | 简直
 755 | 前后
 756 | 大家
 757 | 嘻
 758 | 果然
 759 | 共总
 760 | 嘿
 761 | 敢
 762 | 时候
 763 | 不怎么
 764 | 如次
 765 | 依
 766 | 鄙人
 767 | 亲手
 768 | 大
 769 | 顿时
 770 | 顺着
 771 | 叮当
 772 | 敞开儿
 773 | 等
 774 | 大面儿上
 775 | 年复一年
 776 | 冲
 777 | 打开天窗说亮话
 778 | 跟
 779 | 上来
 780 | 拿
 781 | 假若
 782 | 不曾
 783 | 着呢
 784 | 快要
 785 | 此刻
 786 | 而且
 787 | 背靠背
 788 | 假使
 789 | 陈年
 790 | 多多益善
 791 | 另方面
 792 | 冒
 793 | 他人
 794 | 到处
 795 | 大体
 796 | 下来
 797 | 云云
 798 | 全然
 799 | 何须
 800 | 为着
 801 | 每逢
 802 | 内
 803 | 多
 804 | 很少
 805 | 尚且
 806 | 只要
 807 | 不仅仅
 808 | 出
 809 |   
 810 | 顷刻间
 811 | 常常
 812 | 趁
 813 | 日臻
 814 | 恰似
 815 | 得天独厚
 816 | 另外
 817 | 敢情
 818 | 率然
 819 | 并无
 820 | 届时
 821 | 凭
 822 | 每每
 823 | 几
 824 | 她
 825 | 成为
 826 | 他们
 827 | 尔等
 828 | 尽快
 829 | 不消
 830 | 如其
 831 | 把
 832 | 反之亦然
 833 | 当即
 834 | 奇
 835 | 据悉
 836 | 奈
 837 | 前者
 838 | 第
 839 | 必定
 840 |  [
 841 | 处处
 842 |  ]
 843 | 断然
 844 | 绝非
 845 | 总的来看
 846 | 岂但
 847 | 分期
 848 | 古来
 849 | 我们
 850 | 啪达
 851 | 顷刻之间
 852 | 每次
 853 | 别说
 854 | 传闻
 855 | 从优
 856 | 总的来说
 857 | 非徒
 858 | 常言说得好
 859 | 非得
 860 | 由于
 861 | 难说
 862 | 可是
 863 | 从今以后
 864 | 比如
 865 | 所
 866 | 继而
 867 | 不可抗拒
 868 | 才
 869 | 如
 870 | 精光
 871 | 凭借
 872 | 略加
 873 | 起
 874 | 平素
 875 | 绝对
 876 | 赶
 877 | 于是
 878 | 打
 879 | 一样
 880 | 长线
 881 | 每时每刻
 882 | 不择手段
 883 | 理该
 884 | 共
 885 | 拦腰
 886 | 喔唷
 887 | 其
 888 | 仍旧
 889 | 屡次
 890 | 以及
 891 | 当然
 892 | 到头来
 893 | 抑或
 894 | 宁愿
 895 | 一方面
 896 | 举凡
 897 | 只有
 898 | 八
 899 | 咱们
 900 | 六
 901 | 从新
 902 | 兮
 903 | 这样
 904 | 不得已
 905 | 管
 906 | 十分
 907 | 自个儿
 908 | 呼啦
 909 | 我
 910 | 必将
 911 | 串行
 912 | 而论
 913 | 或
 914 | 牢牢
 915 | 成心
 916 | 光
 917 | 哈哈
 918 | 与此同时
 919 | 其一
 920 | 于是乎
 921 | 此
 922 | 看样子
 923 | 换句话说
 924 | 全身心
 925 | 除非
 926 | 有人
 927 | 以至于
 928 | 按理
 929 | 也许
 930 | 打从
 931 | 照着
 932 | 况且
 933 | 独
 934 | 除却
 935 | 不了
 936 | 不得
 937 | 反手
 938 | 成年
 939 | 哎呀
 940 | 关于
 941 | 恰恰相反
 942 | 这儿
 943 | 累次
 944 | 其中
 945 | 动辄
 946 | 立刻
 947 | 倒是
 948 | 毫无例外
 949 | 从古至今
 950 | 可见
 951 | 诚然
 952 | 莫不
 953 | 怎么办
 954 | 亲自
 955 | 经常
 956 | 决不
 957 | 自各儿
 958 | 这么样
 959 | 不必
 960 | 不得了
 961 | 除去
 962 | 由此可见
 963 | 像
 964 | 有些
 965 | 挨个
 966 | 不仅
 967 | 进来
 968 | 大事
 969 | 全年
 970 | 绝顶
 971 | 社会主义
 972 | 总之
 973 | 当头
 974 | 若是
 975 | 竟
 976 | 不外乎
 977 | 要不然
 978 | 如此等等
 979 | 分期分批
 980 | 那么
 981 | 毋宁
 982 | 立
 983 | 其二
 984 | 不会
 985 | ..
 986 | 背地里
 987 | 据
 988 | 此间
 989 | 哪儿
 990 | 不怕
 991 | 不问
 992 | 每
 993 | 为什么
 994 | 没有
 995 | 公然
 996 | 那会儿
 997 | 迫于
 998 | 来不及
 999 | 不起
1000 | 千万千万
1001 | 可能
1002 | 正如
1003 | 比
1004 | 还有
1005 | 借
1006 | 倘
1007 | 究竟
1008 | 及其
1009 | 不限
1010 | 偏偏
1011 | 据称
1012 | 故此
1013 | 谁
1014 | 伙同
1015 | 敢于
1016 | 弹指之间
1017 | 那些
1018 | 窃
1019 | 朝着
1020 | 叮咚
1021 | 临到
1022 | 即将
1023 | 哎哟
1024 | 而已
1025 | 尽心竭力
1026 | 到头
1027 | 亲口
1028 | 已经
1029 | 不但
1030 | 出来
1031 | 随着
1032 | 不得不
1033 | 非常
1034 | 另一个
1035 | 非但
1036 | 如前所述
1037 | 殆
1038 | --
1039 | 诸位
1040 | 那时
1041 | 即是说
1042 | 按
1043 | 谨
1044 | 何时
1045 | 此外
1046 | 然后
1047 | 勃然
1048 | 从来
1049 | 近几年来
1050 | 近来
1051 | 莫如
1052 | 奋勇
1053 | 比起
1054 | 仅仅
1055 | 故而
1056 | 穷年累月
1057 | 历
1058 | 乌乎
1059 | 怪不得
1060 | 去
1061 | 借以
1062 | 主要
1063 | 间或
1064 | 方能
1065 | 白白
1066 | 除
1067 | 反过来
1068 | 全都
1069 | 并没
1070 | 过
1071 | 除此之外
1072 | 马上
1073 | 迄
1074 | 恰恰
1075 | 传说
1076 | 还
1077 | 这
1078 | 连
1079 | 近
1080 | 从速
1081 | 上下
1082 | 哪样
1083 | 这边
1084 | 从未
1085 | 不能不
1086 | 从不
1087 | 及
1088 | 那个
1089 | 边
1090 | 又
1091 | 迟早
1092 | 不知不觉
1093 | 挨家挨户
1094 | 多多少少
1095 | 几番
1096 | 有关
1097 | 您
1098 | 连同
1099 | 较
1100 | 互相
1101 | 怎么
1102 | 但愿
1103 | 可
1104 | 你们
1105 | 凑巧
1106 | 连日
1107 | 叫
1108 | 路经
1109 | 阿
1110 | 起先
1111 | 另
1112 | 二话没说
1113 | 之一
1114 | 这时
1115 | 即或
1116 | 连连
1117 | 其后
1118 | 各式
1119 | 当儿
1120 | 独自
1121 | 它
1122 | 宁
1123 | 哪天
1124 | 就是
1125 | 乘机
1126 | 常言说
1127 | 不下
1128 | 定
1129 | 照
1130 | 昂然
1131 | 毫无保留地
1132 | 趁便
1133 | 屡次三番
1134 | 甚至
1135 | 那末
1136 | 充其量
1137 | 该当
1138 | 另一方面
1139 | 既...又
1140 | 瑟瑟
1141 | 的话
1142 | 呜呼
1143 | 或者
1144 | 立时
1145 | 反过来说
1146 | 有的
1147 | 挨着
1148 | 再说
1149 | 够瞧的
1150 | 过于
1151 | 零
1152 | 就地
1153 | 然
1154 | 极其
1155 | 何乐而不为
1156 | 进去
1157 | 单
1158 | 随
1159 | 起头
1160 | 无论
1161 | 怎
1162 | 据说
1163 | 综上所述
1164 | 抽冷子
1165 | 才能
1166 | 怕
1167 | 千
1168 | 离
1169 | 梆
1170 | 极大
1171 | 恰逢
1172 | 半
1173 | 大举
1174 | 漫说
1175 | 接下来
1176 | 忽地
1177 | 而是
1178 | 即
1179 | 难得
1180 | 不但...而且
1181 | 格外
1182 | 怪
1183 | 倘使
1184 | 还是
1185 | 从而
1186 | 对
1187 | 本身
1188 | 乘隙
1189 | 既是
1190 | 理当
1191 | 反倒
1192 | 焉
1193 | 可好
1194 | 不满
1195 | 交口
1196 | 基本上
1197 | 认为
1198 | 这会儿
1199 | 充分
1200 | 并非
1201 | 不迭
1202 | 老是
1203 | 倍感
1204 | 鉴于
1205 | 要是
1206 | 反之
1207 | 哪怕
1208 | 除此而外
1209 | 虽
1210 | 一下
1211 | 自身
1212 | 任凭
1213 | 几乎
1214 | 顶多
1215 | 靠
1216 | 而又
1217 | 不时
1218 | 一个
1219 | 否则
1220 | 自家
1221 | 三天两头
1222 | 砰
1223 | 啊呀
1224 | 难怪
1225 | 所以
1226 | 发生
1227 | 哗啦
1228 | 多年来
1229 | 罢了
1230 | 大致
1231 | 从轻
1232 | 那边
1233 | 那么些
1234 | 不巧
1235 | 完全
1236 | 起初
1237 | 某个
1238 | 加之
1239 | 大不了
1240 | 归根到底
1241 | 偶尔
1242 | 应该
1243 | 二话不说
1244 | 日见
1245 | 不是
1246 | 大都
1247 | 愤然
1248 | 而后
1249 | 多年前
1250 | 例如
1251 | 蛮
1252 | 切
1253 | 一些
1254 | 多多
1255 | 日复一日
1256 | 较之
1257 | 即刻
1258 | 那样
1259 | 齐
1260 | 其它
1261 | 则
1262 | 不料
1263 | 刚
1264 | 初
1265 | 决非
1266 | 乘虚
1267 | 恰如
1268 | 能够
1269 | 从严
1270 | 故意
1271 | 别
1272 | 啊哟
1273 | 从中
1274 | 不已
1275 | 加上
1276 | 具体来说
1277 | 较为
1278 | 分头
1279 | 直到
1280 | 到
1281 | >>
1282 | 隔日
1283 | 多亏
1284 | 假如
1285 | 甚么
1286 | 作为
1287 | 暗地里
1288 | 挨门逐户
1289 | 恰好
1290 | 其实
1291 | 何必
1292 | 万一
1293 | 不过
1294 | 某些
1295 | 啊哈
1296 | 基于
1297 | 不日
1298 | 尽早
1299 | 刚巧
1300 | 概
1301 | 一来
1302 | 同时
1303 | 三番五次
1304 | 为何
1305 | 更加
1306 | 绝不
1307 | 除此
1308 | 不常
1309 | 进而
1310 | 另行
1311 | 急匆匆
1312 | 通过
1313 | 话说
1314 | 若非
1315 | 极力
1316 | 存心
1317 | a
1318 | able
1319 | about
1320 | above
1321 | according
1322 | accordingly
1323 | across
1324 | actually
1325 | after
1326 | afterwards
1327 | again
1328 | against
1329 | ain't
1330 | all
1331 | allow
1332 | allows
1333 | almost
1334 | alone
1335 | along
1336 | already
1337 | also
1338 | although
1339 | always
1340 | am
1341 | among
1342 | amongst
1343 | an
1344 | and
1345 | another
1346 | any
1347 | anybody
1348 | anyhow
1349 | anyone
1350 | anything
1351 | anyway
1352 | anyways
1353 | anywhere
1354 | apart
1355 | appear
1356 | appreciate
1357 | appropriate
1358 | are
1359 | aren't
1360 | around
1361 | as
1362 | a's
1363 | aside
1364 | ask
1365 | asking
1366 | associated
1367 | at
1368 | available
1369 | away
1370 | awfully
1371 | be
1372 | became
1373 | because
1374 | become
1375 | becomes
1376 | becoming
1377 | been
1378 | before
1379 | beforehand
1380 | behind
1381 | being
1382 | believe
1383 | below
1384 | beside
1385 | besides
1386 | best
1387 | better
1388 | between
1389 | beyond
1390 | both
1391 | brief
1392 | but
1393 | by
1394 | came
1395 | can
1396 | cannot
1397 | cant
1398 | can't
1399 | cause
1400 | causes
1401 | certain
1402 | certainly
1403 | changes
1404 | clearly
1405 | c'mon
1406 | co
1407 | com
1408 | come
1409 | comes
1410 | concerning
1411 | consequently
1412 | consider
1413 | considering
1414 | contain
1415 | containing
1416 | contains
1417 | corresponding
1418 | could
1419 | couldn't
1420 | course
1421 | c's
1422 | currently
1423 | definitely
1424 | described
1425 | despite
1426 | did
1427 | didn't
1428 | different
1429 | do
1430 | does
1431 | doesn't
1432 | doing
1433 | done
1434 | don't
1435 | down
1436 | downwards
1437 | during
1438 | each
1439 | edu
1440 | eg
1441 | eight
1442 | either
1443 | else
1444 | elsewhere
1445 | enough
1446 | entirely
1447 | especially
1448 | et
1449 | etc
1450 | even
1451 | ever
1452 | every
1453 | everybody
1454 | everyone
1455 | everything
1456 | everywhere
1457 | ex
1458 | exactly
1459 | example
1460 | except
1461 | far
1462 | few
1463 | fifth
1464 | first
1465 | five
1466 | followed
1467 | following
1468 | follows
1469 | for
1470 | former
1471 | formerly
1472 | forth
1473 | four
1474 | from
1475 | further
1476 | furthermore
1477 | get
1478 | gets
1479 | getting
1480 | given
1481 | gives
1482 | go
1483 | goes
1484 | going
1485 | gone
1486 | got
1487 | gotten
1488 | greetings
1489 | had
1490 | hadn't
1491 | happens
1492 | hardly
1493 | has
1494 | hasn't
1495 | have
1496 | haven't
1497 | having
1498 | he
1499 | hello
1500 | help
1501 | hence
1502 | her
1503 | here
1504 | hereafter
1505 | hereby
1506 | herein
1507 | here's
1508 | hereupon
1509 | hers
1510 | herself
1511 | he's
1512 | hi
1513 | him
1514 | himself
1515 | his
1516 | hither
1517 | hopefully
1518 | how
1519 | howbeit
1520 | however
1521 | i'd
1522 | ie
1523 | if
1524 | ignored
1525 | i'll
1526 | i'm
1527 | immediate
1528 | in
1529 | inasmuch
1530 | inc
1531 | indeed
1532 | indicate
1533 | indicated
1534 | indicates
1535 | inner
1536 | insofar
1537 | instead
1538 | into
1539 | inward
1540 | is
1541 | isn't
1542 | it
1543 | it'd
1544 | it'll
1545 | its
1546 | it's
1547 | itself
1548 | i've
1549 | just
1550 | keep
1551 | keeps
1552 | kept
1553 | know
1554 | known
1555 | knows
1556 | last
1557 | lately
1558 | later
1559 | latter
1560 | latterly
1561 | least
1562 | less
1563 | lest
1564 | let
1565 | let's
1566 | like
1567 | liked
1568 | likely
1569 | little
1570 | look
1571 | looking
1572 | looks
1573 | ltd
1574 | mainly
1575 | many
1576 | may
1577 | maybe
1578 | me
1579 | mean
1580 | meanwhile
1581 | merely
1582 | might
1583 | more
1584 | moreover
1585 | most
1586 | mostly
1587 | much
1588 | must
1589 | my
1590 | myself
1591 | name
1592 | namely
1593 | nd
1594 | near
1595 | nearly
1596 | necessary
1597 | need
1598 | needs
1599 | neither
1600 | never
1601 | nevertheless
1602 | new
1603 | next
1604 | nine
1605 | no
1606 | nobody
1607 | non
1608 | none
1609 | noone
1610 | nor
1611 | normally
1612 | not
1613 | nothing
1614 | novel
1615 | now
1616 | nowhere
1617 | obviously
1618 | of
1619 | off
1620 | often
1621 | oh
1622 | ok
1623 | okay
1624 | old
1625 | on
1626 | once
1627 | one
1628 | ones
1629 | only
1630 | onto
1631 | or
1632 | other
1633 | others
1634 | otherwise
1635 | ought
1636 | our
1637 | ours
1638 | ourselves
1639 | out
1640 | outside
1641 | over
1642 | overall
1643 | own
1644 | particular
1645 | particularly
1646 | per
1647 | perhaps
1648 | placed
1649 | please
1650 | plus
1651 | possible
1652 | presumably
1653 | probably
1654 | provides
1655 | que
1656 | quite
1657 | qv
1658 | rather
1659 | rd
1660 | re
1661 | really
1662 | reasonably
1663 | regarding
1664 | regardless
1665 | regards
1666 | relatively
1667 | respectively
1668 | right
1669 | said
1670 | same
1671 | saw
1672 | say
1673 | saying
1674 | says
1675 | second
1676 | secondly
1677 | see
1678 | seeing
1679 | seem
1680 | seemed
1681 | seeming
1682 | seems
1683 | seen
1684 | self
1685 | selves
1686 | sensible
1687 | sent
1688 | serious
1689 | seriously
1690 | seven
1691 | several
1692 | shall
1693 | she
1694 | should
1695 | shouldn't
1696 | since
1697 | six
1698 | so
1699 | some
1700 | somebody
1701 | somehow
1702 | someone
1703 | something
1704 | sometime
1705 | sometimes
1706 | somewhat
1707 | somewhere
1708 | soon
1709 | sorry
1710 | specified
1711 | specify
1712 | specifying
1713 | still
1714 | sub
1715 | such
1716 | sup
1717 | sure
1718 | take
1719 | taken
1720 | tell
1721 | tends
1722 | th
1723 | than
1724 | thank
1725 | thanks
1726 | thanx
1727 | that
1728 | thats
1729 | that's
1730 | the
1731 | their
1732 | theirs
1733 | them
1734 | themselves
1735 | then
1736 | thence
1737 | there
1738 | thereafter
1739 | thereby
1740 | therefore
1741 | therein
1742 | theres
1743 | there's
1744 | thereupon
1745 | these
1746 | they
1747 | they'd
1748 | they'll
1749 | they're
1750 | they've
1751 | think
1752 | third
1753 | this
1754 | thorough
1755 | thoroughly
1756 | those
1757 | though
1758 | three
1759 | through
1760 | throughout
1761 | thru
1762 | thus
1763 | to
1764 | together
1765 | too
1766 | took
1767 | toward
1768 | towards
1769 | tried
1770 | tries
1771 | truly
1772 | try
1773 | trying
1774 | t's
1775 | twice
1776 | two
1777 | un
1778 | under
1779 | unfortunately
1780 | unless
1781 | unlikely
1782 | until
1783 | unto
1784 | up
1785 | upon
1786 | us
1787 | use
1788 | used
1789 | useful
1790 | uses
1791 | using
1792 | usually
1793 | value
1794 | various
1795 | very
1796 | via
1797 | viz
1798 | vs
1799 | want
1800 | wants
1801 | was
1802 | wasn't
1803 | way
1804 | we
1805 | we'd
1806 | welcome
1807 | well
1808 | we'll
1809 | went
1810 | were
1811 | we're
1812 | weren't
1813 | we've
1814 | what
1815 | whatever
1816 | what's
1817 | when
1818 | whence
1819 | whenever
1820 | where
1821 | whereafter
1822 | whereas
1823 | whereby
1824 | wherein
1825 | where's
1826 | whereupon
1827 | wherever
1828 | whether
1829 | which
1830 | while
1831 | whither
1832 | who
1833 | whoever
1834 | whole
1835 | whom
1836 | who's
1837 | whose
1838 | why
1839 | will
1840 | willing
1841 | wish
1842 | with
1843 | within
1844 | without
1845 | wonder
1846 | won't
1847 | would
1848 | wouldn't
1849 | yes
1850 | yet
1851 | you
1852 | you'd
1853 | you'll
1854 | your
1855 | you're
1856 | yours
1857 | yourself
1858 | yourselves
1859 | you've
1860 | zero
1861 | zt
1862 | ZT
1863 | zz
1864 | ZZ
1865 | 一
1866 | 一下
1867 | 一些
1868 | 一切
1869 | 一则
1870 | 一天
1871 | 一定
1872 | 一方面
1873 | 一旦
1874 | 一时
1875 | 一来
1876 | 一样
1877 | 一次
1878 | 一片
1879 | 一直
1880 | 一致
1881 | 一般
1882 | 一起
1883 | 一边
1884 | 一面
1885 | 万一
1886 | 上下
1887 | 上升
1888 | 上去
1889 | 上来
1890 | 上述
1891 | 上面
1892 | 下列
1893 | 下去
1894 | 下来
1895 | 下面
1896 | 不一
1897 | 不久
1898 | 不仅
1899 | 不会
1900 | 不但
1901 | 不光
1902 | 不单
1903 | 不变
1904 | 不只
1905 | 不可
1906 | 不同
1907 | 不够
1908 | 不如
1909 | 不得
1910 | 不怕
1911 | 不惟
1912 | 不成
1913 | 不拘
1914 | 不敢
1915 | 不断
1916 | 不是
1917 | 不比
1918 | 不然
1919 | 不特
1920 | 不独
1921 | 不管
1922 | 不能
1923 | 不要
1924 | 不论
1925 | 不足
1926 | 不过
1927 | 不问
1928 | 与
1929 | 与其
1930 | 与否
1931 | 与此同时
1932 | 专门
1933 | 且
1934 | 两者
1935 | 严格
1936 | 严重
1937 | 个
1938 | 个人
1939 | 个别
1940 | 中小
1941 | 中间
1942 | 丰富
1943 | 临
1944 | 为
1945 | 为主
1946 | 为了
1947 | 为什么
1948 | 为什麽
1949 | 为何
1950 | 为着
1951 | 主张
1952 | 主要
1953 | 举行
1954 | 乃
1955 | 乃至
1956 | 么
1957 | 之
1958 | 之一
1959 | 之前
1960 | 之后
1961 | 之後
1962 | 之所以
1963 | 之类
1964 | 乌乎
1965 | 乎
1966 | 乘
1967 | 也
1968 | 也好
1969 | 也是
1970 | 也罢
1971 | 了
1972 | 了解
1973 | 争取
1974 | 于
1975 | 于是
1976 | 于是乎
1977 | 云云
1978 | 互相
1979 | 产生
1980 | 人们
1981 | 人家
1982 | 什么
1983 | 什么样
1984 | 什麽
1985 | 今后
1986 | 今天
1987 | 今年
1988 | 今後
1989 | 仍然
1990 | 从
1991 | 从事
1992 | 从而
1993 | 他
1994 | 他人
1995 | 他们
1996 | 他的
1997 | 代替
1998 | 以
1999 | 以上
2000 | 以下
2001 | 以为
2002 | 以便
2003 | 以免
2004 | 以前
2005 | 以及
2006 | 以后
2007 | 以外
2008 | 以後
2009 | 以来
2010 | 以至
2011 | 以至于
2012 | 以致
2013 | 们
2014 | 任
2015 | 任何
2016 | 任凭
2017 | 任务
2018 | 企图
2019 | 伟大
2020 | 似乎
2021 | 似的
2022 | 但
2023 | 但是
2024 | 何
2025 | 何况
2026 | 何处
2027 | 何时
2028 | 作为
2029 | 你
2030 | 你们
2031 | 你的
2032 | 使得
2033 | 使用
2034 | 例如
2035 | 依
2036 | 依照
2037 | 依靠
2038 | 促进
2039 | 保持
2040 | 俺
2041 | 俺们
2042 | 倘
2043 | 倘使
2044 | 倘或
2045 | 倘然
2046 | 倘若
2047 | 假使
2048 | 假如
2049 | 假若
2050 | 做到
2051 | 像
2052 | 允许
2053 | 充分
2054 | 先后
2055 | 先後
2056 | 先生
2057 | 全部
2058 | 全面
2059 | 兮
2060 | 共同
2061 | 关于
2062 | 其
2063 | 其一
2064 | 其中
2065 | 其二
2066 | 其他
2067 | 其余
2068 | 其它
2069 | 其实
2070 | 其次
2071 | 具体
2072 | 具体地说
2073 | 具体说来
2074 | 具有
2075 | 再者
2076 | 再说
2077 | 冒
2078 | 冲
2079 | 决定
2080 | 况且
2081 | 准备
2082 | 几
2083 | 几乎
2084 | 几时
2085 | 凭
2086 | 凭借
2087 | 出去
2088 | 出来
2089 | 出现
2090 | 分别
2091 | 则
2092 | 别
2093 | 别的
2094 | 别说
2095 | 到
2096 | 前后
2097 | 前者
2098 | 前进
2099 | 前面
2100 | 加之
2101 | 加以
2102 | 加入
2103 | 加强
2104 | 十分
2105 | 即
2106 | 即令
2107 | 即使
2108 | 即便
2109 | 即或
2110 | 即若
2111 | 却不
2112 | 原来
2113 | 又
2114 | 及
2115 | 及其
2116 | 及时
2117 | 及至
2118 | 双方
2119 | 反之
2120 | 反应
2121 | 反映
2122 | 反过来
2123 | 反过来说
2124 | 取得
2125 | 受到
2126 | 变成
2127 | 另
2128 | 另一方面
2129 | 另外
2130 | 只是
2131 | 只有
2132 | 只要
2133 | 只限
2134 | 叫
2135 | 叫做
2136 | 召开
2137 | 叮咚
2138 | 可
2139 | 可以
2140 | 可是
2141 | 可能
2142 | 可见
2143 | 各
2144 | 各个
2145 | 各人
2146 | 各位
2147 | 各地
2148 | 各种
2149 | 各级
2150 | 各自
2151 | 合理
2152 | 同
2153 | 同一
2154 | 同时
2155 | 同样
2156 | 后来
2157 | 后面
2158 | 向
2159 | 向着
2160 | 吓
2161 | 吗
2162 | 否则
2163 | 吧
2164 | 吧哒
2165 | 吱
2166 | 呀
2167 | 呃
2168 | 呕
2169 | 呗
2170 | 呜
2171 | 呜呼
2172 | 呢
2173 | 周围
2174 | 呵
2175 | 呸
2176 | 呼哧
2177 | 咋
2178 | 和
2179 | 咚
2180 | 咦
2181 | 咱
2182 | 咱们
2183 | 咳
2184 | 哇
2185 | 哈
2186 | 哈哈
2187 | 哉
2188 | 哎
2189 | 哎呀
2190 | 哎哟
2191 | 哗
2192 | 哟
2193 | 哦
2194 | 哩
2195 | 哪
2196 | 哪个
2197 | 哪些
2198 | 哪儿
2199 | 哪天
2200 | 哪年
2201 | 哪怕
2202 | 哪样
2203 | 哪边
2204 | 哪里
2205 | 哼
2206 | 哼唷
2207 | 唉
2208 | 啊
2209 | 啐
2210 | 啥
2211 | 啦
2212 | 啪达
2213 | 喂
2214 | 喏
2215 | 喔唷
2216 | 嗡嗡
2217 | 嗬
2218 | 嗯
2219 | 嗳
2220 | 嘎
2221 | 嘎登
2222 | 嘘
2223 | 嘛
2224 | 嘻
2225 | 嘿
2226 | 因
2227 | 因为
2228 | 因此
2229 | 因而
2230 | 固然
2231 | 在
2232 | 在下
2233 | 地
2234 | 坚决
2235 | 坚持
2236 | 基本
2237 | 处理
2238 | 复杂
2239 | 多
2240 | 多少
2241 | 多数
2242 | 多次
2243 | 大力
2244 | 大多数
2245 | 大大
2246 | 大家
2247 | 大批
2248 | 大约
2249 | 大量
2250 | 失去
2251 | 她
2252 | 她们
2253 | 她的
2254 | 好的
2255 | 好象
2256 | 如
2257 | 如上所述
2258 | 如下
2259 | 如何
2260 | 如其
2261 | 如果
2262 | 如此
2263 | 如若
2264 | 存在
2265 | 宁
2266 | 宁可
2267 | 宁愿
2268 | 宁肯
2269 | 它
2270 | 它们
2271 | 它们的
2272 | 它的
2273 | 安全
2274 | 完全
2275 | 完成
2276 | 实现
2277 | 实际
2278 | 宣布
2279 | 容易
2280 | 密切
2281 | 对
2282 | 对于
2283 | 对应
2284 | 将
2285 | 少数
2286 | 尔后
2287 | 尚且
2288 | 尤其
2289 | 就
2290 | 就是
2291 | 就是说
2292 | 尽
2293 | 尽管
2294 | 属于
2295 | 岂但
2296 | 左右
2297 | 巨大
2298 | 巩固
2299 | 己
2300 | 已经
2301 | 帮助
2302 | 常常
2303 | 并
2304 | 并不
2305 | 并不是
2306 | 并且
2307 | 并没有
2308 | 广大
2309 | 广泛
2310 | 应当
2311 | 应用
2312 | 应该
2313 | 开外
2314 | 开始
2315 | 开展
2316 | 引起
2317 | 强烈
2318 | 强调
2319 | 归
2320 | 当
2321 | 当前
2322 | 当时
2323 | 当然
2324 | 当着
2325 | 形成
2326 | 彻底
2327 | 彼
2328 | 彼此
2329 | 往
2330 | 往往
2331 | 待
2332 | 後来
2333 | 後面
2334 | 得
2335 | 得出
2336 | 得到
2337 | 心里
2338 | 必然
2339 | 必要
2340 | 必须
2341 | 怎
2342 | 怎么
2343 | 怎么办
2344 | 怎么样
2345 | 怎样
2346 | 怎麽
2347 | 总之
2348 | 总是
2349 | 总的来看
2350 | 总的来说
2351 | 总的说来
2352 | 总结
2353 | 总而言之
2354 | 恰恰相反
2355 | 您
2356 | 意思
2357 | 愿意
2358 | 慢说
2359 | 成为
2360 | 我
2361 | 我们
2362 | 我的
2363 | 或
2364 | 或是
2365 | 或者
2366 | 战斗
2367 | 所
2368 | 所以
2369 | 所有
2370 | 所谓
2371 | 打
2372 | 扩大
2373 | 把
2374 | 抑或
2375 | 拿
2376 | 按
2377 | 按照
2378 | 换句话说
2379 | 换言之
2380 | 据
2381 | 掌握
2382 | 接着
2383 | 接著
2384 | 故
2385 | 故此
2386 | 整个
2387 | 方便
2388 | 方面
2389 | 旁人
2390 | 无宁
2391 | 无法
2392 | 无论
2393 | 既
2394 | 既是
2395 | 既然
2396 | 时候
2397 | 明显
2398 | 明确
2399 | 是
2400 | 是否
2401 | 是的
2402 | 显然
2403 | 显著
2404 | 普通
2405 | 普遍
2406 | 更加
2407 | 曾经
2408 | 替
2409 | 最后
2410 | 最大
2411 | 最好
2412 | 最後
2413 | 最近
2414 | 最高
2415 | 有
2416 | 有些
2417 | 有关
2418 | 有利
2419 | 有力
2420 | 有所
2421 | 有效
2422 | 有时
2423 | 有点
2424 | 有的
2425 | 有着
2426 | 有著
2427 | 望
2428 | 朝
2429 | 朝着
2430 | 本
2431 | 本着
2432 | 来
2433 | 来着
2434 | 极了
2435 | 构成
2436 | 果然
2437 | 果真
2438 | 某
2439 | 某个
2440 | 某些
2441 | 根据
2442 | 根本
2443 | 欢迎
2444 | 正在
2445 | 正如
2446 | 正常
2447 | 此
2448 | 此外
2449 | 此时
2450 | 此间
2451 | 毋宁
2452 | 每
2453 | 每个
2454 | 每天
2455 | 每年
2456 | 每当
2457 | 比
2458 | 比如
2459 | 比方
2460 | 比较
2461 | 毫不
2462 | 没有
2463 | 沿
2464 | 沿着
2465 | 注意
2466 | 深入
2467 | 清楚
2468 | 满足
2469 | 漫说
2470 | 焉
2471 | 然则
2472 | 然后
2473 | 然後
2474 | 然而
2475 | 照
2476 | 照着
2477 | 特别是
2478 | 特殊
2479 | 特点
2480 | 现代
2481 | 现在
2482 | 甚么
2483 | 甚而
2484 | 甚至
2485 | 用
2486 | 由
2487 | 由于
2488 | 由此可见
2489 | 的
2490 | 的话
2491 | 目前
2492 | 直到
2493 | 直接
2494 | 相似
2495 | 相信
2496 | 相反
2497 | 相同
2498 | 相对
2499 | 相对而言
2500 | 相应
2501 | 相当
2502 | 相等
2503 | 省得
2504 | 看出
2505 | 看到
2506 | 看来
2507 | 看看
2508 | 看见
2509 | 真是
2510 | 真正
2511 | 着
2512 | 着呢
2513 | 矣
2514 | 知道
2515 | 确定
2516 | 离
2517 | 积极
2518 | 移动
2519 | 突出
2520 | 突然
2521 | 立即
2522 | 第
2523 | 等
2524 | 等等
2525 | 管
2526 | 紧接着
2527 | 纵
2528 | 纵令
2529 | 纵使
2530 | 纵然
2531 | 练习
2532 | 组成
2533 | 经
2534 | 经常
2535 | 经过
2536 | 结合
2537 | 结果
2538 | 给
2539 | 绝对
2540 | 继续
2541 | 继而
2542 | 维持
2543 | 综上所述
2544 | 罢了
2545 | 考虑
2546 | 者
2547 | 而
2548 | 而且
2549 | 而况
2550 | 而外
2551 | 而已
2552 | 而是
2553 | 而言
2554 | 联系
2555 | 能
2556 | 能否
2557 | 能够
2558 | 腾
2559 | 自
2560 | 自个儿
2561 | 自从
2562 | 自各儿
2563 | 自家
2564 | 自己
2565 | 自身
2566 | 至
2567 | 至于
2568 | 良好
2569 | 若
2570 | 若是
2571 | 若非
2572 | 范围
2573 | 莫若
2574 | 获得
2575 | 虽
2576 | 虽则
2577 | 虽然
2578 | 虽说
2579 | 行为
2580 | 行动
2581 | 表明
2582 | 表示
2583 | 被
2584 | 要
2585 | 要不
2586 | 要不是
2587 | 要不然
2588 | 要么
2589 | 要是
2590 | 要求
2591 | 规定
2592 | 觉得
2593 | 认为
2594 | 认真
2595 | 认识
2596 | 让
2597 | 许多
2598 | 论
2599 | 设使
2600 | 设若
2601 | 该
2602 | 说明
2603 | 诸位
2604 | 谁
2605 | 谁知
2606 | 赶
2607 | 起
2608 | 起来
2609 | 起见
2610 | 趁
2611 | 趁着
2612 | 越是
2613 | 跟
2614 | 转动
2615 | 转变
2616 | 转贴
2617 | 较
2618 | 较之
2619 | 边
2620 | 达到
2621 | 迅速
2622 | 过
2623 | 过去
2624 | 过来
2625 | 运用
2626 | 还是
2627 | 还有
2628 | 这
2629 | 这个
2630 | 这么
2631 | 这么些
2632 | 这么样
2633 | 这么点儿
2634 | 这些
2635 | 这会儿
2636 | 这儿
2637 | 这就是说
2638 | 这时
2639 | 这样
2640 | 这点
2641 | 这种
2642 | 这边
2643 | 这里
2644 | 这麽
2645 | 进入
2646 | 进步
2647 | 进而
2648 | 进行
2649 | 连
2650 | 连同
2651 | 适应
2652 | 适当
2653 | 适用
2654 | 逐步
2655 | 逐渐
2656 | 通常
2657 | 通过
2658 | 造成
2659 | 遇到
2660 | 遭到
2661 | 避免
2662 | 那
2663 | 那个
2664 | 那么
2665 | 那么些
2666 | 那么样
2667 | 那些
2668 | 那会儿
2669 | 那儿
2670 | 那时
2671 | 那样
2672 | 那边
2673 | 那里
2674 | 那麽
2675 | 部分
2676 | 鄙人
2677 | 采取
2678 | 里面
2679 | 重大
2680 | 重新
2681 | 重要
2682 | 鉴于
2683 | 问题
2684 | 防止
2685 | 阿
2686 | 附近
2687 | 限制
2688 | 除
2689 | 除了
2690 | 除此之外
2691 | 除非
2692 | 随
2693 | 随着
2694 | 随著
2695 | 集中
2696 | 需要
2697 | 非但
2698 | 非常
2699 | 非徒
2700 | 靠
2701 | 顺
2702 | 顺着
2703 | 首先
2704 | 高兴
2705 | 是不是
2706 | 说说
2707 | 
2708 | """.split())
2709 | 


--------------------------------------------------------------------------------
/spacy/lang/zh/syntax_iterators.py:
--------------------------------------------------------------------------------
 1 | # coding: utf8
 2 | from __future__ import unicode_literals
 3 | 
 4 | from ...symbols import NOUN, PROPN, PRON
 5 | 
 6 | 
 7 | def noun_chunks(obj):
 8 |     """
 9 |     Detect base noun phrases from a dependency parse. Works on both Doc and Span.
10 |     """
11 |     labels = ['nsubj', 'dobj', 'nsubjpass', 'pcomp', 'pobj', 'dative', 'appos',
12 |               'attr', 'ROOT']
13 |     doc = obj.doc # Ensure works on both Doc and Span.
14 |     np_deps = [doc.vocab.strings.add(label) for label in labels]
15 |     conj = doc.vocab.strings.add('conj')
16 |     np_label = doc.vocab.strings.add('NP')
17 |     seen = set()
18 |     for i, word in enumerate(obj):
19 |         if word.pos not in (NOUN, PROPN, PRON):
20 |             continue
21 |         # Prevent nested chunks from being produced
22 |         if word.i in seen:
23 |             continue
24 |         if word.dep in np_deps:
25 |             if any(w.i in seen for w in word.subtree):
26 |                 continue
27 |             seen.update(j for j in range(word.left_edge.i, word.i+1))
28 |             yield word.left_edge.i, word.i+1, np_label
29 |         elif word.dep == conj:
30 |             head = word.head
31 |             while head.dep == conj and head.head.i < head.i:
32 |                 head = head.head
33 |             # If the head is an NP, and we're coordinated to it, we're an NP
34 |             if head.dep in np_deps:
35 |                 if any(w.i in seen for w in word.subtree):
36 |                     continue
37 |                 seen.update(j for j in range(word.left_edge.i, word.i+1))
38 |                 yield word.left_edge.i, word.i+1, np_label
39 | 
40 | 
41 | SYNTAX_ITERATORS = {
42 |     'noun_chunks': noun_chunks
43 | }
44 | 


--------------------------------------------------------------------------------
/spacy/lang/zh/tag_map.py:
--------------------------------------------------------------------------------
 1 | # coding: utf8
 2 | from __future__ import unicode_literals
 3 | 
 4 | from ...symbols import POS, PUNCT, SYM, ADJ, CCONJ, NUM, DET, ADV, ADP, X, VERB
 5 | from ...symbols import NOUN, PROPN, PART, INTJ, SPACE, PRON
 6 | 
 7 | 
 8 | TAG_MAP = {
 9 |     ".":        {POS: PUNCT, "PunctType": "peri"},
10 |     ",":        {POS: PUNCT, "PunctType": "comm"},
11 |     "-LRB-":    {POS: PUNCT, "PunctType": "brck", "PunctSide": "ini"},
12 |     "-RRB-":    {POS: PUNCT, "PunctType": "brck", "PunctSide": "fin"},
13 |     "``":       {POS: PUNCT, "PunctType": "quot", "PunctSide": "ini"},
14 |     "\"\"":     {POS: PUNCT, "PunctType": "quot", "PunctSide": "fin"},
15 |     "''":       {POS: PUNCT, "PunctType": "quot", "PunctSide": "fin"},
16 |     ":":        {POS: PUNCT},
17 |     "$":        {POS: SYM, "Other": {"SymType": "currency"}},
18 |     "#":        {POS: SYM, "Other": {"SymType": "numbersign"}},
19 |     "AFX":      {POS: ADJ,  "Hyph": "yes"},
20 |     "CC":       {POS: CCONJ, "ConjType": "coor"},
21 |     "CD":       {POS: NUM, "NumType": "card"},
22 |     "DT":       {POS: DET},
23 |     "EX":       {POS: ADV, "AdvType": "ex"},
24 |     "FW":       {POS: X, "Foreign": "yes"},
25 |     "HYPH":     {POS: PUNCT, "PunctType": "dash"},
26 |     "IN":       {POS: ADP},
27 |     "JJ":       {POS: ADJ, "Degree": "pos"},
28 |     "JJR":      {POS: ADJ, "Degree": "comp"},
29 |     "JJS":      {POS: ADJ, "Degree": "sup"},
30 |     "LS":       {POS: PUNCT, "NumType": "ord"},
31 |     "MD":       {POS: VERB, "VerbType": "mod"},
32 |     "NIL":      {POS: ""},
33 |     "NN":       {POS: NOUN, "Number": "sing"},
34 |     "NNP":      {POS: PROPN, "NounType": "prop", "Number": "sing"},
35 |     "NNPS":     {POS: PROPN, "NounType": "prop", "Number": "plur"},
36 |     "NNS":      {POS: NOUN, "Number": "plur"},
37 |     "PDT":      {POS: ADJ, "AdjType": "pdt", "PronType": "prn"},
38 |     "POS":      {POS: PART, "Poss": "yes"},
39 |     "PRP":      {POS: PRON, "PronType": "prs"},
40 |     "PRP$":     {POS: ADJ, "PronType": "prs", "Poss": "yes"},
41 |     "RB":       {POS: ADV, "Degree": "pos"},
42 |     "RBR":      {POS: ADV, "Degree": "comp"},
43 |     "RBS":      {POS: ADV, "Degree": "sup"},
44 |     "RP":       {POS: PART},
45 |     "SP":       {POS: SPACE},
46 |     "SYM":      {POS: SYM},
47 |     "TO":       {POS: PART, "PartType": "inf", "VerbForm": "inf"},
48 |     "UH":       {POS: INTJ},
49 |     "VB":       {POS: VERB, "VerbForm": "inf"},
50 |     "VBD":      {POS: VERB, "VerbForm": "fin", "Tense": "past"},
51 |     "VBG":      {POS: VERB, "VerbForm": "part", "Tense": "pres", "Aspect": "prog"},
52 |     "VBN":      {POS: VERB, "VerbForm": "part", "Tense": "past", "Aspect": "perf"},
53 |     "VBP":      {POS: VERB, "VerbForm": "fin", "Tense": "pres"},
54 |     "VBZ":      {POS: VERB, "VerbForm": "fin", "Tense": "pres", "Number": "sing", "Person": 3},
55 |     "WDT":      {POS: ADJ, "PronType": "int|rel"},
56 |     "WP":       {POS: NOUN, "PronType": "int|rel"},
57 |     "WP$":      {POS: ADJ, "Poss": "yes", "PronType": "int|rel"},
58 |     "WRB":      {POS: ADV, "PronType": "int|rel"},
59 |     "ADD":      {POS: X},
60 |     "NFP":      {POS: PUNCT},
61 |     "GW":       {POS: X},
62 |     "XX":       {POS: X},
63 |     "BES":      {POS: VERB},
64 |     "HVS":      {POS: VERB},
65 |     "_SP":       {POS: SPACE},
66 | }
67 | 


--------------------------------------------------------------------------------
/train_intent_parser_cn.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # coding: utf-8
  3 | """Using the parser to recognise your own semantics
  4 | 
  5 | spaCy's parser component can be used to trained to predict any type of tree
  6 | structure over your input text. You can also predict trees over whole documents
  7 | or chat logs, with connections between the sentence-roots used to annotate
  8 | discourse structure. In this example, we'll build a message parser for a common
  9 | "chat intent": finding local businesses. Our message semantics will have the
 10 | following types of relations: ROOT, PLACE, QUALITY, ATTRIBUTE, TIME, LOCATION.
 11 | 
 12 | "show me the best hotel in berlin"
 13 | ('show', 'ROOT', 'show')
 14 | ('best', 'QUALITY', 'hotel') --> hotel with QUALITY best
 15 | ('hotel', 'PLACE', 'show') --> show PLACE hotel
 16 | ('berlin', 'LOCATION', 'hotel') --> hotel with LOCATION berlin
 17 | 
 18 | Compatible with: spaCy v2.0.0+
 19 | """
 20 | from __future__ import unicode_literals, print_function
 21 | 
 22 | import plac
 23 | import random
 24 | import spacy
 25 | from pathlib import Path
 26 | 
 27 | 
 28 | # training data: texts, heads and dependency labels
 29 | # for no relation, we simply chose an arbitrary dependency label, e.g. '-'
 30 | TRAIN_DATA = [
 31 |     ("找无线质量好的咖啡厅", {
 32 |         'heads': [0, 5, 1, 2, 5, 0, 333, 333, 333, 333],  # index of token head
 33 |         'deps': ['ROOT', 'ATTRIBUTE', 'ATTRIBUTE', 'QUALITY', '-', 'PLACE', '-', '-', '-', '-']
 34 |     }),
 35 |     ("找一个靠近海边的酒店", {
 36 |         'heads': [0, 5, 3, 5, 5, 0, 333, 333, 333, 333],
 37 |         'deps': ['ROOT', '-', 'QUALITY', 'ATTRIBUTE', '-', 'PLACE', '-', '-', '-', '-']
 38 |     }),
 39 |     ("给我找一个最近的关门晚的健身房", {
 40 |         'heads': [2, 2, 2, 9, 9, 9, 7, 9, 9, 2, 333, 333, 333, 333, 333],
 41 |         'deps': ['-', '-', 'ROOT', '-', 'QUALITY', '-', 'TIME', 'ATTRIBUTE', '-', 'PLACE', '-', '-', '-', '-', '-']
 42 |     }),
 43 |     ("告诉我最便宜的卖花的商店", {
 44 |         'heads': [0, 0, 3, 7, 7, 7, 7, 0, 333, 333, 333, 333],  # attach "flowers" to store!
 45 |         'deps': ['ROOT', '-', 'QUALITY', 'ATTRIBUTE', '-', 'PRODUCT', '-', 'PLACE', '-', '-', '-', '-']
 46 |     }),
 47 |     ("找一个在伦敦的好餐厅", {
 48 |         'heads': [0, 6, 3, 6, 6, 6, 0, 333, 333, 333],
 49 |         'deps': ['ROOT', '-', '-', 'LOCATION', '-', 'QUALITY', 'PLACE', '-', '-', '-']
 50 |     }),
 51 |     ("告诉我在柏林最酷的旅社", {
 52 |         'heads': [0, 0, 3, 6, 6, 6, 0, 333, 333, 333, 333],
 53 |         'deps': ['ROOT', '-', '-', 'LOCATION', 'QUALITY', '-', 'PLACE', '-', '-', '-', '-']
 54 |     }),
 55 |     ("找一个上班近的好的意大利餐厅", {
 56 |         'heads': [0, 8, 3, 8, 8, 8, 8, 8, 0, 333, 333, 333, 333, 333],
 57 |         'deps': ['ROOT', '-', 'LOCATION', 'ATTRIBUTE', '-', 'QUALITY', '-', 'ATTRIBUTE', 'PLACE', '-', '-', '-', '-', '-']
 58 |     })
 59 | ]
 60 | 
 61 | 
 62 | @plac.annotations(
 63 |     model=("Model name. Defaults to blank 'en' model.", "option", "m", str),
 64 |     output_dir=("Optional output directory", "option", "o", Path),
 65 |     n_iter=("Number of training iterations", "option", "n", int))
 66 | def main(model=None, output_dir=None, n_iter=5):
 67 |     """Load the model, set up the pipeline and train the parser."""
 68 |     if model is not None:
 69 |         nlp = spacy.load(model)  # load existing spaCy model
 70 |         print("Loaded model '%s'" % model)
 71 |     else:
 72 |         nlp = spacy.blank('en')  # create blank Language class
 73 |         print("Created blank 'en' model")
 74 | 
 75 |     # We'll use the built-in dependency parser class, but we want to create a
 76 |     # fresh instance – just in case.
 77 |     if 'parser' in nlp.pipe_names:
 78 |         nlp.remove_pipe('parser')
 79 |     parser = nlp.create_pipe('parser')
 80 |     nlp.add_pipe(parser, first=True)
 81 | 
 82 |     for text, annotations in TRAIN_DATA:
 83 |         for dep in annotations.get('deps', []):
 84 |             parser.add_label(dep)
 85 | 
 86 |     other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'parser']
 87 |     with nlp.disable_pipes(*other_pipes):  # only train parser
 88 |         optimizer = nlp.begin_training()
 89 |         for itn in range(n_iter):
 90 |             random.shuffle(TRAIN_DATA)
 91 |             losses = {}
 92 |             for text, annotations in TRAIN_DATA:
 93 |                 nlp.update([text], [annotations], sgd=optimizer, losses=losses)
 94 |             print(losses)
 95 | 
 96 |     # test the trained model
 97 |     test_model(nlp)
 98 | 
 99 |     # save model to output directory
100 |     if output_dir is not None:
101 |         output_dir = Path(output_dir)
102 |         if not output_dir.exists():
103 |             output_dir.mkdir()
104 |         nlp.to_disk(output_dir)
105 |         print("Saved model to", output_dir)
106 | 
107 |         # test the saved model
108 |         print("Loading from", output_dir)
109 |         nlp2 = spacy.load(output_dir)
110 |         test_model(nlp2)
111 | 
112 | 
113 | def test_model(nlp):
114 |     texts = ["找一个上班近的好的意大利餐厅"]
115 |     docs = nlp.pipe(texts)
116 |     for doc in docs:
117 |         print(doc.text)
118 |         print([(t.text, t.dep_, t.head.text) for t in doc if t.dep_ != '-'])
119 | 
120 | 
121 | if __name__ == '__main__':
122 |     plac.call(main)
123 | 
124 |     # Expected output:
125 |     # find a hotel with good wifi
126 |     # [
127 |     #   ('find', 'ROOT', 'find'),
128 |     #   ('hotel', 'PLACE', 'find'),
129 |     #   ('good', 'QUALITY', 'wifi'),
130 |     #   ('wifi', 'ATTRIBUTE', 'hotel')
131 |     # ]
132 |     # find me the cheapest gym near work
133 |     # [
134 |     #   ('find', 'ROOT', 'find'),
135 |     #   ('cheapest', 'QUALITY', 'gym'),
136 |     #   ('gym', 'PLACE', 'find')
137 |     #   ('work', 'LOCATION', 'near')
138 |     # ]
139 |     # show me the best hotel in berlin
140 |     # [
141 |     #   ('show', 'ROOT', 'show'),
142 |     #   ('best', 'QUALITY', 'hotel'),
143 |     #   ('hotel', 'PLACE', 'show'),
144 |     #   ('berlin', 'LOCATION', 'hotel')
145 |     # ]
146 | 


--------------------------------------------------------------------------------
/train_ner_cn.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # coding: utf8
  3 | """Example of training spaCy's named entity recognizer, starting off with an
  4 | existing model or a blank model.
  5 | 
  6 | For more details, see the documentation:
  7 | * Training: https://spacy.io/usage/training
  8 | * NER: https://spacy.io/usage/linguistic-features#named-entities
  9 | 
 10 | Compatible with: spaCy v2.0.0+
 11 | """
 12 | from __future__ import unicode_literals, print_function
 13 | 
 14 | import plac
 15 | import random
 16 | from pathlib import Path
 17 | import spacy
 18 | 
 19 | 
 20 | # training data
 21 | TRAIN_DATA = [
 22 |     ('到底谁是张三?', {
 23 |         'entities': [(4, 6, 'PERSON')]
 24 |     }),
 25 |     ('我非常喜欢伦敦和柏林.', {
 26 |         'entities': [(5, 7, 'LOC'), (8, 10, 'LOC')]
 27 |     })
 28 | ]
 29 | 
 30 | 
 31 | @plac.annotations(
 32 |     model=("Model name. Defaults to blank 'en' model.", "option", "m", str),
 33 |     output_dir=("Optional output directory", "option", "o", Path),
 34 |     n_iter=("Number of training iterations", "option", "n", int))
 35 | def main(model=None, output_dir=None, n_iter=100):
 36 |     """Load the model, set up the pipeline and train the entity recognizer."""
 37 |     if model is not None:
 38 |         nlp = spacy.load(model)  # load existing spaCy model
 39 |         print("Loaded model '%s'" % model)
 40 |     else:
 41 |         nlp = spacy.blank('en')  # create blank Language class
 42 |         print("Created blank 'en' model")
 43 | 
 44 |     # create the built-in pipeline components and add them to the pipeline
 45 |     # nlp.create_pipe works for built-ins that are registered with spaCy
 46 |     if 'ner' not in nlp.pipe_names:
 47 |         ner = nlp.create_pipe('ner')
 48 |         nlp.add_pipe(ner, last=True)
 49 |     # otherwise, get it so we can add labels
 50 |     else:
 51 |         ner = nlp.get_pipe('ner')
 52 | 
 53 |     # add labels
 54 |     for _, annotations in TRAIN_DATA:
 55 |         for ent in annotations.get('entities'):
 56 |             ner.add_label(ent[2])
 57 | 
 58 |     # get names of other pipes to disable them during training
 59 |     other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
 60 |     with nlp.disable_pipes(*other_pipes):  # only train NER
 61 |         optimizer = nlp.begin_training()
 62 |         for itn in range(n_iter):
 63 |             random.shuffle(TRAIN_DATA)
 64 |             losses = {}
 65 |             for text, annotations in TRAIN_DATA:
 66 |                 nlp.update(
 67 |                     [text],  # batch of texts
 68 |                     [annotations],  # batch of annotations
 69 |                     drop=0.5,  # dropout - make it harder to memorise data
 70 |                     sgd=optimizer,  # callable to update weights
 71 |                     losses=losses)
 72 |             print(losses)
 73 | 
 74 |     # test the trained model
 75 |     for text, _ in TRAIN_DATA:
 76 |         doc = nlp(text)
 77 |         print('Entities', [(ent.text, ent.label_) for ent in doc.ents])
 78 |         print('Tokens', [(t.text, t.ent_type_, t.ent_iob) for t in doc])
 79 | 
 80 |     # save model to output directory
 81 |     if output_dir is not None:
 82 |         output_dir = Path(output_dir)
 83 |         if not output_dir.exists():
 84 |             output_dir.mkdir()
 85 |         nlp.to_disk(output_dir)
 86 |         print("Saved model to", output_dir)
 87 | 
 88 |         # test the saved model
 89 |         print("Loading from", output_dir)
 90 |         nlp2 = spacy.load(output_dir)
 91 |         for text, _ in TRAIN_DATA:
 92 |             doc = nlp2(text)
 93 |             print('Entities', [(ent.text, ent.label_) for ent in doc.ents])
 94 |             print('Tokens', [(t.text, t.ent_type_, t.ent_iob) for t in doc])
 95 | 
 96 | 
 97 | if __name__ == '__main__':
 98 |     plac.call(main)
 99 | 
100 |     # Expected output:
101 |     # Entities [('Shaka Khan', 'PERSON')]
102 |     # Tokens [('Who', '', 2), ('is', '', 2), ('Shaka', 'PERSON', 3),
103 |     # ('Khan', 'PERSON', 1), ('?', '', 2)]
104 |     # Entities [('London', 'LOC'), ('Berlin', 'LOC')]
105 |     # Tokens [('I', '', 2), ('like', '', 2), ('London', 'LOC', 3),
106 |     # ('and', '', 2), ('Berlin', 'LOC', 3), ('.', '', 2)]
107 | 


--------------------------------------------------------------------------------
/train_new_entity_type_cn.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # coding: utf8
  3 | """Example of training an additional entity type
  4 | 
  5 | This script shows how to add a new entity type to an existing pre-trained NER
  6 | model. To keep the example short and simple, only four sentences are provided
  7 | as examples. In practice, you'll need many more — a few hundred would be a
  8 | good start. You will also likely need to mix in examples of other entity
  9 | types, which might be obtained by running the entity recognizer over unlabelled
 10 | sentences, and adding their annotations to the training set.
 11 | 
 12 | The actual training is performed by looping over the examples, and calling
 13 | `nlp.entity.update()`. The `update()` method steps through the words of the
 14 | input. At each word, it makes a prediction. It then consults the annotations
 15 | provided on the GoldParse instance, to see whether it was right. If it was
 16 | wrong, it adjusts its weights so that the correct action will score higher
 17 | next time.
 18 | 
 19 | After training your model, you can save it to a directory. We recommend
 20 | wrapping models as Python packages, for ease of deployment.
 21 | 
 22 | For more details, see the documentation:
 23 | * Training: https://spacy.io/usage/training
 24 | * NER: https://spacy.io/usage/linguistic-features#named-entities
 25 | 
 26 | Compatible with: spaCy v2.0.0+
 27 | """
 28 | from __future__ import unicode_literals, print_function
 29 | 
 30 | import plac
 31 | import random
 32 | from pathlib import Path
 33 | import spacy
 34 | 
 35 | 
 36 | # new entity label
 37 | LABEL = 'ANIMAL'
 38 | 
 39 | # training data
 40 | # Note: If you're using an existing model, make sure to mix in examples of
 41 | # other entity types that spaCy correctly recognized before. Otherwise, your
 42 | # model might learn the new type, but "forget" what it previously knew.
 43 | # https://explosion.ai/blog/pseudo-rehearsal-catastrophic-forgetting
 44 | TRAIN_DATA = [
 45 |     ("马是又高大又善良的动物，是人类的好伙伴。", {
 46 |         'entities': [(0, 1, 'ANIMAL')]
 47 |     }),
 48 | 
 49 |     ("它们咬人不?", {
 50 |         'entities': []
 51 |     }),
 52 | 
 53 |     ("很多人都很喜爱马。", {
 54 |         'entities': [(7, 8, 'ANIMAL')]
 55 |     }),
 56 | 
 57 |     ("人善人欺，马善人骑。", {
 58 |         'entities': [(5, 6, 'ANIMAL')]
 59 |     }),
 60 | 
 61 |     ("蒙古有一种马的品种，个子很矮。", {
 62 |         'entities': [(5, 6, 'ANIMAL')]
 63 |     }),
 64 | 
 65 |     ("马?", {
 66 |         'entities': [(0, 1, 'ANIMAL')]
 67 |     })
 68 | ]
 69 | 
 70 | 
 71 | @plac.annotations(
 72 |     model=("Model name. Defaults to blank 'en' model.", "option", "m", str),
 73 |     new_model_name=("New model name for model meta.", "option", "nm", str),
 74 |     output_dir=("Optional output directory", "option", "o", Path),
 75 |     n_iter=("Number of training iterations", "option", "n", int))
 76 | def main(model=None, new_model_name='animal', output_dir=None, n_iter=20):
 77 |     """Set up the pipeline and entity recognizer, and train the new entity."""
 78 |     if model is not None:
 79 |         nlp = spacy.load(model)  # load existing spaCy model
 80 |         print("Loaded model '%s'" % model)
 81 |     else:
 82 |         nlp = spacy.blank('en')  # create blank Language class
 83 |         print("Created blank 'en' model")
 84 |     # Add entity recognizer to model if it's not in the pipeline
 85 |     # nlp.create_pipe works for built-ins that are registered with spaCy
 86 |     if 'ner' not in nlp.pipe_names:
 87 |         ner = nlp.create_pipe('ner')
 88 |         nlp.add_pipe(ner)
 89 |     # otherwise, get it, so we can add labels to it
 90 |     else:
 91 |         ner = nlp.get_pipe('ner')
 92 | 
 93 |     ner.add_label(LABEL)   # add new entity label to entity recognizer
 94 |     if model is None:
 95 |         optimizer = nlp.begin_training()
 96 |     else:
 97 |         # Note that 'begin_training' initializes the models, so it'll zero out
 98 |         # existing entity types.
 99 |         optimizer = nlp.entity.create_optimizer()
100 | 
101 | 
102 | 
103 |     # get names of other pipes to disable them during training
104 |     other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
105 |     with nlp.disable_pipes(*other_pipes):  # only train NER
106 |         for itn in range(n_iter):
107 |             random.shuffle(TRAIN_DATA)
108 |             losses = {}
109 |             for text, annotations in TRAIN_DATA:
110 |                 nlp.update([text], [annotations], sgd=optimizer, drop=0.35,
111 |                            losses=losses)
112 |             print(losses)
113 | 
114 |     # test the trained model
115 |     test_text = '您喜欢马吗？'
116 |     doc = nlp(test_text)
117 |     print("Entities in '%s'" % test_text)
118 |     for ent in doc.ents:
119 |         print(ent.label_, ent.text)
120 | 
121 |     # save model to output directory
122 |     if output_dir is not None:
123 |         output_dir = Path(output_dir)
124 |         if not output_dir.exists():
125 |             output_dir.mkdir()
126 |         nlp.meta['name'] = new_model_name  # rename model
127 |         nlp.to_disk(output_dir)
128 |         print("Saved model to", output_dir)
129 | 
130 |         # test the saved model
131 |         print("Loading from", output_dir)
132 |         nlp2 = spacy.load(output_dir)
133 |         doc2 = nlp2(test_text)
134 |         for ent in doc2.ents:
135 |             print(ent.label_, ent.text)
136 | 
137 | 
138 | if __name__ == '__main__':
139 |     plac.call(main)
140 | 


--------------------------------------------------------------------------------
/train_parser_cn.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # coding: utf8
  3 | """Example of training spaCy dependency parser, starting off with an existing
  4 | model or a blank model. For more details, see the documentation:
  5 | * Training: https://spacy.io/usage/training
  6 | * Dependency Parse: https://spacy.io/usage/linguistic-features#dependency-parse
  7 | 
  8 | Compatible with: spaCy v2.0.0+
  9 | """
 10 | from __future__ import unicode_literals, print_function
 11 | 
 12 | import plac
 13 | import random
 14 | from pathlib import Path
 15 | import spacy
 16 | 
 17 | 
 18 | # training data
 19 | TRAIN_DATA = [
 20 |     ("他们进行抵押贷款交易。", {
 21 |         'heads': [1, 1, 3, 4, 1, 1, 333, 333, 333, 333, 333],
 22 |         'deps': ['nsubj', 'ROOT', 'compound', 'nmod', 'dobj', 'punct', 'depdep', 'dep', 'dep', 'dep', 'dep']
 23 |     }),
 24 |     ("我喜欢伦敦和柏林。", {
 25 |         'heads': [1, 1, 1, 2, 2, 1, 333, 333, 333],
 26 |         'deps': ['nsubj', 'ROOT', 'dobj', 'cc', 'conj', 'punct', 'dep', 'dep', 'dep']
 27 |     }),
 28 |     ("你在找些什么？", {
 29 |         'heads': [2, 2, 2, 2, 2, 333, 333],
 30 |         'deps': ['nsubj', 'advmod', 'ROOT', 'obj', 'punct', 'dep', 'dep']
 31 |     }),
 32 |     ("我喜欢北京的秋天。", {
 33 |         'heads': [1, 1, 3, 4, 1, 1, 333, 333, 333],
 34 |         'deps': ['nsubj', 'ROOT', 'nmod', 'case', 'dobj', 'punct', 'dep', 'dep', 'dep']
 35 |     })
 36 | ]
 37 | 
 38 | @plac.annotations(
 39 |     model=("Model name. Defaults to blank 'en' model.", "option", "m", str),
 40 |     output_dir=("Optional output directory", "option", "o", Path),
 41 |     n_iter=("Number of training iterations", "option", "n", int))
 42 | def main(model=None, output_dir=None, n_iter=10):
 43 |     """Load the model, set up the pipeline and train the parser."""
 44 |     if model is not None:
 45 |         nlp = spacy.load(model)  # load existing spaCy model
 46 |         print("Loaded model '%s'" % model)
 47 |     else:
 48 |         nlp = spacy.blank('en')  # create blank Language class
 49 |         print("Created blank 'en' model")
 50 | 
 51 |     # add the parser to the pipeline if it doesn't exist
 52 |     # nlp.create_pipe works for built-ins that are registered with spaCy
 53 |     if 'parser' not in nlp.pipe_names:
 54 |         parser = nlp.create_pipe('parser')
 55 |         nlp.add_pipe(parser, first=True)
 56 |     # otherwise, get it, so we can add labels to it
 57 |     else:
 58 |         parser = nlp.get_pipe('parser')
 59 | 
 60 |     # add labels to the parser
 61 |     for _, annotations in TRAIN_DATA:
 62 |         for dep in annotations.get('deps', []):
 63 |             parser.add_label(dep)
 64 | 
 65 |     # get names of other pipes to disable them during training
 66 |     other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'parser']
 67 |     with nlp.disable_pipes(*other_pipes):  # only train parser
 68 |         optimizer = nlp.begin_training()
 69 |         for itn in range(n_iter):
 70 |             random.shuffle(TRAIN_DATA)
 71 |             losses = {}
 72 |             for text, annotations in TRAIN_DATA:
 73 |                 nlp.update([text], [annotations], sgd=optimizer, losses=losses)
 74 |             print(losses)
 75 | 
 76 |     # test the trained model
 77 |     test_text = "我喜欢北京的秋天。"
 78 |     doc = nlp(test_text)
 79 |     print('Dependencies', [(t.text, t.dep_, t.head.text) for t in doc])
 80 | 
 81 |     # save model to output directory
 82 |     if output_dir is not None:
 83 |         output_dir = Path(output_dir)
 84 |         if not output_dir.exists():
 85 |             output_dir.mkdir()
 86 |         nlp.to_disk(output_dir)
 87 |         print("Saved model to", output_dir)
 88 | 
 89 |         # test the saved model
 90 |         print("Loading from", output_dir)
 91 |         nlp2 = spacy.load(output_dir)
 92 |         doc = nlp2(test_text)
 93 |         print('Dependencies', [(t.text, t.dep_, t.head.text) for t in doc])
 94 | 
 95 | 
 96 | if __name__ == '__main__':
 97 |     plac.call(main)
 98 | 
 99 |     # expected result:
100 |     # [
101 |     #   ('I', 'nsubj', 'like'),
102 |     #   ('like', 'ROOT', 'like'),
103 |     #   ('securities', 'dobj', 'like'),
104 |     #   ('.', 'punct', 'like')
105 |     # ]
106 | 


--------------------------------------------------------------------------------
/train_tagger_cn.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # coding: utf8
  3 | """
  4 | A simple example for training a part-of-speech tagger with a custom tag map.
  5 | To allow us to update the tag map with our custom one, this example starts off
  6 | with a blank Language class and modifies its defaults. For more details, see
  7 | the documentation:
  8 | * Training: https://spacy.io/usage/training
  9 | * POS Tagging: https://spacy.io/usage/linguistic-features#pos-tagging
 10 | 
 11 | Compatible with: spaCy v2.0.0+
 12 | """
 13 | from __future__ import unicode_literals, print_function
 14 | 
 15 | import plac
 16 | import random
 17 | from pathlib import Path
 18 | import spacy
 19 | 
 20 | 
 21 | # You need to define a mapping from your data's part-of-speech tag names to the
 22 | # Universal Part-of-Speech tag set, as spaCy includes an enum of these tags.
 23 | # See here for the Universal Tag Set:
 24 | # http://universaldependencies.github.io/docs/u/pos/index.html
 25 | # You may also specify morphological features for your tags, from the universal
 26 | # scheme.
 27 | TAG_MAP = {
 28 |     'N': {'pos': 'NOUN'},
 29 |     'V': {'pos': 'VERB'},
 30 |     'J': {'pos': 'ADJ'}
 31 | }
 32 | 
 33 | # Usually you'll read this in, of course. Data formats vary. Ensure your
 34 | # strings are unicode and that the number of tags assigned matches spaCy's
 35 | # tokenization. If not, you can always add a 'words' key to the annotations
 36 | # that specifies the gold-standard tokenization, e.g.:
 37 | # ("Eatblueham", {'words': ['Eat', 'blue', 'ham'] 'tags': ['V', 'J', 'N']})
 38 | TRAIN_DATA = [
 39 |     ("我喜欢红苹果", {'tags': ['N', 'V', 'J', 'N']}),
 40 |     ("吃蓝色汉堡", {'tags': ['V', 'J', 'N']})
 41 | ]
 42 | 
 43 | 
 44 | @plac.annotations(
 45 |     lang=("ISO Code of language to use", "option", "l", str),
 46 |     output_dir=("Optional output directory", "option", "o", Path),
 47 |     n_iter=("Number of training iterations", "option", "n", int))
 48 | def main(lang='en', output_dir=None, n_iter=25):
 49 |     """Create a new model, set up the pipeline and train the tagger. In order to
 50 |     train the tagger with a custom tag map, we're creating a new Language
 51 |     instance with a custom vocab.
 52 |     """
 53 |     nlp = spacy.blank(lang)
 54 |     # add the tagger to the pipeline
 55 |     # nlp.create_pipe works for built-ins that are registered with spaCy
 56 |     tagger = nlp.create_pipe('tagger')
 57 |     # Add the tags. This needs to be done before you start training.
 58 |     for tag, values in TAG_MAP.items():
 59 |         tagger.add_label(tag, values)
 60 |     nlp.add_pipe(tagger)
 61 | 
 62 |     optimizer = nlp.begin_training()
 63 |     for i in range(n_iter):
 64 |         random.shuffle(TRAIN_DATA)
 65 |         losses = {}
 66 |         for text, annotations in TRAIN_DATA:
 67 |             nlp.update([text], [annotations], sgd=optimizer, losses=losses)
 68 |         print(losses)
 69 | 
 70 |     # test the trained model
 71 |     test_text = "我喜欢黑色衬衫"
 72 |     doc = nlp(test_text)
 73 |     print('Tags', [(t.text, t.tag_, t.pos_) for t in doc])
 74 | 
 75 |     # save model to output directory
 76 |     if output_dir is not None:
 77 |         output_dir = Path(output_dir)
 78 |         if not output_dir.exists():
 79 |             output_dir.mkdir()
 80 |         nlp.to_disk(output_dir)
 81 |         print("Saved model to", output_dir)
 82 | 
 83 |         # test the save model
 84 |         print("Loading from", output_dir)
 85 |         nlp2 = spacy.load(output_dir)
 86 |         doc = nlp2(test_text)
 87 |         print('Tags', [(t.text, t.tag_, t.pos_) for t in doc])
 88 | 
 89 | 
 90 | if __name__ == '__main__':
 91 |     plac.call(main)
 92 | 
 93 |     # Expected output:
 94 |     # [
 95 |     #   ('I', 'N', 'NOUN'),
 96 |     #   ('like', 'V', 'VERB'),
 97 |     #   ('blue', 'J', 'ADJ'),
 98 |     #   ('eggs', 'N', 'NOUN')
 99 |     # ]
100 | 


--------------------------------------------------------------------------------
/vectors_fast_text.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # coding: utf8
 3 | """Load vectors for a language trained using fastText
 4 | https://github.com/facebookresearch/fastText/blob/master/pretrained-vectors.md
 5 | Compatible with: spaCy v2.0.0+
 6 | """
 7 | from __future__ import unicode_literals
 8 | import plac
 9 | import numpy
10 | 
11 | import spacy
12 | from spacy.language import Language
13 | 
14 | 
15 | @plac.annotations(
16 |     vectors_loc=("Path to .vec file", "positional", None, str),
17 |     lang=("Optional language ID. If not set, blank Language() will be used.",
18 |           "positional", None, str))
19 | def main(vectors_loc, lang=None):
20 |     if lang is None:
21 |         nlp = Language()
22 |     else:
23 |         # create empty language class – this is required if you're planning to
24 |         # save the model to disk and load it back later (models always need a
25 |         # "lang" setting). Use 'xx' for blank multi-language class.
26 |         nlp = spacy.blank(lang)
27 |     with open(vectors_loc, 'rb') as file_:
28 |         header = file_.readline()
29 |         nr_row, nr_dim = header.split()
30 |         print(nr_row, nr_dim)
31 | 
32 |         nlp.vocab.reset_vectors(width=int(nr_dim))
33 | 
34 |         for line in file_:
35 |             line = line.rstrip().decode('utf8')
36 |             pieces = line.rsplit(' ', int(nr_dim))
37 |             word = pieces[0]
38 |             vector = numpy.asarray([float(v) for v in pieces[1:]], dtype='f')
39 |             nlp.vocab.set_vector(word, vector)  # add the vectors to the vocab
40 | 
41 |             print(word)
42 |     # test the vectors and similarity
43 |     # text = '您好'
44 |     # doc = nlp(text)
45 |     # print(text, doc[0].similarity(doc[1]))
46 |     nlp.to_disk("./zh_model")
47 | 
48 | 
49 | if __name__ == '__main__':
50 |     plac.call(main)


--------------------------------------------------------------------------------
/zh_model/meta.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "lang":"zh",
 3 |   "name":"model",
 4 |   "version":"0.0.0",
 5 |   "spacy_version":">=2.0.10",
 6 |   "description":"",
 7 |   "author":"",
 8 |   "email":"",
 9 |   "url":"",
10 |   "license":"",
11 |   "vectors":{
12 |     "width":0,
13 |     "vectors":0,
14 |     "keys":0,
15 |     "name":"spacy_pretrained_vectors"
16 |   },
17 |   "pipeline":[
18 |     "parser",
19 |     "tagger"
20 |   ]
21 | }


--------------------------------------------------------------------------------
/zh_model/ner/cfg:
--------------------------------------------------------------------------------
 1 | {
 2 |   "beam_width":1,
 3 |   "beam_density":0.0,
 4 |   "cnn_maxout_pieces":3,
 5 |   "extra_labels":[
 6 |     "PERSON",
 7 |     "PERSON",
 8 |     "PERSON",
 9 |     "PERSON",
10 |     "LOC",
11 |     "LOC",
12 |     "LOC",
13 |     "LOC",
14 |     "ANIMAL",
15 |     "ANIMAL",
16 |     "ANIMAL",
17 |     "ANIMAL"
18 |   ],
19 |   "nr_class":9,
20 |   "hidden_depth":1,
21 |   "token_vector_width":128,
22 |   "hidden_width":200,
23 |   "maxout_pieces":2,
24 |   "pretrained_vectors":"spacy_pretrained_vectors",
25 |   "hist_size":0,
26 |   "hist_width":0
27 | }


--------------------------------------------------------------------------------
/zh_model/ner/lower_model:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jeusgao/spaCy-new-language-test-Chinese/f202c409a56247cebd29165450919e83b412f309/zh_model/ner/lower_model


--------------------------------------------------------------------------------
/zh_model/ner/moves:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jeusgao/spaCy-new-language-test-Chinese/f202c409a56247cebd29165450919e83b412f309/zh_model/ner/moves


--------------------------------------------------------------------------------
/zh_model/ner/tok2vec_model:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jeusgao/spaCy-new-language-test-Chinese/f202c409a56247cebd29165450919e83b412f309/zh_model/ner/tok2vec_model


--------------------------------------------------------------------------------
/zh_model/ner/upper_model:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jeusgao/spaCy-new-language-test-Chinese/f202c409a56247cebd29165450919e83b412f309/zh_model/ner/upper_model


--------------------------------------------------------------------------------
/zh_model/parser/cfg:
--------------------------------------------------------------------------------
 1 | {
 2 |   "beam_width":1,
 3 |   "beam_density":0.0,
 4 |   "cnn_maxout_pieces":3,
 5 |   "extra_labels":[
 6 |     "ROOT",
 7 |     "ROOT",
 8 |     "ROOT",
 9 |     "ROOT",
10 |     "ROOT",
11 |     "ATTRIBUTE",
12 |     "ATTRIBUTE",
13 |     "ATTRIBUTE",
14 |     "ATTRIBUTE",
15 |     "ATTRIBUTE",
16 |     "QUALITY",
17 |     "QUALITY",
18 |     "QUALITY",
19 |     "QUALITY",
20 |     "QUALITY",
21 |     "-",
22 |     "-",
23 |     "-",
24 |     "-",
25 |     "-",
26 |     "PLACE",
27 |     "PLACE",
28 |     "PLACE",
29 |     "PLACE",
30 |     "PLACE",
31 |     "TIME",
32 |     "TIME",
33 |     "TIME",
34 |     "TIME",
35 |     "TIME",
36 |     "PRODUCT",
37 |     "PRODUCT",
38 |     "PRODUCT",
39 |     "PRODUCT",
40 |     "PRODUCT",
41 |     "LOCATION",
42 |     "LOCATION",
43 |     "LOCATION",
44 |     "LOCATION",
45 |     "LOCATION"
46 |   ],
47 |   "nr_class":42,
48 |   "hidden_depth":1,
49 |   "token_vector_width":128,
50 |   "hidden_width":200,
51 |   "maxout_pieces":2,
52 |   "pretrained_vectors":null,
53 |   "hist_size":0,
54 |   "hist_width":0
55 | }


--------------------------------------------------------------------------------
/zh_model/parser/lower_model:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jeusgao/spaCy-new-language-test-Chinese/f202c409a56247cebd29165450919e83b412f309/zh_model/parser/lower_model


--------------------------------------------------------------------------------
/zh_model/parser/moves:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jeusgao/spaCy-new-language-test-Chinese/f202c409a56247cebd29165450919e83b412f309/zh_model/parser/moves


--------------------------------------------------------------------------------
/zh_model/parser/tok2vec_model:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jeusgao/spaCy-new-language-test-Chinese/f202c409a56247cebd29165450919e83b412f309/zh_model/parser/tok2vec_model


--------------------------------------------------------------------------------
/zh_model/parser/upper_model:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jeusgao/spaCy-new-language-test-Chinese/f202c409a56247cebd29165450919e83b412f309/zh_model/parser/upper_model


--------------------------------------------------------------------------------
/zh_model/tagger/cfg:
--------------------------------------------------------------------------------
1 | {
2 |   "cnn_maxout_pieces":2,
3 |   "pretrained_vectors":null
4 | }


--------------------------------------------------------------------------------
/zh_model/tagger/model:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jeusgao/spaCy-new-language-test-Chinese/f202c409a56247cebd29165450919e83b412f309/zh_model/tagger/model


--------------------------------------------------------------------------------
/zh_model/tagger/tag_map:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jeusgao/spaCy-new-language-test-Chinese/f202c409a56247cebd29165450919e83b412f309/zh_model/tagger/tag_map


--------------------------------------------------------------------------------
/zh_model/tokenizer:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jeusgao/spaCy-new-language-test-Chinese/f202c409a56247cebd29165450919e83b412f309/zh_model/tokenizer


--------------------------------------------------------------------------------
/zh_model/vocab/key2row:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jeusgao/spaCy-new-language-test-Chinese/f202c409a56247cebd29165450919e83b412f309/zh_model/vocab/key2row


--------------------------------------------------------------------------------
/zh_model/vocab/lexemes.bin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jeusgao/spaCy-new-language-test-Chinese/f202c409a56247cebd29165450919e83b412f309/zh_model/vocab/lexemes.bin


--------------------------------------------------------------------------------
/zh_model/vocab/strings.json:
--------------------------------------------------------------------------------
  1 | [
  2 |   "\"\"",
  3 |   "#",
  4 |   "$",
  5 |   "''",
  6 |   ",",
  7 |   "-LRB-",
  8 |   "-RRB-",
  9 |   ".",
 10 |   ":",
 11 |   "ADD",
 12 |   "AFX",
 13 |   "BES",
 14 |   "CC",
 15 |   "CD",
 16 |   "DT",
 17 |   "EX",
 18 |   "FW",
 19 |   "GW",
 20 |   "HVS",
 21 |   "HYPH",
 22 |   "IN",
 23 |   "JJ",
 24 |   "JJR",
 25 |   "JJS",
 26 |   "LS",
 27 |   "MD",
 28 |   "NFP",
 29 |   "NIL",
 30 |   "NN",
 31 |   "NNP",
 32 |   "NNPS",
 33 |   "NNS",
 34 |   "PDT",
 35 |   "PRP",
 36 |   "PRP$",
 37 |   "RB",
 38 |   "RBR",
 39 |   "RBS",
 40 |   "RP",
 41 |   "SP",
 42 |   "TO",
 43 |   "UH",
 44 |   "VB",
 45 |   "VBD",
 46 |   "VBG",
 47 |   "VBN",
 48 |   "VBP",
 49 |   "VBZ",
 50 |   "WDT",
 51 |   "WP",
 52 |   "WP$",
 53 |   "WRB",
 54 |   "XX",
 55 |   "_SP",
 56 |   "``",
 57 |   "-PRON-",
 58 |   "be",
 59 |   "\t",
 60 |   "zh",
 61 |   "\n",
 62 |   " ",
 63 |   "\")",
 64 |   "\"",
 65 |   "'",
 66 |   "(*_*)",
 67 |   "(",
 68 |   "_*)",
 69 |   "(-8",
 70 |   "(-d",
 71 |   "(-:",
 72 |   "(-;",
 73 |   "(-_-)",
 74 |   "_-)",
 75 |   "(._.)",
 76 |   "_.)",
 77 |   "(:",
 78 |   "(;",
 79 |   "(=",
 80 |   "(>_<)",
 81 |   "_<)",
 82 |   "(^_^)",
 83 |   "_^)",
 84 |   "(o:",
 85 |   "(x:",
 86 |   "(\u00ac_\u00ac)",
 87 |   "_\u00ac)",
 88 |   "(\u0ca0_\u0ca0)",
 89 |   "_\u0ca0)",
 90 |   "(x_x)",
 91 |   "(\u256f\u00b0\u25a1\u00b0\uff09\u256f\ufe35\u253b\u2501\u253b",
 92 |   "\u253b\u2501\u253b",
 93 |   ")-:",
 94 |   ")",
 95 |   "):",
 96 |   "-_-",
 97 |   "-",
 98 |   "-__-",
 99 |   "__-",
100 |   "._.",
101 |   "0.0",
102 |   "0",
103 |   "d.d",
104 |   "0.o",
105 |   "d.x",
106 |   "0_0",
107 |   "d_d",
108 |   "0_o",
109 |   "d_x",
110 |   "8)",
111 |   "8",
112 |   "d)",
113 |   "8-)",
114 |   "d-)",
115 |   "8-D",
116 |   "8-d",
117 |   "d-X",
118 |   "8D",
119 |   "8d",
120 |   "dX",
121 |   ":'(",
122 |   ":')",
123 |   ":'-(",
124 |   "'-(",
125 |   ":'-)",
126 |   "'-)",
127 |   ":(",
128 |   ":((",
129 |   ":(((",
130 |   "(((",
131 |   ":()",
132 |   ":)",
133 |   ":))",
134 |   ":)))",
135 |   ")))",
136 |   ":*",
137 |   ":-(",
138 |   ":-((",
139 |   "-((",
140 |   ":-(((",
141 |   ":-)",
142 |   ":-))",
143 |   "-))",
144 |   ":-)))",
145 |   ":-*",
146 |   ":-/",
147 |   ":-0",
148 |   ":-d",
149 |   ":-3",
150 |   ":->",
151 |   ":-D",
152 |   ":-X",
153 |   ":-O",
154 |   ":-o",
155 |   ":-P",
156 |   ":-p",
157 |   ":-x",
158 |   ":-]",
159 |   ":-|",
160 |   ":-}",
161 |   ":/",
162 |   ":0",
163 |   ":d",
164 |   ":1",
165 |   ":3",
166 |   ":>",
167 |   ":D",
168 |   ":X",
169 |   ":O",
170 |   ":o",
171 |   ":P",
172 |   ":p",
173 |   ":x",
174 |   ":]",
175 |   ":o)",
176 |   ":x)",
177 |   ":|",
178 |   ":}",
179 |   ":\u2019(",
180 |   ":\u2019)",
181 |   ":\u2019-(",
182 |   "\u2019-(",
183 |   ":\u2019-)",
184 |   "\u2019-)",
185 |   ";)",
186 |   ";",
187 |   ";-)",
188 |   ";-D",
189 |   ";-d",
190 |   ";-X",
191 |   ";D",
192 |   ";d",
193 |   ";X",
194 |   ";_;",
195 |   "<.<",
196 |   "<",
197 |   "</3",
198 |   "</d",
199 |   "<3",
200 |   "<d",
201 |   "<33",
202 |   "<dd",
203 |   "<333",
204 |   "333",
205 |   "<ddd",
206 |   "<space>",
207 |   "ce>",
208 |   "<xxxx>",
209 |   "=(",
210 |   "=",
211 |   "=)",
212 |   "=/",
213 |   "=3",
214 |   "=d",
215 |   "=D",
216 |   "=X",
217 |   "=|",
218 |   ">.<",
219 |   ">",
220 |   ">.>",
221 |   ">:(",
222 |   ">:o",
223 |   ">:x",
224 |   "><(((*>",
225 |   "(*>",
226 |   "@_@",
227 |   "@",
228 |   "C++",
229 |   "c++",
230 |   "C",
231 |   "X++",
232 |   "O.O",
233 |   "o.o",
234 |   "O",
235 |   "X.X",
236 |   "O.o",
237 |   "X.x",
238 |   "O_O",
239 |   "o_o",
240 |   "X_X",
241 |   "O_o",
242 |   "X_x",
243 |   "V.V",
244 |   "v.v",
245 |   "V",
246 |   "V_V",
247 |   "v_v",
248 |   "XD",
249 |   "xd",
250 |   "XDD",
251 |   "xdd",
252 |   "XXX",
253 |   "[-:",
254 |   "[",
255 |   "[:",
256 |   "\\\")",
257 |   "\\",
258 |   "\\n",
259 |   "\\x",
260 |   "\\t",
261 |   "^_^",
262 |   "^",
263 |   "^__^",
264 |   "__^",
265 |   "^___^",
266 |   "a.",
267 |   "a",
268 |   "x.",
269 |   "b.",
270 |   "b",
271 |   "c.",
272 |   "c",
273 |   "d.",
274 |   "d",
275 |   "e.",
276 |   "e",
277 |   "f.",
278 |   "f",
279 |   "g.",
280 |   "g",
281 |   "h.",
282 |   "h",
283 |   "i.",
284 |   "i",
285 |   "j.",
286 |   "j",
287 |   "k.",
288 |   "k",
289 |   "l.",
290 |   "l",
291 |   "m.",
292 |   "m",
293 |   "n.",
294 |   "n",
295 |   "o.",
296 |   "o",
297 |   "o.0",
298 |   "x.d",
299 |   "o.O",
300 |   "x.X",
301 |   "x.x",
302 |   "o_0",
303 |   "x_d",
304 |   "o_O",
305 |   "x_X",
306 |   "x_x",
307 |   "p.",
308 |   "p",
309 |   "q.",
310 |   "q",
311 |   "r.",
312 |   "r",
313 |   "s.",
314 |   "s",
315 |   "t.",
316 |   "t",
317 |   "u.",
318 |   "u",
319 |   "v.",
320 |   "v",
321 |   "w.",
322 |   "w",
323 |   "x",
324 |   "xD",
325 |   "xX",
326 |   "xDD",
327 |   "xXX",
328 |   "y.",
329 |   "y",
330 |   "z.",
331 |   "z",
332 |   "\u00a0",
333 |   "  ",
334 |   "\u00af\\(\u30c4)/\u00af",
335 |   "\u00af",
336 |   ")/\u00af",
337 |   "\u00af\\(x)/\u00af",
338 |   "\u00e4.",
339 |   "\u00e4",
340 |   "\u00f6.",
341 |   "\u00f6",
342 |   "\u00fc.",
343 |   "\u00fc",
344 |   "\u0ca0_\u0ca0",
345 |   "\u0ca0",
346 |   "\u0ca0\ufe35\u0ca0",
347 |   "x\ufe35x",
348 |   "\u2014",
349 |   "--",
350 |   "\u2019",
351 |   "\u2019\u2019",
352 |   "N",
353 |   "J",
354 |   "\u5403",
355 |   "\u84dd\u8272",
356 |   "\u84dd",
357 |   "xx",
358 |   "\u6c49\u5821",
359 |   "\u6c49",
360 |   "\u6211",
361 |   "\u559c\u6b22",
362 |   "\u559c",
363 |   "\u7ea2",
364 |   "\u82f9\u679c",
365 |   "\u82f9",
366 |   "\u9ed1\u8272",
367 |   "\u9ed1",
368 |   "\u886c\u886b",
369 |   "\u886c",
370 |   "ROOT",
371 |   "ATTRIBUTE",
372 |   "QUALITY",
373 |   "PLACE",
374 |   "\u627e",
375 |   "\u4e00\u4e2a",
376 |   "\u4e00",
377 |   "\u65e0\u7ebf",
378 |   "\u65e0",
379 |   "\u8d28\u91cf",
380 |   "\u8d28",
381 |   "\u597d",
382 |   "\u7684",
383 |   "\u5496\u5561\u5385",
384 |   "\u5496",
385 |   "xxx",
386 |   "find",
387 |   "ind",
388 |   "xxxx",
389 |   "hotel",
390 |   "tel",
391 |   "with",
392 |   "ith",
393 |   "good",
394 |   "ood",
395 |   "wifi",
396 |   "ifi",
397 |   "me",
398 |   "the",
399 |   "cheapest",
400 |   "est",
401 |   "gym",
402 |   "near",
403 |   "ear",
404 |   "work",
405 |   "ork",
406 |   "show",
407 |   "how",
408 |   "best",
409 |   "in",
410 |   "berlin",
411 |   "lin",
412 |   "\u5965\u7f8e\u62c9\u5511",
413 |   "\u5965",
414 |   "\u7f8e\u62c9\u5511",
415 |   "\u5bf9",
416 |   "\u53cd\u6d41\u6027",
417 |   "\u53cd",
418 |   "\u98df\u9053\u708e",
419 |   "\u98df",
420 |   "\u6cbb\u7597",
421 |   "\u6cbb",
422 |   "\u5177\u6709",
423 |   "\u5177",
424 |   "\u975e\u5e38\u660e\u663e",
425 |   "\u975e",
426 |   "\u5e38\u660e\u663e",
427 |   "\u6548\u679c",
428 |   "\u6548",
429 |   "\u6700\u597d",
430 |   "\u6700",
431 |   "\u4f9b\u5e94\u5546",
432 |   "\u4f9b",
433 |   "TARGET",
434 |   "2012",
435 |   "2",
436 |   "012",
437 |   "dddd",
438 |   "\u5e74",
439 |   "\u76ee\u6807",
440 |   "\u4eca\u5e74",
441 |   "\u4eca",
442 |   "\u51a0\u519b",
443 |   "\u51a0",
444 |   "\u6839",
445 |   "\u5c5e\u6027",
446 |   "\u6839-",
447 |   "-\u5c5e\u6027-",
448 |   "-\u8d28\u91cf-",
449 |   "-\u76ee\u6807-",
450 |   "\u9760\u8fd1",
451 |   "\u9760",
452 |   "\u6d77\u8fb9",
453 |   "\u6d77",
454 |   "\u9152\u5e97",
455 |   "\u9152",
456 |   "\u7ed9",
457 |   "\u6700\u8fd1",
458 |   "\u5173\u95e8",
459 |   "\u5173",
460 |   "\u665a",
461 |   "\u5065\u8eab\u623f",
462 |   "\u5065",
463 |   "\u544a\u8bc9",
464 |   "\u544a",
465 |   "\u4fbf\u5b9c",
466 |   "\u4fbf",
467 |   "\u5356\u82b1",
468 |   "\u5356",
469 |   "\u5546\u5e97",
470 |   "\u5546",
471 |   "LOCATION",
472 |   "\u5728",
473 |   "\u4f26\u6566",
474 |   "\u4f26",
475 |   "\u9910\u5385",
476 |   "\u9910",
477 |   "\u67cf\u6797",
478 |   "\u67cf",
479 |   "\u6700\u9177",
480 |   "\u65c5\u793e",
481 |   "\u65c5",
482 |   "\u4e0a\u73ed",
483 |   "\u4e0a",
484 |   "\u8fd1",
485 |   "\u610f\u5927\u5229",
486 |   "\u610f"
487 | ]


--------------------------------------------------------------------------------
/zh_model/vocab/vectors:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jeusgao/spaCy-new-language-test-Chinese/f202c409a56247cebd29165450919e83b412f309/zh_model/vocab/vectors


--------------------------------------------------------------------------------