├── README.MD ├── spacy └── lang │ └── zh │ ├── __init__.py │ ├── __pycache__ │ ├── __init__.cpython-36.pyc │ ├── lex_attrs.cpython-36.pyc │ ├── morph_rules.cpython-36.pyc │ ├── stop_words.cpython-36.pyc │ ├── syntax_iterators.cpython-36.pyc │ └── tag_map.cpython-36.pyc │ ├── examples.py │ ├── lex_attrs.py │ ├── morph_rules.py │ ├── stop_words.py │ ├── syntax_iterators.py │ └── tag_map.py ├── train_intent_parser_cn.py ├── train_ner_cn.py ├── train_new_entity_type_cn.py ├── train_parser_cn.py ├── train_tagger_cn.py ├── vectors_fast_text.py └── zh_model ├── meta.json ├── ner ├── cfg ├── lower_model ├── moves ├── tok2vec_model └── upper_model ├── parser ├── cfg ├── lower_model ├── moves ├── tok2vec_model └── upper_model ├── tagger ├── cfg ├── model └── tag_map ├── tokenizer └── vocab ├── key2row ├── lexemes.bin ├── strings.json └── vectors /README.MD: -------------------------------------------------------------------------------- 1 | 关于spaCy的中文语言和分析模型测试。 2 | 中文vocab使用的是80余万条新浪新闻(news.vec),通过fasttext生成的300dim的vector。 3 | 所有内容均为配合 https://www.jianshu.com/p/9bfbdb5dc487 系列文章。 4 | -------------------------------------------------------------------------------- /spacy/lang/zh/__init__.py: -------------------------------------------------------------------------------- 1 | # coding: utf8 2 | from __future__ import unicode_literals 3 | 4 | from .tag_map import TAG_MAP 5 | from .stop_words import STOP_WORDS 6 | from .lex_attrs import LEX_ATTRS 7 | from .morph_rules import MORPH_RULES 8 | from .syntax_iterators import SYNTAX_ITERATORS 9 | 10 | from ..tokenizer_exceptions import BASE_EXCEPTIONS 11 | from ..norm_exceptions import BASE_NORMS 12 | from ..punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES 13 | from ..char_classes import UNITS, CURRENCY, QUOTES, PUNCT, HYPHENS, ICONS, LIST_UNITS, LIST_CURRENCY, LIST_QUOTES, LIST_PUNCT, LIST_HYPHENS, LIST_ELLIPSES, LIST_ICONS 14 | 15 | from ...attrs import LANG, NORM 16 | from ...language import Language 17 | from ...tokens import Doc 18 | from ...util import update_exc, add_lookups 19 | 20 | 21 | class ChineseDefaults(Language.Defaults): 22 | lex_attr_getters = dict(Language.Defaults.lex_attr_getters) 23 | lex_attr_getters.update(LEX_ATTRS) 24 | lex_attr_getters[LANG] = lambda text: 'zh' # for pickling 25 | lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], 26 | BASE_NORMS) 27 | tokenizer_exceptions = update_exc(BASE_EXCEPTIONS) 28 | 29 | use_jieba = True 30 | tag_map = TAG_MAP 31 | stop_words = STOP_WORDS 32 | morph_rules = MORPH_RULES 33 | syntax_iterators = SYNTAX_ITERATORS 34 | 35 | 36 | class Chinese(Language): 37 | lang = 'zh' 38 | Defaults = ChineseDefaults # override defaults 39 | 40 | def make_doc(self, text): 41 | if self.Defaults.use_jieba: 42 | try: 43 | import jieba 44 | except ImportError: 45 | msg = ("Jieba not installed. Either set Chinese.use_jieba = False, " 46 | "or install it https://github.com/fxsjy/jieba") 47 | raise ImportError(msg) 48 | words = list(jieba.cut(text, cut_all=False)) 49 | words = [x for x in words if x] 50 | return Doc(self.vocab, words=words, spaces=[False]*len(words)) 51 | else: 52 | words = [] 53 | spaces = [] 54 | doc = self.tokenizer(text) 55 | for token in self.tokenizer(text): 56 | words.extend(list(token.text)) 57 | spaces.extend([False]*len(token.text)) 58 | spaces[-1] = bool(token.whitespace_) 59 | return Doc(self.vocab, words=words, spaces=spaces) 60 | 61 | 62 | __all__ = ['Chinese'] 63 | -------------------------------------------------------------------------------- /spacy/lang/zh/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jeusgao/spaCy-new-language-test-Chinese/f202c409a56247cebd29165450919e83b412f309/spacy/lang/zh/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /spacy/lang/zh/__pycache__/lex_attrs.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jeusgao/spaCy-new-language-test-Chinese/f202c409a56247cebd29165450919e83b412f309/spacy/lang/zh/__pycache__/lex_attrs.cpython-36.pyc -------------------------------------------------------------------------------- /spacy/lang/zh/__pycache__/morph_rules.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jeusgao/spaCy-new-language-test-Chinese/f202c409a56247cebd29165450919e83b412f309/spacy/lang/zh/__pycache__/morph_rules.cpython-36.pyc -------------------------------------------------------------------------------- /spacy/lang/zh/__pycache__/stop_words.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jeusgao/spaCy-new-language-test-Chinese/f202c409a56247cebd29165450919e83b412f309/spacy/lang/zh/__pycache__/stop_words.cpython-36.pyc -------------------------------------------------------------------------------- /spacy/lang/zh/__pycache__/syntax_iterators.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jeusgao/spaCy-new-language-test-Chinese/f202c409a56247cebd29165450919e83b412f309/spacy/lang/zh/__pycache__/syntax_iterators.cpython-36.pyc -------------------------------------------------------------------------------- /spacy/lang/zh/__pycache__/tag_map.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jeusgao/spaCy-new-language-test-Chinese/f202c409a56247cebd29165450919e83b412f309/spacy/lang/zh/__pycache__/tag_map.cpython-36.pyc -------------------------------------------------------------------------------- /spacy/lang/zh/examples.py: -------------------------------------------------------------------------------- 1 | # coding: utf8 2 | from __future__ import unicode_literals 3 | 4 | 5 | """ 6 | Example sentences to test spaCy and its language models. 7 | 8 | >>> from spacy.lang.zh.examples import sentences 9 | >>> docs = nlp.pipe(sentences) 10 | """ 11 | 12 | 13 | sentences = [ 14 | "苹果公司正考虑用一亿元买下英国的新创公司", 15 | "自动驾驶汽车将保险责任归属转移至制造商", 16 | "旧金山考虑禁止送货机器人在人行道上行驶", 17 | "伦敦是英国的大城市" 18 | ] 19 | -------------------------------------------------------------------------------- /spacy/lang/zh/lex_attrs.py: -------------------------------------------------------------------------------- 1 | # coding: utf8 2 | from __future__ import unicode_literals 3 | 4 | from ...attrs import LIKE_NUM 5 | 6 | 7 | _num_words = ['零', '一', '二', '三', '四', '五', '六', '七', 8 | '八', '九', '十', '十一', '十二', '十三', '十四', 9 | '十五', '十六', '十七', '十八', '十九', '二十', 10 | '三十', '四十', '五十', '六十', '七十', '八十', '九十', 11 | '百', '千', '百万', '十亿', '万亿', '百兆', 12 | 'gajillion', 'bazillion'] 13 | 14 | 15 | def like_num(text): 16 | text = text.replace(',', '').replace('.', '') 17 | if text.isdigit(): 18 | return True 19 | if text.count('/') == 1: 20 | num, denom = text.split('/') 21 | if num.isdigit() and denom.isdigit(): 22 | return True 23 | if text.lower() in _num_words: 24 | return True 25 | return False 26 | 27 | 28 | LEX_ATTRS = { 29 | LIKE_NUM: like_num 30 | } 31 | -------------------------------------------------------------------------------- /spacy/lang/zh/morph_rules.py: -------------------------------------------------------------------------------- 1 | # coding: utf8 2 | from __future__ import unicode_literals 3 | 4 | from ...symbols import LEMMA, PRON_LEMMA 5 | 6 | 7 | MORPH_RULES = { 8 | "PRP": { 9 | "我": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "One", "Number": "Sing"}, 10 | "你": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Two"}, 11 | "他": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Three", "Number": "Sing", "Gender": "Masc"}, 12 | "她": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Three", "Number": "Sing", "Gender": "Fem"}, 13 | "它": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Three", "Number": "Sing", "Gender": "Neut"}, 14 | "我们": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "One", "Number": "Plur"}, 15 | "他们": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Three", "Number": "Plur", "Gender": "Masc"}, 16 | "她们": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Three", "Number": "Plur", "Gender": "Fem"}, 17 | 18 | "我的": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "One", "Number": "Sing", "Poss": "Yes", "Reflex": "Yes"}, 19 | "他的": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Three", "Number": "Sing", "Gender": "Masc", "Poss": "Yes", "Reflex": "Yes"}, 20 | "她的": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Three", "Number": "Sing", "Gender": "Fem", "Poss": "Yes", "Reflex": "Yes"}, 21 | "它的": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Three", "Number": "Sing", "Gender": "Neut", "Poss": "Yes", "Reflex": "Yes"}, 22 | "我们的": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "One", "Number": "Plur", "Poss": "Yes", "Reflex": "Yes"}, 23 | "你们的": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Two", "Number": "Plur", "Poss": "Yes", "Reflex": "Yes"}, 24 | "他们的": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Three", "Number": "Plur", "Poss": "Yes", "Reflex": "Yes"}, 25 | "她们的": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Three", "Number": "Plur", "Gender": "Fem", "Poss": "Yes", "Reflex": "Yes"}, 26 | 27 | "我自己": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "One", "Number": "Sing", "Case": "Acc", "Reflex": "Yes"}, 28 | "你自己": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Two", "Case": "Acc", "Reflex": "Yes"}, 29 | "他自己": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Three", "Number": "Sing", "Case": "Acc", "Gender": "Masc", "Reflex": "Yes"}, 30 | "她自己": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Three", "Number": "Sing", "Case": "Acc", "Gender": "Fem", "Reflex": "Yes"}, 31 | "它自己": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Three", "Number": "Sing", "Case": "Acc", "Gender": "Neut", "Reflex": "Yes"}, 32 | "他们自己": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Three", "Number": "Sing", "Case": "Acc", "Reflex": "Yes"}, 33 | "她们自己": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Three", "Number": "Sing", "Case": "Acc", "Gender": "Fem","Reflex": "Yes"}, 34 | }, 35 | 36 | "PRP$": { 37 | "我的": {LEMMA: PRON_LEMMA, "Person": "One", "Number": "Sing", "PronType": "Prs", "Poss": "Yes"}, 38 | "你的": {LEMMA: PRON_LEMMA, "Person": "Two", "PronType": "Prs", "Poss": "Yes"}, 39 | "他的": {LEMMA: PRON_LEMMA, "Person": "Three", "Number": "Sing", "Gender": "Masc", "PronType": "Prs", "Poss": "Yes"}, 40 | "她的": {LEMMA: PRON_LEMMA, "Person": "Three", "Number": "Sing", "Gender": "Fem", "PronType": "Prs", "Poss": "Yes"}, 41 | "它的": {LEMMA: PRON_LEMMA, "Person": "Three", "Number": "Sing", "Gender": "Neut", "PronType": "Prs", "Poss": "Yes"}, 42 | "我们的": {LEMMA: PRON_LEMMA, "Person": "One", "Number": "Plur", "PronType": "Prs", "Poss": "Yes"}, 43 | "他们的": {LEMMA: PRON_LEMMA, "Person": "Three", "Number": "Plur", "PronType": "Prs", "Poss": "Yes"}, 44 | "她们的": {LEMMA: PRON_LEMMA, "Person": "Three", "Number": "Plur", "PronType": "Prs", "Gender": "Fem", "Poss": "Yes"} 45 | }, 46 | 47 | "VBZ": { 48 | "是": {LEMMA: "be", "VerbForm": "Fin", "Tense": "Pres", "Mood": "Ind"}, 49 | "为": {LEMMA: "be", "VerbForm": "Fin", "Tense": "Pres", "Mood": "Ind"}, 50 | }, 51 | } 52 | 53 | 54 | for tag, rules in MORPH_RULES.items(): 55 | for key, attrs in dict(rules).items(): 56 | rules[key.title()] = attrs 57 | -------------------------------------------------------------------------------- /spacy/lang/zh/stop_words.py: -------------------------------------------------------------------------------- 1 | # coding: utf8 2 | from __future__ import unicode_literals 3 | 4 | 5 | # Stop words 6 | 7 | STOP_WORDS = set(""" 8 | a about above across after afterwards again against all almost alone along 9 | already also although always am among amongst amount an and another any anyhow 10 | anyone anything anyway anywhere are around as at 11 | 12 | back be became because become becomes becoming been before beforehand behind 13 | being below beside besides between beyond both bottom but by 14 | 15 | call can cannot ca could 16 | 17 | did do does doing done down due during 18 | 19 | each eight either eleven else elsewhere empty enough even ever every 20 | everyone everything everywhere except 21 | 22 | few fifteen fifty first five for former formerly forty four from front full 23 | further 24 | 25 | get give go 26 | 27 | had has have he hence her here hereafter hereby herein hereupon hers herself 28 | him himself his how however hundred 29 | 30 | i if in indeed into is it its itself 31 | 32 | keep 33 | 34 | last latter latterly least less 35 | 36 | just 37 | 38 | made make many may me meanwhile might mine more moreover most mostly move much 39 | must my myself 40 | 41 | name namely neither never nevertheless next nine no nobody none noone nor not 42 | nothing now nowhere n't 43 | 44 | of off often on once one only onto or other others otherwise our ours ourselves 45 | out over own 46 | 47 | part per perhaps please put 48 | 49 | quite 50 | 51 | rather re really regarding 52 | 53 | same say see seem seemed seeming seems serious several she should show side 54 | since six sixty so some somehow someone something sometime sometimes somewhere 55 | still such 56 | 57 | take ten than that the their them themselves then thence there thereafter 58 | thereby therefore therein thereupon these they third this those though three 59 | through throughout thru thus to together too top toward towards twelve twenty 60 | two 61 | 62 | under until up unless upon us used using 63 | 64 | various very very via was we well were what whatever when whence whenever where 65 | whereafter whereas whereby wherein whereupon wherever whether which while 66 | whither who whoever whole whom whose why will with within without would 67 | 68 | yet you your yours yourself yourselves 69 | 70 | 'd 'll 'm 're 's 've 71 | 72 | 现在 73 | 嘿嘿 74 | 还要 75 | 要 76 | 嗳 77 | 赶早不赶晚 78 | 这些 79 | 日益 80 | 倒不如 81 | 逢 82 | 立地 83 | 不只 84 | 一般 85 | 豁然 86 | 将近 87 | 毫不 88 | 嗬 89 | 大张旗鼓 90 | 嗯 91 | 不可开交 92 | " 93 | # 94 | 95 | ! 96 | & 97 | 全力 98 | 就算 99 | ' 100 | $ 101 | % 102 | ... 103 | * 104 | + 105 | ( 106 | 她们 107 | ) 108 | . 109 | / 110 | 各位 111 | , 112 | - 113 | ︿ 114 | 3 115 | 2 116 | 1 117 | 不止 118 | 基本 119 | 0 120 | 不拘 121 | 7 122 | 这里 123 | 6 124 | 5 125 | 颇 126 | 4 127 | ; 128 | : 129 | 如此 130 | 9 131 | 8 132 | 极度 133 | ? 134 | > 135 | 首先 136 | = 137 | < 138 | 也罢 139 | @ 140 | A 141 | 见 142 | 当庭 143 | 隔夜 144 | 更 145 | 不少 146 | 不胜 147 | \ 148 | _ 149 | 替 150 | ^ 151 | 到目前为止 152 | 大大 153 | 除开 154 | 腾 155 | 暗中 156 | 而外 157 | 开始 158 | ` 159 | 三番两次 160 | 宁可 161 | 这么 162 | 权时 163 | 结果 164 | 大多 165 | 除此以外 166 | 单单 167 | 如下 168 | 几度 169 | 何处 170 | 喂 171 | 如上 172 | 矣 173 | 喀 174 | 喏 175 | ~ 176 | 吧哒 177 | | 178 | 放量 179 | 即便 180 | 当年 181 | 不对 182 | 那 183 | 顷刻 184 | 本人 185 | 是 186 | 岂非 187 | 己 188 | 看 189 | 趁热 190 | 哪边 191 | 立马 192 | 乘势 193 | 啥 194 | 何况 195 | 这个 196 | 啦 197 | 人民 198 | 率尔 199 | 那种 200 | 仍然 201 | 不能 202 | 根据 203 | 并肩 204 | 相对而言 205 | 也好 206 | 啐 207 | 什么样 208 | 累年 209 | 啊 210 | 扑通 211 | 即使 212 | 开外 213 | 大概 214 | 依照 215 | · 216 | 乃至 217 | 与否 218 | 总而言之 219 | 高低 220 | 切切 221 | 多次 222 | 比如说 223 | 不亦乐乎 224 | 如期 225 | 简言之 226 | 何妨 227 | 不管怎样 228 | 顺 229 | 顷 230 | 将才 231 | 呆呆地 232 | 略为 233 | 更为 234 | 大约 235 | 其次 236 | 倍加 237 | 满 238 | 不定 239 | 除了 240 | 都 241 | 之后 242 | 着 243 | 难道 244 | 不可 245 | 至 246 | 风雨无阻 247 | 陡然 248 | 为了 249 | 及至 250 | 对于 251 | 虽说 252 | 唉 253 | 彻夜 254 | 嘎嘎 255 | 臭 256 | 不同 257 | 大体上 258 | 自 259 | ! 260 | # 261 | 来得及 262 | 哦 263 | 而 264 | % 265 | $ 266 | 的 267 | & 268 | 哪 269 | ) 270 | 老 271 | ( 272 | + 273 | 哩 274 | * 275 | , 276 | 者 277 | 人人 278 | 比方 279 | 0 280 | 1 281 | 简而言之 282 | 这么些 283 | 2 284 | 3 285 | 4 286 | 5 287 | 6 288 | 不至于 289 | 7 290 | 这部 291 | 谁知 292 | 8 293 | 几时 294 | 9 295 | 屡 296 | : 297 | ; 298 | < 299 | 应当 300 | > 301 | 哼 302 | ? 303 | 人家 304 | 如今 305 | @ 306 | 哇 307 | 哈 308 | 哉 309 | 尽量 310 | 总的说来 311 | 继之 312 | 单纯 313 | 方才 314 | 哎 315 | 这么点儿 316 | 极 317 | 乘胜 318 | 八成 319 | 光是 320 | 倘或 321 | 哗 322 | 被 323 | 忽然 324 | [ 325 | 从头 326 | 出去 327 | 哟 328 | ] 329 | 咦 330 | 切不可 331 | 尽 332 | 哪年 333 | 竟然 334 | 是否 335 | 而况 336 | 加以 337 | 从此以后 338 | 省得 339 | 就 340 | 咳 341 | 我们 342 | 不力 343 | 各个 344 | 咱 345 | 充其极 346 | | 347 | } 348 | 次第 349 | ~ 350 | 岂止 351 | { 352 | 某 353 | 尽管如此 354 | 偶而 355 | 看上去 356 | 截然 357 | 甚而 358 | 和 359 | 如常 360 | 任何 361 | 极端 362 | 接着 363 | 嘎登 364 | 咋 365 | 皆可 366 | 具体地说 367 | 凝神 368 | 这就是说 369 | 将 370 | 千万 371 | 好在 372 | 从早到晚 373 | 各自 374 | 咚 375 | 取道 376 | 纯粹 377 | 这种 378 | 只限 379 | 上去 380 | 恐怕 381 | 呢 382 | 莫非 383 | 虽然 384 | 碰巧 385 | 呸 386 | 紧接着 387 | 即若 388 | 本 389 | 等等 390 | 按照 391 | 呵 392 | 不单 393 | 具体说来 394 | 一旦 395 | 望 396 | 朝 397 | 纵 398 | 不要 399 | 呀 400 | 怎样 401 | 呃 402 | 轰然 403 | 有 404 | 每当 405 | 接连不断 406 | 呜 407 | 呐 408 | 不比 409 | 呕 410 | 纯 411 | 呗 412 | 各种 413 | 理应 414 | 连袂 415 | 吧 416 | 绝 417 | 什么 418 | 那里 419 | 后来 420 | 给 421 | 日渐 422 | 暗自 423 | 以免 424 | 经 425 | 不然 426 | 来 427 | 饱 428 | 别人 429 | 吱 430 | 看来 431 | 沙沙 432 | 同 433 | 趁势 434 | 切莫 435 | 从重 436 | 尽心尽力 437 | 切勿 438 | 果真 439 | 各 440 | ¥ 441 | 要不是 442 | 白 443 | 并排 444 | 自己 445 | 保管 446 | 岂 447 | 差一点 448 | 默然 449 | 此中 450 | 能 451 | 吗 452 | 向 453 | 吓 454 | 藉以 455 | 不惟 456 | 的确 457 | 此后 458 | 让 459 | 待到 460 | 末##末 461 | 哪些 462 | 不然的话 463 | 其他 464 | 毫无 465 | 连声 466 | 趁早 467 | 归 468 | 几经 469 | 当 470 | 论 471 | 九 472 | 也 473 | 挨门挨户 474 | 挨次 475 | 乘 476 | 保险 477 | 从小 478 | 莫若 479 | 乒 480 | 彼 481 | 乎 482 | 刚好 483 | 么 484 | 如若 485 | 之 486 | 默默地 487 | 是的 488 | 嗡嗡 489 | 请勿 490 | 乃 491 | 为 492 | 得起 493 | 借此 494 | 该 495 | 汝 496 | 从此 497 | 然而 498 | 亲眼 499 | 略微 500 | 刚才 501 | 一定 502 | 反倒是 503 | 按时 504 | 临 505 | 个 506 | 倘若 507 | 差不多 508 | 从无到有 509 | — 510 | 起来 511 | ’ 512 | ‘ 513 | 反之则 514 | 弗 515 | ” 516 | “ 517 | 何止 518 | 惯常 519 | 姑且 520 | 与其 521 | … 522 | 哪个 523 | 反而 524 | 常言道 525 | 大抵 526 | 不再 527 | 且 528 | 到了儿 529 | 三 530 | 上 531 | 再者 532 | 不 533 | 并且 534 | 与 535 | 一 536 | 趁着 537 | 七 538 | 两者 539 | 等到 540 | 不经意 541 | 必 542 | 如何 543 | 来着 544 | 不由得 545 | 怎么样 546 | 尽管 547 | 知道 548 | 任 549 | 旁人 550 | 不管 551 | 由 552 | 个人 553 | 哪里 554 | 似的 555 | 以 556 | 甭 557 | 甫 558 | 倒不如说 559 | 用 560 | 均 561 | 其余 562 | 长此下去 563 | 们 564 | 莫 565 | 匆匆 566 | 多少 567 | 当着 568 | 就是说 569 | 他 570 | 既然 571 | 虽则 572 | 纵使 573 | 呼哧 574 | 沿 575 | 快 576 | 仅 577 | 联袂 578 | 没 579 | 或许 580 | 仍 581 | 来看 582 | 俺们 583 | 从 584 | 倘然 585 | 只是 586 | 往 587 | 大凡 588 | 而言 589 | 当真 590 | 待 591 | 因此 592 | 很 593 | 不如 594 | 据此 595 | 更进一步 596 | 那么样 597 | 纵然 598 | 得 599 | 不仅...而且 600 | 极为 601 | 尽然 602 | 略 603 | 长期以来 604 | 互 605 | 五 606 | 不妨 607 | 不止一次 608 | 地 609 | —— 610 | 较比 611 | 必须 612 | 或是 613 | 向着 614 | 从古到今 615 | 在 616 | 尽如人意 617 | 了 618 | 毕竟 619 | 二 620 | 川流不息 621 | 确实 622 | 于 623 | 可以 624 | 你 625 | 并没有 626 | 当场 627 | 要不 628 | 那儿 629 | 纵令 630 | 恰巧 631 | 无宁 632 | 四 633 | 来讲 634 | 局外 635 | 近年来 636 | 因 637 | 并 638 | 但 639 | 起首 640 | , 641 | 赶快 642 | 方 643 | 需要 644 | 即令 645 | 大略 646 | 将要 647 | 活 648 | 不特 649 | 然则 650 | 极了 651 | 何 652 | 但是 653 | 固 654 | 不独 655 | 何苦 656 | 一则 657 | 猛然 658 | 屡屡 659 | 传 660 | 到底 661 | 在下 662 | 设使 663 | 经过 664 | 至于 665 | 老老实实 666 | 猛然间 667 | 截至 668 | 譬如 669 | 很多 670 | 一切 671 | 别的 672 | 要么 673 | 趁机 674 | 。 675 | 、 676 | 越是 677 | 常 678 |   679 | 按期 680 | 何尝 681 | 》 682 | 《 683 | 〉 684 | 〈 685 | 动不动 686 | 不外 687 | 因为 688 | 使得 689 | 会 690 | 既 691 | 如果 692 | 按说 693 | 不大 694 | 带 695 | 自从 696 | 以便 697 | 宁肯 698 | 当下 699 | 不光 700 | 它们 701 | 之类 702 | 老大 703 | 尽可能 704 | 尔后 705 | 成年累月 706 | 如上所述 707 | 每个 708 | 彼此 709 | 从宽 710 | 俺 711 | 就此 712 | 粗 713 | 达旦 714 | 当口儿 715 | 归根结底 716 | 看起来 717 | 或多或少 718 | 当中 719 | 据我所知 720 | 据实 721 | 不免 722 | 遵照 723 | 固然 724 | 缕缕 725 | 换言之 726 | 策略地 727 | 居然 728 | 连日来 729 | 若 730 | 起见 731 | 比照 732 | 嘎 733 | 不成 734 | 不仅仅是 735 | 长话短说 736 | 因而 737 | 设若 738 | 不论 739 | 嘘 740 | 嘛 741 | 沿着 742 | 恍然 743 | 慢说 744 | 亲身 745 | 哼唷 746 | 故 747 | 便 748 | 以至 749 | 以致 750 | 本着 751 | 论说 752 | 除外 753 | 之所以 754 | 简直 755 | 前后 756 | 大家 757 | 嘻 758 | 果然 759 | 共总 760 | 嘿 761 | 敢 762 | 时候 763 | 不怎么 764 | 如次 765 | 依 766 | 鄙人 767 | 亲手 768 | 大 769 | 顿时 770 | 顺着 771 | 叮当 772 | 敞开儿 773 | 等 774 | 大面儿上 775 | 年复一年 776 | 冲 777 | 打开天窗说亮话 778 | 跟 779 | 上来 780 | 拿 781 | 假若 782 | 不曾 783 | 着呢 784 | 快要 785 | 此刻 786 | 而且 787 | 背靠背 788 | 假使 789 | 陈年 790 | 多多益善 791 | 另方面 792 | 冒 793 | 他人 794 | 到处 795 | 大体 796 | 下来 797 | 云云 798 | 全然 799 | 何须 800 | 为着 801 | 每逢 802 | 内 803 | 多 804 | 很少 805 | 尚且 806 | 只要 807 | 不仅仅 808 | 出 809 | 810 | 顷刻间 811 | 常常 812 | 趁 813 | 日臻 814 | 恰似 815 | 得天独厚 816 | 另外 817 | 敢情 818 | 率然 819 | 并无 820 | 届时 821 | 凭 822 | 每每 823 | 几 824 | 她 825 | 成为 826 | 他们 827 | 尔等 828 | 尽快 829 | 不消 830 | 如其 831 | 把 832 | 反之亦然 833 | 当即 834 | 奇 835 | 据悉 836 | 奈 837 | 前者 838 | 第 839 | 必定 840 | [ 841 | 处处 842 | ] 843 | 断然 844 | 绝非 845 | 总的来看 846 | 岂但 847 | 分期 848 | 古来 849 | 我们 850 | 啪达 851 | 顷刻之间 852 | 每次 853 | 别说 854 | 传闻 855 | 从优 856 | 总的来说 857 | 非徒 858 | 常言说得好 859 | 非得 860 | 由于 861 | 难说 862 | 可是 863 | 从今以后 864 | 比如 865 | 所 866 | 继而 867 | 不可抗拒 868 | 才 869 | 如 870 | 精光 871 | 凭借 872 | 略加 873 | 起 874 | 平素 875 | 绝对 876 | 赶 877 | 于是 878 | 打 879 | 一样 880 | 长线 881 | 每时每刻 882 | 不择手段 883 | 理该 884 | 共 885 | 拦腰 886 | 喔唷 887 | 其 888 | 仍旧 889 | 屡次 890 | 以及 891 | 当然 892 | 到头来 893 | 抑或 894 | 宁愿 895 | 一方面 896 | 举凡 897 | 只有 898 | 八 899 | 咱们 900 | 六 901 | 从新 902 | 兮 903 | 这样 904 | 不得已 905 | 管 906 | 十分 907 | 自个儿 908 | 呼啦 909 | 我 910 | 必将 911 | 串行 912 | 而论 913 | 或 914 | 牢牢 915 | 成心 916 | 光 917 | 哈哈 918 | 与此同时 919 | 其一 920 | 于是乎 921 | 此 922 | 看样子 923 | 换句话说 924 | 全身心 925 | 除非 926 | 有人 927 | 以至于 928 | 按理 929 | 也许 930 | 打从 931 | 照着 932 | 况且 933 | 独 934 | 除却 935 | 不了 936 | 不得 937 | 反手 938 | 成年 939 | 哎呀 940 | 关于 941 | 恰恰相反 942 | 这儿 943 | 累次 944 | 其中 945 | 动辄 946 | 立刻 947 | 倒是 948 | 毫无例外 949 | 从古至今 950 | 可见 951 | 诚然 952 | 莫不 953 | 怎么办 954 | 亲自 955 | 经常 956 | 决不 957 | 自各儿 958 | 这么样 959 | 不必 960 | 不得了 961 | 除去 962 | 由此可见 963 | 像 964 | 有些 965 | 挨个 966 | 不仅 967 | 进来 968 | 大事 969 | 全年 970 | 绝顶 971 | 社会主义 972 | 总之 973 | 当头 974 | 若是 975 | 竟 976 | 不外乎 977 | 要不然 978 | 如此等等 979 | 分期分批 980 | 那么 981 | 毋宁 982 | 立 983 | 其二 984 | 不会 985 | .. 986 | 背地里 987 | 据 988 | 此间 989 | 哪儿 990 | 不怕 991 | 不问 992 | 每 993 | 为什么 994 | 没有 995 | 公然 996 | 那会儿 997 | 迫于 998 | 来不及 999 | 不起 1000 | 千万千万 1001 | 可能 1002 | 正如 1003 | 比 1004 | 还有 1005 | 借 1006 | 倘 1007 | 究竟 1008 | 及其 1009 | 不限 1010 | 偏偏 1011 | 据称 1012 | 故此 1013 | 谁 1014 | 伙同 1015 | 敢于 1016 | 弹指之间 1017 | 那些 1018 | 窃 1019 | 朝着 1020 | 叮咚 1021 | 临到 1022 | 即将 1023 | 哎哟 1024 | 而已 1025 | 尽心竭力 1026 | 到头 1027 | 亲口 1028 | 已经 1029 | 不但 1030 | 出来 1031 | 随着 1032 | 不得不 1033 | 非常 1034 | 另一个 1035 | 非但 1036 | 如前所述 1037 | 殆 1038 | -- 1039 | 诸位 1040 | 那时 1041 | 即是说 1042 | 按 1043 | 谨 1044 | 何时 1045 | 此外 1046 | 然后 1047 | 勃然 1048 | 从来 1049 | 近几年来 1050 | 近来 1051 | 莫如 1052 | 奋勇 1053 | 比起 1054 | 仅仅 1055 | 故而 1056 | 穷年累月 1057 | 历 1058 | 乌乎 1059 | 怪不得 1060 | 去 1061 | 借以 1062 | 主要 1063 | 间或 1064 | 方能 1065 | 白白 1066 | 除 1067 | 反过来 1068 | 全都 1069 | 并没 1070 | 过 1071 | 除此之外 1072 | 马上 1073 | 迄 1074 | 恰恰 1075 | 传说 1076 | 还 1077 | 这 1078 | 连 1079 | 近 1080 | 从速 1081 | 上下 1082 | 哪样 1083 | 这边 1084 | 从未 1085 | 不能不 1086 | 从不 1087 | 及 1088 | 那个 1089 | 边 1090 | 又 1091 | 迟早 1092 | 不知不觉 1093 | 挨家挨户 1094 | 多多少少 1095 | 几番 1096 | 有关 1097 | 您 1098 | 连同 1099 | 较 1100 | 互相 1101 | 怎么 1102 | 但愿 1103 | 可 1104 | 你们 1105 | 凑巧 1106 | 连日 1107 | 叫 1108 | 路经 1109 | 阿 1110 | 起先 1111 | 另 1112 | 二话没说 1113 | 之一 1114 | 这时 1115 | 即或 1116 | 连连 1117 | 其后 1118 | 各式 1119 | 当儿 1120 | 独自 1121 | 它 1122 | 宁 1123 | 哪天 1124 | 就是 1125 | 乘机 1126 | 常言说 1127 | 不下 1128 | 定 1129 | 照 1130 | 昂然 1131 | 毫无保留地 1132 | 趁便 1133 | 屡次三番 1134 | 甚至 1135 | 那末 1136 | 充其量 1137 | 该当 1138 | 另一方面 1139 | 既...又 1140 | 瑟瑟 1141 | 的话 1142 | 呜呼 1143 | 或者 1144 | 立时 1145 | 反过来说 1146 | 有的 1147 | 挨着 1148 | 再说 1149 | 够瞧的 1150 | 过于 1151 | 零 1152 | 就地 1153 | 然 1154 | 极其 1155 | 何乐而不为 1156 | 进去 1157 | 单 1158 | 随 1159 | 起头 1160 | 无论 1161 | 怎 1162 | 据说 1163 | 综上所述 1164 | 抽冷子 1165 | 才能 1166 | 怕 1167 | 千 1168 | 离 1169 | 梆 1170 | 极大 1171 | 恰逢 1172 | 半 1173 | 大举 1174 | 漫说 1175 | 接下来 1176 | 忽地 1177 | 而是 1178 | 即 1179 | 难得 1180 | 不但...而且 1181 | 格外 1182 | 怪 1183 | 倘使 1184 | 还是 1185 | 从而 1186 | 对 1187 | 本身 1188 | 乘隙 1189 | 既是 1190 | 理当 1191 | 反倒 1192 | 焉 1193 | 可好 1194 | 不满 1195 | 交口 1196 | 基本上 1197 | 认为 1198 | 这会儿 1199 | 充分 1200 | 并非 1201 | 不迭 1202 | 老是 1203 | 倍感 1204 | 鉴于 1205 | 要是 1206 | 反之 1207 | 哪怕 1208 | 除此而外 1209 | 虽 1210 | 一下 1211 | 自身 1212 | 任凭 1213 | 几乎 1214 | 顶多 1215 | 靠 1216 | 而又 1217 | 不时 1218 | 一个 1219 | 否则 1220 | 自家 1221 | 三天两头 1222 | 砰 1223 | 啊呀 1224 | 难怪 1225 | 所以 1226 | 发生 1227 | 哗啦 1228 | 多年来 1229 | 罢了 1230 | 大致 1231 | 从轻 1232 | 那边 1233 | 那么些 1234 | 不巧 1235 | 完全 1236 | 起初 1237 | 某个 1238 | 加之 1239 | 大不了 1240 | 归根到底 1241 | 偶尔 1242 | 应该 1243 | 二话不说 1244 | 日见 1245 | 不是 1246 | 大都 1247 | 愤然 1248 | 而后 1249 | 多年前 1250 | 例如 1251 | 蛮 1252 | 切 1253 | 一些 1254 | 多多 1255 | 日复一日 1256 | 较之 1257 | 即刻 1258 | 那样 1259 | 齐 1260 | 其它 1261 | 则 1262 | 不料 1263 | 刚 1264 | 初 1265 | 决非 1266 | 乘虚 1267 | 恰如 1268 | 能够 1269 | 从严 1270 | 故意 1271 | 别 1272 | 啊哟 1273 | 从中 1274 | 不已 1275 | 加上 1276 | 具体来说 1277 | 较为 1278 | 分头 1279 | 直到 1280 | 到 1281 | >> 1282 | 隔日 1283 | 多亏 1284 | 假如 1285 | 甚么 1286 | 作为 1287 | 暗地里 1288 | 挨门逐户 1289 | 恰好 1290 | 其实 1291 | 何必 1292 | 万一 1293 | 不过 1294 | 某些 1295 | 啊哈 1296 | 基于 1297 | 不日 1298 | 尽早 1299 | 刚巧 1300 | 概 1301 | 一来 1302 | 同时 1303 | 三番五次 1304 | 为何 1305 | 更加 1306 | 绝不 1307 | 除此 1308 | 不常 1309 | 进而 1310 | 另行 1311 | 急匆匆 1312 | 通过 1313 | 话说 1314 | 若非 1315 | 极力 1316 | 存心 1317 | a 1318 | able 1319 | about 1320 | above 1321 | according 1322 | accordingly 1323 | across 1324 | actually 1325 | after 1326 | afterwards 1327 | again 1328 | against 1329 | ain't 1330 | all 1331 | allow 1332 | allows 1333 | almost 1334 | alone 1335 | along 1336 | already 1337 | also 1338 | although 1339 | always 1340 | am 1341 | among 1342 | amongst 1343 | an 1344 | and 1345 | another 1346 | any 1347 | anybody 1348 | anyhow 1349 | anyone 1350 | anything 1351 | anyway 1352 | anyways 1353 | anywhere 1354 | apart 1355 | appear 1356 | appreciate 1357 | appropriate 1358 | are 1359 | aren't 1360 | around 1361 | as 1362 | a's 1363 | aside 1364 | ask 1365 | asking 1366 | associated 1367 | at 1368 | available 1369 | away 1370 | awfully 1371 | be 1372 | became 1373 | because 1374 | become 1375 | becomes 1376 | becoming 1377 | been 1378 | before 1379 | beforehand 1380 | behind 1381 | being 1382 | believe 1383 | below 1384 | beside 1385 | besides 1386 | best 1387 | better 1388 | between 1389 | beyond 1390 | both 1391 | brief 1392 | but 1393 | by 1394 | came 1395 | can 1396 | cannot 1397 | cant 1398 | can't 1399 | cause 1400 | causes 1401 | certain 1402 | certainly 1403 | changes 1404 | clearly 1405 | c'mon 1406 | co 1407 | com 1408 | come 1409 | comes 1410 | concerning 1411 | consequently 1412 | consider 1413 | considering 1414 | contain 1415 | containing 1416 | contains 1417 | corresponding 1418 | could 1419 | couldn't 1420 | course 1421 | c's 1422 | currently 1423 | definitely 1424 | described 1425 | despite 1426 | did 1427 | didn't 1428 | different 1429 | do 1430 | does 1431 | doesn't 1432 | doing 1433 | done 1434 | don't 1435 | down 1436 | downwards 1437 | during 1438 | each 1439 | edu 1440 | eg 1441 | eight 1442 | either 1443 | else 1444 | elsewhere 1445 | enough 1446 | entirely 1447 | especially 1448 | et 1449 | etc 1450 | even 1451 | ever 1452 | every 1453 | everybody 1454 | everyone 1455 | everything 1456 | everywhere 1457 | ex 1458 | exactly 1459 | example 1460 | except 1461 | far 1462 | few 1463 | fifth 1464 | first 1465 | five 1466 | followed 1467 | following 1468 | follows 1469 | for 1470 | former 1471 | formerly 1472 | forth 1473 | four 1474 | from 1475 | further 1476 | furthermore 1477 | get 1478 | gets 1479 | getting 1480 | given 1481 | gives 1482 | go 1483 | goes 1484 | going 1485 | gone 1486 | got 1487 | gotten 1488 | greetings 1489 | had 1490 | hadn't 1491 | happens 1492 | hardly 1493 | has 1494 | hasn't 1495 | have 1496 | haven't 1497 | having 1498 | he 1499 | hello 1500 | help 1501 | hence 1502 | her 1503 | here 1504 | hereafter 1505 | hereby 1506 | herein 1507 | here's 1508 | hereupon 1509 | hers 1510 | herself 1511 | he's 1512 | hi 1513 | him 1514 | himself 1515 | his 1516 | hither 1517 | hopefully 1518 | how 1519 | howbeit 1520 | however 1521 | i'd 1522 | ie 1523 | if 1524 | ignored 1525 | i'll 1526 | i'm 1527 | immediate 1528 | in 1529 | inasmuch 1530 | inc 1531 | indeed 1532 | indicate 1533 | indicated 1534 | indicates 1535 | inner 1536 | insofar 1537 | instead 1538 | into 1539 | inward 1540 | is 1541 | isn't 1542 | it 1543 | it'd 1544 | it'll 1545 | its 1546 | it's 1547 | itself 1548 | i've 1549 | just 1550 | keep 1551 | keeps 1552 | kept 1553 | know 1554 | known 1555 | knows 1556 | last 1557 | lately 1558 | later 1559 | latter 1560 | latterly 1561 | least 1562 | less 1563 | lest 1564 | let 1565 | let's 1566 | like 1567 | liked 1568 | likely 1569 | little 1570 | look 1571 | looking 1572 | looks 1573 | ltd 1574 | mainly 1575 | many 1576 | may 1577 | maybe 1578 | me 1579 | mean 1580 | meanwhile 1581 | merely 1582 | might 1583 | more 1584 | moreover 1585 | most 1586 | mostly 1587 | much 1588 | must 1589 | my 1590 | myself 1591 | name 1592 | namely 1593 | nd 1594 | near 1595 | nearly 1596 | necessary 1597 | need 1598 | needs 1599 | neither 1600 | never 1601 | nevertheless 1602 | new 1603 | next 1604 | nine 1605 | no 1606 | nobody 1607 | non 1608 | none 1609 | noone 1610 | nor 1611 | normally 1612 | not 1613 | nothing 1614 | novel 1615 | now 1616 | nowhere 1617 | obviously 1618 | of 1619 | off 1620 | often 1621 | oh 1622 | ok 1623 | okay 1624 | old 1625 | on 1626 | once 1627 | one 1628 | ones 1629 | only 1630 | onto 1631 | or 1632 | other 1633 | others 1634 | otherwise 1635 | ought 1636 | our 1637 | ours 1638 | ourselves 1639 | out 1640 | outside 1641 | over 1642 | overall 1643 | own 1644 | particular 1645 | particularly 1646 | per 1647 | perhaps 1648 | placed 1649 | please 1650 | plus 1651 | possible 1652 | presumably 1653 | probably 1654 | provides 1655 | que 1656 | quite 1657 | qv 1658 | rather 1659 | rd 1660 | re 1661 | really 1662 | reasonably 1663 | regarding 1664 | regardless 1665 | regards 1666 | relatively 1667 | respectively 1668 | right 1669 | said 1670 | same 1671 | saw 1672 | say 1673 | saying 1674 | says 1675 | second 1676 | secondly 1677 | see 1678 | seeing 1679 | seem 1680 | seemed 1681 | seeming 1682 | seems 1683 | seen 1684 | self 1685 | selves 1686 | sensible 1687 | sent 1688 | serious 1689 | seriously 1690 | seven 1691 | several 1692 | shall 1693 | she 1694 | should 1695 | shouldn't 1696 | since 1697 | six 1698 | so 1699 | some 1700 | somebody 1701 | somehow 1702 | someone 1703 | something 1704 | sometime 1705 | sometimes 1706 | somewhat 1707 | somewhere 1708 | soon 1709 | sorry 1710 | specified 1711 | specify 1712 | specifying 1713 | still 1714 | sub 1715 | such 1716 | sup 1717 | sure 1718 | take 1719 | taken 1720 | tell 1721 | tends 1722 | th 1723 | than 1724 | thank 1725 | thanks 1726 | thanx 1727 | that 1728 | thats 1729 | that's 1730 | the 1731 | their 1732 | theirs 1733 | them 1734 | themselves 1735 | then 1736 | thence 1737 | there 1738 | thereafter 1739 | thereby 1740 | therefore 1741 | therein 1742 | theres 1743 | there's 1744 | thereupon 1745 | these 1746 | they 1747 | they'd 1748 | they'll 1749 | they're 1750 | they've 1751 | think 1752 | third 1753 | this 1754 | thorough 1755 | thoroughly 1756 | those 1757 | though 1758 | three 1759 | through 1760 | throughout 1761 | thru 1762 | thus 1763 | to 1764 | together 1765 | too 1766 | took 1767 | toward 1768 | towards 1769 | tried 1770 | tries 1771 | truly 1772 | try 1773 | trying 1774 | t's 1775 | twice 1776 | two 1777 | un 1778 | under 1779 | unfortunately 1780 | unless 1781 | unlikely 1782 | until 1783 | unto 1784 | up 1785 | upon 1786 | us 1787 | use 1788 | used 1789 | useful 1790 | uses 1791 | using 1792 | usually 1793 | value 1794 | various 1795 | very 1796 | via 1797 | viz 1798 | vs 1799 | want 1800 | wants 1801 | was 1802 | wasn't 1803 | way 1804 | we 1805 | we'd 1806 | welcome 1807 | well 1808 | we'll 1809 | went 1810 | were 1811 | we're 1812 | weren't 1813 | we've 1814 | what 1815 | whatever 1816 | what's 1817 | when 1818 | whence 1819 | whenever 1820 | where 1821 | whereafter 1822 | whereas 1823 | whereby 1824 | wherein 1825 | where's 1826 | whereupon 1827 | wherever 1828 | whether 1829 | which 1830 | while 1831 | whither 1832 | who 1833 | whoever 1834 | whole 1835 | whom 1836 | who's 1837 | whose 1838 | why 1839 | will 1840 | willing 1841 | wish 1842 | with 1843 | within 1844 | without 1845 | wonder 1846 | won't 1847 | would 1848 | wouldn't 1849 | yes 1850 | yet 1851 | you 1852 | you'd 1853 | you'll 1854 | your 1855 | you're 1856 | yours 1857 | yourself 1858 | yourselves 1859 | you've 1860 | zero 1861 | zt 1862 | ZT 1863 | zz 1864 | ZZ 1865 | 一 1866 | 一下 1867 | 一些 1868 | 一切 1869 | 一则 1870 | 一天 1871 | 一定 1872 | 一方面 1873 | 一旦 1874 | 一时 1875 | 一来 1876 | 一样 1877 | 一次 1878 | 一片 1879 | 一直 1880 | 一致 1881 | 一般 1882 | 一起 1883 | 一边 1884 | 一面 1885 | 万一 1886 | 上下 1887 | 上升 1888 | 上去 1889 | 上来 1890 | 上述 1891 | 上面 1892 | 下列 1893 | 下去 1894 | 下来 1895 | 下面 1896 | 不一 1897 | 不久 1898 | 不仅 1899 | 不会 1900 | 不但 1901 | 不光 1902 | 不单 1903 | 不变 1904 | 不只 1905 | 不可 1906 | 不同 1907 | 不够 1908 | 不如 1909 | 不得 1910 | 不怕 1911 | 不惟 1912 | 不成 1913 | 不拘 1914 | 不敢 1915 | 不断 1916 | 不是 1917 | 不比 1918 | 不然 1919 | 不特 1920 | 不独 1921 | 不管 1922 | 不能 1923 | 不要 1924 | 不论 1925 | 不足 1926 | 不过 1927 | 不问 1928 | 与 1929 | 与其 1930 | 与否 1931 | 与此同时 1932 | 专门 1933 | 且 1934 | 两者 1935 | 严格 1936 | 严重 1937 | 个 1938 | 个人 1939 | 个别 1940 | 中小 1941 | 中间 1942 | 丰富 1943 | 临 1944 | 为 1945 | 为主 1946 | 为了 1947 | 为什么 1948 | 为什麽 1949 | 为何 1950 | 为着 1951 | 主张 1952 | 主要 1953 | 举行 1954 | 乃 1955 | 乃至 1956 | 么 1957 | 之 1958 | 之一 1959 | 之前 1960 | 之后 1961 | 之後 1962 | 之所以 1963 | 之类 1964 | 乌乎 1965 | 乎 1966 | 乘 1967 | 也 1968 | 也好 1969 | 也是 1970 | 也罢 1971 | 了 1972 | 了解 1973 | 争取 1974 | 于 1975 | 于是 1976 | 于是乎 1977 | 云云 1978 | 互相 1979 | 产生 1980 | 人们 1981 | 人家 1982 | 什么 1983 | 什么样 1984 | 什麽 1985 | 今后 1986 | 今天 1987 | 今年 1988 | 今後 1989 | 仍然 1990 | 从 1991 | 从事 1992 | 从而 1993 | 他 1994 | 他人 1995 | 他们 1996 | 他的 1997 | 代替 1998 | 以 1999 | 以上 2000 | 以下 2001 | 以为 2002 | 以便 2003 | 以免 2004 | 以前 2005 | 以及 2006 | 以后 2007 | 以外 2008 | 以後 2009 | 以来 2010 | 以至 2011 | 以至于 2012 | 以致 2013 | 们 2014 | 任 2015 | 任何 2016 | 任凭 2017 | 任务 2018 | 企图 2019 | 伟大 2020 | 似乎 2021 | 似的 2022 | 但 2023 | 但是 2024 | 何 2025 | 何况 2026 | 何处 2027 | 何时 2028 | 作为 2029 | 你 2030 | 你们 2031 | 你的 2032 | 使得 2033 | 使用 2034 | 例如 2035 | 依 2036 | 依照 2037 | 依靠 2038 | 促进 2039 | 保持 2040 | 俺 2041 | 俺们 2042 | 倘 2043 | 倘使 2044 | 倘或 2045 | 倘然 2046 | 倘若 2047 | 假使 2048 | 假如 2049 | 假若 2050 | 做到 2051 | 像 2052 | 允许 2053 | 充分 2054 | 先后 2055 | 先後 2056 | 先生 2057 | 全部 2058 | 全面 2059 | 兮 2060 | 共同 2061 | 关于 2062 | 其 2063 | 其一 2064 | 其中 2065 | 其二 2066 | 其他 2067 | 其余 2068 | 其它 2069 | 其实 2070 | 其次 2071 | 具体 2072 | 具体地说 2073 | 具体说来 2074 | 具有 2075 | 再者 2076 | 再说 2077 | 冒 2078 | 冲 2079 | 决定 2080 | 况且 2081 | 准备 2082 | 几 2083 | 几乎 2084 | 几时 2085 | 凭 2086 | 凭借 2087 | 出去 2088 | 出来 2089 | 出现 2090 | 分别 2091 | 则 2092 | 别 2093 | 别的 2094 | 别说 2095 | 到 2096 | 前后 2097 | 前者 2098 | 前进 2099 | 前面 2100 | 加之 2101 | 加以 2102 | 加入 2103 | 加强 2104 | 十分 2105 | 即 2106 | 即令 2107 | 即使 2108 | 即便 2109 | 即或 2110 | 即若 2111 | 却不 2112 | 原来 2113 | 又 2114 | 及 2115 | 及其 2116 | 及时 2117 | 及至 2118 | 双方 2119 | 反之 2120 | 反应 2121 | 反映 2122 | 反过来 2123 | 反过来说 2124 | 取得 2125 | 受到 2126 | 变成 2127 | 另 2128 | 另一方面 2129 | 另外 2130 | 只是 2131 | 只有 2132 | 只要 2133 | 只限 2134 | 叫 2135 | 叫做 2136 | 召开 2137 | 叮咚 2138 | 可 2139 | 可以 2140 | 可是 2141 | 可能 2142 | 可见 2143 | 各 2144 | 各个 2145 | 各人 2146 | 各位 2147 | 各地 2148 | 各种 2149 | 各级 2150 | 各自 2151 | 合理 2152 | 同 2153 | 同一 2154 | 同时 2155 | 同样 2156 | 后来 2157 | 后面 2158 | 向 2159 | 向着 2160 | 吓 2161 | 吗 2162 | 否则 2163 | 吧 2164 | 吧哒 2165 | 吱 2166 | 呀 2167 | 呃 2168 | 呕 2169 | 呗 2170 | 呜 2171 | 呜呼 2172 | 呢 2173 | 周围 2174 | 呵 2175 | 呸 2176 | 呼哧 2177 | 咋 2178 | 和 2179 | 咚 2180 | 咦 2181 | 咱 2182 | 咱们 2183 | 咳 2184 | 哇 2185 | 哈 2186 | 哈哈 2187 | 哉 2188 | 哎 2189 | 哎呀 2190 | 哎哟 2191 | 哗 2192 | 哟 2193 | 哦 2194 | 哩 2195 | 哪 2196 | 哪个 2197 | 哪些 2198 | 哪儿 2199 | 哪天 2200 | 哪年 2201 | 哪怕 2202 | 哪样 2203 | 哪边 2204 | 哪里 2205 | 哼 2206 | 哼唷 2207 | 唉 2208 | 啊 2209 | 啐 2210 | 啥 2211 | 啦 2212 | 啪达 2213 | 喂 2214 | 喏 2215 | 喔唷 2216 | 嗡嗡 2217 | 嗬 2218 | 嗯 2219 | 嗳 2220 | 嘎 2221 | 嘎登 2222 | 嘘 2223 | 嘛 2224 | 嘻 2225 | 嘿 2226 | 因 2227 | 因为 2228 | 因此 2229 | 因而 2230 | 固然 2231 | 在 2232 | 在下 2233 | 地 2234 | 坚决 2235 | 坚持 2236 | 基本 2237 | 处理 2238 | 复杂 2239 | 多 2240 | 多少 2241 | 多数 2242 | 多次 2243 | 大力 2244 | 大多数 2245 | 大大 2246 | 大家 2247 | 大批 2248 | 大约 2249 | 大量 2250 | 失去 2251 | 她 2252 | 她们 2253 | 她的 2254 | 好的 2255 | 好象 2256 | 如 2257 | 如上所述 2258 | 如下 2259 | 如何 2260 | 如其 2261 | 如果 2262 | 如此 2263 | 如若 2264 | 存在 2265 | 宁 2266 | 宁可 2267 | 宁愿 2268 | 宁肯 2269 | 它 2270 | 它们 2271 | 它们的 2272 | 它的 2273 | 安全 2274 | 完全 2275 | 完成 2276 | 实现 2277 | 实际 2278 | 宣布 2279 | 容易 2280 | 密切 2281 | 对 2282 | 对于 2283 | 对应 2284 | 将 2285 | 少数 2286 | 尔后 2287 | 尚且 2288 | 尤其 2289 | 就 2290 | 就是 2291 | 就是说 2292 | 尽 2293 | 尽管 2294 | 属于 2295 | 岂但 2296 | 左右 2297 | 巨大 2298 | 巩固 2299 | 己 2300 | 已经 2301 | 帮助 2302 | 常常 2303 | 并 2304 | 并不 2305 | 并不是 2306 | 并且 2307 | 并没有 2308 | 广大 2309 | 广泛 2310 | 应当 2311 | 应用 2312 | 应该 2313 | 开外 2314 | 开始 2315 | 开展 2316 | 引起 2317 | 强烈 2318 | 强调 2319 | 归 2320 | 当 2321 | 当前 2322 | 当时 2323 | 当然 2324 | 当着 2325 | 形成 2326 | 彻底 2327 | 彼 2328 | 彼此 2329 | 往 2330 | 往往 2331 | 待 2332 | 後来 2333 | 後面 2334 | 得 2335 | 得出 2336 | 得到 2337 | 心里 2338 | 必然 2339 | 必要 2340 | 必须 2341 | 怎 2342 | 怎么 2343 | 怎么办 2344 | 怎么样 2345 | 怎样 2346 | 怎麽 2347 | 总之 2348 | 总是 2349 | 总的来看 2350 | 总的来说 2351 | 总的说来 2352 | 总结 2353 | 总而言之 2354 | 恰恰相反 2355 | 您 2356 | 意思 2357 | 愿意 2358 | 慢说 2359 | 成为 2360 | 我 2361 | 我们 2362 | 我的 2363 | 或 2364 | 或是 2365 | 或者 2366 | 战斗 2367 | 所 2368 | 所以 2369 | 所有 2370 | 所谓 2371 | 打 2372 | 扩大 2373 | 把 2374 | 抑或 2375 | 拿 2376 | 按 2377 | 按照 2378 | 换句话说 2379 | 换言之 2380 | 据 2381 | 掌握 2382 | 接着 2383 | 接著 2384 | 故 2385 | 故此 2386 | 整个 2387 | 方便 2388 | 方面 2389 | 旁人 2390 | 无宁 2391 | 无法 2392 | 无论 2393 | 既 2394 | 既是 2395 | 既然 2396 | 时候 2397 | 明显 2398 | 明确 2399 | 是 2400 | 是否 2401 | 是的 2402 | 显然 2403 | 显著 2404 | 普通 2405 | 普遍 2406 | 更加 2407 | 曾经 2408 | 替 2409 | 最后 2410 | 最大 2411 | 最好 2412 | 最後 2413 | 最近 2414 | 最高 2415 | 有 2416 | 有些 2417 | 有关 2418 | 有利 2419 | 有力 2420 | 有所 2421 | 有效 2422 | 有时 2423 | 有点 2424 | 有的 2425 | 有着 2426 | 有著 2427 | 望 2428 | 朝 2429 | 朝着 2430 | 本 2431 | 本着 2432 | 来 2433 | 来着 2434 | 极了 2435 | 构成 2436 | 果然 2437 | 果真 2438 | 某 2439 | 某个 2440 | 某些 2441 | 根据 2442 | 根本 2443 | 欢迎 2444 | 正在 2445 | 正如 2446 | 正常 2447 | 此 2448 | 此外 2449 | 此时 2450 | 此间 2451 | 毋宁 2452 | 每 2453 | 每个 2454 | 每天 2455 | 每年 2456 | 每当 2457 | 比 2458 | 比如 2459 | 比方 2460 | 比较 2461 | 毫不 2462 | 没有 2463 | 沿 2464 | 沿着 2465 | 注意 2466 | 深入 2467 | 清楚 2468 | 满足 2469 | 漫说 2470 | 焉 2471 | 然则 2472 | 然后 2473 | 然後 2474 | 然而 2475 | 照 2476 | 照着 2477 | 特别是 2478 | 特殊 2479 | 特点 2480 | 现代 2481 | 现在 2482 | 甚么 2483 | 甚而 2484 | 甚至 2485 | 用 2486 | 由 2487 | 由于 2488 | 由此可见 2489 | 的 2490 | 的话 2491 | 目前 2492 | 直到 2493 | 直接 2494 | 相似 2495 | 相信 2496 | 相反 2497 | 相同 2498 | 相对 2499 | 相对而言 2500 | 相应 2501 | 相当 2502 | 相等 2503 | 省得 2504 | 看出 2505 | 看到 2506 | 看来 2507 | 看看 2508 | 看见 2509 | 真是 2510 | 真正 2511 | 着 2512 | 着呢 2513 | 矣 2514 | 知道 2515 | 确定 2516 | 离 2517 | 积极 2518 | 移动 2519 | 突出 2520 | 突然 2521 | 立即 2522 | 第 2523 | 等 2524 | 等等 2525 | 管 2526 | 紧接着 2527 | 纵 2528 | 纵令 2529 | 纵使 2530 | 纵然 2531 | 练习 2532 | 组成 2533 | 经 2534 | 经常 2535 | 经过 2536 | 结合 2537 | 结果 2538 | 给 2539 | 绝对 2540 | 继续 2541 | 继而 2542 | 维持 2543 | 综上所述 2544 | 罢了 2545 | 考虑 2546 | 者 2547 | 而 2548 | 而且 2549 | 而况 2550 | 而外 2551 | 而已 2552 | 而是 2553 | 而言 2554 | 联系 2555 | 能 2556 | 能否 2557 | 能够 2558 | 腾 2559 | 自 2560 | 自个儿 2561 | 自从 2562 | 自各儿 2563 | 自家 2564 | 自己 2565 | 自身 2566 | 至 2567 | 至于 2568 | 良好 2569 | 若 2570 | 若是 2571 | 若非 2572 | 范围 2573 | 莫若 2574 | 获得 2575 | 虽 2576 | 虽则 2577 | 虽然 2578 | 虽说 2579 | 行为 2580 | 行动 2581 | 表明 2582 | 表示 2583 | 被 2584 | 要 2585 | 要不 2586 | 要不是 2587 | 要不然 2588 | 要么 2589 | 要是 2590 | 要求 2591 | 规定 2592 | 觉得 2593 | 认为 2594 | 认真 2595 | 认识 2596 | 让 2597 | 许多 2598 | 论 2599 | 设使 2600 | 设若 2601 | 该 2602 | 说明 2603 | 诸位 2604 | 谁 2605 | 谁知 2606 | 赶 2607 | 起 2608 | 起来 2609 | 起见 2610 | 趁 2611 | 趁着 2612 | 越是 2613 | 跟 2614 | 转动 2615 | 转变 2616 | 转贴 2617 | 较 2618 | 较之 2619 | 边 2620 | 达到 2621 | 迅速 2622 | 过 2623 | 过去 2624 | 过来 2625 | 运用 2626 | 还是 2627 | 还有 2628 | 这 2629 | 这个 2630 | 这么 2631 | 这么些 2632 | 这么样 2633 | 这么点儿 2634 | 这些 2635 | 这会儿 2636 | 这儿 2637 | 这就是说 2638 | 这时 2639 | 这样 2640 | 这点 2641 | 这种 2642 | 这边 2643 | 这里 2644 | 这麽 2645 | 进入 2646 | 进步 2647 | 进而 2648 | 进行 2649 | 连 2650 | 连同 2651 | 适应 2652 | 适当 2653 | 适用 2654 | 逐步 2655 | 逐渐 2656 | 通常 2657 | 通过 2658 | 造成 2659 | 遇到 2660 | 遭到 2661 | 避免 2662 | 那 2663 | 那个 2664 | 那么 2665 | 那么些 2666 | 那么样 2667 | 那些 2668 | 那会儿 2669 | 那儿 2670 | 那时 2671 | 那样 2672 | 那边 2673 | 那里 2674 | 那麽 2675 | 部分 2676 | 鄙人 2677 | 采取 2678 | 里面 2679 | 重大 2680 | 重新 2681 | 重要 2682 | 鉴于 2683 | 问题 2684 | 防止 2685 | 阿 2686 | 附近 2687 | 限制 2688 | 除 2689 | 除了 2690 | 除此之外 2691 | 除非 2692 | 随 2693 | 随着 2694 | 随著 2695 | 集中 2696 | 需要 2697 | 非但 2698 | 非常 2699 | 非徒 2700 | 靠 2701 | 顺 2702 | 顺着 2703 | 首先 2704 | 高兴 2705 | 是不是 2706 | 说说 2707 | 2708 | """.split()) 2709 | -------------------------------------------------------------------------------- /spacy/lang/zh/syntax_iterators.py: -------------------------------------------------------------------------------- 1 | # coding: utf8 2 | from __future__ import unicode_literals 3 | 4 | from ...symbols import NOUN, PROPN, PRON 5 | 6 | 7 | def noun_chunks(obj): 8 | """ 9 | Detect base noun phrases from a dependency parse. Works on both Doc and Span. 10 | """ 11 | labels = ['nsubj', 'dobj', 'nsubjpass', 'pcomp', 'pobj', 'dative', 'appos', 12 | 'attr', 'ROOT'] 13 | doc = obj.doc # Ensure works on both Doc and Span. 14 | np_deps = [doc.vocab.strings.add(label) for label in labels] 15 | conj = doc.vocab.strings.add('conj') 16 | np_label = doc.vocab.strings.add('NP') 17 | seen = set() 18 | for i, word in enumerate(obj): 19 | if word.pos not in (NOUN, PROPN, PRON): 20 | continue 21 | # Prevent nested chunks from being produced 22 | if word.i in seen: 23 | continue 24 | if word.dep in np_deps: 25 | if any(w.i in seen for w in word.subtree): 26 | continue 27 | seen.update(j for j in range(word.left_edge.i, word.i+1)) 28 | yield word.left_edge.i, word.i+1, np_label 29 | elif word.dep == conj: 30 | head = word.head 31 | while head.dep == conj and head.head.i < head.i: 32 | head = head.head 33 | # If the head is an NP, and we're coordinated to it, we're an NP 34 | if head.dep in np_deps: 35 | if any(w.i in seen for w in word.subtree): 36 | continue 37 | seen.update(j for j in range(word.left_edge.i, word.i+1)) 38 | yield word.left_edge.i, word.i+1, np_label 39 | 40 | 41 | SYNTAX_ITERATORS = { 42 | 'noun_chunks': noun_chunks 43 | } 44 | -------------------------------------------------------------------------------- /spacy/lang/zh/tag_map.py: -------------------------------------------------------------------------------- 1 | # coding: utf8 2 | from __future__ import unicode_literals 3 | 4 | from ...symbols import POS, PUNCT, SYM, ADJ, CCONJ, NUM, DET, ADV, ADP, X, VERB 5 | from ...symbols import NOUN, PROPN, PART, INTJ, SPACE, PRON 6 | 7 | 8 | TAG_MAP = { 9 | ".": {POS: PUNCT, "PunctType": "peri"}, 10 | ",": {POS: PUNCT, "PunctType": "comm"}, 11 | "-LRB-": {POS: PUNCT, "PunctType": "brck", "PunctSide": "ini"}, 12 | "-RRB-": {POS: PUNCT, "PunctType": "brck", "PunctSide": "fin"}, 13 | "``": {POS: PUNCT, "PunctType": "quot", "PunctSide": "ini"}, 14 | "\"\"": {POS: PUNCT, "PunctType": "quot", "PunctSide": "fin"}, 15 | "''": {POS: PUNCT, "PunctType": "quot", "PunctSide": "fin"}, 16 | ":": {POS: PUNCT}, 17 | "$": {POS: SYM, "Other": {"SymType": "currency"}}, 18 | "#": {POS: SYM, "Other": {"SymType": "numbersign"}}, 19 | "AFX": {POS: ADJ, "Hyph": "yes"}, 20 | "CC": {POS: CCONJ, "ConjType": "coor"}, 21 | "CD": {POS: NUM, "NumType": "card"}, 22 | "DT": {POS: DET}, 23 | "EX": {POS: ADV, "AdvType": "ex"}, 24 | "FW": {POS: X, "Foreign": "yes"}, 25 | "HYPH": {POS: PUNCT, "PunctType": "dash"}, 26 | "IN": {POS: ADP}, 27 | "JJ": {POS: ADJ, "Degree": "pos"}, 28 | "JJR": {POS: ADJ, "Degree": "comp"}, 29 | "JJS": {POS: ADJ, "Degree": "sup"}, 30 | "LS": {POS: PUNCT, "NumType": "ord"}, 31 | "MD": {POS: VERB, "VerbType": "mod"}, 32 | "NIL": {POS: ""}, 33 | "NN": {POS: NOUN, "Number": "sing"}, 34 | "NNP": {POS: PROPN, "NounType": "prop", "Number": "sing"}, 35 | "NNPS": {POS: PROPN, "NounType": "prop", "Number": "plur"}, 36 | "NNS": {POS: NOUN, "Number": "plur"}, 37 | "PDT": {POS: ADJ, "AdjType": "pdt", "PronType": "prn"}, 38 | "POS": {POS: PART, "Poss": "yes"}, 39 | "PRP": {POS: PRON, "PronType": "prs"}, 40 | "PRP$": {POS: ADJ, "PronType": "prs", "Poss": "yes"}, 41 | "RB": {POS: ADV, "Degree": "pos"}, 42 | "RBR": {POS: ADV, "Degree": "comp"}, 43 | "RBS": {POS: ADV, "Degree": "sup"}, 44 | "RP": {POS: PART}, 45 | "SP": {POS: SPACE}, 46 | "SYM": {POS: SYM}, 47 | "TO": {POS: PART, "PartType": "inf", "VerbForm": "inf"}, 48 | "UH": {POS: INTJ}, 49 | "VB": {POS: VERB, "VerbForm": "inf"}, 50 | "VBD": {POS: VERB, "VerbForm": "fin", "Tense": "past"}, 51 | "VBG": {POS: VERB, "VerbForm": "part", "Tense": "pres", "Aspect": "prog"}, 52 | "VBN": {POS: VERB, "VerbForm": "part", "Tense": "past", "Aspect": "perf"}, 53 | "VBP": {POS: VERB, "VerbForm": "fin", "Tense": "pres"}, 54 | "VBZ": {POS: VERB, "VerbForm": "fin", "Tense": "pres", "Number": "sing", "Person": 3}, 55 | "WDT": {POS: ADJ, "PronType": "int|rel"}, 56 | "WP": {POS: NOUN, "PronType": "int|rel"}, 57 | "WP$": {POS: ADJ, "Poss": "yes", "PronType": "int|rel"}, 58 | "WRB": {POS: ADV, "PronType": "int|rel"}, 59 | "ADD": {POS: X}, 60 | "NFP": {POS: PUNCT}, 61 | "GW": {POS: X}, 62 | "XX": {POS: X}, 63 | "BES": {POS: VERB}, 64 | "HVS": {POS: VERB}, 65 | "_SP": {POS: SPACE}, 66 | } 67 | -------------------------------------------------------------------------------- /train_intent_parser_cn.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | """Using the parser to recognise your own semantics 4 | 5 | spaCy's parser component can be used to trained to predict any type of tree 6 | structure over your input text. You can also predict trees over whole documents 7 | or chat logs, with connections between the sentence-roots used to annotate 8 | discourse structure. In this example, we'll build a message parser for a common 9 | "chat intent": finding local businesses. Our message semantics will have the 10 | following types of relations: ROOT, PLACE, QUALITY, ATTRIBUTE, TIME, LOCATION. 11 | 12 | "show me the best hotel in berlin" 13 | ('show', 'ROOT', 'show') 14 | ('best', 'QUALITY', 'hotel') --> hotel with QUALITY best 15 | ('hotel', 'PLACE', 'show') --> show PLACE hotel 16 | ('berlin', 'LOCATION', 'hotel') --> hotel with LOCATION berlin 17 | 18 | Compatible with: spaCy v2.0.0+ 19 | """ 20 | from __future__ import unicode_literals, print_function 21 | 22 | import plac 23 | import random 24 | import spacy 25 | from pathlib import Path 26 | 27 | 28 | # training data: texts, heads and dependency labels 29 | # for no relation, we simply chose an arbitrary dependency label, e.g. '-' 30 | TRAIN_DATA = [ 31 | ("找无线质量好的咖啡厅", { 32 | 'heads': [0, 5, 1, 2, 5, 0, 333, 333, 333, 333], # index of token head 33 | 'deps': ['ROOT', 'ATTRIBUTE', 'ATTRIBUTE', 'QUALITY', '-', 'PLACE', '-', '-', '-', '-'] 34 | }), 35 | ("找一个靠近海边的酒店", { 36 | 'heads': [0, 5, 3, 5, 5, 0, 333, 333, 333, 333], 37 | 'deps': ['ROOT', '-', 'QUALITY', 'ATTRIBUTE', '-', 'PLACE', '-', '-', '-', '-'] 38 | }), 39 | ("给我找一个最近的关门晚的健身房", { 40 | 'heads': [2, 2, 2, 9, 9, 9, 7, 9, 9, 2, 333, 333, 333, 333, 333], 41 | 'deps': ['-', '-', 'ROOT', '-', 'QUALITY', '-', 'TIME', 'ATTRIBUTE', '-', 'PLACE', '-', '-', '-', '-', '-'] 42 | }), 43 | ("告诉我最便宜的卖花的商店", { 44 | 'heads': [0, 0, 3, 7, 7, 7, 7, 0, 333, 333, 333, 333], # attach "flowers" to store! 45 | 'deps': ['ROOT', '-', 'QUALITY', 'ATTRIBUTE', '-', 'PRODUCT', '-', 'PLACE', '-', '-', '-', '-'] 46 | }), 47 | ("找一个在伦敦的好餐厅", { 48 | 'heads': [0, 6, 3, 6, 6, 6, 0, 333, 333, 333], 49 | 'deps': ['ROOT', '-', '-', 'LOCATION', '-', 'QUALITY', 'PLACE', '-', '-', '-'] 50 | }), 51 | ("告诉我在柏林最酷的旅社", { 52 | 'heads': [0, 0, 3, 6, 6, 6, 0, 333, 333, 333, 333], 53 | 'deps': ['ROOT', '-', '-', 'LOCATION', 'QUALITY', '-', 'PLACE', '-', '-', '-', '-'] 54 | }), 55 | ("找一个上班近的好的意大利餐厅", { 56 | 'heads': [0, 8, 3, 8, 8, 8, 8, 8, 0, 333, 333, 333, 333, 333], 57 | 'deps': ['ROOT', '-', 'LOCATION', 'ATTRIBUTE', '-', 'QUALITY', '-', 'ATTRIBUTE', 'PLACE', '-', '-', '-', '-', '-'] 58 | }) 59 | ] 60 | 61 | 62 | @plac.annotations( 63 | model=("Model name. Defaults to blank 'en' model.", "option", "m", str), 64 | output_dir=("Optional output directory", "option", "o", Path), 65 | n_iter=("Number of training iterations", "option", "n", int)) 66 | def main(model=None, output_dir=None, n_iter=5): 67 | """Load the model, set up the pipeline and train the parser.""" 68 | if model is not None: 69 | nlp = spacy.load(model) # load existing spaCy model 70 | print("Loaded model '%s'" % model) 71 | else: 72 | nlp = spacy.blank('en') # create blank Language class 73 | print("Created blank 'en' model") 74 | 75 | # We'll use the built-in dependency parser class, but we want to create a 76 | # fresh instance – just in case. 77 | if 'parser' in nlp.pipe_names: 78 | nlp.remove_pipe('parser') 79 | parser = nlp.create_pipe('parser') 80 | nlp.add_pipe(parser, first=True) 81 | 82 | for text, annotations in TRAIN_DATA: 83 | for dep in annotations.get('deps', []): 84 | parser.add_label(dep) 85 | 86 | other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'parser'] 87 | with nlp.disable_pipes(*other_pipes): # only train parser 88 | optimizer = nlp.begin_training() 89 | for itn in range(n_iter): 90 | random.shuffle(TRAIN_DATA) 91 | losses = {} 92 | for text, annotations in TRAIN_DATA: 93 | nlp.update([text], [annotations], sgd=optimizer, losses=losses) 94 | print(losses) 95 | 96 | # test the trained model 97 | test_model(nlp) 98 | 99 | # save model to output directory 100 | if output_dir is not None: 101 | output_dir = Path(output_dir) 102 | if not output_dir.exists(): 103 | output_dir.mkdir() 104 | nlp.to_disk(output_dir) 105 | print("Saved model to", output_dir) 106 | 107 | # test the saved model 108 | print("Loading from", output_dir) 109 | nlp2 = spacy.load(output_dir) 110 | test_model(nlp2) 111 | 112 | 113 | def test_model(nlp): 114 | texts = ["找一个上班近的好的意大利餐厅"] 115 | docs = nlp.pipe(texts) 116 | for doc in docs: 117 | print(doc.text) 118 | print([(t.text, t.dep_, t.head.text) for t in doc if t.dep_ != '-']) 119 | 120 | 121 | if __name__ == '__main__': 122 | plac.call(main) 123 | 124 | # Expected output: 125 | # find a hotel with good wifi 126 | # [ 127 | # ('find', 'ROOT', 'find'), 128 | # ('hotel', 'PLACE', 'find'), 129 | # ('good', 'QUALITY', 'wifi'), 130 | # ('wifi', 'ATTRIBUTE', 'hotel') 131 | # ] 132 | # find me the cheapest gym near work 133 | # [ 134 | # ('find', 'ROOT', 'find'), 135 | # ('cheapest', 'QUALITY', 'gym'), 136 | # ('gym', 'PLACE', 'find') 137 | # ('work', 'LOCATION', 'near') 138 | # ] 139 | # show me the best hotel in berlin 140 | # [ 141 | # ('show', 'ROOT', 'show'), 142 | # ('best', 'QUALITY', 'hotel'), 143 | # ('hotel', 'PLACE', 'show'), 144 | # ('berlin', 'LOCATION', 'hotel') 145 | # ] 146 | -------------------------------------------------------------------------------- /train_ner_cn.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf8 3 | """Example of training spaCy's named entity recognizer, starting off with an 4 | existing model or a blank model. 5 | 6 | For more details, see the documentation: 7 | * Training: https://spacy.io/usage/training 8 | * NER: https://spacy.io/usage/linguistic-features#named-entities 9 | 10 | Compatible with: spaCy v2.0.0+ 11 | """ 12 | from __future__ import unicode_literals, print_function 13 | 14 | import plac 15 | import random 16 | from pathlib import Path 17 | import spacy 18 | 19 | 20 | # training data 21 | TRAIN_DATA = [ 22 | ('到底谁是张三?', { 23 | 'entities': [(4, 6, 'PERSON')] 24 | }), 25 | ('我非常喜欢伦敦和柏林.', { 26 | 'entities': [(5, 7, 'LOC'), (8, 10, 'LOC')] 27 | }) 28 | ] 29 | 30 | 31 | @plac.annotations( 32 | model=("Model name. Defaults to blank 'en' model.", "option", "m", str), 33 | output_dir=("Optional output directory", "option", "o", Path), 34 | n_iter=("Number of training iterations", "option", "n", int)) 35 | def main(model=None, output_dir=None, n_iter=100): 36 | """Load the model, set up the pipeline and train the entity recognizer.""" 37 | if model is not None: 38 | nlp = spacy.load(model) # load existing spaCy model 39 | print("Loaded model '%s'" % model) 40 | else: 41 | nlp = spacy.blank('en') # create blank Language class 42 | print("Created blank 'en' model") 43 | 44 | # create the built-in pipeline components and add them to the pipeline 45 | # nlp.create_pipe works for built-ins that are registered with spaCy 46 | if 'ner' not in nlp.pipe_names: 47 | ner = nlp.create_pipe('ner') 48 | nlp.add_pipe(ner, last=True) 49 | # otherwise, get it so we can add labels 50 | else: 51 | ner = nlp.get_pipe('ner') 52 | 53 | # add labels 54 | for _, annotations in TRAIN_DATA: 55 | for ent in annotations.get('entities'): 56 | ner.add_label(ent[2]) 57 | 58 | # get names of other pipes to disable them during training 59 | other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner'] 60 | with nlp.disable_pipes(*other_pipes): # only train NER 61 | optimizer = nlp.begin_training() 62 | for itn in range(n_iter): 63 | random.shuffle(TRAIN_DATA) 64 | losses = {} 65 | for text, annotations in TRAIN_DATA: 66 | nlp.update( 67 | [text], # batch of texts 68 | [annotations], # batch of annotations 69 | drop=0.5, # dropout - make it harder to memorise data 70 | sgd=optimizer, # callable to update weights 71 | losses=losses) 72 | print(losses) 73 | 74 | # test the trained model 75 | for text, _ in TRAIN_DATA: 76 | doc = nlp(text) 77 | print('Entities', [(ent.text, ent.label_) for ent in doc.ents]) 78 | print('Tokens', [(t.text, t.ent_type_, t.ent_iob) for t in doc]) 79 | 80 | # save model to output directory 81 | if output_dir is not None: 82 | output_dir = Path(output_dir) 83 | if not output_dir.exists(): 84 | output_dir.mkdir() 85 | nlp.to_disk(output_dir) 86 | print("Saved model to", output_dir) 87 | 88 | # test the saved model 89 | print("Loading from", output_dir) 90 | nlp2 = spacy.load(output_dir) 91 | for text, _ in TRAIN_DATA: 92 | doc = nlp2(text) 93 | print('Entities', [(ent.text, ent.label_) for ent in doc.ents]) 94 | print('Tokens', [(t.text, t.ent_type_, t.ent_iob) for t in doc]) 95 | 96 | 97 | if __name__ == '__main__': 98 | plac.call(main) 99 | 100 | # Expected output: 101 | # Entities [('Shaka Khan', 'PERSON')] 102 | # Tokens [('Who', '', 2), ('is', '', 2), ('Shaka', 'PERSON', 3), 103 | # ('Khan', 'PERSON', 1), ('?', '', 2)] 104 | # Entities [('London', 'LOC'), ('Berlin', 'LOC')] 105 | # Tokens [('I', '', 2), ('like', '', 2), ('London', 'LOC', 3), 106 | # ('and', '', 2), ('Berlin', 'LOC', 3), ('.', '', 2)] 107 | -------------------------------------------------------------------------------- /train_new_entity_type_cn.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf8 3 | """Example of training an additional entity type 4 | 5 | This script shows how to add a new entity type to an existing pre-trained NER 6 | model. To keep the example short and simple, only four sentences are provided 7 | as examples. In practice, you'll need many more — a few hundred would be a 8 | good start. You will also likely need to mix in examples of other entity 9 | types, which might be obtained by running the entity recognizer over unlabelled 10 | sentences, and adding their annotations to the training set. 11 | 12 | The actual training is performed by looping over the examples, and calling 13 | `nlp.entity.update()`. The `update()` method steps through the words of the 14 | input. At each word, it makes a prediction. It then consults the annotations 15 | provided on the GoldParse instance, to see whether it was right. If it was 16 | wrong, it adjusts its weights so that the correct action will score higher 17 | next time. 18 | 19 | After training your model, you can save it to a directory. We recommend 20 | wrapping models as Python packages, for ease of deployment. 21 | 22 | For more details, see the documentation: 23 | * Training: https://spacy.io/usage/training 24 | * NER: https://spacy.io/usage/linguistic-features#named-entities 25 | 26 | Compatible with: spaCy v2.0.0+ 27 | """ 28 | from __future__ import unicode_literals, print_function 29 | 30 | import plac 31 | import random 32 | from pathlib import Path 33 | import spacy 34 | 35 | 36 | # new entity label 37 | LABEL = 'ANIMAL' 38 | 39 | # training data 40 | # Note: If you're using an existing model, make sure to mix in examples of 41 | # other entity types that spaCy correctly recognized before. Otherwise, your 42 | # model might learn the new type, but "forget" what it previously knew. 43 | # https://explosion.ai/blog/pseudo-rehearsal-catastrophic-forgetting 44 | TRAIN_DATA = [ 45 | ("马是又高大又善良的动物,是人类的好伙伴。", { 46 | 'entities': [(0, 1, 'ANIMAL')] 47 | }), 48 | 49 | ("它们咬人不?", { 50 | 'entities': [] 51 | }), 52 | 53 | ("很多人都很喜爱马。", { 54 | 'entities': [(7, 8, 'ANIMAL')] 55 | }), 56 | 57 | ("人善人欺,马善人骑。", { 58 | 'entities': [(5, 6, 'ANIMAL')] 59 | }), 60 | 61 | ("蒙古有一种马的品种,个子很矮。", { 62 | 'entities': [(5, 6, 'ANIMAL')] 63 | }), 64 | 65 | ("马?", { 66 | 'entities': [(0, 1, 'ANIMAL')] 67 | }) 68 | ] 69 | 70 | 71 | @plac.annotations( 72 | model=("Model name. Defaults to blank 'en' model.", "option", "m", str), 73 | new_model_name=("New model name for model meta.", "option", "nm", str), 74 | output_dir=("Optional output directory", "option", "o", Path), 75 | n_iter=("Number of training iterations", "option", "n", int)) 76 | def main(model=None, new_model_name='animal', output_dir=None, n_iter=20): 77 | """Set up the pipeline and entity recognizer, and train the new entity.""" 78 | if model is not None: 79 | nlp = spacy.load(model) # load existing spaCy model 80 | print("Loaded model '%s'" % model) 81 | else: 82 | nlp = spacy.blank('en') # create blank Language class 83 | print("Created blank 'en' model") 84 | # Add entity recognizer to model if it's not in the pipeline 85 | # nlp.create_pipe works for built-ins that are registered with spaCy 86 | if 'ner' not in nlp.pipe_names: 87 | ner = nlp.create_pipe('ner') 88 | nlp.add_pipe(ner) 89 | # otherwise, get it, so we can add labels to it 90 | else: 91 | ner = nlp.get_pipe('ner') 92 | 93 | ner.add_label(LABEL) # add new entity label to entity recognizer 94 | if model is None: 95 | optimizer = nlp.begin_training() 96 | else: 97 | # Note that 'begin_training' initializes the models, so it'll zero out 98 | # existing entity types. 99 | optimizer = nlp.entity.create_optimizer() 100 | 101 | 102 | 103 | # get names of other pipes to disable them during training 104 | other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner'] 105 | with nlp.disable_pipes(*other_pipes): # only train NER 106 | for itn in range(n_iter): 107 | random.shuffle(TRAIN_DATA) 108 | losses = {} 109 | for text, annotations in TRAIN_DATA: 110 | nlp.update([text], [annotations], sgd=optimizer, drop=0.35, 111 | losses=losses) 112 | print(losses) 113 | 114 | # test the trained model 115 | test_text = '您喜欢马吗?' 116 | doc = nlp(test_text) 117 | print("Entities in '%s'" % test_text) 118 | for ent in doc.ents: 119 | print(ent.label_, ent.text) 120 | 121 | # save model to output directory 122 | if output_dir is not None: 123 | output_dir = Path(output_dir) 124 | if not output_dir.exists(): 125 | output_dir.mkdir() 126 | nlp.meta['name'] = new_model_name # rename model 127 | nlp.to_disk(output_dir) 128 | print("Saved model to", output_dir) 129 | 130 | # test the saved model 131 | print("Loading from", output_dir) 132 | nlp2 = spacy.load(output_dir) 133 | doc2 = nlp2(test_text) 134 | for ent in doc2.ents: 135 | print(ent.label_, ent.text) 136 | 137 | 138 | if __name__ == '__main__': 139 | plac.call(main) 140 | -------------------------------------------------------------------------------- /train_parser_cn.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf8 3 | """Example of training spaCy dependency parser, starting off with an existing 4 | model or a blank model. For more details, see the documentation: 5 | * Training: https://spacy.io/usage/training 6 | * Dependency Parse: https://spacy.io/usage/linguistic-features#dependency-parse 7 | 8 | Compatible with: spaCy v2.0.0+ 9 | """ 10 | from __future__ import unicode_literals, print_function 11 | 12 | import plac 13 | import random 14 | from pathlib import Path 15 | import spacy 16 | 17 | 18 | # training data 19 | TRAIN_DATA = [ 20 | ("他们进行抵押贷款交易。", { 21 | 'heads': [1, 1, 3, 4, 1, 1, 333, 333, 333, 333, 333], 22 | 'deps': ['nsubj', 'ROOT', 'compound', 'nmod', 'dobj', 'punct', 'depdep', 'dep', 'dep', 'dep', 'dep'] 23 | }), 24 | ("我喜欢伦敦和柏林。", { 25 | 'heads': [1, 1, 1, 2, 2, 1, 333, 333, 333], 26 | 'deps': ['nsubj', 'ROOT', 'dobj', 'cc', 'conj', 'punct', 'dep', 'dep', 'dep'] 27 | }), 28 | ("你在找些什么?", { 29 | 'heads': [2, 2, 2, 2, 2, 333, 333], 30 | 'deps': ['nsubj', 'advmod', 'ROOT', 'obj', 'punct', 'dep', 'dep'] 31 | }), 32 | ("我喜欢北京的秋天。", { 33 | 'heads': [1, 1, 3, 4, 1, 1, 333, 333, 333], 34 | 'deps': ['nsubj', 'ROOT', 'nmod', 'case', 'dobj', 'punct', 'dep', 'dep', 'dep'] 35 | }) 36 | ] 37 | 38 | @plac.annotations( 39 | model=("Model name. Defaults to blank 'en' model.", "option", "m", str), 40 | output_dir=("Optional output directory", "option", "o", Path), 41 | n_iter=("Number of training iterations", "option", "n", int)) 42 | def main(model=None, output_dir=None, n_iter=10): 43 | """Load the model, set up the pipeline and train the parser.""" 44 | if model is not None: 45 | nlp = spacy.load(model) # load existing spaCy model 46 | print("Loaded model '%s'" % model) 47 | else: 48 | nlp = spacy.blank('en') # create blank Language class 49 | print("Created blank 'en' model") 50 | 51 | # add the parser to the pipeline if it doesn't exist 52 | # nlp.create_pipe works for built-ins that are registered with spaCy 53 | if 'parser' not in nlp.pipe_names: 54 | parser = nlp.create_pipe('parser') 55 | nlp.add_pipe(parser, first=True) 56 | # otherwise, get it, so we can add labels to it 57 | else: 58 | parser = nlp.get_pipe('parser') 59 | 60 | # add labels to the parser 61 | for _, annotations in TRAIN_DATA: 62 | for dep in annotations.get('deps', []): 63 | parser.add_label(dep) 64 | 65 | # get names of other pipes to disable them during training 66 | other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'parser'] 67 | with nlp.disable_pipes(*other_pipes): # only train parser 68 | optimizer = nlp.begin_training() 69 | for itn in range(n_iter): 70 | random.shuffle(TRAIN_DATA) 71 | losses = {} 72 | for text, annotations in TRAIN_DATA: 73 | nlp.update([text], [annotations], sgd=optimizer, losses=losses) 74 | print(losses) 75 | 76 | # test the trained model 77 | test_text = "我喜欢北京的秋天。" 78 | doc = nlp(test_text) 79 | print('Dependencies', [(t.text, t.dep_, t.head.text) for t in doc]) 80 | 81 | # save model to output directory 82 | if output_dir is not None: 83 | output_dir = Path(output_dir) 84 | if not output_dir.exists(): 85 | output_dir.mkdir() 86 | nlp.to_disk(output_dir) 87 | print("Saved model to", output_dir) 88 | 89 | # test the saved model 90 | print("Loading from", output_dir) 91 | nlp2 = spacy.load(output_dir) 92 | doc = nlp2(test_text) 93 | print('Dependencies', [(t.text, t.dep_, t.head.text) for t in doc]) 94 | 95 | 96 | if __name__ == '__main__': 97 | plac.call(main) 98 | 99 | # expected result: 100 | # [ 101 | # ('I', 'nsubj', 'like'), 102 | # ('like', 'ROOT', 'like'), 103 | # ('securities', 'dobj', 'like'), 104 | # ('.', 'punct', 'like') 105 | # ] 106 | -------------------------------------------------------------------------------- /train_tagger_cn.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf8 3 | """ 4 | A simple example for training a part-of-speech tagger with a custom tag map. 5 | To allow us to update the tag map with our custom one, this example starts off 6 | with a blank Language class and modifies its defaults. For more details, see 7 | the documentation: 8 | * Training: https://spacy.io/usage/training 9 | * POS Tagging: https://spacy.io/usage/linguistic-features#pos-tagging 10 | 11 | Compatible with: spaCy v2.0.0+ 12 | """ 13 | from __future__ import unicode_literals, print_function 14 | 15 | import plac 16 | import random 17 | from pathlib import Path 18 | import spacy 19 | 20 | 21 | # You need to define a mapping from your data's part-of-speech tag names to the 22 | # Universal Part-of-Speech tag set, as spaCy includes an enum of these tags. 23 | # See here for the Universal Tag Set: 24 | # http://universaldependencies.github.io/docs/u/pos/index.html 25 | # You may also specify morphological features for your tags, from the universal 26 | # scheme. 27 | TAG_MAP = { 28 | 'N': {'pos': 'NOUN'}, 29 | 'V': {'pos': 'VERB'}, 30 | 'J': {'pos': 'ADJ'} 31 | } 32 | 33 | # Usually you'll read this in, of course. Data formats vary. Ensure your 34 | # strings are unicode and that the number of tags assigned matches spaCy's 35 | # tokenization. If not, you can always add a 'words' key to the annotations 36 | # that specifies the gold-standard tokenization, e.g.: 37 | # ("Eatblueham", {'words': ['Eat', 'blue', 'ham'] 'tags': ['V', 'J', 'N']}) 38 | TRAIN_DATA = [ 39 | ("我喜欢红苹果", {'tags': ['N', 'V', 'J', 'N']}), 40 | ("吃蓝色汉堡", {'tags': ['V', 'J', 'N']}) 41 | ] 42 | 43 | 44 | @plac.annotations( 45 | lang=("ISO Code of language to use", "option", "l", str), 46 | output_dir=("Optional output directory", "option", "o", Path), 47 | n_iter=("Number of training iterations", "option", "n", int)) 48 | def main(lang='en', output_dir=None, n_iter=25): 49 | """Create a new model, set up the pipeline and train the tagger. In order to 50 | train the tagger with a custom tag map, we're creating a new Language 51 | instance with a custom vocab. 52 | """ 53 | nlp = spacy.blank(lang) 54 | # add the tagger to the pipeline 55 | # nlp.create_pipe works for built-ins that are registered with spaCy 56 | tagger = nlp.create_pipe('tagger') 57 | # Add the tags. This needs to be done before you start training. 58 | for tag, values in TAG_MAP.items(): 59 | tagger.add_label(tag, values) 60 | nlp.add_pipe(tagger) 61 | 62 | optimizer = nlp.begin_training() 63 | for i in range(n_iter): 64 | random.shuffle(TRAIN_DATA) 65 | losses = {} 66 | for text, annotations in TRAIN_DATA: 67 | nlp.update([text], [annotations], sgd=optimizer, losses=losses) 68 | print(losses) 69 | 70 | # test the trained model 71 | test_text = "我喜欢黑色衬衫" 72 | doc = nlp(test_text) 73 | print('Tags', [(t.text, t.tag_, t.pos_) for t in doc]) 74 | 75 | # save model to output directory 76 | if output_dir is not None: 77 | output_dir = Path(output_dir) 78 | if not output_dir.exists(): 79 | output_dir.mkdir() 80 | nlp.to_disk(output_dir) 81 | print("Saved model to", output_dir) 82 | 83 | # test the save model 84 | print("Loading from", output_dir) 85 | nlp2 = spacy.load(output_dir) 86 | doc = nlp2(test_text) 87 | print('Tags', [(t.text, t.tag_, t.pos_) for t in doc]) 88 | 89 | 90 | if __name__ == '__main__': 91 | plac.call(main) 92 | 93 | # Expected output: 94 | # [ 95 | # ('I', 'N', 'NOUN'), 96 | # ('like', 'V', 'VERB'), 97 | # ('blue', 'J', 'ADJ'), 98 | # ('eggs', 'N', 'NOUN') 99 | # ] 100 | -------------------------------------------------------------------------------- /vectors_fast_text.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf8 3 | """Load vectors for a language trained using fastText 4 | https://github.com/facebookresearch/fastText/blob/master/pretrained-vectors.md 5 | Compatible with: spaCy v2.0.0+ 6 | """ 7 | from __future__ import unicode_literals 8 | import plac 9 | import numpy 10 | 11 | import spacy 12 | from spacy.language import Language 13 | 14 | 15 | @plac.annotations( 16 | vectors_loc=("Path to .vec file", "positional", None, str), 17 | lang=("Optional language ID. If not set, blank Language() will be used.", 18 | "positional", None, str)) 19 | def main(vectors_loc, lang=None): 20 | if lang is None: 21 | nlp = Language() 22 | else: 23 | # create empty language class – this is required if you're planning to 24 | # save the model to disk and load it back later (models always need a 25 | # "lang" setting). Use 'xx' for blank multi-language class. 26 | nlp = spacy.blank(lang) 27 | with open(vectors_loc, 'rb') as file_: 28 | header = file_.readline() 29 | nr_row, nr_dim = header.split() 30 | print(nr_row, nr_dim) 31 | 32 | nlp.vocab.reset_vectors(width=int(nr_dim)) 33 | 34 | for line in file_: 35 | line = line.rstrip().decode('utf8') 36 | pieces = line.rsplit(' ', int(nr_dim)) 37 | word = pieces[0] 38 | vector = numpy.asarray([float(v) for v in pieces[1:]], dtype='f') 39 | nlp.vocab.set_vector(word, vector) # add the vectors to the vocab 40 | 41 | print(word) 42 | # test the vectors and similarity 43 | # text = '您好' 44 | # doc = nlp(text) 45 | # print(text, doc[0].similarity(doc[1])) 46 | nlp.to_disk("./zh_model") 47 | 48 | 49 | if __name__ == '__main__': 50 | plac.call(main) -------------------------------------------------------------------------------- /zh_model/meta.json: -------------------------------------------------------------------------------- 1 | { 2 | "lang":"zh", 3 | "name":"model", 4 | "version":"0.0.0", 5 | "spacy_version":">=2.0.10", 6 | "description":"", 7 | "author":"", 8 | "email":"", 9 | "url":"", 10 | "license":"", 11 | "vectors":{ 12 | "width":0, 13 | "vectors":0, 14 | "keys":0, 15 | "name":"spacy_pretrained_vectors" 16 | }, 17 | "pipeline":[ 18 | "parser", 19 | "tagger" 20 | ] 21 | } -------------------------------------------------------------------------------- /zh_model/ner/cfg: -------------------------------------------------------------------------------- 1 | { 2 | "beam_width":1, 3 | "beam_density":0.0, 4 | "cnn_maxout_pieces":3, 5 | "extra_labels":[ 6 | "PERSON", 7 | "PERSON", 8 | "PERSON", 9 | "PERSON", 10 | "LOC", 11 | "LOC", 12 | "LOC", 13 | "LOC", 14 | "ANIMAL", 15 | "ANIMAL", 16 | "ANIMAL", 17 | "ANIMAL" 18 | ], 19 | "nr_class":9, 20 | "hidden_depth":1, 21 | "token_vector_width":128, 22 | "hidden_width":200, 23 | "maxout_pieces":2, 24 | "pretrained_vectors":"spacy_pretrained_vectors", 25 | "hist_size":0, 26 | "hist_width":0 27 | } -------------------------------------------------------------------------------- /zh_model/ner/lower_model: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jeusgao/spaCy-new-language-test-Chinese/f202c409a56247cebd29165450919e83b412f309/zh_model/ner/lower_model -------------------------------------------------------------------------------- /zh_model/ner/moves: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jeusgao/spaCy-new-language-test-Chinese/f202c409a56247cebd29165450919e83b412f309/zh_model/ner/moves -------------------------------------------------------------------------------- /zh_model/ner/tok2vec_model: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jeusgao/spaCy-new-language-test-Chinese/f202c409a56247cebd29165450919e83b412f309/zh_model/ner/tok2vec_model -------------------------------------------------------------------------------- /zh_model/ner/upper_model: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jeusgao/spaCy-new-language-test-Chinese/f202c409a56247cebd29165450919e83b412f309/zh_model/ner/upper_model -------------------------------------------------------------------------------- /zh_model/parser/cfg: -------------------------------------------------------------------------------- 1 | { 2 | "beam_width":1, 3 | "beam_density":0.0, 4 | "cnn_maxout_pieces":3, 5 | "extra_labels":[ 6 | "ROOT", 7 | "ROOT", 8 | "ROOT", 9 | "ROOT", 10 | "ROOT", 11 | "ATTRIBUTE", 12 | "ATTRIBUTE", 13 | "ATTRIBUTE", 14 | "ATTRIBUTE", 15 | "ATTRIBUTE", 16 | "QUALITY", 17 | "QUALITY", 18 | "QUALITY", 19 | "QUALITY", 20 | "QUALITY", 21 | "-", 22 | "-", 23 | "-", 24 | "-", 25 | "-", 26 | "PLACE", 27 | "PLACE", 28 | "PLACE", 29 | "PLACE", 30 | "PLACE", 31 | "TIME", 32 | "TIME", 33 | "TIME", 34 | "TIME", 35 | "TIME", 36 | "PRODUCT", 37 | "PRODUCT", 38 | "PRODUCT", 39 | "PRODUCT", 40 | "PRODUCT", 41 | "LOCATION", 42 | "LOCATION", 43 | "LOCATION", 44 | "LOCATION", 45 | "LOCATION" 46 | ], 47 | "nr_class":42, 48 | "hidden_depth":1, 49 | "token_vector_width":128, 50 | "hidden_width":200, 51 | "maxout_pieces":2, 52 | "pretrained_vectors":null, 53 | "hist_size":0, 54 | "hist_width":0 55 | } -------------------------------------------------------------------------------- /zh_model/parser/lower_model: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jeusgao/spaCy-new-language-test-Chinese/f202c409a56247cebd29165450919e83b412f309/zh_model/parser/lower_model -------------------------------------------------------------------------------- /zh_model/parser/moves: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jeusgao/spaCy-new-language-test-Chinese/f202c409a56247cebd29165450919e83b412f309/zh_model/parser/moves -------------------------------------------------------------------------------- /zh_model/parser/tok2vec_model: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jeusgao/spaCy-new-language-test-Chinese/f202c409a56247cebd29165450919e83b412f309/zh_model/parser/tok2vec_model -------------------------------------------------------------------------------- /zh_model/parser/upper_model: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jeusgao/spaCy-new-language-test-Chinese/f202c409a56247cebd29165450919e83b412f309/zh_model/parser/upper_model -------------------------------------------------------------------------------- /zh_model/tagger/cfg: -------------------------------------------------------------------------------- 1 | { 2 | "cnn_maxout_pieces":2, 3 | "pretrained_vectors":null 4 | } -------------------------------------------------------------------------------- /zh_model/tagger/model: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jeusgao/spaCy-new-language-test-Chinese/f202c409a56247cebd29165450919e83b412f309/zh_model/tagger/model -------------------------------------------------------------------------------- /zh_model/tagger/tag_map: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jeusgao/spaCy-new-language-test-Chinese/f202c409a56247cebd29165450919e83b412f309/zh_model/tagger/tag_map -------------------------------------------------------------------------------- /zh_model/tokenizer: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jeusgao/spaCy-new-language-test-Chinese/f202c409a56247cebd29165450919e83b412f309/zh_model/tokenizer -------------------------------------------------------------------------------- /zh_model/vocab/key2row: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jeusgao/spaCy-new-language-test-Chinese/f202c409a56247cebd29165450919e83b412f309/zh_model/vocab/key2row -------------------------------------------------------------------------------- /zh_model/vocab/lexemes.bin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jeusgao/spaCy-new-language-test-Chinese/f202c409a56247cebd29165450919e83b412f309/zh_model/vocab/lexemes.bin -------------------------------------------------------------------------------- /zh_model/vocab/strings.json: -------------------------------------------------------------------------------- 1 | [ 2 | "\"\"", 3 | "#", 4 | "$", 5 | "''", 6 | ",", 7 | "-LRB-", 8 | "-RRB-", 9 | ".", 10 | ":", 11 | "ADD", 12 | "AFX", 13 | "BES", 14 | "CC", 15 | "CD", 16 | "DT", 17 | "EX", 18 | "FW", 19 | "GW", 20 | "HVS", 21 | "HYPH", 22 | "IN", 23 | "JJ", 24 | "JJR", 25 | "JJS", 26 | "LS", 27 | "MD", 28 | "NFP", 29 | "NIL", 30 | "NN", 31 | "NNP", 32 | "NNPS", 33 | "NNS", 34 | "PDT", 35 | "PRP", 36 | "PRP$", 37 | "RB", 38 | "RBR", 39 | "RBS", 40 | "RP", 41 | "SP", 42 | "TO", 43 | "UH", 44 | "VB", 45 | "VBD", 46 | "VBG", 47 | "VBN", 48 | "VBP", 49 | "VBZ", 50 | "WDT", 51 | "WP", 52 | "WP$", 53 | "WRB", 54 | "XX", 55 | "_SP", 56 | "``", 57 | "-PRON-", 58 | "be", 59 | "\t", 60 | "zh", 61 | "\n", 62 | " ", 63 | "\")", 64 | "\"", 65 | "'", 66 | "(*_*)", 67 | "(", 68 | "_*)", 69 | "(-8", 70 | "(-d", 71 | "(-:", 72 | "(-;", 73 | "(-_-)", 74 | "_-)", 75 | "(._.)", 76 | "_.)", 77 | "(:", 78 | "(;", 79 | "(=", 80 | "(>_<)", 81 | "_<)", 82 | "(^_^)", 83 | "_^)", 84 | "(o:", 85 | "(x:", 86 | "(\u00ac_\u00ac)", 87 | "_\u00ac)", 88 | "(\u0ca0_\u0ca0)", 89 | "_\u0ca0)", 90 | "(x_x)", 91 | "(\u256f\u00b0\u25a1\u00b0\uff09\u256f\ufe35\u253b\u2501\u253b", 92 | "\u253b\u2501\u253b", 93 | ")-:", 94 | ")", 95 | "):", 96 | "-_-", 97 | "-", 98 | "-__-", 99 | "__-", 100 | "._.", 101 | "0.0", 102 | "0", 103 | "d.d", 104 | "0.o", 105 | "d.x", 106 | "0_0", 107 | "d_d", 108 | "0_o", 109 | "d_x", 110 | "8)", 111 | "8", 112 | "d)", 113 | "8-)", 114 | "d-)", 115 | "8-D", 116 | "8-d", 117 | "d-X", 118 | "8D", 119 | "8d", 120 | "dX", 121 | ":'(", 122 | ":')", 123 | ":'-(", 124 | "'-(", 125 | ":'-)", 126 | "'-)", 127 | ":(", 128 | ":((", 129 | ":(((", 130 | "(((", 131 | ":()", 132 | ":)", 133 | ":))", 134 | ":)))", 135 | ")))", 136 | ":*", 137 | ":-(", 138 | ":-((", 139 | "-((", 140 | ":-(((", 141 | ":-)", 142 | ":-))", 143 | "-))", 144 | ":-)))", 145 | ":-*", 146 | ":-/", 147 | ":-0", 148 | ":-d", 149 | ":-3", 150 | ":->", 151 | ":-D", 152 | ":-X", 153 | ":-O", 154 | ":-o", 155 | ":-P", 156 | ":-p", 157 | ":-x", 158 | ":-]", 159 | ":-|", 160 | ":-}", 161 | ":/", 162 | ":0", 163 | ":d", 164 | ":1", 165 | ":3", 166 | ":>", 167 | ":D", 168 | ":X", 169 | ":O", 170 | ":o", 171 | ":P", 172 | ":p", 173 | ":x", 174 | ":]", 175 | ":o)", 176 | ":x)", 177 | ":|", 178 | ":}", 179 | ":\u2019(", 180 | ":\u2019)", 181 | ":\u2019-(", 182 | "\u2019-(", 183 | ":\u2019-)", 184 | "\u2019-)", 185 | ";)", 186 | ";", 187 | ";-)", 188 | ";-D", 189 | ";-d", 190 | ";-X", 191 | ";D", 192 | ";d", 193 | ";X", 194 | ";_;", 195 | "<.<", 196 | "<", 197 | "", 207 | "ce>", 208 | "", 209 | "=(", 210 | "=", 211 | "=)", 212 | "=/", 213 | "=3", 214 | "=d", 215 | "=D", 216 | "=X", 217 | "=|", 218 | ">.<", 219 | ">", 220 | ">.>", 221 | ">:(", 222 | ">:o", 223 | ">:x", 224 | "><(((*>", 225 | "(*>", 226 | "@_@", 227 | "@", 228 | "C++", 229 | "c++", 230 | "C", 231 | "X++", 232 | "O.O", 233 | "o.o", 234 | "O", 235 | "X.X", 236 | "O.o", 237 | "X.x", 238 | "O_O", 239 | "o_o", 240 | "X_X", 241 | "O_o", 242 | "X_x", 243 | "V.V", 244 | "v.v", 245 | "V", 246 | "V_V", 247 | "v_v", 248 | "XD", 249 | "xd", 250 | "XDD", 251 | "xdd", 252 | "XXX", 253 | "[-:", 254 | "[", 255 | "[:", 256 | "\\\")", 257 | "\\", 258 | "\\n", 259 | "\\x", 260 | "\\t", 261 | "^_^", 262 | "^", 263 | "^__^", 264 | "__^", 265 | "^___^", 266 | "a.", 267 | "a", 268 | "x.", 269 | "b.", 270 | "b", 271 | "c.", 272 | "c", 273 | "d.", 274 | "d", 275 | "e.", 276 | "e", 277 | "f.", 278 | "f", 279 | "g.", 280 | "g", 281 | "h.", 282 | "h", 283 | "i.", 284 | "i", 285 | "j.", 286 | "j", 287 | "k.", 288 | "k", 289 | "l.", 290 | "l", 291 | "m.", 292 | "m", 293 | "n.", 294 | "n", 295 | "o.", 296 | "o", 297 | "o.0", 298 | "x.d", 299 | "o.O", 300 | "x.X", 301 | "x.x", 302 | "o_0", 303 | "x_d", 304 | "o_O", 305 | "x_X", 306 | "x_x", 307 | "p.", 308 | "p", 309 | "q.", 310 | "q", 311 | "r.", 312 | "r", 313 | "s.", 314 | "s", 315 | "t.", 316 | "t", 317 | "u.", 318 | "u", 319 | "v.", 320 | "v", 321 | "w.", 322 | "w", 323 | "x", 324 | "xD", 325 | "xX", 326 | "xDD", 327 | "xXX", 328 | "y.", 329 | "y", 330 | "z.", 331 | "z", 332 | "\u00a0", 333 | " ", 334 | "\u00af\\(\u30c4)/\u00af", 335 | "\u00af", 336 | ")/\u00af", 337 | "\u00af\\(x)/\u00af", 338 | "\u00e4.", 339 | "\u00e4", 340 | "\u00f6.", 341 | "\u00f6", 342 | "\u00fc.", 343 | "\u00fc", 344 | "\u0ca0_\u0ca0", 345 | "\u0ca0", 346 | "\u0ca0\ufe35\u0ca0", 347 | "x\ufe35x", 348 | "\u2014", 349 | "--", 350 | "\u2019", 351 | "\u2019\u2019", 352 | "N", 353 | "J", 354 | "\u5403", 355 | "\u84dd\u8272", 356 | "\u84dd", 357 | "xx", 358 | "\u6c49\u5821", 359 | "\u6c49", 360 | "\u6211", 361 | "\u559c\u6b22", 362 | "\u559c", 363 | "\u7ea2", 364 | "\u82f9\u679c", 365 | "\u82f9", 366 | "\u9ed1\u8272", 367 | "\u9ed1", 368 | "\u886c\u886b", 369 | "\u886c", 370 | "ROOT", 371 | "ATTRIBUTE", 372 | "QUALITY", 373 | "PLACE", 374 | "\u627e", 375 | "\u4e00\u4e2a", 376 | "\u4e00", 377 | "\u65e0\u7ebf", 378 | "\u65e0", 379 | "\u8d28\u91cf", 380 | "\u8d28", 381 | "\u597d", 382 | "\u7684", 383 | "\u5496\u5561\u5385", 384 | "\u5496", 385 | "xxx", 386 | "find", 387 | "ind", 388 | "xxxx", 389 | "hotel", 390 | "tel", 391 | "with", 392 | "ith", 393 | "good", 394 | "ood", 395 | "wifi", 396 | "ifi", 397 | "me", 398 | "the", 399 | "cheapest", 400 | "est", 401 | "gym", 402 | "near", 403 | "ear", 404 | "work", 405 | "ork", 406 | "show", 407 | "how", 408 | "best", 409 | "in", 410 | "berlin", 411 | "lin", 412 | "\u5965\u7f8e\u62c9\u5511", 413 | "\u5965", 414 | "\u7f8e\u62c9\u5511", 415 | "\u5bf9", 416 | "\u53cd\u6d41\u6027", 417 | "\u53cd", 418 | "\u98df\u9053\u708e", 419 | "\u98df", 420 | "\u6cbb\u7597", 421 | "\u6cbb", 422 | "\u5177\u6709", 423 | "\u5177", 424 | "\u975e\u5e38\u660e\u663e", 425 | "\u975e", 426 | "\u5e38\u660e\u663e", 427 | "\u6548\u679c", 428 | "\u6548", 429 | "\u6700\u597d", 430 | "\u6700", 431 | "\u4f9b\u5e94\u5546", 432 | "\u4f9b", 433 | "TARGET", 434 | "2012", 435 | "2", 436 | "012", 437 | "dddd", 438 | "\u5e74", 439 | "\u76ee\u6807", 440 | "\u4eca\u5e74", 441 | "\u4eca", 442 | "\u51a0\u519b", 443 | "\u51a0", 444 | "\u6839", 445 | "\u5c5e\u6027", 446 | "\u6839-", 447 | "-\u5c5e\u6027-", 448 | "-\u8d28\u91cf-", 449 | "-\u76ee\u6807-", 450 | "\u9760\u8fd1", 451 | "\u9760", 452 | "\u6d77\u8fb9", 453 | "\u6d77", 454 | "\u9152\u5e97", 455 | "\u9152", 456 | "\u7ed9", 457 | "\u6700\u8fd1", 458 | "\u5173\u95e8", 459 | "\u5173", 460 | "\u665a", 461 | "\u5065\u8eab\u623f", 462 | "\u5065", 463 | "\u544a\u8bc9", 464 | "\u544a", 465 | "\u4fbf\u5b9c", 466 | "\u4fbf", 467 | "\u5356\u82b1", 468 | "\u5356", 469 | "\u5546\u5e97", 470 | "\u5546", 471 | "LOCATION", 472 | "\u5728", 473 | "\u4f26\u6566", 474 | "\u4f26", 475 | "\u9910\u5385", 476 | "\u9910", 477 | "\u67cf\u6797", 478 | "\u67cf", 479 | "\u6700\u9177", 480 | "\u65c5\u793e", 481 | "\u65c5", 482 | "\u4e0a\u73ed", 483 | "\u4e0a", 484 | "\u8fd1", 485 | "\u610f\u5927\u5229", 486 | "\u610f" 487 | ] -------------------------------------------------------------------------------- /zh_model/vocab/vectors: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jeusgao/spaCy-new-language-test-Chinese/f202c409a56247cebd29165450919e83b412f309/zh_model/vocab/vectors --------------------------------------------------------------------------------