├── .ipynb_checkpoints └── AutoChecker4Chinese-checkpoint.ipynb ├── AutoChecker4Chinese.ipynb ├── AutoChecker4Chinese.pdf ├── Autochecker4Chinese.py ├── cn_dict.txt ├── readme.md ├── readme.pdf ├── result.png ├── token_freq_pos%40350k_jieba.txt ├── token_pinyin%4040k_sogou.txt └── words.dic /.ipynb_checkpoints/AutoChecker4Chinese-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "deletable": true, 7 | "editable": true 8 | }, 9 | "source": [ 10 | "## Solutions of autochecker for chinese" 11 | ] 12 | }, 13 | { 14 | "cell_type": "markdown", 15 | "metadata": { 16 | "deletable": true, 17 | "editable": true 18 | }, 19 | "source": [ 20 | "### 1. Construct a detecter" 21 | ] 22 | }, 23 | { 24 | "cell_type": "code", 25 | "execution_count": 1, 26 | "metadata": { 27 | "collapsed": true, 28 | "deletable": true, 29 | "editable": true 30 | }, 31 | "outputs": [], 32 | "source": [ 33 | "# Step1 : construct a dict to detect the misspelled chinese phrase\n", 34 | "# key is the chinese word, value is its corresponding frequency appeared in corpus\n", 35 | "# you can finish this step by collecting corpus from the internet\n", 36 | "# or you can choose a more easy way, load some dicts already created by others" 37 | ] 38 | }, 39 | { 40 | "cell_type": "code", 41 | "execution_count": 2, 42 | "metadata": { 43 | "collapsed": true, 44 | "deletable": true, 45 | "editable": true 46 | }, 47 | "outputs": [], 48 | "source": [ 49 | "def construct_dict( file_path ):\n", 50 | " \n", 51 | " word_freq = {}\n", 52 | " with open(file_path, \"r\") as f:\n", 53 | " for line in f:\n", 54 | " info = line.split()\n", 55 | " word = info[0]\n", 56 | " frequency = info[1]\n", 57 | " word_freq[word] = frequency\n", 58 | " \n", 59 | " return word_freq" 60 | ] 61 | }, 62 | { 63 | "cell_type": "code", 64 | "execution_count": 3, 65 | "metadata": { 66 | "collapsed": true, 67 | "deletable": true, 68 | "editable": true 69 | }, 70 | "outputs": [], 71 | "source": [ 72 | "FILE_PATH = \"./token_freq_pos%40350k_jieba.txt\"\n", 73 | "\n", 74 | "phrase_freq = construct_dict( FILE_PATH )" 75 | ] 76 | }, 77 | { 78 | "cell_type": "code", 79 | "execution_count": 4, 80 | "metadata": { 81 | "collapsed": false, 82 | "deletable": true, 83 | "editable": true 84 | }, 85 | "outputs": [ 86 | { 87 | "name": "stdout", 88 | "output_type": "stream", 89 | "text": [ 90 | "\n", 91 | "349045\n" 92 | ] 93 | } 94 | ], 95 | "source": [ 96 | "print( type(phrase_freq) )\n", 97 | "print( len(phrase_freq) )" 98 | ] 99 | }, 100 | { 101 | "cell_type": "markdown", 102 | "metadata": { 103 | "deletable": true, 104 | "editable": true 105 | }, 106 | "source": [ 107 | "### 2. Construct an autocorrecter" 108 | ] 109 | }, 110 | { 111 | "cell_type": "code", 112 | "execution_count": 5, 113 | "metadata": { 114 | "collapsed": true, 115 | "deletable": true, 116 | "editable": true 117 | }, 118 | "outputs": [], 119 | "source": [ 120 | "import pinyin" 121 | ] 122 | }, 123 | { 124 | "cell_type": "code", 125 | "execution_count": 11, 126 | "metadata": { 127 | "collapsed": false, 128 | "deletable": true, 129 | "editable": true 130 | }, 131 | "outputs": [], 132 | "source": [ 133 | "# list for chinese words\n", 134 | "# read from the words.dic\n", 135 | "def load_cn_words_dict( file_path ):\n", 136 | " cn_words_dict = \"\"\n", 137 | " with open(file_path, \"r\") as f:\n", 138 | " for word in f:\n", 139 | " cn_words_dict += word.strip().decode(\"utf-8\")\n", 140 | " return cn_words_dict" 141 | ] 142 | }, 143 | { 144 | "cell_type": "code", 145 | "execution_count": 12, 146 | "metadata": { 147 | "collapsed": true, 148 | "deletable": true, 149 | "editable": true 150 | }, 151 | "outputs": [], 152 | "source": [ 153 | "# function calculate the edite distance from the chinese phrase \n", 154 | "def edits1(phrase, cn_words_dict):\n", 155 | " \"All edits that are one edit away from `phrase`.\"\n", 156 | " phrase = phrase.decode(\"utf-8\")\n", 157 | " splits = [(phrase[:i], phrase[i:]) for i in range(len(phrase) + 1)]\n", 158 | " deletes = [L + R[1:] for L, R in splits if R]\n", 159 | " transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R)>1]\n", 160 | " replaces = [L + c + R[1:] for L, R in splits if R for c in cn_words_dict]\n", 161 | " inserts = [L + c + R for L, R in splits for c in cn_words_dict]\n", 162 | " return set(deletes + transposes + replaces + inserts)" 163 | ] 164 | }, 165 | { 166 | "cell_type": "code", 167 | "execution_count": 13, 168 | "metadata": { 169 | "collapsed": true, 170 | "deletable": true, 171 | "editable": true 172 | }, 173 | "outputs": [], 174 | "source": [ 175 | "# return the phrease exist in phrase_freq\n", 176 | "def known(phrases): return set(phrase for phrase in phrases if phrase.encode(\"utf-8\") in phrase_freq)" 177 | ] 178 | }, 179 | { 180 | "cell_type": "code", 181 | "execution_count": 14, 182 | "metadata": { 183 | "collapsed": true, 184 | "deletable": true, 185 | "editable": true 186 | }, 187 | "outputs": [], 188 | "source": [ 189 | "# get the candidates phrase of the error phrase\n", 190 | "# we sort the candidates phrase's importance according to their pinyin\n", 191 | "# if the candidate phrase's pinyin exactly matches with the error phrase, we put them into first order\n", 192 | "# if the candidate phrase's first word pinyin matches with the error phrase first word, we put them into second order\n", 193 | "# else we put candidate phrase into the third order\n", 194 | "def get_candidates( error_phrase ):\n", 195 | " \n", 196 | " candidates_1st_order = []\n", 197 | " candidates_2nd_order = []\n", 198 | " candidates_3nd_order = []\n", 199 | " \n", 200 | " error_pinyin = pinyin.get(error_phrase, format=\"strip\", delimiter=\"/\").encode(\"utf-8\")\n", 201 | " cn_words_dict = load_cn_words_dict( \"./cn_dict.txt\" )\n", 202 | " candidate_phrases = list( known(edits1(error_phrase, cn_words_dict)) )\n", 203 | " \n", 204 | " for candidate_phrase in candidate_phrases:\n", 205 | " candidate_pinyin = pinyin.get(candidate_phrase, format=\"strip\", delimiter=\"/\").encode(\"utf-8\")\n", 206 | " if candidate_pinyin == error_pinyin:\n", 207 | " candidates_1st_order.append(candidate_phrase)\n", 208 | " elif candidate_pinyin.split(\"/\")[0] == error_pinyin.split(\"/\")[0]:\n", 209 | " candidates_2nd_order.append(candidate_phrase)\n", 210 | " else:\n", 211 | " candidates_3nd_order.append(candidate_phrase)\n", 212 | " \n", 213 | " return candidates_1st_order, candidates_2nd_order, candidates_3nd_order" 214 | ] 215 | }, 216 | { 217 | "cell_type": "code", 218 | "execution_count": 15, 219 | "metadata": { 220 | "collapsed": false, 221 | "deletable": true, 222 | "editable": true 223 | }, 224 | "outputs": [], 225 | "source": [ 226 | "def auto_correct( error_phrase ):\n", 227 | " \n", 228 | " c1_order, c2_order, c3_order = get_candidates(error_phrase)\n", 229 | " # print c1_order, c2_order, c3_order\n", 230 | " if c1_order:\n", 231 | " return max(c1_order, key=phrase_freq.get )\n", 232 | " elif c2_order:\n", 233 | " return max(c2_order, key=phrase_freq.get )\n", 234 | " else:\n", 235 | " return max(c3_order, key=phrase_freq.get )" 236 | ] 237 | }, 238 | { 239 | "cell_type": "code", 240 | "execution_count": 16, 241 | "metadata": { 242 | "collapsed": false, 243 | "deletable": true, 244 | "editable": true 245 | }, 246 | "outputs": [ 247 | { 248 | "name": "stdout", 249 | "output_type": "stream", 250 | "text": [ 251 | "呕涂 呕吐\n", 252 | "东方之朱 东方之珠\n", 253 | "沙拢 沙龙\n" 254 | ] 255 | } 256 | ], 257 | "source": [ 258 | "# test for the auto_correct \n", 259 | "error_phrase_1 = \"呕涂\" # should be \"呕吐\"\n", 260 | "error_phrase_2 = \"东方之朱\" # should be \"东方之珠\"\n", 261 | "error_phrase_3 = \"沙拢\" # should be \"沙龙\"\n", 262 | "\n", 263 | "print error_phrase_1, auto_correct( error_phrase_1 )\n", 264 | "print error_phrase_2, auto_correct( error_phrase_2 )\n", 265 | "print error_phrase_3, auto_correct( error_phrase_3 )" 266 | ] 267 | }, 268 | { 269 | "cell_type": "markdown", 270 | "metadata": { 271 | "deletable": true, 272 | "editable": true 273 | }, 274 | "source": [ 275 | "### 3. Correct the misspelled phrase in a sentance " 276 | ] 277 | }, 278 | { 279 | "cell_type": "code", 280 | "execution_count": 17, 281 | "metadata": { 282 | "collapsed": true, 283 | "deletable": true, 284 | "editable": true 285 | }, 286 | "outputs": [], 287 | "source": [ 288 | "# step 3 : Tokenization\n", 289 | "# For any given sentence, use jieba do the segmentation\n", 290 | "# Get segment list after segmentation is done\n", 291 | "# check if the remain phrase exists in word_freq dict\n", 292 | "# if not, then it is a misspelled phrase\n", 293 | "# use auto_correct fun to correct the phrase" 294 | ] 295 | }, 296 | { 297 | "cell_type": "code", 298 | "execution_count": 18, 299 | "metadata": { 300 | "collapsed": false, 301 | "deletable": true, 302 | "editable": true 303 | }, 304 | "outputs": [], 305 | "source": [ 306 | "import jieba\n", 307 | "import string\n", 308 | "import re" 309 | ] 310 | }, 311 | { 312 | "cell_type": "code", 313 | "execution_count": 19, 314 | "metadata": { 315 | "collapsed": true, 316 | "deletable": true, 317 | "editable": true 318 | }, 319 | "outputs": [], 320 | "source": [ 321 | "PUNCTUATION_LIST = string.punctuation\n", 322 | "PUNCTUATION_LIST += \"。,?:;{}[]‘“”《》/!%……()\"" 323 | ] 324 | }, 325 | { 326 | "cell_type": "code", 327 | "execution_count": 21, 328 | "metadata": { 329 | "collapsed": true, 330 | "deletable": true, 331 | "editable": true 332 | }, 333 | "outputs": [], 334 | "source": [ 335 | "def auto_correct_sentence( error_sentence, verbose=True):\n", 336 | " \n", 337 | " jieba_cut = jieba.cut(err_test.decode(\"utf-8\"), cut_all=False)\n", 338 | " seg_list = \"\\t\".join(jieba_cut).split(\"\\t\")\n", 339 | " \n", 340 | " correct_sentence = \"\"\n", 341 | " \n", 342 | " for phrase in seg_list:\n", 343 | " \n", 344 | " correct_phrase = phrase\n", 345 | " # check if item is a punctuation\n", 346 | " if phrase not in PUNCTUATION_LIST.decode(\"utf-8\"):\n", 347 | " # check if the phrase in our dict, if not then it is a misspelled phrase\n", 348 | " if phrase.encode(\"utf-8\") not in phrase_freq.keys():\n", 349 | " correct_phrase = auto_correct(phrase.encode(\"utf-8\"))\n", 350 | " if verbose :\n", 351 | " print phrase, correct_phrase\n", 352 | " \n", 353 | " correct_sentence += correct_phrase\n", 354 | " \n", 355 | " if verbose:\n", 356 | " print correct_sentence\n", 357 | " return correct_sentence" 358 | ] 359 | }, 360 | { 361 | "cell_type": "code", 362 | "execution_count": 23, 363 | "metadata": { 364 | "collapsed": false, 365 | "deletable": true, 366 | "editable": true 367 | }, 368 | "outputs": [ 369 | { 370 | "name": "stdout", 371 | "output_type": "stream", 372 | "text": [ 373 | "机七 机器\n", 374 | "领遇 领域\n", 375 | "分知 分枝\n", 376 | "机器学习是人工智能领域最能体现智能的一个分枝!\n" 377 | ] 378 | } 379 | ], 380 | "source": [ 381 | "err_sent = '机七学习是人工智能领遇最能体现智能的一个分知!'\n", 382 | "correct_sent = auto_correct_sentence( err_sent )" 383 | ] 384 | }, 385 | { 386 | "cell_type": "code", 387 | "execution_count": 24, 388 | "metadata": { 389 | "collapsed": false, 390 | "deletable": true, 391 | "editable": true 392 | }, 393 | "outputs": [ 394 | { 395 | "name": "stdout", 396 | "output_type": "stream", 397 | "text": [ 398 | "机器学习是人工智能领域最能体现智能的一个分枝!\n" 399 | ] 400 | } 401 | ], 402 | "source": [ 403 | "print correct_sent" 404 | ] 405 | }, 406 | { 407 | "cell_type": "code", 408 | "execution_count": null, 409 | "metadata": { 410 | "collapsed": true, 411 | "deletable": true, 412 | "editable": true 413 | }, 414 | "outputs": [], 415 | "source": [] 416 | } 417 | ], 418 | "metadata": { 419 | "kernelspec": { 420 | "display_name": "nlp_interview", 421 | "language": "python", 422 | "name": "nlp_interview" 423 | }, 424 | "language_info": { 425 | "codemirror_mode": { 426 | "name": "ipython", 427 | "version": 2 428 | }, 429 | "file_extension": ".py", 430 | "mimetype": "text/x-python", 431 | "name": "python", 432 | "nbconvert_exporter": "python", 433 | "pygments_lexer": "ipython2", 434 | "version": "2.7.13" 435 | } 436 | }, 437 | "nbformat": 4, 438 | "nbformat_minor": 2 439 | } 440 | -------------------------------------------------------------------------------- /AutoChecker4Chinese.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "deletable": true, 7 | "editable": true 8 | }, 9 | "source": [ 10 | "## Solutions of autochecker for chinese" 11 | ] 12 | }, 13 | { 14 | "cell_type": "markdown", 15 | "metadata": { 16 | "deletable": true, 17 | "editable": true 18 | }, 19 | "source": [ 20 | "### 1. Construct a detecter" 21 | ] 22 | }, 23 | { 24 | "cell_type": "code", 25 | "execution_count": 1, 26 | "metadata": { 27 | "collapsed": true, 28 | "deletable": true, 29 | "editable": true 30 | }, 31 | "outputs": [], 32 | "source": [ 33 | "# Step1 : construct a dict to detect the misspelled chinese phrase\n", 34 | "# key is the chinese word, value is its corresponding frequency appeared in corpus\n", 35 | "# you can finish this step by collecting corpus from the internet\n", 36 | "# or you can choose a more easy way, load some dicts already created by others" 37 | ] 38 | }, 39 | { 40 | "cell_type": "code", 41 | "execution_count": 2, 42 | "metadata": { 43 | "collapsed": true, 44 | "deletable": true, 45 | "editable": true 46 | }, 47 | "outputs": [], 48 | "source": [ 49 | "def construct_dict( file_path ):\n", 50 | " \n", 51 | " word_freq = {}\n", 52 | " with open(file_path, \"r\") as f:\n", 53 | " for line in f:\n", 54 | " info = line.split()\n", 55 | " word = info[0]\n", 56 | " frequency = info[1]\n", 57 | " word_freq[word] = frequency\n", 58 | " \n", 59 | " return word_freq" 60 | ] 61 | }, 62 | { 63 | "cell_type": "code", 64 | "execution_count": 3, 65 | "metadata": { 66 | "collapsed": true, 67 | "deletable": true, 68 | "editable": true 69 | }, 70 | "outputs": [], 71 | "source": [ 72 | "FILE_PATH = \"./token_freq_pos%40350k_jieba.txt\"\n", 73 | "\n", 74 | "phrase_freq = construct_dict( FILE_PATH )" 75 | ] 76 | }, 77 | { 78 | "cell_type": "code", 79 | "execution_count": 4, 80 | "metadata": { 81 | "collapsed": false, 82 | "deletable": true, 83 | "editable": true 84 | }, 85 | "outputs": [ 86 | { 87 | "name": "stdout", 88 | "output_type": "stream", 89 | "text": [ 90 | "\n", 91 | "349045\n" 92 | ] 93 | } 94 | ], 95 | "source": [ 96 | "print( type(phrase_freq) )\n", 97 | "print( len(phrase_freq) )" 98 | ] 99 | }, 100 | { 101 | "cell_type": "markdown", 102 | "metadata": { 103 | "deletable": true, 104 | "editable": true 105 | }, 106 | "source": [ 107 | "### 2. Construct an autocorrecter" 108 | ] 109 | }, 110 | { 111 | "cell_type": "code", 112 | "execution_count": 5, 113 | "metadata": { 114 | "collapsed": true, 115 | "deletable": true, 116 | "editable": true 117 | }, 118 | "outputs": [], 119 | "source": [ 120 | "import pinyin" 121 | ] 122 | }, 123 | { 124 | "cell_type": "code", 125 | "execution_count": 11, 126 | "metadata": { 127 | "collapsed": false, 128 | "deletable": true, 129 | "editable": true 130 | }, 131 | "outputs": [], 132 | "source": [ 133 | "# list for chinese words\n", 134 | "# read from the words.dic\n", 135 | "def load_cn_words_dict( file_path ):\n", 136 | " cn_words_dict = \"\"\n", 137 | " with open(file_path, \"r\") as f:\n", 138 | " for word in f:\n", 139 | " cn_words_dict += word.strip().decode(\"utf-8\")\n", 140 | " return cn_words_dict" 141 | ] 142 | }, 143 | { 144 | "cell_type": "code", 145 | "execution_count": 12, 146 | "metadata": { 147 | "collapsed": true, 148 | "deletable": true, 149 | "editable": true 150 | }, 151 | "outputs": [], 152 | "source": [ 153 | "# function calculate the edite distance from the chinese phrase \n", 154 | "def edits1(phrase, cn_words_dict):\n", 155 | " \"All edits that are one edit away from `phrase`.\"\n", 156 | " phrase = phrase.decode(\"utf-8\")\n", 157 | " splits = [(phrase[:i], phrase[i:]) for i in range(len(phrase) + 1)]\n", 158 | " deletes = [L + R[1:] for L, R in splits if R]\n", 159 | " transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R)>1]\n", 160 | " replaces = [L + c + R[1:] for L, R in splits if R for c in cn_words_dict]\n", 161 | " inserts = [L + c + R for L, R in splits for c in cn_words_dict]\n", 162 | " return set(deletes + transposes + replaces + inserts)" 163 | ] 164 | }, 165 | { 166 | "cell_type": "code", 167 | "execution_count": 13, 168 | "metadata": { 169 | "collapsed": true, 170 | "deletable": true, 171 | "editable": true 172 | }, 173 | "outputs": [], 174 | "source": [ 175 | "# return the phrease exist in phrase_freq\n", 176 | "def known(phrases): return set(phrase for phrase in phrases if phrase.encode(\"utf-8\") in phrase_freq)" 177 | ] 178 | }, 179 | { 180 | "cell_type": "code", 181 | "execution_count": 14, 182 | "metadata": { 183 | "collapsed": true, 184 | "deletable": true, 185 | "editable": true 186 | }, 187 | "outputs": [], 188 | "source": [ 189 | "# get the candidates phrase of the error phrase\n", 190 | "# we sort the candidates phrase's importance according to their pinyin\n", 191 | "# if the candidate phrase's pinyin exactly matches with the error phrase, we put them into first order\n", 192 | "# if the candidate phrase's first word pinyin matches with the error phrase first word, we put them into second order\n", 193 | "# else we put candidate phrase into the third order\n", 194 | "def get_candidates( error_phrase ):\n", 195 | " \n", 196 | " candidates_1st_order = []\n", 197 | " candidates_2nd_order = []\n", 198 | " candidates_3nd_order = []\n", 199 | " \n", 200 | " error_pinyin = pinyin.get(error_phrase, format=\"strip\", delimiter=\"/\").encode(\"utf-8\")\n", 201 | " cn_words_dict = load_cn_words_dict( \"./cn_dict.txt\" )\n", 202 | " candidate_phrases = list( known(edits1(error_phrase, cn_words_dict)) )\n", 203 | " \n", 204 | " for candidate_phrase in candidate_phrases:\n", 205 | " candidate_pinyin = pinyin.get(candidate_phrase, format=\"strip\", delimiter=\"/\").encode(\"utf-8\")\n", 206 | " if candidate_pinyin == error_pinyin:\n", 207 | " candidates_1st_order.append(candidate_phrase)\n", 208 | " elif candidate_pinyin.split(\"/\")[0] == error_pinyin.split(\"/\")[0]:\n", 209 | " candidates_2nd_order.append(candidate_phrase)\n", 210 | " else:\n", 211 | " candidates_3nd_order.append(candidate_phrase)\n", 212 | " \n", 213 | " return candidates_1st_order, candidates_2nd_order, candidates_3nd_order" 214 | ] 215 | }, 216 | { 217 | "cell_type": "code", 218 | "execution_count": 15, 219 | "metadata": { 220 | "collapsed": false, 221 | "deletable": true, 222 | "editable": true 223 | }, 224 | "outputs": [], 225 | "source": [ 226 | "def auto_correct( error_phrase ):\n", 227 | " \n", 228 | " c1_order, c2_order, c3_order = get_candidates(error_phrase)\n", 229 | " # print c1_order, c2_order, c3_order\n", 230 | " if c1_order:\n", 231 | " return max(c1_order, key=phrase_freq.get )\n", 232 | " elif c2_order:\n", 233 | " return max(c2_order, key=phrase_freq.get )\n", 234 | " else:\n", 235 | " return max(c3_order, key=phrase_freq.get )" 236 | ] 237 | }, 238 | { 239 | "cell_type": "code", 240 | "execution_count": 16, 241 | "metadata": { 242 | "collapsed": false, 243 | "deletable": true, 244 | "editable": true 245 | }, 246 | "outputs": [ 247 | { 248 | "name": "stdout", 249 | "output_type": "stream", 250 | "text": [ 251 | "呕涂 呕吐\n", 252 | "东方之朱 东方之珠\n", 253 | "沙拢 沙龙\n" 254 | ] 255 | } 256 | ], 257 | "source": [ 258 | "# test for the auto_correct \n", 259 | "error_phrase_1 = \"呕涂\" # should be \"呕吐\"\n", 260 | "error_phrase_2 = \"东方之朱\" # should be \"东方之珠\"\n", 261 | "error_phrase_3 = \"沙拢\" # should be \"沙龙\"\n", 262 | "\n", 263 | "print error_phrase_1, auto_correct( error_phrase_1 )\n", 264 | "print error_phrase_2, auto_correct( error_phrase_2 )\n", 265 | "print error_phrase_3, auto_correct( error_phrase_3 )" 266 | ] 267 | }, 268 | { 269 | "cell_type": "markdown", 270 | "metadata": { 271 | "deletable": true, 272 | "editable": true 273 | }, 274 | "source": [ 275 | "### 3. Correct the misspelled phrase in a sentance " 276 | ] 277 | }, 278 | { 279 | "cell_type": "code", 280 | "execution_count": 17, 281 | "metadata": { 282 | "collapsed": true, 283 | "deletable": true, 284 | "editable": true 285 | }, 286 | "outputs": [], 287 | "source": [ 288 | "# step 3 : Tokenization\n", 289 | "# For any given sentence, use jieba do the segmentation\n", 290 | "# Get segment list after segmentation is done\n", 291 | "# check if the remain phrase exists in word_freq dict\n", 292 | "# if not, then it is a misspelled phrase\n", 293 | "# use auto_correct fun to correct the phrase" 294 | ] 295 | }, 296 | { 297 | "cell_type": "code", 298 | "execution_count": 18, 299 | "metadata": { 300 | "collapsed": false, 301 | "deletable": true, 302 | "editable": true 303 | }, 304 | "outputs": [], 305 | "source": [ 306 | "import jieba\n", 307 | "import string\n", 308 | "import re" 309 | ] 310 | }, 311 | { 312 | "cell_type": "code", 313 | "execution_count": 19, 314 | "metadata": { 315 | "collapsed": true, 316 | "deletable": true, 317 | "editable": true 318 | }, 319 | "outputs": [], 320 | "source": [ 321 | "PUNCTUATION_LIST = string.punctuation\n", 322 | "PUNCTUATION_LIST += \"。,?:;{}[]‘“”《》/!%……()\"" 323 | ] 324 | }, 325 | { 326 | "cell_type": "code", 327 | "execution_count": 21, 328 | "metadata": { 329 | "collapsed": true, 330 | "deletable": true, 331 | "editable": true 332 | }, 333 | "outputs": [], 334 | "source": [ 335 | "def auto_correct_sentence( error_sentence, verbose=True):\n", 336 | " \n", 337 | " jieba_cut = jieba.cut(err_test.decode(\"utf-8\"), cut_all=False)\n", 338 | " seg_list = \"\\t\".join(jieba_cut).split(\"\\t\")\n", 339 | " \n", 340 | " correct_sentence = \"\"\n", 341 | " \n", 342 | " for phrase in seg_list:\n", 343 | " \n", 344 | " correct_phrase = phrase\n", 345 | " # check if item is a punctuation\n", 346 | " if phrase not in PUNCTUATION_LIST.decode(\"utf-8\"):\n", 347 | " # check if the phrase in our dict, if not then it is a misspelled phrase\n", 348 | " if phrase.encode(\"utf-8\") not in phrase_freq.keys():\n", 349 | " correct_phrase = auto_correct(phrase.encode(\"utf-8\"))\n", 350 | " if verbose :\n", 351 | " print phrase, correct_phrase\n", 352 | " \n", 353 | " correct_sentence += correct_phrase\n", 354 | " \n", 355 | " if verbose:\n", 356 | " print correct_sentence\n", 357 | " return correct_sentence" 358 | ] 359 | }, 360 | { 361 | "cell_type": "code", 362 | "execution_count": 23, 363 | "metadata": { 364 | "collapsed": false, 365 | "deletable": true, 366 | "editable": true 367 | }, 368 | "outputs": [ 369 | { 370 | "name": "stdout", 371 | "output_type": "stream", 372 | "text": [ 373 | "机七 机器\n", 374 | "领遇 领域\n", 375 | "分知 分枝\n", 376 | "机器学习是人工智能领域最能体现智能的一个分枝!\n" 377 | ] 378 | } 379 | ], 380 | "source": [ 381 | "err_sent = '机七学习是人工智能领遇最能体现智能的一个分知!'\n", 382 | "correct_sent = auto_correct_sentence( err_sent )" 383 | ] 384 | }, 385 | { 386 | "cell_type": "code", 387 | "execution_count": 24, 388 | "metadata": { 389 | "collapsed": false, 390 | "deletable": true, 391 | "editable": true 392 | }, 393 | "outputs": [ 394 | { 395 | "name": "stdout", 396 | "output_type": "stream", 397 | "text": [ 398 | "机器学习是人工智能领域最能体现智能的一个分枝!\n" 399 | ] 400 | } 401 | ], 402 | "source": [ 403 | "print correct_sent" 404 | ] 405 | }, 406 | { 407 | "cell_type": "code", 408 | "execution_count": null, 409 | "metadata": { 410 | "collapsed": true, 411 | "deletable": true, 412 | "editable": true 413 | }, 414 | "outputs": [], 415 | "source": [] 416 | } 417 | ], 418 | "metadata": { 419 | "kernelspec": { 420 | "display_name": "nlp_interview", 421 | "language": "python", 422 | "name": "nlp_interview" 423 | }, 424 | "language_info": { 425 | "codemirror_mode": { 426 | "name": "ipython", 427 | "version": 2 428 | }, 429 | "file_extension": ".py", 430 | "mimetype": "text/x-python", 431 | "name": "python", 432 | "nbconvert_exporter": "python", 433 | "pygments_lexer": "ipython2", 434 | "version": "2.7.13" 435 | } 436 | }, 437 | "nbformat": 4, 438 | "nbformat_minor": 2 439 | } 440 | -------------------------------------------------------------------------------- /AutoChecker4Chinese.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/beyondacm/Autochecker4Chinese/ca27a2aed69b79cdc639fc088b0a2f942c2a81f5/AutoChecker4Chinese.pdf -------------------------------------------------------------------------------- /Autochecker4Chinese.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/python 2 | # -*- coding:utf-8 -*- 3 | __author__ = "zpgao" 4 | 5 | import sys 6 | import pinyin 7 | import jieba 8 | import string 9 | import re 10 | 11 | FILE_PATH = "./token_freq_pos%40350k_jieba.txt" 12 | PUNCTUATION_LIST = string.punctuation 13 | PUNCTUATION_LIST += "。,?:;{}[]‘“”《》/!%……()" 14 | 15 | 16 | def construct_dict( file_path ): 17 | 18 | word_freq = {} 19 | with open(file_path, "r") as f: 20 | for line in f: 21 | info = line.split() 22 | word = info[0] 23 | frequency = info[1] 24 | word_freq[word] = frequency 25 | 26 | return word_freq 27 | 28 | 29 | def load_cn_words_dict( file_path ): 30 | cn_words_dict = "" 31 | with open(file_path, "r") as f: 32 | for word in f: 33 | cn_words_dict += word.strip().decode("utf-8") 34 | return cn_words_dict 35 | 36 | 37 | def edits1(phrase, cn_words_dict): 38 | "All edits that are one edit away from `phrase`." 39 | phrase = phrase.decode("utf-8") 40 | splits = [(phrase[:i], phrase[i:]) for i in range(len(phrase) + 1)] 41 | deletes = [L + R[1:] for L, R in splits if R] 42 | transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R)>1] 43 | replaces = [L + c + R[1:] for L, R in splits if R for c in cn_words_dict] 44 | inserts = [L + c + R for L, R in splits for c in cn_words_dict] 45 | return set(deletes + transposes + replaces + inserts) 46 | 47 | def known(phrases): return set(phrase for phrase in phrases if phrase.encode("utf-8") in phrase_freq) 48 | 49 | 50 | def get_candidates( error_phrase ): 51 | 52 | candidates_1st_order = [] 53 | candidates_2nd_order = [] 54 | candidates_3nd_order = [] 55 | 56 | error_pinyin = pinyin.get(error_phrase, format="strip", delimiter="/").encode("utf-8") 57 | cn_words_dict = load_cn_words_dict( "./cn_dict.txt" ) 58 | candidate_phrases = list( known(edits1(error_phrase, cn_words_dict)) ) 59 | 60 | for candidate_phrase in candidate_phrases: 61 | candidate_pinyin = pinyin.get(candidate_phrase, format="strip", delimiter="/").encode("utf-8") 62 | if candidate_pinyin == error_pinyin: 63 | candidates_1st_order.append(candidate_phrase) 64 | elif candidate_pinyin.split("/")[0] == error_pinyin.split("/")[0]: 65 | candidates_2nd_order.append(candidate_phrase) 66 | else: 67 | candidates_3nd_order.append(candidate_phrase) 68 | 69 | return candidates_1st_order, candidates_2nd_order, candidates_3nd_order 70 | 71 | 72 | def auto_correct( error_phrase ): 73 | 74 | c1_order, c2_order, c3_order = get_candidates(error_phrase) 75 | # print c1_order, c2_order, c3_order 76 | if c1_order: 77 | return max(c1_order, key=phrase_freq.get ) 78 | elif c2_order: 79 | return max(c2_order, key=phrase_freq.get ) 80 | else: 81 | return max(c3_order, key=phrase_freq.get ) 82 | 83 | def auto_correct_sentence( error_sentence, verbose=True): 84 | 85 | jieba_cut = jieba.cut( error_sentence.decode("utf-8"), cut_all=False) 86 | seg_list = "\t".join(jieba_cut).split("\t") 87 | 88 | correct_sentence = "" 89 | 90 | for phrase in seg_list: 91 | 92 | correct_phrase = phrase 93 | # check if item is a punctuation 94 | if phrase not in PUNCTUATION_LIST.decode("utf-8"): 95 | # check if the phrase in our dict, if not then it is a misspelled phrase 96 | if phrase.encode("utf-8") not in phrase_freq.keys(): 97 | correct_phrase = auto_correct(phrase.encode("utf-8")) 98 | if verbose : 99 | print phrase, correct_phrase 100 | 101 | correct_sentence += correct_phrase 102 | 103 | return correct_sentence 104 | 105 | 106 | 107 | phrase_freq = construct_dict( FILE_PATH ) 108 | 109 | def main(): 110 | 111 | err_sent_1 = '机七学习是人工智能领遇最能体现智能的一个分知!' 112 | print "Test case 1:" 113 | correct_sent = auto_correct_sentence( err_sent_1 ) 114 | print "original sentence:" + err_sent_1 + "\n==>\n" + "corrected sentence:" + correct_sent 115 | 116 | err_sent_2 = '杭洲是中国的八大古都之一,因风景锈丽,享有"人间天棠"的美誉!' 117 | print "Test case 2:" 118 | correct_sent = auto_correct_sentence( err_sent_2 ) 119 | print "original sentence:" + err_sent_2 + "\n==>\n" + "corrected sentence:" + correct_sent 120 | 121 | if __name__=="__main__": 122 | reload(sys) 123 | sys.setdefaultencoding('utf-8') 124 | main() 125 | 126 | -------------------------------------------------------------------------------- /cn_dict.txt: -------------------------------------------------------------------------------- 1 | 一 2 | 乙 3 | 二 4 | 十 5 | 丁 6 | 厂 7 | 七 8 | 卜 9 | 人 10 | 入 11 | 八 12 | 九 13 | 几 14 | 儿 15 | 了 16 | 力 17 | 乃 18 | 刀 19 | 又 20 | 三 21 | 于 22 | 干 23 | 亏 24 | 士 25 | 工 26 | 土 27 | 才 28 | 寸 29 | 下 30 | 大 31 | 丈 32 | 与 33 | 万 34 | 上 35 | 小 36 | 口 37 | 巾 38 | 山 39 | 千 40 | 乞 41 | 川 42 | 亿 43 | 个 44 | 勺 45 | 久 46 | 凡 47 | 及 48 | 夕 49 | 丸 50 | 么 51 | 广 52 | 亡 53 | 门 54 | 义 55 | 之 56 | 尸 57 | 弓 58 | 己 59 | 已 60 | 子 61 | 卫 62 | 也 63 | 女 64 | 飞 65 | 刃 66 | 习 67 | 叉 68 | 马 69 | 乡 70 | 丰 71 | 王 72 | 井 73 | 开 74 | 夫 75 | 天 76 | 无 77 | 元 78 | 专 79 | 云 80 | 扎 81 | 艺 82 | 木 83 | 五 84 | 支 85 | 厅 86 | 不 87 | 太 88 | 犬 89 | 区 90 | 历 91 | 尤 92 | 友 93 | 匹 94 | 车 95 | 巨 96 | 牙 97 | 屯 98 | 比 99 | 互 100 | 切 101 | 瓦 102 | 止 103 | 少 104 | 日 105 | 中 106 | 冈 107 | 贝 108 | 内 109 | 水 110 | 见 111 | 午 112 | 牛 113 | 手 114 | 毛 115 | 气 116 | 升 117 | 长 118 | 仁 119 | 什 120 | 片 121 | 仆 122 | 化 123 | 仇 124 | 币 125 | 仍 126 | 仅 127 | 斤 128 | 爪 129 | 反 130 | 介 131 | 父 132 | 从 133 | 今 134 | 凶 135 | 分 136 | 乏 137 | 公 138 | 仓 139 | 月 140 | 氏 141 | 勿 142 | 欠 143 | 风 144 | 丹 145 | 匀 146 | 乌 147 | 凤 148 | 勾 149 | 文 150 | 六 151 | 方 152 | 火 153 | 为 154 | 斗 155 | 忆 156 | 订 157 | 计 158 | 户 159 | 认 160 | 心 161 | 尺 162 | 引 163 | 丑 164 | 巴 165 | 孔 166 | 队 167 | 办 168 | 以 169 | 允 170 | 予 171 | 劝 172 | 双 173 | 书 174 | 幻 175 | 玉 176 | 刊 177 | 示 178 | 末 179 | 未 180 | 击 181 | 打 182 | 巧 183 | 正 184 | 扑 185 | 扒 186 | 功 187 | 扔 188 | 去 189 | 甘 190 | 世 191 | 古 192 | 节 193 | 本 194 | 术 195 | 可 196 | 丙 197 | 左 198 | 厉 199 | 右 200 | 石 201 | 布 202 | 龙 203 | 平 204 | 灭 205 | 轧 206 | 东 207 | 卡 208 | 北 209 | 占 210 | 业 211 | 旧 212 | 帅 213 | 归 214 | 且 215 | 旦 216 | 目 217 | 叶 218 | 甲 219 | 申 220 | 叮 221 | 电 222 | 号 223 | 田 224 | 由 225 | 史 226 | 只 227 | 央 228 | 兄 229 | 叼 230 | 叫 231 | 另 232 | 叨 233 | 叹 234 | 四 235 | 生 236 | 失 237 | 禾 238 | 丘 239 | 付 240 | 仗 241 | 代 242 | 仙 243 | 们 244 | 仪 245 | 白 246 | 仔 247 | 他 248 | 斥 249 | 瓜 250 | 乎 251 | 丛 252 | 令 253 | 用 254 | 甩 255 | 印 256 | 乐 257 | 句 258 | 匆 259 | 册 260 | 犯 261 | 外 262 | 处 263 | 冬 264 | 鸟 265 | 务 266 | 包 267 | 饥 268 | 主 269 | 市 270 | 立 271 | 闪 272 | 兰 273 | 半 274 | 汁 275 | 汇 276 | 头 277 | 汉 278 | 宁 279 | 穴 280 | 它 281 | 讨 282 | 写 283 | 让 284 | 礼 285 | 训 286 | 必 287 | 议 288 | 讯 289 | 记 290 | 永 291 | 司 292 | 尼 293 | 民 294 | 出 295 | 辽 296 | 奶 297 | 奴 298 | 加 299 | 召 300 | 皮 301 | 边 302 | 发 303 | 孕 304 | 圣 305 | 对 306 | 台 307 | 矛 308 | 纠 309 | 母 310 | 幼 311 | 丝 312 | 式 313 | 刑 314 | 动 315 | 扛 316 | 寺 317 | 吉 318 | 扣 319 | 考 320 | 托 321 | 老 322 | 执 323 | 巩 324 | 圾 325 | 扩 326 | 扫 327 | 地 328 | 扬 329 | 场 330 | 耳 331 | 共 332 | 芒 333 | 亚 334 | 芝 335 | 朽 336 | 朴 337 | 机 338 | 权 339 | 过 340 | 臣 341 | 再 342 | 协 343 | 西 344 | 压 345 | 厌 346 | 在 347 | 有 348 | 百 349 | 存 350 | 而 351 | 页 352 | 匠 353 | 夸 354 | 夺 355 | 灰 356 | 达 357 | 列 358 | 死 359 | 成 360 | 夹 361 | 轨 362 | 邪 363 | 划 364 | 迈 365 | 毕 366 | 至 367 | 此 368 | 贞 369 | 师 370 | 尘 371 | 尖 372 | 劣 373 | 光 374 | 当 375 | 早 376 | 吐 377 | 吓 378 | 虫 379 | 曲 380 | 团 381 | 同 382 | 吊 383 | 吃 384 | 因 385 | 吸 386 | 吗 387 | 屿 388 | 帆 389 | 岁 390 | 回 391 | 岂 392 | 刚 393 | 则 394 | 肉 395 | 网 396 | 年 397 | 朱 398 | 先 399 | 丢 400 | 舌 401 | 竹 402 | 迁 403 | 乔 404 | 伟 405 | 传 406 | 乒 407 | 乓 408 | 休 409 | 伍 410 | 伏 411 | 优 412 | 伐 413 | 延 414 | 件 415 | 任 416 | 伤 417 | 价 418 | 份 419 | 华 420 | 仰 421 | 仿 422 | 伙 423 | 伪 424 | 自 425 | 血 426 | 向 427 | 似 428 | 后 429 | 行 430 | 舟 431 | 全 432 | 会 433 | 杀 434 | 合 435 | 兆 436 | 企 437 | 众 438 | 爷 439 | 伞 440 | 创 441 | 肌 442 | 朵 443 | 杂 444 | 危 445 | 旬 446 | 旨 447 | 负 448 | 各 449 | 名 450 | 多 451 | 争 452 | 色 453 | 壮 454 | 冲 455 | 冰 456 | 庄 457 | 庆 458 | 亦 459 | 刘 460 | 齐 461 | 交 462 | 次 463 | 衣 464 | 产 465 | 决 466 | 充 467 | 妄 468 | 闭 469 | 问 470 | 闯 471 | 羊 472 | 并 473 | 关 474 | 米 475 | 灯 476 | 州 477 | 汗 478 | 污 479 | 江 480 | 池 481 | 汤 482 | 忙 483 | 兴 484 | 宇 485 | 守 486 | 宅 487 | 字 488 | 安 489 | 讲 490 | 军 491 | 许 492 | 论 493 | 农 494 | 讽 495 | 设 496 | 访 497 | 寻 498 | 那 499 | 迅 500 | 尽 501 | 导 502 | 异 503 | 孙 504 | 阵 505 | 阳 506 | 收 507 | 阶 508 | 阴 509 | 防 510 | 奸 511 | 如 512 | 妇 513 | 好 514 | 她 515 | 妈 516 | 戏 517 | 羽 518 | 观 519 | 欢 520 | 买 521 | 红 522 | 纤 523 | 级 524 | 约 525 | 纪 526 | 驰 527 | 巡 528 | 寿 529 | 弄 530 | 麦 531 | 形 532 | 进 533 | 戒 534 | 吞 535 | 远 536 | 违 537 | 运 538 | 扶 539 | 抚 540 | 坛 541 | 技 542 | 坏 543 | 扰 544 | 拒 545 | 找 546 | 批 547 | 扯 548 | 址 549 | 走 550 | 抄 551 | 坝 552 | 贡 553 | 攻 554 | 赤 555 | 折 556 | 抓 557 | 扮 558 | 抢 559 | 孝 560 | 均 561 | 抛 562 | 投 563 | 坟 564 | 抗 565 | 坑 566 | 坊 567 | 抖 568 | 护 569 | 壳 570 | 志 571 | 扭 572 | 块 573 | 声 574 | 把 575 | 报 576 | 却 577 | 劫 578 | 芽 579 | 花 580 | 芹 581 | 芬 582 | 苍 583 | 芳 584 | 严 585 | 芦 586 | 劳 587 | 克 588 | 苏 589 | 杆 590 | 杠 591 | 杜 592 | 材 593 | 村 594 | 杏 595 | 极 596 | 李 597 | 杨 598 | 求 599 | 更 600 | 束 601 | 豆 602 | 两 603 | 丽 604 | 医 605 | 辰 606 | 励 607 | 否 608 | 还 609 | 歼 610 | 来 611 | 连 612 | 步 613 | 坚 614 | 旱 615 | 盯 616 | 呈 617 | 时 618 | 吴 619 | 助 620 | 县 621 | 里 622 | 呆 623 | 园 624 | 旷 625 | 围 626 | 呀 627 | 吨 628 | 足 629 | 邮 630 | 男 631 | 困 632 | 吵 633 | 串 634 | 员 635 | 听 636 | 吩 637 | 吹 638 | 呜 639 | 吧 640 | 吼 641 | 别 642 | 岗 643 | 帐 644 | 财 645 | 针 646 | 钉 647 | 告 648 | 我 649 | 乱 650 | 利 651 | 秃 652 | 秀 653 | 私 654 | 每 655 | 兵 656 | 估 657 | 体 658 | 何 659 | 但 660 | 伸 661 | 作 662 | 伯 663 | 伶 664 | 佣 665 | 低 666 | 你 667 | 住 668 | 位 669 | 伴 670 | 身 671 | 皂 672 | 佛 673 | 近 674 | 彻 675 | 役 676 | 返 677 | 余 678 | 希 679 | 坐 680 | 谷 681 | 妥 682 | 含 683 | 邻 684 | 岔 685 | 肝 686 | 肚 687 | 肠 688 | 龟 689 | 免 690 | 狂 691 | 犹 692 | 角 693 | 删 694 | 条 695 | 卵 696 | 岛 697 | 迎 698 | 饭 699 | 饮 700 | 系 701 | 言 702 | 冻 703 | 状 704 | 亩 705 | 况 706 | 床 707 | 库 708 | 疗 709 | 应 710 | 冷 711 | 这 712 | 序 713 | 辛 714 | 弃 715 | 冶 716 | 忘 717 | 闲 718 | 间 719 | 闷 720 | 判 721 | 灶 722 | 灿 723 | 弟 724 | 汪 725 | 沙 726 | 汽 727 | 沃 728 | 泛 729 | 沟 730 | 没 731 | 沈 732 | 沉 733 | 怀 734 | 忧 735 | 快 736 | 完 737 | 宋 738 | 宏 739 | 牢 740 | 究 741 | 穷 742 | 灾 743 | 良 744 | 证 745 | 启 746 | 评 747 | 补 748 | 初 749 | 社 750 | 识 751 | 诉 752 | 诊 753 | 词 754 | 译 755 | 君 756 | 灵 757 | 即 758 | 层 759 | 尿 760 | 尾 761 | 迟 762 | 局 763 | 改 764 | 张 765 | 忌 766 | 际 767 | 陆 768 | 阿 769 | 陈 770 | 阻 771 | 附 772 | 妙 773 | 妖 774 | 妨 775 | 努 776 | 忍 777 | 劲 778 | 鸡 779 | 驱 780 | 纯 781 | 纱 782 | 纳 783 | 纲 784 | 驳 785 | 纵 786 | 纷 787 | 纸 788 | 纹 789 | 纺 790 | 驴 791 | 纽 792 | 奉 793 | 玩 794 | 环 795 | 武 796 | 青 797 | 责 798 | 现 799 | 表 800 | 规 801 | 抹 802 | 拢 803 | 拔 804 | 拣 805 | 担 806 | 坦 807 | 押 808 | 抽 809 | 拐 810 | 拖 811 | 拍 812 | 者 813 | 顶 814 | 拆 815 | 拥 816 | 抵 817 | 拘 818 | 势 819 | 抱 820 | 垃 821 | 拉 822 | 拦 823 | 拌 824 | 幸 825 | 招 826 | 坡 827 | 披 828 | 拨 829 | 择 830 | 抬 831 | 其 832 | 取 833 | 苦 834 | 若 835 | 茂 836 | 苹 837 | 苗 838 | 英 839 | 范 840 | 直 841 | 茄 842 | 茎 843 | 茅 844 | 林 845 | 枝 846 | 杯 847 | 柜 848 | 析 849 | 板 850 | 松 851 | 枪 852 | 构 853 | 杰 854 | 述 855 | 枕 856 | 丧 857 | 或 858 | 画 859 | 卧 860 | 事 861 | 刺 862 | 枣 863 | 雨 864 | 卖 865 | 矿 866 | 码 867 | 厕 868 | 奔 869 | 奇 870 | 奋 871 | 态 872 | 欧 873 | 垄 874 | 妻 875 | 轰 876 | 顷 877 | 转 878 | 斩 879 | 轮 880 | 软 881 | 到 882 | 非 883 | 叔 884 | 肯 885 | 齿 886 | 些 887 | 虎 888 | 虏 889 | 肾 890 | 贤 891 | 尚 892 | 旺 893 | 具 894 | 果 895 | 味 896 | 昆 897 | 国 898 | 昌 899 | 畅 900 | 明 901 | 易 902 | 昂 903 | 典 904 | 固 905 | 忠 906 | 咐 907 | 呼 908 | 鸣 909 | 咏 910 | 呢 911 | 岸 912 | 岩 913 | 帖 914 | 罗 915 | 帜 916 | 岭 917 | 凯 918 | 败 919 | 贩 920 | 购 921 | 图 922 | 钓 923 | 制 924 | 知 925 | 垂 926 | 牧 927 | 物 928 | 乖 929 | 刮 930 | 秆 931 | 和 932 | 季 933 | 委 934 | 佳 935 | 侍 936 | 供 937 | 使 938 | 例 939 | 版 940 | 侄 941 | 侦 942 | 侧 943 | 凭 944 | 侨 945 | 佩 946 | 货 947 | 依 948 | 的 949 | 迫 950 | 质 951 | 欣 952 | 征 953 | 往 954 | 爬 955 | 彼 956 | 径 957 | 所 958 | 舍 959 | 金 960 | 命 961 | 斧 962 | 爸 963 | 采 964 | 受 965 | 乳 966 | 贪 967 | 念 968 | 贫 969 | 肤 970 | 肺 971 | 肢 972 | 肿 973 | 胀 974 | 朋 975 | 股 976 | 肥 977 | 服 978 | 胁 979 | 周 980 | 昏 981 | 鱼 982 | 兔 983 | 狐 984 | 忽 985 | 狗 986 | 备 987 | 饰 988 | 饱 989 | 饲 990 | 变 991 | 京 992 | 享 993 | 店 994 | 夜 995 | 庙 996 | 府 997 | 底 998 | 剂 999 | 郊 1000 | 废 1001 | 净 1002 | 盲 1003 | 放 1004 | 刻 1005 | 育 1006 | 闸 1007 | 闹 1008 | 郑 1009 | 券 1010 | 卷 1011 | 单 1012 | 炒 1013 | 炊 1014 | 炕 1015 | 炎 1016 | 炉 1017 | 沫 1018 | 浅 1019 | 法 1020 | 泄 1021 | 河 1022 | 沾 1023 | 泪 1024 | 油 1025 | 泊 1026 | 沿 1027 | 泡 1028 | 注 1029 | 泻 1030 | 泳 1031 | 泥 1032 | 沸 1033 | 波 1034 | 泼 1035 | 泽 1036 | 治 1037 | 怖 1038 | 性 1039 | 怕 1040 | 怜 1041 | 怪 1042 | 学 1043 | 宝 1044 | 宗 1045 | 定 1046 | 宜 1047 | 审 1048 | 宙 1049 | 官 1050 | 空 1051 | 帘 1052 | 实 1053 | 试 1054 | 郎 1055 | 诗 1056 | 肩 1057 | 房 1058 | 诚 1059 | 衬 1060 | 衫 1061 | 视 1062 | 话 1063 | 诞 1064 | 询 1065 | 该 1066 | 详 1067 | 建 1068 | 肃 1069 | 录 1070 | 隶 1071 | 居 1072 | 届 1073 | 刷 1074 | 屈 1075 | 弦 1076 | 承 1077 | 孟 1078 | 孤 1079 | 陕 1080 | 降 1081 | 限 1082 | 妹 1083 | 姑 1084 | 姐 1085 | 姓 1086 | 始 1087 | 驾 1088 | 参 1089 | 艰 1090 | 线 1091 | 练 1092 | 组 1093 | 细 1094 | 驶 1095 | 织 1096 | 终 1097 | 驻 1098 | 驼 1099 | 绍 1100 | 经 1101 | 贯 1102 | 奏 1103 | 春 1104 | 帮 1105 | 珍 1106 | 玻 1107 | 毒 1108 | 型 1109 | 挂 1110 | 封 1111 | 持 1112 | 项 1113 | 垮 1114 | 挎 1115 | 城 1116 | 挠 1117 | 政 1118 | 赴 1119 | 赵 1120 | 挡 1121 | 挺 1122 | 括 1123 | 拴 1124 | 拾 1125 | 挑 1126 | 指 1127 | 垫 1128 | 挣 1129 | 挤 1130 | 拼 1131 | 挖 1132 | 按 1133 | 挥 1134 | 挪 1135 | 某 1136 | 甚 1137 | 革 1138 | 荐 1139 | 巷 1140 | 带 1141 | 草 1142 | 茧 1143 | 茶 1144 | 荒 1145 | 茫 1146 | 荡 1147 | 荣 1148 | 故 1149 | 胡 1150 | 南 1151 | 药 1152 | 标 1153 | 枯 1154 | 柄 1155 | 栋 1156 | 相 1157 | 查 1158 | 柏 1159 | 柳 1160 | 柱 1161 | 柿 1162 | 栏 1163 | 树 1164 | 要 1165 | 咸 1166 | 威 1167 | 歪 1168 | 研 1169 | 砖 1170 | 厘 1171 | 厚 1172 | 砌 1173 | 砍 1174 | 面 1175 | 耐 1176 | 耍 1177 | 牵 1178 | 残 1179 | 殃 1180 | 轻 1181 | 鸦 1182 | 皆 1183 | 背 1184 | 战 1185 | 点 1186 | 临 1187 | 览 1188 | 竖 1189 | 省 1190 | 削 1191 | 尝 1192 | 是 1193 | 盼 1194 | 眨 1195 | 哄 1196 | 显 1197 | 哑 1198 | 冒 1199 | 映 1200 | 星 1201 | 昨 1202 | 畏 1203 | 趴 1204 | 胃 1205 | 贵 1206 | 界 1207 | 虹 1208 | 虾 1209 | 蚁 1210 | 思 1211 | 蚂 1212 | 虽 1213 | 品 1214 | 咽 1215 | 骂 1216 | 哗 1217 | 咱 1218 | 响 1219 | 哈 1220 | 咬 1221 | 咳 1222 | 哪 1223 | 炭 1224 | 峡 1225 | 罚 1226 | 贱 1227 | 贴 1228 | 骨 1229 | 钞 1230 | 钟 1231 | 钢 1232 | 钥 1233 | 钩 1234 | 卸 1235 | 缸 1236 | 拜 1237 | 看 1238 | 矩 1239 | 怎 1240 | 牲 1241 | 选 1242 | 适 1243 | 秒 1244 | 香 1245 | 种 1246 | 秋 1247 | 科 1248 | 重 1249 | 复 1250 | 竿 1251 | 段 1252 | 便 1253 | 俩 1254 | 贷 1255 | 顺 1256 | 修 1257 | 保 1258 | 促 1259 | 侮 1260 | 俭 1261 | 俗 1262 | 俘 1263 | 信 1264 | 皇 1265 | 泉 1266 | 鬼 1267 | 侵 1268 | 追 1269 | 俊 1270 | 盾 1271 | 待 1272 | 律 1273 | 很 1274 | 须 1275 | 叙 1276 | 剑 1277 | 逃 1278 | 食 1279 | 盆 1280 | 胆 1281 | 胜 1282 | 胞 1283 | 胖 1284 | 脉 1285 | 勉 1286 | 狭 1287 | 狮 1288 | 独 1289 | 狡 1290 | 狱 1291 | 狠 1292 | 贸 1293 | 怨 1294 | 急 1295 | 饶 1296 | 蚀 1297 | 饺 1298 | 饼 1299 | 弯 1300 | 将 1301 | 奖 1302 | 哀 1303 | 亭 1304 | 亮 1305 | 度 1306 | 迹 1307 | 庭 1308 | 疮 1309 | 疯 1310 | 疫 1311 | 疤 1312 | 姿 1313 | 亲 1314 | 音 1315 | 帝 1316 | 施 1317 | 闻 1318 | 阀 1319 | 阁 1320 | 差 1321 | 养 1322 | 美 1323 | 姜 1324 | 叛 1325 | 送 1326 | 类 1327 | 迷 1328 | 前 1329 | 首 1330 | 逆 1331 | 总 1332 | 炼 1333 | 炸 1334 | 炮 1335 | 烂 1336 | 剃 1337 | 洁 1338 | 洪 1339 | 洒 1340 | 浇 1341 | 浊 1342 | 洞 1343 | 测 1344 | 洗 1345 | 活 1346 | 派 1347 | 洽 1348 | 染 1349 | 济 1350 | 洋 1351 | 洲 1352 | 浑 1353 | 浓 1354 | 津 1355 | 恒 1356 | 恢 1357 | 恰 1358 | 恼 1359 | 恨 1360 | 举 1361 | 觉 1362 | 宣 1363 | 室 1364 | 宫 1365 | 宪 1366 | 突 1367 | 穿 1368 | 窃 1369 | 客 1370 | 冠 1371 | 语 1372 | 扁 1373 | 袄 1374 | 祖 1375 | 神 1376 | 祝 1377 | 误 1378 | 诱 1379 | 说 1380 | 诵 1381 | 垦 1382 | 退 1383 | 既 1384 | 屋 1385 | 昼 1386 | 费 1387 | 陡 1388 | 眉 1389 | 孩 1390 | 除 1391 | 险 1392 | 院 1393 | 娃 1394 | 姥 1395 | 姨 1396 | 姻 1397 | 娇 1398 | 怒 1399 | 架 1400 | 贺 1401 | 盈 1402 | 勇 1403 | 怠 1404 | 柔 1405 | 垒 1406 | 绑 1407 | 绒 1408 | 结 1409 | 绕 1410 | 骄 1411 | 绘 1412 | 给 1413 | 络 1414 | 骆 1415 | 绝 1416 | 绞 1417 | 统 1418 | 耕 1419 | 耗 1420 | 艳 1421 | 泰 1422 | 珠 1423 | 班 1424 | 素 1425 | 蚕 1426 | 顽 1427 | 盏 1428 | 匪 1429 | 捞 1430 | 栽 1431 | 捕 1432 | 振 1433 | 载 1434 | 赶 1435 | 起 1436 | 盐 1437 | 捎 1438 | 捏 1439 | 埋 1440 | 捉 1441 | 捆 1442 | 捐 1443 | 损 1444 | 都 1445 | 哲 1446 | 逝 1447 | 捡 1448 | 换 1449 | 挽 1450 | 热 1451 | 恐 1452 | 壶 1453 | 挨 1454 | 耻 1455 | 耽 1456 | 恭 1457 | 莲 1458 | 莫 1459 | 荷 1460 | 获 1461 | 晋 1462 | 恶 1463 | 真 1464 | 框 1465 | 桂 1466 | 档 1467 | 桐 1468 | 株 1469 | 桥 1470 | 桃 1471 | 格 1472 | 校 1473 | 核 1474 | 样 1475 | 根 1476 | 索 1477 | 哥 1478 | 速 1479 | 逗 1480 | 栗 1481 | 配 1482 | 翅 1483 | 辱 1484 | 唇 1485 | 夏 1486 | 础 1487 | 破 1488 | 原 1489 | 套 1490 | 逐 1491 | 烈 1492 | 殊 1493 | 顾 1494 | 轿 1495 | 较 1496 | 顿 1497 | 毙 1498 | 致 1499 | 柴 1500 | 桌 1501 | 虑 1502 | 监 1503 | 紧 1504 | 党 1505 | 晒 1506 | 眠 1507 | 晓 1508 | 鸭 1509 | 晃 1510 | 晌 1511 | 晕 1512 | 蚊 1513 | 哨 1514 | 哭 1515 | 恩 1516 | 唤 1517 | 啊 1518 | 唉 1519 | 罢 1520 | 峰 1521 | 圆 1522 | 贼 1523 | 贿 1524 | 钱 1525 | 钳 1526 | 钻 1527 | 铁 1528 | 铃 1529 | 铅 1530 | 缺 1531 | 氧 1532 | 特 1533 | 牺 1534 | 造 1535 | 乘 1536 | 敌 1537 | 秤 1538 | 租 1539 | 积 1540 | 秧 1541 | 秩 1542 | 称 1543 | 秘 1544 | 透 1545 | 笔 1546 | 笑 1547 | 笋 1548 | 债 1549 | 借 1550 | 值 1551 | 倚 1552 | 倾 1553 | 倒 1554 | 倘 1555 | 俱 1556 | 倡 1557 | 候 1558 | 俯 1559 | 倍 1560 | 倦 1561 | 健 1562 | 臭 1563 | 射 1564 | 躬 1565 | 息 1566 | 徒 1567 | 徐 1568 | 舰 1569 | 舱 1570 | 般 1571 | 航 1572 | 途 1573 | 拿 1574 | 爹 1575 | 爱 1576 | 颂 1577 | 翁 1578 | 脆 1579 | 脂 1580 | 胸 1581 | 胳 1582 | 脏 1583 | 胶 1584 | 脑 1585 | 狸 1586 | 狼 1587 | 逢 1588 | 留 1589 | 皱 1590 | 饿 1591 | 恋 1592 | 桨 1593 | 浆 1594 | 衰 1595 | 高 1596 | 席 1597 | 准 1598 | 座 1599 | 脊 1600 | 症 1601 | 病 1602 | 疾 1603 | 疼 1604 | 疲 1605 | 效 1606 | 离 1607 | 唐 1608 | 资 1609 | 凉 1610 | 站 1611 | 剖 1612 | 竞 1613 | 部 1614 | 旁 1615 | 旅 1616 | 畜 1617 | 阅 1618 | 羞 1619 | 瓶 1620 | 拳 1621 | 粉 1622 | 料 1623 | 益 1624 | 兼 1625 | 烤 1626 | 烘 1627 | 烦 1628 | 烧 1629 | 烛 1630 | 烟 1631 | 递 1632 | 涛 1633 | 浙 1634 | 涝 1635 | 酒 1636 | 涉 1637 | 消 1638 | 浩 1639 | 海 1640 | 涂 1641 | 浴 1642 | 浮 1643 | 流 1644 | 润 1645 | 浪 1646 | 浸 1647 | 涨 1648 | 烫 1649 | 涌 1650 | 悟 1651 | 悄 1652 | 悔 1653 | 悦 1654 | 害 1655 | 宽 1656 | 家 1657 | 宵 1658 | 宴 1659 | 宾 1660 | 窄 1661 | 容 1662 | 宰 1663 | 案 1664 | 请 1665 | 朗 1666 | 诸 1667 | 读 1668 | 扇 1669 | 袜 1670 | 袖 1671 | 袍 1672 | 被 1673 | 祥 1674 | 课 1675 | 谁 1676 | 调 1677 | 冤 1678 | 谅 1679 | 谈 1680 | 谊 1681 | 剥 1682 | 恳 1683 | 展 1684 | 剧 1685 | 屑 1686 | 弱 1687 | 陵 1688 | 陶 1689 | 陷 1690 | 陪 1691 | 娱 1692 | 娘 1693 | 通 1694 | 能 1695 | 难 1696 | 预 1697 | 桑 1698 | 绢 1699 | 绣 1700 | 验 1701 | 继 1702 | 球 1703 | 理 1704 | 捧 1705 | 堵 1706 | 描 1707 | 域 1708 | 掩 1709 | 捷 1710 | 排 1711 | 掉 1712 | 堆 1713 | 推 1714 | 掀 1715 | 授 1716 | 教 1717 | 掏 1718 | 掠 1719 | 培 1720 | 接 1721 | 控 1722 | 探 1723 | 据 1724 | 掘 1725 | 职 1726 | 基 1727 | 著 1728 | 勒 1729 | 黄 1730 | 萌 1731 | 萝 1732 | 菌 1733 | 菜 1734 | 萄 1735 | 菊 1736 | 萍 1737 | 菠 1738 | 营 1739 | 械 1740 | 梦 1741 | 梢 1742 | 梅 1743 | 检 1744 | 梳 1745 | 梯 1746 | 桶 1747 | 救 1748 | 副 1749 | 票 1750 | 戚 1751 | 爽 1752 | 聋 1753 | 袭 1754 | 盛 1755 | 雪 1756 | 辅 1757 | 辆 1758 | 虚 1759 | 雀 1760 | 堂 1761 | 常 1762 | 匙 1763 | 晨 1764 | 睁 1765 | 眯 1766 | 眼 1767 | 悬 1768 | 野 1769 | 啦 1770 | 晚 1771 | 啄 1772 | 距 1773 | 跃 1774 | 略 1775 | 蛇 1776 | 累 1777 | 唱 1778 | 患 1779 | 唯 1780 | 崖 1781 | 崭 1782 | 崇 1783 | 圈 1784 | 铜 1785 | 铲 1786 | 银 1787 | 甜 1788 | 梨 1789 | 犁 1790 | 移 1791 | 笨 1792 | 笼 1793 | 笛 1794 | 符 1795 | 第 1796 | 敏 1797 | 做 1798 | 袋 1799 | 悠 1800 | 偿 1801 | 偶 1802 | 偷 1803 | 您 1804 | 售 1805 | 停 1806 | 偏 1807 | 假 1808 | 得 1809 | 衔 1810 | 盘 1811 | 船 1812 | 斜 1813 | 盒 1814 | 鸽 1815 | 悉 1816 | 欲 1817 | 彩 1818 | 领 1819 | 脚 1820 | 脖 1821 | 脸 1822 | 脱 1823 | 象 1824 | 够 1825 | 猜 1826 | 猪 1827 | 猎 1828 | 猫 1829 | 猛 1830 | 馅 1831 | 馆 1832 | 凑 1833 | 减 1834 | 毫 1835 | 麻 1836 | 痒 1837 | 痕 1838 | 廊 1839 | 康 1840 | 庸 1841 | 鹿 1842 | 盗 1843 | 章 1844 | 竟 1845 | 商 1846 | 族 1847 | 旋 1848 | 望 1849 | 率 1850 | 着 1851 | 盖 1852 | 粘 1853 | 粗 1854 | 粒 1855 | 断 1856 | 剪 1857 | 兽 1858 | 清 1859 | 添 1860 | 淋 1861 | 淹 1862 | 渠 1863 | 渐 1864 | 混 1865 | 渔 1866 | 淘 1867 | 液 1868 | 淡 1869 | 深 1870 | 婆 1871 | 梁 1872 | 渗 1873 | 情 1874 | 惜 1875 | 惭 1876 | 悼 1877 | 惧 1878 | 惕 1879 | 惊 1880 | 惨 1881 | 惯 1882 | 寇 1883 | 寄 1884 | 宿 1885 | 窑 1886 | 密 1887 | 谋 1888 | 谎 1889 | 祸 1890 | 谜 1891 | 逮 1892 | 敢 1893 | 屠 1894 | 弹 1895 | 随 1896 | 蛋 1897 | 隆 1898 | 隐 1899 | 婚 1900 | 婶 1901 | 颈 1902 | 绩 1903 | 绪 1904 | 续 1905 | 骑 1906 | 绳 1907 | 维 1908 | 绵 1909 | 绸 1910 | 绿 1911 | 琴 1912 | 斑 1913 | 替 1914 | 款 1915 | 堪 1916 | 搭 1917 | 塔 1918 | 越 1919 | 趁 1920 | 趋 1921 | 超 1922 | 提 1923 | 堤 1924 | 博 1925 | 揭 1926 | 喜 1927 | 插 1928 | 揪 1929 | 搜 1930 | 煮 1931 | 援 1932 | 裁 1933 | 搁 1934 | 搂 1935 | 搅 1936 | 握 1937 | 揉 1938 | 斯 1939 | 期 1940 | 欺 1941 | 联 1942 | 散 1943 | 惹 1944 | 葬 1945 | 葛 1946 | 董 1947 | 葡 1948 | 敬 1949 | 葱 1950 | 落 1951 | 朝 1952 | 辜 1953 | 葵 1954 | 棒 1955 | 棋 1956 | 植 1957 | 森 1958 | 椅 1959 | 椒 1960 | 棵 1961 | 棍 1962 | 棉 1963 | 棚 1964 | 棕 1965 | 惠 1966 | 惑 1967 | 逼 1968 | 厨 1969 | 厦 1970 | 硬 1971 | 确 1972 | 雁 1973 | 殖 1974 | 裂 1975 | 雄 1976 | 暂 1977 | 雅 1978 | 辈 1979 | 悲 1980 | 紫 1981 | 辉 1982 | 敞 1983 | 赏 1984 | 掌 1985 | 晴 1986 | 暑 1987 | 最 1988 | 量 1989 | 喷 1990 | 晶 1991 | 喇 1992 | 遇 1993 | 喊 1994 | 景 1995 | 践 1996 | 跌 1997 | 跑 1998 | 遗 1999 | 蛙 2000 | 蛛 2001 | 蜓 2002 | 喝 2003 | 喂 2004 | 喘 2005 | 喉 2006 | 幅 2007 | 帽 2008 | 赌 2009 | 赔 2010 | 黑 2011 | 铸 2012 | 铺 2013 | 链 2014 | 销 2015 | 锁 2016 | 锄 2017 | 锅 2018 | 锈 2019 | 锋 2020 | 锐 2021 | 短 2022 | 智 2023 | 毯 2024 | 鹅 2025 | 剩 2026 | 稍 2027 | 程 2028 | 稀 2029 | 税 2030 | 筐 2031 | 等 2032 | 筑 2033 | 策 2034 | 筛 2035 | 筒 2036 | 答 2037 | 筋 2038 | 筝 2039 | 傲 2040 | 傅 2041 | 牌 2042 | 堡 2043 | 集 2044 | 焦 2045 | 傍 2046 | 储 2047 | 奥 2048 | 街 2049 | 惩 2050 | 御 2051 | 循 2052 | 艇 2053 | 舒 2054 | 番 2055 | 释 2056 | 禽 2057 | 腊 2058 | 脾 2059 | 腔 2060 | 鲁 2061 | 猾 2062 | 猴 2063 | 然 2064 | 馋 2065 | 装 2066 | 蛮 2067 | 就 2068 | 痛 2069 | 童 2070 | 阔 2071 | 善 2072 | 羡 2073 | 普 2074 | 粪 2075 | 尊 2076 | 道 2077 | 曾 2078 | 焰 2079 | 港 2080 | 湖 2081 | 渣 2082 | 湿 2083 | 温 2084 | 渴 2085 | 滑 2086 | 湾 2087 | 渡 2088 | 游 2089 | 滋 2090 | 溉 2091 | 愤 2092 | 慌 2093 | 惰 2094 | 愧 2095 | 愉 2096 | 慨 2097 | 割 2098 | 寒 2099 | 富 2100 | 窜 2101 | 窝 2102 | 窗 2103 | 遍 2104 | 裕 2105 | 裤 2106 | 裙 2107 | 谢 2108 | 谣 2109 | 谦 2110 | 属 2111 | 屡 2112 | 强 2113 | 粥 2114 | 疏 2115 | 隔 2116 | 隙 2117 | 絮 2118 | 嫂 2119 | 登 2120 | 缎 2121 | 缓 2122 | 编 2123 | 骗 2124 | 缘 2125 | 瑞 2126 | 魂 2127 | 肆 2128 | 摄 2129 | 摸 2130 | 填 2131 | 搏 2132 | 塌 2133 | 鼓 2134 | 摆 2135 | 携 2136 | 搬 2137 | 摇 2138 | 搞 2139 | 塘 2140 | 摊 2141 | 蒜 2142 | 勤 2143 | 鹊 2144 | 蓝 2145 | 墓 2146 | 幕 2147 | 蓬 2148 | 蓄 2149 | 蒙 2150 | 蒸 2151 | 献 2152 | 禁 2153 | 楚 2154 | 想 2155 | 槐 2156 | 榆 2157 | 楼 2158 | 概 2159 | 赖 2160 | 酬 2161 | 感 2162 | 碍 2163 | 碑 2164 | 碎 2165 | 碰 2166 | 碗 2167 | 碌 2168 | 雷 2169 | 零 2170 | 雾 2171 | 雹 2172 | 输 2173 | 督 2174 | 龄 2175 | 鉴 2176 | 睛 2177 | 睡 2178 | 睬 2179 | 鄙 2180 | 愚 2181 | 暖 2182 | 盟 2183 | 歇 2184 | 暗 2185 | 照 2186 | 跨 2187 | 跳 2188 | 跪 2189 | 路 2190 | 跟 2191 | 遣 2192 | 蛾 2193 | 蜂 2194 | 嗓 2195 | 置 2196 | 罪 2197 | 罩 2198 | 错 2199 | 锡 2200 | 锣 2201 | 锤 2202 | 锦 2203 | 键 2204 | 锯 2205 | 矮 2206 | 辞 2207 | 稠 2208 | 愁 2209 | 筹 2210 | 签 2211 | 简 2212 | 毁 2213 | 舅 2214 | 鼠 2215 | 催 2216 | 傻 2217 | 像 2218 | 躲 2219 | 微 2220 | 愈 2221 | 遥 2222 | 腰 2223 | 腥 2224 | 腹 2225 | 腾 2226 | 腿 2227 | 触 2228 | 解 2229 | 酱 2230 | 痰 2231 | 廉 2232 | 新 2233 | 韵 2234 | 意 2235 | 粮 2236 | 数 2237 | 煎 2238 | 塑 2239 | 慈 2240 | 煤 2241 | 煌 2242 | 满 2243 | 漠 2244 | 源 2245 | 滤 2246 | 滥 2247 | 滔 2248 | 溪 2249 | 溜 2250 | 滚 2251 | 滨 2252 | 粱 2253 | 滩 2254 | 慎 2255 | 誉 2256 | 塞 2257 | 谨 2258 | 福 2259 | 群 2260 | 殿 2261 | 辟 2262 | 障 2263 | 嫌 2264 | 嫁 2265 | 叠 2266 | 缝 2267 | 缠 2268 | 静 2269 | 碧 2270 | 璃 2271 | 墙 2272 | 撇 2273 | 嘉 2274 | 摧 2275 | 截 2276 | 誓 2277 | 境 2278 | 摘 2279 | 摔 2280 | 聚 2281 | 蔽 2282 | 慕 2283 | 暮 2284 | 蔑 2285 | 模 2286 | 榴 2287 | 榜 2288 | 榨 2289 | 歌 2290 | 遭 2291 | 酷 2292 | 酿 2293 | 酸 2294 | 磁 2295 | 愿 2296 | 需 2297 | 弊 2298 | 裳 2299 | 颗 2300 | 嗽 2301 | 蜻 2302 | 蜡 2303 | 蝇 2304 | 蜘 2305 | 赚 2306 | 锹 2307 | 锻 2308 | 舞 2309 | 稳 2310 | 算 2311 | 箩 2312 | 管 2313 | 僚 2314 | 鼻 2315 | 魄 2316 | 貌 2317 | 膜 2318 | 膊 2319 | 膀 2320 | 鲜 2321 | 疑 2322 | 馒 2323 | 裹 2324 | 敲 2325 | 豪 2326 | 膏 2327 | 遮 2328 | 腐 2329 | 瘦 2330 | 辣 2331 | 竭 2332 | 端 2333 | 旗 2334 | 精 2335 | 歉 2336 | 熄 2337 | 熔 2338 | 漆 2339 | 漂 2340 | 漫 2341 | 滴 2342 | 演 2343 | 漏 2344 | 慢 2345 | 寨 2346 | 赛 2347 | 察 2348 | 蜜 2349 | 谱 2350 | 嫩 2351 | 翠 2352 | 熊 2353 | 凳 2354 | 骡 2355 | 缩 2356 | 慧 2357 | 撕 2358 | 撒 2359 | 趣 2360 | 趟 2361 | 撑 2362 | 播 2363 | 撞 2364 | 撤 2365 | 增 2366 | 聪 2367 | 鞋 2368 | 蕉 2369 | 蔬 2370 | 横 2371 | 槽 2372 | 樱 2373 | 橡 2374 | 飘 2375 | 醋 2376 | 醉 2377 | 震 2378 | 霉 2379 | 瞒 2380 | 题 2381 | 暴 2382 | 瞎 2383 | 影 2384 | 踢 2385 | 踏 2386 | 踩 2387 | 踪 2388 | 蝶 2389 | 蝴 2390 | 嘱 2391 | 墨 2392 | 镇 2393 | 靠 2394 | 稻 2395 | 黎 2396 | 稿 2397 | 稼 2398 | 箱 2399 | 箭 2400 | 篇 2401 | 僵 2402 | 躺 2403 | 僻 2404 | 德 2405 | 艘 2406 | 膝 2407 | 膛 2408 | 熟 2409 | 摩 2410 | 颜 2411 | 毅 2412 | 糊 2413 | 遵 2414 | 潜 2415 | 潮 2416 | 懂 2417 | 额 2418 | 慰 2419 | 劈 2420 | 操 2421 | 燕 2422 | 薯 2423 | 薪 2424 | 薄 2425 | 颠 2426 | 橘 2427 | 整 2428 | 融 2429 | 醒 2430 | 餐 2431 | 嘴 2432 | 蹄 2433 | 器 2434 | 赠 2435 | 默 2436 | 镜 2437 | 赞 2438 | 篮 2439 | 邀 2440 | 衡 2441 | 膨 2442 | 雕 2443 | 磨 2444 | 凝 2445 | 辨 2446 | 辩 2447 | 糖 2448 | 糕 2449 | 燃 2450 | 澡 2451 | 激 2452 | 懒 2453 | 壁 2454 | 避 2455 | 缴 2456 | 戴 2457 | 擦 2458 | 鞠 2459 | 藏 2460 | 霜 2461 | 霞 2462 | 瞧 2463 | 蹈 2464 | 螺 2465 | 穗 2466 | 繁 2467 | 辫 2468 | 赢 2469 | 糟 2470 | 糠 2471 | 燥 2472 | 臂 2473 | 翼 2474 | 骤 2475 | 鞭 2476 | 覆 2477 | 蹦 2478 | 镰 2479 | 翻 2480 | 鹰 2481 | 警 2482 | 攀 2483 | 蹲 2484 | 颤 2485 | 瓣 2486 | 爆 2487 | 疆 2488 | 壤 2489 | 耀 2490 | 躁 2491 | 嚼 2492 | 嚷 2493 | 籍 2494 | 魔 2495 | 灌 2496 | 蠢 2497 | 霸 2498 | 露 2499 | 囊 2500 | 罐 2501 | 匕 2502 | 刁 2503 | 丐 2504 | 歹 2505 | 戈 2506 | 夭 2507 | 仑 2508 | 讥 2509 | 冗 2510 | 邓 2511 | 艾 2512 | 夯 2513 | 凸 2514 | 卢 2515 | 叭 2516 | 叽 2517 | 皿 2518 | 凹 2519 | 囚 2520 | 矢 2521 | 乍 2522 | 尔 2523 | 冯 2524 | 玄 2525 | 邦 2526 | 迂 2527 | 邢 2528 | 芋 2529 | 芍 2530 | 吏 2531 | 夷 2532 | 吁 2533 | 吕 2534 | 吆 2535 | 屹 2536 | 廷 2537 | 迄 2538 | 臼 2539 | 仲 2540 | 伦 2541 | 伊 2542 | 肋 2543 | 旭 2544 | 匈 2545 | 凫 2546 | 妆 2547 | 亥 2548 | 汛 2549 | 讳 2550 | 讶 2551 | 讹 2552 | 讼 2553 | 诀 2554 | 弛 2555 | 阱 2556 | 驮 2557 | 驯 2558 | 纫 2559 | 玖 2560 | 玛 2561 | 韧 2562 | 抠 2563 | 扼 2564 | 汞 2565 | 扳 2566 | 抡 2567 | 坎 2568 | 坞 2569 | 抑 2570 | 拟 2571 | 抒 2572 | 芙 2573 | 芜 2574 | 苇 2575 | 芥 2576 | 芯 2577 | 芭 2578 | 杖 2579 | 杉 2580 | 巫 2581 | 杈 2582 | 甫 2583 | 匣 2584 | 轩 2585 | 卤 2586 | 肖 2587 | 吱 2588 | 吠 2589 | 呕 2590 | 呐 2591 | 吟 2592 | 呛 2593 | 吻 2594 | 吭 2595 | 邑 2596 | 囤 2597 | 吮 2598 | 岖 2599 | 牡 2600 | 佑 2601 | 佃 2602 | 伺 2603 | 囱 2604 | 肛 2605 | 肘 2606 | 甸 2607 | 狈 2608 | 鸠 2609 | 彤 2610 | 灸 2611 | 刨 2612 | 庇 2613 | 吝 2614 | 庐 2615 | 闰 2616 | 兑 2617 | 灼 2618 | 沐 2619 | 沛 2620 | 汰 2621 | 沥 2622 | 沦 2623 | 汹 2624 | 沧 2625 | 沪 2626 | 忱 2627 | 诅 2628 | 诈 2629 | 罕 2630 | 屁 2631 | 坠 2632 | 妓 2633 | 姊 2634 | 妒 2635 | 纬 2636 | 玫 2637 | 卦 2638 | 坷 2639 | 坯 2640 | 拓 2641 | 坪 2642 | 坤 2643 | 拄 2644 | 拧 2645 | 拂 2646 | 拙 2647 | 拇 2648 | 拗 2649 | 茉 2650 | 昔 2651 | 苛 2652 | 苫 2653 | 苟 2654 | 苞 2655 | 茁 2656 | 苔 2657 | 枉 2658 | 枢 2659 | 枚 2660 | 枫 2661 | 杭 2662 | 郁 2663 | 矾 2664 | 奈 2665 | 奄 2666 | 殴 2667 | 歧 2668 | 卓 2669 | 昙 2670 | 哎 2671 | 咕 2672 | 呵 2673 | 咙 2674 | 呻 2675 | 咒 2676 | 咆 2677 | 咖 2678 | 帕 2679 | 账 2680 | 贬 2681 | 贮 2682 | 氛 2683 | 秉 2684 | 岳 2685 | 侠 2686 | 侥 2687 | 侣 2688 | 侈 2689 | 卑 2690 | 刽 2691 | 刹 2692 | 肴 2693 | 觅 2694 | 忿 2695 | 瓮 2696 | 肮 2697 | 肪 2698 | 狞 2699 | 庞 2700 | 疟 2701 | 疙 2702 | 疚 2703 | 卒 2704 | 氓 2705 | 炬 2706 | 沽 2707 | 沮 2708 | 泣 2709 | 泞 2710 | 泌 2711 | 沼 2712 | 怔 2713 | 怯 2714 | 宠 2715 | 宛 2716 | 衩 2717 | 祈 2718 | 诡 2719 | 帚 2720 | 屉 2721 | 弧 2722 | 弥 2723 | 陋 2724 | 陌 2725 | 函 2726 | 姆 2727 | 虱 2728 | 叁 2729 | 绅 2730 | 驹 2731 | 绊 2732 | 绎 2733 | 契 2734 | 贰 2735 | 玷 2736 | 玲 2737 | 珊 2738 | 拭 2739 | 拷 2740 | 拱 2741 | 挟 2742 | 垢 2743 | 垛 2744 | 拯 2745 | 荆 2746 | 茸 2747 | 茬 2748 | 荚 2749 | 茵 2750 | 茴 2751 | 荞 2752 | 荠 2753 | 荤 2754 | 荧 2755 | 荔 2756 | 栈 2757 | 柑 2758 | 栅 2759 | 柠 2760 | 枷 2761 | 勃 2762 | 柬 2763 | 砂 2764 | 泵 2765 | 砚 2766 | 鸥 2767 | 轴 2768 | 韭 2769 | 虐 2770 | 昧 2771 | 盹 2772 | 咧 2773 | 昵 2774 | 昭 2775 | 盅 2776 | 勋 2777 | 哆 2778 | 咪 2779 | 哟 2780 | 幽 2781 | 钙 2782 | 钝 2783 | 钠 2784 | 钦 2785 | 钧 2786 | 钮 2787 | 毡 2788 | 氢 2789 | 秕 2790 | 俏 2791 | 俄 2792 | 俐 2793 | 侯 2794 | 徊 2795 | 衍 2796 | 胚 2797 | 胧 2798 | 胎 2799 | 狰 2800 | 饵 2801 | 峦 2802 | 奕 2803 | 咨 2804 | 飒 2805 | 闺 2806 | 闽 2807 | 籽 2808 | 娄 2809 | 烁 2810 | 炫 2811 | 洼 2812 | 柒 2813 | 涎 2814 | 洛 2815 | 恃 2816 | 恍 2817 | 恬 2818 | 恤 2819 | 宦 2820 | 诫 2821 | 诬 2822 | 祠 2823 | 诲 2824 | 屏 2825 | 屎 2826 | 逊 2827 | 陨 2828 | 姚 2829 | 娜 2830 | 蚤 2831 | 骇 2832 | 耘 2833 | 耙 2834 | 秦 2835 | 匿 2836 | 埂 2837 | 捂 2838 | 捍 2839 | 袁 2840 | 捌 2841 | 挫 2842 | 挚 2843 | 捣 2844 | 捅 2845 | 埃 2846 | 耿 2847 | 聂 2848 | 荸 2849 | 莽 2850 | 莱 2851 | 莉 2852 | 莹 2853 | 莺 2854 | 梆 2855 | 栖 2856 | 桦 2857 | 栓 2858 | 桅 2859 | 桩 2860 | 贾 2861 | 酌 2862 | 砸 2863 | 砰 2864 | 砾 2865 | 殉 2866 | 逞 2867 | 哮 2868 | 唠 2869 | 哺 2870 | 剔 2871 | 蚌 2872 | 蚜 2873 | 畔 2874 | 蚣 2875 | 蚪 2876 | 蚓 2877 | 哩 2878 | 圃 2879 | 鸯 2880 | 唁 2881 | 哼 2882 | 唆 2883 | 峭 2884 | 唧 2885 | 峻 2886 | 赂 2887 | 赃 2888 | 钾 2889 | 铆 2890 | 氨 2891 | 秫 2892 | 笆 2893 | 俺 2894 | 赁 2895 | 倔 2896 | 殷 2897 | 耸 2898 | 舀 2899 | 豺 2900 | 豹 2901 | 颁 2902 | 胯 2903 | 胰 2904 | 脐 2905 | 脓 2906 | 逛 2907 | 卿 2908 | 鸵 2909 | 鸳 2910 | 馁 2911 | 凌 2912 | 凄 2913 | 衷 2914 | 郭 2915 | 斋 2916 | 疹 2917 | 紊 2918 | 瓷 2919 | 羔 2920 | 烙 2921 | 浦 2922 | 涡 2923 | 涣 2924 | 涤 2925 | 涧 2926 | 涕 2927 | 涩 2928 | 悍 2929 | 悯 2930 | 窍 2931 | 诺 2932 | 诽 2933 | 袒 2934 | 谆 2935 | 祟 2936 | 恕 2937 | 娩 2938 | 骏 2939 | 琐 2940 | 麸 2941 | 琉 2942 | 琅 2943 | 措 2944 | 捺 2945 | 捶 2946 | 赦 2947 | 埠 2948 | 捻 2949 | 掐 2950 | 掂 2951 | 掖 2952 | 掷 2953 | 掸 2954 | 掺 2955 | 勘 2956 | 聊 2957 | 娶 2958 | 菱 2959 | 菲 2960 | 萎 2961 | 菩 2962 | 萤 2963 | 乾 2964 | 萧 2965 | 萨 2966 | 菇 2967 | 彬 2968 | 梗 2969 | 梧 2970 | 梭 2971 | 曹 2972 | 酝 2973 | 酗 2974 | 厢 2975 | 硅 2976 | 硕 2977 | 奢 2978 | 盔 2979 | 匾 2980 | 颅 2981 | 彪 2982 | 眶 2983 | 晤 2984 | 曼 2985 | 晦 2986 | 冕 2987 | 啡 2988 | 畦 2989 | 趾 2990 | 啃 2991 | 蛆 2992 | 蚯 2993 | 蛉 2994 | 蛀 2995 | 唬 2996 | 啰 2997 | 唾 2998 | 啤 2999 | 啥 3000 | 啸 3001 | 崎 3002 | 逻 3003 | 崔 3004 | 崩 3005 | 婴 3006 | 赊 3007 | 铐 3008 | 铛 3009 | 铝 3010 | 铡 3011 | 铣 3012 | 铭 3013 | 矫 3014 | 秸 3015 | 秽 3016 | 笙 3017 | 笤 3018 | 偎 3019 | 傀 3020 | 躯 3021 | 兜 3022 | 衅 3023 | 徘 3024 | 徙 3025 | 舶 3026 | 舷 3027 | 舵 3028 | 敛 3029 | 翎 3030 | 脯 3031 | 逸 3032 | 凰 3033 | 猖 3034 | 祭 3035 | 烹 3036 | 庶 3037 | 庵 3038 | 痊 3039 | 阎 3040 | 阐 3041 | 眷 3042 | 焊 3043 | 焕 3044 | 鸿 3045 | 涯 3046 | 淑 3047 | 淌 3048 | 淮 3049 | 淆 3050 | 渊 3051 | 淫 3052 | 淳 3053 | 淤 3054 | 淀 3055 | 涮 3056 | 涵 3057 | 惦 3058 | 悴 3059 | 惋 3060 | 寂 3061 | 窒 3062 | 谍 3063 | 谐 3064 | 裆 3065 | 袱 3066 | 祷 3067 | 谒 3068 | 谓 3069 | 谚 3070 | 尉 3071 | 堕 3072 | 隅 3073 | 婉 3074 | 颇 3075 | 绰 3076 | 绷 3077 | 综 3078 | 绽 3079 | 缀 3080 | 巢 3081 | 琳 3082 | 琢 3083 | 琼 3084 | 揍 3085 | 堰 3086 | 揩 3087 | 揽 3088 | 揖 3089 | 彭 3090 | 揣 3091 | 搀 3092 | 搓 3093 | 壹 3094 | 搔 3095 | 葫 3096 | 募 3097 | 蒋 3098 | 蒂 3099 | 韩 3100 | 棱 3101 | 椰 3102 | 焚 3103 | 椎 3104 | 棺 3105 | 榔 3106 | 椭 3107 | 粟 3108 | 棘 3109 | 酣 3110 | 酥 3111 | 硝 3112 | 硫 3113 | 颊 3114 | 雳 3115 | 翘 3116 | 凿 3117 | 棠 3118 | 晰 3119 | 鼎 3120 | 喳 3121 | 遏 3122 | 晾 3123 | 畴 3124 | 跋 3125 | 跛 3126 | 蛔 3127 | 蜒 3128 | 蛤 3129 | 鹃 3130 | 喻 3131 | 啼 3132 | 喧 3133 | 嵌 3134 | 赋 3135 | 赎 3136 | 赐 3137 | 锉 3138 | 锌 3139 | 甥 3140 | 掰 3141 | 氮 3142 | 氯 3143 | 黍 3144 | 筏 3145 | 牍 3146 | 粤 3147 | 逾 3148 | 腌 3149 | 腋 3150 | 腕 3151 | 猩 3152 | 猬 3153 | 惫 3154 | 敦 3155 | 痘 3156 | 痢 3157 | 痪 3158 | 竣 3159 | 翔 3160 | 奠 3161 | 遂 3162 | 焙 3163 | 滞 3164 | 湘 3165 | 渤 3166 | 渺 3167 | 溃 3168 | 溅 3169 | 湃 3170 | 愕 3171 | 惶 3172 | 寓 3173 | 窖 3174 | 窘 3175 | 雇 3176 | 谤 3177 | 犀 3178 | 隘 3179 | 媒 3180 | 媚 3181 | 婿 3182 | 缅 3183 | 缆 3184 | 缔 3185 | 缕 3186 | 骚 3187 | 瑟 3188 | 鹉 3189 | 瑰 3190 | 搪 3191 | 聘 3192 | 斟 3193 | 靴 3194 | 靶 3195 | 蓖 3196 | 蒿 3197 | 蒲 3198 | 蓉 3199 | 楔 3200 | 椿 3201 | 楷 3202 | 榄 3203 | 楞 3204 | 楣 3205 | 酪 3206 | 碘 3207 | 硼 3208 | 碉 3209 | 辐 3210 | 辑 3211 | 频 3212 | 睹 3213 | 睦 3214 | 瞄 3215 | 嗜 3216 | 嗦 3217 | 暇 3218 | 畸 3219 | 跷 3220 | 跺 3221 | 蜈 3222 | 蜗 3223 | 蜕 3224 | 蛹 3225 | 嗅 3226 | 嗡 3227 | 嗤 3228 | 署 3229 | 蜀 3230 | 幌 3231 | 锚 3232 | 锥 3233 | 锨 3234 | 锭 3235 | 锰 3236 | 稚 3237 | 颓 3238 | 筷 3239 | 魁 3240 | 衙 3241 | 腻 3242 | 腮 3243 | 腺 3244 | 鹏 3245 | 肄 3246 | 猿 3247 | 颖 3248 | 煞 3249 | 雏 3250 | 馍 3251 | 馏 3252 | 禀 3253 | 痹 3254 | 廓 3255 | 痴 3256 | 靖 3257 | 誊 3258 | 漓 3259 | 溢 3260 | 溯 3261 | 溶 3262 | 滓 3263 | 溺 3264 | 寞 3265 | 窥 3266 | 窟 3267 | 寝 3268 | 褂 3269 | 裸 3270 | 谬 3271 | 媳 3272 | 嫉 3273 | 缚 3274 | 缤 3275 | 剿 3276 | 赘 3277 | 熬 3278 | 赫 3279 | 蔫 3280 | 摹 3281 | 蔓 3282 | 蔗 3283 | 蔼 3284 | 熙 3285 | 蔚 3286 | 兢 3287 | 榛 3288 | 榕 3289 | 酵 3290 | 碟 3291 | 碴 3292 | 碱 3293 | 碳 3294 | 辕 3295 | 辖 3296 | 雌 3297 | 墅 3298 | 嘁 3299 | 踊 3300 | 蝉 3301 | 嘀 3302 | 幔 3303 | 镀 3304 | 舔 3305 | 熏 3306 | 箍 3307 | 箕 3308 | 箫 3309 | 舆 3310 | 僧 3311 | 孵 3312 | 瘩 3313 | 瘟 3314 | 彰 3315 | 粹 3316 | 漱 3317 | 漩 3318 | 漾 3319 | 慷 3320 | 寡 3321 | 寥 3322 | 谭 3323 | 褐 3324 | 褪 3325 | 隧 3326 | 嫡 3327 | 缨 3328 | 撵 3329 | 撩 3330 | 撮 3331 | 撬 3332 | 擒 3333 | 墩 3334 | 撰 3335 | 鞍 3336 | 蕊 3337 | 蕴 3338 | 樊 3339 | 樟 3340 | 橄 3341 | 敷 3342 | 豌 3343 | 醇 3344 | 磕 3345 | 磅 3346 | 碾 3347 | 憋 3348 | 嘶 3349 | 嘲 3350 | 嘹 3351 | 蝠 3352 | 蝎 3353 | 蝌 3354 | 蝗 3355 | 蝙 3356 | 嘿 3357 | 幢 3358 | 镊 3359 | 镐 3360 | 稽 3361 | 篓 3362 | 膘 3363 | 鲤 3364 | 鲫 3365 | 褒 3366 | 瘪 3367 | 瘤 3368 | 瘫 3369 | 凛 3370 | 澎 3371 | 潭 3372 | 潦 3373 | 澳 3374 | 潘 3375 | 澈 3376 | 澜 3377 | 澄 3378 | 憔 3379 | 懊 3380 | 憎 3381 | 翩 3382 | 褥 3383 | 谴 3384 | 鹤 3385 | 憨 3386 | 履 3387 | 嬉 3388 | 豫 3389 | 缭 3390 | 撼 3391 | 擂 3392 | 擅 3393 | 蕾 3394 | 薛 3395 | 薇 3396 | 擎 3397 | 翰 3398 | 噩 3399 | 橱 3400 | 橙 3401 | 瓢 3402 | 蟥 3403 | 霍 3404 | 霎 3405 | 辙 3406 | 冀 3407 | 踱 3408 | 蹂 3409 | 蟆 3410 | 螃 3411 | 螟 3412 | 噪 3413 | 鹦 3414 | 黔 3415 | 穆 3416 | 篡 3417 | 篷 3418 | 篙 3419 | 篱 3420 | 儒 3421 | 膳 3422 | 鲸 3423 | 瘾 3424 | 瘸 3425 | 糙 3426 | 燎 3427 | 濒 3428 | 憾 3429 | 懈 3430 | 窿 3431 | 缰 3432 | 壕 3433 | 藐 3434 | 檬 3435 | 檐 3436 | 檩 3437 | 檀 3438 | 礁 3439 | 磷 3440 | 瞭 3441 | 瞬 3442 | 瞳 3443 | 瞪 3444 | 曙 3445 | 蹋 3446 | 蟋 3447 | 蟀 3448 | 嚎 3449 | 赡 3450 | 镣 3451 | 魏 3452 | 簇 3453 | 儡 3454 | 徽 3455 | 爵 3456 | 朦 3457 | 臊 3458 | 鳄 3459 | 糜 3460 | 癌 3461 | 懦 3462 | 豁 3463 | 臀 3464 | 藕 3465 | 藤 3466 | 瞻 3467 | 嚣 3468 | 鳍 3469 | 癞 3470 | 瀑 3471 | 襟 3472 | 璧 3473 | 戳 3474 | 攒 3475 | 孽 3476 | 蘑 3477 | 藻 3478 | 鳖 3479 | 蹭 3480 | 蹬 3481 | 簸 3482 | 簿 3483 | 蟹 3484 | 靡 3485 | 癣 3486 | 羹 3487 | 鬓 3488 | 攘 3489 | 蠕 3490 | 巍 3491 | 鳞 3492 | 糯 3493 | 譬 3494 | 霹 3495 | 躏 3496 | 髓 3497 | 蘸 3498 | 镶 3499 | 瓤 3500 | 矗 3501 | -------------------------------------------------------------------------------- /readme.md: -------------------------------------------------------------------------------- 1 | 2 | ## Solutions of autochecker for chinese 3 | 4 | ### How to use : 5 | - run in the terminal : python Autochecker4Chinese.py 6 | - You will get the following result : ![](./result.png) 7 | 8 | 9 | ### 1. Make a detecter 10 | 11 | - Construct a dict to detect the misspelled chinese phrase,key is the chinese phrase, value is its corresponding frequency appeared in corpus. 12 | - You can finish this step by collecting corpus from the internet, or you can choose a more easy way, load some dicts already created by others. Here we choose the second way, construct the dict from file. 13 | - The detecter works in this way: for any phrase not appeared in this dict, the detecter will detect it as a mis-spelled phrase. 14 | 15 | 16 | 17 | ```python 18 | def construct_dict( file_path ): 19 | 20 | word_freq = {} 21 | with open(file_path, "r") as f: 22 | for line in f: 23 | info = line.split() 24 | word = info[0] 25 | frequency = info[1] 26 | word_freq[word] = frequency 27 | 28 | return word_freq 29 | ``` 30 | 31 | 32 | ```python 33 | FILE_PATH = "./token_freq_pos%40350k_jieba.txt" 34 | phrase_freq = construct_dict( FILE_PATH ) 35 | ``` 36 | 37 | 38 | ```python 39 | print( type(phrase_freq) ) 40 | print( len(phrase_freq) ) 41 | ``` 42 | 43 | 44 | 349045 45 | 46 | 47 | ### 2. Make an autocorrecter 48 | - Make an autocorrecter for the misspelled phrase, we use the edit distance to make a correct-candidate list for the mis-spelled phrase 49 | - We sort the correct-candidate list according to the likelyhood of being the correct phrase, based on the following rules: 50 | - If the candidate's pinyin matches exactly with misspelled phrase's pinyin, we put the candidate in first order, which means they are the most likely phrase to be selected. 51 | - Else if candidate first word's pinyin matches with misspelled phrase's first word's pinyin, we put the candidate in second order. 52 | - Otherwise, we put the candidate in third order. 53 | 54 | ```python 55 | import pinyin 56 | ``` 57 | 58 | 59 | ```python 60 | # list for chinese words 61 | # read from the words.dic 62 | def load_cn_words_dict( file_path ): 63 | cn_words_dict = "" 64 | with open(file_path, "r") as f: 65 | for word in f: 66 | cn_words_dict += word.strip().decode("utf-8") 67 | return cn_words_dict 68 | ``` 69 | 70 | 71 | ```python 72 | # function calculate the edite distance from the chinese phrase 73 | def edits1(phrase, cn_words_dict): 74 | "All edits that are one edit away from `phrase`." 75 | phrase = phrase.decode("utf-8") 76 | splits = [(phrase[:i], phrase[i:]) for i in range(len(phrase) + 1)] 77 | deletes = [L + R[1:] for L, R in splits if R] 78 | transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R)>1] 79 | replaces = [L + c + R[1:] for L, R in splits if R for c in cn_words_dict] 80 | inserts = [L + c + R for L, R in splits for c in cn_words_dict] 81 | return set(deletes + transposes + replaces + inserts) 82 | ``` 83 | 84 | 85 | ```python 86 | # return the phrease exist in phrase_freq 87 | def known(phrases): return set(phrase for phrase in phrases if phrase.encode("utf-8") in phrase_freq) 88 | ``` 89 | 90 | 91 | ```python 92 | # get the candidates phrase of the error phrase 93 | # we sort the candidates phrase's importance according to their pinyin 94 | # if the candidate phrase's pinyin exactly matches with the error phrase, we put them into first order 95 | # if the candidate phrase's first word pinyin matches with the error phrase first word, we put them into second order 96 | # else we put candidate phrase into the third order 97 | def get_candidates( error_phrase ): 98 | 99 | candidates_1st_order = [] 100 | candidates_2nd_order = [] 101 | candidates_3nd_order = [] 102 | 103 | error_pinyin = pinyin.get(error_phrase, format="strip", delimiter="/").encode("utf-8") 104 | cn_words_dict = load_cn_words_dict( "./cn_dict.txt" ) 105 | candidate_phrases = list( known(edits1(error_phrase, cn_words_dict)) ) 106 | 107 | for candidate_phrase in candidate_phrases: 108 | candidate_pinyin = pinyin.get(candidate_phrase, format="strip", delimiter="/").encode("utf-8") 109 | if candidate_pinyin == error_pinyin: 110 | candidates_1st_order.append(candidate_phrase) 111 | elif candidate_pinyin.split("/")[0] == error_pinyin.split("/")[0]: 112 | candidates_2nd_order.append(candidate_phrase) 113 | else: 114 | candidates_3nd_order.append(candidate_phrase) 115 | 116 | return candidates_1st_order, candidates_2nd_order, candidates_3nd_order 117 | ``` 118 | 119 | 120 | ```python 121 | def auto_correct( error_phrase ): 122 | 123 | c1_order, c2_order, c3_order = get_candidates(error_phrase) 124 | # print c1_order, c2_order, c3_order 125 | if c1_order: 126 | return max(c1_order, key=phrase_freq.get ) 127 | elif c2_order: 128 | return max(c2_order, key=phrase_freq.get ) 129 | else: 130 | return max(c3_order, key=phrase_freq.get ) 131 | ``` 132 | 133 | 134 | ```python 135 | # test for the auto_correct 136 | error_phrase_1 = "呕涂" # should be "呕吐" 137 | error_phrase_2 = "东方之朱" # should be "东方之珠" 138 | error_phrase_3 = "沙拢" # should be "沙龙" 139 | 140 | print error_phrase_1, auto_correct( error_phrase_1 ) 141 | print error_phrase_2, auto_correct( error_phrase_2 ) 142 | print error_phrase_3, auto_correct( error_phrase_3 ) 143 | ``` 144 | 145 | 呕涂 呕吐 146 | 东方之朱 东方之珠 147 | 沙拢 沙龙 148 | 149 | 150 | ### 3. Correct the misspelled phrase in a sentance 151 | 152 | 153 | 154 | - For any given sentence, use jieba do the segmentation, 155 | - Get segment list after segmentation is done, check if the remain phrase exists in word_freq dict, if not, then it is a misspelled phrase 156 | - Use auto_correct function to correct the misspelled phrase 157 | - Output the correct sentence 158 | 159 | 160 | 161 | ```python 162 | import jieba 163 | import string 164 | import re 165 | ``` 166 | 167 | 168 | ```python 169 | PUNCTUATION_LIST = string.punctuation 170 | PUNCTUATION_LIST += "。,?:;{}[]‘“”《》/!%……()" 171 | ``` 172 | 173 | 174 | ```python 175 | def auto_correct_sentence( error_sentence, verbose=True): 176 | 177 | jieba_cut = jieba.cut(err_test.decode("utf-8"), cut_all=False) 178 | seg_list = "\t".join(jieba_cut).split("\t") 179 | 180 | correct_sentence = "" 181 | 182 | for phrase in seg_list: 183 | 184 | correct_phrase = phrase 185 | # check if item is a punctuation 186 | if phrase not in PUNCTUATION_LIST.decode("utf-8"): 187 | # check if the phrase in our dict, if not then it is a misspelled phrase 188 | if phrase.encode("utf-8") not in phrase_freq.keys(): 189 | correct_phrase = auto_correct(phrase.encode("utf-8")) 190 | if verbose : 191 | print phrase, correct_phrase 192 | 193 | correct_sentence += correct_phrase 194 | 195 | if verbose: 196 | print correct_sentence 197 | return correct_sentence 198 | ``` 199 | 200 | 201 | ```python 202 | err_sent = '机七学习是人工智能领遇最能体现智能的一个分知!' 203 | correct_sent = auto_correct_sentence( err_sent ) 204 | ``` 205 | 206 | 机七 机器 207 | 领遇 领域 208 | 分知 分枝 209 | 机器学习是人工智能领域最能体现智能的一个分枝! 210 | 211 | 212 | 213 | ```python 214 | print correct_sent 215 | ``` 216 | 217 | 机器学习是人工智能领域最能体现智能的一个分枝! 218 | 219 | 220 | 221 | ```python 222 | 223 | ``` 224 | -------------------------------------------------------------------------------- /readme.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/beyondacm/Autochecker4Chinese/ca27a2aed69b79cdc639fc088b0a2f942c2a81f5/readme.pdf -------------------------------------------------------------------------------- /result.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/beyondacm/Autochecker4Chinese/ca27a2aed69b79cdc639fc088b0a2f942c2a81f5/result.png -------------------------------------------------------------------------------- /token_pinyin%4040k_sogou.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/beyondacm/Autochecker4Chinese/ca27a2aed69b79cdc639fc088b0a2f942c2a81f5/token_pinyin%4040k_sogou.txt --------------------------------------------------------------------------------