├── .idea ├── ai-video.iml ├── misc.xml ├── modules.xml └── workspace.xml ├── autocorrelation ├── __init__.py ├── knowledge_video_v1.py ├── readme └── scope_folder │ └── scope_video_20181219_test.txt ├── data ├── course_json │ ├── kuaiji-09.json │ └── kuaiji-11.json ├── stopwords-pre-v20180817.txt └── stopwords.txt ├── image_ocr ├── __init__.py ├── __pycache__ │ ├── __init__.cpython-36.pyc │ ├── image_text.cpython-36.pyc │ └── tencent_ocr_api.cpython-36.pyc ├── image_text.py └── tencent_ocr_api.py ├── image_processor ├── __init__.py ├── __pycache__ │ ├── __init__.cpython-36.pyc │ └── image_similarity_fundimental.cpython-36.pyc ├── image_enhance.py ├── image_gray.py ├── image_similarity_fundimental.py └── image_similarity_hash.py ├── readme.md ├── text_analysit ├── __init__.py ├── __pycache__ │ ├── __init__.cpython-36.pyc │ └── text_distribution.cpython-36.pyc ├── subtitle_distribution.py └── text_distribution.py ├── text_vector ├── __init__.py ├── __pycache__ │ ├── __init__.cpython-36.pyc │ └── document_feature.cpython-36.pyc └── document_feature.py ├── tools ├── __init__.py ├── __pycache__ │ └── __init__.cpython-36.pyc ├── excel_xls │ ├── ExcelReader.py │ ├── __init__.py │ └── __pycache__ │ │ ├── ExcelReader.cpython-36.pyc │ │ └── __init__.cpython-36.pyc └── file_util │ ├── FilePath.py │ ├── JsonParser.py │ ├── __init__.py │ └── __pycache__ │ ├── FilePath.cpython-36.pyc │ ├── JsonParser.cpython-36.pyc │ └── __init__.cpython-36.pyc ├── video_convertor ├── __init__.py ├── __pycache__ │ ├── __init__.cpython-36.pyc │ └── video_image_convertor_open.cpython-36.pyc ├── video_image_convertor_1.py ├── video_image_convertor_2.py ├── video_image_convertor_open.py └── video_to_img.py └── word_spliter ├── __init__.py ├── __pycache__ ├── __init__.cpython-36.pyc └── jieba_splitor.cpython-36.pyc └── jieba_splitor.py /.idea/ai-video.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 11 | -------------------------------------------------------------------------------- /.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | -------------------------------------------------------------------------------- /.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /.idea/workspace.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 87 | 88 | 89 | 90 | isChinese 91 | 92 | 93 | 94 | 127 | 128 | 129 | 134 | 135 | 136 | 137 | 138 | 139 | 140 | 141 | 142 | 143 | 144 | 145 | 146 | 147 | 148 | 149 | 150 | 151 | 152 | 153 | 154 | 155 | 158 | 159 | 162 | 163 | 164 | 165 | 166 | 167 | 168 | 169 | 170 | 171 | 172 | 173 | 174 | 175 | 176 | 177 | 178 | 179 | 180 | 181 | 182 | 183 | 184 | 185 | 186 | 187 | 197 | 198 | 199 | 200 | 217 | 218 | 235 | 236 | 253 | 254 | 271 | 272 | 289 | 290 | 307 | 308 | 319 | 320 | 338 | 339 | 353 | 354 | 355 | 356 | 357 | 358 | 359 | 360 | 361 | 362 | 363 | 364 | 365 | 366 | 367 | 368 | 369 | 370 | 371 | 373 | 374 | 375 | 376 | 1545716171771 377 | 381 | 382 | 383 | 384 | 385 | 386 | 387 | 388 | 389 | 390 | 391 | 392 | 393 | 394 | 395 | 396 | 397 | 398 | 399 | 400 | 401 | 402 | 403 | 404 | 405 | 406 | 407 | 408 | 409 | 410 | 411 | 412 | 414 | 415 | 416 | 417 | 418 | file://$PROJECT_DIR$/image_processor/image_similarity_hash.py 419 | 80 420 | 422 | 423 | file://$PROJECT_DIR$/video_convertor/video_image_convertor_open.py 424 | 33 425 | 427 | 428 | 430 | 431 | 432 | 433 | 434 | 435 | 436 | 437 | 438 | 439 | 440 | 441 | 442 | 443 | 444 | 445 | 446 | 447 | 448 | 449 | 450 | 451 | 452 | 453 | 454 | 455 | 456 | 457 | 458 | 459 | 460 | 461 | 462 | 463 | 464 | 465 | 466 | 467 | 468 | 469 | 470 | 471 | 472 | 473 | 474 | 475 | 476 | 477 | 478 | 479 | 480 | 481 | 482 | 483 | 484 | 485 | 486 | 487 | 488 | 489 | 490 | 491 | 492 | 493 | 494 | 495 | 496 | 497 | 498 | 499 | 500 | 501 | 502 | 503 | 504 | 505 | 506 | 507 | 508 | 509 | 510 | 511 | 512 | 513 | 514 | 515 | 516 | 517 | 518 | 519 | 520 | 521 | 522 | 523 | 524 | 525 | 526 | 527 | 528 | 529 | 530 | 531 | 532 | 533 | 534 | 535 | 536 | 537 | 538 | 539 | 540 | 541 | 542 | 543 | 544 | 545 | 546 | 547 | 548 | 549 | 550 | 551 | 552 | 553 | 554 | 555 | 556 | 557 | 558 | 559 | 560 | 561 | 562 | 563 | 564 | 565 | 566 | 567 | 568 | 569 | 570 | 571 | 572 | 573 | 574 | 575 | 576 | 577 | 578 | 579 | 580 | 581 | 582 | 583 | 584 | 585 | 586 | 587 | 588 | 589 | 590 | 591 | 592 | 593 | 594 | 595 | 596 | 597 | 598 | 599 | 600 | 601 | 602 | 603 | 604 | 605 | 606 | 607 | 608 | 609 | 610 | 611 | 612 | 613 | 614 | 615 | 616 | 617 | 618 | 619 | 620 | 621 | 622 | 623 | 624 | 625 | 626 | 627 | 628 | 629 | 630 | 631 | 632 | 633 | 634 | 635 | 636 | 637 | 638 | 639 | 640 | 641 | 642 | 643 | 644 | 645 | 646 | 647 | 648 | 649 | 650 | 651 | 652 | 653 | 654 | 655 | 656 | 657 | 658 | 659 | 660 | 661 | 662 | 663 | 664 | 665 | 666 | 667 | 668 | 669 | 670 | 671 | 672 | 673 | 674 | 675 | 676 | 677 | 678 | 679 | 680 | 681 | 682 | 683 | 684 | 685 | 686 | 687 | 688 | 689 | 690 | 691 | 692 | 693 | 694 | 695 | 696 | 697 | 698 | 699 | 700 | 701 | 702 | 703 | 704 | 705 | 706 | 707 | 708 | 709 | 710 | 711 | 712 | 713 | 714 | 715 | 716 | 717 | 718 | 719 | 720 | 721 | 722 | 723 | 724 | 725 | -------------------------------------------------------------------------------- /autocorrelation/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/254675123/ai-video/a26108f600dc3c72e38b3dc7c2c2d8053899909f/autocorrelation/__init__.py -------------------------------------------------------------------------------- /autocorrelation/knowledge_video_v1.py: -------------------------------------------------------------------------------- 1 | # encoding: UTF-8 2 | """ 3 | 1. 首先读取基础课程和视频资源的关系表 4 | 2. 对每一个视频资源，开始处理，处理的过程中，带上基础课程信息 5 | """ 6 | import os 7 | from tools.file_util import FilePath 8 | from tools.excel_xls import ExcelReader 9 | from video_convertor import video_image_convertor_open 10 | from image_ocr import image_text 11 | from text_analysit import text_distribution 12 | class AssociateKV: 13 | """ 14 | 对视频资源进行知识点定位关联 15 | """ 16 | 17 | def __init__(self): 18 | """ 19 | initialize local variables. 20 | """ 21 | # 读取基础课程和视频资源的excel文件 22 | self.excel_reader = ExcelReader.ExcelReader() 23 | 24 | # 已经处理过的video文件 25 | self.processed_video_list = [] 26 | # video 处理成图片 27 | self.video2image = video_image_convertor_open.Video2Image() 28 | # image 处理成文本 29 | self.image2text = image_text.Image2Text() 30 | # text 与知识点相似分布 31 | self.text2kwg = text_distribution.Text2KnowledgeDistribution() 32 | 33 | def loadNeedProcessedVideo(self, filepath): 34 | need_processed_video_list = [] 35 | if not FilePath.fileExist(filepath): 36 | return need_processed_video_list 37 | 38 | # 如果是文本文件，按文本文件读取, 这里暂且用文本文件做测试 39 | f_input = open(filepath, 'r') 40 | for line in f_input: 41 | line = line.strip('\n') 42 | line_secs = line.split(' ') 43 | if len(line_secs) < 2: 44 | continue 45 | # 去掉后缀，只取名字，用作创建文件夹名字使用 46 | course_base_code = line_secs[0] 47 | video_file = os.path.splitext(line_secs[1])[0] 48 | directory_name = '{}-{}'.format(course_base_code, video_file) 49 | # directory_name = line_secs[1].split('.')[0] 50 | need_processed_video_list.append((line_secs[0], line_secs[1], directory_name)) 51 | 52 | # 如果是excel文件，按excel文件读取 53 | 54 | 55 | return need_processed_video_list 56 | 57 | def associateFlow(self, scope_filepath): 58 | """ 59 | 关联的流程 60 | :param scope_filepath: 61 | :return: 62 | """ 63 | # 获取需要处理的基础课程与视频资源表 64 | need_processed_video_list = self.loadNeedProcessedVideo(scope_filepath) 65 | length = len(need_processed_video_list) 66 | index = 0 67 | for need_processed_video in need_processed_video_list: 68 | # 转换为图片 69 | self.video2image.run(need_processed_video) 70 | # 图片识别成文本 71 | self.image2text.run(need_processed_video) 72 | # 文本与知识点相似度统计 73 | self.text2kwg.run(need_processed_video) 74 | 75 | index += 1 76 | print('已经处理了{}/{}'.format(index, length)) 77 | 78 | if __name__ == '__main__': 79 | scope_filepath = u'./scope_folder/scope_video_20181219_test.txt' 80 | akv = AssociateKV() 81 | akv.associateFlow(scope_filepath) 82 | print('task execute over.') 83 | 84 | -------------------------------------------------------------------------------- /autocorrelation/readme: -------------------------------------------------------------------------------- 1 | 该包用来处理业务流程方面的事情 2 | 业务需求是把视频的内容分析，定位每个时间段讲的大概内容，这些内容和对应基础课程知识点进行匹配 3 | 4 | 前提条件：基础课程的知识点，基础课程和视频的关系 5 | 处理过程：视频课程-> 帧转图片 -> 图片相似度分组 -> 图片OCR识别 -> 文本与知识点相似度分析 6 | -> 知识点分布统计 -> 多个候选知识点选举 -> 帧位置转时间轴，附带定位知识点 7 | 8 | 处理结果：格式如下 9 | 帧序号，时间点，知识点编号，知识点名称，知识点匹配的次数 10 | 1 00:00:01 open.bc.47.12 管理的基本原理与方法 5 11 | 19394 00:12:56 open.bc.47.14 计划 38 12 | 19934 00:13:18 open.bc.47.12 管理的基本原理与方法 15 13 | 21412 00:14:17 open.bc.47.14 计划 7 14 | 44965 00:29:59 open.bc.47.11 管理学的产生与发展 7 15 | 45726 00:30:30 open.bc.47.14 计划 4 16 | 45806 00:30:33 open.bc.47.1 《管理学》 1 17 | 49269 00:32:51 open.bc.47.14 计划 32 18 | 59062 00:39:23 open.bc.47.40 控制与控制过程 3 19 | 59225 00:39:29 open.bc.47.13 决策 1 20 | 59236 00:39:30 open.bc.47.40 控制与控制过程 2 21 | 59252 00:39:31 open.bc.47.14 计划 10 22 | 60769 00:40:31 open.bc.47.26 领导 8 23 | 61093 00:40:44 open.bc.47.1 《管理学》 1 24 | 63124 00:42:05 open.bc.47.12 管理的基本原理与方法 8 25 | 63299 00:42:12 open.bc.47.11 管理学的产生与发展 25 26 | 82365 00:54:55 open.bc.47.26 领导 35 27 | 87727 00:58:30 open.bc.47.11 管理学的产生与发展 38 28 | 92894 01:01:56 open.bc.47.26 领导 18 29 | 95973 01:03:59 open.bc.47.1 《管理学》 4 -------------------------------------------------------------------------------- /autocorrelation/scope_folder/scope_video_20181219_test.txt: -------------------------------------------------------------------------------- 1 | open.bc.69 kuaiji-09.mp4 2 | open.bc.69 kuaiji-11.mp4 3 | open.bc.47 glx.mp4 -------------------------------------------------------------------------------- /data/stopwords-pre-v20180817.txt: -------------------------------------------------------------------------------- 1 | 2 | 3 | aboard 4 | about 5 | above 6 | according 7 | according to 8 | across 9 | afore 10 | after 11 | afterwards 12 | again 13 | against 14 | agin 15 | all 16 | almost 17 | alone 18 | along 19 | alongside 20 | already 21 | also 22 | although 23 | always 24 | am 25 | amid 26 | amidst 27 | among 28 | amongst 29 | amoungst 30 | amount 31 | an 32 | and 33 | anent 34 | another 35 | any 36 | anyhow 37 | anyone 38 | anything 39 | anyway 40 | anywhere 41 | approximately 42 | are 43 | around 44 | as 45 | asked 46 | aslant 47 | astride 48 | at 49 | athwart 50 | back 51 | bar 52 | be 53 | became 54 | because 55 | because of 56 | become 57 | becomes 58 | becoming 59 | been 60 | before 61 | beforehand 62 | behind 63 | being 64 | below 65 | beneath 66 | beside 67 | besides 68 | between 69 | betwixt 70 | beyond 71 | bill 72 | both 73 | bottom 74 | but 75 | by 76 | call 77 | called 78 | can 79 | cannot 80 | cant 81 | circa 82 | co 83 | computer 84 | con 85 | could 86 | couldnt 87 | cry 88 | currently 89 | dare 90 | de 91 | describe 92 | despite 93 | detail 94 | did 95 | do 96 | does 97 | done 98 | down 99 | dr 100 | due 101 | due to 102 | during 103 | e.g., 104 | each 105 | earlier 106 | eg 107 | eight 108 | either 109 | eleven 110 | else 111 | elsewhere 112 | empty 113 | enough 114 | ere 115 | etc 116 | even 117 | eventually 118 | ever 119 | every 120 | everyone 121 | everything 122 | everywhere 123 | except 124 | few 125 | fifteen 126 | fify 127 | fill 128 | find 129 | fire 130 | first 131 | five 132 | for 133 | former 134 | formerly 135 | forty 136 | found 137 | four 138 | from 139 | front 140 | full 141 | further 142 | get 143 | give 144 | go 145 | had 146 | has 147 | hasnt 148 | have 149 | he 150 | hence 151 | her 152 | here 153 | hereafter 154 | hereby 155 | herein 156 | hereupon 157 | hers 158 | herself 159 | him 160 | himself 161 | his 162 | how 163 | however 164 | hundred 165 | i 166 | ie 167 | if 168 | in 169 | inc 170 | indeed 171 | inside 172 | instead 173 | interest 174 | into 175 | is 176 | it 177 | its 178 | itself 179 | just 180 | keep 181 | last 182 | latter 183 | latterly 184 | least 185 | less 186 | like 187 | ltd 188 | made 189 | major 190 | many 191 | may 192 | maybe 193 | me 194 | meanwhile 195 | mid 196 | midst 197 | might 198 | mill 199 | mine 200 | minus 201 | more 202 | moreover 203 | most 204 | mostly 205 | move 206 | mr 207 | mrs 208 | ms 209 | much 210 | must 211 | my 212 | myself 213 | name 214 | namely 215 | near 216 | need 217 | neither 218 | net 219 | never 220 | nevertheless 221 | next 222 | nigh 223 | nigher 224 | nighest 225 | nine 226 | no 227 | nobody 228 | none 229 | noone 230 | nor 231 | not 232 | nothing 233 | notwithstanding 234 | now 235 | nowhere 236 | of 237 | off 238 | often 239 | on 240 | on to 241 | once 242 | one 243 | only 244 | onto 245 | or 246 | other 247 | others 248 | otherwise 249 | ought 250 | our 251 | ours 252 | ourselves 253 | out 254 | out of 255 | outside 256 | over 257 | own 258 | part 259 | partly 260 | past 261 | pending 262 | per 263 | perhaps 264 | please 265 | plus 266 | prior 267 | put 268 | qua 269 | rather 270 | re 271 | regarding 272 | round 273 | same 274 | sans 275 | save 276 | see 277 | seem 278 | seemed 279 | seeming 280 | seems 281 | separately 282 | serious 283 | seven 284 | several 285 | shall 286 | she 287 | should 288 | show 289 | side 290 | similarly 291 | since 292 | sincere 293 | six 294 | sixty 295 | so 296 | some 297 | somehow 298 | someone 299 | something 300 | sometime 301 | sometimes 302 | somewhere 303 | still 304 | such 305 | system 306 | take 307 | ten 308 | than 309 | that 310 | the 311 | their 312 | theirs 313 | them 314 | themselves 315 | then 316 | thence 317 | there 318 | thereafter 319 | thereby 320 | therefore 321 | therein 322 | thereupon 323 | these 324 | they 325 | thick 326 | thin 327 | third 328 | this 329 | those 330 | though 331 | three 332 | through 333 | throughout 334 | thru 335 | thus 336 | till 337 | to 338 | together 339 | too 340 | top 341 | toward 342 | towards 343 | twelve 344 | twenty 345 | two 346 | un 347 | under 348 | underneath 349 | unless 350 | unlike 351 | until 352 | unto 353 | up 354 | upon 355 | us 356 | versus 357 | very 358 | via 359 | vice 360 | volume 361 | was 362 | we 363 | well 364 | were 365 | what 366 | whatever 367 | whats 368 | when 369 | whence 370 | whenever 371 | where 372 | whereafter 373 | whereas 374 | whereby 375 | wherein 376 | whereupon 377 | wherever 378 | whether 379 | which 380 | while 381 | whither 382 | who 383 | whoever 384 | whole 385 | whom 386 | whose 387 | why 388 | will 389 | with 390 | within 391 | without 392 | would 393 | yesterday 394 | yet 395 | you 396 | your 397 | yours 398 | yourself 399 | yourselves 400 | { 401 | | 402 | } 403 | ~ 404 | ¡ 405 | ¦ 406 | « 407 | 408 | ¯ 409 | ´ 410 | ¸ 411 | » 412 | ¿ 413 | ˇ 414 | ˉ 415 | ˊ 416 | ˋ 417 | ˜ 418 | ‐ 419 | —　 420 | ― 421 | ‖ 422 | ‘ 423 | ’ 424 | “ 425 | ” 426 | • 427 | … 428 | ‹ 429 | › 430 | ∕ 431 | 、 432 | 。 433 | 〈 434 | 〉 435 | 《 436 | 》 437 | 「 438 | 」 439 | 『 440 | 』 441 | 【 442 | 】 443 | 〔 444 | 〕 445 | 〖 446 | 〗 447 | 〝 448 | 〞 449 | 一 450 | 一些 451 | 一何 452 | 一切 453 | 一则 454 | 一方面 455 | 一旦 456 | 一来 457 | 一样 458 | 一般 459 | 一转眼 460 | 万一 461 | 上 462 | 上下 463 | 下 464 | 不 465 | 不仅 466 | 不但 467 | 不光 468 | 不单 469 | 不只 470 | 不外乎 471 | 不如 472 | 不妨 473 | 不尽 474 | 不尽然 475 | 不得 476 | 不怕 477 | 不惟 478 | 不成 479 | 不拘 480 | 不料 481 | 不是 482 | 不比 483 | 不然 484 | 不特 485 | 不独 486 | 不管 487 | 不至于 488 | 不若 489 | 论 490 | 不论 491 | 不过 492 | 不问 493 | 与 494 | 与其 495 | 与其说 496 | 与否 497 | 与此同时 498 | 且 499 | 且不说 500 | 且说 501 | 两者 502 | 两个 503 | 个别 504 | 临 505 | 为 506 | 为了 507 | 为止 508 | 为此 509 | 为着 510 | 乃 511 | 乃至 512 | 乃至于 513 | 么 514 | 之 515 | 之一 516 | 之所以 517 | 之类 518 | 乌乎 519 | 乎 520 | 乘 521 | 也 522 | 也好 523 | 也罢 524 | 了 525 | 二来 526 | 于 527 | 于是 528 | 于是乎 529 | 云云 530 | 云尔 531 | 些 532 | 亦 533 | 人们 534 | 人家 535 | 今 536 | 介于 537 | 仍 538 | 仍旧 539 | 从 540 | 从此 541 | 从而 542 | 他 543 | 他人 544 | 他们 545 | 以 546 | 以上 547 | 以为 548 | 以便 549 | 以免 550 | 以及 551 | 以故 552 | 以期 553 | 以来 554 | 以至 555 | 以至于 556 | 以致 557 | 们 558 | 任何 559 | 任凭 560 | 似的 561 | 但 562 | 但凡 563 | 但是 564 | 何 565 | 何以 566 | 何况 567 | 何处 568 | 何时 569 | 余外 570 | 作为 571 | 你 572 | 你们 573 | 使 574 | 使得 575 | 例如 576 | 依 577 | 依据 578 | 依照 579 | 便于 580 | 俺 581 | 俺们 582 | 倘 583 | 倘使 584 | 倘或 585 | 倘然 586 | 倘若 587 | 假使 588 | 假如 589 | 假若 590 | 傥然 591 | 像 592 | 儿 593 | 先不先 594 | 光是 595 | 全体 596 | 全部 597 | 全额 598 | 超额 599 | 兮 600 | 关于 601 | 其 602 | 其一 603 | 其中 604 | 其二 605 | 其他 606 | 其余 607 | 其它 608 | 其次 609 | 具体地说 610 | 具体说来 611 | 兼之 612 | 内容 613 | 再其次 614 | 再则 615 | 再有 616 | 再者 617 | 再者说 618 | 再说 619 | 冒 620 | 冲出 621 | 况且 622 | 几 623 | 几时 624 | 凡 625 | 凡是 626 | 凭 627 | 凭借 628 | 出于 629 | 出来 630 | 分别 631 | 则 632 | 原则 633 | 则甚 634 | 别 635 | 别人 636 | 别处 637 | 别是 638 | 别的 639 | 别管 640 | 别说 641 | 到 642 | 前后 643 | 前此 644 | 前者 645 | 加之 646 | 加以 647 | 即 648 | 即对 649 | 即令 650 | 即使 651 | 即便 652 | 即如 653 | 即或 654 | 即若 655 | 却 656 | 去 657 | 又 658 | 又及 659 | 及 660 | 及其 661 | 及至 662 | 反之 663 | 反而 664 | 反过来 665 | 反过来说 666 | 受到 667 | 另 668 | 另一方面 669 | 另外 670 | 另悉 671 | 只 672 | 只当 673 | 只怕 674 | 只是 675 | 只有 676 | 只消 677 | 只要 678 | 只限 679 | 叫 680 | 叮咚 681 | 可 682 | 可以 683 | 可是 684 | 可见 685 | 各国 686 | 各个 687 | 各位 688 | 各种 689 | 各自 690 | 同 691 | 同时 692 | 后 693 | 后者 694 | 向 695 | 向使 696 | 向着 697 | 吓 698 | 吗 699 | 否则 700 | 吧 701 | 吧哒 702 | 吱 703 | 呀 704 | 呃 705 | 呕 706 | 呗 707 | 呜 708 | 呜呼 709 | 呢 710 | 呵 711 | 呵呵 712 | 呸 713 | 呼哧 714 | 咋 715 | 和 716 | 咚 717 | 咦 718 | 咧 719 | 咱 720 | 咱们 721 | 咳 722 | 哇 723 | 哈 724 | 哈哈 725 | 哉 726 | 哎 727 | 哎呀 728 | 哎哟 729 | 哗 730 | 哟 731 | 哦 732 | 哩 733 | 哪 734 | 哪些 735 | 哪怕 736 | 哼 737 | 哼唷 738 | 唉 739 | 唯有 740 | 啊 741 | 啐 742 | 啥 743 | 啦 744 | 啪达 745 | 啷当 746 | 喂 747 | 喏 748 | 喔唷 749 | 喽 750 | 嗡 751 | 嗡嗡 752 | 嗬 753 | 嗯 754 | 嗳 755 | 嘎 756 | 嘎登 757 | 嘘 758 | 嘛 759 | 嘻 760 | 嘿 761 | 嘿嘿 762 | 因 763 | 因为 764 | 因了 765 | 因此 766 | 因着 767 | 因而 768 | 固然 769 | 在 770 | 在下 771 | 在于 772 | 地下 773 | 基于 774 | 处在 775 | 多 776 | 多么 777 | 多少 778 | 大家 779 | 她 780 | 她们 781 | 好 782 | 如 783 | 如上 784 | 如上所述 785 | 如下 786 | 如何 787 | 如其 788 | 如同 789 | 如是 790 | 如果 791 | 如此 792 | 如若 793 | 始而 794 | 孰料 795 | 孰知 796 | 宁 797 | 宁可 798 | 宁愿 799 | 宁肯 800 | 它 801 | 它们 802 | 对 803 | 对于 804 | 对待 805 | 对方 806 | 对比 807 | 将 808 | 小 809 | 尔 810 | 尔后 811 | 尔尔 812 | 尚且 813 | 就 814 | 就是 815 | 就是了 816 | 就是说 817 | 就算 818 | 就要 819 | 尽 820 | 尽管 821 | 尽管如此 822 | 岂但 823 | 己 824 | 已 825 | 已矣 826 | 巴 827 | 巴巴 828 | 并 829 | 并且 830 | 并非 831 | 庶乎 832 | 庶几 833 | 开外 834 | 开始 835 | 归 836 | 归齐 837 | 当 838 | 当地 839 | 当然 840 | 当着 841 | 彼 842 | 彼时 843 | 彼此 844 | 往 845 | 待 846 | 很 847 | 得了 848 | 怎 849 | 怎奈 850 | 总之 851 | 总的来看 852 | 总的来说 853 | 总的说来 854 | 总而言之 855 | 恰恰相反 856 | 您 857 | 惟其 858 | 慢说 859 | 我 860 | 我们 861 | 或 862 | 或则 863 | 或是 864 | 或曰 865 | 或者 866 | 截至 867 | 所以 868 | 所在 869 | 所幸 870 | 有所 871 | 所有 872 | 所示 873 | 才 874 | 才能 875 | 打 876 | 打从 877 | 把 878 | 抑或 879 | 拿 880 | 按 881 | 按照 882 | 换句话说 883 | 换言之 884 | 据 885 | 据此 886 | 接着 887 | 故 888 | 故此 889 | 故而 890 | 旁人 891 | 无非 892 | 无宁 893 | 无论 894 | 既 895 | 既往 896 | 既是 897 | 既然 898 | 时候 899 | 是 900 | 是以 901 | 是的 902 | 曾 903 | 替 904 | 替代 905 | 最 906 | 有 907 | 有些 908 | 有关 909 | 有及 910 | 有时 911 | 有的 912 | 望 913 | 朝向 914 | 朝着 915 | 本人 916 | 本地 917 | 本着 918 | 本身 919 | 本来 920 | 本例中 921 | 来 922 | 来着 923 | 来自 924 | 来说 925 | 说 926 | 极了 927 | 果然 928 | 果真 929 | 某 930 | 某个 931 | 某些 932 | 某某 933 | 根据 934 | 欤 935 | 正值 936 | 正如 937 | 正巧 938 | 正是 939 | 此 940 | 此地 941 | 此处 942 | 此外 943 | 此时 944 | 此次 945 | 此间 946 | 毋宁 947 | 每 948 | 每当 949 | 比 950 | 比及 951 | 比如 952 | 比方 953 | 没奈何 954 | 沿 955 | 沿着 956 | 漫说 957 | 焉 958 | 然则 959 | 然后 960 | 然而 961 | 该项 962 | 某项 963 | 各项 964 | 照 965 | 照着 966 | 犹且 967 | 犹自 968 | 甚且 969 | 甚么 970 | 甚或 971 | 甚而 972 | 甚至 973 | 甚至于 974 | 用 975 | 用于 976 | 用来 977 | 由 978 | 由于 979 | 由是 980 | 由此 981 | 由此可见 982 | 的 983 | 的确 984 | 的话 985 | 直到 986 | 相对而言 987 | 省得 988 | 看 989 | 眨眼 990 | 着 991 | 着呢 992 | 矣 993 | 矣乎 994 | 矣哉 995 | 离开 996 | 竟而 997 | 第 998 | 等 999 | 等到 1000 | 等等 1001 | 简言之 1002 | 类如 1003 | 紧接着 1004 | 纵 1005 | 纵令 1006 | 纵使 1007 | 纵然 1008 | 经过 1009 | 结果 1010 | 继之 1011 | 继后 1012 | 继而 1013 | 综上所述 1014 | 罢了 1015 | 而 1016 | 而且 1017 | 而况 1018 | 而后 1019 | 而外 1020 | 而已 1021 | 而是 1022 | 而言 1023 | 而对 1024 | 能不能 1025 | 能否 1026 | 腾 1027 | 自个儿 1028 | 自从 1029 | 自各儿 1030 | 自后 1031 | 自家 1032 | 自己 1033 | 自打 1034 | 自身 1035 | 至 1036 | 至于 1037 | 至今 1038 | 至若 1039 | 致 1040 | 般的 1041 | 若 1042 | 若夫 1043 | 若是 1044 | 若果 1045 | 若非 1046 | 莫不然 1047 | 莫如 1048 | 莫若 1049 | 虽 1050 | 虽则 1051 | 虽然 1052 | 虽说 1053 | 被 1054 | 要 1055 | 要不 1056 | 要不是 1057 | 要不然 1058 | 要么 1059 | 要是 1060 | 譬喻 1061 | 譬如 1062 | 让 1063 | 许多 1064 | 设使 1065 | 设或 1066 | 设若 1067 | 诚如 1068 | 诚然 1069 | 该 1070 | 说来 1071 | 诸 1072 | 诸位 1073 | 诸如 1074 | 谁 1075 | 谁人 1076 | 谁料 1077 | 谁知 1078 | 贼死 1079 | 赖以 1080 | 赶 1081 | 起 1082 | 起见 1083 | 趁 1084 | 趁着 1085 | 越是 1086 | 距 1087 | 跟 1088 | 较 1089 | 较之 1090 | 边 1091 | 过 1092 | 还 1093 | 还是 1094 | 还有 1095 | 还要 1096 | 这 1097 | 这一来 1098 | 这个 1099 | 这么 1100 | 这么些 1101 | 这么样 1102 | 这么点儿 1103 | 这些 1104 | 这会儿 1105 | 这儿 1106 | 这就是说 1107 | 这时 1108 | 这样 1109 | 这次 1110 | 这般 1111 | 这边 1112 | 这里 1113 | 进而 1114 | 连 1115 | 连同 1116 | 逐步 1117 | 通过 1118 | 遵循 1119 | 遵照 1120 | 那 1121 | 那个 1122 | 那么 1123 | 那么些 1124 | 那么样 1125 | 那些 1126 | 那会儿 1127 | 那儿 1128 | 那时 1129 | 那样 1130 | 那般 1131 | 那边 1132 | 那里 1133 | 都 1134 | 鄙人 1135 | 鉴于 1136 | 针对 1137 | 阿 1138 | 除 1139 | 除了 1140 | 除外 1141 | 除开 1142 | 除此之外 1143 | 除非 1144 | 随 1145 | 随后 1146 | 随时 1147 | 随着 1148 | 难道说 1149 | 非但 1150 | 非徒 1151 | 非特 1152 | 非独 1153 | 靠 1154 | 顺 1155 | 顺着 1156 | 首先 1157 | ︰ 1158 | ︳ 1159 | ︴ 1160 | ︵ 1161 | ︶ 1162 | ︷ 1163 | ︸ 1164 | ︹ 1165 | ︺ 1166 | ︻ 1167 | ︼ 1168 | ︽ 1169 | ︾ 1170 | ︿ 1171 | ﹀ 1172 | ﹁ 1173 | ﹂ 1174 | ﹃ 1175 | ﹄ 1176 | ﹉ 1177 | ﹊ 1178 | ﹋ 1179 | ﹌ 1180 | ﹍ 1181 | ﹎ 1182 | ﹏ 1183 | ﹐ 1184 | ﹑ 1185 | ﹔ 1186 | ﹕ 1187 | ﹖ 1188 | ﹝ 1189 | ﹞ 1190 | ﹟ 1191 | ﹠ 1192 | ﹡ 1193 | ﹢ 1194 | ﹤ 1195 | ﹦ 1196 | ﹨ 1197 | ﹩ 1198 | ﹪ 1199 | ﹫ 1200 | ！ 1201 | ＂ 1202 | ＇ 1203 | （ 1204 | ） 1205 | ， 1206 | ： 1207 | ； 1208 | ？ 1209 | ． 1210 |  1211 | 第一次 1212 | 第二次 1213 | 第三次 1214 | 三个 1215 | 四个 1216 | 五个 1217 | 六个 1218 | 七个 1219 | 八个 1220 | 九个 1221 | 十个 1222 | 一节 1223 | 二节 1224 | 三节 1225 | 四节 1226 | 五节 1227 | 六节 1228 | 七节 1229 | 八节 1230 | 九节 1231 | 十节 1232 | 一章 1233 | 二章 1234 | 三章 1235 | 四章 1236 | 五章 1237 | 六章 1238 | 七章 1239 | 八章 1240 | 九章 1241 | 十章 1242 | 一下 1243 | 一个 1244 | 一则通过 1245 | 一天 1246 | 一定 1247 | 一时 1248 | 一次 1249 | 一片 1250 | 一番 1251 | 一直 1252 | 一致 1253 | 一起 1254 | 一边 1255 | 一面 1256 | 一国 1257 | 七 1258 | 三 1259 | 三天两头 1260 | 三番两次 1261 | 三番五次 1262 | 上升 1263 | 上去 1264 | 上来 1265 | 上述 1266 | 上面 1267 | 下列 1268 | 下去 1269 | 下来 1270 | 下面 1271 | 不一 1272 | 不下 1273 | 不久 1274 | 不了 1275 | 不亦乐乎 1276 | 不仅仅 1277 | 不仅仅是 1278 | 不会 1279 | 不免 1280 | 不再 1281 | 不力 1282 | 不变 1283 | 不可 1284 | 不可开交 1285 | 不可抗拒 1286 | 不同 1287 | 不外 1288 | 不够 1289 | 不大 1290 | 不定 1291 | 不对 1292 | 不少 1293 | 不巧 1294 | 不已 1295 | 不常 1296 | 不得不 1297 | 不得了 1298 | 不得已 1299 | 不必 1300 | 不怎么 1301 | 不择手段 1302 | 不敢 1303 | 不断 1304 | 不日 1305 | 不时 1306 | 不曾 1307 | 不止 1308 | 不止一次 1309 | 不消 1310 | 不满 1311 | 不然的话 1312 | 不由得 1313 | 不知不觉 1314 | 不管怎样 1315 | 不经意 1316 | 不胜 1317 | 不能 1318 | 不能不 1319 | 不要 1320 | 不起 1321 | 不足 1322 | 不迭 1323 | 不限 1324 | 专门 1325 | 严格 1326 | 严重 1327 | 中 1328 | 中小 1329 | 中间 1330 | 丰富 1331 | 串行 1332 | 临到 1333 | 为主 1334 | 为什么 1335 | 为什麽 1336 | 为何 1337 | 主张 1338 | 主要 1339 | 最主要 1340 | 申请 1341 | 申报 1342 | 举凡 1343 | 举行 1344 | 之前 1345 | 之后 1346 | 之後 1347 | 乒 1348 | 乘势 1349 | 乘机 1350 | 乘胜 1351 | 乘虚 1352 | 乘隙 1353 | 九 1354 | 也就是说 1355 | 也是 1356 | 了解 1357 | 争取 1358 | 定义 1359 | 二 1360 | 二话不说 1361 | 二话没说 1362 | 互 1363 | 互相 1364 | 篇 1365 | 五 1366 | 交口 1367 | 产生 1368 | 亲口 1369 | 亲手 1370 | 亲眼 1371 | 亲自 1372 | 亲身 1373 | 人人 1374 | 人民 1375 | 什么 1376 | 什么样 1377 | 什麽 1378 | 仅 1379 | 仅仅 1380 | 今后 1381 | 今天 1382 | 今年 1383 | 今後 1384 | 仍然 1385 | 从不 1386 | 从严 1387 | 从中 1388 | 从事 1389 | 从今以后 1390 | 从优 1391 | 从古到今 1392 | 从古至今 1393 | 从头 1394 | 从宽 1395 | 从小 1396 | 从新 1397 | 从无到有 1398 | 从早到晚 1399 | 从未 1400 | 从来 1401 | 从此以后 1402 | 从轻 1403 | 从速 1404 | 从重 1405 | 他是 1406 | 他的 1407 | 代替 1408 | 以下 1409 | 以前 1410 | 以后 1411 | 以外 1412 | 以後 1413 | 任务 1414 | 企图 1415 | 伙同 1416 | 伟大 1417 | 传 1418 | 传说 1419 | 传闻 1420 | 似乎 1421 | 但愿 1422 | 何乐而不为 1423 | 何妨 1424 | 何尝 1425 | 何必 1426 | 何止 1427 | 何苦 1428 | 何须 1429 | 你是 1430 | 你的 1431 | 使用 1432 | 依靠 1433 | 便 1434 | 促进 1435 | 保持 1436 | 保管 1437 | 倍加 1438 | 倍感 1439 | 倒不如 1440 | 倒不如说 1441 | 倒是 1442 | 借以 1443 | 借此 1444 | 偏偏 1445 | 做到 1446 | 偶尔 1447 | 偶而 1448 | 允许 1449 | 元／吨 1450 | 充其极 1451 | 充其量 1452 | 充分 1453 | 先后 1454 | 先後 1455 | 先生 1456 | 光 1457 | 全力 1458 | 全年 1459 | 全然 1460 | 全身心 1461 | 全都 1462 | 全面 1463 | 八 1464 | 八成 1465 | 公然 1466 | 六 1467 | 共同 1468 | 共总 1469 | 其后 1470 | 其实 1471 | 具体 1472 | 具体来说 1473 | 具有 1474 | 再 1475 | 再次 1476 | 决不 1477 | 决定 1478 | 决非 1479 | 准备 1480 | 凑巧 1481 | 凝神 1482 | 几乎 1483 | 几度 1484 | 几番 1485 | 几经 1486 | 出发 1487 | 出去 1488 | 出现 1489 | 分头 1490 | 分期 1491 | 分期分批 1492 | 切 1493 | 切不可 1494 | 切切 1495 | 切勿 1496 | 切莫 1497 | 刚 1498 | 刚好 1499 | 刚巧 1500 | 刚才 1501 | 到了儿 1502 | 到处 1503 | 到头 1504 | 到头来 1505 | 到底 1506 | 到目前为止 1507 | 前进 1508 | 前面 1509 | 加上 1510 | 加入 1511 | 加强 1512 | 动不动 1513 | 动辄 1514 | 勃然 1515 | 匆匆 1516 | 十分 1517 | 千 1518 | 千万 1519 | 千万千万 1520 | 半 1521 | 单 1522 | 单单 1523 | 单纯 1524 | 即刻 1525 | 即将 1526 | 即是说 1527 | 却不 1528 | 历 1529 | 原来 1530 | 及时 1531 | 双方 1532 | 反之亦然 1533 | 反之则 1534 | 反倒 1535 | 反倒是 1536 | 反应 1537 | 反手 1538 | 反映 1539 | 取得 1540 | 取道 1541 | 变成 1542 | 古来 1543 | 另一个 1544 | 另方面 1545 | 另行 1546 | 做 1547 | 叫做 1548 | 召开 1549 | 叮当 1550 | 可好 1551 | 可能 1552 | 各 1553 | 各人 1554 | 各地 1555 | 各式 1556 | 各级 1557 | 合理 1558 | 同一 1559 | 同样 1560 | 后来 1561 | 后面 1562 | 呆呆地 1563 | 呐 1564 | 周围 1565 | 呼啦 1566 | 哗啦 1567 | 哪个 1568 | 哪儿 1569 | 哪天 1570 | 哪年 1571 | 哪样 1572 | 哪边 1573 | 哪里 1574 | 啊呀 1575 | 啊哈 1576 | 啊哟 1577 | 喀 1578 | 嘎嘎 1579 | 四 1580 | 均 1581 | 坚决 1582 | 坚持 1583 | 基本 1584 | 基本上 1585 | 处处 1586 | 处理 1587 | 复杂 1588 | 多亏 1589 | 多多 1590 | 多多少少 1591 | 多多益善 1592 | 多年前 1593 | 多年来 1594 | 多数 1595 | 多次 1596 | 够瞧的 1597 | 大不了 1598 | 大举 1599 | 大事 1600 | 大体 1601 | 大体上 1602 | 大凡 1603 | 大力 1604 | 大多 1605 | 大多数 1606 | 大大 1607 | 大张旗鼓 1608 | 大批 1609 | 大抵 1610 | 大概 1611 | 大略 1612 | 大约 1613 | 大致 1614 | 大都 1615 | 大量 1616 | 大面儿上 1617 | 失去 1618 | 奇 1619 | 奈 1620 | 奋勇 1621 | 她是 1622 | 她的 1623 | 好在 1624 | 好的 1625 | 好象 1626 | 如今 1627 | 如前所述 1628 | 如常 1629 | 如期 1630 | 如次 1631 | 如此等等 1632 | 姑且 1633 | 存在 1634 | 存心 1635 | 它们的 1636 | 它是 1637 | 它的 1638 | 安全 1639 | 完全 1640 | 完成 1641 | 实现 1642 | 实际 1643 | 宣布 1644 | 容易 1645 | 密切 1646 | 对应 1647 | 将才 1648 | 将要 1649 | 将近 1650 | 少数 1651 | 尔等 1652 | 尤其 1653 | 就地 1654 | 就此 1655 | 尽可能 1656 | 尽如人意 1657 | 尽心尽力 1658 | 尽心竭力 1659 | 尽快 1660 | 尽早 1661 | 尽然 1662 | 尽量 1663 | 局外 1664 | 居然 1665 | 届时 1666 | 属于 1667 | 屡 1668 | 屡屡 1669 | 屡次 1670 | 屡次三番 1671 | 岂 1672 | 岂止 1673 | 岂非 1674 | 川流不息 1675 | 左右 1676 | 巨大 1677 | 巩固 1678 | 差一点 1679 | 差不多 1680 | 已经 1681 | 带 1682 | 帮助 1683 | 常 1684 | 常常 1685 | 常言说 1686 | 常言说得好 1687 | 常言道 1688 | 平素 1689 | 简介 1690 | 岁 1691 | 年 1692 | 年复一年 1693 | 并不 1694 | 并不是 1695 | 并排 1696 | 并无 1697 | 并没 1698 | 并没有 1699 | 并肩 1700 | 广大 1701 | 广泛 1702 | 应当 1703 | 应用 1704 | 应该 1705 | 应按 1706 | 应 1707 | 开展 1708 | 引起 1709 | 弗 1710 | 指 1711 | 指导 1712 | 之间 1713 | 弹指之间 1714 | 强烈 1715 | 强调 1716 | 归根到底 1717 | 归根结底 1718 | 当下 1719 | 当中 1720 | 当儿 1721 | 当前 1722 | 当即 1723 | 当口儿 1724 | 当场 1725 | 当头 1726 | 当庭 1727 | 当时 1728 | 当真 1729 | 形成 1730 | 彻夜 1731 | 彻底 1732 | 往往 1733 | 待到 1734 | 很多 1735 | 很少 1736 | 後来 1737 | 後面 1738 | 得出 1739 | 得到 1740 | 得天独厚 1741 | 得起 1742 | 心里 1743 | 必 1744 | 必定 1745 | 必将 1746 | 必然 1747 | 必要 1748 | 必须 1749 | 快 1750 | 快要 1751 | 忽地 1752 | 忽然 1753 | 怎么 1754 | 怎么办 1755 | 怎么样 1756 | 怎样 1757 | 怎麽 1758 | 怕 1759 | 急匆匆 1760 | 怪 1761 | 怪不得 1762 | 总是 1763 | 总结 1764 | 总论 1765 | 总则 1766 | 总纲 1767 | 附则 1768 | 附录 1769 | 恍然 1770 | 恐怕 1771 | 恰似 1772 | 恰好 1773 | 恰如 1774 | 恰巧 1775 | 恰恰 1776 | 恰逢 1777 | 您们 1778 | 您是 1779 | 惯常 1780 | 意思 1781 | 愤然 1782 | 愿意 1783 | 成为 1784 | 成年 1785 | 成年累月 1786 | 成心 1787 | 我是 1788 | 我的 1789 | 或多或少 1790 | 或许 1791 | 战斗 1792 | 截然 1793 | 所谓 1794 | 扑通 1795 | 打开天窗说亮话 1796 | 扩大 1797 | 抽冷子 1798 | 拦腰 1799 | 按时 1800 | 按期 1801 | 按理 1802 | 按说 1803 | 挨个 1804 | 挨家挨户 1805 | 挨次 1806 | 挨着 1807 | 挨门挨户 1808 | 挨门逐户 1809 | 据实 1810 | 据悉 1811 | 据我所知 1812 | 据称 1813 | 据说 1814 | 掌握 1815 | 接下来 1816 | 接著 1817 | 接连不断 1818 | 放量 1819 | 故意 1820 | 敞开儿 1821 | 敢 1822 | 敢于 1823 | 敢情 1824 | 数/ 1825 | 整个 1826 | 断然 1827 | 方便 1828 | 方才 1829 | 方能 1830 | 方面 1831 | 无法 1832 | 日复一日 1833 | 日渐 1834 | 日益 1835 | 日臻 1836 | 日见 1837 | 昂然 1838 | 明显 1839 | 明确 1840 | 是不是 1841 | 是否 1842 | 显然 1843 | 显著 1844 | 普通 1845 | 普遍 1846 | 暗中 1847 | 暗地里 1848 | 暗自 1849 | 更 1850 | 更为 1851 | 更加 1852 | 更进一步 1853 | 曾经 1854 | 最后 1855 | 最大 1856 | 最好 1857 | 最後 1858 | 最近 1859 | 最高 1860 | 有利 1861 | 有力 1862 | 有所 1863 | 有效 1864 | 有点 1865 | 有的是 1866 | 有着 1867 | 有著 1868 | 末##末 1869 | 我国 1870 | 本国 1871 | 外国 1872 | 本项目 1873 | 权时 1874 | 来不及 1875 | 来得及 1876 | 来看 1877 | 来讲 1878 | 极 1879 | 极为 1880 | 极其 1881 | 极力 1882 | 极大 1883 | 极度 1884 | 极端 1885 | 构成 1886 | 根本 1887 | 格外 1888 | 梆 1889 | 概 1890 | 概述 1891 | 次第 1892 | 欢迎 1893 | 正在 1894 | 正常 1895 | 此中 1896 | 此后 1897 | 殆 1898 | 个 1899 | 每个 1900 | 每天 1901 | 每年 1902 | 每时每刻 1903 | 每每 1904 | 每逢 1905 | 比如说 1906 | 比照 1907 | 比起 1908 | 比较 1909 | 毕竟 1910 | 毫不 1911 | 毫无 1912 | 毫无例外 1913 | 毫无保留地 1914 | 汝 1915 | 沙沙 1916 | 没 1917 | 没有 1918 | 注意 1919 | 深入 1920 | 清楚 1921 | 满 1922 | 满足 1923 | 然 1924 | 然後 1925 | 牢牢 1926 | 特别是 1927 | 特殊 1928 | 特征 1929 | 特点 1930 | 独 1931 | 独自 1932 | 猛然 1933 | 猛然间 1934 | 率尔 1935 | 率然 1936 | 现代 1937 | 现在 1938 | 理应 1939 | 理当 1940 | 理该 1941 | 瑟瑟 1942 | 甫 1943 | 甭 1944 | 略为 1945 | 略加 1946 | 略微 1947 | 白 1948 | 白白 1949 | 皆可 1950 | 目前 1951 | 直接 1952 | 相似 1953 | 相信 1954 | 相反 1955 | 相同 1956 | 相对 1957 | 相应 1958 | 相当 1959 | 相等 1960 | 看上去 1961 | 看出 1962 | 看到 1963 | 看来 1964 | 看样子 1965 | 看看 1966 | 看见 1967 | 看起来 1968 | 真是 1969 | 真正 1970 | 知道 1971 | 砰 1972 | 确定 1973 | 碰巧 1974 | 种 1975 | 积极 1976 | 移动 1977 | 究竟 1978 | 穷年累月 1979 | 突出 1980 | 突然 1981 | 窃 1982 | 个月 1983 | 立 1984 | 立刻 1985 | 立即 1986 | 立地 1987 | 立时 1988 | 立马 1989 | 竟 1990 | 竟然 1991 | 第二 1992 | 策略地 1993 | 简直 1994 | 简而言之 1995 | 粗 1996 | 精光 1997 | 累年 1998 | 累次 1999 | 纯 2000 | 纯粹 2001 | 练习 2002 | 组成 2003 | 经常 2004 | 结合 2005 | 绝 2006 | 绝不 2007 | 绝对 2008 | 绝非 2009 | 绝顶 2010 | 继续 2011 | 维持 2012 | 缕缕 2013 | 老 2014 | 老大 2015 | 老二 2016 | 老是 2017 | 老老实实 2018 | 考虑 2019 | 而又 2020 | 而论 2021 | 联系 2022 | 联袂 2023 | 背地里 2024 | 背靠背 2025 | 能够 2026 | 臭 2027 | 良好 2028 | 范围 2029 | 莫 2030 | 莫不 2031 | 莫非 2032 | 得 2033 | 获得 2034 | 藉以 2035 | 蛮 2036 | 行为 2037 | 行动 2038 | 表明 2039 | 表示 2040 | 要求 2041 | 见 2042 | 规定 2043 | 觉得 2044 | 认为 2045 | 认真 2046 | 认识 2047 | 认可 2048 | 论说 2049 | 话说 2050 | 该当 2051 | 说明 2052 | 说说 2053 | 请勿 2054 | 谨慎 2055 | 豁然 2056 | 赶快 2057 | 赶早不赶晚 2058 | 起先 2059 | 起初 2060 | 起头 2061 | 起来 2062 | 起首 2063 | 趁便 2064 | 趁势 2065 | 趁早 2066 | 趁机 2067 | 趁热 2068 | 路经 2069 | 转动 2070 | 转变 2071 | 转贴 2072 | 轰然 2073 | 较为 2074 | 较比 2075 | 达到 2076 | 达旦 2077 | 迄今 2078 | 迅速 2079 | 过于 2080 | 过去 2081 | 过来 2082 | 过程 2083 | 运用 2084 | 近 2085 | 近几年来 2086 | 近年来 2087 | 近来 2088 | 这点 2089 | 这种 2090 | 这麽 2091 | 租入 2092 | 换入 2093 | 换出 2094 | 转入 2095 | 转出 2096 | 计入 2097 | 存入 2098 | 进入 2099 | 进去 2100 | 进来 2101 | 进步 2102 | 进行 2103 | 连声 2104 | 连日 2105 | 连日来 2106 | 连袂 2107 | 连连 2108 | 迟早 2109 | 迫于 2110 | 适应 2111 | 适当 2112 | 适用 2113 | 逐渐 2114 | 通常 2115 | 造成 2116 | 逢 2117 | 遇到 2118 | 遭到 2119 | 避免 2120 | 那末 2121 | 那麽 2122 | 部分 2123 | 采取 2124 | 里面 2125 | 重大 2126 | 重新 2127 | 重要 2128 | 长期以来 2129 | 长此下去 2130 | 长线 2131 | 长话短说 2132 | 短期内 2133 | 问题 2134 | 间或 2135 | 防止 2136 | 附近 2137 | 陈年 2138 | 限制 2139 | 陡然 2140 | 除却 2141 | 除去 2142 | 除此 2143 | 除此以外 2144 | 除此而外 2145 | 随著 2146 | 隔夜 2147 | 隔日 2148 | 难得 2149 | 难怪 2150 | 难说 2151 | 难道 2152 | 集中 2153 | 零 2154 | 给 2155 | 需 2156 | 需要 2157 | 非常 2158 | 非得 2159 | 顶多 2160 | 顷刻 2161 | 顷刻之间 2162 | 顷刻间 2163 | 顿时 2164 | 颇 2165 | 风雨无阻 2166 | 饱 2167 | 马上 2168 | 高 2169 | 高低 2170 | 高兴 2171 | 默然 2172 | 默默地 2173 | 齐 2174 | 表中 2175 | 包括 2176 | 大于 2177 | 小于 2178 | 等于 2179 | 发生 2180 | 情况 2181 | 情况下 2182 | 增加 2183 | 减少 2184 | 减去 2185 | 提取 2186 | 又称 2187 | 高于 2188 | 低于 2189 | 未能 2190 | 往来 2191 | 泛指 2192 | 有利于 2193 | 依赖 2194 | 中的 2195 | 分为 2196 | 安排 2197 | 分成 2198 | 作业 2199 | 书目 2200 | 目录 2201 | 小结 2202 | 意义 2203 | 影响 2204 | 另外 2205 | 条件 2206 | 项目 2207 | 内容 2208 | 大小 2209 | 方向 2210 | 讨论 2211 | 结果 2212 | 特点 2213 | 方法 2214 | 材料 2215 | 特征 2216 | 原则 2217 | 原因 2218 | 定义 2219 | 含义 2220 | 涵义 2221 | 概念 2222 | 概况 2223 | 构成 2224 | 类型 2225 | 计算 2226 | 分类 2227 | 其他 2228 | 其它 2229 | 问题 2230 | 绪论 2231 | 结论 2232 | 导论 2233 | 导语 2234 | 总论 2235 | 总结 2236 | 总则 2237 | 总之 2238 | 小结 2239 | 前言 2240 | 引言 2241 | 序言 2242 | 附注 2243 | 附录 2244 | 附则 2245 | 后记 2246 | 注意 2247 | 概要 2248 | 概述 2249 | 简介 2250 | 建议 2251 | 并列 2252 | 描述 2253 | 结语 2254 | 结束语 2255 | 总结与展望 2256 | 展望未来 2257 | 结论与建议 2258 | 总要求 2259 | 课程介绍 2260 | 作品解读 2261 | 举例子 2262 | 案例分析 2263 | 本章要点 2264 | 思考及要点 2265 | 重点知识 2266 | 难点知识 2267 | 知识点整理 2268 | 主要知识点掌握程度 2269 | 课堂笔记 2270 | 关键词汇 2271 | 拓展资源 2272 | -------------------------------------------------------------------------------- /data/stopwords.txt: -------------------------------------------------------------------------------- 1 | 2 | 3 | aboard 4 | about 5 | above 6 | according 7 | according to 8 | across 9 | afore 10 | after 11 | afterwards 12 | again 13 | against 14 | agin 15 | all 16 | almost 17 | alone 18 | along 19 | alongside 20 | already 21 | also 22 | although 23 | always 24 | am 25 | amid 26 | amidst 27 | among 28 | amongst 29 | amoungst 30 | amount 31 | an 32 | and 33 | anent 34 | another 35 | any 36 | anyhow 37 | anyone 38 | anything 39 | anyway 40 | anywhere 41 | approximately 42 | are 43 | around 44 | as 45 | asked 46 | aslant 47 | astride 48 | at 49 | athwart 50 | back 51 | bar 52 | be 53 | became 54 | because 55 | because of 56 | become 57 | becomes 58 | becoming 59 | been 60 | before 61 | beforehand 62 | behind 63 | being 64 | below 65 | beneath 66 | beside 67 | besides 68 | between 69 | betwixt 70 | beyond 71 | bill 72 | both 73 | bottom 74 | but 75 | by 76 | call 77 | called 78 | can 79 | cannot 80 | cant 81 | circa 82 | co 83 | computer 84 | con 85 | could 86 | couldnt 87 | cry 88 | currently 89 | dare 90 | de 91 | describe 92 | despite 93 | detail 94 | did 95 | do 96 | does 97 | done 98 | down 99 | dr 100 | due 101 | due to 102 | during 103 | e.g., 104 | each 105 | earlier 106 | eg 107 | eight 108 | either 109 | eleven 110 | else 111 | elsewhere 112 | empty 113 | enough 114 | ere 115 | etc 116 | even 117 | eventually 118 | ever 119 | every 120 | everyone 121 | everything 122 | everywhere 123 | except 124 | few 125 | fifteen 126 | fify 127 | fill 128 | find 129 | fire 130 | first 131 | five 132 | for 133 | former 134 | formerly 135 | forty 136 | found 137 | four 138 | from 139 | front 140 | full 141 | further 142 | get 143 | give 144 | go 145 | had 146 | has 147 | hasnt 148 | have 149 | he 150 | hence 151 | her 152 | here 153 | hereafter 154 | hereby 155 | herein 156 | hereupon 157 | hers 158 | herself 159 | him 160 | himself 161 | his 162 | how 163 | however 164 | hundred 165 | i 166 | ie 167 | if 168 | in 169 | inc 170 | indeed 171 | inside 172 | instead 173 | interest 174 | into 175 | is 176 | it 177 | its 178 | itself 179 | just 180 | keep 181 | last 182 | latter 183 | latterly 184 | least 185 | less 186 | like 187 | ltd 188 | made 189 | major 190 | many 191 | may 192 | maybe 193 | me 194 | meanwhile 195 | mid 196 | midst 197 | might 198 | mill 199 | mine 200 | minus 201 | more 202 | moreover 203 | most 204 | mostly 205 | move 206 | mr 207 | mrs 208 | ms 209 | much 210 | must 211 | my 212 | myself 213 | name 214 | namely 215 | near 216 | need 217 | neither 218 | net 219 | never 220 | nevertheless 221 | next 222 | nigh 223 | nigher 224 | nighest 225 | nine 226 | no 227 | nobody 228 | none 229 | noone 230 | nor 231 | not 232 | nothing 233 | notwithstanding 234 | now 235 | nowhere 236 | of 237 | off 238 | often 239 | on 240 | on to 241 | once 242 | one 243 | only 244 | onto 245 | or 246 | other 247 | others 248 | otherwise 249 | ought 250 | our 251 | ours 252 | ourselves 253 | out 254 | out of 255 | outside 256 | over 257 | own 258 | part 259 | partly 260 | past 261 | pending 262 | per 263 | perhaps 264 | please 265 | plus 266 | prior 267 | put 268 | qua 269 | rather 270 | re 271 | regarding 272 | round 273 | same 274 | sans 275 | save 276 | see 277 | seem 278 | seemed 279 | seeming 280 | seems 281 | separately 282 | serious 283 | seven 284 | several 285 | shall 286 | she 287 | should 288 | show 289 | side 290 | similarly 291 | since 292 | sincere 293 | six 294 | sixty 295 | so 296 | some 297 | somehow 298 | someone 299 | something 300 | sometime 301 | sometimes 302 | somewhere 303 | still 304 | such 305 | system 306 | take 307 | ten 308 | than 309 | that 310 | the 311 | their 312 | theirs 313 | them 314 | themselves 315 | then 316 | thence 317 | there 318 | thereafter 319 | thereby 320 | therefore 321 | therein 322 | thereupon 323 | these 324 | they 325 | thick 326 | thin 327 | third 328 | this 329 | those 330 | though 331 | three 332 | through 333 | throughout 334 | thru 335 | thus 336 | till 337 | to 338 | together 339 | too 340 | top 341 | toward 342 | towards 343 | twelve 344 | twenty 345 | two 346 | un 347 | under 348 | underneath 349 | unless 350 | unlike 351 | until 352 | unto 353 | up 354 | upon 355 | us 356 | versus 357 | very 358 | via 359 | vice 360 | volume 361 | was 362 | we 363 | well 364 | were 365 | what 366 | whatever 367 | whats 368 | when 369 | whence 370 | whenever 371 | where 372 | whereafter 373 | whereas 374 | whereby 375 | wherein 376 | whereupon 377 | wherever 378 | whether 379 | which 380 | while 381 | whither 382 | who 383 | whoever 384 | whole 385 | whom 386 | whose 387 | why 388 | will 389 | with 390 | within 391 | without 392 | would 393 | yesterday 394 | yet 395 | you 396 | your 397 | yours 398 | yourself 399 | yourselves 400 | { 401 | | 402 | } 403 | ~ 404 | ¡ 405 | ¦ 406 | « 407 | 408 | ¯ 409 | ´ 410 | ¸ 411 | » 412 | ¿ 413 | ˇ 414 | ˉ 415 | ˊ 416 | ˋ 417 | ˜ 418 | ‐ 419 | —　 420 | ― 421 | ‖ 422 | ‘ 423 | ’ 424 | “ 425 | ” 426 | • 427 | … 428 | ‹ 429 | › 430 | ∕ 431 | 、 432 | 。 433 | 〈 434 | 〉 435 | 《 436 | 》 437 | 「 438 | 」 439 | 『 440 | 』 441 | 【 442 | 】 443 | 〔 444 | 〕 445 | 〖 446 | 〗 447 | 〝 448 | 〞 449 | 一 450 | 一些 451 | 一何 452 | 一切 453 | 一则 454 | 一方面 455 | 一旦 456 | 一来 457 | 一样 458 | 一般 459 | 一转眼 460 | 万一 461 | 上 462 | 上下 463 | 下 464 | 不 465 | 不仅 466 | 不但 467 | 不光 468 | 不单 469 | 不只 470 | 不外乎 471 | 不如 472 | 不妨 473 | 不尽 474 | 不尽然 475 | 不得 476 | 不怕 477 | 不惟 478 | 不成 479 | 不拘 480 | 不料 481 | 不是 482 | 不比 483 | 不然 484 | 不特 485 | 不独 486 | 不管 487 | 不至于 488 | 不若 489 | 论 490 | 不论 491 | 不过 492 | 不问 493 | 与 494 | 与其 495 | 与其说 496 | 与否 497 | 与此同时 498 | 且 499 | 且不说 500 | 且说 501 | 两者 502 | 两个 503 | 个别 504 | 临 505 | 为 506 | 为了 507 | 为止 508 | 为此 509 | 为着 510 | 乃 511 | 乃至 512 | 乃至于 513 | 么 514 | 之 515 | 之一 516 | 之所以 517 | 之类 518 | 乌乎 519 | 乎 520 | 乘 521 | 也 522 | 也好 523 | 也罢 524 | 了 525 | 二来 526 | 于 527 | 于是 528 | 于是乎 529 | 云云 530 | 云尔 531 | 些 532 | 亦 533 | 人们 534 | 人家 535 | 今 536 | 介于 537 | 仍 538 | 仍旧 539 | 从 540 | 从此 541 | 从而 542 | 他 543 | 他人 544 | 他们 545 | 以 546 | 以上 547 | 以为 548 | 以便 549 | 以免 550 | 以及 551 | 以故 552 | 以期 553 | 以来 554 | 以至 555 | 以至于 556 | 以致 557 | 们 558 | 任何 559 | 任凭 560 | 似的 561 | 但 562 | 但凡 563 | 但是 564 | 何 565 | 何以 566 | 何况 567 | 何处 568 | 何时 569 | 余外 570 | 作为 571 | 你 572 | 你们 573 | 使 574 | 使得 575 | 例如 576 | 依 577 | 依据 578 | 依照 579 | 便于 580 | 俺 581 | 俺们 582 | 倘 583 | 倘使 584 | 倘或 585 | 倘然 586 | 倘若 587 | 假使 588 | 假如 589 | 假若 590 | 傥然 591 | 像 592 | 儿 593 | 先不先 594 | 光是 595 | 全体 596 | 全部 597 | 全额 598 | 超额 599 | 兮 600 | 关于 601 | 其 602 | 其一 603 | 其中 604 | 其二 605 | 其他 606 | 其余 607 | 其它 608 | 其次 609 | 具体地说 610 | 具体说来 611 | 兼之 612 | 内容 613 | 再其次 614 | 再则 615 | 再有 616 | 再者 617 | 再者说 618 | 再说 619 | 冒 620 | 冲出 621 | 况且 622 | 几 623 | 几时 624 | 凡 625 | 凡是 626 | 凭 627 | 凭借 628 | 出于 629 | 出来 630 | 分别 631 | 则 632 | 原则 633 | 则甚 634 | 别 635 | 别人 636 | 别处 637 | 别是 638 | 别的 639 | 别管 640 | 别说 641 | 到 642 | 前后 643 | 前此 644 | 前者 645 | 加之 646 | 加以 647 | 即 648 | 即对 649 | 即令 650 | 即使 651 | 即便 652 | 即如 653 | 即或 654 | 即若 655 | 却 656 | 去 657 | 又 658 | 又及 659 | 及 660 | 及其 661 | 及至 662 | 反之 663 | 反而 664 | 反过来 665 | 反过来说 666 | 受到 667 | 另 668 | 另一方面 669 | 另外 670 | 另悉 671 | 只 672 | 只当 673 | 只怕 674 | 只是 675 | 只有 676 | 只消 677 | 只要 678 | 只限 679 | 叫 680 | 叮咚 681 | 可 682 | 可以 683 | 可是 684 | 可见 685 | 各国 686 | 各个 687 | 各位 688 | 各种 689 | 各自 690 | 同 691 | 同时 692 | 后 693 | 后者 694 | 向 695 | 向使 696 | 向着 697 | 吓 698 | 吗 699 | 否则 700 | 吧 701 | 吧哒 702 | 吱 703 | 呀 704 | 呃 705 | 呕 706 | 呗 707 | 呜 708 | 呜呼 709 | 呢 710 | 呵 711 | 呵呵 712 | 呸 713 | 呼哧 714 | 咋 715 | 和 716 | 咚 717 | 咦 718 | 咧 719 | 咱 720 | 咱们 721 | 咳 722 | 哇 723 | 哈 724 | 哈哈 725 | 哉 726 | 哎 727 | 哎呀 728 | 哎哟 729 | 哗 730 | 哟 731 | 哦 732 | 哩 733 | 哪 734 | 哪些 735 | 哪怕 736 | 哼 737 | 哼唷 738 | 唉 739 | 唯有 740 | 啊 741 | 啐 742 | 啥 743 | 啦 744 | 啪达 745 | 啷当 746 | 喂 747 | 喏 748 | 喔唷 749 | 喽 750 | 嗡 751 | 嗡嗡 752 | 嗬 753 | 嗯 754 | 嗳 755 | 嘎 756 | 嘎登 757 | 嘘 758 | 嘛 759 | 嘻 760 | 嘿 761 | 嘿嘿 762 | 因 763 | 因为 764 | 因了 765 | 因此 766 | 因着 767 | 因而 768 | 固然 769 | 在 770 | 在下 771 | 在于 772 | 地下 773 | 基于 774 | 处在 775 | 多 776 | 多么 777 | 多少 778 | 大家 779 | 她 780 | 她们 781 | 好 782 | 如 783 | 如上 784 | 如上所述 785 | 如下 786 | 如何 787 | 如其 788 | 如同 789 | 如是 790 | 如果 791 | 如此 792 | 如若 793 | 始而 794 | 孰料 795 | 孰知 796 | 宁 797 | 宁可 798 | 宁愿 799 | 宁肯 800 | 它 801 | 它们 802 | 对 803 | 对于 804 | 对待 805 | 对方 806 | 对比 807 | 将 808 | 小 809 | 尔 810 | 尔后 811 | 尔尔 812 | 尚且 813 | 就 814 | 就是 815 | 就是了 816 | 就是说 817 | 就算 818 | 就要 819 | 尽 820 | 尽管 821 | 尽管如此 822 | 岂但 823 | 己 824 | 已 825 | 已矣 826 | 巴 827 | 巴巴 828 | 并 829 | 并且 830 | 并非 831 | 庶乎 832 | 庶几 833 | 开外 834 | 开始 835 | 归 836 | 归齐 837 | 当 838 | 当地 839 | 当然 840 | 当着 841 | 彼 842 | 彼时 843 | 彼此 844 | 往 845 | 待 846 | 很 847 | 得了 848 | 怎 849 | 怎奈 850 | 总之 851 | 总的来看 852 | 总的来说 853 | 总的说来 854 | 总而言之 855 | 恰恰相反 856 | 您 857 | 惟其 858 | 慢说 859 | 我 860 | 我们 861 | 或 862 | 或则 863 | 或是 864 | 或曰 865 | 或者 866 | 截至 867 | 所以 868 | 所在 869 | 所幸 870 | 有所 871 | 所有 872 | 所示 873 | 才 874 | 才能 875 | 打 876 | 打从 877 | 把 878 | 抑或 879 | 拿 880 | 按 881 | 按照 882 | 换句话说 883 | 换言之 884 | 据 885 | 据此 886 | 接着 887 | 故 888 | 故此 889 | 故而 890 | 旁人 891 | 无非 892 | 无宁 893 | 无论 894 | 既 895 | 既往 896 | 既是 897 | 既然 898 | 时候 899 | 是 900 | 是以 901 | 是的 902 | 曾 903 | 替 904 | 替代 905 | 最 906 | 有 907 | 有些 908 | 有关 909 | 有及 910 | 有时 911 | 有的 912 | 望 913 | 朝向 914 | 朝着 915 | 本人 916 | 本地 917 | 本着 918 | 本身 919 | 本来 920 | 本例中 921 | 来 922 | 来着 923 | 来自 924 | 来说 925 | 说 926 | 极了 927 | 果然 928 | 果真 929 | 某 930 | 某个 931 | 某些 932 | 某某 933 | 根据 934 | 欤 935 | 正值 936 | 正如 937 | 正巧 938 | 正是 939 | 此 940 | 此地 941 | 此处 942 | 此外 943 | 此时 944 | 此次 945 | 此间 946 | 毋宁 947 | 每 948 | 每当 949 | 比 950 | 比及 951 | 比如 952 | 比方 953 | 没奈何 954 | 沿 955 | 沿着 956 | 漫说 957 | 焉 958 | 然则 959 | 然后 960 | 然而 961 | 该项 962 | 某项 963 | 各项 964 | 照 965 | 照着 966 | 犹且 967 | 犹自 968 | 甚且 969 | 甚么 970 | 甚或 971 | 甚而 972 | 甚至 973 | 甚至于 974 | 用 975 | 用于 976 | 用来 977 | 由 978 | 由于 979 | 由是 980 | 由此 981 | 由此可见 982 | 的 983 | 的确 984 | 的话 985 | 直到 986 | 相对而言 987 | 省得 988 | 看 989 | 眨眼 990 | 着 991 | 着呢 992 | 矣 993 | 矣乎 994 | 矣哉 995 | 离开 996 | 竟而 997 | 第 998 | 等 999 | 等到 1000 | 等等 1001 | 简言之 1002 | 类如 1003 | 紧接着 1004 | 纵 1005 | 纵令 1006 | 纵使 1007 | 纵然 1008 | 经过 1009 | 结果 1010 | 继之 1011 | 继后 1012 | 继而 1013 | 综上所述 1014 | 罢了 1015 | 而 1016 | 而且 1017 | 而况 1018 | 而后 1019 | 而外 1020 | 而已 1021 | 而是 1022 | 而言 1023 | 而对 1024 | 能不能 1025 | 能否 1026 | 腾 1027 | 自个儿 1028 | 自从 1029 | 自各儿 1030 | 自后 1031 | 自家 1032 | 自己 1033 | 自打 1034 | 自身 1035 | 至 1036 | 至于 1037 | 至今 1038 | 至若 1039 | 致 1040 | 般的 1041 | 若 1042 | 若夫 1043 | 若是 1044 | 若果 1045 | 若非 1046 | 莫不然 1047 | 莫如 1048 | 莫若 1049 | 虽 1050 | 虽则 1051 | 虽然 1052 | 虽说 1053 | 被 1054 | 要 1055 | 要不 1056 | 要不是 1057 | 要不然 1058 | 要么 1059 | 要是 1060 | 譬喻 1061 | 譬如 1062 | 让 1063 | 许多 1064 | 设使 1065 | 设或 1066 | 设若 1067 | 诚如 1068 | 诚然 1069 | 该 1070 | 说来 1071 | 诸 1072 | 诸位 1073 | 诸如 1074 | 谁 1075 | 谁人 1076 | 谁料 1077 | 谁知 1078 | 贼死 1079 | 赖以 1080 | 赶 1081 | 起 1082 | 起见 1083 | 趁 1084 | 趁着 1085 | 越是 1086 | 距 1087 | 跟 1088 | 较 1089 | 较之 1090 | 边 1091 | 过 1092 | 还 1093 | 还是 1094 | 还有 1095 | 还要 1096 | 这 1097 | 这一来 1098 | 这个 1099 | 这么 1100 | 这么些 1101 | 这么样 1102 | 这么点儿 1103 | 这些 1104 | 这会儿 1105 | 这儿 1106 | 这就是说 1107 | 这时 1108 | 这样 1109 | 这次 1110 | 这般 1111 | 这边 1112 | 这里 1113 | 进而 1114 | 连 1115 | 连同 1116 | 逐步 1117 | 通过 1118 | 遵循 1119 | 遵照 1120 | 那 1121 | 那个 1122 | 那么 1123 | 那么些 1124 | 那么样 1125 | 那些 1126 | 那会儿 1127 | 那儿 1128 | 那时 1129 | 那样 1130 | 那般 1131 | 那边 1132 | 那里 1133 | 都 1134 | 鄙人 1135 | 鉴于 1136 | 针对 1137 | 阿 1138 | 除 1139 | 除了 1140 | 除外 1141 | 除开 1142 | 除此之外 1143 | 除非 1144 | 随 1145 | 随后 1146 | 随时 1147 | 随着 1148 | 难道说 1149 | 非但 1150 | 非徒 1151 | 非特 1152 | 非独 1153 | 靠 1154 | 顺 1155 | 顺着 1156 | 首先 1157 | ︰ 1158 | ︳ 1159 | ︴ 1160 | ︵ 1161 | ︶ 1162 | ︷ 1163 | ︸ 1164 | ︹ 1165 | ︺ 1166 | ︻ 1167 | ︼ 1168 | ︽ 1169 | ︾ 1170 | ︿ 1171 | ﹀ 1172 | ﹁ 1173 | ﹂ 1174 | ﹃ 1175 | ﹄ 1176 | ﹉ 1177 | ﹊ 1178 | ﹋ 1179 | ﹌ 1180 | ﹍ 1181 | ﹎ 1182 | ﹏ 1183 | ﹐ 1184 | ﹑ 1185 | ﹔ 1186 | ﹕ 1187 | ﹖ 1188 | ﹝ 1189 | ﹞ 1190 | ﹟ 1191 | ﹠ 1192 | ﹡ 1193 | ﹢ 1194 | ﹤ 1195 | ﹦ 1196 | ﹨ 1197 | ﹩ 1198 | ﹪ 1199 | ﹫ 1200 | ！ 1201 | ＂ 1202 | ＇ 1203 | （ 1204 | ） 1205 | ， 1206 | ： 1207 | ； 1208 | ？ 1209 | ． 1210 |  1211 | 第一次 1212 | 第二次 1213 | 第三次 1214 | 三个 1215 | 四个 1216 | 五个 1217 | 六个 1218 | 七个 1219 | 八个 1220 | 九个 1221 | 十个 1222 | 一节 1223 | 二节 1224 | 三节 1225 | 四节 1226 | 五节 1227 | 六节 1228 | 七节 1229 | 八节 1230 | 九节 1231 | 十节 1232 | 一章 1233 | 二章 1234 | 三章 1235 | 四章 1236 | 五章 1237 | 六章 1238 | 七章 1239 | 八章 1240 | 九章 1241 | 十章 1242 | 一下 1243 | 一个 1244 | 一则通过 1245 | 一天 1246 | 一定 1247 | 一时 1248 | 一次 1249 | 一片 1250 | 一番 1251 | 一直 1252 | 一致 1253 | 一起 1254 | 一边 1255 | 一面 1256 | 一国 1257 | 七 1258 | 三 1259 | 三天两头 1260 | 三番两次 1261 | 三番五次 1262 | 上升 1263 | 上去 1264 | 上来 1265 | 上述 1266 | 上面 1267 | 下列 1268 | 下去 1269 | 下来 1270 | 下面 1271 | 不一 1272 | 不下 1273 | 不久 1274 | 不了 1275 | 不亦乐乎 1276 | 不仅仅 1277 | 不仅仅是 1278 | 不会 1279 | 不免 1280 | 不再 1281 | 不力 1282 | 不变 1283 | 不可 1284 | 不可开交 1285 | 不可抗拒 1286 | 不同 1287 | 不外 1288 | 不够 1289 | 不大 1290 | 不定 1291 | 不对 1292 | 不少 1293 | 不巧 1294 | 不已 1295 | 不常 1296 | 不得不 1297 | 不得了 1298 | 不得已 1299 | 不必 1300 | 不怎么 1301 | 不择手段 1302 | 不敢 1303 | 不断 1304 | 不日 1305 | 不时 1306 | 不曾 1307 | 不止 1308 | 不止一次 1309 | 不消 1310 | 不满 1311 | 不然的话 1312 | 不由得 1313 | 不知不觉 1314 | 不管怎样 1315 | 不经意 1316 | 不胜 1317 | 不能 1318 | 不能不 1319 | 不要 1320 | 不起 1321 | 不足 1322 | 不迭 1323 | 不限 1324 | 专门 1325 | 严格 1326 | 严重 1327 | 中 1328 | 中小 1329 | 中间 1330 | 丰富 1331 | 串行 1332 | 临到 1333 | 为主 1334 | 为什么 1335 | 为什麽 1336 | 为何 1337 | 主张 1338 | 主要 1339 | 最主要 1340 | 申请 1341 | 申报 1342 | 举凡 1343 | 举行 1344 | 之前 1345 | 之后 1346 | 之後 1347 | 乒 1348 | 乘势 1349 | 乘机 1350 | 乘胜 1351 | 乘虚 1352 | 乘隙 1353 | 九 1354 | 也就是说 1355 | 也是 1356 | 了解 1357 | 争取 1358 | 定义 1359 | 二 1360 | 二话不说 1361 | 二话没说 1362 | 互 1363 | 互相 1364 | 篇 1365 | 五 1366 | 交口 1367 | 产生 1368 | 亲口 1369 | 亲手 1370 | 亲眼 1371 | 亲自 1372 | 亲身 1373 | 人人 1374 | 人民 1375 | 什么 1376 | 什么样 1377 | 什麽 1378 | 仅 1379 | 仅仅 1380 | 今后 1381 | 今天 1382 | 今年 1383 | 今後 1384 | 仍然 1385 | 从不 1386 | 从严 1387 | 从中 1388 | 从事 1389 | 从今以后 1390 | 从优 1391 | 从古到今 1392 | 从古至今 1393 | 从头 1394 | 从宽 1395 | 从小 1396 | 从新 1397 | 从无到有 1398 | 从早到晚 1399 | 从未 1400 | 从来 1401 | 从此以后 1402 | 从轻 1403 | 从速 1404 | 从重 1405 | 他是 1406 | 他的 1407 | 代替 1408 | 以下 1409 | 以前 1410 | 以后 1411 | 以外 1412 | 以後 1413 | 任务 1414 | 企图 1415 | 伙同 1416 | 伟大 1417 | 传 1418 | 传说 1419 | 传闻 1420 | 似乎 1421 | 但愿 1422 | 何乐而不为 1423 | 何妨 1424 | 何尝 1425 | 何必 1426 | 何止 1427 | 何苦 1428 | 何须 1429 | 你是 1430 | 你的 1431 | 使用 1432 | 依靠 1433 | 便 1434 | 促进 1435 | 保持 1436 | 保管 1437 | 倍加 1438 | 倍感 1439 | 倒不如 1440 | 倒不如说 1441 | 倒是 1442 | 借以 1443 | 借此 1444 | 偏偏 1445 | 做到 1446 | 偶尔 1447 | 偶而 1448 | 允许 1449 | 元／吨 1450 | 充其极 1451 | 充其量 1452 | 充分 1453 | 先后 1454 | 先後 1455 | 先生 1456 | 光 1457 | 全力 1458 | 全年 1459 | 全然 1460 | 全身心 1461 | 全都 1462 | 全面 1463 | 八 1464 | 八成 1465 | 公然 1466 | 六 1467 | 共同 1468 | 共总 1469 | 其后 1470 | 其实 1471 | 具体 1472 | 具体来说 1473 | 具有 1474 | 再 1475 | 再次 1476 | 决不 1477 | 决定 1478 | 决非 1479 | 准备 1480 | 凑巧 1481 | 凝神 1482 | 几乎 1483 | 几度 1484 | 几番 1485 | 几经 1486 | 出发 1487 | 出去 1488 | 出现 1489 | 分头 1490 | 分期 1491 | 分期分批 1492 | 切 1493 | 切不可 1494 | 切切 1495 | 切勿 1496 | 切莫 1497 | 刚 1498 | 刚好 1499 | 刚巧 1500 | 刚才 1501 | 到了儿 1502 | 到处 1503 | 到头 1504 | 到头来 1505 | 到底 1506 | 到目前为止 1507 | 前进 1508 | 前面 1509 | 加上 1510 | 加入 1511 | 加强 1512 | 动不动 1513 | 动辄 1514 | 勃然 1515 | 匆匆 1516 | 十分 1517 | 千 1518 | 千万 1519 | 千万千万 1520 | 半 1521 | 单 1522 | 单单 1523 | 单纯 1524 | 即刻 1525 | 即将 1526 | 即是说 1527 | 却不 1528 | 历 1529 | 原来 1530 | 及时 1531 | 双方 1532 | 反之亦然 1533 | 反之则 1534 | 反倒 1535 | 反倒是 1536 | 反应 1537 | 反手 1538 | 反映 1539 | 取得 1540 | 取道 1541 | 变成 1542 | 古来 1543 | 另一个 1544 | 另方面 1545 | 另行 1546 | 做 1547 | 叫做 1548 | 召开 1549 | 叮当 1550 | 可好 1551 | 可能 1552 | 各 1553 | 各人 1554 | 各地 1555 | 各式 1556 | 各级 1557 | 合理 1558 | 同一 1559 | 同样 1560 | 后来 1561 | 后面 1562 | 呆呆地 1563 | 呐 1564 | 周围 1565 | 呼啦 1566 | 哗啦 1567 | 哪个 1568 | 哪儿 1569 | 哪天 1570 | 哪年 1571 | 哪样 1572 | 哪边 1573 | 哪里 1574 | 啊呀 1575 | 啊哈 1576 | 啊哟 1577 | 喀 1578 | 嘎嘎 1579 | 四 1580 | 均 1581 | 坚决 1582 | 坚持 1583 | 基本 1584 | 基本上 1585 | 处处 1586 | 处理 1587 | 复杂 1588 | 多亏 1589 | 多多 1590 | 多多少少 1591 | 多多益善 1592 | 多年前 1593 | 多年来 1594 | 多数 1595 | 多次 1596 | 够瞧的 1597 | 大不了 1598 | 大举 1599 | 大事 1600 | 大体 1601 | 大体上 1602 | 大凡 1603 | 大力 1604 | 大多 1605 | 大多数 1606 | 大大 1607 | 大张旗鼓 1608 | 大批 1609 | 大抵 1610 | 大概 1611 | 大略 1612 | 大约 1613 | 大致 1614 | 大都 1615 | 大量 1616 | 大面儿上 1617 | 失去 1618 | 奇 1619 | 奈 1620 | 奋勇 1621 | 她是 1622 | 她的 1623 | 好在 1624 | 好的 1625 | 好象 1626 | 如今 1627 | 如前所述 1628 | 如常 1629 | 如期 1630 | 如次 1631 | 如此等等 1632 | 姑且 1633 | 存在 1634 | 存心 1635 | 它们的 1636 | 它是 1637 | 它的 1638 | 安全 1639 | 完全 1640 | 完成 1641 | 实现 1642 | 实际 1643 | 宣布 1644 | 容易 1645 | 密切 1646 | 对应 1647 | 将才 1648 | 将要 1649 | 将近 1650 | 少数 1651 | 尔等 1652 | 尤其 1653 | 就地 1654 | 就此 1655 | 尽可能 1656 | 尽如人意 1657 | 尽心尽力 1658 | 尽心竭力 1659 | 尽快 1660 | 尽早 1661 | 尽然 1662 | 尽量 1663 | 局外 1664 | 居然 1665 | 届时 1666 | 属于 1667 | 屡 1668 | 屡屡 1669 | 屡次 1670 | 屡次三番 1671 | 岂 1672 | 岂止 1673 | 岂非 1674 | 川流不息 1675 | 左右 1676 | 巨大 1677 | 巩固 1678 | 差一点 1679 | 差不多 1680 | 已经 1681 | 带 1682 | 帮助 1683 | 常 1684 | 常常 1685 | 常言说 1686 | 常言说得好 1687 | 常言道 1688 | 平素 1689 | 简介 1690 | 岁 1691 | 年 1692 | 年复一年 1693 | 并不 1694 | 并不是 1695 | 并排 1696 | 并无 1697 | 并没 1698 | 并没有 1699 | 并肩 1700 | 广大 1701 | 广泛 1702 | 应当 1703 | 应用 1704 | 应该 1705 | 应按 1706 | 应 1707 | 开展 1708 | 引起 1709 | 弗 1710 | 指 1711 | 指导 1712 | 之间 1713 | 弹指之间 1714 | 强烈 1715 | 强调 1716 | 归根到底 1717 | 归根结底 1718 | 当下 1719 | 当中 1720 | 当儿 1721 | 当前 1722 | 当即 1723 | 当口儿 1724 | 当场 1725 | 当头 1726 | 当庭 1727 | 当时 1728 | 当真 1729 | 形成 1730 | 彻夜 1731 | 彻底 1732 | 往往 1733 | 待到 1734 | 很多 1735 | 很少 1736 | 後来 1737 | 後面 1738 | 得出 1739 | 得到 1740 | 得天独厚 1741 | 得起 1742 | 心里 1743 | 必 1744 | 必定 1745 | 必将 1746 | 必然 1747 | 必要 1748 | 必须 1749 | 快 1750 | 快要 1751 | 忽地 1752 | 忽然 1753 | 怎么 1754 | 怎么办 1755 | 怎么样 1756 | 怎样 1757 | 怎麽 1758 | 怕 1759 | 急匆匆 1760 | 怪 1761 | 怪不得 1762 | 总是 1763 | 总结 1764 | 总论 1765 | 总则 1766 | 总纲 1767 | 附则 1768 | 附录 1769 | 恍然 1770 | 恐怕 1771 | 恰似 1772 | 恰好 1773 | 恰如 1774 | 恰巧 1775 | 恰恰 1776 | 恰逢 1777 | 您们 1778 | 您是 1779 | 惯常 1780 | 意思 1781 | 愤然 1782 | 愿意 1783 | 成为 1784 | 成年 1785 | 成年累月 1786 | 成心 1787 | 我是 1788 | 我的 1789 | 或多或少 1790 | 或许 1791 | 战斗 1792 | 截然 1793 | 所谓 1794 | 扑通 1795 | 打开天窗说亮话 1796 | 扩大 1797 | 抽冷子 1798 | 拦腰 1799 | 按时 1800 | 按期 1801 | 按理 1802 | 按说 1803 | 挨个 1804 | 挨家挨户 1805 | 挨次 1806 | 挨着 1807 | 挨门挨户 1808 | 挨门逐户 1809 | 据实 1810 | 据悉 1811 | 据我所知 1812 | 据称 1813 | 据说 1814 | 掌握 1815 | 接下来 1816 | 接著 1817 | 接连不断 1818 | 放量 1819 | 故意 1820 | 敞开儿 1821 | 敢 1822 | 敢于 1823 | 敢情 1824 | 数/ 1825 | 整个 1826 | 断然 1827 | 方便 1828 | 方才 1829 | 方能 1830 | 方面 1831 | 无法 1832 | 日复一日 1833 | 日渐 1834 | 日益 1835 | 日臻 1836 | 日见 1837 | 昂然 1838 | 明显 1839 | 明确 1840 | 是不是 1841 | 是否 1842 | 显然 1843 | 显著 1844 | 普通 1845 | 普遍 1846 | 暗中 1847 | 暗地里 1848 | 暗自 1849 | 更 1850 | 更为 1851 | 更加 1852 | 更进一步 1853 | 曾经 1854 | 最后 1855 | 最大 1856 | 最好 1857 | 最後 1858 | 最近 1859 | 最高 1860 | 有利 1861 | 有力 1862 | 有所 1863 | 有效 1864 | 有点 1865 | 有的是 1866 | 有着 1867 | 有著 1868 | 末##末 1869 | 我国 1870 | 本国 1871 | 外国 1872 | 本项目 1873 | 权时 1874 | 来不及 1875 | 来得及 1876 | 来看 1877 | 来讲 1878 | 极 1879 | 极为 1880 | 极其 1881 | 极力 1882 | 极大 1883 | 极度 1884 | 极端 1885 | 构成 1886 | 根本 1887 | 格外 1888 | 梆 1889 | 概 1890 | 概述 1891 | 次第 1892 | 欢迎 1893 | 正在 1894 | 正常 1895 | 此中 1896 | 此后 1897 | 殆 1898 | 个 1899 | 每个 1900 | 每天 1901 | 每年 1902 | 每时每刻 1903 | 每每 1904 | 每逢 1905 | 比如说 1906 | 比照 1907 | 比起 1908 | 比较 1909 | 毕竟 1910 | 毫不 1911 | 毫无 1912 | 毫无例外 1913 | 毫无保留地 1914 | 汝 1915 | 沙沙 1916 | 没 1917 | 没有 1918 | 注意 1919 | 深入 1920 | 清楚 1921 | 满 1922 | 满足 1923 | 然 1924 | 然後 1925 | 牢牢 1926 | 特别是 1927 | 特殊 1928 | 特征 1929 | 特点 1930 | 独 1931 | 独自 1932 | 猛然 1933 | 猛然间 1934 | 率尔 1935 | 率然 1936 | 现代 1937 | 现在 1938 | 理应 1939 | 理当 1940 | 理该 1941 | 瑟瑟 1942 | 甫 1943 | 甭 1944 | 略为 1945 | 略加 1946 | 略微 1947 | 白 1948 | 白白 1949 | 皆可 1950 | 目前 1951 | 直接 1952 | 相似 1953 | 相信 1954 | 相反 1955 | 相同 1956 | 相对 1957 | 相应 1958 | 相当 1959 | 相等 1960 | 看上去 1961 | 看出 1962 | 看到 1963 | 看来 1964 | 看样子 1965 | 看看 1966 | 看见 1967 | 看起来 1968 | 真是 1969 | 真正 1970 | 知道 1971 | 砰 1972 | 确定 1973 | 碰巧 1974 | 种 1975 | 积极 1976 | 移动 1977 | 究竟 1978 | 穷年累月 1979 | 突出 1980 | 突然 1981 | 窃 1982 | 个月 1983 | 立 1984 | 立刻 1985 | 立即 1986 | 立地 1987 | 立时 1988 | 立马 1989 | 竟 1990 | 竟然 1991 | 第二 1992 | 策略地 1993 | 简直 1994 | 简而言之 1995 | 粗 1996 | 精光 1997 | 累年 1998 | 累次 1999 | 纯 2000 | 纯粹 2001 | 练习 2002 | 组成 2003 | 经常 2004 | 结合 2005 | 绝 2006 | 绝不 2007 | 绝对 2008 | 绝非 2009 | 绝顶 2010 | 继续 2011 | 维持 2012 | 缕缕 2013 | 老 2014 | 老大 2015 | 老二 2016 | 老是 2017 | 老老实实 2018 | 考虑 2019 | 而又 2020 | 而论 2021 | 联系 2022 | 联袂 2023 | 背地里 2024 | 背靠背 2025 | 能够 2026 | 臭 2027 | 良好 2028 | 范围 2029 | 莫 2030 | 莫不 2031 | 莫非 2032 | 得 2033 | 获得 2034 | 藉以 2035 | 蛮 2036 | 行为 2037 | 行动 2038 | 表明 2039 | 表示 2040 | 要求 2041 | 见 2042 | 规定 2043 | 觉得 2044 | 认为 2045 | 认真 2046 | 认识 2047 | 认可 2048 | 论说 2049 | 话说 2050 | 该当 2051 | 说明 2052 | 说说 2053 | 请勿 2054 | 谨慎 2055 | 豁然 2056 | 赶快 2057 | 赶早不赶晚 2058 | 起先 2059 | 起初 2060 | 起头 2061 | 起来 2062 | 起首 2063 | 趁便 2064 | 趁势 2065 | 趁早 2066 | 趁机 2067 | 趁热 2068 | 路经 2069 | 转动 2070 | 转变 2071 | 转贴 2072 | 轰然 2073 | 较为 2074 | 较比 2075 | 达到 2076 | 达旦 2077 | 迄今 2078 | 迅速 2079 | 过于 2080 | 过去 2081 | 过来 2082 | 过程 2083 | 运用 2084 | 近 2085 | 近几年来 2086 | 近年来 2087 | 近来 2088 | 这点 2089 | 这种 2090 | 这麽 2091 | 租入 2092 | 换入 2093 | 换出 2094 | 转入 2095 | 转出 2096 | 计入 2097 | 存入 2098 | 进入 2099 | 进去 2100 | 进来 2101 | 进步 2102 | 进行 2103 | 连声 2104 | 连日 2105 | 连日来 2106 | 连袂 2107 | 连连 2108 | 迟早 2109 | 迫于 2110 | 适应 2111 | 适当 2112 | 适用 2113 | 逐渐 2114 | 通常 2115 | 造成 2116 | 逢 2117 | 遇到 2118 | 遭到 2119 | 避免 2120 | 那末 2121 | 那麽 2122 | 部分 2123 | 采取 2124 | 里面 2125 | 重大 2126 | 重新 2127 | 重要 2128 | 长期以来 2129 | 长此下去 2130 | 长线 2131 | 长话短说 2132 | 短期内 2133 | 问题 2134 | 间或 2135 | 防止 2136 | 附近 2137 | 陈年 2138 | 限制 2139 | 陡然 2140 | 除却 2141 | 除去 2142 | 除此 2143 | 除此以外 2144 | 除此而外 2145 | 随著 2146 | 隔夜 2147 | 隔日 2148 | 难得 2149 | 难怪 2150 | 难说 2151 | 难道 2152 | 集中 2153 | 零 2154 | 给 2155 | 需 2156 | 需要 2157 | 非常 2158 | 非得 2159 | 顶多 2160 | 顷刻 2161 | 顷刻之间 2162 | 顷刻间 2163 | 顿时 2164 | 颇 2165 | 风雨无阻 2166 | 饱 2167 | 马上 2168 | 高 2169 | 高低 2170 | 高兴 2171 | 默然 2172 | 默默地 2173 | 齐 2174 | 表中 2175 | 包括 2176 | 大于 2177 | 小于 2178 | 等于 2179 | 发生 2180 | 情况 2181 | 情况下 2182 | 增加 2183 | 减少 2184 | 减去 2185 | 提取 2186 | 又称 2187 | 高于 2188 | 低于 2189 | 未能 2190 | 往来 2191 | 泛指 2192 | 有利于 2193 | 依赖 2194 | 中的 2195 | 分为 2196 | 安排 2197 | 分成 2198 | 作业 2199 | 书目 2200 | 目录 2201 | 小结 2202 | 意义 2203 | 影响 2204 | 另外 2205 | 条件 2206 | 项目 2207 | 内容 2208 | 大小 2209 | 方向 2210 | 讨论 2211 | 结果 2212 | 特点 2213 | 方法 2214 | 材料 2215 | 特征 2216 | 原则 2217 | 原因 2218 | 定义 2219 | 含义 2220 | 涵义 2221 | 概念 2222 | 概况 2223 | 构成 2224 | 类型 2225 | 计算 2226 | 分类 2227 | 其他 2228 | 其它 2229 | 问题 2230 | 绪论 2231 | 结论 2232 | 导论 2233 | 导语 2234 | 总论 2235 | 总结 2236 | 总则 2237 | 总之 2238 | 小结 2239 | 前言 2240 | 引言 2241 | 序言 2242 | 附注 2243 | 附录 2244 | 附则 2245 | 后记 2246 | 注意 2247 | 概要 2248 | 概述 2249 | 简介 2250 | 建议 2251 | 并列 2252 | 描述 2253 | 结语 2254 | 结束语 2255 | 总结与展望 2256 | 展望未来 2257 | 结论与建议 2258 | 总要求 2259 | 课程介绍 2260 | 作品解读 2261 | 举例子 2262 | 案例分析 2263 | 本章要点 2264 | 思考及要点 2265 | 重点知识 2266 | 难点知识 2267 | 知识点整理 2268 | 主要知识点掌握程度 2269 | 课堂笔记 2270 | 关键词汇 2271 | 拓展资源 -------------------------------------------------------------------------------- /image_ocr/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/254675123/ai-video/a26108f600dc3c72e38b3dc7c2c2d8053899909f/image_ocr/__init__.py -------------------------------------------------------------------------------- /image_ocr/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/254675123/ai-video/a26108f600dc3c72e38b3dc7c2c2d8053899909f/image_ocr/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /image_ocr/__pycache__/image_text.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/254675123/ai-video/a26108f600dc3c72e38b3dc7c2c2d8053899909f/image_ocr/__pycache__/image_text.cpython-36.pyc -------------------------------------------------------------------------------- /image_ocr/__pycache__/tencent_ocr_api.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/254675123/ai-video/a26108f600dc3c72e38b3dc7c2c2d8053899909f/image_ocr/__pycache__/tencent_ocr_api.cpython-36.pyc -------------------------------------------------------------------------------- /image_ocr/image_text.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | 将图像中的文本识别出来 4 | """ 5 | import os 6 | 7 | from image_ocr import tencent_ocr_api 8 | from tools.file_util import FilePath 9 | 10 | 11 | class Image2Text: 12 | """ 13 | 将图片中的文字识别出来 14 | """ 15 | # 当前的文件目录 16 | curPath = os.path.abspath(os.path.dirname(__file__)) 17 | 18 | def loadImageList(self, directory_path): 19 | image_file_list = [] 20 | if not os.path.exists(directory_path): 21 | return image_file_list 22 | 23 | for item in os.listdir(directory_path): 24 | basename = os.path.basename(item) 25 | # windows下文件编码为GBK，linux下为utf-8 26 | # try: 27 | # decode_str = basename.decode("GBK") 28 | # except UnicodeDecodeError: 29 | # decode_str = basename.decode("utf-8") 30 | 31 | image_file_list.append(basename) 32 | return image_file_list 33 | 34 | def saveText(self, text_filepath, text_lines): 35 | f_out = open(text_filepath, 'w', encoding='utf-8') 36 | f_out.write(' '.join(text_lines)) 37 | f_out.close() 38 | pass 39 | 40 | def loadHasProcessedImageIndex(self, file_name): 41 | index_list = [] 42 | processed_filepath = self.curPath+'./processed/{}.txt'.format(file_name) 43 | if not FilePath.fileExist(processed_filepath): 44 | return index_list 45 | f = open(processed_filepath,'r' , encoding='utf-8') 46 | for line in f: 47 | line = line.strip('\n') 48 | index_list.append(line) 49 | return index_list 50 | 51 | def saveProcessedImageIndex(self, filename, index): 52 | processed_filepath = self.curPath+'/processed/{}.txt'.format(filename) 53 | f = open(processed_filepath, 'a', encoding='utf-8') 54 | f.write(index) 55 | f.write('\n') 56 | f.close() 57 | pass 58 | 59 | def run(self, need_processed_video): 60 | file_directory = need_processed_video[2] 61 | 62 | processed_file = self.loadHasProcessedImageIndex(file_directory) 63 | image_filepath = self.curPath+'/../video_convertor/img_folder/'+file_directory 64 | image_file_list = self.loadImageList(image_filepath) 65 | text_filepath_dir = self.curPath+'/text_folder/{}'.format(file_directory) 66 | FilePath.mkdir(text_filepath_dir) 67 | 68 | length_file = len(image_file_list) 69 | index = 0 70 | for image_file in image_file_list: 71 | image_index = image_file.split('.')[0] 72 | if processed_file.__contains__(image_index): 73 | index += 1 74 | continue 75 | 76 | text_lines = tencent_ocr_api.invoke_api_file('{}/{}'.format(image_filepath,image_file)) 77 | if len(text_lines) == 0: 78 | index+=1 79 | continue 80 | # 保存文件的名称 81 | 82 | text_filepath = '{}/{}.txt'.format(text_filepath_dir, image_index) 83 | self.saveText(text_filepath, text_lines) 84 | index += 1 85 | print('已处理{}/{}'.format(index, length_file)) 86 | self.saveProcessedImageIndex(file_directory, image_index) 87 | 88 | if __name__ == '__main__': 89 | #run(need_processed_video) 90 | pass -------------------------------------------------------------------------------- /image_ocr/tencent_ocr_api.py: -------------------------------------------------------------------------------- 1 | import base64 2 | import hashlib 3 | import time 4 | import random 5 | import string 6 | from urllib.parse import quote 7 | import requests 8 | 9 | url = "https://api.ai.qq.com/fcgi-bin/ocr/ocr_generalocr" 10 | def curlmd5(src): 11 | m = hashlib.md5(src.encode('UTF-8')) 12 | return m.hexdigest().upper() 13 | 14 | 15 | # 请求时间戳（秒级），用于防止请求重放（保证签名5分钟有效） 16 | def get_params(base64_data): 17 | t = time.time() 18 | time_stamp = str(int(t)) 19 | # 请求随机字符串，用于保证签名不可预测 20 | nonce_str = ''.join(random.sample(string.ascii_letters + string.digits, 10)) 21 | # 应用标志，这里修改成自己的id和key 22 | app_id = '2110218671' 23 | app_key = '9GkQT0jSiRz2HCEY' 24 | params = {'app_id': app_id, 25 | 'image': base64_data, 26 | 'time_stamp': time_stamp, 27 | 'nonce_str': nonce_str, 28 | } 29 | sign_before = '' 30 | # 要对key排序再拼接 31 | for key in sorted(params): 32 | # 键值拼接过程value部分需要URL编码，URL编码算法用大写字母，例如%E8。quote默认大写。 33 | sign_before += '{}={}&'.format(key, quote(params[key], safe='')) 34 | # 将应用密钥以app_key为键名，拼接到字符串sign_before末尾 35 | sign_before += 'app_key={}'.format(app_key) 36 | # 对字符串sign_before进行MD5运算，得到接口请求签名 37 | sign = curlmd5(sign_before) 38 | params['sign'] = sign 39 | return params 40 | 41 | def invoke_api_file(image_filepath): 42 | with open(image_filepath, 'rb') as fin: 43 | image_data = fin.read() 44 | base64_data = base64.b64encode(image_data) 45 | params = get_params(base64_data) 46 | r = requests.post(url, data=params) 47 | print(r.status_code) 48 | if r.status_code != 200: 49 | return [] 50 | item_list = r.json()['data']['item_list'] 51 | lines = [] 52 | for s in item_list: 53 | lines.append(s['itemstring']) 54 | return lines 55 | 56 | def test_api(): 57 | #url = "https://api.ai.qq.com/fcgi-bin/ocr/ocr_generalocr" 58 | with open('cn_1.jpg', 'rb') as fin: 59 | image_data = fin.read() 60 | base64_data = base64.b64encode(image_data) 61 | params = get_params(base64_data) 62 | r = requests.post(url, data=params) 63 | item_list = r.json()['data']['item_list'] 64 | for s in item_list: 65 | print(s['itemstring']) 66 | 67 | #test_api() -------------------------------------------------------------------------------- /image_processor/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/254675123/ai-video/a26108f600dc3c72e38b3dc7c2c2d8053899909f/image_processor/__init__.py -------------------------------------------------------------------------------- /image_processor/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/254675123/ai-video/a26108f600dc3c72e38b3dc7c2c2d8053899909f/image_processor/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /image_processor/__pycache__/image_similarity_fundimental.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/254675123/ai-video/a26108f600dc3c72e38b3dc7c2c2d8053899909f/image_processor/__pycache__/image_similarity_fundimental.cpython-36.pyc -------------------------------------------------------------------------------- /image_processor/image_enhance.py: -------------------------------------------------------------------------------- 1 | # encoding: UTF-8 2 | 3 | # 亮度，色度，对比度，锐度增强 4 | 5 | from PIL import Image 6 | from PIL import ImageEnhance 7 | 8 | # 原始图像 9 | image = Image.open('deal_with/A1.jpg') 10 | image.show() 11 | 12 | # 亮度增强 13 | enh_bri = ImageEnhance.Brightness(image) 14 | brightness = 1.5 15 | image_brightened = enh_bri.enhance(brightness) 16 | image_brightened.show() 17 | image_brightened.save("image_brightened.jpg") 18 | 19 | # 色度增强 20 | enh_col = ImageEnhance.Color(image) 21 | color = 1.5 22 | image_colored = enh_col.enhance(color) 23 | image_colored.show() 24 | image_colored.save("image_colored.jpg") 25 | 26 | # 对比度增强 27 | enh_con = ImageEnhance.Contrast(image) 28 | contrast = 1.5 29 | image_contrasted = enh_con.enhance(contrast) 30 | image_contrasted.show() 31 | image_contrasted.save("image_contrasted.jpg") 32 | 33 | # 锐度增强 34 | enh_sha = ImageEnhance.Sharpness(image) 35 | sharpness = 3.0 36 | image_sharped = enh_sha.enhance(sharpness) 37 | image_sharped.show() 38 | image_sharped.save("image_sharped.jpg") 39 | -------------------------------------------------------------------------------- /image_processor/image_gray.py: -------------------------------------------------------------------------------- 1 | # encoding: UTF-8 2 | 3 | # 图片灰度化（白-灰-黑）+二值化（黑白2色） 4 | 5 | from PIL import Image 6 | 7 | # load a color image 8 | im = Image.open('durant.jpg') 9 | 10 | # convert to grey level image 11 | Lim = im.convert('L') 12 | Lim.save('grey.jpg') 13 | 14 | # setup a converting table with constant threshold 15 | threshold = 185 16 | table = [] 17 | for i in range(256): 18 | if i < threshold: 19 | table.append(0) 20 | else: 21 | table.append(1) 22 | 23 | # convert to binary image by the table 24 | bim = Lim.point(table, '1') 25 | 26 | bim.save('durant_grey.jpg') -------------------------------------------------------------------------------- /image_processor/image_similarity_fundimental.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Birdy 17/11/27 4 | import os 5 | import PIL.Image as Image 6 | 7 | #current_path = os.path.abspath(__file__) 8 | current_path = os.path.dirname(__file__) 9 | 10 | def difference(hist1,hist2): 11 | sum1 = 0 12 | for i in range(len(hist1)): 13 | if (hist1[i] == hist2[i]): 14 | sum1 += 1 15 | else: 16 | sum1 += 1 - float(abs(hist1[i] - hist2[i]))/ max(hist1[i], hist2[i]) 17 | return sum1/len(hist1) 18 | 19 | def similary_calculate(path1 , path2 , mode): 20 | if(mode == 3): 21 | # 如果是frame的话，可以直接使用 22 | # img = cv2.resize(frame, (8, 8)) 23 | img1 = Image.open(path1).resize((8,8)).convert('1') 24 | img2 = Image.open(path2).resize((8,8)).convert('1') 25 | hist1 = list(img1.getdata()) 26 | hist2 = list(img2.getdata()) 27 | return difference(hist1, hist2) 28 | 29 | # 预处理 30 | img1 = Image.open(path1).resize((256,256)).convert('RGB') 31 | img2 = Image.open(path2).resize((256,256)).convert('RGB') 32 | if(mode == 1): 33 | return difference(img1.histogram(), img2.histogram()) 34 | if(mode == 2): 35 | sum = 0; 36 | for i in range(4): 37 | for j in range(4): 38 | hist1 = img1.crop((i*64, j*64, i*64+63, j*64+63)).copy().histogram() 39 | hist2 = img2.crop((i*64, j*64, i*64+63, j*64+63)).copy().histogram() 40 | sum += difference(hist1, hist2) 41 | #print difference(hist1, hist2) 42 | return sum/16 43 | return 0 44 | 45 | 46 | def readfolder(folder,pic, mode): 47 | # 不同的mode对应不同的类型 48 | file_list = [] 49 | t = 0 50 | file_temp = '' 51 | for root,directors,files in os.walk(folder): 52 | for filename in files: 53 | filepath = os.path.join(root,filename) 54 | if (filepath.endswith(".png") or filepath.endswith(".jpg")): 55 | remember = similary_calculate(pic,filepath,mode) 56 | print(filename) 57 | print(remember) 58 | if (remember > t) and remember!= 1: 59 | file_temp = filename 60 | t = remember 61 | 62 | return file_temp 63 | 64 | if __name__ == '__main__': 65 | print("###########直方图的距离计算#############") 66 | print("相似度最高的图是" + readfolder(current_path+'/../test','./../data/1073.jpg',1)) 67 | print("###########分块直方图的距离计算#############") 68 | print("相似度最高的图是" + readfolder(current_path+'/../test','./../data/1073.jpg',2)) 69 | print("##############感知哈希算法###############") 70 | print("相似度最高的图是" + readfolder(current_path+'/../test','./../data/1073.jpg',3)) 71 | -------------------------------------------------------------------------------- /image_processor/image_similarity_hash.py: -------------------------------------------------------------------------------- 1 | # encoding: UTF-8 2 | # 计算2张图片的相似度 3 | # "感知哈希算法"（Perceptual hash algorithm）， 4 | # 它的作用是对每张图片生成一个"指纹"（fingerprint）字符串， 5 | # 然后比较不同图片的指纹。结果越接近，就说明图片越相似。 6 | """ 7 | 4.1 算法步骤 8 | 4.1.1 缩小尺寸 9 | 　　将图片缩小到8x8的尺寸，总共64个像素。这一步的作用是去除图片的细节， 10 | 只保留结构、明暗等基本信息，摒弃不同尺寸、比例带来的图片差异。 11 | 4.1.2 简化色彩 12 | 　　将缩小后的图片，转为64级灰度。也就是说，所有像素点总共只有64种颜色。 13 | 4.1.3 计算平均值 14 | 　　计算所有64个像素的灰度平均值 15 | 4.1.4 比较像素的灰度平均值 16 | 　　将每个像素的灰度，与平均值进行比较。大于或等于平均值，记为1；小于平均值，记为0。 17 | 4.1.5 计算哈希值 18 | 　　将上一步的比较结果，组合在一起，就构成了一个64位的整数，这就是这张图片的指纹。组合的次序并不重要， 19 | 只要保证所有图片都采用同样次序就行了。 20 | 　　得到指纹以后，就可以对比不同的图片，看看64位中有多少位是不一样的。在理论上，这等同于计算"汉明距离"（Hamming distance）。 21 | 如果不相同的数据位不超过5，就说明两张图片很相似；如果大于10，就说明这是两张不同的图片。 22 | """ 23 | import glob 24 | import os 25 | import sys 26 | from functools import reduce 27 | from PIL import Image 28 | 29 | EXTS = 'jpg', 'jpeg', 'JPG', 'JPEG', 'gif', 'GIF', 'png', 'PNG' 30 | 31 | 32 | def avhash(im): 33 | if not isinstance(im, Image.Image): 34 | im = Image.open(im) 35 | im = im.resize((80, 80), Image.ANTIALIAS).convert('L') 36 | avg = reduce(lambda x, y: x + y, im.getdata()) / 6400. 37 | print('avg:{}'.format(avg)) 38 | return reduce(lambda x,y,z: x | (z << y), 39 | enumerate(map(lambda i: 0 if i < avg else 1, im.getdata())), 40 | 0) 41 | 42 | 43 | def hamming(h1, h2): 44 | h, d = 0, h1 ^ h2 45 | while d: 46 | h += 1 47 | d &= d - 1 48 | return h 49 | 50 | def one_to_many(): 51 | if len(sys.argv) <= 1 or len(sys.argv) > 3: 52 | print("Usage: %s image.jpg [dir]" % sys.argv[0]) 53 | else: 54 | im, wd = sys.argv[1], '.' if len(sys.argv) < 3 else sys.argv[2] 55 | h = avhash(im) 56 | 57 | os.chdir(wd) 58 | images = [] 59 | for ext in EXTS: 60 | images.extend(glob.glob('*.%s' % ext)) 61 | 62 | seq = [] 63 | # isatty方法检测文件是否连接到一个终端设备 64 | prog = int(len(images) > 50 and sys.stdout.isatty()) 65 | for f in images: 66 | seq.append((f, hamming(avhash(f), h))) 67 | if prog: 68 | perc = 100. * prog / len(images) 69 | x = int(2 * perc / 5) 70 | print('\rCalculating... [' + '#' * x + ' ' * (40 - x) + ']') 71 | print('%.2f%%' % perc, '(%d/%d)' % (prog, len(images))) 72 | sys.stdout.flush() 73 | prog += 1 74 | 75 | if prog: print('prog') 76 | for f, ham in sorted(seq, key=lambda i: i[1]): 77 | print("%d\t%s" % (ham, f)) 78 | 79 | def one_to_one(): 80 | im = './../mv/1.jpg' 81 | h = avhash(im) 82 | target = './../mv/2.jpg' 83 | t = avhash(target) 84 | sim = hamming(t, h) 85 | print('sim:{}'.format(sim)) 86 | 87 | if __name__ == '__main__': 88 | one_to_one() 89 | -------------------------------------------------------------------------------- /readme.md: -------------------------------------------------------------------------------- 1 | 本项目用来处理视频 2 | 1. 将视频转成图片 3 | 2. 把图片按组分类，相似的图片归到一起（按帧号） 4 | 3. 提取图片中的文字 5 | 4. 分析文字的内容，识别内容的特征，区分不同组描述的主题 6 | 5. 把不同主题的区间段划分出来（按照帧号，普通的视频25帧/秒） -------------------------------------------------------------------------------- /text_analysit/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/254675123/ai-video/a26108f600dc3c72e38b3dc7c2c2d8053899909f/text_analysit/__init__.py -------------------------------------------------------------------------------- /text_analysit/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/254675123/ai-video/a26108f600dc3c72e38b3dc7c2c2d8053899909f/text_analysit/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /text_analysit/__pycache__/text_distribution.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/254675123/ai-video/a26108f600dc3c72e38b3dc7c2c2d8053899909f/text_analysit/__pycache__/text_distribution.cpython-36.pyc -------------------------------------------------------------------------------- /text_analysit/subtitle_distribution.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | 分析字幕文本在知识树上面的分布情况，确定一段时间内的内容是集中在一个段的知识点 4 | """ 5 | import os 6 | import math 7 | from word_spliter import jieba_splitor 8 | from text_vector import document_feature 9 | from tools.file_util import JsonParser 10 | class Text2KnowledgeDistribution: 11 | splitor = jieba_splitor.JiebaSplitor() 12 | docvector = document_feature.TextVector() 13 | time_kwg_dict = {} 14 | knowledge_code_dict = {} 15 | json_parser = JsonParser.JsonParser() 16 | 17 | # 当前的文件目录 18 | curPath = os.path.abspath(os.path.dirname(__file__)) 19 | 20 | def loadKnowledge(self, kwg_filepath): 21 | knowledge_list = [] 22 | f = open(kwg_filepath, 'rb') 23 | ids_lines = f.read().decode('utf-8', 'ignore') 24 | ids_lines_list = ids_lines.split('\r\n') 25 | index = 0 26 | for line in ids_lines_list: 27 | index += 1 28 | if index == 1: 29 | continue 30 | line = line.strip('\n') 31 | line_k = line.split(' ') 32 | if len(line_k) < 2: 33 | continue 34 | 35 | line_k_code = line_k[0] 36 | line_k_word = line_k[1] 37 | line_k_word_array = self.splitor.split1list(line_k_word) 38 | 39 | if knowledge_list.__contains__(line): 40 | continue 41 | else: 42 | knowledge_list.append((line_k_code, line_k_word, line_k_word_array)) 43 | self.knowledge_code_dict[line_k_code] = line_k_word 44 | return knowledge_list 45 | 46 | def loadTextList(self, directory_path): 47 | text_file_list = [] 48 | if not os.path.exists(directory_path): 49 | return text_file_list 50 | 51 | for item in os.listdir(directory_path): 52 | basename = os.path.basename(item) 53 | # windows下文件编码为GBK，linux下为utf-8 54 | # try: 55 | # decode_str = basename.decode("GBK") 56 | # except UnicodeDecodeError: 57 | # decode_str = basename.decode("utf-8") 58 | 59 | text_file_list.append(basename) 60 | return text_file_list 61 | 62 | def readJsonFileContent(self, filepath): 63 | f = open(filepath, 'r', encoding='utf-8') 64 | content = f.read() 65 | # 内容是json格式，需要对其转换成json对象 66 | flag = self.json_parser.parseJson(content) 67 | return flag 68 | 69 | def compute_similarity(self, course_base_code, video_file_name): 70 | kwg_filepath = self.curPath+'/knowledge_folder/{}.txt'.format(course_base_code) 71 | print('开始加载知识点集') 72 | kwg_list = self.loadKnowledge(kwg_filepath) 73 | # 训练模型 74 | print('开始训练模型') 75 | self.docvector.train(kwg_list, course_base_code) 76 | print('开始预测') 77 | # 测试数据 78 | text_filepath = self.curPath+'/../data/course_json/{}.json'.format(video_file_name) 79 | parse_flag = self.readJsonFileContent(text_filepath) 80 | if parse_flag == False: 81 | print('解析json数据失败') 82 | return 83 | for line in self.json_parser.jsondata: 84 | item_time = line['start'] 85 | item_content = line['content'] 86 | content_words = self.splitor.split1list(item_content) 87 | sims = self.docvector.test_doc2vec(content_words) 88 | 89 | print('{}的对应的知识点如下：'.format(item_content)) 90 | sim_kwg_list = [] 91 | for sim in sims: 92 | kwg = self.docvector.index_catalog.get(int(sim[0])) 93 | kwg_tuple = (kwg[0],kwg[1],kwg[2],sim[1]) 94 | sim_kwg_list.append(kwg_tuple) 95 | print('{} {}, {}'.format(kwg[0], kwg[1], sim[1])) 96 | 97 | self.time_kwg_dict[item_time] = sim_kwg_list 98 | 99 | def statistics(self): 100 | img_index_kwg_list = [] 101 | for key, value in self.time_kwg_dict.items(): 102 | img_index_kwg_list.append((int(key), value)) 103 | 104 | # 按帧的顺序排序 105 | # 由tuple组成的List排序 106 | # students = [('john', 'A', 15), ('jane', 'B', 12), ('dave', 'B', 10),] 107 | # 用key函数排序：返回由tuple组成的list 108 | # sorted(students, key=lambda student : student[2]) # sort by age 109 | # [('dave', 'B', 10), ('jane', 'B', 12), ('john', 'A', 15)] 110 | # 用cmp函数排序 111 | # sorted(students, cmp=lambda x,y : cmp(x[2], y[2])) # sort by age 112 | # [('dave', 'B', 10), ('jane', 'B', 12), ('john', 'A', 15)] 113 | 114 | # 这里的kwg是(index, (code, name, words)) 115 | sort_list = sorted(img_index_kwg_list, key=lambda x:x[0]) 116 | # 对排序后的帧，分析段 117 | # 分组的原则是相似度大于80%的作为分组的开始，如果连续几个都大于80%，都作为一组 118 | # 如果80%后，开始小于80%，也同样归属于当前组； 119 | # 如果再次出现80%，前面的一定是小于80%的或者是刚开始，才作为分组的开始 120 | # 分组的信息，只需要记录开始的帧序号 121 | section_index_dict = {} 122 | pre_similarity = 0.0 123 | current_section_index = 1 124 | has_exist_kwg_point = False 125 | for frame_tuple in sort_list: 126 | frame_index = frame_tuple[0] 127 | k_code = frame_tuple[1][0][0] 128 | k_code_part = self.getPartCode(k_code) 129 | cur_similarity = frame_tuple[1][0][3] 130 | if cur_similarity >= 0.8 and pre_similarity < 0.8: 131 | has_exist_kwg_point = True 132 | current_section_index = frame_index 133 | self.setFrameIndexSectionDict(section_index_dict, current_section_index, k_code_part) 134 | else: 135 | if has_exist_kwg_point: 136 | self.setFrameIndexSectionDict(section_index_dict, current_section_index, k_code_part) 137 | else: 138 | continue 139 | pre_similarity = cur_similarity 140 | print('统计分段完成') 141 | return section_index_dict 142 | 143 | def getPartCode(self, code, sec=4): 144 | if code is None: 145 | return code 146 | section_list = code.split('.') 147 | n_code = '.'.join(section_list[:sec]) 148 | 149 | return n_code 150 | 151 | def setFrameIndexSectionDict(self, section_index_dict, current_section_index, k_code_part): 152 | """ 153 | 设置相同内容的为一段 154 | :param section_index_dict: 155 | :return: 156 | """ 157 | if section_index_dict.__contains__(current_section_index): 158 | frame_list = section_index_dict.get(current_section_index) 159 | frame_list.append(k_code_part) 160 | else: 161 | frame_list = [] 162 | frame_list.append(k_code_part) 163 | section_index_dict[current_section_index] = frame_list 164 | 165 | def compute_distribute(self, section_index_dict, file_name): 166 | """ 167 | 对于每一段，选择大多数的知识点为最终知识点，也算是投票的结果 168 | :param section_index_dict: 169 | :return: 170 | """ 171 | index_kwg_list = [] 172 | print('计算知识点分布') 173 | 174 | for index, kwg_code_list in section_index_dict.items(): 175 | kwg_code_tuple = self.getMoreKwgCode(kwg_code_list) 176 | if kwg_code_tuple[1] < 2: 177 | continue 178 | index_kwg_list.append((index, kwg_code_tuple)) 179 | 180 | # 合并code 相同的 181 | index_kwg_code_list = [] 182 | pre_kwg_code_tuple = None 183 | pre_index = 0 184 | for index, kwg_code_tuple in index_kwg_list: 185 | if pre_kwg_code_tuple is not None and pre_kwg_code_tuple[0] == kwg_code_tuple[0]: 186 | pre_kwg_code_tuple = (kwg_code_tuple[0], kwg_code_tuple[1]+pre_kwg_code_tuple[1]) 187 | continue 188 | elif pre_kwg_code_tuple is not None: 189 | index_kwg_code_list.append((pre_index, pre_kwg_code_tuple)) 190 | pre_kwg_code_tuple = kwg_code_tuple 191 | pre_index = index 192 | else: 193 | pre_kwg_code_tuple = kwg_code_tuple 194 | pre_index = index 195 | if pre_kwg_code_tuple is not None: 196 | index_kwg_code_list.append((pre_index, pre_kwg_code_tuple)) 197 | 198 | # 对应知识点的code到名称 199 | # 把帧换算成时分秒 200 | result_list = [] 201 | for index, kwg_code_tuple in index_kwg_code_list: 202 | time_string = self.frame2Time(index) 203 | kwg_name = self.knowledge_code_dict.get(kwg_code_tuple[0]) 204 | result_list.append((index, time_string, kwg_code_tuple[0], kwg_name, kwg_code_tuple[1])) 205 | # 将结果存储 206 | filepath = self.curPath+'/distribution_folder/{}.txt'.format(file_name) 207 | self.saveText(filepath, result_list) 208 | 209 | def frame2Time(self, frame_count): 210 | """ 211 | 将视频帧的序号，转换成时分秒的具体时间点 212 | :param frame_count: 213 | :return: 214 | """ 215 | seconds = frame_count 216 | #seconds = int(frame_count) / 25 217 | if seconds <= 1: 218 | seconds = 1 219 | else: 220 | seconds = math.ceil(seconds) 221 | minute = int(seconds / 60) 222 | hour = int(minute / 60) 223 | minute = minute % 60 224 | seconds = seconds % 60 225 | 226 | return '{:0>2d}:{:0>2d}:{:0>2d}'.format(hour,minute,seconds) 227 | 228 | def saveText(self, filepath, index_kwg_list): 229 | f_out = open(filepath, 'w', encoding='utf-8') 230 | for index_kwgcode in index_kwg_list: 231 | f_out.write('{} {} {} {} {}'.format(index_kwgcode[0], index_kwgcode[1],index_kwgcode[2],index_kwgcode[3],index_kwgcode[4])) 232 | f_out.write('\n') 233 | f_out.close() 234 | pass 235 | 236 | def getMoreKwgCode(self, kwg_code_list): 237 | kwg_distribute_dict = {} 238 | for kwg_code in kwg_code_list: 239 | if kwg_distribute_dict.__contains__(kwg_code): 240 | kwg_distribute_dict[kwg_code] += 1 241 | else: 242 | kwg_distribute_dict[kwg_code] = 1 243 | sorted_kwg_code_list = sorted(kwg_distribute_dict.items(), key=lambda d: d[1],reverse=True) 244 | 245 | return sorted_kwg_code_list[0] 246 | 247 | def run(self, need_processed_video): 248 | course_base_code = need_processed_video[0] 249 | video_file = os.path.splitext(need_processed_video[1])[0] 250 | file_name = '{}-{}'.format(course_base_code, video_file) 251 | 252 | self.compute_similarity(course_base_code, video_file) 253 | section_index_dict = self.statistics() 254 | self.compute_distribute(section_index_dict, file_name) 255 | 256 | if __name__ == '__main__': 257 | t2kd = Text2KnowledgeDistribution() 258 | need_processed_video = ('open.bc.69','kuaiji-09.mp4') 259 | t2kd.run(need_processed_video) 260 | pass -------------------------------------------------------------------------------- /text_analysit/text_distribution.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | 分析文本在知识树上面的分布情况，确定一段时间内的内容是集中在一个段的知识点 4 | """ 5 | import os 6 | import math 7 | from word_spliter import jieba_splitor 8 | from text_vector import document_feature 9 | 10 | class Text2KnowledgeDistribution: 11 | splitor = jieba_splitor.JiebaSplitor() 12 | docvector = document_feature.TextVector() 13 | frame_index_kwg_dict = {} 14 | knowledge_code_dict = {} 15 | 16 | # 当前的文件目录 17 | curPath = os.path.abspath(os.path.dirname(__file__)) 18 | 19 | def loadKnowledge(self, kwg_filepath): 20 | knowledge_list = [] 21 | f = open(kwg_filepath, 'rb') 22 | ids_lines = f.read().decode('utf-8', 'ignore') 23 | ids_lines_list = ids_lines.split('\r\n') 24 | index = 0 25 | for line in ids_lines_list: 26 | index += 1 27 | if index == 1: 28 | continue 29 | line = line.strip('\n') 30 | line_k = line.split(' ') 31 | if len(line_k) < 2: 32 | continue 33 | 34 | line_k_code = line_k[0] 35 | line_k_word = line_k[1] 36 | line_k_word_array = self.splitor.split1list(line_k_word) 37 | 38 | if knowledge_list.__contains__(line): 39 | continue 40 | else: 41 | knowledge_list.append((line_k_code, line_k_word, line_k_word_array)) 42 | self.knowledge_code_dict[line_k_code] = line_k_word 43 | return knowledge_list 44 | 45 | def loadTextList(self, directory_path): 46 | text_file_list = [] 47 | if not os.path.exists(directory_path): 48 | return text_file_list 49 | 50 | for item in os.listdir(directory_path): 51 | basename = os.path.basename(item) 52 | # windows下文件编码为GBK，linux下为utf-8 53 | # try: 54 | # decode_str = basename.decode("GBK") 55 | # except UnicodeDecodeError: 56 | # decode_str = basename.decode("utf-8") 57 | 58 | text_file_list.append(basename) 59 | return text_file_list 60 | 61 | def readFileContent(self, filepath): 62 | f = open(filepath, 'r', encoding='utf-8') 63 | content = f.read() 64 | return content 65 | 66 | def compute_similarity(self, course_base_code, file_name): 67 | kwg_filepath = self.curPath+'./knowledge_folder/{}.txt'.format(course_base_code) 68 | print('开始加载知识点集') 69 | kwg_list = self.loadKnowledge(kwg_filepath) 70 | # 训练模型 71 | print('开始训练模型') 72 | self.docvector.train(kwg_list, course_base_code) 73 | print('开始预测') 74 | # 测试数据 75 | text_filepath = self.curPath+'/../image_ocr/text_folder/{}'.format(file_name) 76 | text_file_list = self.loadTextList(text_filepath) 77 | for text_file in text_file_list: 78 | content = self.readFileContent('{}/{}'.format(text_filepath, text_file)) 79 | content_words = self.splitor.split1list(content) 80 | sims = self.docvector.test_doc2vec(content_words) 81 | 82 | index = os.path.splitext(text_file)[0] 83 | 84 | print('{}的对应的知识点如下：'.format(content)) 85 | sim_kwg_list = [] 86 | for sim in sims: 87 | kwg = self.docvector.index_catalog.get(int(sim[0])) 88 | kwg_tuple = (kwg[0],kwg[1],kwg[2],sim[1]) 89 | sim_kwg_list.append(kwg_tuple) 90 | print('{} {}, {}'.format(kwg[0], kwg[1], sim[1])) 91 | 92 | self.frame_index_kwg_dict[index] = sim_kwg_list 93 | 94 | def statistics(self): 95 | img_index_kwg_list = [] 96 | for key, value in self.frame_index_kwg_dict.items(): 97 | img_index_kwg_list.append((int(key), value)) 98 | 99 | # 按帧的顺序排序 100 | # 由tuple组成的List排序 101 | # students = [('john', 'A', 15), ('jane', 'B', 12), ('dave', 'B', 10),] 102 | # 用key函数排序：返回由tuple组成的list 103 | # sorted(students, key=lambda student : student[2]) # sort by age 104 | # [('dave', 'B', 10), ('jane', 'B', 12), ('john', 'A', 15)] 105 | # 用cmp函数排序 106 | # sorted(students, cmp=lambda x,y : cmp(x[2], y[2])) # sort by age 107 | # [('dave', 'B', 10), ('jane', 'B', 12), ('john', 'A', 15)] 108 | 109 | # 这里的kwg是(index, (code, name, words)) 110 | sort_list = sorted(img_index_kwg_list, key=lambda x:x[0]) 111 | # 对排序后的帧，分析段 112 | # 分组的原则是相似度大于80%的作为分组的开始，如果连续几个都大于80%，都作为一组 113 | # 如果80%后，开始小于80%，也同样归属于当前组； 114 | # 如果再次出现80%，前面的一定是小于80%的或者是刚开始，才作为分组的开始 115 | # 分组的信息，只需要记录开始的帧序号 116 | section_index_dict = {} 117 | pre_similarity = 0.0 118 | current_section_index = 1 119 | has_exist_kwg_point = False 120 | for frame_tuple in sort_list: 121 | frame_index = frame_tuple[0] 122 | k_code = frame_tuple[1][0][0] 123 | k_code_part = self.getPartCode(k_code) 124 | cur_similarity = frame_tuple[1][0][3] 125 | if cur_similarity >= 0.8 and pre_similarity < 0.8: 126 | current_section_index = frame_index 127 | self.setFrameIndexSectionDict(section_index_dict, current_section_index, k_code_part) 128 | else: 129 | if has_exist_kwg_point: 130 | self.setFrameIndexSectionDict(section_index_dict, current_section_index, k_code_part) 131 | else: 132 | continue 133 | pre_similarity = cur_similarity 134 | print('统计分段完成') 135 | return section_index_dict 136 | 137 | def getPartCode(self, code, sec=4): 138 | if code is None: 139 | return code 140 | section_list = code.split('.') 141 | n_code = '.'.join(section_list[:sec]) 142 | 143 | return n_code 144 | 145 | def setFrameIndexSectionDict(self, section_index_dict, current_section_index, k_code_part): 146 | """ 147 | 设置相同内容的为一段 148 | :param section_index_dict: 149 | :return: 150 | """ 151 | if section_index_dict.__contains__(current_section_index): 152 | frame_list = section_index_dict.get(current_section_index) 153 | frame_list.append(k_code_part) 154 | else: 155 | frame_list = [] 156 | frame_list.append(k_code_part) 157 | section_index_dict[current_section_index] = frame_list 158 | 159 | def compute_distribute(self, section_index_dict, file_name): 160 | """ 161 | 对于每一段，选择大多数的知识点为最终知识点，也算是投票的结果 162 | :param section_index_dict: 163 | :return: 164 | """ 165 | index_kwg_list = [] 166 | print('计算知识点分布') 167 | 168 | for index, kwg_code_list in section_index_dict.items(): 169 | kwg_code_tuple = self.getMoreKwgCode(kwg_code_list) 170 | 171 | index_kwg_list.append((index, kwg_code_tuple)) 172 | 173 | # 合并code 相同的 174 | index_kwg_code_list = [] 175 | pre_kwg_code_tuple = None 176 | pre_index = 0 177 | for index, kwg_code_tuple in index_kwg_list: 178 | if pre_kwg_code_tuple is not None and pre_kwg_code_tuple[0] == kwg_code_tuple[0]: 179 | pre_kwg_code_tuple = (kwg_code_tuple[0], kwg_code_tuple[1]+pre_kwg_code_tuple[1]) 180 | continue 181 | elif pre_kwg_code_tuple is not None: 182 | index_kwg_code_list.append((pre_index, pre_kwg_code_tuple)) 183 | pre_kwg_code_tuple = kwg_code_tuple 184 | pre_index = index 185 | else: 186 | pre_kwg_code_tuple = kwg_code_tuple 187 | pre_index = index 188 | if pre_kwg_code_tuple is not None: 189 | index_kwg_code_list.append((pre_index, pre_kwg_code_tuple)) 190 | 191 | # 对应知识点的code到名称 192 | # 把帧换算成时分秒 193 | result_list = [] 194 | for index, kwg_code_tuple in index_kwg_code_list: 195 | time_string = self.frame2Time(index) 196 | kwg_name = self.knowledge_code_dict.get(kwg_code_tuple[0]) 197 | result_list.append((index, time_string, kwg_code_tuple[0], kwg_name, kwg_code_tuple[1])) 198 | # 将结果存储 199 | filepath = self.curPath+'/distribution_folder/{}.txt'.format(file_name) 200 | self.saveText(filepath, result_list) 201 | 202 | def frame2Time(self, frame_count): 203 | """ 204 | 将视频帧的序号，转换成时分秒的具体时间点 205 | :param frame_count: 206 | :return: 207 | """ 208 | seconds = int(frame_count) / 25 209 | if seconds <= 1: 210 | seconds = 1 211 | else: 212 | seconds = math.ceil(seconds) 213 | minute = int(seconds / 60) 214 | hour = int(minute / 60) 215 | minute = minute % 60 216 | seconds = seconds % 60 217 | 218 | return '{:0>2d}:{:0>2d}:{:0>2d}'.format(hour,minute,seconds) 219 | 220 | def saveText(self, filepath, index_kwg_list): 221 | f_out = open(filepath, 'w', encoding='utf-8') 222 | for index_kwgcode in index_kwg_list: 223 | f_out.write('{} {} {} {} {}'.format(index_kwgcode[0], index_kwgcode[1],index_kwgcode[2],index_kwgcode[3],index_kwgcode[4])) 224 | f_out.write('\n') 225 | f_out.close() 226 | pass 227 | 228 | def getMoreKwgCode(self, kwg_code_list): 229 | kwg_distribute_dict = {} 230 | for kwg_code in kwg_code_list: 231 | if kwg_distribute_dict.__contains__(kwg_code): 232 | kwg_distribute_dict[kwg_code] += 1 233 | else: 234 | kwg_distribute_dict[kwg_code] = 1 235 | sorted_kwg_code_list = sorted(kwg_distribute_dict.items(), key=lambda d: d[1],reverse=True) 236 | 237 | return sorted_kwg_code_list[0] 238 | 239 | def run(self, need_processed_video): 240 | course_base_code = need_processed_video[0] 241 | video_file = os.path.splitext(need_processed_video[1])[0] 242 | file_name = '{}-{}'.format(course_base_code, video_file) 243 | 244 | self.compute_similarity(course_base_code, file_name) 245 | section_index_dict = self.statistics() 246 | self.compute_distribute(section_index_dict, file_name) 247 | 248 | if __name__ == '__main__': 249 | #t2kd = Text2KnowledgeDistribution() 250 | #t2kd.run() 251 | pass -------------------------------------------------------------------------------- /text_vector/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/254675123/ai-video/a26108f600dc3c72e38b3dc7c2c2d8053899909f/text_vector/__init__.py -------------------------------------------------------------------------------- /text_vector/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/254675123/ai-video/a26108f600dc3c72e38b3dc7c2c2d8053899909f/text_vector/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /text_vector/__pycache__/document_feature.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/254675123/ai-video/a26108f600dc3c72e38b3dc7c2c2d8053899909f/text_vector/__pycache__/document_feature.cpython-36.pyc -------------------------------------------------------------------------------- /text_vector/document_feature.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | 3 | import os 4 | import sys 5 | 6 | import gensim 7 | from gensim.models import Doc2Vec 8 | 9 | from tools.file_util import FilePath 10 | 11 | curPath = os.path.abspath(os.path.dirname(__file__)) 12 | rootPath = os.path.split(curPath)[0] 13 | sys.path.append(rootPath) 14 | 15 | class TextVector: 16 | """ 17 | 文本向量，计算2个文本句子之间的相似度 18 | """ 19 | 20 | def __init__(self): 21 | """ 22 | initialize local variables. 23 | """ 24 | self.model = None 25 | 26 | self.index_catalog = {} 27 | 28 | 29 | def train(self, sentences_words, course_base_code): 30 | # 如果模型已经生成，仅需要加载 31 | model_file = u'{}/model_folder/{}.model'.format(curPath, course_base_code) 32 | if FilePath.fileExist(model_file): 33 | # you can continue training with the loaded model! 34 | print('model has exist, loading') 35 | self.model = Doc2Vec.load(model_file) 36 | print('has been loaded.') 37 | 38 | # 序号和名称对应 39 | count = 0 40 | for words_tuple in sentences_words: 41 | self.index_catalog[count] = words_tuple 42 | count += 1 43 | return 44 | 45 | # 加载数据 46 | documents = [] 47 | # 使用count当做每个句子的“标签”，标签和每个句子是一一对应的 48 | count = 0 49 | for words_tuple in sentences_words: 50 | words = words_tuple[2] 51 | self.index_catalog[count] = words_tuple 52 | # 这里documents里的每个元素是二元组，具体可以查看函数文档 53 | documents.append(gensim.models.doc2vec.TaggedDocument(words, [str(count)])) 54 | count += 1 55 | if count % 100 == 0: 56 | print('{} has loaded...'.format(count)) 57 | 58 | # 模型训练 59 | print('start instance doc2vec') 60 | self.model = Doc2Vec(dm=1, vector_size=200, window=8, min_count=1, workers=4, epochs=2000) 61 | print('start build vocab') 62 | self.model.build_vocab(documents) 63 | print('start training') 64 | self.model.train(documents, total_examples=self.model.corpus_count, epochs=self.model.epochs) 65 | # 保存模型 66 | print('save model') 67 | self.model.save(model_file) 68 | 69 | def test_doc2vec(self, sentence_words): 70 | # 加载模型 71 | #model = Doc2Vec.load('models/ko_d2v.model') 72 | model = self.model 73 | # 与标签‘0’最相似的 74 | #print(model.docvecs.most_similar('0')) 75 | # 进行相关性比较 76 | #print(model.docvecs.similarity('0', '1')) 77 | # 输出标签为‘10’句子的向量 78 | #print(model.docvecs['10']) 79 | # 也可以推断一个句向量(未出现在语料中) 80 | #words = u"여기 나오는 팀 다 가슴" 81 | #course_name = u'比较教育学' 82 | #words = jieba.cut(course_name) 83 | print('开始获取测试向量') 84 | vector = model.infer_vector(sentence_words) 85 | print('开始预测') 86 | #sims = model.docvecs.most_similar([vector], topn=len(model.docvecs)) 87 | sims = model.docvecs.most_similar([vector], topn=1) 88 | # print('得到前10相似结果') 89 | # for sim in sims: 90 | # name = self.index_catalog.get(int(sim[0])) 91 | # print('{}, {}'.format(name, sim[1])) 92 | 93 | # 也可以输出词向量 94 | #print(model[u'가슴']) 95 | return sims 96 | 97 | 98 | 99 | if __name__ == "__main__": 100 | tv = TextVector() 101 | tv.train() 102 | tv.test_doc2vec() 103 | -------------------------------------------------------------------------------- /tools/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/254675123/ai-video/a26108f600dc3c72e38b3dc7c2c2d8053899909f/tools/__init__.py -------------------------------------------------------------------------------- /tools/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/254675123/ai-video/a26108f600dc3c72e38b3dc7c2c2d8053899909f/tools/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /tools/excel_xls/ExcelReader.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # coding=utf-8 3 | """ 4 | create_author : zhangcl 5 | create_time : 2018-07-06 6 | program : *_* read excel data *_* 7 | 仅用来读取excel文件的内容 8 | 参数可以指定sheet的范围，默认是全部，指定的时候用序号的数组，如: [0,1,2] 是指读取第1，2，3个sheet 9 | 参数可以指定列的范围，可以使用名字，也可以用列的序号，都是以数组形式 10 | """ 11 | import hashlib 12 | import sys 13 | 14 | import xlrd 15 | 16 | 17 | from tools.file_util import FilePath 18 | # import xlwt 19 | # from datetime import date,datetime 20 | 21 | class ExcelReader: 22 | """ 23 | 该excel里面. 24 | """ 25 | 26 | def __init__(self): 27 | """ 28 | initialize data 29 | """ 30 | # excel 文件的路径 31 | self.filepath = None 32 | # sheet的读取范围，默认为全部, 可以按序号，也可以按sheet的名称 33 | self.sheet_scope_indexes = None 34 | self.sheet_scope_names = None 35 | 36 | # 单个sheet中，列的范围，默认为全部 37 | self.column_scope_names = None 38 | self.column_scope_indexes = None 39 | 40 | # 是否获取第一行，第一行一般为数据头 41 | self.start_row_index = 1 42 | 43 | 44 | 45 | def readFile(self, filepath=None): 46 | """ 47 | read the excel data 48 | python操作excel主要用到xlrd和xlwt这两个库，即xlrd是读excel，xlwt是写excel的库。 49 | 可从这里下载https://pypi.python.org/pypi。下面分别记录python读和写excel. 50 | :param filepath:the excel full path 51 | :return: ture if read file ok, false otherwise 52 | """ 53 | result_list = [] 54 | 55 | # 如果filepath 是空的话，就先看看self.filepath 是否为空 56 | if filepath is None and self.filepath is None: 57 | print('请设置读取的文件名称.') 58 | return result_list 59 | 60 | if filepath is None: 61 | filepath = self.filepath 62 | 63 | # 检查文件是否存在 64 | if not FilePath.fileExist(filepath): 65 | return result_list 66 | 67 | # 打开文件 68 | workbook = xlrd.open_workbook(filepath) 69 | # 获取所有sheet 70 | #print workbook.sheet_names() # [u'sheet1', u'sheet2'] 71 | #sheet2_name = workbook.sheet_names()[1] 72 | local_sheet_scope_indexes = self.getSheetScope(workbook) 73 | 74 | 75 | totalcount = 0 76 | for index in local_sheet_scope_indexes: 77 | sheet = workbook.sheet_by_index(index) 78 | rowindex = self.start_row_index 79 | local_sheet_columns_indexes = self.getSheetColumnScope(sheet) 80 | while rowindex < sheet.nrows: 81 | row = sheet.row_values(rowindex) 82 | rowindex = rowindex + 1 83 | try: 84 | one_row = self.addOneRow(row, local_sheet_columns_indexes) 85 | result_list.append(one_row) 86 | totalcount = totalcount + 1 87 | if totalcount % 100 == 0: 88 | print('已经读取：{0}行'.format(totalcount)) 89 | except Exception: 90 | print('数据异常行数:' + str(rowindex)) 91 | #print('读取数据异常：' + ) 92 | 93 | 94 | print('共读取：{0}行'.format(totalcount)) 95 | 96 | return result_list 97 | # sheet的名称，行数，列数 98 | #print sheet2.name, sheet2.nrows, sheet2.ncols 99 | 100 | # 获取整行和整列的值（数组） 101 | #rows = sheet2.row_values(3) # 获取第四行内容 102 | #cols = sheet2.col_values(2) # 获取第三列内容 103 | #print rows 104 | #print cols 105 | 106 | # 获取单元格内容 107 | #print sheet2.cell(1, 0).value.encode('utf-8') 108 | #print sheet2.cell_value(1, 0).encode('utf-8') 109 | #print sheet2.row(1)[0].value.encode('utf-8') 110 | 111 | # 获取单元格内容的数据类型 112 | #print sheet2.cell(1, 0).ctype 113 | 114 | def getSheetColumnScope(self, sheet): 115 | """ 116 | 根据设置，取sheet中的哪些列，优先按名称获取 117 | :return: 118 | """ 119 | # 结果column的范围 120 | result_scope = None 121 | 122 | # 检查名称是否存在 123 | use_name = False 124 | use_index = False 125 | if self.column_scope_names and len(self.column_scope_names) > 0: 126 | use_name = True 127 | elif self.column_scope_indexes and len(self.column_scope_indexes) > 0: 128 | use_index = True 129 | 130 | column_length = sheet.ncols 131 | row_length = sheet.nrows 132 | if (use_index == False and use_name == False) or row_length == 0: 133 | result_scope = range(column_length) 134 | return result_scope 135 | 136 | result_scope = [] 137 | first_row = sheet.row_values(0) 138 | result_index = 0 139 | index = 0 140 | for column_name in first_row: 141 | if use_name and self.column_scope_names.__contains__(column_name): 142 | self.column_scope_names[column_name]=result_index 143 | result_index += 1 144 | result_scope.append(index) 145 | elif use_index and self.column_scope_indexes.__contains__(index): 146 | #self.column_scope_names[column_name] = result_index 147 | #result_index += 1 148 | result_scope.append(index) 149 | else: 150 | pass 151 | index += 1 152 | return result_scope 153 | 154 | 155 | def getSheetScope(self, workbook): 156 | """ 157 | 根据设置，取那些sheet，如果sheet的名称和序号都提供了，优先按名称获取 158 | :return: 159 | """ 160 | # 结果sheet的范围 161 | result_scope = None 162 | 163 | # 检查名称是否存在 164 | use_name = False 165 | use_index = False 166 | if self.sheet_scope_names and len(self.sheet_scope_names) > 0: 167 | use_name = True 168 | elif self.sheet_scope_indexes and len(self.sheet_scope_indexes) > 0: 169 | use_index = True 170 | else: 171 | pass 172 | 173 | sheetlength = workbook.sheets().__len__() 174 | if use_index == False and use_name == False: 175 | result_scope = range(sheetlength) 176 | return result_scope 177 | 178 | index = 0 179 | result_scope = [] 180 | while index < sheetlength: 181 | sheet = workbook.sheet_by_index(index) 182 | if use_name and self.sheet_scope_names.__contains__(sheet.name): 183 | result_scope.append(index) 184 | elif use_index and self.sheet_scope_indexes.__contains__(index): 185 | result_scope.append(index) 186 | else: 187 | pass 188 | index = index + 1 189 | return result_scope 190 | 191 | 192 | def addOneRow(self, row, column_scope): 193 | # 定义rows_list 194 | row_item_list = [] 195 | for column_index in column_scope: 196 | row_item_list.append(row[column_index]) 197 | return row_item_list 198 | 199 | 200 | # def isChinese(self, ch): 201 | # res = False 202 | # s_unicode = UnicodeConvertor.stringToUnicode(ch) 203 | # if s_unicode >= u'\\u4e00' and s_unicode <= u'\\u9fa5': 204 | # res = True 205 | # return res 206 | def getMd5(self,text): 207 | 208 | md5 = hashlib.md5(text.encode('utf-8')).hexdigest() 209 | 210 | return md5 211 | 212 | 213 | 214 | if __name__ == '__main__': 215 | #read_excel() 216 | er = ExcelReader() 217 | er.readFile(u'D:/福师《比较教育学》-李婷婷建设、郭丽娜审核.xlsx') 218 | 219 | -------------------------------------------------------------------------------- /tools/excel_xls/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/254675123/ai-video/a26108f600dc3c72e38b3dc7c2c2d8053899909f/tools/excel_xls/__init__.py -------------------------------------------------------------------------------- /tools/excel_xls/__pycache__/ExcelReader.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/254675123/ai-video/a26108f600dc3c72e38b3dc7c2c2d8053899909f/tools/excel_xls/__pycache__/ExcelReader.cpython-36.pyc -------------------------------------------------------------------------------- /tools/excel_xls/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/254675123/ai-video/a26108f600dc3c72e38b3dc7c2c2d8053899909f/tools/excel_xls/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /tools/file_util/FilePath.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # coding=utf-8 3 | """ 4 | create_author : zhangcl 5 | create_time : 2018-10-11 6 | program : *_* define course information *_* 7 | """ 8 | import os 9 | def mkdir(path): 10 | # 引入模块 11 | 12 | 13 | # 去除首位空格 14 | path = path.strip() 15 | # 去除尾部 \ 符号 16 | path = path.rstrip("\\") 17 | 18 | # 判断路径是否存在 19 | # 存在 True 20 | # 不存在 False 21 | isExists = os.path.exists(path) 22 | 23 | # 判断结果 24 | if not isExists: 25 | # 如果不存在则创建目录 26 | # 创建目录操作函数 27 | os.makedirs(path) 28 | 29 | #print path + ' 创建成功' 30 | return True 31 | else: 32 | # 如果目录存在则不创建，并提示目录已存在 33 | #print path + ' 目录已存在' 34 | return False 35 | 36 | 37 | def fileExist(filepath): 38 | """ 39 | os模块中的os.path.exists()方法用于检验文件是否存在。 40 | 其实这种方法还是有个问题，假设你想检查文件“test_data”是否存在， 41 | 但是当前路径下有个叫“test_data”的文件夹，这样就可能出现误判。为了避免这样的情况，可以这样: 42 | 只检查文件os.path.isfile("test-data") 43 | 通过这个方法，如果文件”test-data”不存在将返回False，反之返回True。 44 | 即是文件存在，你可能还需要判断文件是否可进行读写操作。 45 | 使用os.access()方法判断文件是否可进行读写操作。 46 | os.access(path, mode), path为文件路径，mode为操作模式，有这么几种: 47 | os.F_OK: 检查文件是否存在; 48 | os.R_OK: 检查文件是否可读; 49 | os.W_OK: 检查文件是否可以写入; 50 | os.X_OK: 检查文件是否可以执行 51 | :param filepath: 52 | :return: 53 | """ 54 | isExist = os.path.exists(filepath) 55 | isFile = os.path.isfile(filepath) 56 | if isExist and isFile: 57 | return True 58 | else: 59 | return False 60 | 61 | # 定义要创建的目录 62 | #mkpath = "d:\\qttc\\web\\" 63 | # 调用函数 64 | #mkdir(mkpath) 65 | 66 | def del_file(path): 67 | """ 68 | 删除指定文件夹下面的所有文件和文件夹 69 | :param path: 70 | :return: 71 | """ 72 | for i in os.listdir(path): 73 | path_file = os.path.join(path, i) 74 | if os.path.isfile(path_file): 75 | os.remove(path_file) 76 | else: 77 | #del_file(path_file) 78 | os.removedirs(path_file) 79 | 80 | def rename(src_name, dst_name): 81 | """ 82 | 重命名文件夹或者文件名 83 | :param src_name: 84 | :param dst_name: 85 | :return: 86 | """ 87 | try: 88 | os.rename(src_name, dst_name) 89 | except Exception as e: 90 | print(e) 91 | 92 | print('rename dir fail\r\n') 93 | else: 94 | print('rename dir success\r\n') 95 | 96 | -------------------------------------------------------------------------------- /tools/file_util/JsonParser.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # coding=utf-8 3 | """ 4 | create_author : zhangcl 5 | create_time : 2018-07-01 6 | program : *_* parse the parameter and generate cypher *_* 7 | 8 | """ 9 | import json 10 | 11 | 12 | class JsonParser: 13 | """ 14 | Parser of request parameter. 15 | """ 16 | def __init__(self): 17 | """ 18 | initialize local variables. 19 | """ 20 | self.jsondata = None 21 | self.result = {} 22 | def parseJson(self, queryparam): 23 | """ 24 | Parse the parameter string to json object . 25 | :param queryparam: json string 26 | The json object holds the detail of request all infomation. 27 | """ 28 | self.querystring = queryparam 29 | flag = True 30 | try: 31 | self.jsondata = json.loads(queryparam) 32 | self.result['code'] = 200 33 | self.result['message'] = 'sucess' 34 | except Exception as err: 35 | flag = False 36 | print(err) 37 | self.result['code'] = 500 38 | self.result['message'] = 'fail' 39 | self.result['data'] = '' 40 | return flag 41 | 42 | -------------------------------------------------------------------------------- /tools/file_util/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/254675123/ai-video/a26108f600dc3c72e38b3dc7c2c2d8053899909f/tools/file_util/__init__.py -------------------------------------------------------------------------------- /tools/file_util/__pycache__/FilePath.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/254675123/ai-video/a26108f600dc3c72e38b3dc7c2c2d8053899909f/tools/file_util/__pycache__/FilePath.cpython-36.pyc -------------------------------------------------------------------------------- /tools/file_util/__pycache__/JsonParser.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/254675123/ai-video/a26108f600dc3c72e38b3dc7c2c2d8053899909f/tools/file_util/__pycache__/JsonParser.cpython-36.pyc -------------------------------------------------------------------------------- /tools/file_util/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/254675123/ai-video/a26108f600dc3c72e38b3dc7c2c2d8053899909f/tools/file_util/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /video_convertor/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/254675123/ai-video/a26108f600dc3c72e38b3dc7c2c2d8053899909f/video_convertor/__init__.py -------------------------------------------------------------------------------- /video_convertor/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/254675123/ai-video/a26108f600dc3c72e38b3dc7c2c2d8053899909f/video_convertor/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /video_convertor/__pycache__/video_image_convertor_open.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/254675123/ai-video/a26108f600dc3c72e38b3dc7c2c2d8053899909f/video_convertor/__pycache__/video_image_convertor_open.cpython-36.pyc -------------------------------------------------------------------------------- /video_convertor/video_image_convertor_1.py: -------------------------------------------------------------------------------- 1 | # encoding: UTF-8 2 | 3 | # 不建议使用，生成的视频大，压缩率不够，推荐ffmpeg 4 | import glob as gb 5 | import cv2 6 | 7 | def images_video(): 8 | img_path = gb.glob("G:\\temp_picture\\*.jpg") 9 | videoWriter = cv2.VideoWriter('test.mp4', cv2.VideoWriter_fourcc(*'MJPG'), 25, (640,480)) 10 | 11 | for path in img_path: 12 | img = cv2.imread(path) 13 | img = cv2.resize(img,(640,480)) 14 | videoWriter.write(img) 15 | 16 | 17 | 18 | 19 | def camera_video_img(): 20 | cap = cv2.VideoCapture(0) 21 | 22 | while cv2.waitKey(30)!=ord('q'): 23 | retval, image = cap.read() 24 | cv2.imshow("video",image) 25 | cap.release() -------------------------------------------------------------------------------- /video_convertor/video_image_convertor_2.py: -------------------------------------------------------------------------------- 1 | import os 2 | import cv2 3 | from PIL import Image 4 | 5 | 6 | def video2jpg(sp): 7 | """ 将视频转换成图片 8 | sp: 视频路径 """ 9 | cap = cv2.VideoCapture(sp) 10 | suc = cap.isOpened() # 是否成功打开 11 | frame_count = 0 12 | while suc: 13 | frame_count += 1 14 | suc, frame = cap.read() 15 | print(frame.shape) 16 | if frame_count == 2: 17 | break 18 | params = [] 19 | params.append(2) # params.append(1) 20 | #cv2.imwrite('mv\\%d.jpg' % frame_count, frame, params) 21 | 22 | cap.release() 23 | print('unlock image: ', frame_count) 24 | 25 | 26 | def jpg2video(sp, fps): 27 | """ 将图片合成视频. sp: 视频路径，fps: 帧率 """ 28 | fourcc = cv2.VideoWriter_fourcc(*"MJPG") 29 | images = os.listdir('mv') 30 | im = Image.open('mv/' + images[0]) 31 | vw = cv2.VideoWriter(sp, fourcc, fps, im.size) 32 | 33 | os.chdir('mv') 34 | for image in range(len(images)): 35 | # Image.open(str(image)+'.jpg').convert("RGB").save(str(image)+'.jpg') 36 | jpgfile = str(image + 1) + '.jpg' 37 | try: 38 | frame = cv2.imread(jpgfile) 39 | vw.write(frame) 40 | except Exception as exc: 41 | print(jpgfile, exc) 42 | vw.release() 43 | print(sp, 'Synthetic success!') 44 | 45 | 46 | if __name__ == '__main__': 47 | sp = "./../data/0547619.mp4" 48 | sp_new = './../data/0547619_new.mp4' 49 | video2jpg(sp) # 视频转图片 50 | jpg2video(sp_new, 28) # 图片转视频 51 | -------------------------------------------------------------------------------- /video_convertor/video_image_convertor_open.py: -------------------------------------------------------------------------------- 1 | # encoding: UTF-8 2 | 3 | import os 4 | import cv2 5 | from PIL import Image 6 | from image_processor import image_similarity_fundimental 7 | from tools.file_util import FilePath 8 | import time 9 | 10 | class Video2Image: 11 | """ 12 | 将视频转换成图片，视频的1s为25帧，大部分的帧是相同的； 13 | 所以需要对相同的图片分组，只取一张即可。 14 | """ 15 | # 当前的文件目录 16 | curPath = os.path.abspath(os.path.dirname(__file__)) 17 | 18 | def video2jpg(self, sp): 19 | """ 将视频转换成图片 20 | sp: 视频路径 """ 21 | cap = cv2.VideoCapture(sp) 22 | suc = cap.isOpened() # 是否成功打开 23 | frame_count = 0 24 | while suc: 25 | frame_count += 1 26 | suc, frame = cap.read() 27 | #print(frame.shape) 28 | yield frame, frame_count 29 | #params = [] 30 | #params.append(2) # params.append(1) 31 | #cv2.imwrite('mv\\%d.jpg' % frame_count, frame, params) 32 | 33 | cap.release() 34 | print('unlock image: ', frame_count) 35 | 36 | def frameToImage(self, frame): 37 | img = None 38 | if frame is not None: 39 | img = Image.fromarray(frame) # 完成np.array向PIL.Image格式的转换 40 | # img = img.resize((80, 80)).convert('1') 41 | 42 | return img 43 | 44 | def saveFrame(self, frame, frame_count, video_to_image_file_path): 45 | params = [] 46 | params.append(2) # params.append(1) 47 | cv2.imwrite(video_to_image_file_path+'/%d.jpg' % frame_count, frame, params) 48 | 49 | def loadImage(self, count, video_to_image_file_path): 50 | path = video_to_image_file_path+'/%d.jpg' % count 51 | img = Image.open(path).resize((80, 80)).convert('1') 52 | return img 53 | 54 | def similary_calculate(self, img1 , img2): 55 | hist1 = list(img1.getdata()) 56 | hist2 = list(img2.getdata()) 57 | return image_similarity_fundimental.difference(hist1, hist2) 58 | 59 | def similarity_mode_3(self, image1, image2): 60 | """ 61 | 感知哈希算法 62 | :param image1: 63 | :param image2: 64 | :return: 65 | """ 66 | # 如果是frame的话，可以直接使用 67 | # img = cv2.resize(frame, (8, 8)) 68 | img1 = image1.resize((128, 128)).convert('1') 69 | img2 = image2.resize((128, 128)).convert('1') 70 | hist1 = list(img1.getdata()) 71 | hist2 = list(img2.getdata()) 72 | sim = image_similarity_fundimental.difference(hist1, hist2) 73 | return sim 74 | 75 | def similarity_mode_2(self, image1, image2): 76 | """ 77 | 直方图的距离计算 78 | :param image1: 79 | :param image2: 80 | :return: 81 | """ 82 | # 预处理 83 | img1 = image1.resize((256, 256)).convert('RGB') 84 | img2 = image2.resize((256, 256)).convert('RGB') 85 | sim = image_similarity_fundimental.difference(img1.histogram(), img2.histogram()) 86 | return sim 87 | 88 | def similarity_mode_1(self, image1, image2): 89 | """ 90 | 分块直方图的距离计算 91 | :param image1: 92 | :param image2: 93 | :return: 94 | """ 95 | sum = 0; 96 | img1 = image1.resize((256, 256)).convert('RGB') 97 | img2 = image2.resize((256, 256)).convert('RGB') 98 | for i in range(4): 99 | for j in range(4): 100 | hist1 = img1.crop((i * 64, j * 64, i * 64 + 63, j * 64 + 63)).copy().histogram() 101 | hist2 = img2.crop((i * 64, j * 64, i * 64 + 63, j * 64 + 63)).copy().histogram() 102 | sum += image_similarity_fundimental.difference(hist1, hist2) 103 | # print difference(hist1, hist2) 104 | return sum / 16 105 | 106 | def similary_calculate_multiple(self, image1 , image2): 107 | sim = 0 108 | sim1 = self.similarity_mode_1(image1, image2) 109 | if sim1 > sim: 110 | sim = sim1 111 | sim2 = self.similarity_mode_2(image1, image2) 112 | if sim2 > sim: 113 | sim = sim2 114 | sim3 = self.similarity_mode_3(image1, image2) 115 | if sim3 > sim: 116 | sim = sim3 117 | 118 | return sim 119 | 120 | def run(self, need_processed_video): 121 | #course_base_code = need_processed_video[0] 122 | video_file = need_processed_video[1] 123 | video_file_path = '{}/../data/course_video/{}'.format(self.curPath, video_file) 124 | 125 | # 检查文件夹是否存在，若不存在，创建 126 | video_to_image_file_path = '{}/img_folder/{}'.format(self.curPath, need_processed_video[2]) 127 | FilePath.mkdir(video_to_image_file_path) 128 | # 清理图片文件夹 129 | print('正在清理文件夹') 130 | FilePath.del_file(video_to_image_file_path) 131 | print('清理完毕，开始进行转换...') 132 | pre_img = None 133 | for frame, frame_count in self.video2jpg(video_file_path): 134 | if frame_count % 100 == 0: 135 | print('已处理{}帧'.format(frame_count)) 136 | # 对frame进行resize 137 | if pre_img is None: 138 | pre_img = self.frameToImage(frame) 139 | if pre_img is None: 140 | break 141 | self.saveFrame(frame, frame_count, video_to_image_file_path) 142 | else: 143 | # 当前frame和前一个frame比较，如果相似度大于90的，就认为是一个 144 | cur_img = self.frameToImage(frame) 145 | if cur_img is None: 146 | break 147 | sim = self.similary_calculate_multiple(pre_img,cur_img) 148 | if sim > 0.95: 149 | continue 150 | else: 151 | pre_img = cur_img 152 | self.saveFrame(frame, frame_count, video_to_image_file_path) 153 | 154 | 155 | 156 | if __name__ == '__main__': 157 | video_file = "./../data/glx.mp4" 158 | start_time = time.time() 159 | vv = Video2Image() 160 | vv.run(video_file) 161 | #time.sleep(1) 162 | end_time = time.time() 163 | 164 | print('花费时间:{}秒'.format(end_time - start_time)) 165 | -------------------------------------------------------------------------------- /video_convertor/video_to_img.py: -------------------------------------------------------------------------------- 1 | import cv2 2 | 3 | def video2image(video_filepath): 4 | # 读入视频文件 5 | vc = cv2.VideoCapture(video_filepath) 6 | c = 0 7 | rval = vc.isOpened() 8 | # timeF = 1 #视频帧计数间隔频率 9 | while rval: # 循环读取视频帧 10 | c = c + 1 11 | rval, frame = vc.read() 12 | # if(c%timeF == 0): #每隔timeF帧进行存储操作 13 | # cv2.imwrite('smallVideo/smallVideo'+str(c) + '.jpg', frame) #存储为图像 14 | if rval: 15 | cv2.imwrite('driveway-320x240/driveway-320x240' + str(c).zfill(8) + '.jpg', frame) # 存储为图像 16 | cv2.waitKey(1) 17 | else: 18 | break 19 | vc.release() 20 | -------------------------------------------------------------------------------- /word_spliter/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/254675123/ai-video/a26108f600dc3c72e38b3dc7c2c2d8053899909f/word_spliter/__init__.py -------------------------------------------------------------------------------- /word_spliter/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/254675123/ai-video/a26108f600dc3c72e38b3dc7c2c2d8053899909f/word_spliter/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /word_spliter/__pycache__/jieba_splitor.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/254675123/ai-video/a26108f600dc3c72e38b3dc7c2c2d8053899909f/word_spliter/__pycache__/jieba_splitor.cpython-36.pyc -------------------------------------------------------------------------------- /word_spliter/jieba_splitor.py: -------------------------------------------------------------------------------- 1 | 2 | # -*- coding: utf-8 -*- 3 | # coding=utf-8 4 | """ 5 | create_author : zhangcl 6 | create_time : 2018-07-16 7 | program : *_* jie ba *_* 8 | 9 | """ 10 | 11 | 12 | import jieba #导入jieba模块 13 | import re 14 | #jieba.load_userdict("./../data/newdict.txt") #加载自定义词典 15 | #jieba.load_userdict("./../../dictionary/data/newdict.txt") #加载自定义词典 16 | 17 | import jieba.posseg as pseg 18 | 19 | class JiebaSplitor: 20 | """ 21 | jie ba splitor 22 | """ 23 | def __init__(self): 24 | 25 | self.wordnetlist = [] 26 | self.wordposlist = [] 27 | # 把停用词做成字典 28 | self.stopwords = {} 29 | fstop = open('./../data/stopwords.txt', 'rb') 30 | lines = fstop.read().decode('utf-8', 'ignore') 31 | lines_list = lines.split('\r\n') 32 | for eachWord in lines_list: 33 | self.stopwords[eachWord] = eachWord 34 | fstop.close() 35 | 36 | def split1list(self, sentence): 37 | line = sentence.strip() # 去除每行首尾可能出现的空格，并转为Unicode进行处理 38 | line1 = re.sub("[0-9\s+\.\!\/_,$%^*()?;；:-【】\"\']+|[+—！，;:。？、~@#￥%…&*（）]+", 39 | " ", line) 40 | #wordList = list(jieba.cut(line1)) # 用结巴分词，对每行内容进行分词 41 | wordList = pseg.cut(line1) 42 | poslist = [] 43 | for w in wordList: 44 | length = len(w.word) 45 | if length < 2: 46 | continue 47 | if w.word in self.stopwords: 48 | preflag = None 49 | continue 50 | 51 | if self.isFormWord(w): 52 | continue 53 | 54 | poslist.append(w.word) 55 | 56 | 57 | return poslist 58 | 59 | def split(self, sentence): 60 | line = sentence.strip() # 去除每行首尾可能出现的空格，并转为Unicode进行处理 61 | line1 = re.sub("[0-9\s+\.\!\/_,$%^*()?;；:-【】\"\']+|[+—！，;:。？、~@#￥%…&*（）]+", 62 | " ", line) 63 | #wordList = list(jieba.cut(line1)) # 用结巴分词，对每行内容进行分词 64 | wordList = pseg.cut(line1) 65 | 66 | # 67 | self.process(wordList) 68 | return self.wordnetlist 69 | 70 | def process(self, wordList): 71 | self.wordnetlist = [] 72 | preflag = None 73 | poslist = [] 74 | for w in wordList: 75 | length = len(w.word) 76 | if length < 2: 77 | continue 78 | if w.word in self.stopwords: 79 | preflag = None 80 | continue 81 | 82 | if self.isFormWord(w): 83 | continue 84 | 85 | wordpos = w.word + ' ' + w.flag 86 | self.wordposlist.append(wordpos) 87 | 88 | if w.flag == preflag: 89 | poslist.append(w.word) 90 | else: 91 | if len(poslist) > 0: 92 | self.wordnetlist.append(poslist) 93 | poslist = [] 94 | 95 | poslist.append(w.word) 96 | preflag = w.flag 97 | 98 | if len(poslist) > 0: 99 | self.wordnetlist.append(poslist) 100 | 101 | def isFormWord(self, w): 102 | flag = False 103 | 104 | if w.flag == 'c' or w.flag == 'e' or w.flag == 'f' or w.flag == 'h' or w.flag == 'p' or w.flag == 't': 105 | flag = True 106 | 107 | if w.flag == 'r' or w.flag == 'm' or str(w.flag).__contains__('d'): 108 | flag = True 109 | 110 | return flag 111 | 112 | if __name__ == "__main__": 113 | sr = JiebaSplitor() 114 | #sr.split1list('国际收支账户') 115 | #账户是国际收支平衡表中最基本的往来账户 116 | #sr.split1list('账户是国际收支平衡表中最基本的往来账户') 117 | sr.split1list('外汇管制的作用') 118 | print('split over') --------------------------------------------------------------------------------