├── .gitignore ├── .readthedocs.yml ├── README.md ├── docs └── source │ ├── _static │ └── README.md │ ├── conf.py │ ├── index.rst │ ├── pyunit_newword.rst │ └── requirements.txt ├── img ├── 5.png └── weibo.png ├── pyunit_newword ├── __init__.py ├── auto.txt └── words.py ├── requirements.txt ├── setup.py └── test.py /.gitignore: -------------------------------------------------------------------------------- 1 | .idea/* 2 | venv/* 3 | *.pyc 4 | build/* 5 | dist/* 6 | *.egg-info/* 7 | -------------------------------------------------------------------------------- /.readthedocs.yml: -------------------------------------------------------------------------------- 1 | # Read the Docs configuration file 2 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details 3 | 4 | # Required 5 | version: 2 6 | 7 | # Build documentation in the docs/ directory with Sphinx 8 | sphinx: 9 | configuration: docs/source/conf.py 10 | 11 | 12 | # Build documentation with MkDocs 13 | #mkdocs: 14 | # configuration: mkdocs.yml 15 | 16 | # Optionally build your docs in additional formats such as PDF and ePub 17 | formats: all 18 | 19 | # Optionally set the version of Python and requirements required to build your docs 20 | python: 21 | version: 3.7 22 | install: 23 | - requirements: docs/source/requirements.txt -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # **pyUnit-NewWord** [![](https://gitee.com/tyoui/logo/raw/master/logo/photolog.png)][1] 2 | 3 | ## 无监督训练文本词库 4 | [![](https://img.shields.io/badge/Python-3.7-green.svg)](https://pypi.org/project/pyunit-newword/) 5 | 6 | ## 安装 7 | pip install pyunit-newword 8 | 9 | ## 注意事项 10 | 该算法采用Hash字典存储,大量消耗内存。100M的纯中文文本需要12G以上的内存,不然耗时太严重。 11 | 12 | ## 更新说明 13 | 新增加自动识别新词模型,无需手动设置参数 14 | 15 | ## 训练代码非模型(文本是UTF-8格式) 16 | ```python 17 | from pyunit_newword import NewWords 18 | 19 | if __name__ == '__main__': 20 | nw = NewWords(filter_cond=10, filter_free=2) 21 | nw.add_text(r'C:\Users\Administrator\Desktop\微博数据.txt') 22 | nw.analysis_data() 23 | with open('分析结果.txt', 'w', encoding='utf-8')as f: 24 | for word in nw.get_words(): 25 | print(word) 26 | f.write(word[0] + '\n') 27 | ``` 28 | 29 | ## 无监督训练新词模型 30 | ```python 31 | from pyunit_newword import NewWords 32 | 33 | if __name__ == '__main__': 34 | nw = NewWords(accuracy=0.01) 35 | nw.add_text(r'C:\Users\Administrator\Desktop\微博数据.txt') 36 | nw.analysis_data() 37 | with open('分析结果.txt', 'w', encoding='utf-8')as f: 38 | for word in nw.get_words(): 39 | print(word) 40 | f.write(word[0] + '\n') 41 | ``` 42 | 43 | ## 微博数据下载 44 | [点击下载微博数据](http://cdn.tyoui.cn/微博数据.7z) 45 | 46 | ## 爬虫的微博数据一部分截图(大概100M纯文本) 47 | ![微博数据](./img/weibo.png) 48 | 49 | ## 训练微博数据后的结果 50 | ![5个词语](./img/5.png) 51 | 52 | ### 训练后得到的词语视频 53 | [![词语视频](./img/5.png)](https://youtu.be/6PSM4dMArGo "YouTube视频") 54 | 55 | ### 算法实现来源 56 | [基于改进互信息和邻接熵的微博新词发现方法](http://xueshu.baidu.com/usercenter/paper/show?paperid=b31a76cd03eebaaa598faa3f904770b8) 57 | 58 | # TODO 59 | - [x] ~~自动寻找过滤参数~~ 60 | - [ ] 参数自动寻找最优解 61 | 62 | *** 63 | [1]: https://blog.jtyoui.com 64 | -------------------------------------------------------------------------------- /docs/source/_static/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pyunits/pyunit-newword/4379dc37c3506fa4ec64c0a185df3ebbf148fff4/docs/source/_static/README.md -------------------------------------------------------------------------------- /docs/source/conf.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # Configuration file for the Sphinx documentation builder. 4 | # 5 | # This file does only contain a selection of the most common options. For a 6 | # full list see the documentation: 7 | # http://www.sphinx-doc.org/en/master/config 8 | 9 | # -- Path setup -------------------------------------------------------------- 10 | 11 | # If extensions (or modules to document with autodoc) are in another directory, 12 | # add these directories to sys.path here. If the directory is relative to the 13 | # documentation root, use os.path.abspath to make it absolute, like shown here. 14 | # 15 | import os 16 | import sys 17 | 18 | sys.path.insert(0, os.path.abspath('../..')) 19 | 20 | # -- Project information ----------------------------------------------------- 21 | 22 | project = 'pyunit_newword' 23 | copyright = '2019, 张伟' 24 | author = '张伟' 25 | 26 | # The short X.Y version 27 | version = '' 28 | # The full version, including alpha/beta/rc tags 29 | release = '2.0' 30 | 31 | # -- General configuration --------------------------------------------------- 32 | 33 | # If your documentation needs a minimal Sphinx version, state it here. 34 | # 35 | # needs_sphinx = '1.0' 36 | 37 | # Add any Sphinx extension module names here, as strings. They can be 38 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 39 | # ones. 40 | extensions = [ 41 | 'sphinx.ext.autodoc', 42 | 'sphinx.ext.doctest', 43 | 'sphinx.ext.intersphinx', 44 | 'sphinx.ext.todo', 45 | 'sphinx.ext.coverage', 46 | 'sphinx.ext.mathjax', 47 | 'sphinx.ext.napoleon', 48 | 'sphinx.ext.viewcode', 49 | 'recommonmark' 50 | ] 51 | 52 | # Add any paths that contain templates here, relative to this directory. 53 | templates_path = ['_templates'] 54 | 55 | # The suffix(es) of source filenames. 56 | # You can specify multiple suffix as a list of string: 57 | # 58 | # source_suffix = ['.rst', '.md'] 59 | source_suffix = '.rst' 60 | 61 | # The master toctree document. 62 | master_doc = 'index' 63 | 64 | # The language for content autogenerated by Sphinx. Refer to documentation 65 | # for a list of supported languages. 66 | # 67 | # This is also used if you do content translation via gettext catalogs. 68 | # Usually you set "language" from the command line for these cases. 69 | language = 'zh_CN' 70 | 71 | # List of patterns, relative to source directory, that match files and 72 | # directories to ignore when looking for source files. 73 | # This pattern also affects html_static_path and html_extra_path. 74 | exclude_patterns = [] 75 | 76 | # The name of the Pygments (syntax highlighting) style to use. 77 | pygments_style = None 78 | 79 | # -- Options for HTML output ------------------------------------------------- 80 | 81 | # The theme to use for HTML and HTML Help pages. See the documentation for 82 | # a list of builtin themes. 83 | # 84 | html_theme = 'sphinx_rtd_theme' 85 | 86 | # Theme options are theme-specific and customize the look and feel of a theme 87 | # further. For a list of options available for each theme, see the 88 | # documentation. 89 | # 90 | # html_theme_options = {} 91 | 92 | # Add any paths that contain custom static files (such as style sheets) here, 93 | # relative to this directory. They are copied after the builtin static files, 94 | # so a file named "default.css" will overwrite the builtin "default.css". 95 | html_static_path = ['_static'] 96 | 97 | # Custom sidebar templates, must be a dictionary that maps document names 98 | # to template names. 99 | # 100 | # The default sidebars (for documents that don't match any pattern) are 101 | # defined by theme itself. Builtin themes are using these templates by 102 | # default: ``['localtoc.html', 'relations.html', 'sourcelink.html', 103 | # 'searchbox.html']``. 104 | # 105 | # html_sidebars = {} 106 | 107 | 108 | # -- Options for HTMLHelp output --------------------------------------------- 109 | 110 | # Output file base name for HTML help builder. 111 | htmlhelp_basename = 'pyunit_newword-doc' 112 | 113 | # -- Options for LaTeX output ------------------------------------------------ 114 | 115 | latex_elements = { 116 | # The paper size ('letterpaper' or 'a4paper'). 117 | # 118 | # 'papersize': 'letterpaper', 119 | 120 | # The font size ('10pt', '11pt' or '12pt'). 121 | # 122 | # 'pointsize': '10pt', 123 | 124 | # Additional stuff for the LaTeX preamble. 125 | # 126 | # 'preamble': '', 127 | 128 | # Latex figure (float) alignment 129 | # 130 | # 'figure_align': 'htbp', 131 | } 132 | 133 | # Grouping the document tree into LaTeX files. List of tuples 134 | # (source start file, target name, title, 135 | # author, documentclass [howto, manual, or own class]). 136 | latex_documents = [ 137 | (master_doc, 'pyunit_newword.tex', 'pyunit_newword Documentation', 138 | '张伟', 'manual'), 139 | ] 140 | 141 | # -- Options for manual page output ------------------------------------------ 142 | 143 | # One entry per manual page. List of tuples 144 | # (source start file, name, description, authors, manual section). 145 | man_pages = [ 146 | (master_doc, 'pyunit_newword', 'pyunit_newword Documentation', 147 | [author], 1) 148 | ] 149 | 150 | # -- Options for Texinfo output ---------------------------------------------- 151 | 152 | # Grouping the document tree into Texinfo files. List of tuples 153 | # (source start file, target name, title, author, 154 | # dir menu entry, description, category) 155 | texinfo_documents = [ 156 | (master_doc, 'pyunit_newword', 'pyunit_newword Documentation', 157 | author, 'pyunit_newword', 'One line description of project.', 158 | 'Miscellaneous'), 159 | ] 160 | 161 | # -- Options for Epub output ------------------------------------------------- 162 | 163 | # Bibliographic Dublin Core info. 164 | epub_title = project 165 | 166 | # The unique identifier of the text. This can be a ISBN number 167 | # or the project homepage. 168 | # 169 | # epub_identifier = '' 170 | 171 | # A unique identification for the text. 172 | # 173 | # epub_uid = '' 174 | 175 | # A list of files that should not be packed into the epub file. 176 | epub_exclude_files = ['search.html'] 177 | 178 | # -- Extension configuration ------------------------------------------------- 179 | 180 | # -- Options for intersphinx extension --------------------------------------- 181 | 182 | # Example configuration for intersphinx: refer to the Python standard library. 183 | intersphinx_mapping = {'https://docs.python.org/': None} 184 | todo_include_todos = True 185 | -------------------------------------------------------------------------------- /docs/source/index.rst: -------------------------------------------------------------------------------- 1 | .. jtyoui documentation master file, created by 2 | sphinx-quickstart on Wed Dec 11 17:07:43 2019. 3 | You can adapt this file completely to your liking, but it should at least 4 | contain the root `toctree` directive. 5 | 6 | 欢迎使用pyUnit-calendar文档 7 | ================================== 8 | 9 | .. toctree:: 10 | :maxdepth: 2 11 | :caption: Contents: 12 | 13 | 14 | 15 | 目录 16 | ================== 17 | 18 | * :ref:`genindex` 19 | * :ref:`modindex` 20 | * :ref:`search` 21 | -------------------------------------------------------------------------------- /docs/source/pyunit_newword.rst: -------------------------------------------------------------------------------- 1 | pyunit_newword package 2 | ===================== 3 | 4 | Submodules 5 | ---------- 6 | 7 | pyunit_newword.words module 8 | ---------------------------------- 9 | 10 | .. automodule:: pyunit_newword.words 11 | :members: 12 | :undoc-members: 13 | :show-inheritance: 14 | 15 | 16 | Module contents 17 | --------------- 18 | 19 | .. automodule:: pyunit_newword 20 | :members: 21 | :undoc-members: 22 | :show-inheritance: 23 | -------------------------------------------------------------------------------- /docs/source/requirements.txt: -------------------------------------------------------------------------------- 1 | sphinx==1.8.* 2 | sphinx_rtd_theme==0.4.* 3 | pyunit_newword 4 | tqdm==4.42.1 5 | 6 | -------------------------------------------------------------------------------- /img/5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pyunits/pyunit-newword/4379dc37c3506fa4ec64c0a185df3ebbf148fff4/img/5.png -------------------------------------------------------------------------------- /img/weibo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pyunits/pyunit-newword/4379dc37c3506fa4ec64c0a185df3ebbf148fff4/img/weibo.png -------------------------------------------------------------------------------- /pyunit_newword/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3.7 2 | # -*- coding: utf-8 -*- 3 | # @Time : 2018/2/28 10:58 4 | # @Author: Jtyoui@qq.com 5 | from .words import NewWords 6 | 7 | __version__ = '2020.2.12' 8 | __author__ = 'Jtyoui' 9 | __description__ = '新词发现算法' 10 | __email__ = 'jtyoui@qq.com' 11 | __names__ = 'pyUnit_newWord' 12 | __url__ = 'https://github.com/PyUnit/pyunit-newword' 13 | -------------------------------------------------------------------------------- /pyunit_newword/auto.txt: -------------------------------------------------------------------------------- 1 | 52 1.51575216286174e-05 19 36 3.140404358743989 0.531077450166314 2 | 6 1.7935010695244712e-06 4 5 5.53484998743103 0.11927928289042984 3 | 60 1.793645294087577e-05 28 18 5.184359645539216 0.45250104003883174 4 | 15 4.5511904245380765e-06 11 12 7.734424091923545 0.3756397698438658 5 | 27 8.065631537455747e-06 21 8 5.563118091615323 0.057423544086562285 6 | 19 5.675814785617007e-06 16 11 9.145724751236456 0.11703244297573095 7 | 33 9.661974362975782e-06 20 21 7.268901594956713 0.25950782595178434 8 | 10658 0.003090821669043701 1777 26 7.074869576597444 0.6038269760032314 9 | 175 5.0846034430610124e-05 33 102 5.35898993223958 0.6210382942935951 10 | 685 0.00019860184302510328 182 268 9.039397094922295 2.304224191342023 11 | 1 0.0078125 1 0 11.345174658202035 0 12 | 1158 0.00033580888472591557 167 378 4.763993584460806 1.8597876632185548 13 | 434 0.0001258313859428366 134 120 5.4456940190565435 1.66410816296181 14 | 1589 0.0004608228108741423 316 426 5.030348168333266 3.89194300769366 15 | 142 4.151167998353565e-05 74 82 2.6774266278054855 0.8907699584314562 16 | 259 7.586971149297238e-05 101 84 7.890648365237997 1.5592259954847907 17 | 16 4.766366137351777e-06 8 15 4.063822827392764 0.2647821558821443 18 | 106 3.084588435877954e-05 72 64 6.332820486427797 1.0756706399766476 19 | 93 2.7254983559090564e-05 36 36 1.29193133549365 0.6099050518407093 20 | 193 5.666440987687088e-05 44 106 10.400540176480407 0.7759974777623433 21 | 49 1.451426530121988e-05 16 25 3.777290283724036 0.3811083148562267 22 | 2384 0.0006912565327221821 92 459 7.9972077941613335 1.3567025141519673 23 | 27 8.17344659106733e-06 19 19 4.5219363584997705 0.42408850658507713 24 | 388 0.00011278938616992826 177 169 9.10691161197994 2.1539942563957806 25 | 100 2.9337816151641303e-05 62 57 1.7365802679073923 0.6068010668582088 26 | 88 2.6348394035442183e-05 54 49 6.489692288555879 0.8162908056037211 27 | 16 4.782383611727839e-06 1 12 5.372755910458258 0.01124504836401891 28 | 1 0.005494505494505495 1 1 10.43184439272886 0.002292152796822366 29 | 168 4.906451779654755e-05 67 29 1.549539454503479 0.835052185890721 30 | 398 0.00011589213063314376 151 202 4.988218643870759 2.110743134896715 31 | 208 6.0999410827805995e-05 72 143 3.8941980440887365 1.263548776412275 32 | 268 7.787501176842529e-05 33 124 5.207687643425371 0.4756923504115048 33 | 1298 0.000376567629573498 235 314 8.33862695703825 2.739619821750043 34 | 6 2.037941027421176e-06 3 3 5.9476672194423275 0.06301211967252274 35 | 72 2.1659848326912092e-05 38 30 2.368492036665256 0.5108138794569917 36 | 107 3.130961995388415e-05 62 63 4.417450744097986 0.5400365572486565 37 | 40 1.1818158719053413e-05 17 21 6.785258596317582 0.32189538970732773 38 | 3649 0.0010580338373660785 694 17 8.533040017626089 0.3822403077607651 39 | 4134 0.001198660423039564 170 626 8.956357284867892 1.9182143572498738 40 | 705 0.00020488474288708468 256 302 3.9528423139682447 2.987861997949779 41 | 175 5.0846034430610124e-05 57 52 11.534772511525732 0.934281740440992 42 | 3667 0.0010650320104998738 572 613 7.897344716547689 5.054732217147231 43 | 1391 0.0004034081170585371 328 290 8.49121849191176 3.435098194282749 44 | 11 3.271034985206001e-06 9 6 4.359271006091434 0.09410848692947074 45 | 1 0.004464285714285714 1 1 8.019103100943179 0.00633325950757327 46 | 98 2.842092197470886e-05 47 44 4.823461787705275 0.8254081357264342 47 | 42 1.2323589286458311e-05 17 16 6.030161359388491 0.36904039476392875 48 | 244 7.15705182262831e-05 115 110 6.170740785286224 1.4413121422724462 49 | 14 4.082087276192276e-06 9 7 6.10926514635547 0.09232043752960617 50 | 18 5.351988382617218e-06 8 6 5.550097491874617 0.15006054212733216 51 | 44 1.2889217759232927e-05 20 19 10.137522771070417 0.46317272950248267 52 | 23 7.02141929748563e-06 9 1 8.777126459094339 0.009517415223070015 53 | 103 2.9890421135719925e-05 19 48 6.674638112130136 0.2769659854176842 54 | 23 7.02141929748563e-06 1 12 10.043388689538716 0.003789710282767936 55 | 217 6.361593564529865e-05 44 100 5.978023909722223 0.8903795261808797 56 | 45 1.3266094868497623e-05 17 16 7.299744395416031 0.28010229996977565 57 | 66 1.9456939140463182e-05 32 30 4.609670435558012 0.4427521520610538 58 | 5 1.4759350269786163e-06 4 3 4.765658926990652 0.02061023191826134 59 | 3 5.126943111439235e-06 3 1 9.703837860243162 0.022865474140012217 60 | 49 1.430940615672421e-05 30 11 4.099850514648258 0.21185848848741326 61 | 12 3.5043443649120512e-06 7 7 9.483766805171493 0.12885040230348455 62 | 7 2.04420087953203e-06 3 3 10.915836289191702 0.08068803330760811 63 | 48 1.3947122970039837e-05 20 26 4.956107615326269 0.1558543273688881 64 | 8 2.6315945291781335e-06 8 7 7.021002829374496 0.04736702547737448 65 | 1 0.003745318352059925 1 1 22.21449650648294 0.0033079676580661443 66 | 1 0.003745318352059925 1 1 20.581345068331093 0.0026849017095088045 67 | 177 5.142713196695996e-05 112 27 8.690233199124387 0.5123412693936878 68 | 10 2.9105650309000137e-06 6 6 10.290387004330634 0.10958703270703897 69 | 1 0.0033222591362126247 1 1 18.84706840092269 0.004497418451219507 70 | 1 0.0033222591362126247 1 1 11.251595977392945 0.00472845013554347 71 | 11 3.716403291922465e-06 9 7 2.073655112306792 0.02435094986724179 72 | 1 0.0033222591362126247 1 1 11.800631290748065 0.001895866291855079 73 | 1 0.0033222591362126247 1 1 8.233619676759702 0.001263528035170237 74 | 1 0.0033222591362126247 1 1 8.90001825422235 0.0016056804689516242 75 | 1 0.0033222591362126247 1 1 19.38848898462376 0.001263528035170237 76 | 1 0.0033222591362126247 1 1 10.169944248762794 0.0033079676580661443 77 | 1 0.0033222591362126247 1 1 16.71321871735648 0.0006154575319020823 78 | 1 0.0033222591362126247 1 1 10.73083520600657 0.0007470232763421683 79 | 455 0.00013219849880614587 211 126 5.82768107421978 1.580822374067774 80 | 661 0.00019246016205669742 173 221 5.604436306495126 2.185090709543422 81 | 5 1.6427903516951297e-06 3 3 5.7506088046511925 0.0022378147887976495 82 | 546 0.0001585387949657835 153 203 5.628989495640396 1.9924813373535923 83 | 1 0.0031545741324921135 1 1 8.019103100943179 0.009081428620129837 84 | 43 1.2566059482465379e-05 25 31 5.788483344305879 0.49333266746508775 85 | 70 2.078467492471494e-05 34 35 6.3496898115274645 0.7267279837163847 86 | 44 1.2833750430222315e-05 27 14 4.575944780984459 0.08698926835316927 87 | 3 1.4899480653769344e-06 3 2 9.409714854479276 0.0909127051211944 88 | 37 1.1145195998814391e-05 15 14 8.178561841990101 0.2396882907038032 89 | 443 0.00012908809315323165 149 189 8.197466484538301 2.021681868848889 90 | 31 9.479145421404584e-06 20 25 1.8243588656824523 0.42087302241861996 91 | 128 3.715751943367296e-05 76 58 6.615472648941102 0.9461573544007746 92 | 12 3.819321885764719e-06 9 9 8.598461888539344 0.11999434629115578 93 | 913 0.00026518234263185196 242 290 7.453826284126075 2.5783171310275623 94 | 17 6.461598340965621e-06 13 10 8.571555094035265 0.117274870606225 95 | 3 1.3229834865201213e-06 3 2 1.7687742079486455 0.00464242410980366 96 | 70 2.0327924266285354e-05 45 25 13.586177568827539 0.3223897994790666 97 | 263 7.637491545761497e-05 92 76 9.11862566197438 1.2984573354401832 98 | 148 4.2979039877289035e-05 27 35 10.931455714098135 0.5078595783030051 99 | 29 8.421568624603932e-06 2 2 4.222557897830428 0.006997957187695385 100 | 358 0.00010394711588854895 101 83 7.293630417021655 1.3000000357780566 101 | 97 2.8168695054709705e-05 2 3 5.689734432650421 0.024593207768652477 102 | 686 0.00020039360107593253 213 237 8.359001455056468 2.430071086382676 103 | 12 4.624751515955007e-06 8 9 -0.9491127346426379 0.15823850419293423 104 | 50 1.4738245806821685e-05 28 3 4.563570258118945 0.019857141321899283 105 | 34 9.870579286879677e-06 20 20 6.468547454062673 0.3675485681946152 106 | 5 1.608155535657311e-06 3 2 -0.6160294002346207 0.04149017387359199 107 | 61 1.7818058924028764e-05 24 2 6.736110402005261 0.02768042005600941 108 | 106 3.0962528622082766e-05 22 31 8.921596471940298 0.4516785446058934 109 | 85 2.467360449721435e-05 20 36 7.54210504365723 0.3650258392301292 110 | 241 6.995692569210186e-05 82 122 5.528547585749893 1.557295939060091 111 | 335 9.720279375337127e-05 95 132 10.539664465576642 1.3494163389099965 112 | 1921 0.0005573084872193295 285 548 7.216045075628494 3.500895465951496 113 | 258 7.539999257689996e-05 101 101 4.440203558923622 1.5788583229095523 114 | 233 6.796522047567487e-05 88 86 7.188270496744855 1.3078467354141436 115 | 193 5.613968950970271e-05 78 79 5.527953526747119 1.3224802069452541 116 | 5 1.4955431319125874e-06 5 1 2.4620464415771397 0.00848814959334345 117 | 22 6.5803897804153844e-06 15 15 5.691509594651773 0.15888864863710828 118 | 506 0.0001471829588894144 121 285 2.7398076705192027 2.0081243826571784 119 | 827 0.00024122327912566916 245 211 7.335385004010354 2.219537323229357 120 | 25 7.922205212367386e-06 17 12 9.199234956136971 0.3072784508031836 121 | 107 3.168169079558055e-05 48 54 5.393969493415333 0.7622271325213577 122 | 7 2.3861068587278165e-06 7 4 12.171421751143695 0.22337746251311463 123 | 8 2.7269792671175044e-06 2 5 6.639824113912228 0.004707410815237272 124 | 190 5.598726319229356e-05 93 95 9.924690362954737 1.3936622623028259 125 | 5 1.714088055445949e-06 4 4 9.780724178511154 0.09221007105239006 126 | 120 3.4805873143034155e-05 47 70 5.149053095205045 0.9111036564086964 127 | 18 5.326179682222366e-06 5 10 2.9030299293207786 0.1348685680050906 128 | 127 3.6989195950993686e-05 83 65 2.4088129265613607 1.016576359205755 129 | 40 1.1694983144605544e-05 17 4 3.0400147529117225 0.19113310666130195 130 | 21 6.599481846396746e-06 15 7 11.22552587475243 0.18706307145490397 131 | 4 1.940487198120832e-06 3 1 10.797731432660527 0.07297039689780697 132 | 4 1.940487198120832e-06 1 1 6.316412093589154 0.012117720601556569 133 | 1496 0.0004341326966053783 287 313 5.8855657127449845 3.339603499632228 134 | 24 7.125249717736201e-06 10 15 4.692616928037924 0.2803366371776372 135 | 452 0.00013104822342678349 114 190 2.697479784133337 1.899150969825857 136 | 218 6.320467413061681e-05 67 85 3.057840664982694 1.329362758990441 137 | 1723 0.0004999194848188988 290 440 8.829835595026767 3.636622909956102 138 | 19 6.33006912768649e-06 17 6 1.3024742404395213 0.19355064014722106 139 | 129 3.784330700045441e-05 50 65 4.4231318863845885 0.8472846780909605 140 | 186 5.398586963399322e-05 20 22 5.076192786967694 0.3266281487774754 141 | 6766 0.001962486516244173 716 567 8.19939610117464 5.117046763007849 142 | 608 0.00017738027998076824 86 155 4.9688049209892995 1.6338460140631417 143 | 592 0.0001723372010226117 172 275 7.577301273621361 2.306240919501176 144 | 647 0.00018816256780541374 197 180 10.284475248772324 2.3134594170457428 145 | 435 0.0001265955403446891 169 151 9.991124384688106 1.7318626141753575 146 | 33 9.693491789612455e-06 19 13 10.489302258957135 0.2103283483233303 147 | 13 3.9393330587921215e-06 5 8 11.400407333692971 0.0486086490340379 148 | 8 2.4032698890109884e-06 3 2 6.082008937115202 0.007647502363911332 149 | 7 2.801444424745399e-06 2 5 10.708760382745302 0.044843911324520404 150 | 119 3.456453765574011e-05 48 61 4.789157343968732 0.4710411282010664 151 | 8 3.2016507711375984e-06 2 6 6.9582545562369615 0.019532445556167392 152 | 10 3.469191497983012e-06 5 4 8.425741574601387 0.16391762637782872 153 | 273 7.934328176020019e-05 102 172 9.305547898160045 1.5453129364067928 154 | 444 0.00012955915455064858 122 129 5.495466546470885 2.0425164548130232 155 | 6 1.7993070868408576e-06 5 5 4.283534471660124 0.0400008042688214 156 | 2459 0.0007131236206611546 455 452 9.406281915032118 4.461400846956547 157 | 193 5.630446654872874e-05 69 34 8.794254376651917 0.5704147956502438 158 | 117 3.459876083207358e-05 64 81 2.9718113932220214 0.9981412901146793 159 | 7 2.3379375116061896e-06 6 5 7.843569395165333 0.08348672735234135 160 | 216 6.347129819080354e-05 58 64 1.3672400793618 1.122066055378876 161 | 13 6.378036006466348e-06 7 7 5.523130368169784 0.29352683140806113 162 | 4403 0.001277691415033811 423 719 8.500614453602026 4.125964731468104 163 | 344 0.00010002090553229003 140 128 7.31855082445765 1.7675664644818483 164 | 4391 0.0012732123486809885 676 672 6.16551797890615 5.62981050147528 165 | 18 5.678269263528477e-06 7 6 7.374618930434789 0.21847654970884273 166 | 502 0.00014603156318061399 82 201 9.956081635225333 1.133025918901613 167 | 7325 0.0021237947645922095 124 33 8.257329209295843 0.7611301985497237 168 | 11984 0.0034746152162284014 476 687 7.824905932177436 4.426864126102786 169 | 3 1.2308894160742669e-06 2 2 9.52613928249031 0.037805651092158124 170 | 61 1.7877123787652592e-05 26 26 4.094126528024782 0.6885680093393202 171 | 85 2.5167541806247828e-05 30 57 1.809846260100267 0.6206679594166024 172 | 31 9.469089531768796e-06 17 20 4.623854306362959 0.3097738153692877 173 | 533 0.00015490480039223175 154 152 5.1904827609690365 2.246882872935852 174 | 36 1.0591369093325848e-05 9 15 7.50438253444525 0.20996240377233147 175 | 39 1.1473983184436336e-05 4 1 3.531377996816523 0.018748792028679667 176 | 187 5.516753081047593e-05 95 55 5.849294657306634 0.8030185385684413 177 | 1 0.0016806722689075631 1 1 10.73083520600657 0.015889265721209953 178 | 485 0.00014072537110150827 157 106 6.598869370018531 1.654504302047754 179 | 42 1.2845391945021723e-05 16 11 10.135996916839456 0.22647704651491182 180 | 43 1.2515877700664448e-05 8 29 10.99350286817752 0.18732647398191557 181 | 94 2.7314036450872217e-05 42 61 5.739631713243284 0.8674821769588438 182 | 4 1.3258849536255163e-06 3 3 10.101699125198062 0.05482747114433971 183 | 6945 0.002013834419025443 721 1231 4.592741273830154 5.911139159872371 184 | 150 4.410400076211713e-05 46 67 6.77071403340344 0.8106712940119898 185 | 9 2.7889386972681416e-06 9 7 11.157403506341014 0.323368382598677 186 | 59 1.716989994313562e-05 27 39 9.497292452842713 0.4764305416306226 187 | 1033 0.00029987279355456377 307 264 4.489797604988358 3.155044044924548 188 | 851 0.00024673925807157756 173 265 6.176397322027728 2.403531395527803 189 | 28 8.239227136953904e-06 17 19 8.682731697083204 0.3001766172395694 190 | 155 4.52606414338104e-05 68 98 9.883788316506791 1.1370730346697488 191 | 3 1.0691078437591542e-06 2 1 4.273927022576643 0.010074272214716058 192 | 3 1.0691078437591542e-06 2 1 9.869303743234244 0.0166649173509935 193 | 51 1.4970626749692002e-05 19 15 3.7893149252042058 0.33877743978904246 194 | 18 6.414647062554925e-06 7 16 9.795513941448842 0.03991607625101075 195 | 20 7.127385625061028e-06 3 16 5.8869944198338215 0.014906624408541368 196 | 382 0.00011264834489502471 100 163 6.593003943431235 1.2732135742498434 197 | 740 0.00021455637425242327 143 277 5.014624284845863 2.0827948630861317 198 | 447 0.00012982897718872923 134 188 7.647775265170532 1.9460796429435696 199 | 4432 0.001285092864777031 545 695 5.550724906402197 5.038120332935743 200 | 144 4.1768840792842214e-05 55 62 7.241739942769785 1.026105011069379 201 | 83 2.4329060212372178e-05 53 33 2.564981974125267 0.7200671915381313 202 | 2 2.3120558685180067e-06 2 2 8.282954178543948 0.016867012953264537 203 | 71 2.0886401217177093e-05 35 49 7.514613786808403 0.559666163112184 204 | 14 4.063957400437398e-06 6 7 5.385729026473085 0.2739241464505674 205 | 505 0.00014658785561705794 184 170 4.423302565587679 2.227246647259215 206 | 69 2.0550711412018533e-05 41 36 2.183499229998326 0.7871473539111844 207 | 10 3.842473938420513e-06 8 6 0.36006467487054733 0.21016176436207637 208 | 3 1.6415015361718542e-06 2 2 5.011733688953262 0.04883139786573994 209 | 1394 0.0004042491984614171 318 350 4.705893646030068 2.9626228437635054 210 | 6 2.1237569827359796e-06 6 5 5.840760743717843 0.11215048039264927 211 | 1914 0.0005549072617761961 477 445 9.07349240638704 4.331249288643162 212 | 35 1.0165426339432879e-05 17 13 1.1489164405014618 0.3342206805087409 213 | 834 0.00024207853155278864 192 192 5.437420899094263 2.554289863952476 214 | 2003 0.0005813948425662298 148 379 8.31867882870628 2.2400021726919257 215 | 1765 0.0005126279282599723 244 79 6.232232307142635 1.0508382105491596 216 | 97 2.8293419679794326e-05 56 34 5.938155322544707 0.4786698174898741 217 | 722 0.00021001137610099629 266 222 8.363516655862565 2.0850334627329743 218 | 508 0.0001474403998305016 199 214 8.334204360152194 2.2499222973614184 219 | 7 3.401145700223018e-06 1 3 9.680875202876127 0.006453842579501051 220 | 248 7.217447130744636e-05 125 91 11.429493255975771 1.55158502688235 221 | 21 6.11154797361441e-06 1 11 7.064180028137493 0.0017424440879669708 222 | 1144 0.000331809253765629 329 341 5.773457429023858 3.57247227550192 223 | 3 9.178915419354049e-07 1 0 12.760374277431696 0 224 | 64 1.8692833518119783e-05 42 32 5.3276192755768665 0.5068770736382571 225 | 154 4.52409196480139e-05 68 81 3.508818150862341 1.1135773155217639 226 | 34 1.0112474510615719e-05 13 21 2.1074593480579633 0.18605560260601095 227 | 498 0.0001446919833087047 185 169 8.596240063359756 1.9391137994464083 228 | 13 4.6313795917902026e-06 9 6 4.121153038102905 0.19191844832235616 229 | 6 2.1375598115954782e-06 4 4 9.716888296437643 0.0546936633660538 230 | 69 2.023286562181021e-05 28 33 8.96971087990911 0.45288509317930814 231 | 561 0.00016302608969573403 201 152 8.177664388678068 2.326675028813932 232 | 92 2.6819522396779092e-05 40 25 6.840151366633746 0.44905277416014083 233 | 114 3.428715795552294e-05 43 12 5.437071789963219 0.10634971130961451 234 | 332 9.670743234484938e-05 139 121 4.612241838977305 1.8641960069683627 235 | 306 8.876947235251576e-05 128 103 11.266467698858456 1.995588896732662 236 | 398 0.00011541192924378798 175 134 6.902371745091238 1.9353888874698746 237 | 60 1.7575351389858786e-05 41 40 4.781152961730579 0.5336024919615796 238 | 2 0.0024600246002460025 1 0 18.259664800343806 0 239 | 214 6.24112862936941e-05 101 108 5.041127895075303 1.4695090171892353 240 | 108 3.148397072340545e-05 54 46 12.587271168445922 0.8562227895997992 241 | 185 5.3762737772973036e-05 99 101 7.04033050080774 1.4154067919748452 242 | 295 8.584807567900735e-05 77 91 5.47179543146326 1.1065550773042931 243 | 436 0.00012674451764554034 142 181 4.9731135289874935 2.120672176054881 244 | 8 2.462168018961156e-06 7 6 6.254116069394989 0.055292286216953944 245 | 3 1.4221480598345094e-06 3 2 10.58905210512791 0.09753480856150969 246 | 29 8.505064766068194e-06 16 15 4.844553645819836 0.35629617570924543 247 | 139 4.159102734924225e-05 57 50 5.1427116800555295 0.7102416850978678 248 | 17 5.536261863883512e-06 15 8 1.1314864013952992 0.24112515730174378 249 | 2157 0.0006259763649665667 531 254 4.92232016989946 2.894831276463482 250 | 37 1.0873372814452064e-05 11 14 3.1110242774754275 0.32096236736057016 251 | 73 2.20770980652599e-05 40 19 3.665394507706776 0.27375282394488004 252 | 39 1.2189431263015735e-05 24 28 9.586008287656773 0.45865685721326926 253 | 16 4.9271323319963355e-06 11 9 1.159364029329944 0.08939195073368593 254 | 94 2.7329021356467253e-05 38 35 4.563463553543742 0.8027078616619508 255 | 3521 0.0010210603484074822 538 891 5.956145005646289 4.987426132082937 256 | 60 1.843623260503045e-05 40 33 9.602401824422502 0.7881230844022118 257 | 4 1.8560089682353346e-06 4 3 3.9501364038284725 0.09303892889189394 258 | 2860 0.0008293941942406403 437 714 2.9805318629673985 4.27113393231958 259 | 870 0.0002525464373390198 312 124 7.054125805278314 1.6168301771949996 260 | 385 0.00011231438225047683 132 191 2.718778361794985 2.16870170629827 261 | 402 0.00011660058816585245 179 90 5.849599425110125 1.273221405285473 262 | 8 2.601575253816186e-06 7 2 7.4303847156641405 0.016322529632927983 263 | 1 0.0011668611435239206 0 1 16.285794811701813 0 264 | 23 7.324889752447389e-06 16 14 1.9335415149673731 0.2807273682933577 265 | 207 6.006690931376603e-05 59 45 4.907494158947437 0.8213113390747682 266 | 2580 0.000748016234851584 300 542 9.22786561180193 3.5905960830898156 267 | 650 0.00018847015009762754 149 271 6.279221032789875 2.104105466360044 268 | 2043 0.0005923873441938646 348 458 5.413784817829854 3.7459691281442398 269 | 48 1.4616588610023326e-05 17 19 4.050411137666863 0.3885694459757098 270 | 126 3.6838335258087624e-05 35 48 5.391423769129747 0.7622424343803493 271 | 36 1.062701138330036e-05 24 25 7.264728666255093 0.5616183274923258 272 | 1575 0.0004570395848513577 287 364 7.8749468214215685 3.191765081944127 273 | 2 0.002242152466367713 2 1 10.292901566308286 0.011607966376027705 274 | 4 1.2278420553093732e-06 2 1 8.248468606556743 0.002360352452339088 275 | 87 2.5759299773406987e-05 42 42 2.878175019958234 0.7951948196416186 276 | 3 1.6449641068831879e-06 3 1 1.5572795906864216 0.002288111000903864 277 | 1 0.0011210762331838565 1 1 14.95715545306865 0.008204756670407709 278 | 1 0.0011210762331838565 1 1 8.800899899920305 0.011607966376027705 279 | 785 0.0002276433892359766 144 223 4.23771601254885 2.1542959213936594 280 | 139 4.246990289363642e-05 78 58 6.340431808899542 1.1908152040034048 281 | 252 7.314456617145086e-05 103 94 7.606094203475365 1.5208502770946675 282 | 33 9.660693040553833e-06 17 23 8.140695809792419 0.5009984004017715 283 | 602 0.00017469093922786605 187 77 3.969869679645084 1.1893797690166679 284 | 288 8.378398742541989e-05 103 120 3.4673974528788785 1.8159301987219716 285 | 7 2.6008536744860714e-06 5 3 8.35752336244577 0.16670446391718896 286 | 285 8.32660874463372e-05 41 166 5.458667388966852 0.7608906040775102 287 | 10 6.760191664954085e-06 9 6 8.526302682492645 0.17424263389129457 288 | 74 2.151222824823018e-05 38 31 3.059619137307475 0.5450755659493832 289 | 124 3.6156652780373705e-05 74 75 4.31492087496837 0.8842617507548952 290 | 246 7.152817919495325e-05 99 91 2.17459421909915 1.459421694957219 291 | 78 2.4104405450933164e-05 21 42 7.360813493781494 0.5607819617901082 292 | 766 0.00022233180787397096 221 316 4.46798749849951 2.9187090972775684 293 | 80 2.3395958465154936e-05 31 50 3.5946534350490564 0.765447461068563 294 | 493 0.00014299661739036106 138 193 4.999261214556867 2.044245905216679 295 | 2086 0.0006049620811159607 414 461 6.989891042892845 4.20764662714932 296 | 360 0.00010506096162298174 127 137 4.541906972606963 1.5537643260402694 297 | 45 1.3197514292174783e-05 26 13 1.3326497169296418 0.3940669485370445 298 | 337 9.787557719725589e-05 145 113 8.211815185751972 1.6875361247286946 299 | 203 5.903591157409225e-05 68 56 5.9061555199165285 1.1525281880677654 300 | 1418 0.0004112894009735203 293 379 5.617429125048247 3.406896061044271 301 | 271 7.897552137686682e-05 113 57 5.832549366572857 0.9831673881963251 302 | 36 1.088398518326817e-05 29 15 5.15990444233633 0.4802683730812575 303 | 513 0.00014896326792306874 144 136 5.161901217447183 1.8931314406224469 304 | 736 0.0002136285742150311 211 233 4.546817674871497 2.4433105208960715 305 | 447 0.00012981521515637652 167 166 10.807646109995563 2.2779767377781623 306 | 54 1.5832847352879862e-05 28 29 9.110737499803355 0.53981689822975 307 | 2 3.225926123065856e-06 2 1 9.82000492962179 0.0030553449575416586 308 | 238 6.901037098293674e-05 83 91 2.713904439177852 1.4979499640246834 309 | 53 1.635752992656395e-05 27 42 8.348292281749304 0.49023552820625677 310 | 1 0.000992063492063492 1 0 10.390101189847117 0 311 | 532 0.0001548195740832033 149 207 6.955081531401752 1.6111038098843347 312 | 1 0.000980392156862745 0 1 15.933578950075866 0 313 | 22 6.6315955739525395e-06 14 14 5.139184281105427 0.3748339177974986 314 | 39 1.155825419386808e-05 30 17 3.1854750689513405 0.38477431992723304 315 | 10 4.507038190838814e-06 7 9 5.025846150733707 0.058178946500536996 316 | 12 3.6205877481120897e-06 7 8 7.133357702589102 0.11189691589308241 317 | 4 1.2472529254317366e-06 2 4 0.5376836734895041 0.021007484753180653 318 | 2 6.723109411210247e-07 1 2 10.585197399930298 0.02000663346790765 319 | 30 8.710945564430074e-06 15 7 3.4693816526077126 0.09245438452979565 320 | 1 0.0009596928982725527 1 1 12.949136487483969 0.0004385003649508474 321 | 2 1.4826412364041799e-06 2 2 7.8517034626340525 0.0035881602291153074 322 | 2 6.218596089746779e-06 1 2 9.45708893307872 0.0016374283591222376 323 | 12 4.04751512927426e-06 11 7 3.888039101140318 0.10032453153518568 324 | 177 5.1701580812345754e-05 88 92 6.183153797597252 1.2823648042820273 325 | 332 9.65375302733844e-05 132 119 5.350284839380468 1.4887597303051 326 | 8 2.3502149271550883e-06 3 4 0.5524619360713207 0.043343266900971414 327 | 142 4.1190247309725714e-05 82 58 4.932006953104294 1.2437106444195403 328 | 156 4.608952535765915e-05 68 81 6.895092620951216 1.0537633219260907 329 | 4 5.6019741356854154e-06 2 1 9.091571757430799 0.013130855711226498 330 | 88 2.5776932867200464e-05 41 51 2.450271372552476 0.6736005144889696 331 | 325 9.423327170960611e-05 114 135 4.043785664229671 1.9810124558301354 332 | 1121 0.0003250477567935271 292 327 9.14096106991354 2.612071957819731 333 | 10 2.9587707136140734e-06 5 3 2.4859763691284216 0.02090867764039114 334 | 192 5.595145744804237e-05 65 69 4.824417516853835 1.314009224802897 335 | 78 2.342457003150605e-05 51 55 4.03160947631333 0.8452756460356072 336 | 18 5.766090837073017e-06 14 7 10.183947129476401 0.31122932947826576 337 | 91 2.6673044748575776e-05 41 53 4.752796687693105 0.7221760119648479 338 | 246 7.195067342613021e-05 95 92 4.233751646056957 1.4788518983672754 339 | 18 5.522748045790945e-06 5 3 4.9581330965836266 0.09341528922854542 340 | 47 1.381653580643445e-05 22 23 4.276459771826968 0.5212596940493719 341 | 19 5.832591724411883e-06 17 18 2.962228053588098 0.14075637730793714 342 | 44 1.2876536258570214e-05 31 21 10.32244855173109 0.39120445432198625 343 | 70 2.06092931424638e-05 35 43 1.177523289892379 0.6615489455705643 344 | 96 2.7906287199298855e-05 45 45 2.864872897921405 1.0103828943234072 345 | 144 4.183512081634266e-05 36 91 5.813825248250721 0.6402261321703393 346 | 251 7.290448263813148e-05 108 105 5.135345826529244 1.8062783834662148 347 | 25 7.29834099951072e-06 16 18 2.9978127165657424 0.25516334815431846 348 | 150 4.4557124461415756e-05 60 70 6.233568875434697 1.0693502508028558 349 | 517 0.0001500138262259568 211 150 6.505458414167756 1.9625628946688845 350 | 1 0.0008920606601248885 1 1 10.700673748585855 0.008021742569369243 351 | 383 0.00011122278562838986 89 163 8.579178674561662 1.4201415798596468 352 | 135 3.930772973402643e-05 58 73 7.464025597054016 0.9247971464983009 353 | 48 1.3953021919325372e-05 36 22 3.793554841365092 0.4797087610633269 354 | 1 0.0008920606601248885 1 1 18.43393359419032 0.0019396928316665462 355 | 1 0.0008920606601248885 1 0 12.353647101805425 0 356 | 4 1.3199041881549818e-06 3 3 7.052155901216925 0.01711695214518527 357 | 1 0.0008802816901408451 0 1 11.695302406575525 0 358 | 6 1.7424572655095396e-06 6 4 4.758379960671293 0.10497737776794813 359 | 2 7.424161264661326e-07 2 2 1.892129987018231 0.007772177872396724 360 | 4 1.6101050190998708e-06 3 2 1.8475293488671216 0.018262894074849205 361 | 1 0.0008802816901408451 1 1 10.410803734923121 0.0006668666176152647 362 | 1 0.0008802816901408451 1 1 14.275079794019133 0.0008854925702232464 363 | 8 2.3233040171088106e-06 7 5 7.420799034545755 0.27042095204368227 364 | 336 9.77715079858371e-05 99 163 4.95704420398247 1.5292002910387268 365 | 1561 0.0004532549278770073 387 509 4.577314934152088 3.900859155683382 366 | 9 2.680360728903253e-06 6 7 14.544069727772998 0.08371504016098189 367 | 221 6.430815770804562e-05 93 73 8.439162683851714 1.2560945045648866 368 | 90 2.6516149218745858e-05 53 37 7.111759526859853 0.6417978157899568 369 | 18 5.420285962242288e-06 13 12 4.923420918970343 0.30121704328474963 370 | 1 0.0008620689655172414 1 0 11.588024847527844 0 371 | 255 7.422102847642565e-05 118 100 8.385167459641762 1.3057425368015707 372 | 2 9.153204971837877e-07 1 2 5.868797518286696 0.012142623827111492 373 | 1 0.0008561643835616438 1 0 17.38482229353022 0 374 | 142 4.119342576132413e-05 75 53 6.084540645115872 0.963873470434121 375 | 687 0.00019968672147248465 218 313 7.024586745763579 2.7413345672257 376 | 48 1.4895743758450619e-05 29 39 3.5176480478625947 0.4435676108444237 377 | 12 3.5142560188223554e-06 8 6 5.468563445744934 0.24457016930270428 378 | 54 1.7546156952973375e-05 13 31 7.957443179631184 0.4237560259268194 379 | 3 1.2893853500036534e-06 2 2 8.32089816788178 0.01753954995980538 380 | 40 1.1686226000508936e-05 30 17 5.910344199171338 0.17621014391346573 381 | 10 2.9812301748193374e-06 8 8 10.271174223692658 0.0999155556659583 382 | 7 2.0733976856735033e-06 1 6 -0.24506248008931467 0.03124948750813948 383 | 69 2.020442189878229e-05 41 47 5.517190729340721 0.5544212020616693 384 | 28 9.149393918005746e-06 20 16 5.053599462768225 0.37244100834250493 385 | 11 4.392402899944256e-06 9 2 3.5353686504532154 0.05220025075877641 386 | 1 0.0008223684210526315 1 0 14.327198809475679 0 387 | 56 1.6758785119754987e-05 34 41 10.340453169855216 0.4503837692357214 388 | 118 3.459295669811987e-05 32 25 6.534293778053465 0.409452373197387 389 | 2 6.32029351443081e-07 2 2 7.544181810933127 0.04277071271051654 390 | 291 8.530274707090593e-05 134 126 4.586167419412203 1.7370866652508685 391 | 190 5.535416028758525e-05 96 104 6.627572901567011 1.3565465934557583 392 | 70 2.0497041544867878e-05 30 35 8.09536700995703 0.7113095947239567 393 | 233 6.831880898911327e-05 112 50 3.910370371404775 0.8214114488657053 394 | 2 1.28567011376252e-06 1 1 11.573473036389043 0.004773119449208506 395 | 207 6.007841539261245e-05 96 72 10.245236749749195 1.217255013036027 396 | 60 1.7646394488678072e-05 28 24 10.931999063742218 0.3918904035538229 397 | 33 9.697853101035466e-06 25 12 5.781136644667741 0.20933065989292757 398 | 3 0.0023148148148148147 1 3 10.850610852134185 0.0005539074308921532 399 | 195 5.76890477490769e-05 77 80 7.6097647405400775 0.9877466978538482 400 | 190 5.563031489100557e-05 90 94 4.483363784499901 1.1868984315541784 401 | 134 3.8870222479809964e-05 53 80 3.7727776655881318 1.1033058579261854 402 | 276 8.020131692887073e-05 112 159 3.9175362768908744 1.4461221525271633 403 | 16 5.084612722527281e-06 8 14 6.944402958924185 0.16177316536395192 404 | 113 3.280214183471959e-05 63 50 4.80243555201489 0.7936055975370063 405 | 1 0.0007757951900698216 1 0 8.602259478323056 0 406 | 86 2.5108755948731426e-05 45 30 9.659912792697178 0.5376423322684055 407 | 2 7.474534261770149e-07 1 1 2.090865285177758 0.001407556579726448 408 | 45 1.3644066331992879e-05 31 17 6.764085802504677 0.2616887684188674 409 | 21 6.2856467257018595e-06 12 12 6.710322663919496 0.27848077326614395 410 | 748 0.00021721561524725845 205 307 3.8447496622917545 2.6768291052349866 411 | 566 0.00016521157882139345 216 168 4.439379125790572 2.429506557735454 412 | 599 0.0001740740654634729 216 273 10.689033013471656 2.5715649770521845 413 | 154 4.56348489042006e-05 53 58 7.703924231511095 0.8437516956062875 414 | 331 9.644716050526657e-05 110 154 9.232028627157343 1.5053236784665711 415 | 94 2.7772837101673994e-05 45 33 10.276684928951296 0.5940302833835809 416 | 10 3.3195274187985503e-06 3 6 9.155357493868737 0.03649005569347738 417 | 723 0.00020961327655123248 191 192 8.563528085705638 2.3276582085463398 418 | 20 5.984952632092393e-06 10 7 9.422969304074492 0.05213936947026195 419 | 139 4.159542079304213e-05 75 66 6.263795988907364 1.020553626288315 420 | 166 4.8529327500458985e-05 89 89 6.179756160893998 1.2198000326324858 421 | 144 4.197724483519559e-05 63 78 3.4801372981518317 0.9284253653298873 422 | 382 0.00011096738285170487 135 159 7.013371045809458 2.0264062721865086 423 | 1 0.0007326007326007326 1 1 11.331051478345831 0.009601247865953516 424 | 42 1.2267562549234547e-05 30 23 5.537870061491704 0.42039445979611045 425 | 374 0.00010843065757975235 169 106 4.299680138190404 1.6654570040615155 426 | 24 9.480144430000392e-06 8 8 7.702134451894313 0.14026727146764317 427 | 52 1.5608088111259256e-05 9 24 7.528894569877621 0.1143392274366474 428 | 34 1.0376818919016253e-05 14 11 10.516511943623764 0.11687980687519478 429 | 59 1.7958326941386456e-05 3 25 8.0146213014191 0.02180075443432171 430 | 135 4.027525601637741e-05 44 71 7.686187744056777 0.3846625075923414 431 | 140 4.0775182825725996e-05 69 69 6.826665413852652 0.6985062339685113 432 | 102 2.9956272652595636e-05 48 14 6.402026421355346 0.4505191489607818 433 | 71 2.085191527778716e-05 38 32 9.306443404770143 0.528373319554615 434 | 22 6.469779512855011e-06 11 14 5.688272739735989 0.12818939923274197 435 | 68 1.9997500312460943e-05 25 35 4.892200392472889 0.3820411496198052 436 | 2 1.1168005896707114e-06 2 2 2.172261817473259 0.0081866198389872 437 | 1 0.0006839945280437756 1 1 19.484609678683423 0.0011640368683812088 438 | 14 4.162114354091879e-06 9 10 2.3539659621779774 0.09536979984774471 439 | 395 0.00011531085910093145 118 185 11.053878865056536 1.5555879455676944 440 | 21 6.707334406293907e-06 4 12 3.9044712110174817 0.08878280983078346 441 | 14 4.155530833445137e-06 4 2 7.284590068806289 0.12051763881910724 442 | 383 0.00011126104077005422 100 127 5.93320579394249 1.6128789831390669 443 | 124 3.604091690417825e-05 66 66 2.9327053958152565 0.9296306496023395 444 | 2 0.0013201320132013201 0 1 9.942848140184145 0 445 | 13 4.323355029924268e-06 10 6 5.759209166697204 0.27789382432068765 446 | 8 3.2229473853839337e-06 5 5 9.68969554633008 0.11360721598016563 447 | 209 6.064788187997987e-05 96 87 6.546841132968576 1.3180624359678947 448 | 4 2.3683087137774585e-06 2 4 5.637884194838125 0.010776199290849716 449 | 2 2.4091189972283085e-06 1 2 2.563606018847749 0.0026369416305847154 450 | 1 0.0006565988181221273 1 0 8.557664107549673 0 451 | 11 3.3983163505048197e-06 10 4 1.3698332559105164 0.20331011380542124 452 | 174 5.101612989293063e-05 71 76 7.968951188494107 1.316047657900766 453 | 261 7.615239582439782e-05 117 76 7.82015960698036 1.163416933702307 454 | 1024 0.00029736046128041554 266 368 5.077959710414832 2.90081230860453 455 | 1355 0.00039334192589496173 277 275 9.046943877015693 2.9778476789629527 456 | 264 7.667459938973989e-05 90 47 3.997662644070105 0.9406971235709357 457 | 106 3.1057465687360314e-05 65 60 7.921148801124421 1.1648923798332902 458 | 104 3.0238201431778837e-05 44 48 3.9924346576133223 0.7676684716545512 459 | 3626 0.001051818447897712 566 664 6.9574023910311755 4.818474305833538 460 | 603 0.00017507414520701863 223 142 4.180339528773026 2.067220883630784 461 | 4 1.4201948152237784e-06 3 2 5.841937189881641 0.01907263883444763 462 | 11 3.4800358253869877e-06 8 8 8.548195885109314 0.23990575880831827 463 | 258 7.51989054770161e-05 130 111 11.055787696999024 1.6116434581435737 464 | 10 2.923567137819001e-06 7 5 5.047602411026022 0.2174840339373068 465 | 2 6.027787497585117e-07 2 1 2.0798659939570348 0.003145726586091111 466 | 61 1.7765191350040962e-05 22 30 3.970618059772539 0.36131038087179895 467 | 30 9.139748583795957e-06 22 19 6.35446162686797 0.323702139482084 468 | 282 8.187575615309372e-05 83 88 6.5753353773441825 1.6208294755945152 469 | 1 0.0006215040397762585 1 0 10.142822680895277 0 470 | 2 1.0913229999596211e-06 2 2 1.8492062013662927 0.015621561097046339 471 | 572 0.00016625930230969633 303 326 8.25456290531734 2.833297909711296 472 | 29 9.009859893571807e-06 22 19 3.4601502620382325 0.3740270229527405 473 | 58 1.696063640986606e-05 25 35 5.559310353207211 0.6578230958095815 474 | 74 2.1698499929919708e-05 33 46 5.817015242875751 0.7257316740195904 475 | 10 3.1059537094871046e-06 8 6 3.022050600404712 0.1144289676149503 476 | 1 0.0006090133982947625 1 1 9.688502443329616 0.005814116247480392 477 | 431 0.00012504253912366359 144 161 9.093320325206067 2.004161768954843 478 | 266 7.740793984297246e-05 124 112 6.092634335264909 1.9272139216004878 479 | 863 0.00025033910360615944 147 245 2.142884468698179 2.1079883262966677 480 | 64 1.9314579882235385e-05 35 49 7.453913953930341 0.7320336105258034 481 | 793 0.00023037237530187352 257 222 3.0608984080801362 2.736891615659746 482 | 62 1.8161552871356157e-05 35 39 4.853112157250513 0.7747280569976557 483 | 38 1.1051057338992639e-05 11 26 1.893690894040045 0.3554941861742642 484 | 7 2.158556456119014e-06 6 4 4.646585157275993 0.14227752106999783 485 | 1184 0.00034358377611338266 356 254 4.556641317246076 2.954225080557776 486 | 44 1.2766385873877972e-05 27 21 3.987909563934134 0.39927927512468214 487 | 173 5.022766487521329e-05 83 69 4.189185984642634 1.1626610355549747 488 | 1245 0.00036312070307168034 198 315 7.739663108709063 2.912110537591986 489 | 4641 0.0013455805030284985 617 755 8.398032111259912 4.913733602589574 490 | 68 1.9887181190937997e-05 33 30 3.785083058417741 0.7367530349898098 491 | 100 3.0270659043707502e-05 49 68 8.238100606041725 0.7784234812123026 492 | 3 2.2876297943954596e-06 3 3 12.065755348061634 0.027324479766004904 493 | 17 5.454968433381198e-06 10 15 9.498645252927085 0.17023217693765225 494 | 12 5.2599188920506845e-06 7 3 8.775697820274692 0.027605951832914005 495 | 10 4.38326574337557e-06 5 6 12.082084790180943 0.11229395051813261 496 | 118 3.439972480220158e-05 75 54 3.54017675540518 0.8450850227949565 497 | 504 0.00014666635238162585 116 211 2.8687268420316183 1.4620362529143525 498 | 12 3.6969761199989155e-06 6 5 10.511349227712339 0.06315194480509825 499 | 3 1.572969257364511e-06 2 3 10.86935230735932 0.08211935504986856 500 | 3 9.668748670547058e-07 1 2 0.6570655008985777 0.01967541381151597 501 | 10 4.610838328792426e-06 7 3 10.890306088030908 0.04209333021357983 502 | 39 1.1978221750055053e-05 18 22 4.609092516217055 0.3570565727557995 503 | 2 1.4622811878987458e-06 2 1 10.657577841701762 0.012501511758042838 504 | 20 6.161418062689964e-06 12 15 10.660445878115173 0.29895682199173146 505 | 1 0.0005390835579514825 1 1 13.777645056181209 0.0034686206376816162 506 | 278 8.247662470050305e-05 148 137 11.112829038957992 1.9822526718952194 507 | 358 0.00010433384732810634 98 85 12.095997680515433 1.3178594038342097 508 | 1 0.0005136106831022085 0 1 11.805073326021752 0 509 | 80 2.3237359167080098e-05 57 41 8.314208987591476 0.6429058639952547 510 | 11 3.268126291095574e-06 9 8 11.151254773201563 0.08886263641860367 511 | 50 1.5077052785967967e-05 27 33 10.859252760825333 0.5866338332787397 512 | 119 3.4797051315416264e-05 50 86 3.642543121530587 1.218207720242595 513 | 1 0.0004967709885742673 1 1 11.877977266043509 0.07353583962983049 514 | 5 1.7128367522696799e-06 3 2 5.047499977800973 0.011916717476614606 515 | 46 1.3583393769592569e-05 29 30 11.253881242005878 0.447654341344265 516 | 2 1.339848570314583e-06 1 2 4.017127614293567 0.025603652926969693 517 | 2 6.110872617332574e-07 2 2 -0.3739425475332278 0.0018611061895739392 518 | 2 8.272848469957358e-07 2 1 3.288425462971929 0.007945231460889407 519 | 31 1.0674372415768872e-05 24 19 1.8761208802582687 0.10905207812969187 520 | 48 1.421371084159678e-05 35 28 2.836227307053856 0.3302041473496044 521 | 7 2.7437627413482302e-06 3 6 9.694408117129282 0.03945271627992989 522 | 11 3.960937950106585e-06 6 2 8.609312171691649 0.15931405312125912 523 | 9 3.2407674137235697e-06 1 1 11.098429182596261 0.022044855162806794 524 | 9 3.2407674137235697e-06 1 6 7.507795981847664 0.004726591381336946 525 | 244 7.084581482704185e-05 91 72 4.329620771862123 1.535469730832677 526 | 16 5.115020834119235e-06 9 12 10.653040742750473 0.23774951765145866 527 | 109 3.1828850721522085e-05 42 56 10.130773651482116 0.753838503732761 528 | 25 8.140371259540109e-06 9 12 10.347847739891185 0.13847410653779135 529 | 8 2.557490794631571e-06 6 6 8.425741574601387 0.0813404693853892 530 | 1 0.00046446818392940084 1 1 18.692363506654186 0.010081923500520795 531 | 4 1.3833832158264574e-06 4 3 7.628168256087613 0.18335157107036976 532 | 12 3.7515533775704004e-06 4 8 10.73083520600657 0.037211966890601446 533 | 6 1.998207607775825e-06 4 3 8.076716846054117 0.0623037571848969 534 | 121 3.524225995506466e-05 46 62 8.298463111335678 0.9829200084723129 535 | 21 6.203437295065018e-06 20 4 11.146257351026275 0.0443404641953554 536 | 112 3.31359868355458e-05 40 68 7.0537474833222165 0.2723540570597098 537 | 3 1.0766568942109236e-06 1 2 5.29424031245437 0.004531756892662856 538 | 67 1.943851698621751e-05 24 5 3.4478760259172763 0.23239066332807262 539 | 40 1.223928037927082e-05 15 12 9.623705503469916 0.22013008617256394 540 | 18 5.503296321627313e-06 15 13 2.8860155743404854 0.16679626101296863 541 | 2 6.401413432085804e-06 2 0 9.869303743234244 0 542 | 35 1.1361496860980724e-05 16 20 5.101160166261818 0.2709424876820543 543 | 3 1.0766735095249717e-06 2 2 7.936800675572473 0.007308852877808588 544 | 3 8.980868952956412e-07 2 2 7.13528390293382 0.013598560601889905 545 | 312 9.08020968299591e-05 126 109 7.7599569985483114 1.6581637966341114 546 | 40 1.2161673640879763e-05 13 27 8.375875102280641 0.1494854633621982 547 | 4 1.2299748562390013e-06 2 3 8.024349670970492 0.07162738984690209 548 | 140 4.0756795467844346e-05 27 20 8.422340488447407 0.4662178323786709 549 | 2 0.0007836990595611285 1 1 10.698320816476796 0.0030955821080947852 550 | 2 0.0007836990595611285 1 1 13.74724503115768 0.0008559543357198936 551 | 7 2.5535944760644294e-06 5 6 2.761433673266324 0.09545557307809242 552 | 186 5.4843151535165956e-05 60 95 5.362047391336728 0.9964584486242031 553 | 1 0.00040404040404040404 1 1 9.459485749505918 0.00858946867135361 554 | 1 0.0004 1 0 17.517622334763946 0 555 | 342 9.931458417244961e-05 136 152 7.767819949033596 1.7044111017003365 556 | 73 2.1355738330771133e-05 32 42 3.9851384981607 0.6325740002065516 557 | 2 1.3558954333441805e-05 2 2 9.599805930045607 0.0458263942087133 558 | 62 1.799674200915628e-05 32 33 6.486468850327001 0.6163404815421282 559 | 2 6.680555541637731e-07 2 0 1.2207346434255655 0 560 | 174 5.067448024437914e-05 80 61 6.841101755500171 0.9923054583962502 561 | 67 1.981522744036356e-05 33 40 4.869174993222792 0.8088013098919357 562 | 6 1.7908384287661184e-06 4 3 8.787487596617972 0.05893357111083514 563 | 3 1.3123876268094545e-06 2 1 3.8763729897620465 0.05868578422033822 564 | 244 7.116366593208653e-05 105 104 4.573338194950237 1.555921563667592 565 | 492 0.0001427818134277009 165 192 7.3300609603719415 2.236413530143891 566 | 124 3.638206211767661e-05 61 57 3.2422938841324807 1.1346650037146144 567 | 8 2.395600479718996e-06 5 5 9.552377127988672 0.11698206640916084 568 | 2 4.190939189472361e-05 2 1 10.43184439272886 0.02583319537571619 569 | 745 0.00021693262509441666 162 201 10.532932985517132 2.176066564163594 570 | 1783 0.0005174678033738437 392 320 8.258908570101672 3.2770996809703647 571 | 73 2.1166831748623867e-05 43 46 4.316129234261433 0.8195175838749877 572 | 197 5.71263696192107e-05 98 97 3.964104882574507 0.9817060804317237 573 | 230 6.69654880255517e-05 133 80 8.321240288879135 1.4789556528951784 574 | 51 1.4848583750816309e-05 36 26 10.579526875530224 0.5774396456383949 575 | 91 2.7070941931937108e-05 58 39 6.549880507902123 0.5579730407364331 576 | 1247 0.00036226273896776325 302 337 6.7961096871397775 3.434762212461893 577 | 52 1.52356128935476e-05 39 26 1.7356470032946167 0.6157639121124556 578 | 4 2.464556595461519e-06 2 0 7.464662979872486 0 579 | 21 8.084326454341457e-06 16 9 8.941556495943075 0.26457654695625 580 | 174 5.0702772480050355e-05 91 63 1.5381112684449767 1.182982596331313 581 | 137 3.977385226307413e-05 61 80 3.6096786868190396 0.7255880528877562 582 | 71 2.0827674170691883e-05 40 33 6.5724893566760265 0.805211560364429 583 | 182 5.2768689031506094e-05 84 78 2.8745478483146134 1.1478327840140472 584 | 1017 0.00029525321664879763 140 287 9.188303982479379 2.201846209618636 585 | 6 2.0291947005551202e-06 5 4 8.999630053083097 0.11471792609476603 586 | 70 2.035374222450701e-05 26 30 5.905123481615 0.5737234247479331 587 | 454 0.00013166366024975495 85 122 7.543826787866907 1.3405948216465715 588 | 76 2.2117082011886186e-05 37 24 2.95182254976319 0.6680627432490096 589 | 184 5.350960482866021e-05 69 91 10.447113635637127 1.3831116226019793 590 | 8 2.8000116200482234e-06 6 5 10.01775630887889 0.18109731561022296 591 | 178 5.1866207330210286e-05 76 91 6.547672361646209 1.5106233302362833 592 | 21 6.25809454729242e-06 10 10 8.998442332904421 0.1053833528740223 593 | 62 1.8333979458254562e-05 34 29 13.957078941086928 0.6508897873107181 594 | 11 4.8591702039217036e-06 7 4 11.746911955772708 0.10663190624106215 595 | 18 5.713199299561766e-06 15 17 1.1497445574344478 0.3637931347076152 596 | 15 5.182367512774536e-06 9 9 5.914447873558177 0.28103160949764705 597 | 47 1.3693956390867471e-05 21 28 5.232515466240075 0.5249279563697654 598 | 2 1.2701057744088928e-06 2 2 4.573194809989908 0.01570986081693927 599 | 29 8.503506083525542e-06 21 16 7.826184369439859 0.4303325940416321 600 | 66 1.918253343137739e-05 35 33 5.645620117350178 0.611344404542531 601 | 58 1.7160403020941017e-05 41 38 8.249833638812502 0.6867479516504313 602 | 1 0.0003359086328518643 1 1 11.17398602612965 0.0019504210152227772 603 | 53 1.5470698934068844e-05 28 21 6.960581767087628 0.1911900747324427 604 | 119 3.471012958253923e-05 65 71 7.8910818476206925 1.2863279386091988 605 | 53 1.540966895669854e-05 32 27 6.900574621443286 0.6221651943066858 606 | 379 0.00011050393878023478 151 141 9.033110071176443 1.8586583809989292 607 | 65 1.8920214619271433e-05 41 32 8.63207478053263 0.7498432501629777 608 | 224 6.521913191588479e-05 76 94 3.571846798141831 1.3267473405143846 609 | 30 8.827377109669567e-06 15 21 -1.0949980820504734 0.20307516140999218 610 | 25 7.29781690186071e-06 19 18 7.40289683864991 0.4149799008800383 611 | 117 3.414704476298157e-05 53 75 8.285202881588603 1.0406506339365023 612 | 1091 0.00031646285607872646 277 289 9.780416759141392 3.0226295514856596 613 | 13 4.139524916278109e-06 6 7 3.9640986592208063 0.048109351921014024 614 | 86 2.4951967462634428e-05 40 57 12.365812471040883 0.7911515886246419 615 | 2 1.4574624994898881e-06 1 2 6.281021177390244 0.004178059095249025 616 | 62 1.834313556967714e-05 37 24 8.001497367487358 0.3423080719201206 617 | 44 1.279391102881654e-05 25 26 9.692923491573284 0.3367014864492951 618 | 132 3.8494087949659065e-05 68 62 8.844770929815057 1.0852708560022295 619 | 1 0.00029850746268656717 1 1 8.109408144234049 0.0028496835334778587 620 | 4 1.5128959248635367e-06 4 1 9.34425073017682 0.00014404303878078558 621 | 1 0.0002901073397156948 1 1 12.480607391144837 0.0017202088760133092 622 | 2 0.0005717552887364208 2 1 11.057663797975538 0.007290810480520451 623 | 18 5.639097744360902e-06 11 10 8.58033105198951 0.27326671623090226 624 | 284 8.264804330757469e-05 114 118 4.626153743338708 1.698827199410676 625 | 196 5.69343578806592e-05 52 81 9.317655336441321 1.0151327592722816 626 | 6 1.9103321908485536e-06 5 4 9.518615296000034 0.18339158828282606 627 | 5 1.5041108854610535e-06 4 3 5.238728926070323 0.045817969303845275 628 | 7 2.601902585510599e-06 3 6 7.653162406092276 0.017435094367164082 629 | 1 0.0002748007694421544 1 1 9.248658016746353 0.006348971446245847 630 | 7 2.2302944721514282e-06 6 1 11.853554279049213 0.15679291713665552 631 | 1 0.0002718129926610492 1 1 10.660993529323424 0.012145439803429246 632 | 1 0.0002718129926610492 1 1 13.02881174015953 0.006584439989242076 633 | 123 3.6151972310528396e-05 59 56 10.975451798727134 1.0726348741232175 634 | 47 1.492579024120077e-05 23 33 1.9707327431814248 0.4936050463015298 635 | 6 2.853010068272531e-06 6 5 7.0970666287286805 0.13871141186569647 636 | 699 0.0002030646788600373 201 218 5.641864080369379 2.494134743070469 637 | 313 9.085238403331757e-05 116 141 10.323572864371046 1.7837576904445824 638 | 23 7.441559492032678e-06 10 14 6.3029053239914425 0.421750605697907 639 | 1278 0.0003708969524633739 368 306 5.942952790279388 3.5058965362303938 640 | 260 7.572308997970913e-05 132 90 5.958047321705339 1.489725106654095 641 | -------------------------------------------------------------------------------- /pyunit_newword/words.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3.7 2 | # -*- coding: utf-8 -*- 3 | # @Time : 2018/2/28 10:58 4 | # @Author: Jtyoui@qq.com 5 | from sklearn.linear_model import LinearRegression 6 | import numpy as np 7 | from tqdm import tqdm 8 | import re 9 | import math 10 | import os 11 | 12 | 13 | class NewWords: 14 | def __init__(self, max_split=5, accuracy=0.1, filter_cond=None, filter_free=None): 15 | """初始化 16 | 17 | 当filter_cond=filter_free=None时,会启动预加载模型\n 18 | 0 < accuracy 越小识别的词语越精确,但是生成词语越少 19 | 20 | :param max_split: 最大候选词长度,限制长度为 n-gram 21 | :param accuracy: 自动寻找候选词和模型的精准度之差,默认是设置:0.1 22 | :param filter_cond: 过滤凝聚度,默认None为自动寻找 23 | :param filter_free: 过滤自由度,默认None为自动寻找 24 | """ 25 | self.vocab = {} 26 | self.max_split = max_split 27 | self.accuracy = accuracy 28 | self.all_words_len = 0 29 | self.cond = filter_cond 30 | self.free = filter_free 31 | self.auto = None 32 | if not (filter_free and filter_free): 33 | txt = os.path.dirname(__file__) + os.sep + 'auto.txt' 34 | d = [data.strip().split('\t') for data in open(txt, 'r', encoding='utf-8').readlines()] 35 | data = np.array(d, dtype=np.float32) 36 | x = data[:, :-1] 37 | y = data[:, -1:] 38 | self.auto = LinearRegression() 39 | self.auto.fit(x, y) 40 | 41 | def add_text(self, file, encoding='UTF-8'): 42 | """读取文本数据内容 43 | 44 | 统计:[关键字次数,关键字频率,关键字的左邻,关键字的右邻] 45 | 46 | :param file: 文件文本路径 47 | :param encoding: 文本格式 48 | """ 49 | with open(file=file, mode='r', encoding=encoding) as line: 50 | for word in tqdm(line.readlines(), desc='读取数据进度条'): 51 | words = word.strip() 52 | for lines in re.split('[^\u4e00-\u9fa50-9a-zA-Z]', words): 53 | match = re.findall(r'[\u4e00-\u9fa50-9]', lines) 54 | lens = len(match) 55 | self.all_words_len += lens 56 | for i in range(lens): 57 | for j in range(1, self.max_split + 1): 58 | if i + j <= lens: 59 | k = ''.join(match[i:i + j]) 60 | if k in self.vocab: 61 | w = self.vocab[k] 62 | else: 63 | w = [0, 0, set(), set()] 64 | self.vocab[k] = w 65 | w[0] += 1 66 | w[1] = w[0] / self.all_words_len 67 | if i != 0: 68 | w[2].add(match[i - 1]) 69 | if i + j != lens: 70 | w[3].add(match[i + j]) 71 | else: # 候选词的个数大于该句子的长度时立即停止 72 | break 73 | 74 | def analysis_data(self): 75 | """分析文本数据 76 | 77 | 分析:关键词每个片段凝固程度:solid\n 78 | 关键字的左邻自由程度:front_all\n 79 | 关键字的右邻自由程度:end_all 80 | """ 81 | for key in tqdm(self.vocab, desc='分析数据进度条'): 82 | key_len = len(key) 83 | if key_len != 1: 84 | attribute: list = self.vocab[key] 85 | solid, end_all, front_all = [], 0, 0 86 | for index in range(1, key_len): 87 | score = attribute[1] / (self.vocab[key[:index]][1] * self.vocab[key[index:]][1]) 88 | solid.append(math.log2(score)) 89 | for front in attribute[2]: 90 | front_all -= math.log2(self.vocab[front][1]) * self.vocab[front][1] # 左邻字集合自由程度 91 | for end in attribute[3]: 92 | end_all -= math.log2(self.vocab[end][1]) * self.vocab[end][1] # 右邻字集合自由程度 93 | attribute.append(min(solid)) 94 | attribute.append(min(end_all, front_all)) 95 | 96 | def _filter_algorithm(self, x): 97 | """自动筛选算法 98 | 99 | 自动寻找筛选过滤参数值\n 100 | attribute:[出现次数,出现频率,关键字的左邻,关键字的右邻,凝固程度,自由程度] 101 | 102 | ::param x: x为候选词属性 103 | """ 104 | if len(x[0]) == 1: 105 | return False 106 | attribute: list = x[1] 107 | if attribute[4] <= 0.1: 108 | return False 109 | elif len(attribute[2]) == len(attribute[3]) == 0 and attribute[0] > 2: 110 | return True 111 | elif attribute[0] > 100 and len(attribute[2]) >= attribute[0] * 0.1 and len(attribute[3]) >= attribute[0] * 0.1: 112 | return True 113 | if not (self.free and self.cond): 114 | ls = [attribute[0], attribute[1], len(attribute[2]), len(attribute[3]), attribute[4]] 115 | predict = self.auto.predict([ls])[0][0] 116 | if 0 < attribute[5] - predict <= self.accuracy and predict > 0: 117 | return True 118 | elif attribute[4] >= self.cond and attribute[5] >= self.free: 119 | return True 120 | return False 121 | 122 | def get_words(self): 123 | """新词筛选""" 124 | clean_text = filter(self._filter_algorithm, self.vocab.items()) 125 | return clean_text 126 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | joblib==1.2.0 2 | numpy==1.22.0 3 | scikit-learn==0.22.1 4 | scipy==1.4.1 5 | sklearn==0.0 6 | tqdm==4.42.1 7 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3.7 2 | # -*- coding: utf-8 -*- 3 | # @Time : 2019/5/9 11:50 4 | # @Author: Jtyoui@qq.com 5 | from setuptools import setup, find_packages 6 | from pyunit_newword import __version__, __author__, __description__, __email__, __names__, __url__ 7 | 8 | with open('README.md', encoding='utf-8') as f: 9 | long_text = f.read() 10 | 11 | with open('requirements.txt', encoding='utf-8') as f: 12 | install_requires = f.read().strip().splitlines() 13 | 14 | setup( 15 | name=__names__.lower(), 16 | version=__version__, 17 | description=__description__, 18 | long_description=long_text, 19 | long_description_content_type="text/markdown", 20 | url=__url__, 21 | author=__author__, 22 | author_email=__email__, 23 | license='MIT Licence', 24 | packages=find_packages(), 25 | platforms='any', 26 | package_data={'': ['*']}, 27 | install_requires=install_requires, 28 | classifiers=[ 29 | "Programming Language :: Python :: 3", 30 | "License :: OSI Approved :: MIT License", 31 | "Operating System :: OS Independent", 32 | ], 33 | zip_safe=True, 34 | ) 35 | -------------------------------------------------------------------------------- /test.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3.7 2 | # -*- coding: utf-8 -*- 3 | # @Time : 2018/2/28 10:58 4 | # @Author: Jtyoui@qq.com 5 | from pyunit_newword import NewWords 6 | 7 | 8 | def test(): 9 | """测试""" 10 | nw = NewWords(accuracy=0.01) 11 | nw.add_text(r'C:\Users\Administrator\Desktop\西游记.txt') 12 | nw.analysis_data() 13 | for word in nw.get_words(): 14 | print(word[0]) 15 | 16 | 17 | if __name__ == '__main__': 18 | test() 19 | --------------------------------------------------------------------------------