├── .gitignore ├── Makefile ├── README.md ├── docs ├── .gitignore ├── Makefile ├── advanced.rst ├── basic.rst ├── conf.py ├── getting_started.rst ├── index.rst ├── make.bat └── tech_report.rst ├── isan.py ├── isan.sh └── isan ├── Makefile ├── README.md ├── __init__.py ├── annotation ├── __init__.py └── seg │ ├── __init__.py │ ├── anno.py │ ├── http_server.py │ └── sample.html ├── common ├── Chinese.py ├── __init__.py ├── common.hpp ├── decoder.hpp ├── decoder.py ├── feature_dict.cc ├── first_order_linear │ ├── decoder.h │ └── first_order_linear.cc ├── general_types.hpp ├── parameters.py ├── perceptrons.py ├── python_interface.cc ├── searcher.hpp ├── smart_string.hpp ├── task.py ├── updater.py └── weights.py ├── data ├── __init__.py └── lattice.py ├── parsing ├── __init__.py ├── char_dep.py ├── codec.py ├── default_dep.py ├── default_dep2.py ├── dep_codec.py ├── dep_unlabeled_eval.py ├── eval.py ├── lat_dep.py ├── lat_tag.py ├── lattice_dep.py ├── ldep_eval.py ├── make_cython.sh ├── seq_dep.py └── setup.py ├── sentence ├── README.md └── __init__.py ├── tagging ├── .exrc ├── PA_segger.py ├── __init__.py ├── cb_cws.py ├── cb_subsymbolic.py ├── cb_symbolic.py ├── cws.py ├── eval.py ├── ss.py ├── tagging_dag.py ├── wb_tag.py └── wb_tag_symbolic.py └── utls ├── Makefile ├── __init__.py ├── average.py ├── cdat2 ├── Makefile ├── cdat.cc ├── dat.h ├── dat_builder.cc └── test2.dat ├── count.py ├── dat.cc ├── dat.hpp ├── divde.py ├── divide.py ├── draw.py ├── indexer.py ├── pydat.py ├── segconv.py ├── shuffle_lines.py ├── times.py └── to_full.py /.gitignore: -------------------------------------------------------------------------------- 1 | test 2 | build/ 3 | /data* 4 | test.* 5 | test* 6 | test_ctb.zsh 7 | *.swp 8 | *.swo 9 | *.raw 10 | *.json 11 | scripts/test 12 | *.so 13 | *.bin 14 | datasets/* 15 | __pycache__ 16 | README.html 17 | *tags 18 | *.tmp 19 | *.pyc 20 | 21 | doc/html* 22 | doc/latex* 23 | doc/build* 24 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | all: 2 | make -C isan 3 | 4 | 5 | test_cws: 6 | ./cws.sh model.bin --train ~/data/seg/ctb5.test.seg --dev ~/data/seg/ctb5.test.seg 7 | 8 | test_dag: 9 | ./tag_path.sh model.bin --train test/train1000.dlat --dev test/test.dlat --iteration=5 10 | #./tag_path.sh model.bin --train test/train.dlat --dev test/test.dlat --iteration=20 11 | 12 | test_dep: 13 | ./parsing.sh model.bin --train test/ctb5.test.txt --dev test/ctb5.test.txt 14 | 15 | test_dep2: 16 | #./dep2.sh model.bin --train test/train1000.dlat --dev test/test.dlat 17 | ./dep2.sh model.bin --train test/train.dlat --dev test/test.dlat --iteration=20 18 | 19 | test_lat_dep: 20 | ./lat_dep.sh model.bin --train test/train1000.dlat --dev test/test.dlat --iteration=20 21 | #./lat_dep.sh model.bin --train test/train.dlat --dev test/test.dlat --iteration=20 22 | 23 | test_seq_dep: 24 | ./seq_dep.sh model.bin --train test/train1000.dlat --dev test/test.dlat --iteration=20 25 | #./seq_dep.sh model.bin --train test/train.dlat --dev test/test.dlat --iteration=20 26 | 27 | test_lat_tag: 28 | #./lat_tag.sh model.bin --train test/train.dlat --dev test/test.dlat --iteration=5 29 | ./lat_tag.sh model.bin --train test/train1000.dlat --dev test/test.dlat --iteration=5 30 | 31 | basic_test: 32 | ./cws.sh model.bin --train ~/data/seg/ctb5.test.seg --dev ~/data/seg/ctb5.test.seg --iteration=1 33 | ./parsing.sh model.bin --train test/ctb5.test.txt --dev test/ctb5.test.txt --iteration=1 34 | 35 | test_msr: 36 | ./cws.sh model.bin --train ~/data/seg/msr.training.seg --dev ~/data/seg/msr.test.seg --iteration=30 37 | 38 | test_ctb5_parsing: 39 | ./parsing.sh model.bin --train test/ctb5.training.txt --dev test/ctb5.test.txt --iteration=30 40 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | isan 2 | ==== 3 | 4 | > “举一隅不以三隅反,则不复也” ——《论语·述而》 5 | 6 | 一个数据驱动的中文处理 **实验环境** ,可进行 **中文分词** , **词性标注** 和 **依存句法分析** 等任务。 7 | 8 | 文档: [https://isan.readthedocs.org/en/latest/](https://isan.readthedocs.org/en/latest/) 9 | 10 | 作者: [张开旭](http://weibo.com/zhangkaixu)。 11 | 12 | -------------------------------------------------------------------------------- /docs/.gitignore: -------------------------------------------------------------------------------- 1 | _build 2 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = sphinx-build 7 | PAPER = 8 | BUILDDIR = _build 9 | 10 | # Internal variables. 11 | PAPEROPT_a4 = -D latex_paper_size=a4 12 | PAPEROPT_letter = -D latex_paper_size=letter 13 | ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . 14 | # the i18n builder cannot share the environment and doctrees with the others 15 | I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . 16 | 17 | .PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest gettext 18 | 19 | help: 20 | @echo "Please use \`make ' where is one of" 21 | @echo " html to make standalone HTML files" 22 | @echo " dirhtml to make HTML files named index.html in directories" 23 | @echo " singlehtml to make a single large HTML file" 24 | @echo " pickle to make pickle files" 25 | @echo " json to make JSON files" 26 | @echo " htmlhelp to make HTML files and a HTML help project" 27 | @echo " qthelp to make HTML files and a qthelp project" 28 | @echo " devhelp to make HTML files and a Devhelp project" 29 | @echo " epub to make an epub" 30 | @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter" 31 | @echo " latexpdf to make LaTeX files and run them through pdflatex" 32 | @echo " text to make text files" 33 | @echo " man to make manual pages" 34 | @echo " texinfo to make Texinfo files" 35 | @echo " info to make Texinfo files and run them through makeinfo" 36 | @echo " gettext to make PO message catalogs" 37 | @echo " changes to make an overview of all changed/added/deprecated items" 38 | @echo " linkcheck to check all external links for integrity" 39 | @echo " doctest to run all doctests embedded in the documentation (if enabled)" 40 | 41 | clean: 42 | -rm -rf $(BUILDDIR)/* 43 | 44 | html: 45 | $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html 46 | @echo 47 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/html." 48 | 49 | dirhtml: 50 | $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml 51 | @echo 52 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml." 53 | 54 | singlehtml: 55 | $(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml 56 | @echo 57 | @echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml." 58 | 59 | pickle: 60 | $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle 61 | @echo 62 | @echo "Build finished; now you can process the pickle files." 63 | 64 | json: 65 | $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json 66 | @echo 67 | @echo "Build finished; now you can process the JSON files." 68 | 69 | htmlhelp: 70 | $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp 71 | @echo 72 | @echo "Build finished; now you can run HTML Help Workshop with the" \ 73 | ".hhp project file in $(BUILDDIR)/htmlhelp." 74 | 75 | qthelp: 76 | $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp 77 | @echo 78 | @echo "Build finished; now you can run "qcollectiongenerator" with the" \ 79 | ".qhcp project file in $(BUILDDIR)/qthelp, like this:" 80 | @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/isan.qhcp" 81 | @echo "To view the help file:" 82 | @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/isan.qhc" 83 | 84 | devhelp: 85 | $(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp 86 | @echo 87 | @echo "Build finished." 88 | @echo "To view the help file:" 89 | @echo "# mkdir -p $$HOME/.local/share/devhelp/isan" 90 | @echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/isan" 91 | @echo "# devhelp" 92 | 93 | epub: 94 | $(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub 95 | @echo 96 | @echo "Build finished. The epub file is in $(BUILDDIR)/epub." 97 | 98 | latex: 99 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 100 | @echo 101 | @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex." 102 | @echo "Run \`make' in that directory to run these through (pdf)latex" \ 103 | "(use \`make latexpdf' here to do that automatically)." 104 | 105 | latexpdf: 106 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 107 | @echo "Running LaTeX files through pdflatex..." 108 | $(MAKE) -C $(BUILDDIR)/latex all-pdf 109 | @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." 110 | 111 | text: 112 | $(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text 113 | @echo 114 | @echo "Build finished. The text files are in $(BUILDDIR)/text." 115 | 116 | man: 117 | $(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man 118 | @echo 119 | @echo "Build finished. The manual pages are in $(BUILDDIR)/man." 120 | 121 | texinfo: 122 | $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo 123 | @echo 124 | @echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo." 125 | @echo "Run \`make' in that directory to run these through makeinfo" \ 126 | "(use \`make info' here to do that automatically)." 127 | 128 | info: 129 | $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo 130 | @echo "Running Texinfo files through makeinfo..." 131 | make -C $(BUILDDIR)/texinfo info 132 | @echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo." 133 | 134 | gettext: 135 | $(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale 136 | @echo 137 | @echo "Build finished. The message catalogs are in $(BUILDDIR)/locale." 138 | 139 | changes: 140 | $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes 141 | @echo 142 | @echo "The overview file is in $(BUILDDIR)/changes." 143 | 144 | linkcheck: 145 | $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck 146 | @echo 147 | @echo "Link check complete; look for any errors in the above output " \ 148 | "or in $(BUILDDIR)/linkcheck/output.txt." 149 | 150 | doctest: 151 | $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest 152 | @echo "Testing of doctests in the sources finished, look at the " \ 153 | "results in $(BUILDDIR)/doctest/output.txt." 154 | -------------------------------------------------------------------------------- /docs/advanced.rst: -------------------------------------------------------------------------------- 1 | 进阶 2 | ================= 3 | 4 | 5 | 6 | 如何修改已有算法 7 | ---------------------------------- 8 | 9 | 10 | 11 | 如何写新工具处理新任务 12 | ---------------------------------------- 13 | -------------------------------------------------------------------------------- /docs/basic.rst: -------------------------------------------------------------------------------- 1 | 基本操作 2 | ================ 3 | 4 | 模型的训练、测试和使用 5 | --------------------------------------- 6 | 7 | 命令行及参数 8 | ++++++++++++++++++++++++ 9 | 10 | 主要命令均通过调用 ``./isan.py`` 完成。 11 | 12 | 许多已实现的模型有一些固定的参数,可以使用 ``./isan.sh`` 更方便的调用, 基本操作使用后者即可。 13 | 14 | .. code-block:: bash 15 | 16 | ./isan.sh model-name [model-file] [ other args ] 17 | 18 | 其中 ``model-name`` 是模型名字, 如 ``seg`` 是一个基于字标注的模型,可用于进行分词或者分词词性标注, ``cws`` 是一个基于词的分词模型, ``dep`` 是一个依存句法分析模型。 19 | 20 | ``model-file`` 是模型参数文件。 如果是训练任务,可为空,表示训练之后不保存模型参数。 21 | 22 | 本小节将涉及的其它参数有: 23 | 24 | * ``--train training-data`` 使用指定的训练集文件训练模型 25 | * ``--test test-data`` 训练完后使用测试集测试模型效果 26 | * ``--dev dev-data`` 每次训练迭代后使用开发集评价模型效果 27 | * ``--iteration iter`` 指定训练迭代次数 28 | 29 | 主要使用场合: 30 | 31 | * **训练模型** : 指定了 ``--train`` 参数,则训练一个新模型保存在 ``model-name`` , 可同时再使用 ``--test`` ``--dev`` 等参数 32 | * **测试模型** : 不指定 ``--train`` 参数, 但指定 ``--test`` 参数 33 | * **使用模型** : 不指定 ``--train`` 参数, 也不指定 ``--test`` 参数, 则从标准输入流中读入输入,将输出输出到标准输入流。 34 | 35 | 实例 36 | ++++++++++++++++++++++++ 37 | 38 | 可以用中文分词任务试试isan如何工作。下载一个可供实验用的SIGHAN05中文分词语料库:: 39 | 40 | wget http://www.sighan.org/bakeoff2005/data/icwb2-data.rar 41 | sudo apt-get install unrar 42 | mkdir sighan05; unrar e icwb2-data.rar sighan05 43 | ln -s sighan05/msr_test_gold.utf8 train.seg 44 | ln -s sighan05/msr_test_gold.utf8 test.seg 45 | 46 | 47 | 试着训练和测试:: 48 | 49 | ./isan.sh seg model.gz --train test.seg 50 | ./isan.sh seg model.gz --test test.seg 51 | 52 | 接下来就可以试着真枪实弹地来一次,在MSR的训练集上迭代30次训练模型,每次迭代都将测试集作为开发集检查一下模型性能:: 53 | 54 | ./isan.sh seg model.gz --train train \ 55 | --dev test.seg --iteration 15 56 | 57 | 需要一些耐心等待程序结束。 58 | 59 | 会得到类似这样的结果:: 60 | 61 | 标准: 8008 输出: 8057 seg正确: 7811 正确: 7811 seg_f1: 0.9724 tag_f1: 0.9724 ol: 11 时间: 0.2762 (49733字/秒) 62 | 63 | 可以看到分词F值为0.9724。 64 | 65 | 还可以使用 ``./isan/tagging/eval.py`` 这个工具, 直接比较两个分词结果:: 66 | 67 | sed 's/\ //g' test.seg | ./isan.sh seg ctb.seg.gz > result.seg 68 | ./isan/tagging/eval.py test.seg result.seg 69 | 70 | 71 | 已实现的模型 72 | -------------------------------- 73 | 74 | .. _trained_model_parameter_list: 75 | 76 | 已训练模型列表 77 | ++++++++++++++++++++++++++++++++ 78 | 79 | 中文分词 使用 ``wget http://t.cn/zQxy95O -O ctb.seg.gz`` 获取,使用 ``./isan.sh seg ctb.seg.gz`` 启动 80 | 81 | 中文分词词性标注 使用 ``http://t.cn/zQxg4lX -O ctb.tag.gz`` 获取, 使用 ``./isan.sh seg ctb.tag.gz`` 启动 82 | -------------------------------------------------------------------------------- /docs/conf.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # isan documentation build configuration file, created by 4 | # sphinx-quickstart on Sat May 4 16:12:46 2013. 5 | # 6 | # This file is execfile()d with the current directory set to its containing dir. 7 | # 8 | # Note that not all possible configuration values are present in this 9 | # autogenerated file. 10 | # 11 | # All configuration values have a default; values that are commented out 12 | # serve to show the default. 13 | 14 | import sys, os 15 | 16 | # If extensions (or modules to document with autodoc) are in another directory, 17 | # add these directories to sys.path here. If the directory is relative to the 18 | # documentation root, use os.path.abspath to make it absolute, like shown here. 19 | #sys.path.insert(0, os.path.abspath('.')) 20 | 21 | # -- General configuration ----------------------------------------------------- 22 | 23 | # If your documentation needs a minimal Sphinx version, state it here. 24 | #needs_sphinx = '1.0' 25 | 26 | # Add any Sphinx extension module names here, as strings. They can be extensions 27 | # coming with Sphinx (named 'sphinx.ext.*') or your custom ones. 28 | extensions = ['sphinx.ext.autodoc', 'sphinx.ext.mathjax'] 29 | 30 | # Add any paths that contain templates here, relative to this directory. 31 | templates_path = ['_templates'] 32 | 33 | # The suffix of source filenames. 34 | source_suffix = '.rst' 35 | 36 | # The encoding of source files. 37 | #source_encoding = 'utf-8-sig' 38 | 39 | # The master toctree document. 40 | master_doc = 'index' 41 | 42 | # General information about the project. 43 | project = u'isan' 44 | copyright = u'2013, ZHANG, Kaixu' 45 | 46 | # The version info for the project you're documenting, acts as replacement for 47 | # |version| and |release|, also used in various other places throughout the 48 | # built documents. 49 | # 50 | # The short X.Y version. 51 | version = '0.1' 52 | # The full version, including alpha/beta/rc tags. 53 | release = '0.1' 54 | 55 | # The language for content autogenerated by Sphinx. Refer to documentation 56 | # for a list of supported languages. 57 | #language = None 58 | 59 | # There are two options for replacing |today|: either, you set today to some 60 | # non-false value, then it is used: 61 | #today = '' 62 | # Else, today_fmt is used as the format for a strftime call. 63 | #today_fmt = '%B %d, %Y' 64 | 65 | # List of patterns, relative to source directory, that match files and 66 | # directories to ignore when looking for source files. 67 | exclude_patterns = ['_build'] 68 | 69 | # The reST default role (used for this markup: `text`) to use for all documents. 70 | #default_role = None 71 | 72 | # If true, '()' will be appended to :func: etc. cross-reference text. 73 | #add_function_parentheses = True 74 | 75 | # If true, the current module name will be prepended to all description 76 | # unit titles (such as .. function::). 77 | #add_module_names = True 78 | 79 | # If true, sectionauthor and moduleauthor directives will be shown in the 80 | # output. They are ignored by default. 81 | #show_authors = False 82 | 83 | # The name of the Pygments (syntax highlighting) style to use. 84 | pygments_style = 'sphinx' 85 | 86 | # A list of ignored prefixes for module index sorting. 87 | #modindex_common_prefix = [] 88 | 89 | 90 | # -- Options for HTML output --------------------------------------------------- 91 | 92 | # The theme to use for HTML and HTML Help pages. See the documentation for 93 | # a list of builtin themes. 94 | #html_theme = 'default' 95 | html_theme = 'haiku' 96 | html_theme = 'pyramid' 97 | html_theme = 'nature' 98 | 99 | 100 | 101 | # Theme options are theme-specific and customize the look and feel of a theme 102 | # further. For a list of options available for each theme, see the 103 | # documentation. 104 | #html_theme_options = {} 105 | 106 | # Add any paths that contain custom themes here, relative to this directory. 107 | #html_theme_path = [] 108 | 109 | # The name for this set of Sphinx documents. If None, it defaults to 110 | # " v documentation". 111 | #html_title = None 112 | 113 | # A shorter title for the navigation bar. Default is the same as html_title. 114 | #html_short_title = None 115 | 116 | # The name of an image file (relative to this directory) to place at the top 117 | # of the sidebar. 118 | #html_logo = None 119 | 120 | # The name of an image file (within the static path) to use as favicon of the 121 | # docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 122 | # pixels large. 123 | #html_favicon = None 124 | 125 | # Add any paths that contain custom static files (such as style sheets) here, 126 | # relative to this directory. They are copied after the builtin static files, 127 | # so a file named "default.css" will overwrite the builtin "default.css". 128 | html_static_path = ['_static'] 129 | 130 | # If not '', a 'Last updated on:' timestamp is inserted at every page bottom, 131 | # using the given strftime format. 132 | #html_last_updated_fmt = '%b %d, %Y' 133 | 134 | # If true, SmartyPants will be used to convert quotes and dashes to 135 | # typographically correct entities. 136 | #html_use_smartypants = True 137 | 138 | # Custom sidebar templates, maps document names to template names. 139 | #html_sidebars = {} 140 | 141 | # Additional templates that should be rendered to pages, maps page names to 142 | # template names. 143 | #html_additional_pages = {} 144 | 145 | # If false, no module index is generated. 146 | #html_domain_indices = True 147 | 148 | # If false, no index is generated. 149 | #html_use_index = True 150 | 151 | # If true, the index is split into individual pages for each letter. 152 | #html_split_index = False 153 | 154 | # If true, links to the reST sources are added to the pages. 155 | #html_show_sourcelink = True 156 | 157 | # If true, "Created using Sphinx" is shown in the HTML footer. Default is True. 158 | #html_show_sphinx = True 159 | 160 | # If true, "(C) Copyright ..." is shown in the HTML footer. Default is True. 161 | #html_show_copyright = True 162 | 163 | # If true, an OpenSearch description file will be output, and all pages will 164 | # contain a tag referring to it. The value of this option must be the 165 | # base URL from which the finished HTML is served. 166 | #html_use_opensearch = '' 167 | 168 | # This is the file name suffix for HTML files (e.g. ".xhtml"). 169 | #html_file_suffix = None 170 | 171 | # Output file base name for HTML help builder. 172 | htmlhelp_basename = 'isandoc' 173 | 174 | 175 | # -- Options for LaTeX output -------------------------------------------------- 176 | 177 | latex_elements = { 178 | # The paper size ('letterpaper' or 'a4paper'). 179 | #'papersize': 'letterpaper', 180 | 181 | # The font size ('10pt', '11pt' or '12pt'). 182 | #'pointsize': '10pt', 183 | 184 | # Additional stuff for the LaTeX preamble. 185 | #'preamble': '', 186 | } 187 | 188 | # Grouping the document tree into LaTeX files. List of tuples 189 | # (source start file, target name, title, author, documentclass [howto/manual]). 190 | latex_documents = [ 191 | ('index', 'isan.tex', u'isan Documentation', 192 | u'ZHANG, Kaixu', 'manual'), 193 | ] 194 | 195 | # The name of an image file (relative to this directory) to place at the top of 196 | # the title page. 197 | #latex_logo = None 198 | 199 | # For "manual" documents, if this is true, then toplevel headings are parts, 200 | # not chapters. 201 | #latex_use_parts = False 202 | 203 | # If true, show page references after internal links. 204 | #latex_show_pagerefs = False 205 | 206 | # If true, show URL addresses after external links. 207 | #latex_show_urls = False 208 | 209 | # Documents to append as an appendix to all manuals. 210 | #latex_appendices = [] 211 | 212 | # If false, no module index is generated. 213 | #latex_domain_indices = True 214 | 215 | 216 | # -- Options for manual page output -------------------------------------------- 217 | 218 | # One entry per manual page. List of tuples 219 | # (source start file, name, description, authors, manual section). 220 | man_pages = [ 221 | ('index', 'isan', u'isan Documentation', 222 | [u'ZHANG, Kaixu'], 1) 223 | ] 224 | 225 | # If true, show URL addresses after external links. 226 | #man_show_urls = False 227 | 228 | 229 | # -- Options for Texinfo output ------------------------------------------------ 230 | 231 | # Grouping the document tree into Texinfo files. List of tuples 232 | # (source start file, target name, title, author, 233 | # dir menu entry, description, category) 234 | texinfo_documents = [ 235 | ('index', 'isan', u'isan Documentation', 236 | u'ZHANG, Kaixu', 'isan', 'One line description of project.', 237 | 'Miscellaneous'), 238 | ] 239 | 240 | # Documents to append as an appendix to all manuals. 241 | #texinfo_appendices = [] 242 | 243 | # If false, no module index is generated. 244 | #texinfo_domain_indices = True 245 | 246 | # How to display URL addresses: 'footnote', 'no', or 'inline'. 247 | #texinfo_show_urls = 'footnote' 248 | -------------------------------------------------------------------------------- /docs/getting_started.rst: -------------------------------------------------------------------------------- 1 | 上手 2 | ============= 3 | 4 | 在此以Ubuntu操作系统为例,介绍如何安装和使用isan的基本功能。 5 | 6 | 下载与编译 7 | ---------------------- 8 | 9 | 首先,需要安装必要的软件包,在命令行下安装 10 | 11 | .. code-block:: bash 12 | 13 | sudo apt-get install gcc make python3 python3-dev git python3-numpy 14 | 15 | .. note:: 16 | 17 | 本工具包使用的是python3,与最常用的python版本python2不完全兼容。 18 | 19 | 为了提高速度,解码核心算法使用c++编写,因此还需要gcc进行编译。 20 | 21 | 22 | 23 | 然后选好路径,下载isan源代码,编译:: 24 | 25 | git clone https://github.com/zhangkaixu/isan.git 26 | cd isan 27 | make 28 | 29 | 编译正确后,就可以使用了。 30 | 31 | 32 | 使用训练好的模型 33 | ---------------------- 34 | 35 | 以中文分词为例, 下载一个训练好的模型文件:: 36 | 37 | wget http://t.cn/zQxy95O -O ctb.seg.gz 38 | 39 | .. seealso:: 40 | 41 | 在这里有一份已经训练好的模型参数的列表 :ref:`trained_model_parameter_list` 42 | 43 | 这是一个在中文树库5上训练的分词模型参数文件,试试分词:: 44 | 45 | echo '厦门大学' | ./isan.sh seg ctb.seg.gz 46 | 47 | 其中 ``isan.sh`` 是用来启动isan及其常用任务的脚本。 用 ``seg`` 来指明一个基于字标注的模型。 ``ctb.seg.gz`` 是刚才下载的对应的参数文件。 运行后将会得到这样的输出:: 48 | 49 | 厦门 大学 50 | 51 | 程序从标准输入流读入输入数据,将结果输出到标准输出流。一般地,可以这样执行:: 52 | 53 | ./isan.sh seg ctb.seg.gz < input_file > output_file 54 | 55 | 56 | -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | .. isan documentation master file, created by 2 | sphinx-quickstart on Sat May 4 16:12:46 2013. 3 | You can adapt this file completely to your liking, but it should at least 4 | contain the root `toctree` directive. 5 | 6 | 7 | Isan 一三 8 | ================================ 9 | 10 | .. sidebar:: 舉一隅不以三隅反,則不復也 11 | 12 | ——《論語》 13 | 14 | 15 | 一三(isan)是一个基于统计的开源中文自然语言处理实验环境, 可进行 **中文分词** 、 **词性标注** 、 **句法分析** 等任务。 所有任务均使用结构感知器(structured perceptron)这一统一的框架进行参数学习。 16 | 17 | 源码 ``_ 18 | 19 | 文档目录: 20 | 21 | .. toctree:: 22 | :maxdepth: 2 23 | 24 | 上手 —— 把isan当作现成的中文自然语言处理工具 25 | 基本功能 —— 希望根据自己的语料库训练模型并使用 26 | 技术简介 —— 了解isan所使用的技术 27 | 进阶 —— 基于isan的框架编写新模型,完成新任务 28 | 29 | 30 | 31 | 32 | .. Indices and tables 33 | ================== 34 | * :ref:`genindex` 35 | * :ref:`modindex` 36 | * :ref:`search` 37 | -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | REM Command file for Sphinx documentation 4 | 5 | if "%SPHINXBUILD%" == "" ( 6 | set SPHINXBUILD=sphinx-build 7 | ) 8 | set BUILDDIR=_build 9 | set ALLSPHINXOPTS=-d %BUILDDIR%/doctrees %SPHINXOPTS% . 10 | set I18NSPHINXOPTS=%SPHINXOPTS% . 11 | if NOT "%PAPER%" == "" ( 12 | set ALLSPHINXOPTS=-D latex_paper_size=%PAPER% %ALLSPHINXOPTS% 13 | set I18NSPHINXOPTS=-D latex_paper_size=%PAPER% %I18NSPHINXOPTS% 14 | ) 15 | 16 | if "%1" == "" goto help 17 | 18 | if "%1" == "help" ( 19 | :help 20 | echo.Please use `make ^` where ^ is one of 21 | echo. html to make standalone HTML files 22 | echo. dirhtml to make HTML files named index.html in directories 23 | echo. singlehtml to make a single large HTML file 24 | echo. pickle to make pickle files 25 | echo. json to make JSON files 26 | echo. htmlhelp to make HTML files and a HTML help project 27 | echo. qthelp to make HTML files and a qthelp project 28 | echo. devhelp to make HTML files and a Devhelp project 29 | echo. epub to make an epub 30 | echo. latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter 31 | echo. text to make text files 32 | echo. man to make manual pages 33 | echo. texinfo to make Texinfo files 34 | echo. gettext to make PO message catalogs 35 | echo. changes to make an overview over all changed/added/deprecated items 36 | echo. linkcheck to check all external links for integrity 37 | echo. doctest to run all doctests embedded in the documentation if enabled 38 | goto end 39 | ) 40 | 41 | if "%1" == "clean" ( 42 | for /d %%i in (%BUILDDIR%\*) do rmdir /q /s %%i 43 | del /q /s %BUILDDIR%\* 44 | goto end 45 | ) 46 | 47 | if "%1" == "html" ( 48 | %SPHINXBUILD% -b html %ALLSPHINXOPTS% %BUILDDIR%/html 49 | if errorlevel 1 exit /b 1 50 | echo. 51 | echo.Build finished. The HTML pages are in %BUILDDIR%/html. 52 | goto end 53 | ) 54 | 55 | if "%1" == "dirhtml" ( 56 | %SPHINXBUILD% -b dirhtml %ALLSPHINXOPTS% %BUILDDIR%/dirhtml 57 | if errorlevel 1 exit /b 1 58 | echo. 59 | echo.Build finished. The HTML pages are in %BUILDDIR%/dirhtml. 60 | goto end 61 | ) 62 | 63 | if "%1" == "singlehtml" ( 64 | %SPHINXBUILD% -b singlehtml %ALLSPHINXOPTS% %BUILDDIR%/singlehtml 65 | if errorlevel 1 exit /b 1 66 | echo. 67 | echo.Build finished. The HTML pages are in %BUILDDIR%/singlehtml. 68 | goto end 69 | ) 70 | 71 | if "%1" == "pickle" ( 72 | %SPHINXBUILD% -b pickle %ALLSPHINXOPTS% %BUILDDIR%/pickle 73 | if errorlevel 1 exit /b 1 74 | echo. 75 | echo.Build finished; now you can process the pickle files. 76 | goto end 77 | ) 78 | 79 | if "%1" == "json" ( 80 | %SPHINXBUILD% -b json %ALLSPHINXOPTS% %BUILDDIR%/json 81 | if errorlevel 1 exit /b 1 82 | echo. 83 | echo.Build finished; now you can process the JSON files. 84 | goto end 85 | ) 86 | 87 | if "%1" == "htmlhelp" ( 88 | %SPHINXBUILD% -b htmlhelp %ALLSPHINXOPTS% %BUILDDIR%/htmlhelp 89 | if errorlevel 1 exit /b 1 90 | echo. 91 | echo.Build finished; now you can run HTML Help Workshop with the ^ 92 | .hhp project file in %BUILDDIR%/htmlhelp. 93 | goto end 94 | ) 95 | 96 | if "%1" == "qthelp" ( 97 | %SPHINXBUILD% -b qthelp %ALLSPHINXOPTS% %BUILDDIR%/qthelp 98 | if errorlevel 1 exit /b 1 99 | echo. 100 | echo.Build finished; now you can run "qcollectiongenerator" with the ^ 101 | .qhcp project file in %BUILDDIR%/qthelp, like this: 102 | echo.^> qcollectiongenerator %BUILDDIR%\qthelp\isan.qhcp 103 | echo.To view the help file: 104 | echo.^> assistant -collectionFile %BUILDDIR%\qthelp\isan.ghc 105 | goto end 106 | ) 107 | 108 | if "%1" == "devhelp" ( 109 | %SPHINXBUILD% -b devhelp %ALLSPHINXOPTS% %BUILDDIR%/devhelp 110 | if errorlevel 1 exit /b 1 111 | echo. 112 | echo.Build finished. 113 | goto end 114 | ) 115 | 116 | if "%1" == "epub" ( 117 | %SPHINXBUILD% -b epub %ALLSPHINXOPTS% %BUILDDIR%/epub 118 | if errorlevel 1 exit /b 1 119 | echo. 120 | echo.Build finished. The epub file is in %BUILDDIR%/epub. 121 | goto end 122 | ) 123 | 124 | if "%1" == "latex" ( 125 | %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex 126 | if errorlevel 1 exit /b 1 127 | echo. 128 | echo.Build finished; the LaTeX files are in %BUILDDIR%/latex. 129 | goto end 130 | ) 131 | 132 | if "%1" == "text" ( 133 | %SPHINXBUILD% -b text %ALLSPHINXOPTS% %BUILDDIR%/text 134 | if errorlevel 1 exit /b 1 135 | echo. 136 | echo.Build finished. The text files are in %BUILDDIR%/text. 137 | goto end 138 | ) 139 | 140 | if "%1" == "man" ( 141 | %SPHINXBUILD% -b man %ALLSPHINXOPTS% %BUILDDIR%/man 142 | if errorlevel 1 exit /b 1 143 | echo. 144 | echo.Build finished. The manual pages are in %BUILDDIR%/man. 145 | goto end 146 | ) 147 | 148 | if "%1" == "texinfo" ( 149 | %SPHINXBUILD% -b texinfo %ALLSPHINXOPTS% %BUILDDIR%/texinfo 150 | if errorlevel 1 exit /b 1 151 | echo. 152 | echo.Build finished. The Texinfo files are in %BUILDDIR%/texinfo. 153 | goto end 154 | ) 155 | 156 | if "%1" == "gettext" ( 157 | %SPHINXBUILD% -b gettext %I18NSPHINXOPTS% %BUILDDIR%/locale 158 | if errorlevel 1 exit /b 1 159 | echo. 160 | echo.Build finished. The message catalogs are in %BUILDDIR%/locale. 161 | goto end 162 | ) 163 | 164 | if "%1" == "changes" ( 165 | %SPHINXBUILD% -b changes %ALLSPHINXOPTS% %BUILDDIR%/changes 166 | if errorlevel 1 exit /b 1 167 | echo. 168 | echo.The overview file is in %BUILDDIR%/changes. 169 | goto end 170 | ) 171 | 172 | if "%1" == "linkcheck" ( 173 | %SPHINXBUILD% -b linkcheck %ALLSPHINXOPTS% %BUILDDIR%/linkcheck 174 | if errorlevel 1 exit /b 1 175 | echo. 176 | echo.Link check complete; look for any errors in the above output ^ 177 | or in %BUILDDIR%/linkcheck/output.txt. 178 | goto end 179 | ) 180 | 181 | if "%1" == "doctest" ( 182 | %SPHINXBUILD% -b doctest %ALLSPHINXOPTS% %BUILDDIR%/doctest 183 | if errorlevel 1 exit /b 1 184 | echo. 185 | echo.Testing of doctests in the sources finished, look at the ^ 186 | results in %BUILDDIR%/doctest/output.txt. 187 | goto end 188 | ) 189 | 190 | :end 191 | -------------------------------------------------------------------------------- /docs/tech_report.rst: -------------------------------------------------------------------------------- 1 | 理论介绍 2 | ================== 3 | 4 | .. warning:: 5 | 6 | Chrome系列的浏览器可能无法直接正确显示本页公式。 这是因为用来显示公式用的js没有正确加载。 可点击地址栏右边盾牌状的图标以加载用来显示公式的js。 7 | 8 | 一个工具,可以抽象为一个函数 :math:`z=\text{Function}(x)` , 根据输入产生输出, 能够进行中文分词、词性标注、句法分析等任务的isan也不例外。 9 | 10 | 结构分类问题 11 | +++++++++++++++++++++++++++ 12 | 13 | Isan处理的是一类叫做结构分类的问题。 所谓“分类”, 就是说输出是离散的量, 所谓“结构”,就是说输出不是一个量,而是一组具有内部关联结构的量。 例如,一个函数的输出可以是一个词的序列如“厦门 的 鼓浪屿”,可被看作有三个离散量线性连接而成的结构。 语言中的句法树, 是另一种层次性的结构。 14 | 15 | 16 | 通常一个输入可以有多个候选的输出, 需要建立一个标准选择最好的那个, 因此定义一个评价函数 :math:`f(\mathbf{x};\mathbf{y})` 给所有可能的输入输出对打分。 那么根据输入产生输出的过程就可以在数学上抽象为 17 | 18 | .. math:: 19 | :label: argmax_z 20 | 21 | \mathbf{z}=\arg\max_{\mathbf{z}}{f(\mathbf{x};\mathbf{z})} 22 | 23 | 设计工具的问题就变成了如何找到合适的 :math:`f()` 使得对于给定的输入能得到期望得到的输出。 有一种方法论(统计机器学习)的来法是这样的: 24 | 25 | 1. 为了描述所谓“期望的输出”, 我们直接构建一个数据集 :math:`\{(\mathbf{x}_i,\mathbf{y}_i)\}` ,其中 :math:`\mathbf{x}_i` 是输入样本, :math:`\mathbf{y}_i` 是其“期望”的输出。 26 | 2. :math:`f()` 不能漫无目的地寻找, 最好是人给出一个恰当的范围,然后让计算机在这个范围内参考上面的数据集找到一个最佳的函数。不妨将 :math:`f()` 写为 :math:`f(\mathbf{x},\mathbf{w};\mathbf{y})` , 其中 :math:`\mathbf{w}` 被称为模型的参数, 其不同的取值会得到不同的评价函数。 然后根据数据集自动地确定参数最佳的取值。 27 | 28 | 以上的路线图中, 就有一大一小两个搜索问题: 小的搜索问题是根据输入搜索最佳的输出; 大的搜索问题是根据已有的输入输出对组成的数据集, 搜索最佳的评价函数参数, 使得小的搜索问题能最好地完成。 29 | 30 | 继续,为了搜索最佳的参数, 同样需要评价参数的好坏, 因此再引入损失函数, 刻画在一定的参数下, 对数据集进行处理产生的损失 31 | 32 | .. math:: 33 | 34 | \text{loss}(\mathbf{w})=\sum_{i}{f(\mathbf{x}_i,\mathbf{w};\mathbf{z}_i)-f(\mathbf{x}_i,\mathbf{w};\mathbf{y}_i)} 35 | 36 | .. note:: 37 | 38 | 还可以设计其它的损失函数。 39 | 40 | 这个损失函数是非负的, 当小搜索问题的搜索结果与期望的结果相同时, 损失为0。 41 | 42 | 参数的搜索也就是以下最优化问题 43 | 44 | .. math:: 45 | :label: argmax_w 46 | 47 | \mathbf{w}^*=\arg\min_{\mathbf{w}}{\text{loss}(\mathbf{w})} 48 | 49 | 50 | 这就是整个问题的大框架。 接下来的问题就是以上的两个含有 :math:`\arg\min` 、 :math:`\arg\max` 的问题如何求解。 在大的思路上这两个问题很类似, 都是为了确定某组量的取值而设计的优化问题。 但细看却很不一样, 搜索最优输出的问题 :eq:`argmax_z` , 搜索空间是离散的, 并且是有约束的, 搜索最优参数的问题 :eq:`argmax_w` , 搜索空间一般是整个欧式空间,连续的且无约束。 下面就分别介绍这两个问题的具体处理方法。 51 | 52 | 随机梯度下降算法 53 | +++++++++++++++++++++++++++ 54 | 55 | 56 | 1. 得到一个训练样本 :math:`(\mathbf{x}_t,\mathbf{y}_t)` 57 | 2. 解码得到当前权重下的最优输出 :math:`\mathbf{z}_t=\arg\max_{\mathbf{z}}{f(\mathbf{x}_t,\mathbf{w};\mathbf{z})}` 58 | 3. 如果 :math:`\mathbf{z}_t\not=\mathbf{y}_t` 则 :math:`\mathbf{w}\leftarrow \mathbf{w}-\eta \left. \frac{\partial \text{loss}}{\partial \mathbf{w}} \right|_{\mathbf{w}}` 59 | 4. 判断是否停止,如不停止跳到步骤1。 60 | 61 | 感知器算法 62 | 63 | 平均感知器 64 | ---------------------------- 65 | 66 | 67 | Early-update 68 | ---------------------------- 69 | 70 | 71 | 解码器 72 | +++++++++++++++++++++++++++ 73 | 74 | 75 | 类隐马尔可夫解码器 76 | ----------------------------- 77 | 78 | 一阶解码器适合解决当目标函数可按以下形式分解的情况: 79 | 80 | .. math:: 81 | 82 | f(\mathbf{x};\mathbf{z})=\sum_{i}{g(\mathbf{x};z_i)}+\sum_{i}{h(z_i,z_{i+1})} 83 | 84 | 一般线性解码器 85 | ----------------------------- 86 | 87 | .. math:: 88 | 89 | f(\mathbf{x};\mathbf{z})=\sum_{i}{h(\mathbf{x};z_i,z_{i+1})} 90 | 91 | 一般二叉树解码器 92 | ----------------------------- 93 | 94 | .. math:: 95 | 96 | f(\mathbf{x};\mathbf{z})=\sum_{p}{h(\mathbf{x};z_{p},z_{l},z_{r})}+\sum_{l}{g(\mathbf{x};z_{l})} 97 | 98 | 已实现的模型 99 | +++++++++++++++++++++++++++ 100 | 101 | 基于字标注的分词词性标注 102 | ----------------------------- 103 | 104 | 105 | 基于词的中文分词 106 | ----------------------------- 107 | 108 | 109 | 基于词图的分词词性标注 110 | ----------------------------- 111 | 112 | 113 | 移进-归约依存句法分析 114 | ----------------------------- 115 | -------------------------------------------------------------------------------- /isan.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3 2 | import sys 3 | """! 4 | @mainpage 5 | 6 | Isan 7 | ==== 8 | 一个中文处理的实验环境 9 | 10 | 11 | ls test/*.train | sed 's/^\([^\.]*\)\.train/shuffle -m 20 -d .\/\1 -p 5 seg --train \1.train --dev \1.test --iteration 15 --yaml args.yaml /g' | xargs -n 16 -P 1 ./isan.sh 12 | 13 | 14 | seq 0 9 | awk '{print "test/" $1 "/model.gz --input test/" $1 ".test"}' | xargs -d "\n" -n 1 ./isan.sh seg --threshold 20 --yaml args.yaml --output t.lat --append 15 | """ 16 | 17 | 18 | from isan import * 19 | 20 | if __name__ == '__main__': 21 | this,*argv=sys.argv 22 | """ 23 | if len(argv)==0 : 24 | exit() 25 | if argv[0]=='seg': 26 | argv[0:1]= (['--model', 'isan.common.perceptrons.Model']+ 27 | ['--decoder', 'isan.common.decoder.First_Order_Linear']+ 28 | ['--task', 'isan.tagging.cb_cws.Task']) 29 | 30 | """ 31 | 32 | isan(**get_args(argv)) 33 | 34 | -------------------------------------------------------------------------------- /isan.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | if [ $# = 0 ]; then 4 | echo “举一隅不以三隅反,则不复也” ——《论语·述而》 5 | exit 6 | fi 7 | 8 | if [ $1 = 'link' ] ; then 9 | echo 'link' 10 | src=$(dirname $0) 11 | ln -s ${src}/isan . 12 | ln -s ${src}/isan.py . 13 | ln -s ${src}/isan.sh . 14 | fi 15 | 16 | 17 | if [ $1 = 'stack' ] ; then 18 | src=$2 19 | batch=$3 # batchsize 20 | fold=$4 21 | dst=$5 22 | for tgt in `seq 0 $(expr $fold - 1 ) `; do 23 | cat $src | awk "(NR-NR%${batch})/${batch}%${fold}==${tgt} {print}" > ${dst}/${tgt}.test 24 | cat $src | awk "(NR-NR%${batch})/${batch}%${fold}!=${tgt} {print}" > ${dst}/${tgt}.train 25 | done; 26 | 27 | exit 28 | fi 29 | 30 | if [ $1 = 'shuffle' ] ; then 31 | cmd=$0 32 | shift 33 | if [ $# = 0 ]; then 34 | echo "usage:" 35 | echo " " ${cmd} shuffle -p processor-number=1 -m model-number=1 -d dir=. blabla 36 | exit 37 | fi 38 | dst='.' # dir 39 | nm='1' # number of models 40 | np='1' 41 | while [ `echo $1 | grep '\-'` ] ; do 42 | if [ `echo $1 | grep '\-p'` ] ; then 43 | np=$2 44 | shift;shift 45 | fi 46 | if [ `echo $1 | grep '\-d'` ] ; then 47 | dst=$2 48 | shift;shift 49 | fi 50 | if [ `echo $1 | grep '\-m'` ] ; then 51 | nm=$2 52 | shift;shift 53 | fi 54 | done 55 | echo "train [\033[34m$nm\033[0m] model(s)" "into [\033[34m$dst\033[0m]" 56 | echo "using [\033[34m$np\033[0m] processor(s)" 57 | echo "the command line is:\033[34m${cmd} $*\033[0m" 58 | 59 | #exit 60 | 61 | mkdir $dst -p 62 | echo `for i in $(seq $nm); do echo "${dst}/model.$i.gz --seed $i"; done` | xargs -n 3 -P $np ${cmd} $* 63 | ${cmd} $1 ${dst}/model.gz --append_model `for i in $(seq $nm); do echo "${dst}/model.$i.gz"; done` 64 | exit 65 | fi 66 | 67 | 68 | 69 | # 70 | # 中文分词模型 71 | # 72 | if [ $1 = 'seg' ] ; then 73 | shift 74 | ./isan.py \ 75 | --model isan.common.perceptrons.Model \ 76 | --decoder isan.common.decoder.First_Order_Linear \ 77 | --task isan.tagging.cb_cws.Task \ 78 | $@ 79 | fi 80 | 81 | if [ $1 = 'dep' ] ; then 82 | shift 83 | ./isan.py \ 84 | --model isan.common.perceptrons.Model \ 85 | --decoder isan.common.decoder.Push_Down \ 86 | --task isan.parsing.default_dep.Dep \ 87 | $@ 88 | fi 89 | 90 | if [ $1 = 'cws' ] ; then 91 | shift 92 | ./isan.py \ 93 | --model isan.common.perceptrons.Model \ 94 | --decoder isan.common.decoder.DFA \ 95 | --task isan.tagging.cws.Task \ 96 | $@ 97 | fi 98 | 99 | if [ $1 = 'tag' ] ; then 100 | shift 101 | ./isan.py \ 102 | --model isan.common.perceptrons.Model \ 103 | --decoder isan.common.decoder.DFA \ 104 | --task isan.tagging.wb_tag.Path_Finding \ 105 | $* 106 | fi 107 | 108 | # 实验性模型 109 | 110 | 111 | if [ $1 = 'lattice_dep' ] ; then 112 | shift 113 | ./isan.py \ 114 | --model isan.common.perceptrons.Model \ 115 | --decoder isan.common.decoder.Push_Down \ 116 | --task isan.parsing.lattice_dep.Dep \ 117 | $@ 118 | fi 119 | 120 | if [ $1 = 'lat_dep' ] ; then 121 | shift 122 | ./isan.py \ 123 | --model isan.common.perceptrons.Model \ 124 | --decoder isan.common.decoder.Push_Down \ 125 | --task isan.parsing.lat_dep.Dep \ 126 | $@ 127 | fi 128 | 129 | if [ $1 = 'pa_cws' ] ; then 130 | shift 131 | ./isan.py \ 132 | --model isan.common.perceptrons.Model_PA \ 133 | --decoder isan.common.decoder.DFA \ 134 | --task isan.tagging.PA_segger.Segger \ 135 | $* 136 | fi 137 | 138 | if [ $1 = 'pa_parsing' ] ; then 139 | shift 140 | ./isan.py \ 141 | --model isan.common.perceptrons.Model_PA \ 142 | --decoder isan.common.decoder.Push_Down \ 143 | --task isan.parsing.default_dep.PA_Dep \ 144 | $* 145 | fi 146 | 147 | if [ $1 = 'seg_dep' ] ; then 148 | shift 149 | ./isan.py \ 150 | --model isan.common.perceptrons.Model \ 151 | --decoder isan.common.decoder.Push_Down \ 152 | --task isan.parsing.seq_dep.Dep \ 153 | $* 154 | fi 155 | 156 | if [ $1 = 'tagpath' ] ; then 157 | shift 158 | ./isan.py \ 159 | --model isan.common.perceptrons.Model \ 160 | --decoder isan.common.decoder.DFA \ 161 | --task isan.tagging.tagging_dag.Path_Finding \ 162 | $* 163 | fi 164 | 165 | if [ $1 = 'dep' ] ; then 166 | shift 167 | ./isan.py \ 168 | --model isan.common.perceptrons.Model \ 169 | --decoder isan.common.decoder.Push_Down \ 170 | --task isan.parsing.default_dep2.Dep \ 171 | $* 172 | fi 173 | -------------------------------------------------------------------------------- /isan/Makefile: -------------------------------------------------------------------------------- 1 | all: common/pushdown.so common/dfabeam.so common/first_order_linear.so common/feature_dict.so 2 | 3 | gcc= g++ -I /usr/include/python3.2mu -shared -fPIC -O3 -std=c++0x -I .. -Wno-deprecated -g 4 | 5 | headers=common/*.hpp utls/*.hpp 6 | 7 | common/pushdown.so: common/python_interface.cc ${headers} 8 | ${gcc} common/python_interface.cc -o common/pushdown.so \ 9 | -D REDUCE -D __MODULE_NAME=pushdown 10 | 11 | common/dfabeam.so: common/python_interface.cc ${headers} 12 | ${gcc} common/python_interface.cc -o common/dfabeam.so \ 13 | -D __MODULE_NAME=dfabeam 14 | 15 | common/first_order_linear.so: common/first_order_linear/first_order_linear.cc \ 16 | common/first_order_linear/decoder.h 17 | ${gcc} $< -o $@ 18 | 19 | common/feature_dict.so: common/feature_dict.cc 20 | ${gcc} $< -o $@ 21 | -------------------------------------------------------------------------------- /isan/README.md: -------------------------------------------------------------------------------- 1 | * `common` 目录下是最基本的模型和解码器 2 | * `tagging` 目录下是序列类的任务 3 | * `parsing` 目录下是树类的任务 4 | -------------------------------------------------------------------------------- /isan/annotation/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3 2 | """ 3 | is it ok? 4 | """ 5 | -------------------------------------------------------------------------------- /isan/annotation/seg/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3 2 | """ 3 | is it ok? 4 | """ 5 | -------------------------------------------------------------------------------- /isan/annotation/seg/anno.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3 2 | 3 | 4 | -------------------------------------------------------------------------------- /isan/annotation/seg/http_server.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3 2 | import http.server 3 | import cgi 4 | import urllib.parse 5 | 6 | import sys 7 | import webbrowser 8 | import multiprocessing 9 | 10 | import time 11 | import subprocess 12 | import json 13 | 14 | html=''' 15 | 21 | 22 | 103 | $(sen_id) 104 |
105 | $(sequence) 106 |
107 | 提交 108 | 继续 109 | 排除 110 | 终止 111 | 112 | ''' 113 | ''' 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | ''' 122 | 123 | class MyHttpHandler(http.server.BaseHTTPRequestHandler): 124 | def do_GET(self): 125 | path=urllib.parse.unquote(self.path) 126 | if path.endswith('.ico'): 127 | return 128 | if len(path)>1: 129 | print(path) 130 | rtn=html 131 | sen=anno() 132 | raw,seq=sen['raw'],sen['anno'] 133 | s=[] 134 | for id,c in enumerate(raw): 135 | s.append('''%s'''%(id,id,c)) 136 | if id?'''%(id+1)) 138 | rtn=rtn.replace('$(sequence)',''.join(s)) 139 | rtn=rtn.replace('$(url)',url) 140 | rtn=rtn.replace('$(sen_id)',sen['id']) 141 | self.send_response(200) 142 | self.send_header( "Content-type", "text/html" ) 143 | self.end_headers() 144 | 145 | self.wfile.write(rtn.encode('utf8')) 146 | 147 | 148 | def run(server_class=http.server.HTTPServer, handler_class=http.server.BaseHTTPRequestHandler 149 | ,addr=('', 8082)): 150 | server_address = addr 151 | httpd = server_class(server_address, handler_class) 152 | httpd.serve_forever() 153 | return httpd; 154 | 155 | 156 | class Anno: 157 | def __init__(self): 158 | self.data=[] 159 | for line in open("sample.json"): 160 | sen=json.loads(line) 161 | self.data.append(sen) 162 | self.ind=0 163 | def __call__(self,string=""): 164 | if string=='stop': 165 | 166 | return 167 | if self.ind>=len(self.data): 168 | return '' 169 | sen=self.data[self.ind] 170 | self.ind+=1 171 | return sen 172 | 173 | anno=Anno() 174 | 175 | 176 | if __name__=="__main__": 177 | 178 | 179 | 180 | lock=multiprocessing.Lock() 181 | 182 | 183 | print('server started') 184 | 185 | url="http://166.111.138.130:8082/" 186 | port=8082 187 | if len(sys.argv)>1: 188 | url="http://166.111.138.130:"+sys.argv[1]+"/" 189 | port=int(sys.argv[1]) 190 | 191 | print(url) 192 | run(handler_class=MyHttpHandler, addr=('',port)) 193 | 194 | -------------------------------------------------------------------------------- /isan/annotation/seg/sample.html: -------------------------------------------------------------------------------- 1 | 7 | 8 | 71 |
72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 |
92 | 提交 93 | 94 | 95 | 96 | -------------------------------------------------------------------------------- /isan/common/Chinese.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3 2 | """ 3 | 知识点: 4 | 》unicode编码中,汉字的大体范围为:“一”-“鿋” 5 | 》半角转全角,只需要内码加上65248,全角空格为12288 6 | 7 | """ 8 | 9 | #汉字集合 10 | chinese_characters=set(chr(i) for i in range(ord('一'),ord('鿋')+1)) 11 | #阿拉伯数字集合 12 | number_characters=set(chr(x) for x in range(ord('0'),ord('9')+1)) 13 | #拉丁字母 14 | latin_characters=set(chr(x) for x in range(ord('a'),ord('z')+1)) 15 | latin_characters.update(chr(x) for x in range(ord('A'),ord('Z')+1)) 16 | 17 | #内容字符,汉字、阿拉伯数字、拉丁字母的集合 18 | content_characters=set() 19 | content_characters.update(chinese_characters) 20 | content_characters.update(number_characters) 21 | content_characters.update(latin_characters) 22 | 23 | #句末符号 24 | full_stops=set('。?!') 25 | 26 | def test(): 27 | print("测试") 28 | 29 | def to_full(text,ignore=set()): 30 | """ 31 | 半角转全角的程序 32 | 空格变成全角 33 | 大于空格的直接加上偏移量 34 | 否则不变 35 | """ 36 | 37 | return ''.join(chr(12288) if x==32 else chr(x+65248) if (x<128 and x>32 and (x not in ignore)) else chr(x) 38 | for x in map(ord,text)) 39 | 40 | def seg_sentence(text): 41 | """ 42 | 切分句子 43 | """ 44 | cache=[] 45 | sentences=[] 46 | has_non=False 47 | for c in text: 48 | cache.append(c) 49 | if c in full_stops and has_non: 50 | cache=''.join(cache) 51 | 52 | cache=cache.strip() 53 | if cache: 54 | sentences.append(cache) 55 | cache=[] 56 | has_non=False 57 | 58 | elif c in content_characters: 59 | has_non=True 60 | if cache: 61 | if not sentences:sentences.append('') 62 | sentences[-1]+=''.join(cache) 63 | return sentences 64 | 65 | def seg_by_punctuations(text): 66 | pass 67 | if __name__=="__main__": 68 | print(seg_sentence(to_full('。“hello world?!wo23。”'))) 69 | -------------------------------------------------------------------------------- /isan/common/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3 2 | 3 | def test(): 4 | print("this is a test function") 5 | -------------------------------------------------------------------------------- /isan/common/decoder.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "isan/common/common.hpp" 4 | #include "isan/common/searcher.hpp" 5 | 6 | 7 | 8 | /** 9 | * searcher data 10 | * provide information for searcher 11 | * */ 12 | 13 | namespace isan{ 14 | 15 | 16 | 17 | class General_Searcher_Data : 18 | public Searcher_Data{ 19 | public: 20 | 21 | Feature_Generator * feature_generator; 22 | State_Generator * shifted_state_generator; 23 | Reduced_State_Generator * reduced_state_generator; 24 | Early_Stop_Checker * early_stop_checker; 25 | 26 | size_t learning_step; 27 | 28 | // init this object 29 | General_Searcher_Data( 30 | Early_Stop_Checker * early_stop_checker, 31 | State_Generator *shifted_state_generator, 32 | Reduced_State_Generator *reduced_state_generator, 33 | Feature_Generator* feature_generator){ 34 | this->early_stop_checker=early_stop_checker; 35 | if(this->early_stop_checker)this->use_early_stop=true; 36 | this->feature_generator=feature_generator; 37 | this->shifted_state_generator=shifted_state_generator; 38 | 39 | this->reduced_state_generator=reduced_state_generator; 40 | //std::cout<& last_alphas, 49 | const std::vector& states 50 | ){ 51 | return (*early_stop_checker)( 52 | step, 53 | last_alphas, 54 | states); 55 | }; 56 | 57 | inline void shift( 58 | const int& ind, 59 | State_Type& state, 60 | std::vector& next_actions, 61 | std::vector& next_inds, 62 | std::vector& next_states, 63 | std::vector& scores 64 | ){ 65 | next_inds.clear();// clear the vector first 66 | (*shifted_state_generator)(ind,state,next_actions,next_inds,next_states); 67 | 68 | cal_weights(state,next_actions,scores,learning_step); 69 | }; 70 | void reduce( 71 | const int state_ind, 72 | const State_Type& state, 73 | const std::vector& pred_alphas, 74 | std::vector& next_actions, 75 | std::vector& next_inds, 76 | std::vector& next_states, 77 | std::vector& reduce_pred_alphas, 78 | std::vector& scores 79 | ){ 80 | if(this->reduced_state_generator){ 81 | (*reduced_state_generator)( 82 | state_ind, 83 | state, 84 | pred_alphas, 85 | next_actions, 86 | next_inds, 87 | next_states, 88 | reduce_pred_alphas 89 | ); 90 | cal_weights(state,next_actions,scores,learning_step); 91 | } 92 | }; 93 | 94 | // calculate the sum of the weights according to the list of symbolic features 95 | inline void cal_weights( 96 | const STATE& state, 97 | const std::vector& next_actions, 98 | std::vector& scores, 99 | size_t step 100 | ){ 101 | scores.resize(next_actions.size()); 102 | for(int i=0;i My_Searcher; 115 | public: 116 | State_Type init_state; 117 | int beam_width; 118 | General_Searcher_Data * data; 119 | 120 | My_Searcher * push_down; 121 | 122 | State_Generator * shifted_state_generator; 123 | Reduced_State_Generator * reduced_state_generator; 124 | Feature_Generator * feature_generator; 125 | Early_Stop_Checker * early_stop_checker; 126 | 127 | Chinese* raw; 128 | 129 | Interface(int beam_width, 130 | PyObject * py_early_stop_callback, 131 | PyObject * py_shift_callback, 132 | PyObject * py_reduce_callback, 133 | PyObject * py_feature_cb 134 | ){ 135 | if(PyLong_Check(py_shift_callback)){ 136 | shifted_state_generator=(State_Generator *) PyLong_AsUnsignedLong(py_shift_callback); 137 | }else{ 138 | shifted_state_generator=new Python_State_Generator(py_shift_callback); 139 | }; 140 | 141 | reduced_state_generator=NULL; 142 | if(py_reduce_callback!=Py_None){ 143 | reduced_state_generator=new Python_Reduced_State_Generator(py_reduce_callback); 144 | }; 145 | 146 | if(PyLong_Check( py_feature_cb)){ 147 | feature_generator=(Feature_Generator*) PyLong_AsUnsignedLong( py_feature_cb); 148 | }else{ 149 | feature_generator=new Python_Feature_Generator( py_feature_cb); 150 | }; 151 | early_stop_checker=NULL; 152 | if(py_early_stop_callback!=Py_None){ 153 | early_stop_checker=new Python_Early_Stop_Checker(py_early_stop_callback); 154 | } 155 | 156 | raw=NULL; 157 | this->beam_width=beam_width; 158 | this->data=new General_Searcher_Data( 159 | early_stop_checker, 160 | shifted_state_generator, 161 | reduced_state_generator, 162 | feature_generator); 163 | this->push_down=new My_Searcher(this->data,beam_width); 164 | 165 | }; 166 | 167 | void set_raw(Chinese& raw){ 168 | if(this->raw)delete this->raw; 169 | this->raw=new Chinese(raw); 170 | this->shifted_state_generator->raw=this->raw; 171 | this->feature_generator->set_raw(this->raw); 172 | } 173 | 174 | ~Interface(){ 175 | delete this->data; 176 | delete this->push_down; 177 | delete feature_generator; 178 | delete shifted_state_generator; 179 | delete early_stop_checker; 180 | if(reduced_state_generator) 181 | delete reduced_state_generator; 182 | }; 183 | }; 184 | 185 | };//isan 186 | -------------------------------------------------------------------------------- /isan/common/decoder.py: -------------------------------------------------------------------------------- 1 | import isan.common.pushdown as pushdown 2 | import isan.common.dfabeam as dfabeam 3 | import isan.common.first_order_linear as first_order_linear 4 | 5 | 6 | class Searcher: 7 | def search(self): 8 | return self.searcher.search(self.handler,self.get_init_states()) 9 | def get_states(self): 10 | return self.searcher.get_states(self.handler) 11 | def __del__(self): 12 | self.searcher.delete(self.handler) 13 | 14 | def __init__(self,schema,beam_width): 15 | self.get_init_states=schema.get_init_states 16 | self.handler=self.searcher.new( 17 | beam_width, 18 | schema.early_stop if hasattr(schema,'early_stop') else None, 19 | schema.shift, 20 | schema.reduce, 21 | schema.gen_features, 22 | ) 23 | 24 | class DFA(Searcher): 25 | name='状态转移' 26 | searcher=dfabeam 27 | class Push_Down(Searcher): 28 | name='Shift-Reduce' 29 | searcher=pushdown 30 | class First_Order_Linear(Searcher): 31 | name='first order linear' 32 | searcher=first_order_linear 33 | def cal_margins(self): 34 | return self.searcher.cal_margins(self.handler) 35 | def __init__(self,schema,beam_width): 36 | self.get_init_states=schema.get_init_states 37 | self.handler=self.searcher.new( 38 | 1, 39 | schema.emission, 40 | schema.transition 41 | ) 42 | -------------------------------------------------------------------------------- /isan/common/feature_dict.cc: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | 6 | #define __MODULE_NAME feature_dict 7 | #define __INIT_FUNC(a,b) a##b 8 | #define INIT_FUNC(a,b) __INIT_FUNC(a,b) 9 | #define PYINIT PyInit_ 10 | #define STR(x) #x 11 | 12 | struct Hash{ 13 | size_t operator()(const PyObject* key) const{ 14 | size_t size=PyUnicode_GET_SIZE(key); 15 | auto* data=PyUnicode_AS_UNICODE(key); 16 | size_t hk=0; 17 | for(int i=0;i Dict; 38 | 39 | static PyObject * 40 | module_new(PyObject *self, PyObject *arg) 41 | { 42 | Dict* dict=new Dict(); 43 | return PyLong_FromLong((long)dict); 44 | }; 45 | 46 | static PyObject * 47 | dict_size(PyObject *self, PyObject *arg){ 48 | Dict* dict=(Dict*)PyLong_AsLong(arg); 49 | return PyLong_FromLong(dict->size()); 50 | }; 51 | 52 | static PyObject * 53 | set_weights(PyObject *self, PyObject *arg){ 54 | Dict* dict; 55 | PyObject * py_dict; 56 | 57 | PyArg_ParseTuple(arg, "LO", &dict,&py_dict); 58 | 59 | PyObject *key, *value; 60 | Py_ssize_t pos = 0; 61 | 62 | size_t length; 63 | while (PyDict_Next(py_dict, &pos, &key, &value)) { 64 | Py_INCREF(key); 65 | (*dict)[key]=PyFloat_AsDouble(value); 66 | }; 67 | 68 | Py_INCREF(Py_None); 69 | return Py_None; 70 | }; 71 | 72 | static PyObject * 73 | to_dict(PyObject *self, PyObject *arg){ 74 | Dict* dict=(Dict*)PyLong_AsLong(arg); 75 | PyObject * py_dict=PyDict_New(); 76 | for(auto it=dict->begin();it!=dict->end();++it){ 77 | PyObject * key=it->first; 78 | PyObject * value=PyFloat_FromDouble(it->second); 79 | PyDict_SetItem(py_dict,key,value); 80 | Py_DECREF(value); 81 | }; 82 | return py_dict; 83 | }; 84 | 85 | static PyObject * 86 | clear(PyObject *self, PyObject *arg){ 87 | Dict* dict=(Dict*)PyLong_AsLong(arg); 88 | for(auto it=dict->begin();it!=dict->end();++it){ 89 | Py_DECREF(it->first); 90 | } 91 | dict->clear(); 92 | Py_INCREF(Py_None); 93 | return Py_None; 94 | }; 95 | static PyObject * 96 | cal_fv(PyObject *self, PyObject *arg){ 97 | Dict* dict; 98 | PyObject * py_fv; 99 | 100 | PyArg_ParseTuple(arg, "LO", &dict,&py_fv); 101 | 102 | long size=PySequence_Size(py_fv); 103 | 104 | double score=0; 105 | 106 | for(int i=0;ifind(key); 109 | if (got!=dict->end()){ 110 | score+=got->second; 111 | }; 112 | Py_DECREF(key); 113 | } 114 | 115 | return PyFloat_FromDouble(score); 116 | Py_INCREF(Py_None); 117 | return Py_None; 118 | }; 119 | static PyObject * 120 | get(PyObject *self, PyObject *arg){ 121 | Dict* dict; 122 | PyObject * key; 123 | 124 | PyArg_ParseTuple(arg, "LO", &dict,&key); 125 | 126 | 127 | auto got=dict->find(key); 128 | if (got!=dict->end()){ 129 | return PyFloat_FromDouble(got->second); 130 | }; 131 | 132 | return PyFloat_FromDouble(0); 133 | }; 134 | 135 | static PyObject * 136 | update_fv(PyObject *self, PyObject *arg){ 137 | Dict* dict; 138 | PyObject * py_fv; 139 | double delta; 140 | 141 | 142 | PyArg_ParseTuple(arg, "LOd", &dict,&py_fv,&delta); 143 | 144 | long size=PySequence_Size(py_fv); 145 | 146 | for(int i=0;ifind(key); 149 | if (got!=dict->end()){ 150 | got->second+=delta; 151 | }else{ 152 | Py_INCREF(key); 153 | (*dict)[key]=delta; 154 | }; 155 | Py_DECREF(key); 156 | } 157 | 158 | Py_INCREF(Py_None); 159 | return Py_None; 160 | }; 161 | 162 | static PyObject * 163 | interface_delete(PyObject *self, PyObject *arg){ 164 | Dict* dict=(Dict*)PyLong_AsLong(arg); 165 | for(auto it=dict->begin();it!=dict->end();++it){ 166 | Py_DECREF(it->first); 167 | } 168 | dict->clear(); 169 | delete dict; 170 | Py_INCREF(Py_None); 171 | return Py_None; 172 | }; 173 | 174 | /** stuffs about the module def */ 175 | static PyMethodDef interfaceMethods[] = { 176 | {"new", module_new, METH_VARARGS,""}, 177 | {"delete", interface_delete, METH_O,""}, 178 | {"size", dict_size, METH_O,""}, 179 | {"set_weights", set_weights, METH_VARARGS,""}, 180 | {"cal_fv", cal_fv, METH_VARARGS,""}, 181 | {"update_fv", update_fv, METH_VARARGS,""}, 182 | {"get", get, METH_VARARGS,""}, 183 | {"to_dict", to_dict, METH_O,""}, 184 | {"clear", clear, METH_O,""}, 185 | //{"set_raw", set_raw, METH_VARARGS,""}, 186 | {NULL, NULL, 0, NULL} /* Sentinel */ 187 | }; 188 | 189 | static struct PyModuleDef module_struct = { 190 | PyModuleDef_HEAD_INIT, 191 | STR(__MODULE_NAME), /* name of module */ 192 | NULL, /* module documentation, may be NULL */ 193 | -1, /* size of per-interpreter state of the module, 194 | or -1 if the module keeps state in global variables. */ 195 | interfaceMethods 196 | }; 197 | 198 | PyMODINIT_FUNC 199 | INIT_FUNC(PYINIT,__MODULE_NAME) (void) 200 | { 201 | return PyModule_Create(&module_struct); 202 | } 203 | -------------------------------------------------------------------------------- /isan/common/first_order_linear/decoder.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | namespace isan{ 3 | 4 | typedef double Score_Type; 5 | typedef size_t Tag_Type; 6 | 7 | //a structure for alphas and betas 8 | struct Alpha_Beta{ 9 | Score_Type value; 10 | Tag_Type tag_id; 11 | }; 12 | 13 | /** The DP algorithm(s) for path labeling */ 14 | inline Score_Type dp_decode( 15 | const size_t tagset_size, 16 | const size_t node_size, 17 | const Score_Type* transitions, 18 | const Score_Type* emissions, 19 | Alpha_Beta* alphas, 20 | Tag_Type* tags 21 | ){ 22 | 23 | Tag_Type max_tag_id; 24 | Score_Type max_value; 25 | // scores of the first item 26 | for(Tag_Type j=0;j max_value){ 36 | max_value=value; 37 | max_tag_id=k; 38 | } 39 | }; 40 | 41 | alphas[i*tagset_size+j].value=emissions[i*tagset_size+j]+max_value; 42 | alphas[i*tagset_size+j].tag_id=max_tag_id; 43 | 44 | }; 45 | }; 46 | 47 | max_tag_id=0; 48 | max_value=alphas[(node_size-1)*tagset_size].value; 49 | for(Tag_Type k=1;k max_value){ 52 | max_value=value; 53 | max_tag_id=k; 54 | } 55 | }; 56 | 57 | size_t node_id=node_size-1; 58 | size_t tag_id=max_tag_id; 59 | tags[node_id]=tag_id; 60 | while (node_id>0) { 61 | tag_id=alphas[(node_id)*tagset_size+tag_id].tag_id; 62 | node_id--; 63 | tags[node_id]=tag_id; 64 | 65 | }; 66 | return max_value; 67 | }; 68 | 69 | 70 | 71 | /** cal beta */ 72 | inline void dp_cal_beta( 73 | const size_t tagset_size, 74 | const size_t node_size, 75 | const Score_Type* transitions, 76 | const Score_Type* emissions, 77 | Alpha_Beta* betas 78 | ){ 79 | 80 | Tag_Type max_tag_id; 81 | Score_Type max_value; 82 | // scores of the first item 83 | for(Tag_Type j=0;j=0;--i){ 88 | for(Tag_Type j=0;j max_value){ 96 | max_value=value; 97 | max_tag_id=k; 98 | } 99 | }; 100 | 101 | betas[i*tagset_size+j].value=emissions[i*tagset_size+j]+max_value; 102 | betas[i*tagset_size+j].tag_id=max_tag_id; 103 | 104 | }; 105 | }; 106 | }; 107 | 108 | 109 | }//end of namespace 110 | -------------------------------------------------------------------------------- /isan/common/general_types.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | #include 4 | #include "isan/common/searcher.hpp" 5 | #include "isan/common/smart_string.hpp" 6 | namespace isan{ 7 | typedef long Action_Type; 8 | 9 | typedef double Score_Type; 10 | 11 | typedef Smart_Chars State_Type; 12 | 13 | typedef unsigned short Chinese_Character; 14 | typedef Smart_String Chinese; 15 | 16 | 17 | typedef Alpha Alpha_Type; 18 | typedef State_Info State_Info_Type; 19 | 20 | 21 | template 22 | inline static PyObject * 23 | pack_alpha(Alpha alpha){ 24 | PyObject * py_step=PyLong_FromLong(alpha->ind1); 25 | PyObject * py_state=alpha->state1.pack(); 26 | PyObject * py_action=PyLong_FromLong(alpha->action); 27 | PyObject * py_move=PyTuple_Pack(3,py_step,py_state,py_action); 28 | Py_DECREF( py_step); 29 | Py_DECREF( py_state); 30 | Py_DECREF( py_action); 31 | return py_move; 32 | }; 33 | 34 | };//end of isan 35 | -------------------------------------------------------------------------------- /isan/common/parameters.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | """ 3 | 4 | """ 5 | 6 | class Para_Dict (dict): 7 | def __call__(self,keys): 8 | return sum(self.get(k,0) for k in keys) 9 | 10 | class Parameters : 11 | def __init__(self,para_class): 12 | self.para_class=para_class 13 | self._list=list() 14 | self._dirty=list() 15 | 16 | def add(self,value): 17 | if type(value)== dict : 18 | p=self.para_class.d(value) 19 | p.init(self) 20 | else : 21 | p=value.view(self.para_class.ndarray) 22 | p.init(self) 23 | self._list.append(p) 24 | return p 25 | 26 | def update(self,step=0) : 27 | for p in self._dirty : 28 | p._update(step) 29 | del self._dirty[:] 30 | 31 | def final(self,step): 32 | for p in self._list : 33 | if hasattr(p,'final') : 34 | p.final(step) 35 | 36 | def un_final(self): 37 | for p in self._list : 38 | if hasattr(p,'un_final') : 39 | p.un_final() 40 | 41 | 42 | class _Base_Dict (Para_Dict): 43 | def init(self,paras): 44 | self._delta={} 45 | self._paras=paras 46 | 47 | def output_obj(self): 48 | for k,v in self.items(): 49 | if hasattr(v,'output_obj') : 50 | self[k]=v.output_obj() 51 | return Para_Dict(self) 52 | 53 | def add_delta(self,keys,delta): 54 | for f in keys : 55 | if f not in self._delta : 56 | self._delta[f]=.0 57 | self._delta[f]+=delta 58 | self._paras._dirty.append(self) 59 | 60 | def add_model(self,model): 61 | for k,v in model.items(): 62 | if k not in self : 63 | self[k]=0 64 | self._delta[k]=0 65 | self[k]=(self[k]*self._delta[k]+v)/(self._delta[k]+1) 66 | self._delta[k]+=1 67 | 68 | class _Base_ndarray(np.ndarray): 69 | def init(self,paras): 70 | self._s=0 71 | self._delta=0 72 | self.paras=paras 73 | 74 | def add_delta(self,delta) : 75 | self._delta+=delta 76 | self.paras._dirty.append(self) 77 | 78 | def output_obj(self): 79 | return np.array(self) 80 | -------------------------------------------------------------------------------- /isan/common/perceptrons.py: -------------------------------------------------------------------------------- 1 | """ 2 | ZHANG Kaixu 3 | """ 4 | import logging 5 | import sys 6 | import pickle 7 | import random 8 | import gzip 9 | from isan.common.parameters import Parameters 10 | 11 | class Model(object): 12 | """感知器模型 """ 13 | name="感知器" #: 模型的名字 14 | 15 | def __init__(self,model_file,Task=None,Searcher=None, 16 | Updater=None, 17 | beam_width=8,logger=None,cmd_args={},**conf): 18 | """ 19 | 初始化 20 | 如果不设置,则读取已有模型。如果设置,就是学习新模型 21 | """ 22 | if logger==None : 23 | logger=logging.getLogger(__name__) 24 | console=logging.StreamHandler() 25 | console.setLevel(logging.INFO) 26 | logger.addHandler(console) 27 | logger.setLevel(logging.INFO) 28 | self.result_logger=logger 29 | 30 | self.beam_width=beam_width#:搜索宽度 31 | self.conf=conf 32 | 33 | if model_file!=None: 34 | file=gzip.open(model_file,"rb") 35 | self.task=Task(model=pickle.load(file),logger=logger) 36 | file.close() 37 | else : # new model to train 38 | self.paras=Parameters(Updater) 39 | #self.paras=Parameters(Ada_Grad) 40 | self.task=Task(logger=logger,paras=self.paras) 41 | if hasattr(self.task,'init'): 42 | self.task.init() 43 | self.searcher=Searcher(self.task,beam_width) 44 | self.step=0 45 | 46 | def __del__(self): 47 | del self.searcher 48 | def test(self,test_file): 49 | """ 50 | 测试 51 | """ 52 | eval=self.task.Eval() 53 | for line in open(test_file): 54 | arg=self.task.codec.decode(line.strip()) 55 | raw=arg.get('raw') 56 | Y=arg.get('Y_a',None) 57 | y=arg.get('y',None) 58 | hat_y=self(raw) 59 | eval(y,hat_y) 60 | if hasattr(eval,'get_result'): 61 | self.result_logger.info(eval.get_result()) 62 | else : 63 | eval.print_result()#打印评测结果 64 | return eval 65 | 66 | def develop(self,dev_file): 67 | """ 68 | @brief 预测开发集 69 | """ 70 | 71 | self.paras.final(self.step) 72 | eval=self.task.Eval() 73 | for line in open(dev_file): 74 | arg=self.task.codec.decode(line.strip()) 75 | if not arg:continue 76 | raw=arg.get('raw') 77 | y=arg.get('y',None) 78 | hat_y=self(raw) 79 | eval(y,hat_y) 80 | if hasattr(eval,'get_result'): 81 | self.result_logger.info(eval.get_result()) 82 | else : 83 | eval.print_result()#打印评测结果 84 | self.paras.un_final() 85 | 86 | if hasattr(eval,'get_scaler'): 87 | return eval.get_scaler() 88 | 89 | 90 | def save(self,model_file=None): 91 | """ 92 | 保存模型 93 | """ 94 | 95 | if model_file==None : model_file=self.model_file 96 | if model_file==None : return 97 | if model_file=='/dev/null' : return 98 | 99 | #self.task.average_weights(self.step) 100 | self.paras.final(self.step) 101 | 102 | file=gzip.open(model_file,'wb') 103 | data=self.task.dump_weights() 104 | pickle.dump(data,file) 105 | file.close() 106 | 107 | def search(self,raw,Y=None): 108 | """ 109 | 搜索 110 | """ 111 | self.task.set_raw(raw,Y) 112 | #self.searcher.set_raw(raw) 113 | return self.searcher.search() 114 | 115 | def __call__(self,raw,Y=None,threshold=0): 116 | """ 117 | 解码,读入生句子,返回词的数组 118 | """ 119 | rst_moves=self.search(raw,Y) 120 | 121 | hat_y=self.task.moves_to_result(rst_moves,raw) 122 | if threshold==0 : 123 | return hat_y 124 | else: 125 | margins=self.searcher.cal_margins() 126 | return self.task.gen_candidates(margins,threshold) 127 | 128 | def _learn_sentence(self,arg): 129 | """ 130 | 学习,根据生句子和标准分词结果 131 | """ 132 | raw=arg.get('raw') 133 | self.raw=raw 134 | y=arg.get('y',None) 135 | Y_a=arg.get('Y_a',None) 136 | 137 | #self.logger.debug('get training example') 138 | #self.logger.debug("raw: %s"%raw) 139 | #self.logger.debug("y: %s"%y) 140 | #self.logger.debug("Y_a: %s"%Y_a) 141 | 142 | 143 | #学习步数加一 144 | self.step+=1 145 | 146 | #set oracle, get standard actions 147 | if hasattr(self.task,'set_oracle'): 148 | std_moves=self.task.set_oracle(raw,y) 149 | 150 | #self.logger.debug(std_moves) 151 | 152 | #get result actions 153 | #self.searcher.set_step(self.step) 154 | rst_moves=self.search(raw,Y_a)#得到解码后动作 155 | 156 | #update 157 | if not self.task.check(std_moves,rst_moves):#check 158 | self.update(std_moves,rst_moves)#update 159 | 160 | #clean oracle 161 | if hasattr(self.task,'remove_oracle'): 162 | self.task.remove_oracle() 163 | 164 | hat_y=self.task.moves_to_result(rst_moves,raw)#得到解码后结果 165 | return y,hat_y 166 | 167 | def update(self,std_moves,rst_moves): 168 | #self.task.cal_delta(std_moves,rst_moves,self.step) 169 | self.task.cal_delta(std_moves,rst_moves) 170 | if self.step%self.batch_size==0 : 171 | self.paras.update(self.step) 172 | 173 | 174 | def train(self,training_file, 175 | iteration=5,peek=-1, 176 | dev_files=None,keep_data=True,batch_size=1): 177 | """ 178 | 训练 179 | """ 180 | if iteration<=0 and peek <=0 : peek=5 181 | self.batch_size=batch_size 182 | 183 | if type(training_file)==str:training_file=[training_file] 184 | #random.seed(123) 185 | 186 | if keep_data : 187 | training_data=[] 188 | for t_file in training_file : 189 | for line in open(t_file):#迭代每个句子 190 | rtn=self.task.codec.decode(line.strip())#得到标准输出 191 | if not rtn:continue 192 | training_data.append(rtn) 193 | random.shuffle(training_data) 194 | 195 | 196 | def gen_data(): 197 | if keep_data : 198 | perc=0 199 | print(perc,end='%\r') 200 | #random.shuffle(training_data) 201 | for i,e in enumerate(training_data) : 202 | p=int(i*100/len(training_data)) 203 | if p != perc : 204 | print("%i"%(p),end='%\r',file=sys.stderr) 205 | perc=p 206 | yield e 207 | else : 208 | for t_file in training_file: 209 | for line in open(t_file):#迭代每个句子 210 | rtn=self.task.codec.decode(line.strip())#得到标准输出 211 | if not rtn:continue 212 | yield rtn 213 | 214 | it=0 215 | best_it=None 216 | best_scaler=None 217 | 218 | while True : 219 | if it == iteration : break 220 | self.result_logger.info("训练集第 \033[33;01m%i\033[1;m 次迭代"%(it+1)) 221 | eval=self.task.Eval()#: 测试用的对象 222 | 223 | for rtn in gen_data(): 224 | if rtn is None : continue 225 | y,hat_y=self._learn_sentence(rtn)#根据(输入,输出)学习参数,顺便得到解码结果 226 | eval(y,hat_y)#根据解码结果和标准输出,评价效果 227 | 228 | if hasattr(eval,'get_result'): 229 | self.result_logger.info(eval.get_result()) 230 | else : 231 | eval.print_result()#打印评测结果 232 | 233 | if hasattr(self.task,'report'): 234 | self.task.report() 235 | 236 | if dev_files: 237 | #self.result_logger.info("使用开发集 %s 评价当前模型效果"%(dev_file)) 238 | for dev_id,dev_file in enumerate(dev_files) : 239 | scaler=self.develop(dev_file) 240 | if dev_id==0 : 241 | if best_scaler==None or (scaler and best_scaler=0 and it-best_it>peek : break 246 | def __del__(self): 247 | self.task.__del__() 248 | del self.task 249 | 250 | 251 | class Model_PA(Model) : 252 | name="局部标注平均感知器" 253 | def _learn_sentence(self,arg): 254 | """ 255 | 学习,根据生句子和标准分词结果 256 | """ 257 | raw=arg.get('raw') 258 | self.raw=raw 259 | y=arg.get('y',None) 260 | Y_a=arg.get('Y_a',None) 261 | Y_b=arg.get('Y_b',None) 262 | #print(arg) 263 | 264 | #学习步数加一 265 | self.step+=1 266 | 267 | #get standard actions 268 | if hasattr(self.task,'set_oracle'): 269 | std_moves=self.task.set_oracle(raw,y,Y_b) 270 | 271 | #get result actions 272 | rst_moves=self.search(raw,Y_a)#得到解码后动作 273 | 274 | #clean the early-update data 275 | if hasattr(self.task,'remove_oracle'): 276 | self.task.remove_oracle() 277 | 278 | if not self.task.is_belong(raw,rst_moves,Y_b): #不一致,则更新 279 | if y and not Y_b : 280 | std_moves=self.task.result_to_moves(y)#得到标准动作 281 | else : 282 | #print('yb',Y_b) 283 | std_moves=self.search(raw,Y_b) 284 | self.update(std_moves,rst_moves) 285 | hat_y=self.task.moves_to_result(rst_moves,raw)#得到解码后结果 286 | return y,hat_y 287 | 288 | -------------------------------------------------------------------------------- /isan/common/python_interface.cc: -------------------------------------------------------------------------------- 1 | #include 2 | #include "isan/common/common.hpp" 3 | #include "isan/common/decoder.hpp" 4 | 5 | #define __INIT_FUNC(a,b) a##b 6 | #define INIT_FUNC(a,b) __INIT_FUNC(a,b) 7 | #define PYINIT PyInit_ 8 | #define STR(x) #x 9 | 10 | 11 | namespace isan{ 12 | 13 | static PyObject * 14 | interface_delete(PyObject *self, PyObject *arg){ 15 | delete (Interface*)PyLong_AsLong(arg); 16 | Py_INCREF(Py_None); 17 | return Py_None; 18 | }; 19 | 20 | static PyObject * 21 | set_raw(PyObject *self, PyObject *arg) 22 | { 23 | Interface* interface; 24 | PyObject *new_raw; 25 | PyArg_ParseTuple(arg, "LO", &interface,&new_raw); 26 | if(!PyUnicode_Check(new_raw)){ 27 | Py_INCREF(Py_None); 28 | return Py_None; 29 | 30 | }; 31 | long raw_size=PySequence_Size(new_raw); 32 | 33 | Chinese raw(raw_size); 34 | for(int i=0;iset_raw(raw); 40 | Py_INCREF(Py_None); 41 | 42 | return Py_None; 43 | }; 44 | static PyObject * 45 | set_step(PyObject *self, PyObject *arg) 46 | { 47 | Interface* interface; 48 | PyObject *new_raw; 49 | long step=0; 50 | 51 | PyArg_ParseTuple(arg, "LL", &interface,&step); 52 | 53 | interface->data->learning_step=step; 54 | Py_INCREF(Py_None); 55 | 56 | return Py_None; 57 | }; 58 | static PyObject * 59 | do_nothing(PyObject *self, PyObject *arg) 60 | { 61 | Py_INCREF(Py_None); 62 | return Py_None; 63 | }; 64 | 65 | 66 | 67 | static PyObject * 68 | search(PyObject *self, PyObject *arg) 69 | { 70 | 71 | Interface* interface; 72 | PyObject *py_init_states; 73 | PyArg_ParseTuple(arg, "LO", &interface,&py_init_states); 74 | 75 | std::vector init_states; 76 | for(int i=0;i result_alphas; 81 | 82 | (*interface->push_down)( 83 | init_states, 84 | result_alphas); 85 | PyObject * rtn_list=PyList_New(result_alphas.size()); 86 | for(int i=0;ipush_down->cal_betas(); 98 | 99 | std::vector states; 100 | std::vector scores; 101 | 102 | interface->push_down->get_states(states,scores); 103 | 104 | PyObject * list=PyList_New(states.size()); 105 | for(int i=0;i 3 | #include 4 | template 5 | class Smart_String{ 6 | public: 7 | typedef size_t SIZE_T; 8 | ITEM* pt; 9 | SIZE_T length; 10 | SIZE_T* _ref_count; 11 | Smart_String(){ 12 | pt=NULL; 13 | length=0; 14 | _ref_count=new SIZE_T(); 15 | *_ref_count=1; 16 | }; 17 | Smart_String(ITEM* buffer, SIZE_T length){ 18 | _ref_count=new SIZE_T(); 19 | *_ref_count=1; 20 | pt=new ITEM[length]; 21 | this->length=length; 22 | memcpy(pt,buffer,length*sizeof(ITEM)); 23 | }; 24 | Smart_String(SIZE_T length){ 25 | _ref_count=new SIZE_T(); 26 | *_ref_count=1; 27 | pt=new ITEM[length]; 28 | this->length=length; 29 | }; 30 | Smart_String(const Smart_String& other){ 31 | pt=other.pt; 32 | length=other.length; 33 | _ref_count=other._ref_count; 34 | (*_ref_count)++; 35 | }; 36 | inline void operator=(const Smart_String& other){ 37 | (*_ref_count)--; 38 | if(!*_ref_count){ 39 | delete _ref_count; 40 | if(pt)delete[] pt; 41 | } 42 | pt=other.pt; 43 | length=other.length; 44 | _ref_count=other._ref_count; 45 | (*_ref_count)++; 46 | }; 47 | ~Smart_String(){ 48 | (*_ref_count)--; 49 | if(!*_ref_count){ 50 | delete _ref_count; 51 | if(pt)delete[] pt; 52 | } 53 | }; 54 | 55 | inline bool operator==(const Smart_String&next) const{ 56 | if(length!=next.length) 57 | return false; 58 | if(pt==next.pt)return true; 59 | for(int i=0;inext.length)return 0; 67 | for(int i=0;inext.pt[i])return 0; 70 | } 71 | return 0; 72 | }; 73 | inline const size_t& size() const{ 74 | return length; 75 | }; 76 | 77 | class HASH{ 78 | public: 79 | inline SIZE_T operator()(const Smart_String& cx) const{ 80 | SIZE_T value=0; 81 | for(int i=0;istr==next.str; 153 | }; 154 | inline bool operator<(const Smart_Chars& next)const{ 155 | if(this->str.length()str.length()>next.str.length())return 0; 157 | for(int i=0;istr.length();i++){ 158 | if((Char)this->str[i]<(Char)next.str[i])return 1; 159 | if((Char)this->str[i]>(Char)next.str[i])return 0; 160 | } 161 | return 0; 162 | }; 163 | }; 164 | -------------------------------------------------------------------------------- /isan/common/task.py: -------------------------------------------------------------------------------- 1 | 2 | #not finished !! 3 | class Lattice (list) : 4 | def __init__(self,l,w=None): 5 | self.weights=w 6 | self.extend(list(l)) # items= [ (begin,end,data) * ] 7 | self.length=max(l for _,l,_ in self) 8 | self.begins={} 9 | for i in range(len(self)) : 10 | b=self[i][0] 11 | if b not in self.begins : self.begins[b]=[] 12 | self.begins[b].append(i) 13 | def __str__(self): 14 | return ' '.join("%i:(%i,%i):%s"%(i,it[0],it[1],it[2]) for i,it in enumerate(self)) 15 | 16 | def gen_sentence(self,to_str): 17 | o=0 18 | s=[] 19 | while True : 20 | if o not in self.begins : break 21 | nind=self.begins[o][0] 22 | b,o,x=self[nind] 23 | s.append(to_str(x)) 24 | return ''.join(s) 25 | 26 | 27 | 28 | class Base_Task : 29 | def get_init_states(self) : 30 | return [self.State.init_state] 31 | 32 | #def reduce(self,last_ind,stat,pred_inds,predictors): 33 | # pass 34 | reduce = None 35 | 36 | 37 | def actions_to_moves(self,actions,lattice): 38 | #print(lattice) 39 | state=self.State(lattice) 40 | stack=[state] 41 | moves=[[None,None,action] for action in actions] 42 | moves[0][0]=0 43 | moves[0][1]=self.State.init_state 44 | for i in range(len(moves)-1) : 45 | move=moves[i] 46 | step,state,action=move 47 | ind,label=action 48 | 49 | if ind >=0 : # shift 50 | rst=[[nstep,ns] for a,nstep,ns in self.shift(step,state) if a==self.Action.encode(action)] 51 | moves[i+1][0],moves[i+1][1]=rst[0] 52 | stack.append(rst[0][1]) 53 | else : # reduce 54 | s0=stack.pop() 55 | s1=stack.pop() 56 | rst=[[nstep,ns] for a,nstep,ns,_ in self.reduce(step,s0,[0],[s1]) if a==self.Action.encode(action)] 57 | #print(i) 58 | moves[i+1][0],moves[i+1][1]=rst[0] 59 | stack.append(rst[0][1]) 60 | pass 61 | #input() 62 | for move in moves: 63 | move[2]=self.Action.encode(move[2]) 64 | 65 | moves=list(map(tuple,moves)) 66 | return moves 67 | 68 | def moves_to_result(self,moves,_): 69 | actions=[self.Action.decode(a) for ind,state,a in moves] 70 | #print(actions) 71 | #input() 72 | return self.actions_to_result(actions) 73 | 74 | 75 | def check(self,std_moves,rst_moves): 76 | if len(std_moves)!=len(rst_moves) :return False 77 | return all( 78 | std_move[2]==rst_move[2] 79 | for std_move,rst_move in zip(std_moves,rst_moves) 80 | ) 81 | 82 | def set_oracle(self,raw,y) : 83 | self.oracle=[None] 84 | self.set_raw(raw,y) 85 | std_actions=self.result_to_actions(y) 86 | moves=self.actions_to_moves(std_actions,raw) 87 | return moves 88 | 89 | def remove_oracle(self): 90 | self.oracle=None 91 | 92 | early_stop=None 93 | 94 | def _update(self,move,delta): 95 | self.gen_features(move[1],[move[2]],delta) 96 | def update_moves(self,std_moves,rst_moves,step) : 97 | for s,r in zip(std_moves,rst_moves) : 98 | if s!= r: 99 | self._update(s,1,step) 100 | self._update(r,-1,step) 101 | #yield s, 1 102 | #yield r, -1 103 | 104 | def average_weights(self,step): 105 | self.weights.average_weights(step) 106 | 107 | def un_average_weights(self): 108 | self.weights.un_average_weights() 109 | 110 | class Early_Stop_Pointwise : 111 | def set_oracle(self,raw,y) : 112 | self.set_raw(raw,y) 113 | self.stop_step=None 114 | std_actions=self.result_to_actions(y) 115 | moves=self.actions_to_moves(std_actions,raw) 116 | 117 | self.oracle={} 118 | for step,state,action in moves : 119 | self.oracle[step]=self.State.decode(state) 120 | return moves 121 | 122 | def remove_oracle(self): 123 | self.stop_step=None 124 | self.oracle=None 125 | 126 | def early_stop(self,step,next_states,moves): 127 | #print('early') 128 | #return False 129 | if not moves : return False 130 | if not hasattr(self,'oracle') or self.oracle==None : return False 131 | last_steps,last_states,actions=zip(*moves) 132 | self.stop_step=None 133 | if step in self.oracle : 134 | next_states=[self.State.decode(x) for x in next_states] 135 | if not (self.oracle[step]in next_states) : 136 | self.stop_step=step 137 | return True 138 | return False 139 | #def early_stop(self,step,next_states,moves): 140 | # return False 141 | 142 | def update_moves(self,std_moves,rst_moves,step) : 143 | for move in rst_moves : 144 | if self.stop_step is not None and move[0]>=self.stop_step : break 145 | self._update(move,-1,step) 146 | #yield move, -1 147 | for move in std_moves : 148 | if self.stop_step is not None and move[0]>=self.stop_step : break 149 | #yield move, 1 150 | self._update(move,1,step) 151 | -------------------------------------------------------------------------------- /isan/common/updater.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import collections 3 | 4 | from isan.common.parameters import _Base_Dict 5 | from isan.common.parameters import _Base_ndarray 6 | 7 | 8 | 9 | class Ada_Grad : 10 | name='Ada Grad' 11 | class d(_Base_Dict): 12 | def __init__(self,dic): 13 | self.update(dic) 14 | self._s=dict(dic) 15 | 16 | def _update(self,step): 17 | for k,v in self._delta.items(): 18 | if np.all(v==0) : continue 19 | if k not in self : 20 | self[k]=0 21 | if k not in self._s : 22 | self._s[k]=0 23 | self._s[k]+=v**2 24 | _s=self._s[k] 25 | _delta=np.where(_s,1/np.sqrt(_s+(_s==0)),0)*v 26 | self[k]+=_delta 27 | self[k]*=0.99 28 | self._delta.clear() 29 | 30 | class ndarray(_Base_ndarray): 31 | def init(self,paras): 32 | self._s=0 33 | self._delta=0 34 | self.paras=paras 35 | def _update(self,step) : 36 | if np.all(self._delta==0) : return 37 | self._s+=self._delta**2 38 | delta=np.where(self._s,1/np.sqrt(self._s+(self._s==0)),0)*self._delta 39 | self+=delta 40 | self*=0.99 41 | self._delta=0 42 | 43 | class Default : 44 | name='naive' 45 | class d(_Base_Dict): 46 | def __init__(self,dic): 47 | self.update(dic) 48 | self._s=dict(dic) 49 | def _update(self,step): 50 | for k,v in self._delta.items(): 51 | if k not in self : 52 | self[k]=0 53 | self._s[k]=0 54 | self[k]+=v 55 | self._s[k]+=v*step 56 | self._delta.clear() 57 | 58 | 59 | class ndarray(_Base_ndarray): 60 | def init(self,paras): 61 | self._delta=0 62 | self.paras=paras 63 | def _update(self,step) : 64 | self+=self._delta 65 | self._delta=0 66 | 67 | class Averaged : 68 | name='Averaged' 69 | class d(_Base_Dict): 70 | def __init__(self,dic): 71 | self.update(dic) 72 | self._s=dict(dic) 73 | 74 | def _update(self,step): 75 | for k,v in self._delta.items(): 76 | if k not in self : 77 | self[k]=0 78 | self._s[k]=0 79 | self[k]+=v 80 | self._s[k]+=v*step 81 | self._delta.clear() 82 | 83 | def final(self,step): 84 | self._backup=dict(self) 85 | for k,v in self._backup.items(): 86 | self[k]=self[k]-self._s[k]/step 87 | 88 | def un_final(self): 89 | self.clear() 90 | self.update(self._backup) 91 | self._backup.clear() 92 | 93 | class ndarray(_Base_ndarray): 94 | def _update(self,step) : 95 | self+=self._delta 96 | self._s+=self._delta*step 97 | self._delta=0 98 | 99 | def final(self,step): 100 | self._d=self*1 101 | self-=self._s/step 102 | 103 | def un_final(self): 104 | self*=0 105 | self+=self._d 106 | -------------------------------------------------------------------------------- /isan/common/weights.py: -------------------------------------------------------------------------------- 1 | import collections 2 | import isan.common.feature_dict as feature_dict 3 | 4 | 5 | class FD : 6 | def __init__(self): 7 | self.fd=feature_dict.new() 8 | 9 | def size(self): 10 | return feature_dict.size(self.fd) 11 | 12 | def set_weights(self,d): 13 | return feature_dict.set_weights(self.fd,d) 14 | 15 | def cal_fv(self,fv): 16 | return feature_dict.cal_fv(self.fd,fv) 17 | 18 | def update_fv(self,fv,delta): 19 | feature_dict.update_fv(self.fd,fv,delta) 20 | 21 | def to_dict(self): 22 | return feature_dict.to_dict(self.fd) 23 | 24 | def get(self,key): 25 | return feature_dict.get(self.fd,key) 26 | 27 | def clear(self): 28 | feature_dict.clear(self.fd) 29 | 30 | def __del__(self): 31 | feature_dict.delete(self.fd) 32 | 33 | class Weights : 34 | def items(self): 35 | for k,v in self.data.items(): 36 | yield k,v 37 | 38 | def __init__(self): 39 | self.data=dict() 40 | self.s=dict() 41 | 42 | def add_model(self,model): 43 | for k,v in model.items(): 44 | if v==0 : continue 45 | if k not in self.data : 46 | self.data[k]=0 47 | self.s[k]=0 48 | self.data[k]=(self.data[k]*self.s[k]+v)/(self.s[k]+1) 49 | self.s[k]+=1 50 | 51 | def __call__(self,keys): 52 | return float(sum(self.data.get(k,0) for k in keys)) 53 | 54 | def update_weights(self,keys,delta,step): 55 | for f in keys : 56 | if f not in self.data : 57 | self.data[f]=0 58 | self.s[f]=0 59 | self.data[f]+=delta 60 | self.s[f]+=delta*(step) 61 | 62 | def average_weights(self,step): 63 | self._backup=dict(self.data) 64 | for k,v in self._backup.items(): 65 | self.data[k]=self.data[k]-self.s[k]/step 66 | 67 | def un_average_weights(self): 68 | self.data.clear() 69 | self.data.update(self._backup) 70 | self._backup.clear() 71 | -------------------------------------------------------------------------------- /isan/data/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3 2 | 3 | def test(): 4 | print("this is a test function") 5 | -------------------------------------------------------------------------------- /isan/data/lattice.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | """ 4 | item= begin,end,* 5 | weights = [] 6 | gold= 7 | 8 | eval 9 | 10 | 11 | [item] 12 | 13 | """ 14 | class Lattice : 15 | def __init__(self,l,w): 16 | self.weights=w 17 | self.items=l 18 | chars={} 19 | begins={} 20 | for i,item in enumerate(self.items) : 21 | begin=item[0] 22 | for j,c in enumerate(item[2]): 23 | o=j+begin 24 | if o not in chars: chars[o]=c 25 | if begin not in begins : begins[begin]=[] 26 | begins[begin].append(i) 27 | self.begins=begins 28 | self.sentence=''.join(x[1] for x in sorted(list(chars.items()))) 29 | self.length=len(self.sentence) 30 | def __str__(self): 31 | items=' '.join('_'.join(map(str,item)) for item in self.items) 32 | s='lattice of: %s\nitems: %s'%(self.sentence,items) 33 | return s 34 | 35 | """ 36 | [{'start': ,'end':,'key','info','gold'}] 37 | """ 38 | class Data : 39 | @staticmethod 40 | def to_train(data): 41 | train=[] 42 | for item in data: 43 | train.append(item['key']) 44 | lattice=Lattice(train,None) 45 | return lattice 46 | pass 47 | 48 | -------------------------------------------------------------------------------- /isan/parsing/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhangkaixu/isan/6dd4a2d7c16158e9d5e559aa79d1e9b9ace2b6de/isan/parsing/__init__.py -------------------------------------------------------------------------------- /isan/parsing/char_dep.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | import json 3 | import time 4 | import isan.parsing.dep_codec as codec 5 | import isan.parsing.eval as eval 6 | class Dep: 7 | shift_action=ord('s') 8 | left_reduce=ord('l') 9 | right_reduce=ord('r') 10 | #init=(0,(0,0),(None,None,None)) 11 | def init(self): 12 | pass 13 | init_stat=pickle.dumps((0,(0,0),(None,None,None))) 14 | class Eval: 15 | def __init__(self): 16 | self.start_time=time.time() 17 | def __call__(self,a,b,**_): 18 | raw=''.join(a) 19 | 20 | mer=[i for i in range(len(raw))] 21 | for i,r in enumerate(b): 22 | if r!=-1 : 23 | if abs(i-r)==1 : 24 | mer[i]=r 25 | 26 | lx=-1 27 | sen=[] 28 | for x,c in zip(mer,raw): 29 | if x!=lx: 30 | sen.append('') 31 | lx=x 32 | sen[-1]+=c 33 | print(' '.join(sen)) 34 | 35 | 36 | pass 37 | def print_result(self): 38 | duration=time.time()-self.start_time 39 | print("历时:%.2f 现时:%s"%( 40 | duration, 41 | time.strftime("%H:%M:%S"))) 42 | pass 43 | pass 44 | class codec: 45 | def decode(line): 46 | data=json.loads(line.strip()) 47 | return data 48 | def shift(self,stat): 49 | stat=pickle.loads(stat) 50 | raw=self.raw 51 | ind,span,stack_top=stat 52 | l,r=span 53 | if self.intervals: 54 | if self.intervals[l][1]!=-1 and self.intervals[l][1]<=r: return [] 55 | 56 | if ind>=len(raw): return [] 57 | rtn= [ 58 | (self.shift_action, 59 | pickle.dumps( 60 | (ind+1, 61 | (ind,ind+1), 62 | (raw[ind][0], 63 | stack_top[0], 64 | stack_top[1]) 65 | ))) 66 | ] 67 | return rtn 68 | 69 | def reduce(self,stat,predictor): 70 | stat=pickle.loads(stat) 71 | ind,span,stack_top=stat 72 | predictor=pickle.loads(predictor) 73 | _,p_span,_=predictor 74 | s0,s1,s2=stack_top 75 | if s0==None or s1==None:return [] 76 | l,r=p_span[0],span[1] 77 | if self.intervals: 78 | if self.intervals[l][1]!=-1 and self.intervals[l][1]l: return [] 80 | s01=s1+s0 81 | rtn= [ 82 | (self.left_reduce,pickle.dumps((ind, 83 | (p_span[0],span[1]), 84 | (s01 if len(s01)<=2 else s1,predictor[2][1],predictor[2][2])))), 85 | (self.right_reduce,pickle.dumps((ind, 86 | (p_span[0],span[1]), 87 | (s01 if len(s01)<=2 else s0,predictor[2][1],predictor[2][2])))), 88 | ] 89 | return rtn 90 | def actions_to_stats(self,actions): 91 | sn=sum(1 if a==self.shift_action else 0 for a in actions) 92 | assert(sn*2-1==len(actions)) 93 | stat=None 94 | stack=[] 95 | ind=0 96 | for action in actions: 97 | stat=(ind,(0,0)if not stack else (stack[-1][4],stack[-1][5]), 98 | ( 99 | stack[-1][0] if len(stack)>0 else None, 100 | stack[-2][0] if len(stack)>1 else None, 101 | stack[-3][0] if len(stack)>2 else None, 102 | )) 103 | yield pickle.dumps(stat) 104 | if action==self.shift_action: 105 | stack.append([self.raw[ind],self.raw[ind],None,None,ind,ind+1]) 106 | ind+=1 107 | else: 108 | s01=stack[-1][0]+stack[-2][0] 109 | if action==self.left_reduce: 110 | if len(s01)<=2 : 111 | stack[-2][0]=s01 112 | stack[-2][3]=stack[-1][1] 113 | stack[-2][5]=stack[-1][5] 114 | stack.pop() 115 | if action==self.right_reduce: 116 | stack[-1][2]=stack[-2][1] 117 | stack[-1][4]=stack[-2][4] 118 | stack[-2]=stack[-1] 119 | if len(s01)<=2 : 120 | stack[-2][0]=s01 121 | stack.pop() 122 | def set_raw(self,raw,Y): 123 | """ 124 | 对需要处理的句子做必要的预处理(如缓存特征) 125 | """ 126 | if Y: 127 | self.intervals=Y[1] 128 | else: 129 | self.intervals=None 130 | self.raw=raw 131 | self.f_raw=[w.encode() for w in self.raw] 132 | def gen_features(self,stat): 133 | stat=pickle.loads(stat) 134 | ind,_,stack_top=stat 135 | s0,s1,s2=stack_top 136 | 137 | q0=self.f_raw[ind] if ind=0 else b'' 140 | 141 | s0=(s0.encode() if s0 else b'') 142 | s1=(s1.encode() if s1 else b'') 143 | s2=(s2.encode() if s2 else b'') 144 | 145 | 146 | 147 | fv=[ 148 | b'0'+s0, 149 | b'1'+s1, 150 | b'2'+s2, 151 | b'3'+s1+b' '+s2, 152 | b'4'+s0+b' '+q0, 153 | b'5'+q0+b' '+q1, 154 | b'6'+q0, 155 | b'6'+c0, 156 | ] 157 | return fv 158 | def actions_to_result(self,actions,raw): 159 | ind=0 160 | stack=[] 161 | arcs=[] 162 | for a in actions: 163 | if a==self.shift_action: 164 | stack.append(ind) 165 | ind+=1 166 | elif a==self.left_reduce: 167 | arcs.append((stack[-1],stack[-2])) 168 | stack.pop() 169 | elif a==self.right_reduce: 170 | arcs.append((stack[-2],stack[-1])) 171 | stack[-2]=stack[-1] 172 | stack.pop() 173 | arcs.append((stack[-1],-1)) 174 | arcs.sort() 175 | arcs=[x for _,x in arcs] 176 | return arcs 177 | 178 | sen=[] 179 | cache='' 180 | for c,a in zip(raw,actions[1:]): 181 | cache+=c 182 | if a==self.shift_action: 183 | sen.append(cache) 184 | cache='' 185 | if cache: 186 | sen.append(cache) 187 | return sen 188 | def result_to_actions(self,result): 189 | """ 190 | 将依存树转化为shift-reduce的动作序列(与动态规划用的状态空间无关) 191 | 在一对多中选择了一个(没搞清楚相关工作怎么弄的) 192 | """ 193 | stack=[] 194 | actions=[] 195 | result=[ind for _,_,ind,_ in result] 196 | record=[[ind,head,0] for ind,head in enumerate(result)] 197 | for ind,head,_ in record: 198 | if head!=-1 : 199 | record[head][2]+=1 200 | for ind,head in enumerate(result): 201 | actions.append(self.shift_action) 202 | stack.append([ind,result[ind],record[ind][2]]) 203 | while len(stack)>=2: 204 | if stack[-1][2]==0 and stack[-1][1]!=-1 and stack[-1][1]==stack[-2][0]: 205 | actions.append(self.left_reduce) 206 | stack.pop() 207 | stack[-1][2]-=1 208 | elif stack[-2][1]!=-1 and stack[-2][1]==stack[-1][0]: 209 | actions.append(self.right_reduce) 210 | stack[-2]=stack[-1] 211 | stack.pop() 212 | stack[-1][2]-=1 213 | else: 214 | break 215 | assert(len(actions)==2*len(result)-1) 216 | return actions 217 | -------------------------------------------------------------------------------- /isan/parsing/codec.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3 2 | 3 | 4 | class Prefix: 5 | def __init__(self): 6 | self.content='' 7 | def __call__(self,arg=''): 8 | tmp=self.content 9 | self.content=arg 10 | return tmp 11 | 12 | 13 | def act_pop(stack): 14 | if len(stack)>1: 15 | sub=stack.pop() 16 | stack[-1].append(sub) 17 | 18 | def normalize(tree,tag='s'): 19 | if type(tree)==str: 20 | assert(1==2) 21 | tree[0][1]=tag 22 | 23 | structure=[] 24 | for sub in tree[1:]: 25 | if type(sub)==str: 26 | 27 | if tree[0][0]=='m' and len(tree[1:])>1: 28 | structure.append([['f','-fix'],[sub,'f']]) 29 | else: 30 | structure.append([[tree[0][0],'head'],[sub,tree[0][0]]]) 31 | else: 32 | t= 'm' if not sub[0][1] else sub[0][1] 33 | #if t=='m' and tag=='m': t='f' 34 | structure.append(normalize(sub,t)) 35 | print(tree[0]) 36 | for struct in structure: 37 | print(' ',struct) 38 | print("") 39 | return [tree[0],structure] 40 | 41 | def decode(line=None): 42 | #line='[(少将)称([(中方){[{完全}有(实力)]}{在(((黄岩)岛)[对峙])中}奉陪{到底}])]' 43 | #line='[{((实际)上)}({(中国)的}((国防)科技)部门){一直}{高度}关注({((博弈)论)的}(军事)应用)]' 44 | #line='[{曾经}纠结([{是否}让((搜狗)(输入)法)支持((火星)文)])]' 45 | line='''[[只要]a({[这样]的}大师)[在((我们)旁边)]出现]''' 46 | stack=[] 47 | 48 | prefix=Prefix() 49 | 50 | actions={ 51 | '[':lambda stack: stack.append([['v',prefix('')]]), 52 | ']':lambda stack: act_pop(stack), 53 | '(':lambda stack: stack.append([['n',prefix('')]]), 54 | ')':lambda stack: act_pop(stack), 55 | '{':lambda stack: stack.append([['m',prefix('')]]), 56 | '}':lambda stack: act_pop(stack), 57 | '<':lambda stack: stack.append([['c',prefix('')]]), 58 | '>':lambda stack: act_pop(stack), 59 | 'a':lambda stack: prefix('a'), 60 | } 61 | for token in line: 62 | if token in actions: 63 | actions[token](stack) 64 | else: 65 | if len(stack[-1])<2 or type(stack[-1][-1])!=str: 66 | stack[-1].append('') 67 | stack[-1][-1]+=token 68 | 69 | stack[0]=normalize(stack[0]) 70 | return stack[0] 71 | if __name__=="__main__": 72 | tree=decode() 73 | print(tree) 74 | -------------------------------------------------------------------------------- /isan/parsing/default_dep2.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | import json 3 | #import marshal as pickle 4 | import isan.parsing.eval as eval 5 | from isan.data.lattice import Lattice as Lattice 6 | from isan.common.lattice import Lattice_Task as Base_Task 7 | 8 | from isan.parsing.seq_dep import Action as Base_Action 9 | from isan.parsing.seq_dep import State as Base_State 10 | from isan.parsing.seq_dep import codec as base_codec 11 | from isan.parsing.seq_dep import Dep as Base_Dep 12 | 13 | 14 | class codec (base_codec): 15 | class Json_Lattice_Data : 16 | def __init__(self,line): 17 | self.lattice=json.loads(line) 18 | self.lattice=[[k,v] for k,v in self.lattice if 'dep' in v] 19 | def make_raw(self): 20 | lat=self.lattice 21 | raw=[] 22 | for i in range(len(lat)): 23 | k,v =lat[i] 24 | k=tuple(k) 25 | lat[i][0]=k 26 | #if not ('is_test' in v and v['is_test']) : 27 | if True: 28 | raw.append([k,v.get('tag-weight',None)]) 29 | if 'dep' in v and v['dep'][1]!=None : 30 | v['dep'][1]=tuple(v['dep'][1]) 31 | l,w=zip(*raw) 32 | lattice=Lattice(l,w) 33 | return lattice 34 | 35 | def make_gold(self): 36 | lat=self.lattice 37 | gold=[] 38 | for k,v in lat : 39 | if 'tag-weight' in v : del v['tag-weight'] 40 | if not v : v=None 41 | else : 42 | v=[v['dep'][1]] 43 | gold.append([k,v]) 44 | return gold 45 | 46 | @staticmethod 47 | def arcs_to_result(arcs,lattice): 48 | return arcs 49 | @staticmethod 50 | def result_to_arcs(result): 51 | result=[ind for _,_,ind,_ in result] 52 | return result 53 | @staticmethod 54 | def encode(raw,result): 55 | return ' '.join(['_'.join([item[0],item[1],str(head)]) for item,head in zip(raw,result)]) 56 | 57 | @staticmethod 58 | def decode(line): 59 | data=codec.Json_Lattice_Data(line) 60 | lattice=data.make_raw() 61 | lat=data.make_gold() 62 | #raw=[(w,t)for b,e,w,t in lattice.items] 63 | raw=lattice 64 | inds={} 65 | for i,it in enumerate(lat): 66 | inds[it[0]]=i 67 | lat=[tuple([word[2],word[3]]+([inds[head[0]],'DEP'] if head[0] else [-1,'ROOT'])) 68 | for word,head in lat] 69 | return {'raw':raw,'y':lat} 70 | 71 | @staticmethod 72 | def to_raw(line): 73 | return [(w,t)for w,t,*_ in line] 74 | 75 | class Action (Base_Action): 76 | @staticmethod 77 | def actions_to_arcs(actions): 78 | ind=0 79 | stack=[] 80 | arcs=[] 81 | for a in actions: 82 | is_shift,*rest=Action.parse_action(a) 83 | if is_shift : 84 | sind=rest[0] 85 | stack.append(sind) 86 | ind+=1 87 | elif a==Action.left_reduce: 88 | arcs.append((stack[-1],stack[-2])) 89 | stack.pop() 90 | elif a==Action.right_reduce: 91 | arcs.append((stack[-2],stack[-1])) 92 | stack[-2]=stack[-1] 93 | stack.pop() 94 | arcs.append((stack[-1],-1)) 95 | arcs.sort() 96 | arcs=[x for _,x in arcs] 97 | return arcs 98 | @staticmethod 99 | def arcs_to_actions(arcs): 100 | result=arcs 101 | stack=[] 102 | actions=[] 103 | record=[[ind,head,0] for ind,head in enumerate(result)]# [ind, ind_of_head, 是head的次数] 104 | for ind,head,_ in record: 105 | if head!=-1 : 106 | record[head][2]+=1 107 | for ind,head in enumerate(result): 108 | #actions.append(self.shift_action) 109 | actions.append(Action.shift_action(ind)) 110 | stack.append([ind,result[ind],record[ind][2]]) 111 | while len(stack)>=2: 112 | if stack[-1][2]==0 and stack[-1][1]!=-1 and stack[-1][1]==stack[-2][0]: 113 | actions.append(Action.left_reduce) 114 | stack.pop() 115 | stack[-1][2]-=1 116 | elif stack[-2][1]!=-1 and stack[-2][1]==stack[-1][0]: 117 | actions.append(Action.right_reduce) 118 | stack[-2]=stack[-1] 119 | stack.pop() 120 | stack[-1][2]-=1 121 | else: 122 | break 123 | 124 | return actions 125 | 126 | class State(Action,Base_State) : 127 | init_stat=pickle.dumps((0,(0,0),(None,None,None))) 128 | shift_cost=1 129 | reduce_cost=1 130 | def __init__(self,bt,lattice): 131 | self.lattice=lattice 132 | state=pickle.loads(bt) 133 | self.ind,self.span,self.stack_top=state 134 | #self.stop_step=2*len(self.lattice.items)-1 135 | #self.stop_step=2*self.lattice.length-1 136 | self.stop_step=self.lattice.length*State.shift_cost+(self.lattice.length-1)*State.reduce_cost 137 | 138 | def shift(self,shift_ind): 139 | item=self.lattice.items[shift_ind] 140 | #next_ind=self.ind+1 141 | next_ind=self.ind+2*len(item[2])-1 142 | #next_ind=self.ind+len(item[2])*State.shift_cost+(len(item[2])-1)*State.reduce_cost 143 | #if next_ind==self.stop_step : next_ind=-1 144 | if item[1]==self.lattice.length and self.stack_top[0]==None : next_ind=-1 145 | 146 | state=( 147 | next_ind, 148 | (item[0],item[1]), 149 | ((item[2],item[3],None,None), 150 | self.stack_top[0], 151 | self.stack_top[1][1] if self.stack_top[1] else None) 152 | ) 153 | return [(self.shift_action(shift_ind),next_ind,pickle.dumps(state))] 154 | pass 155 | def reduce(self,pre_state,alpha_ind): 156 | next_ind=self.ind+1 157 | #next_ind=self.ind+State.reduce_cost 158 | if self.span[1]==self.lattice.length and pre_state.stack_top[1]==None : next_ind=-1 159 | 160 | s0,s1,s2=self.stack_top 161 | 162 | if s0==None or s1==None: return [] 163 | 164 | reduce_state1=( 165 | next_ind, 166 | (pre_state.span[0],self.span[1]), 167 | ((s1[0],s1[1],s1[2],s0[1]),pre_state.stack_top[1],pre_state.stack_top[2])) 168 | 169 | reduce_state2=(next_ind, 170 | (pre_state.span[0],self.span[1]), 171 | ((s0[0],s0[1],s1[1],s0[3]),pre_state.stack_top[1],pre_state.stack_top[2])) 172 | 173 | reduce_state1=pickle.dumps(reduce_state1) 174 | reduce_state2=pickle.dumps(reduce_state2) 175 | return [ 176 | (self.left_reduce,next_ind,reduce_state1,alpha_ind), 177 | (self.right_reduce,next_ind,reduce_state2,alpha_ind), 178 | ] 179 | 180 | class Dep (Base_Dep): 181 | pass 182 | name="依存句法分析" 183 | Action=Action 184 | State=State 185 | 186 | Eval=eval.Eval 187 | codec=codec 188 | 189 | 190 | def gen_features(self,span,actions): 191 | fvs=[] 192 | fv=self.gen_features_one(span) 193 | for action in actions: 194 | is_shift,*_=self.Action.parse_action(action) 195 | if is_shift : 196 | action='s'.encode() 197 | else : 198 | action=chr(action).encode() 199 | fvs.append([action+x for x in fv]) 200 | return fvs 201 | 202 | def gen_features_one(self,stat): 203 | stat=State.load(stat) 204 | _,span,stack_top=stat 205 | s0,s1,s2_t=stack_top 206 | 207 | s2_t=b'~' if s2_t is None else s2_t.encode() 208 | 209 | if s0: 210 | s0_w,s0_t,s0l_t,s0r_t=s0 211 | s0l_t=b'~' if s0l_t is None else s0l_t.encode() 212 | s0r_t=b'~' if s0r_t is None else s0r_t.encode() 213 | s0_w=s0_w.encode() 214 | s0_t=s0_t.encode() 215 | else: 216 | s0_w,s0_t,s0l_t,s0r_t=b'~',b'~',b'~',b'~' 217 | 218 | if s1: 219 | s1_w,s1_t,s1l_t,s1r_t=s1 220 | s1l_t=b'~' if s1l_t is None else s1l_t.encode() 221 | s1r_t=b'~' if s1r_t is None else s1r_t.encode() 222 | s1_w=s1_w.encode() 223 | s1_t=s1_t.encode() 224 | else: 225 | s1_w,s1_t,s1l_t,s1r_t=b'~',b'~',b'~',b'~' 226 | 227 | ind=self.lattice.begins.get(span[1],[len(self.f_raw)])[0] 228 | q0_w,q0_t=self.f_raw[ind] if ind=2: 27 | if stack[-1][2]==0 and stack[-1][1]==stack[-2][0] : 28 | stack.pop() 29 | stack[-1][2]-=1 30 | #actions.append(Action.left_reduce) 31 | elif stack[-2][1] == stack[-1][0] : 32 | stack[-2]=stack[-1] 33 | stack.pop() 34 | stack[-1][2]-=1 35 | #actions.append(Action.right_reduce) 36 | else : 37 | break 38 | return actions 39 | 40 | class State (Action,Base_State): 41 | """ 42 | """ 43 | 44 | def __init__(self,bt,lattice): 45 | self.lattice=lattice 46 | state=pickle.loads(bt) 47 | self.ind,self.span,self.stack_top,self.sequence=state 48 | #self.stop_step=2*self.lattice.length-1 49 | self.stop_step=2*self.lattice.length 50 | def shift(self,shift_ind): 51 | item=self.lattice.items[shift_ind] 52 | #next_ind=self.ind+2*len(item[2])-1 53 | next_ind=self.ind+2*len(item[2]) 54 | if next_ind==self.stop_step : next_ind=-1 55 | state=( 56 | next_ind, 57 | (item[0],item[1]), 58 | ( 59 | (shift_ind,None,None), 60 | self.stack_top[0], 61 | self.lattice.items[self.stack_top[1][0]][3] if self.stack_top[1] else None 62 | ), 63 | (shift_ind,self.sequence[0]), 64 | ) 65 | return [(self.shift_action(shift_ind),next_ind,pickle.dumps(state))] 66 | 67 | 68 | class Dep (Base_Dep): 69 | name="依存句法分析" 70 | State=State 71 | Action=Action 72 | Eval=eval.Eval 73 | codec=codec 74 | reduce=None 75 | 76 | 77 | def set_raw(self,raw,_): 78 | """ 79 | 对需要处理的句子做必要的预处理(如缓存特征) 80 | """ 81 | self.lattice=raw 82 | self.cb_fvs=[] 83 | for i,item in enumerate(self.lattice.items): 84 | fv=[] 85 | 86 | for j,c in enumerate(item[2]): 87 | 88 | o=item[0]+j 89 | if item[0]+1==item[1]: 90 | pos=b's' 91 | elif o == item[0] : 92 | pos=b'b' 93 | elif o==item[1]-1 : 94 | pos=b'e' 95 | else : 96 | pos=b'm' 97 | l2=self.lattice.sentence[o-2] if o-2>=0 else '#' 98 | l1=self.lattice.sentence[o-1] if o-1>=0 else '#' 99 | r1=self.lattice.sentence[o+1] if o+1=0 else '#' 83 | l1=self.lattice.sentence[o-1] if o-1>=0 else '#' 84 | r1=self.lattice.sentence[o+1] if o+1 =BackwardsSnippet() 5 | imap IMAP_JumpBack =IMAP_Jumpfunc('b', 0) 6 | imap IMAP_JumpForward =IMAP_Jumpfunc('', 0) 7 | snoremap  i=TriggerSnippet() 8 | vmap IMAP_JumpForward 9 | nmap IMAP_JumpForward 10 | snoremap  b 11 | snoremap % b% 12 | snoremap ' b' 13 | nmap ;ihn :IHN 14 | nmap ;is :IHS :A 15 | nmap ;ih :IHS 16 | map ;fl :NERDTree 17 | map ;tl :TlistToggle 18 | snoremap U bU 19 | snoremap \ b\ 20 | snoremap ^ b^ 21 | snoremap ` b` 22 | nmap gx NetrwBrowseX 23 | snoremap bi 24 | snoremap a 25 | snoremap b 26 | snoremap i=BackwardsSnippet() 27 | nnoremap NetrwBrowseX :call netrw#NetrwBrowseX(expand(""),0) 28 | vmap IMAP_JumpBack ` IMAP_JumpForward i=IMAP_Jumpfunc('', 0) 30 | vmap IMAP_DeleteAndJumpBack "_i=IMAP_Jumpfunc('b', 0) 31 | vmap IMAP_DeleteAndJumpForward "_i=IMAP_Jumpfunc('', 0) 32 | nmap IMAP_JumpBack i=IMAP_Jumpfunc('b', 0) 33 | nmap IMAP_JumpForward i=IMAP_Jumpfunc('', 0) 34 | map :!ctags -R --c++-kinds=+p --fields=+iaS --extra=+q --languages=c++ . 35 | inoremap  =TriggerSnippet() 36 | imap IMAP_JumpForward 37 | inoremap  =ShowAvailableSnips() 38 | inoremap  omni#cpp#maycomplete#Complete() 39 | inoremap . omni#cpp#maycomplete#Dot() 40 | inoremap : omni#cpp#maycomplete#Scope() 41 | imap ;ihn :IHN 42 | imap ;is :IHS :A 43 | imap ;ih :IHS 44 | inoremap > omni#cpp#maycomplete#Arrow() 45 | let &cpo=s:cpo_save 46 | unlet s:cpo_save 47 | set autoindent 48 | set background=dark 49 | set backspace=indent,eol,start 50 | set completeopt=preview,menuone 51 | set expandtab 52 | set fileencodings=ucs-bom,utf-8,default,latin1 53 | set grepprg=grep\ -nH\ $* 54 | set helplang=cn 55 | set hlsearch 56 | set incsearch 57 | set iskeyword=@,48-57,_,192-255,: 58 | set langmenu=zh_CN.UTF-8 59 | set omnifunc=omni#cpp#complete#Main 60 | set printoptions=paper:a4 61 | set ruler 62 | set runtimepath=~/.vim,/var/lib/vim/addons,/usr/share/vim/vimfiles,/usr/share/vim/vim73,/usr/share/vim/vimfiles/after,/var/lib/vim/addons/after,~/.vim/after 63 | set shiftwidth=4 64 | set smarttab 65 | set suffixes=.bak,~,.swp,.o,.info,.aux,.log,.dvi,.bbl,.blg,.brf,.cb,.ind,.idx,.ilg,.inx,.out,.toc 66 | set tabstop=4 67 | set wildignore=*.pyc 68 | " vim: set ft=vim : 69 | -------------------------------------------------------------------------------- /isan/tagging/PA_segger.py: -------------------------------------------------------------------------------- 1 | from struct import Struct 2 | import isan.tagging.eval as tagging_eval 3 | import isan.tagging.cwstask as cwstask 4 | import isan.tagging.cws as cws 5 | import json 6 | import random 7 | 8 | """ 9 | """ 10 | class Segger(cws.Task): 11 | """告诉isan,这是个什么task""" 12 | name='局部标注中文分词' 13 | 14 | class codec(cws.Task.codec): 15 | """ 16 | 任务的输入和输出是什么,如何从数据文件中获得 17 | """ 18 | @staticmethod 19 | def decode(line): 20 | """ 21 | 编码、解码 22 | 从一行文本中,得到输入(raw)和输出(y) 23 | """ 24 | line=line.strip() 25 | if not line: return [] 26 | if line[0]=='{': 27 | data=json.loads(line) 28 | #if data['Y_b'][1] : return [] 29 | return data 30 | seq=[word for word in line.split()] 31 | raw=''.join(seq) 32 | 33 | return {'raw':raw, 34 | 'y':seq, 35 | 'Y_a' : None, 36 | 'Y_b' : None, 37 | } 38 | 39 | def set_oracle(self,raw,y,Y) : 40 | self.early_stop=None 41 | std_moves=self.result_to_moves(y)#得到标准动作 42 | self.std_moves=std_moves 43 | return std_moves 44 | 45 | def is_belong(self,raw,moves,Y): 46 | 47 | if not Y : 48 | return self.check(self.std_moves,moves) 49 | 50 | return True 51 | seq,intervals=Y 52 | 53 | if intervals : 54 | offset=0 55 | y=self.moves_to_result(moves,raw) 56 | for w in y: 57 | r=intervals[offset][1] 58 | if r!=-1 and offset+len(w)>r : 59 | #print(y) 60 | return False 61 | l=intervals[offset+len(w)][0] 62 | if l!=-1 and l>offset : return False 63 | offset+=len(w) 64 | return True 65 | if seq: 66 | actions=[x[2] for x in moves] 67 | for a,s in zip(actions,seq): 68 | if s and ((s=='s' and a!=self.sep) or (s=='c' and a!=self.com)) : 69 | return False 70 | return True 71 | 72 | def shift(self,last_ind,stat): 73 | """ 74 | 根据当前状态,能产生什么动作,并且后续的状态是什么,就由这个函数决定了 75 | """ 76 | 77 | ind,last,_,wordl,lwordl=self.stat_fmt.unpack(stat) 78 | next_ind=last_ind+1 if last_ind+1 <= len(self.raw) else -1 79 | if self.actions and self.actions[ind]: 80 | if self.actions[ind]=='s': 81 | return [(self.sep,next_ind,self.stat_fmt.pack(ind+1,b'1',last,1,wordl))] 82 | else : 83 | return [(self.com,next_ind,self.stat_fmt.pack(ind+1,b'2',last,wordl+1,lwordl))] 84 | if self.intervals : 85 | rtn=[] 86 | ll,lr=self.intervals[ind-wordl] 87 | rl,rr=self.intervals[ind] 88 | if lr!=-1 and lr<=ind : 89 | return [(self.sep,next_ind,self.stat_fmt.pack(ind+1,b'1',last,1,wordl))] 90 | if rl!=-1 and ind-wordl=0 : emissions[i-2]+=v[0] 96 | if i-1 >=0 and i-1 < l: emissions[i-1]+=v[1] 97 | if i< l: emissions[i]+=v[2] 98 | 99 | for i,k in enumerate(self.bi) : 100 | # : ## #x xx xx x# ## 101 | # : x x x 102 | if k not in self.bi_d : continue 103 | v=self.bi_d[k] 104 | if i-3 >=0 : emissions[i-3]+=v[0] 105 | if i-2 >=0 and i-2 < l: emissions[i-2]+=v[1] 106 | if i-1 >=0 and i-1 < l: emissions[i-1]+=v[2] 107 | if i< l: emissions[i]+=v[3] 108 | 109 | 110 | def cal_delta(self,std_tags,rst_tags,delta): 111 | l=len(self.raw) 112 | dv=[self._new_vec() for i in range(len(std_tags))] 113 | for i in range(len(std_tags)) : 114 | dv[i][std_tags[i]]+=1 115 | dv[i][rst_tags[i]]-=1 116 | for i,k in enumerate(self.uni) : 117 | if chr(0) in k : continue # used for dropout 118 | if k not in self.uni_d : 119 | self.uni_d[k]=[self.paras.add(self._new_vec()) for v in range(3)] 120 | v_para=self.uni_d[k] 121 | if i-2 >=0 : 122 | v_para[0].add_delta(dv[i-2]) 123 | if i-1 >=0 and i-1=0 : 135 | v_para[0].add_delta(dv[i-3]) 136 | if i-2 >=0 and i-2=0 and i-1"+t+"") 36 | continue 37 | html.append(""+w+"_"+t+"") 38 | print(' '.join(html),"
",file=self.html) 39 | html=[] 40 | for b,w,t in sorted(std): 41 | if (b,w,t) in rst: 42 | continue 43 | if (b,w) in seg_rst: 44 | html.append(w+"_"+t+"") 45 | continue 46 | html.append(""+w+"_"+t+"") 47 | print(' '.join(html),"

",file=self.html) 48 | 49 | """ 50 | tagging_eval std rst 51 | """ 52 | 53 | def str_to_list_old(string): 54 | offset=0 55 | li=[] 56 | #print(string) 57 | for word, tag in [x.split('_') for x in string.split()]: 58 | li.append((offset,offset+len(word),tag)) 59 | offset+=len(word) 60 | return li 61 | 62 | def str_to_list(string): 63 | offset=0 64 | li=[] 65 | #print(string) 66 | for word, tag in [x.split('_') for x in string.split()]: 67 | li.append((offset,(word),tag)) 68 | offset+=len(word) 69 | return li 70 | 71 | class CrossBoundaryErrors(object): 72 | def __init__(self): 73 | self.value=0 74 | def __call__(self,std,rst): 75 | max_ind=max(e for b,e,t in std) 76 | boundaries=[0 for i in range(max_ind+1)] 77 | for b,e,t in std: 78 | boundaries[b]=1 79 | boundaries[e]=1 80 | for b,e,t in rst: 81 | if boundaries[b]==1 and boundaries[e]==1: 82 | continue 83 | if any(boundaries[i]==1 for i in range(b+1,e)): 84 | self.value+=1 85 | 86 | 87 | class TaggingEval: 88 | """ 89 | 分词词性标注评测类 90 | """ 91 | def get_prf(self,seg=False): 92 | """ 93 | 得到评测的结果,准确度、精确度和F1 94 | """ 95 | cor=self.cor if seg==False else self.seg_cor 96 | p=cor/self.rst if self.rst else 0 97 | r=cor/self.std if self.std else 0 98 | f=2*p*r/(r+p) if (r+p) else 0 99 | return p,r,f 100 | def __init__(self,plugins=[],sep='_'): 101 | """ 102 | 初始化 103 | """ 104 | self.otime=time.time() 105 | self.plugins=plugins 106 | self.std,self.rst=0,0 107 | self.cor,self.seg_cor=0,0 108 | self.sep=sep 109 | self.characters=0 110 | self.overlaps=0 111 | self.with_tags=False 112 | def print_result(self): 113 | """ 114 | 打印结果 115 | """ 116 | time_used=time.time()-self.otime 117 | speed=self.characters/time_used 118 | 119 | cor=self.cor 120 | p=cor/self.rst if self.rst else 0 121 | r=cor/self.std if self.std else 0 122 | f=2*p*r/(r+p) if (r+p) else 0 123 | 124 | if self.with_tags : 125 | seg_cor=self.seg_cor 126 | p=seg_cor/self.rst if self.rst else 0 127 | r=seg_cor/self.std if self.std else 0 128 | seg_f=2*p*r/(r+p) if (r+p) else 0 129 | 130 | if self.with_tags : 131 | line=("标准: %d 输出: %d seg正确: %d 正确: %d seg_f1: \033[32;01m%.4f\033[1;m tag_f1: \033[32;01m%.4f\033[1;m ol: %d 时间: %.4f (%.0f字/秒)" 132 | %(self.std,self.rst,self.seg_cor,self.cor,seg_f,f,self.overlaps,time_used,speed)) 133 | else : 134 | line=("标准: %d 输出: %d 正确: %d f1: \033[32;01m%.4f\033[1;m ol: %d 时间: %.4f (%.0f字/秒)" 135 | %(self.std,self.rst,self.cor,f,self.overlaps,time_used,speed)) 136 | print(line,file=sys.stderr) 137 | sys.stderr.flush() 138 | 139 | def get_scaler(self): 140 | 141 | if self.with_tags : 142 | cor=self.cor 143 | p=cor/self.rst if self.rst else 0 144 | r=cor/self.std if self.std else 0 145 | f=2*p*r/(r+p) if (r+p) else 0 146 | return f 147 | else : 148 | seg_cor=self.seg_cor 149 | p=seg_cor/self.rst if self.rst else 0 150 | r=seg_cor/self.std if self.std else 0 151 | seg_f=2*p*r/(r+p) if (r+p) else 0 152 | return seg_f 153 | 154 | def get_result(self): 155 | time_used=time.time()-self.otime 156 | speed=self.characters/time_used 157 | 158 | cor=self.cor 159 | p=cor/self.rst if self.rst else 0 160 | r=cor/self.std if self.std else 0 161 | f=2*p*r/(r+p) if (r+p) else 0 162 | 163 | if self.with_tags : 164 | seg_cor=self.seg_cor 165 | p=seg_cor/self.rst if self.rst else 0 166 | r=seg_cor/self.std if self.std else 0 167 | seg_f=2*p*r/(r+p) if (r+p) else 0 168 | 169 | if self.with_tags : 170 | line=("标准: %d 输出: %d seg正确: %d 正确: %d seg_f1: \033[32;01m%.4f\033[1;m tag_f1: \033[32;01m%.4f\033[1;m ol: %d 时间: %.4f (%.0f字/秒)" 171 | %(self.std,self.rst,self.seg_cor,self.cor,seg_f,f,self.overlaps,time_used,speed)) 172 | else : 173 | line=("标准: %d 输出: %d 正确: %d f1: \033[32;01m%.4f\033[1;m ol: %d 时间: %.4f (%.0f字/秒)" 174 | %(self.std,self.rst,self.cor,f,self.overlaps,time_used,speed)) 175 | return line 176 | 177 | def _set_based(self,std,rst): 178 | self.std+=len(std) 179 | self.rst+=len(rst) 180 | self.cor+=len(std&rst) 181 | self.characters+=sum(len(w)for _,w,_ in std) 182 | self.seg_cor+=len({(b,e) for b,e,t in std}&{(b,e) for b,e,t in rst}) 183 | 184 | 185 | std=sorted(list(std)) 186 | rst=sorted(list(rst)) 187 | std_ind=0 188 | rst_ind=0 189 | while rst_ind < len(rst): 190 | b=rst[rst_ind][0] 191 | e=b+len(rst[rst_ind][1]) 192 | while std_ind < len(std) and std[std_ind][0]b and e >ee : # overlap bb b ee e 196 | self.overlaps+=1 197 | std_ind += 1 198 | while std_ind < len(std) and std[std_ind][0]b and e>bb and ee>e : # overlap b bb e ee 202 | self.overlaps+=1 203 | break 204 | std_ind += 1 205 | rst_ind+=1 206 | 207 | def _to_set(self,seq): 208 | s=set() 209 | if type(seq[0])!=str:#word with tag 210 | self.with_tags=True 211 | offset=0 212 | for word,tag in seq: 213 | s.add((offset,word,tag)) 214 | offset+=len(word) 215 | else:#only word 216 | offset=0 217 | for word in seq: 218 | s.add((offset,word,'')) 219 | offset+=len(word) 220 | return s 221 | def __call__(self,std,rst,raw=None): 222 | if not std:return 223 | if not rst:return 224 | self._set_based(self._to_set(std),self._to_set(rst)) 225 | for plugin in self.plugins: 226 | plugin(std,rst) 227 | 228 | def eval_files(self,std_file,rst_file,sep): 229 | for g,r in zip(open(std_file),open(rst_file)): 230 | gl=sum(len(x.partition(self.sep)[0])for x in g.split()) 231 | rl=sum(len(x.partition(self.sep)[0])for x in r.split()) 232 | if(gl!=rl): 233 | print("---") 234 | print(g.strip()) 235 | print(r.strip()) 236 | assert(gl==rl) 237 | g=g.split() 238 | r=r.split() 239 | #print(g) 240 | #print(r) 241 | if all(sep in x for x in g) and all(sep in x for x in r) : 242 | g=[x.split(sep) for x in g] 243 | r=[x.split(sep) for x in r] 244 | self(g,r) 245 | else : 246 | self(g,r) 247 | if __name__=="__main__": 248 | import argparse 249 | parser=argparse.ArgumentParser(description="用于分词词性标注的评测和比较") 250 | parser.add_argument('std',help='被比较的标注结果') 251 | parser.add_argument('rst',help='用以比较的标注结果',nargs='?',default='-') 252 | parser.add_argument('-s','--separator',help='词和词性间的分隔符',dest='sep',default='_') 253 | parser.add_argument('-d','--diff',help='指定以html格式输出的显示差异的文件的名字',dest='diff_file') 254 | args=parser.parse_args() 255 | 256 | plugins=[] 257 | if args.diff_file!=None: 258 | plugins.append(DiffToHTML(args.diff_file)) 259 | eval=TaggingEval(plugins,sep=args.sep) 260 | eval.eval_files(args.std,args.rst,args.sep) 261 | p,r,f=eval.get_prf() 262 | sp,sr,sf=eval.get_prf(True) 263 | print(eval.std,eval.rst,eval.cor,"%.4f|%.4f|%.4f"%(p,r,f),"%.4f|%.4f|%.4f"%(sp,sr,sf)) 264 | 265 | -------------------------------------------------------------------------------- /isan/tagging/ss.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import json 3 | import gzip 4 | import numpy as np 5 | import pickle 6 | import random 7 | 8 | from isan.common.parameters import Para_Dict 9 | 10 | class Word : 11 | def close(self): 12 | if self.use_hidden : 13 | if 'save' in self.use_hidden : 14 | fi=open(self.use_hidden['save'],'w') 15 | print(json.dumps(self.M.tolist()),file=fi) 16 | print(json.dumps(self.b.tolist()),file=fi) 17 | fi.close() 18 | pass 19 | pass 20 | def __init__(self,args={},model=None,paras=None): 21 | self.paras=paras 22 | 23 | #print(args) 24 | self.s={} ## ?? 25 | if model == None : 26 | words={} 27 | size=0 28 | for line in open(args['words']) : 29 | word,*vs = line.split() 30 | vs=list(map(float,vs)) 31 | size=len(vs) 32 | words[word]=np.array(vs) 33 | 34 | 35 | self.use_hidden=args.get('hidden',None) 36 | 37 | 38 | self.words=words 39 | self.zw=np.zeros(size) 40 | self.size=size 41 | 42 | self.d=self.paras.add({}) 43 | 44 | self.d_hidden=np.zeros(self.size) 45 | 46 | np.random.seed(0) 47 | 48 | 49 | self.M=np.random.uniform(-0.8,0.8,(self.size,self.size)) 50 | self.b=np.zeros(self.size) 51 | if self.use_hidden and 'load' in self.use_hidden : 52 | m,b=open(self.use_hidden['load']).read().splitlines() 53 | self.M=np.array(json.loads(m)) 54 | self.b=np.array(json.loads(b)) 55 | 56 | 57 | if self.use_hidden and 'update' in self.use_hidden : 58 | self.M=self.paras.add(self.M) 59 | self.b=self.paras.add(self.b) 60 | 61 | 62 | self.s={k:v.copy()for k,v in self.d.items()} 63 | 64 | else : 65 | for k,v in model.items(): 66 | setattr(self,k,v) 67 | 68 | def dump_weights(self): 69 | d={} 70 | self.d=self.d.output_obj() 71 | if not self.use_hidden : 72 | for k in ['use_hidden','size','d','words','zw']: 73 | d[k]=getattr(self,k) 74 | else : 75 | if 'update' in self.use_hidden : 76 | self.M=self.M.output_obj() 77 | self.b=self.b.output_obj() 78 | for k in ['use_hidden','size','d','words','zw','M','b']: 79 | d[k]=getattr(self,k) 80 | return d 81 | 82 | def add_model(self,model): 83 | for k,v in model.items(): 84 | if k not in ['d'] : 85 | setattr(self,k,v) 86 | else : 87 | getattr(self,k).add_model(v) 88 | 89 | 90 | def set_raw(self,atoms): 91 | self.atoms=atoms 92 | self.sen_word_vecs=[] 93 | self.sen_hidden_vecs=[] 94 | for w,*_ in atoms : 95 | wv=self.words.get(w,self.zw) 96 | self.sen_word_vecs.append(wv) 97 | if self.use_hidden : 98 | hidden=np.dot(self.M,wv)+self.b 99 | hidden=np.tanh(hidden) 100 | self.sen_hidden_vecs.append(hidden) 101 | else : 102 | self.sen_hidden_vecs.append(wv) 103 | 104 | 105 | def __call__(self,ind1,ind2,ind3,delta=0) : 106 | word2,t2,*_=self.atoms[ind2] # word on the top of the stack 107 | word3,t3,*_=self.atoms[ind3] # next word 108 | # get the vector 109 | w2=self.sen_hidden_vecs[ind2] 110 | w3=self.sen_hidden_vecs[ind3] 111 | 112 | wv2=self.sen_word_vecs[ind2] 113 | wv3=self.sen_word_vecs[ind3] 114 | 115 | score=0 116 | 117 | if delta ==0 : # cal the network, not update 118 | if t3 in self.d : 119 | score+=np.dot(w3,self.d([t3])) 120 | if t2!='~' : 121 | if t2 in self.d : 122 | score+=np.dot(w3,self.d(['l'+t2])) 123 | if t3 in self.d : 124 | score+=np.dot(w2,self.d(['r'+t3])) 125 | else : # calculate the grad 126 | self.d.add_delta([t3],w3*delta) 127 | 128 | if self.use_hidden and 'update' in self.use_hidden : 129 | d_hidden=self.d([t3])*(1-w3**2) 130 | dM=(d_hidden[:,np.newaxis]*wv3) 131 | self.b.add_delta(d_hidden*delta) 132 | self.M.add_delta(dM*delta) 133 | 134 | # grad of 135 | if t2!='~' : 136 | self.d.add_delta(['l'+t2],w3*delta) 137 | if self.use_hidden and 'update' in self.use_hidden : 138 | d_hidden=self.d(['l'+t2])*(1-w3**2) 139 | dM=(d_hidden[:,np.newaxis]*wv3) 140 | self.b.add_delta(d_hidden*delta) 141 | self.M.add_delta(dM*delta) 142 | 143 | self.d.add_delta(['r'+t3],w2*delta) 144 | if self.use_hidden and 'update' in self.use_hidden : 145 | d_hidden=self.d(['r'+t3])*(1-w2**2) 146 | dM=(d_hidden[:,np.newaxis]*wv2) 147 | self.b.add_delta(d_hidden*delta) 148 | self.M.add_delta(dM*delta) 149 | return score 150 | -------------------------------------------------------------------------------- /isan/tagging/wb_tag.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3 2 | import pickle 3 | import time 4 | import math 5 | import sys 6 | 7 | import numpy as np 8 | import gzip 9 | 10 | from isan.common.parameters import Para_Dict 11 | from isan.common.task import Lattice, Base_Task, Early_Stop_Pointwise 12 | from isan.tagging.eval import TaggingEval as Eval 13 | from isan.tagging.ss import Word as Word 14 | from isan.tagging.wb_tag_symbolic import Base_Features 15 | 16 | 17 | """ 18 | word-based tagging 19 | """ 20 | 21 | class codec : 22 | @staticmethod 23 | def decode(line): 24 | """ 25 | 编码、解码 26 | 从一行文本中,得到输入(raw)和输出(y) 27 | """ 28 | if not line: return [] 29 | log2=math.log(2) 30 | line=list(map(lambda x:x.split(','), line.split())) 31 | for i,it in enumerate(line): 32 | if len(it)!=6 : 33 | l=it[:3] 34 | r=it[-2:] 35 | m=it[3:-2] 36 | line[i]=l+[','.join(m)]+r 37 | 38 | line=[[int(label),int(b),int(e),w,t,float(conf)] for label,b,e,w,t,conf in line] 39 | items2=[] 40 | gold=[] 41 | for l,b,e,w,t,conf in line : 42 | if conf <= -1 : 43 | conf = None 44 | else : 45 | pass 46 | #conf = conf/1000 47 | items2.append((b,e,(w,t,conf))) 48 | if l ==1 : 49 | gold.append((w,t)) 50 | raw=Lattice(items2) 51 | return {'raw':raw, 52 | 'y':gold, } 53 | @staticmethod 54 | def encode(y): 55 | return ' '.join(y) 56 | 57 | class State (list): 58 | init_state=pickle.dumps((-1,-1)) 59 | 60 | decode=pickle.loads 61 | encode=pickle.dumps 62 | 63 | def __init__(self,lattice,bt=init_state): 64 | self.extend(pickle.loads(bt)) 65 | self.lattice=lattice 66 | 67 | def shift(self,showall=False): 68 | begin=0 if self[1]==-1 else self.lattice[self[1]][1] 69 | 70 | if begin not in self.lattice.begins : return [] 71 | 72 | b=self.lattice.begins[begin] 73 | 74 | return [[n,pickle.dumps((self[1],n))] 75 | for n in self.lattice.begins[begin] 76 | if (self.lattice[n][2][-1] is not None or showall) 77 | ] 78 | 79 | def dumps(self): 80 | return pickle.dumps(tuple(self)) 81 | 82 | @staticmethod 83 | def load(bt): 84 | return pickle.loads(bt) 85 | 86 | 87 | 88 | 89 | class Path_Finding (Early_Stop_Pointwise, Base_Task): 90 | """ 91 | finding path in a DAG 92 | """ 93 | name='joint chinese seg&tag from a word-tag lattice' 94 | codec=codec 95 | State=State 96 | Eval=Eval 97 | 98 | 99 | #~~~~~~~~~~~~~~~~~~~~~~~~~~ 100 | # init and weights 101 | 102 | def __init__(self,cmd_args,model=None,paras=None,logger=None): 103 | self.models={} 104 | self.build_ins={'word':Word,'base': Base_Features } 105 | 106 | if model==None : 107 | self.paras=paras 108 | 109 | self.models['base']=self.build_ins['base'](args=None,paras=self.paras) 110 | 111 | if hasattr(cmd_args,'task_features'): 112 | for k,v in cmd_args.task_features.items(): 113 | self.models[k]=self.build_ins[k](args=v,paras=self.paras) 114 | else : 115 | for k,v in model.items(): 116 | self.models[k]=self.build_ins[k](model=v) 117 | 118 | def dump_weights(self) : 119 | d={k:v.dump_weights() for k,v in self.models.items()} 120 | return d 121 | 122 | def add_model(self,model): 123 | for k,v in model.items(): 124 | self.models[k].add_model(v) 125 | 126 | 127 | #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 128 | # actions 129 | 130 | class Action : 131 | @staticmethod 132 | def encode(action): 133 | return action[0] 134 | @staticmethod 135 | def decode(action): 136 | return (action,None) 137 | 138 | def result_to_actions(self,result): 139 | offset=0 140 | actions=[] 141 | for g in result : 142 | nex=[[ind,self.lattice[ind]] for ind in self.lattice.begins[offset]] 143 | nex=[ind for ind, it in nex if (it[2][0],it[2][1])==g] 144 | actions.append((nex[0],None)) 145 | offset+=len(g[0]) 146 | return actions 147 | 148 | def actions_to_result(self,actions): 149 | seq=[self.lattice[action[0]] for action in actions] 150 | seq=[(it[0],it[1])for _,_,it in seq] 151 | return seq 152 | 153 | #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 154 | # states 155 | 156 | def _next_ind(self,last_ind,action): 157 | next_ind=last_ind+len(self.lattice[action][2][0]) 158 | return next_ind if next_ind != self.lattice.length else -1 159 | 160 | def shift(self,last_ind,stat,showall=False): 161 | rtn= [(a,self._next_ind(last_ind,a),s) 162 | for a,s in self.State(self.lattice,stat).shift(showall)] 163 | return rtn 164 | 165 | reduce=None 166 | 167 | 168 | def actions_to_moves(self,actions,lattice): 169 | state=self.State(lattice) 170 | stack=[state] 171 | moves=[[None,None,action] for action in actions] 172 | moves[0][0]=0 173 | moves[0][1]=self.State.init_state 174 | for i in range(len(moves)-1) : 175 | move=moves[i] 176 | step,state,action=move 177 | ind,label=action 178 | if ind >=0 : # shift 179 | rst=[[nstep,ns] for a,nstep,ns in self.shift(step,state,True) if a==self.Action.encode(action)] 180 | moves[i+1][0],moves[i+1][1]=rst[0] 181 | stack.append(rst[0][1]) 182 | else : # reduce 183 | s0=stack.pop() 184 | s1=stack.pop() 185 | rst=[[nstep,ns] for a,nstep,ns,_ in self.reduce(step,s0,[0],[s1]) if a==self.Action.encode(action)] 186 | moves[i+1][0],moves[i+1][1]=rst[0] 187 | stack.append(rst[0][1]) 188 | pass 189 | for move in moves: 190 | move[2]=self.Action.encode(move[2]) 191 | 192 | moves=list(map(tuple,moves)) 193 | return moves 194 | 195 | # feature related 196 | 197 | def set_raw(self,raw,Y): 198 | self.lattice=raw 199 | self.atoms=[] 200 | for ind in range(len(self.lattice)): 201 | data=self.lattice[ind] 202 | b=data[0] 203 | e=data[1] 204 | w,t,m=data[2] 205 | self.atoms.append((w,t,m,str(len(w)))) 206 | self.atoms.append(('~','~','','0')) 207 | 208 | for model in self.models.values() : 209 | model.set_raw(self.atoms) 210 | 211 | def gen_features(self,state,actions,delta=0): 212 | ind1,ind2=self.State(self.lattice,state) 213 | scores=[[sum(model(ind1,ind2,ind3,delta) for model in self.models.values())] 214 | for ind3 in actions] 215 | return scores 216 | 217 | def cal_delta(self,std_moves,rst_moves) : 218 | delta=0.0001 #### TODO: delta 219 | dirty=set() 220 | for b,e,data in self.lattice : 221 | if data[-1]==None : 222 | for x in range(b,e) : 223 | dirty.add(x) 224 | 225 | max_step=max(x[0] for x in rst_moves) 226 | std_moves=set(x for x in std_moves if x[0]<=max_step) 227 | rst_moves=set(rst_moves) 228 | for m in std_moves-rst_moves : 229 | a,b=pickle.loads(m[1]) 230 | c=m[-1] 231 | flag=True 232 | for x in [a,b,c] : 233 | if x==-1 : continue 234 | l,r,_=self.lattice[x] 235 | for ind in range(l,r): 236 | if ind in dirty : flag=False 237 | if flag : 238 | pass 239 | self._update(m,delta) 240 | for m in rst_moves-std_moves : 241 | a,b=pickle.loads(m[1]) 242 | c=m[-1] 243 | flag=True 244 | for x in [a,b,c] : 245 | if x==-1 : continue 246 | l,r,_=self.lattice[x] 247 | for ind in range(l,r): 248 | if ind in dirty : flag=False 249 | if flag : 250 | pass 251 | self._update(m,-delta) 252 | 253 | def __del__(self): 254 | for model in self.models.values() : 255 | if hasattr(model,'close') : 256 | model.close() 257 | -------------------------------------------------------------------------------- /isan/tagging/wb_tag_symbolic.py: -------------------------------------------------------------------------------- 1 | import math 2 | class Base_Features : 3 | def __init__(self,args={},model=None,paras=None): 4 | if model == None : 5 | self.w=paras.add({}) 6 | else : 7 | self.w=model 8 | 9 | def dump_weights(self): 10 | return self.w.output_obj() 11 | 12 | def add_model(self,model): 13 | self.w.add_model(model) 14 | pass 15 | 16 | def set_raw(self,atoms): 17 | self.atoms=atoms 18 | 19 | def __call__(self,ind1,ind2,ind3,delta=0) : 20 | strm=lambda x:'x' if x=='' else str(math.floor(math.log((x if x>0 else 0)*2+1))) 21 | w1,t1,m1,len1=self.atoms[ind1] 22 | w2,t2,m2,len2=self.atoms[ind2] 23 | w3,t3,m3,len3=self.atoms[ind3] 24 | fv=( 25 | (['m3~'+strm(m3), ] if m3 is not None else []) + 26 | ([ 'm3m2~'+strm(m3)+'~'+strm(m2), ] if m3 is not None and m2 is not None else [])+ 27 | [ 28 | 'w3~'+w3, 't3~'+t3, 'l3~'+len3, 'w3t3~'+w3+t3, 'l3t3~'+len3+t3, 29 | 30 | 'w3w2~'+w3+"~"+w2, 'w3t2~'+w3+t2, 't3w2~'+t3+w2, 't3t2~'+t3+t2, 31 | 32 | 'l3w2~'+len3+'~'+w2, 'w3l2~'+w3+'~'+len2, 'l3t2~'+len3+'~'+t2, 't3l2~'+t3+'~'+len2, 33 | 'l3l2~'+len3+'~'+len2, 34 | 35 | 't3t1~'+t3+'~'+t1, 't3t2t1~'+t3+'~'+t2+'~'+t1, 36 | 'l3l1~'+len3+'~'+len1, 'l3l2l1~'+len3+'~'+len2+'~'+len1, 37 | ]) 38 | 39 | if delta==0 : 40 | v= float(self.w(fv)) 41 | return v 42 | else : 43 | self.w.add_delta(fv,delta*10) 44 | return 0 45 | 46 | -------------------------------------------------------------------------------- /isan/utls/Makefile: -------------------------------------------------------------------------------- 1 | cxx=g++ -O3 -I ~/isan/ -I /usr/include/python3.2mu -shared -fPIC 2 | 3 | all: dat 4 | 5 | 6 | dat: dat.cc *.hpp ../common/*.hpp 7 | $(cxx) -g -o dat.so dat.cc 8 | -------------------------------------------------------------------------------- /isan/utls/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3 2 | """ 3 | 在 `isan.utls` 下面有很多实用工具 4 | """ 5 | 6 | def test(): 7 | print("this is a test function") 8 | -------------------------------------------------------------------------------- /isan/utls/average.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3 2 | import argparse 3 | import gzip 4 | import pickle 5 | import math 6 | import sys 7 | 8 | 9 | if __name__ == '__main__': 10 | argv=sys.argv[1:] 11 | dst=argv[-1] 12 | models=argv[:-1] 13 | 14 | weights={} 15 | numbers={} 16 | 17 | #models=['model_train_'+str(x)+'.txt' for x in [0,1,2,3,4]] 18 | #models=['model_'+str(x)+'.gz' for x in [1,2,3,4,5]] 19 | for model in models: 20 | print(model) 21 | for k,v in pickle.load(gzip.open(model)).items(): 22 | if k not in weights : 23 | weights[k]=0 24 | numbers[k]=0 25 | weights[k]+=v 26 | if v!=0 : numbers[k]+=1 27 | 28 | 29 | for k,n in numbers.items(): 30 | if n!=0 : 31 | #weights[k]=round(weights[k]/max(n-0.5,1)) 32 | weights[k]=round(weights[k]/n) 33 | #weights[k]=round(weights[k]/len(models)) 34 | 35 | pickle.dump(weights,gzip.open(dst,'wb')) 36 | 37 | -------------------------------------------------------------------------------- /isan/utls/cdat2/Makefile: -------------------------------------------------------------------------------- 1 | cdat.so: cdat.cc 2 | g++ cdat.cc -I /usr/include/python3.2mu -shared -o cdat.so -fPIC 3 | -------------------------------------------------------------------------------- /isan/utls/cdat2/cdat.cc: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include "dat.h" 4 | /** 5 | * g++ spammodule.c -I /usr/include/python3.2mu -shared -o spam.so -fPIC 6 | * */ 7 | 8 | 9 | using namespace dat; 10 | 11 | static PyObject * 12 | cdat_open(PyObject *self, PyObject *args) 13 | { 14 | char* filename=NULL; 15 | PyArg_ParseTuple(args,"s",&filename); 16 | DAT* dat=new DAT(filename); 17 | return PyLong_FromLong((size_t)dat); 18 | 19 | } 20 | static PyObject * 21 | cdat_close(PyObject *self, PyObject *args) 22 | { 23 | PyObject* handler=NULL; 24 | PyArg_ParseTuple(args,"O",&handler); 25 | delete (DAT*)PyLong_AsLong(handler); 26 | return Py_None; 27 | } 28 | static PyObject * 29 | cdat_get(PyObject *self, PyObject *args) 30 | { 31 | PyObject* handler=NULL; 32 | PyObject* py_key=NULL; 33 | int no_prefix=0; 34 | PyArg_ParseTuple(args,"OOi",&handler,&py_key,&no_prefix); 35 | DAT* dat=(DAT*)PyLong_AsLong(handler); 36 | 37 | Word key; 38 | size_t key_size=PySequence_Size(py_key); 39 | for(size_t i=0;ip_match(key); 45 | }else{ 46 | ind=dat->match(key); 47 | }; 48 | if (ind==-1) return Py_None; 49 | return PyLong_FromLong(dat->dat[ind].base); 50 | } 51 | static PyObject * 52 | cdat_set(PyObject *self, PyObject *args) 53 | { 54 | PyObject* handler=NULL; 55 | PyObject* py_key=NULL; 56 | int value=0; 57 | PyArg_ParseTuple(args,"OOi",&handler,&py_key,&value); 58 | DAT* dat=(DAT*)PyLong_AsLong(handler); 59 | 60 | Word key; 61 | size_t key_size=PySequence_Size(py_key); 62 | for(size_t i=0;ip_match(key); 66 | if (ind==-1) return Py_None; 67 | dat->dat[ind].base=value; 68 | return PyLong_FromLong(value); 69 | } 70 | static PyObject * 71 | cdat_inc(PyObject *self, PyObject *args) 72 | { 73 | PyObject* handler=NULL; 74 | PyObject* py_key=NULL; 75 | int value=0; 76 | int no_prefix=0; 77 | PyArg_ParseTuple(args,"OOii",&handler,&py_key,&no_prefix,&value); 78 | DAT* dat=(DAT*)PyLong_AsLong(handler); 79 | 80 | Word key; 81 | size_t key_size=PySequence_Size(py_key); 82 | for(size_t i=0;ip_match(key); 86 | int ind=0; 87 | if(no_prefix){ 88 | ind=dat->p_match(key); 89 | }else{ 90 | ind=dat->match(key); 91 | }; 92 | if (ind==-1) return Py_None; 93 | dat->dat[ind].base+=value; 94 | return PyLong_FromLong(dat->dat[ind].base); 95 | } 96 | static PyObject * 97 | cdat_build(PyObject *self, PyObject *args) 98 | { 99 | PyObject* list=NULL; 100 | char* filename=NULL; 101 | PyArg_ParseTuple(args,"sO",&filename,&list); 102 | DATMaker dm; 103 | std::vector lexicon; 104 | size_t size=PySequence_Size(list); 105 | for(size_t i=0;ivalue=(int)PyLong_AsLong(PySequence_GetItem(line,1)); 112 | size_t key_size=PySequence_Size(py_key); 113 | for(size_t i=0;ikey.push_back(*(int*)PyUnicode_AS_UNICODE(PySequence_GetItem(py_key,i))); 115 | } 116 | } 117 | dm.make_dat(lexicon,true); 118 | dm.shrink(); 119 | dm.save_as(filename); 120 | fprintf(stderr,"size of DAT %d\n",(int)dm.dat_size); 121 | return Py_None; 122 | }; 123 | 124 | 125 | static PyMethodDef cdatMethods[] = { 126 | {"build", cdat_build, METH_VARARGS,""}, 127 | {"open", cdat_open, METH_VARARGS,""}, 128 | {"close", cdat_close, METH_VARARGS,""}, 129 | {"get", cdat_get, METH_VARARGS,""}, 130 | {"set", cdat_set, METH_VARARGS,""}, 131 | {"inc", cdat_inc, METH_VARARGS,""}, 132 | {NULL, NULL, 0, NULL} /* Sentinel */ 133 | }; 134 | static struct PyModuleDef cdatmodule = { 135 | PyModuleDef_HEAD_INIT, 136 | "spam", /* name of module */ 137 | NULL, /* module documentation, may be NULL */ 138 | -1, /* size of per-interpreter state of the module, 139 | or -1 if the module keeps state in global variables. */ 140 | cdatMethods 141 | }; 142 | PyMODINIT_FUNC 143 | PyInit_cdat(void) 144 | { 145 | return PyModule_Create(&cdatmodule); 146 | } 147 | -------------------------------------------------------------------------------- /isan/utls/cdat2/dat_builder.cc: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include"dat.h" 9 | 10 | using namespace dat; 11 | 12 | 13 | void showhelp(){ 14 | printf("双数组TRIE树构建器\n\t作者:张开旭\n"); 15 | printf(" get words and make DAT\n"); 16 | printf("-f filename\n use filename instead of stdin\n"); 17 | printf("-s\n save base array and check array Seperately\n"); 18 | printf("-P\n 申明没有一个词是另一个词的前缀,将编号存在base,而不是base指向的节点\n"); 19 | 20 | } 21 | int main(int argc,char **argv){ 22 | int c; 23 | int is_old_style=false; 24 | char* lexicon_filename=NULL; 25 | int no_prefix=0; 26 | char separator=0; 27 | while ( (c = getopt(argc, argv, "f:shPi")) != -1) { 28 | switch (c) { 29 | case 'i':// the index is 30 | separator=' '; 31 | break; 32 | case 's'://seperated two arrays 33 | is_old_style=true; 34 | break; 35 | case 'P'://prefix free 36 | no_prefix=true; 37 | break; 38 | case 'f' : //specify the file 39 | lexicon_filename = optarg; 40 | break; 41 | case 'h' : 42 | case '?' : 43 | default : 44 | showhelp(); 45 | return 1; 46 | } 47 | } 48 | char* dat_filename=argv[optind]; 49 | 50 | //输入文件名 51 | FILE* inputFile=stdin; 52 | std::istream* is=&std::cin; 53 | std::cout<<"begin\n"; 54 | std::string str; 55 | if(lexicon_filename){ 56 | std::cout<<"file\n"; 57 | is=new std::ifstream(lexicon_filename,std::ifstream::in); 58 | } 59 | 60 | 61 | DATMaker dm; 62 | fprintf(stderr,"Double Array Trie Builder, author ZHANG, Kaixu\n"); 63 | std::vector lexicon; 64 | lexicon.push_back(DATMaker::KeyValue()); 65 | int end_character=0; 66 | 67 | //load wordlist 68 | int id=0; 69 | 70 | void* rtn; 71 | do{ 72 | rtn=std::getline(*is,str); 73 | if(str.length()==0)continue; 74 | if(separator){//to find a score as value instread of id 75 | int sep_ind=str.rfind(separator); 76 | //thulac::string_to_raw(str.substr(0,sep_ind),lexicon.back().key); 77 | //std::cout<0){ 95 | if(separator){//to find a score as value instread of id 96 | int sep_ind=lexicon.back().key.rfind(separator); 97 | std::cout< 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include "isan/common/common.hpp" 7 | #include "dat.hpp" 8 | 9 | using namespace isan; 10 | 11 | static PyObject * 12 | make_dat(PyObject *self, PyObject *arg){ 13 | std::cout<<"hello\n"; 14 | 15 | PyObject * py_list; 16 | PyArg_ParseTuple(arg, "O", &py_list); 17 | std::vector > list; 18 | list.clear(); 19 | 20 | 21 | long size=PySequence_Size(py_list); 22 | std::cerr<<"list size: "<()); 27 | list.back().first=Dict_Item(PyTuple_GET_ITEM(tri,0)); 28 | list.back().second=PyLong_AsLong(PyTuple_GET_ITEM(tri,1)); 29 | }; 30 | 31 | std::sort(list.begin(),list.end(),item_cmp); 32 | DATMaker dm; 33 | dm.make_dat(list,0); 34 | dm.shrink(); 35 | std::cout< 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include "isan/common/general_types.hpp" 10 | 11 | 12 | namespace isan{ 13 | 14 | typedef Feature_String Dict_Item; 15 | typedef Feature_String Word; 16 | typedef char Character; 17 | 18 | 19 | 20 | class DAT{ 21 | public: 22 | struct Entry{ 23 | int base; 24 | int check; 25 | }; 26 | 27 | void* mmap_ptr; 28 | Entry* dat; 29 | size_t dat_size; 30 | DAT():mmap_ptr(NULL){}; 31 | DAT(const char* filename){ 32 | 33 | FILE * pFile=fopen(filename,"r+b"); 34 | if(!pFile){ 35 | fprintf(stderr,"[ERROR] DAT file %s not found\n",filename); 36 | } 37 | fseek(pFile,0,SEEK_END); 38 | dat_size=ftell(pFile)/sizeof(Entry); 39 | rewind(pFile); 40 | int rtn; 41 | fclose(pFile); 42 | 43 | int fd=open(filename,O_RDWR); 44 | mmap_ptr=mmap(NULL,sizeof(Entry)*dat_size,PROT_READ|PROT_WRITE,MAP_SHARED,fd,0); 45 | dat=(Entry*)mmap_ptr; 46 | close(fd); 47 | } 48 | 49 | void save_as(const char* filename){ 50 | FILE * pFile=fopen(filename,"wb"); 51 | fwrite(dat,sizeof(Entry),dat_size,pFile); 52 | fclose(pFile); 53 | } 54 | inline int get(const Word& word)const{ 55 | register int ind=0; 56 | register int base=0; 57 | for(int i=0;i=dat_size)||dat[ind].check!=base)return 0; 60 | base=ind; 61 | } 62 | ind=dat[base].base; 63 | if((indmmap_ptr){ 70 | msync(this->mmap_ptr,sizeof(Entry)*this->dat_size,MS_ASYNC); 71 | munmap(this->mmap_ptr,sizeof(Entry)*this->dat_size); 72 | }else{ 73 | free(dat); 74 | } 75 | } 76 | inline int get_index(int base,const Character& character){ 77 | int ind=dat[base].base+character; 78 | if((ind>=dat_size)||dat[ind].check!=base)return -1; 79 | return ind; 80 | }; 81 | /*return -base or number of matched characters*/ 82 | int get_info(Word& prefix){ 83 | register int ind=0; 84 | register int base=0; 85 | for(size_t i=0;i=dat_size)||dat[ind].check!=base)return i; 88 | base=ind; 89 | } 90 | return -base; 91 | } 92 | }; 93 | 94 | 95 | class DATMaker: public DAT{ 96 | public: 97 | typedef std::pair KeyValue; 98 | static bool compare_words (const DATMaker::KeyValue& first, const DATMaker::KeyValue& second) 99 | { 100 | const Word& first_word=first.first; 101 | const Word& second_word=second.first; 102 | size_t min_size=(first_word.size()second_word[i])return false; 105 | if(first_word[i]=0)printf("cell reused!!\n"); 125 | if(dat[ind].base==1){ 126 | head=dat[ind].check; 127 | }else{ 128 | dat[-dat[ind].base].check=dat[ind].check; 129 | }; 130 | if(dat[ind].check==-dat_size){ 131 | tail=dat[ind].base; 132 | }else{ 133 | dat[-dat[ind].check].base=dat[ind].base; 134 | }; 135 | dat[ind].check=ind; 136 | }; 137 | void extends(){ 138 | int old_size=dat_size; 139 | dat_size*=2; 140 | dat=(Entry*)realloc(dat,sizeof(Entry)*dat_size); 141 | for(int i=0;i=0)dat[-tail].check=-old_size; 147 | tail=-(old_size*2-1); 148 | } 149 | void shrink(){//thrink之后双向链表就不需要保持了 150 | int last=dat_size-1; 151 | while(dat[last].check<0)last--; 152 | dat_size=last+1; 153 | dat=(Entry*)realloc(dat,sizeof(Entry)*dat_size); 154 | } 155 | 156 | int alloc(std::vector& offsets){ 157 | size_t size=offsets.size(); 158 | register size_t base=-head; 159 | while(1){ 160 | if(base==dat_size)extends(); 161 | if(size) 162 | while(base+offsets[size-1]>=dat_size) 163 | extends(); 164 | register int flag=true; 165 | if(dat[base].check>=0){ 166 | flag=false; 167 | }else{ 168 | for(register int i=0;i=0){//used 170 | flag=false; 171 | break; 172 | } 173 | } 174 | } 175 | if(flag){ 176 | use(base); 177 | for(int i=0;i& lexicon,int start,const Word& prefix,std::vector&children){ 185 | children.clear(); 186 | size_t l=prefix.size(); 187 | for(size_t ind=start;indl){ 192 | if(children.empty()||word[l]!=children.back()) 193 | children.push_back(word[l]); 194 | } 195 | } 196 | } 197 | int assign(int check,std::vector& offsets,int is_word=false){ 198 | int base=alloc(offsets); 199 | //base 200 | dat[base].base=0; 201 | if(is_word){//如果是词 202 | dat[base].check=check; 203 | }else{//如果不是词 204 | dat[base].check=base; 205 | } 206 | 207 | for(int i=0;i<(int)offsets.size();i++){ 208 | dat[base+offsets[i]].base=0; 209 | dat[base+offsets[i]].check=check; 210 | } 211 | dat[check].base=base; 212 | //printf("is_word %d base %d\n",is_word,base); 213 | return base; 214 | } 215 | void make_dat(std::vector& lexicon,int no_prefix=0){ 216 | std::sort(lexicon.begin(),lexicon.end(),&compare_words); 217 | 218 | int size=(int)lexicon.size(); 219 | //std::cout< children; 221 | Word prefix; 222 | //prefix.clear(); 223 | gen_children(lexicon,0,prefix,children); 224 | //std::cout<<"children size: "<get_info(word); 231 | //std::cout<<"off: "<get_info(prefix); 241 | //std::cout<<"p_base "<get_info(word); 247 | if(no_prefix){ 248 | dat[off].base=lexicon[i].second; 249 | }else{ 250 | dat[dat[off].base].base=lexicon[i].second; 251 | } 252 | //if(i&&(i%100000==0))printf("%f\n",(double)i/size); 253 | //int zkx; 254 | //std::cin>>zkx; 255 | } 256 | 257 | } 258 | 259 | void print(){ 260 | printf("head %d, tail %d\n",head,tail); 261 | for(int i=0;i<(int)dat_size;i++) 262 | printf("[%d,%d,%d] ",i,dat[i].base,dat[i].check); 263 | printf("\n"); 264 | } 265 | }; 266 | 267 | 268 | };//end of isan 269 | -------------------------------------------------------------------------------- /isan/utls/divde.py: -------------------------------------------------------------------------------- 1 | divide.py -------------------------------------------------------------------------------- /isan/utls/divide.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3 2 | """ 3 | 分割文件:: 4 | 5 | ./divide.py number:filename [number:filename ...] 6 | 7 | 8 | """ 9 | import itertools 10 | import sys 11 | if __name__=='__main__': 12 | cycle=[] 13 | file_dict={} 14 | for item in sys.argv[1:]: 15 | n,_,file=item.rpartition(':') 16 | if not n:n='1' 17 | if file not in file_dict: 18 | 19 | file_dict[file]=open(file,'w') if file else None 20 | cycle+=[file_dict[file]]*int(n) 21 | for out_file,line in zip(itertools.cycle(cycle),sys.stdin): 22 | if out_file: 23 | print(line.strip(),file=out_file) 24 | -------------------------------------------------------------------------------- /isan/utls/draw.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3 2 | import os 3 | import sys 4 | import subprocess 5 | 6 | def exp_tree(line,node,head,src): 7 | if len(node[head])==1:return 8 | node[head].sort() 9 | for ind in node[head]: 10 | if ind!=head: 11 | src.append('"%d"[label="%s"];'%(ind,line[ind][1][0])) 12 | src.append("%d->%d;"%(head,ind)) 13 | else: 14 | src.append('"~%d"[label="~",shape="point"];'%(head)) 15 | src.append('%d->"~%d";'%(head,head)) 16 | for ind in node[head]: 17 | if ind !=head: 18 | exp_tree(line,node,ind,src) 19 | def encode(line,T='png'): 20 | line=[[ind,item.split('_')] for ind,item in enumerate(line.split())] 21 | node=[[ind] for ind in range(len(line))] 22 | head=-1 23 | for ind,item in line: 24 | item[2]=int(item[2]) 25 | if item[2]==-1: 26 | head=ind 27 | else: 28 | node[item[2]].append(ind) 29 | src=["digraph unix {", 30 | "node[shape=box];", 31 | "rankdir=TD;"] 32 | src.append('"%d"[label="%s"];'%(head,line[head][1][0])) 33 | exp_tree(line,node,head,src) 34 | src.append("}") 35 | src='\n'.join(src) 36 | dot=subprocess.Popen(["dot","-T"+T],stdin=subprocess.PIPE,stdout=subprocess.PIPE) 37 | stdout,stderr=dot.communicate(src.encode()) 38 | return stdout 39 | 40 | if __name__=="__main__": 41 | line='''导弹_NN_2 不_AD_2 必_VV_22 带_VV_2 弹头_NN_3 ,_PU_22 目标_NN_8 不_AD_8 必_VV_13 在_VV_8 有_VE_11 居民_NN_9 之_DEC_8 地_NN_22 ,_PU_22 例如_AD_22 次要_JJ_17 外岛_NN_20 或_CC_20 领海_NN_20 边缘_NN_22 皆_AD_22 可_VV_-1 。_PU_22''' 42 | if len(sys.argv)<2: 43 | print('请输入要保存的文件名') 44 | exit() 45 | filename=sys.argv[1] 46 | 47 | for line in sys.stdin: 48 | open(filename,'wb').write(encode(line)) 49 | exit() 50 | -------------------------------------------------------------------------------- /isan/utls/indexer.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3 2 | class Indexer(list) : 3 | def __init__(self): 4 | self.d=dict() 5 | pass 6 | def __call__(self,key): 7 | if key not in self.d : 8 | self.d[key]=len(self) 9 | self.append(key) 10 | return self.d[key] 11 | 12 | -------------------------------------------------------------------------------- /isan/utls/pydat.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3 2 | import dat 3 | 4 | if __name__ == '__main__': 5 | abc=[ 6 | (b'ab',33), 7 | (b'sa',44), 8 | (b'a',11), 9 | (b'aa',22), 10 | 11 | ] 12 | dat.make(abc) 13 | pass 14 | 15 | -------------------------------------------------------------------------------- /isan/utls/segconv.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3 2 | import re 3 | import sys 4 | import argparse 5 | import json 6 | 7 | def wiki_reader(line): 8 | line=re.sub(r'^([^\[]*)\]\]',r'\1',line) 9 | line=re.sub(r'\[\[([^\]]*)$',r'\1',line) 10 | if '[[' not in line : return None 11 | line=re.split('(\[\[[^\]]+\]\])',line) 12 | line=[x for x in line if x] 13 | word_start_at=-1 14 | intervals=[[-1,-1]] 15 | offset=0 16 | raw=[] 17 | for w in line: 18 | in_word=0 19 | if w[:2]=='[[': 20 | w=w[2:-2] 21 | in_word=1 22 | else : 23 | pass 24 | raw.append(w) 25 | for c in w[:-1]: 26 | if in_word: 27 | intervals.append([offset,offset+len(w)]) 28 | else : 29 | intervals.append([-1,-1]) 30 | intervals.append([-1,-1]) 31 | offset+=len(w) 32 | raw=''.join(raw) 33 | if not(len(raw)+1==len(intervals)): 34 | input('assert') 35 | return None 36 | return {'raw':raw,'Y':[None,intervals]} 37 | 38 | def seg_reader(line) : 39 | y=line.split() 40 | return {'seg' : y, 'Y' : None} 41 | 42 | def raw_reader(data) : 43 | data=data.strip() 44 | return {'raw' : data,'Y': None} 45 | def raw_writer(data) : 46 | if 'raw' in data : return data['raw'] 47 | if 'seg' in data : return ''.join(data['seg']) 48 | 49 | def seg_writer(data) : 50 | if 'seg' in data : return data['seg'] 51 | return None 52 | 53 | def raw_with_Ya_writer(data) : 54 | raw=raw_writer(data) 55 | Y=None 56 | if 'Y' in data : 57 | Y=data['Y'] 58 | return json.dumps({'raw' : raw,'y': seg_writer(data), 59 | 'Y_a' : Y},ensure_ascii=False) 60 | def raw_with_Ya_reader(data) : 61 | data=json.loads(data) 62 | raw=data.get('raw') 63 | Ya=data.get('Y_a',None) 64 | return {'raw':raw,'Y':Ya} 65 | 66 | def raw_with_Yb_writer(data) : 67 | raw=raw_writer(data) 68 | Y=None 69 | if 'Y' in data : 70 | Y=data['Y'] 71 | return json.dumps({'raw' : raw,'y': seg_writer(data), 72 | 'Y_b' : Y},ensure_ascii=False) 73 | def raw_with_Yb_reader(data) : 74 | data=json.loads(data) 75 | raw=data.get('raw') 76 | Ya=data.get('Y_b',None) 77 | return {'raw':raw,'Y':Ya} 78 | 79 | if __name__ == '__main__': 80 | readers={'raw':raw_reader, 81 | 'seg':seg_reader, 82 | 'wiki':wiki_reader, 83 | 'Ya': raw_with_Ya_reader, 84 | 'Yb': raw_with_Yb_reader, 85 | } 86 | writers={'raw': raw_writer, 87 | 'Ya': raw_with_Ya_writer, 88 | 'Yb':raw_with_Yb_writer, 89 | } 90 | 91 | parser=argparse.ArgumentParser(description="分词相关的格式转换") 92 | parser.add_argument('-f','--from',dest='reader', 93 | choices=readers, 94 | metavar='源格式') 95 | parser.add_argument('-t','--to', 96 | dest='writer',default='raw', 97 | choices=writers, 98 | metavar='目标格式') 99 | args=parser.parse_args() 100 | reader=readers[args.reader] 101 | writer=writers[args.writer] 102 | 103 | 104 | for line in sys.stdin : 105 | line=line.strip() 106 | data=reader(line) 107 | if data : 108 | print(writer(data)) 109 | 110 | 111 | -------------------------------------------------------------------------------- /isan/utls/shuffle_lines.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3 2 | """ 3 | 命令行工具,用于打乱文件中行的顺序 4 | 5 | usage: shuffle_lines.py [-h] [-i] filename [filename ...] 6 | 7 | 如果提供一个文件,则打乱该文件顺序。 8 | 9 | 如果提供多个文件,则同步地打乱多个文件中行的顺序。 10 | 这里多个文件中对应行的数据有对应关系。 11 | 多个文件需要有相同数目的行。 12 | 13 | 给出 `-i` 参数, 则会将打乱顺序的内容写回文件。 14 | """ 15 | import argparse 16 | import random 17 | import sys 18 | 19 | if __name__ == '__main__': 20 | parser=argparse.ArgumentParser(description="随机化文件的行") 21 | parser.add_argument('filename',help='要操作的文件名',nargs='+') 22 | parser.add_argument('-i',help='不设定会输出到标准输出流,设定后写回原文件',action='store_true',dest='i') 23 | args=parser.parse_args() 24 | 25 | data=[] 26 | for lines in zip(*[open(fn) for fn in args.filename]): 27 | data.append([line.strip() for line in lines]) 28 | random.shuffle(data) 29 | out_file=[sys.stdout if not args.i else open(fn,'w') for fn in args.filename] 30 | for ls in data: 31 | for l,f in zip(ls,out_file) : 32 | print(l,file=f) 33 | exit() 34 | 35 | -------------------------------------------------------------------------------- /isan/utls/times.py: -------------------------------------------------------------------------------- 1 | import time 2 | class Times (dict) : 3 | def __call__(self,key): 4 | if key not in self : 5 | self[key]=[0,None] 6 | data=self[key] 7 | if data[1]==None : 8 | data[1]=time.time() 9 | else : 10 | data[0]+=time.time()-data[1] 11 | data[1]=None 12 | def __repr__(self): 13 | return '\n'.join(str(k)+":"+str(v[0]) for k,v in self.items()) 14 | -------------------------------------------------------------------------------- /isan/utls/to_full.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3 2 | import sys 3 | import argparse 4 | 5 | def to_full(text,ignore=set()): 6 | """ 7 | 半角转全角的程序 8 | 空格变成全角 9 | 大于空格的直接加上偏移量 10 | 否则不变 11 | """ 12 | 13 | return ''.join( 14 | chr(x) if (x<32 or x>128 or x in ignore) else 15 | chr(12288) if x==32 else chr(x+65248) 16 | for x in map(ord,text)) 17 | 18 | 19 | if __name__ == '__main__': 20 | parser=argparse.ArgumentParser(description="") 21 | parser.add_argument('--ignore',help='忽略的',dest='ignore',type=str) 22 | parser.add_argument('--check',help='只显示改变了的',action='store_true') 23 | args=parser.parse_args() 24 | ignore=set() 25 | if args.ignore : 26 | for c in sys.argv[1] : 27 | ignore.add(ord(c)) 28 | 29 | for line in sys.stdin : 30 | line=line.strip() 31 | if args.check : 32 | rtn=to_full(line,ignore) 33 | if line!=rtn : 34 | print(rtn) 35 | else : 36 | print(to_full(line,ignore)) 37 | pass 38 | 39 | --------------------------------------------------------------------------------