├── .gitignore
├── Makefile
├── README.md
├── docs
    ├── .gitignore
    ├── Makefile
    ├── advanced.rst
    ├── basic.rst
    ├── conf.py
    ├── getting_started.rst
    ├── index.rst
    ├── make.bat
    └── tech_report.rst
├── isan.py
├── isan.sh
└── isan
    ├── Makefile
    ├── README.md
    ├── __init__.py
    ├── annotation
        ├── __init__.py
        └── seg
        │   ├── __init__.py
        │   ├── anno.py
        │   ├── http_server.py
        │   └── sample.html
    ├── common
        ├── Chinese.py
        ├── __init__.py
        ├── common.hpp
        ├── decoder.hpp
        ├── decoder.py
        ├── feature_dict.cc
        ├── first_order_linear
        │   ├── decoder.h
        │   └── first_order_linear.cc
        ├── general_types.hpp
        ├── parameters.py
        ├── perceptrons.py
        ├── python_interface.cc
        ├── searcher.hpp
        ├── smart_string.hpp
        ├── task.py
        ├── updater.py
        └── weights.py
    ├── data
        ├── __init__.py
        └── lattice.py
    ├── parsing
        ├── __init__.py
        ├── char_dep.py
        ├── codec.py
        ├── default_dep.py
        ├── default_dep2.py
        ├── dep_codec.py
        ├── dep_unlabeled_eval.py
        ├── eval.py
        ├── lat_dep.py
        ├── lat_tag.py
        ├── lattice_dep.py
        ├── ldep_eval.py
        ├── make_cython.sh
        ├── seq_dep.py
        └── setup.py
    ├── sentence
        ├── README.md
        └── __init__.py
    ├── tagging
        ├── .exrc
        ├── PA_segger.py
        ├── __init__.py
        ├── cb_cws.py
        ├── cb_subsymbolic.py
        ├── cb_symbolic.py
        ├── cws.py
        ├── eval.py
        ├── ss.py
        ├── tagging_dag.py
        ├── wb_tag.py
        └── wb_tag_symbolic.py
    └── utls
        ├── Makefile
        ├── __init__.py
        ├── average.py
        ├── cdat2
            ├── Makefile
            ├── cdat.cc
            ├── dat.h
            ├── dat_builder.cc
            └── test2.dat
        ├── count.py
        ├── dat.cc
        ├── dat.hpp
        ├── divde.py
        ├── divide.py
        ├── draw.py
        ├── indexer.py
        ├── pydat.py
        ├── segconv.py
        ├── shuffle_lines.py
        ├── times.py
        └── to_full.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | test
 2 | build/
 3 | /data*
 4 | test.*
 5 | test*
 6 | test_ctb.zsh
 7 | *.swp
 8 | *.swo
 9 | *.raw
10 | *.json
11 | scripts/test
12 | *.so
13 | *.bin
14 | datasets/*
15 | __pycache__
16 | README.html
17 | *tags
18 | *.tmp
19 | *.pyc
20 | 
21 | doc/html*
22 | doc/latex*
23 | doc/build*
24 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | all:
 2 | 	make -C isan
 3 | 
 4 | 
 5 | test_cws:
 6 | 	./cws.sh model.bin --train ~/data/seg/ctb5.test.seg --dev ~/data/seg/ctb5.test.seg
 7 | 
 8 | test_dag:
 9 | 	./tag_path.sh model.bin --train test/train1000.dlat --dev test/test.dlat --iteration=5
10 | 	#./tag_path.sh model.bin --train test/train.dlat --dev test/test.dlat --iteration=20
11 | 
12 | test_dep:
13 | 	./parsing.sh model.bin --train test/ctb5.test.txt --dev test/ctb5.test.txt
14 | 
15 | test_dep2:
16 | 	#./dep2.sh model.bin --train test/train1000.dlat --dev test/test.dlat
17 | 	./dep2.sh model.bin --train test/train.dlat --dev test/test.dlat --iteration=20
18 | 
19 | test_lat_dep:
20 | 	./lat_dep.sh model.bin --train test/train1000.dlat --dev test/test.dlat --iteration=20
21 | 	#./lat_dep.sh model.bin --train test/train.dlat --dev test/test.dlat --iteration=20
22 | 
23 | test_seq_dep:
24 | 	./seq_dep.sh model.bin --train test/train1000.dlat --dev test/test.dlat --iteration=20
25 | 	#./seq_dep.sh model.bin --train test/train.dlat --dev test/test.dlat --iteration=20
26 | 
27 | test_lat_tag:
28 | 	#./lat_tag.sh model.bin --train test/train.dlat --dev test/test.dlat --iteration=5
29 | 	./lat_tag.sh model.bin --train test/train1000.dlat --dev test/test.dlat --iteration=5
30 | 
31 | basic_test:
32 | 	./cws.sh model.bin --train ~/data/seg/ctb5.test.seg --dev ~/data/seg/ctb5.test.seg --iteration=1
33 | 	./parsing.sh model.bin --train test/ctb5.test.txt --dev test/ctb5.test.txt --iteration=1
34 | 
35 | test_msr:
36 | 	./cws.sh model.bin --train ~/data/seg/msr.training.seg --dev ~/data/seg/msr.test.seg --iteration=30
37 | 
38 | test_ctb5_parsing:
39 | 	./parsing.sh model.bin --train test/ctb5.training.txt --dev test/ctb5.test.txt --iteration=30
40 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | isan
 2 | ====
 3 | 
 4 | > “举一隅不以三隅反，则不复也” ——《论语·述而》
 5 | 
 6 | 一个数据驱动的中文处理 **实验环境** ，可进行 **中文分词** ， **词性标注**  和 **依存句法分析** 等任务。
 7 | 
 8 | 文档： [https://isan.readthedocs.org/en/latest/](https://isan.readthedocs.org/en/latest/)
 9 | 
10 | 作者： [张开旭](http://weibo.com/zhangkaixu)。
11 | 
12 | 


--------------------------------------------------------------------------------
/docs/.gitignore:
--------------------------------------------------------------------------------
1 | _build
2 | 


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
  1 | # Makefile for Sphinx documentation
  2 | #
  3 | 
  4 | # You can set these variables from the command line.
  5 | SPHINXOPTS    =
  6 | SPHINXBUILD   = sphinx-build
  7 | PAPER         =
  8 | BUILDDIR      = _build
  9 | 
 10 | # Internal variables.
 11 | PAPEROPT_a4     = -D latex_paper_size=a4
 12 | PAPEROPT_letter = -D latex_paper_size=letter
 13 | ALLSPHINXOPTS   = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
 14 | # the i18n builder cannot share the environment and doctrees with the others
 15 | I18NSPHINXOPTS  = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
 16 | 
 17 | .PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest gettext
 18 | 
 19 | help:
 20 | 	@echo "Please use \`make <target>' where <target> is one of"
 21 | 	@echo "  html       to make standalone HTML files"
 22 | 	@echo "  dirhtml    to make HTML files named index.html in directories"
 23 | 	@echo "  singlehtml to make a single large HTML file"
 24 | 	@echo "  pickle     to make pickle files"
 25 | 	@echo "  json       to make JSON files"
 26 | 	@echo "  htmlhelp   to make HTML files and a HTML help project"
 27 | 	@echo "  qthelp     to make HTML files and a qthelp project"
 28 | 	@echo "  devhelp    to make HTML files and a Devhelp project"
 29 | 	@echo "  epub       to make an epub"
 30 | 	@echo "  latex      to make LaTeX files, you can set PAPER=a4 or PAPER=letter"
 31 | 	@echo "  latexpdf   to make LaTeX files and run them through pdflatex"
 32 | 	@echo "  text       to make text files"
 33 | 	@echo "  man        to make manual pages"
 34 | 	@echo "  texinfo    to make Texinfo files"
 35 | 	@echo "  info       to make Texinfo files and run them through makeinfo"
 36 | 	@echo "  gettext    to make PO message catalogs"
 37 | 	@echo "  changes    to make an overview of all changed/added/deprecated items"
 38 | 	@echo "  linkcheck  to check all external links for integrity"
 39 | 	@echo "  doctest    to run all doctests embedded in the documentation (if enabled)"
 40 | 
 41 | clean:
 42 | 	-rm -rf $(BUILDDIR)/*
 43 | 
 44 | html:
 45 | 	$(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html
 46 | 	@echo
 47 | 	@echo "Build finished. The HTML pages are in $(BUILDDIR)/html."
 48 | 
 49 | dirhtml:
 50 | 	$(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml
 51 | 	@echo
 52 | 	@echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml."
 53 | 
 54 | singlehtml:
 55 | 	$(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml
 56 | 	@echo
 57 | 	@echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml."
 58 | 
 59 | pickle:
 60 | 	$(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle
 61 | 	@echo
 62 | 	@echo "Build finished; now you can process the pickle files."
 63 | 
 64 | json:
 65 | 	$(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json
 66 | 	@echo
 67 | 	@echo "Build finished; now you can process the JSON files."
 68 | 
 69 | htmlhelp:
 70 | 	$(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp
 71 | 	@echo
 72 | 	@echo "Build finished; now you can run HTML Help Workshop with the" \
 73 | 	      ".hhp project file in $(BUILDDIR)/htmlhelp."
 74 | 
 75 | qthelp:
 76 | 	$(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp
 77 | 	@echo
 78 | 	@echo "Build finished; now you can run "qcollectiongenerator" with the" \
 79 | 	      ".qhcp project file in $(BUILDDIR)/qthelp, like this:"
 80 | 	@echo "# qcollectiongenerator $(BUILDDIR)/qthelp/isan.qhcp"
 81 | 	@echo "To view the help file:"
 82 | 	@echo "# assistant -collectionFile $(BUILDDIR)/qthelp/isan.qhc"
 83 | 
 84 | devhelp:
 85 | 	$(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp
 86 | 	@echo
 87 | 	@echo "Build finished."
 88 | 	@echo "To view the help file:"
 89 | 	@echo "# mkdir -p $$HOME/.local/share/devhelp/isan"
 90 | 	@echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/isan"
 91 | 	@echo "# devhelp"
 92 | 
 93 | epub:
 94 | 	$(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub
 95 | 	@echo
 96 | 	@echo "Build finished. The epub file is in $(BUILDDIR)/epub."
 97 | 
 98 | latex:
 99 | 	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
100 | 	@echo
101 | 	@echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex."
102 | 	@echo "Run \`make' in that directory to run these through (pdf)latex" \
103 | 	      "(use \`make latexpdf' here to do that automatically)."
104 | 
105 | latexpdf:
106 | 	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
107 | 	@echo "Running LaTeX files through pdflatex..."
108 | 	$(MAKE) -C $(BUILDDIR)/latex all-pdf
109 | 	@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
110 | 
111 | text:
112 | 	$(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text
113 | 	@echo
114 | 	@echo "Build finished. The text files are in $(BUILDDIR)/text."
115 | 
116 | man:
117 | 	$(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man
118 | 	@echo
119 | 	@echo "Build finished. The manual pages are in $(BUILDDIR)/man."
120 | 
121 | texinfo:
122 | 	$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
123 | 	@echo
124 | 	@echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo."
125 | 	@echo "Run \`make' in that directory to run these through makeinfo" \
126 | 	      "(use \`make info' here to do that automatically)."
127 | 
128 | info:
129 | 	$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
130 | 	@echo "Running Texinfo files through makeinfo..."
131 | 	make -C $(BUILDDIR)/texinfo info
132 | 	@echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo."
133 | 
134 | gettext:
135 | 	$(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale
136 | 	@echo
137 | 	@echo "Build finished. The message catalogs are in $(BUILDDIR)/locale."
138 | 
139 | changes:
140 | 	$(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes
141 | 	@echo
142 | 	@echo "The overview file is in $(BUILDDIR)/changes."
143 | 
144 | linkcheck:
145 | 	$(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck
146 | 	@echo
147 | 	@echo "Link check complete; look for any errors in the above output " \
148 | 	      "or in $(BUILDDIR)/linkcheck/output.txt."
149 | 
150 | doctest:
151 | 	$(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest
152 | 	@echo "Testing of doctests in the sources finished, look at the " \
153 | 	      "results in $(BUILDDIR)/doctest/output.txt."
154 | 


--------------------------------------------------------------------------------
/docs/advanced.rst:
--------------------------------------------------------------------------------
 1 | 进阶
 2 | =================
 3 | 
 4 | 
 5 | 
 6 | 如何修改已有算法
 7 | ----------------------------------
 8 | 
 9 | 
10 | 
11 | 如何写新工具处理新任务
12 | ----------------------------------------
13 | 


--------------------------------------------------------------------------------
/docs/basic.rst:
--------------------------------------------------------------------------------
 1 | 基本操作
 2 | ================
 3 | 
 4 | 模型的训练、测试和使用
 5 | ---------------------------------------
 6 | 
 7 | 命令行及参数
 8 | ++++++++++++++++++++++++
 9 | 
10 | 主要命令均通过调用 ``./isan.py`` 完成。
11 | 
12 | 许多已实现的模型有一些固定的参数，可以使用 ``./isan.sh`` 更方便的调用， 基本操作使用后者即可。
13 | 
14 | .. code-block:: bash
15 | 
16 |     ./isan.sh model-name [model-file] [ other args ]
17 | 
18 | 其中 ``model-name`` 是模型名字， 如 ``seg`` 是一个基于字标注的模型，可用于进行分词或者分词词性标注， ``cws`` 是一个基于词的分词模型， ``dep`` 是一个依存句法分析模型。
19 | 
20 | ``model-file`` 是模型参数文件。 如果是训练任务，可为空，表示训练之后不保存模型参数。
21 | 
22 | 本小节将涉及的其它参数有：
23 | 
24 | * ``--train training-data`` 使用指定的训练集文件训练模型
25 | * ``--test test-data`` 训练完后使用测试集测试模型效果
26 | * ``--dev dev-data`` 每次训练迭代后使用开发集评价模型效果
27 | * ``--iteration iter`` 指定训练迭代次数
28 | 
29 | 主要使用场合：
30 | 
31 | * **训练模型** ： 指定了 ``--train`` 参数，则训练一个新模型保存在 ``model-name`` ， 可同时再使用 ``--test`` ``--dev`` 等参数
32 | * **测试模型** ： 不指定 ``--train`` 参数， 但指定 ``--test`` 参数
33 | * **使用模型** ： 不指定 ``--train`` 参数， 也不指定 ``--test`` 参数， 则从标准输入流中读入输入，将输出输出到标准输入流。
34 | 
35 | 实例
36 | ++++++++++++++++++++++++
37 | 
38 | 可以用中文分词任务试试isan如何工作。下载一个可供实验用的SIGHAN05中文分词语料库::
39 | 
40 |     wget http://www.sighan.org/bakeoff2005/data/icwb2-data.rar
41 |     sudo apt-get install unrar
42 |     mkdir sighan05; unrar e icwb2-data.rar sighan05
43 |     ln -s sighan05/msr_test_gold.utf8 train.seg
44 |     ln -s sighan05/msr_test_gold.utf8 test.seg
45 | 
46 | 
47 | 试着训练和测试::
48 | 
49 |     ./isan.sh seg model.gz --train test.seg
50 |     ./isan.sh seg model.gz --test test.seg
51 | 
52 | 接下来就可以试着真枪实弹地来一次，在MSR的训练集上迭代30次训练模型，每次迭代都将测试集作为开发集检查一下模型性能::
53 | 
54 |     ./isan.sh seg model.gz --train train \
55 |             --dev test.seg --iteration 15
56 | 
57 | 需要一些耐心等待程序结束。
58 | 
59 | 会得到类似这样的结果::
60 | 
61 |     标准: 8008 输出: 8057 seg正确: 7811 正确: 7811 seg_f1: 0.9724 tag_f1: 0.9724 ol: 11 时间: 0.2762 (49733字/秒)
62 | 
63 | 可以看到分词F值为0.9724。
64 | 
65 | 还可以使用 ``./isan/tagging/eval.py`` 这个工具, 直接比较两个分词结果::
66 | 
67 |     sed 's/\ //g' test.seg | ./isan.sh seg ctb.seg.gz > result.seg
68 |     ./isan/tagging/eval.py test.seg result.seg
69 |     
70 | 
71 | 已实现的模型
72 | --------------------------------
73 | 
74 | .. _trained_model_parameter_list:
75 | 
76 | 已训练模型列表
77 | ++++++++++++++++++++++++++++++++
78 | 
79 | 中文分词 使用 ``wget http://t.cn/zQxy95O -O ctb.seg.gz``  获取，使用 ``./isan.sh seg ctb.seg.gz`` 启动
80 | 
81 | 中文分词词性标注  使用 ``http://t.cn/zQxg4lX -O ctb.tag.gz`` 获取， 使用 ``./isan.sh seg ctb.tag.gz`` 启动
82 | 


--------------------------------------------------------------------------------
/docs/conf.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | #
  3 | # isan documentation build configuration file, created by
  4 | # sphinx-quickstart on Sat May  4 16:12:46 2013.
  5 | #
  6 | # This file is execfile()d with the current directory set to its containing dir.
  7 | #
  8 | # Note that not all possible configuration values are present in this
  9 | # autogenerated file.
 10 | #
 11 | # All configuration values have a default; values that are commented out
 12 | # serve to show the default.
 13 | 
 14 | import sys, os
 15 | 
 16 | # If extensions (or modules to document with autodoc) are in another directory,
 17 | # add these directories to sys.path here. If the directory is relative to the
 18 | # documentation root, use os.path.abspath to make it absolute, like shown here.
 19 | #sys.path.insert(0, os.path.abspath('.'))
 20 | 
 21 | # -- General configuration -----------------------------------------------------
 22 | 
 23 | # If your documentation needs a minimal Sphinx version, state it here.
 24 | #needs_sphinx = '1.0'
 25 | 
 26 | # Add any Sphinx extension module names here, as strings. They can be extensions
 27 | # coming with Sphinx (named 'sphinx.ext.*') or your custom ones.
 28 | extensions = ['sphinx.ext.autodoc', 'sphinx.ext.mathjax']
 29 | 
 30 | # Add any paths that contain templates here, relative to this directory.
 31 | templates_path = ['_templates']
 32 | 
 33 | # The suffix of source filenames.
 34 | source_suffix = '.rst'
 35 | 
 36 | # The encoding of source files.
 37 | #source_encoding = 'utf-8-sig'
 38 | 
 39 | # The master toctree document.
 40 | master_doc = 'index'
 41 | 
 42 | # General information about the project.
 43 | project = u'isan'
 44 | copyright = u'2013, ZHANG, Kaixu'
 45 | 
 46 | # The version info for the project you're documenting, acts as replacement for
 47 | # |version| and |release|, also used in various other places throughout the
 48 | # built documents.
 49 | #
 50 | # The short X.Y version.
 51 | version = '0.1'
 52 | # The full version, including alpha/beta/rc tags.
 53 | release = '0.1'
 54 | 
 55 | # The language for content autogenerated by Sphinx. Refer to documentation
 56 | # for a list of supported languages.
 57 | #language = None
 58 | 
 59 | # There are two options for replacing |today|: either, you set today to some
 60 | # non-false value, then it is used:
 61 | #today = ''
 62 | # Else, today_fmt is used as the format for a strftime call.
 63 | #today_fmt = '%B %d, %Y'
 64 | 
 65 | # List of patterns, relative to source directory, that match files and
 66 | # directories to ignore when looking for source files.
 67 | exclude_patterns = ['_build']
 68 | 
 69 | # The reST default role (used for this markup: `text`) to use for all documents.
 70 | #default_role = None
 71 | 
 72 | # If true, '()' will be appended to :func: etc. cross-reference text.
 73 | #add_function_parentheses = True
 74 | 
 75 | # If true, the current module name will be prepended to all description
 76 | # unit titles (such as .. function::).
 77 | #add_module_names = True
 78 | 
 79 | # If true, sectionauthor and moduleauthor directives will be shown in the
 80 | # output. They are ignored by default.
 81 | #show_authors = False
 82 | 
 83 | # The name of the Pygments (syntax highlighting) style to use.
 84 | pygments_style = 'sphinx'
 85 | 
 86 | # A list of ignored prefixes for module index sorting.
 87 | #modindex_common_prefix = []
 88 | 
 89 | 
 90 | # -- Options for HTML output ---------------------------------------------------
 91 | 
 92 | # The theme to use for HTML and HTML Help pages.  See the documentation for
 93 | # a list of builtin themes.
 94 | #html_theme = 'default'
 95 | html_theme = 'haiku'
 96 | html_theme = 'pyramid'
 97 | html_theme = 'nature'
 98 | 
 99 | 
100 | 
101 | # Theme options are theme-specific and customize the look and feel of a theme
102 | # further.  For a list of options available for each theme, see the
103 | # documentation.
104 | #html_theme_options = {}
105 | 
106 | # Add any paths that contain custom themes here, relative to this directory.
107 | #html_theme_path = []
108 | 
109 | # The name for this set of Sphinx documents.  If None, it defaults to
110 | # "<project> v<release> documentation".
111 | #html_title = None
112 | 
113 | # A shorter title for the navigation bar.  Default is the same as html_title.
114 | #html_short_title = None
115 | 
116 | # The name of an image file (relative to this directory) to place at the top
117 | # of the sidebar.
118 | #html_logo = None
119 | 
120 | # The name of an image file (within the static path) to use as favicon of the
121 | # docs.  This file should be a Windows icon file (.ico) being 16x16 or 32x32
122 | # pixels large.
123 | #html_favicon = None
124 | 
125 | # Add any paths that contain custom static files (such as style sheets) here,
126 | # relative to this directory. They are copied after the builtin static files,
127 | # so a file named "default.css" will overwrite the builtin "default.css".
128 | html_static_path = ['_static']
129 | 
130 | # If not '', a 'Last updated on:' timestamp is inserted at every page bottom,
131 | # using the given strftime format.
132 | #html_last_updated_fmt = '%b %d, %Y'
133 | 
134 | # If true, SmartyPants will be used to convert quotes and dashes to
135 | # typographically correct entities.
136 | #html_use_smartypants = True
137 | 
138 | # Custom sidebar templates, maps document names to template names.
139 | #html_sidebars = {}
140 | 
141 | # Additional templates that should be rendered to pages, maps page names to
142 | # template names.
143 | #html_additional_pages = {}
144 | 
145 | # If false, no module index is generated.
146 | #html_domain_indices = True
147 | 
148 | # If false, no index is generated.
149 | #html_use_index = True
150 | 
151 | # If true, the index is split into individual pages for each letter.
152 | #html_split_index = False
153 | 
154 | # If true, links to the reST sources are added to the pages.
155 | #html_show_sourcelink = True
156 | 
157 | # If true, "Created using Sphinx" is shown in the HTML footer. Default is True.
158 | #html_show_sphinx = True
159 | 
160 | # If true, "(C) Copyright ..." is shown in the HTML footer. Default is True.
161 | #html_show_copyright = True
162 | 
163 | # If true, an OpenSearch description file will be output, and all pages will
164 | # contain a <link> tag referring to it.  The value of this option must be the
165 | # base URL from which the finished HTML is served.
166 | #html_use_opensearch = ''
167 | 
168 | # This is the file name suffix for HTML files (e.g. ".xhtml").
169 | #html_file_suffix = None
170 | 
171 | # Output file base name for HTML help builder.
172 | htmlhelp_basename = 'isandoc'
173 | 
174 | 
175 | # -- Options for LaTeX output --------------------------------------------------
176 | 
177 | latex_elements = {
178 | # The paper size ('letterpaper' or 'a4paper').
179 | #'papersize': 'letterpaper',
180 | 
181 | # The font size ('10pt', '11pt' or '12pt').
182 | #'pointsize': '10pt',
183 | 
184 | # Additional stuff for the LaTeX preamble.
185 | #'preamble': '',
186 | }
187 | 
188 | # Grouping the document tree into LaTeX files. List of tuples
189 | # (source start file, target name, title, author, documentclass [howto/manual]).
190 | latex_documents = [
191 |   ('index', 'isan.tex', u'isan Documentation',
192 |    u'ZHANG, Kaixu', 'manual'),
193 | ]
194 | 
195 | # The name of an image file (relative to this directory) to place at the top of
196 | # the title page.
197 | #latex_logo = None
198 | 
199 | # For "manual" documents, if this is true, then toplevel headings are parts,
200 | # not chapters.
201 | #latex_use_parts = False
202 | 
203 | # If true, show page references after internal links.
204 | #latex_show_pagerefs = False
205 | 
206 | # If true, show URL addresses after external links.
207 | #latex_show_urls = False
208 | 
209 | # Documents to append as an appendix to all manuals.
210 | #latex_appendices = []
211 | 
212 | # If false, no module index is generated.
213 | #latex_domain_indices = True
214 | 
215 | 
216 | # -- Options for manual page output --------------------------------------------
217 | 
218 | # One entry per manual page. List of tuples
219 | # (source start file, name, description, authors, manual section).
220 | man_pages = [
221 |     ('index', 'isan', u'isan Documentation',
222 |      [u'ZHANG, Kaixu'], 1)
223 | ]
224 | 
225 | # If true, show URL addresses after external links.
226 | #man_show_urls = False
227 | 
228 | 
229 | # -- Options for Texinfo output ------------------------------------------------
230 | 
231 | # Grouping the document tree into Texinfo files. List of tuples
232 | # (source start file, target name, title, author,
233 | #  dir menu entry, description, category)
234 | texinfo_documents = [
235 |   ('index', 'isan', u'isan Documentation',
236 |    u'ZHANG, Kaixu', 'isan', 'One line description of project.',
237 |    'Miscellaneous'),
238 | ]
239 | 
240 | # Documents to append as an appendix to all manuals.
241 | #texinfo_appendices = []
242 | 
243 | # If false, no module index is generated.
244 | #texinfo_domain_indices = True
245 | 
246 | # How to display URL addresses: 'footnote', 'no', or 'inline'.
247 | #texinfo_show_urls = 'footnote'
248 | 


--------------------------------------------------------------------------------
/docs/getting_started.rst:
--------------------------------------------------------------------------------
 1 | 上手
 2 | =============
 3 | 
 4 | 在此以Ubuntu操作系统为例，介绍如何安装和使用isan的基本功能。
 5 | 
 6 | 下载与编译
 7 | ----------------------
 8 | 
 9 | 首先，需要安装必要的软件包，在命令行下安装
10 | 
11 | .. code-block:: bash
12 | 
13 |     sudo apt-get install gcc make python3 python3-dev git python3-numpy
14 | 
15 | .. note::
16 | 
17 |     本工具包使用的是python3，与最常用的python版本python2不完全兼容。
18 | 
19 |     为了提高速度，解码核心算法使用c++编写，因此还需要gcc进行编译。
20 |     
21 |     
22 | 
23 | 然后选好路径，下载isan源代码，编译::
24 | 
25 |     git clone https://github.com/zhangkaixu/isan.git
26 |     cd isan
27 |     make
28 | 
29 | 编译正确后，就可以使用了。
30 | 
31 | 
32 | 使用训练好的模型
33 | ----------------------
34 | 
35 | 以中文分词为例, 下载一个训练好的模型文件::
36 | 
37 |     wget http://t.cn/zQxy95O -O ctb.seg.gz
38 | 
39 | .. seealso::
40 |     
41 |     在这里有一份已经训练好的模型参数的列表 :ref:`trained_model_parameter_list`
42 | 
43 | 这是一个在中文树库5上训练的分词模型参数文件，试试分词::
44 | 
45 |     echo '厦门大学' | ./isan.sh seg ctb.seg.gz
46 | 
47 | 其中 ``isan.sh`` 是用来启动isan及其常用任务的脚本。 用 ``seg`` 来指明一个基于字标注的模型。 ``ctb.seg.gz`` 是刚才下载的对应的参数文件。 运行后将会得到这样的输出::
48 | 
49 |     厦门 大学
50 | 
51 | 程序从标准输入流读入输入数据，将结果输出到标准输出流。一般地，可以这样执行::
52 | 
53 |     ./isan.sh seg ctb.seg.gz < input_file > output_file
54 | 
55 | 
56 | 


--------------------------------------------------------------------------------
/docs/index.rst:
--------------------------------------------------------------------------------
 1 | .. isan documentation master file, created by
 2 |    sphinx-quickstart on Sat May  4 16:12:46 2013.
 3 |    You can adapt this file completely to your liking, but it should at least
 4 |    contain the root `toctree` directive.
 5 | 
 6 | 
 7 | Isan 一三
 8 | ================================
 9 | 
10 | .. sidebar:: 舉一隅不以三隅反，則不復也 
11 | 
12 |     ——《論語》
13 | 
14 | 
15 | 一三（isan）是一个基于统计的开源中文自然语言处理实验环境， 可进行 **中文分词** 、 **词性标注** 、 **句法分析** 等任务。 所有任务均使用结构感知器（structured perceptron）这一统一的框架进行参数学习。
16 | 
17 | 源码 `<https://github.com/zhangkaixu/isan>`_
18 | 
19 | 文档目录:
20 | 
21 | .. toctree::
22 |    :maxdepth: 2
23 | 
24 |    上手 —— 把isan当作现成的中文自然语言处理工具 <getting_started.rst>
25 |    基本功能 —— 希望根据自己的语料库训练模型并使用 <basic.rst>
26 |    技术简介 —— 了解isan所使用的技术 <tech_report.rst>
27 |    进阶 —— 基于isan的框架编写新模型，完成新任务 <advanced.rst>
28 |    
29 | 
30 | 
31 | 
32 | .. Indices and tables
33 |     ==================
34 |     * :ref:`genindex`
35 |     * :ref:`modindex`
36 |     * :ref:`search`
37 | 


--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
  1 | @ECHO OFF
  2 | 
  3 | REM Command file for Sphinx documentation
  4 | 
  5 | if "%SPHINXBUILD%" == "" (
  6 | 	set SPHINXBUILD=sphinx-build
  7 | )
  8 | set BUILDDIR=_build
  9 | set ALLSPHINXOPTS=-d %BUILDDIR%/doctrees %SPHINXOPTS% .
 10 | set I18NSPHINXOPTS=%SPHINXOPTS% .
 11 | if NOT "%PAPER%" == "" (
 12 | 	set ALLSPHINXOPTS=-D latex_paper_size=%PAPER% %ALLSPHINXOPTS%
 13 | 	set I18NSPHINXOPTS=-D latex_paper_size=%PAPER% %I18NSPHINXOPTS%
 14 | )
 15 | 
 16 | if "%1" == "" goto help
 17 | 
 18 | if "%1" == "help" (
 19 | 	:help
 20 | 	echo.Please use `make ^<target^>` where ^<target^> is one of
 21 | 	echo.  html       to make standalone HTML files
 22 | 	echo.  dirhtml    to make HTML files named index.html in directories
 23 | 	echo.  singlehtml to make a single large HTML file
 24 | 	echo.  pickle     to make pickle files
 25 | 	echo.  json       to make JSON files
 26 | 	echo.  htmlhelp   to make HTML files and a HTML help project
 27 | 	echo.  qthelp     to make HTML files and a qthelp project
 28 | 	echo.  devhelp    to make HTML files and a Devhelp project
 29 | 	echo.  epub       to make an epub
 30 | 	echo.  latex      to make LaTeX files, you can set PAPER=a4 or PAPER=letter
 31 | 	echo.  text       to make text files
 32 | 	echo.  man        to make manual pages
 33 | 	echo.  texinfo    to make Texinfo files
 34 | 	echo.  gettext    to make PO message catalogs
 35 | 	echo.  changes    to make an overview over all changed/added/deprecated items
 36 | 	echo.  linkcheck  to check all external links for integrity
 37 | 	echo.  doctest    to run all doctests embedded in the documentation if enabled
 38 | 	goto end
 39 | )
 40 | 
 41 | if "%1" == "clean" (
 42 | 	for /d %%i in (%BUILDDIR%\*) do rmdir /q /s %%i
 43 | 	del /q /s %BUILDDIR%\*
 44 | 	goto end
 45 | )
 46 | 
 47 | if "%1" == "html" (
 48 | 	%SPHINXBUILD% -b html %ALLSPHINXOPTS% %BUILDDIR%/html
 49 | 	if errorlevel 1 exit /b 1
 50 | 	echo.
 51 | 	echo.Build finished. The HTML pages are in %BUILDDIR%/html.
 52 | 	goto end
 53 | )
 54 | 
 55 | if "%1" == "dirhtml" (
 56 | 	%SPHINXBUILD% -b dirhtml %ALLSPHINXOPTS% %BUILDDIR%/dirhtml
 57 | 	if errorlevel 1 exit /b 1
 58 | 	echo.
 59 | 	echo.Build finished. The HTML pages are in %BUILDDIR%/dirhtml.
 60 | 	goto end
 61 | )
 62 | 
 63 | if "%1" == "singlehtml" (
 64 | 	%SPHINXBUILD% -b singlehtml %ALLSPHINXOPTS% %BUILDDIR%/singlehtml
 65 | 	if errorlevel 1 exit /b 1
 66 | 	echo.
 67 | 	echo.Build finished. The HTML pages are in %BUILDDIR%/singlehtml.
 68 | 	goto end
 69 | )
 70 | 
 71 | if "%1" == "pickle" (
 72 | 	%SPHINXBUILD% -b pickle %ALLSPHINXOPTS% %BUILDDIR%/pickle
 73 | 	if errorlevel 1 exit /b 1
 74 | 	echo.
 75 | 	echo.Build finished; now you can process the pickle files.
 76 | 	goto end
 77 | )
 78 | 
 79 | if "%1" == "json" (
 80 | 	%SPHINXBUILD% -b json %ALLSPHINXOPTS% %BUILDDIR%/json
 81 | 	if errorlevel 1 exit /b 1
 82 | 	echo.
 83 | 	echo.Build finished; now you can process the JSON files.
 84 | 	goto end
 85 | )
 86 | 
 87 | if "%1" == "htmlhelp" (
 88 | 	%SPHINXBUILD% -b htmlhelp %ALLSPHINXOPTS% %BUILDDIR%/htmlhelp
 89 | 	if errorlevel 1 exit /b 1
 90 | 	echo.
 91 | 	echo.Build finished; now you can run HTML Help Workshop with the ^
 92 | .hhp project file in %BUILDDIR%/htmlhelp.
 93 | 	goto end
 94 | )
 95 | 
 96 | if "%1" == "qthelp" (
 97 | 	%SPHINXBUILD% -b qthelp %ALLSPHINXOPTS% %BUILDDIR%/qthelp
 98 | 	if errorlevel 1 exit /b 1
 99 | 	echo.
100 | 	echo.Build finished; now you can run "qcollectiongenerator" with the ^
101 | .qhcp project file in %BUILDDIR%/qthelp, like this:
102 | 	echo.^> qcollectiongenerator %BUILDDIR%\qthelp\isan.qhcp
103 | 	echo.To view the help file:
104 | 	echo.^> assistant -collectionFile %BUILDDIR%\qthelp\isan.ghc
105 | 	goto end
106 | )
107 | 
108 | if "%1" == "devhelp" (
109 | 	%SPHINXBUILD% -b devhelp %ALLSPHINXOPTS% %BUILDDIR%/devhelp
110 | 	if errorlevel 1 exit /b 1
111 | 	echo.
112 | 	echo.Build finished.
113 | 	goto end
114 | )
115 | 
116 | if "%1" == "epub" (
117 | 	%SPHINXBUILD% -b epub %ALLSPHINXOPTS% %BUILDDIR%/epub
118 | 	if errorlevel 1 exit /b 1
119 | 	echo.
120 | 	echo.Build finished. The epub file is in %BUILDDIR%/epub.
121 | 	goto end
122 | )
123 | 
124 | if "%1" == "latex" (
125 | 	%SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex
126 | 	if errorlevel 1 exit /b 1
127 | 	echo.
128 | 	echo.Build finished; the LaTeX files are in %BUILDDIR%/latex.
129 | 	goto end
130 | )
131 | 
132 | if "%1" == "text" (
133 | 	%SPHINXBUILD% -b text %ALLSPHINXOPTS% %BUILDDIR%/text
134 | 	if errorlevel 1 exit /b 1
135 | 	echo.
136 | 	echo.Build finished. The text files are in %BUILDDIR%/text.
137 | 	goto end
138 | )
139 | 
140 | if "%1" == "man" (
141 | 	%SPHINXBUILD% -b man %ALLSPHINXOPTS% %BUILDDIR%/man
142 | 	if errorlevel 1 exit /b 1
143 | 	echo.
144 | 	echo.Build finished. The manual pages are in %BUILDDIR%/man.
145 | 	goto end
146 | )
147 | 
148 | if "%1" == "texinfo" (
149 | 	%SPHINXBUILD% -b texinfo %ALLSPHINXOPTS% %BUILDDIR%/texinfo
150 | 	if errorlevel 1 exit /b 1
151 | 	echo.
152 | 	echo.Build finished. The Texinfo files are in %BUILDDIR%/texinfo.
153 | 	goto end
154 | )
155 | 
156 | if "%1" == "gettext" (
157 | 	%SPHINXBUILD% -b gettext %I18NSPHINXOPTS% %BUILDDIR%/locale
158 | 	if errorlevel 1 exit /b 1
159 | 	echo.
160 | 	echo.Build finished. The message catalogs are in %BUILDDIR%/locale.
161 | 	goto end
162 | )
163 | 
164 | if "%1" == "changes" (
165 | 	%SPHINXBUILD% -b changes %ALLSPHINXOPTS% %BUILDDIR%/changes
166 | 	if errorlevel 1 exit /b 1
167 | 	echo.
168 | 	echo.The overview file is in %BUILDDIR%/changes.
169 | 	goto end
170 | )
171 | 
172 | if "%1" == "linkcheck" (
173 | 	%SPHINXBUILD% -b linkcheck %ALLSPHINXOPTS% %BUILDDIR%/linkcheck
174 | 	if errorlevel 1 exit /b 1
175 | 	echo.
176 | 	echo.Link check complete; look for any errors in the above output ^
177 | or in %BUILDDIR%/linkcheck/output.txt.
178 | 	goto end
179 | )
180 | 
181 | if "%1" == "doctest" (
182 | 	%SPHINXBUILD% -b doctest %ALLSPHINXOPTS% %BUILDDIR%/doctest
183 | 	if errorlevel 1 exit /b 1
184 | 	echo.
185 | 	echo.Testing of doctests in the sources finished, look at the ^
186 | results in %BUILDDIR%/doctest/output.txt.
187 | 	goto end
188 | )
189 | 
190 | :end
191 | 


--------------------------------------------------------------------------------
/docs/tech_report.rst:
--------------------------------------------------------------------------------
  1 | 理论介绍
  2 | ==================
  3 | 
  4 | .. warning::
  5 | 
  6 |     Chrome系列的浏览器可能无法直接正确显示本页公式。 这是因为用来显示公式用的js没有正确加载。 可点击地址栏右边盾牌状的图标以加载用来显示公式的js。
  7 | 
  8 | 一个工具，可以抽象为一个函数 :math:`z=\text{Function}(x)` ， 根据输入产生输出， 能够进行中文分词、词性标注、句法分析等任务的isan也不例外。
  9 | 
 10 | 结构分类问题
 11 | +++++++++++++++++++++++++++
 12 | 
 13 | Isan处理的是一类叫做结构分类的问题。 所谓“分类”， 就是说输出是离散的量， 所谓“结构”，就是说输出不是一个量，而是一组具有内部关联结构的量。 例如，一个函数的输出可以是一个词的序列如“厦门 的 鼓浪屿”，可被看作有三个离散量线性连接而成的结构。 语言中的句法树， 是另一种层次性的结构。
 14 | 
 15 | 
 16 | 通常一个输入可以有多个候选的输出， 需要建立一个标准选择最好的那个， 因此定义一个评价函数 :math:`f(\mathbf{x};\mathbf{y})` 给所有可能的输入输出对打分。 那么根据输入产生输出的过程就可以在数学上抽象为
 17 | 
 18 | .. math::
 19 |     :label: argmax_z
 20 | 
 21 |     \mathbf{z}=\arg\max_{\mathbf{z}}{f(\mathbf{x};\mathbf{z})}
 22 | 
 23 | 设计工具的问题就变成了如何找到合适的 :math:`f()` 使得对于给定的输入能得到期望得到的输出。 有一种方法论（统计机器学习）的来法是这样的：
 24 | 
 25 | 1. 为了描述所谓“期望的输出”， 我们直接构建一个数据集 :math:`\{(\mathbf{x}_i,\mathbf{y}_i)\}` ，其中 :math:`\mathbf{x}_i` 是输入样本， :math:`\mathbf{y}_i` 是其“期望”的输出。
 26 | 2. :math:`f()` 不能漫无目的地寻找， 最好是人给出一个恰当的范围，然后让计算机在这个范围内参考上面的数据集找到一个最佳的函数。不妨将 :math:`f()` 写为 :math:`f(\mathbf{x},\mathbf{w};\mathbf{y})` ， 其中 :math:`\mathbf{w}` 被称为模型的参数， 其不同的取值会得到不同的评价函数。 然后根据数据集自动地确定参数最佳的取值。
 27 | 
 28 | 以上的路线图中， 就有一大一小两个搜索问题： 小的搜索问题是根据输入搜索最佳的输出； 大的搜索问题是根据已有的输入输出对组成的数据集， 搜索最佳的评价函数参数， 使得小的搜索问题能最好地完成。
 29 | 
 30 | 继续，为了搜索最佳的参数， 同样需要评价参数的好坏， 因此再引入损失函数， 刻画在一定的参数下， 对数据集进行处理产生的损失
 31 | 
 32 | .. math::
 33 | 
 34 |     \text{loss}(\mathbf{w})=\sum_{i}{f(\mathbf{x}_i,\mathbf{w};\mathbf{z}_i)-f(\mathbf{x}_i,\mathbf{w};\mathbf{y}_i)}
 35 | 
 36 | .. note::
 37 | 
 38 |     还可以设计其它的损失函数。
 39 | 
 40 | 这个损失函数是非负的， 当小搜索问题的搜索结果与期望的结果相同时， 损失为0。
 41 | 
 42 | 参数的搜索也就是以下最优化问题
 43 | 
 44 | .. math::
 45 |     :label: argmax_w
 46 | 
 47 |     \mathbf{w}^*=\arg\min_{\mathbf{w}}{\text{loss}(\mathbf{w})}
 48 | 
 49 | 
 50 | 这就是整个问题的大框架。 接下来的问题就是以上的两个含有 :math:`\arg\min` 、 :math:`\arg\max` 的问题如何求解。 在大的思路上这两个问题很类似， 都是为了确定某组量的取值而设计的优化问题。 但细看却很不一样， 搜索最优输出的问题 :eq:`argmax_z` ， 搜索空间是离散的， 并且是有约束的， 搜索最优参数的问题 :eq:`argmax_w` ， 搜索空间一般是整个欧式空间，连续的且无约束。 下面就分别介绍这两个问题的具体处理方法。
 51 | 
 52 | 随机梯度下降算法
 53 | +++++++++++++++++++++++++++
 54 | 
 55 | 
 56 | 1. 得到一个训练样本 :math:`(\mathbf{x}_t,\mathbf{y}_t)`
 57 | 2. 解码得到当前权重下的最优输出 :math:`\mathbf{z}_t=\arg\max_{\mathbf{z}}{f(\mathbf{x}_t,\mathbf{w};\mathbf{z})}`
 58 | 3. 如果 :math:`\mathbf{z}_t\not=\mathbf{y}_t` 则 :math:`\mathbf{w}\leftarrow \mathbf{w}-\eta \left. \frac{\partial \text{loss}}{\partial \mathbf{w}} \right|_{\mathbf{w}}`
 59 | 4. 判断是否停止，如不停止跳到步骤1。
 60 | 
 61 | 感知器算法
 62 | 
 63 | 平均感知器
 64 | ----------------------------
 65 | 
 66 | 
 67 | Early-update
 68 | ----------------------------
 69 | 
 70 | 
 71 | 解码器
 72 | +++++++++++++++++++++++++++
 73 | 
 74 | 
 75 | 类隐马尔可夫解码器
 76 | -----------------------------
 77 | 
 78 | 一阶解码器适合解决当目标函数可按以下形式分解的情况：
 79 | 
 80 | .. math::
 81 | 
 82 |     f(\mathbf{x};\mathbf{z})=\sum_{i}{g(\mathbf{x};z_i)}+\sum_{i}{h(z_i,z_{i+1})}
 83 | 
 84 | 一般线性解码器
 85 | -----------------------------
 86 | 
 87 | .. math::
 88 | 
 89 |     f(\mathbf{x};\mathbf{z})=\sum_{i}{h(\mathbf{x};z_i,z_{i+1})}
 90 | 
 91 | 一般二叉树解码器
 92 | -----------------------------
 93 | 
 94 | .. math::
 95 | 
 96 |     f(\mathbf{x};\mathbf{z})=\sum_{p}{h(\mathbf{x};z_{p},z_{l},z_{r})}+\sum_{l}{g(\mathbf{x};z_{l})}
 97 | 
 98 | 已实现的模型
 99 | +++++++++++++++++++++++++++
100 | 
101 | 基于字标注的分词词性标注
102 | -----------------------------
103 | 
104 | 
105 | 基于词的中文分词
106 | -----------------------------
107 | 
108 | 
109 | 基于词图的分词词性标注
110 | -----------------------------
111 | 
112 | 
113 | 移进-归约依存句法分析
114 | -----------------------------
115 | 


--------------------------------------------------------------------------------
/isan.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python3
 2 | import sys
 3 | """!
 4 | @mainpage
 5 | 
 6 | Isan
 7 | ====
 8 | 一个中文处理的实验环境
 9 | 
10 | 
11 | ls test/*.train | sed 's/^\([^\.]*\)\.train/shuffle -m 20 -d .\/\1 -p 5 seg --train \1.train --dev \1.test --iteration 15 --yaml args.yaml /g' | xargs -n 16 -P 1 ./isan.sh
12 | 
13 | 
14 | seq 0 9 | awk '{print "test/" $1 "/model.gz --input test/" $1 ".test"}' | xargs -d "\n" -n 1 ./isan.sh seg --threshold 20 --yaml args.yaml --output t.lat --append
15 | """
16 | 
17 | 
18 | from isan import *
19 | 
20 | if __name__ == '__main__':
21 |     this,*argv=sys.argv
22 |     """
23 |     if len(argv)==0 :
24 |         exit()
25 |     if argv[0]=='seg':
26 |         argv[0:1]= (['--model', 'isan.common.perceptrons.Model']+
27 |             ['--decoder', 'isan.common.decoder.First_Order_Linear']+
28 |             ['--task', 'isan.tagging.cb_cws.Task'])
29 |     
30 |     """
31 |     
32 |     isan(**get_args(argv))
33 | 
34 | 


--------------------------------------------------------------------------------
/isan.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/sh
  2 | 
  3 | if [ $# = 0 ]; then
  4 |     echo “举一隅不以三隅反，则不复也” ——《论语·述而》
  5 |     exit
  6 | fi
  7 | 
  8 | if [ $1 = 'link' ] ; then
  9 |     echo 'link'
 10 |     src=$(dirname $0)
 11 |     ln -s ${src}/isan .
 12 |     ln -s ${src}/isan.py .
 13 |     ln -s ${src}/isan.sh .
 14 | fi
 15 | 
 16 | 
 17 | if [ $1 = 'stack' ] ; then
 18 |     src=$2
 19 |     batch=$3 # batchsize
 20 |     fold=$4
 21 |     dst=$5
 22 |     for tgt in `seq 0 $(expr $fold - 1 ) `; do
 23 |         cat $src | awk "(NR-NR%${batch})/${batch}%${fold}==${tgt} {print}" > ${dst}/${tgt}.test
 24 |         cat $src | awk "(NR-NR%${batch})/${batch}%${fold}!=${tgt} {print}" > ${dst}/${tgt}.train
 25 |     done;
 26 |     
 27 |     exit
 28 | fi
 29 | 
 30 | if [ $1 = 'shuffle' ] ; then
 31 |     cmd=$0
 32 |     shift
 33 |     if [ $# = 0 ]; then
 34 |         echo "usage:"
 35 |         echo "    " ${cmd} shuffle -p processor-number=1 -m model-number=1 -d dir=. blabla
 36 |         exit
 37 |     fi
 38 |     dst='.' # dir
 39 |     nm='1' # number of models
 40 |     np='1'
 41 |     while [ `echo $1 | grep '\-'` ] ; do
 42 |         if [ `echo $1 | grep '\-p'` ] ; then
 43 |             np=$2
 44 |             shift;shift
 45 |         fi
 46 |         if [ `echo $1 | grep '\-d'` ] ; then
 47 |             dst=$2
 48 |             shift;shift
 49 |         fi
 50 |         if [ `echo $1 | grep '\-m'` ] ; then
 51 |             nm=$2
 52 |             shift;shift
 53 |         fi
 54 |     done
 55 |     echo "train [\033[34m$nm\033[0m] model(s)" "into [\033[34m$dst\033[0m]"
 56 |     echo "using [\033[34m$np\033[0m] processor(s)"
 57 |     echo "the command line is:\033[34m${cmd} $*\033[0m"
 58 | 
 59 |     #exit
 60 | 
 61 |     mkdir $dst -p
 62 |     echo `for i in $(seq $nm); do echo "${dst}/model.$i.gz --seed $i"; done` | xargs  -n 3 -P $np ${cmd} $*
 63 |     ${cmd} $1 ${dst}/model.gz --append_model `for i in $(seq $nm); do echo "${dst}/model.$i.gz"; done`
 64 |     exit
 65 | fi
 66 | 
 67 | 
 68 | 
 69 | #
 70 | # 中文分词模型
 71 | #
 72 | if [ $1 = 'seg' ] ; then
 73 |     shift
 74 |     ./isan.py \
 75 |         --model isan.common.perceptrons.Model \
 76 |         --decoder isan.common.decoder.First_Order_Linear \
 77 |         --task isan.tagging.cb_cws.Task \
 78 |         $@
 79 | fi
 80 | 
 81 | if [ $1 = 'dep' ] ; then
 82 |     shift
 83 |     ./isan.py \
 84 |         --model isan.common.perceptrons.Model \
 85 |         --decoder isan.common.decoder.Push_Down \
 86 |         --task isan.parsing.default_dep.Dep \
 87 |         $@
 88 | fi
 89 | 
 90 | if [ $1 = 'cws' ] ; then
 91 |     shift
 92 |     ./isan.py \
 93 |         --model isan.common.perceptrons.Model \
 94 |         --decoder isan.common.decoder.DFA \
 95 |         --task isan.tagging.cws.Task \
 96 |         $@
 97 | fi
 98 | 
 99 | if [ $1 = 'tag' ] ; then
100 |     shift
101 |     ./isan.py \
102 |         --model isan.common.perceptrons.Model \
103 |         --decoder isan.common.decoder.DFA \
104 |         --task isan.tagging.wb_tag.Path_Finding \
105 |         $*
106 | fi
107 | 
108 | # 实验性模型
109 | 
110 | 
111 | if [ $1 = 'lattice_dep' ] ; then
112 |     shift
113 |     ./isan.py \
114 |         --model isan.common.perceptrons.Model \
115 |         --decoder isan.common.decoder.Push_Down \
116 |         --task isan.parsing.lattice_dep.Dep \
117 |         $@
118 | fi
119 | 
120 | if [ $1 = 'lat_dep' ] ; then
121 |     shift
122 |     ./isan.py \
123 |         --model isan.common.perceptrons.Model \
124 |         --decoder isan.common.decoder.Push_Down \
125 |         --task isan.parsing.lat_dep.Dep \
126 |         $@
127 | fi
128 | 
129 | if [ $1 = 'pa_cws' ] ; then
130 |     shift
131 |     ./isan.py \
132 |         --model isan.common.perceptrons.Model_PA \
133 |         --decoder isan.common.decoder.DFA \
134 |         --task isan.tagging.PA_segger.Segger \
135 |         $*
136 | fi
137 | 
138 | if [ $1 = 'pa_parsing' ] ; then
139 |     shift
140 |     ./isan.py \
141 |         --model isan.common.perceptrons.Model_PA \
142 |         --decoder isan.common.decoder.Push_Down \
143 |         --task isan.parsing.default_dep.PA_Dep \
144 |         $*
145 | fi
146 | 
147 | if [ $1 = 'seg_dep' ] ; then
148 |     shift
149 |     ./isan.py \
150 |         --model isan.common.perceptrons.Model \
151 |         --decoder isan.common.decoder.Push_Down \
152 |         --task isan.parsing.seq_dep.Dep \
153 |         $*
154 | fi
155 | 
156 | if [ $1 = 'tagpath' ] ; then
157 |     shift
158 |     ./isan.py \
159 |         --model isan.common.perceptrons.Model \
160 |         --decoder isan.common.decoder.DFA \
161 |         --task isan.tagging.tagging_dag.Path_Finding \
162 |         $*
163 | fi
164 | 
165 | if [ $1 = 'dep' ] ; then
166 |     shift
167 |     ./isan.py \
168 |         --model isan.common.perceptrons.Model \
169 |         --decoder isan.common.decoder.Push_Down \
170 |         --task isan.parsing.default_dep2.Dep \
171 |         $*
172 | fi
173 | 


--------------------------------------------------------------------------------
/isan/Makefile:
--------------------------------------------------------------------------------
 1 | all: common/pushdown.so common/dfabeam.so common/first_order_linear.so common/feature_dict.so
 2 | 
 3 | gcc= g++ -I /usr/include/python3.2mu -shared -fPIC -O3 -std=c++0x -I .. -Wno-deprecated -g
 4 | 
 5 | headers=common/*.hpp utls/*.hpp
 6 | 
 7 | common/pushdown.so:  common/python_interface.cc  ${headers}
 8 | 	${gcc} common/python_interface.cc -o common/pushdown.so \
 9 | 		-D REDUCE -D __MODULE_NAME=pushdown
10 | 
11 | common/dfabeam.so:   common/python_interface.cc  ${headers}
12 | 	${gcc} common/python_interface.cc -o common/dfabeam.so \
13 | 		-D __MODULE_NAME=dfabeam
14 | 
15 | common/first_order_linear.so: common/first_order_linear/first_order_linear.cc \
16 | 			common/first_order_linear/decoder.h
17 | 	${gcc} $< -o $@
18 | 
19 | common/feature_dict.so: common/feature_dict.cc 
20 | 	${gcc} $< -o $@
21 | 


--------------------------------------------------------------------------------
/isan/README.md:
--------------------------------------------------------------------------------
1 | * `common` 目录下是最基本的模型和解码器
2 | * `tagging` 目录下是序列类的任务
3 | * `parsing` 目录下是树类的任务
4 | 


--------------------------------------------------------------------------------
/isan/annotation/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python3
2 | """
3 | is it ok?
4 | """
5 | 


--------------------------------------------------------------------------------
/isan/annotation/seg/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python3
2 | """
3 | is it ok?
4 | """
5 | 


--------------------------------------------------------------------------------
/isan/annotation/seg/anno.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python3
2 | 
3 | 
4 | 


--------------------------------------------------------------------------------
/isan/annotation/seg/http_server.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python3
  2 | import http.server
  3 | import cgi
  4 | import urllib.parse
  5 | 
  6 | import sys
  7 | import webbrowser
  8 | import multiprocessing
  9 | 
 10 | import time
 11 | import subprocess
 12 | import json
 13 | 
 14 | html='''
 15 | <style>
 16 | #a:hover{background:#003000;}  
 17 | span.n{background:yellow;}
 18 | span.s{background:blue;}
 19 | span.c{background:white;}
 20 | </style>
 21 | 
 22 | <script>
 23 | url="$(url)"
 24 | sen_id="$(sen_id)"
 25 | start_id=-1
 26 | function selection_start(id){
 27 |     start_id=id
 28 | }
 29 | function selection_end(id){
 30 |     if(start_id==-1)return
 31 |     //alert(start_id+" "+id)
 32 |     if(start_id>id){
 33 |         tmp=id
 34 |         id=start_id
 35 |         start_id=tmp
 36 |     }
 37 |     //alert(start_id)
 38 |     ele=document.getElementById('i'+start_id)
 39 |     if(ele){ele.className='s'}
 40 |     for(i=start_id+1;i<=id;i++){
 41 |         ele=document.getElementById('i'+i)
 42 |         if(ele){ele.className='c'}
 43 |     }
 44 |     ele=document.getElementById('i'+(id+1))
 45 |     if(ele){ele.className='s'}
 46 |     start_id=-1
 47 | }
 48 | function click(ele){
 49 |     if(event.button==0){
 50 |         if(ele.className=='s'){
 51 |             ele.className='n'
 52 |         }else{
 53 |             ele.className='s'
 54 |         }
 55 |     }
 56 |     else if(event.button==2){
 57 |         if(ele.className=='c'){
 58 |             ele.className='n'
 59 |         }else{
 60 |             ele.className='c'
 61 |         }
 62 |     }
 63 | }
 64 | function control(cmd){
 65 |     if(cmd=='stop'){
 66 |         window.location.href=encodeURI(url+"stop")
 67 |         return
 68 |     }
 69 |     if(cmd=='continue'){
 70 |         window.location.href=encodeURI(url+"")
 71 |         return
 72 |     }
 73 |     if(cmd=='discard'){
 74 |         window.location.href=encodeURI(url+"discard")
 75 |         return
 76 |     }
 77 |     
 78 | }
 79 | function submit(){
 80 |     anno=['s']
 81 |     id=1
 82 |     while(1){
 83 |         ele=document.getElementById('i'+id)
 84 |         if(ele){
 85 |             if(ele.className=='s'){
 86 |                 anno.push('s')
 87 |             }else if (ele.className=='c'){
 88 |                 anno.push('c')
 89 |             }else{
 90 |                 anno.push('sc')
 91 |             }
 92 |         }else{
 93 |             anno.push('s')
 94 |             break;
 95 |         }
 96 |         id++
 97 |     }
 98 |     window.location.href=encodeURI(url+""+sen_id+" "+anno.join(' '))
 99 | }
100 | 
101 | 
102 | </script>
103 | $(sen_id)
104 | <div onselectstart="return false;" oncontextmenu="return false">
105 | $(sequence)
106 | </div>
107 | <a onclick="submit();">提交</a>
108 | <a onclick="control('continue');">继续</a>
109 | <a onclick="control('discard');">排除</a>
110 | <a onclick="control('stop');">终止</a>
111 | 
112 | '''
113 | '''
114 | 
115 | <a id='c0' onmousedown="selection_start(0)" onmouseup="selection_end(0)" >也</a>
116 | <span id='i1' class='n' onmousedown=click(this)>？</span>
117 | <span id='c1' onmousedown="selection_start(1)" onmouseup="selection_end(1)" >记</span>
118 | <span id='i2' class='n' onmousedown=click(this)>？</span>
119 | <span id='c1' onmousedown="selection_start(2)" onmouseup="selection_end(2)">得</span>
120 | 
121 | '''
122 | 
123 | class MyHttpHandler(http.server.BaseHTTPRequestHandler):
124 |     def do_GET(self):
125 |         path=urllib.parse.unquote(self.path)
126 |         if path.endswith('.ico'):
127 |             return
128 |         if len(path)>1:
129 |             print(path)
130 |         rtn=html
131 |         sen=anno()
132 |         raw,seq=sen['raw'],sen['anno']
133 |         s=[]
134 |         for id,c in enumerate(raw):
135 |             s.append('''<a onmousedown="selection_start(%d)" onmouseup="selection_end(%d)" >%s</a>'''%(id,id,c))
136 |             if id<len(raw)-1:
137 |                 s.append('''<span id='i%d' class='n' onmousedown=click(this)>?</span>'''%(id+1))
138 |         rtn=rtn.replace('$(sequence)',''.join(s))
139 |         rtn=rtn.replace('$(url)',url)
140 |         rtn=rtn.replace('$(sen_id)',sen['id'])
141 |         self.send_response(200)
142 |         self.send_header( "Content-type", "text/html" )
143 |         self.end_headers()
144 |         
145 |         self.wfile.write(rtn.encode('utf8'))
146 |     
147 | 
148 | def run(server_class=http.server.HTTPServer, handler_class=http.server.BaseHTTPRequestHandler
149 |             ,addr=('', 8082)):
150 |     server_address = addr
151 |     httpd = server_class(server_address, handler_class)
152 |     httpd.serve_forever()
153 |     return httpd;
154 | 
155 | 
156 | class Anno:
157 |     def __init__(self):
158 |         self.data=[]
159 |         for line in open("sample.json"):
160 |             sen=json.loads(line)
161 |             self.data.append(sen)
162 |         self.ind=0
163 |     def __call__(self,string=""):
164 |         if string=='stop':
165 |             
166 |             return
167 |         if self.ind>=len(self.data):
168 |             return ''
169 |         sen=self.data[self.ind]
170 |         self.ind+=1
171 |         return sen
172 |             
173 | anno=Anno()
174 | 
175 | 
176 | if __name__=="__main__":
177 |     
178 |     
179 | 
180 |     lock=multiprocessing.Lock()
181 | 
182 | 
183 |     print('server started')
184 | 
185 |     url="http://166.111.138.130:8082/"
186 |     port=8082
187 |     if len(sys.argv)>1:
188 |         url="http://166.111.138.130:"+sys.argv[1]+"/"
189 |         port=int(sys.argv[1])
190 |     
191 |     print(url)
192 |     run(handler_class=MyHttpHandler, addr=('',port))
193 | 
194 | 


--------------------------------------------------------------------------------
/isan/annotation/seg/sample.html:
--------------------------------------------------------------------------------
 1 | <style>
 2 | #a:hover{background:#003000;}  
 3 | span.n{background:yellow;}
 4 | span.s{background:blue;}
 5 | span.c{background:white;}
 6 | </style>
 7 | 
 8 | <script>
 9 | start_id=-1
10 | function selection_start(id){
11 |     start_id=id
12 | }
13 | function selection_end(id){
14 |     if(start_id==-1)return
15 |     //alert(start_id+" "+id)
16 |     if(start_id>id){
17 |         tmp=id
18 |         id=start_id
19 |         start_id=tmp
20 |     }
21 |     //alert(start_id)
22 |     ele=document.getElementById('i'+start_id)
23 |     if(ele){ele.className='s'}
24 |     for(i=start_id+1;i<=id;i++){
25 |         ele=document.getElementById('i'+i)
26 |         if(ele){ele.className='c'}
27 |     }
28 |     ele=document.getElementById('i'+(id+1))
29 |     if(ele){ele.className='s'}
30 |     start_id=-1
31 | }
32 | function click(ele){
33 |     if(event.button==0){
34 |         if(ele.className=='s'){
35 |             ele.className='n'
36 |         }else{
37 |             ele.className='s'
38 |         }
39 |     }
40 |     else if(event.button==2){
41 |         if(ele.className=='c'){
42 |             ele.className='n'
43 |         }else{
44 |             ele.className='c'
45 |         }
46 |     }
47 | }
48 | 
49 | function submit(){
50 |     anno=['s']
51 |     id=1
52 |     while(1){
53 |         ele=document.getElementById('i'+id)
54 |         if(ele){
55 |             if(ele.className=='s'){
56 |                 anno.push('s')
57 |             }else if (ele.className=='c'){
58 |                 anno.push('c')
59 |             }else{
60 |                 anno.push('sc')
61 |             }
62 |         }else{
63 |             anno.push('s')
64 |             break;
65 |         }
66 |         id++
67 |     }
68 |     alert(anno.join(' '))
69 | }
70 | </script>
71 | <div onselectstart="return false;" oncontextmenu="return false">
72 |     
73 |     
74 | <a id='c0' onmousedown="selection_start(0)" onmouseup="selection_end(0)" >也</a>
75 | <span id='i1' class='n' onmousedown=click(this)>？</span>
76 | <span id='c1' onmousedown="selection_start(1)" onmouseup="selection_end(1)" >记</span>
77 | <span id='i2' class='n' onmousedown=click(this)>？</span>
78 | <span id='c1' onmousedown="selection_start(2)" onmouseup="selection_end(2)">得</span>
79 | <span id='i3' class='n' onmousedown=click(this) oncontextmenu="return false">？</span>
80 | <span id='c1' onmousedown="selection_start(3)" onmouseup="selection_end(3)">还</span>
81 | <span id='i4' class='n' onmousedown=click(this) oncontextmenu="return false">？</span>
82 | <span id='c1' onmousedown="selection_start(4)" onmouseup="selection_end(4)">是</span>
83 | <span id='i5' class='n' onmousedown=click(this) oncontextmenu="return false">？</span>
84 | <span id='c1' onmousedown="selection_start(5)" onmouseup="selection_end(5)">羚</span>
85 | <span id='i6' class='n' onmousedown=click(this) oncontextmenu="return false">？</span>
86 | <span id='c1' onmousedown="selection_start(6)" onmouseup="selection_end(6)">羊</span>
87 | <span id='i7' class='n' onmousedown=click(this) oncontextmenu="return false">？</span>
88 | <span id='c1' onmousedown="selection_start(7)" onmouseup="selection_end(7)">啊</span>
89 | 
90 | 
91 | </div>
92 | <a onclick="submit();">提交</a>
93 | 
94 | 
95 | 
96 | 


--------------------------------------------------------------------------------
/isan/common/Chinese.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python3
 2 | """
 3 | 知识点：
 4 |     》unicode编码中，汉字的大体范围为：“一”-“鿋”
 5 |     》半角转全角，只需要内码加上65248,全角空格为12288
 6 |     
 7 | """
 8 | 
 9 | #汉字集合
10 | chinese_characters=set(chr(i) for i in range(ord('一'),ord('鿋')+1))
11 | #阿拉伯数字集合
12 | number_characters=set(chr(x) for x in range(ord('０'),ord('９')+1))
13 | #拉丁字母
14 | latin_characters=set(chr(x) for x in range(ord('ａ'),ord('ｚ')+1))
15 | latin_characters.update(chr(x) for x in range(ord('Ａ'),ord('Ｚ')+1))
16 | 
17 | #内容字符，汉字、阿拉伯数字、拉丁字母的集合
18 | content_characters=set()
19 | content_characters.update(chinese_characters)
20 | content_characters.update(number_characters)
21 | content_characters.update(latin_characters)
22 | 
23 | #句末符号
24 | full_stops=set('。？！')
25 | 
26 | def test():
27 |     print("测试")
28 | 
29 | def to_full(text,ignore=set()):
30 |     """
31 |     半角转全角的程序
32 |         空格变成全角
33 |         大于空格的直接加上偏移量
34 |         否则不变
35 |     """
36 |     
37 |     return ''.join(chr(12288) if x==32 else chr(x+65248) if (x<128 and x>32 and (x not in ignore)) else chr(x)
38 |             for x in map(ord,text))
39 | 
40 | def seg_sentence(text):
41 |     """
42 |     切分句子
43 |     """
44 |     cache=[]
45 |     sentences=[]
46 |     has_non=False
47 |     for c in text:
48 |         cache.append(c)
49 |         if c in full_stops and has_non:
50 |             cache=''.join(cache)
51 |             
52 |             cache=cache.strip()
53 |             if cache:
54 |                 sentences.append(cache)
55 |             cache=[]
56 |             has_non=False
57 |             
58 |         elif c in content_characters:
59 |             has_non=True
60 |     if cache:
61 |         if not sentences:sentences.append('')
62 |         sentences[-1]+=''.join(cache)
63 |     return sentences
64 |             
65 | def seg_by_punctuations(text):
66 |     pass
67 | if __name__=="__main__":
68 |     print(seg_sentence(to_full('。“hello world？！wo23。”')))
69 | 


--------------------------------------------------------------------------------
/isan/common/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python3
2 | 
3 | def test():
4 |     print("this is a test function")
5 | 


--------------------------------------------------------------------------------
/isan/common/decoder.hpp:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | #include "isan/common/common.hpp"
  4 | #include "isan/common/searcher.hpp"
  5 | 
  6 | 
  7 | 
  8 | /**
  9 |  * searcher data
 10 |  * provide information for searcher
 11 |  * */
 12 | 
 13 | namespace isan{
 14 | 
 15 | 
 16 | 
 17 | class General_Searcher_Data : 
 18 |         public Searcher_Data<Alpha_Type>{
 19 | public:
 20 | 
 21 |     Feature_Generator * feature_generator;
 22 |     State_Generator * shifted_state_generator;
 23 |     Reduced_State_Generator * reduced_state_generator;
 24 |     Early_Stop_Checker * early_stop_checker;
 25 | 
 26 |     size_t learning_step;
 27 | 
 28 |     // init this object
 29 |     General_Searcher_Data(
 30 |             Early_Stop_Checker * early_stop_checker,
 31 |             State_Generator *shifted_state_generator,
 32 |             Reduced_State_Generator *reduced_state_generator,
 33 |             Feature_Generator* feature_generator){
 34 |         this->early_stop_checker=early_stop_checker;
 35 |         if(this->early_stop_checker)this->use_early_stop=true;
 36 |         this->feature_generator=feature_generator;
 37 |         this->shifted_state_generator=shifted_state_generator;
 38 |         
 39 |         this->reduced_state_generator=reduced_state_generator;
 40 |         //std::cout<<reduced_state_generator<<"\n";
 41 |         learning_step=0;
 42 |     };
 43 |     ~General_Searcher_Data(){
 44 |     };
 45 | 
 46 |     virtual bool early_stop(
 47 |             int step,
 48 |             const std::vector<Alpha_Type*>& last_alphas,
 49 |             const std::vector<State_Type>& states
 50 |             ){
 51 |         return (*early_stop_checker)(
 52 |                 step,
 53 |                 last_alphas,
 54 |                 states);
 55 |     };
 56 | 
 57 |     inline void shift(
 58 |             const int& ind,
 59 |             State_Type& state, 
 60 |             std::vector<Action_Type>& next_actions,
 61 |             std::vector<int>& next_inds,
 62 |             std::vector<State_Type>& next_states,
 63 |             std::vector<Score_Type>& scores
 64 |             ){
 65 |         next_inds.clear();// clear the vector first
 66 |         (*shifted_state_generator)(ind,state,next_actions,next_inds,next_states);
 67 |         
 68 |         cal_weights(state,next_actions,scores,learning_step);
 69 |     };
 70 |     void reduce(
 71 |             const int state_ind,
 72 |             const State_Type& state, 
 73 |             const std::vector<Alpha_Type*>& pred_alphas,
 74 |             std::vector<Action_Type>& next_actions,
 75 |             std::vector<int>& next_inds,
 76 |             std::vector<State_Type>& next_states,
 77 |             std::vector<int>& reduce_pred_alphas,
 78 |             std::vector<Score_Type>& scores
 79 |             ){
 80 |         if(this->reduced_state_generator){
 81 |             (*reduced_state_generator)(
 82 |                     state_ind,
 83 |                     state,
 84 |                     pred_alphas,
 85 |                     next_actions,
 86 |                     next_inds,
 87 |                     next_states,
 88 |                     reduce_pred_alphas
 89 |                     );
 90 |             cal_weights(state,next_actions,scores,learning_step);
 91 |         }
 92 |     };
 93 | 
 94 |     // calculate the sum of the weights according to the list of symbolic features
 95 |     inline void cal_weights(
 96 |             const STATE& state,
 97 |             const std::vector<ACTION>& next_actions,
 98 |             std::vector<SCORE>& scores,
 99 |             size_t step
100 |             ){
101 |         scores.resize(next_actions.size());
102 |         for(int i=0;i<next_actions.size();i++){
103 |             scores[i]=0;
104 |         };
105 |         
106 |         (*feature_generator)(state,next_actions,0,0,scores);
107 |     };
108 | };
109 | 
110 | /**
111 |  * interface of the whole model
112 |  * */
113 | class Interface{
114 |     typedef Searcher<State_Info_Type > My_Searcher;
115 | public:
116 |     State_Type init_state;
117 |     int beam_width;
118 |     General_Searcher_Data * data;
119 | 
120 |     My_Searcher * push_down;
121 |     
122 |     State_Generator * shifted_state_generator;
123 |     Reduced_State_Generator * reduced_state_generator;
124 |     Feature_Generator * feature_generator;
125 |     Early_Stop_Checker * early_stop_checker;
126 |     
127 |     Chinese* raw;
128 |     
129 |     Interface(int beam_width,
130 |             PyObject * py_early_stop_callback,
131 |             PyObject * py_shift_callback,
132 |             PyObject * py_reduce_callback,
133 |             PyObject * py_feature_cb
134 |             ){
135 |         if(PyLong_Check(py_shift_callback)){
136 |             shifted_state_generator=(State_Generator *) PyLong_AsUnsignedLong(py_shift_callback);
137 |         }else{
138 |             shifted_state_generator=new Python_State_Generator(py_shift_callback);
139 |         };
140 | 
141 |         reduced_state_generator=NULL;
142 |         if(py_reduce_callback!=Py_None){
143 |             reduced_state_generator=new Python_Reduced_State_Generator(py_reduce_callback);
144 |         };
145 | 
146 |         if(PyLong_Check( py_feature_cb)){
147 |             feature_generator=(Feature_Generator*) PyLong_AsUnsignedLong( py_feature_cb);
148 |         }else{
149 |             feature_generator=new Python_Feature_Generator( py_feature_cb);
150 |         };
151 |         early_stop_checker=NULL;
152 |         if(py_early_stop_callback!=Py_None){
153 |             early_stop_checker=new Python_Early_Stop_Checker(py_early_stop_callback);
154 |         }
155 | 
156 |         raw=NULL;
157 |         this->beam_width=beam_width;
158 |         this->data=new General_Searcher_Data(
159 |                 early_stop_checker,
160 |                 shifted_state_generator,
161 |                 reduced_state_generator,
162 |                 feature_generator);
163 |         this->push_down=new My_Searcher(this->data,beam_width);
164 | 
165 |     };
166 |     
167 |     void set_raw(Chinese& raw){
168 |         if(this->raw)delete this->raw;
169 |         this->raw=new Chinese(raw);
170 |         this->shifted_state_generator->raw=this->raw;
171 |         this->feature_generator->set_raw(this->raw);
172 |     }
173 | 
174 |     ~Interface(){
175 |         delete this->data;
176 |         delete this->push_down;
177 |         delete feature_generator;
178 |         delete shifted_state_generator;
179 |         delete early_stop_checker;
180 |         if(reduced_state_generator)
181 |             delete reduced_state_generator;
182 |     };
183 | };
184 | 
185 | };//isan
186 | 


--------------------------------------------------------------------------------
/isan/common/decoder.py:
--------------------------------------------------------------------------------
 1 | import isan.common.pushdown as pushdown
 2 | import isan.common.dfabeam as dfabeam
 3 | import isan.common.first_order_linear as first_order_linear
 4 | 
 5 | 
 6 | class Searcher:
 7 |     def search(self):
 8 |         return self.searcher.search(self.handler,self.get_init_states())
 9 |     def get_states(self):
10 |         return self.searcher.get_states(self.handler)
11 |     def __del__(self):
12 |         self.searcher.delete(self.handler)
13 | 
14 |     def __init__(self,schema,beam_width):
15 |         self.get_init_states=schema.get_init_states
16 |         self.handler=self.searcher.new(
17 |                 beam_width,
18 |                 schema.early_stop if hasattr(schema,'early_stop') else None,
19 |                 schema.shift,
20 |                 schema.reduce,
21 |                 schema.gen_features,
22 |                 )
23 | 
24 | class DFA(Searcher):
25 |     name='状态转移'
26 |     searcher=dfabeam
27 | class Push_Down(Searcher):
28 |     name='Shift-Reduce'
29 |     searcher=pushdown
30 | class First_Order_Linear(Searcher):
31 |     name='first order linear'
32 |     searcher=first_order_linear
33 |     def cal_margins(self):
34 |         return self.searcher.cal_margins(self.handler)
35 |     def __init__(self,schema,beam_width):
36 |         self.get_init_states=schema.get_init_states
37 |         self.handler=self.searcher.new(
38 |                 1,
39 |                 schema.emission,
40 |                 schema.transition
41 |                 )
42 | 


--------------------------------------------------------------------------------
/isan/common/feature_dict.cc:
--------------------------------------------------------------------------------
  1 | #include <Python.h>
  2 | #include <iostream>
  3 | #include <unordered_map>
  4 | 
  5 | 
  6 | #define __MODULE_NAME feature_dict
  7 | #define __INIT_FUNC(a,b) a##b
  8 | #define INIT_FUNC(a,b) __INIT_FUNC(a,b)
  9 | #define PYINIT PyInit_
 10 | #define STR(x) #x
 11 | 
 12 | struct Hash{
 13 |     size_t operator()(const PyObject* key) const{
 14 |         size_t size=PyUnicode_GET_SIZE(key);
 15 |         auto* data=PyUnicode_AS_UNICODE(key);
 16 |         size_t hk=0;
 17 |         for(int i=0;i<size;i++){
 18 |             hk=((hk<<5)+hk)+(*(data++));
 19 |         };
 20 |         return hk;
 21 |     };
 22 | };
 23 | struct Equal{
 24 |     bool operator()( PyObject* left, PyObject* right) const{
 25 |         size_t lsize=PyUnicode_GET_SIZE(left);
 26 |         size_t rsize=PyUnicode_GET_SIZE(right);
 27 |         if(lsize!=rsize)return false;
 28 |         auto* ldata=PyUnicode_AS_UNICODE(left);
 29 |         auto* rdata=PyUnicode_AS_UNICODE(right);
 30 |         for(int i=0;i<lsize;i++){
 31 |             if(*(ldata++)!=*(rdata++))return false;
 32 |         };
 33 |         return true;
 34 |     };
 35 | };
 36 | 
 37 | typedef std::unordered_map<PyObject*,double,Hash ,Equal> Dict;
 38 | 
 39 | static PyObject *
 40 | module_new(PyObject *self, PyObject *arg)
 41 | {
 42 |     Dict* dict=new Dict();
 43 |     return PyLong_FromLong((long)dict);
 44 | };
 45 | 
 46 | static PyObject *
 47 | dict_size(PyObject *self, PyObject *arg){
 48 |     Dict* dict=(Dict*)PyLong_AsLong(arg);
 49 |     return PyLong_FromLong(dict->size());
 50 | };
 51 | 
 52 | static PyObject *
 53 | set_weights(PyObject *self, PyObject *arg){
 54 |     Dict* dict;
 55 |     PyObject * py_dict;
 56 |     
 57 |     PyArg_ParseTuple(arg, "LO", &dict,&py_dict);
 58 | 
 59 |     PyObject *key, *value;
 60 |     Py_ssize_t pos = 0;
 61 |     
 62 |     size_t length;
 63 |     while (PyDict_Next(py_dict, &pos, &key, &value)) {
 64 |         Py_INCREF(key);
 65 |         (*dict)[key]=PyFloat_AsDouble(value);
 66 |     };
 67 | 
 68 |     Py_INCREF(Py_None);
 69 |     return Py_None;
 70 | };
 71 | 
 72 | static PyObject *
 73 | to_dict(PyObject *self, PyObject *arg){
 74 |     Dict* dict=(Dict*)PyLong_AsLong(arg);
 75 |     PyObject * py_dict=PyDict_New();
 76 |     for(auto it=dict->begin();it!=dict->end();++it){
 77 |         PyObject * key=it->first;
 78 |         PyObject * value=PyFloat_FromDouble(it->second);
 79 |         PyDict_SetItem(py_dict,key,value);
 80 |         Py_DECREF(value);
 81 |     };
 82 |     return py_dict;
 83 | };
 84 | 
 85 | static PyObject *
 86 | clear(PyObject *self, PyObject *arg){
 87 |     Dict* dict=(Dict*)PyLong_AsLong(arg);
 88 |     for(auto it=dict->begin();it!=dict->end();++it){
 89 |         Py_DECREF(it->first);
 90 |     }
 91 |     dict->clear();
 92 |     Py_INCREF(Py_None);
 93 |     return Py_None;
 94 | };
 95 | static PyObject *
 96 | cal_fv(PyObject *self, PyObject *arg){
 97 |     Dict* dict;
 98 |     PyObject * py_fv;
 99 |     
100 |     PyArg_ParseTuple(arg, "LO", &dict,&py_fv);
101 | 
102 |     long size=PySequence_Size(py_fv);
103 |     
104 |     double score=0;
105 |     
106 |     for(int i=0;i<size;i++){
107 |         PyObject *key=PySequence_GetItem(py_fv,i);
108 |         auto got=dict->find(key);
109 |         if (got!=dict->end()){
110 |             score+=got->second;
111 |         };
112 |         Py_DECREF(key);
113 |     }
114 | 
115 |     return PyFloat_FromDouble(score);
116 |     Py_INCREF(Py_None);
117 |     return Py_None;
118 | };
119 | static PyObject *
120 | get(PyObject *self, PyObject *arg){
121 |     Dict* dict;
122 |     PyObject * key;
123 |     
124 |     PyArg_ParseTuple(arg, "LO", &dict,&key);
125 | 
126 |     
127 |     auto got=dict->find(key);
128 |     if (got!=dict->end()){
129 |         return PyFloat_FromDouble(got->second);
130 |     };
131 | 
132 |     return PyFloat_FromDouble(0);
133 | };
134 | 
135 | static PyObject *
136 | update_fv(PyObject *self, PyObject *arg){
137 |     Dict* dict;
138 |     PyObject * py_fv;
139 |     double delta;
140 |     
141 |     
142 |     PyArg_ParseTuple(arg, "LOd", &dict,&py_fv,&delta);
143 | 
144 |     long size=PySequence_Size(py_fv);
145 |     
146 |     for(int i=0;i<size;i++){
147 |         PyObject *key=PySequence_GetItem(py_fv,i);
148 |         auto got=dict->find(key);
149 |         if (got!=dict->end()){
150 |             got->second+=delta;
151 |         }else{
152 |             Py_INCREF(key);
153 |             (*dict)[key]=delta;
154 |         };
155 |         Py_DECREF(key);
156 |     }
157 | 
158 |     Py_INCREF(Py_None);
159 |     return Py_None;
160 | };
161 | 
162 | static PyObject *
163 | interface_delete(PyObject *self, PyObject *arg){
164 |     Dict* dict=(Dict*)PyLong_AsLong(arg);
165 |     for(auto it=dict->begin();it!=dict->end();++it){
166 |         Py_DECREF(it->first);
167 |     }
168 |     dict->clear();
169 |     delete dict;
170 |     Py_INCREF(Py_None);
171 |     return Py_None;
172 | };
173 | 
174 | /** stuffs about the module def */
175 | static PyMethodDef interfaceMethods[] = {
176 |     {"new",  module_new, METH_VARARGS,""},
177 |     {"delete",  interface_delete, METH_O,""},
178 |     {"size",  dict_size, METH_O,""},
179 |     {"set_weights",  set_weights, METH_VARARGS,""},
180 |     {"cal_fv",  cal_fv, METH_VARARGS,""},
181 |     {"update_fv",  update_fv, METH_VARARGS,""},
182 |     {"get",  get, METH_VARARGS,""},
183 |     {"to_dict",  to_dict, METH_O,""},
184 |     {"clear",  clear, METH_O,""},
185 |     //{"set_raw",  set_raw, METH_VARARGS,""},
186 |     {NULL, NULL, 0, NULL}        /* Sentinel */
187 | };
188 | 
189 | static struct PyModuleDef module_struct = {
190 |    PyModuleDef_HEAD_INIT,
191 |    STR(__MODULE_NAME),   /* name of module */
192 |    NULL, /* module documentation, may be NULL */
193 |    -1,       /* size of per-interpreter state of the module,
194 |                 or -1 if the module keeps state in global variables. */
195 |    interfaceMethods
196 | };
197 | 
198 | PyMODINIT_FUNC
199 | INIT_FUNC(PYINIT,__MODULE_NAME) (void)
200 | {
201 |     return PyModule_Create(&module_struct);
202 | }
203 | 


--------------------------------------------------------------------------------
/isan/common/first_order_linear/decoder.h:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | namespace isan{
  3 | 
  4 | typedef double Score_Type; 
  5 | typedef size_t Tag_Type; 
  6 | 
  7 | //a structure for alphas and betas
  8 | struct Alpha_Beta{
  9 |     Score_Type value;
 10 |     Tag_Type tag_id;
 11 | };
 12 | 
 13 | /** The DP algorithm(s) for path labeling */
 14 | inline Score_Type dp_decode(
 15 |         const size_t tagset_size,
 16 |         const size_t node_size,
 17 |         const Score_Type* transitions,
 18 |         const Score_Type* emissions,
 19 |         Alpha_Beta* alphas,
 20 |         Tag_Type* tags
 21 |         ){
 22 | 
 23 |     Tag_Type max_tag_id;
 24 |     Score_Type max_value;
 25 |     // scores of the first item
 26 |     for(Tag_Type j=0;j<tagset_size;j++){ alphas[j].value=emissions[j]; };
 27 | 
 28 |     for(size_t i=1;i<node_size;i++){
 29 |         for(Tag_Type j=0;j<tagset_size;j++){ // j-th in i
 30 | 
 31 |             max_tag_id=0;
 32 |             max_value=alphas[(i-1)*tagset_size].value+transitions[j];
 33 |             for(Tag_Type k=1;k<tagset_size;k++){// k-th in i-1
 34 |                 Score_Type value=alphas[(i-1)*tagset_size+k].value+transitions[k*tagset_size+j];
 35 |                 if(value > max_value){
 36 |                     max_value=value;
 37 |                     max_tag_id=k;
 38 |                 }
 39 |             };
 40 |             
 41 |             alphas[i*tagset_size+j].value=emissions[i*tagset_size+j]+max_value;
 42 |             alphas[i*tagset_size+j].tag_id=max_tag_id;
 43 |             
 44 |         };
 45 |     };
 46 | 
 47 |     max_tag_id=0;
 48 |     max_value=alphas[(node_size-1)*tagset_size].value;
 49 |     for(Tag_Type k=1;k<tagset_size;k++){// k-th in i-1
 50 |         Score_Type value=alphas[(node_size-1)*tagset_size+k].value;
 51 |         if(value > max_value){
 52 |             max_value=value;
 53 |             max_tag_id=k;
 54 |         }
 55 |     };
 56 | 
 57 |     size_t node_id=node_size-1;
 58 |     size_t tag_id=max_tag_id;
 59 |     tags[node_id]=tag_id;
 60 |     while (node_id>0) {
 61 |         tag_id=alphas[(node_id)*tagset_size+tag_id].tag_id;
 62 |         node_id--;
 63 |         tags[node_id]=tag_id;
 64 |         
 65 |     };
 66 |     return max_value;
 67 | };
 68 | 
 69 | 
 70 | 
 71 | /** cal beta */
 72 | inline void dp_cal_beta(
 73 |         const size_t tagset_size,
 74 |         const size_t node_size,
 75 |         const Score_Type* transitions,
 76 |         const Score_Type* emissions,
 77 |         Alpha_Beta* betas
 78 |         ){
 79 | 
 80 |     Tag_Type max_tag_id;
 81 |     Score_Type max_value;
 82 |     // scores of the first item
 83 |     for(Tag_Type j=0;j<tagset_size;j++){ 
 84 |         betas[j+(node_size-1)*tagset_size].value=emissions[j+(node_size-1)*tagset_size]; 
 85 |     };
 86 | 
 87 |     for(int i=node_size-2;i>=0;--i){
 88 |         for(Tag_Type j=0;j<tagset_size;j++){ // j-th in i
 89 | 
 90 |             //find max
 91 |             max_tag_id=0;
 92 |             max_value=betas[(i+1)*tagset_size].value+transitions[j*tagset_size];
 93 |             for(Tag_Type k=1;k<tagset_size;k++){// k-th in i+1
 94 |                 Score_Type value=betas[(i+1)*tagset_size+k].value+transitions[j*tagset_size+k];
 95 |                 if(value > max_value){
 96 |                     max_value=value;
 97 |                     max_tag_id=k;
 98 |                 }
 99 |             };
100 |             
101 |             betas[i*tagset_size+j].value=emissions[i*tagset_size+j]+max_value;
102 |             betas[i*tagset_size+j].tag_id=max_tag_id;
103 |             
104 |         };
105 |     };
106 | };
107 | 
108 | 
109 | }//end of namespace
110 | 


--------------------------------------------------------------------------------
/isan/common/general_types.hpp:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include <Python.h>
 3 | #include <string>
 4 | #include "isan/common/searcher.hpp"
 5 | #include "isan/common/smart_string.hpp"
 6 | namespace isan{
 7 | typedef long Action_Type;
 8 | 
 9 | typedef double Score_Type;
10 | 
11 | typedef Smart_Chars State_Type;
12 | 
13 | typedef unsigned short Chinese_Character;
14 | typedef Smart_String<Chinese_Character> Chinese;
15 | 
16 | 
17 | typedef Alpha<Action_Type,State_Type,Score_Type> Alpha_Type;
18 | typedef State_Info<Alpha_Type> State_Info_Type;
19 | 
20 | 
21 | template<class Alpha>
22 | inline static PyObject *
23 | pack_alpha(Alpha alpha){
24 |     PyObject * py_step=PyLong_FromLong(alpha->ind1);
25 |     PyObject * py_state=alpha->state1.pack();
26 |     PyObject * py_action=PyLong_FromLong(alpha->action);
27 |     PyObject * py_move=PyTuple_Pack(3,py_step,py_state,py_action);
28 |     Py_DECREF( py_step);
29 |     Py_DECREF( py_state);
30 |     Py_DECREF( py_action);
31 |     return py_move;
32 | };
33 | 
34 | };//end of isan
35 | 


--------------------------------------------------------------------------------
/isan/common/parameters.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | """
 3 | 
 4 | """
 5 | 
 6 | class Para_Dict (dict):
 7 |     def __call__(self,keys):
 8 |         return sum(self.get(k,0) for k in keys)
 9 | 
10 | class Parameters :
11 |     def __init__(self,para_class):
12 |         self.para_class=para_class
13 |         self._list=list()
14 |         self._dirty=list()
15 | 
16 |     def add(self,value):
17 |         if type(value)== dict :
18 |             p=self.para_class.d(value)
19 |             p.init(self)
20 |         else :
21 |             p=value.view(self.para_class.ndarray)
22 |             p.init(self)
23 |         self._list.append(p)
24 |         return p
25 | 
26 |     def update(self,step=0) :
27 |         for p in self._dirty :
28 |             p._update(step)
29 |         del self._dirty[:]
30 | 
31 |     def final(self,step):
32 |         for p in self._list :
33 |             if hasattr(p,'final') :
34 |                 p.final(step)
35 | 
36 |     def un_final(self):
37 |         for p in self._list :
38 |             if hasattr(p,'un_final') :
39 |                 p.un_final()
40 | 
41 | 
42 | class _Base_Dict (Para_Dict):
43 |     def init(self,paras):
44 |         self._delta={}
45 |         self._paras=paras
46 | 
47 |     def output_obj(self):
48 |         for k,v in self.items():
49 |             if hasattr(v,'output_obj') :
50 |                 self[k]=v.output_obj()
51 |         return Para_Dict(self)
52 | 
53 |     def add_delta(self,keys,delta):
54 |         for f in keys :
55 |             if f not in self._delta :
56 |                 self._delta[f]=.0
57 |             self._delta[f]+=delta
58 |         self._paras._dirty.append(self)
59 | 
60 |     def add_model(self,model):
61 |         for k,v in model.items():
62 |             if k not in self :
63 |                 self[k]=0
64 |                 self._delta[k]=0
65 |             self[k]=(self[k]*self._delta[k]+v)/(self._delta[k]+1)
66 |             self._delta[k]+=1
67 | 
68 | class _Base_ndarray(np.ndarray):
69 |     def init(self,paras):
70 |         self._s=0
71 |         self._delta=0
72 |         self.paras=paras
73 | 
74 |     def add_delta(self,delta) :
75 |         self._delta+=delta
76 |         self.paras._dirty.append(self)
77 | 
78 |     def output_obj(self):
79 |         return np.array(self)
80 | 


--------------------------------------------------------------------------------
/isan/common/perceptrons.py:
--------------------------------------------------------------------------------
  1 | """
  2 | ZHANG Kaixu
  3 | """
  4 | import logging
  5 | import sys
  6 | import pickle
  7 | import random
  8 | import gzip
  9 | from isan.common.parameters import Parameters
 10 | 
 11 | class Model(object):
 12 |     """感知器模型 """
 13 |     name="感知器" #: 模型的名字
 14 | 
 15 |     def __init__(self,model_file,Task=None,Searcher=None,
 16 |             Updater=None,
 17 |             beam_width=8,logger=None,cmd_args={},**conf):
 18 |         """
 19 |         初始化
 20 |         如果不设置，则读取已有模型。如果设置，就是学习新模型
 21 |         """
 22 |         if logger==None :
 23 |             logger=logging.getLogger(__name__)
 24 |             console=logging.StreamHandler()
 25 |             console.setLevel(logging.INFO)
 26 |             logger.addHandler(console)
 27 |             logger.setLevel(logging.INFO)
 28 |         self.result_logger=logger
 29 | 
 30 |         self.beam_width=beam_width#:搜索宽度
 31 |         self.conf=conf
 32 | 
 33 |         if model_file!=None:
 34 |             file=gzip.open(model_file,"rb")
 35 |             self.task=Task(model=pickle.load(file),logger=logger)
 36 |             file.close()
 37 |         else : # new model to train
 38 |             self.paras=Parameters(Updater)
 39 |             #self.paras=Parameters(Ada_Grad)
 40 |             self.task=Task(logger=logger,paras=self.paras)
 41 |         if hasattr(self.task,'init'):
 42 |             self.task.init()
 43 |         self.searcher=Searcher(self.task,beam_width)
 44 |         self.step=0
 45 | 
 46 |     def __del__(self):
 47 |         del self.searcher
 48 |     def test(self,test_file):
 49 |         """
 50 |         测试
 51 |         """
 52 |         eval=self.task.Eval()
 53 |         for line in open(test_file):
 54 |             arg=self.task.codec.decode(line.strip())
 55 |             raw=arg.get('raw')
 56 |             Y=arg.get('Y_a',None)
 57 |             y=arg.get('y',None)
 58 |             hat_y=self(raw)
 59 |             eval(y,hat_y)
 60 |         if hasattr(eval,'get_result'):
 61 |             self.result_logger.info(eval.get_result())
 62 |         else :
 63 |             eval.print_result()#打印评测结果
 64 |         return eval
 65 |     
 66 |     def develop(self,dev_file):
 67 |         """
 68 |         @brief 预测开发集
 69 |         """
 70 | 
 71 |         self.paras.final(self.step)
 72 |         eval=self.task.Eval()
 73 |         for line in open(dev_file):
 74 |             arg=self.task.codec.decode(line.strip())
 75 |             if not arg:continue
 76 |             raw=arg.get('raw')
 77 |             y=arg.get('y',None)
 78 |             hat_y=self(raw)
 79 |             eval(y,hat_y)
 80 |         if hasattr(eval,'get_result'):
 81 |             self.result_logger.info(eval.get_result())
 82 |         else :
 83 |             eval.print_result()#打印评测结果
 84 |         self.paras.un_final()
 85 | 
 86 |         if hasattr(eval,'get_scaler'):
 87 |             return eval.get_scaler()
 88 | 
 89 | 
 90 |     def save(self,model_file=None):
 91 |         """
 92 |         保存模型
 93 |         """
 94 | 
 95 |         if model_file==None : model_file=self.model_file
 96 |         if model_file==None : return
 97 |         if model_file=='/dev/null' : return
 98 | 
 99 |         #self.task.average_weights(self.step)
100 |         self.paras.final(self.step)
101 | 
102 |         file=gzip.open(model_file,'wb')
103 |         data=self.task.dump_weights()
104 |         pickle.dump(data,file)
105 |         file.close()
106 | 
107 |     def search(self,raw,Y=None):
108 |         """
109 |         搜索
110 |         """
111 |         self.task.set_raw(raw,Y)
112 |         #self.searcher.set_raw(raw)
113 |         return self.searcher.search()
114 | 
115 |     def __call__(self,raw,Y=None,threshold=0):
116 |         """
117 |         解码，读入生句子，返回词的数组
118 |         """
119 |         rst_moves=self.search(raw,Y)
120 | 
121 |         hat_y=self.task.moves_to_result(rst_moves,raw)
122 |         if threshold==0 : 
123 |             return hat_y
124 |         else:
125 |             margins=self.searcher.cal_margins()
126 |             return self.task.gen_candidates(margins,threshold)
127 | 
128 |     def _learn_sentence(self,arg):
129 |         """
130 |         学习，根据生句子和标准分词结果
131 |         """
132 |         raw=arg.get('raw')
133 |         self.raw=raw
134 |         y=arg.get('y',None)
135 |         Y_a=arg.get('Y_a',None)
136 | 
137 |         #self.logger.debug('get training example')
138 |         #self.logger.debug("raw: %s"%raw)
139 |         #self.logger.debug("y: %s"%y)
140 |         #self.logger.debug("Y_a: %s"%Y_a)
141 | 
142 | 
143 |         #学习步数加一
144 |         self.step+=1
145 | 
146 |         #set oracle, get standard actions
147 |         if hasattr(self.task,'set_oracle'):
148 |             std_moves=self.task.set_oracle(raw,y)
149 | 
150 |         #self.logger.debug(std_moves)
151 | 
152 |         #get result actions
153 |         #self.searcher.set_step(self.step)
154 |         rst_moves=self.search(raw,Y_a)#得到解码后动作
155 | 
156 |         #update
157 |         if not self.task.check(std_moves,rst_moves):#check
158 |             self.update(std_moves,rst_moves)#update
159 | 
160 |         #clean oracle
161 |         if hasattr(self.task,'remove_oracle'):
162 |             self.task.remove_oracle()
163 | 
164 |         hat_y=self.task.moves_to_result(rst_moves,raw)#得到解码后结果
165 |         return y,hat_y
166 | 
167 |     def update(self,std_moves,rst_moves):
168 |         #self.task.cal_delta(std_moves,rst_moves,self.step)
169 |         self.task.cal_delta(std_moves,rst_moves)
170 |         if self.step%self.batch_size==0 : 
171 |             self.paras.update(self.step)
172 | 
173 |         
174 |     def train(self,training_file,
175 |             iteration=5,peek=-1,
176 |             dev_files=None,keep_data=True,batch_size=1):
177 |         """
178 |         训练
179 |         """
180 |         if iteration<=0 and peek <=0 : peek=5
181 |         self.batch_size=batch_size
182 | 
183 |         if type(training_file)==str:training_file=[training_file]
184 |         #random.seed(123)
185 | 
186 |         if keep_data :
187 |             training_data=[]
188 |             for t_file in training_file :
189 |                 for line in open(t_file):#迭代每个句子
190 |                     rtn=self.task.codec.decode(line.strip())#得到标准输出
191 |                     if not rtn:continue
192 |                     training_data.append(rtn)
193 |             random.shuffle(training_data)
194 | 
195 | 
196 |         def gen_data():
197 |             if keep_data :
198 |                 perc=0
199 |                 print(perc,end='%\r')
200 |                 #random.shuffle(training_data)
201 |                 for i,e in enumerate(training_data) :
202 |                     p=int(i*100/len(training_data))
203 |                     if p != perc :
204 |                         print("%i"%(p),end='%\r',file=sys.stderr)
205 |                         perc=p
206 |                     yield e
207 |             else :
208 |                 for t_file in training_file:
209 |                     for line in open(t_file):#迭代每个句子
210 |                         rtn=self.task.codec.decode(line.strip())#得到标准输出
211 |                         if not rtn:continue
212 |                         yield rtn
213 | 
214 |         it=0
215 |         best_it=None
216 |         best_scaler=None
217 | 
218 |         while True :
219 |             if it == iteration : break
220 |             self.result_logger.info("训练集第 \033[33;01m%i\033[1;m 次迭代"%(it+1))
221 |             eval=self.task.Eval()#: 测试用的对象
222 | 
223 |             for rtn in gen_data():
224 |                 if rtn is None : continue
225 |                 y,hat_y=self._learn_sentence(rtn)#根据（输入，输出）学习参数，顺便得到解码结果
226 |                 eval(y,hat_y)#根据解码结果和标准输出，评价效果
227 | 
228 |             if hasattr(eval,'get_result'):
229 |                 self.result_logger.info(eval.get_result())
230 |             else :
231 |                 eval.print_result()#打印评测结果
232 | 
233 |             if hasattr(self.task,'report'):
234 |                 self.task.report()
235 |             
236 |             if dev_files:
237 |                 #self.result_logger.info("使用开发集 %s 评价当前模型效果"%(dev_file))
238 |                 for dev_id,dev_file in enumerate(dev_files) :
239 |                     scaler=self.develop(dev_file)
240 |                     if dev_id==0 :
241 |                         if best_scaler==None or (scaler and best_scaler<scaler) :
242 |                             best_it=it
243 |                             best_scaler=scaler
244 |             it+=1
245 |             if peek>=0 and it-best_it>peek : break
246 |     def __del__(self):
247 |         self.task.__del__()
248 |         del self.task
249 | 
250 | 
251 | class Model_PA(Model) :
252 |     name="局部标注平均感知器"
253 |     def _learn_sentence(self,arg):
254 |         """
255 |         学习，根据生句子和标准分词结果
256 |         """
257 |         raw=arg.get('raw')
258 |         self.raw=raw
259 |         y=arg.get('y',None)
260 |         Y_a=arg.get('Y_a',None)
261 |         Y_b=arg.get('Y_b',None)
262 |         #print(arg)
263 |         
264 |         #学习步数加一
265 |         self.step+=1
266 | 
267 |         #get standard actions
268 |         if hasattr(self.task,'set_oracle'):
269 |             std_moves=self.task.set_oracle(raw,y,Y_b)
270 | 
271 |         #get result actions
272 |         rst_moves=self.search(raw,Y_a)#得到解码后动作
273 | 
274 |         #clean the early-update data
275 |         if hasattr(self.task,'remove_oracle'):
276 |             self.task.remove_oracle()
277 | 
278 |         if not self.task.is_belong(raw,rst_moves,Y_b): #不一致，则更新
279 |             if y and not Y_b :
280 |                 std_moves=self.task.result_to_moves(y)#得到标准动作
281 |             else :
282 |                 #print('yb',Y_b)
283 |                 std_moves=self.search(raw,Y_b)
284 |             self.update(std_moves,rst_moves)
285 |         hat_y=self.task.moves_to_result(rst_moves,raw)#得到解码后结果
286 |         return y,hat_y
287 | 
288 | 


--------------------------------------------------------------------------------
/isan/common/python_interface.cc:
--------------------------------------------------------------------------------
  1 | #include <Python.h>
  2 | #include "isan/common/common.hpp"
  3 | #include "isan/common/decoder.hpp"
  4 | 
  5 | #define __INIT_FUNC(a,b) a##b
  6 | #define INIT_FUNC(a,b) __INIT_FUNC(a,b)
  7 | #define PYINIT PyInit_
  8 | #define STR(x) #x
  9 | 
 10 | 
 11 | namespace isan{
 12 | 
 13 | static PyObject *
 14 | interface_delete(PyObject *self, PyObject *arg){
 15 |     delete (Interface*)PyLong_AsLong(arg);
 16 |     Py_INCREF(Py_None);
 17 |     return Py_None;
 18 | };
 19 | 
 20 | static PyObject *
 21 | set_raw(PyObject *self, PyObject *arg)
 22 | {
 23 |     Interface* interface;
 24 |     PyObject *new_raw;
 25 |     PyArg_ParseTuple(arg, "LO", &interface,&new_raw);
 26 |     if(!PyUnicode_Check(new_raw)){
 27 |         Py_INCREF(Py_None);
 28 |         return Py_None;
 29 |         
 30 |     };
 31 |     long raw_size=PySequence_Size(new_raw);
 32 |     
 33 |     Chinese raw(raw_size);
 34 |     for(int i=0;i<raw_size;i++){
 35 |         PyObject *tmp=PySequence_GetItem(new_raw,i);
 36 |         raw.pt[i]=(Chinese_Character)*PyUnicode_AS_UNICODE(tmp);
 37 |         Py_DECREF(tmp);
 38 |     }
 39 |     interface->set_raw(raw);
 40 |     Py_INCREF(Py_None);
 41 |     
 42 |     return Py_None;
 43 | };
 44 | static PyObject *
 45 | set_step(PyObject *self, PyObject *arg)
 46 | {
 47 |     Interface* interface;
 48 |     PyObject *new_raw;
 49 |     long step=0;
 50 |     
 51 |     PyArg_ParseTuple(arg, "LL", &interface,&step);
 52 |     
 53 |     interface->data->learning_step=step;
 54 |     Py_INCREF(Py_None);
 55 |     
 56 |     return Py_None;
 57 | };
 58 | static PyObject *
 59 | do_nothing(PyObject *self, PyObject *arg)
 60 | {
 61 |     Py_INCREF(Py_None);
 62 |     return Py_None;
 63 | };
 64 | 
 65 | 
 66 | 
 67 | static PyObject *
 68 | search(PyObject *self, PyObject *arg)
 69 | {
 70 | 
 71 |     Interface* interface;
 72 |     PyObject *py_init_states;
 73 |     PyArg_ParseTuple(arg, "LO", &interface,&py_init_states);
 74 | 
 75 |     std::vector<State_Type> init_states;
 76 |     for(int i=0;i<PyList_GET_SIZE(py_init_states);i++){
 77 |         init_states.push_back(State_Type(PyList_GET_ITEM(py_init_states,i)));
 78 |     };
 79 | 
 80 |     std::vector<Alpha_Type* > result_alphas;
 81 | 
 82 |     (*interface->push_down)(
 83 |             init_states,
 84 |             result_alphas);
 85 |     PyObject * rtn_list=PyList_New(result_alphas.size());
 86 |     for(int i=0;i<result_alphas.size();i++){
 87 |         PyList_SetItem(rtn_list,i,pack_alpha(result_alphas[i]));
 88 |     }
 89 |     return rtn_list;
 90 | };
 91 | 
 92 | static PyObject *
 93 | get_states(PyObject *self, PyObject *arg)
 94 | {
 95 |     Interface* interface;
 96 |     PyArg_ParseTuple(arg, "L", &interface);
 97 |     interface->push_down->cal_betas();
 98 | 
 99 |     std::vector<State_Type > states;
100 |     std::vector<Score_Type> scores;
101 | 
102 |     interface->push_down->get_states(states,scores);
103 | 
104 |     PyObject * list=PyList_New(states.size());
105 |     for(int i=0;i<states.size();i++){
106 |         PyObject * py_state=states[i].pack();
107 |         PyObject * py_score=PyLong_FromLong(scores[i]);
108 |         PyList_SetItem(list,i,
109 |                     PyTuple_Pack(2,py_state,py_score)
110 |                 );
111 |         Py_DECREF(py_state);
112 |         Py_DECREF(py_score);
113 |     };
114 |     return list;
115 | };
116 | static PyObject *
117 | module_new(PyObject *self, PyObject *arg)
118 | {
119 |     PyObject * py_early_stop_callback;
120 |     PyObject * py_shift_callback;
121 |     PyObject * py_reduce_callback;
122 |     PyObject * py_feature_cb;
123 |     int beam_width;
124 |     PyArg_ParseTuple(arg, "iOOOO", &beam_width,
125 |             &py_early_stop_callback,
126 |             &py_shift_callback,
127 |             &py_reduce_callback,
128 |             &py_feature_cb);
129 | 
130 |     Interface* interface=new Interface(beam_width,
131 |             py_early_stop_callback,
132 |             py_shift_callback,
133 |             py_reduce_callback,
134 |             py_feature_cb);
135 |     return PyLong_FromLong((long)interface);
136 | };
137 | 
138 | /** stuffs about the module def */
139 | static PyMethodDef interfaceMethods[] = {
140 |     {"new",  module_new, METH_VARARGS,""},
141 |     {"delete",  interface_delete, METH_O,""},
142 |     {"set_raw",  set_raw, METH_VARARGS,""},
143 |     {"set_step",  set_step, METH_VARARGS,""},
144 |     {"search",  search, METH_VARARGS,""},
145 |     {"get_states",  get_states, METH_VARARGS,""},
146 |     {NULL, NULL, 0, NULL}        /* Sentinel */
147 | };
148 | 
149 | static struct PyModuleDef module_struct = {
150 |    PyModuleDef_HEAD_INIT,
151 |    STR(__MODULE_NAME),   /* name of module */
152 |    NULL, /* module documentation, may be NULL */
153 |    -1,       /* size of per-interpreter state of the module,
154 |                 or -1 if the module keeps state in global variables. */
155 |    interfaceMethods
156 | };
157 | 
158 | PyMODINIT_FUNC
159 | INIT_FUNC(PYINIT,__MODULE_NAME) (void)
160 | {
161 |     return PyModule_Create(&module_struct);
162 | }
163 | 
164 | };//end of isan
165 | 


--------------------------------------------------------------------------------
/isan/common/smart_string.hpp:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | #include <string>
  3 | #include <iostream>
  4 | template<class ITEM>
  5 | class Smart_String{
  6 | public:
  7 |     typedef size_t SIZE_T;
  8 |     ITEM* pt;
  9 |     SIZE_T length;
 10 |     SIZE_T* _ref_count;
 11 |     Smart_String(){
 12 |         pt=NULL;
 13 |         length=0;
 14 |         _ref_count=new SIZE_T();
 15 |         *_ref_count=1;
 16 |     };
 17 |     Smart_String(ITEM* buffer, SIZE_T length){
 18 |         _ref_count=new SIZE_T();
 19 |         *_ref_count=1;
 20 |         pt=new ITEM[length];
 21 |         this->length=length;
 22 |         memcpy(pt,buffer,length*sizeof(ITEM));
 23 |     };
 24 |     Smart_String(SIZE_T length){
 25 |         _ref_count=new SIZE_T();
 26 |         *_ref_count=1;
 27 |         pt=new ITEM[length];
 28 |         this->length=length;
 29 |     };
 30 |     Smart_String(const Smart_String& other){
 31 |         pt=other.pt;
 32 |         length=other.length;
 33 |         _ref_count=other._ref_count;
 34 |         (*_ref_count)++;
 35 |     };
 36 |     inline void operator=(const Smart_String& other){
 37 |         (*_ref_count)--;
 38 |         if(!*_ref_count){
 39 |             delete _ref_count;
 40 |             if(pt)delete[] pt;
 41 |         }
 42 |         pt=other.pt;
 43 |         length=other.length;
 44 |         _ref_count=other._ref_count;
 45 |         (*_ref_count)++;
 46 |     };
 47 |     ~Smart_String(){
 48 |         (*_ref_count)--;
 49 |         if(!*_ref_count){
 50 |             delete _ref_count;
 51 |             if(pt)delete[] pt;
 52 |         }
 53 |     };
 54 | 
 55 |     inline bool operator==(const Smart_String&next) const{
 56 |         if(length!=next.length)
 57 |             return false;
 58 |         if(pt==next.pt)return true;
 59 |         for(int i=0;i<length;i++){
 60 |             if(pt[i]!=next.pt[i])return false;
 61 |         }
 62 |         return true;
 63 |     };
 64 |     inline bool operator<(const Smart_String& next)const{
 65 |         if(length<next.length)return 1;
 66 |         if(length>next.length)return 0;
 67 |         for(int i=0;i<length;i++){
 68 |             if(pt[i]<next.pt[i])return 1;
 69 |             if(pt[i]>next.pt[i])return 0;
 70 |         }
 71 |         return 0;
 72 |     };
 73 |     inline const size_t& size() const{
 74 |         return length;
 75 |     };
 76 | 
 77 |     class HASH{
 78 |     public:
 79 |         inline SIZE_T operator()(const Smart_String& cx) const{
 80 |             SIZE_T value=0;
 81 |             for(int i=0;i<cx.length;i++){
 82 |                 value+=cx.pt[i]<<((i%8)*8);
 83 |             }
 84 |             return value;
 85 |         }
 86 |     };
 87 | };
 88 | 
 89 | 
 90 | class Smart_Chars{
 91 | public:
 92 |     typedef unsigned char Char;
 93 | private:
 94 | public:
 95 |     std::string str;
 96 |     PyObject* pack() const{
 97 |         return PyBytes_FromStringAndSize((char*)str.data(),str.length());
 98 |     };
 99 |     Smart_Chars(){
100 |     };
101 |     Smart_Chars(const Smart_Chars& other){
102 |         str=other.str;
103 |     };
104 |     Smart_Chars(PyObject* py_key){
105 |         char* buffer;
106 |         Py_ssize_t len;
107 |         if(PyBytes_Check(py_key)){
108 |             PyBytes_AsStringAndSize(py_key,&buffer,&len);
109 |             str=std::string(buffer,len);
110 |         }else {
111 |             PyObject* by=PyUnicode_AsUTF8String(py_key);
112 |             PyBytes_AsStringAndSize(by,&buffer,&len);
113 |             Py_DECREF(by);
114 |             str=std::string(buffer,len);
115 |         }
116 |     };
117 |     Smart_Chars(Char* buffer, size_t length){
118 |         str=std::string((char*)buffer,length);
119 |     };
120 |     Smart_Chars(const Smart_Chars& other,int length){
121 |         str=std::string(other.str.data(),length);
122 |     };
123 |     inline void operator=(const Smart_Chars& other){
124 |         str=other.str;
125 |     };
126 |     void make_positive(){
127 |         for(int i=0;i<str.length();i++){
128 |             if(str[i]==0){
129 |                 std::cout<<"zero\n";
130 |             };
131 |         };
132 |     };
133 |     inline const Char operator[](const int i) const{
134 |         return (Char)str[i];
135 |     };
136 |     inline const size_t size() const{
137 |         return str.length();
138 |     };
139 |     class HASH{
140 |     public:
141 |         inline size_t operator()(const Smart_Chars& cx) const{
142 |             size_t value=0;
143 |             for(int i=0;i<cx.str.length();i++){
144 |                 value+=(Char)cx.str[i]<<((i%8)*8);
145 |                 //value=131*value+cx.str[i];
146 |             }
147 |             return value;
148 |             return value & 0x7FFFFFFF;
149 |         }
150 |     };
151 |     inline bool operator==(const Smart_Chars&next) const{
152 |         return this->str==next.str;
153 |     };
154 |     inline bool operator<(const Smart_Chars& next)const{
155 |         if(this->str.length()<next.str.length())return 1;
156 |         if(this->str.length()>next.str.length())return 0;
157 |         for(int i=0;i<this->str.length();i++){
158 |             if((Char)this->str[i]<(Char)next.str[i])return 1;
159 |             if((Char)this->str[i]>(Char)next.str[i])return 0;
160 |         }
161 |         return 0;
162 |     };
163 | };
164 | 


--------------------------------------------------------------------------------
/isan/common/task.py:
--------------------------------------------------------------------------------
  1 | 
  2 | #not finished !!
  3 | class Lattice (list) :
  4 |     def __init__(self,l,w=None):
  5 |         self.weights=w
  6 |         self.extend(list(l)) # items= [ (begin,end,data) * ]
  7 |         self.length=max(l for _,l,_ in self)
  8 |         self.begins={}
  9 |         for i in range(len(self)) :
 10 |             b=self[i][0]
 11 |             if b not in self.begins : self.begins[b]=[]
 12 |             self.begins[b].append(i)
 13 |     def __str__(self):
 14 |         return ' '.join("%i:(%i,%i):%s"%(i,it[0],it[1],it[2]) for i,it in enumerate(self))
 15 |     
 16 |     def gen_sentence(self,to_str):
 17 |         o=0
 18 |         s=[]
 19 |         while True :
 20 |             if o not in self.begins : break
 21 |             nind=self.begins[o][0]
 22 |             b,o,x=self[nind]
 23 |             s.append(to_str(x))
 24 |         return ''.join(s)
 25 | 
 26 | 
 27 | 
 28 | class Base_Task :
 29 |     def get_init_states(self) :
 30 |         return [self.State.init_state]
 31 | 
 32 |     #def reduce(self,last_ind,stat,pred_inds,predictors):
 33 |     #    pass
 34 |     reduce = None
 35 | 
 36 | 
 37 |     def actions_to_moves(self,actions,lattice):
 38 |         #print(lattice)
 39 |         state=self.State(lattice)
 40 |         stack=[state]
 41 |         moves=[[None,None,action] for action in actions]
 42 |         moves[0][0]=0
 43 |         moves[0][1]=self.State.init_state
 44 |         for i in range(len(moves)-1) :
 45 |             move=moves[i]
 46 |             step,state,action=move
 47 |             ind,label=action
 48 | 
 49 |             if ind >=0 : # shift
 50 |                 rst=[[nstep,ns] for a,nstep,ns in self.shift(step,state) if a==self.Action.encode(action)]
 51 |                 moves[i+1][0],moves[i+1][1]=rst[0]
 52 |                 stack.append(rst[0][1])
 53 |             else : # reduce 
 54 |                 s0=stack.pop()
 55 |                 s1=stack.pop()
 56 |                 rst=[[nstep,ns] for a,nstep,ns,_ in self.reduce(step,s0,[0],[s1]) if a==self.Action.encode(action)]
 57 |                 #print(i)
 58 |                 moves[i+1][0],moves[i+1][1]=rst[0]
 59 |                 stack.append(rst[0][1])
 60 |                 pass
 61 |         #input()
 62 |         for move in moves:
 63 |             move[2]=self.Action.encode(move[2])
 64 | 
 65 |         moves=list(map(tuple,moves))
 66 |         return moves
 67 | 
 68 |     def moves_to_result(self,moves,_):
 69 |         actions=[self.Action.decode(a) for ind,state,a in moves]
 70 |         #print(actions)
 71 |         #input()
 72 |         return self.actions_to_result(actions)
 73 | 
 74 | 
 75 |     def check(self,std_moves,rst_moves):
 76 |         if len(std_moves)!=len(rst_moves) :return False
 77 |         return all(
 78 |                 std_move[2]==rst_move[2]
 79 |                 for std_move,rst_move in zip(std_moves,rst_moves)
 80 |                 )
 81 | 
 82 |     def set_oracle(self,raw,y) :
 83 |         self.oracle=[None]
 84 |         self.set_raw(raw,y)
 85 |         std_actions=self.result_to_actions(y)
 86 |         moves=self.actions_to_moves(std_actions,raw)
 87 |         return moves
 88 | 
 89 |     def remove_oracle(self):
 90 |         self.oracle=None
 91 | 
 92 |     early_stop=None
 93 | 
 94 |     def _update(self,move,delta):
 95 |         self.gen_features(move[1],[move[2]],delta)
 96 |     def update_moves(self,std_moves,rst_moves,step) :
 97 |         for s,r in zip(std_moves,rst_moves) :
 98 |             if s!= r:
 99 |                 self._update(s,1,step)
100 |                 self._update(r,-1,step)
101 |                 #yield s, 1
102 |                 #yield r, -1
103 | 
104 |     def average_weights(self,step):
105 |         self.weights.average_weights(step)
106 | 
107 |     def un_average_weights(self):
108 |         self.weights.un_average_weights()
109 | 
110 | class Early_Stop_Pointwise :
111 |     def set_oracle(self,raw,y) :
112 |         self.set_raw(raw,y)
113 |         self.stop_step=None
114 |         std_actions=self.result_to_actions(y)
115 |         moves=self.actions_to_moves(std_actions,raw)
116 | 
117 |         self.oracle={}
118 |         for step,state,action in moves :
119 |             self.oracle[step]=self.State.decode(state)
120 |         return moves
121 | 
122 |     def remove_oracle(self):
123 |         self.stop_step=None
124 |         self.oracle=None
125 | 
126 |     def early_stop(self,step,next_states,moves):
127 |         #print('early')
128 |         #return False
129 |         if not moves : return False
130 |         if not hasattr(self,'oracle') or self.oracle==None : return False
131 |         last_steps,last_states,actions=zip(*moves)
132 |         self.stop_step=None
133 |         if step in self.oracle :
134 |             next_states=[self.State.decode(x) for x in next_states]
135 |             if not (self.oracle[step]in next_states) :
136 |                 self.stop_step=step
137 |                 return True
138 |         return False
139 |     #def early_stop(self,step,next_states,moves):
140 |     #    return False
141 | 
142 |     def update_moves(self,std_moves,rst_moves,step) :
143 |         for move in rst_moves :
144 |             if self.stop_step is not None and move[0]>=self.stop_step : break
145 |             self._update(move,-1,step)
146 |             #yield move, -1
147 |         for move in std_moves :
148 |             if self.stop_step is not None and move[0]>=self.stop_step : break
149 |             #yield move, 1
150 |             self._update(move,1,step)
151 | 


--------------------------------------------------------------------------------
/isan/common/updater.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import collections
  3 | 
  4 | from isan.common.parameters import _Base_Dict
  5 | from isan.common.parameters import _Base_ndarray
  6 | 
  7 | 
  8 | 
  9 | class Ada_Grad :
 10 |     name='Ada Grad'
 11 |     class d(_Base_Dict):
 12 |         def __init__(self,dic):
 13 |             self.update(dic)
 14 |             self._s=dict(dic)
 15 | 
 16 |         def _update(self,step):
 17 |             for k,v in self._delta.items():
 18 |                 if np.all(v==0) : continue
 19 |                 if k not in self : 
 20 |                     self[k]=0
 21 |                 if k not in self._s :
 22 |                     self._s[k]=0
 23 |                 self._s[k]+=v**2
 24 |                 _s=self._s[k]
 25 |                 _delta=np.where(_s,1/np.sqrt(_s+(_s==0)),0)*v
 26 |                 self[k]+=_delta
 27 |                 self[k]*=0.99
 28 |             self._delta.clear()
 29 | 
 30 |     class ndarray(_Base_ndarray):
 31 |         def init(self,paras):
 32 |             self._s=0
 33 |             self._delta=0
 34 |             self.paras=paras
 35 |         def _update(self,step) :
 36 |             if np.all(self._delta==0) : return
 37 |             self._s+=self._delta**2
 38 |             delta=np.where(self._s,1/np.sqrt(self._s+(self._s==0)),0)*self._delta
 39 |             self+=delta
 40 |             self*=0.99
 41 |             self._delta=0
 42 |         
 43 | class Default :
 44 |     name='naive'
 45 |     class d(_Base_Dict):
 46 |         def __init__(self,dic):
 47 |             self.update(dic)
 48 |             self._s=dict(dic)
 49 |         def _update(self,step):
 50 |             for k,v in self._delta.items():
 51 |                 if k not in self : 
 52 |                     self[k]=0
 53 |                     self._s[k]=0
 54 |                 self[k]+=v
 55 |                 self._s[k]+=v*step
 56 |             self._delta.clear()
 57 | 
 58 | 
 59 |     class ndarray(_Base_ndarray):
 60 |         def init(self,paras):
 61 |             self._delta=0
 62 |             self.paras=paras
 63 |         def _update(self,step) :
 64 |             self+=self._delta
 65 |             self._delta=0
 66 | 
 67 | class Averaged :
 68 |     name='Averaged'
 69 |     class d(_Base_Dict):
 70 |         def __init__(self,dic):
 71 |             self.update(dic)
 72 |             self._s=dict(dic)
 73 | 
 74 |         def _update(self,step):
 75 |             for k,v in self._delta.items():
 76 |                 if k not in self : 
 77 |                     self[k]=0
 78 |                     self._s[k]=0
 79 |                 self[k]+=v
 80 |                 self._s[k]+=v*step
 81 |             self._delta.clear()
 82 | 
 83 |         def final(self,step):
 84 |             self._backup=dict(self)
 85 |             for k,v in self._backup.items():
 86 |                 self[k]=self[k]-self._s[k]/step
 87 | 
 88 |         def un_final(self):
 89 |             self.clear()
 90 |             self.update(self._backup)
 91 |             self._backup.clear()
 92 | 
 93 |     class ndarray(_Base_ndarray):
 94 |         def _update(self,step) :
 95 |             self+=self._delta
 96 |             self._s+=self._delta*step
 97 |             self._delta=0
 98 | 
 99 |         def final(self,step):
100 |             self._d=self*1
101 |             self-=self._s/step
102 | 
103 |         def un_final(self):
104 |             self*=0
105 |             self+=self._d
106 | 


--------------------------------------------------------------------------------
/isan/common/weights.py:
--------------------------------------------------------------------------------
 1 | import collections
 2 | import isan.common.feature_dict as feature_dict
 3 | 
 4 | 
 5 | class FD :
 6 |     def __init__(self):
 7 |         self.fd=feature_dict.new()
 8 | 
 9 |     def size(self):
10 |         return feature_dict.size(self.fd)
11 | 
12 |     def set_weights(self,d):
13 |         return feature_dict.set_weights(self.fd,d)
14 | 
15 |     def cal_fv(self,fv):
16 |         return feature_dict.cal_fv(self.fd,fv)
17 | 
18 |     def update_fv(self,fv,delta):
19 |         feature_dict.update_fv(self.fd,fv,delta)
20 | 
21 |     def to_dict(self):
22 |         return feature_dict.to_dict(self.fd)
23 | 
24 |     def get(self,key):
25 |         return feature_dict.get(self.fd,key)
26 | 
27 |     def clear(self):
28 |         feature_dict.clear(self.fd)
29 | 
30 |     def __del__(self):
31 |         feature_dict.delete(self.fd)
32 | 
33 | class Weights :
34 |     def items(self):
35 |         for k,v in self.data.items():
36 |             yield k,v
37 | 
38 |     def __init__(self):
39 |         self.data=dict()
40 |         self.s=dict()
41 | 
42 |     def add_model(self,model):
43 |         for k,v in model.items():
44 |             if v==0 : continue
45 |             if k not in self.data :
46 |                 self.data[k]=0
47 |                 self.s[k]=0
48 |             self.data[k]=(self.data[k]*self.s[k]+v)/(self.s[k]+1)
49 |             self.s[k]+=1
50 | 
51 |     def __call__(self,keys):
52 |         return float(sum(self.data.get(k,0) for k in keys))
53 | 
54 |     def update_weights(self,keys,delta,step):
55 |         for f in keys :
56 |             if f not in self.data :
57 |                 self.data[f]=0
58 |                 self.s[f]=0
59 |             self.data[f]+=delta
60 |             self.s[f]+=delta*(step)
61 | 
62 |     def average_weights(self,step):
63 |         self._backup=dict(self.data)
64 |         for k,v in self._backup.items():
65 |             self.data[k]=self.data[k]-self.s[k]/step
66 | 
67 |     def un_average_weights(self):
68 |         self.data.clear()
69 |         self.data.update(self._backup)
70 |         self._backup.clear()
71 | 


--------------------------------------------------------------------------------
/isan/data/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python3
2 | 
3 | def test():
4 |     print("this is a test function")
5 | 


--------------------------------------------------------------------------------
/isan/data/lattice.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | 
 3 | """
 4 | item= begin,end,*
 5 | weights = []
 6 | gold=
 7 | 
 8 | eval
 9 | 
10 | 
11 | [item]
12 | 
13 | """
14 | class Lattice :
15 |     def __init__(self,l,w):
16 |         self.weights=w
17 |         self.items=l
18 |         chars={}
19 |         begins={}
20 |         for i,item in enumerate(self.items) :
21 |             begin=item[0]
22 |             for j,c in enumerate(item[2]):
23 |                 o=j+begin
24 |                 if o not in chars: chars[o]=c
25 |             if begin not in begins : begins[begin]=[]
26 |             begins[begin].append(i)
27 |         self.begins=begins
28 |         self.sentence=''.join(x[1] for x in sorted(list(chars.items())))
29 |         self.length=len(self.sentence)
30 |     def __str__(self):
31 |         items=' '.join('_'.join(map(str,item)) for item in self.items)
32 |         s='lattice of: %s\nitems: %s'%(self.sentence,items)
33 |         return s
34 | 
35 | """
36 | [{'start': ,'end':,'key','info','gold'}]
37 | """
38 | class Data :
39 |     @staticmethod
40 |     def to_train(data):
41 |         train=[]
42 |         for item in data:
43 |             train.append(item['key'])
44 |         lattice=Lattice(train,None)
45 |         return lattice
46 |     pass
47 | 
48 | 


--------------------------------------------------------------------------------
/isan/parsing/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhangkaixu/isan/6dd4a2d7c16158e9d5e559aa79d1e9b9ace2b6de/isan/parsing/__init__.py


--------------------------------------------------------------------------------
/isan/parsing/char_dep.py:
--------------------------------------------------------------------------------
  1 | import pickle
  2 | import json
  3 | import time
  4 | import isan.parsing.dep_codec as codec
  5 | import isan.parsing.eval as eval
  6 | class Dep:
  7 |     shift_action=ord('s')
  8 |     left_reduce=ord('l')
  9 |     right_reduce=ord('r')
 10 |     #init=(0,(0,0),(None,None,None))
 11 |     def init(self):
 12 |         pass
 13 |     init_stat=pickle.dumps((0,(0,0),(None,None,None)))
 14 |     class Eval:
 15 |         def __init__(self):
 16 |             self.start_time=time.time()
 17 |         def __call__(self,a,b,**_):
 18 |             raw=''.join(a)
 19 | 
 20 |             mer=[i for i in range(len(raw))]
 21 |             for i,r in enumerate(b):
 22 |                 if r!=-1 :
 23 |                     if abs(i-r)==1 :
 24 |                         mer[i]=r
 25 | 
 26 |             lx=-1
 27 |             sen=[]
 28 |             for x,c in zip(mer,raw):
 29 |                 if x!=lx:
 30 |                     sen.append('')
 31 |                     lx=x
 32 |                 sen[-1]+=c
 33 |             print(' '.join(sen))
 34 | 
 35 | 
 36 |             pass
 37 |         def print_result(self):
 38 |             duration=time.time()-self.start_time
 39 |             print("历时:%.2f 现时:%s"%(
 40 |                     duration,
 41 |                     time.strftime("%H:%M:%S")))
 42 |             pass
 43 |         pass
 44 |     class codec:
 45 |         def decode(line):
 46 |             data=json.loads(line.strip())
 47 |             return data
 48 |     def shift(self,stat):
 49 |         stat=pickle.loads(stat)
 50 |         raw=self.raw
 51 |         ind,span,stack_top=stat
 52 |         l,r=span
 53 |         if self.intervals:
 54 |             if self.intervals[l][1]!=-1 and self.intervals[l][1]<=r: return []
 55 | 
 56 |         if ind>=len(raw): return []
 57 |         rtn= [
 58 |                 (self.shift_action,
 59 |                     pickle.dumps(
 60 |                 (ind+1,
 61 |                 (ind,ind+1),
 62 |                 (raw[ind][0],
 63 |                         stack_top[0],
 64 |                         stack_top[1])
 65 |                 )))
 66 |                 ]
 67 |         return rtn
 68 | 
 69 |     def reduce(self,stat,predictor):
 70 |         stat=pickle.loads(stat)
 71 |         ind,span,stack_top=stat
 72 |         predictor=pickle.loads(predictor)
 73 |         _,p_span,_=predictor
 74 |         s0,s1,s2=stack_top
 75 |         if s0==None or s1==None:return []
 76 |         l,r=p_span[0],span[1]
 77 |         if self.intervals:
 78 |             if self.intervals[l][1]!=-1 and self.intervals[l][1]<r: return []
 79 |             if self.intervals[r][0]!=-1 and self.intervals[r][1]>l: return []
 80 |         s01=s1+s0
 81 |         rtn= [
 82 |             (self.left_reduce,pickle.dumps((ind,
 83 |                 (p_span[0],span[1]),
 84 |                 (s01 if len(s01)<=2 else s1,predictor[2][1],predictor[2][2])))),
 85 |              (self.right_reduce,pickle.dumps((ind,
 86 |                 (p_span[0],span[1]),
 87 |                 (s01 if len(s01)<=2 else s0,predictor[2][1],predictor[2][2])))),
 88 |              ]
 89 |         return rtn
 90 |     def actions_to_stats(self,actions):
 91 |         sn=sum(1 if a==self.shift_action else 0 for a in actions)
 92 |         assert(sn*2-1==len(actions))
 93 |         stat=None
 94 |         stack=[]
 95 |         ind=0
 96 |         for action in actions:
 97 |             stat=(ind,(0,0)if not stack else (stack[-1][4],stack[-1][5]),
 98 |                     (
 99 |                         stack[-1][0] if len(stack)>0 else None,
100 |                         stack[-2][0] if len(stack)>1 else None,
101 |                         stack[-3][0] if len(stack)>2 else None,
102 |                     ))
103 |             yield pickle.dumps(stat)
104 |             if action==self.shift_action:
105 |                 stack.append([self.raw[ind],self.raw[ind],None,None,ind,ind+1])
106 |                 ind+=1
107 |             else:
108 |                 s01=stack[-1][0]+stack[-2][0]
109 |                 if action==self.left_reduce:
110 |                     if len(s01)<=2 :
111 |                         stack[-2][0]=s01
112 |                     stack[-2][3]=stack[-1][1]
113 |                     stack[-2][5]=stack[-1][5]
114 |                     stack.pop()
115 |                 if action==self.right_reduce:
116 |                     stack[-1][2]=stack[-2][1]
117 |                     stack[-1][4]=stack[-2][4]
118 |                     stack[-2]=stack[-1]
119 |                     if len(s01)<=2 :
120 |                         stack[-2][0]=s01
121 |                     stack.pop()
122 |     def set_raw(self,raw,Y):
123 |         """
124 |         对需要处理的句子做必要的预处理（如缓存特征）
125 |         """
126 |         if Y:
127 |             self.intervals=Y[1]
128 |         else:
129 |             self.intervals=None
130 |         self.raw=raw
131 |         self.f_raw=[w.encode() for w in self.raw]
132 |     def gen_features(self,stat):
133 |         stat=pickle.loads(stat)
134 |         ind,_,stack_top=stat
135 |         s0,s1,s2=stack_top
136 | 
137 |         q0=self.f_raw[ind] if ind<len(self.f_raw) else b''
138 |         q1=self.f_raw[ind+1] if ind+1<len(self.f_raw) else b''
139 |         c0=self.f_raw[ind-1] if ind-1>=0 else b''
140 | 
141 |         s0=(s0.encode() if s0 else b'')
142 |         s1=(s1.encode() if s1 else b'')
143 |         s2=(s2.encode() if s2 else b'')
144 | 
145 |         
146 |         
147 |         fv=[
148 |                 b'0'+s0,
149 |                 b'1'+s1,
150 |                 b'2'+s2,
151 |                 b'3'+s1+b' '+s2,
152 |                 b'4'+s0+b' '+q0,
153 |                 b'5'+q0+b' '+q1,
154 |                 b'6'+q0,
155 |                 b'6'+c0,
156 |                 ]
157 |         return fv
158 |     def actions_to_result(self,actions,raw):
159 |         ind=0
160 |         stack=[]
161 |         arcs=[]
162 |         for a in actions:
163 |             if a==self.shift_action:
164 |                 stack.append(ind)
165 |                 ind+=1
166 |             elif a==self.left_reduce:
167 |                 arcs.append((stack[-1],stack[-2]))
168 |                 stack.pop()
169 |             elif a==self.right_reduce:
170 |                 arcs.append((stack[-2],stack[-1]))
171 |                 stack[-2]=stack[-1]
172 |                 stack.pop()
173 |         arcs.append((stack[-1],-1))
174 |         arcs.sort()
175 |         arcs=[x for _,x in arcs]
176 |         return arcs
177 | 
178 |         sen=[]
179 |         cache=''
180 |         for c,a in zip(raw,actions[1:]):
181 |             cache+=c
182 |             if a==self.shift_action:
183 |                 sen.append(cache)
184 |                 cache=''
185 |         if cache:
186 |             sen.append(cache)
187 |         return sen
188 |     def result_to_actions(self,result):
189 |         """
190 |         将依存树转化为shift-reduce的动作序列（与动态规划用的状态空间无关）
191 |         在一对多中选择了一个（没搞清楚相关工作怎么弄的）
192 |         """
193 |         stack=[]
194 |         actions=[]
195 |         result=[ind for _,_,ind,_ in result]
196 |         record=[[ind,head,0] for ind,head in enumerate(result)]
197 |         for ind,head,_ in record:
198 |             if head!=-1 :
199 |                 record[head][2]+=1
200 |         for ind,head in enumerate(result):
201 |             actions.append(self.shift_action)
202 |             stack.append([ind,result[ind],record[ind][2]])
203 |             while len(stack)>=2:
204 |                 if stack[-1][2]==0 and stack[-1][1]!=-1 and stack[-1][1]==stack[-2][0]:
205 |                     actions.append(self.left_reduce)
206 |                     stack.pop()
207 |                     stack[-1][2]-=1
208 |                 elif stack[-2][1]!=-1 and stack[-2][1]==stack[-1][0]:
209 |                     actions.append(self.right_reduce)
210 |                     stack[-2]=stack[-1]
211 |                     stack.pop()
212 |                     stack[-1][2]-=1
213 |                 else:
214 |                     break
215 |         assert(len(actions)==2*len(result)-1)
216 |         return actions
217 | 


--------------------------------------------------------------------------------
/isan/parsing/codec.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python3
 2 | 
 3 | 
 4 | class Prefix:
 5 |     def __init__(self):
 6 |         self.content=''
 7 |     def __call__(self,arg=''):
 8 |         tmp=self.content
 9 |         self.content=arg
10 |         return tmp
11 |         
12 | 
13 | def act_pop(stack):
14 |     if len(stack)>1:
15 |         sub=stack.pop()
16 |         stack[-1].append(sub)
17 | 
18 | def normalize(tree,tag='s'):
19 |     if type(tree)==str:
20 |         assert(1==2)
21 |     tree[0][1]=tag
22 | 
23 |     structure=[]
24 |     for sub in tree[1:]:
25 |         if type(sub)==str:
26 |             
27 |             if tree[0][0]=='m' and len(tree[1:])>1:
28 |                 structure.append([['f','-fix'],[sub,'f']])
29 |             else:
30 |                 structure.append([[tree[0][0],'head'],[sub,tree[0][0]]])
31 |         else:
32 |             t= 'm' if not sub[0][1] else sub[0][1]
33 |             #if t=='m' and tag=='m': t='f'
34 |             structure.append(normalize(sub,t))
35 |     print(tree[0])
36 |     for struct in structure:
37 |         print('  ',struct)
38 |     print("")
39 |     return [tree[0],structure]
40 | 
41 | def decode(line=None):
42 |     #line='[(少将)称([(中方){[{完全}有(实力)]}{在(((黄岩)岛)[对峙])中}奉陪{到底}])]'
43 |     #line='[{((实际)上)}({(中国)的}((国防)科技)部门){一直}{高度}关注({((博弈)论)的}(军事)应用)]'
44 |     #line='[{曾经}纠结([{是否}让((搜狗)(输入)法)支持((火星)文)])]'
45 |     line='''[[只要]a({[这样]的}大师)[在((我们)旁边)]出现]'''
46 |     stack=[]
47 |     
48 |     prefix=Prefix()
49 |     
50 |     actions={
51 |         '[':lambda stack: stack.append([['v',prefix('')]]),
52 |         ']':lambda stack: act_pop(stack),
53 |         '(':lambda stack: stack.append([['n',prefix('')]]),
54 |         ')':lambda stack: act_pop(stack),
55 |         '{':lambda stack: stack.append([['m',prefix('')]]),
56 |         '}':lambda stack: act_pop(stack),
57 |         '<':lambda stack: stack.append([['c',prefix('')]]),
58 |         '>':lambda stack: act_pop(stack),
59 |         'a':lambda stack: prefix('a'),
60 |     }
61 |     for token in line:
62 |         if token in actions:
63 |             actions[token](stack)
64 |         else:
65 |             if len(stack[-1])<2 or type(stack[-1][-1])!=str:
66 |                 stack[-1].append('')
67 |             stack[-1][-1]+=token
68 |     
69 |     stack[0]=normalize(stack[0])
70 |     return stack[0]
71 | if __name__=="__main__":
72 |     tree=decode()
73 |     print(tree)
74 | 


--------------------------------------------------------------------------------
/isan/parsing/default_dep2.py:
--------------------------------------------------------------------------------
  1 | import pickle
  2 | import json
  3 | #import marshal as pickle
  4 | import isan.parsing.eval as eval
  5 | from isan.data.lattice import Lattice as Lattice
  6 | from isan.common.lattice import Lattice_Task as Base_Task
  7 | 
  8 | from isan.parsing.seq_dep import Action as Base_Action
  9 | from isan.parsing.seq_dep import State as Base_State
 10 | from isan.parsing.seq_dep import codec as base_codec
 11 | from isan.parsing.seq_dep import Dep as Base_Dep
 12 | 
 13 | 
 14 | class codec (base_codec):
 15 |     class Json_Lattice_Data :
 16 |         def __init__(self,line):
 17 |             self.lattice=json.loads(line)
 18 |             self.lattice=[[k,v] for k,v in self.lattice if 'dep' in v]
 19 |         def make_raw(self):
 20 |             lat=self.lattice
 21 |             raw=[]
 22 |             for i in range(len(lat)):
 23 |                 k,v =lat[i]
 24 |                 k=tuple(k)
 25 |                 lat[i][0]=k
 26 |                 #if not ('is_test' in v and v['is_test']) :
 27 |                 if True:
 28 |                     raw.append([k,v.get('tag-weight',None)])
 29 |                 if 'dep' in v and v['dep'][1]!=None :
 30 |                     v['dep'][1]=tuple(v['dep'][1])
 31 |             l,w=zip(*raw)
 32 |             lattice=Lattice(l,w)
 33 |             return lattice
 34 | 
 35 |         def make_gold(self):
 36 |             lat=self.lattice
 37 |             gold=[]
 38 |             for k,v in lat :
 39 |                 if 'tag-weight' in v : del v['tag-weight']
 40 |                 if not v : v=None
 41 |                 else :
 42 |                     v=[v['dep'][1]]
 43 |                 gold.append([k,v])
 44 |             return gold
 45 | 
 46 |     @staticmethod
 47 |     def arcs_to_result(arcs,lattice):
 48 |         return arcs
 49 |     @staticmethod
 50 |     def result_to_arcs(result):
 51 |         result=[ind for _,_,ind,_ in result]
 52 |         return result
 53 |     @staticmethod
 54 |     def encode(raw,result):
 55 |         return ' '.join(['_'.join([item[0],item[1],str(head)]) for item,head in zip(raw,result)])
 56 | 
 57 |     @staticmethod
 58 |     def decode(line):
 59 |         data=codec.Json_Lattice_Data(line)
 60 |         lattice=data.make_raw()
 61 |         lat=data.make_gold()
 62 |         #raw=[(w,t)for b,e,w,t in lattice.items]
 63 |         raw=lattice
 64 |         inds={}
 65 |         for i,it in enumerate(lat):
 66 |             inds[it[0]]=i
 67 |         lat=[tuple([word[2],word[3]]+([inds[head[0]],'DEP'] if head[0] else [-1,'ROOT']))
 68 |                 for word,head in lat]
 69 |         return {'raw':raw,'y':lat}
 70 | 
 71 |     @staticmethod
 72 |     def to_raw(line):
 73 |         return [(w,t)for w,t,*_ in line]
 74 | 
 75 | class Action (Base_Action):
 76 |     @staticmethod
 77 |     def actions_to_arcs(actions):
 78 |         ind=0
 79 |         stack=[]
 80 |         arcs=[]
 81 |         for a in actions:
 82 |             is_shift,*rest=Action.parse_action(a)
 83 |             if is_shift :
 84 |                 sind=rest[0]
 85 |                 stack.append(sind)
 86 |                 ind+=1
 87 |             elif a==Action.left_reduce:
 88 |                 arcs.append((stack[-1],stack[-2]))
 89 |                 stack.pop()
 90 |             elif a==Action.right_reduce:
 91 |                 arcs.append((stack[-2],stack[-1]))
 92 |                 stack[-2]=stack[-1]
 93 |                 stack.pop()
 94 |         arcs.append((stack[-1],-1))
 95 |         arcs.sort()
 96 |         arcs=[x for _,x in arcs]
 97 |         return arcs
 98 |     @staticmethod
 99 |     def arcs_to_actions(arcs):
100 |         result=arcs
101 |         stack=[]
102 |         actions=[]
103 |         record=[[ind,head,0] for ind,head in enumerate(result)]# [ind, ind_of_head, 是head的次数]
104 |         for ind,head,_ in record:
105 |             if head!=-1 :
106 |                 record[head][2]+=1
107 |         for ind,head in enumerate(result):
108 |             #actions.append(self.shift_action)
109 |             actions.append(Action.shift_action(ind))
110 |             stack.append([ind,result[ind],record[ind][2]])
111 |             while len(stack)>=2:
112 |                 if stack[-1][2]==0 and stack[-1][1]!=-1 and stack[-1][1]==stack[-2][0]:
113 |                     actions.append(Action.left_reduce)
114 |                     stack.pop()
115 |                     stack[-1][2]-=1
116 |                 elif stack[-2][1]!=-1 and stack[-2][1]==stack[-1][0]:
117 |                     actions.append(Action.right_reduce)
118 |                     stack[-2]=stack[-1]
119 |                     stack.pop()
120 |                     stack[-1][2]-=1
121 |                 else:
122 |                     break
123 | 
124 |         return actions
125 | 
126 | class State(Action,Base_State) :
127 |     init_stat=pickle.dumps((0,(0,0),(None,None,None)))
128 |     shift_cost=1
129 |     reduce_cost=1
130 |     def __init__(self,bt,lattice):
131 |         self.lattice=lattice
132 |         state=pickle.loads(bt)
133 |         self.ind,self.span,self.stack_top=state
134 |         #self.stop_step=2*len(self.lattice.items)-1
135 |         #self.stop_step=2*self.lattice.length-1
136 |         self.stop_step=self.lattice.length*State.shift_cost+(self.lattice.length-1)*State.reduce_cost
137 | 
138 |     def shift(self,shift_ind):
139 |         item=self.lattice.items[shift_ind]
140 |         #next_ind=self.ind+1
141 |         next_ind=self.ind+2*len(item[2])-1
142 |         #next_ind=self.ind+len(item[2])*State.shift_cost+(len(item[2])-1)*State.reduce_cost
143 |         #if next_ind==self.stop_step : next_ind=-1
144 |         if item[1]==self.lattice.length and self.stack_top[0]==None : next_ind=-1
145 | 
146 |         state=(
147 |                 next_ind,
148 |                 (item[0],item[1]),
149 |                 ((item[2],item[3],None,None),
150 |                         self.stack_top[0],
151 |                         self.stack_top[1][1] if self.stack_top[1] else None)
152 |                 )
153 |         return [(self.shift_action(shift_ind),next_ind,pickle.dumps(state))]
154 |         pass
155 |     def reduce(self,pre_state,alpha_ind):
156 |         next_ind=self.ind+1
157 |         #next_ind=self.ind+State.reduce_cost
158 |         if self.span[1]==self.lattice.length and pre_state.stack_top[1]==None : next_ind=-1
159 |         
160 |         s0,s1,s2=self.stack_top
161 |         
162 |         if s0==None or s1==None: return []
163 | 
164 |         reduce_state1=(
165 |                 next_ind, 
166 |                 (pre_state.span[0],self.span[1]), 
167 |                 ((s1[0],s1[1],s1[2],s0[1]),pre_state.stack_top[1],pre_state.stack_top[2]))
168 | 
169 |         reduce_state2=(next_ind,
170 |                 (pre_state.span[0],self.span[1]),
171 |                 ((s0[0],s0[1],s1[1],s0[3]),pre_state.stack_top[1],pre_state.stack_top[2]))
172 | 
173 |         reduce_state1=pickle.dumps(reduce_state1)
174 |         reduce_state2=pickle.dumps(reduce_state2)
175 |         return [
176 |                 (self.left_reduce,next_ind,reduce_state1,alpha_ind),
177 |                 (self.right_reduce,next_ind,reduce_state2,alpha_ind),
178 |                 ]
179 | 
180 | class Dep (Base_Dep):
181 |     pass
182 |     name="依存句法分析"
183 |     Action=Action
184 |     State=State
185 | 
186 |     Eval=eval.Eval
187 |     codec=codec
188 | 
189 | 
190 |     def gen_features(self,span,actions):
191 |         fvs=[]
192 |         fv=self.gen_features_one(span)
193 |         for action in actions:
194 |             is_shift,*_=self.Action.parse_action(action)
195 |             if is_shift :
196 |                 action='s'.encode()
197 |             else :
198 |                 action=chr(action).encode()
199 |             fvs.append([action+x for x in fv])
200 |         return fvs
201 | 
202 |     def gen_features_one(self,stat):
203 |         stat=State.load(stat)
204 |         _,span,stack_top=stat
205 |         s0,s1,s2_t=stack_top
206 | 
207 |         s2_t=b'~' if s2_t is None else s2_t.encode()
208 | 
209 |         if s0:
210 |             s0_w,s0_t,s0l_t,s0r_t=s0
211 |             s0l_t=b'~' if s0l_t is None else s0l_t.encode()
212 |             s0r_t=b'~' if s0r_t is None else s0r_t.encode()
213 |             s0_w=s0_w.encode()
214 |             s0_t=s0_t.encode()
215 |         else:
216 |             s0_w,s0_t,s0l_t,s0r_t=b'~',b'~',b'~',b'~'
217 | 
218 |         if s1:
219 |             s1_w,s1_t,s1l_t,s1r_t=s1
220 |             s1l_t=b'~' if s1l_t is None else s1l_t.encode()
221 |             s1r_t=b'~' if s1r_t is None else s1r_t.encode()
222 |             s1_w=s1_w.encode()
223 |             s1_t=s1_t.encode()
224 |         else:
225 |             s1_w,s1_t,s1l_t,s1r_t=b'~',b'~',b'~',b'~'
226 | 
227 |         ind=self.lattice.begins.get(span[1],[len(self.f_raw)])[0]
228 |         q0_w,q0_t=self.f_raw[ind] if ind<len(self.f_raw) else (b'~',b'~')
229 |         q1_t=self.f_raw[ind+1][1] if ind+1<len(self.f_raw) else b'~'
230 |         
231 |         fv=[
232 |                 #(1)
233 |                 b'0'+s0_w,
234 |                 b'1'+s0_t,
235 |                 b'2'+s0_w+s0_t,
236 |                 b'3'+s1_w,
237 |                 b'4'+s1_t,
238 |                 b'5'+s1_w+s1_t,
239 |                 b'6'+q0_w,
240 |                 b'7'+q0_t,
241 |                 b'8'+q0_w+q0_t,
242 |                 #(2)
243 |                 b'9'+s0_w+b":"+s1_w,
244 |                 b'0'+s0_t+s1_t,
245 |                 b'a'+s0_t+q0_t,
246 |                 b'b'+s0_w+s0_t+s1_t,
247 |                 b'c'+s0_t+s1_w+s1_t,
248 |                 b'd'+s0_w+s1_t+s1_w,
249 |                 b'e'+s0_w+s0_t+s1_w,
250 |                 b'f'+s0_w+s0_t+s1_w+s1_t,
251 |                 #(3)
252 |                 b'g'+s0_t+q0_t+q1_t,
253 |                 b'h'+s0_t+s1_t+q0_t,
254 |                 b'i'+s0_w+q0_t+q1_t,
255 |                 b'j'+s0_w+s1_t+q0_t,
256 |                 #(4)
257 |                 b'k'+s0_t+s1_t+s1l_t,
258 |                 b'l'+s0_t+s1_t+s1r_t,
259 |                 b'm'+s0_t+s1_t+s0l_t,
260 |                 b'n'+s0_t+s1_t+s0r_t,
261 |                 b'o'+s0_w+s1_t+s0l_t,
262 |                 b'p'+s0_w+s1_t+s0r_t,
263 |                 #(5)
264 |                 b'q'+s0_t+s1_t+s2_t,
265 |                 ]
266 |         return fv
267 | 


--------------------------------------------------------------------------------
/isan/parsing/dep_codec.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python3
 2 | import re
 3 | 
 4 | 
 5 | def encode(raw,result):
 6 |     return ' '.join(['_'.join([item[0],item[1],str(head)]) for item,head in zip(raw,result)])
 7 | 
 8 | def decode(line):
 9 |     #print(line)
10 |     sen=[]
11 |     for arc in line.split():
12 |         word,tag,head_ind,arc_type=arc.split('_')
13 |         head_ind=int(head_ind)
14 |         sen.append((word,tag,head_ind,arc_type))
15 |     return {'raw':to_raw(sen),'y':sen,'Y_b':sen}
16 |         
17 | 
18 | def to_raw(line):
19 |     return [(w,t)for w,t,*_ in line]
20 | 
21 | def arcs_to_result(arcs,lattice):
22 |     return arcs
23 | 
24 | def result_to_arcs(result):
25 |     return [ind for _,_,ind,_ in result]
26 |     pass
27 | 
28 | if __name__=="__main__":
29 |     for line in open('/media/exp/isan/test/hit_dep.txt'):
30 |         sen=decode(line)
31 |         print(sen)
32 | 


--------------------------------------------------------------------------------
/isan/parsing/dep_unlabeled_eval.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python3
 2 | 
 3 | import time
 4 | 
 5 | 
 6 | class PRF :
 7 |     def __init__(self):
 8 |         self.std=0
 9 |         self.rst=0
10 |         self.cor=0
11 |     def __call__(self,std,rst):
12 |         self.std+=len(std)
13 |         self.rst+=len(rst)
14 |         self.cor+=len(std&rst)
15 |     def prf(self):
16 |         self.p=0
17 |         self.r=0
18 |         self.f=0
19 |         if self.std : self.r=self.cor/self.std
20 |         if self.rst : self.p=self.cor/self.rst
21 |         if self.p and self.r : self.f=self.p*self.r*2/(self.p+self.r)
22 |     def __str__(self):
23 |         self.prf()
24 |         return ("""std: %d rst: %d cor: %d p: %.4f r: %.4f f: %.4f"""
25 |                 %(self.std,self.rst,self.cor,self.p,self.r,self.f))
26 | 
27 | class Eval:
28 |     @staticmethod
29 |     def make_color(s):
30 |         return '\033[36;01m%s\033[1;m'%s #blue
31 |     def __init__(self):
32 |         self.std=0
33 |         self.cor=0
34 |         self.cmpl_std=0
35 |         self.cmpl_cor=0
36 |         self.non_root_std=0
37 |         self.non_root_cor=0
38 |         self.root_std=0
39 |         self.root_cor=0
40 |         self.start_time=time.time()
41 |         self.base=PRF()
42 |     def __call__(self,std_result,rst_result):
43 |         raw=' '.join(x[0] for x in std_result)
44 |         #print(raw)
45 |         std=set((k,v[2]) for k,v in enumerate(std_result) if v[1]!='PU')
46 |         kset=set(k for k,v in std)
47 |         #rst=set((k,v) for k,v in enumerate(rst_result) if k in kset)
48 |         rst=set((k,v[2]) for k,v in enumerate(rst_result) if v[1]!='PU')
49 |         self.base(std,rst)
50 |         
51 |         #self.std+=sum(1 for s in std_result if s[1]!='PU')
52 |         self.std+=len(std)
53 |         self.cmpl_std+=1
54 |         flag=True
55 |         for s,r in zip(std_result,rst_result) : 
56 |             if s[1]=='PU' : continue
57 |             if s[2]!=-1 :
58 |                 self.non_root_std+=1
59 |                 if s[2]==r[2] : self.non_root_cor+=1
60 |                 else : flag=False
61 |             else :
62 |                 self.root_std+=1
63 |                 if s[2]==r[2] : self.root_cor+=1
64 |                 else : flag=False
65 |         if flag : self.cmpl_cor+=1
66 |     def print_result(self):
67 |         duration=time.time()-self.start_time
68 |         #print(self.base)
69 |         print("word正确率:\033[32;01m%.4f\033[1;m non-root正确率:%.4f root正确率:%.4f 整句正确率:%.4f 历时:%.2f 现时:%s"%(
70 |                 (self.root_cor+self.non_root_cor)/(self.non_root_std+self.root_std),
71 |                 self.non_root_cor/self.non_root_std,
72 |                 self.root_cor/self.root_std,
73 |                 self.cmpl_cor/self.cmpl_std,
74 |                 duration,
75 |                 time.strftime("%H:%M:%S")))
76 |     def get_result(self):
77 |         duration=time.time()-self.start_time
78 |         #print(self.base)
79 |         line="word正确率:\033[32;01m%.4f\033[1;m non-root正确率:%.4f root正确率:%.4f 整句正确率:%.4f 历时:%.2f 现时:%s"%(
80 |                 (self.root_cor+self.non_root_cor)/(self.non_root_std+self.root_std),
81 |                 self.non_root_cor/self.non_root_std,
82 |                 self.root_cor/self.root_std,
83 |                 self.cmpl_cor/self.cmpl_std,
84 |                 duration,
85 |                 time.strftime("%H:%M:%S"))
86 |         return line
87 | 


--------------------------------------------------------------------------------
/isan/parsing/eval.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python3
 2 | 
 3 | import time
 4 | 
 5 | 
 6 | class PRF :
 7 |     def __init__(self):
 8 |         self.std=0
 9 |         self.rst=0
10 |         self.cor=0
11 |     def __call__(self,std,rst):
12 |         self.std+=len(std)
13 |         self.rst+=len(rst)
14 |         self.cor+=len(std&rst)
15 |     def prf(self):
16 |         self.p=0
17 |         self.r=0
18 |         self.f=0
19 |         if self.std : self.r=self.cor/self.std
20 |         if self.rst : self.p=self.cor/self.rst
21 |         if self.p and self.r : self.f=self.p*self.r*2/(self.p+self.r)
22 |     def __str__(self):
23 |         self.prf()
24 |         return ("""std: %d rst: %d cor: %d p: %.4f r: %.4f f: %.4f"""
25 |                 %(self.std,self.rst,self.cor,self.p,self.r,self.f))
26 | 
27 | class Eval:
28 |     @staticmethod
29 |     def make_color(s):
30 |         return '\033[36;01m%s\033[1;m'%s #blue
31 |     def __init__(self):
32 |         self.std=0
33 |         self.cor=0
34 |         self.cmpl_std=0
35 |         self.cmpl_cor=0
36 |         self.non_root_std=0
37 |         self.non_root_cor=0
38 |         self.root_std=0
39 |         self.root_cor=0
40 |         self.start_time=time.time()
41 |         self.base=PRF()
42 |     def __call__(self,std_result,rst_result):
43 |         raw=' '.join(x[0] for x in std_result)
44 |         #print(raw)
45 |         std=set((k,v[2]) for k,v in enumerate(std_result) if v[1]!='PU')
46 |         kset=set(k for k,v in std)
47 |         rst=set((k,v) for k,v in enumerate(rst_result) if k in kset)
48 |         self.base(std,rst)
49 |         
50 |         #self.std+=sum(1 for s in std_result if s[1]!='PU')
51 |         self.std+=len(std)
52 |         self.cmpl_std+=1
53 |         flag=True
54 |         for s,r in zip(std_result,rst_result) : 
55 |             if s[1]=='PU' : continue
56 |             if s[2]!=-1 :
57 |                 self.non_root_std+=1
58 |                 if s[2]==r : self.non_root_cor+=1
59 |                 else : flag=False
60 |             else :
61 |                 self.root_std+=1
62 |                 if s[2]==r : self.root_cor+=1
63 |                 else : flag=False
64 |         if flag : self.cmpl_cor+=1
65 |         pass
66 |     def print_result(self):
67 |         duration=time.time()-self.start_time
68 |         #print(self.base)
69 |         print("word正确率:\033[32;01m%.4f\033[1;m non-root正确率:%.4f root正确率:%.4f 整句正确率:%.4f 历时:%.2f 现时:%s"%(
70 |                 (self.root_cor+self.non_root_cor)/(self.non_root_std+self.root_std),
71 |                 self.non_root_cor/self.non_root_std,
72 |                 self.root_cor/self.root_std,
73 |                 self.cmpl_cor/self.cmpl_std,
74 |                 duration,
75 |                 time.strftime("%H:%M:%S")))
76 | 


--------------------------------------------------------------------------------
/isan/parsing/lat_tag.py:
--------------------------------------------------------------------------------
  1 | import pickle
  2 | import json
  3 | import collections
  4 | import math
  5 | from isan.common.lattice import Lattice_Task as Base_Task
  6 | from isan.data.lattice import Lattice as Lattice
  7 | import isan.parsing.ldep_eval as eval
  8 | import isan.data.lattice
  9 | 
 10 | from isan.parsing.lat_dep import codec as codec
 11 | from isan.parsing.lat_dep import Action as Base_Action
 12 | from isan.parsing.lat_dep import State as Base_State
 13 | from isan.parsing.lat_dep import Dep as Base_Dep
 14 | 
 15 | class Action (Base_Action):
 16 |     @staticmethod
 17 |     def arcs_to_actions(arcs):
 18 |         is_leaf=collections.Counter()
 19 |         for a,b in arcs:
 20 |             is_leaf[b]+=1
 21 |         stack=[]
 22 |         actions=[]
 23 |         for i,h in arcs:
 24 |             stack.append([i,h,is_leaf[i]])
 25 |             actions.append(Action.shift_action(i))
 26 |             while len(stack)>=2:
 27 |                 if stack[-1][2]==0 and stack[-1][1]==stack[-2][0] :
 28 |                     stack.pop()
 29 |                     stack[-1][2]-=1
 30 |                     #actions.append(Action.left_reduce)
 31 |                 elif stack[-2][1] == stack[-1][0] :
 32 |                     stack[-2]=stack[-1]
 33 |                     stack.pop()
 34 |                     stack[-1][2]-=1
 35 |                     #actions.append(Action.right_reduce)
 36 |                 else :
 37 |                     break
 38 |         return actions
 39 | 
 40 | class State (Action,Base_State):
 41 |     """
 42 |     """
 43 | 
 44 |     def __init__(self,bt,lattice):
 45 |         self.lattice=lattice
 46 |         state=pickle.loads(bt)
 47 |         self.ind,self.span,self.stack_top,self.sequence=state
 48 |         #self.stop_step=2*self.lattice.length-1
 49 |         self.stop_step=2*self.lattice.length
 50 |     def shift(self,shift_ind):
 51 |         item=self.lattice.items[shift_ind]
 52 |         #next_ind=self.ind+2*len(item[2])-1
 53 |         next_ind=self.ind+2*len(item[2])
 54 |         if next_ind==self.stop_step : next_ind=-1
 55 |         state=(
 56 |                 next_ind,
 57 |                 (item[0],item[1]),
 58 |                 (
 59 |                     (shift_ind,None,None),
 60 |                     self.stack_top[0],
 61 |                     self.lattice.items[self.stack_top[1][0]][3] if self.stack_top[1] else None
 62 |                 ),
 63 |                 (shift_ind,self.sequence[0]),
 64 |             )
 65 |         return [(self.shift_action(shift_ind),next_ind,pickle.dumps(state))]
 66 | 
 67 | 
 68 | class Dep (Base_Dep):
 69 |     name="依存句法分析"
 70 |     State=State
 71 |     Action=Action
 72 |     Eval=eval.Eval
 73 |     codec=codec
 74 |     reduce=None
 75 | 
 76 | 
 77 |     def set_raw(self,raw,_):
 78 |         """
 79 |         对需要处理的句子做必要的预处理（如缓存特征）
 80 |         """
 81 |         self.lattice=raw
 82 |         self.cb_fvs=[]
 83 |         for i,item in enumerate(self.lattice.items):
 84 |             fv=[]
 85 | 
 86 |             for j,c in enumerate(item[2]):
 87 | 
 88 |                 o=item[0]+j
 89 |                 if item[0]+1==item[1]:
 90 |                     pos=b's'
 91 |                 elif o == item[0] :
 92 |                     pos=b'b'
 93 |                 elif o==item[1]-1 :
 94 |                     pos=b'e'
 95 |                 else :
 96 |                     pos=b'm'
 97 |                 l2=self.lattice.sentence[o-2] if o-2>=0 else '#'
 98 |                 l1=self.lattice.sentence[o-1] if o-1>=0 else '#'
 99 |                 r1=self.lattice.sentence[o+1] if o+1<len(self.lattice.sentence) else '#'
100 |                 r2=self.lattice.sentence[o+2] if o+2<len(self.lattice.sentence) else '#'
101 |                 c=c.encode()
102 |                 l1=l1.encode()
103 |                 l2=l2.encode()
104 |                 r1=r1.encode()
105 |                 r2=r2.encode()
106 |                 tag=item[3].encode()
107 |                 fv+=[
108 |                         b'C1'+pos+c,
109 |                         b'C2'+pos+l1,
110 |                         b'C3'+pos+r1,
111 |                         b'C4'+pos+l2+l1,
112 |                         b'C5'+pos+l1+c,
113 |                         b'C6'+pos+c+r1,
114 |                         b'C7'+pos+r1+r2,
115 |                         b'CT1'+pos+c+tag,
116 |                         b'CT2'+pos+l1+tag,
117 |                         b'CT3'+pos+r1+tag,
118 |                         b'CT4'+pos+l2+l1+tag,
119 |                         b'CT5'+pos+l1+c+tag,
120 |                         b'CT6'+pos+c+r1+tag,
121 |                         b'CT7'+pos+r1+r2+tag,
122 |                         ]
123 |             self.cb_fvs.append(fv)
124 |         self.margins=[str(math.floor(math.log(float(k)/64.0+1))).encode() if k!=None else None 
125 |                 for k in self.lattice.weights]
126 |         
127 |         self.f_raw=[[k[2].encode(),k[3].encode()] for k in self.lattice.items]
128 | 
129 |     def gen_features(self,span,actions):
130 |         stat=self.State.load(span)
131 |         ind,sp,stack_top,sequence=stat
132 |         b,e=sp
133 |         qq0=self.lattice.sentence[e].encode() if e < len(self.lattice.sentence) else b'#'
134 |         qq1=self.lattice.sentence[e+1].encode() if e+1 < len(self.lattice.sentence) else b'#'
135 | 
136 |         w0_w,w0_t=b'~',b'~'
137 |         w1_w,w1_t=b'~',b'~'
138 |         if sequence[0]!=None :
139 |             w0_w,w0_t=self.f_raw[sequence[0]]
140 |         if sequence[1]!=None :
141 |             w1_w,w1_t=self.f_raw[sequence[1]]
142 | 
143 |         def _shift_f(stat,sind):
144 |             q0_w,q0_t=self.f_raw[sind]
145 |             q0_m=self.margins[sind]
146 |             fv=[
147 |                     b'S0'+w0_w,
148 |                     b'S1'+w0_t,
149 |                     b'S2'+w0_w+b'~'+w0_t,
150 |                     b'S3'+w0_w+b'~'+w1_w,
151 |                     b'S4'+w0_w+b'~'+w1_t,
152 |                     b'S5'+w0_t+b'~'+w1_w,
153 |                     b'S6'+w0_t+b'~'+w1_t,
154 |                     b'S7'+w0_w+b'~'+q0_w,
155 |                     b'S8'+w0_w+b'~'+q0_t,
156 |                     b'S9'+w0_t+b'~'+q0_w,
157 |                     b'Sa'+w0_t+b'~'+q0_t,
158 |                     b'Sb'+w1_t+b'~'+w0_t+b'~'+q0_t,
159 | 
160 |                     b'6'+q0_w,
161 |                     b'7'+q0_t,
162 |                     b'8'+q0_w+q0_t,
163 |                     ]
164 |             fv+=self.cb_fvs[sind]
165 |             if q0_m : fv+=[b'M'+q0_m]
166 |             ba=b'sh'
167 |             fv=[ba+x for x in fv]
168 |             return fv
169 |             pass
170 | 
171 |         fvs=[]
172 | 
173 |         for action in actions:
174 |             rt=self.Action.parse_action(action)
175 |             fv=_shift_f(span,rt[1])
176 |             fvs.append(fv[:])
177 |         return fvs
178 | 
179 | 


--------------------------------------------------------------------------------
/isan/parsing/ldep_eval.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python3
  2 | 
  3 | from isan.parsing.eval import PRF
  4 | import time
  5 | 
  6 | class Eval:
  7 |     @staticmethod
  8 |     def make_color(s):
  9 |         return '\033[36;01m%s\033[1;m'%s #blue
 10 |     def __init__(self):
 11 |         self.base=PRF()
 12 |         self.dep_src=[0,0,0]
 13 |         self.tag_src=[0,0,0]
 14 |         self.seg_src=[0,0,0]
 15 |         self.std=0
 16 |         self.cor=0
 17 |         self.non_root_std=0
 18 |         self.non_root_cor=0
 19 |         self.root_std=0
 20 |         self.root_cor=0
 21 |         self.start_time=time.time()
 22 |     def cal_src(self,s1,s2,src) :
 23 |         src[0]+=len(s1)
 24 |         src[1]+=len(s2)
 25 |         src[2]+=len(s1&s2)
 26 |         return 
 27 |     def __call__(self,std_result,rst_result):
 28 |         rst=rst_result
 29 |         #print(std_result)
 30 |         #input()
 31 | 
 32 |         #rst=set()
 33 |         #for s,d in rst_result :
 34 |         #    s=std_result[s][0]
 35 |         #    d=std_result[d][0]
 36 |         #    r=(s[:3],s[3],d)
 37 |         #    rst.add(r)
 38 |         #print(rst)
 39 |         #input()
 40 |         std=[(x[0][:3],x[0][3],x[1][0]) for x in std_result if x[1] ]
 41 |         std=set(std)
 42 | 
 43 |         dep_std={x for x in std if x[1]!='PU'}
 44 |         dep_rst={x for x in rst if x[1]!='PU'}
 45 |         self.base({(w,d[:3] if d else None) for w,t,d in dep_std},
 46 |                 {(w,d[:3] if d else None) for w,t,d in dep_rst})
 47 | 
 48 |         self.cal_src({(w,d[:3] if d else None) for w,t,d in dep_std},
 49 |                 {(w,d[:3] if d else None) for w,t,d in dep_rst},
 50 |                 self.dep_src)
 51 | 
 52 |         std=set(x[:2] for x in std)
 53 |         rst=set(x[:2] for x in rst)
 54 |         #print(std)
 55 |         #print(rst)
 56 |         self.cal_src(std,rst,self.tag_src)
 57 |         std=set(x[:1] for x in std)
 58 |         rst=set(x[:1] for x in rst)
 59 |         #print(std)
 60 |         #print(rst)
 61 |         #input()
 62 |         self.cal_src(std,rst,self.seg_src)
 63 | 
 64 | 
 65 | 
 66 |         return
 67 |         raw=' '.join(x[0] for x in std_result)
 68 |         #print(raw)
 69 |         self.std+=sum(1 for s in std_result if s[1]!='PU')
 70 |         for s,r in zip(std_result,rst_result) : 
 71 |             if s[1]=='PU' : continue
 72 |             if s[2]!=-1 :
 73 |                 self.non_root_std+=1
 74 |                 if s[2]==r : self.non_root_cor+=1
 75 |             else :
 76 |                 self.root_std+=1
 77 |                 if s[2]==r : self.root_cor+=1
 78 |         pass
 79 |     def print_src(self,src):
 80 |         std,rst,cor=src
 81 |         p=cor/rst
 82 |         r=cor/std
 83 |         
 84 |         f=2*p*r/(p+r) if (p+r) else 0
 85 |         print(std,rst,cor,p,r,f)
 86 |     def print_result(self):
 87 |         duration=time.time()-self.start_time
 88 |         print(duration)
 89 |         print(self.base)
 90 |         self.print_src(self.tag_src)
 91 |         self.print_src(self.seg_src)
 92 | 
 93 |         return
 94 |         print("std:%d non-root正确率:\033[32;01m%.4f\033[1;m root正确率:\033[32;01m%.4f\033[1;m 历时:%.2f 现时:%s"%(
 95 |                 self.std,
 96 |                 self.non_root_cor/self.non_root_std,
 97 |                 self.root_cor/self.root_std,
 98 |                 duration,
 99 |                 time.strftime("%H:%M:%S")))
100 | 
101 | class Eval_With_Result(Eval) :
102 |     pass
103 | if __name__ == '__main__':
104 |     pass
105 | 
106 | 
107 | 


--------------------------------------------------------------------------------
/isan/parsing/make_cython.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | python setup.py build_ext --inplace
3 | g++ -I /usr/include/python3.2mu -shared -fPIC -O3 -std=c++0x -I .. -Wno-deprecated -g -o default_dep0.so default_dep0.c
4 | 


--------------------------------------------------------------------------------
/isan/parsing/seq_dep.py:
--------------------------------------------------------------------------------
  1 | import pickle
  2 | import json
  3 | import collections
  4 | import math
  5 | from isan.common.lattice import Lattice_Task as Base_Task
  6 | from isan.data.lattice import Lattice as Lattice
  7 | import isan.parsing.ldep_eval as eval
  8 | import isan.data.lattice
  9 | 
 10 | from isan.parsing.lat_dep import codec as Base_Codec
 11 | from isan.parsing.lat_dep import Action as Action
 12 | from isan.parsing.lat_dep import State as State
 13 | from isan.parsing.lat_dep import Dep as Base_Dep
 14 | 
 15 | 
 16 | 
 17 | class codec (Base_Codec):
 18 |     class Json_Lattice_Data :
 19 |         def __init__(self,line):
 20 |             self.lattice=json.loads(line)
 21 |             self.lattice=[[k,v] for k,v in self.lattice if 'dep' in v]
 22 |         def make_raw(self):
 23 |             lat=self.lattice
 24 |             raw=[]
 25 |             for i in range(len(lat)):
 26 |                 k,v =lat[i]
 27 |                 k=tuple(k)
 28 |                 lat[i][0]=k
 29 |                 #if not ('is_test' in v and v['is_test']) :
 30 |                 raw.append([k,v.get('tag-weight',None)])
 31 |                 if 'dep' in v and v['dep'][1]!=None :
 32 |                     v['dep'][1]=tuple(v['dep'][1])
 33 |             l,w=zip(*raw)
 34 |             lattice=Lattice(l,w)
 35 |             return lattice
 36 | 
 37 |         def make_gold(self):
 38 |             lat=self.lattice
 39 |             gold=[]
 40 |             for k,v in lat :
 41 |                 if 'tag-weight' in v : del v['tag-weight']
 42 |                 if not v : v=None
 43 |                 else :
 44 |                     v=[v['dep'][1]]
 45 |                 gold.append([k,v])
 46 |             return gold
 47 |     @staticmethod
 48 |     def decode(line):
 49 |         data=codec.Json_Lattice_Data(line)
 50 |         lattice=data.make_raw()
 51 |         lat=data.make_gold()
 52 |         return {'raw':lattice,'y':lat}
 53 | 
 54 | 
 55 | class Dep (Base_Dep):
 56 |     name="依存句法分析"
 57 |     State=State
 58 |     Action=Action
 59 |     Eval=eval.Eval
 60 |     codec=codec
 61 | 
 62 |     def set_raw(self,raw,_):
 63 |         """
 64 |         对需要处理的句子做必要的预处理（如缓存特征）
 65 |         """
 66 |         self.lattice=raw
 67 |         self.cb_fvs=[]
 68 |         for i,item in enumerate(self.lattice.items):
 69 |             fv=[]
 70 | 
 71 |             for j,c in enumerate(item[2]):
 72 | 
 73 |                 o=item[0]+j
 74 |                 if item[0]+1==item[1]:
 75 |                     pos=b's'
 76 |                 elif o == item[0] :
 77 |                     pos=b'b'
 78 |                 elif o==item[1]-1 :
 79 |                     pos=b'e'
 80 |                 else :
 81 |                     pos=b'm'
 82 |                 l2=self.lattice.sentence[o-2] if o-2>=0 else '#'
 83 |                 l1=self.lattice.sentence[o-1] if o-1>=0 else '#'
 84 |                 r1=self.lattice.sentence[o+1] if o+1<len(self.lattice.sentence) else '#'
 85 |                 r2=self.lattice.sentence[o+2] if o+2<len(self.lattice.sentence) else '#'
 86 |                 c=c.encode()
 87 |                 l1=l1.encode()
 88 |                 l2=l2.encode()
 89 |                 r1=r1.encode()
 90 |                 r2=r2.encode()
 91 |                 tag=item[3].encode()
 92 |                 fv+=[
 93 |                         b'C1'+pos+c,
 94 |                         b'C2'+pos+l1,
 95 |                         b'C3'+pos+r1,
 96 |                         b'C4'+pos+l2+l1,
 97 |                         b'C5'+pos+l1+c,
 98 |                         b'C6'+pos+c+r1,
 99 |                         b'C7'+pos+r1+r2,
100 |                         b'CT1'+pos+c+tag,
101 |                         b'CT2'+pos+l1+tag,
102 |                         b'CT3'+pos+r1+tag,
103 |                         b'CT4'+pos+l2+l1+tag,
104 |                         b'CT5'+pos+l1+c+tag,
105 |                         b'CT6'+pos+c+r1+tag,
106 |                         b'CT7'+pos+r1+r2+tag,
107 |                         ]
108 |             self.cb_fvs.append(fv)
109 |         self.margins=[str(math.floor(math.log(float(k)/64.0+1))).encode() if k!=None else None 
110 |                 for k in self.lattice.weights]
111 |         
112 |         self.f_raw=[[k[2].encode(),k[3].encode()] for k in self.lattice.items]
113 | 
114 |     def gen_features(self,span,actions):
115 |         
116 |         stat=self.State.load(span)
117 |         ind,sp,stack_top,*_=stat
118 |         b,e=sp
119 |         
120 |         qid=self.lattice.begins.get(sp[1],[len(self.f_raw)])[0]
121 | 
122 |         q0_w,q0_t=b'~',b'~'
123 |         if qid< len(self.f_raw) :
124 |             q0_w,q0_t=self.f_raw[qid]
125 |         q1_w,q1_t=b'~',b'~'
126 |         if qid+1< len(self.f_raw) :
127 |             q1_w,q1_t=self.f_raw[qid+1]
128 | 
129 |         #print(w1_w.decode(),w0_w.decode(),q0_w.decode(),q1_w.decode())
130 | 
131 |         s0,s1,s2_t=stack_top
132 |         s2_t=b'~' if s2_t is None else s2_t.encode()
133 |         if s0:
134 |             s0_ind,s0l_t,s0r_t=s0
135 |             s0l_t=b'~' if s0l_t is None else s0l_t.encode()
136 |             s0r_t=b'~' if s0r_t is None else s0r_t.encode()
137 |             s0_w,s0_t=self.f_raw[s0_ind]
138 |             #s0_m=self.margins[s0_ind]
139 |         else:
140 |             s0_w,s0_t,s0l_t,s0r_t=b'~',b'~',b'~',b'~'
141 |             s0_m=b'~'
142 |         if s1:
143 |             s1_ind,s1l_t,s1r_t=s1
144 |             s1l_t=b'~' if s1l_t is None else s1l_t.encode()
145 |             s1r_t=b'~' if s1r_t is None else s1r_t.encode()
146 |             s1_w,s1_t=self.f_raw[s1_ind]
147 |         else:
148 |             s1_w,s1_t,s1l_t,s1r_t=b'~',b'~',b'~',b'~'
149 |         base_fv=[
150 |                 #(1)
151 |                 b'0'+s0_w,
152 |                 b'1'+s0_t,
153 |                 b'2'+s0_w+s0_t,
154 |                 b'3'+s1_w,
155 |                 b'4'+s1_t,
156 |                 b'5'+s1_w+s1_t,
157 |                 #(2)
158 |                 b'9'+s0_w+b":"+s1_w,
159 |                 b'0'+s0_t+s1_t,
160 |                 b'b'+s0_w+s0_t+s1_t,
161 |                 b'c'+s0_t+s1_w+s1_t,
162 |                 b'd'+s0_w+s1_t+s1_w,
163 |                 b'e'+s0_w+s0_t+s1_w,
164 |                 b'f'+s0_w+s0_t+s1_w+s1_t,
165 |                 #(4)
166 |                 b'k'+s0_t+s1_t+s1l_t,
167 |                 b'l'+s0_t+s1_t+s1r_t,
168 |                 b'm'+s0_t+s1_t+s0l_t,
169 |                 b'n'+s0_t+s1_t+s0r_t,
170 |                 b'o'+s0_w+s1_t+s0l_t,
171 |                 b'p'+s0_w+s1_t+s0r_t,
172 |                 b'q'+s0_t+s1_t+s2_t,
173 | 
174 |                 b'6'+q0_w,
175 |                 b'7'+q0_t,
176 |                 b'8'+q0_w+q0_t,
177 |                 b'a'+s0_t+q0_t,
178 |                 b'g'+s0_t+q0_t+q1_t,
179 |                 b'h'+s0_t+s1_t+q0_t,
180 |                 b'i'+s0_w+q0_t+q1_t,
181 |                 b'j'+s0_w+s1_t+q0_t,
182 | 
183 |                 ]
184 | 
185 |         def _shift_f(stat,sind):
186 |             fv=base_fv[:]
187 |             ba=b's'
188 |             fv=[ba+x for x in fv]
189 |             return fv
190 |             pass
191 |         def _reduce_f(stat,action):
192 |             fv=base_fv[:]
193 |             ba=chr(action).encode()
194 |             fv=[ba+x for x in fv]
195 |             return fv
196 | 
197 |         fvs=[]
198 | 
199 |         for action in actions:
200 |             is_shift,*rest=self.Action.parse_action(action)
201 |             if is_shift :
202 |                 fv=_shift_f(span,rest[0])
203 |             else :
204 |                 fv=_reduce_f(span,action)
205 |             #print(sorted(fv))
206 |             fvs.append(fv[:])
207 |         #input()
208 |         return fvs
209 | 
210 | 


--------------------------------------------------------------------------------
/isan/parsing/setup.py:
--------------------------------------------------------------------------------
 1 | from distutils.core import setup
 2 | from distutils.extension import Extension
 3 | from Cython.Distutils import build_ext
 4 | from Cython.Build import cythonize
 5 | 
 6 | ext_modules = [Extension("default_dep0", ["default_dep0.pyx"])]
 7 | 
 8 | ext_modules = cythonize("default_dep0.pyx",language='c++')
 9 | 
10 | setup(
11 |           name = 'task',
12 |             cmdclass = {'build_ext': build_ext},
13 |               ext_modules = ext_modules,
14 |               )
15 | 


--------------------------------------------------------------------------------
/isan/sentence/README.md:
--------------------------------------------------------------------------------
 1 | 在这里写写中文句子的各种表示方式，均是用一行文本表示
 2 | 
 3 | 
 4 | !! 生文本
 5 | `*.raw`
 6 | 
 7 |     材料利用率。
 8 | 
 9 | !! 分词句子
10 | `*.seg`
11 | 
12 | 
13 | `*.tag`
14 | 
15 | `*.dep`
16 | 
17 | !! lattice for dependency parse tree
18 | `*.dlat`
19 | 
20 |     [ [(begin,end),(word,tag),(head,rel),[] ]  , ... ]
21 | 
22 | 


--------------------------------------------------------------------------------
/isan/sentence/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python3
2 | """
3 | is it ok?
4 | """
5 | 


--------------------------------------------------------------------------------
/isan/tagging/.exrc:
--------------------------------------------------------------------------------
 1 | if &cp | set nocp | endif
 2 | let s:cpo_save=&cpo
 3 | set cpo&vim
 4 | inoremap <silent> <S-Tab> =BackwardsSnippet()
 5 | imap <silent> <Plug>IMAP_JumpBack =IMAP_Jumpfunc('b', 0)
 6 | imap <silent> <Plug>IMAP_JumpForward =IMAP_Jumpfunc('', 0)
 7 | snoremap <silent> 	 i<Right>=TriggerSnippet()
 8 | vmap <NL> <Plug>IMAP_JumpForward
 9 | nmap <NL> <Plug>IMAP_JumpForward
10 | snoremap  b<BS>
11 | snoremap % b<BS>%
12 | snoremap ' b<BS>'
13 | nmap ;ihn :IHN
14 | nmap ;is :IHS:A
15 | nmap ;ih :IHS
16 | map ;fl :NERDTree
17 | map ;tl :TlistToggle
18 | snoremap U b<BS>U
19 | snoremap \ b<BS>\
20 | snoremap ^ b<BS>^
21 | snoremap ` b<BS>`
22 | nmap gx <Plug>NetrwBrowseX
23 | snoremap <Left> bi
24 | snoremap <Right> a
25 | snoremap <BS> b<BS>
26 | snoremap <silent> <S-Tab> i<Right>=BackwardsSnippet()
27 | nnoremap <silent> <Plug>NetrwBrowseX :call netrw#NetrwBrowseX(expand("<cWORD>"),0)
28 | vmap <silent> <Plug>IMAP_JumpBack `<i=IMAP_Jumpfunc('b', 0)
29 | vmap <silent> <Plug>IMAP_JumpForward i=IMAP_Jumpfunc('', 0)
30 | vmap <silent> <Plug>IMAP_DeleteAndJumpBack "_<Del>i=IMAP_Jumpfunc('b', 0)
31 | vmap <silent> <Plug>IMAP_DeleteAndJumpForward "_<Del>i=IMAP_Jumpfunc('', 0)
32 | nmap <silent> <Plug>IMAP_JumpBack i=IMAP_Jumpfunc('b', 0)
33 | nmap <silent> <Plug>IMAP_JumpForward i=IMAP_Jumpfunc('', 0)
34 | map <F12> :!ctags -R --c++-kinds=+p --fields=+iaS --extra=+q --languages=c++ .
35 | inoremap <silent> 	 =TriggerSnippet()
36 | imap <NL> <Plug>IMAP_JumpForward
37 | inoremap <silent> 	 =ShowAvailableSnips()
38 | inoremap <expr>  omni#cpp#maycomplete#Complete()
39 | inoremap <expr> . omni#cpp#maycomplete#Dot()
40 | inoremap <expr> : omni#cpp#maycomplete#Scope()
41 | imap ;ihn :IHN
42 | imap ;is :IHS:A
43 | imap ;ih :IHS
44 | inoremap <expr> > omni#cpp#maycomplete#Arrow()
45 | let &cpo=s:cpo_save
46 | unlet s:cpo_save
47 | set autoindent
48 | set background=dark
49 | set backspace=indent,eol,start
50 | set completeopt=preview,menuone
51 | set expandtab
52 | set fileencodings=ucs-bom,utf-8,default,latin1
53 | set grepprg=grep\ -nH\ $*
54 | set helplang=cn
55 | set hlsearch
56 | set incsearch
57 | set iskeyword=@,48-57,_,192-255,:
58 | set langmenu=zh_CN.UTF-8
59 | set omnifunc=omni#cpp#complete#Main
60 | set printoptions=paper:a4
61 | set ruler
62 | set runtimepath=~/.vim,/var/lib/vim/addons,/usr/share/vim/vimfiles,/usr/share/vim/vim73,/usr/share/vim/vimfiles/after,/var/lib/vim/addons/after,~/.vim/after
63 | set shiftwidth=4
64 | set smarttab
65 | set suffixes=.bak,~,.swp,.o,.info,.aux,.log,.dvi,.bbl,.blg,.brf,.cb,.ind,.idx,.ilg,.inx,.out,.toc
66 | set tabstop=4
67 | set wildignore=*.pyc
68 | " vim: set ft=vim :
69 | 


--------------------------------------------------------------------------------
/isan/tagging/PA_segger.py:
--------------------------------------------------------------------------------
  1 | from struct import Struct
  2 | import isan.tagging.eval as tagging_eval
  3 | import isan.tagging.cwstask as cwstask
  4 | import isan.tagging.cws as cws
  5 | import json
  6 | import random
  7 | 
  8 | """
  9 | """
 10 | class Segger(cws.Task):
 11 |     """告诉isan，这是个什么task"""
 12 |     name='局部标注中文分词'
 13 |     
 14 |     class codec(cws.Task.codec):
 15 |         """
 16 |         任务的输入和输出是什么，如何从数据文件中获得
 17 |         """
 18 |         @staticmethod
 19 |         def decode(line):
 20 |             """
 21 |             编码、解码
 22 |             从一行文本中，得到输入（raw）和输出（y）
 23 |             """
 24 |             line=line.strip()
 25 |             if not line: return []
 26 |             if line[0]=='{':
 27 |                 data=json.loads(line)
 28 |                 #if data['Y_b'][1] : return []
 29 |                 return data
 30 |             seq=[word for word in line.split()]
 31 |             raw=''.join(seq)
 32 |             
 33 |             return {'raw':raw,
 34 |                     'y':seq,
 35 |                     'Y_a' : None,
 36 |                     'Y_b' : None,
 37 |                     }
 38 | 
 39 |     def set_oracle(self,raw,y,Y) :
 40 |         self.early_stop=None
 41 |         std_moves=self.result_to_moves(y)#得到标准动作
 42 |         self.std_moves=std_moves
 43 |         return std_moves
 44 | 
 45 |     def is_belong(self,raw,moves,Y):
 46 |         
 47 |         if not Y : 
 48 |             return self.check(self.std_moves,moves)
 49 | 
 50 |             return True
 51 |         seq,intervals=Y
 52 |         
 53 |         if intervals :
 54 |             offset=0
 55 |             y=self.moves_to_result(moves,raw)
 56 |             for w in y:
 57 |                 r=intervals[offset][1]
 58 |                 if r!=-1 and offset+len(w)>r : 
 59 |                     #print(y)
 60 |                     return False
 61 |                 l=intervals[offset+len(w)][0]
 62 |                 if l!=-1 and l>offset : return False
 63 |                 offset+=len(w)
 64 |             return True
 65 |         if seq:
 66 |             actions=[x[2] for x in moves]
 67 |             for a,s in zip(actions,seq):
 68 |                 if s and ((s=='s' and a!=self.sep) or (s=='c' and a!=self.com)) : 
 69 |                     return False
 70 |             return True
 71 | 
 72 |     def shift(self,last_ind,stat):
 73 |         """
 74 |         根据当前状态，能产生什么动作，并且后续的状态是什么，就由这个函数决定了
 75 |         """
 76 |         
 77 |         ind,last,_,wordl,lwordl=self.stat_fmt.unpack(stat)
 78 |         next_ind=last_ind+1 if last_ind+1 <= len(self.raw) else -1
 79 |         if self.actions and self.actions[ind]:
 80 |             if self.actions[ind]=='s':
 81 |                 return [(self.sep,next_ind,self.stat_fmt.pack(ind+1,b'1',last,1,wordl))]
 82 |             else :
 83 |                 return [(self.com,next_ind,self.stat_fmt.pack(ind+1,b'2',last,wordl+1,lwordl))]
 84 |         if self.intervals :
 85 |             rtn=[]
 86 |             ll,lr=self.intervals[ind-wordl]
 87 |             rl,rr=self.intervals[ind]
 88 |             if lr!=-1 and lr<=ind :
 89 |                 return [(self.sep,next_ind,self.stat_fmt.pack(ind+1,b'1',last,1,wordl))]
 90 |             if rl!=-1 and ind-wordl<rl :
 91 |                 return [(self.com,next_ind,self.stat_fmt.pack(ind+1,b'2',last,wordl+1,lwordl))]
 92 |         return [(self.sep,next_ind,self.stat_fmt.pack(ind+1,b'1',last,1,wordl)),
 93 |                 (self.com,next_ind,self.stat_fmt.pack(ind+1,b'2',last,wordl+1,lwordl))]
 94 | 
 95 |     def init(self):
 96 |         """
 97 |         分词搜索时的初始状态
 98 |         """
 99 |         #self.init_stat,self.gen_actions_and_stats,self.gen_features=cwstask.new()
100 |         _,_,self.gen_features=cwstask.new()
101 |         pass
102 | 
103 |     def set_raw(self,raw,Y=None):
104 |         """
105 |         这个函数用来在每次新到一个输入的时候，做一些预处理，一般为了加快特征向量生成的速度
106 |         """
107 |         self.raw=raw
108 |         if Y:
109 |             self.actions,self.intervals=Y
110 |         else :
111 |             self.actions,self.intervals=None,None
112 | 
113 | 


--------------------------------------------------------------------------------
/isan/tagging/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhangkaixu/isan/6dd4a2d7c16158e9d5e559aa79d1e9b9ace2b6de/isan/tagging/__init__.py


--------------------------------------------------------------------------------
/isan/tagging/cb_symbolic.py:
--------------------------------------------------------------------------------
  1 | import numpy
  2 | import pickle
  3 | 
  4 | 
  5 | class Tag_Bigram :
  6 |     def __init__(self,ts,paras,model=None):
  7 |         self.ts=ts
  8 |         self.paras=paras
  9 | 
 10 |         if model==None :
 11 |             self.trans=self.paras.add(numpy.zeros((self.ts,self.ts)))
 12 |             pass
 13 |         else :
 14 |             self.ts,self.trans=model
 15 | 
 16 |     
 17 |     
 18 |     def dump(self):
 19 |         return [self.ts,self.trans.output_obj()]
 20 | 
 21 |     def set_raw(self,*x):
 22 |         pass
 23 | 
 24 |     def cal_delta(self,std_tags,rst_tags,delta):
 25 |         trans_delta=self.trans*0
 26 |         for i in range(len(std_tags)-1):
 27 |             trans_delta[std_tags[i]][std_tags[i+1]]+=1
 28 |             trans_delta[rst_tags[i]][rst_tags[i+1]]-=1
 29 |         self.trans.add_delta(trans_delta)
 30 | 
 31 |     def transition(self,*_):
 32 |         return self.trans
 33 | 
 34 | class Character:
 35 |     def __init__(self,ts,paras,model=None,):
 36 |         self.paras=paras
 37 |         self.ts=ts
 38 |         self.uni_d={}
 39 |         self.bi_d={}
 40 | 
 41 |         self.uni_s={}
 42 |         self.bi_s={}
 43 | 
 44 |         if model==None :
 45 |             pass
 46 |         else :
 47 |             self.ts,self.uni_d,self.bi_d=model
 48 | 
 49 | 
 50 |     _new_vec=lambda self : numpy.zeros(self.ts,dtype=float)
 51 | 
 52 |     def add_model(self,model):
 53 |         self.ts,uni,bi=model
 54 |         new_vec=self._new_vec
 55 |         for k,v in uni.items():
 56 |             if k not in self.uni_d :
 57 |                 self.uni_d[k]=[new_vec(),new_vec(),new_vec()]
 58 |                 self.uni_s[k]=[new_vec(),new_vec(),new_vec()]
 59 |             for j in range(3):
 60 |                 ind=numpy.abs(numpy.sign(v[j]))
 61 |                 self.uni_d[k][j]=numpy.where(ind,
 62 |                         (self.uni_d[k][j]*self.uni_s[k][j]+v[j])/(self.uni_s[k][j]+1),
 63 |                         self.uni_d[k][j],)
 64 |                 self.uni_s[k][j]+=ind
 65 |         for k,v in bi.items():
 66 |             if k not in self.bi_d :
 67 |                 self.bi_d[k]=[new_vec(),new_vec(),new_vec(),new_vec()]
 68 |                 self.bi_s[k]=[new_vec(),new_vec(),new_vec(),new_vec()]
 69 |             for j in range(4):
 70 |                 ind=numpy.abs(numpy.sign(v[j]))
 71 |                 self.bi_d[k][j]=numpy.where(ind,
 72 |                         (self.bi_d[k][j]*self.bi_s[k][j]+v[j])/(self.bi_s[k][j]+1),
 73 |                         self.bi_d[k][j],)
 74 |                 self.bi_s[k][j]+=ind
 75 | 
 76 |     def dump(self):
 77 |         self.uni_d={k:[x.output_obj()for x in v] for k,v in self.uni_d.items()}
 78 |         self.bi_d={k:[x.output_obj()for x in v] for k,v in self.bi_d.items()}
 79 |         return [self.ts,self.uni_d,self.bi_d]
 80 | 
 81 |     def set_raw(self,raw):
 82 |         self.raw=raw
 83 |         self.uni=['#']+list(raw)+['#']
 84 |         self.bi=''.join(['#','#']+list(raw)+['#','#'])
 85 |         self.bi=[self.bi[i:i+2] for i in range(len(self.bi)-1)]
 86 | 
 87 |     
 88 |     def emission(self,emissions):
 89 |         l=len(self.raw)
 90 |         for i,k in enumerate(self.uni) :
 91 |             # : # x x x '#'
 92 |             # : x x x
 93 |             if k not in self.uni_d : continue
 94 |             v=self.uni_d[k]
 95 |             if i-2 >=0 : emissions[i-2]+=v[0]
 96 |             if i-1 >=0 and i-1 < l: emissions[i-1]+=v[1]
 97 |             if i< l: emissions[i]+=v[2]
 98 | 
 99 |         for i,k in enumerate(self.bi) :
100 |             # : ## #x xx xx x# ##
101 |             # : x  x  x
102 |             if k not in self.bi_d : continue
103 |             v=self.bi_d[k]
104 |             if i-3 >=0 : emissions[i-3]+=v[0]
105 |             if i-2 >=0 and i-2 < l: emissions[i-2]+=v[1]
106 |             if i-1 >=0 and i-1 < l: emissions[i-1]+=v[2]
107 |             if i< l: emissions[i]+=v[3]
108 | 
109 | 
110 |     def cal_delta(self,std_tags,rst_tags,delta):
111 |         l=len(self.raw)
112 |         dv=[self._new_vec() for i in range(len(std_tags))]
113 |         for i in range(len(std_tags)) :
114 |             dv[i][std_tags[i]]+=1
115 |             dv[i][rst_tags[i]]-=1
116 |         for i,k in enumerate(self.uni) :
117 |             if chr(0) in k : continue # used for dropout
118 |             if k not in self.uni_d : 
119 |                 self.uni_d[k]=[self.paras.add(self._new_vec()) for v in range(3)]
120 |             v_para=self.uni_d[k]
121 |             if i-2 >=0 :
122 |                 v_para[0].add_delta(dv[i-2])
123 |             if i-1 >=0 and i-1<l: 
124 |                 v_para[1].add_delta(dv[i-1])
125 |             if i<len(self.raw) :
126 |                 v_para[2].add_delta(dv[i])
127 | 
128 | 
129 |         for i,k in enumerate(self.bi) :
130 |             if chr(0) in k : continue # used for dropout
131 |             if k not in self.bi_d : 
132 |                 self.bi_d[k]=[self.paras.add(self._new_vec()) for v in range(4)]
133 |             v_para=self.bi_d[k]
134 |             if i-3 >=0 :
135 |                 v_para[0].add_delta(dv[i-3])
136 |             if i-2 >=0 and i-2<l: 
137 |                 v_para[1].add_delta(dv[i-2])
138 |             if i-1 >=0 and i-1<l: 
139 |                 v_para[2].add_delta(dv[i-1])
140 |             if i<l :
141 |                 v_para[3].add_delta(dv[i])
142 | 
143 | 


--------------------------------------------------------------------------------
/isan/tagging/eval.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python3
  2 | """
  3 | 用于分词词性标注的评测和比较
  4 | """
  5 | 
  6 | import time
  7 | import sys
  8 | 
  9 | 
 10 | 
 11 | class DiffToHTML:
 12 |     """
 13 |     用于生成HTML的diff文件的插件
 14 |     """
 15 |     def __init__(self,filename):
 16 |         self.html=open(filename,'w')
 17 |         self.line_no=0
 18 |     def __del__(self):
 19 |         self.html.close()
 20 |     def __call__(self,std,rst):
 21 |         self.line_no+=1
 22 |         #for b,w,t in std:
 23 |         print(std)
 24 |         cor=std&rst
 25 |         tag_std=set(std)
 26 |         seg_std={(b,w)for b,w,t in std}
 27 |         seg_rst={(b,w)for b,w,t in rst}
 28 |         if len(cor)==len(std):return
 29 |         html=[]
 30 |         for b,w,t in sorted(rst):
 31 |             if (b,w,t) in tag_std:
 32 |                 html.append(w+"_"+t)
 33 |                 continue
 34 |             if (b,w) in seg_std:
 35 |                 html.append(w+"_<font color=red>"+t+"</font>")
 36 |                 continue
 37 |             html.append("<font color=red>"+w+"_"+t+"</font>")
 38 |         print(' '.join(html),"<br/>",file=self.html)
 39 |         html=[]
 40 |         for b,w,t in sorted(std):
 41 |             if (b,w,t) in rst:
 42 |                 continue
 43 |             if (b,w) in seg_rst:
 44 |                 html.append(w+"_<font color=blue>"+t+"</font>")
 45 |                 continue
 46 |             html.append("<font color=blue>"+w+"_"+t+"</font>")
 47 |         print(' '.join(html),"<br/><br/>",file=self.html)
 48 |         
 49 | """
 50 | tagging_eval std rst
 51 | """
 52 | 
 53 | def str_to_list_old(string):
 54 |     offset=0
 55 |     li=[]
 56 |     #print(string)
 57 |     for word, tag in [x.split('_') for x in string.split()]:
 58 |         li.append((offset,offset+len(word),tag))
 59 |         offset+=len(word)
 60 |     return li
 61 | 
 62 | def str_to_list(string):
 63 |     offset=0
 64 |     li=[]
 65 |     #print(string)
 66 |     for word, tag in [x.split('_') for x in string.split()]:
 67 |         li.append((offset,(word),tag))
 68 |         offset+=len(word)
 69 |     return li
 70 | 
 71 | class CrossBoundaryErrors(object):
 72 |     def __init__(self):
 73 |         self.value=0
 74 |     def __call__(self,std,rst):
 75 |         max_ind=max(e for b,e,t in std)
 76 |         boundaries=[0 for i in range(max_ind+1)]
 77 |         for b,e,t in std:
 78 |             boundaries[b]=1
 79 |             boundaries[e]=1
 80 |         for b,e,t in rst:
 81 |             if boundaries[b]==1 and boundaries[e]==1:
 82 |                 continue
 83 |             if any(boundaries[i]==1 for i in range(b+1,e)):
 84 |                 self.value+=1
 85 | 
 86 | 
 87 | class TaggingEval:
 88 |     """
 89 |     分词词性标注评测类
 90 |     """
 91 |     def get_prf(self,seg=False):
 92 |         """
 93 |         得到评测的结果，准确度、精确度和F1
 94 |         """
 95 |         cor=self.cor if seg==False else self.seg_cor
 96 |         p=cor/self.rst if self.rst else 0
 97 |         r=cor/self.std if self.std else 0
 98 |         f=2*p*r/(r+p) if (r+p) else 0
 99 |         return p,r,f
100 |     def __init__(self,plugins=[],sep='_'):
101 |         """
102 |         初始化
103 |         """
104 |         self.otime=time.time()
105 |         self.plugins=plugins
106 |         self.std,self.rst=0,0
107 |         self.cor,self.seg_cor=0,0
108 |         self.sep=sep
109 |         self.characters=0
110 |         self.overlaps=0
111 |         self.with_tags=False
112 |     def print_result(self):
113 |         """
114 |         打印结果
115 |         """
116 |         time_used=time.time()-self.otime
117 |         speed=self.characters/time_used
118 | 
119 |         cor=self.cor
120 |         p=cor/self.rst if self.rst else 0
121 |         r=cor/self.std if self.std else 0
122 |         f=2*p*r/(r+p) if (r+p) else 0
123 | 
124 |         if self.with_tags :
125 |             seg_cor=self.seg_cor
126 |             p=seg_cor/self.rst if self.rst else 0
127 |             r=seg_cor/self.std if self.std else 0
128 |             seg_f=2*p*r/(r+p) if (r+p) else 0
129 | 
130 |         if self.with_tags :
131 |             line=("标准: %d 输出: %d seg正确: %d 正确: %d seg_f1: \033[32;01m%.4f\033[1;m tag_f1: \033[32;01m%.4f\033[1;m ol: %d 时间: %.4f (%.0f字/秒)"
132 |                         %(self.std,self.rst,self.seg_cor,self.cor,seg_f,f,self.overlaps,time_used,speed))
133 |         else :
134 |             line=("标准: %d 输出: %d 正确: %d f1: \033[32;01m%.4f\033[1;m ol: %d 时间: %.4f (%.0f字/秒)"
135 |                         %(self.std,self.rst,self.cor,f,self.overlaps,time_used,speed))
136 |         print(line,file=sys.stderr)
137 |         sys.stderr.flush()
138 | 
139 |     def get_scaler(self):
140 | 
141 |         if self.with_tags :
142 |             cor=self.cor
143 |             p=cor/self.rst if self.rst else 0
144 |             r=cor/self.std if self.std else 0
145 |             f=2*p*r/(r+p) if (r+p) else 0
146 |             return f
147 |         else :
148 |             seg_cor=self.seg_cor
149 |             p=seg_cor/self.rst if self.rst else 0
150 |             r=seg_cor/self.std if self.std else 0
151 |             seg_f=2*p*r/(r+p) if (r+p) else 0
152 |             return seg_f
153 | 
154 |     def get_result(self):
155 |         time_used=time.time()-self.otime
156 |         speed=self.characters/time_used
157 | 
158 |         cor=self.cor
159 |         p=cor/self.rst if self.rst else 0
160 |         r=cor/self.std if self.std else 0
161 |         f=2*p*r/(r+p) if (r+p) else 0
162 | 
163 |         if self.with_tags :
164 |             seg_cor=self.seg_cor
165 |             p=seg_cor/self.rst if self.rst else 0
166 |             r=seg_cor/self.std if self.std else 0
167 |             seg_f=2*p*r/(r+p) if (r+p) else 0
168 | 
169 |         if self.with_tags :
170 |             line=("标准: %d 输出: %d seg正确: %d 正确: %d seg_f1: \033[32;01m%.4f\033[1;m tag_f1: \033[32;01m%.4f\033[1;m ol: %d 时间: %.4f (%.0f字/秒)"
171 |                         %(self.std,self.rst,self.seg_cor,self.cor,seg_f,f,self.overlaps,time_used,speed))
172 |         else :
173 |             line=("标准: %d 输出: %d 正确: %d f1: \033[32;01m%.4f\033[1;m ol: %d 时间: %.4f (%.0f字/秒)"
174 |                         %(self.std,self.rst,self.cor,f,self.overlaps,time_used,speed))
175 |         return line
176 |         
177 |     def _set_based(self,std,rst):
178 |         self.std+=len(std)
179 |         self.rst+=len(rst)
180 |         self.cor+=len(std&rst)
181 |         self.characters+=sum(len(w)for _,w,_ in std)
182 |         self.seg_cor+=len({(b,e) for b,e,t in std}&{(b,e) for b,e,t in rst})
183 | 
184 | 
185 |         std=sorted(list(std))
186 |         rst=sorted(list(rst))
187 |         std_ind=0
188 |         rst_ind=0
189 |         while rst_ind < len(rst):
190 |             b=rst[rst_ind][0]
191 |             e=b+len(rst[rst_ind][1])
192 |             while std_ind < len(std) and std[std_ind][0]<b :
193 |                 bb=std[std_ind][0]
194 |                 ee=bb+len(std[std_ind][1])
195 |                 if bb<b and ee>b and e >ee : # overlap bb b ee e
196 |                     self.overlaps+=1
197 |                 std_ind += 1
198 |             while std_ind < len(std) and std[std_ind][0]<e :
199 |                 bb=std[std_ind][0]
200 |                 ee=bb+len(std[std_ind][1])
201 |                 if bb>b and e>bb and ee>e : # overlap b bb e ee
202 |                     self.overlaps+=1
203 |                     break
204 |                 std_ind += 1
205 |             rst_ind+=1
206 |     
207 |     def _to_set(self,seq):
208 |         s=set()
209 |         if type(seq[0])!=str:#word with tag
210 |             self.with_tags=True
211 |             offset=0
212 |             for word,tag in seq:
213 |                 s.add((offset,word,tag))
214 |                 offset+=len(word)
215 |         else:#only word
216 |             offset=0
217 |             for word in seq:
218 |                 s.add((offset,word,''))
219 |                 offset+=len(word)
220 |         return s
221 |     def __call__(self,std,rst,raw=None):
222 |         if not std:return
223 |         if not rst:return
224 |         self._set_based(self._to_set(std),self._to_set(rst))
225 |         for plugin in self.plugins:
226 |             plugin(std,rst)
227 | 
228 |     def eval_files(self,std_file,rst_file,sep):
229 |         for g,r in zip(open(std_file),open(rst_file)):
230 |             gl=sum(len(x.partition(self.sep)[0])for x in g.split())
231 |             rl=sum(len(x.partition(self.sep)[0])for x in r.split())
232 |             if(gl!=rl):
233 |                 print("---")
234 |                 print(g.strip())
235 |                 print(r.strip())
236 |             assert(gl==rl)
237 |             g=g.split()
238 |             r=r.split()
239 |             #print(g)
240 |             #print(r)
241 |             if all(sep in x for x in g) and all(sep in x for x in r) :
242 |                 g=[x.split(sep) for x in g]
243 |                 r=[x.split(sep) for x in r]
244 |                 self(g,r)
245 |             else :
246 |                 self(g,r)
247 | if __name__=="__main__":
248 |     import argparse
249 |     parser=argparse.ArgumentParser(description="用于分词词性标注的评测和比较")
250 |     parser.add_argument('std',help='被比较的标注结果')
251 |     parser.add_argument('rst',help='用以比较的标注结果',nargs='?',default='-')
252 |     parser.add_argument('-s','--separator',help='词和词性间的分隔符',dest='sep',default='_')
253 |     parser.add_argument('-d','--diff',help='指定以html格式输出的显示差异的文件的名字',dest='diff_file')
254 |     args=parser.parse_args()
255 |     
256 |     plugins=[]
257 |     if args.diff_file!=None:
258 |         plugins.append(DiffToHTML(args.diff_file))
259 |     eval=TaggingEval(plugins,sep=args.sep)
260 |     eval.eval_files(args.std,args.rst,args.sep)
261 |     p,r,f=eval.get_prf()
262 |     sp,sr,sf=eval.get_prf(True)
263 |     print(eval.std,eval.rst,eval.cor,"%.4f|%.4f|%.4f"%(p,r,f),"%.4f|%.4f|%.4f"%(sp,sr,sf))
264 |     
265 | 


--------------------------------------------------------------------------------
/isan/tagging/ss.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import json
  3 | import gzip
  4 | import numpy as np
  5 | import pickle
  6 | import random
  7 | 
  8 | from isan.common.parameters import Para_Dict
  9 | 
 10 | class Word : 
 11 |     def close(self):
 12 |         if self.use_hidden :
 13 |             if 'save' in self.use_hidden :
 14 |                 fi=open(self.use_hidden['save'],'w')
 15 |                 print(json.dumps(self.M.tolist()),file=fi)
 16 |                 print(json.dumps(self.b.tolist()),file=fi)
 17 |                 fi.close()
 18 |                 pass
 19 |         pass
 20 |     def __init__(self,args={},model=None,paras=None):
 21 |         self.paras=paras
 22 | 
 23 |         #print(args)
 24 |         self.s={} ## ??
 25 |         if model == None :
 26 |             words={}
 27 |             size=0
 28 |             for line in open(args['words']) :
 29 |                 word,*vs = line.split()
 30 |                 vs=list(map(float,vs))
 31 |                 size=len(vs)
 32 |                 words[word]=np.array(vs)
 33 | 
 34 |             
 35 |             self.use_hidden=args.get('hidden',None)
 36 | 
 37 | 
 38 |             self.words=words
 39 |             self.zw=np.zeros(size)
 40 |             self.size=size
 41 | 
 42 |             self.d=self.paras.add({})
 43 | 
 44 |             self.d_hidden=np.zeros(self.size)
 45 | 
 46 |             np.random.seed(0)
 47 | 
 48 | 
 49 |             self.M=np.random.uniform(-0.8,0.8,(self.size,self.size))
 50 |             self.b=np.zeros(self.size)
 51 |             if self.use_hidden and 'load' in self.use_hidden :
 52 |                 m,b=open(self.use_hidden['load']).read().splitlines()
 53 |                 self.M=np.array(json.loads(m))
 54 |                 self.b=np.array(json.loads(b))
 55 | 
 56 |             
 57 |             if self.use_hidden and 'update' in self.use_hidden :
 58 |                 self.M=self.paras.add(self.M)
 59 |                 self.b=self.paras.add(self.b)
 60 |             
 61 | 
 62 |             self.s={k:v.copy()for k,v in self.d.items()}
 63 | 
 64 |         else :
 65 |             for k,v in model.items():
 66 |                 setattr(self,k,v)
 67 | 
 68 |     def dump_weights(self):
 69 |         d={}
 70 |         self.d=self.d.output_obj()
 71 |         if not self.use_hidden :
 72 |             for k in ['use_hidden','size','d','words','zw']:
 73 |                 d[k]=getattr(self,k)
 74 |         else :
 75 |             if 'update' in self.use_hidden :
 76 |                 self.M=self.M.output_obj()
 77 |                 self.b=self.b.output_obj()
 78 |             for k in ['use_hidden','size','d','words','zw','M','b']:
 79 |                 d[k]=getattr(self,k)
 80 |         return d
 81 | 
 82 |     def add_model(self,model):
 83 |         for k,v in model.items():
 84 |             if k not in ['d'] :
 85 |                 setattr(self,k,v)
 86 |             else :
 87 |                 getattr(self,k).add_model(v)
 88 | 
 89 | 
 90 |     def set_raw(self,atoms):
 91 |         self.atoms=atoms
 92 |         self.sen_word_vecs=[]
 93 |         self.sen_hidden_vecs=[]
 94 |         for w,*_ in atoms :
 95 |             wv=self.words.get(w,self.zw)
 96 |             self.sen_word_vecs.append(wv)
 97 |             if self.use_hidden :
 98 |                 hidden=np.dot(self.M,wv)+self.b
 99 |                 hidden=np.tanh(hidden)
100 |                 self.sen_hidden_vecs.append(hidden)
101 |             else :
102 |                 self.sen_hidden_vecs.append(wv)
103 | 
104 | 
105 |     def __call__(self,ind1,ind2,ind3,delta=0) :
106 |         word2,t2,*_=self.atoms[ind2] # word on the top of the stack
107 |         word3,t3,*_=self.atoms[ind3] # next word
108 |         # get the vector
109 |         w2=self.sen_hidden_vecs[ind2]
110 |         w3=self.sen_hidden_vecs[ind3]
111 | 
112 |         wv2=self.sen_word_vecs[ind2]
113 |         wv3=self.sen_word_vecs[ind3]
114 | 
115 |         score=0
116 | 
117 |         if delta ==0 : # cal the network, not update
118 |             if t3 in self.d :
119 |                 score+=np.dot(w3,self.d([t3]))
120 |             if t2!='~' :
121 |                 if t2 in self.d :
122 |                     score+=np.dot(w3,self.d(['l'+t2]))
123 |                 if t3 in self.d :
124 |                     score+=np.dot(w2,self.d(['r'+t3]))
125 |         else :  # calculate the grad
126 |             self.d.add_delta([t3],w3*delta)
127 |             
128 |             if self.use_hidden and 'update' in self.use_hidden :
129 |                 d_hidden=self.d([t3])*(1-w3**2)
130 |                 dM=(d_hidden[:,np.newaxis]*wv3)
131 |                 self.b.add_delta(d_hidden*delta)
132 |                 self.M.add_delta(dM*delta)
133 | 
134 |             # grad of 
135 |             if t2!='~' :
136 |                 self.d.add_delta(['l'+t2],w3*delta)
137 |                 if self.use_hidden and 'update' in self.use_hidden :
138 |                     d_hidden=self.d(['l'+t2])*(1-w3**2)
139 |                     dM=(d_hidden[:,np.newaxis]*wv3)
140 |                     self.b.add_delta(d_hidden*delta)
141 |                     self.M.add_delta(dM*delta)
142 | 
143 |                 self.d.add_delta(['r'+t3],w2*delta)
144 |                 if self.use_hidden and 'update' in self.use_hidden :
145 |                     d_hidden=self.d(['r'+t3])*(1-w2**2)
146 |                     dM=(d_hidden[:,np.newaxis]*wv2)
147 |                     self.b.add_delta(d_hidden*delta)
148 |                     self.M.add_delta(dM*delta)
149 |         return score
150 | 


--------------------------------------------------------------------------------
/isan/tagging/wb_tag.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python3
  2 | import pickle
  3 | import time
  4 | import math
  5 | import sys
  6 | 
  7 | import numpy as np
  8 | import gzip
  9 | 
 10 | from isan.common.parameters import Para_Dict
 11 | from isan.common.task import Lattice, Base_Task, Early_Stop_Pointwise
 12 | from isan.tagging.eval import TaggingEval as Eval
 13 | from isan.tagging.ss import Word as Word
 14 | from isan.tagging.wb_tag_symbolic import Base_Features
 15 | 
 16 | 
 17 | """
 18 | word-based tagging
 19 | """
 20 | 
 21 | class codec :
 22 |     @staticmethod
 23 |     def decode(line):
 24 |         """
 25 |         编码、解码
 26 |         从一行文本中，得到输入（raw）和输出（y）
 27 |         """
 28 |         if not line: return []
 29 |         log2=math.log(2)
 30 |         line=list(map(lambda x:x.split(','), line.split()))
 31 |         for i,it in enumerate(line):
 32 |             if len(it)!=6 :
 33 |                 l=it[:3]
 34 |                 r=it[-2:]
 35 |                 m=it[3:-2]
 36 |                 line[i]=l+[','.join(m)]+r
 37 | 
 38 |         line=[[int(label),int(b),int(e),w,t,float(conf)] for label,b,e,w,t,conf in line]
 39 |         items2=[]
 40 |         gold=[]
 41 |         for l,b,e,w,t,conf in line :
 42 |             if conf <= -1  :
 43 |                 conf = None
 44 |             else :
 45 |                 pass
 46 |                 #conf = conf/1000
 47 |             items2.append((b,e,(w,t,conf)))
 48 |             if l ==1 :
 49 |                 gold.append((w,t))
 50 |         raw=Lattice(items2)
 51 |         return {'raw':raw,
 52 |                 'y':gold, }
 53 |     @staticmethod
 54 |     def encode(y):
 55 |         return ' '.join(y)
 56 | 
 57 | class State (list):
 58 |     init_state=pickle.dumps((-1,-1))
 59 | 
 60 |     decode=pickle.loads
 61 |     encode=pickle.dumps
 62 | 
 63 |     def __init__(self,lattice,bt=init_state):
 64 |         self.extend(pickle.loads(bt))
 65 |         self.lattice=lattice
 66 | 
 67 |     def shift(self,showall=False):
 68 |         begin=0 if self[1]==-1 else self.lattice[self[1]][1]
 69 | 
 70 |         if begin not in self.lattice.begins : return []
 71 |         
 72 |         b=self.lattice.begins[begin]
 73 | 
 74 |         return [[n,pickle.dumps((self[1],n))] 
 75 |                 for n in self.lattice.begins[begin]
 76 |                 if (self.lattice[n][2][-1] is not None or showall)
 77 |                 ]
 78 | 
 79 |     def dumps(self):
 80 |         return pickle.dumps(tuple(self))
 81 | 
 82 |     @staticmethod
 83 |     def load(bt):
 84 |         return pickle.loads(bt)
 85 | 
 86 | 
 87 | 
 88 | 
 89 | class Path_Finding (Early_Stop_Pointwise, Base_Task):
 90 |     """
 91 |     finding path in a DAG
 92 |     """
 93 |     name='joint chinese seg&tag from a word-tag lattice'
 94 |     codec=codec
 95 |     State=State
 96 |     Eval=Eval
 97 | 
 98 |     
 99 |     #~~~~~~~~~~~~~~~~~~~~~~~~~~
100 |     # init and weights
101 | 
102 |     def __init__(self,cmd_args,model=None,paras=None,logger=None):
103 |         self.models={}
104 |         self.build_ins={'word':Word,'base': Base_Features }
105 | 
106 |         if model==None :
107 |             self.paras=paras
108 | 
109 |             self.models['base']=self.build_ins['base'](args=None,paras=self.paras)
110 | 
111 |             if hasattr(cmd_args,'task_features'):
112 |                 for k,v in cmd_args.task_features.items():
113 |                     self.models[k]=self.build_ins[k](args=v,paras=self.paras)
114 |         else :
115 |             for k,v in model.items():
116 |                 self.models[k]=self.build_ins[k](model=v)
117 |             
118 |     def dump_weights(self) :
119 |         d={k:v.dump_weights() for k,v in self.models.items()}
120 |         return d
121 | 
122 |     def add_model(self,model):
123 |         for k,v in model.items():
124 |             self.models[k].add_model(v)
125 | 
126 | 
127 |     #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
128 |     # actions
129 | 
130 |     class Action :
131 |         @staticmethod
132 |         def encode(action):
133 |             return action[0]
134 |         @staticmethod
135 |         def decode(action):
136 |             return (action,None)
137 | 
138 |     def result_to_actions(self,result):
139 |         offset=0
140 |         actions=[]
141 |         for g in result :
142 |             nex=[[ind,self.lattice[ind]] for ind in self.lattice.begins[offset]]
143 |             nex=[ind for ind, it in nex if (it[2][0],it[2][1])==g]
144 |             actions.append((nex[0],None))
145 |             offset+=len(g[0])
146 |         return actions
147 | 
148 |     def actions_to_result(self,actions):
149 |         seq=[self.lattice[action[0]] for action in actions]
150 |         seq=[(it[0],it[1])for _,_,it in seq]
151 |         return seq
152 | 
153 |     #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
154 |     # states
155 | 
156 |     def _next_ind(self,last_ind,action):
157 |         next_ind=last_ind+len(self.lattice[action][2][0])
158 |         return next_ind if next_ind != self.lattice.length else -1
159 | 
160 |     def shift(self,last_ind,stat,showall=False):
161 |         rtn= [(a,self._next_ind(last_ind,a),s) 
162 |                 for a,s in self.State(self.lattice,stat).shift(showall)]
163 |         return rtn
164 | 
165 |     reduce=None
166 | 
167 | 
168 |     def actions_to_moves(self,actions,lattice):
169 |         state=self.State(lattice)
170 |         stack=[state]
171 |         moves=[[None,None,action] for action in actions]
172 |         moves[0][0]=0
173 |         moves[0][1]=self.State.init_state
174 |         for i in range(len(moves)-1) :
175 |             move=moves[i]
176 |             step,state,action=move
177 |             ind,label=action
178 |             if ind >=0 : # shift
179 |                 rst=[[nstep,ns] for a,nstep,ns in self.shift(step,state,True) if a==self.Action.encode(action)]
180 |                 moves[i+1][0],moves[i+1][1]=rst[0]
181 |                 stack.append(rst[0][1])
182 |             else : # reduce 
183 |                 s0=stack.pop()
184 |                 s1=stack.pop()
185 |                 rst=[[nstep,ns] for a,nstep,ns,_ in self.reduce(step,s0,[0],[s1]) if a==self.Action.encode(action)]
186 |                 moves[i+1][0],moves[i+1][1]=rst[0]
187 |                 stack.append(rst[0][1])
188 |                 pass
189 |         for move in moves:
190 |             move[2]=self.Action.encode(move[2])
191 | 
192 |         moves=list(map(tuple,moves))
193 |         return moves
194 | 
195 |     # feature related
196 | 
197 |     def set_raw(self,raw,Y):
198 |         self.lattice=raw
199 |         self.atoms=[]
200 |         for ind in range(len(self.lattice)):
201 |             data=self.lattice[ind]
202 |             b=data[0]
203 |             e=data[1]
204 |             w,t,m=data[2]
205 |             self.atoms.append((w,t,m,str(len(w))))
206 |         self.atoms.append(('~','~','','0'))
207 | 
208 |         for model in self.models.values() :
209 |             model.set_raw(self.atoms)
210 | 
211 |     def gen_features(self,state,actions,delta=0):
212 |         ind1,ind2=self.State(self.lattice,state)
213 |         scores=[[sum(model(ind1,ind2,ind3,delta) for model in self.models.values())]
214 |                 for ind3 in actions]
215 |         return scores
216 | 
217 |     def cal_delta(self,std_moves,rst_moves) :
218 |         delta=0.0001 #### TODO: delta
219 |         dirty=set()
220 |         for b,e,data in self.lattice :
221 |             if data[-1]==None :
222 |                 for x in range(b,e) :
223 |                     dirty.add(x)
224 | 
225 |         max_step=max(x[0] for x in rst_moves)
226 |         std_moves=set(x for x in std_moves if x[0]<=max_step)
227 |         rst_moves=set(rst_moves)
228 |         for m in std_moves-rst_moves :
229 |             a,b=pickle.loads(m[1])
230 |             c=m[-1]
231 |             flag=True
232 |             for x in [a,b,c] :
233 |                 if x==-1 : continue
234 |                 l,r,_=self.lattice[x]
235 |                 for ind in range(l,r):
236 |                     if ind in dirty : flag=False
237 |             if flag : 
238 |                 pass
239 |                 self._update(m,delta)
240 |         for m in rst_moves-std_moves :
241 |             a,b=pickle.loads(m[1])
242 |             c=m[-1]
243 |             flag=True
244 |             for x in [a,b,c] :
245 |                 if x==-1 : continue
246 |                 l,r,_=self.lattice[x]
247 |                 for ind in range(l,r):
248 |                     if ind in dirty : flag=False
249 |             if flag : 
250 |                 pass
251 |                 self._update(m,-delta)
252 | 
253 |     def __del__(self):
254 |         for model in self.models.values() :
255 |             if hasattr(model,'close') :
256 |                 model.close()
257 | 


--------------------------------------------------------------------------------
/isan/tagging/wb_tag_symbolic.py:
--------------------------------------------------------------------------------
 1 | import math
 2 | class Base_Features :
 3 |     def __init__(self,args={},model=None,paras=None):
 4 |         if model == None :
 5 |             self.w=paras.add({})
 6 |         else :
 7 |             self.w=model
 8 | 
 9 |     def dump_weights(self):
10 |         return self.w.output_obj()
11 |     
12 |     def add_model(self,model):
13 |         self.w.add_model(model)
14 |         pass
15 | 
16 |     def set_raw(self,atoms):
17 |         self.atoms=atoms
18 | 
19 |     def __call__(self,ind1,ind2,ind3,delta=0) :
20 |         strm=lambda x:'x' if x=='' else str(math.floor(math.log((x if x>0 else 0)*2+1)))
21 |         w1,t1,m1,len1=self.atoms[ind1]
22 |         w2,t2,m2,len2=self.atoms[ind2]
23 |         w3,t3,m3,len3=self.atoms[ind3]
24 |         fv=(
25 |             (['m3~'+strm(m3), ] if m3 is not None else []) +
26 |                 ([ 'm3m2~'+strm(m3)+'~'+strm(m2), ] if m3 is not None  and m2 is not None else [])+
27 |         [
28 |                 'w3~'+w3, 't3~'+t3, 'l3~'+len3, 'w3t3~'+w3+t3, 'l3t3~'+len3+t3,
29 | 
30 |                 'w3w2~'+w3+"~"+w2, 'w3t2~'+w3+t2, 't3w2~'+t3+w2, 't3t2~'+t3+t2,
31 | 
32 |                 'l3w2~'+len3+'~'+w2, 'w3l2~'+w3+'~'+len2, 'l3t2~'+len3+'~'+t2, 't3l2~'+t3+'~'+len2,
33 |                 'l3l2~'+len3+'~'+len2,
34 |                 
35 |                 't3t1~'+t3+'~'+t1, 't3t2t1~'+t3+'~'+t2+'~'+t1,
36 |                 'l3l1~'+len3+'~'+len1, 'l3l2l1~'+len3+'~'+len2+'~'+len1,
37 |                 ])
38 | 
39 |         if delta==0 :
40 |             v= float(self.w(fv))
41 |             return v
42 |         else :
43 |             self.w.add_delta(fv,delta*10)
44 |             return 0
45 | 
46 | 


--------------------------------------------------------------------------------
/isan/utls/Makefile:
--------------------------------------------------------------------------------
1 | cxx=g++ -O3 -I ~/isan/ -I /usr/include/python3.2mu -shared -fPIC
2 | 
3 | all: dat
4 | 
5 | 
6 | dat: dat.cc *.hpp ../common/*.hpp
7 | 	$(cxx) -g -o dat.so dat.cc
8 | 


--------------------------------------------------------------------------------
/isan/utls/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python3
2 | """
3 | 在 `isan.utls` 下面有很多实用工具
4 | """
5 | 
6 | def test():
7 |     print("this is a test function")
8 | 


--------------------------------------------------------------------------------
/isan/utls/average.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python3
 2 | import argparse
 3 | import gzip
 4 | import pickle
 5 | import math
 6 | import sys
 7 | 
 8 | 
 9 | if __name__ == '__main__':
10 |     argv=sys.argv[1:]
11 |     dst=argv[-1]
12 |     models=argv[:-1]
13 | 
14 |     weights={}
15 |     numbers={}
16 | 
17 |     #models=['model_train_'+str(x)+'.txt' for x in [0,1,2,3,4]]
18 |     #models=['model_'+str(x)+'.gz' for x in [1,2,3,4,5]]
19 |     for model in models:
20 |         print(model)
21 |         for k,v in pickle.load(gzip.open(model)).items():
22 |             if k not in weights :
23 |                 weights[k]=0
24 |                 numbers[k]=0
25 |             weights[k]+=v
26 |             if v!=0 : numbers[k]+=1
27 | 
28 | 
29 |     for k,n in numbers.items():
30 |         if n!=0 :
31 |             #weights[k]=round(weights[k]/max(n-0.5,1))
32 |             weights[k]=round(weights[k]/n)
33 |             #weights[k]=round(weights[k]/len(models))
34 | 
35 |     pickle.dump(weights,gzip.open(dst,'wb'))
36 | 
37 | 


--------------------------------------------------------------------------------
/isan/utls/cdat2/Makefile:
--------------------------------------------------------------------------------
1 | cdat.so: cdat.cc
2 | 	g++ cdat.cc -I /usr/include/python3.2mu -shared -o cdat.so -fPIC
3 | 


--------------------------------------------------------------------------------
/isan/utls/cdat2/cdat.cc:
--------------------------------------------------------------------------------
  1 | #include <Python.h>
  2 | #include <iostream>
  3 | #include "dat.h"
  4 | /**
  5 |  * g++ spammodule.c -I /usr/include/python3.2mu -shared -o spam.so -fPIC
  6 |  * */
  7 | 
  8 | 
  9 | using namespace dat;
 10 | 
 11 | static PyObject *
 12 | cdat_open(PyObject *self, PyObject *args)
 13 | {
 14 |     char* filename=NULL;
 15 |     PyArg_ParseTuple(args,"s",&filename);
 16 |     DAT* dat=new DAT(filename);
 17 |     return PyLong_FromLong((size_t)dat);
 18 | 
 19 | }
 20 | static PyObject *
 21 | cdat_close(PyObject *self, PyObject *args)
 22 | {
 23 |     PyObject* handler=NULL;
 24 |     PyArg_ParseTuple(args,"O",&handler);
 25 |     delete (DAT*)PyLong_AsLong(handler);
 26 | 	return Py_None;
 27 | }
 28 | static PyObject *
 29 | cdat_get(PyObject *self, PyObject *args)
 30 | {
 31 |     PyObject* handler=NULL;
 32 |     PyObject* py_key=NULL;
 33 |     int no_prefix=0;
 34 |     PyArg_ParseTuple(args,"OOi",&handler,&py_key,&no_prefix);
 35 |     DAT* dat=(DAT*)PyLong_AsLong(handler);
 36 |     
 37 |     Word key;
 38 |     size_t key_size=PySequence_Size(py_key);
 39 |     for(size_t i=0;i<key_size;i++){
 40 |         key.push_back(*(int*)PyUnicode_AS_UNICODE(PySequence_GetItem(py_key,i)));
 41 |     }
 42 |     int ind=0;
 43 |     if(no_prefix){
 44 |         ind=dat->p_match(key);
 45 |     }else{
 46 |         ind=dat->match(key);
 47 |     };
 48 |     if (ind==-1) return Py_None;
 49 |     return PyLong_FromLong(dat->dat[ind].base);
 50 | }
 51 | static PyObject *
 52 | cdat_set(PyObject *self, PyObject *args)
 53 | {
 54 |     PyObject* handler=NULL;
 55 |     PyObject* py_key=NULL;
 56 |     int value=0;
 57 |     PyArg_ParseTuple(args,"OOi",&handler,&py_key,&value);
 58 |     DAT* dat=(DAT*)PyLong_AsLong(handler);
 59 |     
 60 |     Word key;
 61 |     size_t key_size=PySequence_Size(py_key);
 62 |     for(size_t i=0;i<key_size;i++){
 63 |         key.push_back(*(int*)PyUnicode_AS_UNICODE(PySequence_GetItem(py_key,i)));
 64 |     }
 65 |     int ind=dat->p_match(key);
 66 |     if (ind==-1) return Py_None;
 67 |     dat->dat[ind].base=value;
 68 |     return PyLong_FromLong(value);
 69 | }
 70 | static PyObject *
 71 | cdat_inc(PyObject *self, PyObject *args)
 72 | {
 73 |     PyObject* handler=NULL;
 74 |     PyObject* py_key=NULL;
 75 |     int value=0;
 76 |     int no_prefix=0;
 77 |     PyArg_ParseTuple(args,"OOii",&handler,&py_key,&no_prefix,&value);
 78 |     DAT* dat=(DAT*)PyLong_AsLong(handler);
 79 |     
 80 |     Word key;
 81 |     size_t key_size=PySequence_Size(py_key);
 82 |     for(size_t i=0;i<key_size;i++){
 83 |         key.push_back(*(int*)PyUnicode_AS_UNICODE(PySequence_GetItem(py_key,i)));
 84 |     }
 85 |     //int ind=dat->p_match(key);
 86 |     int ind=0;
 87 |     if(no_prefix){
 88 |         ind=dat->p_match(key);
 89 |     }else{
 90 |         ind=dat->match(key);
 91 |     };
 92 |     if (ind==-1) return Py_None;
 93 |     dat->dat[ind].base+=value;
 94 |     return PyLong_FromLong(dat->dat[ind].base);
 95 | }
 96 | static PyObject *
 97 | cdat_build(PyObject *self, PyObject *args)
 98 | {
 99 |     PyObject* list=NULL;
100 |     char* filename=NULL;
101 |     PyArg_ParseTuple(args,"sO",&filename,&list);
102 |     DATMaker dm;
103 |     std::vector<DATMaker::KeyValue> lexicon;
104 |     size_t size=PySequence_Size(list);
105 |     for(size_t i=0;i<size;i++){
106 |         lexicon.push_back(DATMaker::KeyValue());
107 |         PyObject* line=PySequence_GetItem(list,i);
108 |         size_t line_size=PySequence_Size(line);
109 |         PyObject* py_key=PySequence_GetItem(line,0);
110 |         DATMaker::KeyValue* kv=&lexicon.back();
111 |         kv->value=(int)PyLong_AsLong(PySequence_GetItem(line,1));
112 |         size_t key_size=PySequence_Size(py_key);
113 |         for(size_t i=0;i<key_size;i++){
114 |             kv->key.push_back(*(int*)PyUnicode_AS_UNICODE(PySequence_GetItem(py_key,i)));
115 |         }
116 |     }
117 |     dm.make_dat(lexicon,true);
118 |     dm.shrink();
119 |     dm.save_as(filename);
120 |     fprintf(stderr,"size of DAT %d\n",(int)dm.dat_size);
121 | 	return Py_None;
122 | };
123 | 
124 | 
125 | static PyMethodDef cdatMethods[] = {
126 |     {"build",  cdat_build, METH_VARARGS,""},
127 |     {"open",  cdat_open, METH_VARARGS,""},
128 |     {"close",  cdat_close, METH_VARARGS,""},
129 |     {"get",  cdat_get, METH_VARARGS,""},
130 |     {"set",  cdat_set, METH_VARARGS,""},
131 |     {"inc",  cdat_inc, METH_VARARGS,""},
132 |     {NULL, NULL, 0, NULL}        /* Sentinel */
133 | };
134 | static struct PyModuleDef cdatmodule = {
135 |    PyModuleDef_HEAD_INIT,
136 |    "spam",   /* name of module */
137 |    NULL, /* module documentation, may be NULL */
138 |    -1,       /* size of per-interpreter state of the module,
139 |                 or -1 if the module keeps state in global variables. */
140 |    cdatMethods
141 | };
142 | PyMODINIT_FUNC
143 | PyInit_cdat(void)
144 | {
145 |     return PyModule_Create(&cdatmodule);
146 | }
147 | 


--------------------------------------------------------------------------------
/isan/utls/cdat2/dat_builder.cc:
--------------------------------------------------------------------------------
  1 | #include<cstdlib>
  2 | #include<vector>
  3 | #include<cstdio>
  4 | #include<iostream>
  5 | #include<fstream>
  6 | #include<string>
  7 | #include<unistd.h>
  8 | #include"dat.h"
  9 | 
 10 | using namespace dat;
 11 | 
 12 | 
 13 | void showhelp(){
 14 |     printf("双数组TRIE树构建器\n\t作者：张开旭\n");
 15 |     printf("    get words and make DAT\n");
 16 |     printf("-f filename\n    use filename instead of stdin\n");
 17 |     printf("-s\n    save base array and check array Seperately\n");
 18 |     printf("-P\n    申明没有一个词是另一个词的前缀，将编号存在base，而不是base指向的节点\n");
 19 |     
 20 | }
 21 | int main(int argc,char **argv){
 22 |     int c;
 23 |     int is_old_style=false;
 24 |     char* lexicon_filename=NULL;
 25 |     int no_prefix=0;
 26 |     char separator=0;
 27 |     while ( (c = getopt(argc, argv, "f:shPi")) != -1) {
 28 |         switch (c) {
 29 |             case 'i':// the index is 
 30 |                 separator=' ';
 31 |                 break;
 32 |             case 's'://seperated two arrays
 33 |                 is_old_style=true;
 34 |                 break;
 35 |             case 'P'://prefix free
 36 |                 no_prefix=true;
 37 |                 break;
 38 |             case 'f' : //specify the file
 39 |                 lexicon_filename = optarg;
 40 |                 break;            
 41 |             case 'h' :
 42 |             case '?' : 
 43 |             default : 
 44 |                 showhelp();
 45 |                 return 1;
 46 |         }
 47 |     }
 48 |     char* dat_filename=argv[optind];
 49 |     
 50 |     //输入文件名
 51 |     FILE* inputFile=stdin;
 52 |     std::istream* is=&std::cin;
 53 |     std::cout<<"begin\n";
 54 |     std::string str;
 55 |     if(lexicon_filename){
 56 |         std::cout<<"file\n";
 57 |         is=new std::ifstream(lexicon_filename,std::ifstream::in);
 58 |     }
 59 |    
 60 |     
 61 |     DATMaker dm;
 62 |     fprintf(stderr,"Double Array Trie Builder, author ZHANG, Kaixu\n");
 63 |     std::vector<DATMaker::KeyValue> lexicon;
 64 |     lexicon.push_back(DATMaker::KeyValue());
 65 |     int end_character=0;
 66 |     
 67 |     //load wordlist
 68 |     int id=0;
 69 | 
 70 |     void* rtn;
 71 |     do{
 72 |         rtn=std::getline(*is,str);
 73 |         if(str.length()==0)continue;
 74 |         if(separator){//to find a score as value instread of id
 75 |             int sep_ind=str.rfind(separator);
 76 |             //thulac::string_to_raw(str.substr(0,sep_ind),lexicon.back().key);
 77 |             //std::cout<<lexicon.back().key<<"\n";
 78 |             lexicon.back().value=atoi(str.substr(sep_ind+1).c_str());
 79 |         }else{
 80 |             //thulac::string_to_raw(str,lexicon.back().key);
 81 |             lexicon.back().value=id;
 82 |         }
 83 | 
 84 |         //init a new element
 85 |         lexicon.push_back(DATMaker::KeyValue());
 86 |         lexicon.back().key.clear();
 87 |         id+=1;
 88 |     }while(rtn);
 89 |         
 90 |     
 91 | 
 92 |     /*do{
 93 |         end_character=thulac::get_raw(lexicon.back().key,inputFile,32);//space is allowed
 94 |         if((int)lexicon.back().key.size()>0){
 95 |             if(separator){//to find a score as value instread of id
 96 |                 int sep_ind=lexicon.back().key.rfind(separator);
 97 |                 std::cout<<sep_ind<<"\n";
 98 |                 lexicon.back().value=id;
 99 |             }else{
100 |                 lexicon.back().value=id;
101 |             }
102 | 
103 |             //init a new element
104 |             lexicon.push_back(DATMaker::KeyValue());
105 |             lexicon.back().key.clear();
106 |             id+=1;
107 |         }
108 |         if(end_character==-1)break;
109 |     }while(1);*/
110 |     if(lexicon_filename){
111 |         fclose(inputFile);
112 |     }
113 |     lexicon.pop_back();
114 |     
115 |         
116 | 
117 |     
118 |     
119 |     fprintf(stderr,"%d words are loaded\n",(int)lexicon.size());
120 |     dm.make_dat(lexicon,no_prefix);
121 |     dm.shrink();
122 |     fprintf(stderr,"size of DAT %d\n",(int)dm.dat_size);
123 |     
124 |     //save it
125 |     if(is_old_style)
126 |         dm.save_as_old_type(dat_filename);
127 |     else
128 |         dm.save_as(dat_filename);
129 |     if(is!=&std::cin)delete is;
130 | };
131 | 
132 | 
133 | 


--------------------------------------------------------------------------------
/isan/utls/cdat2/test2.dat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhangkaixu/isan/6dd4a2d7c16158e9d5e559aa79d1e9b9ace2b6de/isan/utls/cdat2/test2.dat


--------------------------------------------------------------------------------
/isan/utls/count.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python3
 2 | from collections import Counter
 3 | import argparse
 4 | import sys
 5 | 
 6 | if __name__ == '__main__':
 7 |     parser=argparse.ArgumentParser(description="")
 8 |     parser.add_argument('-m','--min',type=int,help='',dest='min',default=1)
 9 |     parser.add_argument('--kv',type=bool,help='',default=False)
10 |     args=parser.parse_args()
11 |     
12 |     #print(args.min)
13 |     c=Counter()
14 |     for line in sys.stdin:
15 |         line=line.strip()
16 |         if args.kv :
17 |             k,_,v=line.rpartition(' ')
18 |             c.update({k : float(v)})
19 |             continue
20 |         if line:
21 |             c.update({line:1})
22 | 
23 |     for k,v in c.most_common():
24 |         if v<args.min: break
25 |         print(k,v)
26 | 
27 | 
28 | 


--------------------------------------------------------------------------------
/isan/utls/dat.cc:
--------------------------------------------------------------------------------
 1 | #include <Python.h>
 2 | #include <iostream>
 3 | #include <vector>
 4 | #include <map>
 5 | #include <algorithm>
 6 | #include "isan/common/common.hpp"
 7 | #include "dat.hpp"
 8 | 
 9 | using namespace isan;
10 | 
11 | static PyObject *
12 | make_dat(PyObject *self, PyObject *arg){
13 |     std::cout<<"hello\n";
14 | 
15 |     PyObject * py_list;
16 |     PyArg_ParseTuple(arg, "O", &py_list);
17 |     std::vector<std::pair<Dict_Item, Score> > list;
18 |     list.clear();
19 | 
20 | 
21 |     long size=PySequence_Size(py_list);
22 |     std::cerr<<"list size: "<<size<<"\n";
23 |     PyObject * tri;
24 |     for(int i=0;i<size;i++){
25 |         tri=PyList_GetItem(py_list,i);
26 |         list.push_back(std::pair<Dict_Item, Score>());
27 |         list.back().first=Dict_Item(PyTuple_GET_ITEM(tri,0));
28 |         list.back().second=PyLong_AsLong(PyTuple_GET_ITEM(tri,1));
29 |     };
30 | 
31 |     std::sort(list.begin(),list.end(),item_cmp);
32 |     DATMaker dm;
33 |     dm.make_dat(list,0);
34 |     dm.shrink();
35 |     std::cout<<dm.dat_size<<"\n";
36 |     std::cout<<(dm.dat)[dm.match(list[3].first)].base<<"\n";
37 |     Py_INCREF(Py_None);
38 |     return Py_None;
39 | };
40 | 
41 | static PyMethodDef datMethods[] = {
42 |     {"make",  make_dat, METH_VARARGS,""},
43 |     //{"delete",  interface_delete, METH_O,""},
44 |     {NULL, NULL, 0, NULL}        /* Sentinel */
45 | };
46 | static struct PyModuleDef datmodule = {
47 |    PyModuleDef_HEAD_INIT,
48 |    "dat",   /* name of module */
49 |    NULL, /* module documentation, may be NULL */
50 |    -1,       /* size of per-interpreter state of the module,
51 |                 or -1 if the module keeps state in global variables. */
52 |    datMethods
53 | };
54 | 
55 | PyMODINIT_FUNC
56 | PyInit_dat(void)
57 | {
58 |     return PyModule_Create(&datmodule);
59 | };
60 | 


--------------------------------------------------------------------------------
/isan/utls/dat.hpp:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | #include<cstdlib>
  3 | #include<vector>
  4 | #include<cstdio>
  5 | #include<iostream>
  6 | #include<algorithm>
  7 | #include<sys/mman.h>
  8 | #include<fcntl.h>
  9 | #include "isan/common/general_types.hpp"
 10 | 
 11 | 
 12 | namespace isan{
 13 | 
 14 | typedef Feature_String Dict_Item;
 15 | typedef Feature_String Word;
 16 | typedef char Character;
 17 | 
 18 | 
 19 | 
 20 | class DAT{
 21 | public:
 22 |     struct Entry{
 23 |         int base;
 24 |         int check;
 25 |     };
 26 | 
 27 |     void* mmap_ptr;
 28 |     Entry* dat;
 29 |     size_t dat_size;
 30 |     DAT():mmap_ptr(NULL){};
 31 |     DAT(const char* filename){
 32 | 
 33 |         FILE * pFile=fopen(filename,"r+b");
 34 |         if(!pFile){
 35 |             fprintf(stderr,"[ERROR] DAT file %s not found\n",filename);
 36 |         }
 37 |         fseek(pFile,0,SEEK_END);
 38 |         dat_size=ftell(pFile)/sizeof(Entry);
 39 |         rewind(pFile);
 40 |         int rtn;
 41 |         fclose(pFile);
 42 |         
 43 |         int fd=open(filename,O_RDWR);
 44 |         mmap_ptr=mmap(NULL,sizeof(Entry)*dat_size,PROT_READ|PROT_WRITE,MAP_SHARED,fd,0);
 45 |         dat=(Entry*)mmap_ptr;
 46 |         close(fd);
 47 |     }
 48 | 
 49 |     void save_as(const char* filename){
 50 |         FILE * pFile=fopen(filename,"wb");
 51 |         fwrite(dat,sizeof(Entry),dat_size,pFile);
 52 |         fclose(pFile);
 53 |     }
 54 |     inline int get(const Word& word)const{
 55 |         register int ind=0;
 56 |         register int base=0;
 57 |         for(int i=0;i<word.size();i++){
 58 |             ind=dat[ind].base+word[i];
 59 |             if((ind>=dat_size)||dat[ind].check!=base)return 0;
 60 |             base=ind;
 61 |         }
 62 |         ind=dat[base].base;
 63 |         if((ind<dat_size)&&(dat[ind].check==base)){
 64 |             return dat[ind].base;
 65 |         }
 66 |         return 0;
 67 |     }
 68 |     ~DAT(){
 69 |         if(this->mmap_ptr){
 70 |             msync(this->mmap_ptr,sizeof(Entry)*this->dat_size,MS_ASYNC);
 71 |             munmap(this->mmap_ptr,sizeof(Entry)*this->dat_size);
 72 |         }else{
 73 |             free(dat);
 74 |         }
 75 |     }
 76 |     inline int get_index(int base,const Character& character){
 77 |         int ind=dat[base].base+character;
 78 |         if((ind>=dat_size)||dat[ind].check!=base)return -1;
 79 |         return ind;
 80 |     };
 81 |     /*return -base or number of matched characters*/
 82 |     int get_info(Word& prefix){
 83 |         register int ind=0;
 84 |         register int base=0;
 85 |         for(size_t i=0;i<prefix.size();i++){
 86 |             ind=dat[ind].base+prefix[i];
 87 |             if((ind>=dat_size)||dat[ind].check!=base)return i;
 88 |             base=ind;
 89 |         }
 90 |         return -base;
 91 |     }
 92 | };
 93 | 
 94 | 
 95 | class DATMaker: public DAT{
 96 | public:
 97 |     typedef std::pair<Word, Score_Type> KeyValue;
 98 |     static bool compare_words (const DATMaker::KeyValue& first, const DATMaker::KeyValue& second)
 99 |     {
100 |         const Word& first_word=first.first;
101 |         const Word& second_word=second.first;
102 |         size_t min_size=(first_word.size()<second_word.size())?first_word.size():second_word.size();
103 |         for(int i=0;i<min_size;i++){
104 |             if(first_word[i]>second_word[i])return false;
105 |             if(first_word[i]<second_word[i])return true;
106 |         }
107 |         
108 |       return (first_word.size()<second_word.size());
109 |     }
110 |  
111 |     
112 | public:
113 |     int head;
114 |     int tail;
115 |     DATMaker(){
116 |         Entry init;
117 |         dat_size=1;
118 |         dat=(Entry*)calloc(sizeof(Entry),dat_size);
119 |         dat[0].base=1;dat[0].check=-1;
120 |         head=0;
121 |         tail=0;
122 |     };
123 |     void use(int ind){//use [ind] as an entry
124 |         if(dat[ind].check>=0)printf("cell reused!!\n");
125 |         if(dat[ind].base==1){
126 |             head=dat[ind].check;
127 |         }else{
128 |             dat[-dat[ind].base].check=dat[ind].check;
129 |         };
130 |         if(dat[ind].check==-dat_size){
131 |             tail=dat[ind].base;
132 |         }else{
133 |             dat[-dat[ind].check].base=dat[ind].base;
134 |         };
135 |         dat[ind].check=ind;
136 |     };
137 |     void extends(){
138 |         int old_size=dat_size;
139 |         dat_size*=2;
140 |         dat=(Entry*)realloc(dat,sizeof(Entry)*dat_size);
141 |         for(int i=0;i<old_size;i++){
142 |             dat[old_size+i].base=-(old_size+i-1);
143 |             dat[old_size+i].check=-(old_size+i+1);
144 |         };
145 |         dat[old_size].base=tail;
146 |         if(-tail>=0)dat[-tail].check=-old_size;
147 |         tail=-(old_size*2-1);
148 |     }
149 |     void shrink(){//thrink之后双向链表就不需要保持了
150 |         int last=dat_size-1;
151 |         while(dat[last].check<0)last--;
152 |         dat_size=last+1;
153 |         dat=(Entry*)realloc(dat,sizeof(Entry)*dat_size);
154 |     }
155 | 
156 |     int alloc(std::vector<int>& offsets){
157 |         size_t size=offsets.size();
158 |         register size_t base=-head;
159 |         while(1){
160 |             if(base==dat_size)extends();
161 |             if(size)
162 |                 while(base+offsets[size-1]>=dat_size)
163 |                     extends();
164 |             register int flag=true;
165 |             if(dat[base].check>=0){
166 |                 flag=false;
167 |             }else{
168 |                 for(register int i=0;i<size;i++){
169 |                     if(dat[base+offsets[i]].check>=0){//used
170 |                         flag=false;
171 |                         break;
172 |                     }
173 |                 }
174 |             }
175 |             if(flag){
176 |                 use(base);
177 |                 for(int i=0;i<size;i++)use(base+offsets[i]);
178 |                 return base;//got it and return it
179 |             }
180 |             if(dat[base].check==-dat_size)extends();
181 |             base=-dat[base].check;
182 |         }
183 |     }
184 |     void gen_children(std::vector<KeyValue>& lexicon,int start,const Word& prefix,std::vector<int>&children){
185 |         children.clear();
186 |         size_t l=prefix.size();
187 |         for(size_t ind=start;ind<lexicon.size();ind++){
188 |             Word& word=lexicon[ind].first;
189 |             if(word.size()<l)return;
190 |             for(int i=0;i<l;i++)if(word[i]!=prefix[i])return;
191 |             if(word.size()>l){
192 |                 if(children.empty()||word[l]!=children.back())
193 |                     children.push_back(word[l]);
194 |             }
195 |         }
196 |     }
197 |     int assign(int check,std::vector<int>& offsets,int is_word=false){
198 |         int base=alloc(offsets);
199 |         //base
200 |         dat[base].base=0;
201 |         if(is_word){//如果是词
202 |             dat[base].check=check;
203 |         }else{//如果不是词
204 |             dat[base].check=base;
205 |         }
206 |            
207 |         for(int i=0;i<(int)offsets.size();i++){
208 |             dat[base+offsets[i]].base=0;
209 |             dat[base+offsets[i]].check=check;
210 |         }
211 |         dat[check].base=base;
212 |         //printf("is_word %d  base %d\n",is_word,base);
213 |         return base;
214 |     }
215 |     void make_dat(std::vector<KeyValue>& lexicon,int no_prefix=0){
216 |         std::sort(lexicon.begin(),lexicon.end(),&compare_words);
217 | 
218 |         int size=(int)lexicon.size();
219 |         //std::cout<<size<<"\n";
220 |         std::vector<int> children;
221 |         Word prefix;
222 |         //prefix.clear();
223 |         gen_children(lexicon,0,prefix,children);
224 |         //std::cout<<"children size: "<<children.size()<<"\n";
225 |         int base=assign(0,children,true);
226 |         dat[0].base=base;
227 |         for(int i=0;i<(int)lexicon.size();i++){
228 |             //std::cout<<"i "<<i<<"\n";
229 |             Word& word=lexicon[i].first;//得到第一个词
230 |             int off=this->get_info(word);
231 |             //std::cout<<"off: "<<off<<" length of word: "<<word.size()<<"\n";
232 |             //for(int j=0;j<word.size();j++){
233 |             //    std::cout<<(int)word[j]<<" ";
234 |             //};std::cout<<"\n";
235 |             if(off<=0)off=(int)word.size();
236 |             for(int offset=off;offset<=(int)word.size();offset++){
237 |                 //prefix=Word(word.pt,offset);
238 |                 prefix=Word(word,offset);
239 | 
240 |                 int p_base=-this->get_info(prefix);
241 |                 //std::cout<<"p_base "<<p_base<<"\n";
242 |                     
243 |                 gen_children(lexicon,i,prefix,children);
244 |                 int base=assign(p_base,children,offset==(int)word.size());
245 |             }
246 |             off=-this->get_info(word);
247 |             if(no_prefix){
248 |                 dat[off].base=lexicon[i].second;
249 |             }else{
250 |                 dat[dat[off].base].base=lexicon[i].second;
251 |             }
252 |             //if(i&&(i%100000==0))printf("%f\n",(double)i/size);
253 |             //int zkx;
254 |             //std::cin>>zkx;
255 |         }
256 | 
257 |     }
258 | 
259 |     void print(){
260 |         printf("head %d, tail %d\n",head,tail);
261 |         for(int i=0;i<(int)dat_size;i++)
262 |             printf("[%d,%d,%d] ",i,dat[i].base,dat[i].check);
263 |         printf("\n");
264 |     }
265 | };
266 | 
267 | 
268 | };//end of isan
269 | 


--------------------------------------------------------------------------------
/isan/utls/divde.py:
--------------------------------------------------------------------------------
1 | divide.py


--------------------------------------------------------------------------------
/isan/utls/divide.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python3
 2 | """
 3 | 分割文件::
 4 | 
 5 |     ./divide.py number:filename [number:filename ...]
 6 | 
 7 | 
 8 | """
 9 | import itertools
10 | import sys
11 | if __name__=='__main__':
12 |     cycle=[]
13 |     file_dict={}
14 |     for item in sys.argv[1:]:
15 |         n,_,file=item.rpartition(':')
16 |         if not n:n='1'
17 |         if file not in file_dict:
18 | 
19 |             file_dict[file]=open(file,'w') if file else None
20 |         cycle+=[file_dict[file]]*int(n)
21 |     for out_file,line in zip(itertools.cycle(cycle),sys.stdin):
22 |         if out_file:
23 |             print(line.strip(),file=out_file)
24 | 


--------------------------------------------------------------------------------
/isan/utls/draw.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python3
 2 | import os
 3 | import sys
 4 | import subprocess
 5 | 
 6 | def exp_tree(line,node,head,src):
 7 |     if len(node[head])==1:return
 8 |     node[head].sort()
 9 |     for ind in node[head]:
10 |         if ind!=head:
11 |             src.append('"%d"[label="%s"];'%(ind,line[ind][1][0]))
12 |             src.append("%d->%d;"%(head,ind))
13 |         else:
14 |             src.append('"~%d"[label="~",shape="point"];'%(head))
15 |             src.append('%d->"~%d";'%(head,head))
16 |     for ind in node[head]:
17 |         if ind !=head:
18 |             exp_tree(line,node,ind,src)
19 | def encode(line,T='png'):
20 |     line=[[ind,item.split('_')] for ind,item in enumerate(line.split())]
21 |     node=[[ind] for ind in range(len(line))]
22 |     head=-1
23 |     for ind,item in line:
24 |         item[2]=int(item[2])
25 |         if item[2]==-1:
26 |             head=ind
27 |         else:
28 |             node[item[2]].append(ind)
29 |     src=["digraph unix {",
30 |             "node[shape=box];",
31 |             "rankdir=TD;"]
32 |     src.append('"%d"[label="%s"];'%(head,line[head][1][0]))
33 |     exp_tree(line,node,head,src)
34 |     src.append("}")
35 |     src='\n'.join(src)
36 |     dot=subprocess.Popen(["dot","-T"+T],stdin=subprocess.PIPE,stdout=subprocess.PIPE)
37 |     stdout,stderr=dot.communicate(src.encode())
38 |     return stdout
39 | 
40 | if __name__=="__main__":
41 |     line='''导弹_NN_2 不_AD_2 必_VV_22 带_VV_2 弹头_NN_3 ，_PU_22 目标_NN_8 不_AD_8 必_VV_13 在_VV_8 有_VE_11 居民_NN_9 之_DEC_8 地_NN_22 ，_PU_22 例如_AD_22 次要_JJ_17 外岛_NN_20 或_CC_20 领海_NN_20 边缘_NN_22 皆_AD_22 可_VV_-1 。_PU_22'''
42 |     if len(sys.argv)<2:
43 |         print('请输入要保存的文件名')
44 |         exit()
45 |     filename=sys.argv[1]
46 | 
47 |     for line in sys.stdin:
48 |         open(filename,'wb').write(encode(line))
49 |         exit()
50 | 


--------------------------------------------------------------------------------
/isan/utls/indexer.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python3
 2 | class Indexer(list) :
 3 |     def __init__(self):
 4 |         self.d=dict()
 5 |         pass
 6 |     def __call__(self,key):
 7 |         if key not in self.d :
 8 |             self.d[key]=len(self)
 9 |             self.append(key)
10 |         return self.d[key]
11 | 
12 | 


--------------------------------------------------------------------------------
/isan/utls/pydat.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python3
 2 | import dat
 3 | 
 4 | if __name__ == '__main__':
 5 |     abc=[
 6 |             (b'ab',33),
 7 |             (b'sa',44),
 8 |             (b'a',11),
 9 |             (b'aa',22),
10 | 
11 |         ]
12 |     dat.make(abc)
13 |     pass
14 | 
15 | 


--------------------------------------------------------------------------------
/isan/utls/segconv.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python3
  2 | import re
  3 | import sys
  4 | import argparse
  5 | import json
  6 | 
  7 | def wiki_reader(line):
  8 |     line=re.sub(r'^([^\[]*)\]\]',r'\1',line)
  9 |     line=re.sub(r'\[\[([^\]]*)$',r'\1',line)
 10 |     if '[[' not in line : return None
 11 |     line=re.split('(\[\[[^\]]+\]\])',line)
 12 |     line=[x for x in line if x]
 13 |     word_start_at=-1
 14 |     intervals=[[-1,-1]]
 15 |     offset=0
 16 |     raw=[]
 17 |     for w in line:
 18 |         in_word=0
 19 |         if w[:2]=='[[':
 20 |             w=w[2:-2]
 21 |             in_word=1
 22 |         else :
 23 |             pass
 24 |         raw.append(w)
 25 |         for c in w[:-1]:
 26 |             if in_word:
 27 |                 intervals.append([offset,offset+len(w)])
 28 |             else :
 29 |                 intervals.append([-1,-1])
 30 |         intervals.append([-1,-1])
 31 |         offset+=len(w)
 32 |     raw=''.join(raw)
 33 |     if not(len(raw)+1==len(intervals)):
 34 |         input('assert')
 35 |         return None
 36 |     return {'raw':raw,'Y':[None,intervals]}
 37 | 
 38 | def seg_reader(line) :
 39 |     y=line.split()
 40 |     return {'seg' : y, 'Y' : None}
 41 | 
 42 | def raw_reader(data) :
 43 |     data=data.strip()
 44 |     return {'raw' : data,'Y': None}
 45 | def raw_writer(data) :
 46 |     if 'raw' in data : return data['raw']
 47 |     if 'seg' in data : return ''.join(data['seg'])
 48 | 
 49 | def seg_writer(data) :
 50 |     if 'seg' in data : return data['seg']
 51 |     return None
 52 | 
 53 | def raw_with_Ya_writer(data) :
 54 |     raw=raw_writer(data)
 55 |     Y=None
 56 |     if 'Y' in data :
 57 |         Y=data['Y']
 58 |     return json.dumps({'raw' : raw,'y': seg_writer(data),
 59 |         'Y_a' : Y},ensure_ascii=False)
 60 | def raw_with_Ya_reader(data) :
 61 |     data=json.loads(data)
 62 |     raw=data.get('raw')
 63 |     Ya=data.get('Y_a',None)
 64 |     return {'raw':raw,'Y':Ya}
 65 | 
 66 | def raw_with_Yb_writer(data) :
 67 |     raw=raw_writer(data)
 68 |     Y=None
 69 |     if 'Y' in data :
 70 |         Y=data['Y']
 71 |     return json.dumps({'raw' : raw,'y': seg_writer(data),
 72 |         'Y_b' : Y},ensure_ascii=False)
 73 | def raw_with_Yb_reader(data) :
 74 |     data=json.loads(data)
 75 |     raw=data.get('raw')
 76 |     Ya=data.get('Y_b',None)
 77 |     return {'raw':raw,'Y':Ya}
 78 | 
 79 | if __name__ == '__main__':
 80 |     readers={'raw':raw_reader,
 81 |             'seg':seg_reader,
 82 |             'wiki':wiki_reader,
 83 |             'Ya': raw_with_Ya_reader,
 84 |             'Yb': raw_with_Yb_reader,
 85 |             }
 86 |     writers={'raw': raw_writer,
 87 |             'Ya': raw_with_Ya_writer,
 88 |             'Yb':raw_with_Yb_writer,
 89 |             }
 90 | 
 91 |     parser=argparse.ArgumentParser(description="分词相关的格式转换")
 92 |     parser.add_argument('-f','--from',dest='reader',
 93 |             choices=readers,
 94 |             metavar='源格式')
 95 |     parser.add_argument('-t','--to',
 96 |             dest='writer',default='raw',
 97 |             choices=writers,
 98 |             metavar='目标格式')
 99 |     args=parser.parse_args()
100 |     reader=readers[args.reader]
101 |     writer=writers[args.writer]
102 | 
103 | 
104 |     for line in sys.stdin :
105 |         line=line.strip()
106 |         data=reader(line)
107 |         if data :
108 |             print(writer(data))
109 |         
110 | 
111 | 


--------------------------------------------------------------------------------
/isan/utls/shuffle_lines.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python3
 2 | """
 3 | 命令行工具，用于打乱文件中行的顺序
 4 | 
 5 |     usage: shuffle_lines.py [-h] [-i] filename [filename ...]
 6 | 
 7 | 如果提供一个文件，则打乱该文件顺序。
 8 | 
 9 | 如果提供多个文件，则同步地打乱多个文件中行的顺序。
10 | 这里多个文件中对应行的数据有对应关系。
11 | 多个文件需要有相同数目的行。
12 | 
13 | 给出 `-i` 参数， 则会将打乱顺序的内容写回文件。
14 | """
15 | import argparse
16 | import random
17 | import sys
18 | 
19 | if __name__ == '__main__':
20 |     parser=argparse.ArgumentParser(description="随机化文件的行")
21 |     parser.add_argument('filename',help='要操作的文件名',nargs='+')
22 |     parser.add_argument('-i',help='不设定会输出到标准输出流，设定后写回原文件',action='store_true',dest='i')
23 |     args=parser.parse_args()
24 | 
25 |     data=[]
26 |     for lines in zip(*[open(fn) for fn in args.filename]):
27 |         data.append([line.strip() for line in lines])
28 |     random.shuffle(data)
29 |     out_file=[sys.stdout if not args.i else open(fn,'w') for fn in args.filename]
30 |     for ls in data:
31 |         for l,f in zip(ls,out_file) :
32 |             print(l,file=f)
33 |     exit()
34 | 
35 | 


--------------------------------------------------------------------------------
/isan/utls/times.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | class Times (dict) :
 3 |     def __call__(self,key):
 4 |         if key not in self :
 5 |             self[key]=[0,None]
 6 |         data=self[key]
 7 |         if data[1]==None :
 8 |             data[1]=time.time()
 9 |         else :
10 |             data[0]+=time.time()-data[1]
11 |             data[1]=None
12 |     def __repr__(self):
13 |         return '\n'.join(str(k)+":"+str(v[0]) for k,v in self.items())
14 | 


--------------------------------------------------------------------------------
/isan/utls/to_full.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python3
 2 | import sys
 3 | import argparse
 4 | 
 5 | def to_full(text,ignore=set()):
 6 |     """
 7 |     半角转全角的程序
 8 |         空格变成全角
 9 |         大于空格的直接加上偏移量
10 |         否则不变
11 |     """
12 |     
13 |     return ''.join(
14 |                     chr(x) if (x<32 or x>128 or x in ignore) else 
15 |                     chr(12288) if x==32 else chr(x+65248) 
16 |             for x in map(ord,text))
17 | 
18 | 
19 | if __name__ == '__main__':
20 |     parser=argparse.ArgumentParser(description="")
21 |     parser.add_argument('--ignore',help='忽略的',dest='ignore',type=str)
22 |     parser.add_argument('--check',help='只显示改变了的',action='store_true')
23 |     args=parser.parse_args()
24 |     ignore=set()
25 |     if args.ignore :
26 |         for c in sys.argv[1] :
27 |             ignore.add(ord(c))
28 |         
29 |     for line in sys.stdin :
30 |         line=line.strip()
31 |         if args.check :
32 |             rtn=to_full(line,ignore)
33 |             if line!=rtn :
34 |                 print(rtn)
35 |         else :
36 |             print(to_full(line,ignore))
37 |     pass
38 | 
39 | 


--------------------------------------------------------------------------------