├── .circleci
    └── config.yml
├── .gitignore
├── LICENSE
├── MANIFEST.in
├── Makefile
├── README.rst
├── conda-recipe
    └── python-zpar
    │   └── meta.yaml
├── examples
    ├── test.txt
    ├── test_tagged.txt
    ├── test_tokenized.txt
    ├── zpar_client.py
    └── zpar_example.py
├── setup.py
├── src
    ├── Makefile
    ├── Makefile.lib.zpar
    ├── reader.h
    └── zpar.lib.cpp
├── tests
    ├── test_depparser.py
    ├── test_depparser_no_wordnet.py
    ├── test_parser.py
    └── test_tagger.py
└── zpar
    ├── DepParser.py
    ├── Parser.py
    ├── Tagger.py
    ├── __init__.py
    └── zpar_server.py


/.circleci/config.yml:
--------------------------------------------------------------------------------
 1 | version: 2
 2 | jobs:
 3 | 
 4 |   build:
 5 |     environment:
 6 |       ZPAR_MODEL_DIR: /root/english-models
 7 |       NLTK_DATA: /root/nltk/data
 8 |     docker:
 9 |       - image: buildpack-deps:trusty
10 |     working_directory: ~/repo
11 |     parallelism: 4
12 |     steps:
13 |       - checkout
14 |       - restore_cache:
15 |           keys:
16 |           - deps-and-models
17 |       - run: mkdir -p ~/repo/artifacts
18 |       - run: rm -r ~/repo/artifacts
19 |       - run:
20 |           name: Install miniconda and dependencies
21 |           command: |
22 |             wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh
23 |             chmod +x miniconda.sh
24 |             ./miniconda.sh -b -f
25 |             ~/miniconda3/bin/conda config --add channels desilinguist
26 |             ~/miniconda3/bin/conda update --yes conda
27 |             ~/miniconda3/bin/conda install --yes six nose nltk
28 |             mkdir -p ~/nltk/data
29 |             ~/miniconda3/bin/python -m nltk.downloader wordnet -d ~/nltk/data
30 |       - run:
31 |           name: Download ZPar models
32 |           command: |
33 |             if [ ! -d ~/english-models ]; then wget https://github.com/frcchang/zpar/releases/download/v0.7.5/english-models.zip -O ~/english-models.zip; fi
34 |             if [ ! -d ~/english-models ]; then unzip ~/english-models.zip -d ${HOME}; fi
35 |       - run:
36 |           name: Install python-zpar in editable mode
37 |           command: |
38 |             make python-zpar
39 |             ~/miniconda3/bin/pip install -e .
40 | 
41 |       - save_cache:
42 |           paths:
43 |             - "~/miniconda3/pkgs"
44 |             - "~/english-models"
45 |           key: deps-and-models
46 | 
47 |       - run:
48 |           name: Run tests
49 |           command: |
50 |             TESTFILES=$(circleci tests glob "tests/test_*.py" | circleci tests split)
51 |             if [[ ${TESTFILES} == *_no_wordnet.py ]]; then NLTK_DATA= ; fi
52 |             ~/miniconda3/bin/nosetests -v ${TESTFILES}
53 | 
54 |       - store_artifacts:
55 |           path:  ~/repo/artifacts
56 |           destination:  artifacts
57 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | zpar.egg-info
2 | dist
3 | build
4 | build.sh
5 | python_zpar.egg-info
6 | zpar/__pycache__
7 | *.pyc
8 | test_twice.py


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2014 Nitin Madnani
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include Makefile
2 | include src/*
3 | include zpar/*
4 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | all: python-zpar
 2 | 
 3 | clean:
 4 | 	rm -rf /tmp/zpar
 5 | 	rm -f /tmp/zpar.tar.gz
 6 | 
 7 | python-zpar: clean /tmp/zpar.tar.gz
 8 | 	tar -C /tmp/zpar -zxf /tmp/zpar.tar.gz --strip-components=1
 9 | 	cp src/zpar.lib.cpp /tmp/zpar/src/english
10 | 	cp src/Makefile.lib.zpar /tmp/zpar
11 | 	cp src/Makefile /tmp/zpar
12 | 	cp src/reader.h /tmp/zpar/src/include/reader.h
13 | 	make -C /tmp/zpar zpar.so
14 | 	mkdir -p zpar/dist
15 | 	cp /tmp/zpar/dist/zpar.so zpar/dist/
16 | 
17 | /tmp/zpar.tar.gz:
18 | 	wget -N https://github.com/frcchang/zpar/archive/v0.7.5.tar.gz -O /tmp/zpar.tar.gz
19 | 	touch $@
20 | 	mkdir /tmp/zpar
21 | 
22 | 


--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
  1 | NOTE
  2 | ~~~~
  3 | This project is no longer under active development since there are now
  4 | really nice pure Python parsers such as `Stanza <https://stanfordnlp.github.io/stanza/index.html>`__ and `Spacy <https://spacy.io>`__. The repository will remain here for archival purposes and the `PyPI <https://pypi.org/project/python-zpar/>`__ package will continue to be available.
  5 | 
  6 | Introduction
  7 | ~~~~~~~~~~~~
  8 | 
  9 | .. image:: https://circleci.com/gh/EducationalTestingService/python-zpar.svg?style=shield
 10 |    :alt: CircleCI Build status
 11 |    :target: https://circleci.com/gh/EducationalTestingService/python-zpar
 12 | 
 13 | **python-zpar** is a python wrapper around the `ZPar
 14 | parser <http://www.sutd.edu.sg/cmsresource/faculty/yuezhang/zpar.html>`__.
 15 | ZPar was written by `Yue Zhang <http://www.sutd.edu.sg/yuezhang.aspx>`__
 16 | while he was at Oxford University. According to its home page: *ZPar is
 17 | a statistical natural language parser, which performs syntactic analysis
 18 | tasks including word segmentation, part-of-speech tagging and parsing.
 19 | ZPar supports multiple languages and multiple grammar formalisms. ZPar
 20 | has been most heavily developed for Chinese and English, while it
 21 | provides generic support for other languages. ZPar is fast, processing
 22 | above 50 sentences per second using the standard Penn Teebank (Wall
 23 | Street Journal) data.*
 24 | 
 25 | I wrote python-zpar since I needed a fast and efficient parser for my
 26 | NLP work which is primarily done in Python and not C++. I wanted to be
 27 | able to use this parser directly from Python without having to create a
 28 | bunch of files and running them through subprocesses. python-zpar not
 29 | only provides a simply python wrapper but also provides an XML-RPC ZPar
 30 | server to make batch-processing of large files easier.
 31 | 
 32 | python-zpar uses
 33 | `ctypes <https://docs.python.org/3.4/library/ctypes.html>`__, a very
 34 | cool foreign function library bundled with Python that allows calling
 35 | functions in C DLLs or shared libraries directly.
 36 | 
 37 | **IMPORTANT**: As of now, python-zpar only works with the English zpar models
 38 | since the interface to the Chinese models is different than the English ones.
 39 | Pull requests are welcome!
 40 | 
 41 | Installation
 42 | ~~~~~~~~~~~~
 43 | 
 44 | Currently, python-zpar only works on 64-bit linux and OS X systems.
 45 | Those are the two platforms I use everyday. I am happy to try to get
 46 | python-zpar working on other platforms over time. Pull requests are
 47 | welcome!
 48 | 
 49 | Please make sure that ``make`` and ``wget`` are installed as they are both needed to properly build python-zpar.
 50 | 
 51 | In order for python-zpar to work, it requires C functions that can be
 52 | called directly. Since the only user-exposed entry point in ZPar is the
 53 | command line client, I needed to write a shared library that would have
 54 | functions built on top of the ZPar functionality but expose them in a
 55 | way that ctypes could understand.
 56 | 
 57 | Therefore, in order to build python-zpar from scratch, we need to
 58 | download the ZPar source, patch it with new functionality and compile
 59 | the shared library. All of this happens automatically when you install
 60 | with pip:
 61 | 
 62 | .. code-block:: bash
 63 | 
 64 |     pip install python-zpar
 65 | 
 66 | 
 67 | IF YOU ARE USING macOS
 68 | ======================
 69 | 
 70 | 1. On macOS, the installation will only work with ``gcc`` installed using either `macports <http://www.macports.org>`__ or `homebrew <http://brew.sh/>`__. The zpar source cannot be compiled with ``clang``. If you are having trouble compiling the code after cloning the repository or installing the package using pip, you can try to explicitly override the C++ compiler:
 71 | 
 72 |     .. code-block:: bash
 73 | 
 74 |         CXX=<path to c++ compiler> make -e
 75 | 
 76 |     or
 77 | 
 78 |     .. code-block:: bash
 79 | 
 80 |         CXX=<path to c++ compiler> pip install python-zpar
 81 | 
 82 | 
 83 |     If you are curious about what the C functions in the shared library
 84 |     module look like, see ``src/zpar.lib.cpp``.
 85 | 
 86 | 2. If you are using macOS Mojave, you will need an extra step before running the ``pip`` install command above. Starting with Mojave, Apple has stopped installing the C/C++ system header files into ``/usr/include``. As a workaround, they have provided the package ``/Library/Developer/CommandLineTools/Packages/macOS_SDK_headers_for_macOS_10.14.pkg`` that you must install to get the system headers back in the usual place before python-zpar can be compiled. For more details, please read the Command Line Tools section of the `Xcode 10 release notes <https://developer.apple.com/documentation/xcode_release_notes/xcode_10_release_notes>`__
 87 | 
 88 | 3. If you are using macOS Catalina, python-zpar is currently `broken <https://github.com/EducationalTestingService/python-zpar/issues/29>`__. I have not yet upgraded to Catalina on my production machine and cannot figure out a fix yet. If you have a suggested fix, please reply in the issue. 
 89 | 
 90 | Usage
 91 | ~~~~~
 92 | 
 93 | To use python-zpar, you need the English models for ZPar. They can be
 94 | downloaded from the ZPar release page `here <https://github.com/frcchang/zpar/releases/tag/v0.7.5>`__.
 95 | There are three models: a part-of-speech tagger, a constituency parser, and a
 96 | dependency parser. For the purpose of the examples below, the models are
 97 | in the ``english-models`` directory in the current directory.
 98 | 
 99 | Here's a small example of how to use python-zpar:
100 | 
101 | .. code-block:: python
102 | 
103 |     from six import print_
104 |     from zpar import ZPar
105 | 
106 |     # use the zpar wrapper as a context manager
107 |     with ZPar('english-models') as z:
108 | 
109 |         # get the parser and the dependency parser models
110 |         tagger = z.get_tagger()
111 |         depparser = z.get_depparser()
112 | 
113 |         # tag a sentence
114 |         tagged_sent = tagger.tag_sentence("I am going to the market.")
115 |         print_(tagged_sent)
116 | 
117 |         # tag an already tokenized sentence
118 |         tagged_sent = tagger.tag_sentence("Do n't you want to come with me to the market ?", tokenize=False)
119 |         print_(tagged_sent)
120 | 
121 |         # get the dependency parse of an already tagged sentence
122 |         dep_parsed_sent = depparser.dep_parse_tagged_sentence("I/PRP am/VBP going/VBG to/TO the/DT market/NN ./.")
123 |         print_(dep_parsed_sent)
124 | 
125 |         # get the dependency parse of an already tokenized sentence
126 |         dep_parsed_sent = depparser.dep_parse_sentence("Do n't you want to come with me to the market ?", tokenize=False)
127 |         print_(dep_parsed_sent)
128 | 
129 |         # get the dependency parse of an already tokenized sentence
130 |         # and include lemma information (assuming you have NLTK as well
131 |         # as its WordNet corpus installed)
132 |         dep_parsed_sent = depparser.dep_parse_sentence("Do n't you want to come with me to the market ?", tokenize=False, with_lemmas=True)
133 |         print_(dep_parsed_sent)
134 | 
135 | 
136 | The above code sample produces the following output:
137 | 
138 | .. code-block::
139 | 
140 |     I/PRP am/VBP going/VBG to/TO the/DT market/NN ./.
141 | 
142 |     Do/VBP n't/RB you/PRP want/VBP to/TO come/VB with/IN me/PRP to/TO the/DT market/NN ?/.
143 | 
144 |     I       PRP   1    SUB
145 |     am      VBP   -1   ROOT
146 |     going   VBG   1    VC
147 |     to      TO    2    VMOD
148 |     the     DT    5    NMOD
149 |     market  NN    3    PMOD
150 |     .       .     1    P
151 | 
152 |     Do      VBP  -1  ROOT
153 |     n't     RB   0   VMOD
154 |     you     PRP  0   SUB
155 |     want    VBP  0   VMOD
156 |     to      TO   5   VMOD
157 |     come    VB   3   VMOD
158 |     with    IN   5   VMOD
159 |     me      PRP  6   PMOD
160 |     to      TO   5   VMOD
161 |     the     DT   10  NMOD
162 |     market  NN   8   PMOD
163 |     ?       .    0   P
164 | 
165 |     Do      VBP  -1  ROOT   do
166 |     n't     RB   0   VMOD   n't
167 |     you     PRP  0   SUB    you
168 |     want    VBP  0   VMOD   want
169 |     to      TO   5   VMOD   to
170 |     come    VB   3   VMOD   come
171 |     with    IN   5   VMOD   with
172 |     me      PRP  6   PMOD   me
173 |     to      TO   5   VMOD   to
174 |     the     DT   10  NMOD   the
175 |     market  NN   8   PMOD   market
176 |     ?       .    0   P      ?
177 | 
178 | 
179 | Detailed usage with comments is shown in the included file
180 | ``examples/zpar_example.py``. Run ``python zpar_example.py -h`` to see a
181 | list of all available options.
182 | 
183 | ZPar Server
184 | ~~~~~~~~~~~
185 | 
186 | The package also provides an python XML-RPC implementation of a ZPar
187 | server that makes it easier to process multiple sentences and files by
188 | loading the models just once (via the ctypes interface) and allowing
189 | clients to connect and request analyses. The implementation is in the
190 | executable ``zpar_server`` that is installed when you install the
191 | package. The server is quite flexible and allows loading only the
192 | models that you need. Here's an example of how to start the server
193 | with only the tagger and the dependency parser models loaded:
194 | 
195 | .. code-block::
196 | 
197 |     $> zpar_server --modeldir english-models --models tagger parser depparser
198 |     INFO:Initializing server ...
199 |     Loading tagger from english-models/tagger
200 |     Loading model... done.
201 |     Loading constituency parser from english-models/conparser
202 |     Loading scores... done. (65.9334s)
203 |     Loading dependency parser from english-models/depparser
204 |     Loading scores... done. (14.9623s)
205 |     INFO:Registering introspection ...
206 |     INFO:Starting server on port 8859...
207 | 
208 | Run ``zpar_server -h`` to see a list of all options.
209 | 
210 | Once the server is running, you can connect to it using a client. An
211 | example client is included in the file ``examples/zpar_client.py`` which
212 | can be run as follows (note that if you specified a custom host and port
213 | when running the server, you'd need to specify the same here):
214 | 
215 | .. code-block::
216 | 
217 |     $> cd examples
218 |     $> python zpar_client.py
219 | 
220 |     INFO:Attempting connection to http://localhost:8859
221 |     INFO:Tagging "Don't you want to come with me to the market?"
222 |     INFO:Output: Do/VBP n't/RB you/PRP want/VBP to/TO come/VB with/IN me/PRP to/TO the/DT market/NN ?/.
223 |     INFO:Tagging "Do n't you want to come to the market with me ?"
224 |     INFO:Output: Do/VBP n't/RB you/PRP want/VBP to/TO come/VB to/TO the/DT market/NN with/IN me/PRP ?/.
225 |     INFO:Parsing "Don't you want to come with me to the market?"
226 |     INFO:Output: (SQ (VBP Do) (RB n't) (NP (PRP you)) (VP (VBP want) (S (VP (TO to) (VP (VB come) (PP (IN with) (NP (PRP me))) (PP (TO to) (NP (DT the) (NN market))))))) (. ?))
227 |     INFO:Dep Parsing "Do n't you want to come to the market with me ?"
228 |     INFO:Output: Do VBP -1  ROOT
229 |     n't RB  0   VMOD
230 |     you PRP 0   SUB
231 |     want    VBP 0   VMOD
232 |     to  TO  5   VMOD
233 |     come    VB  3   VMOD
234 |     to  TO  5   VMOD
235 |     the DT  8   NMOD
236 |     market  NN  6   PMOD
237 |     with    IN  5   VMOD
238 |     me  PRP 9   PMOD
239 |     ?   .   0   P
240 | 
241 |     INFO:Tagging file /Users/nmadnani/work/python-zpar/examples/test.txt into test.tag
242 |     INFO:Parsing file /Users/nmadnani/work/python-zpar/examples/test_tokenized.txt into test.parse
243 | 
244 | 
245 | Note that python-zpar and all of the example scripts should work with
246 | both Python 2.7 and Python 3.4. I have tested python-zpar on both Linux
247 | and Mac but not on Windows.
248 | 
249 | Node.js version
250 | ~~~~~~~~~~~~~~~
251 | 
252 | If you want to use ZPar in your node.js app, check out my other project
253 | `node-zpar <http://github.com/EducationalTestingService/node-zpar>`__.
254 | 
255 | License
256 | ~~~~~~~
257 | 
258 | Although python-zpar is licensed under the MIT license - which means
259 | that you can do whatever you want with the wrapper code - ZPar itself is
260 | licensed under GPL v3.
261 | 
262 | ToDo
263 | ~~~~
264 | 
265 | 1. Improve error handling on both the python and C side.
266 | 2. Expose more functionality, e.g., Chinese word segmentation, parsing
267 |    etc.
268 | 3. May be look into using `CFFI <https://cffi.readthedocs.org/>`__
269 |    instead of ctypes.
270 | 
271 | 


--------------------------------------------------------------------------------
/conda-recipe/python-zpar/meta.yaml:
--------------------------------------------------------------------------------
 1 | package:
 2 |     name: python-zpar
 3 |     version: "0.9.5"
 4 | 
 5 | source:
 6 |   path: ../../../python-zpar
 7 | 
 8 | build:
 9 |     number: {{environ.get('BINSTAR_BUILD', 0)}}
10 |     script:
11 |       - cd $SRC_DIR
12 |       - "{{ PYTHON }} setup.py install"
13 | 
14 | requirements:
15 |     build:
16 |       - python
17 |       - setuptools
18 |     run:
19 |       - python
20 |       - six
21 | 
22 | about:
23 |     home: https://github.com/EducationalTestingService/python-zpar
24 |     license: MIT
25 | 


--------------------------------------------------------------------------------
/examples/test.txt:
--------------------------------------------------------------------------------
1 | I am going to the market.
2 | Are you going to come with me?
3 | 


--------------------------------------------------------------------------------
/examples/test_tagged.txt:
--------------------------------------------------------------------------------
1 | I/PRP am/VBP going/VBG to/TO the/DT market/NN ./.
2 | Are/VBP you/PRP going/VBG to/TO come/VB with/IN me/PRP ?/.
3 | 


--------------------------------------------------------------------------------
/examples/test_tokenized.txt:
--------------------------------------------------------------------------------
1 | I am going to the market .
2 | Are you going to come with me ?
3 | 


--------------------------------------------------------------------------------
/examples/zpar_client.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | 
  3 | import argparse
  4 | import logging
  5 | import os
  6 | import six
  7 | import socket
  8 | import sys
  9 | 
 10 | if __name__ == '__main__':
 11 | 
 12 |     # set up an argument parser
 13 |     parser = argparse.ArgumentParser(prog='zpar_client.py')
 14 | 
 15 |    # set up the logging
 16 |     logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.INFO)
 17 | 
 18 |     parser.add_argument('--host', dest='hostname',
 19 |                         help="Hostname or IP address",
 20 |                         default="localhost",
 21 |                         required=False)
 22 | 
 23 |     parser.add_argument('--port', dest='port', type=int,
 24 |                         help="Port number",
 25 |                         default=8859,
 26 |                         required=False)
 27 | 
 28 |     # parse given command line arguments
 29 |     args = parser.parse_args()
 30 | 
 31 |     # set up the logging
 32 |     logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.INFO)
 33 | 
 34 |     # instantiate the client appropriately and connect
 35 |     logging.info('Attempting connection to http://{}:{}'.format(args.hostname,
 36 |                                                                 args.port))
 37 |     if six.PY2:
 38 |         import xmlrpclib
 39 |         proxy = xmlrpclib.ServerProxy('http://{}:{}'.format(args.hostname,
 40 |                                                             args.port),
 41 |                                       allow_none=True)
 42 |         fault = xmlrpclib.Fault
 43 |     else:
 44 |         import xmlrpc.client
 45 |         proxy = xmlrpc.client.ServerProxy('http://{}:{}'.format(args.hostname,
 46 |                                                                 args.port),
 47 |                                           use_builtin_types=True,
 48 |                                           allow_none=True)
 49 |         fault = xmlrpc.client.Fault
 50 | 
 51 |     # Make the remote procedure calls on the server
 52 |     try:
 53 | 
 54 |         test_sentence = "Don't you want to come with me to the market?"
 55 |         test_tagged_sentence = "I/PRP am/VBP going/VBG to/TO the/DT market/NN ./."
 56 |         test_file = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'test.txt')
 57 |         tokenized_test_sentence = "Do n't you want to come to the market with me ?"
 58 |         tokenized_test_file = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'test_tokenized.txt')
 59 |         tagged_test_file = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'test_tagged.txt')
 60 |         tag_outfile = os.path.join(os.getcwd(), 'test.tag')
 61 |         parse_outfile = os.path.join(os.getcwd(), 'test.parse')
 62 |         parse_outfile2 = os.path.join(os.getcwd(), 'test_tagged.parse')
 63 | 
 64 |         logging.info('Tagging "{}"'.format(test_sentence))
 65 |         tagged_sent = proxy.tag_sentence(test_sentence)
 66 |         logging.info("Output: {}".format(tagged_sent))
 67 | 
 68 |         logging.info('Tagging "{}"'.format(tokenized_test_sentence))
 69 |         tagged_sent = proxy.tag_sentence(tokenized_test_sentence, False)
 70 |         logging.info("Output: {}".format(tagged_sent))
 71 | 
 72 |         logging.info('Parsing "{}"'.format(test_sentence))
 73 |         parsed_sent = proxy.parse_sentence(test_sentence)
 74 |         logging.info("Output: {}".format(parsed_sent))
 75 | 
 76 |         logging.info('Parsing "{}"'.format(test_tagged_sentence))
 77 |         parsed_sent = proxy.parse_tagged_sentence(test_tagged_sentence)
 78 |         logging.info("Output: {}".format(parsed_sent))
 79 | 
 80 |         logging.info('Dep Parsing "{}"'.format(tokenized_test_sentence))
 81 |         parsed_sent = proxy.dep_parse_sentence(tokenized_test_sentence, False)
 82 |         logging.info("Output: {}".format(parsed_sent))
 83 | 
 84 |         logging.info('Tagging file {} into {}'.format(test_file, tag_outfile))
 85 |         proxy.tag_file(test_file, tag_outfile)
 86 | 
 87 |         logging.info('Parsing file {} into {}'.format(tokenized_test_file, parse_outfile))
 88 |         proxy.parse_file(tokenized_test_file, parse_outfile, False)
 89 | 
 90 |         logging.info('Parsing tagged file {} into {}'.format(tagged_test_file, parse_outfile2))
 91 |         proxy.parse_tagged_file(tagged_test_file, parse_outfile2)
 92 | 
 93 |     except socket.error as err:
 94 |         sys.stderr.write("{}\n".format(err))
 95 |         sys.exit(1)
 96 |     except fault as flt:
 97 |         sys.stderr.write("Fault {}: {}\n".format(flt.faultCode,
 98 |                                                  flt.faultString))
 99 |         sys.exit(1)
100 | 
101 |     # Stop the server
102 |     # NOTE: You will probably do this in the last client (if you know
103 |     # which one that is) or in a clean-up script when you are absolutely sure
104 |     # that all clients are finished.
105 |     proxy.stop_server()
106 | 
107 | 


--------------------------------------------------------------------------------
/examples/zpar_example.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | import argparse
 4 | import os
 5 | from six import print_
 6 | 
 7 | from zpar import ZPar
 8 | 
 9 | def main():
10 |     # set up an argument parser
11 |     parser = argparse.ArgumentParser(prog='zpar_example.py')
12 |     parser.add_argument('--modeldir', dest='modeldir',
13 |                         help="Path to directory containing zpar English models",
14 |                         required=True)
15 | 
16 |     # parse given command line arguments
17 |     args = parser.parse_args()
18 | 
19 |     # use the zpar wrapper as a context manager
20 |     with ZPar(args.modeldir) as z:
21 | 
22 |         # get the parser and the dependency parser models
23 |         tagger = z.get_tagger()
24 |         depparser = z.get_depparser()
25 | 
26 |         # tag a sentence
27 |         tagged_sent = tagger.tag_sentence("I am going to the market.")
28 |         print_(tagged_sent)
29 | 
30 |         # tag an already tokenized sentence
31 |         tagged_sent = tagger.tag_sentence("Do n't you want to come with me to the market ?", tokenize=False)
32 |         print_(tagged_sent)
33 | 
34 |         # get the dependency parse of an already tagged sentence
35 |         dep_parsed_sent = depparser.dep_parse_tagged_sentence("I/PRP am/VBP going/VBG to/TO the/DT market/NN ./.")
36 |         print_(dep_parsed_sent)
37 | 
38 |         # get the dependency parse of an already tokenized sentence
39 |         dep_parsed_sent = depparser.dep_parse_sentence("Do n't you want to come with me to the market ?", tokenize=False)
40 |         print_(dep_parsed_sent)
41 | 
42 |         # get the dependency parse of an already tokenized sentence
43 |         # and include lemma information (assuming you have NLTK as well
44 |         # as its WordNet corpus installed)
45 |         dep_parsed_sent = depparser.dep_parse_sentence("Do n't you want to come with me to the market ?", tokenize=False, with_lemmas=True)
46 |         print_(dep_parsed_sent)
47 | 
48 |         # compute POS tags for all sentences in "test.txt"
49 |         # and write the output to "test.tag". Note that the
50 |         # file contains a single sentence per line.
51 |         # The sentences need not be word tokenized
52 |         test_file = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'test.txt')
53 |         tagger.tag_file(test_file, "test.tag")
54 | 
55 |         # compute dependency parses for all sentences in "test_tokenized.txt"
56 |         tokenized_test_file = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'test_tokenized.txt')
57 |         depparser.dep_parse_file(tokenized_test_file, "test.dep")
58 | 
59 | if __name__ == '__main__':
60 |     main()
61 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | # Adapted from https://github.com/Turbo87/py-xcsoar/blob/master/setup.py
  4 | 
  5 | import os
  6 | from setuptools import setup
  7 | from setuptools.command.install import install
  8 | from distutils.command.build import build
  9 | from subprocess import call
 10 | 
 11 | import sys
 12 | 
 13 | BASEPATH = os.path.dirname(os.path.abspath(__file__))
 14 | ZPAR_PATH = os.path.join(BASEPATH, 'zpar')
 15 | ZPAR_LIB_PATH = os.path.join(ZPAR_PATH, 'dist')
 16 | 
 17 | def readme():
 18 |     with open('README.rst') as f:
 19 |         return f.read()
 20 | 
 21 | class build_zpar(build):
 22 |     def run(self):
 23 | 
 24 |         # run original build code
 25 |         build.run(self)
 26 | 
 27 |         # get a copy of the user environment
 28 |         env = os.environ.copy()
 29 | 
 30 |         sys.stderr.write('running build_zpar\n')
 31 | 
 32 |         # for now the compilation is just calling make
 33 |         # with the option to override the CXX defined
 34 |         # in the zpar Makefile with the CXX environment
 35 |         # variable if defined.
 36 |         if os.environ.get('CXX'):
 37 |             cmd = ['make', '-e']
 38 |             env['CXX'] = os.environ.get('CXX')
 39 |         else:
 40 |             cmd = ['make']
 41 | 
 42 |         # compile the shared library path
 43 |         def compile():
 44 |             sys.stderr.write('*' * 80 + '\n')
 45 |             ret = call(cmd, env=env)
 46 |             # if something went wrong, raise an error
 47 |             if ret:
 48 |                 raise RuntimeError('ZPar shared library compilation failed')
 49 |             sys.stderr.write('*' * 80 + '\n')
 50 |         self.execute(compile, [], 'compiling zpar library')
 51 | 
 52 |         # copy resulting tool to library build folder
 53 |         self.mkpath(self.build_lib)
 54 | 
 55 |         if not self.dry_run:
 56 |             self.copy_tree(ZPAR_PATH, self.build_lib)
 57 | 
 58 | class install_zpar(install):
 59 | 
 60 |     def initialize_options(self):
 61 |         install.initialize_options(self)
 62 |         self.build_scripts = None
 63 | 
 64 |     def finalize_options(self):
 65 |         install.finalize_options(self)
 66 |         self.set_undefined_options('build', ('build_scripts', 'build_scripts'))
 67 | 
 68 |     def run(self):
 69 |         # run original install code
 70 |         install.run(self)
 71 | 
 72 |         # install ZPar executables
 73 |         sys.stderr.write('running install_zpar\n')
 74 |         install_path = os.path.join(self.install_lib, 'zpar')
 75 |         self.mkpath(install_path)
 76 |         self.copy_tree(self.build_lib, install_path)
 77 | 
 78 | 
 79 | def read(fname):
 80 |     return open(os.path.join(os.path.dirname(__file__), fname)).read()
 81 | 
 82 | 
 83 | setup(
 84 |     name='python-zpar',
 85 |     version='0.9.5',
 86 |     description='A Wrapper around the ZPar statistical tagger/parser for English',
 87 |     maintainer='Nitin Madnani',
 88 |     maintainer_email='nmadnani@ets.org',
 89 |     license='MIT',
 90 |     url='http://www.github.com/EducationalTestingService/python-zpar',
 91 |     long_description=readme(),
 92 |     classifiers=['Intended Audience :: Science/Research',
 93 |                  'Intended Audience :: Developers',
 94 |                  'License :: OSI Approved :: MIT License',
 95 |                  'Programming Language :: Python',
 96 |                  'Topic :: Software Development',
 97 |                  'Topic :: Scientific/Engineering',
 98 |                  'Operating System :: POSIX',
 99 |                  'Operating System :: Unix',
100 |                  'Operating System :: MacOS',
101 |                  'Programming Language :: Python :: 2',
102 |                  'Programming Language :: Python :: 2.7',
103 |                  'Programming Language :: Python :: 3',
104 |                  'Programming Language :: Python :: 3.3',
105 |                 ],
106 |     cmdclass={
107 |         'build': build_zpar,
108 |         'install': install_zpar,
109 |     },
110 |     entry_points={'console_scripts':
111 |                   ['zpar_server = zpar.zpar_server:main']}
112 | )
113 | 


--------------------------------------------------------------------------------
/src/Makefile:
--------------------------------------------------------------------------------
  1 | #****************************************************************
  2 | #
  3 | # Makefile
  4 | #
  5 | # Yue Zhang
  6 | # Computing lab, Oxford. 2006.10 - 2008.2
  7 | #
  8 | #****************************************************************
  9 | 
 10 | # Makable targe systems include:
 11 | #
 12 | # === Chinese ===
 13 | # segmentor - Chinese word segmentor
 14 | # chinese.postagger - Chinese POS tagger (joint / single)
 15 | # chinese.depparser - Chinese dependency parser
 16 | #
 17 | # === English ===
 18 | # english.postagger - English POS tagger
 19 | # english.depparser - English dependency parser
 20 | 
 21 | #================================================================
 22 | #
 23 | # Configurations
 24 | #
 25 | #================================================================
 26 | 
 27 | # the generic tagger
 28 | TAGGER_IMPL = collins
 29 | 
 30 | # the generic depparser
 31 | DEPPARSER_IMPL = arceager
 32 | 
 33 | # the generic conparser
 34 | CONPARSER_IMPL = srnew
 35 | 
 36 | # choose between agenda, agendachart etc ## NO SPACE AFTER NAME ###
 37 | #
 38 | # agenda: the single agenda method - reproduce paper
 39 | # agendaplus: try to improve upon the decoding algorithm of agenda
 40 | # viterbi: dynamic programming
 41 | SEGMENTOR_IMPL = agenda
 42 | 
 43 | # Chinese postagger implementations
 44 | #
 45 | # joint taggers include the following implementations
 46 | # agendachart: combining agenda and chart, this is the best system - reproduce paper
 47 | #
 48 | # taggers on segmented sentences include the following implementations
 49 | # segmented: the unidirectional trigram tagger
 50 | CHINESE_TAGGER_IMPL = agenda
 51 | 
 52 | # Chinese dependency parser
 53 | #
 54 | # currently support eisner, covington, nivre, combined and joint implementations
 55 | CHINESE_DEPPARSER_IMPL = arceager
 56 | CHINESE_DEPPARSER_LABELED = true
 57 | CHINESE_DEPLABELER_IMPL = naive
 58 | 
 59 | # currently support sr implementations
 60 | CHINESE_CONPARSER_IMPL = acl13
 61 | CHINESE_CONPARSER_JOINT_OR_CASCADE = JOINT_CONPARSER
 62 | 
 63 | # currently support only agenda
 64 | ENGLISH_TAGGER_IMPL = collins
 65 | 
 66 | # currently support eisner, covington, nivre, combined implementations
 67 | ENGLISH_DEPPARSER_IMPL = arceager
 68 | ENGLISH_DEPPARSER_LABELED = true
 69 | ENGLISH_DEPLABELER_IMPL = naive
 70 | 
 71 | # currently support sr implementations
 72 | ENGLISH_CONPARSER_IMPL = muhua
 73 | 
 74 | # Spanish pos tagger
 75 | SPANISH_TAGGER_IMPL = collins
 76 | 
 77 | # Spanish dependency parser
 78 | SPANISH_DEPPARSER_IMPL = arceager
 79 | SPANISH_DEPPARSER_LABELED = true
 80 | SPANISH_DEPLABELER_IMPL = naive
 81 | 
 82 | # Spanish annotation. Supported: ES06_DEPENDENCIES, ES09_DEPENDENCIES
 83 | SPANISH_ANNOTATION = ES09_DEPENDENCIES
 84 | 
 85 | #no Spanish constituency parser at the moment
 86 | 
 87 | #================================================================
 88 | #
 89 | # Debug mode or the run mode (empty)
 90 | #
 91 | #================================================================
 92 | 
 93 | #DEBUG = -DDEBUG -g
 94 | DEBUG = -DNDEBUG
 95 | 
 96 | #================================================================
 97 | #
 98 | # directory configurations
 99 | #
100 | #================================================================
101 | 
102 | BASE_DIR = .
103 | include Makefile.common
104 | 
105 | #================================================================
106 | #
107 | # cross platform configurations
108 | #
109 | #================================================================
110 | 
111 | ifeq ($(OS),Windows_NT)
112 | #use good old GNU mkdir instead of MSDOS mkdir on Windows
113 | MKDIR=gmkdir -p
114 | else
115 | MKDIR=mkdir -p
116 | endif
117 | 
118 | #================================================================
119 | #
120 | # compiler commands
121 | #
122 | #================================================================
123 | 
124 | INCLUDES = -I$(SRC_INCLUDES)
125 | 
126 | CXX = g++
127 | CXXFLAGS = -w -W -O3 $(INCLUDES) $(DEBUG) -fPIC
128 | 
129 | LD=$(CXX)
130 | LDFLAGS =
131 | 
132 | #================================================================
133 | #
134 | # Shared objects
135 | #
136 | #================================================================
137 | 
138 | # the objects
139 | LINGUISTICS_OBJECTS = $(OBJECT_DIR)/linguistics/lemma.o $(OBJECT_DIR)/linguistics/conll.o
140 | LEARNING_OBJECTS = $(OBJECT_DIR)/learning/dbn.o
141 | OBJECTS = $(OBJECT_DIR)/reader.o $(OBJECT_DIR)/writer.o $(OBJECT_DIR)/options.o $(LINGUISTICS_OBJECTS) $(LEARNING_OBJECTS)
142 | 
143 | $(OBJECT_DIR)/%.o: $(SRC_LIBS)/%.cpp $(SRC_INCLUDES)/%.h
144 | 	$(MKDIR) $(OBJECT_DIR)
145 | 	$(MKDIR) $(OBJECT_DIR)/linguistics
146 | 	$(MKDIR) $(OBJECT_DIR)/learning
147 | 	$(CXX) $(CXXFLAGS) -c $< -o $@
148 | 
149 | all: zpar
150 | 
151 | # the directories
152 | $(OBJECT_DIR):
153 | 	$(MKDIR) $(OBJECT_DIR)
154 | $(DIST_DIR):
155 | 	$(MKDIR) $(DIST_DIR)
156 | 
157 | #  tagger
158 | SRC_TAGGER = $(SRC_CHINESE)/tagger
159 | DIST_TAGGER = $(DIST_DIR)/chinese.postagger
160 | OBJECT_TAGGER = $(OBJECT_DIR)/chinese.postagger
161 | $(DIST_TAGGER):
162 | 	$(MKDIR) $(DIST_TAGGER)
163 | $(OBJECT_TAGGER):
164 | 	$(MKDIR) $(OBJECT_TAGGER)
165 | 
166 | SRC_ENGLISH_TAGGER = $(SRC_COMMON)/tagger
167 | DIST_ENGLISH_TAGGER = $(DIST_DIR)/english.postagger
168 | OBJECT_ENGLISH_TAGGER = $(OBJECT_DIR)/english.postagger
169 | $(DIST_ENGLISH_TAGGER):
170 | 	$(MKDIR) $(DIST_ENGLISH_TAGGER)
171 | $(OBJECT_ENGLISH_TAGGER):
172 | 	$(MKDIR) $(OBJECT_ENGLISH_TAGGER)
173 | 
174 | SRC_SPANISH_TAGGER = $(SRC_COMMON)/tagger
175 | DIST_SPANISH_TAGGER = $(DIST_DIR)/spanish.postagger
176 | OBJECT_SPANISH_TAGGER = $(OBJECT_DIR)/spanish.postagger
177 | $(DIST_SPANISH_TAGGER):
178 | 	$(MKDIR) $(DIST_SPANISH_TAGGER)
179 | $(OBJECT_SPANISH_TAGGER):
180 | 	$(MKDIR) $(OBJECT_SPANISH_TAGGER)
181 | 
182 | #  depparser
183 | SRC_COMMON_DEPPARSER = $(SRC_COMMON)/depparser
184 | #ifeq ($(CHINESE_DEPPARSER_IMPL), joint)
185 | #	SRC_CHINESE_DEPPARSER = $(SRC_CHINESE)/depparser
186 | #else
187 | #	SRC_CHINESE_DEPPARSER = $(SRC_COMMON_DEPPARSER)
188 | #endif
189 | SRC_CHINESE_DEPPARSER = $(SRC_COMMON_DEPPARSER)
190 | DIST_DEPPARSER = $(DIST_DIR)/chinese.depparser
191 | OBJECT_DEPPARSER = $(OBJECT_DIR)/chinese.depparser
192 | DIST_ENGLISH_DEPPARSER = $(DIST_DIR)/english.depparser
193 | OBJECT_ENGLISH_DEPPARSER = $(OBJECT_DIR)/english.depparser
194 | DIST_SPANISH_DEPPARSER = $(DIST_DIR)/spanish.depparser
195 | OBJECT_SPANISH_DEPPARSER = $(OBJECT_DIR)/spanish.depparser
196 | 
197 | # deplabeler
198 | SRC_COMMON_DEPLABELER = $(SRC_COMMON)/deplabeler
199 | SRC_CHINESE_DEPLABELER = $(SRC_COMMON_DEPLABELER)
200 | DIST_DEPLABELER = $(DIST_DIR)/chinese.deplabeler
201 | OBJECT_DEPLABELER = $(OBJECT_DIR)/chinese.deplabeler
202 | SRC_ENGLISH_DEPLABELER = $(SRC_COMMON_DEPLABELER)
203 | DIST_ENGLISH_DEPLABELER = $(DIST_DIR)/english.deplabeler
204 | OBJECT_ENGLISH_DEPLABELER = $(OBJECT_DIR)/english.deplabeler
205 | SRC_SPANISH_DEPLABELER = $(SRC_COMMON_DEPLABELER)
206 | DIST_SPANISH_DEPLABELER = $(DIST_DIR)/spanish.deplabeler
207 | OBJECT_SPANISH_DEPLABELER = $(OBJECT_DIR)/spanish.deplabeler
208 | 
209 | #  conparser
210 | SRC_COMMON_CONPARSER = $(SRC_COMMON)/conparser
211 | SRC_CHINESE_CONPARSER = $(SRC_COMMON_CONPARSER)
212 | ifeq ($(CHINESE_CONPARSER_IMPL), jcad)
213 | 	SRC_CHINESE_CONPARSER = $(SRC_CHINESE)/conparser
214 | else
215 | 	ifeq ($(CHINESE_CONPARSER_IMPL), acl13)
216 | 		SRC_CHINESE_CONPARSER = $(SRC_CHINESE)/conparser
217 | 	else
218 | 		SRC_CHINESE_CONPARSER = $(SRC_COMMON_CONPARSER)
219 | 	endif
220 | endif
221 | SRC_ENGLISH_CONPARSER = $(SRC_COMMON_CONPARSER)
222 | DIST_CONPARSER = $(DIST_DIR)/chinese.conparser
223 | OBJECT_CONPARSER = $(OBJECT_DIR)/chinese.conparser
224 | DIST_ENGLISH_CONPARSER = $(DIST_DIR)/english.conparser
225 | OBJECT_ENGLISH_CONPARSER = $(OBJECT_DIR)/english.conparser
226 | 
227 | #----------------------------------------------------------------
228 | #
229 | # zpar general
230 | #
231 | #----------------------------------------------------------------
232 | 
233 | 
234 | ifeq ($(CHINESE_CONPARSER_IMPL), jcad)
235 | 	OBJ_CHINESE_CONSTITUENT = $(OBJECT_CONPARSER)/constituent.o $(OBJECT_CONPARSER)/jointconstituent.o
236 | else
237 | 	ifeq ($(CHINESE_CONPARSER_IMPL), acl13)
238 | 		OBJ_CHINESE_CONSTITUENT = $(OBJECT_CONPARSER)/constituent.o $(OBJECT_CONPARSER)/jointconstituent.o
239 | 	else
240 | 		OBJ_CHINESE_CONSTITUENT = $(OBJECT_CONPARSER)/constituent.o
241 | 	endif
242 | endif
243 | 
244 | $(DIST_CONPARSER):
245 | 	$(MKDIR) $(DIST_CONPARSER)
246 | $(OBJECT_CONPARSER):
247 | 	$(MKDIR) $(OBJECT_CONPARSER)
248 | 
249 | $(DIST_DEPLABELER):
250 | 	$(MKDIR) $(DIST_DEPLABELER)
251 | $(OBJECT_DEPLABELER):
252 | 	$(MKDIR) $(OBJECT_DEPLABELER)
253 | 
254 | # the flags for train
255 | ifeq ($(CHINESE_TAGGER_IMPL), segmented) # if segmented
256 | 	TAGGER_TRAIN_FLAGS = -DSEGMENTED
257 | 	TAGGER_TEST_FLAGS = -DSEGMENTED
258 | else
259 | 	ifeq ($(CHINESE_TAGGER_IMPL), bidirectional) # else if bidirectional
260 | 		TAGGER_TRAIN_FLAGS = -DSEGMENTED -DAUTO
261 | 		TAGGER_TEST_FLAGS = -DSEGMENTED
262 | 	endif
263 | endif
264 | 
265 | 
266 | ifeq ($(CHINESE_DEPPARSER_LABELED), true)
267 | 	CHINESE_DEPPARSER_D = -DLABELED
268 | endif
269 | 
270 | ifeq ($(ENGLISH_DEPPARSER_LABELED), true)
271 | 	ENGLISH_DEPPARSER_D = -DLABELED
272 | endif
273 | 
274 | ifeq ($(CHINESE_DEPPARSER_IMPL), combined)
275 | 	CHINESE_DEPPARSER_D := $(CHINESE_DEPPARSER_D) -DCOMBINED
276 | 	CHINESE_DEPPARSER_IMPL = nivre
277 | endif
278 | 
279 | ifeq ($(ENGLISH_DEPPARSER_IMPL), combined)
280 | 	ENGLISH_DEPPARSER_D := $(ENGLISH_DEPPARSER_D) -DCOMBINED
281 | 	ENGLISH_DEPPARSER_IMPL = nivre
282 | endif
283 | 
284 | #====================================================
285 | 
286 | $(DIST_DEPPARSER):
287 | 	$(MKDIR) $(DIST_DEPPARSER)
288 | $(OBJECT_DEPPARSER):
289 | 	$(MKDIR) $(OBJECT_DEPPARSER)
290 | 
291 | SRC_SEGMENTOR = $(SRC_CHINESE)/segmentor
292 | DIST_SEGMENTOR = $(DIST_DIR)/segmentor
293 | OBJECT_SEGMENTOR = $(OBJECT_DIR)/segmentor
294 | $(DIST_SEGMENTOR):
295 | 	$(MKDIR) $(DIST_SEGMENTOR)
296 | $(OBJECT_SEGMENTOR):
297 | 	$(MKDIR) $(OBJECT_SEGMENTOR)
298 | 
299 | include Makefile.zpar.zh
300 | include Makefile.zpar.en
301 | include Makefile.zpar.ge
302 | include Makefile.zpar.es
303 | include Makefile.zpar.mvt
304 | include Makefile.lib.zpar
305 | 
306 | zpar: zpar.ge
307 | 
308 | #----------------------------------------------------------------
309 | #
310 | # The sentence boundary detector
311 | #
312 | #----------------------------------------------------------------
313 | 
314 | include Makefile.doc2snt
315 | 
316 | #----------------------------------------------------------------
317 | #
318 | # The ccgparser
319 | #
320 | #----------------------------------------------------------------
321 | 
322 | include Makefile.ccg
323 | 
324 | #----------------------------------------------------------------
325 | #
326 | # Miscelaneous
327 | #
328 | #----------------------------------------------------------------
329 | 
330 | include Makefile.misc
331 | #include Makefile.rr
332 | 
333 | 


--------------------------------------------------------------------------------
/src/Makefile.lib.zpar:
--------------------------------------------------------------------------------
 1 | ifeq ($(ENGLISH_DEPPARSER_LABELED), true)
 2 | 	ENGLISH_DEPPARSER_D = -DLABELED
 3 | endif
 4 | 
 5 | ifeq ($(ENGLISH_DEPPARSER_IMPL), combined)
 6 | 	ENGLISH_DEPPARSER_D := $(ENGLISH_DEPPARSER_D) -DCOMBINED
 7 | 	ENGLISH_DEPPARSER_IMPL = nivre
 8 | endif
 9 | 
10 | zpar.so: $(OBJECT_DIR) $(DIST_DIR) $(OBJECT_DIR)/reader.o $(OBJECT_DIR)/writer.o $(OBJECT_DIR)/options.o $(OBJECT_DIR)/english.postagger.o $(OBJECT_ENGLISH_TAGGER)/weight.o $(OBJECT_DIR)/english.conparser.o $(OBJECT_ENGLISH_CONPARSER)/constituent.o $(OBJECT_ENGLISH_CONPARSER)/weight.o $(OBJECT_DIR)/english.depparser.o $(OBJECT_ENGLISH_DEPPARSER)/weight.o $(OBJECT_DIR)/english.deplabeler.o $(OBJECT_ENGLISH_DEPLABELER)/weight.o $(OBJECTS)
11 | 	$(CXX) $(CXXFLAGS) -DTARGET_LANGUAGE=english $(ENGLISH_DEPPARSER_D) -I$(SRC_ENGLISH) -I$(SRC_ENGLISH_TAGGER) -I$(SRC_ENGLISH_TAGGER)/implementations/$(ENGLISH_TAGGER_IMPL) -I$(SRC_ENGLISH_CONPARSER) -I$(SRC_ENGLISH_CONPARSER)/implementations/$(ENGLISH_CONPARSER_IMPL) -I$(SRC_COMMON_DEPPARSER) -I$(SRC_COMMON_DEPPARSER)/implementations/$(ENGLISH_DEPPARSER_IMPL) -I$(SRC_COMMON_DEPLABELER) -I$(SRC_COMMON_DEPLABELER)/implementations/$(ENGLISH_DEPLABELER_IMPL) -c $(SRC_ENGLISH)/zpar.lib.cpp -o $(OBJECT_DIR)/zpar.lib.o
12 | 	$(CXX) -shared $(OBJECT_DIR)/zpar.lib.o $(OBJECT_ENGLISH_TAGGER)/weight.o $(OBJECT_DIR)/english.postagger.o $(OBJECT_DIR)/english.depparser.o $(OBJECT_ENGLISH_DEPPARSER)/weight.o $(OBJECT_DIR)/english.conparser.o $(OBJECT_ENGLISH_CONPARSER)/constituent.o $(OBJECT_ENGLISH_CONPARSER)/weight.o $(OBJECT_DIR)/english.deplabeler.o $(OBJECT_ENGLISH_DEPLABELER)/weight.o $(OBJECTS) -o $(DIST_DIR)/zpar.so
13 | 	@echo zpar.so compiled successfully into $(DIST_DIR).
14 | 
15 | zpar.exe: $(OBJECT_DIR) $(DIST_DIR) $(OBJECT_DIR)/reader.o $(OBJECT_DIR)/writer.o $(OBJECT_DIR)/options.o $(OBJECT_DIR)/english.postagger.o $(OBJECT_ENGLISH_TAGGER)/weight.o $(OBJECT_DIR)/english.conparser.o $(OBJECT_ENGLISH_CONPARSER)/constituent.o $(OBJECT_ENGLISH_CONPARSER)/weight.o $(OBJECT_DIR)/english.depparser.o $(OBJECT_ENGLISH_DEPPARSER)/weight.o $(OBJECT_DIR)/english.deplabeler.o $(OBJECT_ENGLISH_DEPLABELER)/weight.o $(OBJECTS)
16 | 	$(CXX) $(CXXFLAGS) -DTARGET_LANGUAGE=english $(ENGLISH_DEPPARSER_D) -I$(SRC_ENGLISH) -I$(SRC_ENGLISH_TAGGER) -I$(SRC_ENGLISH_TAGGER)/implementations/$(ENGLISH_TAGGER_IMPL) -I$(SRC_ENGLISH_CONPARSER) -I$(SRC_ENGLISH_CONPARSER)/implementations/$(ENGLISH_CONPARSER_IMPL) -I$(SRC_COMMON_DEPPARSER) -I$(SRC_COMMON_DEPPARSER)/implementations/$(ENGLISH_DEPPARSER_IMPL) -I$(SRC_COMMON_DEPLABELER) -I$(SRC_COMMON_DEPLABELER)/implementations/$(ENGLISH_DEPLABELER_IMPL) -c $(SRC_ENGLISH)/zpar.lib.cpp -o $(OBJECT_DIR)/zpar.lib.o
17 | 	$(LD) $(LDFLAGS) -fPIE -pie -o $(DIST_DIR)/zpar.exe $(OBJECT_DIR)/zpar.lib.o $(OBJECT_ENGLISH_TAGGER)/weight.o $(OBJECT_DIR)/english.postagger.o $(OBJECT_DIR)/english.depparser.o $(OBJECT_ENGLISH_DEPPARSER)/weight.o $(OBJECT_DIR)/english.conparser.o $(OBJECT_ENGLISH_CONPARSER)/constituent.o $(OBJECT_ENGLISH_CONPARSER)/weight.o $(OBJECT_DIR)/english.deplabeler.o $(OBJECT_ENGLISH_DEPLABELER)/weight.o $(OBJECTS)
18 | 	@echo zpar.exe system compiled successfully into $(DIST_DIR).
19 | 
20 | 


--------------------------------------------------------------------------------
/src/reader.h:
--------------------------------------------------------------------------------
 1 | // Copyright (C) University of Oxford 2010
 2 | /****************************************************************
 3 |  *                                                              *
 4 |  * reader.h - the sentence reader classes                       *
 5 |  *                                                              *
 6 |  * this file is specifically designed for sentence_string       *
 7 |  *                                                              *
 8 |  * Author: Yue Zhang                                            *
 9 |  *                                                              *
10 |  * Computing Laboratory, Oxford. 2006.10                        *
11 |  *                                                              *
12 |  ****************************************************************/
13 | 
14 | #ifndef READER_H
15 | #define READER_H
16 | 
17 | #include "definitions.h"
18 | #include "file_utils.h"
19 | #include "linguistics/sentence_string.h"
20 | 
21 | #include <sstream>
22 | 
23 | /*===============================================================
24 |  *
25 |  * CSentenceReader - read sentence
26 |  *
27 |  * Specify a file name in the constructor. If no file name is specified,
28 |  * the reader will read from the standard input.
29 |  *
30 |  * readRawSentence:
31 |  *  - The input file should contain tokenised sentences each in a line,
32 |  *    with space separated words and punctuations.
33 |  *    In the Chinese case, each character should be separated by space.
34 |  *
35 |  *==============================================================*/
36 | 
37 | class CSentenceReader {
38 |    protected:
39 |         std::istream *m_iStream;
40 |         bool m_fileMode;
41 |         int m_nLine;
42 |    public:
43 |         // constructor and destructor method
44 |         CSentenceReader(const std::string &sFileName="", bool fileMode=true) {
45 |             m_fileMode = fileMode;
46 |             if (m_fileMode) {
47 |                 if (sFileName.empty())
48 | 	               m_iStream = &std::cin;
49 |                 else {
50 |                     if (!FileExists(sFileName)) THROW("File " << sFileName << " not found.");
51 |                     m_iStream=new std::ifstream(sFileName.c_str());
52 |                 }
53 |                 m_nLine = 0;
54 |             }
55 |             else {
56 |                 m_iStream = new std::istringstream(sFileName);
57 |                 m_nLine = 0;
58 |             }
59 |         };
60 |         virtual ~CSentenceReader() {
61 |             if (m_fileMode) {
62 |                 if (m_iStream != &std::cin) {
63 |                     ((std::ifstream*)m_iStream)->close();
64 |                     delete m_iStream;
65 |                 }
66 |             }
67 |             else {
68 |                 delete m_iStream;
69 |             }
70 |         };
71 |       bool readRawCharacter(std::string *retval);
72 |       bool readRawSentence(CStringVector *retval, bool bSkipEmptyLines=false, bool bIgnoreSpace=false);
73 |       bool readSegmentedSentence(CStringVector *retval, bool bSkipEmptyLines=false);
74 |       bool readTaggedSentence(CTwoStringVector *retval, bool bSkipEmptyLines=false, const char separator='_');
75 |       bool readSegmentedSentenceAndTokenize(CStringVector *vReturn, bool bSkipEmptyLines=false);
76 | };
77 | 
78 | #endif
79 | 


--------------------------------------------------------------------------------
/src/zpar.lib.cpp:
--------------------------------------------------------------------------------
  1 | /****************************************************************
  2 |  *                                                              *
  3 |  * zpar.lib.cpp - a library that can be used by python          *
  4 |  *                                                              *
  5 |  * Author: Nitin Madnani                                        *
  6 |  * Educational Testing Service, Princeton, NJ                   *
  7 |  *                                                              *
  8 |  ****************************************************************/
  9 | 
 10 | #define SIMPLE_HASH
 11 | 
 12 | #include "definitions.h"
 13 | #include "options.h"
 14 | #include "tagger.h"
 15 | #include "conparser.h"
 16 | #include "depparser.h"
 17 | #include "reader.h"
 18 | #include "writer.h"
 19 | #include "stdlib.h"
 20 | #include <cstring>
 21 | #include <iterator>
 22 | #include <sstream>
 23 | 
 24 | using namespace english;
 25 | 
 26 | #define MAX_SENTENCE_SIZE 512
 27 | 
 28 | 
 29 | // define a container structure with a container and a destructor
 30 | struct zparSession_t
 31 | {
 32 |     CTagger* tagger;
 33 |     CConParser* conparser;
 34 |     CDepParser* depparser;
 35 |     char *output_buffer;
 36 | 
 37 |     zparSession_t() {
 38 |         tagger = NULL;
 39 |         conparser = NULL;
 40 |         depparser = NULL;
 41 |         output_buffer = NULL;
 42 |     };
 43 | 
 44 |     ~zparSession_t() {
 45 |         if (tagger) {
 46 |             delete tagger;
 47 |             tagger= NULL;
 48 |         }
 49 |         if (conparser) {
 50 |             delete conparser;
 51 |             conparser = NULL;
 52 |         }
 53 |         if (depparser) {
 54 |             delete depparser;
 55 |             depparser = NULL;
 56 |         }
 57 |         if (output_buffer) {
 58 |             delete output_buffer;
 59 |             output_buffer = NULL;
 60 |         }
 61 |     };
 62 | };
 63 | 
 64 | // instantiate the container
 65 | // zparSession_t *zps = new zparSession_t();
 66 | 
 67 | extern "C" void* initialize() {
 68 |    zparSession_t* zps = new zparSession_t;
 69 |    return (void *)zps;
 70 | }
 71 | 
 72 | // a utility function to output tagged data in the usual
 73 | // "WORD/TAG" format as expected
 74 | std::string format_tagged_vector(CTwoStringVector *tagged_sent)
 75 | {
 76 | 
 77 |     CTwoStringVector::const_iterator it;
 78 |     CStringVector formatted_tagged_sent[1];
 79 |     for (it = tagged_sent->begin(); it != tagged_sent->end(); ++it)
 80 |     {
 81 |         std::stringstream tmpss;
 82 |         tmpss << it->first << "/" << it->second;
 83 |         std::string tmpstr(tmpss.str());
 84 |         formatted_tagged_sent->push_back(tmpstr);
 85 |     }
 86 | 
 87 |     int i;
 88 |     std::stringstream oss;
 89 |     for (i = 0; i < formatted_tagged_sent->size(); ++i)
 90 |     {
 91 |         oss << formatted_tagged_sent->at(i);
 92 |         if (i != formatted_tagged_sent->size() - 1)
 93 |         {
 94 |             oss << " ";
 95 |         }
 96 |     }
 97 | 
 98 |     std::string outstr(oss.str());
 99 |     return outstr;
100 | 
101 | }
102 | 
103 | // A utility function to format the dependncy output
104 | // in CoNLL format
105 | std::string format_dependency_tree(CDependencyParse *parsed_sent)
106 | {
107 | 
108 |     int i;
109 |     std::stringstream oss;
110 |     std::copy(parsed_sent->begin(), parsed_sent->end(), std::ostream_iterator<CLabeledDependencyTreeNode>(oss, "\n"));
111 | 
112 |     std::string outstr(oss.str());
113 |     return outstr;
114 | 
115 | }
116 | 
117 | // The function to load the tagger model
118 | extern "C" int load_tagger(void* vzps, const char* sFeaturePath) {
119 | 
120 |     zparSession_t* zps = static_cast<zparSession_t *>(vzps);
121 | 
122 |     std::string sTaggerFeatureFile = std::string(sFeaturePath) + "/tagger";
123 |     std::cerr << "Loading tagger from " << sTaggerFeatureFile << std::endl;
124 |     if (!FileExists(sTaggerFeatureFile)) {
125 |         return 1;
126 |     }
127 | 
128 |     CTagger* tagger = new CTagger(sTaggerFeatureFile, false);
129 |     zps->tagger = tagger;
130 |     return 0;
131 | }
132 | 
133 | // The function to load the constituency parser model
134 | extern "C" int load_parser(void* vzps, const char *sFeaturePath) {
135 | 
136 |     zparSession_t* zps = static_cast<zparSession_t *>(vzps);
137 | 
138 |     // If the tagger is not already loaded, then we need to load
139 |     // it since the parser requires the tagger
140 |     if (!zps->tagger) {
141 |         if (load_tagger(zps, sFeaturePath)) {
142 |             return 1;
143 |         }
144 |     }
145 | 
146 |     CConParser *conparser;
147 |     std::string sConParserFeatureFile = std::string(sFeaturePath) + "/conparser";
148 |     std::cerr << "Loading constituency parser from " << sConParserFeatureFile << std::endl;
149 |     if (!FileExists(sConParserFeatureFile)) {
150 |         return 1;
151 |     }
152 |     conparser = new CConParser(sConParserFeatureFile, false);
153 |     zps->conparser = conparser;
154 |     return 0;
155 | }
156 | 
157 | 
158 | 
159 | // The function to load the dependency parser model
160 | extern "C" int load_depparser(void* vzps, const char *sFeaturePath) {
161 | 
162 |     zparSession_t* zps = static_cast<zparSession_t *>(vzps);
163 | 
164 |     // If the tagger is not already loaded, then we need to load
165 |     // it since the parser requires the tagger
166 |     if (!zps->tagger) {
167 |         if (load_tagger(zps, sFeaturePath)) {
168 |             return 1;
169 |         }
170 |     }
171 | 
172 |     CDepParser *depparser;
173 |     std::string sDepParserFeatureFile = std::string(sFeaturePath) + "/depparser";
174 |     std::cerr << "Loading dependency parser from " << sDepParserFeatureFile << std::endl;
175 |     if (!FileExists(sDepParserFeatureFile)) {
176 |         return 1;
177 |     }
178 |     depparser = new CDepParser(sDepParserFeatureFile, false);
179 |     zps->depparser = depparser;
180 |     return 0;
181 | }
182 | 
183 | // The function to load all three models
184 | extern "C" int load_models(void* vzps, const char *sFeaturePath) {
185 | 
186 |     zparSession_t* zps = static_cast<zparSession_t *>(vzps);
187 | 
188 |     if (load_tagger(zps, sFeaturePath)) {
189 |         return 1;
190 |     }
191 |     if (load_parser(zps, sFeaturePath)) {
192 |         return 1;
193 |     }
194 |     if (load_depparser(zps, sFeaturePath)) {
195 |         return 1;
196 |     }
197 |     return 0;
198 | }
199 | 
200 | // Function to tag a sentence
201 | extern "C" char* tag_sentence(void* vzps, const char *input_sentence, bool tokenize)
202 | {
203 | 
204 |     zparSession_t* zps = static_cast<zparSession_t *>(vzps);
205 | 
206 |     try {
207 |         // create a temporary string stream from the input char *
208 |         CSentenceReader input_reader(std::string(input_sentence), false);
209 | 
210 |         // tokenize the sentence
211 |         CStringVector input_sent[1];
212 |         if (tokenize) {
213 |             input_reader.readSegmentedSentenceAndTokenize(input_sent);
214 |         }
215 |         else {
216 |             input_reader.readSegmentedSentence(input_sent);
217 |         }
218 | 
219 |         // initialize the variable that will hold the tagged sentence
220 |         CTwoStringVector tagged_sent[1];
221 | 
222 |         // get the tagger that was stored earlier
223 |         CTagger *tagger = zps->tagger;
224 | 
225 |         // tag the sentence
226 |         tagger->tag(input_sent, tagged_sent);
227 | 
228 |         // format the tagged sentence properly and return
229 |         std::string tagvec = format_tagged_vector(tagged_sent);
230 |         int tagveclen = tagvec.length();
231 | 
232 |         if (zps->output_buffer != NULL) {
233 |             delete zps->output_buffer;
234 |             zps->output_buffer = NULL;
235 |         }
236 |         zps->output_buffer = new char[tagveclen + 1];
237 |         strcpy(zps->output_buffer, tagvec.c_str());
238 |     } catch (const std::string &e) {
239 |         std::cerr << e << std::endl;
240 |         zps->output_buffer = new char[1];
241 |         strcpy(zps->output_buffer, "");
242 |     }
243 |     return zps->output_buffer;
244 | }
245 | 
246 | // Function to constituency parse a sentence
247 | extern "C" char* parse_sentence(void* vzps, const char *input_sentence, bool tokenize)
248 | {
249 | 
250 |     zparSession_t* zps = static_cast<zparSession_t *>(vzps);
251 | 
252 |     try {
253 | 
254 |         // create a temporary string stream from the input char *
255 |         CSentenceReader input_reader(std::string(input_sentence), false);
256 | 
257 |         // tokenize the sentence
258 |         CStringVector tokenized_sent[1];
259 |         if (tokenize) {
260 |             input_reader.readSegmentedSentenceAndTokenize(tokenized_sent);
261 |         }
262 |         else {
263 |             input_reader.readSegmentedSentence(tokenized_sent);
264 |         }
265 | 
266 |         if (zps->output_buffer != NULL) {
267 |             delete zps->output_buffer;
268 |             zps->output_buffer = NULL;
269 |         }
270 | 
271 |         if(tokenized_sent->size() >= MAX_SENTENCE_SIZE){
272 |             // The ZPar code asserts that length < MAX_SENTENCE_SIZE...
273 |             std::cerr << "Sentence too long. Returning empty string. Sentence: " << input_sentence << std::endl;
274 |             zps->output_buffer = new char[1];
275 |             strcpy(zps->output_buffer, "");
276 |         } else {
277 |             // initialize the variables that will hold the tagged and parsed sentences
278 |             CTwoStringVector tagged_sent[1];
279 |             english::CCFGTree parsed_sent[1];
280 | 
281 |             // get the tagger and parser that were stored earlier
282 |             CTagger *tagger = zps->tagger;
283 |             CConParser *conparser = zps->conparser;
284 | 
285 |             // tag and parse the sentence
286 |             tagger->tag(tokenized_sent, tagged_sent);
287 |             conparser->parse(*tagged_sent, parsed_sent);
288 | 
289 |             // now put the parsed sentence into a string stream
290 |             std::string parse = parsed_sent->str_unbinarized();
291 |             int parselen = parse.length();
292 |             zps->output_buffer = new char[parselen + 1];
293 |             strcpy(zps->output_buffer, parse.c_str());
294 |         }
295 |     } catch (const std::string &e) {
296 |         std::cerr << e << std::endl;
297 |         zps->output_buffer = new char[1];
298 |         strcpy(zps->output_buffer, "");
299 |     }
300 | 
301 |     return zps->output_buffer;
302 | }
303 | 
304 | extern "C" char* parse_tagged_sentence(void* vzps, const char *input_tagged_sentence, const char seperator='/')
305 | {
306 | 
307 |     zparSession_t* zps = static_cast<zparSession_t *>(vzps);
308 | 
309 |     try {
310 |         // create a temporary string stream from the input char *
311 |         CSentenceReader input_reader(std::string(input_tagged_sentence), false);
312 | 
313 |         // read the tagged sentence into a CTwoStringVector
314 |         CTwoStringVector tagged_sent[1];
315 |         input_reader.readTaggedSentence(tagged_sent, false, seperator);
316 | 
317 |         if (zps->output_buffer != NULL) {
318 |             delete zps->output_buffer;
319 |             zps->output_buffer = NULL;
320 |         }
321 | 
322 |         if(tagged_sent->size() >= MAX_SENTENCE_SIZE){
323 |             // The ZPar code asserts that length < MAX_SENTENCE_SIZE...
324 |             std::cerr << "Sentence too long. Returning empty string. Sentence: " << input_tagged_sentence << std::endl;
325 |             zps->output_buffer = new char[1];
326 |             strcpy(zps->output_buffer, "");
327 |         } else {
328 |             // initialize the variable that will hold the parsed sentence
329 |             english::CCFGTree parsed_sent[1];
330 | 
331 |             // get the parser that was stored earlier
332 |             CConParser *conparser = zps->conparser;
333 | 
334 |             // parse the tagged sentence
335 |             conparser->parse(*tagged_sent, parsed_sent);
336 | 
337 |             // now put the parsed sentence into a string stream
338 |             std::string parse = parsed_sent->str_unbinarized();
339 |             int parselen = parse.length();
340 |             zps->output_buffer = new char[parselen + 1];
341 |             strcpy(zps->output_buffer, parse.c_str());
342 |         }
343 | 
344 |     } catch (const std::string &e) {
345 |         std::cerr << e << std::endl;
346 |         zps->output_buffer = new char[1];
347 |         strcpy(zps->output_buffer, "");
348 |     }
349 |     return zps->output_buffer;
350 | }
351 | 
352 | // Function to dependency parse a sentence
353 | extern "C" char* dep_parse_sentence(void* vzps, const char *input_sentence, bool tokenize)
354 | {
355 |     zparSession_t* zps = static_cast<zparSession_t *>(vzps);
356 | 
357 |     try {
358 | 
359 |         // create a temporary string stream from the input char *
360 |         CSentenceReader input_reader(std::string(input_sentence), false);
361 | 
362 |         // tokenize the sentence
363 |         CStringVector tokenized_sent[1];
364 |         if (tokenize) {
365 |             input_reader.readSegmentedSentenceAndTokenize(tokenized_sent);
366 |         }
367 |         else {
368 |             input_reader.readSegmentedSentence(tokenized_sent);
369 |         }
370 | 
371 |         if (zps->output_buffer != NULL) {
372 |             delete zps->output_buffer;
373 |             zps->output_buffer = NULL;
374 |         }
375 | 
376 |         if(tokenized_sent->size() >= MAX_SENTENCE_SIZE){
377 |             // The ZPar code asserts that length < MAX_SENTENCE_SIZE...
378 |             std::cerr << "Sentence too long. Returning empty string. Sentence: " << input_sentence << std::endl;
379 |             zps->output_buffer = new char[1];
380 |             strcpy(zps->output_buffer, "");
381 |         } else {
382 | 
383 |             // initialize the variable that will hold the tagged and parsed sentences
384 |             CTwoStringVector tagged_sent[1];
385 |             CDependencyParse parsed_sent[1];
386 | 
387 |             // get the tagger and parser that were stored earlier
388 |             CTagger *tagger = zps->tagger;
389 |             CDepParser *depparser = zps->depparser;
390 | 
391 |             // tag and parse the sentence
392 |             tagger->tag(tokenized_sent, tagged_sent);
393 |             depparser->parse(*tagged_sent, parsed_sent);
394 | 
395 |             // now output the formatted dependency tree
396 |             std::string deptree = format_dependency_tree(parsed_sent);
397 |             int deptreelen = deptree.length();
398 |             zps->output_buffer = new char[deptreelen + 1];
399 |             strcpy(zps->output_buffer, deptree.c_str());
400 |         }
401 | 
402 |     } catch (const std::string &e) {
403 |         std::cerr << e << std::endl;
404 |         zps->output_buffer = new char[1];
405 |         strcpy(zps->output_buffer, "");
406 |     }
407 | 
408 |     return zps->output_buffer;
409 | }
410 | 
411 | // Function to dependency parse a sentence
412 | extern "C" char* dep_parse_tagged_sentence(void* vzps, const char *input_tagged_sentence, const char seperator='/')
413 | {
414 |     zparSession_t* zps = static_cast<zparSession_t *>(vzps);
415 | 
416 |     try {
417 |         // create a temporary string stream from the input char *
418 |         CSentenceReader input_reader(std::string(input_tagged_sentence), false);
419 | 
420 |         // read the tagged sentence into a CTwoStringVector
421 |         CTwoStringVector tagged_sent[1];
422 |         input_reader.readTaggedSentence(tagged_sent, false, seperator);
423 | 
424 |         if (zps->output_buffer != NULL) {
425 |             delete zps->output_buffer;
426 |             zps->output_buffer = NULL;
427 |         }
428 | 
429 |         if(tagged_sent->size() >= MAX_SENTENCE_SIZE){
430 |             // The ZPar code asserts that length < MAX_SENTENCE_SIZE...
431 |             std::cerr << "Sentence too long. Returning empty string. Sentence: " << input_tagged_sentence << std::endl;
432 |             zps->output_buffer = new char[1];
433 |             strcpy(zps->output_buffer, "");
434 |         } else {
435 | 
436 |             // initialize the variable that will hold the parsed sentence
437 |             CDependencyParse parsed_sent[1];
438 | 
439 |             // get the parser that was stored earlier
440 |             CDepParser *depparser = zps->depparser;
441 | 
442 |             // parse the sentence
443 |             depparser->parse(*tagged_sent, parsed_sent);
444 | 
445 |             // now output the formatted dependency tree
446 |             std::string deptree = format_dependency_tree(parsed_sent);
447 |             int deptreelen = deptree.length();
448 |             zps->output_buffer = new char[deptreelen + 1];
449 |             strcpy(zps->output_buffer, deptree.c_str());
450 |         }
451 | 
452 |     } catch (const std::string &e) {
453 |         std::cerr << e << std::endl;
454 |         zps->output_buffer = new char[1];
455 |         strcpy(zps->output_buffer, "");
456 |     }
457 | 
458 |     return zps->output_buffer;
459 | }
460 | 
461 | 
462 | // Function to tag all sentence in the given input file
463 | // and write tagged sentences to the given output file
464 | extern "C" void tag_file(void* vzps, const char *sInputFile, const char *sOutputFile, bool tokenize)
465 | {
466 | 
467 |     zparSession_t* zps = static_cast<zparSession_t *>(vzps);
468 | 
469 |     std::cerr << "Processing file " <<  sInputFile << std::endl;
470 | 
471 |     // initialize the input reader
472 |     CSentenceReader input_reader(sInputFile);
473 | 
474 |     // initialize the temporary sentence variables
475 |     CStringVector tokenized_sent[1];
476 |     CTwoStringVector tagged_sent[1];
477 | 
478 |     // get the tagger and the parser that were stored earlier
479 |     CTagger *tagger = zps->tagger;
480 | 
481 |     // initialize the output file writer
482 |     std::string outputFileName = std::string(sOutputFile);
483 |     CSentenceWriter output_writer(outputFileName);
484 | 
485 |     // read in and tokenize the given input file if asked
486 |     bool readSomething;
487 |     if (tokenize) {
488 |         readSomething = input_reader.readSegmentedSentenceAndTokenize(tokenized_sent);
489 |     }
490 |     else {
491 |         readSomething = input_reader.readSegmentedSentence(tokenized_sent);
492 |     }
493 |     while ( readSomething )
494 |     {
495 |         if ( !tokenized_sent->empty() && tokenized_sent->back() == "\n" )
496 |         {
497 |             tokenized_sent->pop_back();
498 |         }
499 | 
500 |         // tag the sentence
501 |         tagger->tag(tokenized_sent, tagged_sent);
502 | 
503 |         // write the formatted sentence to the output file
504 |         output_writer.writeSentence(tagged_sent, '/', true);
505 | 
506 |         if (tokenize) {
507 |             readSomething = input_reader.readSegmentedSentenceAndTokenize(tokenized_sent);
508 |         }
509 |         else {
510 |             readSomething = input_reader.readSegmentedSentence(tokenized_sent);
511 |         }
512 |     }
513 | 
514 |     // close the output file
515 |     std::cerr << "Wrote output to " << sOutputFile << std::endl;
516 | }
517 | 
518 | // Function to constituency parse all sentence in the given input file
519 | // and write parsed sentences to the given output file
520 | extern "C" void parse_file(void* vzps, const char *sInputFile, const char *sOutputFile, bool tokenize)
521 | {
522 | 
523 |     zparSession_t* zps = static_cast<zparSession_t *>(vzps);
524 | 
525 |     std::cerr << "Processing file " <<  sInputFile << std::endl;
526 | 
527 |     // initialize the input reader
528 |     CSentenceReader input_reader(sInputFile);
529 | 
530 |     // open the output file
531 |     FILE *outfp = NULL;
532 |     outfp = fopen(sOutputFile, "w");
533 | 
534 |     // initialize the temporary sentence variables
535 |     CStringVector tokenized_sent[1];
536 |     CTwoStringVector tagged_sent[1];
537 |     english::CCFGTree parsed_sent[1];
538 | 
539 |     // get the tagger and the parser that were stored earlier
540 |     CTagger *tagger = zps->tagger;
541 |     CConParser *conparser = zps->conparser;
542 | 
543 |     // read in and tokenize the given input file if asked
544 |     bool readSomething;
545 |     if (tokenize) {
546 |         readSomething = input_reader.readSegmentedSentenceAndTokenize(tokenized_sent);
547 |     }
548 |     else {
549 |         readSomething = input_reader.readSegmentedSentence(tokenized_sent);
550 |     }
551 | 
552 |     while ( readSomething )
553 |     {
554 |         if ( !tokenized_sent->empty() && tokenized_sent->back() == "\n" )
555 |         {
556 |             tokenized_sent->pop_back();
557 |         }
558 | 
559 |         std::string parse = "";
560 |         if(tokenized_sent->size() < MAX_SENTENCE_SIZE){
561 |             tagger->tag(tokenized_sent, tagged_sent);
562 |             conparser->parse(*tagged_sent, parsed_sent);
563 |             parse = parsed_sent->str_unbinarized();
564 |         } else {
565 |             std::cerr << "Sentence too long. Writing empty string. Sentence: " << tokenized_sent << std::endl;
566 |         }
567 | 
568 |         fprintf(outfp, "%s\n", parse.c_str());
569 | 
570 |         if (tokenize) {
571 |             readSomething = input_reader.readSegmentedSentenceAndTokenize(tokenized_sent);
572 |         }
573 |         else {
574 |             readSomething = input_reader.readSegmentedSentence(tokenized_sent);
575 |         }
576 |     }
577 | 
578 |     // close the output file
579 |     std::cerr << "Wrote output to " << sOutputFile << std::endl;
580 |     fclose(outfp);
581 | }
582 | 
583 | extern "C" void parse_tagged_file(void* vzps, const char *sInputFile, const char *sOutputFile, const char seperator='/')
584 | {
585 | 
586 |     zparSession_t* zps = static_cast<zparSession_t *>(vzps);
587 | 
588 |     std::cerr << "Processing file " <<  sInputFile << std::endl;
589 | 
590 |     // initialize the input reader
591 |     CSentenceReader input_reader(sInputFile);
592 | 
593 |     // open the output file
594 |     FILE *outfp = NULL;
595 |     outfp = fopen(sOutputFile, "w");
596 | 
597 |     // initialize the temporary sentence variables
598 |     CTwoStringVector tagged_sent[1];
599 |     english::CCFGTree parsed_sent[1];
600 | 
601 |     // get the parser that was stored earlier
602 |     CConParser *conparser = zps->conparser;
603 | 
604 |     // read in and tokenize the given input file if asked
605 |     bool readSomething;
606 |     readSomething = input_reader.readTaggedSentence(tagged_sent, false, seperator);
607 | 
608 |     while ( readSomething )
609 |     {
610 |         std::string parse = "";
611 |         if(tagged_sent->size() < MAX_SENTENCE_SIZE){
612 |             conparser->parse(*tagged_sent, parsed_sent);
613 |             parse = parsed_sent->str_unbinarized();
614 |         } else {
615 |             std::cerr << "Sentence too long. Writing empty string. Sentence: " << tagged_sent << std::endl;
616 |         }
617 | 
618 |         fprintf(outfp, "%s\n", parse.c_str());
619 | 
620 |         readSomething = input_reader.readTaggedSentence(tagged_sent, false, seperator);
621 |     }
622 | 
623 |     // close the output file
624 |     std::cerr << "Wrote output to " << sOutputFile << std::endl;
625 |     fclose(outfp);
626 | }
627 | 
628 | // Function to dependency parse all sentence in the given input file
629 | // and write parsed sentences to the given output file
630 | extern "C" void dep_parse_file(void* vzps, const char *sInputFile, const char *sOutputFile, bool tokenize)
631 | {
632 | 
633 |     zparSession_t* zps = static_cast<zparSession_t *>(vzps);
634 | 
635 |     std::cerr << "Processing file " <<  sInputFile << std::endl;
636 | 
637 |     // initialize the input reader
638 |     CSentenceReader input_reader(sInputFile);
639 | 
640 |     // open the output file
641 |     FILE *outfp = NULL;
642 |     outfp = fopen(sOutputFile, "w");
643 | 
644 |     // initialize the temporary sentence variables
645 |     CStringVector tokenized_sent[1];
646 |     CTwoStringVector tagged_sent[1];
647 |     CDependencyParse parsed_sent[1];
648 | 
649 |     // get the tagger and the parser that were stored earlier
650 |     CTagger *tagger = zps->tagger;
651 |     CDepParser *depparser = zps->depparser;
652 | 
653 |     // read in and tokenize the given input file if asked
654 |     bool readSomething;
655 |     if (tokenize) {
656 |         readSomething = input_reader.readSegmentedSentenceAndTokenize(tokenized_sent);
657 |     }
658 |     else {
659 |         readSomething = input_reader.readSegmentedSentence(tokenized_sent);
660 |     }
661 | 
662 |     while ( readSomething )
663 |     {
664 |         if ( !tokenized_sent->empty() && tokenized_sent->back() == "\n" )
665 |         {
666 |             tokenized_sent->pop_back();
667 |         }
668 | 
669 |         std::string deptree = "";
670 |         if(tokenized_sent->size() < MAX_SENTENCE_SIZE){
671 |             tagger->tag(tokenized_sent, tagged_sent);
672 |             depparser->parse(*tagged_sent, parsed_sent);
673 |             deptree = format_dependency_tree(parsed_sent);
674 |         } else {
675 |             std::cerr << "Sentence too long. Writing empty string. Input:" << tokenized_sent << std::endl;
676 |         }
677 | 
678 |         fprintf(outfp, "%s\n", deptree.c_str());
679 | 
680 |         if (tokenize) {
681 |             readSomething = input_reader.readSegmentedSentenceAndTokenize(tokenized_sent);
682 |         }
683 |         else {
684 |             readSomething = input_reader.readSegmentedSentence(tokenized_sent);
685 |         }
686 |     }
687 | 
688 |     // close the output file
689 |     std::cerr << "Wrote output to " << sOutputFile << std::endl;
690 |     fclose(outfp);
691 | }
692 | 
693 | extern "C" void dep_parse_tagged_file(void* vzps, const char *sInputFile, const char *sOutputFile, const char seperator='/')
694 | {
695 | 
696 |     zparSession_t* zps = static_cast<zparSession_t *>(vzps);
697 | 
698 |     std::cerr << "Processing file " <<  sInputFile << std::endl;
699 | 
700 |     // initialize the input reader
701 |     CSentenceReader input_reader(sInputFile);
702 | 
703 |     // open the output file
704 |     FILE *outfp = NULL;
705 |     outfp = fopen(sOutputFile, "w");
706 | 
707 |     // initialize the temporary sentence variables
708 |     CTwoStringVector tagged_sent[1];
709 |     CDependencyParse parsed_sent[1];
710 | 
711 |     // get the parser that was stored earlier
712 |     CDepParser *depparser = zps->depparser;
713 | 
714 |     // read in and tokenize the given input file if asked
715 |     bool readSomething;
716 |     readSomething = input_reader.readTaggedSentence(tagged_sent, false, seperator);
717 | 
718 |     while ( readSomething )
719 |     {
720 |         std::string deptree = "";
721 |         if(tagged_sent->size() < MAX_SENTENCE_SIZE){
722 |             depparser->parse(*tagged_sent, parsed_sent);
723 |             deptree = format_dependency_tree(parsed_sent);
724 |         } else {
725 |             std::cerr << "Sentence too long. Writing empty string. Sentence: " << tagged_sent << std::endl;
726 |         }
727 | 
728 |         fprintf(outfp, "%s\n", deptree.c_str());
729 | 
730 |         readSomething = input_reader.readTaggedSentence(tagged_sent, false, seperator);
731 |     }
732 | 
733 |     // close the output file
734 |     std::cerr << "Wrote output to " << sOutputFile << std::endl;
735 |     fclose(outfp);
736 | }
737 | 
738 | // Function to unload all the models
739 | extern "C" void unload_models(void* vzps)
740 | {
741 | 
742 |     zparSession_t* zps = static_cast<zparSession_t *>(vzps);
743 | 
744 |     // just delete the container itself and its destructor
745 |     // will take care of everything else
746 |     delete zps;
747 |     zps = NULL;
748 | }
749 | 
750 | // A main function for testing
751 | // extern "C" int main(int argc, char *argv[])
752 | // {
753 | //        void* vzps = initialize();
754 | //        load_tagger(vzps, "/Users/nmadnani/work/NLPTools/zpar/english-models");
755 | //        load_parser(vzps, "/Users/nmadnani/work/NLPTools/zpar/english-models");
756 | //        load_depparser(vzps, "/Users/nmadnani/work/NLPTools/zpar/english-models");
757 | //        parse_tagged_file(vzps, "/Users/nmadnani/work/python-zpar/examples/test_tagged.txt", "/Users/nmadnani/work/python-zpar/examples/test_tagged.parse");
758 | //        dep_parse_tagged_file(vzps, "/Users/nmadnani/work/python-zpar/examples/test_tagged.txt", "/Users/nmadnani/work/python-zpar/examples/test_tagged.dep");
759 | //        std::cout << std::string(parse_tagged_sentence(vzps, "I/PRP am/VBP going/VBG to/TO the/DT market/NN ./.")) << std::endl;
760 | //        std::cout << std::string(dep_parse_tagged_sentence(vzps, "I/PRP am/VBP going/VBG to/TO the/DT market/NN ./.")) << std::endl;
761 | //        unload_models(vzps);
762 | //      return 0;
763 | // }
764 | 


--------------------------------------------------------------------------------
/tests/test_depparser.py:
--------------------------------------------------------------------------------
  1 | """ "
  2 | Run unit tests for the ZPar dependency parser.
  3 | 
  4 | :author: Nitin Madnani (nmadnani@ets.org)
  5 | """
  6 | 
  7 | from __future__ import (absolute_import, division, print_function,
  8 |                         unicode_literals)
  9 | 
 10 | import glob
 11 | import os
 12 | 
 13 | from io import open
 14 | from itertools import product
 15 | from os.path import abspath, dirname, join
 16 | 
 17 | from nose.tools import assert_equal
 18 | from zpar import ZPar
 19 | 
 20 | _my_dir = abspath(dirname(__file__))
 21 | 
 22 | z = None
 23 | depparser = None
 24 | 
 25 | 
 26 | def setUp():
 27 |     """
 28 |     set up things we need for the tests
 29 |     """
 30 |     global z, depparser
 31 | 
 32 |     assert 'ZPAR_MODEL_DIR' in os.environ
 33 | 
 34 |     model_dir = os.environ['ZPAR_MODEL_DIR']
 35 | 
 36 |     z = ZPar(model_dir)
 37 |     depparser = z.get_depparser()
 38 | 
 39 | 
 40 | def tearDown():
 41 |     """
 42 |     Clean up after the tests
 43 |     """
 44 |     global z, depparser
 45 | 
 46 |     if z:
 47 |         z.close()
 48 |         del depparser
 49 |         del z
 50 | 
 51 |     # delete all the files we may have created
 52 |     data_dir = abspath(join(_my_dir, '..', 'examples'))
 53 |     for f in glob.glob(join(data_dir, 'test*.dep')):
 54 |         os.unlink(f)
 55 | 
 56 | 
 57 | def check_dep_parse_sentence(tokenize=False,
 58 |                              with_lemmas=False,
 59 |                              tagged=False):
 60 |     """
 61 |     Check dep_parse_sentence method with and without tokenization,
 62 |     with and without lemmas, and without pre-tagged output.
 63 |     """
 64 |     global depparser
 65 | 
 66 |     if tagged:
 67 |         sentence = "I/PRP 'm/VBP going/VBG to/TO the/DT market/NN ./."
 68 |     else:
 69 |         if tokenize:
 70 |             sentence = "I'm going to the market."
 71 |         else:
 72 |             sentence = "I 'm going to the market ."
 73 | 
 74 |     correct_output = "I\tPRP\t1\tSUB\n'm\tVBP\t-1\tROOT\ngoing\tVBG\t1\tVC\nto\tTO\t2\tVMOD\nthe\tDT\t5\tNMOD\nmarket\tNN\t3\tPMOD\n.\t.\t1\tP\n"
 75 |     correct_output_with_lemmas = "I\tPRP\t1\tSUB\ti\n'm\tVBP\t-1\tROOT\t'm\ngoing\tVBG\t1\tVC\tgo\nto\tTO\t2\tVMOD\tto\nthe\tDT\t5\tNMOD\tthe\nmarket\tNN\t3\tPMOD\tmarket\n.\t.\t1\tP\t.\n"
 76 |     if not tagged:
 77 |         parsed_sentence = depparser.dep_parse_sentence(sentence,
 78 |                                                        tokenize=tokenize,
 79 |                                                        with_lemmas=with_lemmas)
 80 |     else:
 81 |         parsed_sentence = depparser.dep_parse_tagged_sentence(sentence,
 82 |                                                               with_lemmas=with_lemmas)
 83 | 
 84 |     if with_lemmas:
 85 |         assert_equal(parsed_sentence, correct_output_with_lemmas)
 86 |     else:
 87 |         assert_equal(parsed_sentence, correct_output)
 88 | 
 89 | 
 90 | def test_dep_parse_sentence():
 91 |     for (tokenize, with_lemmas, tagged) in product([True, False],
 92 |                                                        [True, False],
 93 |                                                        [True, False]):
 94 |         yield (check_dep_parse_sentence,
 95 |                tokenize,
 96 |                with_lemmas,
 97 |                tagged)
 98 | 
 99 | 
100 | def check_dep_parse_file(tokenize=False,
101 |                          with_lemmas=False,
102 |                          tagged=False):
103 |     """
104 |     Check parse_file method with and without tokenization,
105 |     with and without lemmas, with and without access
106 |     to wordnet, and with and without pre-tagged output.
107 |     """
108 |     global depparser
109 | 
110 |     if tagged:
111 |         prefix = 'test_tagged'
112 |     else:
113 |         if tokenize:
114 |             prefix = 'test'
115 |         else:
116 |             prefix = 'test_tokenized'
117 | 
118 |     correct_output = ['I\tPRP\t1\tSUB', 'am\tVBP\t-1\tROOT',
119 |                       'going\tVBG\t1\tVC', 'to\tTO\t2\tVMOD',
120 |                       'the\tDT\t5\tNMOD', 'market\tNN\t3\tPMOD',
121 |                       '.\t.\t1\tP', '', 'Are\tVBP\t-1\tROOT',
122 |                       'you\tPRP\t0\tSUB', 'going\tVBG\t0\tVMOD',
123 |                       'to\tTO\t4\tVMOD', 'come\tVB\t2\tVMOD',
124 |                       'with\tIN\t4\tVMOD', 'me\tPRP\t5\tPMOD',
125 |                       '?\t.\t0\tP', '']
126 | 
127 |     correct_output_with_lemmas = ['I\tPRP\t1\tSUB\ti', 'am\tVBP\t-1\tROOT\tbe',
128 |                                   'going\tVBG\t1\tVC\tgo', 'to\tTO\t2\tVMOD\tto',
129 |                                   'the\tDT\t5\tNMOD\tthe', 'market\tNN\t3\tPMOD\tmarket',
130 |                                   '.\t.\t1\tP\t.', '', 'Are\tVBP\t-1\tROOT\tbe',
131 |                                   'you\tPRP\t0\tSUB\tyou', 'going\tVBG\t0\tVMOD\tgo',
132 |                                   'to\tTO\t4\tVMOD\tto', 'come\tVB\t2\tVMOD\tcome',
133 |                                   'with\tIN\t4\tVMOD\twith', 'me\tPRP\t5\tPMOD\tme',
134 |                                   '?\t.\t0\tP\t?','']
135 | 
136 |     input_file = abspath(join(_my_dir, '..', 'examples', '{}.txt'.format(prefix)))
137 |     output_file = abspath(join(_my_dir, '..', 'examples', '{}.dep'.format(prefix)))
138 | 
139 |     # dependency parse the file
140 |     if not tagged:
141 |         depparser.dep_parse_file(input_file,
142 |                                  output_file,
143 |                                  tokenize=tokenize,
144 |                                  with_lemmas=with_lemmas)
145 |     else:
146 |         depparser.dep_parse_tagged_file(input_file,
147 |                                         output_file,
148 |                                         with_lemmas=with_lemmas)
149 | 
150 |     # read the output file and make sure we have the expected output
151 |     with open(output_file, 'r') as outf:
152 |         output = [l.strip() for l in outf.readlines()]
153 | 
154 |     if with_lemmas:
155 |         assert_equal(output, correct_output_with_lemmas)
156 |     else:
157 |         assert_equal(output, correct_output)
158 | 
159 | 
160 | def test_dep_parse_file():
161 |     for (tokenize, with_lemmas, tagged) in product([True, False],
162 |                                                    [True, False],
163 |                                                    [True, False]):
164 |         yield (check_dep_parse_file,
165 |                tokenize,
166 |                with_lemmas,
167 |                tagged)
168 | 
169 | 


--------------------------------------------------------------------------------
/tests/test_depparser_no_wordnet.py:
--------------------------------------------------------------------------------
  1 | """ "
  2 | Run unit tests for the ZPar dependency parser without wordnet access.
  3 | 
  4 | :author: Nitin Madnani (nmadnani@ets.org)
  5 | """
  6 | 
  7 | from __future__ import (absolute_import, division, print_function,
  8 |                         unicode_literals)
  9 | 
 10 | import glob
 11 | import os
 12 | 
 13 | from io import open
 14 | from itertools import product
 15 | from os.path import abspath, dirname, join
 16 | 
 17 | from nose.tools import assert_equal
 18 | from zpar import ZPar
 19 | 
 20 | _my_dir = abspath(dirname(__file__))
 21 | 
 22 | z = None
 23 | depparser = None
 24 | 
 25 | 
 26 | def setUp():
 27 |     """
 28 |     set up things we need for the tests
 29 |     """
 30 |     global z, depparser
 31 | 
 32 |     assert 'ZPAR_MODEL_DIR' in os.environ
 33 | 
 34 |     model_dir = os.environ['ZPAR_MODEL_DIR']
 35 | 
 36 |     z = ZPar(model_dir)
 37 |     depparser = z.get_depparser()
 38 | 
 39 | 
 40 | def tearDown():
 41 |     """
 42 |     Clean up after the tests
 43 |     """
 44 |     global z, depparser
 45 | 
 46 |     if z:
 47 |         z.close()
 48 |         del depparser
 49 |         del z
 50 | 
 51 |     # delete all the files we may have created
 52 |     data_dir = abspath(join(_my_dir, '..', 'examples'))
 53 |     for f in glob.glob(join(data_dir, 'test*.dep')):
 54 |         os.unlink(f)
 55 | 
 56 | 
 57 | def check_dep_parse_sentence_no_wordnet(tokenize=False,
 58 |                                         with_lemmas=False,
 59 |                                         tagged=False):
 60 |     """
 61 |     Check dep_parse_sentence method with and without tokenization,
 62 |     with and without lemmas, and with and without pre-tagged output,
 63 |     all under the condition that there is no wordnet corpus
 64 |     accessible to nltk.
 65 |     """
 66 |     global depparser
 67 | 
 68 |     if tagged:
 69 |         sentence = "I/PRP 'm/VBP going/VBG to/TO the/DT market/NN ./."
 70 |     else:
 71 |         if tokenize:
 72 |             sentence = "I'm going to the market."
 73 |         else:
 74 |             sentence = "I 'm going to the market ."
 75 | 
 76 |     correct_output = "I\tPRP\t1\tSUB\n'm\tVBP\t-1\tROOT\ngoing\tVBG\t1\tVC\nto\tTO\t2\tVMOD\nthe\tDT\t5\tNMOD\nmarket\tNN\t3\tPMOD\n.\t.\t1\tP\n"
 77 |     if not tagged:
 78 |         parsed_sentence = depparser.dep_parse_sentence(sentence,
 79 |                                                        tokenize=tokenize,
 80 |                                                        with_lemmas=with_lemmas)
 81 |     else:
 82 |         parsed_sentence = depparser.dep_parse_tagged_sentence(sentence,
 83 |                                                               with_lemmas=with_lemmas)
 84 | 
 85 |     assert_equal(parsed_sentence, correct_output)
 86 | 
 87 | 
 88 | def test_dep_parse_sentence_no_wordnet():
 89 |     for (tokenize, with_lemmas, tagged) in product([True, False],
 90 |                                                        [True, False],
 91 |                                                        [True, False]):
 92 |         yield (check_dep_parse_sentence_no_wordnet,
 93 |                tokenize,
 94 |                with_lemmas,
 95 |                tagged)
 96 | 
 97 | 
 98 | def check_dep_parse_file_no_wordnet(tokenize=False,
 99 |                                     with_lemmas=False,
100 |                                     tagged=False):
101 |     """
102 |     Check parse_file method with and without tokenization,
103 |     with and without lemmas, and with and without pre-tagged output,
104 |     all under the condition that there is no wordnet corpus
105 |     accessible to nltk.
106 |     """
107 |     global depparser
108 | 
109 |     if tagged:
110 |         prefix = 'test_tagged'
111 |     else:
112 |         if tokenize:
113 |             prefix = 'test'
114 |         else:
115 |             prefix = 'test_tokenized'
116 | 
117 |     correct_output = ['I\tPRP\t1\tSUB', 'am\tVBP\t-1\tROOT',
118 |                       'going\tVBG\t1\tVC', 'to\tTO\t2\tVMOD',
119 |                       'the\tDT\t5\tNMOD', 'market\tNN\t3\tPMOD',
120 |                       '.\t.\t1\tP', '', 'Are\tVBP\t-1\tROOT',
121 |                       'you\tPRP\t0\tSUB', 'going\tVBG\t0\tVMOD',
122 |                       'to\tTO\t4\tVMOD', 'come\tVB\t2\tVMOD',
123 |                       'with\tIN\t4\tVMOD', 'me\tPRP\t5\tPMOD',
124 |                       '?\t.\t0\tP', '']
125 | 
126 |     input_file = abspath(join(_my_dir, '..', 'examples', '{}.txt'.format(prefix)))
127 |     output_file = abspath(join(_my_dir, '..', 'examples', '{}.dep'.format(prefix)))
128 | 
129 |     # dependency parse the file
130 |     if not tagged:
131 |         depparser.dep_parse_file(input_file,
132 |                                  output_file,
133 |                                  tokenize=tokenize,
134 |                                  with_lemmas=with_lemmas)
135 |     else:
136 |         depparser.dep_parse_tagged_file(input_file,
137 |                                         output_file,
138 |                                         with_lemmas=with_lemmas)
139 | 
140 |     # read the output file and make sure we have the expected output
141 |     with open(output_file, 'r') as outf:
142 |         output = [l.strip() for l in outf.readlines()]
143 | 
144 |     assert_equal(output, correct_output)
145 | 
146 | 
147 | def test_dep_parse_file_no_wordnet():
148 |     for (tokenize, with_lemmas, tagged) in product([True, False],
149 |                                                    [True, False],
150 |                                                    [True, False]):
151 |         yield (check_dep_parse_file_no_wordnet,
152 |                tokenize,
153 |                with_lemmas,
154 |                tagged)
155 | 
156 | 


--------------------------------------------------------------------------------
/tests/test_parser.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Run unit tests for the ZPar constituency parser.
  3 | 
  4 | :author: Nitin Madnani (nmadnani@ets.org)
  5 | """
  6 | 
  7 | from __future__ import (absolute_import, division, print_function,
  8 |                         unicode_literals)
  9 | 
 10 | import glob
 11 | import os
 12 | 
 13 | from io import open
 14 | from itertools import product
 15 | from os.path import abspath, dirname, join
 16 | 
 17 | from nose.tools import assert_equal
 18 | from zpar import ZPar
 19 | 
 20 | _my_dir = abspath(dirname(__file__))
 21 | 
 22 | z = None
 23 | parser = None
 24 | 
 25 | 
 26 | def setUp():
 27 |     """
 28 |     set up things we need for the tests
 29 |     """
 30 |     global z, parser
 31 | 
 32 |     assert 'ZPAR_MODEL_DIR' in os.environ
 33 | 
 34 |     model_dir = os.environ['ZPAR_MODEL_DIR']
 35 | 
 36 |     z = ZPar(model_dir)
 37 |     parser = z.get_parser()
 38 | 
 39 | 
 40 | def tearDown():
 41 |     """
 42 |     Clean up after the tests
 43 |     """
 44 |     global z, parser
 45 | 
 46 |     if z:
 47 |         z.close()
 48 |         del parser
 49 |         del z
 50 | 
 51 |     # delete all the files we may have created
 52 |     data_dir = abspath(join(_my_dir, '..', 'examples'))
 53 |     for f in glob.glob(join(data_dir, 'test*.parse')):
 54 |         os.unlink(f)
 55 | 
 56 | 
 57 | def check_parse_sentence(tokenize=False, tagged=False):
 58 |     """
 59 |     Check parse_sentence method with and without tokenization
 60 |     and with and without pre-tagged output.
 61 |     """
 62 |     global parser
 63 | 
 64 |     if tagged:
 65 |         sentence = "I/PRP 'm/VBP going/VBG to/TO the/DT market/NN ./."
 66 |     else:
 67 |         if tokenize:
 68 |             sentence = "I'm going to the market."
 69 |         else:
 70 |             sentence = "I 'm going to the market ."
 71 | 
 72 |     correct_output = "(S (NP (PRP I)) (VP (VBP 'm) (VP (VBG going) (PP (TO to) (NP (DT the) (NN market))))) (. .))"
 73 | 
 74 |     if not tagged:
 75 |         parsed_sentence = parser.parse_sentence(sentence, tokenize=tokenize)
 76 |     else:
 77 |         parsed_sentence = parser.parse_tagged_sentence(sentence)
 78 | 
 79 |     assert_equal(parsed_sentence, correct_output)
 80 | 
 81 | 
 82 | def test_parse_sentence():
 83 |     for (tokenize, tagged) in product([True, False], [True, False]):
 84 |         yield check_parse_sentence, tokenize, tagged
 85 | 
 86 | 
 87 | def check_parse_file(tokenize=False, tagged=False):
 88 |     """
 89 |     Check parse_file method with and without tokenization
 90 |     and with and without pre-tagged output
 91 |     """
 92 |     global parser
 93 | 
 94 |     if tagged:
 95 |         prefix = 'test_tagged'
 96 |     else:
 97 |         if tokenize:
 98 |             prefix = 'test'
 99 |         else:
100 |             prefix = 'test_tokenized'
101 | 
102 |     correct_output = ["(S (NP (PRP I)) (VP (VBP am) (VP (VBG going) (PP (TO to) (NP (DT the) (NN market))))) (. .))",
103 |                       "(SQ (VBP Are) (NP (PRP you)) (VP (VBG going) (S (VP (TO to) (VP (VB come) (PP (IN with) (NP (PRP me))))))) (. ?))"]
104 | 
105 |     input_file = abspath(join(_my_dir, '..', 'examples', '{}.txt'.format(prefix)))
106 |     output_file = abspath(join(_my_dir, '..', 'examples', '{}.parse'.format(prefix)))
107 | 
108 |     # parse the file
109 |     if not tagged:
110 |         parser.parse_file(input_file, output_file, tokenize=tokenize)
111 |     else:
112 |         parser.parse_tagged_file(input_file, output_file)
113 | 
114 |     # read the output file and make sure we have the expected output
115 |     with open(output_file, 'r') as outf:
116 |         output = [l.strip() for l in outf.readlines()]
117 | 
118 |     assert_equal(output, correct_output)
119 | 
120 | 
121 | def test_parse_file():
122 |     for (tokenize, tagged) in product([True, False], [True, False]):
123 |         yield check_parse_file, tokenize, tagged
124 | 
125 | 


--------------------------------------------------------------------------------
/tests/test_tagger.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Run unit tests for the ZPar tagger.
  3 | 
  4 | :author: Nitin Madnani (nmadnani@ets.org)
  5 | """
  6 | 
  7 | from __future__ import (absolute_import, division, print_function,
  8 |                         unicode_literals)
  9 | 
 10 | import glob
 11 | import os
 12 | 
 13 | from io import open
 14 | from os.path import abspath, dirname, join
 15 | 
 16 | from nose.tools import assert_equal
 17 | from zpar import ZPar
 18 | 
 19 | _my_dir = abspath(dirname(__file__))
 20 | 
 21 | z = None
 22 | tagger = None
 23 | 
 24 | def setUp():
 25 |     """
 26 |     set up things we need for the tests
 27 |     """
 28 |     global z, tagger
 29 | 
 30 |     assert 'ZPAR_MODEL_DIR' in os.environ
 31 | 
 32 |     model_dir = os.environ['ZPAR_MODEL_DIR']
 33 | 
 34 |     z = ZPar(model_dir)
 35 |     tagger = z.get_tagger()
 36 | 
 37 | def tearDown():
 38 |     """
 39 |     Clean up after the tests
 40 |     """
 41 |     global z, tagger
 42 | 
 43 |     if z:
 44 |         z.close()
 45 |         del tagger
 46 |         del z
 47 | 
 48 |     # delete all the files we may have created
 49 |     data_dir = abspath(join(_my_dir, '..', 'examples'))
 50 |     for f in glob.glob(join(data_dir, 'test*.tag')):
 51 |         os.unlink(f)
 52 | 
 53 | 
 54 | def check_tag_sentence(tokenize=False):
 55 |     """
 56 |     Check tag_sentence method with and without tokenization
 57 |     """
 58 |     global tagger
 59 | 
 60 |     sentence = "I'm going to the market." if tokenize else "I 'm going to the market ."
 61 |     correct_output = "I/PRP 'm/VBP going/VBG to/TO the/DT market/NN ./."
 62 |     tagged_sentence = tagger.tag_sentence(sentence, tokenize=tokenize)
 63 | 
 64 |     assert_equal(tagged_sentence, correct_output)
 65 | 
 66 | 
 67 | def test_tag_sentence():
 68 |     yield check_tag_sentence, False
 69 |     yield check_tag_sentence, True
 70 | 
 71 | 
 72 | def check_tag_file(tokenize=False):
 73 |     """
 74 |     Check tag_file method with and without tokenization
 75 |     """
 76 | 
 77 |     global tagger
 78 | 
 79 |     prefix = 'test' if tokenize else 'test_tokenized'
 80 | 
 81 |     correct_output = ['I/PRP am/VBP going/VBG to/TO the/DT market/NN ./.',
 82 |                       'Are/VBP you/PRP going/VBG to/TO come/VB with/IN me/PRP ?/.']
 83 | 
 84 |     input_file = abspath(join(_my_dir, '..', 'examples', '{}.txt'.format(prefix)))
 85 |     output_file = abspath(join(_my_dir, '..', 'examples', '{}.tag'.format(prefix)))
 86 | 
 87 |     # tag the file
 88 |     tagger.tag_file(input_file, output_file, tokenize=tokenize)
 89 | 
 90 |     # read the output file and make sure we have the expected output
 91 |     with open(output_file, 'r') as outf:
 92 |         output = [l.strip() for l in outf.readlines()]
 93 | 
 94 |     assert_equal(output, correct_output)
 95 | 
 96 | 
 97 | def test_tag_file():
 98 |     yield check_tag_file, False
 99 |     yield check_tag_file, True
100 | 


--------------------------------------------------------------------------------
/zpar/DepParser.py:
--------------------------------------------------------------------------------
  1 | # License: MIT
  2 | '''
  3 | :author: Nitin Madnani (nmadnani@ets.org)
  4 | :organization: ETS
  5 | '''
  6 | 
  7 | import ctypes as c
  8 | import logging
  9 | import os
 10 | 
 11 | # do we have nltk installed and if so, do we have its
 12 | # wordnet corpus installed?
 13 | try:
 14 |     import nltk
 15 |     nltk.data.find('corpora/wordnet')
 16 | except (ImportError, LookupError):
 17 |     _HAS_LEMMATIZER = False
 18 | else:
 19 |     _HAS_LEMMATIZER = True
 20 |     from nltk.stem.wordnet import WordNetLemmatizer
 21 | 
 22 | 
 23 | class DepParser(object):
 24 |     """The ZPar English Dependency Parser"""
 25 | 
 26 |     def __init__(self, modelpath, libptr, zpar_session_obj):
 27 |         super(DepParser, self).__init__()
 28 | 
 29 |         # save the zpar session object
 30 |         self._zpar_session_obj = zpar_session_obj
 31 | 
 32 |         # set up a logger
 33 |         self.logger = logging.getLogger(__name__)
 34 | 
 35 |         # get the library method that loads the parser models
 36 |         self._load_depparser = libptr.load_depparser
 37 |         self._load_depparser.restype = c.c_int
 38 |         self._load_depparser.argtypes = [c.c_void_p, c.c_char_p]
 39 | 
 40 |         # get the library methods that parse sentences and files
 41 |         self._dep_parse_sentence = libptr.dep_parse_sentence
 42 |         self._dep_parse_sentence.restype = c.c_char_p
 43 |         self._dep_parse_sentence.argtypes = [c.c_void_p, c.c_char_p, c.c_bool]
 44 | 
 45 |         self._dep_parse_file = libptr.dep_parse_file
 46 |         self._dep_parse_file.restype = None
 47 |         self._dep_parse_file.argtypes = [c.c_void_p, c.c_char_p, c.c_char_p, c.c_bool]
 48 | 
 49 |         self._dep_parse_tagged_sentence = libptr.dep_parse_tagged_sentence
 50 |         self._dep_parse_tagged_sentence.restype = c.c_char_p
 51 |         self._dep_parse_tagged_sentence.argtypes = [c.c_void_p, c.c_char_p, c.c_char]
 52 | 
 53 |         self._dep_parse_tagged_file = libptr.dep_parse_tagged_file
 54 |         self._dep_parse_tagged_file.restype = None
 55 |         self._dep_parse_tagged_file.argtypes = [c.c_void_p, c.c_char_p, c.c_char_p, c.c_char]
 56 | 
 57 |         if self._load_depparser(self._zpar_session_obj, modelpath.encode('utf-8')):
 58 |             raise OSError('Cannot find dependency parser model at {}\n'.format(modelpath))
 59 | 
 60 |         # set up the wordnet lemmatizer if we have it
 61 |         if _HAS_LEMMATIZER:
 62 |             self.lemmatizer = WordNetLemmatizer()
 63 |         else:
 64 |             self.lemmatizer = None
 65 | 
 66 |     def annotate_parse_with_lemmas(self, parse):
 67 |         if not parse.strip():
 68 |             return parse
 69 |         else:
 70 |             new_parse_lines = []
 71 |             for line in parse.strip().split('\n'):
 72 |                 fields = line.strip().split('\t')
 73 |                 word, pos = fields[:2]
 74 |                 if pos.startswith('J'):
 75 |                     param = 'a'
 76 |                 elif pos.startswith('R'):
 77 |                     param = 'r'
 78 |                 elif pos.startswith('V'):
 79 |                     param = 'v'
 80 |                 else:
 81 |                     param = 'n'
 82 |                 lemma = self.lemmatizer.lemmatize(word.lower(), param)
 83 |                 new_parse_line = '\t'.join(fields + [lemma])
 84 |                 new_parse_lines.append(new_parse_line)
 85 |             return '\n'.join(new_parse_lines) + '\n'
 86 | 
 87 |     def dep_parse_sentence(self,
 88 |                            sentence,
 89 |                            tokenize=True,
 90 |                            with_lemmas=False):
 91 |         if not sentence.strip():
 92 |             # return empty string if the input is empty
 93 |             ans = ""
 94 |         else:
 95 |             zpar_compatible_sentence = sentence.strip() + "\n "
 96 |             zpar_compatible_sentence = zpar_compatible_sentence.strip() + "\n "
 97 |             zpar_compatible_sentence = zpar_compatible_sentence.encode('utf-8')
 98 |             parsed_sent = self._dep_parse_sentence(self._zpar_session_obj,
 99 |                                                    zpar_compatible_sentence,
100 |                                                    tokenize)
101 |             ans = parsed_sent.decode('utf-8')
102 | 
103 |             # if we are asked to add lemma information, then we need
104 |             # to add another field to each of the lines in the
105 |             # parse returned from zpar
106 |             if with_lemmas:
107 |                 if self.lemmatizer:
108 |                     ans = self.annotate_parse_with_lemmas(ans)
109 |                 else:
110 |                     self.logger.warning('No lemmatizer available. Please '
111 |                                         'install NLTK and its Wordnet corpus.')
112 |         return ans
113 | 
114 |     def dep_parse_file(self,
115 |                        inputfile,
116 |                        outputfile,
117 |                        tokenize=True,
118 |                        with_lemmas=False):
119 | 
120 | 
121 |         if not os.path.exists(inputfile):
122 |             raise OSError('File {} does not exist.'.format(inputfile))
123 |         else:
124 |             parsed = False
125 | 
126 |             # if we want lemmas, we have to individually parse
127 |             # each sentence and then annotate its parse with lemmas
128 |             if with_lemmas:
129 |                 if self.lemmatizer:
130 |                     with open(inputfile, 'r') as inputf, open(outputfile, 'w') as outf:
131 |                         for sentence in inputf:
132 |                             outf.write(self.dep_parse_sentence(sentence,
133 |                                                                 tokenize=tokenize,
134 |                                                                 with_lemmas=True) + '\n')
135 |                     parsed = True
136 |                 else:
137 |                     self.logger.warning('No lemmatizer available. Please '
138 |                                         'install NLTK and its Wordnet corpus.')
139 | 
140 |             # otherwise we can just parse the whole file in C++ space
141 |             if not parsed:
142 |                     self._dep_parse_file(self._zpar_session_obj,
143 |                                          inputfile.encode('utf-8'),
144 |                                          outputfile.encode('utf-8'),
145 |                                          tokenize)
146 | 
147 |     def dep_parse_tagged_sentence(self,
148 |                                   tagged_sentence,
149 |                                   sep='/',
150 |                                   with_lemmas=False):
151 |         if not tagged_sentence.strip():
152 |             # return empty string if the input is empty
153 |             ans = ""
154 |         else:
155 |             zpar_compatible_sentence = tagged_sentence.strip().encode('utf-8')
156 |             parsed_sent = self._dep_parse_tagged_sentence(self._zpar_session_obj,
157 |                                                           zpar_compatible_sentence,
158 |                                                           sep.encode('utf-8'))
159 |             ans = parsed_sent.decode('utf-8')
160 | 
161 |         # if we are asked to add lemma information, then we need
162 |         # to add another field to each of the lines in the
163 |         # parse returned from zpar
164 |         if with_lemmas:
165 |             if self.lemmatizer:
166 |                 ans = self.annotate_parse_with_lemmas(ans)
167 |             else:
168 |                 self.logger.warning('No lemmatizer available. Please '
169 |                                     'install NLTK and its Wordnet corpus.')
170 |         return ans
171 | 
172 |     def dep_parse_tagged_file(self, inputfile, outputfile, sep='/', with_lemmas=False):
173 | 
174 |         if not os.path.exists(inputfile):
175 |             raise OSError('File {} does not exist.'.format(inputfile))
176 |         else:
177 | 
178 |             parsed = False
179 | 
180 |             # if we want lemmas, we have to individually parse
181 |             # each sentence and then annotate its parse with lemmas
182 |             if with_lemmas:
183 |                 if self.lemmatizer:
184 |                     with open(inputfile, 'r') as inputf, open(outputfile, 'w') as outf:
185 |                         for sentence in inputf:
186 |                             outf.write(self.dep_parse_tagged_sentence(sentence,
187 |                                                                       sep=sep,
188 |                                                                       with_lemmas=with_lemmas) + '\n')
189 | 
190 |                     parsed = True
191 |                 else:
192 |                     self.logger.warning('No lemmatizer available. Please '
193 |                                         'install NLTK and its Wordnet corpus.')
194 | 
195 |             # otherwise we can just parse the whole file in C++ space
196 |             if not parsed:
197 |                 self._dep_parse_tagged_file(self._zpar_session_obj,
198 |                                             inputfile.encode('utf-8'),
199 |                                             outputfile.encode('utf-8'),
200 |                                             sep.encode('utf-8'))
201 | 
202 |     def cleanup(self):
203 |         self._load_depparser = None
204 |         self._dep_parse_sentence = None
205 |         self._dep_parse_file = None
206 |         self._dep_parse_tagged_sentence = None
207 |         self._dep_parse_tagged_file = None
208 |         self._zpar_session_obj = None
209 | 


--------------------------------------------------------------------------------
/zpar/Parser.py:
--------------------------------------------------------------------------------
 1 | # License: MIT
 2 | '''
 3 | :author: Nitin Madnani (nmadnani@ets.org)
 4 | :organization: ETS
 5 | '''
 6 | 
 7 | import ctypes as c
 8 | import logging
 9 | import os
10 | 
11 | 
12 | class Parser(object):
13 |     """The ZPar English Constituency Parser"""
14 | 
15 |     def __init__(self, modelpath, libptr, zpar_session_obj):
16 |         super(Parser, self).__init__()
17 | 
18 |         # save the zpar session object
19 |         self._zpar_session_obj = zpar_session_obj
20 | 
21 |         # set up a logger
22 |         self.logger = logging.getLogger(__name__)
23 | 
24 |         # get the library method that loads the parser models
25 |         self._load_parser = libptr.load_parser
26 |         self._load_parser.restype = c.c_int
27 |         self._load_parser.argtypes = [c.c_void_p, c.c_char_p]
28 | 
29 |         # get the library methods that parse sentences and files
30 |         self._parse_sentence = libptr.parse_sentence
31 |         self._parse_sentence.restype = c.c_char_p
32 |         self._parse_sentence.argtypes = [c.c_void_p, c.c_char_p, c.c_bool]
33 | 
34 |         self._parse_file = libptr.parse_file
35 |         self._parse_file.restype = None
36 |         self._parse_file.argtypes = [c.c_void_p, c.c_char_p, c.c_char_p, c.c_bool]
37 | 
38 |         self._parse_tagged_sentence = libptr.parse_tagged_sentence
39 |         self._parse_tagged_sentence.restype = c.c_char_p
40 |         self._parse_tagged_sentence.argtypes = [c.c_void_p, c.c_char_p, c.c_char]
41 | 
42 |         self._parse_tagged_file = libptr.parse_tagged_file
43 |         self._parse_tagged_file.restype = None
44 |         self._parse_tagged_file.argtypes = [c.c_void_p, c.c_char_p, c.c_char_p, c.c_char]
45 | 
46 |         if self._load_parser(self._zpar_session_obj, modelpath.encode('utf-8')):
47 |             raise OSError('Cannot find parser model at {}\n'.format(modelpath))
48 | 
49 |     def parse_sentence(self, sentence, tokenize=True):
50 |         if not sentence.strip():
51 |             # return empty string if the input is empty
52 |             ans = ""
53 |         else:
54 |             zpar_compatible_sentence = sentence.strip() + "\n "
55 |             zpar_compatible_sentence = zpar_compatible_sentence.strip() + "\n "
56 |             zpar_compatible_sentence = zpar_compatible_sentence.encode('utf-8')
57 |             parsed_sent = self._parse_sentence(self._zpar_session_obj, zpar_compatible_sentence, tokenize)
58 |             ans = parsed_sent.decode('utf-8')
59 | 
60 |         return ans
61 | 
62 |     def parse_file(self, inputfile, outputfile, tokenize=True):
63 |         if os.path.exists(inputfile):
64 |             self._parse_file(self._zpar_session_obj, inputfile.encode('utf-8'), outputfile.encode('utf-8'), tokenize)
65 |         else:
66 |             raise OSError('File {} does not exist.'.format(inputfile))
67 | 
68 |     def parse_tagged_sentence(self, tagged_sentence, sep='/'):
69 |         if not tagged_sentence.strip():
70 |             # return empty string if the input is empty
71 |             ans = ""
72 |         else:
73 |             zpar_compatible_sentence = tagged_sentence.strip().encode('utf-8')
74 |             parsed_sent = self._parse_tagged_sentence(self._zpar_session_obj, zpar_compatible_sentence, sep.encode('utf-8'))
75 |             ans = parsed_sent.decode('utf-8')
76 |         return ans
77 | 
78 |     def parse_tagged_file(self, inputfile, outputfile, sep='/'):
79 |         if os.path.exists(inputfile):
80 |             self._parse_tagged_file(self._zpar_session_obj, inputfile.encode('utf-8'), outputfile.encode('utf-8'), sep.encode('utf-8'))
81 |         else:
82 |             raise OSError('File {} does not exist.'.format(inputfile))
83 | 
84 |     def cleanup(self):
85 |         self._load_parser = None
86 |         self._parse_sentence = None
87 |         self._parse_file = None
88 |         self._parse_tagged_sentence = None
89 |         self._parse_tagged_file = None
90 |         self._zpar_session_obj = None
91 | 


--------------------------------------------------------------------------------
/zpar/Tagger.py:
--------------------------------------------------------------------------------
 1 | # License: MIT
 2 | '''
 3 | :author: Nitin Madnani (nmadnani@ets.org)
 4 | :organization: ETS
 5 | '''
 6 | import ctypes as c
 7 | import logging
 8 | import os
 9 | 
10 | 
11 | class Tagger(object):
12 |     """The ZPar English POS Tagger"""
13 | 
14 |     def __init__(self, modelpath, libptr, zpar_session_obj):
15 |         super(Tagger, self).__init__()
16 | 
17 |         # save the zpar session object
18 |         self._zpar_session_obj = zpar_session_obj
19 | 
20 |         # set up a logger
21 |         self.logger = logging.getLogger(__name__)
22 | 
23 |         # get the library method that loads the tagger models
24 |         self._load_tagger = libptr.load_tagger
25 |         self._load_tagger.restype = c.c_int
26 |         self._load_tagger.argtypes = [c.c_void_p, c.c_char_p]
27 | 
28 |         # get the library methods that tag sentences and files
29 |         self._tag_sentence = libptr.tag_sentence
30 |         self._tag_sentence.restype = c.c_char_p
31 |         self._tag_sentence.argtypes = [c.c_void_p, c.c_char_p, c.c_bool]
32 | 
33 |         self._tag_file = libptr.tag_file
34 |         self._tag_file.restype = None
35 |         self._tag_file.argtypes = [c.c_void_p, c.c_char_p, c.c_char_p, c.c_bool]
36 | 
37 |         if self._load_tagger(self._zpar_session_obj, modelpath.encode('utf-8')):
38 |             raise OSError('Cannot find tagger model at {}\n'.format(modelpath))
39 | 
40 |     def tag_sentence(self, sentence, tokenize=True):
41 |         if not sentence.strip():
42 |             # return empty string if the input is empty
43 |             ans = ""
44 |         else:
45 |             zpar_compatible_sentence = sentence.strip() + "\n "
46 |             zpar_compatible_sentence = zpar_compatible_sentence.encode('utf-8')
47 |             tagged_sent = self._tag_sentence(self._zpar_session_obj, zpar_compatible_sentence, tokenize)
48 |             ans = tagged_sent.decode('utf-8')
49 |             return ans
50 | 
51 |         return ans
52 | 
53 |     def tag_file(self, inputfile, outputfile, tokenize=True):
54 |         if os.path.exists(inputfile):
55 |             self._tag_file(self._zpar_session_obj, inputfile.encode('utf-8'), outputfile.encode('utf-8'), tokenize)
56 |         else:
57 |             raise OSError('File {} does not exist.'.format(inputfile))
58 | 
59 |     def cleanup(self):
60 |         self._load_tagger = None
61 |         self._tag_sentence = None
62 |         self._tag_file = None
63 |         self._zpar_session_obj = None
64 | 
65 | 


--------------------------------------------------------------------------------
/zpar/__init__.py:
--------------------------------------------------------------------------------
  1 | # License: MIT
  2 | '''
  3 | :author: Nitin Madnani (nmadnani@ets.org)
  4 | :organization: ETS
  5 | '''
  6 | 
  7 | import _ctypes
  8 | import ctypes as c
  9 | import os
 10 | 
 11 | from .Tagger import Tagger
 12 | from .Parser import Parser
 13 | from .DepParser import DepParser
 14 | 
 15 | __all__ = ['Tagger', 'Parser', 'DepParser']
 16 | 
 17 | class ZPar(object):
 18 |     """The ZPar wrapper object"""
 19 | 
 20 |     def __init__(self, modelpath):
 21 |         super(ZPar, self).__init__()
 22 | 
 23 |         # get a pointer to the zpar shared library
 24 |         base_path = os.path.dirname(os.path.abspath(__file__))
 25 |         zpar_path = os.path.join(base_path, 'dist', 'zpar.so')
 26 |         self.libptr = c.cdll.LoadLibrary(zpar_path)
 27 | 
 28 |         # call the library's initialize method to instantiate
 29 |         # the session object associated with this session
 30 |         self._initialize = self.libptr.initialize
 31 |         self._initialize.restype = c.c_void_p
 32 |         self._initialize.argtypes = None
 33 |         self._zpar_session_obj = self._initialize()
 34 | 
 35 |         self.modelpath = modelpath
 36 |         self.tagger = None
 37 |         self.parser = None
 38 |         self.depparser = None
 39 | 
 40 |     def close(self):
 41 | 
 42 |         # unload the models on the C++ side
 43 |         _unload_models = self.libptr.unload_models
 44 |         _unload_models.restype = None
 45 |         _unload_models.argtypes = [c.c_void_p]
 46 |         self.libptr.unload_models(self._zpar_session_obj)
 47 | 
 48 |         # clean up the data structures on the python side
 49 |         if self.tagger:
 50 |             self.tagger.cleanup()
 51 | 
 52 |         if self.parser:
 53 |             self.parser.cleanup()
 54 | 
 55 |         if self.depparser:
 56 |             self.depparser.cleanup()
 57 | 
 58 |         # set all the fields to none to enable clean reuse
 59 |         self.tagger = None
 60 |         self.parser = None
 61 |         self.depparser = None
 62 |         self.modelpath = None
 63 | 
 64 |         # clean up the CDLL object too so that upon reuse, we get a new one
 65 |         _ctypes.dlclose(self.libptr._handle)
 66 |         # pretty sure once the old object libptr was pointed to should
 67 |         # get garbage collected at some point after this
 68 |         self.libptr = None
 69 |         self._zpar_session_obj = None
 70 | 
 71 |     def __enter__(self):
 72 |         """Enable ZPar to be used as a ContextManager"""
 73 |         return self
 74 | 
 75 |     def __exit__(self, type, value, traceback):
 76 |         """Clean up when done"""
 77 |         self.close()
 78 | 
 79 |     def get_tagger(self):
 80 |         if not self.libptr:
 81 |             raise Exception('Cannot get tagger from uninitialized ZPar environment.')
 82 |             return None
 83 |         else:
 84 |             self.tagger = Tagger(self.modelpath, self.libptr, self._zpar_session_obj)
 85 |             return self.tagger
 86 | 
 87 |     def get_parser(self):
 88 |         if not self.libptr:
 89 |             raise Exception('Cannot get parser from uninitialized ZPar environment.')
 90 |             return None
 91 |         else:
 92 |             self.parser = Parser(self.modelpath, self.libptr, self._zpar_session_obj)
 93 |             return self.parser
 94 | 
 95 |     def get_depparser(self):
 96 |         if not self.libptr:
 97 |             raise Exception('Cannot get parser from uninitialized ZPar environment.')
 98 |             return None
 99 |         else:
100 |             self.depparser = DepParser(self.modelpath, self.libptr, self._zpar_session_obj)
101 |             return self.depparser
102 | 
103 | 


--------------------------------------------------------------------------------
/zpar/zpar_server.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | import argparse
  4 | import logging
  5 | import os
  6 | import six
  7 | import sys
  8 | 
  9 | from zpar import ZPar
 10 | 
 11 | if six.PY2:
 12 |     from SimpleXMLRPCServer import SimpleXMLRPCServer
 13 | else:
 14 |     from xmlrpc.server import SimpleXMLRPCServer
 15 | 
 16 | class ModelNotFoundError(Exception):
 17 | 
 18 |     def __init__(self, model_name, model_path):
 19 |         Exception.__init__(self)
 20 |         self.model_name = model_name
 21 |         self.model_path = model_path
 22 | 
 23 |     def __str__(self):
 24 |         if self.model_name != 'all':
 25 |             return "No {} model could be found at {}".format(self.model_name,
 26 |                                                              self.model_path)
 27 |         else:
 28 |             return "No models could be found at {}".format(self.model_path)
 29 | 
 30 | 
 31 | _baseclass = SimpleXMLRPCServer
 32 | class StoppableServer(_baseclass):
 33 | 
 34 |     allow_reuse_address = True
 35 | 
 36 |     def __init__(self, addr, zpar_model_path, model_list, *args, **kwds):
 37 | 
 38 |         # store the hostname and port number
 39 |         self.myhost, self.myport = addr
 40 | 
 41 |         # store the link to the loaded zpar object
 42 |         self.z = ZPar(zpar_model_path)
 43 | 
 44 |         # initialize the parent class
 45 |         _baseclass.__init__(self, addr, *args, **kwds)
 46 | 
 47 |         # Call the individual loading functions
 48 |         # and only register the appropriate methods
 49 |         if 'tagger' in model_list:
 50 |             tagger = self.z.get_tagger()
 51 |             self.register_function(tagger.tag_sentence)
 52 |             self.register_function(tagger.tag_file)
 53 |         if 'parser' in model_list:
 54 |             parser = self.z.get_parser()
 55 |             self.register_function(parser.parse_sentence)
 56 |             self.register_function(parser.parse_file)
 57 |             self.register_function(parser.parse_tagged_sentence)
 58 |             self.register_function(parser.parse_tagged_file)
 59 |         if 'depparser' in model_list:
 60 |             parser = self.z.get_depparser()
 61 |             self.register_function(parser.dep_parse_sentence)
 62 |             self.register_function(parser.dep_parse_file)
 63 |             self.register_function(parser.dep_parse_tagged_sentence)
 64 |             self.register_function(parser.dep_parse_tagged_file)
 65 | 
 66 |         # register the function to remotely stop the server
 67 |         self.register_function(self.stop_server)
 68 | 
 69 |         self.quit = False
 70 | 
 71 |     def serve_forever(self):
 72 |         while not self.quit:
 73 |             try:
 74 |                 self.handle_request()
 75 |             except KeyboardInterrupt:
 76 |                 print("\nKeyboard interrupt received, exiting.")
 77 |                 break
 78 |         self.z.close()
 79 |         self.server_close()
 80 | 
 81 |     def stop_server(self):
 82 |         self.quit = True
 83 |         return 0, "Server terminated on host %r, port %r" % (self.myhost, self.myport)
 84 | 
 85 | 
 86 | def main():
 87 |     # set up an argument parser
 88 |     parser = argparse.ArgumentParser(prog='zpar_server.py', \
 89 |         formatter_class=argparse.ArgumentDefaultsHelpFormatter)
 90 |     parser.add_argument('--modeldir', dest='modeldir',
 91 |                         help="Path to directory containing zpar English models",
 92 |                         required=True)
 93 | 
 94 |     parser.add_argument('--models', dest='models', nargs='+',
 95 |                         help="Load only these models",
 96 |                         required=True)
 97 | 
 98 |     parser.add_argument('--host', dest='hostname',
 99 |                         help="Hostname or IP address",
100 |                         default="localhost",
101 |                         required=False)
102 | 
103 |     parser.add_argument('--port', dest='port', type=int,
104 |                         help="Port number",
105 |                         default=8859,
106 |                         required=False)
107 | 
108 |     parser.add_argument('--log', dest='log', action="store_true",
109 |                         default=False,
110 |                         help="Log server requests")
111 | 
112 | 
113 |     # parse given command line arguments
114 |     args = parser.parse_args()
115 | 
116 |     # check to make sure that the specified models
117 |     # are those we know about
118 |     if set(args.models).difference(['tagger', 'parser', 'depparser']):
119 |         sys.stderr.write('Error: invalid model(s) specified. Choices are: "tagger", "parser", and "depparser".\n')
120 |         sys.exit(1)
121 | 
122 |     # set up the logging
123 |     logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.INFO)
124 | 
125 |     # Create a server that is built on top of this ZPAR data structure
126 |     logging.info('Initializing server ...')
127 |     server = StoppableServer((args.hostname, args.port),
128 |                              args.modeldir, args.models,
129 |                              logRequests=args.log,
130 |                              allow_none=True)
131 | 
132 |     # Register introspection functions with the server
133 |     logging.info('Registering introspection ...')
134 |     server.register_introspection_functions()
135 | 
136 |     # Start the server
137 |     logging.info('Starting server on port {}...'.format(args.port))
138 |     server.serve_forever()
139 | 
140 | 
141 | if __name__ == '__main__':
142 |     main()
143 | 


--------------------------------------------------------------------------------