├── .circleci └── config.yml ├── .gitignore ├── LICENSE ├── MANIFEST.in ├── Makefile ├── README.rst ├── conda-recipe └── python-zpar │ └── meta.yaml ├── examples ├── test.txt ├── test_tagged.txt ├── test_tokenized.txt ├── zpar_client.py └── zpar_example.py ├── setup.py ├── src ├── Makefile ├── Makefile.lib.zpar ├── reader.h └── zpar.lib.cpp ├── tests ├── test_depparser.py ├── test_depparser_no_wordnet.py ├── test_parser.py └── test_tagger.py └── zpar ├── DepParser.py ├── Parser.py ├── Tagger.py ├── __init__.py └── zpar_server.py /.circleci/config.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | jobs: 3 | 4 | build: 5 | environment: 6 | ZPAR_MODEL_DIR: /root/english-models 7 | NLTK_DATA: /root/nltk/data 8 | docker: 9 | - image: buildpack-deps:trusty 10 | working_directory: ~/repo 11 | parallelism: 4 12 | steps: 13 | - checkout 14 | - restore_cache: 15 | keys: 16 | - deps-and-models 17 | - run: mkdir -p ~/repo/artifacts 18 | - run: rm -r ~/repo/artifacts 19 | - run: 20 | name: Install miniconda and dependencies 21 | command: | 22 | wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh 23 | chmod +x miniconda.sh 24 | ./miniconda.sh -b -f 25 | ~/miniconda3/bin/conda config --add channels desilinguist 26 | ~/miniconda3/bin/conda update --yes conda 27 | ~/miniconda3/bin/conda install --yes six nose nltk 28 | mkdir -p ~/nltk/data 29 | ~/miniconda3/bin/python -m nltk.downloader wordnet -d ~/nltk/data 30 | - run: 31 | name: Download ZPar models 32 | command: | 33 | if [ ! -d ~/english-models ]; then wget https://github.com/frcchang/zpar/releases/download/v0.7.5/english-models.zip -O ~/english-models.zip; fi 34 | if [ ! -d ~/english-models ]; then unzip ~/english-models.zip -d ${HOME}; fi 35 | - run: 36 | name: Install python-zpar in editable mode 37 | command: | 38 | make python-zpar 39 | ~/miniconda3/bin/pip install -e . 40 | 41 | - save_cache: 42 | paths: 43 | - "~/miniconda3/pkgs" 44 | - "~/english-models" 45 | key: deps-and-models 46 | 47 | - run: 48 | name: Run tests 49 | command: | 50 | TESTFILES=$(circleci tests glob "tests/test_*.py" | circleci tests split) 51 | if [[ ${TESTFILES} == *_no_wordnet.py ]]; then NLTK_DATA= ; fi 52 | ~/miniconda3/bin/nosetests -v ${TESTFILES} 53 | 54 | - store_artifacts: 55 | path: ~/repo/artifacts 56 | destination: artifacts 57 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | zpar.egg-info 2 | dist 3 | build 4 | build.sh 5 | python_zpar.egg-info 6 | zpar/__pycache__ 7 | *.pyc 8 | test_twice.py -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2014 Nitin Madnani 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include Makefile 2 | include src/* 3 | include zpar/* 4 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | all: python-zpar 2 | 3 | clean: 4 | rm -rf /tmp/zpar 5 | rm -f /tmp/zpar.tar.gz 6 | 7 | python-zpar: clean /tmp/zpar.tar.gz 8 | tar -C /tmp/zpar -zxf /tmp/zpar.tar.gz --strip-components=1 9 | cp src/zpar.lib.cpp /tmp/zpar/src/english 10 | cp src/Makefile.lib.zpar /tmp/zpar 11 | cp src/Makefile /tmp/zpar 12 | cp src/reader.h /tmp/zpar/src/include/reader.h 13 | make -C /tmp/zpar zpar.so 14 | mkdir -p zpar/dist 15 | cp /tmp/zpar/dist/zpar.so zpar/dist/ 16 | 17 | /tmp/zpar.tar.gz: 18 | wget -N https://github.com/frcchang/zpar/archive/v0.7.5.tar.gz -O /tmp/zpar.tar.gz 19 | touch $@ 20 | mkdir /tmp/zpar 21 | 22 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | NOTE 2 | ~~~~ 3 | This project is no longer under active development since there are now 4 | really nice pure Python parsers such as `Stanza `__ and `Spacy `__. The repository will remain here for archival purposes and the `PyPI `__ package will continue to be available. 5 | 6 | Introduction 7 | ~~~~~~~~~~~~ 8 | 9 | .. image:: https://circleci.com/gh/EducationalTestingService/python-zpar.svg?style=shield 10 | :alt: CircleCI Build status 11 | :target: https://circleci.com/gh/EducationalTestingService/python-zpar 12 | 13 | **python-zpar** is a python wrapper around the `ZPar 14 | parser `__. 15 | ZPar was written by `Yue Zhang `__ 16 | while he was at Oxford University. According to its home page: *ZPar is 17 | a statistical natural language parser, which performs syntactic analysis 18 | tasks including word segmentation, part-of-speech tagging and parsing. 19 | ZPar supports multiple languages and multiple grammar formalisms. ZPar 20 | has been most heavily developed for Chinese and English, while it 21 | provides generic support for other languages. ZPar is fast, processing 22 | above 50 sentences per second using the standard Penn Teebank (Wall 23 | Street Journal) data.* 24 | 25 | I wrote python-zpar since I needed a fast and efficient parser for my 26 | NLP work which is primarily done in Python and not C++. I wanted to be 27 | able to use this parser directly from Python without having to create a 28 | bunch of files and running them through subprocesses. python-zpar not 29 | only provides a simply python wrapper but also provides an XML-RPC ZPar 30 | server to make batch-processing of large files easier. 31 | 32 | python-zpar uses 33 | `ctypes `__, a very 34 | cool foreign function library bundled with Python that allows calling 35 | functions in C DLLs or shared libraries directly. 36 | 37 | **IMPORTANT**: As of now, python-zpar only works with the English zpar models 38 | since the interface to the Chinese models is different than the English ones. 39 | Pull requests are welcome! 40 | 41 | Installation 42 | ~~~~~~~~~~~~ 43 | 44 | Currently, python-zpar only works on 64-bit linux and OS X systems. 45 | Those are the two platforms I use everyday. I am happy to try to get 46 | python-zpar working on other platforms over time. Pull requests are 47 | welcome! 48 | 49 | Please make sure that ``make`` and ``wget`` are installed as they are both needed to properly build python-zpar. 50 | 51 | In order for python-zpar to work, it requires C functions that can be 52 | called directly. Since the only user-exposed entry point in ZPar is the 53 | command line client, I needed to write a shared library that would have 54 | functions built on top of the ZPar functionality but expose them in a 55 | way that ctypes could understand. 56 | 57 | Therefore, in order to build python-zpar from scratch, we need to 58 | download the ZPar source, patch it with new functionality and compile 59 | the shared library. All of this happens automatically when you install 60 | with pip: 61 | 62 | .. code-block:: bash 63 | 64 | pip install python-zpar 65 | 66 | 67 | IF YOU ARE USING macOS 68 | ====================== 69 | 70 | 1. On macOS, the installation will only work with ``gcc`` installed using either `macports `__ or `homebrew `__. The zpar source cannot be compiled with ``clang``. If you are having trouble compiling the code after cloning the repository or installing the package using pip, you can try to explicitly override the C++ compiler: 71 | 72 | .. code-block:: bash 73 | 74 | CXX= make -e 75 | 76 | or 77 | 78 | .. code-block:: bash 79 | 80 | CXX= pip install python-zpar 81 | 82 | 83 | If you are curious about what the C functions in the shared library 84 | module look like, see ``src/zpar.lib.cpp``. 85 | 86 | 2. If you are using macOS Mojave, you will need an extra step before running the ``pip`` install command above. Starting with Mojave, Apple has stopped installing the C/C++ system header files into ``/usr/include``. As a workaround, they have provided the package ``/Library/Developer/CommandLineTools/Packages/macOS_SDK_headers_for_macOS_10.14.pkg`` that you must install to get the system headers back in the usual place before python-zpar can be compiled. For more details, please read the Command Line Tools section of the `Xcode 10 release notes `__ 87 | 88 | 3. If you are using macOS Catalina, python-zpar is currently `broken `__. I have not yet upgraded to Catalina on my production machine and cannot figure out a fix yet. If you have a suggested fix, please reply in the issue. 89 | 90 | Usage 91 | ~~~~~ 92 | 93 | To use python-zpar, you need the English models for ZPar. They can be 94 | downloaded from the ZPar release page `here `__. 95 | There are three models: a part-of-speech tagger, a constituency parser, and a 96 | dependency parser. For the purpose of the examples below, the models are 97 | in the ``english-models`` directory in the current directory. 98 | 99 | Here's a small example of how to use python-zpar: 100 | 101 | .. code-block:: python 102 | 103 | from six import print_ 104 | from zpar import ZPar 105 | 106 | # use the zpar wrapper as a context manager 107 | with ZPar('english-models') as z: 108 | 109 | # get the parser and the dependency parser models 110 | tagger = z.get_tagger() 111 | depparser = z.get_depparser() 112 | 113 | # tag a sentence 114 | tagged_sent = tagger.tag_sentence("I am going to the market.") 115 | print_(tagged_sent) 116 | 117 | # tag an already tokenized sentence 118 | tagged_sent = tagger.tag_sentence("Do n't you want to come with me to the market ?", tokenize=False) 119 | print_(tagged_sent) 120 | 121 | # get the dependency parse of an already tagged sentence 122 | dep_parsed_sent = depparser.dep_parse_tagged_sentence("I/PRP am/VBP going/VBG to/TO the/DT market/NN ./.") 123 | print_(dep_parsed_sent) 124 | 125 | # get the dependency parse of an already tokenized sentence 126 | dep_parsed_sent = depparser.dep_parse_sentence("Do n't you want to come with me to the market ?", tokenize=False) 127 | print_(dep_parsed_sent) 128 | 129 | # get the dependency parse of an already tokenized sentence 130 | # and include lemma information (assuming you have NLTK as well 131 | # as its WordNet corpus installed) 132 | dep_parsed_sent = depparser.dep_parse_sentence("Do n't you want to come with me to the market ?", tokenize=False, with_lemmas=True) 133 | print_(dep_parsed_sent) 134 | 135 | 136 | The above code sample produces the following output: 137 | 138 | .. code-block:: 139 | 140 | I/PRP am/VBP going/VBG to/TO the/DT market/NN ./. 141 | 142 | Do/VBP n't/RB you/PRP want/VBP to/TO come/VB with/IN me/PRP to/TO the/DT market/NN ?/. 143 | 144 | I PRP 1 SUB 145 | am VBP -1 ROOT 146 | going VBG 1 VC 147 | to TO 2 VMOD 148 | the DT 5 NMOD 149 | market NN 3 PMOD 150 | . . 1 P 151 | 152 | Do VBP -1 ROOT 153 | n't RB 0 VMOD 154 | you PRP 0 SUB 155 | want VBP 0 VMOD 156 | to TO 5 VMOD 157 | come VB 3 VMOD 158 | with IN 5 VMOD 159 | me PRP 6 PMOD 160 | to TO 5 VMOD 161 | the DT 10 NMOD 162 | market NN 8 PMOD 163 | ? . 0 P 164 | 165 | Do VBP -1 ROOT do 166 | n't RB 0 VMOD n't 167 | you PRP 0 SUB you 168 | want VBP 0 VMOD want 169 | to TO 5 VMOD to 170 | come VB 3 VMOD come 171 | with IN 5 VMOD with 172 | me PRP 6 PMOD me 173 | to TO 5 VMOD to 174 | the DT 10 NMOD the 175 | market NN 8 PMOD market 176 | ? . 0 P ? 177 | 178 | 179 | Detailed usage with comments is shown in the included file 180 | ``examples/zpar_example.py``. Run ``python zpar_example.py -h`` to see a 181 | list of all available options. 182 | 183 | ZPar Server 184 | ~~~~~~~~~~~ 185 | 186 | The package also provides an python XML-RPC implementation of a ZPar 187 | server that makes it easier to process multiple sentences and files by 188 | loading the models just once (via the ctypes interface) and allowing 189 | clients to connect and request analyses. The implementation is in the 190 | executable ``zpar_server`` that is installed when you install the 191 | package. The server is quite flexible and allows loading only the 192 | models that you need. Here's an example of how to start the server 193 | with only the tagger and the dependency parser models loaded: 194 | 195 | .. code-block:: 196 | 197 | $> zpar_server --modeldir english-models --models tagger parser depparser 198 | INFO:Initializing server ... 199 | Loading tagger from english-models/tagger 200 | Loading model... done. 201 | Loading constituency parser from english-models/conparser 202 | Loading scores... done. (65.9334s) 203 | Loading dependency parser from english-models/depparser 204 | Loading scores... done. (14.9623s) 205 | INFO:Registering introspection ... 206 | INFO:Starting server on port 8859... 207 | 208 | Run ``zpar_server -h`` to see a list of all options. 209 | 210 | Once the server is running, you can connect to it using a client. An 211 | example client is included in the file ``examples/zpar_client.py`` which 212 | can be run as follows (note that if you specified a custom host and port 213 | when running the server, you'd need to specify the same here): 214 | 215 | .. code-block:: 216 | 217 | $> cd examples 218 | $> python zpar_client.py 219 | 220 | INFO:Attempting connection to http://localhost:8859 221 | INFO:Tagging "Don't you want to come with me to the market?" 222 | INFO:Output: Do/VBP n't/RB you/PRP want/VBP to/TO come/VB with/IN me/PRP to/TO the/DT market/NN ?/. 223 | INFO:Tagging "Do n't you want to come to the market with me ?" 224 | INFO:Output: Do/VBP n't/RB you/PRP want/VBP to/TO come/VB to/TO the/DT market/NN with/IN me/PRP ?/. 225 | INFO:Parsing "Don't you want to come with me to the market?" 226 | INFO:Output: (SQ (VBP Do) (RB n't) (NP (PRP you)) (VP (VBP want) (S (VP (TO to) (VP (VB come) (PP (IN with) (NP (PRP me))) (PP (TO to) (NP (DT the) (NN market))))))) (. ?)) 227 | INFO:Dep Parsing "Do n't you want to come to the market with me ?" 228 | INFO:Output: Do VBP -1 ROOT 229 | n't RB 0 VMOD 230 | you PRP 0 SUB 231 | want VBP 0 VMOD 232 | to TO 5 VMOD 233 | come VB 3 VMOD 234 | to TO 5 VMOD 235 | the DT 8 NMOD 236 | market NN 6 PMOD 237 | with IN 5 VMOD 238 | me PRP 9 PMOD 239 | ? . 0 P 240 | 241 | INFO:Tagging file /Users/nmadnani/work/python-zpar/examples/test.txt into test.tag 242 | INFO:Parsing file /Users/nmadnani/work/python-zpar/examples/test_tokenized.txt into test.parse 243 | 244 | 245 | Note that python-zpar and all of the example scripts should work with 246 | both Python 2.7 and Python 3.4. I have tested python-zpar on both Linux 247 | and Mac but not on Windows. 248 | 249 | Node.js version 250 | ~~~~~~~~~~~~~~~ 251 | 252 | If you want to use ZPar in your node.js app, check out my other project 253 | `node-zpar `__. 254 | 255 | License 256 | ~~~~~~~ 257 | 258 | Although python-zpar is licensed under the MIT license - which means 259 | that you can do whatever you want with the wrapper code - ZPar itself is 260 | licensed under GPL v3. 261 | 262 | ToDo 263 | ~~~~ 264 | 265 | 1. Improve error handling on both the python and C side. 266 | 2. Expose more functionality, e.g., Chinese word segmentation, parsing 267 | etc. 268 | 3. May be look into using `CFFI `__ 269 | instead of ctypes. 270 | 271 | -------------------------------------------------------------------------------- /conda-recipe/python-zpar/meta.yaml: -------------------------------------------------------------------------------- 1 | package: 2 | name: python-zpar 3 | version: "0.9.5" 4 | 5 | source: 6 | path: ../../../python-zpar 7 | 8 | build: 9 | number: {{environ.get('BINSTAR_BUILD', 0)}} 10 | script: 11 | - cd $SRC_DIR 12 | - "{{ PYTHON }} setup.py install" 13 | 14 | requirements: 15 | build: 16 | - python 17 | - setuptools 18 | run: 19 | - python 20 | - six 21 | 22 | about: 23 | home: https://github.com/EducationalTestingService/python-zpar 24 | license: MIT 25 | -------------------------------------------------------------------------------- /examples/test.txt: -------------------------------------------------------------------------------- 1 | I am going to the market. 2 | Are you going to come with me? 3 | -------------------------------------------------------------------------------- /examples/test_tagged.txt: -------------------------------------------------------------------------------- 1 | I/PRP am/VBP going/VBG to/TO the/DT market/NN ./. 2 | Are/VBP you/PRP going/VBG to/TO come/VB with/IN me/PRP ?/. 3 | -------------------------------------------------------------------------------- /examples/test_tokenized.txt: -------------------------------------------------------------------------------- 1 | I am going to the market . 2 | Are you going to come with me ? 3 | -------------------------------------------------------------------------------- /examples/zpar_client.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import argparse 4 | import logging 5 | import os 6 | import six 7 | import socket 8 | import sys 9 | 10 | if __name__ == '__main__': 11 | 12 | # set up an argument parser 13 | parser = argparse.ArgumentParser(prog='zpar_client.py') 14 | 15 | # set up the logging 16 | logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.INFO) 17 | 18 | parser.add_argument('--host', dest='hostname', 19 | help="Hostname or IP address", 20 | default="localhost", 21 | required=False) 22 | 23 | parser.add_argument('--port', dest='port', type=int, 24 | help="Port number", 25 | default=8859, 26 | required=False) 27 | 28 | # parse given command line arguments 29 | args = parser.parse_args() 30 | 31 | # set up the logging 32 | logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.INFO) 33 | 34 | # instantiate the client appropriately and connect 35 | logging.info('Attempting connection to http://{}:{}'.format(args.hostname, 36 | args.port)) 37 | if six.PY2: 38 | import xmlrpclib 39 | proxy = xmlrpclib.ServerProxy('http://{}:{}'.format(args.hostname, 40 | args.port), 41 | allow_none=True) 42 | fault = xmlrpclib.Fault 43 | else: 44 | import xmlrpc.client 45 | proxy = xmlrpc.client.ServerProxy('http://{}:{}'.format(args.hostname, 46 | args.port), 47 | use_builtin_types=True, 48 | allow_none=True) 49 | fault = xmlrpc.client.Fault 50 | 51 | # Make the remote procedure calls on the server 52 | try: 53 | 54 | test_sentence = "Don't you want to come with me to the market?" 55 | test_tagged_sentence = "I/PRP am/VBP going/VBG to/TO the/DT market/NN ./." 56 | test_file = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'test.txt') 57 | tokenized_test_sentence = "Do n't you want to come to the market with me ?" 58 | tokenized_test_file = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'test_tokenized.txt') 59 | tagged_test_file = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'test_tagged.txt') 60 | tag_outfile = os.path.join(os.getcwd(), 'test.tag') 61 | parse_outfile = os.path.join(os.getcwd(), 'test.parse') 62 | parse_outfile2 = os.path.join(os.getcwd(), 'test_tagged.parse') 63 | 64 | logging.info('Tagging "{}"'.format(test_sentence)) 65 | tagged_sent = proxy.tag_sentence(test_sentence) 66 | logging.info("Output: {}".format(tagged_sent)) 67 | 68 | logging.info('Tagging "{}"'.format(tokenized_test_sentence)) 69 | tagged_sent = proxy.tag_sentence(tokenized_test_sentence, False) 70 | logging.info("Output: {}".format(tagged_sent)) 71 | 72 | logging.info('Parsing "{}"'.format(test_sentence)) 73 | parsed_sent = proxy.parse_sentence(test_sentence) 74 | logging.info("Output: {}".format(parsed_sent)) 75 | 76 | logging.info('Parsing "{}"'.format(test_tagged_sentence)) 77 | parsed_sent = proxy.parse_tagged_sentence(test_tagged_sentence) 78 | logging.info("Output: {}".format(parsed_sent)) 79 | 80 | logging.info('Dep Parsing "{}"'.format(tokenized_test_sentence)) 81 | parsed_sent = proxy.dep_parse_sentence(tokenized_test_sentence, False) 82 | logging.info("Output: {}".format(parsed_sent)) 83 | 84 | logging.info('Tagging file {} into {}'.format(test_file, tag_outfile)) 85 | proxy.tag_file(test_file, tag_outfile) 86 | 87 | logging.info('Parsing file {} into {}'.format(tokenized_test_file, parse_outfile)) 88 | proxy.parse_file(tokenized_test_file, parse_outfile, False) 89 | 90 | logging.info('Parsing tagged file {} into {}'.format(tagged_test_file, parse_outfile2)) 91 | proxy.parse_tagged_file(tagged_test_file, parse_outfile2) 92 | 93 | except socket.error as err: 94 | sys.stderr.write("{}\n".format(err)) 95 | sys.exit(1) 96 | except fault as flt: 97 | sys.stderr.write("Fault {}: {}\n".format(flt.faultCode, 98 | flt.faultString)) 99 | sys.exit(1) 100 | 101 | # Stop the server 102 | # NOTE: You will probably do this in the last client (if you know 103 | # which one that is) or in a clean-up script when you are absolutely sure 104 | # that all clients are finished. 105 | proxy.stop_server() 106 | 107 | -------------------------------------------------------------------------------- /examples/zpar_example.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import argparse 4 | import os 5 | from six import print_ 6 | 7 | from zpar import ZPar 8 | 9 | def main(): 10 | # set up an argument parser 11 | parser = argparse.ArgumentParser(prog='zpar_example.py') 12 | parser.add_argument('--modeldir', dest='modeldir', 13 | help="Path to directory containing zpar English models", 14 | required=True) 15 | 16 | # parse given command line arguments 17 | args = parser.parse_args() 18 | 19 | # use the zpar wrapper as a context manager 20 | with ZPar(args.modeldir) as z: 21 | 22 | # get the parser and the dependency parser models 23 | tagger = z.get_tagger() 24 | depparser = z.get_depparser() 25 | 26 | # tag a sentence 27 | tagged_sent = tagger.tag_sentence("I am going to the market.") 28 | print_(tagged_sent) 29 | 30 | # tag an already tokenized sentence 31 | tagged_sent = tagger.tag_sentence("Do n't you want to come with me to the market ?", tokenize=False) 32 | print_(tagged_sent) 33 | 34 | # get the dependency parse of an already tagged sentence 35 | dep_parsed_sent = depparser.dep_parse_tagged_sentence("I/PRP am/VBP going/VBG to/TO the/DT market/NN ./.") 36 | print_(dep_parsed_sent) 37 | 38 | # get the dependency parse of an already tokenized sentence 39 | dep_parsed_sent = depparser.dep_parse_sentence("Do n't you want to come with me to the market ?", tokenize=False) 40 | print_(dep_parsed_sent) 41 | 42 | # get the dependency parse of an already tokenized sentence 43 | # and include lemma information (assuming you have NLTK as well 44 | # as its WordNet corpus installed) 45 | dep_parsed_sent = depparser.dep_parse_sentence("Do n't you want to come with me to the market ?", tokenize=False, with_lemmas=True) 46 | print_(dep_parsed_sent) 47 | 48 | # compute POS tags for all sentences in "test.txt" 49 | # and write the output to "test.tag". Note that the 50 | # file contains a single sentence per line. 51 | # The sentences need not be word tokenized 52 | test_file = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'test.txt') 53 | tagger.tag_file(test_file, "test.tag") 54 | 55 | # compute dependency parses for all sentences in "test_tokenized.txt" 56 | tokenized_test_file = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'test_tokenized.txt') 57 | depparser.dep_parse_file(tokenized_test_file, "test.dep") 58 | 59 | if __name__ == '__main__': 60 | main() 61 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # Adapted from https://github.com/Turbo87/py-xcsoar/blob/master/setup.py 4 | 5 | import os 6 | from setuptools import setup 7 | from setuptools.command.install import install 8 | from distutils.command.build import build 9 | from subprocess import call 10 | 11 | import sys 12 | 13 | BASEPATH = os.path.dirname(os.path.abspath(__file__)) 14 | ZPAR_PATH = os.path.join(BASEPATH, 'zpar') 15 | ZPAR_LIB_PATH = os.path.join(ZPAR_PATH, 'dist') 16 | 17 | def readme(): 18 | with open('README.rst') as f: 19 | return f.read() 20 | 21 | class build_zpar(build): 22 | def run(self): 23 | 24 | # run original build code 25 | build.run(self) 26 | 27 | # get a copy of the user environment 28 | env = os.environ.copy() 29 | 30 | sys.stderr.write('running build_zpar\n') 31 | 32 | # for now the compilation is just calling make 33 | # with the option to override the CXX defined 34 | # in the zpar Makefile with the CXX environment 35 | # variable if defined. 36 | if os.environ.get('CXX'): 37 | cmd = ['make', '-e'] 38 | env['CXX'] = os.environ.get('CXX') 39 | else: 40 | cmd = ['make'] 41 | 42 | # compile the shared library path 43 | def compile(): 44 | sys.stderr.write('*' * 80 + '\n') 45 | ret = call(cmd, env=env) 46 | # if something went wrong, raise an error 47 | if ret: 48 | raise RuntimeError('ZPar shared library compilation failed') 49 | sys.stderr.write('*' * 80 + '\n') 50 | self.execute(compile, [], 'compiling zpar library') 51 | 52 | # copy resulting tool to library build folder 53 | self.mkpath(self.build_lib) 54 | 55 | if not self.dry_run: 56 | self.copy_tree(ZPAR_PATH, self.build_lib) 57 | 58 | class install_zpar(install): 59 | 60 | def initialize_options(self): 61 | install.initialize_options(self) 62 | self.build_scripts = None 63 | 64 | def finalize_options(self): 65 | install.finalize_options(self) 66 | self.set_undefined_options('build', ('build_scripts', 'build_scripts')) 67 | 68 | def run(self): 69 | # run original install code 70 | install.run(self) 71 | 72 | # install ZPar executables 73 | sys.stderr.write('running install_zpar\n') 74 | install_path = os.path.join(self.install_lib, 'zpar') 75 | self.mkpath(install_path) 76 | self.copy_tree(self.build_lib, install_path) 77 | 78 | 79 | def read(fname): 80 | return open(os.path.join(os.path.dirname(__file__), fname)).read() 81 | 82 | 83 | setup( 84 | name='python-zpar', 85 | version='0.9.5', 86 | description='A Wrapper around the ZPar statistical tagger/parser for English', 87 | maintainer='Nitin Madnani', 88 | maintainer_email='nmadnani@ets.org', 89 | license='MIT', 90 | url='http://www.github.com/EducationalTestingService/python-zpar', 91 | long_description=readme(), 92 | classifiers=['Intended Audience :: Science/Research', 93 | 'Intended Audience :: Developers', 94 | 'License :: OSI Approved :: MIT License', 95 | 'Programming Language :: Python', 96 | 'Topic :: Software Development', 97 | 'Topic :: Scientific/Engineering', 98 | 'Operating System :: POSIX', 99 | 'Operating System :: Unix', 100 | 'Operating System :: MacOS', 101 | 'Programming Language :: Python :: 2', 102 | 'Programming Language :: Python :: 2.7', 103 | 'Programming Language :: Python :: 3', 104 | 'Programming Language :: Python :: 3.3', 105 | ], 106 | cmdclass={ 107 | 'build': build_zpar, 108 | 'install': install_zpar, 109 | }, 110 | entry_points={'console_scripts': 111 | ['zpar_server = zpar.zpar_server:main']} 112 | ) 113 | -------------------------------------------------------------------------------- /src/Makefile: -------------------------------------------------------------------------------- 1 | #**************************************************************** 2 | # 3 | # Makefile 4 | # 5 | # Yue Zhang 6 | # Computing lab, Oxford. 2006.10 - 2008.2 7 | # 8 | #**************************************************************** 9 | 10 | # Makable targe systems include: 11 | # 12 | # === Chinese === 13 | # segmentor - Chinese word segmentor 14 | # chinese.postagger - Chinese POS tagger (joint / single) 15 | # chinese.depparser - Chinese dependency parser 16 | # 17 | # === English === 18 | # english.postagger - English POS tagger 19 | # english.depparser - English dependency parser 20 | 21 | #================================================================ 22 | # 23 | # Configurations 24 | # 25 | #================================================================ 26 | 27 | # the generic tagger 28 | TAGGER_IMPL = collins 29 | 30 | # the generic depparser 31 | DEPPARSER_IMPL = arceager 32 | 33 | # the generic conparser 34 | CONPARSER_IMPL = srnew 35 | 36 | # choose between agenda, agendachart etc ## NO SPACE AFTER NAME ### 37 | # 38 | # agenda: the single agenda method - reproduce paper 39 | # agendaplus: try to improve upon the decoding algorithm of agenda 40 | # viterbi: dynamic programming 41 | SEGMENTOR_IMPL = agenda 42 | 43 | # Chinese postagger implementations 44 | # 45 | # joint taggers include the following implementations 46 | # agendachart: combining agenda and chart, this is the best system - reproduce paper 47 | # 48 | # taggers on segmented sentences include the following implementations 49 | # segmented: the unidirectional trigram tagger 50 | CHINESE_TAGGER_IMPL = agenda 51 | 52 | # Chinese dependency parser 53 | # 54 | # currently support eisner, covington, nivre, combined and joint implementations 55 | CHINESE_DEPPARSER_IMPL = arceager 56 | CHINESE_DEPPARSER_LABELED = true 57 | CHINESE_DEPLABELER_IMPL = naive 58 | 59 | # currently support sr implementations 60 | CHINESE_CONPARSER_IMPL = acl13 61 | CHINESE_CONPARSER_JOINT_OR_CASCADE = JOINT_CONPARSER 62 | 63 | # currently support only agenda 64 | ENGLISH_TAGGER_IMPL = collins 65 | 66 | # currently support eisner, covington, nivre, combined implementations 67 | ENGLISH_DEPPARSER_IMPL = arceager 68 | ENGLISH_DEPPARSER_LABELED = true 69 | ENGLISH_DEPLABELER_IMPL = naive 70 | 71 | # currently support sr implementations 72 | ENGLISH_CONPARSER_IMPL = muhua 73 | 74 | # Spanish pos tagger 75 | SPANISH_TAGGER_IMPL = collins 76 | 77 | # Spanish dependency parser 78 | SPANISH_DEPPARSER_IMPL = arceager 79 | SPANISH_DEPPARSER_LABELED = true 80 | SPANISH_DEPLABELER_IMPL = naive 81 | 82 | # Spanish annotation. Supported: ES06_DEPENDENCIES, ES09_DEPENDENCIES 83 | SPANISH_ANNOTATION = ES09_DEPENDENCIES 84 | 85 | #no Spanish constituency parser at the moment 86 | 87 | #================================================================ 88 | # 89 | # Debug mode or the run mode (empty) 90 | # 91 | #================================================================ 92 | 93 | #DEBUG = -DDEBUG -g 94 | DEBUG = -DNDEBUG 95 | 96 | #================================================================ 97 | # 98 | # directory configurations 99 | # 100 | #================================================================ 101 | 102 | BASE_DIR = . 103 | include Makefile.common 104 | 105 | #================================================================ 106 | # 107 | # cross platform configurations 108 | # 109 | #================================================================ 110 | 111 | ifeq ($(OS),Windows_NT) 112 | #use good old GNU mkdir instead of MSDOS mkdir on Windows 113 | MKDIR=gmkdir -p 114 | else 115 | MKDIR=mkdir -p 116 | endif 117 | 118 | #================================================================ 119 | # 120 | # compiler commands 121 | # 122 | #================================================================ 123 | 124 | INCLUDES = -I$(SRC_INCLUDES) 125 | 126 | CXX = g++ 127 | CXXFLAGS = -w -W -O3 $(INCLUDES) $(DEBUG) -fPIC 128 | 129 | LD=$(CXX) 130 | LDFLAGS = 131 | 132 | #================================================================ 133 | # 134 | # Shared objects 135 | # 136 | #================================================================ 137 | 138 | # the objects 139 | LINGUISTICS_OBJECTS = $(OBJECT_DIR)/linguistics/lemma.o $(OBJECT_DIR)/linguistics/conll.o 140 | LEARNING_OBJECTS = $(OBJECT_DIR)/learning/dbn.o 141 | OBJECTS = $(OBJECT_DIR)/reader.o $(OBJECT_DIR)/writer.o $(OBJECT_DIR)/options.o $(LINGUISTICS_OBJECTS) $(LEARNING_OBJECTS) 142 | 143 | $(OBJECT_DIR)/%.o: $(SRC_LIBS)/%.cpp $(SRC_INCLUDES)/%.h 144 | $(MKDIR) $(OBJECT_DIR) 145 | $(MKDIR) $(OBJECT_DIR)/linguistics 146 | $(MKDIR) $(OBJECT_DIR)/learning 147 | $(CXX) $(CXXFLAGS) -c $< -o $@ 148 | 149 | all: zpar 150 | 151 | # the directories 152 | $(OBJECT_DIR): 153 | $(MKDIR) $(OBJECT_DIR) 154 | $(DIST_DIR): 155 | $(MKDIR) $(DIST_DIR) 156 | 157 | # tagger 158 | SRC_TAGGER = $(SRC_CHINESE)/tagger 159 | DIST_TAGGER = $(DIST_DIR)/chinese.postagger 160 | OBJECT_TAGGER = $(OBJECT_DIR)/chinese.postagger 161 | $(DIST_TAGGER): 162 | $(MKDIR) $(DIST_TAGGER) 163 | $(OBJECT_TAGGER): 164 | $(MKDIR) $(OBJECT_TAGGER) 165 | 166 | SRC_ENGLISH_TAGGER = $(SRC_COMMON)/tagger 167 | DIST_ENGLISH_TAGGER = $(DIST_DIR)/english.postagger 168 | OBJECT_ENGLISH_TAGGER = $(OBJECT_DIR)/english.postagger 169 | $(DIST_ENGLISH_TAGGER): 170 | $(MKDIR) $(DIST_ENGLISH_TAGGER) 171 | $(OBJECT_ENGLISH_TAGGER): 172 | $(MKDIR) $(OBJECT_ENGLISH_TAGGER) 173 | 174 | SRC_SPANISH_TAGGER = $(SRC_COMMON)/tagger 175 | DIST_SPANISH_TAGGER = $(DIST_DIR)/spanish.postagger 176 | OBJECT_SPANISH_TAGGER = $(OBJECT_DIR)/spanish.postagger 177 | $(DIST_SPANISH_TAGGER): 178 | $(MKDIR) $(DIST_SPANISH_TAGGER) 179 | $(OBJECT_SPANISH_TAGGER): 180 | $(MKDIR) $(OBJECT_SPANISH_TAGGER) 181 | 182 | # depparser 183 | SRC_COMMON_DEPPARSER = $(SRC_COMMON)/depparser 184 | #ifeq ($(CHINESE_DEPPARSER_IMPL), joint) 185 | # SRC_CHINESE_DEPPARSER = $(SRC_CHINESE)/depparser 186 | #else 187 | # SRC_CHINESE_DEPPARSER = $(SRC_COMMON_DEPPARSER) 188 | #endif 189 | SRC_CHINESE_DEPPARSER = $(SRC_COMMON_DEPPARSER) 190 | DIST_DEPPARSER = $(DIST_DIR)/chinese.depparser 191 | OBJECT_DEPPARSER = $(OBJECT_DIR)/chinese.depparser 192 | DIST_ENGLISH_DEPPARSER = $(DIST_DIR)/english.depparser 193 | OBJECT_ENGLISH_DEPPARSER = $(OBJECT_DIR)/english.depparser 194 | DIST_SPANISH_DEPPARSER = $(DIST_DIR)/spanish.depparser 195 | OBJECT_SPANISH_DEPPARSER = $(OBJECT_DIR)/spanish.depparser 196 | 197 | # deplabeler 198 | SRC_COMMON_DEPLABELER = $(SRC_COMMON)/deplabeler 199 | SRC_CHINESE_DEPLABELER = $(SRC_COMMON_DEPLABELER) 200 | DIST_DEPLABELER = $(DIST_DIR)/chinese.deplabeler 201 | OBJECT_DEPLABELER = $(OBJECT_DIR)/chinese.deplabeler 202 | SRC_ENGLISH_DEPLABELER = $(SRC_COMMON_DEPLABELER) 203 | DIST_ENGLISH_DEPLABELER = $(DIST_DIR)/english.deplabeler 204 | OBJECT_ENGLISH_DEPLABELER = $(OBJECT_DIR)/english.deplabeler 205 | SRC_SPANISH_DEPLABELER = $(SRC_COMMON_DEPLABELER) 206 | DIST_SPANISH_DEPLABELER = $(DIST_DIR)/spanish.deplabeler 207 | OBJECT_SPANISH_DEPLABELER = $(OBJECT_DIR)/spanish.deplabeler 208 | 209 | # conparser 210 | SRC_COMMON_CONPARSER = $(SRC_COMMON)/conparser 211 | SRC_CHINESE_CONPARSER = $(SRC_COMMON_CONPARSER) 212 | ifeq ($(CHINESE_CONPARSER_IMPL), jcad) 213 | SRC_CHINESE_CONPARSER = $(SRC_CHINESE)/conparser 214 | else 215 | ifeq ($(CHINESE_CONPARSER_IMPL), acl13) 216 | SRC_CHINESE_CONPARSER = $(SRC_CHINESE)/conparser 217 | else 218 | SRC_CHINESE_CONPARSER = $(SRC_COMMON_CONPARSER) 219 | endif 220 | endif 221 | SRC_ENGLISH_CONPARSER = $(SRC_COMMON_CONPARSER) 222 | DIST_CONPARSER = $(DIST_DIR)/chinese.conparser 223 | OBJECT_CONPARSER = $(OBJECT_DIR)/chinese.conparser 224 | DIST_ENGLISH_CONPARSER = $(DIST_DIR)/english.conparser 225 | OBJECT_ENGLISH_CONPARSER = $(OBJECT_DIR)/english.conparser 226 | 227 | #---------------------------------------------------------------- 228 | # 229 | # zpar general 230 | # 231 | #---------------------------------------------------------------- 232 | 233 | 234 | ifeq ($(CHINESE_CONPARSER_IMPL), jcad) 235 | OBJ_CHINESE_CONSTITUENT = $(OBJECT_CONPARSER)/constituent.o $(OBJECT_CONPARSER)/jointconstituent.o 236 | else 237 | ifeq ($(CHINESE_CONPARSER_IMPL), acl13) 238 | OBJ_CHINESE_CONSTITUENT = $(OBJECT_CONPARSER)/constituent.o $(OBJECT_CONPARSER)/jointconstituent.o 239 | else 240 | OBJ_CHINESE_CONSTITUENT = $(OBJECT_CONPARSER)/constituent.o 241 | endif 242 | endif 243 | 244 | $(DIST_CONPARSER): 245 | $(MKDIR) $(DIST_CONPARSER) 246 | $(OBJECT_CONPARSER): 247 | $(MKDIR) $(OBJECT_CONPARSER) 248 | 249 | $(DIST_DEPLABELER): 250 | $(MKDIR) $(DIST_DEPLABELER) 251 | $(OBJECT_DEPLABELER): 252 | $(MKDIR) $(OBJECT_DEPLABELER) 253 | 254 | # the flags for train 255 | ifeq ($(CHINESE_TAGGER_IMPL), segmented) # if segmented 256 | TAGGER_TRAIN_FLAGS = -DSEGMENTED 257 | TAGGER_TEST_FLAGS = -DSEGMENTED 258 | else 259 | ifeq ($(CHINESE_TAGGER_IMPL), bidirectional) # else if bidirectional 260 | TAGGER_TRAIN_FLAGS = -DSEGMENTED -DAUTO 261 | TAGGER_TEST_FLAGS = -DSEGMENTED 262 | endif 263 | endif 264 | 265 | 266 | ifeq ($(CHINESE_DEPPARSER_LABELED), true) 267 | CHINESE_DEPPARSER_D = -DLABELED 268 | endif 269 | 270 | ifeq ($(ENGLISH_DEPPARSER_LABELED), true) 271 | ENGLISH_DEPPARSER_D = -DLABELED 272 | endif 273 | 274 | ifeq ($(CHINESE_DEPPARSER_IMPL), combined) 275 | CHINESE_DEPPARSER_D := $(CHINESE_DEPPARSER_D) -DCOMBINED 276 | CHINESE_DEPPARSER_IMPL = nivre 277 | endif 278 | 279 | ifeq ($(ENGLISH_DEPPARSER_IMPL), combined) 280 | ENGLISH_DEPPARSER_D := $(ENGLISH_DEPPARSER_D) -DCOMBINED 281 | ENGLISH_DEPPARSER_IMPL = nivre 282 | endif 283 | 284 | #==================================================== 285 | 286 | $(DIST_DEPPARSER): 287 | $(MKDIR) $(DIST_DEPPARSER) 288 | $(OBJECT_DEPPARSER): 289 | $(MKDIR) $(OBJECT_DEPPARSER) 290 | 291 | SRC_SEGMENTOR = $(SRC_CHINESE)/segmentor 292 | DIST_SEGMENTOR = $(DIST_DIR)/segmentor 293 | OBJECT_SEGMENTOR = $(OBJECT_DIR)/segmentor 294 | $(DIST_SEGMENTOR): 295 | $(MKDIR) $(DIST_SEGMENTOR) 296 | $(OBJECT_SEGMENTOR): 297 | $(MKDIR) $(OBJECT_SEGMENTOR) 298 | 299 | include Makefile.zpar.zh 300 | include Makefile.zpar.en 301 | include Makefile.zpar.ge 302 | include Makefile.zpar.es 303 | include Makefile.zpar.mvt 304 | include Makefile.lib.zpar 305 | 306 | zpar: zpar.ge 307 | 308 | #---------------------------------------------------------------- 309 | # 310 | # The sentence boundary detector 311 | # 312 | #---------------------------------------------------------------- 313 | 314 | include Makefile.doc2snt 315 | 316 | #---------------------------------------------------------------- 317 | # 318 | # The ccgparser 319 | # 320 | #---------------------------------------------------------------- 321 | 322 | include Makefile.ccg 323 | 324 | #---------------------------------------------------------------- 325 | # 326 | # Miscelaneous 327 | # 328 | #---------------------------------------------------------------- 329 | 330 | include Makefile.misc 331 | #include Makefile.rr 332 | 333 | -------------------------------------------------------------------------------- /src/Makefile.lib.zpar: -------------------------------------------------------------------------------- 1 | ifeq ($(ENGLISH_DEPPARSER_LABELED), true) 2 | ENGLISH_DEPPARSER_D = -DLABELED 3 | endif 4 | 5 | ifeq ($(ENGLISH_DEPPARSER_IMPL), combined) 6 | ENGLISH_DEPPARSER_D := $(ENGLISH_DEPPARSER_D) -DCOMBINED 7 | ENGLISH_DEPPARSER_IMPL = nivre 8 | endif 9 | 10 | zpar.so: $(OBJECT_DIR) $(DIST_DIR) $(OBJECT_DIR)/reader.o $(OBJECT_DIR)/writer.o $(OBJECT_DIR)/options.o $(OBJECT_DIR)/english.postagger.o $(OBJECT_ENGLISH_TAGGER)/weight.o $(OBJECT_DIR)/english.conparser.o $(OBJECT_ENGLISH_CONPARSER)/constituent.o $(OBJECT_ENGLISH_CONPARSER)/weight.o $(OBJECT_DIR)/english.depparser.o $(OBJECT_ENGLISH_DEPPARSER)/weight.o $(OBJECT_DIR)/english.deplabeler.o $(OBJECT_ENGLISH_DEPLABELER)/weight.o $(OBJECTS) 11 | $(CXX) $(CXXFLAGS) -DTARGET_LANGUAGE=english $(ENGLISH_DEPPARSER_D) -I$(SRC_ENGLISH) -I$(SRC_ENGLISH_TAGGER) -I$(SRC_ENGLISH_TAGGER)/implementations/$(ENGLISH_TAGGER_IMPL) -I$(SRC_ENGLISH_CONPARSER) -I$(SRC_ENGLISH_CONPARSER)/implementations/$(ENGLISH_CONPARSER_IMPL) -I$(SRC_COMMON_DEPPARSER) -I$(SRC_COMMON_DEPPARSER)/implementations/$(ENGLISH_DEPPARSER_IMPL) -I$(SRC_COMMON_DEPLABELER) -I$(SRC_COMMON_DEPLABELER)/implementations/$(ENGLISH_DEPLABELER_IMPL) -c $(SRC_ENGLISH)/zpar.lib.cpp -o $(OBJECT_DIR)/zpar.lib.o 12 | $(CXX) -shared $(OBJECT_DIR)/zpar.lib.o $(OBJECT_ENGLISH_TAGGER)/weight.o $(OBJECT_DIR)/english.postagger.o $(OBJECT_DIR)/english.depparser.o $(OBJECT_ENGLISH_DEPPARSER)/weight.o $(OBJECT_DIR)/english.conparser.o $(OBJECT_ENGLISH_CONPARSER)/constituent.o $(OBJECT_ENGLISH_CONPARSER)/weight.o $(OBJECT_DIR)/english.deplabeler.o $(OBJECT_ENGLISH_DEPLABELER)/weight.o $(OBJECTS) -o $(DIST_DIR)/zpar.so 13 | @echo zpar.so compiled successfully into $(DIST_DIR). 14 | 15 | zpar.exe: $(OBJECT_DIR) $(DIST_DIR) $(OBJECT_DIR)/reader.o $(OBJECT_DIR)/writer.o $(OBJECT_DIR)/options.o $(OBJECT_DIR)/english.postagger.o $(OBJECT_ENGLISH_TAGGER)/weight.o $(OBJECT_DIR)/english.conparser.o $(OBJECT_ENGLISH_CONPARSER)/constituent.o $(OBJECT_ENGLISH_CONPARSER)/weight.o $(OBJECT_DIR)/english.depparser.o $(OBJECT_ENGLISH_DEPPARSER)/weight.o $(OBJECT_DIR)/english.deplabeler.o $(OBJECT_ENGLISH_DEPLABELER)/weight.o $(OBJECTS) 16 | $(CXX) $(CXXFLAGS) -DTARGET_LANGUAGE=english $(ENGLISH_DEPPARSER_D) -I$(SRC_ENGLISH) -I$(SRC_ENGLISH_TAGGER) -I$(SRC_ENGLISH_TAGGER)/implementations/$(ENGLISH_TAGGER_IMPL) -I$(SRC_ENGLISH_CONPARSER) -I$(SRC_ENGLISH_CONPARSER)/implementations/$(ENGLISH_CONPARSER_IMPL) -I$(SRC_COMMON_DEPPARSER) -I$(SRC_COMMON_DEPPARSER)/implementations/$(ENGLISH_DEPPARSER_IMPL) -I$(SRC_COMMON_DEPLABELER) -I$(SRC_COMMON_DEPLABELER)/implementations/$(ENGLISH_DEPLABELER_IMPL) -c $(SRC_ENGLISH)/zpar.lib.cpp -o $(OBJECT_DIR)/zpar.lib.o 17 | $(LD) $(LDFLAGS) -fPIE -pie -o $(DIST_DIR)/zpar.exe $(OBJECT_DIR)/zpar.lib.o $(OBJECT_ENGLISH_TAGGER)/weight.o $(OBJECT_DIR)/english.postagger.o $(OBJECT_DIR)/english.depparser.o $(OBJECT_ENGLISH_DEPPARSER)/weight.o $(OBJECT_DIR)/english.conparser.o $(OBJECT_ENGLISH_CONPARSER)/constituent.o $(OBJECT_ENGLISH_CONPARSER)/weight.o $(OBJECT_DIR)/english.deplabeler.o $(OBJECT_ENGLISH_DEPLABELER)/weight.o $(OBJECTS) 18 | @echo zpar.exe system compiled successfully into $(DIST_DIR). 19 | 20 | -------------------------------------------------------------------------------- /src/reader.h: -------------------------------------------------------------------------------- 1 | // Copyright (C) University of Oxford 2010 2 | /**************************************************************** 3 | * * 4 | * reader.h - the sentence reader classes * 5 | * * 6 | * this file is specifically designed for sentence_string * 7 | * * 8 | * Author: Yue Zhang * 9 | * * 10 | * Computing Laboratory, Oxford. 2006.10 * 11 | * * 12 | ****************************************************************/ 13 | 14 | #ifndef READER_H 15 | #define READER_H 16 | 17 | #include "definitions.h" 18 | #include "file_utils.h" 19 | #include "linguistics/sentence_string.h" 20 | 21 | #include 22 | 23 | /*=============================================================== 24 | * 25 | * CSentenceReader - read sentence 26 | * 27 | * Specify a file name in the constructor. If no file name is specified, 28 | * the reader will read from the standard input. 29 | * 30 | * readRawSentence: 31 | * - The input file should contain tokenised sentences each in a line, 32 | * with space separated words and punctuations. 33 | * In the Chinese case, each character should be separated by space. 34 | * 35 | *==============================================================*/ 36 | 37 | class CSentenceReader { 38 | protected: 39 | std::istream *m_iStream; 40 | bool m_fileMode; 41 | int m_nLine; 42 | public: 43 | // constructor and destructor method 44 | CSentenceReader(const std::string &sFileName="", bool fileMode=true) { 45 | m_fileMode = fileMode; 46 | if (m_fileMode) { 47 | if (sFileName.empty()) 48 | m_iStream = &std::cin; 49 | else { 50 | if (!FileExists(sFileName)) THROW("File " << sFileName << " not found."); 51 | m_iStream=new std::ifstream(sFileName.c_str()); 52 | } 53 | m_nLine = 0; 54 | } 55 | else { 56 | m_iStream = new std::istringstream(sFileName); 57 | m_nLine = 0; 58 | } 59 | }; 60 | virtual ~CSentenceReader() { 61 | if (m_fileMode) { 62 | if (m_iStream != &std::cin) { 63 | ((std::ifstream*)m_iStream)->close(); 64 | delete m_iStream; 65 | } 66 | } 67 | else { 68 | delete m_iStream; 69 | } 70 | }; 71 | bool readRawCharacter(std::string *retval); 72 | bool readRawSentence(CStringVector *retval, bool bSkipEmptyLines=false, bool bIgnoreSpace=false); 73 | bool readSegmentedSentence(CStringVector *retval, bool bSkipEmptyLines=false); 74 | bool readTaggedSentence(CTwoStringVector *retval, bool bSkipEmptyLines=false, const char separator='_'); 75 | bool readSegmentedSentenceAndTokenize(CStringVector *vReturn, bool bSkipEmptyLines=false); 76 | }; 77 | 78 | #endif 79 | -------------------------------------------------------------------------------- /src/zpar.lib.cpp: -------------------------------------------------------------------------------- 1 | /**************************************************************** 2 | * * 3 | * zpar.lib.cpp - a library that can be used by python * 4 | * * 5 | * Author: Nitin Madnani * 6 | * Educational Testing Service, Princeton, NJ * 7 | * * 8 | ****************************************************************/ 9 | 10 | #define SIMPLE_HASH 11 | 12 | #include "definitions.h" 13 | #include "options.h" 14 | #include "tagger.h" 15 | #include "conparser.h" 16 | #include "depparser.h" 17 | #include "reader.h" 18 | #include "writer.h" 19 | #include "stdlib.h" 20 | #include 21 | #include 22 | #include 23 | 24 | using namespace english; 25 | 26 | #define MAX_SENTENCE_SIZE 512 27 | 28 | 29 | // define a container structure with a container and a destructor 30 | struct zparSession_t 31 | { 32 | CTagger* tagger; 33 | CConParser* conparser; 34 | CDepParser* depparser; 35 | char *output_buffer; 36 | 37 | zparSession_t() { 38 | tagger = NULL; 39 | conparser = NULL; 40 | depparser = NULL; 41 | output_buffer = NULL; 42 | }; 43 | 44 | ~zparSession_t() { 45 | if (tagger) { 46 | delete tagger; 47 | tagger= NULL; 48 | } 49 | if (conparser) { 50 | delete conparser; 51 | conparser = NULL; 52 | } 53 | if (depparser) { 54 | delete depparser; 55 | depparser = NULL; 56 | } 57 | if (output_buffer) { 58 | delete output_buffer; 59 | output_buffer = NULL; 60 | } 61 | }; 62 | }; 63 | 64 | // instantiate the container 65 | // zparSession_t *zps = new zparSession_t(); 66 | 67 | extern "C" void* initialize() { 68 | zparSession_t* zps = new zparSession_t; 69 | return (void *)zps; 70 | } 71 | 72 | // a utility function to output tagged data in the usual 73 | // "WORD/TAG" format as expected 74 | std::string format_tagged_vector(CTwoStringVector *tagged_sent) 75 | { 76 | 77 | CTwoStringVector::const_iterator it; 78 | CStringVector formatted_tagged_sent[1]; 79 | for (it = tagged_sent->begin(); it != tagged_sent->end(); ++it) 80 | { 81 | std::stringstream tmpss; 82 | tmpss << it->first << "/" << it->second; 83 | std::string tmpstr(tmpss.str()); 84 | formatted_tagged_sent->push_back(tmpstr); 85 | } 86 | 87 | int i; 88 | std::stringstream oss; 89 | for (i = 0; i < formatted_tagged_sent->size(); ++i) 90 | { 91 | oss << formatted_tagged_sent->at(i); 92 | if (i != formatted_tagged_sent->size() - 1) 93 | { 94 | oss << " "; 95 | } 96 | } 97 | 98 | std::string outstr(oss.str()); 99 | return outstr; 100 | 101 | } 102 | 103 | // A utility function to format the dependncy output 104 | // in CoNLL format 105 | std::string format_dependency_tree(CDependencyParse *parsed_sent) 106 | { 107 | 108 | int i; 109 | std::stringstream oss; 110 | std::copy(parsed_sent->begin(), parsed_sent->end(), std::ostream_iterator(oss, "\n")); 111 | 112 | std::string outstr(oss.str()); 113 | return outstr; 114 | 115 | } 116 | 117 | // The function to load the tagger model 118 | extern "C" int load_tagger(void* vzps, const char* sFeaturePath) { 119 | 120 | zparSession_t* zps = static_cast(vzps); 121 | 122 | std::string sTaggerFeatureFile = std::string(sFeaturePath) + "/tagger"; 123 | std::cerr << "Loading tagger from " << sTaggerFeatureFile << std::endl; 124 | if (!FileExists(sTaggerFeatureFile)) { 125 | return 1; 126 | } 127 | 128 | CTagger* tagger = new CTagger(sTaggerFeatureFile, false); 129 | zps->tagger = tagger; 130 | return 0; 131 | } 132 | 133 | // The function to load the constituency parser model 134 | extern "C" int load_parser(void* vzps, const char *sFeaturePath) { 135 | 136 | zparSession_t* zps = static_cast(vzps); 137 | 138 | // If the tagger is not already loaded, then we need to load 139 | // it since the parser requires the tagger 140 | if (!zps->tagger) { 141 | if (load_tagger(zps, sFeaturePath)) { 142 | return 1; 143 | } 144 | } 145 | 146 | CConParser *conparser; 147 | std::string sConParserFeatureFile = std::string(sFeaturePath) + "/conparser"; 148 | std::cerr << "Loading constituency parser from " << sConParserFeatureFile << std::endl; 149 | if (!FileExists(sConParserFeatureFile)) { 150 | return 1; 151 | } 152 | conparser = new CConParser(sConParserFeatureFile, false); 153 | zps->conparser = conparser; 154 | return 0; 155 | } 156 | 157 | 158 | 159 | // The function to load the dependency parser model 160 | extern "C" int load_depparser(void* vzps, const char *sFeaturePath) { 161 | 162 | zparSession_t* zps = static_cast(vzps); 163 | 164 | // If the tagger is not already loaded, then we need to load 165 | // it since the parser requires the tagger 166 | if (!zps->tagger) { 167 | if (load_tagger(zps, sFeaturePath)) { 168 | return 1; 169 | } 170 | } 171 | 172 | CDepParser *depparser; 173 | std::string sDepParserFeatureFile = std::string(sFeaturePath) + "/depparser"; 174 | std::cerr << "Loading dependency parser from " << sDepParserFeatureFile << std::endl; 175 | if (!FileExists(sDepParserFeatureFile)) { 176 | return 1; 177 | } 178 | depparser = new CDepParser(sDepParserFeatureFile, false); 179 | zps->depparser = depparser; 180 | return 0; 181 | } 182 | 183 | // The function to load all three models 184 | extern "C" int load_models(void* vzps, const char *sFeaturePath) { 185 | 186 | zparSession_t* zps = static_cast(vzps); 187 | 188 | if (load_tagger(zps, sFeaturePath)) { 189 | return 1; 190 | } 191 | if (load_parser(zps, sFeaturePath)) { 192 | return 1; 193 | } 194 | if (load_depparser(zps, sFeaturePath)) { 195 | return 1; 196 | } 197 | return 0; 198 | } 199 | 200 | // Function to tag a sentence 201 | extern "C" char* tag_sentence(void* vzps, const char *input_sentence, bool tokenize) 202 | { 203 | 204 | zparSession_t* zps = static_cast(vzps); 205 | 206 | try { 207 | // create a temporary string stream from the input char * 208 | CSentenceReader input_reader(std::string(input_sentence), false); 209 | 210 | // tokenize the sentence 211 | CStringVector input_sent[1]; 212 | if (tokenize) { 213 | input_reader.readSegmentedSentenceAndTokenize(input_sent); 214 | } 215 | else { 216 | input_reader.readSegmentedSentence(input_sent); 217 | } 218 | 219 | // initialize the variable that will hold the tagged sentence 220 | CTwoStringVector tagged_sent[1]; 221 | 222 | // get the tagger that was stored earlier 223 | CTagger *tagger = zps->tagger; 224 | 225 | // tag the sentence 226 | tagger->tag(input_sent, tagged_sent); 227 | 228 | // format the tagged sentence properly and return 229 | std::string tagvec = format_tagged_vector(tagged_sent); 230 | int tagveclen = tagvec.length(); 231 | 232 | if (zps->output_buffer != NULL) { 233 | delete zps->output_buffer; 234 | zps->output_buffer = NULL; 235 | } 236 | zps->output_buffer = new char[tagveclen + 1]; 237 | strcpy(zps->output_buffer, tagvec.c_str()); 238 | } catch (const std::string &e) { 239 | std::cerr << e << std::endl; 240 | zps->output_buffer = new char[1]; 241 | strcpy(zps->output_buffer, ""); 242 | } 243 | return zps->output_buffer; 244 | } 245 | 246 | // Function to constituency parse a sentence 247 | extern "C" char* parse_sentence(void* vzps, const char *input_sentence, bool tokenize) 248 | { 249 | 250 | zparSession_t* zps = static_cast(vzps); 251 | 252 | try { 253 | 254 | // create a temporary string stream from the input char * 255 | CSentenceReader input_reader(std::string(input_sentence), false); 256 | 257 | // tokenize the sentence 258 | CStringVector tokenized_sent[1]; 259 | if (tokenize) { 260 | input_reader.readSegmentedSentenceAndTokenize(tokenized_sent); 261 | } 262 | else { 263 | input_reader.readSegmentedSentence(tokenized_sent); 264 | } 265 | 266 | if (zps->output_buffer != NULL) { 267 | delete zps->output_buffer; 268 | zps->output_buffer = NULL; 269 | } 270 | 271 | if(tokenized_sent->size() >= MAX_SENTENCE_SIZE){ 272 | // The ZPar code asserts that length < MAX_SENTENCE_SIZE... 273 | std::cerr << "Sentence too long. Returning empty string. Sentence: " << input_sentence << std::endl; 274 | zps->output_buffer = new char[1]; 275 | strcpy(zps->output_buffer, ""); 276 | } else { 277 | // initialize the variables that will hold the tagged and parsed sentences 278 | CTwoStringVector tagged_sent[1]; 279 | english::CCFGTree parsed_sent[1]; 280 | 281 | // get the tagger and parser that were stored earlier 282 | CTagger *tagger = zps->tagger; 283 | CConParser *conparser = zps->conparser; 284 | 285 | // tag and parse the sentence 286 | tagger->tag(tokenized_sent, tagged_sent); 287 | conparser->parse(*tagged_sent, parsed_sent); 288 | 289 | // now put the parsed sentence into a string stream 290 | std::string parse = parsed_sent->str_unbinarized(); 291 | int parselen = parse.length(); 292 | zps->output_buffer = new char[parselen + 1]; 293 | strcpy(zps->output_buffer, parse.c_str()); 294 | } 295 | } catch (const std::string &e) { 296 | std::cerr << e << std::endl; 297 | zps->output_buffer = new char[1]; 298 | strcpy(zps->output_buffer, ""); 299 | } 300 | 301 | return zps->output_buffer; 302 | } 303 | 304 | extern "C" char* parse_tagged_sentence(void* vzps, const char *input_tagged_sentence, const char seperator='/') 305 | { 306 | 307 | zparSession_t* zps = static_cast(vzps); 308 | 309 | try { 310 | // create a temporary string stream from the input char * 311 | CSentenceReader input_reader(std::string(input_tagged_sentence), false); 312 | 313 | // read the tagged sentence into a CTwoStringVector 314 | CTwoStringVector tagged_sent[1]; 315 | input_reader.readTaggedSentence(tagged_sent, false, seperator); 316 | 317 | if (zps->output_buffer != NULL) { 318 | delete zps->output_buffer; 319 | zps->output_buffer = NULL; 320 | } 321 | 322 | if(tagged_sent->size() >= MAX_SENTENCE_SIZE){ 323 | // The ZPar code asserts that length < MAX_SENTENCE_SIZE... 324 | std::cerr << "Sentence too long. Returning empty string. Sentence: " << input_tagged_sentence << std::endl; 325 | zps->output_buffer = new char[1]; 326 | strcpy(zps->output_buffer, ""); 327 | } else { 328 | // initialize the variable that will hold the parsed sentence 329 | english::CCFGTree parsed_sent[1]; 330 | 331 | // get the parser that was stored earlier 332 | CConParser *conparser = zps->conparser; 333 | 334 | // parse the tagged sentence 335 | conparser->parse(*tagged_sent, parsed_sent); 336 | 337 | // now put the parsed sentence into a string stream 338 | std::string parse = parsed_sent->str_unbinarized(); 339 | int parselen = parse.length(); 340 | zps->output_buffer = new char[parselen + 1]; 341 | strcpy(zps->output_buffer, parse.c_str()); 342 | } 343 | 344 | } catch (const std::string &e) { 345 | std::cerr << e << std::endl; 346 | zps->output_buffer = new char[1]; 347 | strcpy(zps->output_buffer, ""); 348 | } 349 | return zps->output_buffer; 350 | } 351 | 352 | // Function to dependency parse a sentence 353 | extern "C" char* dep_parse_sentence(void* vzps, const char *input_sentence, bool tokenize) 354 | { 355 | zparSession_t* zps = static_cast(vzps); 356 | 357 | try { 358 | 359 | // create a temporary string stream from the input char * 360 | CSentenceReader input_reader(std::string(input_sentence), false); 361 | 362 | // tokenize the sentence 363 | CStringVector tokenized_sent[1]; 364 | if (tokenize) { 365 | input_reader.readSegmentedSentenceAndTokenize(tokenized_sent); 366 | } 367 | else { 368 | input_reader.readSegmentedSentence(tokenized_sent); 369 | } 370 | 371 | if (zps->output_buffer != NULL) { 372 | delete zps->output_buffer; 373 | zps->output_buffer = NULL; 374 | } 375 | 376 | if(tokenized_sent->size() >= MAX_SENTENCE_SIZE){ 377 | // The ZPar code asserts that length < MAX_SENTENCE_SIZE... 378 | std::cerr << "Sentence too long. Returning empty string. Sentence: " << input_sentence << std::endl; 379 | zps->output_buffer = new char[1]; 380 | strcpy(zps->output_buffer, ""); 381 | } else { 382 | 383 | // initialize the variable that will hold the tagged and parsed sentences 384 | CTwoStringVector tagged_sent[1]; 385 | CDependencyParse parsed_sent[1]; 386 | 387 | // get the tagger and parser that were stored earlier 388 | CTagger *tagger = zps->tagger; 389 | CDepParser *depparser = zps->depparser; 390 | 391 | // tag and parse the sentence 392 | tagger->tag(tokenized_sent, tagged_sent); 393 | depparser->parse(*tagged_sent, parsed_sent); 394 | 395 | // now output the formatted dependency tree 396 | std::string deptree = format_dependency_tree(parsed_sent); 397 | int deptreelen = deptree.length(); 398 | zps->output_buffer = new char[deptreelen + 1]; 399 | strcpy(zps->output_buffer, deptree.c_str()); 400 | } 401 | 402 | } catch (const std::string &e) { 403 | std::cerr << e << std::endl; 404 | zps->output_buffer = new char[1]; 405 | strcpy(zps->output_buffer, ""); 406 | } 407 | 408 | return zps->output_buffer; 409 | } 410 | 411 | // Function to dependency parse a sentence 412 | extern "C" char* dep_parse_tagged_sentence(void* vzps, const char *input_tagged_sentence, const char seperator='/') 413 | { 414 | zparSession_t* zps = static_cast(vzps); 415 | 416 | try { 417 | // create a temporary string stream from the input char * 418 | CSentenceReader input_reader(std::string(input_tagged_sentence), false); 419 | 420 | // read the tagged sentence into a CTwoStringVector 421 | CTwoStringVector tagged_sent[1]; 422 | input_reader.readTaggedSentence(tagged_sent, false, seperator); 423 | 424 | if (zps->output_buffer != NULL) { 425 | delete zps->output_buffer; 426 | zps->output_buffer = NULL; 427 | } 428 | 429 | if(tagged_sent->size() >= MAX_SENTENCE_SIZE){ 430 | // The ZPar code asserts that length < MAX_SENTENCE_SIZE... 431 | std::cerr << "Sentence too long. Returning empty string. Sentence: " << input_tagged_sentence << std::endl; 432 | zps->output_buffer = new char[1]; 433 | strcpy(zps->output_buffer, ""); 434 | } else { 435 | 436 | // initialize the variable that will hold the parsed sentence 437 | CDependencyParse parsed_sent[1]; 438 | 439 | // get the parser that was stored earlier 440 | CDepParser *depparser = zps->depparser; 441 | 442 | // parse the sentence 443 | depparser->parse(*tagged_sent, parsed_sent); 444 | 445 | // now output the formatted dependency tree 446 | std::string deptree = format_dependency_tree(parsed_sent); 447 | int deptreelen = deptree.length(); 448 | zps->output_buffer = new char[deptreelen + 1]; 449 | strcpy(zps->output_buffer, deptree.c_str()); 450 | } 451 | 452 | } catch (const std::string &e) { 453 | std::cerr << e << std::endl; 454 | zps->output_buffer = new char[1]; 455 | strcpy(zps->output_buffer, ""); 456 | } 457 | 458 | return zps->output_buffer; 459 | } 460 | 461 | 462 | // Function to tag all sentence in the given input file 463 | // and write tagged sentences to the given output file 464 | extern "C" void tag_file(void* vzps, const char *sInputFile, const char *sOutputFile, bool tokenize) 465 | { 466 | 467 | zparSession_t* zps = static_cast(vzps); 468 | 469 | std::cerr << "Processing file " << sInputFile << std::endl; 470 | 471 | // initialize the input reader 472 | CSentenceReader input_reader(sInputFile); 473 | 474 | // initialize the temporary sentence variables 475 | CStringVector tokenized_sent[1]; 476 | CTwoStringVector tagged_sent[1]; 477 | 478 | // get the tagger and the parser that were stored earlier 479 | CTagger *tagger = zps->tagger; 480 | 481 | // initialize the output file writer 482 | std::string outputFileName = std::string(sOutputFile); 483 | CSentenceWriter output_writer(outputFileName); 484 | 485 | // read in and tokenize the given input file if asked 486 | bool readSomething; 487 | if (tokenize) { 488 | readSomething = input_reader.readSegmentedSentenceAndTokenize(tokenized_sent); 489 | } 490 | else { 491 | readSomething = input_reader.readSegmentedSentence(tokenized_sent); 492 | } 493 | while ( readSomething ) 494 | { 495 | if ( !tokenized_sent->empty() && tokenized_sent->back() == "\n" ) 496 | { 497 | tokenized_sent->pop_back(); 498 | } 499 | 500 | // tag the sentence 501 | tagger->tag(tokenized_sent, tagged_sent); 502 | 503 | // write the formatted sentence to the output file 504 | output_writer.writeSentence(tagged_sent, '/', true); 505 | 506 | if (tokenize) { 507 | readSomething = input_reader.readSegmentedSentenceAndTokenize(tokenized_sent); 508 | } 509 | else { 510 | readSomething = input_reader.readSegmentedSentence(tokenized_sent); 511 | } 512 | } 513 | 514 | // close the output file 515 | std::cerr << "Wrote output to " << sOutputFile << std::endl; 516 | } 517 | 518 | // Function to constituency parse all sentence in the given input file 519 | // and write parsed sentences to the given output file 520 | extern "C" void parse_file(void* vzps, const char *sInputFile, const char *sOutputFile, bool tokenize) 521 | { 522 | 523 | zparSession_t* zps = static_cast(vzps); 524 | 525 | std::cerr << "Processing file " << sInputFile << std::endl; 526 | 527 | // initialize the input reader 528 | CSentenceReader input_reader(sInputFile); 529 | 530 | // open the output file 531 | FILE *outfp = NULL; 532 | outfp = fopen(sOutputFile, "w"); 533 | 534 | // initialize the temporary sentence variables 535 | CStringVector tokenized_sent[1]; 536 | CTwoStringVector tagged_sent[1]; 537 | english::CCFGTree parsed_sent[1]; 538 | 539 | // get the tagger and the parser that were stored earlier 540 | CTagger *tagger = zps->tagger; 541 | CConParser *conparser = zps->conparser; 542 | 543 | // read in and tokenize the given input file if asked 544 | bool readSomething; 545 | if (tokenize) { 546 | readSomething = input_reader.readSegmentedSentenceAndTokenize(tokenized_sent); 547 | } 548 | else { 549 | readSomething = input_reader.readSegmentedSentence(tokenized_sent); 550 | } 551 | 552 | while ( readSomething ) 553 | { 554 | if ( !tokenized_sent->empty() && tokenized_sent->back() == "\n" ) 555 | { 556 | tokenized_sent->pop_back(); 557 | } 558 | 559 | std::string parse = ""; 560 | if(tokenized_sent->size() < MAX_SENTENCE_SIZE){ 561 | tagger->tag(tokenized_sent, tagged_sent); 562 | conparser->parse(*tagged_sent, parsed_sent); 563 | parse = parsed_sent->str_unbinarized(); 564 | } else { 565 | std::cerr << "Sentence too long. Writing empty string. Sentence: " << tokenized_sent << std::endl; 566 | } 567 | 568 | fprintf(outfp, "%s\n", parse.c_str()); 569 | 570 | if (tokenize) { 571 | readSomething = input_reader.readSegmentedSentenceAndTokenize(tokenized_sent); 572 | } 573 | else { 574 | readSomething = input_reader.readSegmentedSentence(tokenized_sent); 575 | } 576 | } 577 | 578 | // close the output file 579 | std::cerr << "Wrote output to " << sOutputFile << std::endl; 580 | fclose(outfp); 581 | } 582 | 583 | extern "C" void parse_tagged_file(void* vzps, const char *sInputFile, const char *sOutputFile, const char seperator='/') 584 | { 585 | 586 | zparSession_t* zps = static_cast(vzps); 587 | 588 | std::cerr << "Processing file " << sInputFile << std::endl; 589 | 590 | // initialize the input reader 591 | CSentenceReader input_reader(sInputFile); 592 | 593 | // open the output file 594 | FILE *outfp = NULL; 595 | outfp = fopen(sOutputFile, "w"); 596 | 597 | // initialize the temporary sentence variables 598 | CTwoStringVector tagged_sent[1]; 599 | english::CCFGTree parsed_sent[1]; 600 | 601 | // get the parser that was stored earlier 602 | CConParser *conparser = zps->conparser; 603 | 604 | // read in and tokenize the given input file if asked 605 | bool readSomething; 606 | readSomething = input_reader.readTaggedSentence(tagged_sent, false, seperator); 607 | 608 | while ( readSomething ) 609 | { 610 | std::string parse = ""; 611 | if(tagged_sent->size() < MAX_SENTENCE_SIZE){ 612 | conparser->parse(*tagged_sent, parsed_sent); 613 | parse = parsed_sent->str_unbinarized(); 614 | } else { 615 | std::cerr << "Sentence too long. Writing empty string. Sentence: " << tagged_sent << std::endl; 616 | } 617 | 618 | fprintf(outfp, "%s\n", parse.c_str()); 619 | 620 | readSomething = input_reader.readTaggedSentence(tagged_sent, false, seperator); 621 | } 622 | 623 | // close the output file 624 | std::cerr << "Wrote output to " << sOutputFile << std::endl; 625 | fclose(outfp); 626 | } 627 | 628 | // Function to dependency parse all sentence in the given input file 629 | // and write parsed sentences to the given output file 630 | extern "C" void dep_parse_file(void* vzps, const char *sInputFile, const char *sOutputFile, bool tokenize) 631 | { 632 | 633 | zparSession_t* zps = static_cast(vzps); 634 | 635 | std::cerr << "Processing file " << sInputFile << std::endl; 636 | 637 | // initialize the input reader 638 | CSentenceReader input_reader(sInputFile); 639 | 640 | // open the output file 641 | FILE *outfp = NULL; 642 | outfp = fopen(sOutputFile, "w"); 643 | 644 | // initialize the temporary sentence variables 645 | CStringVector tokenized_sent[1]; 646 | CTwoStringVector tagged_sent[1]; 647 | CDependencyParse parsed_sent[1]; 648 | 649 | // get the tagger and the parser that were stored earlier 650 | CTagger *tagger = zps->tagger; 651 | CDepParser *depparser = zps->depparser; 652 | 653 | // read in and tokenize the given input file if asked 654 | bool readSomething; 655 | if (tokenize) { 656 | readSomething = input_reader.readSegmentedSentenceAndTokenize(tokenized_sent); 657 | } 658 | else { 659 | readSomething = input_reader.readSegmentedSentence(tokenized_sent); 660 | } 661 | 662 | while ( readSomething ) 663 | { 664 | if ( !tokenized_sent->empty() && tokenized_sent->back() == "\n" ) 665 | { 666 | tokenized_sent->pop_back(); 667 | } 668 | 669 | std::string deptree = ""; 670 | if(tokenized_sent->size() < MAX_SENTENCE_SIZE){ 671 | tagger->tag(tokenized_sent, tagged_sent); 672 | depparser->parse(*tagged_sent, parsed_sent); 673 | deptree = format_dependency_tree(parsed_sent); 674 | } else { 675 | std::cerr << "Sentence too long. Writing empty string. Input:" << tokenized_sent << std::endl; 676 | } 677 | 678 | fprintf(outfp, "%s\n", deptree.c_str()); 679 | 680 | if (tokenize) { 681 | readSomething = input_reader.readSegmentedSentenceAndTokenize(tokenized_sent); 682 | } 683 | else { 684 | readSomething = input_reader.readSegmentedSentence(tokenized_sent); 685 | } 686 | } 687 | 688 | // close the output file 689 | std::cerr << "Wrote output to " << sOutputFile << std::endl; 690 | fclose(outfp); 691 | } 692 | 693 | extern "C" void dep_parse_tagged_file(void* vzps, const char *sInputFile, const char *sOutputFile, const char seperator='/') 694 | { 695 | 696 | zparSession_t* zps = static_cast(vzps); 697 | 698 | std::cerr << "Processing file " << sInputFile << std::endl; 699 | 700 | // initialize the input reader 701 | CSentenceReader input_reader(sInputFile); 702 | 703 | // open the output file 704 | FILE *outfp = NULL; 705 | outfp = fopen(sOutputFile, "w"); 706 | 707 | // initialize the temporary sentence variables 708 | CTwoStringVector tagged_sent[1]; 709 | CDependencyParse parsed_sent[1]; 710 | 711 | // get the parser that was stored earlier 712 | CDepParser *depparser = zps->depparser; 713 | 714 | // read in and tokenize the given input file if asked 715 | bool readSomething; 716 | readSomething = input_reader.readTaggedSentence(tagged_sent, false, seperator); 717 | 718 | while ( readSomething ) 719 | { 720 | std::string deptree = ""; 721 | if(tagged_sent->size() < MAX_SENTENCE_SIZE){ 722 | depparser->parse(*tagged_sent, parsed_sent); 723 | deptree = format_dependency_tree(parsed_sent); 724 | } else { 725 | std::cerr << "Sentence too long. Writing empty string. Sentence: " << tagged_sent << std::endl; 726 | } 727 | 728 | fprintf(outfp, "%s\n", deptree.c_str()); 729 | 730 | readSomething = input_reader.readTaggedSentence(tagged_sent, false, seperator); 731 | } 732 | 733 | // close the output file 734 | std::cerr << "Wrote output to " << sOutputFile << std::endl; 735 | fclose(outfp); 736 | } 737 | 738 | // Function to unload all the models 739 | extern "C" void unload_models(void* vzps) 740 | { 741 | 742 | zparSession_t* zps = static_cast(vzps); 743 | 744 | // just delete the container itself and its destructor 745 | // will take care of everything else 746 | delete zps; 747 | zps = NULL; 748 | } 749 | 750 | // A main function for testing 751 | // extern "C" int main(int argc, char *argv[]) 752 | // { 753 | // void* vzps = initialize(); 754 | // load_tagger(vzps, "/Users/nmadnani/work/NLPTools/zpar/english-models"); 755 | // load_parser(vzps, "/Users/nmadnani/work/NLPTools/zpar/english-models"); 756 | // load_depparser(vzps, "/Users/nmadnani/work/NLPTools/zpar/english-models"); 757 | // parse_tagged_file(vzps, "/Users/nmadnani/work/python-zpar/examples/test_tagged.txt", "/Users/nmadnani/work/python-zpar/examples/test_tagged.parse"); 758 | // dep_parse_tagged_file(vzps, "/Users/nmadnani/work/python-zpar/examples/test_tagged.txt", "/Users/nmadnani/work/python-zpar/examples/test_tagged.dep"); 759 | // std::cout << std::string(parse_tagged_sentence(vzps, "I/PRP am/VBP going/VBG to/TO the/DT market/NN ./.")) << std::endl; 760 | // std::cout << std::string(dep_parse_tagged_sentence(vzps, "I/PRP am/VBP going/VBG to/TO the/DT market/NN ./.")) << std::endl; 761 | // unload_models(vzps); 762 | // return 0; 763 | // } 764 | -------------------------------------------------------------------------------- /tests/test_depparser.py: -------------------------------------------------------------------------------- 1 | """ " 2 | Run unit tests for the ZPar dependency parser. 3 | 4 | :author: Nitin Madnani (nmadnani@ets.org) 5 | """ 6 | 7 | from __future__ import (absolute_import, division, print_function, 8 | unicode_literals) 9 | 10 | import glob 11 | import os 12 | 13 | from io import open 14 | from itertools import product 15 | from os.path import abspath, dirname, join 16 | 17 | from nose.tools import assert_equal 18 | from zpar import ZPar 19 | 20 | _my_dir = abspath(dirname(__file__)) 21 | 22 | z = None 23 | depparser = None 24 | 25 | 26 | def setUp(): 27 | """ 28 | set up things we need for the tests 29 | """ 30 | global z, depparser 31 | 32 | assert 'ZPAR_MODEL_DIR' in os.environ 33 | 34 | model_dir = os.environ['ZPAR_MODEL_DIR'] 35 | 36 | z = ZPar(model_dir) 37 | depparser = z.get_depparser() 38 | 39 | 40 | def tearDown(): 41 | """ 42 | Clean up after the tests 43 | """ 44 | global z, depparser 45 | 46 | if z: 47 | z.close() 48 | del depparser 49 | del z 50 | 51 | # delete all the files we may have created 52 | data_dir = abspath(join(_my_dir, '..', 'examples')) 53 | for f in glob.glob(join(data_dir, 'test*.dep')): 54 | os.unlink(f) 55 | 56 | 57 | def check_dep_parse_sentence(tokenize=False, 58 | with_lemmas=False, 59 | tagged=False): 60 | """ 61 | Check dep_parse_sentence method with and without tokenization, 62 | with and without lemmas, and without pre-tagged output. 63 | """ 64 | global depparser 65 | 66 | if tagged: 67 | sentence = "I/PRP 'm/VBP going/VBG to/TO the/DT market/NN ./." 68 | else: 69 | if tokenize: 70 | sentence = "I'm going to the market." 71 | else: 72 | sentence = "I 'm going to the market ." 73 | 74 | correct_output = "I\tPRP\t1\tSUB\n'm\tVBP\t-1\tROOT\ngoing\tVBG\t1\tVC\nto\tTO\t2\tVMOD\nthe\tDT\t5\tNMOD\nmarket\tNN\t3\tPMOD\n.\t.\t1\tP\n" 75 | correct_output_with_lemmas = "I\tPRP\t1\tSUB\ti\n'm\tVBP\t-1\tROOT\t'm\ngoing\tVBG\t1\tVC\tgo\nto\tTO\t2\tVMOD\tto\nthe\tDT\t5\tNMOD\tthe\nmarket\tNN\t3\tPMOD\tmarket\n.\t.\t1\tP\t.\n" 76 | if not tagged: 77 | parsed_sentence = depparser.dep_parse_sentence(sentence, 78 | tokenize=tokenize, 79 | with_lemmas=with_lemmas) 80 | else: 81 | parsed_sentence = depparser.dep_parse_tagged_sentence(sentence, 82 | with_lemmas=with_lemmas) 83 | 84 | if with_lemmas: 85 | assert_equal(parsed_sentence, correct_output_with_lemmas) 86 | else: 87 | assert_equal(parsed_sentence, correct_output) 88 | 89 | 90 | def test_dep_parse_sentence(): 91 | for (tokenize, with_lemmas, tagged) in product([True, False], 92 | [True, False], 93 | [True, False]): 94 | yield (check_dep_parse_sentence, 95 | tokenize, 96 | with_lemmas, 97 | tagged) 98 | 99 | 100 | def check_dep_parse_file(tokenize=False, 101 | with_lemmas=False, 102 | tagged=False): 103 | """ 104 | Check parse_file method with and without tokenization, 105 | with and without lemmas, with and without access 106 | to wordnet, and with and without pre-tagged output. 107 | """ 108 | global depparser 109 | 110 | if tagged: 111 | prefix = 'test_tagged' 112 | else: 113 | if tokenize: 114 | prefix = 'test' 115 | else: 116 | prefix = 'test_tokenized' 117 | 118 | correct_output = ['I\tPRP\t1\tSUB', 'am\tVBP\t-1\tROOT', 119 | 'going\tVBG\t1\tVC', 'to\tTO\t2\tVMOD', 120 | 'the\tDT\t5\tNMOD', 'market\tNN\t3\tPMOD', 121 | '.\t.\t1\tP', '', 'Are\tVBP\t-1\tROOT', 122 | 'you\tPRP\t0\tSUB', 'going\tVBG\t0\tVMOD', 123 | 'to\tTO\t4\tVMOD', 'come\tVB\t2\tVMOD', 124 | 'with\tIN\t4\tVMOD', 'me\tPRP\t5\tPMOD', 125 | '?\t.\t0\tP', ''] 126 | 127 | correct_output_with_lemmas = ['I\tPRP\t1\tSUB\ti', 'am\tVBP\t-1\tROOT\tbe', 128 | 'going\tVBG\t1\tVC\tgo', 'to\tTO\t2\tVMOD\tto', 129 | 'the\tDT\t5\tNMOD\tthe', 'market\tNN\t3\tPMOD\tmarket', 130 | '.\t.\t1\tP\t.', '', 'Are\tVBP\t-1\tROOT\tbe', 131 | 'you\tPRP\t0\tSUB\tyou', 'going\tVBG\t0\tVMOD\tgo', 132 | 'to\tTO\t4\tVMOD\tto', 'come\tVB\t2\tVMOD\tcome', 133 | 'with\tIN\t4\tVMOD\twith', 'me\tPRP\t5\tPMOD\tme', 134 | '?\t.\t0\tP\t?',''] 135 | 136 | input_file = abspath(join(_my_dir, '..', 'examples', '{}.txt'.format(prefix))) 137 | output_file = abspath(join(_my_dir, '..', 'examples', '{}.dep'.format(prefix))) 138 | 139 | # dependency parse the file 140 | if not tagged: 141 | depparser.dep_parse_file(input_file, 142 | output_file, 143 | tokenize=tokenize, 144 | with_lemmas=with_lemmas) 145 | else: 146 | depparser.dep_parse_tagged_file(input_file, 147 | output_file, 148 | with_lemmas=with_lemmas) 149 | 150 | # read the output file and make sure we have the expected output 151 | with open(output_file, 'r') as outf: 152 | output = [l.strip() for l in outf.readlines()] 153 | 154 | if with_lemmas: 155 | assert_equal(output, correct_output_with_lemmas) 156 | else: 157 | assert_equal(output, correct_output) 158 | 159 | 160 | def test_dep_parse_file(): 161 | for (tokenize, with_lemmas, tagged) in product([True, False], 162 | [True, False], 163 | [True, False]): 164 | yield (check_dep_parse_file, 165 | tokenize, 166 | with_lemmas, 167 | tagged) 168 | 169 | -------------------------------------------------------------------------------- /tests/test_depparser_no_wordnet.py: -------------------------------------------------------------------------------- 1 | """ " 2 | Run unit tests for the ZPar dependency parser without wordnet access. 3 | 4 | :author: Nitin Madnani (nmadnani@ets.org) 5 | """ 6 | 7 | from __future__ import (absolute_import, division, print_function, 8 | unicode_literals) 9 | 10 | import glob 11 | import os 12 | 13 | from io import open 14 | from itertools import product 15 | from os.path import abspath, dirname, join 16 | 17 | from nose.tools import assert_equal 18 | from zpar import ZPar 19 | 20 | _my_dir = abspath(dirname(__file__)) 21 | 22 | z = None 23 | depparser = None 24 | 25 | 26 | def setUp(): 27 | """ 28 | set up things we need for the tests 29 | """ 30 | global z, depparser 31 | 32 | assert 'ZPAR_MODEL_DIR' in os.environ 33 | 34 | model_dir = os.environ['ZPAR_MODEL_DIR'] 35 | 36 | z = ZPar(model_dir) 37 | depparser = z.get_depparser() 38 | 39 | 40 | def tearDown(): 41 | """ 42 | Clean up after the tests 43 | """ 44 | global z, depparser 45 | 46 | if z: 47 | z.close() 48 | del depparser 49 | del z 50 | 51 | # delete all the files we may have created 52 | data_dir = abspath(join(_my_dir, '..', 'examples')) 53 | for f in glob.glob(join(data_dir, 'test*.dep')): 54 | os.unlink(f) 55 | 56 | 57 | def check_dep_parse_sentence_no_wordnet(tokenize=False, 58 | with_lemmas=False, 59 | tagged=False): 60 | """ 61 | Check dep_parse_sentence method with and without tokenization, 62 | with and without lemmas, and with and without pre-tagged output, 63 | all under the condition that there is no wordnet corpus 64 | accessible to nltk. 65 | """ 66 | global depparser 67 | 68 | if tagged: 69 | sentence = "I/PRP 'm/VBP going/VBG to/TO the/DT market/NN ./." 70 | else: 71 | if tokenize: 72 | sentence = "I'm going to the market." 73 | else: 74 | sentence = "I 'm going to the market ." 75 | 76 | correct_output = "I\tPRP\t1\tSUB\n'm\tVBP\t-1\tROOT\ngoing\tVBG\t1\tVC\nto\tTO\t2\tVMOD\nthe\tDT\t5\tNMOD\nmarket\tNN\t3\tPMOD\n.\t.\t1\tP\n" 77 | if not tagged: 78 | parsed_sentence = depparser.dep_parse_sentence(sentence, 79 | tokenize=tokenize, 80 | with_lemmas=with_lemmas) 81 | else: 82 | parsed_sentence = depparser.dep_parse_tagged_sentence(sentence, 83 | with_lemmas=with_lemmas) 84 | 85 | assert_equal(parsed_sentence, correct_output) 86 | 87 | 88 | def test_dep_parse_sentence_no_wordnet(): 89 | for (tokenize, with_lemmas, tagged) in product([True, False], 90 | [True, False], 91 | [True, False]): 92 | yield (check_dep_parse_sentence_no_wordnet, 93 | tokenize, 94 | with_lemmas, 95 | tagged) 96 | 97 | 98 | def check_dep_parse_file_no_wordnet(tokenize=False, 99 | with_lemmas=False, 100 | tagged=False): 101 | """ 102 | Check parse_file method with and without tokenization, 103 | with and without lemmas, and with and without pre-tagged output, 104 | all under the condition that there is no wordnet corpus 105 | accessible to nltk. 106 | """ 107 | global depparser 108 | 109 | if tagged: 110 | prefix = 'test_tagged' 111 | else: 112 | if tokenize: 113 | prefix = 'test' 114 | else: 115 | prefix = 'test_tokenized' 116 | 117 | correct_output = ['I\tPRP\t1\tSUB', 'am\tVBP\t-1\tROOT', 118 | 'going\tVBG\t1\tVC', 'to\tTO\t2\tVMOD', 119 | 'the\tDT\t5\tNMOD', 'market\tNN\t3\tPMOD', 120 | '.\t.\t1\tP', '', 'Are\tVBP\t-1\tROOT', 121 | 'you\tPRP\t0\tSUB', 'going\tVBG\t0\tVMOD', 122 | 'to\tTO\t4\tVMOD', 'come\tVB\t2\tVMOD', 123 | 'with\tIN\t4\tVMOD', 'me\tPRP\t5\tPMOD', 124 | '?\t.\t0\tP', ''] 125 | 126 | input_file = abspath(join(_my_dir, '..', 'examples', '{}.txt'.format(prefix))) 127 | output_file = abspath(join(_my_dir, '..', 'examples', '{}.dep'.format(prefix))) 128 | 129 | # dependency parse the file 130 | if not tagged: 131 | depparser.dep_parse_file(input_file, 132 | output_file, 133 | tokenize=tokenize, 134 | with_lemmas=with_lemmas) 135 | else: 136 | depparser.dep_parse_tagged_file(input_file, 137 | output_file, 138 | with_lemmas=with_lemmas) 139 | 140 | # read the output file and make sure we have the expected output 141 | with open(output_file, 'r') as outf: 142 | output = [l.strip() for l in outf.readlines()] 143 | 144 | assert_equal(output, correct_output) 145 | 146 | 147 | def test_dep_parse_file_no_wordnet(): 148 | for (tokenize, with_lemmas, tagged) in product([True, False], 149 | [True, False], 150 | [True, False]): 151 | yield (check_dep_parse_file_no_wordnet, 152 | tokenize, 153 | with_lemmas, 154 | tagged) 155 | 156 | -------------------------------------------------------------------------------- /tests/test_parser.py: -------------------------------------------------------------------------------- 1 | """ 2 | Run unit tests for the ZPar constituency parser. 3 | 4 | :author: Nitin Madnani (nmadnani@ets.org) 5 | """ 6 | 7 | from __future__ import (absolute_import, division, print_function, 8 | unicode_literals) 9 | 10 | import glob 11 | import os 12 | 13 | from io import open 14 | from itertools import product 15 | from os.path import abspath, dirname, join 16 | 17 | from nose.tools import assert_equal 18 | from zpar import ZPar 19 | 20 | _my_dir = abspath(dirname(__file__)) 21 | 22 | z = None 23 | parser = None 24 | 25 | 26 | def setUp(): 27 | """ 28 | set up things we need for the tests 29 | """ 30 | global z, parser 31 | 32 | assert 'ZPAR_MODEL_DIR' in os.environ 33 | 34 | model_dir = os.environ['ZPAR_MODEL_DIR'] 35 | 36 | z = ZPar(model_dir) 37 | parser = z.get_parser() 38 | 39 | 40 | def tearDown(): 41 | """ 42 | Clean up after the tests 43 | """ 44 | global z, parser 45 | 46 | if z: 47 | z.close() 48 | del parser 49 | del z 50 | 51 | # delete all the files we may have created 52 | data_dir = abspath(join(_my_dir, '..', 'examples')) 53 | for f in glob.glob(join(data_dir, 'test*.parse')): 54 | os.unlink(f) 55 | 56 | 57 | def check_parse_sentence(tokenize=False, tagged=False): 58 | """ 59 | Check parse_sentence method with and without tokenization 60 | and with and without pre-tagged output. 61 | """ 62 | global parser 63 | 64 | if tagged: 65 | sentence = "I/PRP 'm/VBP going/VBG to/TO the/DT market/NN ./." 66 | else: 67 | if tokenize: 68 | sentence = "I'm going to the market." 69 | else: 70 | sentence = "I 'm going to the market ." 71 | 72 | correct_output = "(S (NP (PRP I)) (VP (VBP 'm) (VP (VBG going) (PP (TO to) (NP (DT the) (NN market))))) (. .))" 73 | 74 | if not tagged: 75 | parsed_sentence = parser.parse_sentence(sentence, tokenize=tokenize) 76 | else: 77 | parsed_sentence = parser.parse_tagged_sentence(sentence) 78 | 79 | assert_equal(parsed_sentence, correct_output) 80 | 81 | 82 | def test_parse_sentence(): 83 | for (tokenize, tagged) in product([True, False], [True, False]): 84 | yield check_parse_sentence, tokenize, tagged 85 | 86 | 87 | def check_parse_file(tokenize=False, tagged=False): 88 | """ 89 | Check parse_file method with and without tokenization 90 | and with and without pre-tagged output 91 | """ 92 | global parser 93 | 94 | if tagged: 95 | prefix = 'test_tagged' 96 | else: 97 | if tokenize: 98 | prefix = 'test' 99 | else: 100 | prefix = 'test_tokenized' 101 | 102 | correct_output = ["(S (NP (PRP I)) (VP (VBP am) (VP (VBG going) (PP (TO to) (NP (DT the) (NN market))))) (. .))", 103 | "(SQ (VBP Are) (NP (PRP you)) (VP (VBG going) (S (VP (TO to) (VP (VB come) (PP (IN with) (NP (PRP me))))))) (. ?))"] 104 | 105 | input_file = abspath(join(_my_dir, '..', 'examples', '{}.txt'.format(prefix))) 106 | output_file = abspath(join(_my_dir, '..', 'examples', '{}.parse'.format(prefix))) 107 | 108 | # parse the file 109 | if not tagged: 110 | parser.parse_file(input_file, output_file, tokenize=tokenize) 111 | else: 112 | parser.parse_tagged_file(input_file, output_file) 113 | 114 | # read the output file and make sure we have the expected output 115 | with open(output_file, 'r') as outf: 116 | output = [l.strip() for l in outf.readlines()] 117 | 118 | assert_equal(output, correct_output) 119 | 120 | 121 | def test_parse_file(): 122 | for (tokenize, tagged) in product([True, False], [True, False]): 123 | yield check_parse_file, tokenize, tagged 124 | 125 | -------------------------------------------------------------------------------- /tests/test_tagger.py: -------------------------------------------------------------------------------- 1 | """ 2 | Run unit tests for the ZPar tagger. 3 | 4 | :author: Nitin Madnani (nmadnani@ets.org) 5 | """ 6 | 7 | from __future__ import (absolute_import, division, print_function, 8 | unicode_literals) 9 | 10 | import glob 11 | import os 12 | 13 | from io import open 14 | from os.path import abspath, dirname, join 15 | 16 | from nose.tools import assert_equal 17 | from zpar import ZPar 18 | 19 | _my_dir = abspath(dirname(__file__)) 20 | 21 | z = None 22 | tagger = None 23 | 24 | def setUp(): 25 | """ 26 | set up things we need for the tests 27 | """ 28 | global z, tagger 29 | 30 | assert 'ZPAR_MODEL_DIR' in os.environ 31 | 32 | model_dir = os.environ['ZPAR_MODEL_DIR'] 33 | 34 | z = ZPar(model_dir) 35 | tagger = z.get_tagger() 36 | 37 | def tearDown(): 38 | """ 39 | Clean up after the tests 40 | """ 41 | global z, tagger 42 | 43 | if z: 44 | z.close() 45 | del tagger 46 | del z 47 | 48 | # delete all the files we may have created 49 | data_dir = abspath(join(_my_dir, '..', 'examples')) 50 | for f in glob.glob(join(data_dir, 'test*.tag')): 51 | os.unlink(f) 52 | 53 | 54 | def check_tag_sentence(tokenize=False): 55 | """ 56 | Check tag_sentence method with and without tokenization 57 | """ 58 | global tagger 59 | 60 | sentence = "I'm going to the market." if tokenize else "I 'm going to the market ." 61 | correct_output = "I/PRP 'm/VBP going/VBG to/TO the/DT market/NN ./." 62 | tagged_sentence = tagger.tag_sentence(sentence, tokenize=tokenize) 63 | 64 | assert_equal(tagged_sentence, correct_output) 65 | 66 | 67 | def test_tag_sentence(): 68 | yield check_tag_sentence, False 69 | yield check_tag_sentence, True 70 | 71 | 72 | def check_tag_file(tokenize=False): 73 | """ 74 | Check tag_file method with and without tokenization 75 | """ 76 | 77 | global tagger 78 | 79 | prefix = 'test' if tokenize else 'test_tokenized' 80 | 81 | correct_output = ['I/PRP am/VBP going/VBG to/TO the/DT market/NN ./.', 82 | 'Are/VBP you/PRP going/VBG to/TO come/VB with/IN me/PRP ?/.'] 83 | 84 | input_file = abspath(join(_my_dir, '..', 'examples', '{}.txt'.format(prefix))) 85 | output_file = abspath(join(_my_dir, '..', 'examples', '{}.tag'.format(prefix))) 86 | 87 | # tag the file 88 | tagger.tag_file(input_file, output_file, tokenize=tokenize) 89 | 90 | # read the output file and make sure we have the expected output 91 | with open(output_file, 'r') as outf: 92 | output = [l.strip() for l in outf.readlines()] 93 | 94 | assert_equal(output, correct_output) 95 | 96 | 97 | def test_tag_file(): 98 | yield check_tag_file, False 99 | yield check_tag_file, True 100 | -------------------------------------------------------------------------------- /zpar/DepParser.py: -------------------------------------------------------------------------------- 1 | # License: MIT 2 | ''' 3 | :author: Nitin Madnani (nmadnani@ets.org) 4 | :organization: ETS 5 | ''' 6 | 7 | import ctypes as c 8 | import logging 9 | import os 10 | 11 | # do we have nltk installed and if so, do we have its 12 | # wordnet corpus installed? 13 | try: 14 | import nltk 15 | nltk.data.find('corpora/wordnet') 16 | except (ImportError, LookupError): 17 | _HAS_LEMMATIZER = False 18 | else: 19 | _HAS_LEMMATIZER = True 20 | from nltk.stem.wordnet import WordNetLemmatizer 21 | 22 | 23 | class DepParser(object): 24 | """The ZPar English Dependency Parser""" 25 | 26 | def __init__(self, modelpath, libptr, zpar_session_obj): 27 | super(DepParser, self).__init__() 28 | 29 | # save the zpar session object 30 | self._zpar_session_obj = zpar_session_obj 31 | 32 | # set up a logger 33 | self.logger = logging.getLogger(__name__) 34 | 35 | # get the library method that loads the parser models 36 | self._load_depparser = libptr.load_depparser 37 | self._load_depparser.restype = c.c_int 38 | self._load_depparser.argtypes = [c.c_void_p, c.c_char_p] 39 | 40 | # get the library methods that parse sentences and files 41 | self._dep_parse_sentence = libptr.dep_parse_sentence 42 | self._dep_parse_sentence.restype = c.c_char_p 43 | self._dep_parse_sentence.argtypes = [c.c_void_p, c.c_char_p, c.c_bool] 44 | 45 | self._dep_parse_file = libptr.dep_parse_file 46 | self._dep_parse_file.restype = None 47 | self._dep_parse_file.argtypes = [c.c_void_p, c.c_char_p, c.c_char_p, c.c_bool] 48 | 49 | self._dep_parse_tagged_sentence = libptr.dep_parse_tagged_sentence 50 | self._dep_parse_tagged_sentence.restype = c.c_char_p 51 | self._dep_parse_tagged_sentence.argtypes = [c.c_void_p, c.c_char_p, c.c_char] 52 | 53 | self._dep_parse_tagged_file = libptr.dep_parse_tagged_file 54 | self._dep_parse_tagged_file.restype = None 55 | self._dep_parse_tagged_file.argtypes = [c.c_void_p, c.c_char_p, c.c_char_p, c.c_char] 56 | 57 | if self._load_depparser(self._zpar_session_obj, modelpath.encode('utf-8')): 58 | raise OSError('Cannot find dependency parser model at {}\n'.format(modelpath)) 59 | 60 | # set up the wordnet lemmatizer if we have it 61 | if _HAS_LEMMATIZER: 62 | self.lemmatizer = WordNetLemmatizer() 63 | else: 64 | self.lemmatizer = None 65 | 66 | def annotate_parse_with_lemmas(self, parse): 67 | if not parse.strip(): 68 | return parse 69 | else: 70 | new_parse_lines = [] 71 | for line in parse.strip().split('\n'): 72 | fields = line.strip().split('\t') 73 | word, pos = fields[:2] 74 | if pos.startswith('J'): 75 | param = 'a' 76 | elif pos.startswith('R'): 77 | param = 'r' 78 | elif pos.startswith('V'): 79 | param = 'v' 80 | else: 81 | param = 'n' 82 | lemma = self.lemmatizer.lemmatize(word.lower(), param) 83 | new_parse_line = '\t'.join(fields + [lemma]) 84 | new_parse_lines.append(new_parse_line) 85 | return '\n'.join(new_parse_lines) + '\n' 86 | 87 | def dep_parse_sentence(self, 88 | sentence, 89 | tokenize=True, 90 | with_lemmas=False): 91 | if not sentence.strip(): 92 | # return empty string if the input is empty 93 | ans = "" 94 | else: 95 | zpar_compatible_sentence = sentence.strip() + "\n " 96 | zpar_compatible_sentence = zpar_compatible_sentence.strip() + "\n " 97 | zpar_compatible_sentence = zpar_compatible_sentence.encode('utf-8') 98 | parsed_sent = self._dep_parse_sentence(self._zpar_session_obj, 99 | zpar_compatible_sentence, 100 | tokenize) 101 | ans = parsed_sent.decode('utf-8') 102 | 103 | # if we are asked to add lemma information, then we need 104 | # to add another field to each of the lines in the 105 | # parse returned from zpar 106 | if with_lemmas: 107 | if self.lemmatizer: 108 | ans = self.annotate_parse_with_lemmas(ans) 109 | else: 110 | self.logger.warning('No lemmatizer available. Please ' 111 | 'install NLTK and its Wordnet corpus.') 112 | return ans 113 | 114 | def dep_parse_file(self, 115 | inputfile, 116 | outputfile, 117 | tokenize=True, 118 | with_lemmas=False): 119 | 120 | 121 | if not os.path.exists(inputfile): 122 | raise OSError('File {} does not exist.'.format(inputfile)) 123 | else: 124 | parsed = False 125 | 126 | # if we want lemmas, we have to individually parse 127 | # each sentence and then annotate its parse with lemmas 128 | if with_lemmas: 129 | if self.lemmatizer: 130 | with open(inputfile, 'r') as inputf, open(outputfile, 'w') as outf: 131 | for sentence in inputf: 132 | outf.write(self.dep_parse_sentence(sentence, 133 | tokenize=tokenize, 134 | with_lemmas=True) + '\n') 135 | parsed = True 136 | else: 137 | self.logger.warning('No lemmatizer available. Please ' 138 | 'install NLTK and its Wordnet corpus.') 139 | 140 | # otherwise we can just parse the whole file in C++ space 141 | if not parsed: 142 | self._dep_parse_file(self._zpar_session_obj, 143 | inputfile.encode('utf-8'), 144 | outputfile.encode('utf-8'), 145 | tokenize) 146 | 147 | def dep_parse_tagged_sentence(self, 148 | tagged_sentence, 149 | sep='/', 150 | with_lemmas=False): 151 | if not tagged_sentence.strip(): 152 | # return empty string if the input is empty 153 | ans = "" 154 | else: 155 | zpar_compatible_sentence = tagged_sentence.strip().encode('utf-8') 156 | parsed_sent = self._dep_parse_tagged_sentence(self._zpar_session_obj, 157 | zpar_compatible_sentence, 158 | sep.encode('utf-8')) 159 | ans = parsed_sent.decode('utf-8') 160 | 161 | # if we are asked to add lemma information, then we need 162 | # to add another field to each of the lines in the 163 | # parse returned from zpar 164 | if with_lemmas: 165 | if self.lemmatizer: 166 | ans = self.annotate_parse_with_lemmas(ans) 167 | else: 168 | self.logger.warning('No lemmatizer available. Please ' 169 | 'install NLTK and its Wordnet corpus.') 170 | return ans 171 | 172 | def dep_parse_tagged_file(self, inputfile, outputfile, sep='/', with_lemmas=False): 173 | 174 | if not os.path.exists(inputfile): 175 | raise OSError('File {} does not exist.'.format(inputfile)) 176 | else: 177 | 178 | parsed = False 179 | 180 | # if we want lemmas, we have to individually parse 181 | # each sentence and then annotate its parse with lemmas 182 | if with_lemmas: 183 | if self.lemmatizer: 184 | with open(inputfile, 'r') as inputf, open(outputfile, 'w') as outf: 185 | for sentence in inputf: 186 | outf.write(self.dep_parse_tagged_sentence(sentence, 187 | sep=sep, 188 | with_lemmas=with_lemmas) + '\n') 189 | 190 | parsed = True 191 | else: 192 | self.logger.warning('No lemmatizer available. Please ' 193 | 'install NLTK and its Wordnet corpus.') 194 | 195 | # otherwise we can just parse the whole file in C++ space 196 | if not parsed: 197 | self._dep_parse_tagged_file(self._zpar_session_obj, 198 | inputfile.encode('utf-8'), 199 | outputfile.encode('utf-8'), 200 | sep.encode('utf-8')) 201 | 202 | def cleanup(self): 203 | self._load_depparser = None 204 | self._dep_parse_sentence = None 205 | self._dep_parse_file = None 206 | self._dep_parse_tagged_sentence = None 207 | self._dep_parse_tagged_file = None 208 | self._zpar_session_obj = None 209 | -------------------------------------------------------------------------------- /zpar/Parser.py: -------------------------------------------------------------------------------- 1 | # License: MIT 2 | ''' 3 | :author: Nitin Madnani (nmadnani@ets.org) 4 | :organization: ETS 5 | ''' 6 | 7 | import ctypes as c 8 | import logging 9 | import os 10 | 11 | 12 | class Parser(object): 13 | """The ZPar English Constituency Parser""" 14 | 15 | def __init__(self, modelpath, libptr, zpar_session_obj): 16 | super(Parser, self).__init__() 17 | 18 | # save the zpar session object 19 | self._zpar_session_obj = zpar_session_obj 20 | 21 | # set up a logger 22 | self.logger = logging.getLogger(__name__) 23 | 24 | # get the library method that loads the parser models 25 | self._load_parser = libptr.load_parser 26 | self._load_parser.restype = c.c_int 27 | self._load_parser.argtypes = [c.c_void_p, c.c_char_p] 28 | 29 | # get the library methods that parse sentences and files 30 | self._parse_sentence = libptr.parse_sentence 31 | self._parse_sentence.restype = c.c_char_p 32 | self._parse_sentence.argtypes = [c.c_void_p, c.c_char_p, c.c_bool] 33 | 34 | self._parse_file = libptr.parse_file 35 | self._parse_file.restype = None 36 | self._parse_file.argtypes = [c.c_void_p, c.c_char_p, c.c_char_p, c.c_bool] 37 | 38 | self._parse_tagged_sentence = libptr.parse_tagged_sentence 39 | self._parse_tagged_sentence.restype = c.c_char_p 40 | self._parse_tagged_sentence.argtypes = [c.c_void_p, c.c_char_p, c.c_char] 41 | 42 | self._parse_tagged_file = libptr.parse_tagged_file 43 | self._parse_tagged_file.restype = None 44 | self._parse_tagged_file.argtypes = [c.c_void_p, c.c_char_p, c.c_char_p, c.c_char] 45 | 46 | if self._load_parser(self._zpar_session_obj, modelpath.encode('utf-8')): 47 | raise OSError('Cannot find parser model at {}\n'.format(modelpath)) 48 | 49 | def parse_sentence(self, sentence, tokenize=True): 50 | if not sentence.strip(): 51 | # return empty string if the input is empty 52 | ans = "" 53 | else: 54 | zpar_compatible_sentence = sentence.strip() + "\n " 55 | zpar_compatible_sentence = zpar_compatible_sentence.strip() + "\n " 56 | zpar_compatible_sentence = zpar_compatible_sentence.encode('utf-8') 57 | parsed_sent = self._parse_sentence(self._zpar_session_obj, zpar_compatible_sentence, tokenize) 58 | ans = parsed_sent.decode('utf-8') 59 | 60 | return ans 61 | 62 | def parse_file(self, inputfile, outputfile, tokenize=True): 63 | if os.path.exists(inputfile): 64 | self._parse_file(self._zpar_session_obj, inputfile.encode('utf-8'), outputfile.encode('utf-8'), tokenize) 65 | else: 66 | raise OSError('File {} does not exist.'.format(inputfile)) 67 | 68 | def parse_tagged_sentence(self, tagged_sentence, sep='/'): 69 | if not tagged_sentence.strip(): 70 | # return empty string if the input is empty 71 | ans = "" 72 | else: 73 | zpar_compatible_sentence = tagged_sentence.strip().encode('utf-8') 74 | parsed_sent = self._parse_tagged_sentence(self._zpar_session_obj, zpar_compatible_sentence, sep.encode('utf-8')) 75 | ans = parsed_sent.decode('utf-8') 76 | return ans 77 | 78 | def parse_tagged_file(self, inputfile, outputfile, sep='/'): 79 | if os.path.exists(inputfile): 80 | self._parse_tagged_file(self._zpar_session_obj, inputfile.encode('utf-8'), outputfile.encode('utf-8'), sep.encode('utf-8')) 81 | else: 82 | raise OSError('File {} does not exist.'.format(inputfile)) 83 | 84 | def cleanup(self): 85 | self._load_parser = None 86 | self._parse_sentence = None 87 | self._parse_file = None 88 | self._parse_tagged_sentence = None 89 | self._parse_tagged_file = None 90 | self._zpar_session_obj = None 91 | -------------------------------------------------------------------------------- /zpar/Tagger.py: -------------------------------------------------------------------------------- 1 | # License: MIT 2 | ''' 3 | :author: Nitin Madnani (nmadnani@ets.org) 4 | :organization: ETS 5 | ''' 6 | import ctypes as c 7 | import logging 8 | import os 9 | 10 | 11 | class Tagger(object): 12 | """The ZPar English POS Tagger""" 13 | 14 | def __init__(self, modelpath, libptr, zpar_session_obj): 15 | super(Tagger, self).__init__() 16 | 17 | # save the zpar session object 18 | self._zpar_session_obj = zpar_session_obj 19 | 20 | # set up a logger 21 | self.logger = logging.getLogger(__name__) 22 | 23 | # get the library method that loads the tagger models 24 | self._load_tagger = libptr.load_tagger 25 | self._load_tagger.restype = c.c_int 26 | self._load_tagger.argtypes = [c.c_void_p, c.c_char_p] 27 | 28 | # get the library methods that tag sentences and files 29 | self._tag_sentence = libptr.tag_sentence 30 | self._tag_sentence.restype = c.c_char_p 31 | self._tag_sentence.argtypes = [c.c_void_p, c.c_char_p, c.c_bool] 32 | 33 | self._tag_file = libptr.tag_file 34 | self._tag_file.restype = None 35 | self._tag_file.argtypes = [c.c_void_p, c.c_char_p, c.c_char_p, c.c_bool] 36 | 37 | if self._load_tagger(self._zpar_session_obj, modelpath.encode('utf-8')): 38 | raise OSError('Cannot find tagger model at {}\n'.format(modelpath)) 39 | 40 | def tag_sentence(self, sentence, tokenize=True): 41 | if not sentence.strip(): 42 | # return empty string if the input is empty 43 | ans = "" 44 | else: 45 | zpar_compatible_sentence = sentence.strip() + "\n " 46 | zpar_compatible_sentence = zpar_compatible_sentence.encode('utf-8') 47 | tagged_sent = self._tag_sentence(self._zpar_session_obj, zpar_compatible_sentence, tokenize) 48 | ans = tagged_sent.decode('utf-8') 49 | return ans 50 | 51 | return ans 52 | 53 | def tag_file(self, inputfile, outputfile, tokenize=True): 54 | if os.path.exists(inputfile): 55 | self._tag_file(self._zpar_session_obj, inputfile.encode('utf-8'), outputfile.encode('utf-8'), tokenize) 56 | else: 57 | raise OSError('File {} does not exist.'.format(inputfile)) 58 | 59 | def cleanup(self): 60 | self._load_tagger = None 61 | self._tag_sentence = None 62 | self._tag_file = None 63 | self._zpar_session_obj = None 64 | 65 | -------------------------------------------------------------------------------- /zpar/__init__.py: -------------------------------------------------------------------------------- 1 | # License: MIT 2 | ''' 3 | :author: Nitin Madnani (nmadnani@ets.org) 4 | :organization: ETS 5 | ''' 6 | 7 | import _ctypes 8 | import ctypes as c 9 | import os 10 | 11 | from .Tagger import Tagger 12 | from .Parser import Parser 13 | from .DepParser import DepParser 14 | 15 | __all__ = ['Tagger', 'Parser', 'DepParser'] 16 | 17 | class ZPar(object): 18 | """The ZPar wrapper object""" 19 | 20 | def __init__(self, modelpath): 21 | super(ZPar, self).__init__() 22 | 23 | # get a pointer to the zpar shared library 24 | base_path = os.path.dirname(os.path.abspath(__file__)) 25 | zpar_path = os.path.join(base_path, 'dist', 'zpar.so') 26 | self.libptr = c.cdll.LoadLibrary(zpar_path) 27 | 28 | # call the library's initialize method to instantiate 29 | # the session object associated with this session 30 | self._initialize = self.libptr.initialize 31 | self._initialize.restype = c.c_void_p 32 | self._initialize.argtypes = None 33 | self._zpar_session_obj = self._initialize() 34 | 35 | self.modelpath = modelpath 36 | self.tagger = None 37 | self.parser = None 38 | self.depparser = None 39 | 40 | def close(self): 41 | 42 | # unload the models on the C++ side 43 | _unload_models = self.libptr.unload_models 44 | _unload_models.restype = None 45 | _unload_models.argtypes = [c.c_void_p] 46 | self.libptr.unload_models(self._zpar_session_obj) 47 | 48 | # clean up the data structures on the python side 49 | if self.tagger: 50 | self.tagger.cleanup() 51 | 52 | if self.parser: 53 | self.parser.cleanup() 54 | 55 | if self.depparser: 56 | self.depparser.cleanup() 57 | 58 | # set all the fields to none to enable clean reuse 59 | self.tagger = None 60 | self.parser = None 61 | self.depparser = None 62 | self.modelpath = None 63 | 64 | # clean up the CDLL object too so that upon reuse, we get a new one 65 | _ctypes.dlclose(self.libptr._handle) 66 | # pretty sure once the old object libptr was pointed to should 67 | # get garbage collected at some point after this 68 | self.libptr = None 69 | self._zpar_session_obj = None 70 | 71 | def __enter__(self): 72 | """Enable ZPar to be used as a ContextManager""" 73 | return self 74 | 75 | def __exit__(self, type, value, traceback): 76 | """Clean up when done""" 77 | self.close() 78 | 79 | def get_tagger(self): 80 | if not self.libptr: 81 | raise Exception('Cannot get tagger from uninitialized ZPar environment.') 82 | return None 83 | else: 84 | self.tagger = Tagger(self.modelpath, self.libptr, self._zpar_session_obj) 85 | return self.tagger 86 | 87 | def get_parser(self): 88 | if not self.libptr: 89 | raise Exception('Cannot get parser from uninitialized ZPar environment.') 90 | return None 91 | else: 92 | self.parser = Parser(self.modelpath, self.libptr, self._zpar_session_obj) 93 | return self.parser 94 | 95 | def get_depparser(self): 96 | if not self.libptr: 97 | raise Exception('Cannot get parser from uninitialized ZPar environment.') 98 | return None 99 | else: 100 | self.depparser = DepParser(self.modelpath, self.libptr, self._zpar_session_obj) 101 | return self.depparser 102 | 103 | -------------------------------------------------------------------------------- /zpar/zpar_server.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import argparse 4 | import logging 5 | import os 6 | import six 7 | import sys 8 | 9 | from zpar import ZPar 10 | 11 | if six.PY2: 12 | from SimpleXMLRPCServer import SimpleXMLRPCServer 13 | else: 14 | from xmlrpc.server import SimpleXMLRPCServer 15 | 16 | class ModelNotFoundError(Exception): 17 | 18 | def __init__(self, model_name, model_path): 19 | Exception.__init__(self) 20 | self.model_name = model_name 21 | self.model_path = model_path 22 | 23 | def __str__(self): 24 | if self.model_name != 'all': 25 | return "No {} model could be found at {}".format(self.model_name, 26 | self.model_path) 27 | else: 28 | return "No models could be found at {}".format(self.model_path) 29 | 30 | 31 | _baseclass = SimpleXMLRPCServer 32 | class StoppableServer(_baseclass): 33 | 34 | allow_reuse_address = True 35 | 36 | def __init__(self, addr, zpar_model_path, model_list, *args, **kwds): 37 | 38 | # store the hostname and port number 39 | self.myhost, self.myport = addr 40 | 41 | # store the link to the loaded zpar object 42 | self.z = ZPar(zpar_model_path) 43 | 44 | # initialize the parent class 45 | _baseclass.__init__(self, addr, *args, **kwds) 46 | 47 | # Call the individual loading functions 48 | # and only register the appropriate methods 49 | if 'tagger' in model_list: 50 | tagger = self.z.get_tagger() 51 | self.register_function(tagger.tag_sentence) 52 | self.register_function(tagger.tag_file) 53 | if 'parser' in model_list: 54 | parser = self.z.get_parser() 55 | self.register_function(parser.parse_sentence) 56 | self.register_function(parser.parse_file) 57 | self.register_function(parser.parse_tagged_sentence) 58 | self.register_function(parser.parse_tagged_file) 59 | if 'depparser' in model_list: 60 | parser = self.z.get_depparser() 61 | self.register_function(parser.dep_parse_sentence) 62 | self.register_function(parser.dep_parse_file) 63 | self.register_function(parser.dep_parse_tagged_sentence) 64 | self.register_function(parser.dep_parse_tagged_file) 65 | 66 | # register the function to remotely stop the server 67 | self.register_function(self.stop_server) 68 | 69 | self.quit = False 70 | 71 | def serve_forever(self): 72 | while not self.quit: 73 | try: 74 | self.handle_request() 75 | except KeyboardInterrupt: 76 | print("\nKeyboard interrupt received, exiting.") 77 | break 78 | self.z.close() 79 | self.server_close() 80 | 81 | def stop_server(self): 82 | self.quit = True 83 | return 0, "Server terminated on host %r, port %r" % (self.myhost, self.myport) 84 | 85 | 86 | def main(): 87 | # set up an argument parser 88 | parser = argparse.ArgumentParser(prog='zpar_server.py', \ 89 | formatter_class=argparse.ArgumentDefaultsHelpFormatter) 90 | parser.add_argument('--modeldir', dest='modeldir', 91 | help="Path to directory containing zpar English models", 92 | required=True) 93 | 94 | parser.add_argument('--models', dest='models', nargs='+', 95 | help="Load only these models", 96 | required=True) 97 | 98 | parser.add_argument('--host', dest='hostname', 99 | help="Hostname or IP address", 100 | default="localhost", 101 | required=False) 102 | 103 | parser.add_argument('--port', dest='port', type=int, 104 | help="Port number", 105 | default=8859, 106 | required=False) 107 | 108 | parser.add_argument('--log', dest='log', action="store_true", 109 | default=False, 110 | help="Log server requests") 111 | 112 | 113 | # parse given command line arguments 114 | args = parser.parse_args() 115 | 116 | # check to make sure that the specified models 117 | # are those we know about 118 | if set(args.models).difference(['tagger', 'parser', 'depparser']): 119 | sys.stderr.write('Error: invalid model(s) specified. Choices are: "tagger", "parser", and "depparser".\n') 120 | sys.exit(1) 121 | 122 | # set up the logging 123 | logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.INFO) 124 | 125 | # Create a server that is built on top of this ZPAR data structure 126 | logging.info('Initializing server ...') 127 | server = StoppableServer((args.hostname, args.port), 128 | args.modeldir, args.models, 129 | logRequests=args.log, 130 | allow_none=True) 131 | 132 | # Register introspection functions with the server 133 | logging.info('Registering introspection ...') 134 | server.register_introspection_functions() 135 | 136 | # Start the server 137 | logging.info('Starting server on port {}...'.format(args.port)) 138 | server.serve_forever() 139 | 140 | 141 | if __name__ == '__main__': 142 | main() 143 | --------------------------------------------------------------------------------