├── .gitignore ├── Dockerfile ├── LICENSE ├── README.md ├── setup.py └── syntaxnet_wrapper ├── __init__.py ├── makefile ├── morpher_eval_forever.py ├── parser_eval_forever.py ├── tagger_eval_forever.py ├── test.sh ├── tests ├── __init__.py └── test_parser.py └── tokenizer_eval_forever.py /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | syntaxnet_wrapper/models/ 3 | build 4 | syntaxnet_wrapper.egg-info 5 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | from gliacloud/base_images:django 2 | 3 | run apt-get install python-software-properties software-properties-common python-software-properties -y 4 | 5 | #RUN echo "deb http://ppa.launchpad.net/webupd8team/java/ubuntu trusty main" | tee -a /etc/apt/sources.list 6 | #RUN echo "deb-src http://ppa.launchpad.net/webupd8team/java/ubuntu trusty main" | tee -a /etc/apt/sources.list 7 | #RUN echo oracle-java8-installer shared/accepted-oracle-license-v1-1 select true | /usr/bin/debconf-set-selections 8 | #RUN apt-key adv --keyserver keyserver.ubuntu.com --recv-keys EEA14886 && apt-get update && apt-get install -y curl dnsutils oracle-java8-installer ca-certificates 9 | 10 | RUN add-apt-repository -y ppa:openjdk-r/ppa 11 | RUN apt-get -y update 12 | RUN apt-get -y install openjdk-8-jdk 13 | RUN apt-get install -y swig unzip wget 14 | 15 | 16 | RUN apt-get update \ 17 | && apt-get install git zlib1g-dev file swig python2.7 python-dev python-pip python-mock -y \ 18 | && pip install --upgrade pip \ 19 | && pip install -U protobuf==3.0.0b2 \ 20 | && pip install asciitree \ 21 | && pip install numpy \ 22 | && wget https://github.com/bazelbuild/bazel/releases/download/0.4.3/bazel-0.4.3-installer-linux-x86_64.sh \ 23 | && chmod +x bazel-0.4.3-installer-linux-x86_64.sh \ 24 | && ./bazel-0.4.3-installer-linux-x86_64.sh \ 25 | && apt-get autoremove 26 | 27 | # install latest bazel 28 | #run echo "deb [arch=amd64] http://storage.googleapis.com/bazel-apt stable jdk1.8" | tee /etc/apt/sources.list.d/bazel.list 29 | #run curl https://bazel.build/bazel-release.pub.gpg | apt-key add - 30 | #run apt-get update && apt-get install -y bazel 31 | 32 | run pip install virtualenv 33 | 34 | add . /work 35 | workdir /work 36 | run python setup.py install 37 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License 2 | 3 | Copyright (c) 2016-2017 GliaCloud, Inc. 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in 13 | all copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 21 | THE SOFTWARE. 22 | 23 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # A Python Wrapper for Google SyntaxNet 2 | 3 | ## Installation 4 | 5 | ### Prerequisites 6 | 7 | #### Install OpenJDK8. 8 | 9 | ```shell-script 10 | add-apt-repository -y ppa:openjdk-r/ppa 11 | apt-get -y update 12 | apt-get -y install openjdk-8-jdk 13 | ``` 14 | 15 | #### Install `bazel` and include `bazel` in `$PATH`. 16 | 17 | **Note:** Only bazel 0.4.3 is runnable. bazel 0.4.4 may cause errors. 18 | 19 | ```shell-script 20 | wget https://github.com/bazelbuild/bazel/releases/download/0.4.3/bazel-0.4.3-installer-linux-x86_64.sh 21 | chmod +x bazel-0.4.3-installer-linux-x86_64.sh 22 | ./bazel-0.4.3-installer-linux-x86_64.sh --user 23 | rm bazel-0.4.3-installer-linux-x86_64.sh 24 | export PATH="$PATH:$HOME/bin" 25 | ``` 26 | 27 | #### Install system package dependencies. 28 | 29 | ```shell-script 30 | apt-get -y install swig unzip 31 | ``` 32 | 33 | #### Install Python packages 34 | 35 | **Note:** Current version of syntaxnet must be used with tensorflow r1.0. 36 | 37 | ```shell-script 38 | pip install tensorflow protobuf asciitree mock 39 | ``` 40 | 41 | 42 | #### Start Installing 43 | 44 | ```shell-script 45 | pip install git+ssh://git@github.com/livingbio/syntaxnet_wrapper.git#egg=syntaxnet_wrapper 46 | ``` 47 | 48 | #### If installation failed... 49 | 50 | Execute [test.sh](https://github.com/livingbio/syntaxnet_wrapper/blob/master/syntaxnet_wrapper/test.sh), you should see following outputs: 51 | 52 | ``` 53 | 1 Bob _ PROPN NNP Number=Sing|fPOS=PROPN++NNP 2 nsubj _ _ 54 | 2 brought _ VERB VBD Mood=Ind|Tense=Past|VerbForm=Fin|fPOS=VERB++VBD 0 ROOT _ _ 55 | 3 the _ DET DT Definite=Def|PronType=Art|fPOS=DET++DT 4 det _ _ 56 | 4 pizza _ NOUN NN Number=Sing|fPOS=NOUN++NN 2 dobj _ _ 57 | 5 to _ ADP IN fPOS=ADP++IN 6 case _ _ 58 | 6 Alice. _ PROPN NNP Number=Sing|fPOS=PROPN++NNP 2 nmod _ _ 59 | 60 | 1 球 _ PROPN NNP fPOS=PROPN++NNP 4 nsubj _ _ 61 | 2 從 _ ADP IN fPOS=ADP++IN 3 case _ _ 62 | 3 天上 _ NOUN NN fPOS=NOUN++NN 4 nmod _ _ 63 | 4 掉 _ VERB VV fPOS=VERB++VV 0 ROOT _ _ 64 | 5 下來 _ VERB VV fPOS=VERB++VV 4 mark _ _ 65 | 66 | 球 從天 上 掉 下 來 67 | ``` 68 | 69 | If the outputs are correct, problems are caused by the wrapper. If the outputs are wrong, compilation of syntaxnet may be failed. 70 | 71 | ## Usage 72 | 73 | ```python 74 | from syntaxnet_wrapper import tagger, parser 75 | 76 | print tagger['en'].query('this is a good day', returnRaw=True) 77 | # 1 this _ DET DT _ 0 _ _ _ 78 | # 2 is _ VERB VBZ _ 0 _ _ _ 79 | # 3 a _ DET DT _ 0 _ _ _ 80 | # 4 good _ ADJ JJ _ 0 _ _ _ 81 | # 5 day _ NOUN NN _ 0 _ _ _ 82 | tagger['en'].query('this is a good day') # in default, return splitted text 83 | 84 | print parser['en'].query('Alice drove down the street in her car', returnRaw=True) 85 | # 1 Alice _ NOUN NNP _ 2 nsubj _ _ 86 | # 2 drove _ VERB VBD _ 0 ROOT _ _ 87 | # 3 down _ ADP IN _ 2 prep _ _ 88 | # 4 the _ DET DT _ 5 det _ _ 89 | # 5 street _ NOUN NN _ 3 pobj _ _ 90 | # 6 in _ ADP IN _ 2 prep _ _ 91 | # 7 her _ PRON PRP$ _ 8 poss _ _ 92 | # 8 car _ NOUN NN _ 6 pobj _ _ 93 | 94 | # use Chinese model 95 | print tagger['zh'].query(u'今天 天氣 很 好', returnRaw=True) 96 | # 1 今天 _ NOUN NN fPOS=NOUN++NN 0 _ _ _ 97 | # 2 天氣 _ NOUN NN fPOS=NOUN++NN 0 _ _ _ 98 | # 3 很 _ ADV RB fPOS=ADV++RB 0 _ _ _ 99 | # 4 好 _ ADJ JJ fPOS=ADJ++JJ 0 _ _ _ 100 | 101 | print parser['zh'].query(u'今天 天氣 很 好', returnRaw=True) 102 | # 1 今天 _ NOUN NN fPOS=NOUN++NN 4 nmod:tmod _ _ 103 | # 2 天氣 _ NOUN NN fPOS=NOUN++NN 4 nsubj _ _ 104 | # 3 很 _ ADV RB fPOS=ADV++RB 4 advmod _ _ 105 | # 4 好 _ ADJ JJ fPOS=ADJ++JJ 0 ROOT _ _ 106 | ``` 107 | 108 | ### Language Selection 109 | 110 | The default model is `'English-Parsey'`. This is 111 | [announced by Google](https://research.googleblog.com/2016/05/announcing-syntaxnet-worlds-most.html) 112 | on May, 2016. 113 | Other models, includes `'English'`, are trained by [Universal Dependencies](http://universaldependencies.org/), 114 | [announced by Google](https://research.googleblog.com/2016/08/meet-parseys-cousins-syntax-for-40.html) 115 | on August, 2016. 116 | 117 | ```python 118 | from syntaxnet_wrapper import language_code_to_model_name 119 | language_code_to_model_name 120 | # {'ar': 'Arabic', 121 | # 'bg': 'Bulgarian', 122 | # 'ca': 'Catalan', 123 | # 'cs': 'Czech', 124 | # 'da': 'Danish', 125 | # 'de': 'German', 126 | # 'el': 'Greek', 127 | # 'en': 'English-Parsey', 128 | # 'en-uni': 'English', 129 | # 'es': 'Spanish', 130 | # 'et': 'Estonian', 131 | # 'eu': 'Basque', 132 | # 'fa': 'Persian', 133 | # 'fi': 'Finnish', 134 | # 'fr': 'French', 135 | # 'ga': 'Irish', 136 | # 'gl': 'Galician', 137 | # 'hi': 'Hindi', 138 | # 'hr': 'Croatian', 139 | # 'hu': 'Hungarian', 140 | # 'id': 'Indonesian', 141 | # 'it': 'Italian', 142 | # 'iw': 'Hebrew', 143 | # 'kk': 'Kazakh', 144 | # 'la': 'Latin', 145 | # 'lv': 'Latvian', 146 | # 'nl': 'Dutch', 147 | # 'no': 'Norwegian', 148 | # 'pl': 'Polish', 149 | # 'pt': 'Portuguese', 150 | # 'ro': 'Romanian', 151 | # 'ru': 'Russian', 152 | # 'sl': 'Slovenian', 153 | # 'sv': 'Swedish', 154 | # 'ta': 'Tamil', 155 | # 'tr': 'Turkish', 156 | # 'zh': 'Chinese', 157 | # 'zh-cn': 'Chinese', 158 | # 'zh-tw': 'Chinese'} 159 | ``` 160 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | from setuptools import setup 3 | from setuptools.command.install import install 4 | import sys 5 | import subprocess 6 | 7 | 8 | class InstallClass(install): 9 | def run(self): 10 | install.run(self) 11 | subprocess.call(['pip', 'install', 'tensorflow', 'virtualenv', 'protobuf', 'asciitree', 'mock']) 12 | sys.path.reverse() 13 | import syntaxnet_wrapper 14 | syntaxnet_wrapper_dir = syntaxnet_wrapper.__path__[0] 15 | subprocess.call(['make'], cwd=syntaxnet_wrapper_dir) 16 | 17 | 18 | setup(name='syntaxnet_wrapper', 19 | version='0.4.1', 20 | description='A Python Wrapper for Google SyntaxNet', 21 | url='https://github.com/livingbio/syntaxnet_wrapper', 22 | author='Ping Chu Hung', 23 | author_email='banyhong@gliacloud.com', 24 | license='MIT', 25 | packages=['syntaxnet_wrapper'], 26 | zip_safe=False, 27 | install_requires=[ 28 | 'tensorflow', 29 | 'virtualenv', 30 | 'protobuf', 31 | 'asciitree', 32 | 'mock', 33 | ], 34 | cmdclass={ 35 | 'install': InstallClass, 36 | }, 37 | test_suite='nose.collector', 38 | tests_require=['nose'], 39 | include_package_data=True, 40 | package_data={'syntaxnet_wrapper': ['makefile']} 41 | ) 42 | -------------------------------------------------------------------------------- /syntaxnet_wrapper/__init__.py: -------------------------------------------------------------------------------- 1 | import subprocess 2 | import os 3 | import stat 4 | import six 5 | import signal 6 | 7 | from os.path import join, dirname, abspath 8 | from fcntl import fcntl, F_SETFL, F_GETFD 9 | import time 10 | from datetime import datetime, timedelta 11 | 12 | __all__ = ['parser', 'tagger'] 13 | pwd = dirname(abspath(__file__)) 14 | 15 | PIDFILE_PATH = os.path.join(pwd, 'pids') 16 | 17 | import logging 18 | import re 19 | logger = logging.getLogger() 20 | 21 | class TimeoutException(Exception): 22 | pass 23 | 24 | class SyntaxNetWrapper(object): 25 | 26 | def __del__(self): 27 | self.stop() 28 | 29 | def clean_zombie_process(self): 30 | for pidfile in os.listdir(PIDFILE_PATH): 31 | if not pidfile.endswith('.pid') or pidfile.count('_') != 2: 32 | continue 33 | pid, model, clsname = pidfile.split('_') 34 | try: 35 | os.kill(int(pid), 0) 36 | os.unlink(os.path.join(PIDFILE_PATH, pidfile)) 37 | except: 38 | logger.info('kill zombie process {}'.format(pid)) 39 | self.kill_process(pidfile) 40 | 41 | def kill_process(self, pidfile): 42 | try: 43 | with open(os.path.join(PIDFILE_PATH, pidfile)) as f: 44 | pid = f.read().strip() 45 | try: 46 | os.kill(int(pid), 9) 47 | except Exception as e: 48 | logger.info(e) 49 | os.unlink(os.path.join(PIDFILE_PATH, pidfile)) 50 | except Exception as e: 51 | logger.info(e) 52 | 53 | def make_pidfile(self): 54 | if not os.path.isdir(PIDFILE_PATH): 55 | os.mkdir(PIDFILE_PATH) 56 | pidfilename = os.path.join(PIDFILE_PATH, "{}.pid".format(self.name)) 57 | for fn in os.listdir(PIDFILE_PATH): 58 | if not fn.endswith('.pid') or fn.count('_') != 2: 59 | continue 60 | pid, model, clsname = fn.split('_') 61 | if clsname == self.__class__.__name__ + '.pid' and model == self.model_name: 62 | self.kill_process(fn) 63 | with open(pidfilename, 'w+') as f: 64 | f.write(str(self.process.pid)) 65 | os.chmod(pidfilename, stat.S_IRUSR | stat.S_IWUSR | stat.S_IRGRP | stat.S_IWGRP | stat.S_IROTH | stat.S_IWOTH) 66 | 67 | @property 68 | def name(self): 69 | return u"{}_{}_{}".format(os.getpid(), self.model_name, self.__class__.__name__) 70 | 71 | 72 | def start(self): 73 | rundir = join(pwd, 'models/syntaxnet/bazel-bin/syntaxnet/parser_eval.runfiles/__main__') 74 | command = ['python', self.run_filename, self.model_path, self.context_path] 75 | 76 | env = os.environ.copy() 77 | env['PYTHONPATH'] = rundir 78 | subproc_args = {'stdin': subprocess.PIPE, 'stdout': subprocess.PIPE, 79 | 'stderr': subprocess.STDOUT, 'cwd': pwd, 80 | 'env': env, 'close_fds': True} 81 | self.process = subprocess.Popen(command, shell=False, **subproc_args) 82 | self.out = self.process.stdout 83 | self.din = self.process.stdin 84 | fcntl(self.out.fileno(), F_SETFL, fcntl(self.out.fileno(), F_GETFD) | os.O_NONBLOCK) 85 | self.make_pidfile() 86 | 87 | def stop(self): 88 | self.din.close() 89 | try: 90 | import signal # wordaround for AttributeError("'NoneType' object has no attribute 'SIGTERM'",) 91 | os.kill(self.process.pid, signal.SIGTERM) 92 | self.process.send_signal(signal.SIGTERM) 93 | self.process.kill() 94 | self.process.wait() 95 | except OSError: 96 | pass 97 | 98 | def __init__(self, run_filename, model_name): 99 | 100 | self.model_name = model_name 101 | self.run_filename = run_filename 102 | 103 | if model_name == 'English-Parsey': 104 | model_path = 'models/syntaxnet' 105 | context_path = 'models/syntaxnet/syntaxnet/models/parsey_mcparseface/context.pbtxt' 106 | elif model_name == 'ZHTokenizer': 107 | model_path = 'models/syntaxnet/syntaxnet/models/parsey_universal/Chinese' 108 | context_path = 'models/syntaxnet/syntaxnet/models/parsey_universal/context-tokenize-zh.pbtxt' 109 | else: 110 | model_path = 'models/syntaxnet/syntaxnet/models/parsey_universal/{!s}'.format(model_name) 111 | context_path = 'models/syntaxnet/syntaxnet/models/parsey_universal/context.pbtxt' 112 | 113 | context_path = join(pwd, context_path) 114 | model_path = join(pwd, model_path) 115 | 116 | self.model_path = model_path 117 | self.context_path = context_path 118 | 119 | self.start() 120 | 121 | def restart(self): 122 | self.stop() 123 | self.start() 124 | 125 | def wait_for(self, text, timeout=5): 126 | result = [] 127 | start_time = datetime.now() 128 | while True: 129 | try: 130 | line = self.out.readline().decode('utf-8').strip() 131 | if text == line: 132 | return result 133 | result.append(line) 134 | except: 135 | # read timeout 136 | time.sleep(0.1) 137 | finally: 138 | now = datetime.now() 139 | if(now - start_time) > timedelta(0, timeout): 140 | raise TimeoutException() 141 | 142 | def __query(self, text, returnRaw=False): 143 | self.wait_for('## input content:') 144 | 145 | # push data 146 | self.din.write(text.encode('utf8') + six.b('\n')) 147 | self.din.flush() 148 | self.process.send_signal(signal.SIGALRM) 149 | 150 | self.wait_for('## result start') 151 | results = self.wait_for('## result end') 152 | 153 | if returnRaw: 154 | return '\n'.join(results).strip() + "\n" 155 | return [r.split('\t') for r in results[:-2]] 156 | 157 | def query(self,text, returnRaw=False): 158 | for i in xrange(3): 159 | try: 160 | return self.__query(text, returnRaw) 161 | except Exception as e: 162 | # retart process 163 | self.restart() 164 | 165 | 166 | def list_models(self): 167 | pwd = dirname(abspath(__file__)) 168 | model_path = os.path.join(pwd, 'models/syntaxnet/syntaxnet/models/parsey_universal') 169 | files = os.listdir(model_path) 170 | models = [] 171 | for fn in files: 172 | if os.path.isdir(os.path.join(model_path, fn)): 173 | models.append(fn) 174 | models.append('English-Parsey') 175 | return sorted(models) 176 | 177 | 178 | class SyntaxNetTokenizer(SyntaxNetWrapper): 179 | 180 | def __init__(self, model_name='ZHTokenizer'): 181 | super(SyntaxNetTokenizer, self).__init__('tokenizer_eval_forever.py', model_name) 182 | 183 | def query(self, text): 184 | return super(SyntaxNetTokenizer, self).query(text, returnRaw=True) 185 | 186 | 187 | class SyntaxNetMorpher(SyntaxNetWrapper): 188 | 189 | def __init__(self, model_name='English'): 190 | if model_name == 'Chinese': 191 | self.tokenizer = SyntaxNetTokenizer() 192 | else: 193 | self.tokenizer = None 194 | super(SyntaxNetMorpher, self).__init__('morpher_eval_forever.py', model_name) 195 | 196 | def query(self, text, returnRaw=False): 197 | if self.tokenizer: 198 | tokenized_text = self.tokenizer.query(text) 199 | else: 200 | tokenized_text = text 201 | return super(SyntaxNetMorpher, self).query(tokenized_text, returnRaw) 202 | 203 | def query_raw(self, tokenized_text, returnRaw=False): 204 | return super(SyntaxNetMorpher, self).query(tokenized_text, returnRaw) 205 | 206 | 207 | class SyntaxNetTagger(SyntaxNetWrapper): 208 | 209 | def __init__(self, model_name='English-Parsey', **kwargs): 210 | if model_name == 'English-Parsey': 211 | self.morpher = None 212 | elif 'morpher' in kwargs: 213 | self.morpher = kwargs['morpher'] 214 | else: 215 | self.morpher = SyntaxNetMorpher(model_name) 216 | super(SyntaxNetTagger, self).__init__('tagger_eval_forever.py', model_name) 217 | 218 | def query(self, morphed_text, returnRaw=False): 219 | if self.morpher: 220 | conll_text = self.morpher.query(morphed_text, returnRaw=True) 221 | else: 222 | conll_text = morphed_text 223 | return super(SyntaxNetTagger, self).query(conll_text, returnRaw) 224 | 225 | def query_raw(self, conll_text, returnRaw=False): 226 | return super(SyntaxNetTagger, self).query(conll_text, returnRaw) 227 | 228 | 229 | class SyntaxNetParser(SyntaxNetWrapper): 230 | 231 | def __init__(self, model_name='English-Parsey', **kwargs): 232 | if 'tagger' in kwargs: 233 | self.tagger = kwargs['tagger'] 234 | self.morpher = self.tagger.morpher 235 | else: 236 | if model_name == 'English-Parsey': 237 | self.morpher = None 238 | elif 'morpher' in kwargs: 239 | self.morpher = kwargs['morpher'] 240 | else: 241 | self.morpher = SyntaxNetMorpher(model_name) 242 | self.tagger = SyntaxNetTagger(model_name, morpher=self.morpher) 243 | super(SyntaxNetParser, self).__init__('parser_eval_forever.py', model_name) 244 | 245 | def query(self, text, returnRaw=False): 246 | conll_text = self.tagger.query(text, returnRaw=True) 247 | return super(SyntaxNetParser, self).query(conll_text, returnRaw) 248 | 249 | def query_raw(self, conll_text, returnRaw=False): 250 | return super(SyntaxNetParser, self).query(conll_text, returnRaw) 251 | 252 | 253 | language_code_to_model_name = { 254 | 'ar': 'Arabic', 255 | 'eu': 'Basque', 256 | 'bg': 'Bulgarian', 257 | 'ca': 'Catalan', 258 | 'zh': 'Chinese', 259 | 'zh-tw': 'Chinese', 260 | 'zh-cn': 'Chinese', 261 | 'hr': 'Croatian', 262 | 'cs': 'Czech', 263 | 'da': 'Danish', 264 | 'nl': 'Dutch', 265 | 'en': 'English-Parsey', 266 | 'et': 'Estonian', 267 | 'fi': 'Finnish', 268 | 'fr': 'French', 269 | 'gl': 'Galician', 270 | 'de': 'German', 271 | 'el': 'Greek', 272 | 'iw': 'Hebrew', 273 | 'hi': 'Hindi', 274 | 'hu': 'Hungarian', 275 | 'id': 'Indonesian', 276 | 'ga': 'Irish', 277 | 'it': 'Italian', 278 | 'kk': 'Kazakh', 279 | 'la': 'Latin', 280 | 'lv': 'Latvian', 281 | 'no': 'Norwegian', 282 | 'fa': 'Persian', 283 | 'pl': 'Polish', 284 | 'pt': 'Portuguese', 285 | 'ro': 'Romanian', 286 | 'ru': 'Russian', 287 | 'sl': 'Slovenian', 288 | 'es': 'Spanish', 289 | 'sv': 'Swedish', 290 | 'ta': 'Tamil', 291 | 'tr': 'Turkish', 292 | } 293 | 294 | 295 | class Tagger(object): 296 | cached = {} 297 | 298 | def __del__(self): 299 | for code in self.cached: 300 | tmp = self.cached[code] 301 | self.cached[code] = None 302 | del tmp 303 | 304 | def __getitem__(self, code): 305 | if code not in language_code_to_model_name: 306 | raise ValueError( 307 | 'Invalid language code for tagger: {}'.format(code)) 308 | lang = language_code_to_model_name[code] 309 | if code in self.cached: 310 | return self.cached[code] 311 | self.cached[code] = SyntaxNetTagger(lang) 312 | return self.cached[code] 313 | 314 | tagger = Tagger() 315 | 316 | 317 | class Parser(object): 318 | cached = {} 319 | 320 | def __del__(self): 321 | for code in self.cached: 322 | tmp = self.cached[code] 323 | self.cached[code] = None 324 | del tmp 325 | 326 | def __getitem__(self, code): 327 | if code not in language_code_to_model_name: 328 | raise ValueError( 329 | 'Invalid language code for parser: {}'.format(code)) 330 | lang = language_code_to_model_name[code] 331 | if code in self.cached: 332 | return self.cached[code] 333 | self.cached[code] = SyntaxNetParser(lang, tagger=tagger[code]) 334 | return self.cached[code] 335 | 336 | parser = Parser() 337 | 338 | 339 | def parse_text(text, lang='en', returnRaw=True): 340 | lang = language_code_to_model_name[lang] 341 | tagger, parser = None, None 342 | try: 343 | tagger = SyntaxNetTagger(lang) 344 | parser = SyntaxNetParser(lang, tagger=tagger) 345 | result = parser.query(text, returnRaw) 346 | return result 347 | finally: 348 | del tagger, parser 349 | 350 | 351 | def tag_text(text, lang='en', returnRaw=True): 352 | lang = language_code_to_model_name[lang] 353 | tagger = None 354 | try: 355 | tagger = SyntaxNetTagger(lang) 356 | result = tagger.query(text, returnRaw) 357 | return result 358 | finally: 359 | del tagger 360 | -------------------------------------------------------------------------------- /syntaxnet_wrapper/makefile: -------------------------------------------------------------------------------- 1 | all: download_language_models 2 | 3 | configure_syntaxnet: 4 | @echo "************************************************************" 1>&2 5 | @echo " configure syntaxnet " 1>&2 6 | @echo "************************************************************" 1>&2 7 | rm -rf models 8 | git clone --recursive https://github.com/tensorflow/models.git && \ 9 | cd models/syntaxnet/tensorflow && \ 10 | virtualenv /tmp/venv && \ 11 | /tmp/venv/bin/pip install tensorflow && \ 12 | printf '/tmp/venv/bin/python\n\n\n\n\n\n\n\n\n\n\n\n\n\n' | ./configure 13 | 14 | build_syntaxnet: configure_syntaxnet 15 | @echo "************************************************************" 1>&2 16 | @echo " build syntaxnet " 1>&2 17 | @echo "************************************************************" 1>&2 18 | cd models/syntaxnet && \ 19 | bazel --output_user_root=bazel_root test syntaxnet/... util/utf8/... || \ 20 | rm -rf bazel_root 21 | cd models/syntaxnet && \ 22 | rm tensorflow/util/python/python_lib && \ 23 | ln -s `python -c "from distutils.sysconfig import get_python_lib; print(get_python_lib())"` tensorflow/util/python/python_lib 24 | @echo "************************************************************" 1>&2 25 | @echo " building syntaxnet finished " 1>&2 26 | @echo "************************************************************" 1>&2 27 | 28 | copy_demo_scripts: build_syntaxnet 29 | cp models/syntaxnet/syntaxnet/models/parsey_universal/parse.sh models/syntaxnet/parse.sh 30 | cp models/syntaxnet/syntaxnet/models/parsey_universal/tokenize.sh models/syntaxnet/tokenize.sh 31 | cp models/syntaxnet/syntaxnet/models/parsey_universal/tokenize_zh.sh models/syntaxnet/tokenize_zh.sh 32 | 33 | clear_tmp_venv: copy_demo_scripts 34 | rm -rf /tmp/venv 35 | mkdir pids 36 | chmod 777 pids 37 | @echo "************************************************************" 1>&2 38 | @echo " download language models " 1>&2 39 | @echo "************************************************************" 1>&2 40 | 41 | download_language_models: models/syntaxnet/syntaxnet/models/parsey_universal/Ancient_Greek-PROIEL \ 42 | models/syntaxnet/syntaxnet/models/parsey_universal/Ancient_Greek \ 43 | models/syntaxnet/syntaxnet/models/parsey_universal/Arabic \ 44 | models/syntaxnet/syntaxnet/models/parsey_universal/Basque \ 45 | models/syntaxnet/syntaxnet/models/parsey_universal/Bulgarian \ 46 | models/syntaxnet/syntaxnet/models/parsey_universal/Catalan \ 47 | models/syntaxnet/syntaxnet/models/parsey_universal/Chinese \ 48 | models/syntaxnet/syntaxnet/models/parsey_universal/Croatian \ 49 | models/syntaxnet/syntaxnet/models/parsey_universal/Czech-CAC \ 50 | models/syntaxnet/syntaxnet/models/parsey_universal/Czech-CLTT \ 51 | models/syntaxnet/syntaxnet/models/parsey_universal/Czech \ 52 | models/syntaxnet/syntaxnet/models/parsey_universal/Danish \ 53 | models/syntaxnet/syntaxnet/models/parsey_universal/Dutch-LassySmall \ 54 | models/syntaxnet/syntaxnet/models/parsey_universal/Dutch \ 55 | models/syntaxnet/syntaxnet/models/parsey_universal/English-LinES \ 56 | models/syntaxnet/syntaxnet/models/parsey_universal/English \ 57 | models/syntaxnet/syntaxnet/models/parsey_universal/Estonian \ 58 | models/syntaxnet/syntaxnet/models/parsey_universal/Finnish-FTB \ 59 | models/syntaxnet/syntaxnet/models/parsey_universal/Finnish \ 60 | models/syntaxnet/syntaxnet/models/parsey_universal/French \ 61 | models/syntaxnet/syntaxnet/models/parsey_universal/Galician \ 62 | models/syntaxnet/syntaxnet/models/parsey_universal/German \ 63 | models/syntaxnet/syntaxnet/models/parsey_universal/Gothic \ 64 | models/syntaxnet/syntaxnet/models/parsey_universal/Greek \ 65 | models/syntaxnet/syntaxnet/models/parsey_universal/Hebrew \ 66 | models/syntaxnet/syntaxnet/models/parsey_universal/Hindi \ 67 | models/syntaxnet/syntaxnet/models/parsey_universal/Hungarian \ 68 | models/syntaxnet/syntaxnet/models/parsey_universal/Indonesian \ 69 | models/syntaxnet/syntaxnet/models/parsey_universal/Irish \ 70 | models/syntaxnet/syntaxnet/models/parsey_universal/Italian \ 71 | models/syntaxnet/syntaxnet/models/parsey_universal/Kazakh \ 72 | models/syntaxnet/syntaxnet/models/parsey_universal/Latin-ITTB \ 73 | models/syntaxnet/syntaxnet/models/parsey_universal/Latin-PROIEL \ 74 | models/syntaxnet/syntaxnet/models/parsey_universal/Latin \ 75 | models/syntaxnet/syntaxnet/models/parsey_universal/Latvian \ 76 | models/syntaxnet/syntaxnet/models/parsey_universal/Norwegian \ 77 | models/syntaxnet/syntaxnet/models/parsey_universal/Old_Church_Slavonic \ 78 | models/syntaxnet/syntaxnet/models/parsey_universal/Persian \ 79 | models/syntaxnet/syntaxnet/models/parsey_universal/Polish \ 80 | models/syntaxnet/syntaxnet/models/parsey_universal/Portuguese-BR \ 81 | models/syntaxnet/syntaxnet/models/parsey_universal/Portuguese \ 82 | models/syntaxnet/syntaxnet/models/parsey_universal/Romanian \ 83 | models/syntaxnet/syntaxnet/models/parsey_universal/Russian-SynTagRus \ 84 | models/syntaxnet/syntaxnet/models/parsey_universal/Russian \ 85 | models/syntaxnet/syntaxnet/models/parsey_universal/Slovenian-SST \ 86 | models/syntaxnet/syntaxnet/models/parsey_universal/Slovenian \ 87 | models/syntaxnet/syntaxnet/models/parsey_universal/Spanish-AnCora \ 88 | models/syntaxnet/syntaxnet/models/parsey_universal/Spanish \ 89 | models/syntaxnet/syntaxnet/models/parsey_universal/Swedish-LinES \ 90 | models/syntaxnet/syntaxnet/models/parsey_universal/Swedish \ 91 | models/syntaxnet/syntaxnet/models/parsey_universal/Tamil \ 92 | models/syntaxnet/syntaxnet/models/parsey_universal/Turkish 93 | 94 | models/syntaxnet/syntaxnet/models/parsey_universal/%: clear_tmp_venv 95 | cd models/syntaxnet/syntaxnet/models/parsey_universal/ && \ 96 | wget http://download.tensorflow.org/models/parsey_universal/$*.zip && \ 97 | unzip $*.zip && \ 98 | cd $* && \ 99 | chmod 644 * && \ 100 | cd .. && \ 101 | chmod 755 $* && \ 102 | rm $*.zip 103 | -------------------------------------------------------------------------------- /syntaxnet_wrapper/morpher_eval_forever.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import os 3 | import os.path 4 | import sys 5 | import signal 6 | 7 | import tempfile 8 | import tensorflow as tf 9 | 10 | from tensorflow.python.platform import gfile 11 | 12 | from google.protobuf import text_format 13 | 14 | from syntaxnet import structured_graph_builder 15 | from syntaxnet.ops import gen_parser_ops 16 | from syntaxnet import task_spec_pb2 17 | 18 | morpher_hidden_layer_sizes = '64' 19 | morpher_arg_prefix = 'brain_morpher' 20 | # graph_builder = 'structured' 21 | slim_model = True 22 | batch_size = 1 23 | beam_size = 8 24 | max_steps = 1000 25 | resource_dir = sys.argv[1] 26 | context_path = sys.argv[2] 27 | morpher_model_path = os.path.join(resource_dir, 'morpher-params') 28 | 29 | 30 | def RewriteContext(task_context): 31 | context = task_spec_pb2.TaskSpec() 32 | with gfile.FastGFile(task_context) as fin: 33 | text_format.Merge(fin.read(), context) 34 | for resource in context.input: 35 | for part in resource.part: 36 | if part.file_pattern != '-': 37 | part.file_pattern = os.path.join(resource_dir, part.file_pattern) 38 | with tempfile.NamedTemporaryFile(delete=False) as fout: 39 | fout.write(str(context)) 40 | return fout.name 41 | 42 | 43 | sess = tf.Session() 44 | 45 | task_context = RewriteContext(context_path) 46 | feature_sizes, domain_sizes, embedding_dims, num_actions = sess.run( 47 | gen_parser_ops.feature_size(task_context=task_context, arg_prefix=morpher_arg_prefix)) 48 | hidden_layer_sizes = map(int, morpher_hidden_layer_sizes.split(',')) 49 | morpher = structured_graph_builder.StructuredGraphBuilder( 50 | num_actions, feature_sizes, domain_sizes, embedding_dims, 51 | hidden_layer_sizes, gate_gradients=True, arg_prefix=morpher_arg_prefix, 52 | beam_size=beam_size, max_steps=max_steps) 53 | morpher.AddEvaluation(task_context, batch_size, corpus_name='stdin', 54 | evaluation_max_steps=max_steps) 55 | 56 | morpher.AddSaver(slim_model) 57 | sess.run(morpher.inits.values()) 58 | morpher.saver.restore(sess, morpher_model_path) 59 | 60 | sink_documents = tf.placeholder(tf.string) 61 | sink = gen_parser_ops.document_sink(sink_documents, task_context=task_context, 62 | corpus_name='stdout-conll') 63 | 64 | 65 | def stdin_handler(signum, frame): 66 | tf_eval_epochs, tf_eval_metrics, tf_documents = sess.run([ 67 | morpher.evaluation['epochs'], 68 | morpher.evaluation['eval_metrics'], 69 | morpher.evaluation['documents'], 70 | ]) 71 | 72 | sys.stdout.write('\n## result start\n') 73 | sys.stdout.flush() 74 | 75 | if len(tf_documents): 76 | sess.run(sink, feed_dict={sink_documents: tf_documents}) 77 | 78 | sys.stdout.write('\n## result end\n') 79 | sys.stdout.flush() 80 | 81 | 82 | def abort_handler(signum, frame): 83 | sess.close() 84 | sys.exit(0) 85 | 86 | 87 | signal.signal(signal.SIGALRM, stdin_handler) 88 | signal.signal(signal.SIGABRT, abort_handler) 89 | while True: 90 | sys.stdout.write('\n## input content:\n') 91 | sys.stdout.flush() 92 | signal.pause() 93 | -------------------------------------------------------------------------------- /syntaxnet_wrapper/parser_eval_forever.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import os 3 | import os.path 4 | import sys 5 | import signal 6 | 7 | import tempfile 8 | import tensorflow as tf 9 | 10 | from tensorflow.python.platform import gfile 11 | 12 | from google.protobuf import text_format 13 | 14 | from syntaxnet import structured_graph_builder 15 | from syntaxnet.ops import gen_parser_ops 16 | from syntaxnet import task_spec_pb2 17 | 18 | parser_hidden_layer_sizes = '512,512' 19 | parser_arg_prefix = 'brain_parser' 20 | # graph_builder = 'structured' 21 | slim_model = True 22 | batch_size = 1 23 | beam_size = 8 24 | max_steps = 1000 25 | resource_dir = sys.argv[1] 26 | context_path = sys.argv[2] 27 | if resource_dir.endswith('syntaxnet'): 28 | parser_model_path = os.path.join(resource_dir, 'syntaxnet/models/parsey_mcparseface') 29 | else: 30 | parser_model_path = resource_dir 31 | parser_model_path = os.path.join(parser_model_path, 'parser-params') 32 | 33 | 34 | def RewriteContext(task_context): 35 | context = task_spec_pb2.TaskSpec() 36 | with gfile.FastGFile(task_context) as fin: 37 | text_format.Merge(fin.read(), context) 38 | for resource in context.input: 39 | for part in resource.part: 40 | if part.file_pattern != '-': 41 | part.file_pattern = os.path.join(resource_dir, part.file_pattern) 42 | with tempfile.NamedTemporaryFile(delete=False) as fout: 43 | fout.write(str(context)) 44 | return fout.name 45 | 46 | 47 | sess = tf.Session() 48 | 49 | task_context = RewriteContext(context_path) 50 | feature_sizes, domain_sizes, embedding_dims, num_actions = sess.run( 51 | gen_parser_ops.feature_size(task_context=task_context, arg_prefix=parser_arg_prefix)) 52 | hidden_layer_sizes = map(int, parser_hidden_layer_sizes.split(',')) 53 | parser = structured_graph_builder.StructuredGraphBuilder( 54 | num_actions, feature_sizes, domain_sizes, embedding_dims, 55 | hidden_layer_sizes, gate_gradients=True, arg_prefix=parser_arg_prefix, 56 | beam_size=beam_size, max_steps=max_steps) 57 | parser.AddEvaluation(task_context, batch_size, corpus_name='stdin-conll', 58 | evaluation_max_steps=max_steps) 59 | 60 | parser.AddSaver(slim_model) 61 | sess.run(parser.inits.values()) 62 | parser.saver.restore(sess, parser_model_path) 63 | 64 | sink_documents = tf.placeholder(tf.string) 65 | sink = gen_parser_ops.document_sink(sink_documents, task_context=task_context, 66 | corpus_name='stdout-conll') 67 | 68 | 69 | def stdin_handler(signum, frame): 70 | tf_eval_epochs, tf_eval_metrics, tf_documents = sess.run([ 71 | parser.evaluation['epochs'], 72 | parser.evaluation['eval_metrics'], 73 | parser.evaluation['documents'], 74 | ]) 75 | 76 | sys.stdout.write('\n## result start\n') 77 | sys.stdout.flush() 78 | 79 | if len(tf_documents): 80 | sess.run(sink, feed_dict={sink_documents: tf_documents}) 81 | 82 | sys.stdout.write('\n## result end\n') 83 | sys.stdout.flush() 84 | 85 | 86 | def abort_handler(signum, frame): 87 | sess.close() 88 | sys.exit(0) 89 | 90 | 91 | signal.signal(signal.SIGALRM, stdin_handler) 92 | signal.signal(signal.SIGABRT, abort_handler) 93 | while True: 94 | sys.stdout.write('\n## input content:\n') 95 | sys.stdout.flush() 96 | signal.alarm(1800) 97 | signal.pause() 98 | -------------------------------------------------------------------------------- /syntaxnet_wrapper/tagger_eval_forever.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import os 3 | import os.path 4 | import sys 5 | import signal 6 | 7 | import tempfile 8 | import tensorflow as tf 9 | 10 | from tensorflow.python.platform import gfile 11 | 12 | from google.protobuf import text_format 13 | 14 | from syntaxnet import structured_graph_builder 15 | from syntaxnet.ops import gen_parser_ops 16 | from syntaxnet import task_spec_pb2 17 | 18 | tagger_hidden_layer_sizes = '64' 19 | tagger_arg_prefix = 'brain_tagger' 20 | # graph_builder = 'structured' 21 | slim_model = True 22 | batch_size = 1 23 | beam_size = 8 24 | max_steps = 1000 25 | resource_dir = sys.argv[1] 26 | context_path = sys.argv[2] 27 | if resource_dir.endswith('syntaxnet'): 28 | input_style = 'stdin' 29 | tagger_model_path = os.path.join(resource_dir, 'syntaxnet/models/parsey_mcparseface') 30 | else: 31 | input_style = 'stdin-conll' 32 | tagger_model_path = resource_dir 33 | tagger_model_path = os.path.join(tagger_model_path, 'tagger-params') 34 | 35 | 36 | def RewriteContext(task_context): 37 | context = task_spec_pb2.TaskSpec() 38 | with gfile.FastGFile(task_context) as fin: 39 | text_format.Merge(fin.read(), context) 40 | for resource in context.input: 41 | for part in resource.part: 42 | if part.file_pattern != '-': 43 | part.file_pattern = os.path.join(resource_dir, part.file_pattern) 44 | with tempfile.NamedTemporaryFile(delete=False) as fout: 45 | fout.write(str(context)) 46 | return fout.name 47 | 48 | 49 | sess = tf.Session() 50 | 51 | task_context = RewriteContext(context_path) 52 | feature_sizes, domain_sizes, embedding_dims, num_actions = sess.run( 53 | gen_parser_ops.feature_size(task_context=task_context, arg_prefix=tagger_arg_prefix)) 54 | hidden_layer_sizes = map(int, tagger_hidden_layer_sizes.split(',')) 55 | tagger = structured_graph_builder.StructuredGraphBuilder( 56 | num_actions, feature_sizes, domain_sizes, embedding_dims, 57 | hidden_layer_sizes, gate_gradients=True, arg_prefix=tagger_arg_prefix, 58 | beam_size=beam_size, max_steps=max_steps) 59 | tagger.AddEvaluation(task_context, batch_size, corpus_name=input_style, 60 | evaluation_max_steps=max_steps) 61 | 62 | tagger.AddSaver(slim_model) 63 | sess.run(tagger.inits.values()) 64 | tagger.saver.restore(sess, tagger_model_path) 65 | 66 | sink_documents = tf.placeholder(tf.string) 67 | sink = gen_parser_ops.document_sink(sink_documents, task_context=task_context, 68 | corpus_name='stdout-conll') 69 | 70 | 71 | def stdin_handler(signum, frame): 72 | tf_eval_epochs, tf_eval_metrics, tf_documents = sess.run([ 73 | tagger.evaluation['epochs'], 74 | tagger.evaluation['eval_metrics'], 75 | tagger.evaluation['documents'], 76 | ]) 77 | 78 | sys.stdout.write('\n## result start\n') 79 | sys.stdout.flush() 80 | 81 | if len(tf_documents): 82 | sess.run(sink, feed_dict={sink_documents: tf_documents}) 83 | 84 | sys.stdout.write('\n## result end\n') 85 | sys.stdout.flush() 86 | 87 | 88 | def abort_handler(signum, frame): 89 | sess.close() 90 | sys.exit(0) 91 | 92 | 93 | signal.signal(signal.SIGALRM, stdin_handler) 94 | signal.signal(signal.SIGABRT, abort_handler) 95 | while True: 96 | sys.stdout.write('\n## input content:\n') 97 | sys.stdout.flush() 98 | signal.alarm(1800) 99 | signal.pause() 100 | -------------------------------------------------------------------------------- /syntaxnet_wrapper/test.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | cd `pip -V | sed -e "s/.*from //" -e "s/ (.*//"` 4 | cd syntaxnet_wrapper/models/syntaxnet 5 | echo 'Bob brought the pizza to Alice.' | bash parse.sh syntaxnet/models/parsey_universal/English 2> /dev/null 6 | echo '球 從 天上 掉 下來' | bash parse.sh syntaxnet/models/parsey_universal/Chinese 2> /dev/null 7 | echo '球從天上掉下來' | bash tokenize_zh.sh syntaxnet/models/parsey_universal/Chinese 2> /dev/null 8 | -------------------------------------------------------------------------------- /syntaxnet_wrapper/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/livingbio/syntaxnet_wrapper/5444875398b8c0e3ee800b12f29df70691af7120/syntaxnet_wrapper/tests/__init__.py -------------------------------------------------------------------------------- /syntaxnet_wrapper/tests/test_parser.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf8 -*- 2 | from __future__ import unicode_literals 3 | from unittest import TestCase 4 | from syntaxnet_wrapper import tagger, parser 5 | 6 | 7 | class TestParser(TestCase): 8 | def test_tagger_en(self): 9 | raw = tagger['en'].query('this is a good day', returnRaw=True) 10 | self.assertEqual( 11 | raw, 12 | '1\tthis\t_\tDET\tDT\t_\t0\t_\t_\t_\n' 13 | '2\tis\t_\tVERB\tVBZ\t_\t0\t_\t_\t_\n' 14 | '3\ta\t_\tDET\tDT\t_\t0\t_\t_\t_\n' 15 | '4\tgood\t_\tADJ\tJJ\t_\t0\t_\t_\t_\n' 16 | '5\tday\t_\tNOUN\tNN\t_\t0\t_\t_\t_\n') 17 | 18 | def test_parser_en(self): 19 | raw = parser['en'].query('Alice drove down the street in her car', returnRaw=True) 20 | self.assertEqual( 21 | raw, 22 | '1\tAlice\t_\tNOUN\tNNP\t_\t2\tnsubj\t_\t_\n' 23 | '2\tdrove\t_\tVERB\tVBD\t_\t0\tROOT\t_\t_\n' 24 | '3\tdown\t_\tADP\tIN\t_\t2\tprep\t_\t_\n' 25 | '4\tthe\t_\tDET\tDT\t_\t5\tdet\t_\t_\n' 26 | '5\tstreet\t_\tNOUN\tNN\t_\t3\tpobj\t_\t_\n' 27 | '6\tin\t_\tADP\tIN\t_\t2\tprep\t_\t_\n' 28 | '7\ther\t_\tPRON\tPRP$\t_\t8\tposs\t_\t_\n' 29 | '8\tcar\t_\tNOUN\tNN\t_\t6\tpobj\t_\t_\n') 30 | 31 | def test_tagger_zh(self): 32 | raw = tagger['zh'].query(u'今天 天氣 很 好', returnRaw=True) 33 | self.assertEqual( 34 | raw, 35 | '1\t\u4eca\u5929\t_\tNOUN\tNN\tfPOS=NOUN++NN\t0\t_\t_\t_\n' 36 | '2\t\u5929\u6c23\t_\tNOUN\tNN\tfPOS=NOUN++NN\t0\t_\t_\t_\n' 37 | '3\t\u5f88\t_\tADV\tRB\tfPOS=ADV++RB\t0\t_\t_\t_\n' 38 | '4\t\u597d\t_\tADJ\tJJ\tfPOS=ADJ++JJ\t0\t_\t_\t_\n') 39 | 40 | def test_parser_zh(self): 41 | raw = parser['zh'].query(u'今天 天氣 很 好', returnRaw=True) 42 | self.assertEqual( 43 | raw, 44 | '1\t\u4eca\u5929\t_\tNOUN\tNN\tfPOS=NOUN++NN\t4\tnmod:tmod\t_\t_\n' 45 | '2\t\u5929\u6c23\t_\tNOUN\tNN\tfPOS=NOUN++NN\t4\tnsubj\t_\t_\n' 46 | '3\t\u5f88\t_\tADV\tRB\tfPOS=ADV++RB\t4\tadvmod\t_\t_\n' 47 | '4\t\u597d\t_\tADJ\tJJ\tfPOS=ADJ++JJ\t0\tROOT\t_\t_\n') 48 | -------------------------------------------------------------------------------- /syntaxnet_wrapper/tokenizer_eval_forever.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import os 3 | import os.path 4 | import sys 5 | import signal 6 | 7 | import tempfile 8 | import tensorflow as tf 9 | 10 | from tensorflow.python.platform import gfile 11 | 12 | from google.protobuf import text_format 13 | 14 | from syntaxnet import structured_graph_builder 15 | from syntaxnet.ops import gen_parser_ops 16 | from syntaxnet import task_spec_pb2 17 | 18 | tokenizer_hidden_layer_sizes = '256,256' 19 | tokenizer_arg_prefix = 'brain_tokenizer_zh' 20 | # graph_builder = 'structured' 21 | slim_model = True 22 | batch_size = 1 23 | beam_size = 8 24 | max_steps = 1000 25 | resource_dir = sys.argv[1] 26 | context_path = sys.argv[2] 27 | tokenizer_model_path = os.path.join(resource_dir, 'tokenizer-params') 28 | 29 | 30 | def RewriteContext(task_context): 31 | context = task_spec_pb2.TaskSpec() 32 | with gfile.FastGFile(task_context) as fin: 33 | text_format.Merge(fin.read(), context) 34 | for resource in context.input: 35 | for part in resource.part: 36 | if part.file_pattern != '-': 37 | part.file_pattern = os.path.join(resource_dir, part.file_pattern) 38 | with tempfile.NamedTemporaryFile(delete=False) as fout: 39 | fout.write(str(context)) 40 | return fout.name 41 | 42 | 43 | sess = tf.Session() 44 | 45 | task_context = RewriteContext(context_path) 46 | feature_sizes, domain_sizes, embedding_dims, num_actions = sess.run( 47 | gen_parser_ops.feature_size(task_context=task_context, arg_prefix=tokenizer_arg_prefix)) 48 | hidden_layer_sizes = map(int, tokenizer_hidden_layer_sizes.split(',')) 49 | tokenizer = structured_graph_builder.StructuredGraphBuilder( 50 | num_actions, feature_sizes, domain_sizes, embedding_dims, 51 | hidden_layer_sizes, gate_gradients=True, arg_prefix=tokenizer_arg_prefix, 52 | beam_size=beam_size, max_steps=max_steps) 53 | tokenizer.AddEvaluation(task_context, batch_size, corpus_name='stdin-untoken', 54 | evaluation_max_steps=max_steps) 55 | 56 | tokenizer.AddSaver(slim_model) 57 | sess.run(tokenizer.inits.values()) 58 | tokenizer.saver.restore(sess, tokenizer_model_path) 59 | 60 | sink_documents = tf.placeholder(tf.string) 61 | sink = gen_parser_ops.document_sink(sink_documents, task_context=task_context, 62 | corpus_name='stdin-untoken') 63 | 64 | 65 | def stdin_handler(signum, frame): 66 | tf_eval_epochs, tf_eval_metrics, tf_documents = sess.run([ 67 | tokenizer.evaluation['epochs'], 68 | tokenizer.evaluation['eval_metrics'], 69 | tokenizer.evaluation['documents'], 70 | ]) 71 | 72 | sys.stdout.write('\n## result start\n') 73 | sys.stdout.flush() 74 | 75 | if len(tf_documents): 76 | sess.run(sink, feed_dict={sink_documents: tf_documents}) 77 | 78 | sys.stdout.write('\n## result end\n') 79 | sys.stdout.flush() 80 | 81 | 82 | def abort_handler(signum, frame): 83 | sess.close() 84 | sys.exit(0) 85 | 86 | 87 | signal.signal(signal.SIGALRM, stdin_handler) 88 | signal.signal(signal.SIGABRT, abort_handler) 89 | while True: 90 | sys.stdout.write('\n## input content:\n') 91 | sys.stdout.flush() 92 | signal.alarm(1800) 93 | signal.pause() 94 | --------------------------------------------------------------------------------