├── .gitignore ├── LICENSE.txt ├── README.rst ├── setup.py └── treetagger.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Based on: 2 | # https://github.com/github/gitignore/blob/master/Python.gitignore 3 | 4 | *.py[co] 5 | *.swp 6 | *.swo 7 | 8 | # Packages 9 | *.egg 10 | *.egg-info 11 | dist 12 | build 13 | eggs 14 | parts 15 | bin 16 | develop-eggs 17 | .installed.cfg 18 | 19 | # Installer logs 20 | pip-log.txt 21 | 22 | # Unit test / coverage reports 23 | .coverage 24 | .tox 25 | 26 | # Not in github's gitignore: 27 | tmp 28 | 29 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | 2 | Copyright (C) 2013 Mirko Otto 3 | 4 | This program is free software; you can redistribute it and/or modify 5 | it under the terms of the GNU General Public License as published by 6 | the Free Software Foundation; either version 3 of the License, or 7 | (at your option) any later version. 8 | 9 | This program is distributed in the hope that it will be useful, 10 | but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | GNU General Public License for more details. 13 | 14 | You should have received a copy of the GNU General Public License 15 | along with this program; if not, write to the Free Software Foundation, 16 | Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 17 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | treetagger-python 2 | ================= 3 | 4 | A Python module for interfacing with the Treetagger by Helmut Schmid. 5 | 6 | Copyright (C) 2018 Mirko Otto 7 | 8 | For license information, see LICENSE.txt 9 | 10 | Dependencies 11 | ------------ 12 | 13 | - `TreeTagger `__ (The names of the parametres Files of the TreeTagger program have changed, use the version after the 16th of October 2018; Since October 2020, it seems that the data must now be transferred to the TreeTagger programme as files.) 14 | - Python 3 15 | - `NLTK `__ 16 | - treetagger.py is for Python 3 17 | 18 | Tested in June 2021 with TreeTagger 3.2.3 (versions after October 2020), Python 3.9.5 and NLTK 3.6.2 on Ubuntu 20.04, OSX 10.15 and Windows 10 19 | 20 | Preparation 21 | ------------ 22 | 23 | The 24 | `TreeTagger `__ 25 | is a copyrighted software by Helmut Schmid and 26 | `IMS `__, please read the license 27 | agreement before you download the TreeTagger package and language 28 | models. 29 | 30 | Before you can use the ``treetagger-python`` package please ensure you 31 | have downloaded and installed the 32 | `TreeTagger `__ 33 | itself. 34 | 35 | After installing the ``TreeTagger`` program, please check if it works properly. 36 | 37 | The ``treetagger-python`` package now checks the installed language packs "language-utf8.par" in the ``lib`` directory. You can use the ``get_installed_lang`` function to show the languages. The corresponding executable files are used in the ``cmd`` directory under Linux and in the ``bat`` directory under Windows. 38 | 39 | The English tagging examples and the Python doctest show the result with the "English parameter file (PENN tagset)" file. 40 | 41 | Installation 42 | ----------- 43 | 44 | Make sure that you know the HOME directory of the TreeTagger program. 45 | 46 | To use the Python package ``treetagger-python``, you must either set the environment variable ``TREETAGGER_HOME`` or the path ``path_to_treetagger`` when the program is called. In section usage you can see the second option. 47 | 48 | To set the environment variable ``TREETAGGER_HOME``, enter the path to the installation directory of ``TreeTagger``: 49 | 50 | :: 51 | 52 | export TREETAGGER_HOME='/path/to/your/TreeTagger/' 53 | 54 | 55 | Clone the repository and change to this directory. In this directory the Python package ``treetagger-python`` can be used without installation.: 56 | 57 | :: 58 | 59 | clone https://github.com/miotto/treetagger-python.git 60 | cd treetagger-python 61 | 62 | Usage 63 | ----- 64 | 65 | Initialize by specifying the path ``path_to_treetagger``: 66 | 67 | :: 68 | 69 | from treetagger import TreeTagger 70 | tt = TreeTagger(path_to_treetagger='/path/to/your/TreeTagger/') 71 | 72 | Usage TreeTagger 73 | ^^^^^^^^^^^^^^^^ 74 | 75 | Show the installed languages: 76 | 77 | :: 78 | 79 | from treetagger import TreeTagger 80 | tt = TreeTagger(path_to_treetagger='/path/to/your/TreeTagger/') 81 | tt.get_installed_lang() 82 | 83 | The output could look like this 84 | 85 | :: 86 | 87 | ['english', 'german'] 88 | 89 | Tagging a sentence from Python: 90 | 91 | :: 92 | 93 | from treetagger import TreeTagger 94 | tt = TreeTagger(path_to_treetagger='/path/to/your/TreeTagger/') 95 | tt.tag('What is the airspeed of an unladen swallow?') 96 | 97 | 98 | The output is a list of [token, tag, lemma]: 99 | 100 | :: 101 | 102 | [['What', 'WP', 'what'], 103 | ['is', 'VBZ', 'be'], 104 | ['the', 'DT', 'the'], 105 | ['airspeed', 'NN', 'airspeed'], 106 | ['of', 'IN', 'of'], 107 | ['an', 'DT', 'an'], 108 | ['unladen', 'JJ', ''], 109 | ['swallow', 'NN', 'swallow'], 110 | ['?', 'SENT', '?']] 111 | 112 | Tagging a german sentence from Python: 113 | 114 | :: 115 | 116 | from treetagger import TreeTagger 117 | tt = TreeTagger(path_to_treetagger='/path/to/your/TreeTagger/', language='german') 118 | tt.tag('Das Haus hat einen großen hübschen Garten.') 119 | 120 | The output is a list of [token, tag, lemma]: 121 | 122 | :: 123 | 124 | [['Das', 'ART', 'die'], 125 | ['Haus', 'NN', 'Haus'], 126 | ['hat', 'VAFIN', 'haben'], 127 | ['einen', 'ART', 'eine'], 128 | ['großen', 'ADJA', 'groß'], 129 | ['hübschen', 'ADJA', 'hübsch'], 130 | ['Garten', 'NN', 'Garten'], 131 | ['.', '$.', '.']] 132 | 133 | Usage TreeTaggerChunker 134 | ^^^^^^^^^^^^^^^^^^^^^^^ 135 | 136 | Initialize by specifying the path ``path_to_treetagger``: 137 | 138 | :: 139 | 140 | from treetagger import TreeTaggerChunker 141 | ttc = TreeTaggerChunker(path_to_treetagger='/path/to/your/TreeTagger/') 142 | 143 | Show the installed languages: 144 | 145 | :: 146 | 147 | from treetagger import TreeTaggerChunker 148 | ttc = TreeTaggerChunker(path_to_treetagger='/path/to/your/TreeTagger/') 149 | ttc.get_installed_lang() 150 | 151 | The output could look like this 152 | 153 | :: 154 | 155 | ['english', 'german'] 156 | 157 | Chunk a sentence from Python: 158 | 159 | :: 160 | 161 | from treetagger import TreeTaggerChunker 162 | ttc = TreeTaggerChunker(path_to_treetagger='/path/to/your/TreeTagger/') 163 | ttc.parse('What is the airspeed of an unladen swallow?') 164 | 165 | 166 | The output is a list of a chunk structure with [token, tag, lemma]: 167 | 168 | :: 169 | 170 | [[''], ['What', 'WP', 'what'], [''], [''], ['is', 'VBZ', 'be'], [''], [''], ['the', 'DT', 'the'], ['airspeed', 'NN', 'airspeed'], [''], [''], ['of', 'IN', 'of'], [''], ['an', 'DT', 'an'], ['unladen', 'JJ', ''], ['swallow', 'NN', 'swallow'], [''], [''], ['?', 'SENT', '?']] 171 | 172 | Chunk a sentence in a tree from Python: 173 | 174 | :: 175 | 176 | from treetagger import TreeTaggerChunker 177 | ttc = TreeTaggerChunker(path_to_treetagger='/path/to/your/TreeTagger/') 178 | ttc.parse_to_tree('What is the airspeed of an unladen swallow?') 179 | 180 | 181 | The output is a chunk structure as a nltk tree: 182 | 183 | :: 184 | 185 | Tree('S', [Tree('NC', [Tree('What', ['WP'])]), Tree('VC', [Tree('is', ['VBZ'])]), Tree('NC', [Tree('the', ['DT']), Tree('airspeed', ['NN'])]), Tree('PC', [Tree('of', ['IN']), Tree('NC', [Tree('an', ['DT']), Tree('unladen', ['JJ']), Tree('swallow', ['NN'])])]), Tree('?', ['SENT'])]) 186 | 187 | Chunk a sentence in a tree from Python: 188 | 189 | :: 190 | 191 | from nltk.tree import Tree 192 | from treetagger import TreeTaggerChunker 193 | ttc = TreeTaggerChunker(path_to_treetagger='/path/to/your/TreeTagger/') 194 | ttc_tree = ttc.parse_to_tree('What is the airspeed of an unladen swallow?') 195 | print(ttc_tree) 196 | 197 | 198 | The output is a chunk structure as a nltk tree: 199 | 200 | :: 201 | 202 | (S 203 | (NC (What WP)) 204 | (VC (is VBZ)) 205 | (NC (the DT) (airspeed NN)) 206 | (PC (of IN) (NC (an DT) (unladen JJ) (swallow NN))) 207 | (? SENT)) 208 | 209 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import sys 4 | import os 5 | try: 6 | from setuptools import setup 7 | except ImportError: 8 | from distutils.core import setup 9 | 10 | here = os.path.abspath(os.path.dirname(__file__)) 11 | README = open(os.path.join(here, 'README.rst')).read() 12 | 13 | setup(name='treetagger', 14 | version='1.1.1', 15 | description='A Python module for interfacing with the Treetagger by Helmut Schmid.', 16 | long_description=README, 17 | author='Mirko Otto', 18 | author_email='dropsy@gmail.com', 19 | url='https://github.com/miotto/treetagger-python', 20 | py_modules=['treetagger'], 21 | install_requires=['nltk'], 22 | license='GPL Version 3', 23 | ) 24 | 25 | 26 | 27 | -------------------------------------------------------------------------------- /treetagger.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Natural Language Toolkit: Interface to the TreeTagger POS-tagger 3 | # 4 | # Copyright (C) Mirko Otto 5 | # Author: Mirko Otto 6 | 7 | """ 8 | A Python module for interfacing with the Treetagger by Helmut Schmid. 9 | """ 10 | 11 | import os, tempfile, fnmatch, re 12 | from subprocess import Popen, PIPE 13 | 14 | from nltk.internals import find_binary, find_file 15 | from nltk.tag.api import TaggerI 16 | from nltk.chunk.api import ChunkParserI 17 | from nltk.tree import Tree 18 | from sys import platform as _platform 19 | import sys 20 | 21 | _treetagger_url = 'http://www.cis.uni-muenchen.de/~schmid/tools/TreeTagger/' 22 | 23 | def files(path, pattern): 24 | for file in os.listdir(path): 25 | if (os.path.isfile(os.path.join(path, file)) and fnmatch.fnmatch(file, pattern)): 26 | yield file 27 | 28 | class TreeTagger(TaggerI): 29 | r""" 30 | A class for pos tagging with TreeTagger. The default encoding used by TreeTagger is utf-8. The input is the paths to: 31 | - a language trained on training data 32 | - (optionally) the path to the TreeTagger binary 33 | 34 | This class communicates with the TreeTagger binary via pipes. 35 | 36 | Example: 37 | 38 | .. doctest:: 39 | :options: +SKIP 40 | 41 | >>> from treetagger import TreeTagger 42 | >>> tt = TreeTagger(language='english') 43 | >>> tt.tag('What is the airspeed of an unladen swallow?') 44 | [['What', 'WP', 'what'], ['is', 'VBZ', 'be'], ['the', 'DT', 'the'], ['airspeed', 'NN', 'airspeed'], ['of', 'IN', 'of'], ['an', 'DT', 'an'], ['unladen', 'JJ', ''], ['swallow', 'NN', 'swallow'], ['?', 'SENT', '?']] 45 | 46 | .. doctest:: 47 | :options: +SKIP 48 | 49 | >>> from treetagger import TreeTagger 50 | >>> tt = TreeTagger(language='german') 51 | >>> tt.tag('Das Haus hat einen großen hübschen Garten.') 52 | [['Das', 'ART', 'die'], ['Haus', 'NN', 'Haus'], ['hat', 'VAFIN', 'haben'], ['einen', 'ART', 'eine'], ['großen', 'ADJA', 'groß'], ['hübschen', 'ADJA', 'hübsch'], ['Garten', 'NN', 'Garten'], ['.', '$.', '.']] 53 | """ 54 | 55 | 56 | def __init__(self, path_to_treetagger=None, language='english', 57 | verbose=False, abbreviation_list=None): 58 | """ 59 | Initialize the TreeTagger. 60 | 61 | :param language: Default language is english. 62 | 63 | The encoding used by the model. Unicode tokens 64 | passed to the tag() method are converted to 65 | this charset when they are sent to TreeTagger. 66 | The default is utf-8. 67 | 68 | This parameter is ignored for str tokens, which are sent as-is. 69 | The caller must ensure that tokens are encoded in the right charset. 70 | """ 71 | if path_to_treetagger: 72 | self._path_to_treetagger = path_to_treetagger 73 | else: 74 | self._path_to_treetagger = None 75 | 76 | treetagger_paths = ['.'] 77 | if 'TREETAGGER_HOME' in os.environ: 78 | if _platform.startswith('win'): 79 | tt_path = os.path.normpath(os.path.join(os.environ['TREETAGGER_HOME'], 'bin')) 80 | else: 81 | tt_path = os.path.normpath(os.path.join(os.environ['TREETAGGER_HOME'], 'cmd')) 82 | treetagger_paths.append(tt_path) 83 | elif self._path_to_treetagger: 84 | if _platform.startswith('win'): 85 | tt_path = os.path.normpath(os.path.join(self._path_to_treetagger, 'bin')) 86 | else: 87 | tt_path = os.path.normpath(os.path.join(self._path_to_treetagger, 'cmd')) 88 | treetagger_paths.append(tt_path) 89 | else: 90 | raise LookupError('Set \'TREETAGGER_HOME\' or use path_to_treetagger!') 91 | treetagger_paths = list(map(os.path.expanduser, treetagger_paths)) 92 | 93 | self._abbr_list = abbreviation_list 94 | 95 | if language in self.get_installed_lang(): 96 | if _platform.startswith('win'): 97 | treetagger_bin_name = 'tag-' + language + '.bat' 98 | else: 99 | treetagger_bin_name = 'tree-tagger-' + language 100 | else: 101 | raise LookupError('Language not installed!') 102 | 103 | try: 104 | self._treetagger_bin = find_binary( 105 | treetagger_bin_name, 106 | searchpath=treetagger_paths, 107 | url=_treetagger_url, 108 | verbose=verbose) 109 | except LookupError: 110 | print('NLTK was unable to find the TreeTagger bin!') 111 | 112 | def get_treetagger_path(self): 113 | if 'TREETAGGER_HOME' in os.environ: 114 | print('Environment variable \'TREETAGGER_HOME\' is ' + os.environ['TREETAGGER_HOME']) 115 | else: 116 | print('Environment variable \'TREETAGGER_HOME\' not set') 117 | 118 | if self._path_to_treetagger: 119 | print('Path to TreeTagger is ' + self._path_to_treetagger) 120 | else: 121 | print('Path to TreeTagger not set') 122 | 123 | def get_installed_lang(self): 124 | if 'TREETAGGER_HOME' in os.environ: 125 | lang_path = os.path.normpath(os.path.join(os.environ['TREETAGGER_HOME'], 'lib')) 126 | return [file[:-4] for file in files(lang_path, "*.par") if not file.endswith("chunker.par")] 127 | elif self._path_to_treetagger: 128 | lang_path = os.path.normpath(os.path.join(self._path_to_treetagger, 'lib')) 129 | return [file[:-4] for file in files(lang_path, "*.par") if not file.endswith("chunker.par")] 130 | else: 131 | return [] 132 | 133 | def tag(self, sentences): 134 | """Tags a single sentence: a list of words. 135 | The tokens should not contain any newline characters. 136 | """ 137 | 138 | # Write the actual sentences to the temporary input file 139 | if isinstance(sentences, list): 140 | _input = '\n'.join((x for x in sentences)) 141 | else: 142 | _input = sentences 143 | 144 | outfile = tempfile.NamedTemporaryFile(mode="w+", encoding="utf8", delete=False) 145 | 146 | with tempfile.NamedTemporaryFile(mode="w+", encoding="utf8", delete=False) as infile: 147 | infile.write(_input) 148 | infile.flush() 149 | 150 | # Run the tagger and get the output 151 | if(self._abbr_list is None): 152 | p = Popen([self._treetagger_bin, infile.name], 153 | shell=False, stdin=PIPE, stdout=outfile, stderr=PIPE) 154 | elif(self._abbr_list is not None): 155 | p = Popen([self._treetagger_bin,"-a",self._abbr_list, infile.name], 156 | shell=False, stdin=PIPE, stdout=outfile, stderr=PIPE) 157 | (stdout, stderr) = p.communicate() 158 | 159 | # Check the return code. 160 | if p.returncode != 0: 161 | print(stderr) 162 | raise OSError('TreeTagger command failed!') 163 | 164 | infile.close() 165 | os.unlink(infile.name) 166 | 167 | outfile.seek(0) 168 | tagged_sentences = [] 169 | for line in outfile: 170 | tagged_word = line.rstrip('\n') 171 | tagged_word_split = tagged_word.split('\t') 172 | tagged_sentences.append(tagged_word_split) 173 | 174 | outfile.close() 175 | os.unlink(outfile.name) 176 | 177 | return tagged_sentences 178 | 179 | class TreeTaggerChunker(ChunkParserI): 180 | r""" 181 | A class for chunking with TreeTagger Chunker. The default encoding used by TreeTagger is utf-8. The input is the paths to: 182 | - a language trained on training data 183 | - (optionally) the path to the TreeTagger binary 184 | 185 | This class communicates with the TreeTagger Chunker binary via pipes. 186 | 187 | Example: 188 | 189 | .. doctest:: 190 | :options: +SKIP 191 | 192 | >>> from treetagger import TreeTaggerChunker 193 | >>> tt = TreeTaggerChunker(language='english') 194 | >>> tt.parse('What is the airspeed of an unladen swallow?') 195 | [[''], ['What', 'WP', 'what'], [''], [''], ['is', 'VBZ', 'be'], [''], [''], ['the', 'DT', 'the'], ['airspeed', 'NN', 'airspeed'], [''], [''], ['of', 'IN', 'of'], [''], ['an', 'DT', 'an'], ['unladen', 'JJ', ''], ['swallow', 'NN', 'swallow'], [''], [''], ['?', 'SENT', '?']] 196 | 197 | .. doctest:: 198 | :options: +SKIP 199 | 200 | >>> from treetagger import TreeTaggerChunker 201 | >>> tt = TreeTaggerChunker(language='english') 202 | >>> tt.parse_to_tree('What is the airspeed of an unladen swallow?') 203 | Tree('S', [Tree('NC', [Tree('What', ['WP'])]), Tree('VC', [Tree('is', ['VBZ'])]), Tree('NC', [Tree('the', ['DT']), Tree('airspeed', ['NN'])]), Tree('PC', [Tree('of', ['IN']), Tree('NC', [Tree('an', ['DT']), Tree('unladen', ['JJ']), Tree('swallow', ['NN'])])]), Tree('?', ['SENT'])]) 204 | 205 | .. doctest:: 206 | :options: +SKIP 207 | 208 | >>> from nltk.tree import Tree 209 | >>> from treetagger import TreeTaggerChunker 210 | >>> tt = TreeTaggerChunker(language='english') 211 | >>> res = tt.parse_to_tree('What is the airspeed of an unladen swallow?') 212 | >>> print(res) 213 | (S 214 | (NC (What WP)) 215 | (VC (is VBZ)) 216 | (NC (the DT) (airspeed NN)) 217 | (PC (of IN) (NC (an DT) (unladen JJ) (swallow NN))) 218 | (? SENT)) 219 | """ 220 | 221 | def __init__(self, path_to_treetagger=None, language='english', 222 | verbose=False, abbreviation_list=None): 223 | """ 224 | Initialize the TreeTaggerChunker. 225 | 226 | :param language: Default language is english. 227 | 228 | The encoding used by the model. Unicode tokens 229 | passed to the parse() and parse_to_tree() methods are converted to 230 | this charset when they are sent to TreeTaggerChunker. 231 | The default is utf-8. 232 | 233 | This parameter is ignored for str tokens, which are sent as-is. 234 | The caller must ensure that tokens are encoded in the right charset. 235 | """ 236 | if path_to_treetagger: 237 | self._path_to_treetagger = path_to_treetagger 238 | else: 239 | self._path_to_treetagger = None 240 | 241 | treetagger_paths = ['.'] 242 | if 'TREETAGGER_HOME' in os.environ: 243 | if _platform.startswith('win'): 244 | tt_path = os.path.normpath(os.path.join(os.environ['TREETAGGER_HOME'], 'bin')) 245 | else: 246 | tt_path = os.path.normpath(os.path.join(os.environ['TREETAGGER_HOME'], 'cmd')) 247 | treetagger_paths.append(tt_path) 248 | elif self._path_to_treetagger: 249 | if _platform.startswith('win'): 250 | tt_path = os.path.normpath(os.path.join(self._path_to_treetagger, 'bin')) 251 | else: 252 | tt_path = os.path.normpath(os.path.join(self._path_to_treetagger, 'cmd')) 253 | treetagger_paths.append(tt_path) 254 | else: 255 | raise LookupError('Set \'TREETAGGER_HOME\' or use path_to_treetagger!') 256 | treetagger_paths = list(map(os.path.expanduser, treetagger_paths)) 257 | 258 | self._abbr_list = abbreviation_list 259 | 260 | if language in self.get_installed_lang(): 261 | if _platform.startswith('win'): 262 | treetagger_chunker_bin_name = 'chunk-' + language + '.bat' 263 | else: 264 | treetagger_chunker_bin_name = 'tagger-chunker-' + language 265 | else: 266 | raise LookupError('Language not installed!') 267 | 268 | try: 269 | self._treetagger_chunker_bin = find_binary( 270 | treetagger_chunker_bin_name, 271 | searchpath=treetagger_paths, 272 | url=_treetagger_url, 273 | verbose=verbose) 274 | except LookupError: 275 | print('NLTK was unable to find the TreeTagger Chunker bin!') 276 | 277 | def get_treetagger_path(self): 278 | if 'TREETAGGER_HOME' in os.environ: 279 | print('Environment variable \'TREETAGGER_HOME\' is ' + os.environ['TREETAGGER_HOME']) 280 | else: 281 | print('Environment variable \'TREETAGGER_HOME\' not set') 282 | 283 | if self._path_to_treetagger: 284 | print('Path to TreeTagger is ' + self._path_to_treetagger) 285 | else: 286 | print('Path to TreeTagger not set') 287 | 288 | def get_installed_lang(self): 289 | if 'TREETAGGER_HOME' in os.environ: 290 | lang_path = os.path.normpath(os.path.join(os.environ['TREETAGGER_HOME'], 'lib')) 291 | lang_files = [file[:-4] for file in files(lang_path, "*.par")] 292 | lang_chunk_files = [file[:-12] for file in files(lang_path, "*-chunker.par")] 293 | return [item for item in lang_chunk_files if item in lang_files] 294 | elif self._path_to_treetagger: 295 | lang_path = os.path.normpath(os.path.join(self._path_to_treetagger, 'lib')) 296 | lang_files = [file[:-4] for file in files(lang_path, "*.par")] 297 | lang_chunk_files = [file[:-12] for file in files(lang_path, "*-chunker.par")] 298 | return [item for item in lang_chunk_files if item in lang_files] 299 | else: 300 | return [] 301 | 302 | def parse(self, tokens): 303 | """Tag and chunk a single sentence: a list of words. 304 | The tokens should not contain any newline characters. 305 | """ 306 | 307 | # Write the actual sentences to the temporary input file 308 | if isinstance(tokens, list): 309 | _input = '\n'.join((x for x in tokens)) 310 | else: 311 | _input = tokens 312 | 313 | outfile = tempfile.NamedTemporaryFile(mode="w+", encoding="utf8", delete=False) 314 | 315 | with tempfile.NamedTemporaryFile(mode="w+", encoding="utf8", delete=False) as infile: 316 | infile.write(_input) 317 | infile.flush() 318 | 319 | # Run the tagger and get the output 320 | if(self._abbr_list is None): 321 | p = Popen([self._treetagger_chunker_bin, infile.name], 322 | shell=False, stdin=PIPE, stdout=outfile, stderr=PIPE) 323 | elif(self._abbr_list is not None): 324 | p = Popen([self._treetagger_chunker_bin,"-a",self._abbr_list, infile.name], 325 | shell=False, stdin=PIPE, stdout=outfile, stderr=PIPE) 326 | (stdout, stderr) = p.communicate() 327 | 328 | # Check the return code. 329 | if p.returncode != 0: 330 | print(stderr) 331 | raise OSError('TreeTaggerChunker command failed!') 332 | 333 | infile.close() 334 | os.unlink(infile.name) 335 | 336 | # Output the tagged ans chunked sentences 337 | outfile.seek(0) 338 | tagged_chunked_sentences = [] 339 | for line in outfile: 340 | tagged_word = line.rstrip('\n') 341 | tagged_word_split = tagged_word.split('\t') 342 | tagged_chunked_sentences.append(tagged_word_split) 343 | 344 | outfile.close() 345 | os.unlink(outfile.name) 346 | 347 | return tagged_chunked_sentences 348 | 349 | def parse_to_tree(self, tokens): 350 | tc_sentences = self.parse(tokens) 351 | 352 | resar = [] 353 | res = '' 354 | for idx, item in enumerate(tc_sentences): 355 | if len(item) == 1: 356 | erg = re.sub('',')',item[0]) 357 | if erg == ')': 358 | res += erg 359 | else: 360 | erg1 = re.sub('<',' (',item[0]) 361 | erg2 = re.sub('>','',erg1) 362 | res += erg2 363 | 364 | if len(item) == 3: 365 | res += ' ('+item[0]+' '+item[1] +')' 366 | if item[1] == 'SENT' or item[1] == '$.' or item[1] == 'FS': 367 | res = '(S '+res+')' 368 | resar.append(res) 369 | res = '' 370 | 371 | if len(tc_sentences)==idx+1 and len(res) > 1 and res[0:2] != '(S': 372 | res = '(S '+res+')' 373 | resar.append(res) 374 | res = '' 375 | 376 | if len(resar) > 1: 377 | erg = '(ROOT '+' '.join(resar)+')' 378 | else: 379 | erg = resar[0] 380 | 381 | try: 382 | return Tree.fromstring(erg) 383 | except ValueError: 384 | print('Something goes wrong. Please check the raw data:\n') 385 | print(erg) 386 | 387 | 388 | if __name__ == "__main__": 389 | import doctest 390 | doctest.testmod(optionflags=doctest.NORMALIZE_WHITESPACE) 391 | --------------------------------------------------------------------------------