├── README ├── .gitignore ├── MANIFEST.in ├── private └── update-version ├── .travis.yml ├── LICENSE ├── doc ├── changelog └── README ├── setup.py ├── tests.py └── morfeusz.py /README: -------------------------------------------------------------------------------- 1 | doc/README -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.py[co] 2 | MANIFEST 3 | build 4 | dist 5 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include MANIFEST.in 2 | 3 | include LICENSE 4 | exclude README 5 | include doc/* 6 | include morfeusz.py 7 | include private/* 8 | include test*.py 9 | -------------------------------------------------------------------------------- /private/update-version: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | version=${1:?"no version number provided"} 3 | set -e 4 | set -x 5 | dch -m -v "$version" -u low -c doc/changelog 6 | sed -i -E -e "s/^(__version__) = '[0-9.]+'$/\1 = '$version'/" *.py 7 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | dist: trusty 2 | sudo: false 3 | language: python 4 | python: 5 | - "2.6" 6 | - "2.7" 7 | - "3.2" 8 | - "3.3" 9 | - "3.4" 10 | - "3.5" 11 | - "3.6" 12 | - "3.7-dev" 13 | env: 14 | - MORFEUSZ_VERSION=siat-20080205 15 | - MORFEUSZ_VERSION=sgjp-20130413 16 | script: 17 | - if [ "$TRAVIS_PYTHON_VERSION" = '2.6' ]; then pip install unittest2; fi 18 | - \[ -n "${MORFEUSZ_VERSION##siat-*}" ] || url="http://sgjp.pl/morfeusz/morfeusz-siat/morfeusz-linux64-${MORFEUSZ_VERSION#siat-}.tar.bz2" 19 | - \[ -n "${MORFEUSZ_VERSION##sgjp-*}" ] || url="http://sgjp.pl/morfeusz/download/morfeusz-SGJP-linux64-${MORFEUSZ_VERSION#sgjp-}.tar.bz2" 20 | - wget "$url" 21 | - tar -xvf morfeusz-*.tar.bz2 22 | - LD_LIBRARY_PATH=$PWD python tests.py 23 | - LC_ALL=C python setup.py --version 24 | 25 | # vim:ts=2 sts=2 sw=2 et 26 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright © 2007-2017 Jakub Wilk 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy 4 | of this software and associated documentation files (the “Software”), to deal 5 | in the Software without restriction, including without limitation the rights 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 | copies of the Software, and to permit persons to whom the Software is 8 | furnished to do so, subject to the following conditions: 9 | 10 | The above copyright notice and this permission notice shall be included in 11 | all copies or substantial portions of the Software. 12 | 13 | THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 19 | SOFTWARE. 20 | -------------------------------------------------------------------------------- /doc/changelog: -------------------------------------------------------------------------------- 1 | python-morfeusz (0.3400) unstable; urgency=low 2 | 3 | * Deprecate the package. 4 | Document that Morfeusz 2 is not supported. 5 | https://github.com/jwilk/python-morfeusz/issues/1 6 | * Open the correct library on Mac OS X systems. 7 | Thanks to Aleksandra Nabożny for the bug report. 8 | * Drop support for Python 2.5. 9 | * Improve the setup script: 10 | + Fix Unicode error with Python 3 and LC_ALL=C. 11 | * Put license into a separate file. 12 | * Update the package description. 13 | 14 | -- Jakub Wilk Mon, 03 Jul 2017 20:22:59 +0200 15 | 16 | python-morfeusz (0.3300) unstable; urgency=low 17 | 18 | * Update the tagset definition. 19 | * Add option to keep whitespace (keep_whitespace=True). 20 | * Open the DLL library on Windows systems. 21 | Thanks to Caroline Linde for the bug report. 22 | 23 | -- Jakub Wilk Thu, 19 Nov 2015 19:53:09 +0100 24 | 25 | python-morfeusz (0.3200) unstable; urgency=low 26 | 27 | * Update the tagset definition. 28 | * Add option (dag=True) to return DAGs. 29 | * Return lists instead of tuples. 30 | 31 | -- Jakub Wilk Mon, 18 Apr 2011 12:35:40 +0200 32 | 33 | python-morfeusz (0.3001) unstable; urgency=low 34 | 35 | * Make the interface compatible with the SGJP version of Morfeusz. 36 | 37 | -- Jakub Wilk Fri, 21 Jan 2011 01:50:39 +0100 38 | 39 | python-morfeusz (0.3000) unstable; urgency=low 40 | 41 | * Initial release. 42 | 43 | -- Jakub Wilk Sat, 25 Sep 2010 20:43:52 +0200 44 | -------------------------------------------------------------------------------- /doc/README: -------------------------------------------------------------------------------- 1 | Overview 2 | ======== 3 | 4 | **python-morfeusz** is a Python interface to Morfeusz_, 5 | a Polish morphological analyser. 6 | 7 | .. _Morfeusz: 8 | http://sgjp.pl/morfeusz/ 9 | 10 | Example 11 | ------- 12 | 13 | .. code:: pycon 14 | 15 | >>> from morfeusz import * 16 | >>> for s in analyse('Mama ma.'): 17 | ... print(s) 18 | ... 19 | [('Mama', 'mama', 'subst:sg:nom:f'), ('ma', 'mieć', 'fin:sg:ter:imperf'), ('.', '.', 'interp')] 20 | [('Mama', 'mama', 'subst:sg:nom:f'), ('ma', 'mój', 'adj:sg:nom:f:pos'), ('.', '.', 'interp')] 21 | [('Mama', 'mama', 'subst:sg:nom:f'), ('ma', 'mój', 'adj:sg:voc:f:pos'), ('.', '.', 'interp')] 22 | 23 | Prerequisites 24 | ============= 25 | 26 | * Python ≥ 2.6 or ≥ 3.2. 27 | 28 | * The Morfeusz_ library. 29 | 30 | This Python interface works only with very old versions of Morfeusz. 31 | The last supported version appears to be the one released on 2013-04-13: 32 | 33 | * source: http://sgjp.pl/morfeusz/download/morfeusz-SGJP-src-20130413.tar.bz2 34 | * Linux, 32-bit: http://sgjp.pl/morfeusz/download/morfeusz-SGJP-linux32-20130413.tar.bz2 35 | * Linux, 64-bit: http://sgjp.pl/morfeusz/download/morfeusz-SGJP-linux64-20130413.tar.bz2 36 | * Mac OS X, 32-bit: http://sgjp.pl/morfeusz/download/morfeusz-SGJP-darwin32-20130413.tar.bz2 37 | * Windows, 32-bit: http://sgjp.pl/morfeusz/download/morfeusz-SGJP-win32-20130413.tar.bz2 38 | * Windows, 64-bit: http://sgjp.pl/morfeusz/download/morfeusz-SGJP-win64-20130413.tar.bz2 39 | 40 | Morfeusz 2 is *not* supported. 41 | (Morfeusz 2 has its own Python interface, with a different API. 42 | You should use it instead.) 43 | 44 | .. vim:ft=rst ts=3 sts=3 sw=3 et 45 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | # encoding=UTF-8 2 | 3 | # Copyright © 2010-2017 Jakub Wilk 4 | # 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy 6 | # of this software and associated documentation files (the “Software”), to deal 7 | # in the Software without restriction, including without limitation the rights 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | # copies of the Software, and to permit persons to whom the Software is 10 | # furnished to do so, subject to the following conditions: 11 | # 12 | # The above copyright notice and this permission notice shall be included in 13 | # all copies or substantial portions of the Software. 14 | # 15 | # THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | # SOFTWARE. 22 | 23 | ''' 24 | *python-morfeusz* is a Python interface to Morfeusz_, 25 | a Polish morphological analyser. 26 | 27 | .. _Morfeusz: 28 | http://sgjp.pl/morfeusz/ 29 | ''' 30 | 31 | import io 32 | import os 33 | 34 | import distutils.core 35 | from distutils.command.sdist import sdist as distutils_sdist 36 | 37 | try: 38 | import distutils644 39 | except ImportError: 40 | pass 41 | else: 42 | distutils644.install() 43 | 44 | b'' # Python >= 2.6 is required 45 | 46 | def get_version(): 47 | with io.open('doc/changelog', encoding='UTF-8') as file: 48 | line = file.readline() 49 | return line.split()[1].strip('()') 50 | 51 | class cmd_sdist(distutils_sdist): 52 | 53 | def maybe_move_file(self, base_dir, src, dst): 54 | src = os.path.join(base_dir, src) 55 | dst = os.path.join(base_dir, dst) 56 | if os.path.exists(src): 57 | self.move_file(src, dst) 58 | 59 | def make_release_tree(self, base_dir, files): 60 | distutils_sdist.make_release_tree(self, base_dir, files) 61 | self.maybe_move_file(base_dir, 'LICENSE', 'doc/LICENSE') 62 | 63 | classifiers = ''' 64 | Development Status :: 7 - Inactive 65 | Intended Audience :: Developers 66 | License :: OSI Approved :: MIT License 67 | Natural Language :: Polish 68 | Operating System :: POSIX :: Linux 69 | Operating System :: MacOS :: MacOS X 70 | Operating System :: Microsoft :: Windows 71 | Programming Language :: Python 72 | Programming Language :: Python :: 2 73 | Programming Language :: Python :: 3 74 | Topic :: Text Processing :: Linguistic 75 | '''.strip().splitlines() 76 | 77 | distutils.core.setup( 78 | name='python-morfeusz', 79 | version=get_version(), 80 | license='MIT', 81 | description='interface to Morfeusz', 82 | long_description=__doc__.strip(), 83 | classifiers=classifiers, 84 | url='http://jwilk.net/software/python-morfeusz', 85 | author='Jakub Wilk', 86 | author_email='jwilk@jwilk.net', 87 | py_modules=['morfeusz'], 88 | cmdclass = dict( 89 | sdist=cmd_sdist, 90 | ) 91 | ) 92 | 93 | # vim:ts=4 sts=4 sw=4 et 94 | -------------------------------------------------------------------------------- /tests.py: -------------------------------------------------------------------------------- 1 | # encoding=UTF-8 2 | 3 | # Copyright © 2007-2017 Jakub Wilk 4 | # 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy 6 | # of this software and associated documentation files (the “Software”), to deal 7 | # in the Software without restriction, including without limitation the rights 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | # copies of the Software, and to permit persons to whom the Software is 10 | # furnished to do so, subject to the following conditions: 11 | # 12 | # The above copyright notice and this permission notice shall be included in 13 | # all copies or substantial portions of the Software. 14 | # 15 | # THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | # SOFTWARE. 22 | 23 | import sys 24 | 25 | if sys.version_info >= (2, 7): 26 | import unittest 27 | else: 28 | import unittest2 as unittest 29 | 30 | import morfeusz 31 | 32 | if str is bytes: 33 | def u(s): 34 | return s.decode('UTF-8') 35 | else: 36 | def u(s): 37 | return s 38 | 39 | sgjp = 'SGJP' in morfeusz.about() 40 | 41 | class test_expand_tags(unittest.TestCase): 42 | 43 | def test1(self): 44 | tags = 'adj:sg:nom:m1.m2.m3:pos|adj:sg:acc:m3:pos' 45 | xtags = morfeusz.expand_tags(tags) 46 | self.assertEqual(list(xtags), [ 47 | 'adj:sg:nom:m1:pos', 48 | 'adj:sg:nom:m2:pos', 49 | 'adj:sg:nom:m3:pos', 50 | 'adj:sg:acc:m3:pos', 51 | ]) 52 | 53 | def test2(self): 54 | tags = 'adj:sg:nom:m1.m2.m3:pos|adj:sg:acc:m3:pos' 55 | xtags = morfeusz.expand_tags(tags, expand_dot=False) 56 | self.assertEqual(list(xtags), [ 57 | 'adj:sg:nom:m1.m2.m3:pos', 58 | 'adj:sg:acc:m3:pos' 59 | ]) 60 | 61 | def test3(self): 62 | tags = 'ppron3:sg:acc:f:ter:_:npraep' 63 | xtags = morfeusz.expand_tags(tags) 64 | self.assertEqual(list(xtags), [ 65 | 'ppron3:sg:acc:f:ter:akc:npraep', 66 | 'ppron3:sg:acc:f:ter:nakc:npraep' 67 | ]) 68 | 69 | def test4(self): 70 | tags = 'ppron3:sg:acc:f:ter:_:npraep' 71 | xtags = morfeusz.expand_tags(tags, expand_dot=False) 72 | self.assertEqual(list(xtags), [ 73 | 'ppron3:sg:acc:f:ter:akc.nakc:npraep' 74 | ]) 75 | 76 | def test5(self): 77 | tags = 'ppron3:sg:acc:f:ter:_:npraep' 78 | xtags = morfeusz.expand_tags(tags, expand_underscore=False) 79 | self.assertEqual(list(xtags), [ 80 | 'ppron3:sg:acc:f:ter:_:npraep' 81 | ]) 82 | 83 | class test_analyse(unittest.TestCase): 84 | 85 | def test1(self): 86 | text = 'Mama ma.' 87 | interps = morfeusz.analyse(text) 88 | if sgjp: 89 | self.assertEqual(interps.pop(), 90 | [(u('Mama'), u('mama'), 'subst:sg:nom:f'), (u('ma'), u('mój'), 'adj:sg:voc:f:pos'), (u('.'), u('.'), 'interp')] 91 | ) 92 | self.assertEqual(interps, [ 93 | [(u('Mama'), u('mama'), 'subst:sg:nom:f'), (u('ma'), u('mieć'), 'fin:sg:ter:imperf'), (u('.'), u('.'), 'interp')], 94 | [(u('Mama'), u('mama'), 'subst:sg:nom:f'), (u('ma'), u('mój'), 'adj:sg:nom:f:pos'), (u('.'), u('.'), 'interp')] 95 | ]) 96 | 97 | def test2(self): 98 | text = u('Miałem miał.') 99 | interps = morfeusz.analyse(text, dag=True) 100 | self.assertEqual(interps, [ 101 | (0, 1, (u('Miał'), u('mieć'), u('praet:sg:m1:imperf'))), 102 | (0, 1, (u('Miał'), u('mieć'), u('praet:sg:m2:imperf'))), 103 | (0, 1, (u('Miał'), u('mieć'), u('praet:sg:m3:imperf'))), 104 | (1, 2, (u('em'), u('być'), u('aglt:sg:pri:imperf:wok'))), 105 | (0, 2, (u('Miałem'), u('miał'), u('subst:sg:inst:m3'))), 106 | (2, 3, (u('miał'), u('miał'), u('subst:sg:nom:m3'))), 107 | (2, 3, (u('miał'), u('miał'), u('subst:sg:acc:m3'))), 108 | (2, 3, (u('miał'), u('mieć'), u('praet:sg:m1:imperf'))), 109 | (2, 3, (u('miał'), u('mieć'), u('praet:sg:m2:imperf'))), 110 | (2, 3, (u('miał'), u('mieć'), u('praet:sg:m3:imperf'))), 111 | (3, 4, (u('.'), u('.'), u('interp'))), 112 | ]) 113 | 114 | class test_about(unittest.TestCase): 115 | 116 | def test_type(self): 117 | self.assertEqual( 118 | type(morfeusz.about()), 119 | type(u('')) 120 | ) 121 | 122 | if __name__ == '__main__': 123 | unittest.main() 124 | 125 | # vim:ts=4 sts=4 sw=4 et 126 | -------------------------------------------------------------------------------- /morfeusz.py: -------------------------------------------------------------------------------- 1 | # encoding=UTF-8 2 | 3 | # Copyright © 2007-2017 Jakub Wilk 4 | # 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy 6 | # of this software and associated documentation files (the “Software”), to deal 7 | # in the Software without restriction, including without limitation the rights 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | # copies of the Software, and to permit persons to whom the Software is 10 | # furnished to do so, subject to the following conditions: 11 | # 12 | # The above copyright notice and this permission notice shall be included in 13 | # all copies or substantial portions of the Software. 14 | # 15 | # THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | # SOFTWARE. 22 | 23 | ''' 24 | Python interface to Morfeusz_, 25 | a Polish morphological analyser. 26 | 27 | .. _Morfeusz: 28 | http://sgjp.pl/morfeusz/ 29 | ''' 30 | 31 | from __future__ import with_statement 32 | 33 | import collections 34 | import ctypes 35 | import os 36 | import sys 37 | 38 | py3k = sys.version_info >= (3, 0) 39 | 40 | if py3k: 41 | import _thread as thread 42 | else: 43 | import thread 44 | if not py3k: 45 | from itertools import izip as zip 46 | 47 | if py3k: 48 | unicode = str 49 | 50 | __author__ = 'Jakub Wilk ' 51 | __version__ = '0.3400' 52 | __all__ = ['analyse', 'about', 'expand_tags', 'ATTRIBUTES', 'VALUES'] 53 | 54 | ATTRIBUTES = ''' 55 | subst=number case gender 56 | depr=number case gender 57 | adj=number case gender degree 58 | adja= 59 | adjc= 60 | adjp= 61 | adv=degree 62 | num=number case gender accommodability 63 | numcol=number case gender accommodability 64 | ppron12=number case gender person accentability 65 | ppron3=number case gender person accentability post_prepositionality 66 | siebie=case 67 | fin=number person aspect 68 | bedzie=number person aspect 69 | aglt=number person aspect vocalicity 70 | praet=number gender aspect agglutination 71 | impt=number person aspect 72 | imps=aspect 73 | inf=aspect 74 | pcon=aspect 75 | pant=aspect 76 | ger=number case gender aspect negation 77 | pact=number case gender aspect negation 78 | ppas=number case gender aspect negation 79 | winien=number gender aspect 80 | pred= 81 | prep=case vocalicity 82 | conj= 83 | comp= 84 | brev=fullstoppedness 85 | burk= 86 | interj= 87 | qub=vocalicity 88 | xxs=number case gender 89 | xxx= 90 | interp= 91 | ign= 92 | sp= 93 | ''' 94 | ATTRIBUTES = dict( 95 | (key, tuple(values.split())) 96 | for line in ATTRIBUTES.splitlines() if line 97 | for (key, values) in (line.split('=', 1),) 98 | ) 99 | 100 | VALUES = ''' 101 | number=sg pl 102 | case=nom gen dat acc inst loc voc 103 | gender=m1 m2 m3 f n1 n2 p1 p2 p3 104 | person=pri sec ter 105 | degree=pos comp sup 106 | aspect=imperf perf 107 | negation=aff neg 108 | accentability=akc nakc 109 | post_prepositionality=npraep praep 110 | accommodability=congr rec 111 | agglutination=agl nagl 112 | vocalicity=nwok wok 113 | fullstoppedness=pun npun 114 | ''' 115 | VALUES = dict( 116 | (key, tuple(values.split())) 117 | for line in VALUES.splitlines() if line 118 | for (key, values) in (line.split('=', 1),) 119 | ) 120 | 121 | if os.name == 'nt': 122 | libmorfeusz = ctypes.CDLL('morfeusz.dll') 123 | elif sys.platform == 'darwin': 124 | libmorfeusz = ctypes.CDLL('libmorfeusz.dylib') 125 | else: 126 | libmorfeusz = ctypes.CDLL('libmorfeusz.so.0') 127 | 128 | MORFOPT_ENCODING = 1 129 | MORFEUSZ_UTF_8 = 8 130 | 131 | MORFOPT_WHITESPACE = 2 132 | MORFEUSZ_SKIP_WHITESPACE = 0 133 | MORFEUSZ_KEEP_WHITESPACE = 2 134 | 135 | libmorfeusz.morfeusz_set_option(MORFOPT_ENCODING, MORFEUSZ_UTF_8) 136 | libmorfeusz_lock = thread.allocate_lock() 137 | 138 | class InterpEdge(ctypes.Structure): 139 | _fields_ = ( 140 | ('i', ctypes.c_int), 141 | ('j', ctypes.c_int), 142 | ('_orth', ctypes.c_char_p), 143 | ('_base', ctypes.c_char_p), 144 | ('_tags', ctypes.c_char_p) 145 | ) 146 | 147 | if py3k: 148 | @property 149 | def tags(self): 150 | if self._tags is not None: 151 | return self._tags.decode('UTF-8') 152 | else: 153 | @property 154 | def tags(self): 155 | return self._tags 156 | 157 | @property 158 | def orth(self): 159 | if self._orth is not None: 160 | return self._orth.decode('UTF-8') 161 | 162 | @property 163 | def base(self): 164 | if self._base is not None: 165 | return self._base.decode('UTF-8') 166 | 167 | libmorfeusz_analyse = libmorfeusz.morfeusz_analyse 168 | libmorfeusz_analyse.restype = ctypes.POINTER(InterpEdge) 169 | libmorfeusz_about = libmorfeusz.morfeusz_about 170 | libmorfeusz_about.restype = ctypes.c_char_p 171 | 172 | def expand_tags(tags, expand_dot=True, expand_underscore=True): 173 | 174 | if tags is None: 175 | yield 176 | return 177 | tags = str(tags) 178 | for tag in tags.split('|'): 179 | tag = tag.split(':') 180 | pos = tag.pop(0) 181 | chunks = [(pos,)] 182 | chunks += ( 183 | VALUES[attribute] if chunk == '_' and expand_underscore 184 | else chunk.split('.') 185 | for chunk, attribute in zip(tag, ATTRIBUTES[pos]) 186 | ) 187 | 188 | if not expand_dot: 189 | yield ':'.join('.'.join(values) for values in chunks) 190 | continue 191 | 192 | def expand_chunks(i): 193 | if i >= len(chunks): 194 | yield () 195 | else: 196 | tail = tuple(expand_chunks(i + 1)) 197 | for chunk_variant in chunks[i]: 198 | for tail_variant in tail: 199 | yield (chunk_variant,) + tail_variant 200 | 201 | for x in expand_chunks(0): 202 | yield ':'.join(x) 203 | 204 | _expand_tags = expand_tags 205 | 206 | def _dont_expand_tags(s, **kwargs): 207 | return [s] 208 | 209 | def analyse(text, expand_tags=True, expand_dot=True, expand_underscore=True, dag=False, keep_whitespace=False): 210 | ''' 211 | Analyse the text. 212 | ''' 213 | expand_tags = _expand_tags if expand_tags else _dont_expand_tags 214 | text = unicode(text) 215 | text = text.encode('UTF-8') 216 | analyse = _analyse_as_dag if dag else _analyse_as_list 217 | return analyse( 218 | text=text, 219 | expand_tags=expand_tags, 220 | expand_dot=expand_dot, 221 | expand_underscore=expand_underscore, 222 | keep_whitespace=keep_whitespace 223 | ) 224 | 225 | def _analyse_as_dag(text, expand_tags, expand_dot, expand_underscore, keep_whitespace): 226 | result = [] 227 | with libmorfeusz_lock: 228 | if keep_whitespace: 229 | if libmorfeusz.morfeusz_set_option(MORFOPT_WHITESPACE, MORFEUSZ_KEEP_WHITESPACE) != 1: 230 | raise NotImplementedError("This version of Morfeusz doesn't support keep_whitespace") 231 | for edge in libmorfeusz_analyse(text): 232 | if edge.i == -1: 233 | break 234 | for tag in expand_tags(edge.tags, expand_dot=expand_dot, expand_underscore=expand_underscore): 235 | result += [(edge.i, edge.j, (edge.orth, edge.base, tag))] 236 | if keep_whitespace: 237 | libmorfeusz.morfeusz_set_option(MORFOPT_WHITESPACE, MORFEUSZ_SKIP_WHITESPACE) 238 | return result 239 | 240 | def _analyse_as_list(text, expand_tags, expand_dot, expand_underscore, keep_whitespace): 241 | dag = collections.defaultdict(list) 242 | with libmorfeusz_lock: 243 | if keep_whitespace: 244 | if libmorfeusz.morfeusz_set_option(MORFOPT_WHITESPACE, MORFEUSZ_KEEP_WHITESPACE) != 1: 245 | raise NotImplementedError("This version of Morfeusz doesn't support keep_whitespace") 246 | for edge in libmorfeusz_analyse(text): 247 | if edge.i == -1: 248 | break 249 | for tag in expand_tags(edge.tags, expand_dot=expand_dot, expand_underscore=expand_underscore): 250 | dag[edge.i] += [((edge.orth, edge.base, tag), edge.j)] 251 | if keep_whitespace: 252 | libmorfeusz.morfeusz_set_option(MORFOPT_WHITESPACE, MORFEUSZ_SKIP_WHITESPACE) 253 | def expand_dag(i): 254 | nexts = dag[i] 255 | if not nexts: 256 | yield [] 257 | else: 258 | for head, j in nexts: 259 | for tail in expand_dag(j): 260 | yield [head] + tail 261 | return list(expand_dag(0)) 262 | 263 | def about(): 264 | ''' 265 | Return a string containing information on authors and version of the 266 | underlying library. 267 | ''' 268 | about = libmorfeusz_about() 269 | try: 270 | return about.decode('UTF-8') 271 | except UnicodeError: 272 | return about.decode('ISO-8859-2') 273 | 274 | # vim:ts=4 sts=4 sw=4 et 275 | --------------------------------------------------------------------------------