├── setup.cfg ├── .gitignore ├── LICENSE.md ├── setup.py ├── README.md └── obt.py /setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | description-file = README.md 3 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .idea/ 2 | build/ 3 | dist/ 4 | __pycache__/ 5 | *.egg-info/ 6 | .*.sh 7 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | # The MIT License (MIT) 2 | 3 | Copyright (c) 2017 Mats Byrkjeland 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 6 | 7 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. 8 | 9 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 10 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | # Always prefer setuptools over distutils 2 | from setuptools import setup, find_packages 3 | # To use a consistent encoding 4 | from os import path 5 | import subprocess 6 | 7 | here = path.abspath(path.dirname(__file__)) 8 | 9 | # Try to create an rst long_description from README.md 10 | try: 11 | args = 'pandoc', '--to', 'rst', 'README.md' 12 | long_description = subprocess.check_output(args) 13 | long_description = long_description.decode() 14 | except Exception as error: 15 | print('README.md conversion to reStructuredText failed. Error:') 16 | print(error) 17 | print('Setting long_description to None.') 18 | long_description = None 19 | 20 | setup( 21 | name='obt', 22 | version='0.1.0', 23 | description='A Python library for The Oslo-Bergen Tagger', 24 | long_description=long_description, 25 | url='https://github.com/draperunner/obt', 26 | author='Mats Byrkjeland', 27 | author_email='matsbyr@gmail.com', 28 | license='MIT', 29 | classifiers=[ 30 | 'Development Status :: 4 - Beta', 31 | 'Intended Audience :: Developers', 32 | 'Topic :: Scientific/Engineering :: Artificial Intelligence', 33 | 'License :: OSI Approved :: MIT License', 34 | 'Programming Language :: Python :: 3', 35 | 'Programming Language :: Python :: 3.3', 36 | 'Programming Language :: Python :: 3.4', 37 | 'Programming Language :: Python :: 3.5', 38 | ], 39 | keywords='pos-tagging nlp pos', 40 | #packages=find_packages(exclude=['contrib', 'docs', 'tests']), 41 | py_modules=["obt"], 42 | install_requires=[], 43 | extras_require={ 44 | 'dev': [], 45 | 'test': [], 46 | }, 47 | package_data={}, 48 | entry_points={ 49 | 'console_scripts': [], 50 | }, 51 | ) 52 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # The Oslo-Bergen Tagger for Python 2 | 3 | This is a Python library for [The Oslo-Bergen Tagger](http://www.tekstlab.uio.no/obt-ny/), which parses the output of 4 | the tagger to a friendly format. Only Python 3 is supported at this time. 5 | 6 | The library is in beta. See [Roadmap](#roadmap) for things that need to get implemented before a v1.0.0 can be released. 7 | 8 | ## Installation 9 | 10 | You need to have The Oslo-Bergen Tagger installed, and the environment variable `OBT_PATH` set to the path of its 11 | installation directory. You can use the provided code snippet below, or install it using the instructions in 12 | [The-Oslo-Bergen-Tagger GitHub repository](https://github.com/noklesta/The-Oslo-Bergen-Tagger). The following code snippet installs it in your home directory. If you want to install it 13 | somewhere else, you can change the `INSTALL_DIR` variable on the first line to your preferred installation directory. 14 | 15 | ```bash 16 | INSTALL_DIR=$HOME 17 | THIS_DIR=$PWD 18 | cd $INSTALL_DIR 19 | git clone git@github.com:noklesta/The-Oslo-Bergen-Tagger.git 20 | cd The-Oslo-Bergen-Tagger 21 | ./bootstrap.sh 22 | export OBT_PATH=$INSTALL_DIR/The-Oslo-Bergen-Tagger 23 | echo 'export OBT_PATH=$OBT_PATH' >> $HOME/.bashrc 24 | cd $THIS_DIR 25 | ``` 26 | 27 | You can then install this Python library with pip. To install for all users, do: 28 | ```bash 29 | sudo pip3 install obt 30 | ``` 31 | To just install for your user, do: 32 | ```bash 33 | pip3 install --user obt 34 | ``` 35 | 36 | And you are good to go! 37 | 38 | ## Usage 39 | First, import the library 40 | ```python 41 | import obt 42 | ``` 43 | 44 | Then, you can tag a string by passing it to the `tag_bm` function: 45 | ```python 46 | my_string = "Jeg er streng." 47 | tags = obt.tag_bm(my_string) 48 | ``` 49 | Or you can pass a file name using the `file` keyword argument: 50 | ```python 51 | tags = obt.tag_bm(file="my_document.txt") 52 | ``` 53 | 54 | The resulting `tags` will be an array of tag objects, like so: 55 | ```python 56 | [ 57 | { 58 | "tall": "ent", 59 | "type": "pers hum", 60 | "base": "jeg", 61 | "person": "1", 62 | "word_tag": "", 63 | "kasus": "nom", 64 | "raw_tags": "pron ent pers hum nom 1", 65 | "word": "Jeg", 66 | "ordklasse": "pron" 67 | }, 68 | { 69 | "word_tag": "", 70 | "base": "v\u00e6re", 71 | "tilleggstagger": [ 72 | "a5", 73 | "pr1", 74 | "pr2", 75 | "" 76 | ], 77 | "tid": "pres", 78 | "raw_tags": "verb pres a5 pr1 pr2 ", 79 | "word": "er", 80 | "ordklasse": "verb" 81 | }, 82 | { 83 | "type": "appell", 84 | "best": "ub", 85 | "base": "streng", 86 | "word_tag": "", 87 | "tall": "ent", 88 | "ordklasse": "subst", 89 | "raw_tags": "subst appell mask ub ent", 90 | "word": "streng", 91 | "kj\u00f8nn": "mask" 92 | }, 93 | { 94 | "word_tag": "<.>", 95 | "base": "$.", 96 | "tilleggstagger": [ 97 | "<<<", 98 | "", 99 | "<<<" 100 | ], 101 | "raw_tags": "clb <<< <<<", 102 | "word": ".", 103 | "ordklasse": "clb" 104 | } 105 | ] 106 | ``` 107 | 108 | You can easily save this to a JSON file with the `obt.save_json` function: 109 | ```python 110 | obt.save_json(tags, 'my_tags.json') 111 | ``` 112 | 113 | ## Format 114 | A documentation of the tag format will come here. 115 | 116 | ## Roadmap 117 | Before a v1.0.0 release, the following boxes should be checked: 118 | - [ ] Put "tilleggstagger" in proper items in tags object. 119 | - [ ] Implement function for `./tag-nostat-bm.sh` from https://github.com/noklesta/The-Oslo-Bergen-Tagger 120 | - [ ] Implement function for `./tag-nostat-nn.sh` from https://github.com/noklesta/The-Oslo-Bergen-Tagger 121 | - [ ] Python 2 support 122 | -------------------------------------------------------------------------------- /obt.py: -------------------------------------------------------------------------------- 1 | from os import path, getenv, remove, devnull 2 | from subprocess import check_output 3 | 4 | FNULL = open(devnull, 'w') 5 | 6 | OBT_PATH = getenv("OBT_PATH", "") 7 | if OBT_PATH == "": 8 | raise EnvironmentError("Path to Oslo-Bergen-Tagger installation dir 'OBT_PATH' not set.") 9 | 10 | TAGS = { 11 | 'adj': { 12 | 'kjønn': ['m/f', 'nøyt', 'fem'], 13 | 'tall': ['ent', 'fl'], 14 | 'type': ['', '', '', '', 'fork'], 15 | 'best': ['ub', 'be'], 16 | 'grad': ['pos', 'kom', 'sup'] 17 | }, 18 | 'adv': { 19 | 'type': ['fork'] 20 | }, 21 | 'det': { 22 | 'kjønn': ['fem', 'nøyt', 'mask'], 23 | 'tall': ['ent', 'fl'], 24 | 'type': ['dem', 'dem ', ' forst', 25 | ' kvant', 'kvant', 'poss', 'poss res', 'poss høflig', 'sp', 'forst'], 26 | 'best': ['ub', 'be'], 27 | }, 28 | 'konj': { 29 | 'type': ['', 'clb'], 30 | }, 31 | 'prep': { 32 | 'type': ['fork'] 33 | }, 34 | 'pron': { 35 | 'kjønn': ['fem', 'mask', 'mask fem', 'nøyt'], 36 | 'tall': ['ent', 'fl'], 37 | 'type': ['hum res', 'hum sp', 'pers', 'pers hum', 'pers høflig', 'poss hum sp', 'refl', 'sp', 'res'], 38 | 'person': ["1", "2", "3"], 39 | 'kasus': ['nom', 'akk'], 40 | }, 41 | 'sbu': { 42 | 'type': [''], 43 | }, 44 | 'subst': { 45 | 'kjønn': ['nøyt', 'fem', 'mask'], 46 | 'tall': ['ent', 'fl'], 47 | 'type': ['appell fork', 'appell', 'prop', 'fork'], 48 | 'best': ['ub', 'be'], 49 | 'kasus': ['gen'], 50 | }, 51 | 'verb': { 52 | 'tid': ['pres inf pass', 'pres', 'inf', 'pret', 'perf-part', 'imp'], 53 | } 54 | } 55 | 56 | 57 | def write_file(data, filepath): 58 | with open(filepath, "w+") as f: 59 | f.write(data) 60 | 61 | 62 | def assign_tags(word_tags): 63 | pos_tag = word_tags[0] 64 | tags = word_tags[1:] 65 | tag = {'ordklasse': pos_tag, 'raw_tags': ' '.join(word_tags)} 66 | 67 | num_tags = len(tags) 68 | num_tags_assigned = 0 69 | 70 | while len(tags) > 0 and num_tags_assigned < num_tags: 71 | proposed_tag = ' '.join(tags) 72 | 73 | found = False 74 | if pos_tag in TAGS: 75 | for t in TAGS[pos_tag]: 76 | if proposed_tag in TAGS[pos_tag][t]: 77 | tag[t] = proposed_tag 78 | found = True 79 | break 80 | 81 | length = len(tags) 82 | 83 | if found: 84 | num_tags_assigned += length 85 | tags = word_tags[1+num_tags_assigned:] 86 | elif length == 1: 87 | if "tilleggstagger" not in tag: 88 | tag["tilleggstagger"] = [] 89 | tag["tilleggstagger"].append(proposed_tag) 90 | num_tags_assigned += 1 91 | tags = word_tags[1+num_tags_assigned:] 92 | else: 93 | tags = tags[:-1] 94 | 95 | return tag 96 | 97 | 98 | def check_input(text, file): 99 | if text is None and file is None: 100 | raise ValueError("No argument passed. Either pass a string or a filename using the file= kwarg") 101 | if text is not None and file is not None: 102 | raise ValueError("Both a string and file were passed as argument. Please only use one.") 103 | if file is not None and not path.isfile(file): 104 | raise FileNotFoundError("Could not find file called \"" + str(file) + "\"") 105 | 106 | 107 | def save_json(tags, filename): 108 | from json import dumps 109 | write_file(dumps(tags, indent=2), filename) 110 | 111 | 112 | def tag_bm(text=None, file=None, encoding="UTF-8"): 113 | check_input(text, file) 114 | 115 | if text is not None: 116 | temp_file = "/tmp/obtfile.txt" 117 | write_file(text, temp_file) 118 | result = check_output([path.join(OBT_PATH, "tag-bm.sh"), temp_file], stderr=FNULL).decode(encoding) 119 | remove(temp_file) 120 | else: 121 | result = check_output([path.join(OBT_PATH, "tag-bm.sh"), file], stderr=FNULL).decode(encoding) 122 | 123 | tags = [] 124 | 125 | lines = result.split("\n") 126 | 127 | tag_starts = [line for line in lines if line.startswith("")] 128 | num_tags = len(tag_starts) 129 | tag_start_indexes = [lines.index(tag_start) for tag_start in tag_starts] 130 | 131 | for i in range(num_tags): 132 | index = tag_start_indexes[i] 133 | word = lines[index].strip()[6:-7] 134 | word_tag = lines[index + 1].strip()[1:-1] 135 | 136 | word_tags_split = lines[index + 2].strip().split() 137 | base = word_tags_split[0][1:-1] 138 | word_tags = word_tags_split[1:] 139 | 140 | tag = assign_tags(word_tags) 141 | tag["word"] = word 142 | tag["word_tag"] = word_tag 143 | tag["base"] = base 144 | tags.append(tag) 145 | 146 | return tags 147 | --------------------------------------------------------------------------------