├── setup.cfg
├── .gitignore
├── LICENSE.md
├── setup.py
├── README.md
└── obt.py


/setup.cfg:
--------------------------------------------------------------------------------
1 | [metadata]
2 | description-file = README.md
3 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .idea/
2 | build/
3 | dist/
4 | __pycache__/
5 | *.egg-info/
6 | .*.sh
7 | 


--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
 1 | # The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2017 Mats Byrkjeland
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
 6 | 
 7 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
 8 | 
 9 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
10 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | # Always prefer setuptools over distutils
 2 | from setuptools import setup, find_packages
 3 | # To use a consistent encoding
 4 | from os import path
 5 | import subprocess
 6 | 
 7 | here = path.abspath(path.dirname(__file__))
 8 | 
 9 | # Try to create an rst long_description from README.md
10 | try:
11 |     args = 'pandoc', '--to', 'rst', 'README.md'
12 |     long_description = subprocess.check_output(args)
13 |     long_description = long_description.decode()
14 | except Exception as error:
15 |     print('README.md conversion to reStructuredText failed. Error:')
16 |     print(error)
17 |     print('Setting long_description to None.')
18 |     long_description = None
19 | 
20 | setup(
21 |     name='obt',
22 |     version='0.1.0',
23 |     description='A Python library for The Oslo-Bergen Tagger',
24 |     long_description=long_description,
25 |     url='https://github.com/draperunner/obt',
26 |     author='Mats Byrkjeland',
27 |     author_email='matsbyr@gmail.com',
28 |     license='MIT',
29 |     classifiers=[
30 |         'Development Status :: 4 - Beta',
31 |         'Intended Audience :: Developers',
32 |         'Topic :: Scientific/Engineering :: Artificial Intelligence',
33 |         'License :: OSI Approved :: MIT License',
34 |         'Programming Language :: Python :: 3',
35 |         'Programming Language :: Python :: 3.3',
36 |         'Programming Language :: Python :: 3.4',
37 |         'Programming Language :: Python :: 3.5',
38 |     ],
39 |     keywords='pos-tagging nlp pos',
40 |     #packages=find_packages(exclude=['contrib', 'docs', 'tests']),
41 |     py_modules=["obt"],
42 |     install_requires=[],
43 |     extras_require={
44 |         'dev': [],
45 |         'test': [],
46 |     },
47 |     package_data={},
48 |     entry_points={
49 |         'console_scripts': [],
50 |     },
51 | )
52 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # The Oslo-Bergen Tagger for Python
  2 | 
  3 | This is a Python library for [The Oslo-Bergen Tagger](http://www.tekstlab.uio.no/obt-ny/), which parses the output of 
  4 | the tagger to a friendly format. Only Python 3 is supported at this time.
  5 | 
  6 | The library is in beta. See [Roadmap](#roadmap) for things that need to get implemented before a v1.0.0 can be released.
  7 | 
  8 | ## Installation
  9 | 
 10 | You need to have The Oslo-Bergen Tagger installed, and the environment variable `OBT_PATH` set to the path of its
 11 | installation directory. You can use the provided code snippet below, or install it using the instructions in 
 12 | [The-Oslo-Bergen-Tagger GitHub repository](https://github.com/noklesta/The-Oslo-Bergen-Tagger). The following code snippet installs it in your home directory. If you want to install it 
 13 | somewhere else, you can change the `INSTALL_DIR` variable on the first line to your preferred installation directory.
 14 | 
 15 | ```bash
 16 | INSTALL_DIR=$HOME
 17 | THIS_DIR=$PWD
 18 | cd $INSTALL_DIR
 19 | git clone git@github.com:noklesta/The-Oslo-Bergen-Tagger.git
 20 | cd The-Oslo-Bergen-Tagger
 21 | ./bootstrap.sh
 22 | export OBT_PATH=$INSTALL_DIR/The-Oslo-Bergen-Tagger
 23 | echo 'export OBT_PATH=$OBT_PATH' >> $HOME/.bashrc
 24 | cd $THIS_DIR
 25 | ```
 26 | 
 27 | You can then install this Python library with pip. To install for all users, do:
 28 | ```bash
 29 | sudo pip3 install obt
 30 | ```
 31 | To just install for your user, do:
 32 | ```bash
 33 | pip3 install --user obt
 34 | ```
 35 | 
 36 | And you are good to go!
 37 | 
 38 | ## Usage
 39 | First, import the library
 40 | ```python
 41 | import obt
 42 | ```
 43 | 
 44 | Then, you can tag a string by passing it to the `tag_bm` function:
 45 | ```python
 46 | my_string = "Jeg er streng."
 47 | tags = obt.tag_bm(my_string)
 48 | ```
 49 | Or you can pass a file name using the `file` keyword argument:
 50 | ```python
 51 | tags = obt.tag_bm(file="my_document.txt")
 52 | ```
 53 | 
 54 | The resulting `tags` will be an array of tag objects, like so:
 55 | ```python
 56 | [
 57 |   {
 58 |     "tall": "ent",
 59 |     "type": "pers hum",
 60 |     "base": "jeg",
 61 |     "person": "1",
 62 |     "word_tag": "<jeg>",
 63 |     "kasus": "nom",
 64 |     "raw_tags": "pron ent pers hum nom 1",
 65 |     "word": "Jeg",
 66 |     "ordklasse": "pron"
 67 |   },
 68 |   {
 69 |     "word_tag": "<er>",
 70 |     "base": "v\u00e6re",
 71 |     "tilleggstagger": [
 72 |       "a5",
 73 |       "pr1",
 74 |       "pr2",
 75 |       "<aux1/perf_part>"
 76 |     ],
 77 |     "tid": "pres",
 78 |     "raw_tags": "verb pres a5 pr1 pr2 <aux1/perf_part>",
 79 |     "word": "er",
 80 |     "ordklasse": "verb"
 81 |   },
 82 |   {
 83 |     "type": "appell",
 84 |     "best": "ub",
 85 |     "base": "streng",
 86 |     "word_tag": "<streng>",
 87 |     "tall": "ent",
 88 |     "ordklasse": "subst",
 89 |     "raw_tags": "subst appell mask ub ent",
 90 |     "word": "streng",
 91 |     "kj\u00f8nn": "mask"
 92 |   },
 93 |   {
 94 |     "word_tag": "<.>",
 95 |     "base": "$.",
 96 |     "tilleggstagger": [
 97 |       "<<<",
 98 |       "<punkt>",
 99 |       "<<<"
100 |     ],
101 |     "raw_tags": "clb <<< <punkt> <<<",
102 |     "word": ".",
103 |     "ordklasse": "clb"
104 |   }
105 | ]
106 | ```
107 | 
108 | You can easily save this to a JSON file with the `obt.save_json` function:
109 | ```python
110 | obt.save_json(tags, 'my_tags.json')
111 | ```
112 | 
113 | ## Format
114 | A documentation of the tag format will come here.
115 | 
116 | ## Roadmap
117 | Before a v1.0.0 release, the following boxes should be checked:
118 | - [ ] Put "tilleggstagger" in proper items in tags object.
119 | - [ ] Implement function for `./tag-nostat-bm.sh` from https://github.com/noklesta/The-Oslo-Bergen-Tagger
120 | - [ ] Implement function for `./tag-nostat-nn.sh` from https://github.com/noklesta/The-Oslo-Bergen-Tagger
121 | - [ ] Python 2 support
122 | 


--------------------------------------------------------------------------------
/obt.py:
--------------------------------------------------------------------------------
  1 | from os import path, getenv, remove, devnull
  2 | from subprocess import check_output
  3 | 
  4 | FNULL = open(devnull, 'w')
  5 | 
  6 | OBT_PATH = getenv("OBT_PATH", "")
  7 | if OBT_PATH == "":
  8 |     raise EnvironmentError("Path to Oslo-Bergen-Tagger installation dir 'OBT_PATH' not set.")
  9 | 
 10 | TAGS = {
 11 |     'adj': {
 12 |         'kjønn': ['m/f', 'nøyt', 'fem'],
 13 |         'tall': ['ent', 'fl'],
 14 |         'type': ['<adv>', '<ordenstall>', '<perf-part>', '<pres-part>', 'fork'],
 15 |         'best': ['ub', 'be'],
 16 |         'grad': ['pos', 'kom', 'sup']
 17 |     },
 18 |     'adv': {
 19 |         'type': ['fork']
 20 |     },
 21 |     'det': {
 22 |         'kjønn': ['fem', 'nøyt', 'mask'],
 23 |         'tall': ['ent', 'fl'],
 24 |         'type': ['dem', 'dem <adj>', '<adj> forst',
 25 |                  '<adj> kvant', 'kvant', 'poss', 'poss res', 'poss høflig', 'sp', 'forst'],
 26 |         'best': ['ub', 'be'],
 27 |     },
 28 |     'konj': {
 29 |         'type': ['<adv>', 'clb'],
 30 |     },
 31 |     'prep': {
 32 |         'type': ['fork']
 33 |     },
 34 |     'pron': {
 35 |         'kjønn': ['fem', 'mask', 'mask fem', 'nøyt'],
 36 |         'tall': ['ent', 'fl'],
 37 |         'type': ['hum res', 'hum sp', 'pers', 'pers hum', 'pers høflig', 'poss hum sp', 'refl', 'sp', 'res'],
 38 |         'person': ["1", "2", "3"],
 39 |         'kasus': ['nom', 'akk'],
 40 |     },
 41 |     'sbu': {
 42 |         'type': ['<spørreartikkel>'],
 43 |     },
 44 |     'subst': {
 45 |         'kjønn': ['nøyt', 'fem', 'mask'],
 46 |         'tall': ['ent', 'fl'],
 47 |         'type': ['appell fork', 'appell', 'prop', 'fork'],
 48 |         'best': ['ub', 'be'],
 49 |         'kasus': ['gen'],
 50 |     },
 51 |     'verb': {
 52 |         'tid': ['pres inf pass', 'pres', 'inf', 'pret', 'perf-part', 'imp'],
 53 |     }
 54 | }
 55 | 
 56 | 
 57 | def write_file(data, filepath):
 58 |     with open(filepath, "w+") as f:
 59 |         f.write(data)
 60 | 
 61 | 
 62 | def assign_tags(word_tags):
 63 |     pos_tag = word_tags[0]
 64 |     tags = word_tags[1:]
 65 |     tag = {'ordklasse': pos_tag, 'raw_tags': ' '.join(word_tags)}
 66 | 
 67 |     num_tags = len(tags)
 68 |     num_tags_assigned = 0
 69 | 
 70 |     while len(tags) > 0 and num_tags_assigned < num_tags:
 71 |         proposed_tag = ' '.join(tags)
 72 | 
 73 |         found = False
 74 |         if pos_tag in TAGS:
 75 |             for t in TAGS[pos_tag]:
 76 |                 if proposed_tag in TAGS[pos_tag][t]:
 77 |                     tag[t] = proposed_tag
 78 |                     found = True
 79 |                     break
 80 | 
 81 |         length = len(tags)
 82 | 
 83 |         if found:
 84 |             num_tags_assigned += length
 85 |             tags = word_tags[1+num_tags_assigned:]
 86 |         elif length == 1:
 87 |             if "tilleggstagger" not in tag:
 88 |                 tag["tilleggstagger"] = []
 89 |             tag["tilleggstagger"].append(proposed_tag)
 90 |             num_tags_assigned += 1
 91 |             tags = word_tags[1+num_tags_assigned:]
 92 |         else:
 93 |             tags = tags[:-1]
 94 | 
 95 |     return tag
 96 | 
 97 | 
 98 | def check_input(text, file):
 99 |     if text is None and file is None:
100 |         raise ValueError("No argument passed. Either pass a string or a filename using the file= kwarg")
101 |     if text is not None and file is not None:
102 |         raise ValueError("Both a string and file were passed as argument. Please only use one.")
103 |     if file is not None and not path.isfile(file):
104 |         raise FileNotFoundError("Could not find file called \"" + str(file) + "\"")
105 | 
106 | 
107 | def save_json(tags, filename):
108 |     from json import dumps
109 |     write_file(dumps(tags, indent=2), filename)
110 | 
111 | 
112 | def tag_bm(text=None, file=None, encoding="UTF-8"):
113 |     check_input(text, file)
114 | 
115 |     if text is not None:
116 |         temp_file = "/tmp/obtfile.txt"
117 |         write_file(text, temp_file)
118 |         result = check_output([path.join(OBT_PATH, "tag-bm.sh"), temp_file], stderr=FNULL).decode(encoding)
119 |         remove(temp_file)
120 |     else:
121 |         result = check_output([path.join(OBT_PATH, "tag-bm.sh"), file], stderr=FNULL).decode(encoding)
122 | 
123 |     tags = []
124 | 
125 |     lines = result.split("\n")
126 | 
127 |     tag_starts = [line for line in lines if line.startswith("<word>")]
128 |     num_tags = len(tag_starts)
129 |     tag_start_indexes = [lines.index(tag_start) for tag_start in tag_starts]
130 | 
131 |     for i in range(num_tags):
132 |         index = tag_start_indexes[i]
133 |         word = lines[index].strip()[6:-7]
134 |         word_tag = lines[index + 1].strip()[1:-1]
135 | 
136 |         word_tags_split = lines[index + 2].strip().split()
137 |         base = word_tags_split[0][1:-1]
138 |         word_tags = word_tags_split[1:]
139 | 
140 |         tag = assign_tags(word_tags)
141 |         tag["word"] = word
142 |         tag["word_tag"] = word_tag
143 |         tag["base"] = base
144 |         tags.append(tag)
145 | 
146 |     return tags
147 | 


--------------------------------------------------------------------------------