├── unidic ├── dicdir │ ├── char.bin │ ├── dicrc │ ├── sys.dic │ ├── unk.dic │ └── matrix.bin ├── __init__.py ├── unidic.py ├── __main__.py └── download.py ├── MANIFEST.in ├── requirements.txt ├── extras ├── reiwa.33.csv ├── README.md └── clean-lex.sh ├── setup.py ├── setup.cfg ├── dicts.json ├── LICENSE ├── LICENSE.unidic ├── doc └── dataset.md └── README.md /unidic/dicdir/char.bin: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /unidic/dicdir/dicrc: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /unidic/dicdir/sys.dic: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /unidic/dicdir/unk.dic: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /unidic/dicdir/matrix.bin: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /unidic/__init__.py: -------------------------------------------------------------------------------- 1 | from .unidic import DICDIR, VERSION 2 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include dicdir/* 2 | include README.md 3 | include LICENSE 4 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | requests>=2.22.0,<3.0.0 2 | tqdm>=4.41.1,<5.0.0 3 | wasabi>=0.6.0,<1.0.0 4 | plac>=1.1.3,<2.0.0 5 | -------------------------------------------------------------------------------- /extras/reiwa.33.csv: -------------------------------------------------------------------------------- 1 | 令和,14629,15402,8205,名詞,固有名詞,一般,*,*,*,レイワ,令和,令和,レーワ,令和,レーワ,固,*,*,*,*,*,*,*,レイワ,レイワ,レイワ,レイワ,"1,0",*,*,*,* 2 | ㋿,18255,20453,2588,補助記号,一般,*,*,*,*,,㋿,㋿,,㋿,,記号,*,*,*,*,*,*,*,,,,,*,*,*,*,999999 3 | ㋿,14629,15402,3992,名詞,固有名詞,一般,*,*,*,レイワ,令和,㋿,レーワ,㋿,レーワ,固,*,*,*,*,*,*,*,レイワ,レイワ,レイワ,レイワ,"1,0",*,*,*,* 4 | -------------------------------------------------------------------------------- /unidic/unidic.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | from wasabi import msg 4 | 5 | def get_version(dicdir): 6 | try: 7 | vpath = os.path.join(dicdir, 'version') 8 | with open(vpath) as vfile: 9 | return vfile.read().strip() 10 | except FileNotFoundError: 11 | return '0' 12 | 13 | _curdir = os.path.dirname(__file__) 14 | 15 | # This will be used elsewhere to initialize the tagger 16 | DICDIR = os.path.join(_curdir, 'dicdir') 17 | VERSION = get_version(DICDIR) 18 | 19 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import pathlib 2 | import setuptools 3 | from distutils.core import setup 4 | import os 5 | 6 | setup(name='unidic', 7 | version='1.1.0', 8 | author="Paul O'Leary McCann", 9 | author_email="polm@dampfkraft.com", 10 | description="UniDic packaged for Python", 11 | long_description=pathlib.Path('README.md').read_text('utf8'), 12 | long_description_content_type="text/markdown", 13 | url="https://github.com/polm/unidic-py", 14 | packages=setuptools.find_packages(), 15 | package_data={'unidic': ['dicdir/*']}, 16 | ) 17 | -------------------------------------------------------------------------------- /unidic/__main__.py: -------------------------------------------------------------------------------- 1 | if __name__ == '__main__': 2 | from .download import download_version 3 | import plac 4 | import sys 5 | 6 | commands = { 7 | "download": download_version, 8 | } 9 | 10 | if len(sys.argv) == 1: 11 | print("Available commands:", ", ".join(commands)) 12 | sys.exit(1) 13 | 14 | command = sys.argv.pop(1) 15 | 16 | if command in commands: 17 | plac.call(commands[command], sys.argv[1:]) 18 | else: 19 | print("Unknown command:", command) 20 | print("Available commands:", ", ".join(commands)) 21 | sys.exit(1) 22 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | description = Japanese UniDic packaged for Python 3 | url = https://github.com/polm/unidic-py 4 | author = "Paul O'Leary McCann" 5 | author_email = polm@dampfkraft.com 6 | license = MIT 7 | long_description = file: README.md 8 | long_description_content_type = text/markdown 9 | classifiers = 10 | License :: OSI Approved :: MIT License 11 | Natural Language :: Japanese 12 | 13 | [options] 14 | include_package_data = True 15 | python_requires = >=3.5 16 | install_requires = 17 | requests>=2.22.0,<3.0.0 18 | tqdm>=4.41.1,<5.0.0 19 | wasabi>=0.6.0,<1.0.0 20 | plac>=1.1.3,<2.0.0 21 | -------------------------------------------------------------------------------- /extras/README.md: -------------------------------------------------------------------------------- 1 | # Extras 2 | 3 | unidic-py distributes a slightly modified version of UniDic for ease of use. To 4 | build this dictionary yourself, perform the following steps: 5 | 6 | 1. Download the official latest UniDic from the [homepage](https://ccd.ninjal.ac.jp/unidic/) 7 | 2. Use `clean-lex.sh` to rewrite `lex.csv` 8 | 3. Copy the appropriate `reiwa.csv` to your dictionary directory (the number is the field count) 9 | 4. Run the normal mecab dictionary build command 10 | 11 | That's it. 12 | 13 | The normal MeCab dictionary building command is: 14 | 15 | /usr/local/libexec/mecab/mecab-dict-index -d . -o . -f utf8 -t utf8 16 | 17 | Note that depending on your MeCab install the path may be different. 18 | -------------------------------------------------------------------------------- /extras/clean-lex.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Remove entries from lex.csv that can make weird tokenizations 3 | 4 | # usage: 5 | # ./clean-lex.sh [lex.csv] > lex.fix.csv 6 | # Make sure to delete the original lex.csv before building a dictionary. 7 | 8 | # Types of entries to remove: 9 | # 1. Single latin letters. These can cause short unknown words to break up, 10 | # like "fish" as "f i s h". 11 | # 2. Number-only entries of any length. These can cause strange pronunciations, 12 | # like 10 as "ten". 13 | 14 | # Note it is extremely important that the character ranges are broken up like 15 | # this to avoid including punctuation. 16 | grep -Ev '^([A-Za-zA-Za-z]|[0-90-9]*),' "$1" 17 | 18 | # In Unidic 2.3.0 this removes 232 entries. 19 | # In Unidic 3.1.0 this removes 233 entries. 20 | -------------------------------------------------------------------------------- /dicts.json: -------------------------------------------------------------------------------- 1 | { 2 | "latest": { 3 | "url" : "https://cotonoha-dic.s3-ap-northeast-1.amazonaws.com/unidic-3.1.0.zip", 4 | "version" : "3.1.0+2021-08-31" 5 | }, 6 | "3.1.0+2021-08-31": { 7 | "url" : "https://cotonoha-dic.s3-ap-northeast-1.amazonaws.com/unidic-3.1.0.zip", 8 | "version" : "3.1.0+2021-08-31" 9 | }, 10 | "2.3.0+2020-10-08": { 11 | "url" : "https://cotonoha-dic.s3-ap-northeast-1.amazonaws.com/unidic.zip", 12 | "version" : "2.3.0+2020-10-08" 13 | }, 14 | "3.1.0a1": { 15 | "url" : "https://cotonoha-dic.s3-ap-northeast-1.amazonaws.com/unidic-3.1.0a1.zip", 16 | "version" : "3.1.0+2021-08-31" 17 | }, 18 | "1.0.2": { 19 | "url" : "https://github.com/polm/unidic-py/releases/download/v1.0.2/unidic.zip", 20 | "version" : "2.3.0" 21 | }, 22 | "aws": { 23 | "url" : "https://cotonoha-dic.s3-ap-northeast-1.amazonaws.com/unidic.zip", 24 | "version" : "2.3.0+2020-10-08" 25 | }, 26 | "blank": { 27 | "url": "https://www.dampfkraft.com/unidic-blank.zip", 28 | "version": "blank" 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 Paul O'Leary McCann 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /LICENSE.unidic: -------------------------------------------------------------------------------- 1 | Copyright (c) 2011-2017, The UniDic Consortium 2 | All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without 5 | modification, are permitted provided that the following conditions are 6 | met: 7 | 8 | * Redistributions of source code must retain the above copyright 9 | notice, this list of conditions and the following disclaimer. 10 | 11 | * Redistributions in binary form must reproduce the above copyright 12 | notice, this list of conditions and the following disclaimer in the 13 | documentation and/or other materials provided with the 14 | distribution. 15 | 16 | * Neither the name of the UniDic Consortium nor the names of its 17 | contributors may be used to endorse or promote products derived 18 | from this software without specific prior written permission. 19 | 20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 21 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 22 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 23 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 24 | OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 25 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 26 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 27 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 28 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 29 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 30 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 31 | -------------------------------------------------------------------------------- /unidic/download.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import shutil 3 | import zipfile 4 | import os 5 | import sys 6 | from wasabi import msg 7 | from urllib.request import urlretrieve 8 | from tqdm import tqdm 9 | 10 | # This is used to show progress when downloading. 11 | # see here: https://github.com/tqdm/tqdm#hooks-and-callbacks 12 | class TqdmUpTo(tqdm): 13 | """Provides `update_to(n)` which uses `tqdm.update(delta_n)`.""" 14 | def update_to(self, b=1, bsize=1, tsize=None): 15 | """ 16 | b : int, optional 17 | Number of blocks transferred so far [default: 1]. 18 | bsize : int, optional 19 | Size of each block (in tqdm units) [default: 1]. 20 | tsize : int, optional 21 | Total size (in tqdm units). If [default: None] remains unchanged. 22 | """ 23 | if tsize is not None: 24 | self.total = tsize 25 | self.update(b * bsize - self.n) # will also set self.n = b * bsize 26 | 27 | def download_file(url, fname): 28 | with requests.get(url, stream=True) as r: 29 | with open(fname, 'wb') as f: 30 | shutil.copyfileobj(r.raw, f) 31 | 32 | return fname 33 | 34 | def download_progress(url, fname): 35 | """Download a file and show a progress bar.""" 36 | with TqdmUpTo(unit='B', unit_scale=True, miniters=1, 37 | desc=url.split('/')[-1]) as t: # all optional kwargs 38 | urlretrieve(url, filename=fname, reporthook=t.update_to, data=None) 39 | t.total = t.n 40 | return fname 41 | 42 | def get_json(url, desc): 43 | r = requests.get(url) 44 | if r.status_code != 200: 45 | msg.fail( 46 | "Server error ({})".format(r.status_code), 47 | "Couldn't fetch {}. If this error persists please open an issue." 48 | " http://github.com/polm/unidic-py/issues/".format(desc), 49 | exits=1, 50 | ) 51 | return r.json() 52 | 53 | def download_and_clean(version, url, dirname='unidic', delfiles=[]): 54 | """Download unidic and prep the dicdir. 55 | 56 | This downloads the zip file from the source, extracts it, renames the 57 | resulting directory, and removes large files not used at runtime. 58 | """ 59 | cdir = os.path.dirname(os.path.abspath(__file__)) 60 | fname = os.path.join(cdir, 'unidic.zip') 61 | print("Downloading UniDic v{}...".format(version), file=sys.stderr) 62 | download_progress(url, fname) 63 | print("Finished download.") 64 | 65 | with zipfile.ZipFile(fname, 'r') as zf: 66 | zf.extractall(cdir) 67 | os.remove(fname) 68 | 69 | dicdir = os.path.join(cdir, 'dicdir') 70 | if os.path.isdir(dicdir): 71 | shutil.rmtree(dicdir) 72 | 73 | outdir = os.path.join(cdir, dirname) 74 | shutil.move(outdir, dicdir) 75 | 76 | for dfile in delfiles: 77 | os.remove(os.path.join(dicdir, dfile)) 78 | 79 | # save a version file so we can tell what it is 80 | vpath = os.path.join(dicdir, 'version') 81 | with open(vpath, 'w') as vfile: 82 | vfile.write('unidic-{}'.format(version)) 83 | 84 | # Write a dummy mecabrc 85 | with open(os.path.join(dicdir, 'mecabrc'), 'w') as mecabrc: 86 | mecabrc.write('# This is a dummy file.') 87 | 88 | print("Downloaded UniDic v{} to {}".format(version, dicdir), file=sys.stderr) 89 | 90 | DICT_INFO = "https://raw.githubusercontent.com/polm/unidic-py/master/dicts.json" 91 | 92 | def download_version(ver="latest"): 93 | res = get_json(DICT_INFO, "dictionary info") 94 | try: 95 | dictinfo = res[ver] 96 | except KeyError: 97 | print('Unknown version "{}".'.format(ver)) 98 | print("Known versions:") 99 | for key, val in res.items(): 100 | print("\t", key, "({})".format(val['version'])) 101 | 102 | print("download url:", dictinfo['url']) 103 | print("Dictionary version:", dictinfo['version']) 104 | download_and_clean(dictinfo['version'], dictinfo['url']) 105 | 106 | -------------------------------------------------------------------------------- /doc/dataset.md: -------------------------------------------------------------------------------- 1 | This is a description of the various tokenizer dictionaries hosted on AWS as 2 | part of Amazon's Open Data Sponsorship Program. This file is distributed with 3 | the main unidic-py package, but also describes other dictionaries. 4 | 5 | This repository contains dictionaries for use with MeCab for tokenization and 6 | morphological analysis of modern written Japanese. 7 | 8 | ## Background 9 | 10 | Two main dictionaries are included in this repository: UniDic and IPADic. 11 | 12 | UniDic is maintained by NINJAL, the National Institute for Japanese Language 13 | and Lingusitics. It is the official dictionary of Japanese Universal 14 | Dependencies. Based on the "School Grammar" model of Japanese, it uses the 15 | "short unit word" as the base unit of tokenization, which results in highly 16 | reproducible tokenizations suitable for many downstream tasks. 17 | 18 | In its default distribution, UniDic includes dictionary entries for individual 19 | letters of the latin alphabet and some numbers. Because these can cause 20 | unexpected tokenization results with the default configuration, they have been 21 | removed. As a result strings of latin text or numbers will result in single 22 | tokens. 23 | 24 | This distribution of UniDic also adds entries for 令和 *Reiwa*, the name of the 25 | new Imperial Era that began in 2019. 26 | 27 | Additionally, a smaller version UniDic called "unidic-lite" is provided. This 28 | is based v2.1.2 of UniDic, which was much smaller in size than more recent 29 | versions, but should still be accurate for many applications. In particular, 30 | for first time use of MeCab it's more than enough to get started with Japanese 31 | text analysis. 32 | 33 | Besides UniDic, IPADic is also provided. IPADic is a dictionary based on an 34 | original definition of the "word" and was built at the Nara Institute of 35 | Science and Technology (NAIST), based on an older dictionary from the Institute 36 | for New Generation Computer Technology (ICOT). IPADic has not been updated 37 | since 2007, so using it is not recommended for new projects. However, IPADic 38 | has been used for many old benchmarks and occasionally is used in newer 39 | projects, so it's provided here for purposes of compatability. Unlike UniDic, 40 | IPADic has not been modified for convenience or to make it more current in the 41 | interest of providing maximum compatability with old benchmarks. 42 | 43 | ## Data Access 44 | 45 | The various dictionaries are stored in the cloud on Amazon Web Services and can 46 | be installed through pip. 47 | 48 | pip install unidic 49 | pip install unidic-lite 50 | pip install ipadic 51 | 52 | In the case of unidic, due to the large file size - 1GB after extraction - it 53 | has an extra download step. Do this to download the dictionary data from S3: 54 | 55 | python -m unidic download aws 56 | 57 | ## Data Types and Structure 58 | 59 | The dictionaries are provided as binary files for use with MeCab and you should 60 | generally not need to access them directly. 61 | 62 | The raw files used to compile the dictionaries are in CSV format. The fields 63 | differ for each dictionary, except for the first four fields, which are defined 64 | by MeCab. 65 | 66 | - word: the literal word as it appears in text 67 | - left_id: an internal ID (integer) 68 | - right_id: an internal ID (integer). Ordinarily this is always the same as left_id. 69 | - cost: (integer) Used in calculating tokenizations, low cost entries take priority over high cost entries. 70 | 71 | ### UniDic 72 | 73 | UniDic fields are defined at [NINJAL's UniDic FAQ page][faq]. unidic-lite uses 74 | a subset of the fields in the most recent version. 75 | 76 | [faq]: https://clrd.ninjal.ac.jp/unidic/faq.html 77 | 78 | IPADic fields are as follows: 79 | 80 | - pos1: Broad part of speech 81 | - pos2: part of speech 82 | - pos3: part of speech 83 | - ctype: conjugation type 84 | - cform: the conjugation of the current token 85 | - lemma: normalized form 86 | - reading: standard kana representation 87 | - pronuncation: as reading, but long vowels are differentiated (講師 is こーし, 子牛 is こうし) 88 | 89 | ## References 90 | 91 | UniDic is developed by NINJAL. Here are some resources on UniDic: 92 | 93 | - [UniDic home page](https://clrd.ninjal.ac.jp/unidic/) 94 | - [Universal Dependencies for Japanese](https://www.semanticscholar.org/paper/Universal-Dependencies-for-Japanese-Tanaka-Miyao/064b601542d27471e397f8df811f0ddb54824113) 95 | 96 | The MeCab documentation covers dictionary formats and tokenizer options. It 97 | also has some discussion of IPADic, which does not have a home page. 98 | 99 | - [MeCab documentation](https://taku910.github.io/mecab/) 100 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # unidic-py 2 | 3 | This is a version of [UniDic](https://clrd.ninjal.ac.jp/unidic/) for 4 | Contemporary Written Japanese packaged for use with pip. 5 | 6 | Currently it supports 3.1.0, the latest version of UniDic. **Note this will 7 | take up 770MB on disk after install.** If you want a small package, try 8 | [unidic-lite](https://github.com/polm/unidic-lite). 9 | 10 | The data for this dictionary is hosted as part of the AWS Open Data 11 | Sponsorship Program. You can read the announcement 12 | [here](https://aws.amazon.com/jp/blogs/news/published-unidic-mecab-on-aws-open-data/). 13 | 14 | After installing via pip, you need to download the dictionary using the 15 | following command: 16 | 17 | python -m unidic download 18 | 19 | With [fugashi](https://github.com/polm/fugashi) or 20 | [mecab-python3](https://github.com/samurait/mecab-python3) unidic will be used 21 | automatically when installed, though if you want you can manually pass the 22 | MeCab arguments: 23 | 24 | import fugashi 25 | import unidic 26 | tagger = fugashi.Tagger('-d "{}"'.format(unidic.DICDIR)) 27 | # that's it! 28 | 29 | ## Differences from the Official UniDic Release 30 | 31 | This has a few changes from the official UniDic release to make it easier to use. 32 | 33 | - entries for 令和 have been added 34 | - single-character numeric and alphabetic words have been deleted 35 | - `unk.def` has been modified so unknown punctuation won't be marked as a noun 36 | 37 | See the `extras` directory for details on how to replicate the build process. 38 | 39 | ## Fields 40 | 41 | Here is a list of fields included in this edition of UniDic. For more information see the [UniDic FAQ](https://clrd.ninjal.ac.jp/unidic/faq.html#col_name), though not all fields are included. For fields in the UniDic FAQ the name given there is included. Als orefer to the [description of the field hierarchy](https://clrd.ninjal.ac.jp/unidic/glossary.html#kaisouteki) for details. 42 | 43 | Fields which are not applicable are usually marked with an asterisk (`*`). 44 | 45 | - **pos1, pos2, pos3, pos4**: Part of speech fields. The earlier fields are more general, the later fields are more specific. 46 | - **cType:** 活用型, conjugation type. Will have a value like `五段-ラ行`. 47 | - **cForm:** 活用形, conjugation shape. Will have a value like `連用形-促音便`. 48 | - **lForm:** 語彙素読み, lemma reading. The reading of the lemma in katakana, this uses the same format as the `kana` field, not `pron`. 49 | - **lemma:** 語彙素(+語彙素細分類). The lemma is a non-inflected "dictionary form" of a word. UniDic lemmas sometimes include extra info or have unusual forms, like using katakana for some place names. 50 | - **orth:** 書字形出現形, the word as it appears in text, this appears to be identical to the surface. 51 | - **pron:** 発音形出現形, pronunciation. This is similar to kana except that long vowels are indicated with a ー, so 講師 is こーし. 52 | - **orthBase:** 書字形基本形, the uninflected form of the word using its current written form. For example, for 彷徨った the lemma is さ迷う but the orthBase is 彷徨う. 53 | - **pronBase:** 発音形基本形, the pronunciation of the base form. Like `pron` for the `lemma` or `orthBase`. 54 | - **goshu:** 語種, word type. Etymological category. In order of frequency, 和, 固, 漢, 外, 混, 記号, 不明. Defined for all dictionary words, blank for unks. 55 | - **iType:** 語頭変化化型, "i" is for "initial". This is the type of initial transformation the word undergoes when combining, for example 兵 is へ半濁 because it can be read as べい in combination. This is available for <2% of entries. 56 | - **iForm:** 語頭変化形, this is the initial form of the word in context, such as 基本形 or 半濁音形. 57 | - **fType:** 語末変化化型, "f" is for "final", but otherwise as iType. For example 医学 is ク促 because it can change to いがっ (apparently). This is available for <0.1% of entries. 58 | - **fForm:** 語末変化形, as iForm but for final transformations. 59 | - **iConType:** 語頭変化結合型, initial change fusion type. Describes phonetic change at the start of the word in counting expressions. Only available for a few hundred entries, mostly numbers. Values are N followed by a letter or number; most entries with this value are numeric. 60 | - **fConType:** 語末変化結合型, final change fusion type. This is also used for counting expressions, and like iConType it is only available for a few hundred entries. Unlike iConType the values are very complicated, like `B1S6SjShS,B1S6S8SjShS`. 61 | - **type:** Appears to refer to the type of the lemma. See the details below for an overview. 62 | 63 |
64 | Type and POS fields in unidic-cwj-202302 65 |
 66 | type,pos1,pos2,pos3,pos4
 67 | 人名,名詞,固有名詞,人名,一般
 68 | 他,感動詞,フィラー,*,*
 69 | 他,感動詞,一般,*,*
 70 | 他,接続詞,*,*,*
 71 | 体,代名詞,*,*,*
 72 | 体,名詞,助動詞語幹,*,*
 73 | 体,名詞,普通名詞,サ変可能,*
 74 | 体,名詞,普通名詞,サ変形状詞可能,*
 75 | 体,名詞,普通名詞,一般,*
 76 | 体,名詞,普通名詞,副詞可能,*
 77 | 体,名詞,普通名詞,助数詞可能,*
 78 | 体,名詞,普通名詞,形状詞可能,*
 79 | 係助,助詞,係助詞,*,*
 80 | 副助,助詞,副助詞,*,*
 81 | 助動,助動詞,*,*,*
 82 | 助動,形状詞,助動詞語幹,*,*
 83 | 助数,接尾辞,名詞的,助数詞,*
 84 | 名,名詞,固有名詞,人名,名
 85 | 固有名,名詞,固有名詞,一般,*
 86 | 国,名詞,固有名詞,地名,国
 87 | 地名,名詞,固有名詞,地名,一般
 88 | 姓,名詞,固有名詞,人名,姓
 89 | 接助,助詞,接続助詞,*,*
 90 | 接尾体,接尾辞,名詞的,サ変可能,*
 91 | 接尾体,接尾辞,名詞的,一般,*
 92 | 接尾体,接尾辞,名詞的,副詞可能,*
 93 | 接尾用,接尾辞,動詞的,*,*
 94 | 接尾相,接尾辞,形容詞的,*,*
 95 | 接尾相,接尾辞,形状詞的,*,*
 96 | 接頭,接頭辞,*,*,*
 97 | 数,名詞,数詞,*,*
 98 | 格助,助詞,格助詞,*,*
 99 | 準助,助詞,準体助詞,*,*
100 | 用,動詞,一般,*,*
101 | 用,動詞,非自立可能,*,*
102 | 相,副詞,*,*,*
103 | 相,形容詞,一般,*,*
104 | 相,形容詞,非自立可能,*,*
105 | 相,形状詞,タリ,*,*
106 | 相,形状詞,一般,*,*
107 | 相,連体詞,*,*,*
108 | 終助,助詞,終助詞,*,*
109 | 補助,空白,*,*,*
110 | 補助,補助記号,一般,*,*
111 | 補助,補助記号,句点,*,*
112 | 補助,補助記号,括弧閉,*,*
113 | 補助,補助記号,括弧開,*,*
114 | 補助,補助記号,読点,*,*
115 | 補助,補助記号,AA,一般,*
116 | 補助,補助記号,AA,顔文字,*
117 | 記号,記号,一般,*,*
118 | 記号,記号,文字,*,*
119 |     
120 |
121 | 122 | - **kana:** 読みがな, this is the typical representation of a word in katakana, unlike pron. 講師 is コウシ. 123 | - **kanaBase:** 仮名形基本形, this is the typical katakana representation of the lemma. 124 | - **form:** 語形出現形, the form of the word as it appears. Form groups the same word with different written expressions together. 125 | - **formBase:** 語形基本形 the uninflected form of the word. For example, the formBase オオキイ groups its orthBase 書字形基本形 大きい and おおきい together. Also since its casual form of the orthBase おっきい has a different pronunciation, it is regarded as a distinct formBase オッキイ (see the UniDic hierarchy for details). 126 | - **aType:** Accent type. This is a (potentially) comma-separated field which has the number of the mora taking the accent in 標準語 (standard language). When there are multiple values, more common accent patterns come first. 127 | - **aConType:** This describes how the accent shifts when the word is used in a counter expression. It uses complicated notation. 128 | - **aModType:** Presumably accent related but unclear use. Available for <25% of entries and only has 6 non-default values. 129 | - **lid:** 語彙表ID. A long lemma ID. This seems to be a kind of GUID. There is usually one entry per line in the CSV, except that half-width and full-width variations can be combined. Example: 7821659499274752 130 | - **lemma_id:** 語彙素ID. A shorter lemma id, starting from 1. This seems to be as unique as the `lemma` field, so many CSV lines can share this value. Examples of values that share an ID are: クリエイティブ, クリエイティヴ, クリエーティブ and Creative. 131 | 132 | # License 133 | 134 | The modern Japanese UniDic is available under the GPL, LGPL, or BSD license, 135 | [see here](https://clrd.ninjal.ac.jp/unidic/commerce_use.html). UniDic is 136 | developed by [NINJAL](https://www.ninjal.ac.jp/), the National Institute for 137 | Japanese Language and Linguistics. UniDic is copyrighted by the UniDic 138 | Consortium and is distributed here under the terms of the [BSD 139 | License](./LICENSE.unidic). 140 | 141 | The code in this repository is not written or maintained by NINJAL. The code is 142 | available under the MIT or WTFPL License, as you prefer. 143 | --------------------------------------------------------------------------------