├── unidic
    ├── dicdir
    │   ├── char.bin
    │   ├── dicrc
    │   ├── sys.dic
    │   ├── unk.dic
    │   └── matrix.bin
    ├── __init__.py
    ├── unidic.py
    ├── __main__.py
    └── download.py
├── MANIFEST.in
├── requirements.txt
├── extras
    ├── reiwa.33.csv
    ├── README.md
    └── clean-lex.sh
├── setup.py
├── setup.cfg
├── dicts.json
├── LICENSE
├── LICENSE.unidic
├── doc
    └── dataset.md
└── README.md


/unidic/dicdir/char.bin:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/unidic/dicdir/dicrc:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/unidic/dicdir/sys.dic:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/unidic/dicdir/unk.dic:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/unidic/dicdir/matrix.bin:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/unidic/__init__.py:
--------------------------------------------------------------------------------
1 | from .unidic import DICDIR, VERSION
2 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include dicdir/*
2 | include README.md
3 | include LICENSE
4 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | requests>=2.22.0,<3.0.0
2 | tqdm>=4.41.1,<5.0.0
3 | wasabi>=0.6.0,<1.0.0
4 | plac>=1.1.3,<2.0.0
5 | 


--------------------------------------------------------------------------------
/extras/reiwa.33.csv:
--------------------------------------------------------------------------------
1 | 令和,14629,15402,8205,名詞,固有名詞,一般,*,*,*,レイワ,令和,令和,レーワ,令和,レーワ,固,*,*,*,*,*,*,*,レイワ,レイワ,レイワ,レイワ,"1,0",*,*,*,*
2 | ㋿,18255,20453,2588,補助記号,一般,*,*,*,*,,㋿,㋿,,㋿,,記号,*,*,*,*,*,*,*,,,,,*,*,*,*,999999
3 | ㋿,14629,15402,3992,名詞,固有名詞,一般,*,*,*,レイワ,令和,㋿,レーワ,㋿,レーワ,固,*,*,*,*,*,*,*,レイワ,レイワ,レイワ,レイワ,"1,0",*,*,*,*
4 | 


--------------------------------------------------------------------------------
/unidic/unidic.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | from wasabi import msg
 4 | 
 5 | def get_version(dicdir):
 6 |     try:
 7 |         vpath = os.path.join(dicdir, 'version')
 8 |         with open(vpath) as vfile:
 9 |             return vfile.read().strip()
10 |     except FileNotFoundError:
11 |         return '0'
12 | 
13 | _curdir = os.path.dirname(__file__)
14 | 
15 | # This will be used elsewhere to initialize the tagger
16 | DICDIR = os.path.join(_curdir, 'dicdir')
17 | VERSION = get_version(DICDIR)
18 | 
19 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | import pathlib
 2 | import setuptools
 3 | from distutils.core import setup
 4 | import os
 5 | 
 6 | setup(name='unidic', 
 7 |       version='1.1.0',
 8 |       author="Paul O'Leary McCann",
 9 |       author_email="polm@dampfkraft.com",
10 |       description="UniDic packaged for Python",
11 |       long_description=pathlib.Path('README.md').read_text('utf8'),
12 |       long_description_content_type="text/markdown",
13 |       url="https://github.com/polm/unidic-py",
14 |       packages=setuptools.find_packages(),
15 |       package_data={'unidic': ['dicdir/*']},
16 |       )
17 | 


--------------------------------------------------------------------------------
/unidic/__main__.py:
--------------------------------------------------------------------------------
 1 | if __name__ == '__main__':
 2 |     from .download import download_version
 3 |     import plac
 4 |     import sys
 5 | 
 6 |     commands = {
 7 |         "download": download_version,
 8 |     }
 9 | 
10 |     if len(sys.argv) == 1:
11 |         print("Available commands:", ", ".join(commands))
12 |         sys.exit(1)
13 | 
14 |     command = sys.argv.pop(1)
15 | 
16 |     if command in commands:
17 |         plac.call(commands[command], sys.argv[1:])
18 |     else:
19 |         print("Unknown command:", command)
20 |         print("Available commands:", ", ".join(commands))
21 |         sys.exit(1)
22 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
 1 | [metadata]
 2 | description = Japanese UniDic packaged for Python
 3 | url = https://github.com/polm/unidic-py
 4 | author = "Paul O'Leary McCann"
 5 | author_email = polm@dampfkraft.com
 6 | license = MIT
 7 | long_description = file: README.md
 8 | long_description_content_type = text/markdown
 9 | classifiers =
10 |     License :: OSI Approved :: MIT License
11 |     Natural Language :: Japanese
12 | 
13 | [options]
14 | include_package_data = True
15 | python_requires = >=3.5
16 | install_requires =
17 |     requests>=2.22.0,<3.0.0
18 |     tqdm>=4.41.1,<5.0.0
19 |     wasabi>=0.6.0,<1.0.0
20 |     plac>=1.1.3,<2.0.0
21 | 


--------------------------------------------------------------------------------
/extras/README.md:
--------------------------------------------------------------------------------
 1 | # Extras
 2 | 
 3 | unidic-py distributes a slightly modified version of UniDic for ease of use. To
 4 | build this dictionary yourself, perform the following steps:
 5 | 
 6 | 1. Download the official latest UniDic from the [homepage](https://ccd.ninjal.ac.jp/unidic/)
 7 | 2. Use `clean-lex.sh` to rewrite `lex.csv`
 8 | 3. Copy the appropriate `reiwa.csv` to your dictionary directory (the number is the field count)
 9 | 4. Run the normal mecab dictionary build command
10 | 
11 | That's it.
12 | 
13 | The normal MeCab dictionary building command is:
14 | 
15 |     /usr/local/libexec/mecab/mecab-dict-index -d . -o . -f utf8 -t utf8
16 | 
17 | Note that depending on your MeCab install the path may be different.
18 | 


--------------------------------------------------------------------------------
/extras/clean-lex.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Remove entries from lex.csv that can make weird tokenizations
 3 | 
 4 | # usage:
 5 | #     ./clean-lex.sh [lex.csv] > lex.fix.csv
 6 | # Make sure to delete the original lex.csv before building a dictionary.
 7 | 
 8 | # Types of entries to remove:
 9 | # 1. Single latin letters. These can cause short unknown words to break up,
10 | # like "fish" as "f i s h".
11 | # 2. Number-only entries of any length. These can cause strange pronunciations,
12 | # like 10 as "ten".
13 | 
14 | # Note it is extremely important that the character ranges are broken up like
15 | # this to avoid including punctuation. 
16 | grep -Ev '^([A-Za-zＡ-Ｚａ-ｚ]|[0-9０-９]*),' "$1"
17 | 
18 | # In Unidic 2.3.0 this removes 232 entries.
19 | # In Unidic 3.1.0 this removes 233 entries.
20 | 


--------------------------------------------------------------------------------
/dicts.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "latest": {
 3 |     "url" : "https://cotonoha-dic.s3-ap-northeast-1.amazonaws.com/unidic-3.1.0.zip",
 4 |     "version" : "3.1.0+2021-08-31"
 5 |   },
 6 |   "3.1.0+2021-08-31": {
 7 |     "url" : "https://cotonoha-dic.s3-ap-northeast-1.amazonaws.com/unidic-3.1.0.zip",
 8 |     "version" : "3.1.0+2021-08-31"
 9 |   },
10 |   "2.3.0+2020-10-08": {
11 |     "url" : "https://cotonoha-dic.s3-ap-northeast-1.amazonaws.com/unidic.zip",
12 |     "version" : "2.3.0+2020-10-08" 
13 |   },
14 |   "3.1.0a1": {
15 |     "url" : "https://cotonoha-dic.s3-ap-northeast-1.amazonaws.com/unidic-3.1.0a1.zip",
16 |     "version" : "3.1.0+2021-08-31" 
17 |   },
18 |   "1.0.2": {
19 |     "url" : "https://github.com/polm/unidic-py/releases/download/v1.0.2/unidic.zip",
20 |     "version" : "2.3.0" 
21 |   },
22 |   "aws": {
23 |     "url" : "https://cotonoha-dic.s3-ap-northeast-1.amazonaws.com/unidic.zip",
24 |     "version" : "2.3.0+2020-10-08" 
25 |   },
26 |   "blank": {
27 |     "url": "https://www.dampfkraft.com/unidic-blank.zip",
28 |     "version": "blank"
29 |   }
30 | }
31 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2020 Paul O'Leary McCann
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/LICENSE.unidic:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2011-2017, The UniDic Consortium
 2 | All rights reserved.
 3 | 
 4 | Redistribution and use in source and binary forms, with or without
 5 | modification, are permitted provided that the following conditions are
 6 | met:
 7 | 
 8 |  * Redistributions of source code must retain the above copyright
 9 |    notice, this list of conditions and the following disclaimer.
10 | 
11 |  * Redistributions in binary form must reproduce the above copyright
12 |    notice, this list of conditions and the following disclaimer in the
13 |    documentation and/or other materials provided with the
14 |    distribution.
15 | 
16 |  * Neither the name of the UniDic Consortium nor the names of its
17 |    contributors may be used to endorse or promote products derived
18 |    from this software without specific prior written permission.
19 | 
20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
23 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
24 | OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
25 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
26 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
27 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
28 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
29 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
30 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31 | 


--------------------------------------------------------------------------------
/unidic/download.py:
--------------------------------------------------------------------------------
  1 | import requests
  2 | import shutil
  3 | import zipfile
  4 | import os
  5 | import sys
  6 | from wasabi import msg
  7 | from urllib.request import urlretrieve
  8 | from tqdm import tqdm
  9 | 
 10 | # This is used to show progress when downloading.
 11 | # see here: https://github.com/tqdm/tqdm#hooks-and-callbacks
 12 | class TqdmUpTo(tqdm):
 13 |     """Provides `update_to(n)` which uses `tqdm.update(delta_n)`."""
 14 |     def update_to(self, b=1, bsize=1, tsize=None):
 15 |         """
 16 |         b  : int, optional
 17 |             Number of blocks transferred so far [default: 1].
 18 |         bsize  : int, optional
 19 |             Size of each block (in tqdm units) [default: 1].
 20 |         tsize  : int, optional
 21 |             Total size (in tqdm units). If [default: None] remains unchanged.
 22 |         """
 23 |         if tsize is not None:
 24 |             self.total = tsize
 25 |         self.update(b * bsize - self.n)  # will also set self.n = b * bsize
 26 | 
 27 | def download_file(url, fname):
 28 |     with requests.get(url, stream=True) as r:
 29 |         with open(fname, 'wb') as f:
 30 |             shutil.copyfileobj(r.raw, f)
 31 | 
 32 |     return fname
 33 | 
 34 | def download_progress(url, fname):
 35 |     """Download a file and show a progress bar."""
 36 |     with TqdmUpTo(unit='B', unit_scale=True, miniters=1,
 37 |               desc=url.split('/')[-1]) as t:  # all optional kwargs
 38 |         urlretrieve(url, filename=fname, reporthook=t.update_to, data=None)
 39 |         t.total = t.n
 40 |     return fname
 41 | 
 42 | def get_json(url, desc):
 43 |     r = requests.get(url)
 44 |     if r.status_code != 200:
 45 |         msg.fail(
 46 |             "Server error ({})".format(r.status_code),
 47 |             "Couldn't fetch {}. If this error persists please open an issue."
 48 |             " http://github.com/polm/unidic-py/issues/".format(desc),
 49 |             exits=1,
 50 |         )
 51 |     return r.json()
 52 | 
 53 | def download_and_clean(version, url, dirname='unidic', delfiles=[]):
 54 |     """Download unidic and prep the dicdir.
 55 | 
 56 |     This downloads the zip file from the source, extracts it, renames the
 57 |     resulting directory, and removes large files not used at runtime.  
 58 |     """
 59 |     cdir = os.path.dirname(os.path.abspath(__file__))
 60 |     fname = os.path.join(cdir, 'unidic.zip')
 61 |     print("Downloading UniDic v{}...".format(version), file=sys.stderr)
 62 |     download_progress(url, fname)
 63 |     print("Finished download.")
 64 | 
 65 |     with zipfile.ZipFile(fname, 'r') as zf:
 66 |         zf.extractall(cdir)
 67 |     os.remove(fname)
 68 | 
 69 |     dicdir = os.path.join(cdir, 'dicdir')
 70 |     if os.path.isdir(dicdir):
 71 |         shutil.rmtree(dicdir)
 72 | 
 73 |     outdir = os.path.join(cdir, dirname)
 74 |     shutil.move(outdir, dicdir)
 75 | 
 76 |     for dfile in delfiles:
 77 |         os.remove(os.path.join(dicdir, dfile))
 78 | 
 79 |     # save a version file so we can tell what it is
 80 |     vpath = os.path.join(dicdir, 'version')
 81 |     with open(vpath, 'w') as vfile:
 82 |         vfile.write('unidic-{}'.format(version))
 83 | 
 84 |     # Write a dummy mecabrc
 85 |     with open(os.path.join(dicdir, 'mecabrc'), 'w') as mecabrc:
 86 |         mecabrc.write('# This is a dummy file.')
 87 | 
 88 |     print("Downloaded UniDic v{} to {}".format(version, dicdir), file=sys.stderr)
 89 | 
 90 | DICT_INFO = "https://raw.githubusercontent.com/polm/unidic-py/master/dicts.json"
 91 | 
 92 | def download_version(ver="latest"):
 93 |     res = get_json(DICT_INFO, "dictionary info")
 94 |     try:
 95 |         dictinfo = res[ver]
 96 |     except KeyError:
 97 |         print('Unknown version "{}".'.format(ver))
 98 |         print("Known versions:")
 99 |         for key, val in res.items():
100 |             print("\t", key, "({})".format(val['version']))
101 | 
102 |     print("download url:", dictinfo['url'])
103 |     print("Dictionary version:", dictinfo['version'])
104 |     download_and_clean(dictinfo['version'], dictinfo['url'])
105 | 
106 | 


--------------------------------------------------------------------------------
/doc/dataset.md:
--------------------------------------------------------------------------------
  1 | This is a description of the various tokenizer dictionaries hosted on AWS as
  2 | part of Amazon's Open Data Sponsorship Program. This file is distributed with
  3 | the main unidic-py package, but also describes other dictionaries.
  4 | 
  5 | This repository contains dictionaries for use with MeCab for tokenization and
  6 | morphological analysis of modern written Japanese.
  7 | 
  8 | ## Background
  9 | 
 10 | Two main dictionaries are included in this repository: UniDic and IPADic.
 11 | 
 12 | UniDic is maintained by NINJAL, the National Institute for Japanese Language
 13 | and Lingusitics. It is the official dictionary of Japanese Universal
 14 | Dependencies. Based on the "School Grammar" model of Japanese, it uses the
 15 | "short unit word" as the base unit of tokenization, which results in highly
 16 | reproducible tokenizations suitable for many downstream tasks. 
 17 | 
 18 | In its default distribution, UniDic includes dictionary entries for individual
 19 | letters of the latin alphabet and some numbers. Because these can cause
 20 | unexpected tokenization results with the default configuration, they have been
 21 | removed. As a result strings of latin text or numbers will result in single
 22 | tokens.
 23 | 
 24 | This distribution of UniDic also adds entries for 令和 *Reiwa*, the name of the
 25 | new Imperial Era that began in 2019. 
 26 | 
 27 | Additionally, a smaller version UniDic called "unidic-lite" is provided. This
 28 | is based v2.1.2 of UniDic, which was much smaller in size than more recent
 29 | versions, but should still be accurate for many applications. In particular,
 30 | for first time use of MeCab it's more than enough to get started with Japanese
 31 | text analysis.
 32 | 
 33 | Besides UniDic, IPADic is also provided. IPADic is a dictionary based on an
 34 | original definition of the "word" and was built at the Nara Institute of
 35 | Science and Technology (NAIST), based on an older dictionary from the Institute
 36 | for New Generation Computer Technology (ICOT). IPADic has not been updated
 37 | since 2007, so using it is not recommended for new projects. However, IPADic
 38 | has been used for many old benchmarks and occasionally is used in newer
 39 | projects, so it's provided here for purposes of compatability. Unlike UniDic,
 40 | IPADic has not been modified for convenience or to make it more current in the
 41 | interest of providing maximum compatability with old benchmarks.
 42 | 
 43 | ## Data Access
 44 | 
 45 | The various dictionaries are stored in the cloud on Amazon Web Services and can
 46 | be installed through pip. 
 47 | 
 48 |     pip install unidic
 49 |     pip install unidic-lite
 50 |     pip install ipadic
 51 | 
 52 | In the case of unidic, due to the large file size - 1GB after extraction - it
 53 | has an extra download step. Do this to download the dictionary data from S3:
 54 | 
 55 |     python -m unidic download aws
 56 | 
 57 | ## Data Types and Structure
 58 | 
 59 | The dictionaries are provided as binary files for use with MeCab and you should
 60 | generally not need to access them directly.
 61 | 
 62 | The raw files used to compile the dictionaries are in CSV format. The fields
 63 | differ for each dictionary, except for the first four fields, which are defined
 64 | by MeCab.
 65 | 
 66 | - word: the literal word as it appears in text
 67 | - left_id: an internal ID (integer)
 68 | - right_id: an internal ID (integer). Ordinarily this is always the same as left_id.
 69 | - cost: (integer) Used in calculating tokenizations, low cost entries take priority over high cost entries.
 70 | 
 71 | ### UniDic
 72 | 
 73 | UniDic fields are defined at [NINJAL's UniDic FAQ page][faq]. unidic-lite uses
 74 | a subset of the fields in the most recent version.
 75 | 
 76 | [faq]: https://clrd.ninjal.ac.jp/unidic/faq.html
 77 | 
 78 | IPADic fields are as follows:
 79 | 
 80 | - pos1: Broad part of speech
 81 | - pos2: part of speech
 82 | - pos3: part of speech
 83 | - ctype: conjugation type
 84 | - cform: the conjugation of the current token
 85 | - lemma: normalized form
 86 | - reading: standard kana representation
 87 | - pronuncation: as reading, but long vowels are differentiated (講師 is こーし, 子牛 is こうし)
 88 | 
 89 | ## References
 90 | 
 91 | UniDic is developed by NINJAL. Here are some resources on UniDic:
 92 | 
 93 | - [UniDic home page](https://clrd.ninjal.ac.jp/unidic/)
 94 | - [Universal Dependencies for Japanese](https://www.semanticscholar.org/paper/Universal-Dependencies-for-Japanese-Tanaka-Miyao/064b601542d27471e397f8df811f0ddb54824113)
 95 | 
 96 | The MeCab documentation covers dictionary formats and tokenizer options. It
 97 | also has some discussion of IPADic, which does not have a home page.
 98 | 
 99 | - [MeCab documentation](https://taku910.github.io/mecab/)
100 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # unidic-py
  2 | 
  3 | This is a version of [UniDic](https://clrd.ninjal.ac.jp/unidic/) for
  4 | Contemporary Written Japanese packaged for use with pip.
  5 | 
  6 | Currently it supports 3.1.0, the latest version of UniDic. **Note this will
  7 | take up 770MB on disk after install.** If you want a small package, try
  8 | [unidic-lite](https://github.com/polm/unidic-lite).
  9 | 
 10 | The data for this dictionary is hosted as part of the AWS Open Data
 11 | Sponsorship Program. You can read the announcement
 12 | [here](https://aws.amazon.com/jp/blogs/news/published-unidic-mecab-on-aws-open-data/).
 13 | 
 14 | After installing via pip, you need to download the dictionary using the
 15 | following command:
 16 | 
 17 |     python -m unidic download
 18 | 
 19 | With [fugashi](https://github.com/polm/fugashi) or
 20 | [mecab-python3](https://github.com/samurait/mecab-python3) unidic will be used
 21 | automatically when installed, though if you want you can manually pass the
 22 | MeCab arguments:
 23 | 
 24 |     import fugashi
 25 |     import unidic
 26 |     tagger = fugashi.Tagger('-d "{}"'.format(unidic.DICDIR))
 27 |     # that's it!
 28 | 
 29 | ## Differences from the Official UniDic Release
 30 | 
 31 | This has a few changes from the official UniDic release to make it easier to use.
 32 | 
 33 | - entries for 令和 have been added
 34 | - single-character numeric and alphabetic words have been deleted
 35 | - `unk.def` has been modified so unknown punctuation won't be marked as a noun
 36 | 
 37 | See the `extras` directory for details on how to replicate the build process.
 38 | 
 39 | ## Fields
 40 | 
 41 | Here is a list of fields included in this edition of UniDic. For more information see the [UniDic FAQ](https://clrd.ninjal.ac.jp/unidic/faq.html#col_name), though not all fields are included. For fields in the UniDic FAQ the name given there is included. Als orefer to the [description of the field hierarchy](https://clrd.ninjal.ac.jp/unidic/glossary.html#kaisouteki) for details.
 42 | 
 43 | Fields which are not applicable are usually marked with an asterisk (`*`).
 44 | 
 45 | - **pos1, pos2, pos3, pos4**: Part of speech fields. The earlier fields are more general, the later fields are more specific.
 46 | - **cType:** 活用型, conjugation type. Will have a value like `五段-ラ行`.
 47 | - **cForm:** 活用形, conjugation shape. Will have a value like `連用形-促音便`.
 48 | - **lForm:** 語彙素読み, lemma reading. The reading of the lemma in katakana, this uses the same format as the `kana` field, not `pron`.
 49 | - **lemma:** 語彙素（＋語彙素細分類）. The lemma is a non-inflected "dictionary form" of a word. UniDic lemmas sometimes include extra info or have unusual forms, like using katakana for some place names.
 50 | - **orth:** 書字形出現形, the word as it appears in text, this appears to be identical to the surface.
 51 | - **pron:** 発音形出現形, pronunciation. This is similar to kana except that long vowels are indicated with a ー, so 講師 is こーし.
 52 | - **orthBase:** 書字形基本形, the uninflected form of the word using its current written form. For example, for 彷徨った the lemma is さ迷う but the orthBase is 彷徨う.
 53 | - **pronBase:** 発音形基本形, the pronunciation of the base form. Like `pron` for the `lemma` or `orthBase`.
 54 | - **goshu:** 語種, word type. Etymological category. In order of frequency, 和, 固, 漢, 外, 混, 記号, 不明. Defined for all dictionary words, blank for unks.
 55 | - **iType:** 語頭変化化型, "i" is for "initial". This is the type of initial transformation the word undergoes when combining, for example 兵 is へ半濁 because it can be read as べい in combination. This is available for <2% of entries.
 56 | - **iForm:** 語頭変化形, this is the initial form of the word in context, such as 基本形 or 半濁音形.
 57 | - **fType:** 語末変化化型, "f" is for "final", but otherwise as iType. For example 医学 is ク促 because it can change to いがっ (apparently). This is available for <0.1% of entries.
 58 | - **fForm:** 語末変化形, as iForm but for final transformations.
 59 | - **iConType:** 語頭変化結合型, initial change fusion type. Describes phonetic change at the start of the word in counting expressions. Only available for a few hundred entries, mostly numbers. Values are N followed by a letter or number; most entries with this value are numeric.
 60 | - **fConType:** 語末変化結合型, final change fusion type. This is also used for counting expressions, and like iConType it is only available for a few hundred entries. Unlike iConType the values are very complicated, like `B1S6SjShS,B1S6S8SjShS`.
 61 | - **type:** Appears to refer to the type of the lemma. See the details below for an overview.
 62 | 
 63 | <details>
 64 |     <summary>Type and POS fields in unidic-cwj-202302</summary>
 65 |     <pre>
 66 | type,pos1,pos2,pos3,pos4
 67 | 人名,名詞,固有名詞,人名,一般
 68 | 他,感動詞,フィラー,*,*
 69 | 他,感動詞,一般,*,*
 70 | 他,接続詞,*,*,*
 71 | 体,代名詞,*,*,*
 72 | 体,名詞,助動詞語幹,*,*
 73 | 体,名詞,普通名詞,サ変可能,*
 74 | 体,名詞,普通名詞,サ変形状詞可能,*
 75 | 体,名詞,普通名詞,一般,*
 76 | 体,名詞,普通名詞,副詞可能,*
 77 | 体,名詞,普通名詞,助数詞可能,*
 78 | 体,名詞,普通名詞,形状詞可能,*
 79 | 係助,助詞,係助詞,*,*
 80 | 副助,助詞,副助詞,*,*
 81 | 助動,助動詞,*,*,*
 82 | 助動,形状詞,助動詞語幹,*,*
 83 | 助数,接尾辞,名詞的,助数詞,*
 84 | 名,名詞,固有名詞,人名,名
 85 | 固有名,名詞,固有名詞,一般,*
 86 | 国,名詞,固有名詞,地名,国
 87 | 地名,名詞,固有名詞,地名,一般
 88 | 姓,名詞,固有名詞,人名,姓
 89 | 接助,助詞,接続助詞,*,*
 90 | 接尾体,接尾辞,名詞的,サ変可能,*
 91 | 接尾体,接尾辞,名詞的,一般,*
 92 | 接尾体,接尾辞,名詞的,副詞可能,*
 93 | 接尾用,接尾辞,動詞的,*,*
 94 | 接尾相,接尾辞,形容詞的,*,*
 95 | 接尾相,接尾辞,形状詞的,*,*
 96 | 接頭,接頭辞,*,*,*
 97 | 数,名詞,数詞,*,*
 98 | 格助,助詞,格助詞,*,*
 99 | 準助,助詞,準体助詞,*,*
100 | 用,動詞,一般,*,*
101 | 用,動詞,非自立可能,*,*
102 | 相,副詞,*,*,*
103 | 相,形容詞,一般,*,*
104 | 相,形容詞,非自立可能,*,*
105 | 相,形状詞,タリ,*,*
106 | 相,形状詞,一般,*,*
107 | 相,連体詞,*,*,*
108 | 終助,助詞,終助詞,*,*
109 | 補助,空白,*,*,*
110 | 補助,補助記号,一般,*,*
111 | 補助,補助記号,句点,*,*
112 | 補助,補助記号,括弧閉,*,*
113 | 補助,補助記号,括弧開,*,*
114 | 補助,補助記号,読点,*,*
115 | 補助,補助記号,ＡＡ,一般,*
116 | 補助,補助記号,ＡＡ,顔文字,*
117 | 記号,記号,一般,*,*
118 | 記号,記号,文字,*,*
119 |     </pre>
120 | </details>
121 | 
122 | - **kana:** 読みがな, this is the typical representation of a word in katakana, unlike pron. 講師 is コウシ.
123 | - **kanaBase:** 仮名形基本形, this is the typical katakana representation of the lemma.
124 | - **form:** 語形出現形, the form of the word as it appears. Form groups the same word with different written expressions together.
125 | - **formBase:** 語形基本形 the uninflected form of the word. For example, the formBase オオキイ groups its orthBase 書字形基本形 大きい and おおきい together. Also since its casual form of the orthBase おっきい has a different pronunciation, it is regarded as a distinct formBase オッキイ (see the UniDic hierarchy for details).
126 | - **aType:** Accent type. This is a (potentially) comma-separated field which has the number of the mora taking the accent in 標準語 (standard language). When there are multiple values, more common accent patterns come first.
127 | - **aConType:** This describes how the accent shifts when the word is used in a counter expression. It uses complicated notation.
128 | - **aModType:** Presumably accent related but unclear use. Available for <25% of entries and only has 6 non-default values.
129 | - **lid:** 語彙表ID. A long lemma ID. This seems to be a kind of GUID. There is usually one entry per line in the CSV, except that half-width and full-width variations can be combined. Example: 7821659499274752
130 | - **lemma_id:** 語彙素ID. A shorter lemma id, starting from 1. This seems to be as unique as the `lemma` field, so many CSV lines can share this value. Examples of values that share an ID are: クリエイティブ, クリエイティヴ, クリエーティブ and Ｃｒｅａｔｉｖｅ.
131 | 
132 | # License
133 | 
134 | The modern Japanese UniDic is available under the GPL, LGPL, or BSD license,
135 | [see here](https://clrd.ninjal.ac.jp/unidic/commerce_use.html). UniDic is
136 | developed by [NINJAL](https://www.ninjal.ac.jp/), the National Institute for
137 | Japanese Language and Linguistics. UniDic is copyrighted by the UniDic
138 | Consortium and is distributed here under the terms of the [BSD
139 | License](./LICENSE.unidic).
140 | 
141 | The code in this repository is not written or maintained by NINJAL. The code is
142 | available under the MIT or WTFPL License, as you prefer.
143 | 


--------------------------------------------------------------------------------