├── tests
├── __init__.py
├── nn
│ ├── __init__.py
│ └── lang
│ │ ├── __init__.py
│ │ └── japanese
│ │ ├── __init__.py
│ │ └── test_kansuji.py
├── util
│ ├── __init__.py
│ └── test_word_type.py
└── filters
│ ├── __init__.py
│ └── test_neologd_patch.py
├── tdmelodic
├── nn
│ ├── __init__.py
│ ├── lang
│ │ ├── __init__.py
│ │ ├── category
│ │ │ ├── __init__.py
│ │ │ ├── list_of_symbols
│ │ │ │ ├── __init__.py
│ │ │ │ ├── goshu.py
│ │ │ │ ├── acc_concat.py
│ │ │ │ └── pos_short.py
│ │ │ └── symbol_map.py
│ │ ├── japanese
│ │ │ ├── __init__.py
│ │ │ ├── kana
│ │ │ │ ├── __init__.py
│ │ │ │ ├── kanamap
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ └── kanamap_normal.py
│ │ │ │ ├── hyphen2romaji.py
│ │ │ │ ├── kana2roman.py
│ │ │ │ └── mora_sep.py
│ │ │ ├── accent
│ │ │ │ ├── __init__.py
│ │ │ │ ├── accent_diff.py
│ │ │ │ └── accent_alignment.py
│ │ │ ├── text_normalize.py
│ │ │ └── kansuji.py
│ │ └── mecab
│ │ │ ├── __init__.py
│ │ │ ├── my_mecabrc
│ │ │ └── unidic.py
│ ├── loader
│ │ ├── __init__.py
│ │ ├── data_loader_base.py
│ │ └── data_loader.py
│ ├── model
│ │ ├── __init__.py
│ │ ├── modules
│ │ │ ├── __init__.py
│ │ │ ├── gatedconv1d.py
│ │ │ ├── stacked_conv.py
│ │ │ ├── cnn_attention.py
│ │ │ └── dilateconvcausal1d.py
│ │ ├── decode_accent.py
│ │ ├── encode_morae.py
│ │ └── encode_surface.py
│ ├── resource
│ │ └── net_it_2500000
│ ├── net.py
│ ├── inference.py
│ ├── convert_dic.py
│ └── convert.py
├── util
│ ├── __init__.py
│ ├── util.py
│ ├── dic_index_map.py
│ └── word_type.py
├── filters
│ ├── __init__.py
│ ├── yomi
│ │ ├── __init__.py
│ │ ├── basic.py
│ │ ├── particle_yomi.py
│ │ ├── yomieval.py
│ │ └── wrong_yomi_detection.py
│ ├── postprocess_modify_unigram_cost.py
│ ├── neologd_rmdups.py
│ ├── neologd_preprocess.py
│ └── neologd_patch.py
└── __init__.py
├── docs
├── requirements.txt
├── logo_tdmelodic.png
├── imgs
│ ├── sent_example.png
│ ├── jpn_accent_types.png
│ ├── jpn_accent-en-page1.png
│ ├── jpn_accent-en-page2.png
│ ├── jpn_accent-en-page3.png
│ ├── jpn_accent-en-page4.png
│ ├── jpn_accent-en-page5.png
│ ├── Dockerfile
│ ├── makefile
│ ├── sent_example.ly
│ ├── jpn_accent_types.ly
│ ├── jpn_accent-ja.ly
│ └── jpn_accent-en.ly
├── make.bat
├── Makefile
├── locale
│ └── ja
│ │ └── LC_MESSAGES
│ │ ├── pages
│ │ ├── ipadic-usage.po
│ │ ├── unidic-usage.po
│ │ ├── docker.po
│ │ ├── unidic-dicgen.po
│ │ ├── onebyone.po
│ │ └── ipadic-dicgen.po
│ │ └── index.po
├── index.rst
├── pages
│ ├── unidic-dicgen.md
│ ├── ipadic-usage.md
│ ├── docker.md
│ ├── unidic-usage.md
│ ├── ipadic-dicgen.md
│ ├── onebyone.rst
│ └── introduction.rst
└── conf.py
├── MANIFEST.in
├── requirements.txt
├── .readthedocs.yaml
├── .github
└── workflows
│ ├── docker-image.yml
│ ├── img.yml
│ └── test.yml
├── packege.sh
├── LICENSE
├── Dockerfile
├── README.md
├── setup.py
└── .gitignore
/tests/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/tests/nn/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/tests/util/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/tdmelodic/nn/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/tdmelodic/util/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/tests/filters/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/tests/nn/lang/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/tdmelodic/filters/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/tdmelodic/nn/lang/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/tdmelodic/nn/loader/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/tdmelodic/nn/model/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/tdmelodic/filters/yomi/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/tdmelodic/nn/lang/category/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/tdmelodic/nn/lang/japanese/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/tdmelodic/nn/lang/mecab/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/tdmelodic/nn/model/modules/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/tests/nn/lang/japanese/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/tdmelodic/nn/lang/japanese/kana/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/tdmelodic/nn/lang/japanese/accent/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/tdmelodic/nn/lang/japanese/kana/kanamap/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/tdmelodic/nn/lang/category/list_of_symbols/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/docs/requirements.txt:
--------------------------------------------------------------------------------
1 | Sphinx==4.0.2
2 | myst-parser==0.15.1
3 | sphinx_rtd_theme
--------------------------------------------------------------------------------
/docs/logo_tdmelodic.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PKSHATechnology-Research/tdmelodic/HEAD/docs/logo_tdmelodic.png
--------------------------------------------------------------------------------
/docs/imgs/sent_example.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PKSHATechnology-Research/tdmelodic/HEAD/docs/imgs/sent_example.png
--------------------------------------------------------------------------------
/docs/imgs/jpn_accent_types.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PKSHATechnology-Research/tdmelodic/HEAD/docs/imgs/jpn_accent_types.png
--------------------------------------------------------------------------------
/docs/imgs/jpn_accent-en-page1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PKSHATechnology-Research/tdmelodic/HEAD/docs/imgs/jpn_accent-en-page1.png
--------------------------------------------------------------------------------
/docs/imgs/jpn_accent-en-page2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PKSHATechnology-Research/tdmelodic/HEAD/docs/imgs/jpn_accent-en-page2.png
--------------------------------------------------------------------------------
/docs/imgs/jpn_accent-en-page3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PKSHATechnology-Research/tdmelodic/HEAD/docs/imgs/jpn_accent-en-page3.png
--------------------------------------------------------------------------------
/docs/imgs/jpn_accent-en-page4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PKSHATechnology-Research/tdmelodic/HEAD/docs/imgs/jpn_accent-en-page4.png
--------------------------------------------------------------------------------
/docs/imgs/jpn_accent-en-page5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PKSHATechnology-Research/tdmelodic/HEAD/docs/imgs/jpn_accent-en-page5.png
--------------------------------------------------------------------------------
/tdmelodic/nn/resource/net_it_2500000:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PKSHATechnology-Research/tdmelodic/HEAD/tdmelodic/nn/resource/net_it_2500000
--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include README.md
2 | include LICENSE
3 | include tdmelodic/nn/lang/mecab/my_mecabrc
4 | include tdmelodic/nn/resource/net_it_2500000
5 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | numpy>=1.15.4
2 | chainer>=5.1.0
3 | mecab-python3>=0.996.1
4 | jaconv>=0.2.4
5 | python-Levenshtein>=0.12.0
6 | tqdm>=4.42.1
7 | regex>=2020.1.8
8 | matplotlib>=3.0.0
9 | romkan>=0.2.1
--------------------------------------------------------------------------------
/.readthedocs.yaml:
--------------------------------------------------------------------------------
1 | version: 2
2 |
3 | sphinx:
4 | configuration: docs/conf.py
5 |
6 | formats: all
7 |
8 | python:
9 | version: 3.8
10 | install:
11 | - requirements: docs/requirements.txt
--------------------------------------------------------------------------------
/tdmelodic/nn/lang/category/list_of_symbols/goshu.py:
--------------------------------------------------------------------------------
1 | goshu_list = [
2 | "和",
3 | "漢",
4 | "外",
5 | "混",
6 | "固",
7 | "記号",
8 | "他"
9 | ]
10 |
11 | goshu_map = { p : i+1 for i, p in enumerate(goshu_list) }
12 | goshu_map[None] = 0
13 | goshu_invmap = {v:k for k, v in goshu_map.items()}
14 |
--------------------------------------------------------------------------------
/docs/imgs/Dockerfile:
--------------------------------------------------------------------------------
1 | # docker build -t lilypond .
2 | # docker run -v `pwd`:/work -it lilypond
3 |
4 | FROM ubuntu:16.04
5 |
6 | RUN apt-get update
7 | RUN apt-get -y install lilypond netpbm make locales-all
8 | RUN apt install -y fonts-ipafont fonts-ipaexfont
9 |
10 | ENV LANG ja_JP.UTF-8
11 | ENV LANGUAGE ja_JP.UTF-8
12 |
--------------------------------------------------------------------------------
/tdmelodic/nn/lang/category/list_of_symbols/acc_concat.py:
--------------------------------------------------------------------------------
1 | # we consider only "Cx"
2 | accent_list = [
3 | "C1",
4 | "C2",
5 | "C3",
6 | "C4",
7 | "C5",
8 | ]
9 |
10 | accent_map = { p : i+1 for i, p in enumerate(accent_list) }
11 | accent_map[None] = 0
12 | accent_map[''] = 0
13 | accent_invmap = {v:k for k, v in accent_map.items()}
14 |
--------------------------------------------------------------------------------
/tdmelodic/__init__.py:
--------------------------------------------------------------------------------
1 | from .nn import *
2 | from .filters import *
3 |
4 | __copyright__ = 'Copyright (C) 2019 Hideyuki Tachibana, PKSHA Technology Inc.'
5 | __version__ = '1.0.0'
6 | __license__ = 'BSD-3-Clause'
7 | __author__ = 'Hideyuki Tachibana'
8 | __author_email__ = 'h_tachibana@pkshatech.com'
9 | __url__ = 'http://github.com/PKSHATechnology-Research/tdmelodic'
10 |
11 | __all__ = ['nn', 'filters']
12 |
--------------------------------------------------------------------------------
/docs/imgs/makefile:
--------------------------------------------------------------------------------
1 | LYFILES = $(shell ls *.ly)
2 | TARGET = $(LYFILES:%.ly=%.pdf)
3 |
4 | all: $(TARGET)
5 |
6 | %.pdf:%.ly
7 | echo "----------------------------------------------------"
8 | echo $@ $<
9 |
10 | # pdf
11 | lilypond $<
12 |
13 | # png
14 | $(eval RESOLUTION:=$(shell echo $< | awk '{ if ( $$0 ~ /jpn_accent/ ) print 140; else print 220} ' ) )
15 | echo "****" $< $(RESOLUTION)
16 | lilypond -dresolution=$(RESOLUTION) -dpixmap-format=pngalpha --png $<
17 | # lilypond -dresolution=$(RESOLUTION) -danti-alias-factor=10 --png $<
18 |
19 | .PHONY:clean
20 | clean:
21 | rm -f *.pdf *.png
22 |
--------------------------------------------------------------------------------
/tdmelodic/util/util.py:
--------------------------------------------------------------------------------
1 | # -----------------------------------------------------------------------------
2 | # Copyright (c) 2019-, PKSHA Technology Inc.
3 | # All rights reserved.
4 | #
5 | # This source code is licensed under the BSD-style license found in the
6 | # LICENSE file in the root directory of this source tree.
7 | # -----------------------------------------------------------------------------
8 |
9 | import sys
10 |
11 | def count_lines(fp):
12 | if fp is not sys.stdin:
13 | for i, l in enumerate(fp):
14 | pass
15 | fp.seek(0)
16 | return i + 1
17 | else:
18 | return None
19 |
--------------------------------------------------------------------------------
/.github/workflows/docker-image.yml:
--------------------------------------------------------------------------------
1 | name: Docker
2 |
3 | on:
4 | push:
5 | branches: [ master ]
6 | paths:
7 | - 'Dockerfile'
8 | - '.github/workflows/docker-image.yml'
9 | pull_request:
10 | branches: [ master ]
11 | paths:
12 | - 'Dockerfile'
13 | - '.github/workflows/docker-image.yml'
14 | schedule:
15 | - cron: '0 0 1 */2 *' # bimonthly check
16 |
17 | jobs:
18 | build:
19 | runs-on: ubuntu-latest
20 | steps:
21 | - name: Checkout Source
22 | uses: actions/checkout@v2
23 | - name: Build the Docker image
24 | run: |
25 | wget --no-check-certificate https://ccd.ninjal.ac.jp/unidic_archive/cwj/2.1.2/unidic-mecab_kana-accent-2.1.2_src.zip
26 | docker build -t tdmelodic:$(date +%s) . --no-cache
27 |
--------------------------------------------------------------------------------
/.github/workflows/img.yml:
--------------------------------------------------------------------------------
1 | name: Lilypond
2 | on:
3 | push:
4 | branches:
5 | - test-imgs
6 | pull_request:
7 | branches: [main, master]
8 | paths:
9 | - 'docs/imgs/*.ly'
10 | - '.github/workflows/img.yml'
11 | schedule:
12 | # Monthly check (12:10 PM UTC, 28th day of every month)
13 | - cron: '10 12 27 * *'
14 | env:
15 | IMG_DIR: docs/imgs
16 | jobs:
17 | build_sheets:
18 | runs-on: ubuntu-latest
19 | steps:
20 | - name: Checkout Source
21 | uses: actions/checkout@v2
22 | - name: apt
23 | run: |
24 | sudo apt-get update
25 | sudo apt-get -y install lilypond netpbm make locales-all
26 | sudo apt install -y fonts-ipafont fonts-ipaexfont
27 | export LANG=ja_JP.UTF-8
28 | export LANGUAGE=ja_JP.UTF-8
29 | - name: build
30 | run: |
31 | cd ${IMG_DIR}
32 | make
33 |
--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
1 | @ECHO OFF
2 |
3 | pushd %~dp0
4 |
5 | REM Command file for Sphinx documentation
6 |
7 | if "%SPHINXBUILD%" == "" (
8 | set SPHINXBUILD=sphinx-build
9 | )
10 | set SOURCEDIR=.
11 | set BUILDDIR=_build
12 |
13 | if "%1" == "" goto help
14 |
15 | %SPHINXBUILD% >NUL 2>NUL
16 | if errorlevel 9009 (
17 | echo.
18 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
19 | echo.installed, then set the SPHINXBUILD environment variable to point
20 | echo.to the full path of the 'sphinx-build' executable. Alternatively you
21 | echo.may add the Sphinx directory to PATH.
22 | echo.
23 | echo.If you don't have Sphinx installed, grab it from
24 | echo.http://sphinx-doc.org/
25 | exit /b 1
26 | )
27 |
28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
29 | goto end
30 |
31 | :help
32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
33 |
34 | :end
35 | popd
36 |
--------------------------------------------------------------------------------
/packege.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # remove binary and create dummy file
4 | echo -----------------------------------------------------------
5 | files_to_exclude="
6 | tdmelodic/nn/resource/net_it_2500000
7 | "
8 |
9 | for f in $files_to_exclude;
10 | do
11 | echo replace $f with an empty file
12 | if [ -e $f ]; then
13 | mv $f ${f}_bak
14 | fi
15 | touch $f
16 | done
17 |
18 | # remove old files
19 | echo -----------------------------------------------------------
20 | echo removing obosolete files
21 | rm -rf dist/*
22 | rm -rf tdmelodic.egg-info*
23 |
24 | # sdist
25 | echo -----------------------------------------------------------
26 | python3 setup.py sdist bdist_wheel
27 |
28 | # back
29 | echo -----------------------------------------------------------
30 | for f in $files_to_exclude;
31 | do
32 | echo moving ${f}_bak back to $f
33 | mv ${f}_bak ${f}
34 | done
35 |
36 | # done
37 | echo -----------------------------------------------------------
38 | echo Done.
--------------------------------------------------------------------------------
/.github/workflows/test.yml:
--------------------------------------------------------------------------------
1 | name: Python unittest
2 |
3 | on: [push, pull_request]
4 |
5 | jobs:
6 | build:
7 |
8 | runs-on: ubuntu-latest
9 | strategy:
10 | matrix:
11 | python-version: [3.7, 3.8, 3.9]
12 |
13 | steps:
14 | - name: Checkout Source
15 | uses: actions/checkout@v2
16 | - name: Set up Python ${{ matrix.python-version }}
17 | uses: actions/setup-python@v2
18 | with:
19 | python-version: ${{ matrix.python-version }}
20 | - name: Install mecab
21 | run: |
22 | sudo apt-get update
23 | sudo apt-get install -y --no-install-recommends \
24 | mecab libmecab-dev \
25 | python3-dev python3-pip python3-setuptools python3-tk
26 | - name: Install dependencies
27 | run: |
28 | python -m pip install --upgrade pip
29 | pip install pytest
30 | pip install -r requirements.txt
31 | - name: Test
32 | run: |
33 | pytest
34 |
--------------------------------------------------------------------------------
/tests/filters/test_neologd_patch.py:
--------------------------------------------------------------------------------
1 | import unittest
2 | from unittest import TestCase
3 |
4 | import argparse
5 | from tdmelodic.filters.neologd_preprocess import my_add_argument
6 |
7 | class Test(unittest.TestCase):
8 | def __init__(self, *args, **kwargs):
9 | super(Test, self).__init__(*args, **kwargs)
10 |
11 | def test_my_add_argument(self):
12 | parser = argparse.ArgumentParser()
13 | my_add_argument(parser, "a", True, "help")
14 | my_add_argument(parser, "b", True, "help")
15 | my_add_argument(parser, "c", True, "help")
16 | my_add_argument(parser, "d", False, "help")
17 | my_add_argument(parser, "e", False, "help")
18 | my_add_argument(parser, "f", False, "help")
19 | args = parser.parse_args(["--a", "--no-b", "--d", "--no-e"])
20 | assert(args.a is True)
21 | assert(args.b is False)
22 | assert(args.c is True)
23 | assert(args.d is True)
24 | assert(args.e is False)
25 | assert(args.f is False)
26 |
27 | if __name__ == '__main__':
28 | unittest.main()
--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
1 | # Minimal makefile for Sphinx documentation
2 | #
3 |
4 | # You can set these variables from the command line, and also
5 | # from the environment for the first two.
6 | SPHINXOPTS ?=
7 | #SPHINXBUILD ?= sphinx-build
8 | PYTHON_VERSION=$(shell python -c "import sys;v=str(sys.version_info[0]);sys.stdout.write(v)")
9 | SPHINXBUILD ?= $(if $(ifeq $(PYTHON_VERSION), 3),sphinx-build,python3 -msphinx)
10 | SOURCEDIR = .
11 | BUILDDIR = _build
12 |
13 |
14 | # Put it first so that "make" without argument is like "make help".
15 | help:
16 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
17 |
18 | .PHONY: help Makefile
19 |
20 | # Catch-all target: route all unknown targets to Sphinx using the new
21 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
22 | %: Makefile
23 | @echo Python version is \"$(PYTHON_VERSION)\"
24 | @echo Sphinx build command is \"$(SPHINXBUILD)\"
25 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
26 |
27 | jahtml:
28 | make -e SPHINXOPTS="-D language='ja'" html
29 |
30 | jalatex:
31 | make -e SPHINXOPTS="-D language='ja'" latexpdf
32 |
33 | translation:
34 | make gettext
35 | sphinx-intl update -d locale -l ja
36 |
--------------------------------------------------------------------------------
/tdmelodic/util/dic_index_map.py:
--------------------------------------------------------------------------------
1 | # -----------------------------------------------------------------------------
2 | # Copyright (c) 2019-, PKSHA Technology Inc.
3 | # All rights reserved.
4 | #
5 | # This source code is licensed under the BSD-style license found in the
6 | # LICENSE file in the root directory of this source tree.
7 | # -----------------------------------------------------------------------------
8 |
9 | unidic_index_map = {
10 | # see also mecabrc
11 | "SURFACE": 0,
12 | "COST": 3,
13 | "POS1": 4, # f[0]: pos1
14 | "POS2": 5, # f[1]: pos2
15 | "POS3": 6, # f[2]: pos3
16 | "POS4": 7, # f[3]: pos4
17 | "LEMMA" : 11, # f[7]: lemma
18 | "YOMI": 13, # f[9]: pron
19 | "GOSHU": 16, # f[12]: goshu
20 | "ACCENT": 27, # f[23]: aType
21 | }
22 |
23 | ipadic_index_map = {
24 | # see also mecabrc
25 | "SURFACE": 0,
26 | "COST": 3,
27 | "POS1": 4,
28 | "POS2": 5,
29 | "POS3": 6,
30 | "POS4": 7,
31 | "LEMMA": 10,
32 | "YOMI": 12,
33 | "GOSHU": 9, # We do not use this element. Dummy value.
34 | "ACCENT": 9, # We do not use this element. Dummy value.
35 | }
36 |
37 | def get_dictionary_index_map(mode):
38 | if mode == "unidic":
39 | IDX_MAP = unidic_index_map
40 | elif mode == "ipadic":
41 | IDX_MAP = ipadic_index_map
42 | else:
43 | IDX_MAP = unidic_index_map
44 | return IDX_MAP
--------------------------------------------------------------------------------
/tdmelodic/nn/lang/category/symbol_map.py:
--------------------------------------------------------------------------------
1 | # -----------------------------------------------------------------------------
2 | # Copyright (c) 2019-, PKSHA Technology Inc.
3 | # All rights reserved.
4 | #
5 | # This source code is licensed under the BSD-style license found in the
6 | # LICENSE file in the root directory of this source tree.
7 | # -----------------------------------------------------------------------------
8 |
9 | import os, sys
10 | import re
11 |
12 | from .list_of_symbols.acc_concat import accent_map, accent_invmap
13 | from .list_of_symbols.goshu import goshu_map, goshu_invmap
14 | from .list_of_symbols.pos_short import pos_map, pos_invmap
15 |
16 | # キーが定義されていないものは None と同じもの(0番)を返す。
17 | def acccon_map_robust(x):
18 | return accent_map[x] if x in accent_map.keys() else 0
19 |
20 | def goshu_map_robust(x):
21 | return goshu_map[x] if x in goshu_map.keys() else 0
22 |
23 | def pos_map_robust(x):
24 | return pos_map[x] if x in pos_map.keys() else 0
25 |
26 | # 数値 -> 記号
27 | import string
28 | numeric_to_char_symbol = {i + 1: c for
29 | i, c in enumerate(string.digits +
30 | string.ascii_letters +
31 | string.punctuation)}
32 | numeric_to_char_symbol[0] = " " # 空白は0番と同一視。
33 | char_symbol_to_numeric = {v: k for k, v in numeric_to_char_symbol.items()}
34 |
35 | if __name__ == '__main__':
36 | from pprint import pprint
37 | pprint(numeric_to_char_symbol)
38 | pprint(char_symbol_to_numeric)
39 |
--------------------------------------------------------------------------------
/tdmelodic/nn/lang/japanese/accent/accent_diff.py:
--------------------------------------------------------------------------------
1 | # -----------------------------------------------------------------------------
2 | # Copyright (c) 2019-, PKSHA Technology Inc.
3 | # All rights reserved.
4 | #
5 | # This source code is licensed under the BSD-style license found in the
6 | # LICENSE file in the root directory of this source tree.
7 | # -----------------------------------------------------------------------------
8 |
9 | def simple_accent_diff(a):
10 | '''
11 | L/H方式のアクセント情報を入力として、up/down方式のアクセント情報を出力する。
12 | '''
13 | if len(a) == 1:
14 | return "."
15 | a_ = a.replace(".", "0").replace("?", "0").replace("H", "1").replace("L", "0")
16 | diff = [int(n) - int(p) for n, p in zip(a_[1:], a_[:-1])]
17 |
18 | # 1モーラ目のup (+1) は情報量がないので(ほぼ無意味な情報なので)消す。down(-1)は重要なので残す。
19 | # Ignore the 'up' (+1) after the first mora, because it has almost no information.
20 | # Every word except 'type-1 accent' words ('down' after the first mora) raise the pitch first.
21 | diff[0] = 0 if diff[0] == 1 else diff[0]
22 |
23 | diff = ["-" if d == 0 else
24 | "U" if d == 1 else
25 | "D" if d == -1 else
26 | "?" for d in diff]
27 | diff="".join(diff) + "."
28 | return diff
29 |
30 | if __name__ == '__main__':
31 | def test(a):
32 | print(a)
33 | d = simple_accent_diff(a)
34 | print(d)
35 | print("")
36 |
37 | test("LHHHHLLLL")
38 | test("LHHLLLHHLLL")
39 | test("HLLLLLHHHHLLLLLLHHHLLHLHLHLHL")
40 | test("LHLHLHLHL")
41 |
--------------------------------------------------------------------------------
/tdmelodic/nn/lang/japanese/text_normalize.py:
--------------------------------------------------------------------------------
1 | # -----------------------------------------------------------------------------
2 | # Copyright (c) 2019-, PKSHA Technology Inc.
3 | # All rights reserved.
4 | #
5 | # This source code is licensed under the BSD-style license found in the
6 | # LICENSE file in the root directory of this source tree.
7 | # -----------------------------------------------------------------------------
8 |
9 | import os, sys
10 | import re
11 |
12 | from .kansuji import numeric2kanji
13 | import jaconv
14 |
15 |
16 | #このファイルでは以下を実施する。
17 | # ・数字列をすべて漢数字に変換する。
18 | # ・英字アルファベットを全て小文字から大文字に変換する。
19 | # ・英字アルファベットを全て全角に変換する。
20 |
21 | def suuji(text):
22 | # 数字の間の小数点を「点」にする
23 | text = re.sub(r"(?\1", text, count=0, flags=0)
24 | text = re.sub(r"\.(\d+)", r"点\1", text, count=0, flags=0)
25 | text = re.sub(r"(\d+)", r"\1", text, count=0, flags=0)
26 |
27 | # 数字表現
28 | text = numeric2kanji(text)
29 |
30 | # タグ消し
31 | text = text.replace("","").replace("","")
32 | text = text.replace("","").replace("","")
33 |
34 | return text
35 |
36 | def normalize_jpn(text):
37 | text = suuji(text)
38 | text = jaconv.h2z(text.upper(), ignore='', kana=True, ascii=True, digit=True)
39 | text = text.replace("ー","ー")
40 | return text
41 |
42 | if __name__ == "__main__":
43 | text = "12345.67890あああ林檎蜜柑リンゴミカンABCDEFGabcdefg1234あ5あ0.3あ3あああ。"
44 | text = normalize_jpn(text)
45 | print(text)
46 |
--------------------------------------------------------------------------------
/tdmelodic/nn/loader/data_loader_base.py:
--------------------------------------------------------------------------------
1 | # -----------------------------------------------------------------------------
2 | # Copyright (c) 2019-, PKSHA Technology Inc.
3 | # All rights reserved.
4 | #
5 | # This source code is licensed under the BSD-style license found in the
6 | # LICENSE file in the root directory of this source tree.
7 | # -----------------------------------------------------------------------------
8 |
9 | # -*- coding: utf-8 -*-
10 | import sys
11 | import os
12 | import re
13 | import numpy as np
14 |
15 | from functools import reduce
16 |
17 | import chainer
18 | from chainer import dataset
19 | from chainer import datasets
20 | from chainer import iterators
21 |
22 |
23 | class DataLoaderBase(dataset.DatasetMixin):
24 | def __init__(self):
25 | self.memo = {} # メモ化
26 |
27 | def _load_word_list(self, text_file):
28 | raise NotImplementedError
29 |
30 | def __len__(self):
31 | raise NotImplementedError
32 |
33 | def _get_example_core(self, i):
34 | raise NotImplementedError
35 |
36 | if False:
37 | # 2epoch以降はロードなどの処理を省略する。
38 | def _get_example_memoized(self, i):
39 | if i not in self.memo.keys():
40 | self.memo[i] = self._get_example_core(i)
41 | return self.memo[i]
42 | else:
43 | # ロードや前処理を毎回実行する。
44 | # 毎回違う分析結果が欲しい(毎回MeCabを実行したい)ので今回はこっちを使う。
45 | def _get_example_memoized(self, i):
46 | return self._get_example_core(i)
47 |
48 | def get_example(self, i):
49 | dat_tuple = self._get_example_memoized(i)
50 | return dat_tuple
51 |
--------------------------------------------------------------------------------
/docs/imgs/sent_example.ly:
--------------------------------------------------------------------------------
1 | \version "2.18.2"
2 | \pointAndClickOff
3 |
4 | % Copyright (c) 2019-, Hideyuki Tachibana.
5 | % All rights reserved.
6 |
7 | "|" = {
8 | \once \override Staff.BarLine.bar-extent = #'(-1 . 1)
9 | \bar "|"
10 | }
11 |
12 | myColorNote = { \once \override NoteHead.color = #(x11-color "medium turquoise") }
13 |
14 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
15 | \new Staff \with
16 | {
17 | instrumentName = \markup{}
18 | }{\omit Staff.TimeSignature
19 |
20 | \stopStaff
21 | \override Staff.StaffSymbol.line-positions = #'(-3 0 3)
22 | \override Score.BarNumber #'transparent = ##t
23 | \startStaff
24 |
25 | \time 12/8
26 | b'8[ e'' \myColorNote e'' b' b'] b' b'[ \myColorNote b' f' f' f' ] f'
27 | \time 8/8
28 | f'8[ e'' e''] e'' \myColorNote e''[ b'] b' b'
29 | \time 9/8
30 | \myColorNote b'8[ f'] \myColorNote e''[ b' b'] b'[ b' b' b']
31 | }
32 | \addlyrics {
33 | \stopStaff
34 | \override Lyrics . LyricText #'font-name ="IPAex Mincho"
35 | \startStaff
36 | お や ゆ ず り の む て っ ぽ う で
37 | こ ど も の と き か ら
38 | そ ん ば か り し て い る
39 | }
40 |
41 | \layout {
42 | indent = 0\cm
43 | }
44 |
45 | \header {
46 | tagline = "" % removed
47 | }
48 |
49 | % ページサイズ
50 | #(set! paper-alist (cons '("my size" . (cons (* 8. in) (* 0.8 in))) paper-alist))
51 |
52 | \paper {
53 | print-page-number = ##f % erase page numbering
54 |
55 | #(set-paper-size "my size")
56 | ragged-last-bottom = ##f
57 | ragged-bottom = ##f
58 |
59 | left-margin = 5
60 | right-margin = 5
61 | }
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | BSD 3-Clause License
2 |
3 | Copyright (c) 2019-, PKSHA Technology Inc.
4 | All rights reserved.
5 |
6 | Redistribution and use in source and binary forms, with or without
7 | modification, are permitted provided that the following conditions are met:
8 |
9 | 1. Redistributions of source code must retain the above copyright notice, this
10 | list of conditions and the following disclaimer.
11 |
12 | 2. Redistributions in binary form must reproduce the above copyright notice,
13 | this list of conditions and the following disclaimer in the documentation
14 | and/or other materials provided with the distribution.
15 |
16 | 3. Neither the name of the copyright holder nor the names of its
17 | contributors may be used to endorse or promote products derived from
18 | this software without specific prior written permission.
19 |
20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 |
--------------------------------------------------------------------------------
/tdmelodic/nn/lang/japanese/kana/hyphen2romaji.py:
--------------------------------------------------------------------------------
1 | # -----------------------------------------------------------------------------
2 | # Copyright (c) 2019-, PKSHA Technology Inc.
3 | # All rights reserved.
4 | #
5 | # This source code is licensed under the BSD-style license found in the
6 | # LICENSE file in the root directory of this source tree.
7 | # -----------------------------------------------------------------------------
8 |
9 | import sys
10 | import os
11 | import re
12 |
13 | def replace_hyphen_by_romaji(text):
14 | """
15 | 長音「ー」などを仮名に置換する。
16 | """
17 |
18 | # error check
19 | if len(text) < 2:
20 | return ""
21 |
22 | while "-" in list(text) or "~" in list(text):
23 | text_ = text
24 |
25 | if (text[0] == "-" or text[0] == "~") and len(text) >= 2:
26 | text = text[2:]
27 | continue
28 |
29 | text = re.sub(r"(?P[aeiou])[-~][-~]", r"\gx\g", text) # "-" を 2文字
30 | text = re.sub(r"A[-~][-~]", r"Axa", text)
31 | text = re.sub(r"E[-~][-~]", r"Exe", text)
32 | text = re.sub(r"O[-~][-~]", r"Oxo", text)
33 | text = re.sub(r"U[-~][-~]", r"Uxu", text)
34 | if text_ == text:
35 | break # 変化しなかったら終わり
36 |
37 | return text
38 |
39 | if __name__ == "__main__":
40 | print(replace_hyphen_by_romaji("xa--xi--xu--xe--xo--"))
41 | print(replace_hyphen_by_romaji("ka--ki--ku--ke--ko--"))
42 | print(replace_hyphen_by_romaji("haxnba--ga-~"))
43 | print(replace_hyphen_by_romaji("xA--xi--"))
44 | print(replace_hyphen_by_romaji("wa--------------xi"))
45 | print(replace_hyphen_by_romaji("~~~~hoge--"))
46 |
--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM ubuntu:20.04
2 | # To enable GPU, use other base images such as
3 | # nvidia/cuda:10.0-devel-ubuntu16.04
4 |
5 | # apt
6 | ENV DEBIAN_FRONTEND=noninteractive
7 | RUN apt-get update -y && \
8 | apt-get install -y --no-install-recommends \
9 | build-essential \
10 | gcc g++ cmake \
11 | unzip xz-utils \
12 | libblas3 libblas-dev \
13 | mecab libmecab-dev swig \
14 | locales \
15 | nkf \
16 | fonts-ipafont fonts-ipaexfont fonts-takao-pgothic fonts-takao-mincho \
17 | python3-dev python3-pip python3-setuptools python3-tk && \
18 | rm -rf /var/lib/apt/lists/*
19 | # The fonts are used only for plotting images (optional). The line can be removed.
20 |
21 | # language=Japanese
22 | RUN locale-gen ja_JP.UTF-8 \
23 | && echo "export LANG=ja_JP.UTF-8" >> ~/.bashrc
24 |
25 | # Python
26 | ARG PYTHON_VERSION=3.7
27 | RUN echo "alias python='python3'" >> ~/.bash_aliases
28 |
29 | # Install UniDic
30 | # Download this file in advance. The downloaded file will be reused later.
31 | ARG UNIDIC='unidic-mecab_kana-accent-2.1.2_src'
32 | COPY ${UNIDIC}.zip /tmp
33 | WORKDIR /tmp
34 | RUN unzip ${UNIDIC}.zip && \
35 | cd /tmp/${UNIDIC} && \
36 | ./configure && make && make install && cd - && \
37 | rm ${UNIDIC}.zip && rm -rf ${UNIDIC}
38 |
39 | # pip
40 | ENV pip='python3 -m pip'
41 | RUN $pip install --upgrade pip && \
42 | $pip install --upgrade setuptools && \
43 | $pip install wheel
44 |
45 | # install tdmelodic
46 | COPY . /tmp
47 | WORKDIR /tmp
48 | RUN $pip install .
49 |
50 | # workspace
51 | ARG workspace=/root/workspace
52 | RUN mkdir -p $workspace
53 | WORKDIR $workspace
--------------------------------------------------------------------------------
/tdmelodic/nn/net.py:
--------------------------------------------------------------------------------
1 | # -----------------------------------------------------------------------------
2 | # Copyright (c) 2019-, PKSHA Technology Inc.
3 | # All rights reserved.
4 | #
5 | # This source code is licensed under the BSD-style license found in the
6 | # LICENSE file in the root directory of this source tree.
7 | # -----------------------------------------------------------------------------
8 |
9 | import sys
10 | import os
11 |
12 | import numpy as np
13 |
14 | import chainer
15 | import chainer.functions as F
16 | from chainer import cuda
17 |
18 | from .model.encode_morae import EncodeMorae
19 | from .model.encode_surface import EncodeSurface
20 | from .model.decode_accent import DecodeAccent
21 | from .model.modules.cnn_attention import ConvAttention
22 |
23 | class Net(chainer.Chain):
24 | def __init__(self, embed_dim):
25 | layers = {}
26 | layers["enc_surface"] = EncodeSurface(embed_dim)
27 | layers["enc_yomigana"] = EncodeMorae(embed_dim)
28 | layers["att"] = ConvAttention()
29 | layers["dec"] = DecodeAccent(embed_dim)
30 | super(Net,self).__init__(**layers)
31 |
32 | def __call__(self, *args, **kwargs):
33 | input_lst_s, input_lst_y, t_gt = args
34 |
35 | # forward propagation
36 | h_s = self.enc_surface (input_lst_s)
37 | h_y = self.enc_yomigana(input_lst_y)
38 | c, a, a_loss = self.att(h_y, h_s)
39 | h = self.dec(c)
40 | # y = F.softmax(h)
41 |
42 | # evaluate loss
43 | if chainer.config.train:
44 | loss = F.softmax_cross_entropy(h, t_gt)
45 | else:
46 | loss = None
47 |
48 | return h, [loss, a_loss]
49 |
--------------------------------------------------------------------------------
/tdmelodic/nn/lang/mecab/my_mecabrc:
--------------------------------------------------------------------------------
1 | ; List of features
2 | ; f[0]: pos1
3 | ; f[1]: pos2
4 | ; f[2]: pos3
5 | ; f[3]: pos4
6 | ; f[4]: cType
7 | ; f[5]: cForm
8 | ; f[6]: lForm
9 | ; f[7]: lemma
10 | ; f[8]: orth
11 | ; f[9]: pron
12 | ; f[10]: orthBase
13 | ; f[11]: pronBase
14 | ; f[12]: goshu
15 | ; f[13]: iType
16 | ; f[14]: iForm
17 | ; f[15]: fType
18 | ; f[16]: fForm
19 | ; f[17]: kana
20 | ; f[18]: kanaBase
21 | ; f[19]: form
22 | ; f[20]: formBase
23 | ; f[21]: iConType
24 | ; f[22]: fConType
25 | ; f[23]: aType
26 | ; f[24]: aConType
27 | ; f[25]: aModType
28 |
29 | ;dictionary-charset = utf8
30 | ;config-charset = utf8
31 |
32 | ;cost-factor = 700
33 | ;max-grouping-size = 10
34 | ;eval-size = 10
35 | ;unk-eval-size = 4
36 |
37 | bos-feature = BOS/EOS,*,*,*,*,*,*,*,*,*,*,*,*,*,*,*,*,*,*,*,*,*,*,*,*,*
38 |
39 | output-format-type = unidic
40 |
41 | ; アクセント情報の中にはカンマを含むカラムがあるのでので、タブ区切りにする
42 | node-format-unidic = %m\t%f[9]\t%f[6]\t%F-[0,1,2,3]\t*\t*\t*\t%pw\t%pC\n
43 | unk-format-unidic = %m\t*\t*\t%F-[0,1,2,3]\t*\t*\t*\t0\t0\n
44 | bos-format-unidic =
45 | eos-format-unidic = EOS\n
46 |
47 | ; アクセント情報の中にはカンマを含むカラムがあるのでので、タブ区切りにする
48 | ; 未知語はそのまま出力する
49 | node-format-acc = %m\t%f[9]\t%f[6]\t%F-[0,1,2,3]\t%f[12]\t%f[23]\t%f[24]\n
50 | unk-format-acc = %m\t%m\t%m\t%F-[0,1,2,3]\t*\t*\t*\t*\n
51 | bos-format-acc =
52 | eos-format-acc = EOS\n
53 |
54 | node-format-simple = %m\t%f[9]\t%f[6]\t%F-[0,1,2,3]\t*\t*\n
55 | unk-format-simple = %m\t*\t*\t%F-[0,1,2,3]\t*\t*\t*\n
56 | bos-format-simple =
57 | eos-format-simple = EOS\n
58 |
59 | ; -Oyomi
60 | node-format-yomi = %f[9](%f[23])\s
61 | unk-format-yomi = %m\s
62 | bos-format-yomi =
63 | eos-format-yomi = \n
64 |
--------------------------------------------------------------------------------
/tdmelodic/filters/yomi/basic.py:
--------------------------------------------------------------------------------
1 | import regex as re
2 |
3 |
4 | def modify_longvowel_errors(line, idx_yomi=None):
5 | line[idx_yomi] = line[idx_yomi]\
6 | .replace("ーィ","ウィ")\
7 | .replace("ーェ","ウェ")\
8 | .replace("ーォ","ウォ")
9 | return line
10 |
11 | def modify_yomi_of_numerals(line, idx_surface=None, idx_yomi=None):
12 | """
13 | 数値の読みを簡易的に修正する(完全なものではない)
14 | """
15 |
16 | surface = line[idx_surface]
17 | # 1文字目が数字で2文字以上の長さがあるもの
18 | num=[str(i) for i in range(10)] + ['1','2','3','4','5','6','7','8','9','0']
19 | if (surface[0] in num) and len(line[1]) >= 2:
20 | pass
21 | else:
22 | # otherwise do nothing
23 | return line
24 |
25 | filters=[
26 | (r"ニ(テン\p{Katakana}+)", r"ニー\1" ),
27 | (r"ゴ(テン\p{Katakana}+)", r"ゴー\1" ),
28 | (r"ニ(イチ|ニ|サン|ヨン|ゴ|ロク|ナナ|ハチ|キュウ|キュー|レー|レイ|ゼロ)", r"ニー\1" ),
29 | (r"ゴ(イチ|ゴ|サン|ヨン|ゴ|ロク|ナナ|ハチ|キュウ|キュー|レー|レイ|ゼロ)", r"ゴー\1" ),
30 | (r"イチ(サ^ン|シ|ス|セ|ソ|タ|チ|ツ|テ|ト|カ|キ^ュ|ケ|コ|パ|ピ|プ|ペ|ポ)", r"イッ\1" ),
31 | (r"ハチ(サ^ン|シ|ス|セ|ソ|タ|チ|ツ|テ|ト|カ|キ^ュ|ケ|コ|パ|ピ|プ|ペ|ポ)", r"ハッ\1" ),
32 | (r"ジュウ(サ^ン|シ^チ|ス|セ|ソ|タ|チ|ツ|テ|ト|カ|キ^ュ|ケ|コ|パ|ピ|プ|ペ|ポ)", r"ジュッ\1" ),
33 | (r"ンエ", r"ンイェ" ), # 「万円」などを en -> yen
34 | (r"ヨンニチ", r"ヨッカ" ),
35 | (r"ニーニチ", r"ニニチ" ), # 12日など
36 | (r"ゴーニチ", r"ゴニチ" ) # 15日など
37 | ]
38 | yomi = line[idx_yomi]
39 |
40 | for regex1, regex2 in filters:
41 | prev_yomi = ''
42 | while prev_yomi != yomi: # 変化しなくなるまでループ
43 | prev_yomi = yomi
44 | if re.search(regex1, yomi):
45 | yomi = re.sub(regex1, regex2, yomi)
46 |
47 | line[idx_yomi] = yomi
48 |
49 | return line
50 |
--------------------------------------------------------------------------------
/tdmelodic/nn/model/modules/gatedconv1d.py:
--------------------------------------------------------------------------------
1 | # -----------------------------------------------------------------------------
2 | # Copyright (c) 2019-, PKSHA Technology Inc.
3 | # All rights reserved.
4 | #
5 | # This source code is licensed under the BSD-style license found in the
6 | # LICENSE file in the root directory of this source tree.
7 | # -----------------------------------------------------------------------------
8 |
9 | import chainer
10 | import chainer.functions as F
11 | import chainer.links as L
12 |
13 | from .dilateconvcausal1d import DilateConvCausal1D
14 |
15 | class GatedConv1D(chainer.Chain):
16 | def __init__(self,
17 | channel,
18 | ksize,
19 | dilate=1,
20 | dropout_rate = 0.3,
21 | causal=True):
22 | self.dropout_rate = dropout_rate
23 | self.half = channel
24 |
25 | ls = {}
26 | ls["c"] = DilateConvCausal1D(channel, channel * 2, ksize, dilate=dilate, causal=causal)
27 | ls["bn"] = L.BatchRenormalization(channel*2, decay=0.9, eps=2e-5)
28 | super(GatedConv1D, self).__init__(**ls)
29 |
30 | def __call__(self, x):
31 | h = x
32 | h = F.dropout(h, ratio=self.dropout_rate)
33 | h = self.c(h)
34 | h = self.bn(h)
35 |
36 | h1 = h[:,:self.half,:]
37 | h2 = h[:,self.half:,:]
38 | c = F.sigmoid(h2)
39 |
40 | h = F.relu(h1) * c + x * (1 - c)
41 |
42 | return h
43 |
44 | if __name__ == '__main__':
45 | import numpy as np
46 | x = np.ones((1, 3, 20)).astype(np.float32)
47 | m = GatedConv1D(3, 5, dilate=2, dropout_rate = 0.1, causal=True)
48 | y = m(x)
49 | print(y.shape)
50 | print(y.data)
51 |
--------------------------------------------------------------------------------
/tdmelodic/nn/model/decode_accent.py:
--------------------------------------------------------------------------------
1 | # -----------------------------------------------------------------------------
2 | # Copyright (c) 2019-, PKSHA Technology Inc.
3 | # All rights reserved.
4 | #
5 | # This source code is licensed under the BSD-style license found in the
6 | # LICENSE file in the root directory of this source tree.
7 | # -----------------------------------------------------------------------------
8 |
9 | import chainer
10 | import chainer.functions as F
11 | import chainer.links as L
12 |
13 | from .modules.stacked_conv import StackedConv
14 |
15 |
16 | class DecodeAccent(chainer.Chain):
17 | def __init__(self,embed_dim = 100):
18 | n_class = 3
19 | layers = {}
20 | layers["conv"] = StackedConv(
21 | embed_dim,
22 | ksizes=[3,3,3,3], dilations=[1,3,1,3],
23 | causal=False,
24 | dropout_rate=0.5,
25 | conditional=False,
26 | self_attention=False
27 | )
28 | layers["conv2"] = StackedConv(
29 | embed_dim,
30 | ksizes=[1,1], dilations=[1,1],
31 | causal=False,
32 | dropout_rate=0.0,
33 | conditional=False,
34 | self_attention=False
35 | )
36 |
37 | layers["classifier"] = L.ConvolutionND(1, embed_dim, n_class, 1, stride=1, pad=0)
38 | super(DecodeAccent,self).__init__(**layers)
39 |
40 | def __call__(self, x):
41 | h = self.conv(x)
42 | h = self.conv2(h)
43 | pre_softmax = self.classifier(h)
44 | return pre_softmax
45 |
--------------------------------------------------------------------------------
/tdmelodic/nn/model/modules/stacked_conv.py:
--------------------------------------------------------------------------------
1 | # -----------------------------------------------------------------------------
2 | # Copyright (c) 2019-, PKSHA Technology Inc.
3 | # All rights reserved.
4 | #
5 | # This source code is licensed under the BSD-style license found in the
6 | # LICENSE file in the root directory of this source tree.
7 | # -----------------------------------------------------------------------------
8 |
9 | import chainer
10 | import chainer.functions as F
11 | import chainer.links as L
12 |
13 | from .gatedconv1d import GatedConv1D
14 |
15 | class StackedConv(chainer.Chain):
16 | def __init__(self,
17 | channel,
18 | ksizes=[3,3,3,3],
19 | dilations=[1,1,1,1],
20 | causal=False,
21 | dropout_rate=0.3,
22 | **kwargs
23 | ):
24 | layers = {}
25 | self.n_layers = len(ksizes)
26 | self.causal = causal
27 |
28 | for i in range(self.n_layers):
29 | ksize = ksizes[i]
30 | dilation = dilations[i]
31 | layers["l_{}".format(i)] = GatedConv1D(
32 | channel,
33 | ksize,
34 | dilate=dilation,
35 | dropout_rate=dropout_rate,
36 | causal=causal
37 | )
38 |
39 | layers["last"] = L.ConvolutionND(1, channel, channel, 1, stride=1, pad=0)
40 |
41 | super(StackedConv,self).__init__(**layers)
42 |
43 | def __call__(self, x, cond=None):
44 | h = x
45 | for i in range(self.n_layers):
46 | l = self.__getattribute__("l_{}".format(i))
47 | h = l(h)
48 |
49 | h = self.last(h)
50 | return h
51 |
--------------------------------------------------------------------------------
/tdmelodic/nn/lang/japanese/kana/kana2roman.py:
--------------------------------------------------------------------------------
1 | # -----------------------------------------------------------------------------
2 | # Copyright (c) 2019-, PKSHA Technology Inc.
3 | # All rights reserved.
4 | #
5 | # This source code is licensed under the BSD-style license found in the
6 | # LICENSE file in the root directory of this source tree.
7 | # -----------------------------------------------------------------------------
8 |
9 | import sys
10 | import os
11 |
12 | from .kanamap import kanamap_normal
13 | from .mora_sep import sep_katakana2mora
14 | from .hyphen2romaji import replace_hyphen_by_romaji
15 |
16 |
17 | # get dict
18 | k2r_dic = kanamap_normal.kana2roman_dictionary
19 | _unknown_ = k2r_dic["0"]
20 |
21 | # subsidiary functions
22 | def _mora2roman(mora, UNKNOWN=_unknown_):
23 | """ unknown char -> UNKNOWN Token (#) """
24 | return k2r_dic[mora] if mora in k2r_dic.keys() else UNKNOWN
25 |
26 | def _moralist2roman(moralist, UNKNOWN=_unknown_):
27 | return "".join([_mora2roman(m, UNKNOWN) for m in moralist])
28 |
29 | # main function
30 | def kana2roman(kana):
31 | mora_list = sep_katakana2mora(katakana_text=kana)
32 | roman = _moralist2roman(mora_list)
33 | roman = replace_hyphen_by_romaji(roman)
34 | return roman
35 |
36 | if __name__ == "__main__":
37 | katakana_texts=[
38 | "リンゴ",
39 | "アップル",
40 | "ミカン",
41 | "オレンジ",
42 | "パイナップル",
43 | "チョコレート",
44 | "マシュマロ",
45 | ]
46 | for t in katakana_texts:
47 | mora_list = sep_katakana2mora(katakana_text=t)
48 | roman = _moralist2roman(mora_list)
49 | roman_ = replace_hyphen_by_romaji(roman)
50 | print(mora_list)
51 | print(roman)
52 | print(roman_)
53 | print("{} -> {}".format(t, kana2roman(t)))
54 |
--------------------------------------------------------------------------------
/docs/locale/ja/LC_MESSAGES/pages/ipadic-usage.po:
--------------------------------------------------------------------------------
1 | # SOME DESCRIPTIVE TITLE.
2 | # Copyright (C) 2019-, Hideyuki Tachibana, PKSHA Technology Inc
3 | # This file is distributed under the same license as the tdmelodic package.
4 | # Hideyuki Tachibana, 2021.
5 | #
6 | #, fuzzy
7 | msgid ""
8 | msgstr ""
9 | "Project-Id-Version: tdmelodic \n"
10 | "Report-Msgid-Bugs-To: \n"
11 | "POT-Creation-Date: 2021-07-23 00:59+0900\n"
12 | "PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
13 | "Last-Translator: Hideyuki Tachibana \n"
14 | "Language-Team: \n"
15 | "MIME-Version: 1.0\n"
16 | "Content-Type: text/plain; charset=utf-8\n"
17 | "Content-Transfer-Encoding: 8bit\n"
18 | "Generated-By: Babel 2.9.1\n"
19 |
20 | #: ../../pages/ipadic-usage.md:1
21 | msgid "Usage of IPADIC-tdmelodic as a MeCab dictionary"
22 | msgstr "IPADIC-tdmelodic を MeCab 辞書として活用"
23 |
24 | #: ../../pages/ipadic-usage.md:3
25 | msgid "Install IPADIC-tdmelodic"
26 | msgstr "IPADIC-tdmelodic のインストール"
27 |
28 | #: ../../pages/ipadic-usage.md:4
29 | msgid ""
30 | "You can install `tdmelodic-ipadic` by replacing all the CSV files with "
31 | "the generated `*.csv.accent` files, and running the installation script "
32 | "with appropriate command line options."
33 | msgstr ""
34 | "前の章で作成した `*.csv.accent` ファイルを IPADIC のディレクトリ配下に配置し、"
35 | "適切なコマンドラインオプションを与えてインストール用スクリプトを実行することで、"
36 | " `tdmelodic-ipadic` をインストールできます。"
37 |
38 | #: ../../pages/ipadic-usage.md:42
39 | msgid "Use IPADIC-tdmelodic"
40 | msgstr "UniDic-tdmelodic の使用例"
41 |
42 | #: ../../pages/ipadic-usage.md:43
43 | msgid "Here are some examples."
44 | msgstr "いくつかの例をご覧ください。"
45 |
46 | #: ../../pages/ipadic-usage.md:45
47 | msgid "Example 1"
48 | msgstr "例1"
49 |
50 | #: ../../pages/ipadic-usage.md:70
51 | msgid "Example 2"
52 | msgstr "例2"
53 |
54 | #: ../../pages/ipadic-usage.md:80
55 | msgid "Example 3"
56 | msgstr "例3"
57 |
--------------------------------------------------------------------------------
/tdmelodic/nn/model/encode_morae.py:
--------------------------------------------------------------------------------
1 | # -----------------------------------------------------------------------------
2 | # Copyright (c) 2019-, PKSHA Technology Inc.
3 | # All rights reserved.
4 | #
5 | # This source code is licensed under the BSD-style license found in the
6 | # LICENSE file in the root directory of this source tree.
7 | # -----------------------------------------------------------------------------
8 |
9 | import chainer
10 | import chainer.functions as F
11 | import chainer.links as L
12 |
13 | from .modules.stacked_conv import StackedConv
14 |
15 | class EmbedMorae(chainer.Chain):
16 | def __init__(self,
17 | embed_dim = 100):
18 | self.embed_dim = embed_dim
19 | layers = {}
20 | layers["emb_v" ] = L.EmbedID(50,embed_dim)
21 | layers["emb_c" ] = L.EmbedID(50,embed_dim)
22 | super(EmbedMorae,self).__init__(**layers)
23 |
24 | def __call_add_(self, input_lst):
25 | v, c = input_lst
26 | emb = self.emb_v(v)
27 | emb += self.emb_c(c)
28 |
29 | return emb
30 |
31 | def __call__(self, input_lst):
32 | r = self.__call_add_(input_lst)
33 | r = F.transpose(r, axes=(0, 2, 1))
34 |
35 | return r
36 |
37 | class EncodeMorae(chainer.Chain):
38 | def __init__(self,
39 | embed_dim = 100):
40 | layers = {}
41 | layers["emb"] = EmbedMorae(embed_dim = embed_dim)
42 | layers["conv"] = StackedConv(
43 | embed_dim,
44 | ksizes=[3,3,3,3], dilations=[1,3,1,3],
45 | causal=False,
46 | dropout_rate=0.5
47 | )
48 |
49 | super(EncodeMorae,self).__init__(**layers)
50 |
51 | def __call__(self, x):
52 | h = self.emb(x)
53 | y = self.conv(h)
54 | return y
55 |
--------------------------------------------------------------------------------
/tdmelodic/nn/model/modules/cnn_attention.py:
--------------------------------------------------------------------------------
1 | # -----------------------------------------------------------------------------
2 | # Copyright (c) 2019-, PKSHA Technology Inc.
3 | # All rights reserved.
4 | #
5 | # This source code is licensed under the BSD-style license found in the
6 | # LICENSE file in the root directory of this source tree.
7 | # -----------------------------------------------------------------------------
8 |
9 | import numpy as np
10 |
11 | import chainer
12 | import chainer.functions as F
13 | import chainer.links as L
14 | from chainer import cuda
15 |
16 |
17 | def attention_loss(a, bs, len_1, len_2):
18 | xp = cuda.get_array_module(*a.data)
19 |
20 | I, J, sd = len_1, len_2, 15
21 | def f(bs, i, j):
22 | return 1 - np.exp(- (i - j)**2/(2 * sd**2)) + 1e-5
23 |
24 | a_soft = np.fromfunction(f, (bs, len_1, len_2), dtype=np.float32)
25 | a_soft = xp.asarray(a_soft)
26 | a_loss_tmp = F.sum(a * a_soft)/bs / len_1 /len_2
27 | a_loss_tmp *= 10
28 | return a_loss_tmp
29 |
30 | class ConvAttention(chainer.Chain):
31 | def __init__(self):
32 | layers={}
33 | super(ConvAttention, self).__init__(**layers)
34 |
35 | def __call__(self, Q, KV):
36 | bs = KV.data.shape[0]
37 | vec_dim = KV.data.shape[1]
38 | len_KV = KV.data.shape[2] # key and value : processed surface
39 | len_Q = Q. data.shape[2] # query : yomi (morae)
40 |
41 | # key and value are same
42 | K = KV
43 | V = KV
44 |
45 | # forward
46 | KQ = F.batch_matmul(K, Q, transa=True, transb=False)
47 | KQ /= np.sqrt(vec_dim)
48 | Attention = F.softmax(KQ, axis=1)
49 | c = F.batch_matmul(V, Attention)
50 | c += Q
51 |
52 | if chainer.config.train:
53 | a_loss = attention_loss(Attention, KQ, bs, len_KV, len_Q) # loss
54 | a = cuda.to_cpu(Attention.data) # log
55 | else:
56 | a, a_loss = None, None
57 |
58 | return c, a, a_loss
59 |
--------------------------------------------------------------------------------
/docs/index.rst:
--------------------------------------------------------------------------------
1 | tdmelodic Documentation
2 | =======================
3 |
4 | | **Tokyo Dialect MELOdic accent DICtionary**
5 |
6 | This module generates a large scale accent dictionary of
7 | Japanese (Tokyo dialect) using a neural network based technique.
8 |
9 | The objective of this module is to generate a large vocabulary Japanese accent dictionary,
10 | exploiting existing two dictionaries: UniDic and NEologd.
11 | UniDic provides accurate accent information of words, but its vocabulary size is not necessarilly large.
12 | NEologd is a very large Japanese dictionary but it does not provide accent information of words.
13 |
14 |
15 | .. toctree::
16 | :maxdepth: 3
17 | :caption: Background
18 |
19 | pages/introduction.md
20 |
21 |
22 | .. toctree::
23 | :maxdepth: 3
24 | :caption: Prelininary Setting
25 |
26 | pages/docker.md
27 |
28 | .. toctree::
29 | :maxdepth: 3
30 | :caption: Dictionary Generation
31 |
32 | pages/unidic-dicgen.md
33 | pages/ipadic-dicgen.md
34 |
35 |
36 | .. toctree::
37 | :maxdepth: 3
38 | :caption: Install tdmelodic on your system
39 |
40 | pages/unidic-usage.md
41 | pages/ipadic-usage.md
42 |
43 | .. toctree::
44 | :maxdepth: 3
45 | :caption: One-by-one Manual Inference Mode
46 |
47 | pages/onebyone.md
48 |
49 | Citation
50 | --------
51 |
52 | For academic use, please cite the following paper.
53 |
54 | .. code-block:: bibtex
55 |
56 | @inproceedings{tachibana2020icassp,
57 | author = "H. Tachibana and Y. Katayama",
58 | title = "Accent Estimation of {Japanese} Words from
59 | Their Surfaces and Romanizations
60 | for Building Large Vocabulary Accent Dictionaries",
61 | booktitle = {2020 IEEE International Conference on Acoustics,
62 | Speech and Signal Processing (ICASSP)},
63 | pages = "8059--8063",
64 | year = "2020",
65 | doi = "10.1109/ICASSP40776.2020.9054081"
66 | }
67 |
68 | Paper Links: `[IEEE Xplore] `_, `[arXiv preprint] `_
69 |
--------------------------------------------------------------------------------
/tests/nn/lang/japanese/test_kansuji.py:
--------------------------------------------------------------------------------
1 | import unittest
2 | from unittest import TestCase
3 | from tdmelodic.nn.lang.japanese.kansuji import num2kansuji as N
4 | from tdmelodic.nn.lang.japanese.kansuji import numeric2kanji as NN
5 |
6 | class TestWordType(unittest.TestCase):
7 | def test_basic(self):
8 | self.assertEqual(N("123"), "百二十三")
9 | self.assertEqual(N("100"), "百")
10 | self.assertEqual(N("103"), "百三")
11 | self.assertEqual(N("111"), "百十一")
12 | self.assertEqual(N("123456"), "十二万三千四百五十六")
13 |
14 | def test_float(self):
15 | self.assertEqual(N("12."), "十二")
16 | self.assertEqual(N("12.345"), "十二点三四五")
17 | self.assertEqual(N(".345"), "零点三四五")
18 | self.assertEqual(N("3.141592"), "三点一四一五九二")
19 | self.assertEqual(N("0.0001"), "零点零零零一")
20 |
21 | def test_issenman(self):
22 | self.assertEqual(N("100010001000"), "一千億一千万千")
23 | self.assertEqual(N("100011001000"), "一千億千百万千")
24 |
25 | def test_zeros(self):
26 | self.assertEqual(N("1020304050607080"), "千二十兆三千四十億五千六十万七千八十")
27 |
28 | def test_suji_in_text(self):
29 | self.assertEqual(NN("金額は1234567円です"),
30 | "金額は百二十三万四千五百六十七円です")
31 | self.assertEqual(NN("123456は34.5で、さらに0.12345になります。0.1.2.3.4."),
32 | "十二万三千四百五十六は三十四点五で、さらに零点一二三四五になります。零点一点二点三点四点")
33 | self.assertEqual(NN("1000000円、10000000円、100000000円、1000円、100000000000円"),
34 | "百万円、一千万円、一億円、千円、一千億円")
35 |
36 | if False:
37 | # tests
38 | print(N("123412341234"))
39 | print(N("123412341234123400001234"))
40 | print(N("12345678901234567890123456789012345678901234567890123456789012345678901"))
41 | print(N("123456789012345678901234567890123456789012345678901234567890123456789012"))
42 | print(N("1234567890123456789012345678901234567890123456789012345678901234567890123"))
43 |
44 | print( "----------" )
45 | print(N("100010001000", mode='replace'))
46 | print(N("100011001000", mode='replace'))
47 | print(N("1020304050607080", mode='replace'))
48 | print(N("12345678901234567890123456789012345678901234567890123456789012345678901", mode='replace'))
49 |
--------------------------------------------------------------------------------
/tdmelodic/filters/yomi/particle_yomi.py:
--------------------------------------------------------------------------------
1 |
2 | from dataclasses import dataclass
3 | #from .lang.japanese.kana.mora_sep import sep_katakana2mora
4 | from tdmelodic.nn.lang.mecab.unidic import UniDic
5 |
6 | @dataclass
7 | class Word(object):
8 | surf: str
9 | yomi: str
10 | pos: str
11 |
12 | class Alignment(object):
13 | # TODO
14 | def __init__(self):
15 | pass
16 |
17 | def __call__(self, x_yomi, lst_y_yomi, y_mask):
18 | y_idx = sum([[idx] * len(y_) for idx, y_ in enumerate(lst_y_yomi)], [])
19 | y_yomi = "".join(lst_y_yomi)
20 | print(y_idx, y_yomi)
21 |
22 | class DetectWrongParticle(object):
23 | def __init__(self):
24 | self.unidic = UniDic()
25 | self.special_particles = [
26 | Word("は", "ワ", "助詞"),
27 | Word("へ", "エ", "助詞"),
28 | # Word("を", "オ", "助詞")
29 | ]
30 |
31 | def parse(self, surf):
32 | """ parse text and return the words and the flags (whether it is a special particle or not) """
33 | parsed = self.unidic._UniDic__parse(surf)[0]
34 | words = [Word(word["surface"], word["pron"], word["pos"].split("-")[0]) for word in parsed]
35 | masks = [w in self.special_particles for w in words]
36 | return words, masks
37 |
38 | def has_special_particles(self, surf):
39 | """ check if the text has special particles は, へ or not"""
40 | words, masks = self.parse(surf)
41 | return any(masks)
42 |
43 |
44 | class ParticleYomi(object):
45 | """
46 | neologdの読みは ワガハイ【ハ】ネコデアル のように助詞「は」「へ」の読みが適切に処理されていないケースがあるので削除する。
47 | TODO : 削除ではなく修正するようにする
48 | """
49 | def __init__(self):
50 | self.detector = DetectWrongParticle()
51 |
52 | def __call__(self, line, IDX_MAP):
53 | if self.detector.has_special_particles(line[IDX_MAP["SURFACE"]]):
54 | return None
55 | else:
56 | return line
57 |
58 | if __name__ == '__main__':
59 | a = Alignment()
60 | a("あああああ", ["あああ", "いい", "うううううううう"], None)
61 |
62 | txt = "今日はいい天気ですね。犬を連れてどこへ行きましょうか。"
63 | d = DetectWrongParticle()
64 | a = d.parse(txt)
65 | print(a)
66 | a = d.has_special_particles(txt)
67 | print (a)
68 |
--------------------------------------------------------------------------------
/tdmelodic/nn/lang/japanese/kana/mora_sep.py:
--------------------------------------------------------------------------------
1 | # -----------------------------------------------------------------------------
2 | # Copyright (c) 2019-, PKSHA Technology Inc.
3 | # All rights reserved.
4 | #
5 | # This source code is licensed under the BSD-style license found in the
6 | # LICENSE file in the root directory of this source tree.
7 | # -----------------------------------------------------------------------------
8 |
9 | import sys
10 | import os
11 |
12 | from .kanamap import kanamap_normal
13 |
14 | mora_with_subs = kanamap_normal.exceptions.keys()
15 | small_vowel = list("ァィゥェォャュョヮぁぃぅぇぉゃゅょゎ")
16 | large_vowel = {
17 | "ァ":"ア",
18 | "ィ":"イ",
19 | "ゥ":"ウ",
20 | "ェ":"エ",
21 | "ォ":"オ",
22 | "ャ":"ヤ",
23 | "ュ":"ユ",
24 | "ョ":"ヨ",
25 | "ヮ":"ワ",
26 | "ぁ":"ア",
27 | "ぃ":"イ",
28 | "ぅ":"ウ",
29 | "ぇ":"エ",
30 | "ぉ":"オ",
31 | "ゃ":"ヤ",
32 | "ゅ":"ユ",
33 | "ょ":"ヨ",
34 | "ゎ":"ワ",
35 | }
36 |
37 | def sep_katakana2mora(katakana_text=""):
38 | eos_char = "@"
39 | lst = list(katakana_text + eos_char)
40 | lst1, lst2 = lst[:-1], lst[1:]
41 |
42 | concat_ = [ [i + j, ""] if i + j in mora_with_subs # i + j が「キャ」「シュ」などのパターンに合致する場合
43 | else [i, large_vowel[j]] if ((not i in small_vowel) and ( j in small_vowel)) # それ以外のパターンの場合。
44 | else ["", ""] if (( i in small_vowel) and (not j in small_vowel))
45 | else [large_vowel[i], ""] if (( i in small_vowel) and ( j in small_vowel))
46 | else [i, ""] # それ以外は普通に返す。
47 | for i, j in zip(lst1, lst2) ]
48 |
49 | concat_ = sum(concat_, [])
50 | concat = [i for i in [c for c in concat_ if c != ""] if i != ""]
51 |
52 | return concat
53 |
54 | if __name__ == "__main__":
55 | # test
56 | test_str = [
57 | "キューリョービ",
58 | "シュークリーム",
59 | "クィニーアマン",
60 | "クロワッサン",
61 | "シークヮーサー",
62 | "ベートーヴェン",
63 | "ドストエフスキィ",
64 | "ウワァ",
65 | "ウワァァ",
66 | "ウワァァァァァァァ",
67 | "ァアィアアゥアウアアウィアァアァアァアアウォオゥオァアァ"
68 | ]
69 | for s in test_str:
70 | print("{}\n{}\n{}".format(s, str(sep_katakana2mora(s)),'-'*80))
71 |
--------------------------------------------------------------------------------
/tdmelodic/nn/model/encode_surface.py:
--------------------------------------------------------------------------------
1 | # -----------------------------------------------------------------------------
2 | # Copyright (c) 2019-, PKSHA Technology Inc.
3 | # All rights reserved.
4 | #
5 | # This source code is licensed under the BSD-style license found in the
6 | # LICENSE file in the root directory of this source tree.
7 | # -----------------------------------------------------------------------------
8 |
9 | import chainer
10 | import chainer.functions as F
11 | import chainer.links as L
12 |
13 | from .modules.stacked_conv import StackedConv
14 |
15 | class EmbedSurface(chainer.Chain):
16 | def __init__(self, embed_dim = 100):
17 | self.embed_dim = embed_dim
18 | layers = {}
19 | layers["emb_v" ] = L.EmbedID(50,embed_dim)
20 | layers["emb_c" ] = L.EmbedID(50,embed_dim)
21 | layers["emb_pos"] = L.EmbedID(50,embed_dim)
22 | layers["emb_acc"] = L.EmbedID(10,embed_dim)
23 | layers["emb_ac" ] = L.EmbedID(10,embed_dim)
24 | layers["emb_gos"] = L.EmbedID(10,embed_dim)
25 | super(EmbedSurface,self).__init__(**layers)
26 |
27 | def __call_add_(self, input_lst):
28 | v, c, pos, acc, ac, gos = input_lst
29 |
30 | emb = self.emb_v(v)
31 | emb += self.emb_c(c)
32 | emb += self.emb_pos(pos)
33 | emb += self.emb_acc(acc)
34 | emb += self.emb_ac(ac)
35 | emb += self.emb_gos(gos)
36 |
37 | return emb
38 |
39 | def __call__(self, input_lst):
40 | r = self.__call_add_(input_lst)
41 | r = F.transpose(r, axes=(0, 2, 1))
42 | return r
43 |
44 | class EncodeSurface(chainer.Chain):
45 | def __init__(self, embed_dim = 100):
46 | layers = {}
47 | layers["emb"] = EmbedSurface(embed_dim = embed_dim)
48 | layers["conv"] = StackedConv(
49 | embed_dim,
50 | ksizes=[3,3,3,3], dilations=[1,3,1,3],
51 | causal=False,
52 | dropout_rate=0.5
53 | )
54 |
55 | super(EncodeSurface,self).__init__(**layers)
56 |
57 | def __call__(self, input_lst):
58 | h = self.emb(input_lst)
59 | y = self.conv(h)
60 | return y
61 |
--------------------------------------------------------------------------------
/tdmelodic/nn/lang/category/list_of_symbols/pos_short.py:
--------------------------------------------------------------------------------
1 | pos_short_interm = {
2 | None:'0',
3 | '名詞-普通名詞-一般':'n',
4 | '名詞-普通名詞-サ変可能':'n',
5 | '名詞-普通名詞-形状詞可能':'n',
6 | '名詞-普通名詞-サ変形状詞可能':'n',
7 | '名詞-普通名詞-副詞可能':'n',
8 | '名詞-固有名詞-一般':'K',
9 | '名詞-固有名詞-人名-一般':'K',
10 | '名詞-固有名詞-人名-姓':'S',
11 | '名詞-固有名詞-人名-名':'M',
12 | '名詞-固有名詞-地名-一般':'K',
13 | '名詞-固有名詞-地名-国':'K',
14 | '名詞-固有名詞-組織名':'K',
15 | '名詞-数詞':'#',
16 | '名詞-助動詞語幹':'n',
17 | '代名詞':'n',
18 | '形状詞-一般':'n',
19 | '形状詞-タリ':'n',
20 | '形状詞-助動詞語幹':'n',
21 | '連体詞':'n',
22 | '副詞':'d',
23 | '接続詞':'d',
24 | '感動詞-一般':'d',
25 | '感動詞-フィラー':'d',
26 | '動詞-一般':'d',
27 | '動詞-非自立可能':'d',
28 | '形容詞-一般':'d',
29 | '形容詞-非自立可能':'d',
30 | '助動詞':'+',
31 | '助詞-格助詞':'+',
32 | '助詞-副助詞':'+',
33 | '助詞-係助詞':'+',
34 | '助詞-接続助詞':'+',
35 | '助詞-終助詞':'+',
36 | '助詞-準体助詞':'+',
37 | '接頭辞':'n',
38 | '接尾辞-名詞的-一般':'n',
39 | '接尾辞-名詞的-サ変可能':'n',
40 | '接尾辞-名詞的-形状詞可能':'n',
41 | '接尾辞-名詞的-副詞可能':'n',
42 | '接尾辞-名詞的-助数詞':'n',
43 | '接尾辞-形状詞的':'n',
44 | '接尾辞-動詞的':'n',
45 | '接尾辞-形容詞的':'n',
46 | '記号-一般':'@',
47 | '記号-文字':'@',
48 | '補助記号-一般':'@',
49 | '補助記号-句点':'@',
50 | '補助記号-読点':'@',
51 | '補助記号-括弧開':'@',
52 | '補助記号-括弧閉':'@',
53 | '空白':'0'
54 | }
55 |
56 |
57 | def revdic(x):
58 | revdict = {}
59 | for k, v in x.items():
60 | revdict.setdefault(v, []).append(k)
61 | return revdict
62 |
63 | pos_short_interm_inv = revdic(pos_short_interm)
64 |
65 | pos_short_ids = list(sorted(set(pos_short_interm.values())))
66 | pos_short_ids = ['0'] + [p for p in pos_short_ids if p is not '0']
67 | pos_short_ids = {v : i for i, v in enumerate(pos_short_ids)}
68 | pos_map = {k: pos_short_ids[v] for k, v in pos_short_interm.items()}
69 | pos_invmap = {v:pos_short_interm[k] for k, v in pos_map.items()}
70 |
71 | if __name__ == '__main__':
72 | from pprint import pprint
73 | pprint(pos_short_interm_inv)
74 | pprint(pos_short_ids)
75 | pprint(pos_map)
76 | pprint(pos_invmap)
77 |
--------------------------------------------------------------------------------
/tdmelodic/filters/yomi/yomieval.py:
--------------------------------------------------------------------------------
1 | import Levenshtein
2 | import romkan
3 | import jaconv
4 | from tdmelodic.nn.lang.mecab.unidic import UniDic
5 |
6 | class YomiEvaluator():
7 | def __init__(self, rank_weight = 0.1, romaji_priority=2, nbest=10):
8 | self.unidic = UniDic()
9 | self.romaji_priority=romaji_priority
10 | self.nbest = nbest
11 | self.rank_weight = rank_weight
12 |
13 | def eval(self, *args, **kwargs):
14 | distance1 = self.eval_normal(*args, **kwargs)
15 | # アルファベットをローマ字読みできそうな箇所を無理やり仮名に変換してからさらにUniDicで分析してより良い読みを探る。
16 | distance2 = self.eval_force_romaji_to_kana_v1(*args, **kwargs) - self.romaji_priority
17 | distance3 = self.eval_force_romaji_to_kana_v2(*args, **kwargs) - self.romaji_priority
18 | return min(distance1, distance2, distance3)
19 |
20 | def eval_normal(self, text, kana_ref, nbest=20):
21 | '''一番読みが近いものとの距離を評価して返す。順位も考慮に入れる。'''
22 | text = jaconv.h2z(text, digit=True, ascii=True, kana=True) # zenkaku
23 | p = self.unidic._UniDic__parse(text, nbest=self.nbest)
24 | kanas = ["".join([
25 | e["pron"] for e in p_
26 | ]) for p_ in p]
27 | dist = [self.rank_weight * rank +
28 | Levenshtein.distance(k, kana_ref)
29 | for rank, k in enumerate(kanas)]
30 | rank = [i for i, v in sorted(
31 | enumerate(dist),
32 | key=lambda v: v[1])]
33 | ld = dist[rank[0]]
34 | return ld
35 |
36 | def eval_force_romaji_to_kana_v1(self, text, kana_ref, nbest=20):
37 | p_ = jaconv.z2h(text, digit=True, ascii=True, kana=False) # hankaku
38 | p = romkan.to_katakana(p_) # romanize as possible
39 | if p_ == p: # 変化がないものは以下の処理を行わずに戻る。戻り値は十分大きければなんでも良い。
40 | return 12345
41 | return self.eval_normal(p, kana_ref, nbest)
42 |
43 | def eval_force_romaji_to_kana_v2(self, text, kana_ref, nbest=20):
44 | p_ = jaconv.z2h(text, digit=True, ascii=True, kana=False) # hankaku
45 | p_ = jaconv.normalize(p_, "NFKC")
46 | p = jaconv.alphabet2kata(p_) # romanize as possible
47 | if p_ == p:
48 | return 12345
49 | return self.eval_normal(p, kana_ref, nbest)
50 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 | # Tokyo Dialect MELOdic accent DICtionary (tdmelodic) generator
7 |
8 | [](https://tdmelodic.readthedocs.io/en/latest)
9 | [](https://arxiv.org/abs/2009.09679)
10 | [](https://github.com/PKSHATechnology-Research/tdmelodic/actions/workflows/test.yml)
11 | [](https://github.com/PKSHATechnology-Research/tdmelodic/actions/workflows/docker-image.yml)
12 | [](https://github.com/PKSHATechnology-Research/tdmelodic/actions/workflows/img.yml)
13 | [](https://opensource.org/licenses/BSD-3-Clause)
14 |
15 |
16 | This module generates a large scale accent dictionary of
17 | Japanese (Tokyo dialect) using a neural network based technique.
18 |
19 | For academic use, please cite the following paper.
20 | [[IEEE Xplore]](https://ieeexplore.ieee.org/document/9054081)
21 | [[arXiv]](https://arxiv.org/abs/2009.09679)
22 |
23 | ```bibtex
24 | @inproceedings{tachibana2020icassp,
25 | author = "H. Tachibana and Y. Katayama",
26 | title = "Accent Estimation of {Japanese} Words from Their Surfaces and Romanizations
27 | for Building Large Vocabulary Accent Dictionaries",
28 | booktitle = {2020 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)},
29 | pages = "8059--8063",
30 | year = "2020",
31 | doi = "10.1109/ICASSP40776.2020.9054081"
32 | }
33 | ```
34 |
35 | ## Installation and Usage
36 |
37 | - English: [tdmelodic Documentation](https://tdmelodic.readthedocs.io/en/latest)
38 | - 日本語: [tdmelodic 利用マニュアル](https://tdmelodic.readthedocs.io/ja/latest)
39 |
40 | ## Acknowledgement
41 | Some part of this work is based on the results obtained from a project subsidized by the New Energy and Industrial Technology Development Organization (NEDO).
42 |
--------------------------------------------------------------------------------
/tdmelodic/nn/lang/japanese/accent/accent_alignment.py:
--------------------------------------------------------------------------------
1 | # -----------------------------------------------------------------------------
2 | # Copyright (c) 2019-, PKSHA Technology Inc.
3 | # All rights reserved.
4 | #
5 | # This source code is licensed under the BSD-style license found in the
6 | # LICENSE file in the root directory of this source tree.
7 | # -----------------------------------------------------------------------------
8 |
9 | accent_map ={
10 | # Note!!
11 | # low/high code:
12 | '.':0,
13 | 'L':1,
14 | 'H':2,
15 | '?':3,
16 | # up/down code:
17 | # 0:down
18 | # 1:keep
19 | # 2:up
20 | # These codes below are also hard coded in accent_diff.py
21 | '-':0, # keep
22 | 'D':1, # down
23 | 'U':2, # up
24 | # See also data_loader.py, convert_dic.py.
25 | }
26 | accent_invmap = {v: i for i, v in accent_map.items()}
27 |
28 | def accent_align(roman, a_kernel):
29 | """
30 | ローマ字とアクセント核を入力として、L/H方式のアクセント情報を出力する。
31 | roman: 他のモジュールにおいて既に1モーラ2文字に変換されていることを想定。
32 | a_kernel: アクセント核位置
33 | """
34 | n_morae = len(roman) // 2
35 | r = 2 # 2文字で1モーラなので、長さを2倍にする。
36 | try:
37 | a_kernel = int(a_kernel.split(",")[0]) # 複数書いてある場合は最初のものを優先して採用する。
38 | if a_kernel == 0:
39 | accent = 'L' * r + 'H' * n_morae * r
40 | return accent
41 | elif a_kernel == 1:
42 | accent = 'H' * r + 'L' * n_morae * r
43 | return accent
44 | elif 2 <= a_kernel and a_kernel <= n_morae:
45 | accent = 'L' * r + 'H' * (a_kernel - 1) * r + 'L' * (n_morae + 1 - a_kernel) * r
46 | return accent
47 | elif a_kernel == n_morae + 1:
48 | accent = 'L' * r + 'H' * (a_kernel - 1) * r + 'L' * (n_morae + 1 - a_kernel) * r
49 | return accent
50 | else:
51 | accent = "." * (n_morae + 1) * r
52 | return accent
53 |
54 | except Exception as e:
55 | if a_kernel is '*':
56 | accent = "?" * (n_morae + 1) * r
57 | return accent
58 | else:
59 | accent = "." * (n_morae + 1) * r
60 | return accent
61 |
62 | if __name__ == '__main__':
63 | txt = "rixngo"
64 | a = accent_align(txt, 0)
65 | print(txt)
66 | print(a)
67 |
68 | txt = "mikaxn"
69 | a = accent_align(txt, 1)
70 | print(txt)
71 | print(a)
72 |
73 | txt = "paxinaqqpuru"
74 | a = accent_align(txt, 3)
75 | print(txt)
76 | print(a)
77 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 | from setuptools import setup, find_packages
4 | from os import path
5 | import re, io
6 |
7 | def _readme():
8 | with open('README.md') as readme_file:
9 | return readme_file.read().replace(":copyright:", "(c)")
10 |
11 | def _requirements():
12 | root_dir = path.abspath(path.dirname(__file__))
13 | return [name.rstrip() for name in open(path.join(root_dir, 'requirements.txt')).readlines()]
14 |
15 | def _get_version():
16 | version = re.search(
17 | r'__version__\s*=\s*[\'"]([^\'"]*)[\'"]', # It excludes inline comment too
18 | io.open('tdmelodic/__init__.py', encoding='utf_8_sig').read()
19 | ).group(1)
20 | return version
21 |
22 | setup(
23 | name="tdmelodic",
24 | author="Hideyuki Tachibana",
25 | author_email='h_tachibana@pkshatech.com',
26 | python_requires='>=3.7',
27 | url="https://github.com/PKSHATechnology-Research/tdmelodic",
28 |
29 | description="tdmelodic: Tokyo Japanese Accent Estimator",
30 | long_description=_readme(),
31 | long_description_content_type="text/markdown",
32 |
33 | install_requires=_requirements(),
34 | tests_requires=_requirements(),
35 | setup_requires=[],
36 |
37 | include_package_data=True,
38 | packages=find_packages(include=['tdmelodic', 'tdmelodic.*']),
39 |
40 | version=_get_version(),
41 | zip_safe=False,
42 |
43 | entry_points={
44 | 'console_scripts':[
45 | 'tdmelodic-convert = tdmelodic.nn.convert_dic:main',
46 | 'tdmelodic-sy2a = tdmelodic.nn.convert:main_sy2a',
47 | 'tdmelodic-s2ya = tdmelodic.nn.convert:main_s2ya',
48 | 'tdmelodic-neologd-preprocess = tdmelodic.filters.neologd_preprocess:main',
49 | 'tdmelodic-modify-unigram-cost = tdmelodic.filters.postprocess_modify_unigram_cost:main',
50 | ]
51 | },
52 |
53 | classifiers=[
54 | 'Development Status :: 5 - Production/Stable',
55 | 'Environment :: Console',
56 | 'Intended Audience :: Science/Research',
57 | 'Intended Audience :: Developers',
58 | 'License :: OSI Approved :: BSD License',
59 | 'Operating System :: POSIX',
60 | 'Programming Language :: Python :: 3.7',
61 | 'Topic :: Text Processing :: Linguistic',
62 | 'Topic :: Multimedia :: Sound/Audio :: Speech',
63 | 'Topic :: Scientific/Engineering :: Artificial Intelligence',
64 | 'Natural Language :: Japanese',
65 | ]
66 | )
67 |
--------------------------------------------------------------------------------
/docs/locale/ja/LC_MESSAGES/pages/unidic-usage.po:
--------------------------------------------------------------------------------
1 | # SOME DESCRIPTIVE TITLE.
2 | # Copyright (C) 2019-, Hideyuki Tachibana, PKSHA Technology Inc
3 | # This file is distributed under the same license as the tdmelodic package.
4 | # Hideyuki Tachibana, 2021.
5 | #
6 | #, fuzzy
7 | msgid ""
8 | msgstr ""
9 | "Project-Id-Version: tdmelodic \n"
10 | "Report-Msgid-Bugs-To: \n"
11 | "POT-Creation-Date: 2021-10-22 18:34+0900\n"
12 | "PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
13 | "Last-Translator: Hideyuki Tachibana \n"
14 | "Language-Team: \n"
15 | "MIME-Version: 1.0\n"
16 | "Content-Type: text/plain; charset=utf-8\n"
17 | "Content-Transfer-Encoding: 8bit\n"
18 | "Generated-By: Babel 2.9.1\n"
19 |
20 | #: ../../pages/unidic-usage.md:1
21 | msgid "Usage of UniDic-tdmelodic as a MeCab dictionary"
22 | msgstr "UniDic-tdmelodic を MeCab 辞書として活用"
23 |
24 | #: ../../pages/unidic-usage.md:3
25 | msgid "Install UniDic-tdmelodic"
26 | msgstr "UniDic-tdmelodic のインストール"
27 |
28 | #: ../../pages/unidic-usage.md:5
29 | msgid ""
30 | "You can install `tdmelodic` (`tdmelodic-unidic`) by copying the content "
31 | "of `tdmelodic.csv` we have just created to the UniDic default dictionary "
32 | "(`lex.csv`), and running the installation script with appropriate command"
33 | " line options."
34 | msgstr ""
35 | "前の章で作成した `tdmelodic.csv` ファイルの中身を UniDic "
36 | "のデフォルトの辞書(`lex.csv`)にコピーし、適切なコマンドラインオプションを与えてインストール用スクリプトを実行することで、 `tdmelodic` "
37 | "(`tdmelodic-upadic`) をインストールできます。"
38 |
39 | #: ../../pages/unidic-usage.md:9
40 | msgid "Firstly, specify the file paths."
41 | msgstr "まず、ファイルのパスを指定します。"
42 |
43 | #: ../../pages/unidic-usage.md:16
44 | msgid "Then unzip the UniDic file."
45 | msgstr "次に、UniDicのzipファイルを展開します。"
46 |
47 | #: ../../pages/unidic-usage.md:23
48 | msgid "Concatenate the dictionaries."
49 | msgstr "次に、辞書ファイルを結合します。"
50 |
51 | #: ../../pages/unidic-usage.md:30
52 | msgid "Finally, install `tdmelodic`."
53 | msgstr "最後に、`tdmelodic`をインストールします。"
54 |
55 | #: ../../pages/unidic-usage.md:37
56 | msgid "Use UniDic-tdmelodic"
57 | msgstr "UniDic-tdmelodic の使用例"
58 |
59 | #: ../../pages/unidic-usage.md:38
60 | msgid "Here are some examples."
61 | msgstr "いくつかの例をご覧ください。"
62 |
63 | #: ../../pages/unidic-usage.md:40
64 | msgid "Example 1"
65 | msgstr "例1"
66 |
67 | #: ../../pages/unidic-usage.md:64
68 | msgid "Cf."
69 | msgstr ""
70 |
71 | #: ../../pages/unidic-usage.md:92
72 | msgid "Example 2"
73 | msgstr "例2"
74 |
75 | #: ../../pages/unidic-usage.md:114
76 | msgid "Example 3"
77 | msgstr "例3"
78 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | pip-wheel-metadata/
24 | share/python-wheels/
25 | *.egg-info/
26 | .installed.cfg
27 | *.egg
28 | MANIFEST
29 |
30 | # PyInstaller
31 | # Usually these files are written by a python script from a template
32 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
33 | *.manifest
34 | *.spec
35 |
36 | # Installer logs
37 | pip-log.txt
38 | pip-delete-this-directory.txt
39 |
40 | # Unit test / coverage reports
41 | htmlcov/
42 | .tox/
43 | .nox/
44 | .coverage
45 | .coverage.*
46 | .cache
47 | nosetests.xml
48 | coverage.xml
49 | *.cover
50 | *.py,cover
51 | .hypothesis/
52 | .pytest_cache/
53 |
54 | # Translations
55 | *.mo
56 | *.pot
57 |
58 | # Django stuff:
59 | *.log
60 | local_settings.py
61 | db.sqlite3
62 | db.sqlite3-journal
63 |
64 | # Flask stuff:
65 | instance/
66 | .webassets-cache
67 |
68 | # Scrapy stuff:
69 | .scrapy
70 |
71 | # Sphinx documentation
72 | docs/_build/
73 |
74 | # PyBuilder
75 | target/
76 |
77 | # Jupyter Notebook
78 | .ipynb_checkpoints
79 |
80 | # IPython
81 | profile_default/
82 | ipython_config.py
83 |
84 | # pyenv
85 | .python-version
86 |
87 | # pipenv
88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
91 | # install all needed dependencies.
92 | #Pipfile.lock
93 |
94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
95 | __pypackages__/
96 |
97 | # Celery stuff
98 | celerybeat-schedule
99 | celerybeat.pid
100 |
101 | # SageMath parsed files
102 | *.sage.py
103 |
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 |
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 |
117 | # Rope project settings
118 | .ropeproject
119 |
120 | # mkdocs documentation
121 | /site
122 |
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 |
128 | # Pyre type checker
129 | .pyre/
130 |
--------------------------------------------------------------------------------
/docs/pages/unidic-dicgen.md:
--------------------------------------------------------------------------------
1 | # Dictionary generation for UniDic users
2 | WARNING: _This section takes several hours or days._
3 |
4 | ## Prepare the base dictionary
5 | ### git clone NEologd
6 | First, download the NEologd dictionary as follows.
7 |
8 | ```sh
9 | WORKDIR=/path/to/your/work/dir
10 | cd $WORKDIR # move to the working directory
11 | git clone --depth 1 https://github.com/neologd/mecab-unidic-neologd/
12 | ```
13 |
14 | ### Extract the NEologd vocabulary file and apply a patch
15 |
16 | Then, extract the csv file of NEologd dictionary using `unxz` command.
17 |
18 | ```sh
19 | # if your system has the unxz command
20 | unxz -k `ls mecab-unidic-neologd/seed/*.xz | tail -n 1`
21 | # otherwise
22 | docker run --rm -v $(pwd):/root/workspace tdmelodic:latest \
23 | unxz -k `ls mecab-unidic-neologd/seed/*.xz | tail -n 1`
24 | ```
25 |
26 | This will generate a CSV file named `mecab-unidic-user-dict-seed.yyyymmdd.csv`.
27 | Then, apply the patch to the NEologd dictionary which we have just extracted, as follows.
28 | This creates a dictionary file `neologd_modified.csv` in the working directory.
29 |
30 | ```sh
31 | docker run --rm -v $(pwd):/root/workspace tdmelodic:latest \
32 | tdmelodic-neologd-preprocess \
33 | --input `ls mecab-unidic-neologd/seed/mecab-unidic-user-dict-seed*.csv | tail -n 1` \
34 | --output neologd_modified.csv \
35 | --no-rmdups --no-rm_wrong_yomi
36 | ```
37 |
38 | `--no-rmdups`, `--no-rm_wrong_yomi` are options whether or not to remove certain words.
39 | These options can be found with the following command.
40 | ```
41 | docker run --rm tdmelodic:latest tdmelodic-neologd-preprocess -h
42 | ```
43 |
44 | ## Inference
45 |
46 | _WARNING! THIS TAKES MUCH TIME!_
47 | (FYI: It took about 2.5 hours in a MacBookPro, 5 hours in our Linux server.)
48 |
49 | Now let generate the accent dictionary.
50 | It estimates the accent of the words listed in NEologd dictionary
51 | by a machine learning -based technique.
52 |
53 | ```sh
54 | docker run --rm -v $(pwd):/root/workspace tdmelodic:latest \
55 | tdmelodic-convert \
56 | -m unidic \
57 | --input neologd_modified.csv \
58 | --output tdmelodic_original.csv
59 | cp ${WORKDIR}/tdmelodic_original.csv ${WORKDIR}/tdmelodic.csv # backup
60 | ```
61 |
62 | ## Postprocess
63 |
64 | Unigram costs can be fixed using the following script.
65 | ```sh
66 | cp ${WORKDIR}/tdmelodic.csv ${WORKDIR}/tdmelodic.csv.bak
67 | docker run --rm -v $(pwd):/root/workspace tdmelodic:latest \
68 | tdmelodic-modify-unigram-cost \
69 | -i tdmelodic.csv.bak \
70 | -o tdmelodic.csv
71 | ```
72 |
--------------------------------------------------------------------------------
/docs/pages/ipadic-usage.md:
--------------------------------------------------------------------------------
1 | # Usage of IPADIC-tdmelodic as a MeCab dictionary
2 |
3 | ## Install IPADIC-tdmelodic
4 | You can install `tdmelodic-ipadic` by replacing all the CSV files with
5 | the generated `*.csv.accent` files, and running the
6 | installation script with appropriate command line options.
7 |
8 |
9 | ```sh
10 | # paths
11 | WORKDIR=/path/to/your/work/dir
12 | NEOLOGD_DIC_DIR=${WORKDIR}/mecab-ipadic-neologd/seed
13 | IPADIC_DIR=${WORKDIR}/mecab-ipadic-2.7.0-XXXX
14 | ```
15 |
16 | ```sh
17 | # copy
18 | for f in `ls ${NEOLOGD_DIC_DIR}/*.csv.accent`
19 | do
20 | target=`basename $f`
21 | target=${target%.accent}
22 | cp $f $IPADIC_DIR/$target
23 | done
24 |
25 | for f in `ls ${IPADIC_DIR}/*.csv.accent`
26 | do
27 | target=`basename $f`
28 | target=${target%.accent}
29 | cp $f $IPADIC_DIR/$target
30 | done
31 | ```
32 |
33 | ```sh
34 | # install
35 | cd ${IPADIC_DIR}
36 | ./configure --with-dicdir=`mecab-config --dicdir`/tdmelodic-ipadic
37 | make
38 | make install
39 | ```
40 |
41 |
42 | ## Use IPADIC-tdmelodic
43 | Here are some examples.
44 |
45 | ### Example 1
46 |
47 | ```sh
48 | echo 一昔前は人工知能のプログラミング言語といえばCommon LispやPrologだった。 | \
49 | mecab -d `mecab-config --dicdir`/tdmelodic-ipadic
50 | ```
51 | ```
52 | 一昔 名詞,一般,*,*,*,*,一昔,ヒトムカシ,ヒ[ト]ムカシ
53 | 前 名詞,副詞可能,*,*,*,*,前,マエ,マ]エ
54 | は 助詞,係助詞,*,*,*,*,は,ハ,ワ
55 | 人工知能 名詞,固有名詞,一般,*,*,*,人工知能,ジンコウチノウ,ジ[ンコーチ]ノー
56 | の 助詞,連体化,*,*,*,*,の,ノ,ノ
57 | プログラミング言語 名詞,固有名詞,一般,*,*,*,プログラミング言語,プログラミングゲンゴ,プ[ログラミングゲ]ンゴ
58 | と 助詞,格助詞,引用,*,*,*,と,ト,ト]
59 | いえ 動詞,自立,*,*,五段・ワ行促音便,仮定形,いう,イエ,イ[エ]
60 | ば 助詞,接続助詞,*,*,*,*,ば,バ,バ
61 | Common Lisp 名詞,固有名詞,一般,*,*,*,Common Lisp,コモンリスプ,コ[モンリ]スプ
62 | や 助詞,並立助詞,*,*,*,*,や,ヤ,ヤ
63 | Prolog 名詞,固有名詞,一般,*,*,*,Prolog,プロログ,プ[ロログ
64 | だっ 助動詞,*,*,*,特殊・ダ,連用タ接続,だ,ダッ,ダ]ッ
65 | た 助動詞,*,*,*,特殊・タ,基本形,た,タ,タ
66 | 。 記号,句点,*,*,*,*,。,。,。
67 | EOS
68 | ```
69 |
70 | ### Example 2
71 |
72 | ```sh
73 | echo 横浜市中区日本大通 | mecab -d `mecab-config --dicdir`/tdmelodic-ipadic
74 | ```
75 | ```
76 | 横浜市中区日本大通 名詞,固有名詞,地域,一般,*,*,横浜市中区日本大通,ヨコハマシナカクニホンオオドオリ,ヨ[コハマ]シナ[カ]クニ[ホンオード]ーリ
77 | EOS
78 | ```
79 |
80 | ### Example 3
81 |
82 | ```sh
83 | echo 980hPa | mecab -d `mecab-config --dicdir`/tdmelodic-ipadic
84 | echo 15mm | mecab -d `mecab-config --dicdir`/tdmelodic-ipadic
85 | echo 4月10日 | mecab -d `mecab-config --dicdir`/tdmelodic-ipadic
86 | ```
87 | ```
88 | 980hPa 名詞,固有名詞,一般,*,*,*,980hPa,キュウヒャクハチジュウヘクトパスカル,キュ]ウヒャクハ[チジュウヘクトパ]スカル
89 | EOS
90 | 15mm 名詞,固有名詞,一般,*,*,*,15mm,ジュウゴミリメートル,ジュ[ウゴミリメ]ートル
91 | EOS
92 | 4月10日 名詞,固有名詞,一般,*,*,*,4月10日,シガツトオカ,シ[ガツトオカ
93 | EOS
94 | ```
95 |
--------------------------------------------------------------------------------
/docs/imgs/jpn_accent_types.ly:
--------------------------------------------------------------------------------
1 | \version "2.18.2"
2 | \pointAndClickOff
3 |
4 | % Copyright (c) 2020-, Hideyuki Tachibana.
5 | % All rights reserved.
6 |
7 | "|" = {
8 | \once \override Staff.BarLine.bar-extent = #'(-1 . 1)
9 | \bar "|"
10 | }
11 |
12 | SP = {\hideNotes d''8 \unHideNotes} % little spacing
13 |
14 | % note: it converts a command as follows
15 | % \age xyz
16 | % ->
17 | % \markup { xyz \with-color " red "[" }
18 | age=#(define-music-function
19 | (parser location argtext)
20 | (markup?)
21 | #{
22 | \lyricmode{
23 | \markup{ #argtext \with-color #red "[" }
24 | }
25 | #}
26 | )
27 |
28 | sage=#(define-music-function
29 | (parser location argtext)
30 | (markup?)
31 | #{
32 | \lyricmode{
33 | \markup{ #argtext \with-color #blue "]" }
34 | }
35 | #}
36 | )
37 |
38 |
39 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
40 | \header {
41 | subtitle = \markup{\fontsize #1 "Six accent patterns of five-mora words"}
42 | }
43 |
44 | \new Staff \with
45 | {
46 | instrumentName = \markup{}
47 | }{
48 | \omit Staff.TimeSignature
49 | \textLengthOff
50 | \stopStaff
51 | \override Staff.StaffSymbol.line-positions = #'(-2 2)
52 | \override Score.BarNumber #'transparent = ##t
53 | \startStaff
54 |
55 | \time 6/8
56 | g'8^[^\markup{type 0 }
57 | d'' d'' d'' d'' ] d'' \bar"||"
58 | d''8^[^\markup{type 1 }
59 | g' g' g' g' ] g' \bar"||"
60 | g'8^[^\markup{type 2 }
61 | d'' g' g' g' ] g' \bar"||"
62 | g'8^[^\markup{type 3 }
63 | d'' d'' g' g' ] g' \bar"||"
64 | g'8^[^\markup{type 4 }
65 | d'' d'' d'' g' ] g' \bar"||"
66 | g'8^[^\markup{type 5 }
67 | d'' d'' d'' d'' ] g' \bar"||"
68 |
69 | }
70 | \addlyrics {
71 | \stopStaff
72 | \override Lyrics . LyricText #'font-name ="Times"
73 | \startStaff
74 |
75 | \age "*" "*" "*" "*" "*" ga
76 | \sage "*" "*" "*" "*" "*" ga
77 | \age "*" \sage "*" "*" "*" "*" ga
78 | \age "*" "*" \sage"*" "*" "*" ga
79 | \age "*" "*" "*" \sage "*" "*" ga
80 | \age "*" "*" "*" "*" \sage "*" ga
81 | \age "*" "*" "*" "*" "*" \sage ga
82 | }
83 |
84 | \layout {
85 | indent = 0\cm
86 | }
87 |
88 | \header {
89 | tagline = "" % removed
90 | }
91 |
92 | % page size
93 | #(set! paper-alist (cons '("my size" . (cons (* 5. in) (* 2.2 in))) paper-alist))
94 |
95 | \paper {
96 | print-page-number = ##f % erase page numbering
97 |
98 | #(set-paper-size "my size")
99 | ragged-last-bottom = ##f
100 | ragged-bottom = ##f
101 |
102 | left-margin = 0
103 | right-margin = 0
104 | }
--------------------------------------------------------------------------------
/docs/locale/ja/LC_MESSAGES/index.po:
--------------------------------------------------------------------------------
1 | # SOME DESCRIPTIVE TITLE.
2 | # Copyright (C) 2019-, Hideyuki Tachibana, PKSHA Technology Inc
3 | # This file is distributed under the same license as the tdmelodic package.
4 | # Hideyuki Tachibana, 2021.
5 | #
6 | #, fuzzy
7 | msgid ""
8 | msgstr ""
9 | "Project-Id-Version: tdmelodic \n"
10 | "Report-Msgid-Bugs-To: \n"
11 | "POT-Creation-Date: 2021-08-24 16:00+0900\n"
12 | "PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
13 | "Last-Translator: Hideyuki Tachibana \n"
14 | "Language-Team: \n"
15 | "MIME-Version: 1.0\n"
16 | "Content-Type: text/plain; charset=utf-8\n"
17 | "Content-Transfer-Encoding: 8bit\n"
18 | "Generated-By: Babel 2.9.1\n"
19 |
20 | #: ../../index.rst:15
21 | msgid "Background"
22 | msgstr "予備知識"
23 |
24 | #: ../../index.rst:22
25 | msgid "Prelininary Setting"
26 | msgstr "事前準備"
27 |
28 | #: ../../index.rst:28
29 | msgid "Dictionary Generation"
30 | msgstr "辞書生成"
31 |
32 | #: ../../index.rst:36
33 | msgid "Install tdmelodic on your system"
34 | msgstr "tdmelodicのインストール"
35 |
36 | #: ../../index.rst:43
37 | msgid "One-by-one Manual Inference Mode"
38 | msgstr "一件ずつ推論するためのツール"
39 |
40 | #: ../../index.rst:2
41 | msgid "tdmelodic Documentation"
42 | msgstr "tdmelodic利用マニュアル"
43 |
44 | #: ../../index.rst:4
45 | msgid "**Tokyo Dialect MELOdic accent DICtionary**"
46 | msgstr "**Tokyo Dialect MELOdic accent DICtionary**"
47 |
48 | #: ../../index.rst:6
49 | msgid ""
50 | "This module generates a large scale accent dictionary of Japanese (Tokyo "
51 | "dialect) using a neural network based technique."
52 | msgstr "これは、ニューラルネットワークにより、日本語(東京方言)の大規模なアクセント辞書を自動生成するモジュールです。"
53 |
54 | #: ../../index.rst:9
55 | msgid ""
56 | "The objective of this module is to generate a large vocabulary Japanese "
57 | "accent dictionary, exploiting existing two dictionaries: UniDic and "
58 | "NEologd. UniDic provides accurate accent information of words, but its "
59 | "vocabulary size is not necessarilly large. NEologd is a very large "
60 | "Japanese dictionary but it does not provide accent information of words."
61 | msgstr ""
62 | "このモジュールの目的は、日本語の大規模アクセント辞書を自動生成することです。そのために UniDic と NEologd "
63 | "という既存の二つの辞書を利用します。UniDic では正確なアクセント情報が提供されていますが、扱える語彙がやや限定されています。一方 "
64 | "NEologd は非常に大規模な語彙を扱っている一方、アクセント情報を提供していません。"
65 |
66 | #: ../../index.rst:50
67 | msgid "Citation"
68 | msgstr "文献情報"
69 |
70 | #: ../../index.rst:52
71 | msgid "For academic use, please cite the following paper."
72 | msgstr "論文等で引用いただく場合は以下の bibtex をご利用ください。"
73 |
74 | #: ../../index.rst:68
75 | msgid ""
76 | "Paper Links: `[IEEE Xplore] "
77 | "`_, `[arXiv preprint] "
78 | "`_"
79 | msgstr ""
80 | "論文リンク: `[IEEE Xplore] `_, "
81 | "`[arXiv プレプリント] `_"
82 |
--------------------------------------------------------------------------------
/docs/imgs/jpn_accent-ja.ly:
--------------------------------------------------------------------------------
1 | \version "2.18.2"
2 | \pointAndClickOff
3 |
4 | % Copyright (c) 2019-, Hideyuki Tachibana.
5 | % All rights reserved.
6 |
7 | "|" = {
8 | \once \override Staff.BarLine.bar-extent = #'(-1 . 1)
9 | \bar "|"
10 | }
11 |
12 | myColorNote = { \once \override NoteHead.color = #(x11-color "medium turquoise") }
13 | LS = { \once \override NoteColumn.X-offset = 5 } % little spacing
14 | SP = {\hideNotes r8 \unHideNotes} % little spacing
15 |
16 | % アクセント記号用のマクロ
17 | % note: it converts a command as follows
18 | % \age xyz
19 | % ->
20 | % \markup { xyz \with-color " red "[" }
21 | age=#(define-music-function
22 | (parser location argtext)
23 | (markup?)
24 | #{
25 | \lyricmode{
26 | \markup{ #argtext \with-color #red "[" }
27 | }
28 | #}
29 | )
30 |
31 | sage=#(define-music-function
32 | (parser location argtext)
33 | (markup?)
34 | #{
35 | \lyricmode{
36 | \markup{ #argtext \with-color #blue "]" }
37 | }
38 | #}
39 | )
40 |
41 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
42 | \new Staff \with
43 | {
44 | instrumentName = \markup{}
45 | }{\omit Staff.TimeSignature
46 | \stopStaff
47 | \override Staff.StaffSymbol.line-positions = #'(-2 2)
48 | \override Score.BarNumber #'transparent = ##t
49 | \startStaff
50 |
51 | \time 4/8
52 | g'8^[ d'' d'' d'' ] \bar"||"
53 | \time 5/8
54 | g'8^[ d'' d'' g' g' ] \bar"||" \break
55 | \time 7/8
56 | g'8^[ d'' d'' d'' d'' d'' g' ] \bar"||"\break
57 | \time 10/8
58 | g'8^[ d'' d'' d'' d'' d'' d'' d'' g' g'] \bar"||"\break
59 | \time 3/8
60 | d''^[ g' g' ] \bar"||"
61 | \time 3/8
62 | g'^[ d'' d''] \bar"||"\\
63 | \time 6/8
64 | g'^[ d'' d'' d'' g' g' ] \bar"||"\break
65 | \time 3/8
66 | d''^[ g' g' ] \bar"||"
67 | \time 4/8
68 | g'^[ d'' d'' d''] \bar"||"\\
69 | \time 7/8
70 | g'^[ d'' d'' d'' g' g' g' ] \bar"||"
71 | }
72 | \addlyrics {
73 | \stopStaff
74 | \override Lyrics . LyricText #'font-name ="IPAex Mincho"
75 | \startStaff
76 | \age と う きょ う
77 | \age と う \sage きょ う と
78 | \age と う きょ う と \sage ち じ
79 | \age と う きょ う と ち じ \sage せ ん きょ
80 | \sage せ か い \age い さ ん
81 | \age せ か い \sage い さ ん
82 | \sage き か い \age が く しゅ う
83 | \age き か い \sage が く しゅ う
84 | }
85 |
86 | \layout {
87 | indent = 0\cm
88 | }
89 |
90 | \header {
91 | tagline = "" % removed
92 | }
93 |
94 | % page size
95 | #(set! paper-alist (cons '("my size" . (cons (* 4. in) (* 0.8 in))) paper-alist))
96 |
97 | \paper {
98 | print-page-number = ##f % erase page numbering
99 |
100 | #(set-paper-size "my size")
101 | ragged-last-bottom = ##f
102 | ragged-bottom = ##f
103 |
104 | left-margin = 0
105 | right-margin = 0
106 | }
--------------------------------------------------------------------------------
/docs/imgs/jpn_accent-en.ly:
--------------------------------------------------------------------------------
1 | \version "2.18.2"
2 | \pointAndClickOff
3 |
4 | % Copyright (c) 2019-, Hideyuki Tachibana.
5 | % All rights reserved.
6 |
7 | "|" = {
8 | \once \override Staff.BarLine.bar-extent = #'(-1 . 1)
9 | \bar "|"
10 | }
11 |
12 | myColorNote = { \once \override NoteHead.color = #(x11-color "medium turquoise") }
13 | LS = { \once \override NoteColumn.X-offset = 5 } % little spacing
14 | SP = {\hideNotes r8 \unHideNotes} % little spacing
15 |
16 | % アクセント記号用のマクロ
17 | % note: it converts a command as follows
18 | % \age xyz
19 | % ->
20 | % \markup { xyz \with-color " red "[" }
21 | age=#(define-music-function
22 | (parser location argtext)
23 | (markup?)
24 | #{
25 | \lyricmode{
26 | \markup{ #argtext \with-color #red "[" }
27 | }
28 | #}
29 | )
30 |
31 | sage=#(define-music-function
32 | (parser location argtext)
33 | (markup?)
34 | #{
35 | \lyricmode{
36 | \markup{ #argtext \with-color #blue "]" }
37 | }
38 | #}
39 | )
40 |
41 |
42 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
43 | \new Staff \with
44 | {
45 | instrumentName = \markup{}
46 | }{\omit Staff.TimeSignature
47 |
48 | \stopStaff
49 | \override Staff.StaffSymbol.line-positions = #'(-2 2)
50 | \override Score.BarNumber #'transparent = ##t
51 | \startStaff
52 |
53 | \time 4/8
54 | g'8^[ d'' d'' d'' ] \bar"||"
55 | \time 5/8
56 | g'8^[ d'' d'' g' g' ] \bar"||" \break
57 | \time 7/8
58 | g'8^[ d'' d'' d'' d'' d'' g' ] \bar"||"\break
59 | \time 10/8
60 | g'8^[ d'' d'' d'' d'' d'' d'' d'' g' g'] \bar"||"\break
61 | \time 3/8
62 | d''^[ g' g' ] \bar"||"
63 | \time 3/8
64 | g'^[ d'' d''] \bar"||"\\
65 | \time 6/8
66 | g'^[ d'' d'' d'' g' g' ] \bar"||"\break
67 | \time 3/8
68 | d''^[ g' g' ] \bar"||"
69 | \time 4/8
70 | g'^[ d'' d'' d''] \bar"||"\\
71 | \time 7/8
72 | g'^[ d'' d'' d'' g' g' g' ] \bar"||"
73 | }
74 | \addlyrics {
75 | \stopStaff
76 | \override Lyrics . LyricText #'font-name ="Times"
77 | \startStaff
78 |
79 | \age to o kyo o
80 | \age to o \sage kyo o to
81 | \age to o kyo o to \sage chi ji
82 | \age to o kyo o to chi ji \sage se n kyo
83 | \sage se ka i \age i sa n
84 | \age se ka i \sage i sa n
85 | \sage ki ka i \age ga ku shu u
86 | \age ki ka i \sage ga ku shu u
87 | }
88 |
89 | \layout {
90 | indent = 0\cm
91 | }
92 |
93 | \header {
94 | tagline = "" % removed
95 | }
96 |
97 | % page size
98 | #(set! paper-alist (cons '("my size" . (cons (* 4. in) (* 0.8 in))) paper-alist))
99 |
100 | \paper {
101 | print-page-number = ##f % erase page numbering
102 |
103 | #(set-paper-size "my size")
104 | ragged-last-bottom = ##f
105 | ragged-bottom = ##f
106 |
107 | left-margin = 0
108 | right-margin = 0
109 | }
--------------------------------------------------------------------------------
/docs/locale/ja/LC_MESSAGES/pages/docker.po:
--------------------------------------------------------------------------------
1 | # SOME DESCRIPTIVE TITLE.
2 | # Copyright (C) 2019-, Hideyuki Tachibana, PKSHA Technology Inc
3 | # This file is distributed under the same license as the tdmelodic package.
4 | # Hideyuki Tachibana, 2021.
5 | #
6 | msgid ""
7 | msgstr ""
8 | "Project-Id-Version: tdmelodic \n"
9 | "Report-Msgid-Bugs-To: \n"
10 | "POT-Creation-Date: 2021-11-05 18:26+0900\n"
11 | "PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
12 | "Last-Translator: Hideyuki Tachibana \n"
13 | "Language-Team: \n"
14 | "MIME-Version: 1.0\n"
15 | "Content-Type: text/plain; charset=utf-8\n"
16 | "Content-Transfer-Encoding: 8bit\n"
17 | "Generated-By: Babel 2.9.1\n"
18 |
19 | #: ../../pages/docker.md:1
20 | msgid "Build the Docker image"
21 | msgstr "docker イメージをビルド"
22 |
23 | #: ../../pages/docker.md:3
24 | msgid "Download codes and data"
25 | msgstr "コードとデータのダウンロード"
26 |
27 | #: ../../pages/docker.md:5
28 | msgid "Requirements"
29 | msgstr "事前準備"
30 |
31 | #: ../../pages/docker.md:6
32 | msgid ""
33 | "Please set up `git`, `docker` and `mecab` (such as `libmecab-dev`) on "
34 | "your UNIX-like system such as Ubuntu or MacOS."
35 | msgstr ""
36 | "お手元の Unix 系処理系(UbuntuやMacOSなど)に、Git, Docker, MeCab (libmecab-"
37 | "devなど)をセットアップしてください。"
38 |
39 | #: ../../pages/docker.md:8
40 | msgid "git clone"
41 | msgstr "コードのダウンロード"
42 |
43 | #: ../../pages/docker.md:9
44 | msgid "Create working directory and download the repositories."
45 | msgstr "作業ディレクトリを作成し、GitHub からコードをダウンロードしてください"
46 |
47 | #: ../../pages/docker.md:17
48 | msgid "Download the UniDic dictionary file"
49 | msgstr "UniDic の辞書ファイルをダウンロード"
50 |
51 | #: ../../pages/docker.md:19
52 | msgid ""
53 | "Download the UniDic file from [NINJAL](https://ccd.ninjal.ac.jp/unidic/)."
54 | " Several versions have been published, but the version we need is "
55 | "`uniDic-mecab_kana-accent-2.1.2_src.zip`."
56 | msgstr ""
57 | "[国立国語研究所のサイト](https://ccd.ninjal.ac.jp/unidic/)から UniDic "
58 | "の辞書ファイルをダウンロードしてください。複数のバージョンが公開されていますが、"
59 | "このモジュールでは `unidic-mecab_kana-accent-2.1.2_src.zip` を使います。"
60 |
61 | #: ../../pages/docker.md:27
62 | msgid ""
63 | "Note: **This file will be reused later.** Please do not download the file"
64 | " more than once to avoid overloading the site you are downloading from. "
65 | "It is recommended that you keep this file somewhere in your local file "
66 | "system."
67 | msgstr ""
68 | "注意: **このファイルは後ほど再利用します。** "
69 | "ダウンロード先のサイトに負荷をかけすぎることのないよう、何度もダウンロードするのは避けてください。ダウンロードした zip "
70 | "ファイルはローカルのどこかに保管しておくことを推奨します。"
71 |
72 | #: ../../pages/docker.md:32
73 | msgid "Docker build"
74 | msgstr "Docker ビルド"
75 |
76 | #: ../../pages/docker.md:34
77 | msgid ""
78 | "Build the docker image using following commands. It will take a few "
79 | "minutes."
80 | msgstr "以下のコマンドでdockerのイメージをビルドしてください。これには数分かかります。"
81 |
82 | #: ../../pages/docker.md:42
83 | msgid "Test some commands if needed"
84 | msgstr "動作確認"
85 |
86 | #: ../../pages/docker.md:43
87 | msgid "If needed, try following commands and check the results."
88 | msgstr "もし興味があれば、以下のコマンドを試してみてください。このプロセスは飛ばしても構いません。"
89 |
--------------------------------------------------------------------------------
/tdmelodic/nn/model/modules/dilateconvcausal1d.py:
--------------------------------------------------------------------------------
1 | # -----------------------------------------------------------------------------
2 | # Copyright (c) 2019-, PKSHA Technology Inc.
3 | # All rights reserved.
4 | #
5 | # This source code is licensed under the BSD-style license found in the
6 | # LICENSE file in the root directory of this source tree.
7 | # -----------------------------------------------------------------------------
8 |
9 | import chainer
10 | import chainer.functions as F
11 | import chainer.links as L
12 |
13 | class DilateConvCausal1D(chainer.Chain):
14 | ''' dilated convolution (causal) 1D '''
15 | def __init__(self,
16 | in_channel,
17 | out_channel,
18 | ksize,
19 | dilate=1,
20 | causal=True):
21 |
22 | self.in_ch = in_channel
23 | self.out_ch = out_channel
24 | self.ksize = ksize
25 | self.dilate = dilate
26 | self.causal = causal
27 | self.conv_size = (self.ksize - 1) * self.dilate + 1
28 |
29 | layers={}
30 | if self.dilate is None or self.dilate == 1:
31 | layers["conv"] = L.ConvolutionND(1,
32 | self.in_ch,
33 | self.out_ch,
34 | self.ksize,
35 | stride=1,
36 | pad=0)
37 | else:
38 | layers["conv"] = L.DilatedConvolution2D( \
39 | self.in_ch,
40 | self.out_ch,
41 | (self.ksize, 1),
42 | stride=1,
43 | pad=(0, 0),
44 | dilate = (self.dilate, 1)
45 | )
46 |
47 | super(DilateConvCausal1D, self).__init__(**layers)
48 |
49 | def padding(self, h):
50 | if self.causal:
51 | h = F.pad(h, (
52 | (0, 0), # batch
53 | (0, 0), # feature
54 | ( (self.ksize - 1 ) * self.dilate, 0) # temporal
55 | ), 'constant', constant_values=0)
56 | else:
57 | h = F.pad(h, (
58 | (0, 0), # batch
59 | (0, 0), # feature
60 | ( (self.ksize - 1) * self.dilate // 2, (self.ksize - 1) * self.dilate // 2) # temporal
61 | ), 'constant', constant_values=0)
62 | return h
63 |
64 | def __call__(self, x):
65 | h = x
66 | h = self.padding(h)
67 | return self.forward(h)
68 |
69 | def forward(self, x, **kwargs):
70 | h = x
71 | if self.dilate == 1:
72 | h = self.conv(h)
73 | else:
74 | h = F.expand_dims(h, axis=3)
75 | h = self.conv(h)
76 | h = h[:,:,:,0]
77 |
78 | return h
79 |
80 | if __name__ == '__main__':
81 | import numpy as np
82 | ''' causal test '''
83 | x = np.ones((3, 3, 40)).astype(np.float32)
84 | m = DilateConvCausal1D(3, 2, ksize=5, dilate=3, causal=True)
85 | y = m(x)
86 | print(y.shape)
87 | print(y.data)
88 |
89 | ''' non causal test '''
90 | x = np.ones((3, 3, 40)).astype(np.float32)
91 | m = DilateConvCausal1D(3, 2, ksize=5, dilate=3, causal=False)
92 | y = m(x)
93 | print(y.shape)
94 | print(y.data)
95 |
--------------------------------------------------------------------------------
/docs/pages/docker.md:
--------------------------------------------------------------------------------
1 | # Build the Docker image
2 |
3 | ## Download codes and data
4 |
5 | ### Requirements
6 | Please set up `git`, `docker` and `mecab` (such as `libmecab-dev`) on your UNIX-like system such as Ubuntu or MacOS.
7 |
8 | ### git clone
9 | Create working directory and download the repositories.
10 |
11 | ```sh
12 | WORKDIR=/path/to/your/work/dir
13 | cd $WORKDIR
14 | git clone --depth 1 https://github.com/PKSHATechnology-Research/tdmelodic
15 | ```
16 |
17 | ### Download the UniDic dictionary file
18 |
19 | Download the UniDic file from [NINJAL](https://ccd.ninjal.ac.jp/unidic/).
20 | Several versions have been published, but the version we need is `uniDic-mecab_kana-accent-2.1.2_src.zip`.
21 |
22 | ```sh
23 | wget https://ccd.ninjal.ac.jp/unidic_archive/cwj/2.1.2/unidic-mecab_kana-accent-2.1.2_src.zip
24 | cp unidic-mecab_kana-accent-2.1.2_src.zip ${WORKDIR}/tdmelodic
25 | ```
26 |
27 | Note: **This file will be reused later.**
28 | Please do not download the file more than once to avoid overloading the site you are downloading from.
29 | It is recommended that you keep this file somewhere in your local file system.
30 |
31 |
32 | ## Docker build
33 |
34 | Build the docker image using following commands.
35 | It will take a few minutes.
36 |
37 | ```sh
38 | cd ${WORKDIR}/tdmelodic
39 | docker build -t tdmelodic:latest . # --no-cache
40 | ```
41 |
42 | ## Test some commands if needed
43 | If needed, try following commands and check the results.
44 |
45 | ```console
46 | you@machine:~$ docker run --rm tdmelodic:latest /bin/bash -c "echo 深層学習 | mecab -d \`mecab-config --dicdir\`/unidic"
47 | 深層 シンソー シンソウ 深層 名詞-普通名詞-一般 0
48 | 学習 ガクシュー ガクシュウ 学習 名詞-普通名詞-サ変可能 0
49 | EOS
50 | ```
51 |
52 | ```console
53 | you@machine:~$ docker run -it --rm tdmelodic:latest
54 | root@docker:~/workspace$ echo 深層学習 | mecab -d `mecab-config --dicdir`/unidic
55 | 深層 シンソー シンソウ 深層 名詞-普通名詞-一般 0
56 | 学習 ガクシュー ガクシュウ 学習 名詞-普通名詞-サ変可能 0
57 | EOS
58 |
59 | root@docker:~/workspace$ python3
60 |
61 | >>> from tdmelodic.nn.lang.mecab.unidic import UniDic
62 |
63 | >>> u = UniDic()
64 | [ MeCab setting ] unidic='/usr/lib/x86_64-linux-gnu/mecab/dic/unidic'
65 | [ MeCab setting ] mecabrc='/usr/local/lib/python3.8/dist-packages/tdmelodic/nn/lang/mecab/my_mecabrc'
66 |
67 | >>> u.get_n_best("深層学習", "しんそうがくしゅう", 3)
68 | ([[{'surface': '深層', 'pron': 'シンソー', 'kana': 'シンソウ', 'pos': '名詞-普通名詞-一般', 'goshu': '漢', 'acc': '0', 'concat': 'C2'}, {'surface': '学習', 'pron': 'ガクシュー', 'kana': 'ガクシュウ', 'pos': '名詞-普通名詞-サ変可能', 'goshu': '漢', 'acc': '0', 'concat': 'C2'}], [{'surface': '深', 'pron': 'シン', 'kana': 'シン', 'pos': '接頭辞', 'goshu': '漢', 'acc': '', 'concat': 'P2'}, {'surface': '層', 'pron': 'ソー', 'kana': 'ソウ', 'pos': '名詞-普通名詞-一般', 'goshu': '漢', 'acc': '1', 'concat': 'C3'}, {'surface': '学習', 'pron': 'ガクシュー', 'kana': 'ガクシュウ', 'pos': '名詞-普通名詞-サ変可能', 'goshu': '漢', 'acc': '0', 'concat': 'C2'}], [{'surface': '深', 'pron': 'フカ', 'kana': 'フカイ', 'pos': '形容詞-一般', 'goshu': '和', 'acc': '2', 'concat': 'C1'}, {'surface': '層', 'pron': 'ソー', 'kana': 'ソウ', 'pos': '名詞-普通名詞-一般', 'goshu': '漢', 'acc': '1', 'concat': 'C3'}, {'surface': '学習', 'pron': 'ガクシュー', 'kana': 'ガクシュウ', 'pos': '名詞-普通名詞-サ変可能', 'goshu': '漢', 'acc': '0', 'concat': 'C2'}]], [0, 1, 2], 9)
69 |
70 | >>> Ctrl-D
71 |
72 | root@docker:~/workspace$ exit
73 | you@machine:~$
74 | ```
75 |
--------------------------------------------------------------------------------
/tdmelodic/filters/postprocess_modify_unigram_cost.py:
--------------------------------------------------------------------------------
1 | # -----------------------------------------------------------------------------
2 | # Copyright (c) 2019-, PKSHA Technology Inc.
3 | # All rights reserved.
4 | #
5 | # This source code is licensed under the BSD-style license found in the
6 | # LICENSE file in the root directory of this source tree.
7 | # -----------------------------------------------------------------------------
8 |
9 | # -*- coding: utf-8 -*-
10 | import sys
11 | import os
12 | import argparse
13 | import csv
14 | import copy
15 | from tqdm import tqdm
16 | from tdmelodic.util.dic_index_map import get_dictionary_index_map
17 | from tdmelodic.util.util import count_lines
18 |
19 | # unigram costなどを後処理で微調整するためのスクリプト
20 |
21 | IDX_MAP = get_dictionary_index_map("unidic")
22 |
23 | def avoid_overflow(line, cost, INT16_MIN = -32768, INT16_MAX = 32767):
24 | """avoid overflow (signed short int)"""
25 | cost = INT16_MAX if cost > INT16_MAX else INT16_MIN if cost < INT16_MIN else cost
26 | line[IDX_MAP["COST"]] = str(cost)
27 | return line, cost
28 |
29 | def modify_unigram_cost(line, verbose=True):
30 | cost = int(line[IDX_MAP["COST"]])
31 |
32 | # 数詞のコストを必要に応じて調整する
33 | if (line[IDX_MAP["SURFACE"]][0] in [str(i) for i in range(10)]) and len(line[1]) >= 2:
34 | cost = cost - 5000
35 |
36 | # 人名のコストを必要に応じて調整する
37 | elif line[IDX_MAP["POS1"]] == "名詞" and line[IDX_MAP["POS2"]] == "固有名詞" and line[IDX_MAP["POS3"]] == "人名":
38 | cost = cost + 5000
39 |
40 | else:
41 | # 必要であればその他の単語のコストも全体的に高めるなど
42 | # (例えばUniDicに同じ単語がある場合はUniDicを優先させるなど)
43 | pass
44 | #cost = cost + 10000
45 |
46 | line, cost = avoid_overflow(line, cost)
47 |
48 | return line
49 |
50 | # ------------------------------------------------------------------------------------
51 | def main_(fp_in, fp_out):
52 | L = count_lines(fp_in)
53 | for i, line in enumerate(tqdm(csv.reader(fp_in), total=L)):
54 | # unigram cost を調整する
55 | line_modified = modify_unigram_cost(copy.deepcopy(line))
56 |
57 | if i % 100000 == 0:
58 | print(i)
59 | print("before", line, file=sys.stderr)
60 | print("after", line_modified, file=sys.stderr)
61 |
62 | # output
63 | line = ','.join(line_modified) + '\n'
64 | fp_out.write(line)
65 |
66 | print("Complete!", file=sys.stderr)
67 | return
68 |
69 | def main():
70 | parser = argparse.ArgumentParser()
71 | parser.add_argument(
72 | '-i',
73 | '--input',
74 | nargs='?',
75 | type=argparse.FileType("r"),
76 | default=sys.stdin,
77 | help='input CSV file (NEologd dicitionary file) ')
78 | parser.add_argument(
79 | '-o',
80 | '--output',
81 | nargs='?',
82 | type=argparse.FileType("w"),
83 | default=sys.stdout,
84 | help='output CSV file ')
85 | args = parser.parse_args()
86 |
87 | if args.input == args.output:
88 | print("[ Error ] intput and output files should be different.")
89 | else:
90 | try:
91 | main_(args.input, args.output)
92 | except Exception as e:
93 | print(e)
94 |
95 | if __name__ == '__main__':
96 | main()
--------------------------------------------------------------------------------
/tdmelodic/filters/yomi/wrong_yomi_detection.py:
--------------------------------------------------------------------------------
1 | # -----------------------------------------------------------------------------
2 | # Copyright (c) 2019-, PKSHA Technology Inc.
3 | # All rights reserved.
4 | #
5 | # This source code is licensed under the BSD-style license found in the
6 | # LICENSE file in the root directory of this source tree.
7 | # -----------------------------------------------------------------------------
8 |
9 | # -*- coding: utf-8 -*-
10 | import sys
11 | import os
12 | import argparse
13 | import regex as re
14 | import csv
15 | from tqdm import tqdm
16 |
17 | import unicodedata
18 | import jaconv
19 | from dataclasses import dataclass
20 |
21 | from tdmelodic.nn.lang.japanese.kansuji import numeric2kanji
22 | from tdmelodic.util.dic_index_map import get_dictionary_index_map
23 | from tdmelodic.util.util import count_lines
24 | from tdmelodic.util.word_type import WordType
25 | from .yomieval import YomiEvaluator
26 |
27 |
28 | @dataclass
29 | class LineInfo(object):
30 | surf: str
31 | yomi: str
32 | pos: str
33 |
34 | class SimpleWrongYomiDetector(object):
35 | def __init__(self, distance_threshold=10, ratio_threshold=0.7, mode="unidic"):
36 | """
37 | If the Levenshtein distance between the provided yomi and the predicted yomi from the surface form
38 | is greater than the given thresholds, the entry will be removed.
39 | """
40 | self.distance_threshold = distance_threshold
41 | self.ratio_threshold = ratio_threshold
42 |
43 | self.yomieval = YomiEvaluator(rank_weight=0, romaji_priority=0, nbest=10)
44 | self.IDX_MAP = get_dictionary_index_map(mode)
45 | self.wt = WordType()
46 |
47 | def __call__(self, line):
48 | if line is None:
49 | return None
50 |
51 | elif not self.is_target(line):
52 | return line
53 |
54 | else:
55 | info = self.get_line_info(line, self.IDX_MAP)
56 | dist = self.yomieval.eval(info.surf, info.yomi)
57 | ratio = float(dist) / float(len(info.yomi))
58 |
59 | if dist > self.distance_threshold or ratio > self.ratio_threshold:
60 | return None
61 | else:
62 | return line
63 |
64 | def is_target(self, line):
65 | not_target = self.wt.is_person(line) or \
66 | self.wt.is_emoji(line) or \
67 | self.wt.is_symbol(line) or \
68 | self.wt.is_numeral(line)
69 | return not not_target
70 |
71 |
72 | def get_line_info(self, line, IDX_MAP):
73 | s = line[IDX_MAP["SURFACE"]]
74 | y = line[IDX_MAP["YOMI"]]
75 | pos = "-".join([line[i] for i in [IDX_MAP["POS1"], IDX_MAP["POS2"], IDX_MAP["POS3"]]])
76 | s = self.normalize_surface(s)
77 | y = y.replace("[","").replace("]","") # remove accent marks
78 |
79 | return LineInfo(s, y, pos)
80 |
81 | def normalize_surface(self, text):
82 | # 全て全角に統一して処理する。
83 | text = unicodedata.normalize("NFKC",text)
84 | text = jaconv.h2z(text, digit=True, ascii=True, kana=True)
85 |
86 | # kansuji
87 | text = numeric2kanji(text)
88 |
89 | # (株), 株式会社などは無視
90 | text = text.replace("(株)","株式会社")
91 | text = text.replace("(有)","有限会社")
92 | return text
93 |
--------------------------------------------------------------------------------
/tdmelodic/filters/neologd_rmdups.py:
--------------------------------------------------------------------------------
1 | # -----------------------------------------------------------------------------
2 | # Copyright (c) 2019-, PKSHA Technology Inc.
3 | # All rights reserved.
4 | #
5 | # This source code is licensed under the BSD-style license found in the
6 | # LICENSE file in the root directory of this source tree.
7 | # -----------------------------------------------------------------------------
8 | # -*- coding: utf-8 -*-
9 | import sys
10 | import os
11 | import argparse
12 | import regex as re
13 | import csv
14 | from tqdm import tqdm
15 |
16 | import jaconv
17 | import unicodedata
18 | from dataclasses import dataclass
19 |
20 | from tdmelodic.nn.lang.japanese.kansuji import numeric2kanji
21 | from tdmelodic.util.dic_index_map import get_dictionary_index_map
22 | from tdmelodic.util.util import count_lines
23 | from tdmelodic.util.word_type import WordType
24 | from .yomi.yomieval import YomiEvaluator
25 |
26 | # ------------------------------------------------------------------------------------
27 | def normalize_surface(text):
28 | # hankaku
29 | text = unicodedata.normalize("NFKC",text)
30 | text = jaconv.h2z(text, digit=True, ascii=True, kana=False)
31 |
32 | # kansuji
33 | text = numeric2kanji(text)
34 |
35 | # (株), 株式会社など
36 | text = text.replace("(株)","・カブシキガイシャ・")
37 | text = text.replace("(有)","・ユウゲンガイシャ・")
38 | text = text.replace("&","・アンド・")
39 | return text
40 |
41 | # ------------------------------------------------------------------------------------
42 | @dataclass
43 | class LineInfo(object):
44 | surf: str
45 | yomi: str
46 | pos: str
47 |
48 | def get_line_info(line, IDX_MAP):
49 | s = line[IDX_MAP["SURFACE"]]
50 | y = line[IDX_MAP["YOMI"]]
51 | pos = "-".join([line[i] for i in [IDX_MAP["POS1"], IDX_MAP["POS2"], IDX_MAP["POS3"]]])
52 | s = normalize_surface(s)
53 |
54 | return LineInfo(s, y, pos)
55 |
56 | def rmdups(fp_in, fp_out, dictionary_type="unidic"):
57 | """
58 | dictionary_type: unidic or ipadic
59 | """
60 | IDX_MAP = get_dictionary_index_map(dictionary_type)
61 |
62 | yomieval = YomiEvaluator()
63 | prev_line = [""] * 100
64 | c = 0
65 | L = count_lines(fp_in)
66 | wt = WordType(dictionary_type)
67 |
68 | print("ℹ️ [ Removing duplicate entries ]", file=sys.stderr)
69 | for i, curr_line in enumerate(tqdm(csv.reader(fp_in), total=L)):
70 | prev = get_line_info(prev_line, IDX_MAP)
71 | curr = get_line_info(curr_line, IDX_MAP)
72 |
73 | if prev.surf == curr.surf and prev.pos == curr.pos and \
74 | not wt.is_person(prev_line) and not wt.is_placename(prev_line):
75 | # if the surface form and pos are the same
76 | distance_p = yomieval.eval(prev.surf, prev.yomi)
77 | distance_c = yomieval.eval(curr.surf, curr.yomi)
78 | else:
79 | distance_p = 0
80 | distance_c = 100
81 |
82 | if distance_p > distance_c:
83 | c += 1
84 | # if c % 100 == 0:
85 | # print(c, curr.surf, "| deleted: ", prev.yomi, distance_p, " | left: ", curr.yomi, distance_c, file=sys.stderr)
86 | else:
87 | if i != 0:
88 | fp_out.write(",".join(prev_line) + "\n")
89 |
90 | prev_line = curr_line
91 | continue
92 |
93 | fp_out.write(",".join(prev_line) + "\n")
94 | print("📊 Number of removed duplicate entries ", c, file=sys.stderr)
--------------------------------------------------------------------------------
/docs/locale/ja/LC_MESSAGES/pages/unidic-dicgen.po:
--------------------------------------------------------------------------------
1 | # SOME DESCRIPTIVE TITLE.
2 | # Copyright (C) 2019-, Hideyuki Tachibana, PKSHA Technology Inc
3 | # This file is distributed under the same license as the tdmelodic package.
4 | # Hideyuki Tachibana, 2021.
5 | #
6 | #, fuzzy
7 | msgid ""
8 | msgstr ""
9 | "Project-Id-Version: tdmelodic \n"
10 | "Report-Msgid-Bugs-To: \n"
11 | "POT-Creation-Date: 2021-10-22 18:34+0900\n"
12 | "PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
13 | "Last-Translator: Hideyuki Tachibana \n"
14 | "Language-Team: \n"
15 | "MIME-Version: 1.0\n"
16 | "Content-Type: text/plain; charset=utf-8\n"
17 | "Content-Transfer-Encoding: 8bit\n"
18 | "Generated-By: Babel 2.9.1\n"
19 |
20 | #: ../../pages/unidic-dicgen.md:1
21 | msgid "Dictionary generation for UniDic users"
22 | msgstr "UniDic ユーザー向け辞書生成"
23 |
24 | #: ../../pages/unidic-dicgen.md:2
25 | msgid "WARNING: _This section takes several hours or days._"
26 | msgstr "注意:_この作業は数時間から数日かかる可能性があります。_"
27 |
28 | #: ../../pages/unidic-dicgen.md:4
29 | msgid "Prepare the base dictionary"
30 | msgstr "ベースになる NEologd 辞書の準備"
31 |
32 | #: ../../pages/unidic-dicgen.md:5
33 | msgid "git clone NEologd"
34 | msgstr "NEologd 辞書ファイルをダウンロード"
35 |
36 | #: ../../pages/unidic-dicgen.md:6
37 | msgid "First, download the NEologd dictionary as follows."
38 | msgstr "まず、以下のようなコマンドで NEologd の辞書をダウンロードしてください。"
39 |
40 | #: ../../pages/unidic-dicgen.md:14
41 | msgid "Extract the NEologd vocabulary file and apply a patch"
42 | msgstr "NEologd 辞書ファイルの抽出と、パッチによる微修正"
43 |
44 | #: ../../pages/unidic-dicgen.md:16
45 | msgid "Then, extract the csv file of NEologd dictionary using `unxz` command."
46 | msgstr "次に `unxz` コマンドで、NEologd の単語リスト(CSVファイル)を抽出します。"
47 |
48 | #: ../../pages/unidic-dicgen.md:26
49 | msgid ""
50 | "This will generate a CSV file named `mecab-unidic-user-dict-"
51 | "seed.yyyymmdd.csv`. Then, apply the patch to the NEologd dictionary which"
52 | " we have just extracted, as follows. This creates a dictionary file "
53 | "`neologd_modified.csv` in the working directory."
54 | msgstr ""
55 | "これにより、`mecab-unidic-user-dict-seed.yyyymmdd.csv` といったファイル名の CSV "
56 | "ファイルが生成されます。次に、ここで抽出した CSV ファイルにパッチを当てます。これにより作業ディレクトリ配下に "
57 | "`neologd_modified.csv` というファイルが作られます。"
58 |
59 | #: ../../pages/unidic-dicgen.md:38
60 | msgid ""
61 | "`--no-rmdups`, `--no-rm_wrong_yomi` are options whether or not to remove "
62 | "certain words. These options can be found with the following command."
63 | msgstr ""
64 | "`--no-rmdups`, `--no-rm_wrong_yomi` などのオプションは、あるカテゴリーに属する単語を辞書から除去するかどうかを指定するためのものです。"
65 | "これらのオプションは以下のコマンドにより確認できます。"
66 |
67 | #: ../../pages/unidic-dicgen.md:44
68 | msgid "Inference"
69 | msgstr "推論"
70 |
71 | #: ../../pages/unidic-dicgen.md:46
72 | msgid ""
73 | "_WARNING! THIS TAKES MUCH TIME!_ (FYI: It took about 2.5 hours in a "
74 | "MacBookPro, 5 hours in our Linux server.)"
75 | msgstr ""
76 | "注意: _この処理はかなりの時間がかかります!_ (参考までに、筆者の MacBookPro では2時間半、Linux "
77 | "サーバーでは5時間かかりました。)"
78 |
79 | #: ../../pages/unidic-dicgen.md:49
80 | msgid ""
81 | "Now let generate the accent dictionary. It estimates the accent of the "
82 | "words listed in NEologd dictionary by a machine learning -based "
83 | "technique."
84 | msgstr ""
85 | "では、アクセント辞書を生成しましょう。ここでは、NEologd "
86 | "の辞書ファイルに掲載されている全ての単語について、機械学習ベースの手法により、アクセント情報を推定します。以下のコマンドにより、各単語にアクセント情報を付与した新しい辞書が生成されます。"
87 |
88 | #: ../../pages/unidic-dicgen.md:62
89 | msgid "Postprocess"
90 | msgstr "後処理"
91 |
92 | #: ../../pages/unidic-dicgen.md:64
93 | msgid "Unigram costs can be fixed using the following script."
94 | msgstr "以下のスクリプトにより、単語生起確率(条件付確率場におけるユニグラム・コスト)の微修正などの後処理を行います。"
95 |
--------------------------------------------------------------------------------
/docs/pages/unidic-usage.md:
--------------------------------------------------------------------------------
1 | # Usage of UniDic-tdmelodic as a MeCab dictionary
2 |
3 | ## Install UniDic-tdmelodic
4 |
5 | You can install `tdmelodic` (`tdmelodic-unidic`) by copying the content of `tdmelodic.csv` we have just created
6 | to the UniDic default dictionary (`lex.csv`), and running the
7 | installation script with appropriate command line options.
8 |
9 | Firstly, specify the file paths.
10 | ```sh
11 | WORKDIR=/path/to/your/work/dir
12 | UNIDIC_ZIP_PATH=/path/to/your/unidic/file/unidic-mecab_kana-accent-2.1.2_src.zip
13 | TDMELODIC_CSV=${WORKDIR}/tdmelodic.csv
14 | ```
15 |
16 | Then unzip the UniDic file.
17 | ```sh
18 | cd ${WORKDIR}
19 | cp ${UNIDIC_ZIP_PATH} .
20 | unzip unidic-mecab_kana-accent-2.1.2_src.zip
21 | ```
22 |
23 | Concatenate the dictionaries.
24 | ```sh
25 | cd ${WORKDIR}/unidic-mecab_kana-accent-2.1.2_src
26 | cp lex.csv lex_bak.csv # backup
27 | cat ${TDMELODIC_CSV} >> lex.csv
28 | ```
29 |
30 | Finally, install `tdmelodic`.
31 | ```sh
32 | ./configure --with-dicdir=`mecab-config --dicdir`/tdmelodic
33 | make
34 | make install
35 | ```
36 |
37 | ## Use UniDic-tdmelodic
38 | Here are some examples.
39 |
40 | ### Example 1
41 |
42 | ```sh
43 | echo 一昔前は人工知能のプログラミング言語といえばCommon LispやPrologだった。 | \
44 | mecab -d `mecab-config --dicdir`/tdmelodic/
45 | ```
46 | ```
47 | 一昔 ヒトムカシ ヒトムカシ 一昔 名詞-普通名詞-一般 2,3
48 | 前 マエ マエ 前 名詞-普通名詞-副詞可能 1
49 | は ワ ハ は 助詞-係助詞
50 | 人工知能 ジ[ンコーチ]ノー ジンコウチノウ 人工知能 名詞-固有名詞-一般 @
51 | の ノ ノ の 助詞-格助詞
52 | プログラミング言語 プ[ログラミングゲ]ンゴ プログラミングゲンゴ プログラミング言語 名詞-固有名詞-一般 @
53 | と ト ト と 助詞-格助詞
54 | いえ イエ イウ 言う 動詞-一般 五段-ワア行 仮定形-一般 0
55 | ば バ バ ば 助詞-接続助詞
56 | Common Lisp コ[モンリ]スプ コモンリスプ Common Lisp 名詞-固有名詞-一般 @
57 | や ヤ ヤ や 助詞-副助詞
58 | Prolog プ[ロログ プロログ Prolog 名詞-固有名詞-一般 @
59 | だっ ダッ ダ だ 助動詞 助動詞-ダ 連用形-促音便
60 | た タ タ た 助動詞 助動詞-タ 終止形-一般
61 | 。 。 補助記号-句点
62 | EOS
63 | ```
64 | Cf.
65 |
66 | ```sh
67 | echo 一昔前は人工知能のプログラミング言語といえばCommon LispやPrologだった。 | \
68 | mecab -d `mecab-config --dicdir`/unidic/
69 | ```
70 | ```
71 | 一昔 ヒトムカシ ヒトムカシ 一昔 名詞-普通名詞-一般 2,3
72 | 前 マエ マエ 前 名詞-普通名詞-副詞可能 1
73 | は ワ ハ は 助詞-係助詞
74 | 人工 ジンコー ジンコウ 人工 名詞-普通名詞-一般 0
75 | 知能 チノー チノウ 知能 名詞-普通名詞-一般 1
76 | の ノ ノ の 助詞-格助詞
77 | プログラミング プログラミング プログラミング プログラミング-programming 名詞-普通名詞-サ変可能 4
78 | 言語 ゲンゴ ゲンゴ 言語 名詞-普通名詞-一般 1
79 | と ト ト と 助詞-格助詞
80 | いえ イエ イウ 言う 動詞-一般 五段-ワア行 仮定形-一般 0
81 | ば バ バ ば 助詞-接続助詞
82 | Common Common Common Common 名詞-普通名詞-一般 0
83 | Lisp Lisp Lisp Lisp 名詞-普通名詞-一般 0
84 | や ヤ ヤ や 助詞-副助詞
85 | Prolog Prolog Prolog Prolog 名詞-普通名詞-一般 0
86 | だっ ダッ ダ だ 助動詞 助動詞-ダ 連用形-促音便
87 | た タ タ た 助動詞 助動詞-タ 終止形-一般
88 | 。 。 補助記号-句点
89 | EOS
90 | ```
91 |
92 | ### Example 2
93 |
94 | ```sh
95 | echo 横浜市中区日本大通 | mecab -d `mecab-config --dicdir`/tdmelodic
96 | ```
97 | ```
98 | 横浜市中区日本大通 ヨ[コハマ]シナ[カ]クニ[ホンオオド]オリ ヨコハマシナカクニホンオオドオリ 横浜市中区日本大通 名詞-固有名詞-地名-一般 @
99 | EOS
100 | ```
101 |
102 | ```sh
103 | echo 横浜市中区日本大通 | mecab -d `mecab-config --dicdir`/unidic
104 | ```
105 | ```
106 | 横浜 ヨコハマ ヨコハマ ヨコハマ 名詞-固有名詞-地名-一般 0
107 | 市中 シチュー シチュウ 市中 名詞-普通名詞-一般 0,2
108 | 区 ク ク 区 名詞-普通名詞-一般 1
109 | 日本 ニッポン ニッポン 日本 名詞-固有名詞-地名-国 3
110 | 大通 ダイツー ダイツウ 大通 名詞-普通名詞-一般 3,0
111 | EOS
112 | ```
113 |
114 | ### Example 3
115 |
116 | ```sh
117 | echo 980hPa | mecab -d `mecab-config --dicdir`/tdmelodic/
118 | echo 15mm | mecab -d `mecab-config --dicdir`/tdmelodic/
119 | echo 4月10日 | mecab -d `mecab-config --dicdir`/tdmelodic/
120 | ```
121 | ```
122 | 980hPa キュ]ーヒャクハ[チジュウヘクトパ]スカル キュウヒャクハチジュウヘクトパスカル 980hPa 名詞-固有名詞-一般 @
123 | EOS
124 | 15mm ジュ[ウゴミリメ]ートル ジュウゴミリメートル 15mm 名詞-固有名詞-一般 @
125 | EOS
126 | 4月10日 シ[ガツトオカ シガツトオカ 4月10日 名詞-固有名詞-一般 @
127 | EOS
128 | ```
129 |
--------------------------------------------------------------------------------
/tdmelodic/nn/inference.py:
--------------------------------------------------------------------------------
1 | # -----------------------------------------------------------------------------
2 | # Copyright (c) 2019-, PKSHA Technology Inc.
3 | # All rights reserved.
4 | #
5 | # This source code is licensed under the BSD-style license found in the
6 | # LICENSE file in the root directory of this source tree.
7 | # -----------------------------------------------------------------------------
8 |
9 | import sys
10 | import os
11 |
12 | import numpy as np
13 | from tqdm import tqdm
14 | import urllib.request
15 |
16 | import chainer
17 | import chainer.functions as F
18 | from chainer.dataset import convert
19 |
20 | import re
21 |
22 | from .net import Net
23 | from .loader.data_loader import NeologdDictionaryLoader
24 | from .lang.japanese.kana.mora_sep import sep_katakana2mora
25 |
26 | # ------------------------------------------------------------------------------
27 | # hyper params
28 | gpu_id = -1 # cpu
29 | bs = 1
30 | embed_dim = 64
31 |
32 | _github_url = "https://github.com/PKSHATechnology-Research/tdmelodic"
33 | model_location={
34 | "path" : os.path.dirname(os.path.abspath(__file__)) + "/resource/net_it_2500000",
35 | "url" : _github_url + "/raw/master/tdmelodic/nn/resource/net_it_2500000"
36 | }
37 |
38 | # ------------------------------------------------------------------------------
39 | class model_downloader(object):
40 | def __init__(self, path, url=model_location["url"]):
41 | self.path = path
42 | self.url = url
43 | if self.__check_if_file_empty(self.path):
44 | self.__download()
45 | else:
46 | self.__already_downloaded()
47 |
48 | def __check_if_file_empty(self, path_):
49 | return not os.path.exists(path_) or os.path.getsize(path_) == 0
50 |
51 | def __download(self):
52 | print("🌐 [ tdmelodic Model Downloader ] Downloading the pretrained model.")
53 | print("🌐 [ tdmelodic Model Downloader ] From {}".format(self.url))
54 | print("🌐 [ tdmelodic Model Downloader ] To {}".format(self.path))
55 | urllib.request.urlretrieve(self.url, self.path)
56 | print("🌐 [ tdmelodic Model Downloader ] Done")
57 |
58 | def __already_downloaded(self, verbose=False):
59 | if verbose:
60 | print("🌐 [ tdmelodic Model Downloader ] The tdmelodic pretrained model already on your system.")
61 |
62 | # ------------------------------------------------------------------------------
63 | class InferAccent(object):
64 | def __init__(self,
65 | model_path=model_location["path"],
66 | model_dim=embed_dim):
67 | model_downloader(model_path)
68 | self.net = self.__load_model(model_path, model_dim)
69 |
70 | def __load_model(self, model_path, model_dim, verbose=False):
71 | if verbose:
72 | print("[ Loading model ] model_path='{}'".format(model_path))
73 | net = Net(embed_dim=model_dim)
74 | chainer.serializers.load_npz(model_path, net)
75 | return net
76 |
77 | def infer(self, s, y, _):
78 | with chainer.using_config("debug", False):
79 | with chainer.using_config("type_check", False):
80 | with chainer.using_config("train", False):
81 | a, _ = self.net(s, y, _)
82 | a = F.argmax(a, axis=1).data
83 | return a
84 |
85 | def infer_and_get_image(self, s, y, _):
86 | with chainer.using_config("debug", False):
87 | with chainer.using_config("type_check", False):
88 | with chainer.using_config("train", False):
89 | a, _ = self.net(s, y, _)
90 | return F.argmax(a, axis=1).data, F.softmax(a).data
91 |
--------------------------------------------------------------------------------
/docs/conf.py:
--------------------------------------------------------------------------------
1 | # Configuration file for the Sphinx documentation builder.
2 | #
3 | # This file only contains a selection of the most common options. For a full
4 | # list see the documentation:
5 | # https://www.sphinx-doc.org/en/master/usage/configuration.html
6 |
7 | # -- Path setup --------------------------------------------------------------
8 |
9 | # If extensions (or modules to document with autodoc) are in another directory,
10 | # add these directories to sys.path here. If the directory is relative to the
11 | # documentation root, use os.path.abspath to make it absolute, like shown here.
12 | #
13 | # import os
14 | # import sys
15 | # sys.path.insert(0, os.path.abspath('.'))
16 | #from recommonmark.parser import CommonMarkParser
17 |
18 | # -- Project information -----------------------------------------------------
19 | import sys
20 | project = 'tdmelodic'
21 | copyright = '2019-, Hideyuki Tachibana, PKSHA Technology Inc'
22 | author = 'Hideyuki Tachibana'
23 |
24 |
25 | # -- General configuration ---------------------------------------------------
26 |
27 | # Add any Sphinx extension module names here, as strings. They can be
28 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
29 | # ones.
30 | #extensions = ['recommonmark']
31 | extensions = ['myst_parser',
32 | #'sphinx.ext.imgconverter'
33 | ]
34 |
35 | source_suffix = ['.rst', '.md']
36 | #source_parsers = {
37 | # '.md' : CommonMarkParser
38 | #}
39 |
40 |
41 | # Add any paths that contain templates here, relative to this directory.
42 | templates_path = ['_templates']
43 |
44 | # The language for content autogenerated by Sphinx. Refer to documentation
45 | # for a list of supported languages.
46 | #
47 | # This is also used if you do content translation via gettext catalogs.
48 | # Usually you set "language" from the command line for these cases.
49 | language = 'en'
50 |
51 | # List of patterns, relative to source directory, that match files and
52 | # directories to ignore when looking for source files.
53 | # This pattern also affects html_static_path and html_extra_path.
54 | exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']
55 |
56 |
57 | # -- Options for HTML output -------------------------------------------------
58 |
59 | # The theme to use for HTML and HTML Help pages. See the documentation for
60 | # a list of builtin themes.
61 | #
62 | #html_theme = 'alabaster'
63 | html_theme = 'sphinx_rtd_theme'
64 |
65 | # Add any paths that contain custom static files (such as style sheets) here,
66 | # relative to this directory. They are copied after the builtin static files,
67 | # so a file named "default.css" will overwrite the builtin "default.css".
68 | html_static_path = ['_static']
69 | html_logo = "logo_tdmelodic.png"
70 |
71 | # latex
72 | __latex_lang = 'ja' if 'language=ja' in sys.argv else 'en'
73 | latex_engine = 'lualatex'
74 | latex_use_xindy = False
75 | latex_elements = {
76 | 'preamble' : r"""
77 | \usepackage{luatexja}
78 | \usepackage{luatexja-fontspec}
79 | \usepackage[ipa]{luatexja-preset}
80 | """,
81 | 'fncychap': '',
82 | 'tableofcontents': r"""
83 | \renewcommand{\contentsname}{""" \
84 | + ("目次" if __latex_lang == 'ja' else "Table of Contents") +
85 | r"""}
86 | \sphinxtableofcontents""",
87 | 'fvset' : r"""\fvset{tabsize=2,fontsize=\footnotesize}"""
88 | }
89 |
90 | latex_docclass = {
91 | 'howto' : 'article', # 'jsbook'
92 | 'manual' : 'ltjbook' if __latex_lang == 'ja' else 'report' # 'jreport'
93 | }
94 |
95 | latex_show_urls = 'footnote'
96 |
97 | # locale
98 | locale_dirs = ['locale/']
99 | gettext_compact = False
100 |
--------------------------------------------------------------------------------
/docs/pages/ipadic-dicgen.md:
--------------------------------------------------------------------------------
1 | # Dictionary Generation for IPADIC users
2 |
3 | WARNING: _This section takes several hours or days._
4 |
5 | ## Prepare the base dictionary
6 | ### Download IPADIC
7 |
8 | First, download IPADIC manually from [https://taku910.github.io/mecab](https://taku910.github.io/mecab)
9 | ```sh
10 | WORKDIR=/path/to/your/work/dir
11 | cd $WORKDIR # move to the working directory
12 | cp /path/to/your/download/dir/mecab-ipadic-2.7.0-XXXX.tar.gz $WORKDIR
13 | tar zxfv mecab-ipadic-2.7.0-XXXX.tar.gz
14 | ```
15 | By trying `ls mecab-ipadic-2.7.0-XXXX`, you will find many CSV files and configuration files
16 | in the directory.
17 | We convert the encoding of these dicrionaty files from EUC-JP to UTF-8.
18 | If your system has `nkf` commnad,
19 | ```sh
20 | find ./mecab-ipadic-2.7.0-* -type f -name "*.csv" | xargs -I{} nkf -w --overwrite {}
21 | ```
22 | Otherwise, you can use docker.
23 | ```sh
24 | docker run --rm -v $(pwd):/root/workspace tdmelodic:latest \
25 | find ./mecab-ipadic-2.7.0-* -type f -name "*.csv" | xargs -I{} nkf -w --overwrite {}
26 | ```
27 |
28 | ### Download NEologd
29 | Also, download the NEologd dictionary as follows.
30 |
31 | ```sh
32 | cd $WORKDIR # move to the working directory
33 | git clone --depth 1 https://github.com/neologd/mecab-ipadic-neologd/
34 | ```
35 |
36 | Then, extract the csv file of NEologd dictionary using `unxz` command.
37 | If your system has the unxz command,
38 | ```sh
39 | find ./mecab-ipadic-neologd/seed/ -type f -name "*.xz" | xargs -I{} unxz -k {}
40 | ```
41 | Or, otherwise,
42 | ```sh
43 | find ./mecab-ipadic-neologd/seed/ -type f -name "*.xz" | xargs -I{} \
44 | docker run --rm -v $(pwd):/root/workspace tdmelodic:latest unxz -k {}
45 | ```
46 | Thus many CSV files will be created at `./mecab-ipadic-neologd/seed/`.
47 |
48 | ## Inference
49 |
50 | _WARNING! THIS TAKES MUCH TIME!_
51 |
52 | Now let generate the accent dictionary.
53 | It estimates the accent of the words listed in NEologd dictionary
54 | by a machine learning -based technique.
55 |
56 | ### IPADIC
57 | ```
58 | find ./mecab-ipadic-2.7.0-*/ -type f -name "*.csv" | xargs -I{} \
59 | docker run --rm -v $(pwd):/root/workspace tdmelodic:latest \
60 | tdmelodic-convert -m ipadic --input {} --output {}.accent
61 | ```
62 | Or, following commands will also work.
63 | ```sh
64 | cat ./mecab-ipadic-2.7.0-*/*.csv > ipadic_all.csv
65 | docker run --rm -v $(pwd):/root/workspace tdmelodic:latest \
66 | tdmelodic-convert -m ipadic \
67 | --input ipadic_all.csv \
68 | --output ipadic_all.csv.accent
69 | ```
70 |
71 | ### NEologd
72 | Use preprocessor if necessary. (try `-h` to show preprocessing options.)
73 | ```sh
74 | find ./mecab-ipadic-neologd/seed/ -type f -name "*.csv" | xargs -I{} \
75 | docker run --rm -v $(pwd):/root/workspace tdmelodic:latest \
76 | tdmelodic-neologd-preprocess -m ipadic --input {} --output {}.preprocessed
77 | ```
78 | Then,
79 | ```sh
80 | find ./mecab-ipadic-neologd/seed/ -type f -name "*.csv" | xargs -I{} \
81 | docker run --rm -v $(pwd):/root/workspace tdmelodic:latest \
82 | tdmelodic-convert -m ipadic --input {}.preprocessed --output {}.accent
83 | ```
84 |
85 | Thus we obtain dictionary files `*.csv.accent` with the accent information added.
86 |
87 | Alternatively, following commands will also work.
88 | ```sh
89 | cat ./mecab-ipadic-neologd/seed/*.csv > neologd_all.csv
90 |
91 | docker run --rm -v $(pwd):/root/workspace tdmelodic:latest \
92 | tdmelodic-neologd-preprocess -m ipadic \
93 | --input neologd_all.csv \
94 | --output neologd_all.csv.preprocessed
95 |
96 | docker run --rm -v $(pwd):/root/workspace tdmelodic:latest \
97 | tdmelodic-convert -m ipadic \
98 | --input neologd_all.csv.preprocessed \
99 | --output neologd_all.csv.accent
100 | ```
101 |
--------------------------------------------------------------------------------
/tdmelodic/nn/lang/japanese/kansuji.py:
--------------------------------------------------------------------------------
1 | # -----------------------------------------------------------------------------
2 | # Copyright (c) 2019-, PKSHA Technology Inc.
3 | # All rights reserved.
4 | #
5 | # This source code is licensed under the BSD-style license found in the
6 | # LICENSE file in the root directory of this source tree.
7 | # -----------------------------------------------------------------------------
8 |
9 | import re
10 |
11 | """
12 | 数字列を漢数字に変換する。
13 | """
14 |
15 | digit2kanji = {
16 | '0':'零',
17 | '1':'一',
18 | '2':'二',
19 | '3':'三',
20 | '4':'四',
21 | '5':'五',
22 | '6':'六',
23 | '7':'七',
24 | '8':'八',
25 | '9':'九'
26 | }
27 | unit_1 = ['', '十', '百', '千']
28 | unit_2 = ['', '万', '億', '兆',
29 | '京', '垓', '𥝱', '穣',
30 | '溝', '澗', '正', '載',
31 | '極', '恒河沙', '阿僧祇',
32 | '那由他', '不可思議', '無量大数']
33 | unit = sum([[u1 + u2 if u1 == '' else u1 for u1 in unit_1] for u2 in unit_2], [])
34 |
35 | def split_4(lst):
36 | return ["".join(lst[0+n:4+n]) for n in range(0, int(len(lst)), 4)]
37 |
38 | def _case_straightforward(num_str):
39 | """ simply replace """
40 | kanji = "".join([digit2kanji[n] for n in num_str])
41 | return kanji
42 |
43 | def _case_int(num_str):
44 | if num_str in ['0', '']:
45 | return digit2kanji['0']
46 | else:
47 | lst = split_4(list(reversed(num_str)))
48 | fourdigits = ["".join(reversed([
49 | '' if r == '0' else
50 | unit_1[i] if r == '1' and (i == 1 or i == 2) else
51 | digit2kanji[r] + unit_1[i]
52 | for i, r in enumerate(l)]
53 | )) for l in lst]
54 | tmp = "".join(reversed([
55 | '' if l == '' else
56 | l + unit_2[i]
57 | for i, l in enumerate(fourdigits)]
58 | ))
59 |
60 | # "一千"などを修正
61 | ret = re.sub('一千(?![{}])'.format("".join(unit_2)), '千', tmp)
62 | return ret
63 |
64 | def _case_float(num_str):
65 | # 小数
66 | sep = num_str.split(".")
67 |
68 | # 整数部分
69 | i_str = sep[0]
70 | i_kanji = _case_int(i_str)
71 |
72 | # 小数部分
73 | if len(sep) >= 2 and sep[1] != '':
74 | d_str = sep[1:]
75 | d_kanji = "点".join([_case_straightforward(d) for d in d_str])
76 | num_kanji = i_kanji + "点" + d_kanji
77 | else:
78 | num_kanji = i_kanji
79 |
80 | return num_kanji
81 |
82 | def num2kansuji(num_str, mode='digit'):
83 | '''
84 | 数字列を漢数字に変換する。
85 |
86 | mode=digitのときはそのまま漢数字にする。
87 | mode=replaceのときは単に置換する。
88 | '''
89 |
90 | if len(num_str) > 72:
91 | print(" ********** error 73文字以上の数字が入力されました。無視してそのまま出力します。 **************", file=sys.stderr)
92 | return num_str
93 |
94 | num_str = num_str.replace(",", '') # 桁区切りのカンマは削除
95 |
96 | if mode == 'digit':
97 | return _case_float(num_str)
98 | elif mode == 'replace':
99 | return _case_straightforward(num_str)
100 | else:
101 | # default
102 | return _case_float(num_str)
103 |
104 | def numeric2kanji(text_orig):
105 | r = re.compile(r"[0-90-9.]+", flags=0) # 小数点(仮)
106 | digits = r.findall(text_orig)
107 | split = r.split(text_orig)
108 |
109 | # convert
110 | digits_ = []
111 | for d in digits:
112 | d_orig = d
113 | d = d.replace("1","1").\
114 | replace("2","2").\
115 | replace("3","3").\
116 | replace("4","4").\
117 | replace("5","5").\
118 | replace("6","6").\
119 | replace("7","7").\
120 | replace("8","8").\
121 | replace("9","9").\
122 | replace("0","0")
123 | d = num2kansuji(d)
124 | digits_.append(d)
125 | digits = digits_
126 |
127 | # join
128 | l = max(len(digits), len(split))
129 | digits = (digits + [""] * 3)[:l]
130 | split = (split + [""] * 3)[:l]
131 |
132 | converted = "".join([t + str(d) for t, d in zip(split, digits)])
133 | return converted
134 |
--------------------------------------------------------------------------------
/tdmelodic/nn/lang/mecab/unidic.py:
--------------------------------------------------------------------------------
1 | # -----------------------------------------------------------------------------
2 | # Copyright (c) 2019-, PKSHA Technology Inc.
3 | # All rights reserved.
4 | #
5 | # This source code is licensed under the BSD-style license found in the
6 | # LICENSE file in the root directory of this source tree.
7 | # -----------------------------------------------------------------------------
8 |
9 | import os, sys
10 | import subprocess
11 | import MeCab
12 | import Levenshtein
13 | import numpy as np
14 |
15 | class Singleton:
16 | """ Singleton pattern """
17 | _instance = None
18 | def __new__(cls):
19 | if cls._instance is None:
20 | cls._instance = super().__new__(cls)
21 | cls._instance.is_initialized = False
22 | return cls._instance
23 |
24 | @property
25 | def singleton_initialized(cls):
26 | return cls.is_initialized
27 |
28 | @singleton_initialized.setter
29 | def singleton_initialized(self, boolean):
30 | assert boolean == True
31 | self.is_initialized = boolean
32 |
33 |
34 | def get_mecab_default_path():
35 | out = subprocess.Popen(['mecab-config', '--dicdir'],
36 | stdout=subprocess.PIPE,
37 | stderr=subprocess.STDOUT)
38 | stdout_, stderr_ = out.communicate()
39 | mecab_default_dir = stdout_.decode('utf-8').strip()
40 | return mecab_default_dir
41 |
42 | mapping=["surface",
43 | "pron",
44 | "kana",
45 | "pos",
46 | "goshu",
47 | "acc",
48 | "concat",
49 | "cost_uni",
50 | "cost_bi"] + list(range(100))
51 |
52 | class UniDic(Singleton):
53 | def __init__(self,
54 | unidic_path = get_mecab_default_path() + "/unidic",
55 | mecabrc_path = os.path.dirname(os.path.abspath(__file__)) + "/my_mecabrc",
56 | verbose = False
57 | ):
58 | if self.singleton_initialized:
59 | return
60 | else:
61 | self.singleton_initialized = True
62 |
63 | self.unidic_path = unidic_path
64 | self.mecabrc_path = mecabrc_path
65 | if verbose:
66 | print("ℹ️ [ MeCab setting ] unidic=\'{}\'".format(self.unidic_path), file=sys.stderr)
67 | print("ℹ️ [ MeCab setting ] mecabrc=\'{}\'".format(self.mecabrc_path), file=sys.stderr)
68 |
69 | self.__init_mecab()
70 |
71 | def __init_mecab(self):
72 | self.unidic_acc = MeCab.Tagger(
73 | "-d {dic} -r {rc} -Oacc" .format(
74 | dic=self.unidic_path, rc=self.mecabrc_path))
75 |
76 | def __parse(self, text, nbest=1, sep1='\t', sep2='\n'):
77 | parsed = self.unidic_acc.parseNBest(nbest, text)
78 | nbest = parsed.split("EOS\n")[:-1] # remove the last entry
79 | ret = [
80 | [
81 | {
82 | mapping[i] : c
83 | for i, c in enumerate(list(l.split(sep1)))
84 | }
85 | for l in c.split(sep2)[:-1]
86 | ]
87 | for c in nbest
88 | ]
89 | return ret
90 |
91 | def get_n_best(self, text, kana_ref, nbest=20):
92 | '''
93 | during inference, only the top 1 result is used. see data_loader.py
94 | '''
95 | p = self.__parse(text, nbest=nbest)
96 | kanas = ["".join([e["pron"] for e in p_]) for p_ in p]
97 | dist = [Levenshtein.distance(k, kana_ref) for k in kanas]
98 |
99 | rank = [i for i, v in sorted(enumerate(dist), key=lambda v: v[1])]
100 |
101 | # rank = rank[0:3] if len(rank) >= 3 else rank # 上位 3 件を返す。
102 | # rank = rank[0:5] if len(rank) >= 5 else rank # 上位 5 件を返す。
103 | rank = rank[0:10] if len(rank) >= 10 else rank # 上位 10 件を返す。
104 |
105 | ld = dist[rank[0]]
106 | return p, rank, ld
107 |
108 | def get_yomi(self, surface):
109 | words = self.unidic_acc.parse(surface)
110 | parsed = [word.split("\t") for word in words.split("\n")]
111 | yomis = [entry[1] for entry in parsed if len(entry) > 1]
112 | return "".join(yomis)
113 |
--------------------------------------------------------------------------------
/tdmelodic/nn/convert_dic.py:
--------------------------------------------------------------------------------
1 | # -----------------------------------------------------------------------------
2 | # Copyright (c) 2019-, PKSHA Technology Inc.
3 | # All rights reserved.
4 | #
5 | # This source code is licensed under the BSD-style license found in the
6 | # LICENSE file in the root directory of this source tree.
7 | # -----------------------------------------------------------------------------
8 |
9 | import sys
10 | import os
11 | import csv
12 |
13 | import numpy as np
14 | import argparse
15 | from tqdm import tqdm
16 |
17 | import chainer
18 | from chainer.training import extensions
19 | from chainer.dataset import convert
20 |
21 | from .net import Net
22 | from .loader.data_loader import NeologdDictionaryLoader
23 | from .lang.japanese.kana.mora_sep import sep_katakana2mora
24 | from .inference import InferAccent
25 |
26 | from ..util.dic_index_map import get_dictionary_index_map
27 |
28 | # hyper params
29 | gpu_id = -1
30 | bs = 64
31 |
32 | def apply_all(
33 | test_csv, output_csv, up_symbol="[", down_symbol="]", mode="unidic"
34 | ):
35 | index_map = get_dictionary_index_map(mode)
36 |
37 | test_dat = NeologdDictionaryLoader(
38 | test_csv, infer_mode=True, index_map=index_map, store_entire_line=False
39 | )
40 |
41 | test_iter = chainer.iterators.SerialIterator(
42 | test_dat, bs, repeat=False, shuffle=False
43 | )
44 |
45 | model = InferAccent()
46 | with open(output_csv, "w") as ofs:
47 | csv_out = csv.writer(ofs)
48 | for batch_ in tqdm(test_iter, total=len(test_dat) // bs):
49 | batch = [a for a, b in batch_]
50 | orig_info = [b for a, b in batch_]
51 |
52 | batch = chainer.dataset.convert.concat_examples(
53 | batch, device=gpu_id, padding=0
54 | )
55 | X = batch[:-1]
56 | y_truth = batch[-1] # Ground Truth
57 |
58 | # X : (S_vow_np, S_con_np, S_pos_np, S_acc_np, S_acccon_np, S_gosh_np, Y_vow_np, Y_con_np)
59 | # X_s : (S_vow_np, S_con_np, S_pos_np, S_acc_np, S_acccon_np, S_gosh_np)
60 | # X_y : (Y_vow_np, Y_con_np)
61 | X_s = X[:-2]
62 | X_y = X[-2:]
63 | y_dummy_GT = X_y[0] * 0 # dummy data
64 |
65 | # infer
66 | a_est = model.infer(X_s, X_y, y_dummy_GT)
67 | a_est = a_est.tolist()
68 | a_est = np.asarray(a_est).astype(np.int32)
69 |
70 | # postprocessing
71 | def proc(b, orig_info, a_est):
72 | idx, kanji, yomi, orig_entry = orig_info[b]
73 | A = a_est[b].tolist()
74 | A = A[: len(yomi)]
75 | y_ = [
76 | y + (up_symbol if a_ == 2 else down_symbol if a_ == 0 else "")
77 | for y, a_ in zip(sep_katakana2mora(yomi), A)
78 | ]
79 | y_ = "".join(y_)
80 | orig_entry[index_map["YOMI"]] = y_
81 | if mode == "unidic":
82 | orig_entry[index_map["ACCENT"]] = "@"
83 | return orig_entry
84 |
85 | for i in range(len(batch_)):
86 | line = proc(i, orig_info, a_est)
87 | csv_out.writerow(line)
88 |
89 | # =============================================================================================
90 | def main():
91 | parser = argparse.ArgumentParser()
92 | parser.add_argument(
93 | "-i", "--input", type=str, help="input csv (neologd dicitionary file)"
94 | )
95 | parser.add_argument("-o", "--output", type=str, help="output csv")
96 | parser.add_argument(
97 | "-m",
98 | "--mode",
99 | type=str,
100 | help="dictionary format type",
101 | choices=["unidic", "ipadic"],
102 | default="unidic",
103 | )
104 | args = parser.parse_args()
105 |
106 | if args.input == args.output:
107 | print("[ Error ] intput and output files should be different.")
108 | else:
109 | try:
110 | apply_all(
111 | test_csv=args.input,
112 | output_csv=args.output,
113 | mode=args.mode,
114 | )
115 | except Exception as e:
116 | print(e)
117 |
118 | if __name__ == '__main__':
119 | main()
--------------------------------------------------------------------------------
/docs/locale/ja/LC_MESSAGES/pages/onebyone.po:
--------------------------------------------------------------------------------
1 | # SOME DESCRIPTIVE TITLE.
2 | # Copyright (C) 2019-, Hideyuki Tachibana, PKSHA Technology Inc
3 | # This file is distributed under the same license as the tdmelodic package.
4 | # Hideyuki Tachibana, 2021.
5 | #
6 | #, fuzzy
7 | msgid ""
8 | msgstr ""
9 | "Project-Id-Version: tdmelodic \n"
10 | "Report-Msgid-Bugs-To: \n"
11 | "POT-Creation-Date: 2022-06-20 16:12+0900\n"
12 | "PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
13 | "Last-Translator: Hideyuki Tachibana \n"
14 | "Language-Team: \n"
15 | "MIME-Version: 1.0\n"
16 | "Content-Type: text/plain; charset=utf-8\n"
17 | "Content-Transfer-Encoding: 8bit\n"
18 | "Generated-By: Babel 2.9.1\n"
19 |
20 | #: ../../pages/onebyone.rst:3
21 | msgid "One-by-one Manual Inference Mode"
22 | msgstr "一件ずつ推論するモード"
23 |
24 | #: ../../pages/onebyone.rst:5
25 | msgid ""
26 | "In some cases, you may want to estimate the accent for one word, rather "
27 | "than the entire dictionary at once. This page introduces the tools for "
28 | "this purpose."
29 | msgstr "辞書全体を一度に推論するのではなく、個々の単語について一つずつアクセント推定したいケースもあるでしょう。このページではそのためのツールを紹介します。"
30 |
31 | #: ../../pages/onebyone.rst:10
32 | msgid "s2ya: Surface -> Yomi & Accent"
33 | msgstr "s2ya: 表層形 → 読み&アクセント"
34 |
35 | #: ../../pages/onebyone.rst:12
36 | msgid ""
37 | "``s2ya`` estimates the reading (``yomi``) and accent of a word from its "
38 | "orthographic form (``surface``). For yomi, it uses the best estimates "
39 | "from MeCab and UniDic."
40 | msgstr ""
41 | "``s2ya`` は、単語の標準的な表記法(表層形)から、その読みとアクセントを推定します。読みの推定には、MeCab と UniDic "
42 | "による最適解を利用します。"
43 |
44 | #: ../../pages/onebyone.rst:15
45 | msgid "Input: Orthographic (surface) form, such as kanji"
46 | msgstr "入力:漢字などによる、標準的な表記法(表層形)"
47 |
48 | #: ../../pages/onebyone.rst:16
49 | msgid "Output: Reading (yomi) and Accent"
50 | msgstr "出力:読みとアクセント"
51 |
52 | #: ../../pages/onebyone.rst:22
53 | msgid "Then you will have the following result."
54 | msgstr "すると以下のような結果が得られます。"
55 |
56 | #: ../../pages/onebyone.rst:28
57 | msgid "It is convenient to define an alias command as follows."
58 | msgstr "以下のようなエイリアスを利用すると便利です。"
59 |
60 | #: ../../pages/onebyone.rst:34
61 | msgid "Using this, try other examples."
62 | msgstr "このコマンドを使って、他の単語も試してみてください。"
63 |
64 | #: ../../pages/onebyone.rst:53
65 | msgid "It also predicts the accents of sentences."
66 | msgstr "同様にして、文のアクセントも推定できます。"
67 |
68 | #: ../../pages/onebyone.rst:73
69 | msgid ""
70 | "Although tdmelodic is formally capable of predicting sentence accents as "
71 | "described above, and a small amount of sentences are used in the training"
72 | " data, it has not been trained to perform this task. Therefore, this "
73 | "accent estimation should be considered only as a reference."
74 | msgstr ""
75 | "上で見たように、 tdmelodic は形式的には文のアクセント予測が可能であり、"
76 | "また学習データにも少量の文章データを使用しています。"
77 | "しかし、tdmelodic はもともと文のアクセント推定を主眼として設計・学習したものではありません。"
78 | "このため、この方法によって推定された文のアクセントはあくまで参考程度のものです。"
79 |
80 | #: ../../pages/onebyone.rst:79
81 | msgid ""
82 | "The yomi prediction of ``s2ya`` is based on the UniDic lexicon. This is "
83 | "because the docker image contains only the UniDic dictionary. If you "
84 | "prefer using other yomi prediction modules such as Neologd, please use "
85 | "the ``sy2a`` module below."
86 | msgstr ""
87 | "``s2ya`` の読み予測にはUniDic辞書を使用します。これは、docker イメージの中に入っている辞書が UniDic "
88 | "のみであるためです。もし Neologd など他の辞書を使用して読み推定をしたい場合は、次の ``sy2a`` をご利用ください。"
89 |
90 | #: ../../pages/onebyone.rst:86
91 | msgid "sy2a: Surface & Yomi -> Accent"
92 | msgstr "sy2a: 表層形&読み → アクセント"
93 |
94 | #: ../../pages/onebyone.rst:88
95 | msgid ""
96 | "``sy2a`` estimates the accent of a word from its orthographic form "
97 | "(``surface``) and the reading (``yomi``)."
98 | msgstr "``sy2a`` は、単語の表層形と読みから、アクセントを推定するツールです。"
99 |
100 | #: ../../pages/onebyone.rst:90
101 | msgid "Input: Orthographic (surface) form, such as kanji, and reading (yomi)."
102 | msgstr "入力:表層形(漢字など)と読み"
103 |
104 | #: ../../pages/onebyone.rst:91
105 | msgid "Output: Accent"
106 | msgstr "出力:アクセント"
107 |
108 | #: ../../pages/onebyone.rst:93
109 | msgid "For example,"
110 | msgstr "例えば以下のように使用します。"
111 |
112 | #: ../../pages/onebyone.rst:101
113 | msgid "Try other examples."
114 | msgstr "他の例"
115 |
116 | #: ../../pages/onebyone.rst:114
117 | msgid "It can also predict the accents of sentences."
118 | msgstr "また、文のアクセントも同様に推定できます。"
119 |
120 | #: ../../pages/onebyone.rst:128
121 | msgid ""
122 | "If you want to predict the yomi of a given sentence using an advanced "
123 | "dictionary such as Neologd, the following command may be helpful."
124 | msgstr "Neologd などの先進的な辞書を使用して読み推定を行いたい場合は、一例として以下のようなコマンドで読みが推定できます。"
125 |
--------------------------------------------------------------------------------
/docs/locale/ja/LC_MESSAGES/pages/ipadic-dicgen.po:
--------------------------------------------------------------------------------
1 | # SOME DESCRIPTIVE TITLE.
2 | # Copyright (C) 2019-, Hideyuki Tachibana, PKSHA Technology Inc
3 | # This file is distributed under the same license as the tdmelodic package.
4 | # Hideyuki Tachibana, 2021.
5 | #
6 | #, fuzzy
7 | msgid ""
8 | msgstr ""
9 | "Project-Id-Version: tdmelodic \n"
10 | "Report-Msgid-Bugs-To: \n"
11 | "POT-Creation-Date: 2021-08-20 18:55+0900\n"
12 | "PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
13 | "Last-Translator: Hideyuki Tachibana \n"
14 | "Language-Team: \n"
15 | "MIME-Version: 1.0\n"
16 | "Content-Type: text/plain; charset=utf-8\n"
17 | "Content-Transfer-Encoding: 8bit\n"
18 | "Generated-By: Babel 2.9.1\n"
19 |
20 | #: ../../pages/ipadic-dicgen.md:1
21 | msgid "Dictionary Generation for IPADIC users"
22 | msgstr "IPADIC ユーザー向け辞書生成"
23 |
24 | #: ../../pages/ipadic-dicgen.md:3
25 | msgid "WARNING: _This section takes several hours or days._"
26 | msgstr "注意:_この作業は数時間から数日かかる可能性があります。_"
27 |
28 | #: ../../pages/ipadic-dicgen.md:5
29 | msgid "Prepare the base dictionary"
30 | msgstr "ベースになる辞書の準備"
31 |
32 | #: ../../pages/ipadic-dicgen.md:6
33 | msgid "Download IPADIC"
34 | msgstr "IPADIC のダウンロード"
35 |
36 | #: ../../pages/ipadic-dicgen.md:8
37 | msgid ""
38 | "First, download IPADIC manually from "
39 | "[https://taku910.github.io/mecab](https://taku910.github.io/mecab)"
40 | msgstr ""
41 | "はじめに、[https://taku910.github.io/mecab](https://taku910.github.io/mecab) "
42 | "からIPADICをダウンロードしてください。"
43 |
44 | #: ../../pages/ipadic-dicgen.md:15
45 | msgid ""
46 | "By trying `ls mecab-ipadic-2.7.0-XXXX`, you will find many CSV files and "
47 | "configuration files in the directory. We convert the encoding of these "
48 | "dicrionaty files from EUC-JP to UTF-8. If your system has `nkf` commnad,"
49 | msgstr ""
50 | "ダウンロードした圧縮ファイルを展開して得られたディレクトリの中を `ls` などのコマンドで見てみると、多数の CSV "
51 | "ファイルと設定ファイルがあるのが確認できます。これらの辞書ファイルの文字コードは EUC-JP になっていますが、これを以下のコマンドにより "
52 | "UTF-8 に変換します。もし `nkf` コマンドがある場合は以下のコマンドを実行してください。"
53 |
54 | #: ../../pages/ipadic-dicgen.md:22
55 | msgid "Otherwise, you can use docker."
56 | msgstr "もし `nkf` がない場合は Docker を使って以下のようなコマンドで変換できます。"
57 |
58 | #: ../../pages/ipadic-dicgen.md:28
59 | msgid "Download NEologd"
60 | msgstr "NEologd のダウンロード"
61 |
62 | #: ../../pages/ipadic-dicgen.md:29
63 | msgid "Also, download the NEologd dictionary as follows."
64 | msgstr "同様に、NEologdの辞書ファイルをダウンロードしてください。"
65 |
66 | #: ../../pages/ipadic-dicgen.md:36
67 | msgid ""
68 | "Then, extract the csv file of NEologd dictionary using `unxz` command. If"
69 | " your system has the unxz command,"
70 | msgstr ""
71 | "次に `unxz` コマンドで、NEologd の単語リスト(CSVファイル)を抽出します。もし `unxz` "
72 | "コマンドがインストールされている場合は以下のようなコマンドを実行してください。"
73 |
74 | #: ../../pages/ipadic-dicgen.md:41
75 | msgid "Or, otherwise,"
76 | msgstr "`unxz`がない場合は、dockerを使用して以下のコマンドで同様のことができます。"
77 |
78 | #: ../../pages/ipadic-dicgen.md:46
79 | msgid "Thus many CSV files will be created at `./mecab-ipadic-neologd/seed/`."
80 | msgstr "この処理により、多数のCSVファイルが `./mecab-ipadic-neologd/seed/` 配下に生成されます。"
81 |
82 | #: ../../pages/ipadic-dicgen.md:48
83 | msgid "Inference"
84 | msgstr "推論"
85 |
86 | #: ../../pages/ipadic-dicgen.md:50
87 | msgid "_WARNING! THIS TAKES MUCH TIME!_"
88 | msgstr "注意: _この処理はかなりの時間がかかります!_"
89 |
90 | #: ../../pages/ipadic-dicgen.md:52
91 | msgid ""
92 | "Now let generate the accent dictionary. It estimates the accent of the "
93 | "words listed in NEologd dictionary by a machine learning -based "
94 | "technique."
95 | msgstr ""
96 | "では、アクセント辞書を生成しましょう。ここでは、NEologd "
97 | "の辞書ファイルに掲載されている全ての単語について、機械学習ベースの手法により、アクセント情報を推定します。以下のコマンドにより、各単語にアクセント情報を付与した新しい辞書が生成されます。"
98 |
99 | #: ../../pages/ipadic-dicgen.md:56
100 | msgid "IPADIC"
101 | msgstr "IPADIC"
102 |
103 | #: ../../pages/ipadic-dicgen.md:62
104 | msgid "Or, following commands will also work."
105 | msgstr "もしくは、以下のコマンドでも同様に辞書を生成できます。"
106 |
107 | #: ../../pages/ipadic-dicgen.md:71
108 | msgid "NEologd"
109 | msgstr "NEologd"
110 |
111 | #: ../../pages/ipadic-dicgen.md:72
112 | msgid "Use preprocessor if necessary. (try `-h` to show preprocessing options.)"
113 | msgstr "まず、もし必要であれば以下のコマンドで前処理を行ってください。"
114 | "(`-h` により前処理のオプションを表示できます。)"
115 |
116 | #: ../../pages/ipadic-dicgen.md:78
117 | msgid "Then,"
118 | msgstr "次に、以下のコマンドによりアクセントを推定します。"
119 |
120 | #: ../../pages/ipadic-dicgen.md:85
121 | msgid ""
122 | "Thus we obtain dictionary files `*.csv.accent` with the accent "
123 | "information added."
124 | msgstr "以上の作業により、アクセント情報が付与された辞書ファイル `*.csv.accent` が得られます。"
125 |
126 | #: ../../pages/ipadic-dicgen.md:87
127 | msgid "Alternatively, following commands will also work."
128 | msgstr "もしくは、以下のようなコマンドでも辞書ファイルが得られます。"
129 |
--------------------------------------------------------------------------------
/docs/pages/onebyone.rst:
--------------------------------------------------------------------------------
1 | ================================
2 | One-by-one Manual Inference Mode
3 | ================================
4 |
5 | In some cases, you may want to estimate the accent for one word,
6 | rather than the entire dictionary at once.
7 | This page introduces the tools for this purpose.
8 |
9 | s2ya: Surface -> Yomi & Accent
10 | ==============================
11 |
12 | ``s2ya`` estimates the reading (``yomi``) and accent of a word from its orthographic form (``surface``).
13 | For yomi, it uses the best estimates from MeCab and UniDic.
14 |
15 | - Input: Orthographic (surface) form, such as kanji
16 | - Output: Reading (yomi) and Accent
17 |
18 | .. code-block:: console
19 |
20 | $ echo 機械学習 | docker run tdmelodic:latest tdmelodic-s2ya
21 |
22 | Then you will have the following result.
23 |
24 | .. code-block:: console
25 |
26 | キ[カイガ]クシュー
27 |
28 | It is convenient to define an alias command as follows.
29 |
30 | .. code-block:: console
31 |
32 | $ alias tdmelodic-s2ya="docker run tdmelodic:latest tdmelodic-s2ya"
33 |
34 | Using this, try other examples.
35 |
36 | .. code-block:: console
37 |
38 | $ echo 深層学習 | tdmelodic-s2ya
39 | シ[ンソーガ]クシュー
40 |
41 | $ echo 確率微分方程式 | tdmelodic-s2ya
42 | カ[クリツビブンホーテ]ーシキ
43 |
44 | $ echo 電験一種 | tdmelodic-s2ya
45 | デ[ンケンイ]ッシュ
46 |
47 | $ echo マルクス・アウレリウス・アントニヌス | tdmelodic-s2ya
48 | マ[ルクスアウレリウスアントニ]ヌス
49 |
50 | $ echo IoT | tdmelodic-s2ya
51 | ア[イオーティ]ー
52 |
53 | It also predicts the accents of sentences.
54 |
55 | .. code-block:: console
56 |
57 | $ echo 今日の東京の天気は晴れ | tdmelodic-s2ya
58 | キョ]ーノト[ーキョーノテ]ンキワハレ
59 |
60 | $ echo 漢字の音読みには主に呉音と漢音があり、漢音の方が新しい。 | tdmelodic-s2ya
61 | カ[ンジノオンヨミニ]ワオ]モニゴ[オントカ]ンオンガアリ[カ]ンオンノホ]ーガアタラシ]ー
62 |
63 | $ echo 現在、西新宿ジャンクションから談合坂サービスエリアまで、およそ四十五分 | tdmelodic-s2ya
64 | ゲ]ンザイニ[シシンジュクジャ]ンクションカラダ[ンゴーサカサ[ービスエ]リアマ]デ]オ[ヨソヨ]ンジュー[ゴ]フン
65 |
66 | $ echo 完備なノルム空間をバナッハ空間といい、完備な内積空間をヒルベルト空間という。 | tdmelodic-s2ya
67 | カ]ンビナノ[ルムク]ーカンオバ[ナッハク]ーカントイーカ]ンビナナ[イセキク]ーカンオヒ[ルベルトク]ーカントイウ
68 |
69 | $ echo 権利の行使及び義務の履行は、信義に従い誠実に行わなければならない。 | tdmelodic-s2ya
70 | ケ]ンリノコ]ーシオヨビギ]ムノリコーワ[シ]ンギニシ[タガイセージツニオコナワナ]ケレ]バナラナイ
71 |
72 | .. warning::
73 | Although tdmelodic is formally capable of predicting sentence accents as described above,
74 | and a small amount of sentences are used in the training data,
75 | it has not been trained to perform this task.
76 | Therefore, this accent estimation should be considered only as a reference.
77 |
78 |
79 | .. note::
80 | The yomi prediction of ``s2ya`` is based on the UniDic lexicon.
81 | This is because the docker image contains only the UniDic dictionary.
82 | If you prefer using other yomi prediction modules such as Neologd,
83 | please use the ``sy2a`` module below.
84 |
85 | sy2a: Surface & Yomi -> Accent
86 | ==============================
87 |
88 | ``sy2a`` estimates the accent of a word from its orthographic form (``surface``) and the reading (``yomi``).
89 |
90 | - Input: Orthographic (surface) form, such as kanji, and reading (yomi).
91 | - Output: Accent
92 |
93 | For example,
94 |
95 | .. code-block:: console
96 |
97 | $ alias tdmelodic-sy2a="docker run -v tdmelodic:latest tdmelodic-sy2a"
98 | $ echo 機械学習,きかいがくしゅー | tdmelodic-sy2a
99 | キ[カイガ]クシュー
100 |
101 | Try other examples.
102 |
103 | .. code-block:: console
104 |
105 | $ echo 日本語アクセント,にほんごあくせんと | tdmelodic-sy2a
106 | ニ[ホンゴア]クセント
107 |
108 | $ echo 御御御付け,おみおつけ | tdmelodic-sy2a
109 | オ[ミオ]ツケ
110 |
111 | $ echo 談合坂SA,だんごーざかさーびすえりあ | tdmelodic-sy2a
112 | ダ[ンゴーザカサービスエ]リア
113 |
114 | It can also predict the accents of sentences.
115 |
116 | .. code-block:: console
117 |
118 | $ echo Wifiに接続できません,わいふぁいにせつぞくできません | tdmelodic-sy2a
119 | ワ[イファイニセ[ツゾクデキマセ]ン
120 |
121 | $ echo 国立市の国立大学,くにたちしのこくりつだいがく | tdmelodic-sy2a
122 | ク[ニタチ]シノコ[クリツダ]イガク
123 |
124 | $ echo 漢音は、当時の唐の都、長安の音を持ち帰ったものである。,かんおんわとーじのとーのみやこちょーあんのおとおもちかえったものである | tdmelodic-sy2a
125 | カ]ンオンワ[ト]ージノト]ーノミ[ヤコ[チョ]ーアンノオ[ト]オモ[チカエッタモノ]デア]ル
126 |
127 | .. note::
128 | If you want to predict the yomi of a given sentence using an advanced
129 | dictionary such as Neologd, the following command may be helpful.
130 |
131 |
132 | .. code-block:: console
133 |
134 | $ TEXT=ラグビー日本代表の試合を見に飛田給に
135 |
136 | $ YOMI=`echo $TEXT \
137 | $ | mecab -d \`mecab-config --dicdir\`/mecab-unidic-neologd/ \
138 | $ | sed -e "/^EOS/d" | cut -f 2 | perl -pe 's/\s+//g'`
139 |
140 | $ # An alternative approach:
141 | $ YOMI=`echo $TEXT | mecab -Oyomi -d \`mecab-config --dicdir\`/mecab-ipadic-neologd/`
142 |
143 | $ # check the result.
144 | $ echo $YOMI
145 | ラグビーニホンダイヒョーノシアイオミニトビタキューニ
146 |
147 | $ # accent prediction.
148 | $ echo $TEXT,$YOMI | tdmelodic-sy2a
149 | ラ[グビーニホンダ]イヒョーノシ[アイオミ]ニトビタキュ]ーニ
--------------------------------------------------------------------------------
/tdmelodic/filters/neologd_preprocess.py:
--------------------------------------------------------------------------------
1 | # -----------------------------------------------------------------------------
2 | # Copyright (c) 2019-, PKSHA Technology Inc.
3 | # All rights reserved.
4 | #
5 | # This source code is licensed under the BSD-style license found in the
6 | # LICENSE file in the root directory of this source tree.
7 | # -----------------------------------------------------------------------------
8 |
9 | # -*- coding: utf-8 -*-
10 | import sys
11 | import os
12 | import argparse
13 | import tempfile
14 |
15 | from .neologd_patch import NeologdPatch
16 | from .neologd_rmdups import rmdups
17 |
18 | class Preprocess(object):
19 | def __init__(self, flag_rmdups, neologd_patch, dictionary_type="unidic"):
20 | self.flag_rmdups = flag_rmdups
21 | self.neologd_patch_module = neologd_patch
22 | self.dictionary_type = dictionary_type
23 |
24 | def do_rmdups(self, fp_in):
25 | fp_tmp = tempfile.NamedTemporaryFile("w+")
26 | print("📌 creating a temporary file", fp_tmp.name, file=sys.stderr)
27 | rmdups(fp_in, fp_tmp, self.dictionary_type)
28 | fp_tmp.seek(0)
29 | fp_in.close() # CPython's GC will automatically closes the previous fp_in without doing this
30 | fp_in = fp_tmp
31 | return fp_in
32 |
33 | def do_neologd_patch(self, fp_in):
34 | fp_tmp = tempfile.NamedTemporaryFile("w+")
35 | print("📌 creating a temporary file", fp_tmp.name, file=sys.stderr)
36 | self.neologd_patch_module(fp_in, fp_tmp)
37 | fp_tmp.seek(0)
38 | fp_in.close() # CPython's GC will automatically closes the previous fp_in without doing this
39 | fp_in = fp_tmp
40 | return fp_in
41 |
42 | def copy_temp_to_output(self, fp_in, fp_out):
43 | # output
44 | for l in fp_in:
45 | fp_out.write(l)
46 | fp_in.close()
47 | fp_out.close()
48 |
49 | def __call__(self, fp_in, fp_out):
50 | print("ℹ️ [ Info ]", file=sys.stderr)
51 | NeologdPatch.message("| {} Duplicate entried will{}be removed.", self.flag_rmdups)
52 | if self.flag_rmdups:
53 | fp_in = self.do_rmdups(fp_in)
54 |
55 | fp_in = self.do_neologd_patch(fp_in)
56 |
57 | print("💾 [ Saving ]", file=sys.stderr)
58 | self.copy_temp_to_output(fp_in, fp_out)
59 | print("🍺 [ Done ]", file=sys.stderr)
60 |
61 | def my_add_argument(parser, option_name, default, help_):
62 | help_ = help_ + " ".format(str(default))
63 | if sys.version_info >= (3, 9):
64 | parser.add_argument("--" + option_name,
65 | action=argparse.BooleanOptionalAction,
66 | default=default,
67 | help=help_)
68 | else:
69 | parser.add_argument("--" + option_name,
70 | action="store_true",
71 | default=default,
72 | help=help_)
73 | parser.add_argument("--no-" + option_name,
74 | action="store_false",
75 | dest=option_name,
76 | default=default)
77 |
78 | def main():
79 | parser = argparse.ArgumentParser()
80 | parser.add_argument(
81 | '-i', '--input',
82 | nargs='?',
83 | type=argparse.FileType("r"),
84 | default=sys.stdin,
85 | help='input CSV file (NEologd dicitionary file) ')
86 | parser.add_argument(
87 | '-o', '--output',
88 | nargs='?',
89 | type=argparse.FileType("w"),
90 | default=sys.stdout,
91 | help='output CSV file ')
92 | parser.add_argument(
93 | "-m", "--mode",
94 | type=str,
95 | choices=["unidic", "ipadic"],
96 | default="unidic",
97 | help="dictionary format type ",
98 | )
99 | my_add_argument(parser, "rmdups", True, "remove duplicate entries or not")
100 | my_add_argument(parser, "rm_hashtag", True, "remove hash tags or not")
101 | my_add_argument(parser, "rm_noisy_katakana", True, "remove noisy katakana words or not")
102 | my_add_argument(parser, "rm_person", False, "remove person names or not")
103 | my_add_argument(parser, "rm_emoji", False, "remove emojis or not")
104 | my_add_argument(parser, "rm_symbol", False, "remove symbols or not")
105 | my_add_argument(parser, "rm_numeral", False, "remove numerals or not")
106 | my_add_argument(parser, "rm_wrong_yomi", True, "remove words with possibly wrong yomi or not")
107 | my_add_argument(parser, "rm_special_particle", True, "remove words with special particles \"は\" or \"へ\"")
108 | my_add_argument(parser, "cor_longvow", True, "correct long vowel errors or not")
109 | my_add_argument(parser, "cor_yomi_num", True, "correct the yomi of numerals or not")
110 | my_add_argument(parser, "normalize", False, "normalize the surface forms by applying "
111 | "NFKC Unicode normalization, "
112 | "capitalization of alphabets, "
113 | "and "
114 | "hankaku-to-zenkaku converter.")
115 |
116 | args = parser.parse_args()
117 | if args.input == args.output:
118 | print("[ Error ] intput and output files should be different.", file=sys.stderr)
119 | sys.exit(0)
120 | try:
121 | preprocess = Preprocess(args.rmdups, NeologdPatch(**vars(args)), dictionary_type=args.mode)
122 | preprocess(args.input, args.output)
123 | except Exception as e:
124 | print(e, file=sys.stderr)
125 |
126 | if __name__ == '__main__':
127 | main()
--------------------------------------------------------------------------------
/tdmelodic/nn/convert.py:
--------------------------------------------------------------------------------
1 | # -----------------------------------------------------------------------------
2 | # Copyright (c) 2019-, PKSHA Technology Inc.
3 | # All rights reserved.
4 | #
5 | # This source code is licensed under the BSD-style license found in the
6 | # LICENSE file in the root directory of this source tree.
7 | # -----------------------------------------------------------------------------
8 |
9 | import sys
10 | import os
11 | import csv
12 |
13 | import numpy as np
14 | # import argparse
15 | import jaconv
16 |
17 | import chainer
18 | from chainer.training import extensions
19 | from chainer.dataset import convert
20 |
21 | from tdmelodic.nn.net import Net
22 | from tdmelodic.nn.lang.mecab.unidic import UniDic
23 | from tdmelodic.nn.lang.japanese.kana.mora_sep import sep_katakana2mora
24 | from tdmelodic.nn.lang.japanese.kana.kanamap.kanamap_normal import roman_map
25 | from tdmelodic.nn.lang.japanese.accent.accent_alignment import accent_map
26 | from tdmelodic.nn.lang.category.symbol_map import char_symbol_to_numeric
27 | from tdmelodic.nn.loader.data_loader import NeologdDictionaryLoader
28 | from tdmelodic.nn.loader.data_loader import _convert_parsed_surface_to_codes
29 | from tdmelodic.nn.loader.data_loader import _convert_yomi_to_codes
30 | from tdmelodic.nn.inference import InferAccent
31 | from tdmelodic.util.dic_index_map import get_dictionary_index_map
32 |
33 | class Converter(object):
34 | # gpu_id = -1
35 | # bs = 1
36 | accent_symbol={0: "]", 1 : "", 2: "["}
37 | def __init__(self):
38 | self.model = InferAccent()
39 | self.unidic = UniDic()
40 |
41 | def encode_sy(self, surface, yomi):
42 | # analyze surface, and get the result of MeCab+UniDic
43 | lst_mecab_parsed, rank, ld = self.unidic.get_n_best(surface, yomi)
44 | mecab_parsed = lst_mecab_parsed[rank[0]]
45 |
46 | # convert to codes
47 | # codes : v_code, c_code, accent_code, pos_code, conc_code, gosh_code
48 | S_vow, S_con, S_acc, S_pos, S_acccon, S_gosh = _convert_parsed_surface_to_codes( mecab_parsed )
49 | Y_vow, Y_con = _convert_yomi_to_codes( yomi )
50 |
51 | # join
52 | S_vow = ''.join([s + ' ' for s in S_vow])
53 | S_con = ''.join([s + ' ' for s in S_con])
54 | S_acc = ''.join([s for s in S_acc])
55 | S_pos = ''.join([s + ' ' for s in S_pos])
56 | S_acccon = ''.join([s + ' ' for s in S_acccon])
57 | S_gosh = ''.join([s + ' ' for s in S_gosh])
58 | Y_vow = ''.join([s + ' ' for s in Y_vow])
59 | Y_con = ''.join([s + ' ' for s in Y_con])
60 |
61 | # adjust the length
62 | S_len = len(S_vow)
63 | Y_len = len(Y_vow)
64 | S_con = (S_con + " " * (S_len - len(S_con ))) [:S_len]
65 | S_acc = (S_acc + " " * (S_len - len(S_acc ))) [:S_len]
66 | S_pos = (S_pos + " " * (S_len - len(S_pos ))) [:S_len]
67 | S_acccon = (S_acccon + " " * (S_len - len(S_acccon))) [:S_len]
68 | S_gosh = (S_gosh + " " * (S_len - len(S_gosh ))) [:S_len]
69 | Y_vow = (Y_vow + " " * (Y_len - len(Y_vow ))) [:Y_len]
70 | Y_con = (Y_con + " " * (Y_len - len(Y_con ))) [:Y_len]
71 |
72 | # zeropad y
73 | pad = 8
74 | Y_vow = (Y_vow + "0" * pad) [:Y_len + pad]
75 | Y_con = (Y_con + "0" * pad) [:Y_len + pad]
76 |
77 | # convert to numpy array
78 | S_vow_np = np.array( [roman_map[c] for c in S_vow] , np.int32)
79 | S_con_np = np.array( [roman_map[c] for c in S_con] , np.int32)
80 | S_acc_np = np.array( [accent_map[c] for c in S_acc] , np.int32)
81 | S_pos_np = np.array( [char_symbol_to_numeric[c] for c in S_pos] , np.int32)
82 | S_acccon_np = np.array( [char_symbol_to_numeric[c] for c in S_acccon] , np.int32)
83 | S_gosh_np = np.array( [char_symbol_to_numeric[c] for c in S_gosh] , np.int32)
84 | Y_vow_np = np.array( [roman_map[c] for c in Y_vow] , np.int32)
85 | Y_con_np = np.array( [roman_map[c] for c in Y_con] , np.int32)
86 |
87 | # return encoded information
88 | S = S_vow_np, S_con_np, S_pos_np, S_acc_np, S_acccon_np, S_gosh_np
89 | Y = Y_vow_np, Y_con_np
90 | return S, Y
91 |
92 | def add_batch_dim(self, X_s, X_y):
93 | X_s = [np.expand_dims(xs, 0) for xs in X_s]
94 | X_y = [np.expand_dims(xy, 0) for xy in X_y]
95 | return X_s, X_y
96 |
97 | def infer(self, X_s, X_y):
98 | dummy = X_y[0] * 0 # dummy data
99 | accent = self.model.infer(X_s, X_y, dummy)
100 | accent = accent.tolist()
101 | accent = np.asarray(accent).astype(np.int32)
102 | return accent
103 |
104 | def zip_ya(self, yomi, accent):
105 | # the length of the list zip return is equalt to the shorter argument
106 | return "".join([y + self.accent_symbol[a]
107 | for y, a in zip(sep_katakana2mora(yomi), accent)])
108 |
109 | def sy2a(self, s, y):
110 | # preprocess strings
111 | s = s.strip()
112 | y = jaconv.normalize(y, "NFKC")
113 | y = jaconv.hira2kata(y)
114 |
115 | # encode
116 | s_np, y_np = self.encode_sy(s, y)
117 | s_np, y_np = self.add_batch_dim(s_np, y_np)
118 |
119 | # inference
120 | accent = self.infer(s_np, y_np)[0]
121 | yomi_and_accent = self.zip_ya(y, accent)
122 | return yomi_and_accent
123 |
124 | def s2ya(self, s):
125 | s = jaconv.normalize(s, "NFKC").strip()
126 | y = self.unidic.get_yomi(s).strip()
127 | return self.sy2a(s, y)
128 |
129 | # =============================================================================================
130 | def main_s2ya():
131 | tdmelodic = Converter()
132 | for surface in sys.stdin:
133 | accent = tdmelodic.s2ya(surface)
134 | print(accent)
135 |
136 | def main_sy2a():
137 | tdmelodic = Converter()
138 | for line in sys.stdin:
139 | surface, yomi = line.strip().split(",")
140 | accent = tdmelodic.sy2a(surface, yomi)
141 | print(accent)
142 |
--------------------------------------------------------------------------------
/docs/pages/introduction.rst:
--------------------------------------------------------------------------------
1 | ==========
2 | Background
3 | ==========
4 |
5 | Japanese pitch accent
6 | =====================
7 |
8 | Japanese language is a pitch-accented language.
9 | While English accents are based on the strength each syllable,
10 | Japanese accents are based on the pitch (height) of each mora.
11 | Accent is important in spoken Japanese language, as it is in other languages.
12 |
13 | There are large regional differences in Japanese accents.
14 | Of these, this module deals with the accents of the contemporary Tokyo dialect,
15 | which is one of the most influential Japanese dialects in the country.
16 |
17 | On the Tokyo Japanese accent
18 | ============================
19 |
20 | Raise and lower (accent nucleus)
21 | --------------------------------
22 |
23 | In the contemporary Tokyo dialect, there are two kinds of events that invoke the change of pitch.
24 |
25 | - ``[``: *raise the pitch here*
26 | - ``]``: *lower the pitch here* (a.k.a. accent nucleus)
27 |
28 | Not all accent patterns are possible. Usually, at most one ``]`` can appear in a word.
29 | Also, ``[`` can appear only at the beginning.
30 | Thus, an :math:`n`-mora word basically has one of the :math:`n+1` possible accent patterns as follows.
31 |
32 | - ``*[****``
33 | - ``*]****``
34 | - ``*[*]***``
35 | - ``*[**]**``
36 | - ``*[***]*``
37 | - ``*[****]``
38 |
39 | Representation of accent nuclei by digits
40 | -----------------------------------------
41 |
42 | Accent patterns (accent types) are often expressed by the digits which indicate the positions of the accent nuclei.
43 | Some accent dictionaries including UniDic use this notation.
44 |
45 | - :math:`\text{Accent type} = 0` means that there are no accent nuclei (no fall ``]``).
46 | - :math:`\text{Accent type} = n~(n > 0)` means that the accent nucleus appears after the :math:`n`-th mora.
47 |
48 | .. image:: ../imgs/jpn_accent_types.png
49 | :width: 70%
50 | :align: center
51 |
52 | Heiban, Atamadaka, Nakadaka, and Odaka types
53 | --------------------------------------------
54 |
55 | - :math:`\text{Accent type} = 0` also called the **Heiban** (平板; flat) type.
56 | - :math:`\text{Accent type} = 1` also called the **Atamadaka** (頭高; head high) type.
57 | - :math:`\text{Accent type} \in \{2, \cdots, n-1\}` also called the **Nakadaka** (中高; middle high) types.
58 | - :math:`\text{Accent type} = n` also called the **Odaka** (尾高; tail high) type.
59 |
60 | For examples,
61 |
62 | - 0 (Heiban)
63 | - 野球 ``ya[kyuu``, パソコン ``pa[sokon``, 鉛筆 ``e[npitsu``, 緑茶 ``ryo[kucha``, りんご ``ri[ngo``, 渋谷 ``shi[buya``
64 | - 1 (Atamadaka)
65 | - サッカー ``sa]Qkaa``, ジュース ``ju]usu``, 猫 ``ne]ko``, メロン ``me]ron``, 金魚 ``ki]ngyo``, 新橋 ``shi]mbashi``
66 | - 2 ~ n-1 (Nakadaka)
67 | - バドミントン ``ba[domi]nton``, 折り紙 ``o[ri]gami``, カブトムシ ``ka[buto]mushi``, 冷蔵庫 ``re[ezo]oko``, 池袋 ``i[kebu]kuro``
68 | - n (Odaka)
69 | - 足 ``a[shi]``, 紙 ``ka[mi]``, 花 ``ha[na]``, 海苔 ``no[ri]``, 米 ``ko[me]``, 光 ``hi[kari]``, 犬 ``i[nu]``, 馬 ``u[ma]``
70 |
71 | Difference between Heiban and Odaka
72 | -----------------------------------
73 |
74 | Although there may seem to be no difference in the pitch pattern within the words,
75 | Heiban and Odaka types have differences in the pitch of the subsequent particles e.g. "*ga* が"
76 | which is the NOM case marker for subjects.
77 |
78 | - *hana ga* はなが
79 | - 鼻が ``ha[na-ga``: nose is
80 | - 花が ``ha[na]-ga``: flower is
81 | - *hikari ga* ひかりが
82 | - ひかりが ``hi[kari-ga``: Shinkansen Hikari is
83 | - 光が ``hi[kari]-ga``: light is
84 | - *hashi ga* はしが
85 | - 橋が ``ha[shi]-ga``: bridge is
86 | - 端が ``ha[shi-ga``: edge is
87 | - 箸が ``ha]shi-ga``: chopsticks are
88 | - *ha ga* はが
89 | - 葉が ``ha[-ga``: leaf is
90 | - 歯が ``ha]-ga``: tooth is
91 |
92 | Note that the GEN case markar "*no* の" is often unaffected by the preceding downforce.
93 | (This law also has some exceptions.)
94 |
95 | - *hana no* はなの
96 | - 鼻の ``ha[na-no``: nose\'s
97 | - 花の ``ha[na-no``: flower\'s
98 | - *hikari no* ひかりの
99 | - ひかりの ``hi[kari-no``: Shinkansen Hikari\'s
100 | - 光の ``hi[kari-no``: light\'s
101 | - *hashi no* はしの
102 | - 橋の ``ha[shi-no``: bridge\'s
103 | - 端の ``ha[shi-no``: edge\'s
104 | - 箸の ``ha]shi-no``: chopsticks\'
105 | - *ha no* はの
106 | - 葉の ``ha[-no``: leaf\'s
107 | - 歯の ``ha]-no``: tooth\'s
108 |
109 | Compounds
110 | ---------
111 |
112 | The accents of compound words are a little complicated. For example,
113 |
114 | - 東京 ``to[okyoo``
115 | - 東京都 ``to[okyo]oto``
116 |
117 | .. image:: ../imgs/jpn_accent-en-page1.png
118 | :width: 60%
119 | :align: center
120 |
121 | - 東京都知事 ``to[okyootochi]ji``
122 |
123 | .. image:: ../imgs/jpn_accent-en-page2.png
124 | :width: 60%
125 | :align: center
126 |
127 | - 東京都知事選挙 ``to[okyootochijise]nkyo``
128 |
129 | .. image:: ../imgs/jpn_accent-en-page3.png
130 | :width: 60%
131 | :align: center
132 |
133 | - 世界 ``se]kai``
134 | - 遺産 ``i[san``
135 | - 世界遺産 ``se[kaii]san``
136 |
137 | .. image:: ../imgs/jpn_accent-en-page4.png
138 | :width: 60%
139 | :align: center
140 |
141 | - 機械 ``ki]kai``
142 | - 学習 ``ga[kushuu``
143 | - 機械学習 ``ki[kaiga]kushuu``
144 |
145 | .. image:: ../imgs/jpn_accent-en-page5.png
146 | :width: 60%
147 | :align: center
148 |
149 |
150 | Logo of tdmelodic
151 | -----------------
152 |
153 | .. figure:: ../logo_tdmelodic.png
154 | :figwidth: 30%
155 | :align: right
156 |
157 | There are many pairs of words that have same sound patterns except the accentuation.
158 | For example,
159 |
160 | - *fuji*
161 | - 富士 (Mt. Fuji) is pronounced as ``fu]ji``.
162 | - 藤 (wisteria) is pronounced as ``fu[ji``.
163 | - *sake*
164 | - 鮭 (salmon) is pronounced as ``sa]ke``.
165 | - 酒 (alcoholic beverage) is pronounced as ``sa[ke``
166 | - *hashi*
167 | - 端 (edge, corner) is pronounced as ``ha[shi``.
168 | - 橋 (bridge) is pronounced as ``ha[shi]``.
169 | - 箸 (chopstics) is pronounced as ``ha]shi``.
170 |
171 | As you can see, the logo is a combination of two "fuji"-s with different accent patterns.
172 |
173 | Further reading
174 | ===============
175 |
176 | - Wikipedia
177 | - `Wikipedia - Japanese pitch accent `_
178 | - Textbook
179 | - 松森, 新田, 木部, 中井, **日本語アクセント入門**, 三省堂, 2012
180 | - Dictionary
181 | - `OJAD (Online Japanese Accent Dictionary) `_
182 | - **NHK日本語発音アクセント新辞典**, NHK出版, 2016
183 | - 金田一, 秋永, **新明解日本語アクセント辞典 第2版**, 三省堂, 2014
184 |
--------------------------------------------------------------------------------
/tests/util/test_word_type.py:
--------------------------------------------------------------------------------
1 | import unittest
2 | from unittest import TestCase
3 | from tdmelodic.util.word_type import WordType as W
4 |
5 | class TestWordType(unittest.TestCase):
6 | def __init__(self, *args, **kwargs):
7 | super(TestWordType, self).__init__(*args, **kwargs)
8 | self.w = W()
9 |
10 | def test_is_symbol(self):
11 | f = self.w.is_symbol
12 | self.assertEqual(True, f(",,,,記号,一般,,,,".split(",")))
13 | self.assertEqual(False, f(",,,,記号,,,,,".split(",")))
14 |
15 | def test_is_hashtag(self):
16 | f = self.w.is_hashtag
17 | self.assertEqual(True, f("#hello,,,".split(",")))
18 | self.assertEqual(True, f("#hello world,,,".split(",")))
19 | self.assertEqual(True, f("#こんにちは12345,,,".split(",")))
20 | self.assertEqual(False, f("#hello world,,,".split(",")))
21 |
22 | def test_is_emoji(self):
23 | f = self.w.is_emoji
24 | self.assertEqual(True, f("😄,,,".split(",")))
25 | self.assertEqual(False, f("あ,,,".split(",")))
26 |
27 | def test_is_noisy_katakana(self):
28 | f = self.w.is_noisy_katakana
29 | self.assertEqual(True, f("カタカナ,,,,,,,,,,,片仮名,".split(",")))
30 | self.assertEqual(True, f("カタカナ,,,,,,,,,,,かたかな,".split(",")))
31 | self.assertEqual(False, f("カタカナ,,,,,,,,,,,カタカナ,".split(",")))
32 | self.assertEqual(True, f("トウキョウトチジセンキョ,,,,,,,,,,,東京都知事選挙,".split(",")))
33 |
34 | def test_is_katakana(self):
35 | f = self.w.is_katakana
36 | self.assertEqual(True, f("カタカナ,,,,,,,,,,,カタカナ,".split(",")))
37 | self.assertEqual(False, f("カタカナ,,,,,,,,,,,片仮名,".split(",")))
38 | self.assertEqual(False, f("ひらがな,,,,,,,,,,,平仮名,,,".split(",")))
39 | self.assertEqual(False, f("漢字,,,,,,,,,,,漢字,,".split(",")))
40 | self.assertEqual(False, f("漢字カタカナ,,,,,,,,,,,漢字片仮名,,,".split(",")))
41 |
42 | def test_is_hira_kata_kanji(self):
43 | f = self.w.is_hira_kata_kanji
44 | self.assertEqual(True, f("カタカナ,,,,,,,,,,,,".split(",")))
45 | self.assertEqual(True, f("カタカナ,,,,,,,,,,,,".split(",")))
46 | self.assertEqual(True, f("ひらがな,,,,,,,,,,,,,,".split(",")))
47 | self.assertEqual(True, f("漢字,,,,,,,,,,,,,".split(",")))
48 | self.assertEqual(True, f("漢字ひらがなカタカナ,,,,,,,,,,,,,,".split(",")))
49 | self.assertEqual(True, f("漢字=カタカナ&ひらがな,,,,,,,,,,,,,,".split(",")))
50 | self.assertEqual(False, f("カタカナabc,,,,,,,,,,,,".split(",")))
51 |
52 | def test_is_hira_kata_kanji_romaji(self):
53 | f = self.w.is_hira_kata_kanji_romaji
54 | self.assertEqual(True, f("カタカナ,,,,,,,,,,,,".split(",")))
55 | self.assertEqual(True, f("カタカナ,,,,,,,,,,,,".split(",")))
56 | self.assertEqual(True, f("ひらがな,,,,,,,,,,,,,,".split(",")))
57 | self.assertEqual(True, f("漢字,,,,,,,,,,,,,".split(",")))
58 | self.assertEqual(True, f("漢字ひらがなカタカナ,,,,,,,,,,,,,,".split(",")))
59 | self.assertEqual(True, f("漢字=カタカナ&ひらがな,,,,,,,,,,,,,,".split(",")))
60 | self.assertEqual(True, f("カタカナabc,,,,,,,,,,,,".split(",")))
61 |
62 | def test_is_romaji(self):
63 | f = self.w.is_romaji
64 | self.assertEqual(True, f("this is an apple,,,,,,,,,,,,".split(",")))
65 | self.assertEqual(True, f("A&B&C,,,,,,,,,,,,".split(",")))
66 | self.assertEqual(True, f("A-B-C,,,,,,,,,,,,,,".split(",")))
67 | self.assertEqual(False, f("カタカナabc,,,,,,,,,,,,".split(",")))
68 |
69 | def test_is_KK(self):
70 | f = self.w.is_KK
71 | self.assertEqual(True, f("株式会社あああああ,,,,,,,,,,,,,カブシキガイシャアアアアア,".split(",")))
72 | self.assertEqual(True, f("株式会社あああああ,,,,,,,,,,,,,カブシキカイシャアアアアア,".split(",")))
73 | self.assertEqual(True, f("あああああ株式会社,,,,,,,,,,,,,アアアアアカブシキガイシャ,".split(",")))
74 | self.assertEqual(True, f("あああああ株式会社,,,,,,,,,,,,,アアアアアカブシキカイシャ,".split(",")))
75 | self.assertEqual(False, f("株式会社あああああ,,,,,,,,,,,,,アアアアア,".split(",")))
76 |
77 | def test_is_YK(self):
78 | f = self.w.is_YK
79 | self.assertEqual(True, f("有限会社あああああ,,,,,,,,,,,,,ユーゲンガイシャアアアアア,".split(",")))
80 | self.assertEqual(True, f("有限会社あああああ,,,,,,,,,,,,,ユーゲンカイシャアアアアア,".split(",")))
81 | self.assertEqual(True, f("有限会社あああああ,,,,,,,,,,,,,ユウゲンガイシャアアアアア,".split(",")))
82 | self.assertEqual(False, f("有限会社あああああ,,,,,,,,,,,,,アアアアア,".split(",")))
83 |
84 | def test_is_station(self):
85 | f = self.w.is_station
86 | self.assertEqual(True, f("東京駅,,".split(",")))
87 | self.assertEqual(False, f("東京駅前,,".split(",")))
88 |
89 | def test_is_road(self):
90 | f = self.w.is_road
91 | self.assertEqual(True, f("東京都道1号あああああ線,,".split(",")))
92 | self.assertEqual(False, f("東京都道1号あああああ,,".split(",")))
93 |
94 | def test_is_school(self):
95 | f = self.w.is_school
96 | self.assertEqual(True, f("あああ小学校,,".split(",")))
97 | self.assertEqual(True, f("いいい中学校,,".split(",")))
98 | self.assertEqual(True, f("ううう高等学校,,".split(",")))
99 | self.assertEqual(True, f("えええ高校,,".split(",")))
100 | self.assertEqual(True, f("おおお大学,,".split(",")))
101 | self.assertEqual(True, f("かかか専門学校,,".split(",")))
102 |
103 | def test_is_address(self):
104 | f = self.w.is_address
105 | self.assertEqual(True, f("東京都文京区本郷,,".split(",")))
106 | self.assertEqual(True, f("埼玉県さいたま市浦和区,,".split(",")))
107 | self.assertEqual(True, f("神奈川県横浜市西区,,".split(",")))
108 | self.assertEqual(True, f("東京都八王子市,,".split(",")))
109 |
110 |
111 | def test_is_date(self):
112 | f = self.w.is_date
113 | self.assertEqual(True, f("10月10日,,,,,,,,,,,,,ジュウガツトオカ,".split(",")))
114 | self.assertEqual(False, f("十月十日,,,,,,,,,,,,,ジュウガツトオカ,".split(",")))
115 | self.assertEqual(False, f("10月10日,,,,,,,,,,,,,トツキトオカ,".split(",")))
116 | self.assertEqual(True, f("2020-10-10,,,,,,,,,,,,,,".split(",")))
117 |
118 | def test_is_numeral(self):
119 | f = self.w.is_numeral
120 | self.assertEqual(True, f("1億5000万円,,,".split(",")))
121 | self.assertEqual(True, f("$4,,,".split(",")))
122 | self.assertEqual(True, f("50ドル,,,".split(",")))
123 | self.assertEqual(True, f("80kg,,,".split(",")))
124 | self.assertEqual(True, f("80W,,,".split(",")))
125 | self.assertEqual(True, f("1階建て,,,".split(",")))
126 | self.assertEqual(True, f("10両編成,,,".split(",")))
127 | self.assertEqual(True, f("123456あああ,,,".split(",")))
128 |
129 | if __name__ == '__main__':
130 | unittest.main()
--------------------------------------------------------------------------------
/tdmelodic/nn/lang/japanese/kana/kanamap/kanamap_normal.py:
--------------------------------------------------------------------------------
1 | # -----------------------------------------------------------------------------
2 | # Copyright (c) 2019-, PKSHA Technology Inc.
3 | # All rights reserved.
4 | #
5 | # This source code is licensed under the BSD-style license found in the
6 | # LICENSE file in the root directory of this source tree.
7 | # -----------------------------------------------------------------------------
8 |
9 | # 仮名をローマ字(1モーラが2文字になるようにした独自表現)に変換するための辞書
10 | # これらのマッピングは、モーラ分割後に各モーラに適用する。
11 |
12 | kana2roman_alias = {
13 | "ァ":"xa",
14 | "ィ":"xi",
15 | "ゥ":"xu",
16 | "ェ":"xe",
17 | "ォ":"xo",
18 | "ぁ":"xa",
19 | "ぃ":"xi",
20 | "ぅ":"xu",
21 | "ぇ":"xe",
22 | "ぉ":"xo",
23 |
24 | "ゐ":"wi",
25 | "ゑ":"we",
26 |
27 | # 合拗音
28 | "クヮ":"ka",
29 | "グヮ":"ga",
30 |
31 | # 濁音
32 | "ヂャ":"ja",
33 | "ヂュ":"ju",
34 | "ヂェ":"je",
35 | "ヂョ":"jo",
36 |
37 | # 促音
38 | "っ":"QQ",
39 | }
40 |
41 | kana2roman_standard = {
42 | # 直音・清音
43 | "ア":"xa",
44 | "イ":"xi",
45 | "ウ":"xu",
46 | "エ":"xe",
47 | "オ":"xo",
48 |
49 | "カ":"ka",
50 | "キ":"ki",
51 | "ク":"ku",
52 | "ケ":"ke",
53 | "コ":"ko",
54 |
55 | "サ":"sa",
56 | "シ":"Si",
57 | "ス":"su",
58 | "セ":"se",
59 | "ソ":"so",
60 | "スィ":"si",
61 |
62 | "タ":"ta",
63 | "チ":"Ci",
64 | "ツ":"Zu",
65 | "テ":"te",
66 | "ト":"to",
67 | "ティ":"ti",
68 | "トゥ":"tu",
69 |
70 | "ナ":"na",
71 | "ニ":"ni",
72 | "ヌ":"nu",
73 | "ネ":"ne",
74 | "ノ":"no",
75 |
76 | "ハ":"ha",
77 | "ヒ":"hi",
78 | "フ":"fu",
79 | "ヘ":"he",
80 | "ホ":"ho",
81 |
82 | "マ":"ma",
83 | "ミ":"mi",
84 | "ム":"mu",
85 | "メ":"me",
86 | "モ":"mo",
87 |
88 | "ヤ":"xA",
89 | "ユ":"xU",
90 | "ヨ":"xO",
91 | "イェ":"xE",
92 |
93 | "ラ":"ra",
94 | "リ":"ri",
95 | "ル":"ru",
96 | "レ":"re",
97 | "ロ":"ro",
98 |
99 | "ワ":"wa",
100 | "ヰ":"wi",
101 | "ヱ":"we",
102 | "ヲ":"wo",
103 | "ウィ":"wi",
104 | "ウェ":"we",
105 | "ウォ":"wo",
106 |
107 | # 濁音・半濁音
108 | "ガ":"ga",
109 | "ギ":"gi",
110 | "グ":"gu",
111 | "ゲ":"ge",
112 | "ゴ":"go",
113 |
114 | "ザ":"za",
115 | "ジ":"ji",
116 | "ズ":"zu",
117 | "ゼ":"ze",
118 | "ゾ":"zo",
119 | "ズィ":"zi",
120 |
121 | "ダ":"da",
122 | "ヂ":"ji",
123 | "ヅ":"zu",
124 | "デ":"de",
125 | "ド":"do",
126 | "ディ":"di",
127 | "ドゥ":"du",
128 |
129 | "バ":"ba",
130 | "ビ":"bi",
131 | "ブ":"bu",
132 | "ベ":"be",
133 | "ボ":"bo",
134 |
135 | "パ":"pa",
136 | "ピ":"pi",
137 | "プ":"pu",
138 | "ペ":"pe",
139 | "ポ":"po",
140 |
141 | "ヴァ":"va",
142 | "ヴィ":"vi",
143 | "ヴ": "vu",
144 | "ヴェ":"ve",
145 | "ヴォ":"vo",
146 |
147 | # 開拗音
148 | "キャ":"kA",
149 | "キュ":"kU",
150 | "キェ":"kE",
151 | "キョ":"kO",
152 |
153 | "テャ":"tA",
154 | "テュ":"tU",
155 | "テェ":"tE",
156 | "テョ":"tO",
157 |
158 | "ニャ":"nA",
159 | "ニュ":"nU",
160 | "ニェ":"nE",
161 | "ニョ":"nO",
162 |
163 | "ヒャ":"hA",
164 | "ヒュ":"hU",
165 | "ヒェ":"hE",
166 | "ヒョ":"hO",
167 |
168 | "ミャ":"mA",
169 | "ミュ":"mU",
170 | "ミェ":"mE",
171 | "ミョ":"mO",
172 |
173 | "リャ":"rA",
174 | "リュ":"rU",
175 | "リェ":"rE",
176 | "リョ":"rO",
177 |
178 | "ギャ":"gA",
179 | "ギュ":"gU",
180 | "ギェ":"gE",
181 | "ギョ":"gO",
182 |
183 | "ズャ":"zA",
184 | "ズュ":"zU",
185 | "ズェ":"zE",
186 | "ズョ":"zO",
187 |
188 | "デャ":"dA",
189 | "デュ":"dU",
190 | "デェ":"dE",
191 | "デョ":"dO",
192 |
193 | "ビャ":"bA",
194 | "ビュ":"bU",
195 | "ビェ":"bE",
196 | "ビョ":"bO",
197 |
198 | "ピャ":"pA",
199 | "ピュ":"pU",
200 | "ピェ":"pE",
201 | "ピョ":"pO",
202 |
203 | "ツャ":"ZA",
204 | "ツュ":"ZU",
205 | "ツョ":"ZO",
206 |
207 | "フャ":"fA",
208 | "フュ":"fU",
209 | "フョ":"fO",
210 |
211 | "ヴャ":"vA",
212 | "ヴュ":"vU",
213 | "ヴョ":"vO",
214 |
215 | # その他拗音
216 | "シャ":"Sa",
217 | "シュ":"Su",
218 | "シェ":"Se",
219 | "ショ":"So",
220 |
221 | "チャ":"Ca",
222 | "チュ":"Cu",
223 | "チェ":"Ce",
224 | "チョ":"Co",
225 |
226 | "ツァ":"Za",
227 | "ツィ":"Zi",
228 | "ツェ":"Ze",
229 | "ツォ":"Zo",
230 |
231 | "ファ":"fa",
232 | "フィ":"fi",
233 | "フェ":"fe",
234 | "フォ":"fo",
235 |
236 | "ジャ":"ja",
237 | "ジュ":"ju",
238 | "ジェ":"je",
239 | "ジョ":"jo",
240 |
241 | # 撥音と促音
242 | "ン":"xn",
243 | "ッ":"QQ",
244 |
245 | # 句読点など
246 | "。":".",
247 | "。":".",
248 | "。":".",
249 | ".":".",
250 |
251 | ".":".",
252 | "、":",",
253 | ",":",",
254 | ",":",",
255 |
256 | "?":"?",
257 | "?":"?",
258 |
259 | "!":"!",
260 | "!":"!",
261 | "♪":"!",
262 |
263 | # 括弧
264 | "「":"(",
265 | "」":")",
266 |
267 | "【":"(",
268 | "】":")",
269 |
270 | "『":"(",
271 | "』":")",
272 |
273 | "(":"(",
274 | ")":")",
275 |
276 | "(":"(",
277 | ")":")",
278 |
279 | "<":"<",
280 | ">":">",
281 |
282 | # 空白
283 | " ":" ",
284 | " ":" ",
285 |
286 | # 伸ばし棒、点
287 | "—":"-",
288 | "ー":"-",
289 |
290 | "~":"~",
291 | "〜":"~",
292 | "~":"~",
293 |
294 | "…":":",
295 | ":":":",
296 | "・":",",
297 |
298 | # 例外記号
299 | "0" : "##",
300 | }
301 |
302 | # ===========================================================================================
303 | # 辞書結合
304 | kana2roman_dictionary = {**kana2roman_standard, **kana2roman_alias}
305 |
306 | # 1文字のコードを2文字にする。
307 | kana2roman_dictionary = {k: (v * 2)[:2] for k, v in kana2roman_dictionary.items()}
308 |
309 | # invmap 本当はこれでは正しくないが、学習中の参考情報としてしか使わないのでとりあえずこれで良い。
310 | kana2roman_dictionary_inv = {v: k for k, v in kana2roman_standard.items()}
311 | kana2roman_dictionary_inv[" "] = "_"
312 |
313 | # 2文字で1モーラになるもののリスト
314 | exceptions = {k: v for k, v in kana2roman_dictionary.items() if len(k) == 2}
315 |
316 | # 子音と母音のリスト
317 | consonants = list(sorted(set([code[0] for code in kana2roman_dictionary.values()])))
318 | vowels = list(sorted(set([code[1] for code in kana2roman_dictionary.values()])))
319 |
320 | # 子音と母音のエンコード
321 | roman_map = {v: i+1 for i, v in enumerate(list(sorted(set(consonants + vowels))))}
322 | roman_map[None] = 0
323 | roman_map["0"] = 0
324 | roman_map[""] = 0
325 | roman_invmap = {v: k for k, v in roman_map.items()}
326 |
327 | if __name__ == '__main__':
328 | print(sorted([(k, v) for k, v in kana2roman_dictionary.items()], key=lambda _: _[0]))
329 |
330 | print("-"*80)
331 | print(sorted([(k, v) for k, v in exceptions.items()], key=lambda _: _[0]))
332 |
333 | print("-"*80)
334 | print("consonants",consonants)
335 |
336 | print("-"*80)
337 | print("vowels",vowels)
338 |
339 | print("-"*80)
340 | print(roman_map)
341 |
342 | print("-"*80)
343 | print(roman_invmap)
344 |
--------------------------------------------------------------------------------
/tdmelodic/filters/neologd_patch.py:
--------------------------------------------------------------------------------
1 | # -----------------------------------------------------------------------------
2 | # Copyright (c) 2019-, PKSHA Technology Inc.
3 | # All rights reserved.
4 | #
5 | # This source code is licensed under the BSD-style license found in the
6 | # LICENSE file in the root directory of this source tree.
7 | # -----------------------------------------------------------------------------
8 |
9 | # -*- coding: utf-8 -*-
10 | import sys
11 | import os
12 | import argparse
13 | import regex as re
14 | import csv
15 | from tqdm import tqdm
16 | import tempfile
17 | import copy
18 | import unicodedata
19 | import jaconv
20 |
21 | from tdmelodic.util.dic_index_map import get_dictionary_index_map
22 | from tdmelodic.util.util import count_lines
23 | from tdmelodic.util.word_type import WordType
24 |
25 | from .yomi.basic import modify_longvowel_errors
26 | from .yomi.basic import modify_yomi_of_numerals
27 | from .yomi.particle_yomi import ParticleYomi
28 | from .yomi.wrong_yomi_detection import SimpleWrongYomiDetector
29 |
30 | class NeologdPatch(object):
31 | def __init__(self, *args, **kwargs):
32 | for k, v in kwargs.items():
33 | if k != "input" and k != "output":
34 | self.__setattr__(k, v)
35 | self.IDX_MAP = get_dictionary_index_map(self.mode) # dictionary type
36 | self.wt = WordType(self.mode)
37 | self.wrong_yomi_detector = SimpleWrongYomiDetector(mode=self.mode)
38 | self.particle_yomi = ParticleYomi()
39 |
40 | def showinfo(self):
41 | print("ℹ️ [ Info ]", file=sys.stderr)
42 | self.message("| {} Hash tags will{}be removed.", self.rm_hashtag)
43 | self.message("| {} Noisy katakana words will{}be removed.", self.rm_noisy_katakana)
44 | self.message("| {} Person names will{}be removed.", self.rm_person)
45 | self.message("| {} Emojis will{}be removed.", self.rm_emoji)
46 | self.message("| {} Symbols will{}be removed.", self.rm_symbol)
47 | self.message("| {} Numerals will{}be removed.", self.rm_numeral)
48 | self.message("| {} Wrong yomi words will{}be removed.", self.rm_wrong_yomi)
49 | self.message("| {} Words with special particles \"は\" and \"へ\" will{}be removed", self.rm_special_particle)
50 | self.message("| {} Long vowel errors will{}be corrected.", self.cor_longvow)
51 | self.message("| {} Numeral yomi errors will{}be corrected.", self.cor_yomi_num)
52 | self.message("| {} Surface forms will{}be normalized.", self.normalize)
53 |
54 | @classmethod
55 | def message(cls, message, flag):
56 | if flag:
57 | message = message.format("✅", " ")
58 | else:
59 | message = message.format("‼️", " *NOT* ")
60 | print(message, file=sys.stderr)
61 |
62 | def add_accent_column(self, line, idx_accent=None):
63 | line = line + ['' for i in range(10)]
64 | line[idx_accent] = '@'
65 | return line
66 |
67 | def normalize_surface(self, line, idx_surface=None):
68 | s = line[idx_surface]
69 | s = unicodedata.normalize("NFKC", s)
70 | s = s.upper()
71 | s = jaconv.normalize(s, "NFKC")
72 | s = jaconv.h2z(s, digit=True, ascii=True, kana=True)
73 | s = s.replace("\u00A5", "\uFFE5") # yen symbol
74 | line[idx_surface] = s
75 | return line
76 |
77 | def process_single_line(self, line):
78 | # ----------------------------------------------------------------------
79 | # remove words by word types
80 | if self.rm_hashtag:
81 | if self.wt.is_hashtag(line):
82 | return None
83 |
84 | if self.rm_noisy_katakana:
85 | if self.wt.is_noisy_katakana(line):
86 | return None
87 |
88 | if self.rm_person:
89 | if self.wt.is_person(line):
90 | return None
91 |
92 | if self.rm_emoji:
93 | if self.wt.is_emoji(line):
94 | return None
95 |
96 | if self.rm_symbol:
97 | if self.wt.is_symbol(line):
98 | return None
99 |
100 | if self.rm_numeral:
101 | if self.wt.is_numeral(line):
102 | return None
103 |
104 | line = copy.deepcopy(line)
105 |
106 | # ----------------------------------------------------------------------
107 | # correct yomi
108 | if self.cor_longvow:
109 | line = modify_longvowel_errors(line, idx_yomi=self.IDX_MAP["YOMI"])
110 |
111 | if self.cor_yomi_num:
112 | if self.wt.is_numeral(line):
113 | line = modify_yomi_of_numerals(line,
114 | idx_surface=self.IDX_MAP["SURFACE"], idx_yomi=self.IDX_MAP["YOMI"])
115 |
116 | # ----------------------------------------------------------------------
117 | # 助詞の読みを修正する(TODO)
118 | if self.rm_special_particle:
119 | line = self.particle_yomi(line, self.IDX_MAP)
120 | if line is None:
121 | return None
122 |
123 | # ----------------------------------------------------------------------
124 | # normalize surface
125 | if self.normalize:
126 | line = self.normalize_surface(line, idx_surface=self.IDX_MAP["SURFACE"])
127 |
128 | # ----------------------------------------------------------------------
129 | # remove words with their yomi
130 | if self.rm_wrong_yomi:
131 | line = self.wrong_yomi_detector(line)
132 | if line is None:
133 | return None
134 |
135 | # ----------------------------------------------------------------------
136 | # add additional columns for compatibility with unidic-kana-accent
137 | if self.mode == "unidic":
138 | line = self.add_accent_column(line, idx_accent=self.IDX_MAP["ACCENT"])
139 |
140 | # ----------------------------------------------------------------------
141 | return line
142 |
143 | def __call__(self, fp_in, fp_out):
144 | self.showinfo()
145 | L = count_lines(fp_in)
146 | n_removed = 0
147 | n_corrected= 0
148 | for line in tqdm(csv.reader(fp_in), total=L):
149 | try:
150 | line_processed = self.process_single_line(line)
151 | except Exception as e:
152 | print(e)
153 | print(line)
154 | sys.exit(1)
155 | if line_processed is None:
156 | n_removed += 1
157 | continue
158 | if line_processed[:20] != line[:20]:
159 | n_corrected += 1
160 | fp_out.write(','.join(line_processed) + '\n')
161 |
162 | print("🍺 [ Complete! ]", file=sys.stderr)
163 | print("📊 Number of removed entries ", n_removed, file=sys.stderr)
164 | print("📊 Number of corrected entries ", n_corrected, file=sys.stderr)
165 | return
166 |
--------------------------------------------------------------------------------
/tdmelodic/util/word_type.py:
--------------------------------------------------------------------------------
1 |
2 | import sys
3 | import csv
4 | import shutil
5 | import regex as re
6 | from .dic_index_map import get_dictionary_index_map
7 |
8 | class WordType(object):
9 | def __init__(self, mode="unidic"):
10 | self.map = get_dictionary_index_map(mode)
11 |
12 | def is_symbol(self, line):
13 | flag1 = re.search(r"^記号$", line[self.map["POS1"]], flags=0)
14 | flag2 = re.search(r"^一般$", line[self.map["POS2"]], flags=0)
15 | return all([flag1, flag2])
16 |
17 | def is_hashtag(self, line):
18 | """ extract hash tags"""
19 | flag1 = re.search(r"^\#.+$", line[self.map["SURFACE"]], flags=0)
20 | return all([flag1])
21 |
22 | def is_emoji(self, line):
23 | """ extract emojis """
24 | flag1 = re.search(u"[\U0001F1E6-\U0001F645]+", line[self.map["SURFACE"]], flags=0)
25 | return all([flag1])
26 |
27 | def is_noisy_katakana(self, line):
28 | """ extract word such that the surface form is katakana but the lemma form contains kanji or hiragana """
29 | flag1 = re.search(r"[\p{Han}\p{Hiragana}a-zA-Z0-9]+", line[self.map["LEMMA"]], flags=0)
30 | flag2 = re.search(r"^[\p{Katakana}・&&!!ー=\s ]+$", line[self.map["SURFACE"]], flags=0)
31 | return all([flag1, flag2])
32 |
33 | def is_katakana(self, line):
34 | flag1 = re.search(r"^[\p{Katakana}・&&!!ー=\s ]+$", line[self.map["LEMMA"]], flags=0)
35 | flag2 = re.search(r"^[\p{Katakana}・&&!!ー=\s ]+$", line[self.map["SURFACE"]], flags=0)
36 | return all([flag1, flag2])
37 |
38 | def is_hira_kata_kanji(self, line):
39 | flag1 = re.search(r"^[\p{Han}\p{Hiragana}\p{Katakana}・&&!!ー=\s ]+$", line[self.map["SURFACE"]], flags=0)
40 | return all([flag1])
41 |
42 | def is_romaji(self, line):
43 | flag1 = re.search(r"^[a-zA-Za-zA-Z',,.!!\-&&\s ]+$", line[self.map["SURFACE"]], flags=0)
44 | return all([flag1])
45 |
46 | def is_hira_kata_kanji_romaji(self, line):
47 | flag1 = re.search(r"^[a-zA-Za-zA-Z',.!!\-&&\s \p{Han}\p{Hiragana}\p{Katakana}ー=]+$", line[self.map["SURFACE"]], flags=0)
48 | return all([flag1])
49 |
50 | def is_KK(self, line):
51 | """ extract KK (kabushiki gaisha) """
52 | flag1 = re.search("カブシキ[ガ|カ]イシャ", line[self.map["YOMI"]], flags=0)
53 | return all([flag1])
54 |
55 | def is_YK(self, line):
56 | """ extract YK (yugen gaisha) """
57 | flag1 = re.search("ユ[ウ|ー]ゲン[ガ|カ]イシャ", line[self.map["YOMI"]], flags=0)
58 | return all([flag1])
59 |
60 | def is_station(self, line):
61 | """ extract station """
62 | flag1 = re.search(r".+駅$", line[self.map["SURFACE"]], flags=0)
63 | return all([flag1])
64 |
65 | def is_road(self, line):
66 | """ extract station """
67 | flag1 = re.search(r"^\p{Han}+道.*\d号.+線$", line[self.map["SURFACE"]], flags=0)
68 | return all([flag1])
69 |
70 | def is_school(self, line):
71 | """ extract schools """
72 | flag1 = re.search(r"^[\p{Han}\p{Katakana}\p{Hiragana}ー・]+[小|中|高等]+学校$", line[self.map["SURFACE"]], flags=0)
73 | flag2 = re.search(r"^[\p{Han}\p{Katakana}\p{Hiragana}ー・]+[大学|高校]$", line[self.map["SURFACE"]], flags=0)
74 | flag3 = re.search(r"^[\p{Han}\p{Katakana}\p{Hiragana}ー・]+専門学校$", line[self.map["SURFACE"]], flags=0)
75 | return any([flag1, flag2, flag3])
76 |
77 | def is_address(self, line):
78 | """ extract schools """
79 | flag1 = re.search(r"^.+[都道府県][^,]+[郡市区町村].*$", line[self.map["SURFACE"]], flags=0)
80 | return all([flag1])
81 |
82 | def is_placename(self, line):
83 | flag1 = re.search(r"^名詞$", line[self.map["POS1"]], flags=0)
84 | flag2 = re.search(r"^固有名詞$", line[self.map["POS2"]], flags=0)
85 | flag3 = re.search(r"^地名$", line[self.map["POS3"]], flags=0)
86 | return all([flag1, flag2, flag3])
87 |
88 | def is_person(self, line):
89 | flag1 = re.search(r"^名詞$", line[self.map["POS1"]], flags=0)
90 | flag2 = re.search(r"^固有名詞$", line[self.map["POS2"]], flags=0)
91 | flag3 = re.search(r"^人名$", line[self.map["POS3"]], flags=0)
92 | return all([flag1, flag2, flag3])
93 |
94 | def is_date(self, line):
95 | flag1 = re.search(r"^\d+月\d+日$", line[self.map["SURFACE"]], flags=0)
96 | flag2 = re.search(r"^.*ガツ.*$", line[self.map["YOMI"]], flags=0)
97 | flag3 = re.search(r"\d{4}-\d{2}-\d{2}$", line[self.map["SURFACE"]], flags=0)
98 | return any([all([flag1, flag2]), flag3])
99 |
100 | def is_JPY(self, line):
101 | flag1 = re.search(r"^\d+[万億兆京]*円$", line[self.map["SURFACE"]], flags=0)
102 | return all([flag1])
103 |
104 | def is_USD(self, line):
105 | flag1 = re.search(r"^\$\d+$", line[self.map["SURFACE"]], flags=0)
106 | flag2 = re.search(r"^\d+ドル$", line[self.map["SURFACE"]], flags=0)
107 | return any([flag1, flag2])
108 |
109 | def is_length(self, line):
110 | flag1 = re.search(r"^[.\d]+[kcm]*m$", line[self.map["SURFACE"]], flags=0)
111 | return all([flag1])
112 |
113 | def is_weight(self, line):
114 | flag1 = re.search(r"^[.\d]+[km]*g$", line[self.map["SURFACE"]], flags=0)
115 | flag2 = re.search(r"^[.\d]+t$", line[self.map["SURFACE"]], flags=0)
116 | return any([flag1, flag2])
117 |
118 | def is_electric_unit(self, line):
119 | flag1 = re.search(r"^[.\d]+m[aA]$", line[self.map["SURFACE"]], flags=0)
120 | flag2 = re.search(r"^[.\d]+[vV]$", line[self.map["SURFACE"]], flags=0)
121 | flag3 = re.search(r"^[.\d]+[wW]$", line[self.map["SURFACE"]], flags=0)
122 | return any([flag1, flag2, flag3])
123 |
124 | def is_mass(self, line):
125 | flag1 = re.search(r"^[.\d]+ml$", line[self.map["SURFACE"]], flags=0)
126 | return all([flag1])
127 |
128 | def is_temperature(self, line):
129 | flag1 = re.search(r"^[.\d]+度$", line[self.map["SURFACE"]], flags=0)
130 | return all([flag1])
131 |
132 | def is_pressure(self, line):
133 | flag1 = re.search(r"^[.\d]+hPa$", line[self.map["SURFACE"]], flags=0)
134 | return all([flag1])
135 |
136 | def is_ratio(self, line):
137 | flag1 = re.search(r"^[.\d]+\%$", line[self.map["SURFACE"]], flags=0)
138 | flag2 = re.search(r"^[.\d]+パーセント$", line[self.map["SURFACE"]], flags=0)
139 | flag3 = re.search(r"^[.\d]+倍$", line[self.map["SURFACE"]], flags=0)
140 | return any([flag1, flag2, flag3])
141 |
142 | def is_byte(self, line):
143 | flag1 = re.search(r"^[.\d]+[kKmMgGtT]B$", line[self.map["SURFACE"]], flags=0)
144 | return all([flag1])
145 |
146 | def is_number(self, line):
147 | flag1 = re.search(r"^-*[\d.]+$", line[self.map["SURFACE"]], flags=0)
148 | return all([flag1])
149 |
150 | def is_1char_unit_numeral(self,line):
151 | flag1 = re.search(r"^-*[\d.]+[階話色系種秒発番点歳枚杯本期曲日敗打才戦形巻基均回周勝分億傑件代人万部節時児月年条限位]+$", line[self.map["SURFACE"]], flags=0)
152 | flag2 = re.search(r"\d+[\p{Han}\p{Katakana}\p{Hiragana}]{1}$", line[self.map["SURFACE"]], flags=0)
153 | return any([flag1, flag2])
154 |
155 | def is_2char_unit_numeral(self,line):
156 | flag1 = re.search(r"^-*[\d.]+(部隊|連敗|連勝|試合|行目|秒間|杯目|期目|期生|時間|日間|打点|度目|年間|日目|週目|週間|年目|年後|年前|年代|学期|回目|周年|周目|列目|分間|円玉|円札|作品|代目|人月|人年|万人|キロ|か年|か月|世紀|丁目|連休|年度)+$", line[self.map["SURFACE"]], flags=0)
157 | flag2 = re.search(r"\d+[\p{Han}\p{Katakana}\p{Hiragana}]{2}$", line[self.map["SURFACE"]], flags=0)
158 | return any([flag1, flag2])
159 |
160 | def is_3char_unit_numeral(self,line):
161 | flag1 = re.search(r"^-*[\d.]+(か月目|か月間|インチ|カ月目|カ月間|セント|チャン|ユーロ|世紀間|両編成|年ぶり|年戦争|年連続|時間前|時間半|時間五|番人気|階建て|系電車)+$", line[self.map["SURFACE"]], flags=0)
162 | flag2 = re.search(r"\d+[\p{Han}\p{Katakana}\p{Hiragana}]{3}$", line[self.map["SURFACE"]], flags=0)
163 | return any([flag1, flag2])
164 |
165 | def is_numeral(self,line):
166 | return self.is_JPY(line) or \
167 | self.is_USD(line) or \
168 | self.is_length(line) or\
169 | self.is_weight(line) or \
170 | self.is_electric_unit(line) or \
171 | self.is_mass(line) or \
172 | self.is_temperature(line) or\
173 | self.is_pressure(line) or \
174 | self.is_ratio(line) or \
175 | self.is_byte(line) or \
176 | self.is_number(line) or \
177 | self.is_1char_unit_numeral(line) or \
178 | self.is_2char_unit_numeral(line) or \
179 | self.is_3char_unit_numeral(line)
180 |
--------------------------------------------------------------------------------
/tdmelodic/nn/loader/data_loader.py:
--------------------------------------------------------------------------------
1 | # -----------------------------------------------------------------------------
2 | # Copyright (c) 2019-, PKSHA Technology Inc.
3 | # All rights reserved.
4 | #
5 | # This source code is licensed under the BSD-style license found in the
6 | # LICENSE file in the root directory of this source tree.
7 | # -----------------------------------------------------------------------------
8 |
9 | # -*- coding: utf-8 -*-
10 | import sys
11 | import os
12 |
13 | import re
14 | import numpy as np
15 | import csv
16 | from pprint import pprint
17 |
18 | import chainer
19 | from chainer import dataset
20 | from chainer import datasets
21 | from chainer import iterators
22 |
23 | # base
24 | from .data_loader_base import DataLoaderBase
25 |
26 | # unidic
27 | from ..lang.mecab.unidic import UniDic
28 |
29 | # textproc
30 | from ..lang.japanese.text_normalize import normalize_jpn
31 | from ..lang.japanese.kana.kana2roman import kana2roman
32 | from ..lang.japanese.accent.accent_alignment import accent_align
33 | from ..lang.japanese.accent.accent_diff import simple_accent_diff
34 |
35 | # map code to int
36 | from ..lang.category.symbol_map import acccon_map_robust
37 | from ..lang.category.symbol_map import goshu_map_robust
38 | from ..lang.category.symbol_map import pos_map_robust
39 | from ..lang.category.symbol_map import numeric_to_char_symbol
40 | from ..lang.category.symbol_map import char_symbol_to_numeric
41 | from ..lang.japanese.kana.kanamap.kanamap_normal import roman_map
42 | from ..lang.japanese.accent.accent_alignment import accent_map
43 |
44 |
45 | # ------------------------------------------------------------------------------------
46 | def split_codes_to_vowel_and_consonant(romancode):
47 | L = len(romancode)
48 | c = romancode[0::2]
49 | v = romancode[1::2]
50 | return c, v
51 |
52 | # ------------------------------------------------------------------------------------
53 | def _convert_yomi_to_codes(kana, **kwargs):
54 | pron = [kana]
55 | pron_code = ["".join([c for c in kana2roman(p)]) for p in pron]
56 |
57 | # split into vowel and consonant
58 | c_code = [split_codes_to_vowel_and_consonant(pron_code_)[0] for pron_code_ in pron_code]
59 | v_code = [split_codes_to_vowel_and_consonant(pron_code_)[1] for pron_code_ in pron_code]
60 |
61 | return c_code, v_code
62 |
63 | # ------------------------------------------------------------------------------------
64 | def _convert_parsed_surface_to_codes(mecab_p, **kwargs):
65 | """
66 | mecabのエントリは以下の通り。
67 | ["surface", "pron", "kana", "pos", "acc", "concat"]
68 | """
69 | pron = [e["pron"] for e in mecab_p]
70 | pos = [e["pos"] for e in mecab_p]
71 | gosh = [e["goshu"] for e in mecab_p]
72 | acc = [e["acc"] for e in mecab_p]
73 | conc = [e["concat"] for e in mecab_p]
74 |
75 | # code
76 | pron_code = ["".join([c for c in kana2roman(p)]) for p in pron]
77 | accent_code = [accent_align(y, a) for y, a in zip(pron_code, acc)]
78 | pos_code = [numeric_to_char_symbol[pos_map_robust(s) ] * len(w) for s, w in zip(pos, pron_code)]
79 | conc_code = [numeric_to_char_symbol[acccon_map_robust(s)] * len(w) for s, w in zip(conc, pron_code)]
80 | gosh_code = [numeric_to_char_symbol[goshu_map_robust(s) ] * len(w) for s, w in zip(gosh, pron_code)]
81 |
82 | # Split prin_code into vowel and consonant
83 | c_code = [split_codes_to_vowel_and_consonant(pron_code_)[0] for pron_code_ in pron_code]
84 | v_code = [split_codes_to_vowel_and_consonant(pron_code_)[1] for pron_code_ in pron_code]
85 |
86 | # LH binary -> up down
87 | accent_code = [split_codes_to_vowel_and_consonant(a)[0] for a in accent_code]
88 | accent_code = [simple_accent_diff(a) for a in accent_code]
89 |
90 | # halve the length
91 | pos_code = [split_codes_to_vowel_and_consonant(a)[0] for a in pos_code]
92 | conc_code = [split_codes_to_vowel_and_consonant(a)[0] for a in conc_code]
93 | gosh_code = [split_codes_to_vowel_and_consonant(a)[0] for a in gosh_code]
94 |
95 | # return
96 | codes = (c_code, v_code, accent_code, pos_code, conc_code, gosh_code)
97 | return codes
98 |
99 |
100 | # ------------------------------------------------------------------------------------
101 | class NeologdDictionaryLoader(DataLoaderBase):
102 | def __init__(self,
103 | csv_file='default_path.csv',
104 | verbose=False,
105 | valid_mode=False,
106 | infer_mode=False,
107 | index_map={
108 | # see also mecabrc
109 | 'SURFACE': 1 + 0,
110 | 'COST' : 1 + 3,
111 | 'POS1' : 1 + 4 + 0, # f[0]: pos1
112 | 'POS2' : 1 + 4 + 1, # f[1]: pos2
113 | 'POS3' : 1 + 4 + 2, # f[2]: pos3
114 | 'POS4' : 1 + 4 + 3, # f[3]: pos4
115 | 'YOMI' : 1 + 4 + 9, # f[9]: pron
116 | 'GOSHU' : 1 + 4 + 12, # f[12]: goshu
117 | #ACCENT = 1 + 4 + 23 # f[23]: aType
118 | 'ACCENT' : 23, # f[23]: aType
119 | },
120 | load_all_lines_first=False,
121 | store_entire_line=False # store whole of the neologd dictionary data in memory if this flag is True.
122 | ):
123 |
124 | # flags
125 | self.infer_mode = infer_mode
126 | self.valid_mode = valid_mode
127 |
128 | self.store_entire_line = store_entire_line
129 | self.load_all_lines_first = load_all_lines_first
130 |
131 | self.index = index_map
132 |
133 | # on the data
134 | self.csv_file = csv_file
135 | self.lines = self._count_lines(csv_file)
136 |
137 | # load first
138 | if self.load_all_lines_first:
139 | self.neologd_quadraple, self.neologd_lines = self._load_word_list(csv_file)
140 | else:
141 | self.line_generator = self._read_line(self.csv_file)
142 |
143 | # load unidic
144 | self.unidic = UniDic()
145 | super().__init__()
146 |
147 | def _count_lines(self,text_file):
148 | c = 0
149 | cf = csv.reader(open(text_file, 'r'))
150 | for entry in cf:
151 | c += 1
152 | return c
153 |
154 | def _load_word_list(self, text_file, **kwargs):
155 | data_lst = []
156 | line_lst = []
157 | cf = csv.reader(open(text_file, 'r'))
158 | for entry in cf:
159 | surface = entry[self.index['SURFACE']]
160 | pos = "-".join([e for e in entry[self.index['POS1']:self.index['POS4']+1] if len(e) > 0 and e != '*']) # we do not use them
161 | kana = None #entry[11] # we do not use them
162 | yomi = entry[self.index['YOMI']]
163 | goshu = entry[self.index['GOSHU']] # we do not use them
164 | accent = entry[self.index['ACCENT']] # annotated accent (during training only)
165 | data_lst.append([surface, kana, yomi, accent])
166 | if self.store_entire_line:
167 | line_lst.append(entry)
168 | return data_lst, line_lst
169 |
170 | def _read_line(self, text_file, **kwargs):
171 | cf = csv.reader(open(text_file, 'r'))
172 | for entry in cf:
173 | surface = entry[self.index['SURFACE']]
174 | pos = "-".join([e for e in entry[self.index['POS1']:self.index['POS4']+1] if len(e) > 0 and e != '*']) # we do not use them
175 | kana = None #entry[11] # we do not use them
176 | yomi = entry[self.index['YOMI']]
177 | goshu = entry[self.index['GOSHU']] # we do not use them
178 | accent = entry[self.index['ACCENT']] # annotated accent (during training only)
179 | yield (surface, kana, yomi, accent, entry)
180 |
181 | def __len__(self):
182 | return self.lines
183 | # return len(self.neologd_quadraple)
184 |
185 | def _get_example_core(self, i):
186 | # i-th entry of neologd quadraples
187 | if self.load_all_lines_first:
188 | surface, kana, yomi, accent = self.neologd_quadraple[i]
189 | else:
190 | surface, kana, yomi, accent, line = next(self.line_generator)
191 | yomi_or_kana = yomi
192 |
193 | # analyze surface, and get the result of MeCab+UniDic
194 | surface_ = normalize_jpn(surface)
195 | tmp = self.unidic.get_n_best(surface_, yomi_or_kana)
196 | lst_mecab_parsed, rank, ld = tmp
197 |
198 | # Get pi^*(s)
199 | if self.valid_mode or self.infer_mode:
200 | # inference
201 | rank_ = rank[0]
202 | else:
203 | # training
204 | # randomly draw from N-best candidates
205 | rank_ = np.random.choice(rank)
206 | mecab_parsed = lst_mecab_parsed[rank_]
207 |
208 | # convert to codes
209 | # codes : v_code, c_code, accent_code, pos_code, conc_code, gosh_code
210 | S_vow, S_con, S_acc, S_pos, S_acccon, S_gosh = \
211 | _convert_parsed_surface_to_codes( mecab_parsed )
212 | Y_vow, Y_con = \
213 | _convert_yomi_to_codes( yomi_or_kana )
214 |
215 | # join
216 | S_vow = ''.join([s + ' ' for s in S_vow])
217 | S_con = ''.join([s + ' ' for s in S_con])
218 | S_acc = ''.join([s for s in S_acc])
219 | S_pos = ''.join([s + ' ' for s in S_pos])
220 | S_acccon = ''.join([s + ' ' for s in S_acccon])
221 | S_gosh = ''.join([s + ' ' for s in S_gosh])
222 | Y_vow = ''.join([s + ' ' for s in Y_vow])
223 | Y_con = ''.join([s + ' ' for s in Y_con])
224 |
225 | # adjust the length
226 | S_len = len(S_vow)
227 | Y_len = len(Y_vow) # len(accent)
228 | S_con = (S_con + " " * (S_len - len(S_con ))) [:S_len]
229 | S_acc = (S_acc + " " * (S_len - len(S_acc ))) [:S_len]
230 | S_pos = (S_pos + " " * (S_len - len(S_pos ))) [:S_len]
231 | S_acccon = (S_acccon + " " * (S_len - len(S_acccon))) [:S_len]
232 | S_gosh = (S_gosh + " " * (S_len - len(S_gosh ))) [:S_len]
233 | Y_vow = (Y_vow + " " * (Y_len - len(Y_vow ))) [:Y_len]
234 | Y_con = (Y_con + " " * (Y_len - len(Y_con ))) [:Y_len]
235 |
236 | # convert to numpy array
237 | S_vow_np = np.array( [roman_map[c] for c in S_vow] , np.int32)
238 | S_con_np = np.array( [roman_map[c] for c in S_con] , np.int32)
239 | S_acc_np = np.array( [accent_map[c] for c in S_acc] , np.int32)
240 | S_pos_np = np.array( [char_symbol_to_numeric[c] for c in S_pos] , np.int32)
241 | S_acccon_np = np.array( [char_symbol_to_numeric[c] for c in S_acccon] , np.int32)
242 | S_gosh_np = np.array( [char_symbol_to_numeric[c] for c in S_gosh] , np.int32)
243 | Y_vow_np = np.array( [roman_map[c] for c in Y_vow] , np.int32)
244 | Y_con_np = np.array( [roman_map[c] for c in Y_con] , np.int32)
245 | accent_np = np.array( [0 if c == "0" else 2 if c== "2" else 1 for c in accent] , np.int32)
246 |
247 | # return X, y pairs
248 | X = S_vow_np, S_con_np, S_pos_np, S_acc_np, S_acccon_np, S_gosh_np, Y_vow_np, Y_con_np
249 | y = accent_np
250 | ret = X + (y,)
251 |
252 | if self.infer_mode:
253 | # 情報を表示するための情報を返す。
254 | # if self.store_entire_line:
255 | if self.load_all_lines_first:
256 | ret = [ret, (i, surface, yomi_or_kana, self.neologd_lines[i])]
257 | else:
258 | ret = [ret, (i, surface, yomi_or_kana, line)]
259 | # ret = [ret, (i, surface, yomi_or_kana)]
260 | return ret
261 |
262 | if __name__ == "__main__":
263 | ds = NeologdDictionaryLoader()
264 | for n in range(2):
265 | for i in range(len(ds)):
266 | Xy = ds[i]
267 | X = Xy[:-1]
268 | y = Xy[-1]
269 | print("===", X[0], y)
270 |
--------------------------------------------------------------------------------