├── text_analysis
    ├── main
    │   ├── __init__.py
    │   ├── urls.py
    │   ├── forms.py
    │   ├── views.py
    │   ├── static
    │   │   └── demo.html
    │   ├── tests.py
    │   └── mecab_utils.py
    ├── text_analysis
    │   ├── __init__.py
    │   ├── settings
    │   │   ├── test.py
    │   │   ├── production.py
    │   │   └── __init__.py
    │   ├── urls.py
    │   └── wsgi.py
    └── manage.py
├── requirements.txt
├── typography-icon.png
├── .coveragerc
├── newrelic.ini
├── .github
    └── dependabot.yml
├── tox.ini
├── NOTICE
├── uwsgi.ini
├── .gitignore
├── LICENSE
├── Dockerfile
├── .circleci
    └── config.yml
└── README.md


/text_analysis/main/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/text_analysis/text_analysis/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | Django==3.2.14
2 | django-cors-headers==3.13.0
3 | mecab-python3==0.7
4 | 


--------------------------------------------------------------------------------
/typography-icon.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bungoume/mecab-web-api/HEAD/typography-icon.png


--------------------------------------------------------------------------------
/.coveragerc:
--------------------------------------------------------------------------------
1 | [report]
2 | include = text_analysis/**
3 | omit =
4 |     text_analysis/*/tests/**
5 |     text_analysis/*/tests.py
6 | 


--------------------------------------------------------------------------------
/newrelic.ini:
--------------------------------------------------------------------------------
1 | [newrelic:development]
2 | app_name = Text Analysis API (development)
3 | 
4 | [newrelic:production]
5 | app_name = Text Analysis API
6 | 


--------------------------------------------------------------------------------
/.github/dependabot.yml:
--------------------------------------------------------------------------------
 1 | version: 2
 2 | updates:
 3 | - package-ecosystem: pip
 4 |   directory: "/"
 5 |   schedule:
 6 |     interval: weekly
 7 |     time: "11:00"
 8 |     timezone: Asia/Tokyo
 9 |   open-pull-requests-limit: 10
10 | 


--------------------------------------------------------------------------------
/text_analysis/main/urls.py:
--------------------------------------------------------------------------------
1 | from django.conf.urls import url
2 | 
3 | from main import views
4 | 
5 | urlpatterns = [
6 |     url(r'^parse$', views.parse, name='parse'),
7 |     url(r'^reading$', views.reading, name='reading'),
8 | ]
9 | 


--------------------------------------------------------------------------------
/text_analysis/text_analysis/settings/test.py:
--------------------------------------------------------------------------------
 1 | from text_analysis.settings import *  # NOQA
 2 | 
 3 | 
 4 | DATABASES = {
 5 |     'default': {
 6 |         'ENGINE': 'django.db.backends.sqlite3',
 7 |         'NAME': ':memory:',
 8 |     }
 9 | }
10 | 


--------------------------------------------------------------------------------
/text_analysis/manage.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | import os
 3 | import sys
 4 | 
 5 | if __name__ == "__main__":
 6 |     os.environ.setdefault("DJANGO_SETTINGS_MODULE", "text_analysis.settings")
 7 | 
 8 |     from django.core.management import execute_from_command_line
 9 | 
10 |     execute_from_command_line(sys.argv)
11 | 


--------------------------------------------------------------------------------
/text_analysis/text_analysis/urls.py:
--------------------------------------------------------------------------------
 1 | from django.conf.urls import include, url
 2 | 
 3 | urlpatterns = [
 4 |     url(r'^text-analysis/v1/', include('main.urls')),
 5 |     url(r'^v1/', include('main.urls')),
 6 | ]
 7 | 
 8 | handler400 = 'main.views.handler400'
 9 | handler403 = 'main.views.handler403'
10 | handler404 = 'main.views.handler404'
11 | handler500 = 'main.views.handler500'
12 | 


--------------------------------------------------------------------------------
/text_analysis/text_analysis/settings/production.py:
--------------------------------------------------------------------------------
 1 | from text_analysis.settings import *  # NOQA
 2 | 
 3 | 
 4 | ALLOWED_HOSTS = ['*']
 5 | 
 6 | DEBUG = False
 7 | 
 8 | SESSION_COOKIE_SECURE = True
 9 | CSRF_COOKIE_SECURE = True
10 | 
11 | 
12 | #######################
13 | # SECURITY MIDDLEWARE #
14 | #######################
15 | SECURE_BROWSER_XSS_FILTER = True
16 | SECURE_CONTENT_TYPE_NOSNIFF = True
17 | SECURE_HSTS_SECONDS = 31536000
18 | SECURE_SSL_REDIRECT = True
19 | 


--------------------------------------------------------------------------------
/text_analysis/text_analysis/wsgi.py:
--------------------------------------------------------------------------------
 1 | """
 2 | WSGI config for text_analysis project.
 3 | 
 4 | It exposes the WSGI callable as a module-level variable named ``application``.
 5 | 
 6 | For more information on this file, see
 7 | https://docs.djangoproject.com/en/1.8/howto/deployment/wsgi/
 8 | """
 9 | 
10 | import os
11 | 
12 | from django.core.wsgi import get_wsgi_application
13 | 
14 | os.environ.setdefault("DJANGO_SETTINGS_MODULE", "text_analysis.settings")
15 | 
16 | application = get_wsgi_application()
17 | 


--------------------------------------------------------------------------------
/tox.ini:
--------------------------------------------------------------------------------
 1 | [tox]
 2 | envlist = py37, flake8
 3 | skipsdist = True
 4 | setupdir = ./text_analysis/
 5 | [testenv:py37]
 6 | deps = coverage
 7 |        testfixtures
 8 |        -rrequirements.txt
 9 | setenv = DJANGO_SETTINGS_MODULE = text_analysis.settings.test
10 | commands =
11 |     pip install -r requirements.txt
12 |     coverage erase
13 |     coverage run text_analysis/manage.py test text_analysis
14 |     coverage report
15 | 
16 | [testenv:flake8]
17 | basepython = python3.7
18 | deps = flake8
19 | commands = flake8 text_analysis
20 | 
21 | [flake8]
22 | max-line-length = 99
23 | exclude = text_analysis/*/migrations/*
24 | 


--------------------------------------------------------------------------------
/NOTICE:
--------------------------------------------------------------------------------
 1 | List of open-source software used/depended on this project:
 2 | 
 3 | MeCab
 4 | https://taku910.github.io/mecab/
 5 | Copyright (c) 2001-2008, Taku Kudo
 6 | Copyright (c) 2004-2008, Nippon Telegraph and Telephone Corporation
 7 | License: BSD (https://github.com/taku910/mecab/blob/master/mecab/BSD)
 8 | 
 9 | Django
10 | https://www.djangoproject.com/
11 | Copyright (c) Django Software Foundation and individual contributors.
12 | License: BSD (https://github.com/django/django/blob/master/LICENSE)
13 | 
14 | mecab-python3
15 | https://github.com/SamuraiT/mecab-python3
16 | License: BSD (https://github.com/SamuraiT/mecab-python3/blob/master/BSD)
17 | 


--------------------------------------------------------------------------------
/uwsgi.ini:
--------------------------------------------------------------------------------
 1 | [uwsgi]
 2 | strict=true
 3 | chdir=/usr/src/app/text_analysis/
 4 | env=DJANGO_SETTINGS_MODULE=text_analysis.settings
 5 | module=text_analysis.wsgi
 6 | enable-threads=true
 7 | single-interpreter=true
 8 | master=true
 9 | vacuum=true
10 | harakiri=60
11 | processes=3
12 | threads=3
13 | die-on-term=true
14 | threads-stacksize=2048
15 | reload-on-rss=320
16 | evil-reload-on-rss=384
17 | post-buffering=8192
18 | buffer-size=32768
19 | reuse-port=true
20 | thunder-lock=true
21 | 
22 | py-tracebacker=/tmp/tbsocket.
23 | req-logger=file:/log/uwsgi-access.log
24 | logger=file:/log/uwsgi.log
25 | 
26 | [development]
27 | ini=:uwsgi
28 | socket=0.0.0.0:8000
29 | env=DJANGO_SETTINGS_MODULE=text_analysis.settings.development
30 | 
31 | [production]
32 | ini=:uwsgi
33 | socket=0.0.0.0:8000
34 | env=DJANGO_SETTINGS_MODULE=text_analysis.settings.production
35 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Byte-compiled / optimized / DLL files
 2 | __pycache__/
 3 | *.py[cod]
 4 | 
 5 | # C extensions
 6 | *.so
 7 | 
 8 | # Distribution / packaging
 9 | .Python
10 | env/
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | lib/
17 | lib64/
18 | parts/
19 | sdist/
20 | var/
21 | *.egg-info/
22 | .installed.cfg
23 | *.egg
24 | 
25 | # PyInstaller
26 | #  Usually these files are written by a python script from a template
27 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
28 | *.manifest
29 | *.spec
30 | 
31 | # Installer logs
32 | pip-log.txt
33 | pip-delete-this-directory.txt
34 | 
35 | # Unit test / coverage reports
36 | htmlcov/
37 | .tox/
38 | .coverage
39 | .cache
40 | nosetests.xml
41 | coverage.xml
42 | 
43 | # Translations
44 | *.mo
45 | *.pot
46 | 
47 | # Django stuff:
48 | *.log
49 | 
50 | # Sphinx documentation
51 | docs/_build/
52 | 
53 | # PyBuilder
54 | target/
55 | 


--------------------------------------------------------------------------------
/text_analysis/main/forms.py:
--------------------------------------------------------------------------------
 1 | from django import forms
 2 | from django.core import validators
 3 | 
 4 | 
 5 | class ReadingForm(forms.Form):
 6 |     sentence = forms.CharField(required=False)
 7 |     nbest_num = forms.IntegerField(validators=[
 8 |         validators.MinValueValidator(1), validators.MaxValueValidator(50)], required=False)
 9 | 
10 |     def clean_sentence(self):
11 |         return self.cleaned_data.get('sentence', '')
12 | 
13 |     def clean_nbest_num(self):
14 |         nbest_num = self.cleaned_data.get('nbest_num')
15 |         if nbest_num is None:
16 |             return 10
17 |         return nbest_num
18 | 
19 | 
20 | class ParseForm(forms.Form):
21 |     sentence = forms.CharField(required=False)
22 |     nbest_num = forms.IntegerField(validators=[
23 |         validators.MinValueValidator(1), validators.MaxValueValidator(50)], required=False)
24 | 
25 |     def clean_sentence(self):
26 |         return self.cleaned_data.get('sentence', '')
27 | 
28 |     def clean_nbest_num(self):
29 |         nbest_num = self.cleaned_data.get('nbest_num')
30 |         if nbest_num is None:
31 |             return 3
32 |         return nbest_num
33 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2015 Yuri UMEZAKI
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 
23 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM python:3.7.0-alpine
 2 | 
 3 | RUN mkdir -p /usr/src/app && mkdir /log && \
 4 |     apk --no-cache --update add \
 5 |                             build-base \
 6 |                             linux-headers \
 7 |                             openssl \
 8 |                             libstdc++ \
 9 |                             bash \
10 |                             curl \
11 |                             file \
12 |                             git \
13 |                             ca-certificates && \
14 |     cd /tmp && \
15 |     wget -O mecab-0.996.tar.gz "https://drive.google.com/uc?export=download&id=0B4y35FiV1wh7cENtOXlicTFaRUE" && \
16 |     tar xvzf mecab-0.996.tar.gz && \
17 |     cd mecab-0.996 && \
18 |     ./configure --enable-utf8-only && \
19 |     make && make install && \
20 |     mkdir -p /usr/local/lib/mecab/dic && \
21 |     chmod 777 /usr/local/lib/mecab/dic && \
22 |     cd /tmp && \
23 |     git clone --depth 1 https://github.com/neologd/mecab-ipadic-neologd.git && \
24 |     cd mecab-ipadic-neologd && \
25 |     ./bin/install-mecab-ipadic-neologd -n -y && \
26 |     sed -i "s/ipadic$/mecab-ipadic-neologd/g" /usr/local/etc/mecabrc && \
27 |     pip install uWSGI mecab-python3==0.7 && \
28 |     apk del build-base linux-headers && \
29 |     rm -rf /tmp/* /var/tmp/* /var/cache/apk/* /root/.cache/pip/*
30 | 
31 | WORKDIR /usr/src/app
32 | 
33 | COPY requirements.txt /usr/src/app/
34 | RUN pip install --no-cache-dir -r requirements.txt && \
35 |     rm -rf /tmp/* /var/tmp/* /root/.cache/pip/*
36 | 
37 | COPY . /usr/src/app
38 | 
39 | ENV DJANGO_SETTINGS_MODULE=text_analysis.settings.production
40 | 
41 | RUN python text_analysis/manage.py collectstatic --noinput
42 | 
43 | EXPOSE 8000
44 | 
45 | ENV UWSGI_ENV production
46 | 
47 | CMD ["uwsgi", "--ini", "uwsgi.ini:${UWSGI_ENV}"]
48 | 
49 | # RUN pip install newrelic
50 | # ENV NEW_RELIC_ENVIRONMENT ${UWSGI_ENV}
51 | # ENV NEW_RELIC_LICENSE_KEY {{ YOUR_LICENSE_KEY }}
52 | # ENV NEW_RELIC_APP_NAME {{ THIS_APP_NAME }}
53 | # CMD ["newrelic-admin", "run-program", "uwsgi", "--ini", "uwsgi.ini:${UWSGI_ENV}"]
54 | 


--------------------------------------------------------------------------------
/.circleci/config.yml:
--------------------------------------------------------------------------------
 1 | version: 2
 2 | jobs:
 3 |   test:
 4 |     working_directory: ~/myapp
 5 |     docker:
 6 |       - image: circleci/python:3.7.0
 7 |     steps:
 8 |       - checkout
 9 |       - run:
10 |           command: python -m venv env
11 |       - restore_cache:
12 |           keys:
13 |             - v1-myapp-{{ checksum "requirements.txt" }}
14 |       - restore_cache:
15 |           keys:
16 |             - v1-myapp-mecab-0.996
17 |       - run:
18 |           name: Install Mecab
19 |           command: |
20 |             cd ~
21 |             if [[ ! -e mecab-0.996/src/mecab ]]; then
22 |               wget -O mecab-0.996.tar.gz "https://drive.google.com/uc?export=download&id=0B4y35FiV1wh7cENtOXlicTFaRUE"
23 |               tar xvzf mecab-0.996.tar.gz
24 |               rm -f mecab-0.996.tar.gz
25 |               cd mecab-0.996
26 |               ./configure --enable-utf8-only
27 |               make
28 |             else
29 |               cd mecab-0.996
30 |             fi
31 |             sudo make install
32 |             sudo ldconfig
33 |             sudo mkdir -p /usr/local/lib/mecab/dic
34 |             sudo chmod 777 /usr/local/lib/mecab/dic
35 |       - restore_cache:
36 |           keys:
37 |             - v1-myapp-mecab-ipadic-neologd-201809  # neologd更新時はこことsave_cacheの日付を変更してください。
38 |       - run:
39 |           name: Install neologd
40 |           command: |
41 |             if [[ ! -e /usr/local/lib/mecab/dic/mecab-ipadic-neologd/sys.dic ]]; then
42 |               cd ~
43 |               git clone --depth 1 https://github.com/neologd/mecab-ipadic-neologd.git
44 |               cd mecab-ipadic-neologd
45 |               ./bin/install-mecab-ipadic-neologd -n -y
46 |             fi
47 |             sudo sed -i "s/ipadic$/mecab-ipadic-neologd/g" /usr/local/etc/mecabrc
48 |       - run:
49 |           name: Run tox
50 |           command: |
51 |             . env/bin/activate
52 |             pip install -U tox
53 |             tox
54 |       - run:
55 |           name: Run coveralls
56 |           command: |
57 |             . env/bin/activate
58 |             pip install -U coveralls
59 |             coveralls
60 |       - save_cache:
61 |           key: v1-myapp-{{ checksum "requirements.txt" }}
62 |           paths:
63 |             - ~/myapp/.tox
64 |             - ~/myapp/env
65 |       - save_cache:
66 |           key: v1-myapp-mecab-0.996
67 |           paths:
68 |             - ~/mecab-0.996
69 |       - save_cache:
70 |           key: v1-myapp-mecab-ipadic-neologd-201804
71 |           paths:
72 |             - /usr/local/lib/mecab/dic/mecab-ipadic-neologd
73 |       - store_artifacts:
74 |           path: .circle_artifacts
75 |       - store_test_results:
76 |           path: .circle_test_reposts/django
77 | 
78 | workflows:
79 |   version: 2
80 |   test:
81 |     jobs:
82 |       - test
83 | 


--------------------------------------------------------------------------------
/text_analysis/main/views.py:
--------------------------------------------------------------------------------
 1 | from django.http import HttpResponse, JsonResponse
 2 | 
 3 | from django.views.decorators.cache import cache_control
 4 | from django.views.decorators.http import require_http_methods
 5 | 
 6 | from main import mecab_utils
 7 | from main.forms import ReadingForm, ParseForm
 8 | 
 9 | 
10 | @cache_control(max_age=86400)
11 | @require_http_methods(["GET", "POST", "OPTIONS"])
12 | def reading(request):
13 |     if request.method == "GET":
14 |         form = ReadingForm(request.GET)
15 |     elif request.method == "POST":
16 |         form = ReadingForm(request.POST)
17 |     elif request.method == "OPTIONS":
18 |         return HttpResponse({}, status=204)
19 |     if not form.is_valid():
20 |         return JsonResponse(
21 |             {"error": {"code": "form_invalid", "errors": form.errors}}, status=400)
22 | 
23 |     sentence = form.cleaned_data.get('sentence')
24 |     nbest_num = form.cleaned_data.get('nbest_num')
25 | 
26 |     ret = {
27 |         'input_sentence': sentence,
28 |         'items': mecab_utils.reading_sentence(sentence, nbest_num),
29 |     }
30 | 
31 |     return JsonResponse(ret, json_dumps_params={'ensure_ascii': False, 'separators': (',', ':')})
32 | 
33 | 
34 | @cache_control(max_age=86400)
35 | @require_http_methods(["GET", "POST", "OPTIONS"])
36 | def parse(request):
37 |     if request.method == "GET":
38 |         form = ParseForm(request.GET)
39 |     elif request.method == "POST":
40 |         form = ParseForm(request.POST)
41 |     elif request.method == "OPTIONS":
42 |         return HttpResponse("", status=204)
43 |     if not form.is_valid():
44 |         return JsonResponse(
45 |             {"error": {"code": "form_invalid", "errors": form.errors}}, status=400)
46 | 
47 |     sentence = form.cleaned_data.get('sentence')
48 |     nbest_num = form.cleaned_data.get('nbest_num')
49 | 
50 |     ret = {
51 |         'input_sentence': sentence,
52 |         'items': mecab_utils.parse_sentence(sentence, nbest_num),
53 |     }
54 | 
55 |     return JsonResponse(ret, json_dumps_params={'ensure_ascii': False, 'separators': (',', ':')})
56 | 
57 | 
58 | def handler400(request, exception):
59 |     res = JsonResponse({'error': {'code': 'bad_request',
60 |                                   'message': "400 Bad Request"}}, status=400)
61 |     return res
62 | 
63 | 
64 | def handler403(request, exception):
65 |     res = JsonResponse({'error': {'code': 'permission_denied',
66 |                                   'message': "403 Permission Denied"}}, status=403)
67 |     return res
68 | 
69 | 
70 | def handler404(request, exception):
71 |     res = JsonResponse({'error': {'code': 'not_found',
72 |                                   'message': "404 Not Found"}}, status=404)
73 |     return res
74 | 
75 | 
76 | def handler500(request):
77 |     res = JsonResponse({'error': {'code': 'server_error',
78 |                                   'message': "500 Internal Server Error"}}, status=500)
79 |     return res
80 | 


--------------------------------------------------------------------------------
/text_analysis/main/static/demo.html:
--------------------------------------------------------------------------------
  1 | <!DOCTYPE html>
  2 | <html>
  3 | <head>
  4 |     <meta charset="UTF-8">
  5 |     <title>mecab-web-api demo</title>
  6 |     <meta content='width=device-width, initial-scale=1, maximum-scale=1, user-scalable=no' name='viewport'>
  7 |     <link href="//cdnjs.cloudflare.com/ajax/libs/twitter-bootstrap/3.3.4/css/bootstrap.min.css" rel="stylesheet">
  8 |     <style type="text/css">
  9 |     body {
 10 |       padding-top: 50px;
 11 |       padding-bottom: 20px;
 12 |     }
 13 |     </style>
 14 | </head>
 15 | <body>
 16 | <div class="container">
 17 | 
 18 | <form>
 19 |   <div class="form-group">
 20 |     <label for="inputText">解析文字列</label>
 21 |     <input type="text" class="form-control" id="inputText" placeholder="Enter text" value="文章を入力してください">
 22 |   </div>
 23 | </form>
 24 | 
 25 | <div id='table-box'>
 26 | </div>
 27 | 
 28 | </div>
 29 | 
 30 | <script src="//cdnjs.cloudflare.com/ajax/libs/jquery/2.1.3/jquery.min.js"></script>
 31 | <script src="//cdnjs.cloudflare.com/ajax/libs/moment.js/2.9.0/moment-with-locales.min.js"></script>
 32 | <script src="//cdnjs.cloudflare.com/ajax/libs/twitter-bootstrap/3.3.4/js/bootstrap.min.js"></script>
 33 | <script src="//cdnjs.cloudflare.com/ajax/libs/lodash.js/2.5.0/lodash.min.js"></script>
 34 | <script src="//cdnjs.cloudflare.com/ajax/libs/hogan.js/3.0.2/hogan.js"></script>
 35 | <script src="//cdnjs.cloudflare.com/ajax/libs/vue/0.11.5/vue.min.js"></script>
 36 | <script type="text/template" id="table-template">
 37 | <p>input: {{input_sentence}}</p>
 38 | <div v-repeat="item: items">
 39 | <p>nbest: {{$index+1}}件目</p>
 40 | <table class="table table-striped">
 41 |   <thead>
 42 |     <tr>
 43 |       <th>形態素種類</th>
 44 |       <th>表層形</th>
 45 |       <th>品詞</th>
 46 |       <th>品詞細分類1</th>
 47 |       <th>品詞細分類2</th>
 48 |       <th>品詞細分類3</th>
 49 |       <th>活用型</th>
 50 |       <th>活用形</th>
 51 |       <th>基本形</th>
 52 |       <th>読み</th>
 53 |       <th>発音</th>
 54 |       <th>IMEローマ字</th>
 55 |       <th>コスト</th>
 56 |     </tr>
 57 |   </thead>
 58 |   <tbody>
 59 |     <tr v-repeat="word: item.words">
 60 |       <td>{{word.morpheme}}</td>
 61 |       <td>{{word.surface}}</td>
 62 |       <td>{{word.pos}}</td>
 63 |       <td>{{word.pos_detail1}}</td>
 64 |       <td>{{word.pos_detail2}}</td>
 65 |       <td>{{word.pos_detail3}}</td>
 66 |       <td>{{word.conjugated_type}}</td>
 67 |       <td>{{word.conjugated_form}}</td>
 68 |       <td>{{word.baseform}}</td>
 69 |       <td>{{word.reading}}</td>
 70 |       <td>{{word.pronunciation}}</td>
 71 |       <td>{{word.ime_romaji}}</td>
 72 |       <td>{{word.cost}}</td>
 73 |     </tr>
 74 |   </tbody>
 75 | </table>
 76 | <hr>
 77 | </div>
 78 | </script>
 79 | 
 80 | <script>
 81 | var vue_table = null;
 82 | $(function(){
 83 |   'use strict';
 84 | 
 85 |   $('#inputText').on('input', update);
 86 |   update();
 87 | });
 88 | 
 89 | function update(evt){
 90 |   var api_url = '/text-analysis/v1/parse';
 91 |   var params = {sentence: $('#inputText').val()};
 92 |   $.getJSON(api_url, params).then(function(data){
 93 |     if(!vue_table){
 94 |       var template = $('#table-template').html();
 95 |       vue_table = new Vue({
 96 |         el: '#table-box', data:data,
 97 |         template: template
 98 |       });
 99 |     }
100 |     vue_table.$data = data;
101 |   });
102 | }
103 | </script>
104 | 
105 | </body>
106 | </html>
107 | 


--------------------------------------------------------------------------------
/text_analysis/text_analysis/settings/__init__.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Django settings for text_analysis project.
  3 | 
  4 | For more information on this file, see
  5 | https://docs.djangoproject.com/en/1.8/topics/settings/
  6 | 
  7 | For the full list of settings and their values, see
  8 | https://docs.djangoproject.com/en/1.8/ref/settings/
  9 | """
 10 | 
 11 | # Build paths inside the project like this: os.path.join(BASE_DIR, ...)
 12 | import os
 13 | from os.path import dirname
 14 | BASE_DIR = dirname(dirname(dirname(os.path.abspath(__file__))))
 15 | 
 16 | # Quick-start development settings - unsuitable for production
 17 | # See https://docs.djangoproject.com/en/1.8/howto/deployment/checklist/
 18 | 
 19 | # SECURITY WARNING: keep the secret key used in production secret!
 20 | SECRET_KEY = '9k7451dd#&6qdyha$iq=ikii7w8u=f@*orsb%f6ghrqlb!8%_p'
 21 | 
 22 | # SECURITY WARNING: don't run with debug turned on in production!
 23 | DEBUG = True
 24 | 
 25 | ALLOWED_HOSTS = []
 26 | 
 27 | 
 28 | # Application definition
 29 | 
 30 | INSTALLED_APPS = (
 31 |     # 'django.contrib.admin',
 32 |     # 'django.contrib.auth',
 33 |     # 'django.contrib.contenttypes',
 34 |     # 'django.contrib.sessions',
 35 |     # 'django.contrib.messages',
 36 |     'django.contrib.staticfiles',
 37 | 
 38 |     # Third-party applications
 39 |     'corsheaders',
 40 | 
 41 |     # Project applications
 42 |     'main',
 43 | )
 44 | 
 45 | MIDDLEWARE_CLASSES = (
 46 |     # 'django.contrib.sessions.middleware.SessionMiddleware',
 47 |     'corsheaders.middleware.CorsMiddleware',
 48 |     # 'django.middleware.common.CommonMiddleware',
 49 |     # 'django.middleware.csrf.CsrfViewMiddleware',
 50 |     # 'django.contrib.auth.middleware.AuthenticationMiddleware',
 51 |     # 'django.contrib.auth.middleware.SessionAuthenticationMiddleware',
 52 |     # 'django.contrib.messages.middleware.MessageMiddleware',
 53 |     'django.middleware.clickjacking.XFrameOptionsMiddleware',
 54 |     'django.middleware.security.SecurityMiddleware',
 55 | )
 56 | 
 57 | ROOT_URLCONF = 'text_analysis.urls'
 58 | 
 59 | # TEMPLATES = [
 60 | #     {
 61 | #         'BACKEND': 'django.template.backends.django.DjangoTemplates',
 62 | #         'DIRS': [],
 63 | #         'APP_DIRS': True,
 64 | #         'OPTIONS': {
 65 | #             'context_processors': [
 66 | #                 'django.template.context_processors.debug',
 67 | #                 'django.template.context_processors.request',
 68 | #                 'django.contrib.auth.context_processors.auth',
 69 | #                 'django.contrib.messages.context_processors.messages',
 70 | #             ],
 71 | #         },
 72 | #     },
 73 | # ]
 74 | 
 75 | WSGI_APPLICATION = 'text_analysis.wsgi.application'
 76 | 
 77 | 
 78 | # Database
 79 | # https://docs.djangoproject.com/en/1.8/ref/settings/#databases
 80 | 
 81 | DATABASES = {
 82 |     'default': {
 83 |         'ENGINE': 'django.db.backends.sqlite3',
 84 |         'NAME': ':memory:',
 85 |     }
 86 | }
 87 | 
 88 | 
 89 | # Internationalization
 90 | # https://docs.djangoproject.com/en/1.8/topics/i18n/
 91 | 
 92 | LANGUAGE_CODE = 'ja'
 93 | 
 94 | TIME_ZONE = 'Asia/Tokyo'
 95 | 
 96 | USE_I18N = True
 97 | 
 98 | USE_L10N = True
 99 | 
100 | USE_TZ = True
101 | 
102 | 
103 | # Static files (CSS, JavaScript, Images)
104 | # https://docs.djangoproject.com/en/1.8/howto/static-files/
105 | 
106 | STATIC_URL = '/static/'
107 | STATIC_ROOT = os.path.join(BASE_DIR, 'staticfiles')
108 | 
109 | CORS_ORIGIN_ALLOW_ALL = True
110 | CORS_ALLOW_METHODS = (
111 |     'GET',
112 |     'POST',
113 |     'OPTIONS'
114 | )
115 | 


--------------------------------------------------------------------------------
/text_analysis/main/tests.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | from django.urls import reverse
 3 | from django.test import TestCase
 4 | 
 5 | 
 6 | class TestReadingApi(TestCase):
 7 |     def _getTargetURL(self, *args, **kwargs):
 8 |         return reverse('reading', args=args, kwargs=kwargs)
 9 | 
10 |     def test_it(self):
11 |         res = self.client.get(self._getTargetURL(), {'sentence': '今日は良い天気ですね。'})
12 |         self.assertEqual(res.status_code, 200)
13 |         res_data = json.loads(res.content.decode())
14 |         self.assertEqual(res_data['items'][0]['reading'], 'キョウハヨイテンキデスネ。')
15 | 
16 |     def test_control_characters(self):
17 |         res = self.client.get(self._getTargetURL(), {'sentence': '今日は\r\nNLNL良い天気\vですね。'})
18 |         self.assertEqual(res.status_code, 200)
19 |         res_data = json.loads(res.content.decode())
20 |         self.assertEqual(res_data['items'][0]['reading'], 'キョウハnlnlヨイテンキデスネ。')
21 | 
22 | 
23 | class TestParseApi(TestCase):
24 |     def _getTargetURL(self, *args, **kwargs):
25 |         return reverse('parse', args=args, kwargs=kwargs)
26 | 
27 |     def test_it(self):
28 |         res = self.client.get(self._getTargetURL(), {'sentence': '今日は良い天気ですね。'})
29 |         self.assertEqual(res.status_code, 200)
30 |         res_data = json.loads(res.content.decode())
31 |         self.assertEqual(res_data['items'][0]['all']['reading'], 'キョウハヨイテンキデスネ。')
32 | 
33 |     def test_control_characters(self):
34 |         res = self.client.get(self._getTargetURL(), {'sentence': '今日は\r\nNLNL良い天気\vですね。'})
35 |         self.assertEqual(res.status_code, 200)
36 |         res_data = json.loads(res.content.decode())
37 |         self.assertEqual(res_data['items'][0]['all']['reading'], 'キョウハNLNLヨイテンキデスネ。')
38 |         self.assertEqual(res_data['items'][0]['all']['normalized'], '今日は\nNLNL良い天気ですね。')
39 | 
40 | 
41 | class TestHandler400(TestCase):
42 |     def _callFUT(self, request, exception):
43 |         from main.views import handler400
44 |         return handler400(request, exception)
45 | 
46 |     def test__it(self):
47 |         import json
48 |         res = self._callFUT('dummy request', "exception")
49 |         body = json.loads(res.content.decode())
50 |         self.assertEqual(res.status_code, 400)
51 |         self.assertEqual(body['error']['code'], 'bad_request')
52 | 
53 | 
54 | class TestHandler403(TestCase):
55 |     def _callFUT(self, request, exception):
56 |         from main.views import handler403
57 |         return handler403(request, exception)
58 | 
59 |     def test__it(self):
60 |         import json
61 |         res = self._callFUT('dummy request', "exception")
62 |         body = json.loads(res.content.decode())
63 |         self.assertEqual(res.status_code, 403)
64 |         self.assertEqual(body['error']['code'], 'permission_denied')
65 | 
66 | 
67 | class TestHandler404(TestCase):
68 |     def _callFUT(self, request, exception):
69 |         from main.views import handler404
70 |         return handler404(request, exception)
71 | 
72 |     def test__it(self):
73 |         import json
74 |         res = self._callFUT('dummy request', "exception")
75 |         body = json.loads(res.content.decode())
76 |         self.assertEqual(res.status_code, 404)
77 |         self.assertEqual(body['error']['code'], 'not_found')
78 | 
79 | 
80 | class TestHandler500(TestCase):
81 |     def _callFUT(self, request):
82 |         from main.views import handler500
83 |         return handler500(request)
84 | 
85 |     def test__it(self):
86 |         import json
87 |         res = self._callFUT('dummy request')
88 |         body = json.loads(res.content.decode())
89 |         self.assertEqual(res.status_code, 500)
90 |         self.assertEqual(body['error']['code'], 'server_error')
91 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # mecab-web-api
  2 | [![Circle CI](https://circleci.com/gh/bungoume/mecab-web-api.svg?style=shield)](https://circleci.com/gh/bungoume/mecab-web-api)
  3 | [![Coverage Status](https://img.shields.io/coveralls/bungoume/mecab-web-api.svg)](https://coveralls.io/r/bungoume/mecab-web-api)
  4 | [![Requirements Status](https://requires.io/github/bungoume/mecab-web-api/requirements.svg?branch=master)](https://requires.io/github/bungoume/mecab-web-api/requirements/?branch=master)
  5 | [![License](http://img.shields.io/:license-MIT-blue.svg)](http://doge.mit-license.org)
  6 | 
  7 | MeCabを利用した日本語形態素解析WebAPI
  8 | 
  9 | ![typography-icon](typography-icon.png)
 10 | 
 11 | 
 12 | ## Description
 13 | 
 14 | 文章を形態素に分割し、品詞や読みなどを取得できるJSON WebAPIを提供します。
 15 | 
 16 | 形態素解析にはMeCabを利用しており、MeCabの分かち書きコストの計算結果も取得できます。
 17 | 
 18 | 
 19 | ## HTTP API
 20 | ### Endpoints
 21 | ```
 22 | method: 
 23 |   GET or POST
 24 | path:
 25 |   /text-analysis/v1/parse
 26 |   /text-analysis/v1/reading
 27 |   or 
 28 |   /v1/parse
 29 |   /v1/reading
 30 | params:
 31 |   sentence=<string>
 32 |   nbest_num=<number> (not required)
 33 | ```
 34 | 
 35 | 
 36 | ## Demo
 37 | * https://mecab-web-api.herokuapp.com/v1/parse?sentence=すもももももももものうち&nbest_num=2
 38 | * https://mecab-web-api.herokuapp.com/text-analysis/v1/reading?sentence=今日は良い天気だ
 39 | 
 40 | 
 41 | ## Sample Response
 42 | ### parse API
 43 | ```
 44 | GET /text-analysis/v1/parse?sentence=エビフライ
 45 | or
 46 | GET /v1/parse?sentence=エビフライ
 47 | ```
 48 | 
 49 | Takes a JSON object like this:
 50 | 
 51 | ```json
 52 | {
 53 |     "items": [
 54 |         {
 55 |             "words": [
 56 |                 {
 57 |                     "word_cost": 4235,
 58 |                     "surface": "エビ",
 59 |                     "pos_detail1": "一般",
 60 |                     "pos": "名詞",
 61 |                     "conjugated_type": "",
 62 |                     "ime_romaji": "ebi",
 63 |                     "morpheme": "通常",
 64 |                     "conjugated_form": "",
 65 |                     "pos_detail3": "",
 66 |                     "c_cost": -283,
 67 |                     "pronunciation": "エビ",
 68 |                     "baseform": "エビ",
 69 |                     "reading": "エビ",
 70 |                     "pos_detail2": "",
 71 |                     "with_whitespace": false,
 72 |                     "cost": 3952
 73 |                 },
 74 |                 {
 75 |                     "word_cost": 3742,
 76 |                     "surface": "フライ",
 77 |                     "pos_detail1": "一般",
 78 |                     "pos": "名詞",
 79 |                     "conjugated_type": "",
 80 |                     "ime_romaji": "hurai",
 81 |                     "morpheme": "通常",
 82 |                     "conjugated_form": "",
 83 |                     "pos_detail3": "",
 84 |                     "c_cost": 62,
 85 |                     "pronunciation": "フライ",
 86 |                     "baseform": "フライ",
 87 |                     "reading": "フライ",
 88 |                     "pos_detail2": "",
 89 |                     "with_whitespace": false,
 90 |                     "cost": 3804
 91 |                 }
 92 |             ],
 93 |             "all": {
 94 |                 "cost": 7756,
 95 |                 "wakati": "エビ フライ",
 96 |                 "length": 5,
 97 |                 "wakati_reading": "エビ フライ",
 98 |                 "normalized": "エビフライ",
 99 |                 "ime_romaji": "ebihurai",
100 |                 "reading": "エビフライ"
101 |             }
102 |         },
103 |         {
104 |             "second cost analysis result"
105 |         },
106 |         "..."
107 |     ],
108 |     "input_sentence": "エビフライ"
109 | }
110 | ```
111 | 
112 | ### reading API
113 | 
114 | ```
115 | GET /text-analysis/v1/parse?sentence=今日は良い天気だ
116 | or
117 | GET /v1/parse?sentence=今日は良い天気だ
118 | ```
119 | 
120 | Takes a JSON object like this:
121 | 
122 | ```json
123 | {
124 |     "items": [
125 |         {
126 |             "ignore_all_romaji": "kiyouhayoitennkita",
127 |             "romaji": "kixyouhayoitennkida",
128 |             "ignore_kogaki_romaji": "kiyouhayoitennkida",
129 |             "ignore_soundmark_romaji": "kixyouhayoitennkita",
130 |             "qwerty_romaji": "kixyouhayoitennkida",
131 |             "reading": "キョウハヨイテンキダ"
132 |         },
133 |         {
134 |             "ignore_all_romaji": "konnnitihayoitennkita",
135 |             "romaji": "konnnitihayoitennkida",
136 |             "ignore_kogaki_romaji": "konnnitihayoitennkida",
137 |             "ignore_soundmark_romaji": "konnnitihayoitennkita",
138 |             "qwerty_romaji": "konnnitihayoitennkida",
139 |             "reading": "コンニチハヨイテンキダ"
140 |         },
141 |         "..."
142 |     ],
143 |     "input_sentence": "今日は良い天気だ"
144 | }
145 | ```
146 | 
147 | 
148 | ## Quick Start
149 | using Docker Hub
150 | 
151 | ```sh
152 | $ sudo docker run -d -p 8000:8000 bungoume/mecab-web-api
153 | ```
154 | 
155 | or build container on yourself
156 | 
157 | ```sh
158 | $ git clone https://github.com/bungoume/mecab-web-api.git
159 | $ sudo docker build -t mecab-web-api mecab-web-api
160 | $ sudo docker run -d -p 8000:8000 mecab-web-api
161 | ```
162 | 
163 | then, access http://localhost:8000/static/demo.html
164 | 
165 | 
166 | ## Licence
167 | 
168 | [MIT](http://doge.mit-license.org)
169 | 
170 | 
171 | ## Author
172 | 
173 | [bungoume](https://github.com/bungoume)
174 | 


--------------------------------------------------------------------------------
/text_analysis/main/mecab_utils.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | import MeCab
  3 | import unicodedata
  4 | 
  5 | 
  6 | # RE_HIRAGANA = re.compile(r'[\u3040-\u309F]')
  7 | RE_NOWORD = re.compile(r'[^\w-]')
  8 | RE_LX = re.compile(r'[lx]')
  9 | RE_ALL = re.compile(r'.')
 10 | 
 11 | 
 12 | # node: 1つの形態素を出力, デフォルトは空文字
 13 | # unk: 1つの未知語形態素を出力, デフォルトは node と同一フォーマット
 14 | # bos: 形態素解析の結果に先だって出力 (header 的役割), デフォルトは空文字
 15 | # eos: 形態素解析の結果の後に出力 (footer 的役割), デフォルトは "EOS\n"
 16 | # eon: N-best出力で, N-Bestの出力が終了したときに出力, デフォルトは空文字列
 17 | # %s: 形態素種類, %m: 形態素の表層文字列, %pS: 先頭に空白を含むか,
 18 | # %f[n]:
 19 | #   0: 品詞, 1:品詞, 2:品詞, 3:品詞, 4:活用型, 5:活用形, 6:原形, 7:読み, 8:発音
 20 | # %pw: 単語生起コスト, %pC: 1つ前の形態素との連接コスト
 21 | M_PARSE = MeCab.Tagger('--node-format={0} --unk-format={1} --eos-format=EOS'.format(
 22 |     r'%s\v%m\v%pS\v%f[0]\v%f[1]\v%f[2]\v%f[3]\v%f[4]\v%f[5]'
 23 |     r'\v%f[6]\v%f[7]\v%f[8]\v%pw\v%pC\r\n',
 24 |     r'%s\v%m\v%pS\v%f[0]\v%f[1]\v%f[2]\v%f[3]\v%f[4]\v%f[5]'
 25 |     r'\v%f[6]\v%m\v%m\v%pw\v%pC\r\n',
 26 | ))
 27 | M_READING = MeCab.Tagger('-Oyomi')
 28 | 
 29 | 
 30 | def remove_mark(w):
 31 |     """英語・ハイフン以外のもの(句読点など)を除去
 32 |     """
 33 |     w = RE_NOWORD.sub('', w)
 34 |     return w
 35 | 
 36 | 
 37 | def remove_soundmark(w):
 38 |     """濁点・半濁点を削除
 39 |     """
 40 |     w = w.replace('g', 'k')
 41 |     w = w.replace('z', 's')
 42 |     w = w.replace('d', 't')
 43 |     w = w.replace('b', 'h')
 44 |     w = w.replace('p', 'h')
 45 |     return w
 46 | 
 47 | 
 48 | def qwerty_kana(w):
 49 |     """QWERTYキーボードのIME向けの予測処理をする
 50 |     """
 51 |     w = re.sub(r'([qrtypsdfghjlzxcvbm])\1', r'っ\1', w)
 52 |     w = re.sub(r'ji?', 'じ', w)
 53 |     w = re.sub(r'fu?', 'ふ', w)
 54 |     w = re.sub(r'ch?i?', 'ち', w)
 55 |     w = re.sub(r'qu?', 'く', w)
 56 |     w = re.sub(r'shi?', 'し', w)
 57 |     w = re.sub(r'tsu?', 'つ', w)
 58 |     w = w.replace('ky', 'きxy')
 59 |     w = w.replace('gy', 'ぎxy')
 60 |     w = w.replace('sy', 'しxy')
 61 |     w = w.replace('zy', 'じxy')
 62 |     w = w.replace('ty', 'ちxy')
 63 |     w = w.replace('dy', 'ぢxy')
 64 |     w = w.replace('ny', 'にxy')
 65 |     w = w.replace('hy', 'ひxy')
 66 |     w = w.replace('my', 'みxy')
 67 |     w = w.replace('ry', 'りxy')
 68 |     w = w.replace('nn', 'ん')
 69 |     w = re.sub(r'n([^aiueo])', r'ん\1', w)
 70 |     return w
 71 | 
 72 | 
 73 | ROMAJI_DICT = {
 74 |     'ア': 'a', 'イ': 'i', 'ウ': 'u', 'エ': 'e', 'オ': 'o',
 75 |     'カ': 'ka', 'キ': 'ki', 'ク': 'ku', 'ケ': 'ke', 'コ': 'ko',
 76 |     'サ': 'sa', 'シ': 'si', 'ス': 'su', 'セ': 'se', 'ソ': 'so',
 77 |     'タ': 'ta', 'チ': 'ti', 'ツ': 'tu', 'テ': 'te', 'ト': 'to',
 78 |     'ナ': 'na', 'ニ': 'ni', 'ヌ': 'nu', 'ネ': 'ne', 'ノ': 'no',
 79 |     'ハ': 'ha', 'ヒ': 'hi', 'フ': 'hu', 'ヘ': 'he', 'ホ': 'ho',
 80 |     'マ': 'ma', 'ミ': 'mi', 'ム': 'mu', 'メ': 'me', 'モ': 'mo',
 81 |     'ヤ': 'ya', 'ユ': 'yu', 'ヨ': 'yo',
 82 |     'ラ': 'ra', 'リ': 'ri', 'ル': 'ru', 'レ': 're', 'ロ': 'ro',
 83 |     'ワ': 'wa', 'ヰ': 'wi', 'ヱ': 'we', 'ヲ': 'wo',
 84 |     'ガ': 'ga', 'ギ': 'gi', 'グ': 'gu', 'ゲ': 'ge', 'ゴ': 'go',
 85 |     'ザ': 'za', 'ジ': 'zi', 'ズ': 'zu', 'ゼ': 'ze', 'ゾ': 'zo',
 86 |     'ダ': 'da', 'ヂ': 'di', 'ヅ': 'du', 'デ': 'de', 'ド': 'do',
 87 |     'バ': 'ba', 'ビ': 'bi', 'ブ': 'bu', 'ベ': 'be', 'ボ': 'bo',
 88 |     'パ': 'pa', 'ピ': 'pi', 'プ': 'pu', 'ペ': 'pe', 'ポ': 'po',
 89 |     'ヴ': 'vu',
 90 |     'ァ': 'xa', 'ィ': 'xi', 'ゥ': 'xu', 'ェ': 'xe', 'ォ': 'xo',
 91 |     'ッ': 'xtu',
 92 |     'ャ': 'xya', 'ュ': 'xyu', 'ョ': 'xyo',
 93 |     'ヮ': 'xwa',
 94 |     'ヶ': 'xke', 'ヵ': 'xka',
 95 |     'ン': 'nn',
 96 |     '、': ',', '。': '.', '・': ';',
 97 |     'ー': '-', '－': '-', '‐': '-',
 98 | }
 99 | 
100 | 
101 | def to_romaji(w):
102 |     """カタカナ・ひらがなをローマ字書きに変換する
103 |     一般的なローマ字ではなく、IMEでの単体文字入力となる形に変換して予測候補を出しやすくする
104 |     ref: http://developers.linecorp.com/blog/?p=367
105 |     """
106 |     def ctoromaji(c):
107 |         c = c.group(0)
108 | 
109 |         # if RE_HIRAGANA.search(c):
110 |         #     c = chr(ord(c)+96)
111 |         if c in ROMAJI_DICT:
112 |             return ROMAJI_DICT[c]
113 |         else:
114 |             return c
115 | 
116 |     return RE_ALL.sub(ctoromaji, w)
117 | 
118 | 
119 | def reading_sentence(sentence, nbest_num=10):
120 |     sentence = unicodedata.normalize('NFKC', sentence)
121 |     sentence = sentence.replace('\v', '')
122 |     sentence = sentence.replace('\r', '')
123 |     sentence = sentence.replace('\n', '')
124 | 
125 |     parsed_text = M_READING.parseNBest(nbest_num, sentence)
126 |     nbests = parsed_text.strip().split('\n')
127 | 
128 |     ans_list = []
129 |     for reading in nbests:
130 |         reading = reading.lower()
131 |         if reading in map(lambda x: x['reading'], ans_list):
132 |             continue
133 |         roma = remove_mark(to_romaji(reading))
134 |         no_soundmark = remove_soundmark(roma)
135 |         ret = {
136 |             'reading': reading,
137 |             'romaji': roma,
138 |             # 日本語変換前の語句をひらがなに（弊害で英語名検索不可)
139 |             'qwerty_romaji': to_romaji(qwerty_kana(reading)),
140 |             # 濁点・半濁点を削除
141 |             'ignore_soundmark_romaji': no_soundmark,
142 |             # 小書き文字を通常の仮名と同一視する
143 |             'ignore_kogaki_romaji': RE_LX.sub('', roma),
144 |             'ignore_all_romaji': RE_LX.sub('', no_soundmark),
145 |         }
146 |         ans_list.append(ret)
147 | 
148 |     return ans_list
149 | 
150 | 
151 | def parse_sentence(sentence, nbest_num=3):
152 |     sentence = unicodedata.normalize('NFKC', sentence)
153 |     sentence = sentence.replace('\v', '')
154 |     sentence = sentence.replace('\r', '')
155 | 
156 |     parsed_text = M_PARSE.parseNBest(nbest_num, sentence)
157 |     nbests = parsed_text.strip()
158 |     nbests = nbests.split('\r\nEOS')[:-1]
159 | 
160 |     def parse_line(line):
161 |         x = line.split('\v')
162 |         MORPHEME_TYPE = {'0': '通常', '1': '未知語', '2': '文頭', '3': '文末'}
163 |         w_cost = int(x[12])
164 |         c_cost = int(x[13])
165 |         return {
166 |             'morpheme': MORPHEME_TYPE[x[0]],  # 形態素種類
167 |             'surface': x[1],  # 形態素の表層文字列
168 |             'with_whitespace': bool(len(x[2])),  # 先頭に空白を含むか
169 |             'pos': x[3],           # 品詞
170 |             'pos_detail1': x[4],   # 品詞細分類1
171 |             'pos_detail2': x[5],   # 品詞細分類2
172 |             'pos_detail3': x[6],   # 品詞細分類3
173 |             'conjugated_type': x[7],  # 活用型
174 |             'conjugated_form': x[8],  # 活用形
175 |             'baseform': x[9],         # 原形
176 |             'reading': x[10],         # 読み
177 |             'pronunciation': x[11],   # 発音
178 |             'word_cost': w_cost,      # 単語生起コスト
179 |             'c_cost': c_cost,         # 1つ前の形態素との連接コスト
180 |             'cost': w_cost+c_cost,    # その形態素単独
181 |             'ime_romaji': to_romaji(x[10]).lower(),  # ローマ字
182 |         }
183 | 
184 |     ans_list = []
185 |     for nbest in nbests:
186 |         words = list(map(parse_line, nbest.strip().split('\r\n')))
187 |         readings = list(map(lambda x: x['reading'], words))
188 |         roma = to_romaji(''.join(readings)).lower()
189 | 
190 |         ret = {
191 |             'all': {
192 |                 'normalized': sentence,
193 |                 'length': len(sentence),
194 |                 'cost': sum(map(lambda x: x['cost'], words)),
195 |                 'reading': ''.join(readings),
196 |                 'ime_romaji': remove_mark(roma),
197 |                 'wakati': ' '.join(map(lambda x: x['surface'], words)),
198 |                 'wakati_reading': ' '.join(readings),
199 |             },
200 |             'words': words
201 |         }
202 |         ans_list.append(ret)
203 |     return ans_list
204 | 


--------------------------------------------------------------------------------