├── text_analysis ├── main │ ├── __init__.py │ ├── urls.py │ ├── forms.py │ ├── views.py │ ├── static │ │ └── demo.html │ ├── tests.py │ └── mecab_utils.py ├── text_analysis │ ├── __init__.py │ ├── settings │ │ ├── test.py │ │ ├── production.py │ │ └── __init__.py │ ├── urls.py │ └── wsgi.py └── manage.py ├── requirements.txt ├── typography-icon.png ├── .coveragerc ├── newrelic.ini ├── .github └── dependabot.yml ├── tox.ini ├── NOTICE ├── uwsgi.ini ├── .gitignore ├── LICENSE ├── Dockerfile ├── .circleci └── config.yml └── README.md /text_analysis/main/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /text_analysis/text_analysis/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | Django==3.2.14 2 | django-cors-headers==3.13.0 3 | mecab-python3==0.7 4 | -------------------------------------------------------------------------------- /typography-icon.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bungoume/mecab-web-api/HEAD/typography-icon.png -------------------------------------------------------------------------------- /.coveragerc: -------------------------------------------------------------------------------- 1 | [report] 2 | include = text_analysis/** 3 | omit = 4 | text_analysis/*/tests/** 5 | text_analysis/*/tests.py 6 | -------------------------------------------------------------------------------- /newrelic.ini: -------------------------------------------------------------------------------- 1 | [newrelic:development] 2 | app_name = Text Analysis API (development) 3 | 4 | [newrelic:production] 5 | app_name = Text Analysis API 6 | -------------------------------------------------------------------------------- /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | updates: 3 | - package-ecosystem: pip 4 | directory: "/" 5 | schedule: 6 | interval: weekly 7 | time: "11:00" 8 | timezone: Asia/Tokyo 9 | open-pull-requests-limit: 10 10 | -------------------------------------------------------------------------------- /text_analysis/main/urls.py: -------------------------------------------------------------------------------- 1 | from django.conf.urls import url 2 | 3 | from main import views 4 | 5 | urlpatterns = [ 6 | url(r'^parse$', views.parse, name='parse'), 7 | url(r'^reading$', views.reading, name='reading'), 8 | ] 9 | -------------------------------------------------------------------------------- /text_analysis/text_analysis/settings/test.py: -------------------------------------------------------------------------------- 1 | from text_analysis.settings import * # NOQA 2 | 3 | 4 | DATABASES = { 5 | 'default': { 6 | 'ENGINE': 'django.db.backends.sqlite3', 7 | 'NAME': ':memory:', 8 | } 9 | } 10 | -------------------------------------------------------------------------------- /text_analysis/manage.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import os 3 | import sys 4 | 5 | if __name__ == "__main__": 6 | os.environ.setdefault("DJANGO_SETTINGS_MODULE", "text_analysis.settings") 7 | 8 | from django.core.management import execute_from_command_line 9 | 10 | execute_from_command_line(sys.argv) 11 | -------------------------------------------------------------------------------- /text_analysis/text_analysis/urls.py: -------------------------------------------------------------------------------- 1 | from django.conf.urls import include, url 2 | 3 | urlpatterns = [ 4 | url(r'^text-analysis/v1/', include('main.urls')), 5 | url(r'^v1/', include('main.urls')), 6 | ] 7 | 8 | handler400 = 'main.views.handler400' 9 | handler403 = 'main.views.handler403' 10 | handler404 = 'main.views.handler404' 11 | handler500 = 'main.views.handler500' 12 | -------------------------------------------------------------------------------- /text_analysis/text_analysis/settings/production.py: -------------------------------------------------------------------------------- 1 | from text_analysis.settings import * # NOQA 2 | 3 | 4 | ALLOWED_HOSTS = ['*'] 5 | 6 | DEBUG = False 7 | 8 | SESSION_COOKIE_SECURE = True 9 | CSRF_COOKIE_SECURE = True 10 | 11 | 12 | ####################### 13 | # SECURITY MIDDLEWARE # 14 | ####################### 15 | SECURE_BROWSER_XSS_FILTER = True 16 | SECURE_CONTENT_TYPE_NOSNIFF = True 17 | SECURE_HSTS_SECONDS = 31536000 18 | SECURE_SSL_REDIRECT = True 19 | -------------------------------------------------------------------------------- /text_analysis/text_analysis/wsgi.py: -------------------------------------------------------------------------------- 1 | """ 2 | WSGI config for text_analysis project. 3 | 4 | It exposes the WSGI callable as a module-level variable named ``application``. 5 | 6 | For more information on this file, see 7 | https://docs.djangoproject.com/en/1.8/howto/deployment/wsgi/ 8 | """ 9 | 10 | import os 11 | 12 | from django.core.wsgi import get_wsgi_application 13 | 14 | os.environ.setdefault("DJANGO_SETTINGS_MODULE", "text_analysis.settings") 15 | 16 | application = get_wsgi_application() 17 | -------------------------------------------------------------------------------- /tox.ini: -------------------------------------------------------------------------------- 1 | [tox] 2 | envlist = py37, flake8 3 | skipsdist = True 4 | setupdir = ./text_analysis/ 5 | [testenv:py37] 6 | deps = coverage 7 | testfixtures 8 | -rrequirements.txt 9 | setenv = DJANGO_SETTINGS_MODULE = text_analysis.settings.test 10 | commands = 11 | pip install -r requirements.txt 12 | coverage erase 13 | coverage run text_analysis/manage.py test text_analysis 14 | coverage report 15 | 16 | [testenv:flake8] 17 | basepython = python3.7 18 | deps = flake8 19 | commands = flake8 text_analysis 20 | 21 | [flake8] 22 | max-line-length = 99 23 | exclude = text_analysis/*/migrations/* 24 | -------------------------------------------------------------------------------- /NOTICE: -------------------------------------------------------------------------------- 1 | List of open-source software used/depended on this project: 2 | 3 | MeCab 4 | https://taku910.github.io/mecab/ 5 | Copyright (c) 2001-2008, Taku Kudo 6 | Copyright (c) 2004-2008, Nippon Telegraph and Telephone Corporation 7 | License: BSD (https://github.com/taku910/mecab/blob/master/mecab/BSD) 8 | 9 | Django 10 | https://www.djangoproject.com/ 11 | Copyright (c) Django Software Foundation and individual contributors. 12 | License: BSD (https://github.com/django/django/blob/master/LICENSE) 13 | 14 | mecab-python3 15 | https://github.com/SamuraiT/mecab-python3 16 | License: BSD (https://github.com/SamuraiT/mecab-python3/blob/master/BSD) 17 | -------------------------------------------------------------------------------- /uwsgi.ini: -------------------------------------------------------------------------------- 1 | [uwsgi] 2 | strict=true 3 | chdir=/usr/src/app/text_analysis/ 4 | env=DJANGO_SETTINGS_MODULE=text_analysis.settings 5 | module=text_analysis.wsgi 6 | enable-threads=true 7 | single-interpreter=true 8 | master=true 9 | vacuum=true 10 | harakiri=60 11 | processes=3 12 | threads=3 13 | die-on-term=true 14 | threads-stacksize=2048 15 | reload-on-rss=320 16 | evil-reload-on-rss=384 17 | post-buffering=8192 18 | buffer-size=32768 19 | reuse-port=true 20 | thunder-lock=true 21 | 22 | py-tracebacker=/tmp/tbsocket. 23 | req-logger=file:/log/uwsgi-access.log 24 | logger=file:/log/uwsgi.log 25 | 26 | [development] 27 | ini=:uwsgi 28 | socket=0.0.0.0:8000 29 | env=DJANGO_SETTINGS_MODULE=text_analysis.settings.development 30 | 31 | [production] 32 | ini=:uwsgi 33 | socket=0.0.0.0:8000 34 | env=DJANGO_SETTINGS_MODULE=text_analysis.settings.production 35 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | 5 | # C extensions 6 | *.so 7 | 8 | # Distribution / packaging 9 | .Python 10 | env/ 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | lib/ 17 | lib64/ 18 | parts/ 19 | sdist/ 20 | var/ 21 | *.egg-info/ 22 | .installed.cfg 23 | *.egg 24 | 25 | # PyInstaller 26 | # Usually these files are written by a python script from a template 27 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 28 | *.manifest 29 | *.spec 30 | 31 | # Installer logs 32 | pip-log.txt 33 | pip-delete-this-directory.txt 34 | 35 | # Unit test / coverage reports 36 | htmlcov/ 37 | .tox/ 38 | .coverage 39 | .cache 40 | nosetests.xml 41 | coverage.xml 42 | 43 | # Translations 44 | *.mo 45 | *.pot 46 | 47 | # Django stuff: 48 | *.log 49 | 50 | # Sphinx documentation 51 | docs/_build/ 52 | 53 | # PyBuilder 54 | target/ 55 | -------------------------------------------------------------------------------- /text_analysis/main/forms.py: -------------------------------------------------------------------------------- 1 | from django import forms 2 | from django.core import validators 3 | 4 | 5 | class ReadingForm(forms.Form): 6 | sentence = forms.CharField(required=False) 7 | nbest_num = forms.IntegerField(validators=[ 8 | validators.MinValueValidator(1), validators.MaxValueValidator(50)], required=False) 9 | 10 | def clean_sentence(self): 11 | return self.cleaned_data.get('sentence', '') 12 | 13 | def clean_nbest_num(self): 14 | nbest_num = self.cleaned_data.get('nbest_num') 15 | if nbest_num is None: 16 | return 10 17 | return nbest_num 18 | 19 | 20 | class ParseForm(forms.Form): 21 | sentence = forms.CharField(required=False) 22 | nbest_num = forms.IntegerField(validators=[ 23 | validators.MinValueValidator(1), validators.MaxValueValidator(50)], required=False) 24 | 25 | def clean_sentence(self): 26 | return self.cleaned_data.get('sentence', '') 27 | 28 | def clean_nbest_num(self): 29 | nbest_num = self.cleaned_data.get('nbest_num') 30 | if nbest_num is None: 31 | return 3 32 | return nbest_num 33 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2015 Yuri UMEZAKI 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | 23 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.7.0-alpine 2 | 3 | RUN mkdir -p /usr/src/app && mkdir /log && \ 4 | apk --no-cache --update add \ 5 | build-base \ 6 | linux-headers \ 7 | openssl \ 8 | libstdc++ \ 9 | bash \ 10 | curl \ 11 | file \ 12 | git \ 13 | ca-certificates && \ 14 | cd /tmp && \ 15 | wget -O mecab-0.996.tar.gz "https://drive.google.com/uc?export=download&id=0B4y35FiV1wh7cENtOXlicTFaRUE" && \ 16 | tar xvzf mecab-0.996.tar.gz && \ 17 | cd mecab-0.996 && \ 18 | ./configure --enable-utf8-only && \ 19 | make && make install && \ 20 | mkdir -p /usr/local/lib/mecab/dic && \ 21 | chmod 777 /usr/local/lib/mecab/dic && \ 22 | cd /tmp && \ 23 | git clone --depth 1 https://github.com/neologd/mecab-ipadic-neologd.git && \ 24 | cd mecab-ipadic-neologd && \ 25 | ./bin/install-mecab-ipadic-neologd -n -y && \ 26 | sed -i "s/ipadic$/mecab-ipadic-neologd/g" /usr/local/etc/mecabrc && \ 27 | pip install uWSGI mecab-python3==0.7 && \ 28 | apk del build-base linux-headers && \ 29 | rm -rf /tmp/* /var/tmp/* /var/cache/apk/* /root/.cache/pip/* 30 | 31 | WORKDIR /usr/src/app 32 | 33 | COPY requirements.txt /usr/src/app/ 34 | RUN pip install --no-cache-dir -r requirements.txt && \ 35 | rm -rf /tmp/* /var/tmp/* /root/.cache/pip/* 36 | 37 | COPY . /usr/src/app 38 | 39 | ENV DJANGO_SETTINGS_MODULE=text_analysis.settings.production 40 | 41 | RUN python text_analysis/manage.py collectstatic --noinput 42 | 43 | EXPOSE 8000 44 | 45 | ENV UWSGI_ENV production 46 | 47 | CMD ["uwsgi", "--ini", "uwsgi.ini:${UWSGI_ENV}"] 48 | 49 | # RUN pip install newrelic 50 | # ENV NEW_RELIC_ENVIRONMENT ${UWSGI_ENV} 51 | # ENV NEW_RELIC_LICENSE_KEY {{ YOUR_LICENSE_KEY }} 52 | # ENV NEW_RELIC_APP_NAME {{ THIS_APP_NAME }} 53 | # CMD ["newrelic-admin", "run-program", "uwsgi", "--ini", "uwsgi.ini:${UWSGI_ENV}"] 54 | -------------------------------------------------------------------------------- /.circleci/config.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | jobs: 3 | test: 4 | working_directory: ~/myapp 5 | docker: 6 | - image: circleci/python:3.7.0 7 | steps: 8 | - checkout 9 | - run: 10 | command: python -m venv env 11 | - restore_cache: 12 | keys: 13 | - v1-myapp-{{ checksum "requirements.txt" }} 14 | - restore_cache: 15 | keys: 16 | - v1-myapp-mecab-0.996 17 | - run: 18 | name: Install Mecab 19 | command: | 20 | cd ~ 21 | if [[ ! -e mecab-0.996/src/mecab ]]; then 22 | wget -O mecab-0.996.tar.gz "https://drive.google.com/uc?export=download&id=0B4y35FiV1wh7cENtOXlicTFaRUE" 23 | tar xvzf mecab-0.996.tar.gz 24 | rm -f mecab-0.996.tar.gz 25 | cd mecab-0.996 26 | ./configure --enable-utf8-only 27 | make 28 | else 29 | cd mecab-0.996 30 | fi 31 | sudo make install 32 | sudo ldconfig 33 | sudo mkdir -p /usr/local/lib/mecab/dic 34 | sudo chmod 777 /usr/local/lib/mecab/dic 35 | - restore_cache: 36 | keys: 37 | - v1-myapp-mecab-ipadic-neologd-201809 # neologd更新時はこことsave_cacheの日付を変更してください。 38 | - run: 39 | name: Install neologd 40 | command: | 41 | if [[ ! -e /usr/local/lib/mecab/dic/mecab-ipadic-neologd/sys.dic ]]; then 42 | cd ~ 43 | git clone --depth 1 https://github.com/neologd/mecab-ipadic-neologd.git 44 | cd mecab-ipadic-neologd 45 | ./bin/install-mecab-ipadic-neologd -n -y 46 | fi 47 | sudo sed -i "s/ipadic$/mecab-ipadic-neologd/g" /usr/local/etc/mecabrc 48 | - run: 49 | name: Run tox 50 | command: | 51 | . env/bin/activate 52 | pip install -U tox 53 | tox 54 | - run: 55 | name: Run coveralls 56 | command: | 57 | . env/bin/activate 58 | pip install -U coveralls 59 | coveralls 60 | - save_cache: 61 | key: v1-myapp-{{ checksum "requirements.txt" }} 62 | paths: 63 | - ~/myapp/.tox 64 | - ~/myapp/env 65 | - save_cache: 66 | key: v1-myapp-mecab-0.996 67 | paths: 68 | - ~/mecab-0.996 69 | - save_cache: 70 | key: v1-myapp-mecab-ipadic-neologd-201804 71 | paths: 72 | - /usr/local/lib/mecab/dic/mecab-ipadic-neologd 73 | - store_artifacts: 74 | path: .circle_artifacts 75 | - store_test_results: 76 | path: .circle_test_reposts/django 77 | 78 | workflows: 79 | version: 2 80 | test: 81 | jobs: 82 | - test 83 | -------------------------------------------------------------------------------- /text_analysis/main/views.py: -------------------------------------------------------------------------------- 1 | from django.http import HttpResponse, JsonResponse 2 | 3 | from django.views.decorators.cache import cache_control 4 | from django.views.decorators.http import require_http_methods 5 | 6 | from main import mecab_utils 7 | from main.forms import ReadingForm, ParseForm 8 | 9 | 10 | @cache_control(max_age=86400) 11 | @require_http_methods(["GET", "POST", "OPTIONS"]) 12 | def reading(request): 13 | if request.method == "GET": 14 | form = ReadingForm(request.GET) 15 | elif request.method == "POST": 16 | form = ReadingForm(request.POST) 17 | elif request.method == "OPTIONS": 18 | return HttpResponse({}, status=204) 19 | if not form.is_valid(): 20 | return JsonResponse( 21 | {"error": {"code": "form_invalid", "errors": form.errors}}, status=400) 22 | 23 | sentence = form.cleaned_data.get('sentence') 24 | nbest_num = form.cleaned_data.get('nbest_num') 25 | 26 | ret = { 27 | 'input_sentence': sentence, 28 | 'items': mecab_utils.reading_sentence(sentence, nbest_num), 29 | } 30 | 31 | return JsonResponse(ret, json_dumps_params={'ensure_ascii': False, 'separators': (',', ':')}) 32 | 33 | 34 | @cache_control(max_age=86400) 35 | @require_http_methods(["GET", "POST", "OPTIONS"]) 36 | def parse(request): 37 | if request.method == "GET": 38 | form = ParseForm(request.GET) 39 | elif request.method == "POST": 40 | form = ParseForm(request.POST) 41 | elif request.method == "OPTIONS": 42 | return HttpResponse("", status=204) 43 | if not form.is_valid(): 44 | return JsonResponse( 45 | {"error": {"code": "form_invalid", "errors": form.errors}}, status=400) 46 | 47 | sentence = form.cleaned_data.get('sentence') 48 | nbest_num = form.cleaned_data.get('nbest_num') 49 | 50 | ret = { 51 | 'input_sentence': sentence, 52 | 'items': mecab_utils.parse_sentence(sentence, nbest_num), 53 | } 54 | 55 | return JsonResponse(ret, json_dumps_params={'ensure_ascii': False, 'separators': (',', ':')}) 56 | 57 | 58 | def handler400(request, exception): 59 | res = JsonResponse({'error': {'code': 'bad_request', 60 | 'message': "400 Bad Request"}}, status=400) 61 | return res 62 | 63 | 64 | def handler403(request, exception): 65 | res = JsonResponse({'error': {'code': 'permission_denied', 66 | 'message': "403 Permission Denied"}}, status=403) 67 | return res 68 | 69 | 70 | def handler404(request, exception): 71 | res = JsonResponse({'error': {'code': 'not_found', 72 | 'message': "404 Not Found"}}, status=404) 73 | return res 74 | 75 | 76 | def handler500(request): 77 | res = JsonResponse({'error': {'code': 'server_error', 78 | 'message': "500 Internal Server Error"}}, status=500) 79 | return res 80 | -------------------------------------------------------------------------------- /text_analysis/main/static/demo.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | mecab-web-api demo 6 | 7 | 8 | 14 | 15 | 16 |
17 | 18 |
19 |
20 | 21 | 22 |
23 |
24 | 25 |
26 |
27 | 28 |
29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 79 | 80 | 104 | 105 | 106 | 107 | -------------------------------------------------------------------------------- /text_analysis/text_analysis/settings/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Django settings for text_analysis project. 3 | 4 | For more information on this file, see 5 | https://docs.djangoproject.com/en/1.8/topics/settings/ 6 | 7 | For the full list of settings and their values, see 8 | https://docs.djangoproject.com/en/1.8/ref/settings/ 9 | """ 10 | 11 | # Build paths inside the project like this: os.path.join(BASE_DIR, ...) 12 | import os 13 | from os.path import dirname 14 | BASE_DIR = dirname(dirname(dirname(os.path.abspath(__file__)))) 15 | 16 | # Quick-start development settings - unsuitable for production 17 | # See https://docs.djangoproject.com/en/1.8/howto/deployment/checklist/ 18 | 19 | # SECURITY WARNING: keep the secret key used in production secret! 20 | SECRET_KEY = '9k7451dd#&6qdyha$iq=ikii7w8u=f@*orsb%f6ghrqlb!8%_p' 21 | 22 | # SECURITY WARNING: don't run with debug turned on in production! 23 | DEBUG = True 24 | 25 | ALLOWED_HOSTS = [] 26 | 27 | 28 | # Application definition 29 | 30 | INSTALLED_APPS = ( 31 | # 'django.contrib.admin', 32 | # 'django.contrib.auth', 33 | # 'django.contrib.contenttypes', 34 | # 'django.contrib.sessions', 35 | # 'django.contrib.messages', 36 | 'django.contrib.staticfiles', 37 | 38 | # Third-party applications 39 | 'corsheaders', 40 | 41 | # Project applications 42 | 'main', 43 | ) 44 | 45 | MIDDLEWARE_CLASSES = ( 46 | # 'django.contrib.sessions.middleware.SessionMiddleware', 47 | 'corsheaders.middleware.CorsMiddleware', 48 | # 'django.middleware.common.CommonMiddleware', 49 | # 'django.middleware.csrf.CsrfViewMiddleware', 50 | # 'django.contrib.auth.middleware.AuthenticationMiddleware', 51 | # 'django.contrib.auth.middleware.SessionAuthenticationMiddleware', 52 | # 'django.contrib.messages.middleware.MessageMiddleware', 53 | 'django.middleware.clickjacking.XFrameOptionsMiddleware', 54 | 'django.middleware.security.SecurityMiddleware', 55 | ) 56 | 57 | ROOT_URLCONF = 'text_analysis.urls' 58 | 59 | # TEMPLATES = [ 60 | # { 61 | # 'BACKEND': 'django.template.backends.django.DjangoTemplates', 62 | # 'DIRS': [], 63 | # 'APP_DIRS': True, 64 | # 'OPTIONS': { 65 | # 'context_processors': [ 66 | # 'django.template.context_processors.debug', 67 | # 'django.template.context_processors.request', 68 | # 'django.contrib.auth.context_processors.auth', 69 | # 'django.contrib.messages.context_processors.messages', 70 | # ], 71 | # }, 72 | # }, 73 | # ] 74 | 75 | WSGI_APPLICATION = 'text_analysis.wsgi.application' 76 | 77 | 78 | # Database 79 | # https://docs.djangoproject.com/en/1.8/ref/settings/#databases 80 | 81 | DATABASES = { 82 | 'default': { 83 | 'ENGINE': 'django.db.backends.sqlite3', 84 | 'NAME': ':memory:', 85 | } 86 | } 87 | 88 | 89 | # Internationalization 90 | # https://docs.djangoproject.com/en/1.8/topics/i18n/ 91 | 92 | LANGUAGE_CODE = 'ja' 93 | 94 | TIME_ZONE = 'Asia/Tokyo' 95 | 96 | USE_I18N = True 97 | 98 | USE_L10N = True 99 | 100 | USE_TZ = True 101 | 102 | 103 | # Static files (CSS, JavaScript, Images) 104 | # https://docs.djangoproject.com/en/1.8/howto/static-files/ 105 | 106 | STATIC_URL = '/static/' 107 | STATIC_ROOT = os.path.join(BASE_DIR, 'staticfiles') 108 | 109 | CORS_ORIGIN_ALLOW_ALL = True 110 | CORS_ALLOW_METHODS = ( 111 | 'GET', 112 | 'POST', 113 | 'OPTIONS' 114 | ) 115 | -------------------------------------------------------------------------------- /text_analysis/main/tests.py: -------------------------------------------------------------------------------- 1 | import json 2 | from django.urls import reverse 3 | from django.test import TestCase 4 | 5 | 6 | class TestReadingApi(TestCase): 7 | def _getTargetURL(self, *args, **kwargs): 8 | return reverse('reading', args=args, kwargs=kwargs) 9 | 10 | def test_it(self): 11 | res = self.client.get(self._getTargetURL(), {'sentence': '今日は良い天気ですね。'}) 12 | self.assertEqual(res.status_code, 200) 13 | res_data = json.loads(res.content.decode()) 14 | self.assertEqual(res_data['items'][0]['reading'], 'キョウハヨイテンキデスネ。') 15 | 16 | def test_control_characters(self): 17 | res = self.client.get(self._getTargetURL(), {'sentence': '今日は\r\nNLNL良い天気\vですね。'}) 18 | self.assertEqual(res.status_code, 200) 19 | res_data = json.loads(res.content.decode()) 20 | self.assertEqual(res_data['items'][0]['reading'], 'キョウハnlnlヨイテンキデスネ。') 21 | 22 | 23 | class TestParseApi(TestCase): 24 | def _getTargetURL(self, *args, **kwargs): 25 | return reverse('parse', args=args, kwargs=kwargs) 26 | 27 | def test_it(self): 28 | res = self.client.get(self._getTargetURL(), {'sentence': '今日は良い天気ですね。'}) 29 | self.assertEqual(res.status_code, 200) 30 | res_data = json.loads(res.content.decode()) 31 | self.assertEqual(res_data['items'][0]['all']['reading'], 'キョウハヨイテンキデスネ。') 32 | 33 | def test_control_characters(self): 34 | res = self.client.get(self._getTargetURL(), {'sentence': '今日は\r\nNLNL良い天気\vですね。'}) 35 | self.assertEqual(res.status_code, 200) 36 | res_data = json.loads(res.content.decode()) 37 | self.assertEqual(res_data['items'][0]['all']['reading'], 'キョウハNLNLヨイテンキデスネ。') 38 | self.assertEqual(res_data['items'][0]['all']['normalized'], '今日は\nNLNL良い天気ですね。') 39 | 40 | 41 | class TestHandler400(TestCase): 42 | def _callFUT(self, request, exception): 43 | from main.views import handler400 44 | return handler400(request, exception) 45 | 46 | def test__it(self): 47 | import json 48 | res = self._callFUT('dummy request', "exception") 49 | body = json.loads(res.content.decode()) 50 | self.assertEqual(res.status_code, 400) 51 | self.assertEqual(body['error']['code'], 'bad_request') 52 | 53 | 54 | class TestHandler403(TestCase): 55 | def _callFUT(self, request, exception): 56 | from main.views import handler403 57 | return handler403(request, exception) 58 | 59 | def test__it(self): 60 | import json 61 | res = self._callFUT('dummy request', "exception") 62 | body = json.loads(res.content.decode()) 63 | self.assertEqual(res.status_code, 403) 64 | self.assertEqual(body['error']['code'], 'permission_denied') 65 | 66 | 67 | class TestHandler404(TestCase): 68 | def _callFUT(self, request, exception): 69 | from main.views import handler404 70 | return handler404(request, exception) 71 | 72 | def test__it(self): 73 | import json 74 | res = self._callFUT('dummy request', "exception") 75 | body = json.loads(res.content.decode()) 76 | self.assertEqual(res.status_code, 404) 77 | self.assertEqual(body['error']['code'], 'not_found') 78 | 79 | 80 | class TestHandler500(TestCase): 81 | def _callFUT(self, request): 82 | from main.views import handler500 83 | return handler500(request) 84 | 85 | def test__it(self): 86 | import json 87 | res = self._callFUT('dummy request') 88 | body = json.loads(res.content.decode()) 89 | self.assertEqual(res.status_code, 500) 90 | self.assertEqual(body['error']['code'], 'server_error') 91 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # mecab-web-api 2 | [![Circle CI](https://circleci.com/gh/bungoume/mecab-web-api.svg?style=shield)](https://circleci.com/gh/bungoume/mecab-web-api) 3 | [![Coverage Status](https://img.shields.io/coveralls/bungoume/mecab-web-api.svg)](https://coveralls.io/r/bungoume/mecab-web-api) 4 | [![Requirements Status](https://requires.io/github/bungoume/mecab-web-api/requirements.svg?branch=master)](https://requires.io/github/bungoume/mecab-web-api/requirements/?branch=master) 5 | [![License](http://img.shields.io/:license-MIT-blue.svg)](http://doge.mit-license.org) 6 | 7 | MeCabを利用した日本語形態素解析WebAPI 8 | 9 | ![typography-icon](typography-icon.png) 10 | 11 | 12 | ## Description 13 | 14 | 文章を形態素に分割し、品詞や読みなどを取得できるJSON WebAPIを提供します。 15 | 16 | 形態素解析にはMeCabを利用しており、MeCabの分かち書きコストの計算結果も取得できます。 17 | 18 | 19 | ## HTTP API 20 | ### Endpoints 21 | ``` 22 | method: 23 | GET or POST 24 | path: 25 | /text-analysis/v1/parse 26 | /text-analysis/v1/reading 27 | or 28 | /v1/parse 29 | /v1/reading 30 | params: 31 | sentence= 32 | nbest_num= (not required) 33 | ``` 34 | 35 | 36 | ## Demo 37 | * https://mecab-web-api.herokuapp.com/v1/parse?sentence=すもももももももものうち&nbest_num=2 38 | * https://mecab-web-api.herokuapp.com/text-analysis/v1/reading?sentence=今日は良い天気だ 39 | 40 | 41 | ## Sample Response 42 | ### parse API 43 | ``` 44 | GET /text-analysis/v1/parse?sentence=エビフライ 45 | or 46 | GET /v1/parse?sentence=エビフライ 47 | ``` 48 | 49 | Takes a JSON object like this: 50 | 51 | ```json 52 | { 53 | "items": [ 54 | { 55 | "words": [ 56 | { 57 | "word_cost": 4235, 58 | "surface": "エビ", 59 | "pos_detail1": "一般", 60 | "pos": "名詞", 61 | "conjugated_type": "", 62 | "ime_romaji": "ebi", 63 | "morpheme": "通常", 64 | "conjugated_form": "", 65 | "pos_detail3": "", 66 | "c_cost": -283, 67 | "pronunciation": "エビ", 68 | "baseform": "エビ", 69 | "reading": "エビ", 70 | "pos_detail2": "", 71 | "with_whitespace": false, 72 | "cost": 3952 73 | }, 74 | { 75 | "word_cost": 3742, 76 | "surface": "フライ", 77 | "pos_detail1": "一般", 78 | "pos": "名詞", 79 | "conjugated_type": "", 80 | "ime_romaji": "hurai", 81 | "morpheme": "通常", 82 | "conjugated_form": "", 83 | "pos_detail3": "", 84 | "c_cost": 62, 85 | "pronunciation": "フライ", 86 | "baseform": "フライ", 87 | "reading": "フライ", 88 | "pos_detail2": "", 89 | "with_whitespace": false, 90 | "cost": 3804 91 | } 92 | ], 93 | "all": { 94 | "cost": 7756, 95 | "wakati": "エビ フライ", 96 | "length": 5, 97 | "wakati_reading": "エビ フライ", 98 | "normalized": "エビフライ", 99 | "ime_romaji": "ebihurai", 100 | "reading": "エビフライ" 101 | } 102 | }, 103 | { 104 | "second cost analysis result" 105 | }, 106 | "..." 107 | ], 108 | "input_sentence": "エビフライ" 109 | } 110 | ``` 111 | 112 | ### reading API 113 | 114 | ``` 115 | GET /text-analysis/v1/parse?sentence=今日は良い天気だ 116 | or 117 | GET /v1/parse?sentence=今日は良い天気だ 118 | ``` 119 | 120 | Takes a JSON object like this: 121 | 122 | ```json 123 | { 124 | "items": [ 125 | { 126 | "ignore_all_romaji": "kiyouhayoitennkita", 127 | "romaji": "kixyouhayoitennkida", 128 | "ignore_kogaki_romaji": "kiyouhayoitennkida", 129 | "ignore_soundmark_romaji": "kixyouhayoitennkita", 130 | "qwerty_romaji": "kixyouhayoitennkida", 131 | "reading": "キョウハヨイテンキダ" 132 | }, 133 | { 134 | "ignore_all_romaji": "konnnitihayoitennkita", 135 | "romaji": "konnnitihayoitennkida", 136 | "ignore_kogaki_romaji": "konnnitihayoitennkida", 137 | "ignore_soundmark_romaji": "konnnitihayoitennkita", 138 | "qwerty_romaji": "konnnitihayoitennkida", 139 | "reading": "コンニチハヨイテンキダ" 140 | }, 141 | "..." 142 | ], 143 | "input_sentence": "今日は良い天気だ" 144 | } 145 | ``` 146 | 147 | 148 | ## Quick Start 149 | using Docker Hub 150 | 151 | ```sh 152 | $ sudo docker run -d -p 8000:8000 bungoume/mecab-web-api 153 | ``` 154 | 155 | or build container on yourself 156 | 157 | ```sh 158 | $ git clone https://github.com/bungoume/mecab-web-api.git 159 | $ sudo docker build -t mecab-web-api mecab-web-api 160 | $ sudo docker run -d -p 8000:8000 mecab-web-api 161 | ``` 162 | 163 | then, access http://localhost:8000/static/demo.html 164 | 165 | 166 | ## Licence 167 | 168 | [MIT](http://doge.mit-license.org) 169 | 170 | 171 | ## Author 172 | 173 | [bungoume](https://github.com/bungoume) 174 | -------------------------------------------------------------------------------- /text_analysis/main/mecab_utils.py: -------------------------------------------------------------------------------- 1 | import re 2 | import MeCab 3 | import unicodedata 4 | 5 | 6 | # RE_HIRAGANA = re.compile(r'[\u3040-\u309F]') 7 | RE_NOWORD = re.compile(r'[^\w-]') 8 | RE_LX = re.compile(r'[lx]') 9 | RE_ALL = re.compile(r'.') 10 | 11 | 12 | # node: 1つの形態素を出力, デフォルトは空文字 13 | # unk: 1つの未知語形態素を出力, デフォルトは node と同一フォーマット 14 | # bos: 形態素解析の結果に先だって出力 (header 的役割), デフォルトは空文字 15 | # eos: 形態素解析の結果の後に出力 (footer 的役割), デフォルトは "EOS\n" 16 | # eon: N-best出力で, N-Bestの出力が終了したときに出力, デフォルトは空文字列 17 | # %s: 形態素種類, %m: 形態素の表層文字列, %pS: 先頭に空白を含むか, 18 | # %f[n]: 19 | # 0: 品詞, 1:品詞, 2:品詞, 3:品詞, 4:活用型, 5:活用形, 6:原形, 7:読み, 8:発音 20 | # %pw: 単語生起コスト, %pC: 1つ前の形態素との連接コスト 21 | M_PARSE = MeCab.Tagger('--node-format={0} --unk-format={1} --eos-format=EOS'.format( 22 | r'%s\v%m\v%pS\v%f[0]\v%f[1]\v%f[2]\v%f[3]\v%f[4]\v%f[5]' 23 | r'\v%f[6]\v%f[7]\v%f[8]\v%pw\v%pC\r\n', 24 | r'%s\v%m\v%pS\v%f[0]\v%f[1]\v%f[2]\v%f[3]\v%f[4]\v%f[5]' 25 | r'\v%f[6]\v%m\v%m\v%pw\v%pC\r\n', 26 | )) 27 | M_READING = MeCab.Tagger('-Oyomi') 28 | 29 | 30 | def remove_mark(w): 31 | """英語・ハイフン以外のもの(句読点など)を除去 32 | """ 33 | w = RE_NOWORD.sub('', w) 34 | return w 35 | 36 | 37 | def remove_soundmark(w): 38 | """濁点・半濁点を削除 39 | """ 40 | w = w.replace('g', 'k') 41 | w = w.replace('z', 's') 42 | w = w.replace('d', 't') 43 | w = w.replace('b', 'h') 44 | w = w.replace('p', 'h') 45 | return w 46 | 47 | 48 | def qwerty_kana(w): 49 | """QWERTYキーボードのIME向けの予測処理をする 50 | """ 51 | w = re.sub(r'([qrtypsdfghjlzxcvbm])\1', r'っ\1', w) 52 | w = re.sub(r'ji?', 'じ', w) 53 | w = re.sub(r'fu?', 'ふ', w) 54 | w = re.sub(r'ch?i?', 'ち', w) 55 | w = re.sub(r'qu?', 'く', w) 56 | w = re.sub(r'shi?', 'し', w) 57 | w = re.sub(r'tsu?', 'つ', w) 58 | w = w.replace('ky', 'きxy') 59 | w = w.replace('gy', 'ぎxy') 60 | w = w.replace('sy', 'しxy') 61 | w = w.replace('zy', 'じxy') 62 | w = w.replace('ty', 'ちxy') 63 | w = w.replace('dy', 'ぢxy') 64 | w = w.replace('ny', 'にxy') 65 | w = w.replace('hy', 'ひxy') 66 | w = w.replace('my', 'みxy') 67 | w = w.replace('ry', 'りxy') 68 | w = w.replace('nn', 'ん') 69 | w = re.sub(r'n([^aiueo])', r'ん\1', w) 70 | return w 71 | 72 | 73 | ROMAJI_DICT = { 74 | 'ア': 'a', 'イ': 'i', 'ウ': 'u', 'エ': 'e', 'オ': 'o', 75 | 'カ': 'ka', 'キ': 'ki', 'ク': 'ku', 'ケ': 'ke', 'コ': 'ko', 76 | 'サ': 'sa', 'シ': 'si', 'ス': 'su', 'セ': 'se', 'ソ': 'so', 77 | 'タ': 'ta', 'チ': 'ti', 'ツ': 'tu', 'テ': 'te', 'ト': 'to', 78 | 'ナ': 'na', 'ニ': 'ni', 'ヌ': 'nu', 'ネ': 'ne', 'ノ': 'no', 79 | 'ハ': 'ha', 'ヒ': 'hi', 'フ': 'hu', 'ヘ': 'he', 'ホ': 'ho', 80 | 'マ': 'ma', 'ミ': 'mi', 'ム': 'mu', 'メ': 'me', 'モ': 'mo', 81 | 'ヤ': 'ya', 'ユ': 'yu', 'ヨ': 'yo', 82 | 'ラ': 'ra', 'リ': 'ri', 'ル': 'ru', 'レ': 're', 'ロ': 'ro', 83 | 'ワ': 'wa', 'ヰ': 'wi', 'ヱ': 'we', 'ヲ': 'wo', 84 | 'ガ': 'ga', 'ギ': 'gi', 'グ': 'gu', 'ゲ': 'ge', 'ゴ': 'go', 85 | 'ザ': 'za', 'ジ': 'zi', 'ズ': 'zu', 'ゼ': 'ze', 'ゾ': 'zo', 86 | 'ダ': 'da', 'ヂ': 'di', 'ヅ': 'du', 'デ': 'de', 'ド': 'do', 87 | 'バ': 'ba', 'ビ': 'bi', 'ブ': 'bu', 'ベ': 'be', 'ボ': 'bo', 88 | 'パ': 'pa', 'ピ': 'pi', 'プ': 'pu', 'ペ': 'pe', 'ポ': 'po', 89 | 'ヴ': 'vu', 90 | 'ァ': 'xa', 'ィ': 'xi', 'ゥ': 'xu', 'ェ': 'xe', 'ォ': 'xo', 91 | 'ッ': 'xtu', 92 | 'ャ': 'xya', 'ュ': 'xyu', 'ョ': 'xyo', 93 | 'ヮ': 'xwa', 94 | 'ヶ': 'xke', 'ヵ': 'xka', 95 | 'ン': 'nn', 96 | '、': ',', '。': '.', '・': ';', 97 | 'ー': '-', '-': '-', '‐': '-', 98 | } 99 | 100 | 101 | def to_romaji(w): 102 | """カタカナ・ひらがなをローマ字書きに変換する 103 | 一般的なローマ字ではなく、IMEでの単体文字入力となる形に変換して予測候補を出しやすくする 104 | ref: http://developers.linecorp.com/blog/?p=367 105 | """ 106 | def ctoromaji(c): 107 | c = c.group(0) 108 | 109 | # if RE_HIRAGANA.search(c): 110 | # c = chr(ord(c)+96) 111 | if c in ROMAJI_DICT: 112 | return ROMAJI_DICT[c] 113 | else: 114 | return c 115 | 116 | return RE_ALL.sub(ctoromaji, w) 117 | 118 | 119 | def reading_sentence(sentence, nbest_num=10): 120 | sentence = unicodedata.normalize('NFKC', sentence) 121 | sentence = sentence.replace('\v', '') 122 | sentence = sentence.replace('\r', '') 123 | sentence = sentence.replace('\n', '') 124 | 125 | parsed_text = M_READING.parseNBest(nbest_num, sentence) 126 | nbests = parsed_text.strip().split('\n') 127 | 128 | ans_list = [] 129 | for reading in nbests: 130 | reading = reading.lower() 131 | if reading in map(lambda x: x['reading'], ans_list): 132 | continue 133 | roma = remove_mark(to_romaji(reading)) 134 | no_soundmark = remove_soundmark(roma) 135 | ret = { 136 | 'reading': reading, 137 | 'romaji': roma, 138 | # 日本語変換前の語句をひらがなに(弊害で英語名検索不可) 139 | 'qwerty_romaji': to_romaji(qwerty_kana(reading)), 140 | # 濁点・半濁点を削除 141 | 'ignore_soundmark_romaji': no_soundmark, 142 | # 小書き文字を通常の仮名と同一視する 143 | 'ignore_kogaki_romaji': RE_LX.sub('', roma), 144 | 'ignore_all_romaji': RE_LX.sub('', no_soundmark), 145 | } 146 | ans_list.append(ret) 147 | 148 | return ans_list 149 | 150 | 151 | def parse_sentence(sentence, nbest_num=3): 152 | sentence = unicodedata.normalize('NFKC', sentence) 153 | sentence = sentence.replace('\v', '') 154 | sentence = sentence.replace('\r', '') 155 | 156 | parsed_text = M_PARSE.parseNBest(nbest_num, sentence) 157 | nbests = parsed_text.strip() 158 | nbests = nbests.split('\r\nEOS')[:-1] 159 | 160 | def parse_line(line): 161 | x = line.split('\v') 162 | MORPHEME_TYPE = {'0': '通常', '1': '未知語', '2': '文頭', '3': '文末'} 163 | w_cost = int(x[12]) 164 | c_cost = int(x[13]) 165 | return { 166 | 'morpheme': MORPHEME_TYPE[x[0]], # 形態素種類 167 | 'surface': x[1], # 形態素の表層文字列 168 | 'with_whitespace': bool(len(x[2])), # 先頭に空白を含むか 169 | 'pos': x[3], # 品詞 170 | 'pos_detail1': x[4], # 品詞細分類1 171 | 'pos_detail2': x[5], # 品詞細分類2 172 | 'pos_detail3': x[6], # 品詞細分類3 173 | 'conjugated_type': x[7], # 活用型 174 | 'conjugated_form': x[8], # 活用形 175 | 'baseform': x[9], # 原形 176 | 'reading': x[10], # 読み 177 | 'pronunciation': x[11], # 発音 178 | 'word_cost': w_cost, # 単語生起コスト 179 | 'c_cost': c_cost, # 1つ前の形態素との連接コスト 180 | 'cost': w_cost+c_cost, # その形態素単独 181 | 'ime_romaji': to_romaji(x[10]).lower(), # ローマ字 182 | } 183 | 184 | ans_list = [] 185 | for nbest in nbests: 186 | words = list(map(parse_line, nbest.strip().split('\r\n'))) 187 | readings = list(map(lambda x: x['reading'], words)) 188 | roma = to_romaji(''.join(readings)).lower() 189 | 190 | ret = { 191 | 'all': { 192 | 'normalized': sentence, 193 | 'length': len(sentence), 194 | 'cost': sum(map(lambda x: x['cost'], words)), 195 | 'reading': ''.join(readings), 196 | 'ime_romaji': remove_mark(roma), 197 | 'wakati': ' '.join(map(lambda x: x['surface'], words)), 198 | 'wakati_reading': ' '.join(readings), 199 | }, 200 | 'words': words 201 | } 202 | ans_list.append(ret) 203 | return ans_list 204 | --------------------------------------------------------------------------------