├── .github
    ├── CONTRIBUTING.md
    ├── FUNDING.yml
    └── workflows
    │   └── python-package.yml
├── .gitignore
├── LICENSE
├── MANIFEST.in
├── README.md
├── benchmark
    ├── benchmark.ipynb
    └── normalize_neologd.py
├── cythonize.sh
├── neologdn.cpp
├── neologdn.pyx
├── pyproject.toml
├── setup.py
└── test_neologdn.py


/.github/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # Contributing
 2 | 
 3 | Contributions are welcome!
 4 | 
 5 | Issue や Pull request は日本語でも大丈夫です！
 6 | 
 7 | ## Types of Contributions
 8 | 
 9 | ### Report Bugs
10 | 
11 | Report bugs at https://github.com/ikegami-yukino/neologdn/issues.
12 | 
13 | If you are reporting a bug, please include:
14 | 
15 | - Your operating system name and version.
16 | - Your Python verison and 32-bit/64-bit
17 | - Using Anaconda/Miniconda: yes or no
18 | - Error log
19 | - Any details about your local setup that might be helpful in troubleshooting.
20 | - Detailed steps to reproduce the bug.
21 | 
22 | ### Fix Bugs
23 | 
24 | Look through the GitHub issues for bugs. Anything tagged with "bug" and "help wanted" is open to whoever wants to implement it.
25 | 
26 | ### Implement Features
27 | 
28 | Look through the GitHub issues for features. Anything tagged with "enhancement" and "help wanted" is open to whoever wants to implement it.
29 | 
30 | ### Typo
31 | 
32 | If you find typo, I would appreciate if you could submit pull request.
33 | 
34 | ### Revise Document
35 | 
36 | If you find mistake/missing in `README.md` or docstrings, I would appreciate if you could submit pull request.
37 | 
38 | ### Submit Feedback
39 | 
40 | The best way to send feedback is to file an issue at https://github.com/ikegami-yukino/neologdn/issues.
41 | 
42 | If you are proposing a new feature:
43 | 
44 | - Explain in detail how it would work.
45 | - Explain your motivation/hypothesis about a new feature.
46 | - Keep the scope as narrow as possible, to make it easier to implement.
47 | 
48 | ## Get Started
49 | 
50 | Here's how to set up neologdn for local development.
51 | 
52 | 1. Fork the neologdn repo on GitHub.
53 | 2. Clone your fork locally:
54 | 
55 | ```sh
56 | git clone git@github.com:ikegami-yukino/neologdn.git
57 | ```
58 | 
59 | 3. Create a branch for local development:
60 | 
61 | ```sh
62 | git checkout -b name-of-your-bugfix-or-feature
63 | ```
64 | Now you can make your changes locally.
65 | 
66 | 4. When you're done making changes, check that your changes pass the tests:
67 | 
68 | ```sh
69 | python test_neologdn.py
70 | ```
71 | 
72 | 5. Commit your changes and push your branch to GitHub:
73 | 
74 | ```sh
75 | git add .
76 | git commit -m "Your detailed description of your changes."
77 | git push origin name-of-your-bugfix-or-feature
78 | ```
79 | 
80 | 6. Submit a pull request through the GitHub website.
81 | 
82 | ## Pull Request Guidelines
83 | 
84 | Before you submit a pull request, check that it meets these guidelines:
85 | 
86 | - The pull request should include tests.
87 | - If the pull request adds functionality, the docs should be updated. Put your new functionality into a function with a docstring, and add example about the feature to the `README.md`.
88 | - The title of pull request should be written in English.
89 | - The message/comment about pull request should be written in English or Japanese (日本語).
90 | 


--------------------------------------------------------------------------------
/.github/FUNDING.yml:
--------------------------------------------------------------------------------
1 | # These are supported funding model platforms
2 | 
3 | github: ikegami-yukino
4 | custom: ['https://www.amazon.co.jp/hz/wishlist/ls/15HRUQ0OYGJYQ?type=wishlist&filter=unpurchased&sort=priority', 'https://www.soundhouse.co.jp/customers/wishlist/index/?list_id=85773']
5 | 


--------------------------------------------------------------------------------
/.github/workflows/python-package.yml:
--------------------------------------------------------------------------------
 1 | # This workflow will install Python dependencies, run tests and lint with a variety of Python versions
 2 | # For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python
 3 | 
 4 | name: Python package
 5 | 
 6 | on:
 7 |   push:
 8 |     branches: [ "master" ]
 9 |   pull_request:
10 |     branches: [ "master" ]
11 | 
12 | jobs:
13 |   build:
14 | 
15 |     runs-on: ubuntu-latest
16 |     strategy:
17 |       fail-fast: false
18 |       matrix:
19 |         python-version: ["3.8", "3.9", "3.10", "3.11", "3.12", "3.13"]
20 | 
21 |     steps:
22 |     - uses: actions/checkout@v3
23 |     - name: Set up Python ${{ matrix.python-version }}
24 |       uses: actions/setup-python@v3
25 |       with:
26 |         python-version: ${{ matrix.python-version }}
27 |     - name: Install dependencies
28 |       run: |
29 |         python -m pip install --upgrade pip
30 |         python -m pip install flake8 pytest
31 |         python -m pip install .
32 |         if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
33 |     - name: Lint with flake8
34 |       run: |
35 |         # stop the build if there are Python syntax errors or undefined names
36 |         flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
37 |         # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
38 |         flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
39 |     - name: Test with pytest
40 |       run: |
41 |         pytest
42 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | *.py[co]
2 | .ipynb_checkpoints
3 | .DS_Store
4 | __pycache__
5 | build/*
6 | dist/*
7 | *.egg-info
8 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "{}"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright 2015 Yukino Ikegami
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 
203 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include LICENSE
2 | include neologdn.cpp
3 | include README.md
4 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # neologdn
  2 | 
  3 | [![PyPI Downloads](https://static.pepy.tech/badge/neologdn)](https://pepy.tech/projects/neologdn)![PyPI - Version](https://img.shields.io/pypi/v/neologdn)![PyPI - Python Version](https://img.shields.io/pypi/pyversions/neologdn)![PyPI - License](https://img.shields.io/pypi/l/neologdn)
  4 | 
  5 | neologdn is a Japanese text normalizer for [mecab-neologd](https://github.com/neologd/mecab-ipadic-neologd).
  6 | 
  7 | The normalization is based on the neologd's rules:
  8 | https://github.com/neologd/mecab-ipadic-neologd/wiki/Regexp.ja
  9 | 
 10 | And also some optional features are added.
 11 | 
 12 | Contributions are welcome!
 13 | 
 14 | NOTE: Installing this module requires C++11 compiler.
 15 | 
 16 | ## Installation
 17 | 
 18 | ```sh
 19 | pip install neologdn
 20 | ```
 21 | 
 22 | If setuptools is not installed, you must install it:
 23 | 
 24 | ```sh
 25 | pip install setuptools
 26 | ```
 27 | 
 28 | If you encountered the following error:
 29 | 
 30 | ```sh
 31 | ERROR: Could not find a version that satisfies the requirement setuptools (from versions: none)
 32 | ```
 33 | 
 34 | Then execute the following commands to may solve this error:
 35 | 
 36 | ```sh
 37 | pip install wheel
 38 | pip install --no-build-isolation neologdn
 39 | ```
 40 | 
 41 | ## Usage
 42 | 
 43 | ```python
 44 | import neologdn
 45 | neologdn.normalize("ﾊﾝｶｸｶﾅ")
 46 | # => 'ハンカクカナ'
 47 | neologdn.normalize("全角記号！？＠＃")
 48 | # => '全角記号!?@#'
 49 | neologdn.normalize("全角記号例外「・」")
 50 | # => '全角記号例外「・」'
 51 | neologdn.normalize("長音短縮ウェーーーーイ")
 52 | # => '長音短縮ウェーイ'
 53 | neologdn.normalize("チルダ削除ウェ~∼∾〜〰～イ")
 54 | # => 'チルダ削除ウェイ'
 55 | neologdn.normalize("いろんなハイフン˗֊‐‑‒–⁃⁻₋−")
 56 | # => 'いろんなハイフン-'
 57 | neologdn.normalize("　　　ＰＲＭＬ　　副　読　本　　　")
 58 | # => 'PRML副読本'
 59 | neologdn.normalize(" Natural Language Processing ")
 60 | # => 'Natural Language Processing'
 61 | neologdn.normalize("かわいいいいいいいいい", repeat=6)
 62 | # => 'かわいいいいいい'
 63 | neologdn.normalize("無駄無駄無駄無駄ァ", repeat=1)
 64 | # => '無駄ァ'
 65 | neologdn.normalize("1995〜2001年", tilde="normalize")
 66 | # => '1995~2001年'
 67 | neologdn.normalize("1995~2001年", tilde="normalize_zenkaku")
 68 | # => '1995〜2001年'
 69 | neologdn.normalize("1995〜2001年", tilde="ignore")  # Don't convert tilde
 70 | # => '1995〜2001年'
 71 | neologdn.normalize("1995〜2001年", tilde="remove")
 72 | # => '19952001年'
 73 | neologdn.normalize("1995〜2001年")  # Default parameter
 74 | # => '19952001年'
 75 | ```
 76 | 
 77 | 
 78 | ## Benchmark
 79 | 
 80 | ```python
 81 | 
 82 | # Sample code from
 83 | # https://github.com/neologd/mecab-ipadic-neologd/wiki/Regexp.ja#python-written-by-hideaki-t--overlast
 84 | import normalize_neologd
 85 | 
 86 | %timeit normalize(normalize_neologd.normalize_neologd)
 87 | # => 9.55 s ± 29.4 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
 88 | 
 89 | import neologdn
 90 | %timeit normalize(neologdn.normalize)
 91 | # => 6.66 s ± 35.8 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
 92 | ```
 93 | 
 94 | neologdn is about x1.43 faster than sample code.
 95 | 
 96 | details are described as the below notebook:
 97 | https://github.com/ikegami-yukino/neologdn/blob/master/benchmark/benchmark.ipynb
 98 | 
 99 | ## License
100 | 
101 | Apache Software License.
102 | 
103 | ## CHANGES
104 | 
105 | ### 0.5.4 (2025-03-15)
106 | 
107 | - Support Python 3.13
108 | - Fix tilde loss after latin and whitespace (Many thanks @a-lucky)
109 | 
110 | ### 0.5.3 (2024-05-03)
111 | 
112 | - Support Python 3.12
113 | 
114 | ### 0.5.2 (2023-08-03)
115 | 
116 | - Support Python 3.10 and 3.11 (Many thanks @polm)
117 | 
118 | ### 0.5.1 (2021-05-02)
119 | 
120 | - Improve performance of shorten_repeat function (Many thanks @yskn67)
121 | - Add tilde option to normalize function
122 | 
123 | ### 0.4 (2018-12-06)
124 | 
125 | - Add shorten_repeat function, which shortening contiguous substring. For example: neologdn.normalize("無駄無駄無駄無駄ァ", repeat=1) -> 無駄ァ
126 | 
127 | ### 0.3.2 (2018-05-17)
128 | 
129 | - Add option for suppression removal of spaces between Japanese characters
130 | 
131 | ### 0.2.2 (2018-03-10)
132 | 
133 | - Fix bug (daku-ten & handaku-ten)
134 | - Support mac osx 10.13 (Many thanks @r9y9)
135 | 
136 | ### 0.2.1 (2017-01-23)
137 | 
138 | - Fix bug (Check if a previous character of daku-ten character is in maps) (Many thanks @unnonouno)
139 | 
140 | ### 0.2 (2016-04-12)
141 | 
142 | - Add lengthened expression (repeating character) threshold
143 | 
144 | ### 0.1.2 (2016-03-29)
145 | 
146 | - Fix installation bug
147 | 
148 | ### 0.1.1.1 (2016-03-19)
149 | 
150 | - Support Windows
151 | - Explicitly specify to -std=c++11 in build (Many thanks @id774)
152 | 
153 | ### 0.1.1 (2015-10-10)
154 | 
155 | Initial release.
156 | 
157 | ## Contribution
158 | 
159 | Contributions are welcome! See: https://github.com/ikegami-yukino/neologdn/blob/master/.github/CONTRIBUTING.md
160 | 
161 | ## Cited by
162 | 
163 | ### Book
164 | 
165 | - 山本 和英. テキスト処理の要素技術. 近代科学者. P.41. 2021.
166 | 
167 | ### Blog
168 | 
169 | - 【ライブラリ紹介】テキスト正規化ライブラリ neologdn: https://diatonic.codes/blog/neologdn/
170 | - 日本語テキストの前処理：neologdn、大文字小文字、Unicode正規化 - tuttieee’s blog: https://tuttieee.hatenablog.com/entry/ja-nlp-preprocess
171 | - ▲本日の関数==neologdn.normalize()== - TPTブログ: https://ds-blog.tbtech.co.jp/entry/2020/05/11/%E2%96%B2%E6%9C%AC%E6%97%A5%E3%81%AE%E9%96%A2%E6%95%B0%3D%3Dneologdn_normalize%28%29%3D%3D
172 | - NLPについて学ぶ: https://zenn.dev/panyoriokome/scraps/d67f68ab50c0c1
173 | - テキスト正規化用PythonライブラリをMATLABからコール #Python - Qiita: https://qiita.com/aoimidori/items/ab5a4383b5a7bb307bad
174 | - 自然言語処理の前処理手順をPythonコード付きでご紹介 | AI活用・AI導入事例の紹介 | AI活用・AI導入事例の紹介: https://www.matrixflow.net/case-study/75/
175 | - pythonによる日本語前処理備忘録 | DATUM STUDIO株式会社: https://datumstudio.jp/blog/python%E3%81%AB%E3%82%88%E3%82%8B%E6%97%A5%E6%9C%AC%E8%AA%9E%E5%89%8D%E5%87%A6%E7%90%86%E5%82%99%E5%BF%98%E9%8C%B2/
176 | - 前処理、前処理、そして、前処理 （自然言語処理：日本語編）｜narudesu: https://note.com/narudesu/n/na35de30a583a
177 | - ショートカットキーでneologd.normalize: https://scrapbox.io/nishio/%E3%82%B7%E3%83%A7%E3%83%BC%E3%83%88%E3%82%AB%E3%83%83%E3%83%88%E3%82%AD%E3%83%BC%E3%81%A7neologd.normalize
178 | - Pythonで自然言語処理を行うための環境構築 #Python - Qiita: https://qiita.com/lawyer_alpaca/items/86b0deda984170203467
179 | - Python normalize Examples: https://python.hotexamples.com/examples/neologdn/-/normalize/python-normalize-function-examples.html
180 | - 株式会社ししまろ (ch-4) 潜在的ディリクレ配分(LDA)によるchABSAデータセットの分析: https://shishimaro.co.jp/blog/ai/538
181 | - 形態素解析前の日本語文書の前処理 (Python) - け日記: https://ohke.hateblo.jp/entry/2019/02/09/141500
182 | - 人工知能に言語を理解させる！？自然言語処理に重要なデータの前処理をPythonで徹底解説 | AI研究所: https://ai-kenkyujo.com/programming/make-ai-understand-the-language/
183 | - 最新wikipediaを反映したMeCabユーザー辞書を作る - NEologd拡張 | ぷらこめ: https://purakome.net/mecab/addwiki/
184 | - 【自然言語処理入門】文に対してストップワードと正規化から処理を施す | マイナビエンジニアブログ: https://engineerblog.mynavi.jp/technology/nlp_stopword/
185 | - 表記統一 [自然言語処理の餅屋]: https://www.jnlp.org/nlp/%E6%A0%A1%E6%AD%A3/%E8%A1%A8%E8%A8%98%E7%B5%B1%E4%B8%80
186 | - Pytorchを使ってテキスト生成モデルのT5を構築 〜Transformersでの転移学習による手軽な実践〜 - 見習いデータサイエンティストの隠れ家: https://www.dskomei.com/entry/2021/09/28/110016
187 | - 象と散歩: Goolge Colabでお手軽テキストマイニング(日本語前処理): https://walking-elephant.blogspot.com/2023/07/text-mining-normalized.html
188 | - 【Pythonで自然言語処理（NLP）を実装してみよう！】学ぶべき知識についても徹底解説！ - ベトナムオフショア開発の最前線 by Mattock inc.: https://mattock.jp/blog/artificial-intelligence/nlp/lets-implement-nlp-in-python/
189 | - tools [Digital Humanities Japan: Resource Wiki]: https://dhjapan.org/wiki/doku.php?id=tools
190 | - Pythonで現代の季語を調べてみた | Aidemy | 10秒で始めるAIプログラミング学習サービスAidemy［アイデミー］: https://aidemy.net/magazine/703/
191 | 


--------------------------------------------------------------------------------
/benchmark/benchmark.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {
  7 |     "collapsed": false,
  8 |     "jupyter": {
  9 |      "outputs_hidden": false
 10 |     }
 11 |    },
 12 |    "outputs": [
 13 |     {
 14 |      "name": "stdout",
 15 |      "output_type": "stream",
 16 |      "text": [
 17 |       "3.11.4 (main, Jul 25 2023, 17:36:13) [Clang 14.0.3 (clang-1403.0.22.14.1)]\n"
 18 |      ]
 19 |     }
 20 |    ],
 21 |    "source": [
 22 |     "import sys\n",
 23 |     "print(sys.version)"
 24 |    ]
 25 |   },
 26 |   {
 27 |    "cell_type": "code",
 28 |    "execution_count": 2,
 29 |    "metadata": {
 30 |     "collapsed": false,
 31 |     "jupyter": {
 32 |      "outputs_hidden": false
 33 |     }
 34 |    },
 35 |    "outputs": [
 36 |     {
 37 |      "name": "stdout",
 38 |      "output_type": "stream",
 39 |      "text": [
 40 |       "Apple M2\n"
 41 |      ]
 42 |     }
 43 |    ],
 44 |    "source": [
 45 |     "%%bash\n",
 46 |     "sysctl -n machdep.cpu.brand_string"
 47 |    ]
 48 |   },
 49 |   {
 50 |    "cell_type": "code",
 51 |    "execution_count": 3,
 52 |    "metadata": {
 53 |     "collapsed": false,
 54 |     "jupyter": {
 55 |      "outputs_hidden": false
 56 |     }
 57 |    },
 58 |    "outputs": [
 59 |     {
 60 |      "name": "stderr",
 61 |      "output_type": "stream",
 62 |      "text": [
 63 |       "  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current\n",
 64 |       "                                 Dload  Upload   Total   Spent    Left  Speed\n",
 65 |       "100 8647k  100 8647k    0     0  4922k      0  0:00:01  0:00:01 --:--:-- 4927k\n"
 66 |      ]
 67 |     }
 68 |    ],
 69 |    "source": [
 70 |     "%%bash\n",
 71 |     "curl -O https://www.rondhuit.com/download/ldcc-20140209.tar.gz\n",
 72 |     "tar -x -f ldcc-20140209.tar.gz\n",
 73 |     "find text/ -name \"*.txt\" -print0 | xargs -0 -I % cat %  >> /tmp/ld.txt"
 74 |    ]
 75 |   },
 76 |   {
 77 |    "cell_type": "code",
 78 |    "execution_count": 4,
 79 |    "metadata": {},
 80 |    "outputs": [],
 81 |    "source": [
 82 |     "def normalize(func):\n",
 83 |     "    with open('/tmp/ld.txt') as fd:\n",
 84 |     "        for line in fd:\n",
 85 |     "            func(line)"
 86 |    ]
 87 |   },
 88 |   {
 89 |    "cell_type": "code",
 90 |    "execution_count": 5,
 91 |    "metadata": {
 92 |     "collapsed": false,
 93 |     "jupyter": {
 94 |      "outputs_hidden": false
 95 |     }
 96 |    },
 97 |    "outputs": [
 98 |     {
 99 |      "name": "stdout",
100 |      "output_type": "stream",
101 |      "text": [
102 |       "183 ms ± 1.35 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n"
103 |      ]
104 |     }
105 |    ],
106 |    "source": [
107 |     "def compute_overhead(x):\n",
108 |     "    pass\n",
109 |     "\n",
110 |     "%timeit normalize(compute_overhead)"
111 |    ]
112 |   },
113 |   {
114 |    "cell_type": "code",
115 |    "execution_count": 6,
116 |    "metadata": {
117 |     "collapsed": false,
118 |     "jupyter": {
119 |      "outputs_hidden": false
120 |     }
121 |    },
122 |    "outputs": [
123 |     {
124 |      "name": "stdout",
125 |      "output_type": "stream",
126 |      "text": [
127 |       "9.55 s ± 29.4 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
128 |      ]
129 |     }
130 |    ],
131 |    "source": [
132 |     "# Sample code from\n",
133 |     "# https://github.com/neologd/mecab-ipadic-neologd/wiki/Regexp.ja#python-written-by-hideaki-t--overlast\n",
134 |     "import normalize_neologd\n",
135 |     "\n",
136 |     "%timeit normalize(normalize_neologd.normalize_neologd)"
137 |    ]
138 |   },
139 |   {
140 |    "cell_type": "code",
141 |    "execution_count": 7,
142 |    "metadata": {
143 |     "collapsed": false,
144 |     "jupyter": {
145 |      "outputs_hidden": false
146 |     }
147 |    },
148 |    "outputs": [
149 |     {
150 |      "name": "stdout",
151 |      "output_type": "stream",
152 |      "text": [
153 |       "6.66 s ± 35.8 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
154 |      ]
155 |     }
156 |    ],
157 |    "source": [
158 |     "import neologdn\n",
159 |     "%timeit normalize(neologdn.normalize)"
160 |    ]
161 |   },
162 |   {
163 |    "cell_type": "code",
164 |    "execution_count": 8,
165 |    "metadata": {},
166 |    "outputs": [],
167 |    "source": [
168 |     "%%bash\n",
169 |     "rm -rf ldcc-20140209.tar.gz text ld.txt"
170 |    ]
171 |   },
172 |   {
173 |    "cell_type": "code",
174 |    "execution_count": null,
175 |    "metadata": {},
176 |    "outputs": [],
177 |    "source": []
178 |   }
179 |  ],
180 |  "metadata": {
181 |   "kernelspec": {
182 |    "display_name": "Python 3 (ipykernel)",
183 |    "language": "python",
184 |    "name": "python3"
185 |   },
186 |   "language_info": {
187 |    "codemirror_mode": {
188 |     "name": "ipython",
189 |     "version": 3
190 |    },
191 |    "file_extension": ".py",
192 |    "mimetype": "text/x-python",
193 |    "name": "python",
194 |    "nbconvert_exporter": "python",
195 |    "pygments_lexer": "ipython3",
196 |    "version": "3.11.4"
197 |   }
198 |  },
199 |  "nbformat": 4,
200 |  "nbformat_minor": 4
201 | }
202 | 


--------------------------------------------------------------------------------
/benchmark/normalize_neologd.py:
--------------------------------------------------------------------------------
 1 | # encoding: utf8
 2 | """
 3 | from: https://github.com/neologd/mecab-ipadic-neologd/wiki/Regexp.ja
 4 | """
 5 | from __future__ import unicode_literals
 6 | import re
 7 | import unicodedata
 8 | 
 9 | def unicode_normalize(cls, s):
10 |     pt = re.compile('([{}]+)'.format(cls))
11 | 
12 |     def norm(c):
13 |         return unicodedata.normalize('NFKC', c) if pt.match(c) else c
14 | 
15 |     s = ''.join(norm(x) for x in re.split(pt, s))
16 |     return s
17 | 
18 | def remove_extra_spaces(s):
19 |     s = re.sub('[ 　]+', ' ', s)
20 |     blocks = ''.join(('\u4E00-\u9FFF',  # CJK UNIFIED IDEOGRAPHS
21 |                       '\u3040-\u309F',  # HIRAGANA
22 |                       '\u30A0-\u30FF',  # KATAKANA
23 |                       '\u3000-\u303F',  # CJK SYMBOLS AND PUNCTUATION
24 |                       '\uFF00-\uFFEF'   # HALFWIDTH AND FULLWIDTH FORMS
25 |                       ))
26 |     basic_latin = '\u0000-\u007F'
27 | 
28 |     def remove_space_between(cls1, cls2, s):
29 |         p = re.compile('([{}]) ([{}])'.format(cls1, cls2))
30 |         while p.search(s):
31 |             s = p.sub(r'\1\2', s)
32 |         return s
33 | 
34 |     s = remove_space_between(blocks, blocks, s)
35 |     s = remove_space_between(blocks, basic_latin, s)
36 |     s = remove_space_between(basic_latin, blocks, s)
37 |     return s
38 | 
39 | def normalize_neologd(s):
40 |     s = s.strip()
41 |     s = unicode_normalize('０−９Ａ-Ｚａ-ｚ｡-ﾟ', s)
42 | 
43 |     def maketrans(f, t):
44 |         return {ord(x): ord(y) for x, y in zip(f, t)}
45 | 
46 |     s = s.translate(
47 |         maketrans('!"#$%&\'()*+,-./:;<=>?@[\]^_`{|}~｡､･｢｣',
48 |                   '！”＃＄％＆’（）＊＋，−．／：；＜＝＞？＠［￥］＾＿｀｛｜｝〜。、・「」'))
49 |     s = re.sub('[˗֊‐‑‒–⁃⁻₋−]+', '-', s)  # normalize hyphens
50 |     s = re.sub('[﹣－ｰ—―─━ー]+', 'ー', s)  # normalize choonpus
51 |     s = re.sub('[~∼∾〜〰～]', '', s)  # remove tildes
52 |     s = remove_extra_spaces(s)
53 |     s = unicode_normalize('！”＃＄％＆’（）＊＋，−．／：；＜＞？＠［￥］＾＿｀｛｜｝〜', s)  # keep ＝,・,「,」
54 |     return s
55 | 


--------------------------------------------------------------------------------
/cythonize.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | 
3 | if ! `type cython &> /dev/null`; then
4 |     pip install cython
5 | fi
6 | cython --cplus -3 neologdn.pyx
7 | 


--------------------------------------------------------------------------------
/neologdn.pyx:
--------------------------------------------------------------------------------
  1 | # distutils: language=c++
  2 | # cython: language_level=3
  3 | # -*- coding: utf-8 -*-
  4 | 
  5 | import itertools
  6 | from sys import version_info
  7 | from libc.stdlib cimport malloc, free
  8 | from libcpp.unordered_map cimport unordered_map
  9 | from libcpp.unordered_set cimport unordered_set
 10 | 
 11 | VERSION = (0, 5, 4)
 12 | __version__ = '0.5.4'
 13 | 
 14 | cdef extern from "Python.h":
 15 |     object PyUnicode_DecodeUTF32(const char *s, Py_ssize_t size, const char *errors, int *byteorder)
 16 | 
 17 | 
 18 | cdef py_ucs4_to_unicode(Py_UCS4 *ucs4_ptr, Py_ssize_t length):
 19 |     return PyUnicode_DecodeUTF32(<char*>ucs4_ptr, sizeof(Py_UCS4)*length, NULL, NULL)
 20 | 
 21 | 
 22 | ASCII = (
 23 |     ('ａ', 'a'), ('ｂ', 'b'), ('ｃ', 'c'), ('ｄ', 'd'), ('ｅ', 'e'),
 24 |     ('ｆ', 'f'), ('ｇ', 'g'), ('ｈ', 'h'), ('ｉ', 'i'), ('ｊ', 'j'),
 25 |     ('ｋ', 'k'), ('ｌ', 'l'), ('ｍ', 'm'), ('ｎ', 'n'), ('ｏ', 'o'),
 26 |     ('ｐ', 'p'), ('ｑ', 'q'), ('ｒ', 'r'), ('ｓ', 's'), ('ｔ', 't'),
 27 |     ('ｕ', 'u'), ('ｖ', 'v'), ('ｗ', 'w'), ('ｘ', 'x'), ('ｙ', 'y'),
 28 |     ('ｚ', 'z'),
 29 |     ('Ａ', 'A'), ('Ｂ', 'B'), ('Ｃ', 'C'), ('Ｄ', 'D'), ('Ｅ', 'E'),
 30 |     ('Ｆ', 'F'), ('Ｇ', 'G'), ('Ｈ', 'H'), ('Ｉ', 'I'), ('Ｊ', 'J'),
 31 |     ('Ｋ', 'K'), ('Ｌ', 'L'), ('Ｍ', 'M'), ('Ｎ', 'N'), ('Ｏ', 'O'),
 32 |     ('Ｐ', 'P'), ('Ｑ', 'Q'), ('Ｒ', 'R'), ('Ｓ', 'S'), ('Ｔ', 'T'),
 33 |     ('Ｕ', 'U'), ('Ｖ', 'V'), ('Ｗ', 'W'), ('Ｘ', 'X'), ('Ｙ', 'Y'),
 34 |     ('Ｚ', 'Z'),
 35 |     ('！', '!'), ('”', '"'), ('＃', '#'), ('＄', '$'), ('％', '%'),
 36 |     ('＆', '&'), ('’', '\''), ('（', '('), ('）', ')'), ('＊', '*'),
 37 |     ('＋', '+'), ('，', ','), ('−', '-'), ('．', '.'), ('／', '/'),
 38 |     ('：', ':'), ('；', ';'), ('＜', '<'), ('＝', '='), ('＞', '>'),
 39 |     ('？', '?'), ('＠', '@'), ('［', '['), ('¥', '\\'), ('］', ']'),
 40 |     ('＾', '^'), ('＿', '_'), ('‘', '`'), ('｛', '{'), ('｜', '|'),
 41 |     ('｝', '}')
 42 | )
 43 | KANA = (
 44 |     ('ｱ', 'ア'), ('ｲ', 'イ'), ('ｳ', 'ウ'), ('ｴ', 'エ'), ('ｵ', 'オ'),
 45 |     ('ｶ', 'カ'), ('ｷ', 'キ'), ('ｸ', 'ク'), ('ｹ', 'ケ'), ('ｺ', 'コ'),
 46 |     ('ｻ', 'サ'), ('ｼ', 'シ'), ('ｽ', 'ス'), ('ｾ', 'セ'), ('ｿ', 'ソ'),
 47 |     ('ﾀ', 'タ'), ('ﾁ', 'チ'), ('ﾂ', 'ツ'), ('ﾃ', 'テ'), ('ﾄ', 'ト'),
 48 |     ('ﾅ', 'ナ'), ('ﾆ', 'ニ'), ('ﾇ', 'ヌ'), ('ﾈ', 'ネ'), ('ﾉ', 'ノ'),
 49 |     ('ﾊ', 'ハ'), ('ﾋ', 'ヒ'), ('ﾌ', 'フ'), ('ﾍ', 'ヘ'), ('ﾎ', 'ホ'),
 50 |     ('ﾏ', 'マ'), ('ﾐ', 'ミ'), ('ﾑ', 'ム'), ('ﾒ', 'メ'), ('ﾓ', 'モ'),
 51 |     ('ﾔ', 'ヤ'), ('ﾕ', 'ユ'), ('ﾖ', 'ヨ'),
 52 |     ('ﾗ', 'ラ'), ('ﾘ', 'リ'), ('ﾙ', 'ル'), ('ﾚ', 'レ'), ('ﾛ', 'ロ'),
 53 |     ('ﾜ', 'ワ'), ('ｦ', 'ヲ'), ('ﾝ', 'ン'),
 54 |     ('ｧ', 'ァ'), ('ｨ', 'ィ'), ('ｩ', 'ゥ'), ('ｪ', 'ェ'), ('ｫ', 'ォ'),
 55 |     ('ｯ', 'ッ'), ('ｬ', 'ャ'), ('ｭ', 'ュ'), ('ｮ', 'ョ'),
 56 |     ('｡', '。'), ('､', '、'), ('･', '・'), ('゛', 'ﾞ'), ('゜', 'ﾟ'),
 57 |     ('｢', '「'), ('｣', '」'), ('ｰ', 'ー')
 58 | )
 59 | DIGIT = (
 60 |     ('０', '0'), ('１', '1'), ('２', '2'), ('３', '3'), ('４', '4'),
 61 |     ('５', '5'), ('６', '6'), ('７', '7'), ('８', '8'), ('９', '9')
 62 | )
 63 | KANA_TEN = (
 64 |     ('カ', 'ガ'), ('キ', 'ギ'), ('ク', 'グ'), ('ケ', 'ゲ'), ('コ', 'ゴ'),
 65 |     ('サ', 'ザ'), ('シ', 'ジ'), ('ス', 'ズ'), ('セ', 'ゼ'), ('ソ', 'ゾ'),
 66 |     ('タ', 'ダ'), ('チ', 'ヂ'), ('ツ', 'ヅ'), ('テ', 'デ'), ('ト', 'ド'),
 67 |     ('ハ', 'バ'), ('ヒ', 'ビ'), ('フ', 'ブ'), ('ヘ', 'ベ'), ('ホ', 'ボ'),
 68 |     ('ウ', 'ヴ'), ('う', 'ゔ')
 69 | )
 70 | KANA_MARU = (
 71 |     ('ハ', 'パ'), ('ヒ', 'ピ'), ('フ', 'プ'), ('ヘ', 'ペ'), ('ホ', 'ポ'),
 72 |     ('は', 'ぱ'), ('ひ', 'ぴ'), ('ふ', 'ぷ'), ('へ', 'ぺ'), ('ほ', 'ぽ')
 73 | )
 74 | 
 75 | HIPHENS = ('˗', '֊', '‐', '‑', '‒', '–', '⁃', '⁻', '₋', '−')
 76 | CHOONPUS = ('﹣', '－', 'ｰ', '—', '―', '─', '━', 'ー')
 77 | TILDES = ('~', '∼', '∾', '〜', '〰', '～')
 78 | 
 79 | SPACE = (' ', '　')
 80 | 
 81 | cdef unordered_map[Py_UCS4, Py_UCS4] conversion_map, kana_ten_map, kana_maru_map
 82 | cdef unordered_set[Py_UCS4] blocks, basic_latin
 83 | 
 84 | for (before, after) in (ASCII + DIGIT + KANA):
 85 |     conversion_map[before] = after
 86 | 
 87 | for (before, after) in KANA_TEN:
 88 |     kana_ten_map[before] = after
 89 | 
 90 | for (before, after) in KANA_MARU:
 91 |     kana_maru_map[before] = after
 92 | 
 93 | char_codes = itertools.chain(
 94 |     range(19968, 40960),  # CJK UNIFIED IDEOGRAPHS
 95 |     range(12352, 12448),  # HIRAGANA
 96 |     range(12448, 12544),  # KATAKANA
 97 |     range(12289, 12352),  # CJK SYMBOLS AND PUNCTUATION
 98 |     range(65280, 65520)   # HALFWIDTH AND FULLWIDTH FORMS
 99 | )
100 | for c in map(chr, char_codes):
101 |     blocks.insert(c)
102 | 
103 | 
104 | for c in map(chr, range(128)):
105 |     basic_latin.insert(c)
106 | 
107 | del ASCII, KANA, DIGIT, KANA_TEN, KANA_MARU, char_codes, version_info
108 | 
109 | 
110 | cpdef unicode shorten_repeat(unicode text, int repeat_threshould, int max_repeat_substr_length=8):
111 |     cdef int text_length, i, repeat_length, right_start, right_end, num_repeat_substrs
112 |     cdef int upper_repeat_substr_length
113 |     cdef unicode substr, right_substr
114 | 
115 |     i = 0
116 |     while i < len(text):
117 |         text_length = len(text)
118 | 
119 |         upper_repeat_substr_length = (text_length - i) // 2
120 |         if max_repeat_substr_length and max_repeat_substr_length < upper_repeat_substr_length:
121 |             upper_repeat_substr_length = max_repeat_substr_length + 1
122 | 
123 |         for repeat_length in range(1, upper_repeat_substr_length):
124 |             substr = text[i:i+repeat_length]
125 |             right_start = i + repeat_length
126 |             right_end = right_start + repeat_length
127 |             right_substr = text[right_start:right_end]
128 |             num_repeat_substrs = 1
129 |             while substr == right_substr and right_end <= text_length:
130 |                 num_repeat_substrs += 1
131 |                 right_start += repeat_length
132 |                 right_end += repeat_length
133 |                 right_substr = text[right_start:right_end]
134 |             if num_repeat_substrs > repeat_threshould:
135 |                 text = text[:i+repeat_length*repeat_threshould] + text[i+repeat_length*num_repeat_substrs:]
136 |         i += 1
137 |     return text
138 | 
139 | 
140 | cpdef unicode normalize(unicode text, int repeat=0, bint remove_space=True,
141 |                         int max_repeat_substr_length=8, unicode tilde='remove'):
142 |     cdef Py_UCS4 *buf = <Py_UCS4 *>malloc(sizeof(Py_UCS4) * (len(text) + 1))
143 | 
144 |     cdef Py_UCS4 c, prev = '\0'
145 |     cdef int pos = 0
146 |     cdef bint lattin_space = False
147 | 
148 |     for c in text:
149 |         if c in SPACE:
150 |             c = ' '
151 |             if (prev == ' ' or blocks.count(prev)) and remove_space:
152 |                 continue
153 |             elif prev != '*' and pos > 0 and basic_latin.count(prev):
154 |                 lattin_space = True
155 |                 buf[pos] = c
156 |             elif remove_space:
157 |                 pos -= 1
158 |             else:
159 |                 buf[pos] = c
160 |         else:
161 |             if c in HIPHENS:
162 |                 if prev == '-':
163 |                     continue
164 |                 else:
165 |                     buf[pos] = c = '-'
166 |                 lattin_space = False
167 |             elif c in CHOONPUS:
168 |                 if prev == 'ー':
169 |                     continue
170 |                 else:
171 |                     buf[pos] = c = 'ー'
172 |                 lattin_space = False
173 |             elif c in TILDES:
174 |                 if tilde == 'ignore':
175 |                     buf[pos] = c
176 |                 elif tilde == 'normalize':
177 |                     buf[pos] = c = '~'
178 |                 elif tilde == 'normalize_zenkaku':
179 |                     buf[pos] = c = '〜'
180 |                 else:
181 |                     continue
182 |                 lattin_space = False
183 |             else:
184 |                 if conversion_map.count(c):
185 |                     c = conversion_map[c]
186 |                 if c == 'ﾞ' and kana_ten_map.count(prev):
187 |                     pos -= 1
188 |                     c = kana_ten_map[prev]
189 |                 elif c == 'ﾟ' and kana_maru_map.count(prev):
190 |                     pos -= 1
191 |                     c = kana_maru_map[prev]
192 |                 if lattin_space and blocks.count(c) and remove_space:
193 |                     pos -= 1
194 |                 lattin_space = False
195 |                 buf[pos] = c
196 |         prev = c
197 |         pos += 1
198 | 
199 |     if buf[pos-1] == ' ':
200 |         pos -= 1
201 |     buf[pos] = '\0'
202 | 
203 |     cdef unicode ret = py_ucs4_to_unicode(buf, pos)
204 |     free(buf)
205 | 
206 |     if repeat:
207 |         return shorten_repeat(ret, repeat, max_repeat_substr_length)
208 |     return ret
209 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [build-system]
 2 | requires = ["setuptools"]
 3 | build-backend = "setuptools.build_meta"
 4 | 
 5 | [project]
 6 | name = "neologdn"
 7 | dynamic = ["version"]
 8 | authors = [
 9 |     {name = "Yukino Ikegami", email = "yknikgm@gmail.com"}
10 | ]
11 | maintainers = [
12 |     {name = "Yukino Ikegami", email = "yknikgm@gmail.com"}
13 | ]
14 | description = "Japanese text normalizer for mecab-neologd"
15 | readme = "README.md"
16 | license = {file = "LICENSE"}
17 | keywords = ["MeCab", "NEologd", "japanese", "textpreprocessing", "JapaneseText"]
18 | classifiers = [
19 |     "Development Status :: 4 - Beta",
20 |     "Intended Audience :: Science/Research",
21 |     "Intended Audience :: Developers",
22 |     "Natural Language :: Japanese",
23 |     "License :: OSI Approved :: Apache Software License",
24 |     "Programming Language :: Cython",
25 |     "Programming Language :: Python",
26 |     "Programming Language :: Python :: 3",
27 |     "Programming Language :: Python :: 3.8",
28 |     "Programming Language :: Python :: 3.9",
29 |     "Programming Language :: Python :: 3.10",
30 |     "Programming Language :: Python :: 3.11",
31 |     "Programming Language :: Python :: 3.12",
32 |     "Programming Language :: Python :: 3.13",
33 |     "Topic :: Text Processing :: Linguistic",
34 |     "Topic :: Text Processing"
35 | ]
36 | 
37 | [project.urls]
38 | Homepage = "https://github.com/ikegami-yukino/neologdn"
39 | Repository = "https://github.com/ikegami-yukino/neologdn.git"
40 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from codecs import open
 3 | import re
 4 | from setuptools import setup, Extension
 5 | import platform
 6 | 
 7 | with open('neologdn.cpp', 'r', encoding='utf8') as f:
 8 |     version = re.compile(r".*__version__ = '(.*?)'",
 9 |                          re.S).match(f.read()).group(1)
10 | 
11 | extra_compile_args = ["-std=c++11"]
12 | if platform.system() == "Darwin":
13 |     extra_compile_args.append("-mmacosx-version-min=10.7")
14 |     extra_compile_args.append("-stdlib=libc++")
15 | 
16 | setup(name='neologdn',
17 |       version=version,
18 |       ext_modules=[
19 |           Extension('neologdn', ['neologdn.cpp'],
20 |                     language='c++',
21 |                     extra_compile_args=extra_compile_args)
22 |       ]
23 | )
24 | 


--------------------------------------------------------------------------------
/test_neologdn.py:
--------------------------------------------------------------------------------
 1 | # encoding: utf8
 2 | from __future__ import unicode_literals
 3 | import unittest
 4 | from neologdn import normalize, shorten_repeat
 5 | 
 6 | 
 7 | class TestNeologdn(unittest.TestCase):
 8 | 
 9 |     def test_normalize(self):
10 |         self.assertEqual(normalize('０'), '0')
11 |         self.assertEqual(normalize('ﾊﾝｶｸ'), 'ハンカク')
12 |         self.assertEqual(normalize('o₋o'), 'o-o')
13 |         self.assertEqual(normalize('majika━'), 'majikaー')
14 |         self.assertEqual(normalize('わ〰い'), 'わい')
15 |         self.assertEqual(normalize('スーパーーーー'), 'スーパー')
16 |         self.assertEqual(normalize('!#'), '!#')
17 |         self.assertEqual(normalize('ゼンカク　スペース'), 'ゼンカクスペース')
18 |         self.assertEqual(normalize('お             お'), 'おお')
19 |         self.assertEqual(normalize('      おお'), 'おお')
20 |         self.assertEqual(normalize('おお      '), 'おお')
21 |         self.assertEqual(normalize('検索 エンジン 自作 入門 を 買い ました!!!'),\
22 |                          '検索エンジン自作入門を買いました!!!')
23 |         self.assertEqual(normalize('アルゴリズム C'), 'アルゴリズムC')
24 |         self.assertEqual(normalize('　　　ＰＲＭＬ　　副　読　本　　　'), 'PRML副読本')
25 |         self.assertEqual(normalize('Coding the Matrix'), 'Coding the Matrix')
26 |         self.assertEqual(normalize('南アルプスの　天然水　Ｓｐａｒｋｉｎｇ　Ｌｅｍｏｎ　レモン一絞り'),\
27 |                          '南アルプスの天然水Sparking Lemonレモン一絞り')
28 |         self.assertEqual(normalize('南アルプスの　天然水-　Ｓｐａｒｋｉｎｇ*　Ｌｅｍｏｎ+　レモン一絞り'),\
29 |                          '南アルプスの天然水- Sparking*Lemon+レモン一絞り')
30 |         self.assertEqual(normalize('ﾊﾟﾊﾟ'), 'パパ')
31 |         self.assertEqual(normalize('a˗֊‐‑‒–⁃⁻₋−'), 'a-')
32 |         self.assertEqual(normalize('あ﹣－ｰ—―─━ー'), 'あー')
33 |         self.assertEqual(normalize('チルダ~∼∾〜〰～'), 'チルダ')
34 |         self.assertEqual(normalize('う゛ほﾟ'), 'ゔぽ')
35 | 
36 |     def test_shorten_repeat(self):
37 |         self.assertEqual(shorten_repeat('うまああああああああああああい', 7, 0), 'うまあああああああい')
38 |         self.assertEqual(shorten_repeat('かわいいいいいるい', 6, 0), 'かわいいいいいるい')
39 |         self.assertEqual(shorten_repeat('オラオラオラオラーッ', 2, 0), 'オラオラーッ')
40 |         self.assertEqual(shorten_repeat('無駄無駄無駄無駄ァ', 1, 0), '無駄ァ')
41 |         self.assertEqual(shorten_repeat('隣の客はよく柿食う客だ、隣の客はよく柿食う客だ、隣の客はよく柿食う客だ、言えた！', 1, 0),
42 |                          '隣の客はよく柿食う客だ、言えた！')
43 |         self.assertEqual(shorten_repeat('隣の客はよく柿食う客だ、隣の客はよく柿食う客だ、隣の客はよく柿食う客だ、言えた！', 1, 11),
44 |                          '隣の客はよく柿食う客だ、隣の客はよく柿食う客だ、隣の客はよく柿食う客だ、言えた！')
45 | 
46 |     def test_suppress_removal_of_spaces_between_Japanese(self):
47 |         self.assertEqual(normalize('巴 マミ', remove_space=False), '巴 マミ')
48 | 
49 |     def test_handling_tilde(self):
50 |         self.assertEqual(normalize('1467〜1487年', tilde='normalize'), '1467~1487年')
51 |         self.assertEqual(normalize('1467~1487年', tilde='normalize_zenkaku'), '1467〜1487年')
52 |         self.assertEqual(normalize('1467〜1487年', tilde='ignore'), '1467〜1487年')
53 |         self.assertEqual(normalize('1467〜1487年', tilde='remove'), '14671487年')
54 |         self.assertEqual(normalize('1467〜1487年'), '14671487年')
55 | 
56 |     def test_tilde_boundary_handling(self):
57 |         self.assertEqual(normalize('A ˗あ'), 'A -あ')
58 |         self.assertEqual(normalize('A ーあ'), 'A ーあ')
59 |         self.assertEqual(normalize('A ～あ', tilde='normalize'), 'A ~あ')
60 |         self.assertEqual(normalize('A ~あ', tilde='normalize_zenkaku'), 'A 〜あ')
61 |         self.assertEqual(normalize('A ～あ'), 'Aあ')
62 | 
63 | if __name__ == '__main__':
64 |     unittest.main()
65 | 


--------------------------------------------------------------------------------