├── .gitignore
├── LICENSE
├── MANIFEST.in
├── README.md
├── code
    └── dimsim
    │   ├── build
    │       └── lib
    │       │   └── dimsim
    │       │       └── dimsim.py
    │   └── dimsim
    │       └── dimsim.py
├── dimsim
    ├── __init__.py
    ├── core
    │   ├── __init__.py
    │   └── model.py
    ├── data
    │   ├── __init__.py
    │   ├── pinyin_to_simplified.pickle
    │   └── pinyin_to_traditional.pickle
    ├── tests
    │   ├── __init__.py
    │   └── test_dimsim.py
    └── utils
    │   ├── __init__.py
    │   ├── maps.py
    │   ├── pinyin.py
    │   └── utils.py
├── setup.cfg
└── setup.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | *.egg-info/
 24 | .installed.cfg
 25 | *.egg
 26 | MANIFEST
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | .pytest_cache/
 49 | 
 50 | # Translations
 51 | *.mo
 52 | *.pot
 53 | 
 54 | # Django stuff:
 55 | *.log
 56 | local_settings.py
 57 | db.sqlite3
 58 | 
 59 | # Flask stuff:
 60 | instance/
 61 | .webassets-cache
 62 | 
 63 | # Scrapy stuff:
 64 | .scrapy
 65 | 
 66 | # Sphinx documentation
 67 | docs/_build/
 68 | 
 69 | # PyBuilder
 70 | target/
 71 | 
 72 | # Jupyter Notebook
 73 | .ipynb_checkpoints
 74 | 
 75 | # pyenv
 76 | .python-version
 77 | 
 78 | # celery beat schedule file
 79 | celerybeat-schedule
 80 | 
 81 | # SageMath parsed files
 82 | *.sage.py
 83 | 
 84 | # Environments
 85 | .env
 86 | .venv
 87 | env/
 88 | venv/
 89 | ENV/
 90 | env.bak/
 91 | venv.bak/
 92 | 
 93 | # Spyder project settings
 94 | .spyderproject
 95 | .spyproject
 96 | 
 97 | # Rope project settings
 98 | .ropeproject
 99 | 
100 | # mkdocs documentation
101 | /site
102 | 
103 | # mypy
104 | .mypy_cache/
105 | 
106 | # local test files
107 | test_image_utils/
108 | .idea/
109 | 
110 | # other
111 | maxfw/.DS_Store
112 | .vscode/settings.json
113 | .DS_Store
114 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include dimsim/data/*.*


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # DimSim - A Chinese Soundex Library (Python version) 
  2 | 
  3 | DimSim is a library developed by the Scalable Knowledge Intelligence team at IBM Almaden Research Center as part of the [SystemT](https://researcher.watson.ibm.com/researcher/view_group.php?id=1264) project. 
  4 | 
  5 | The PyPi project page can be found [here](https://pypi.org/project/dimsim/). It was created in collaboration with IBM Center for Open-Source Data and AI Technologies ([CODAIT](http://codait.org)).
  6 | 
  7 | ## Overview
  8 | We provide a phonetic algorithm for indexing Chinese characters by sound. The technical details can be found in the following [paper](http://aclweb.org/anthology/K18-1043):
  9 | 
 10 | Min Li, Marina Danilevsky, Sara Noeman and Yunyao Li. *DIMSIM: An Accurate Chinese Phonetic Similarity Algorithm based on Learned High Dimensional Encoding*. CoNLL 2018.
 11 | 
 12 | In this library, we provide a pre-trained model that can perform the following functions, in compliance with the phonetic principles of Mandarin Chinese as guided by the Romanization defined in [ISO 7098:2015](https://www.iso.org/standard/61420.html):
 13 | - Given two Chinese phrases (of the same length), return the phonetic distance of the input phrases. Optionally you can feed in pinyin strings of Chinese phrases too.
 14 | - Given a Chinese phrase, return its top-k similar (phoentically) Chinese phrases.
 15 | 
 16 | 
 17 | 
 18 | ## How to install
 19 | 
 20 | **Dependencies**:
 21 | - [pypinyin](https://github.com/mozillazg/python-pinyin): used for translating Chinese characters into their correponding pinyins. 
 22 | 
 23 | There are two ways to install this library:
 24 | - Install from PyPi
 25 | 
 26 | ```shell
 27 | pip install dimsim
 28 | ```
 29 | - Download the source code by cloning this repo and compile it yourself.
 30 | 
 31 | ```shell
 32 | git clone git@github.com:System-T/DimSim.git
 33 | 
 34 | cd DimSim/
 35 | 
 36 | pip install -e .
 37 | ```
 38 | 
 39 | ## How to use
 40 | Once you have the package installed you can use it for the two functions as shown below.
 41 | 
 42 | - Computing phonetic distance of two Chinese phrases. The optional argument `pinyin` (False by default) can be used to provide a pinyin string list directly. See example usage below.
 43 | 
 44 | ```python
 45 | import dimsim
 46 | 
 47 | dist = dimsim.get_distance("大侠","大虾")
 48 | 0.0002380952380952381
 49 | 
 50 | dist = dimsim.get_distance("大侠","大人")
 51 | 25.001417183349876
 52 | 
 53 | dist = dimsim.get_distance(['da4','xia2'],['da4','xia1']], pinyin=True)
 54 | 0.0002380952380952381
 55 | 
 56 | dist = dimsim.get_distance(['da4','xia2'],['da4','ren2']], pinyin=True)
 57 | 25.001417183349876
 58 | 
 59 | ```
 60 | ***
 61 | - Return top-k phonetically similar phrases of a given Chinese phrase. Two parameters:
 62 | - **mode** controls the character type of the returned Chinese phrases, where 'simplified' represents simplified Chinese and 'traditional' represents traditional Chinese.
 63 | - **theta** controls the size of search space for the candidate phrases.
 64 | ```python
 65 | import dimsim
 66 | 
 67 | candidates = dimsim.get_candidates("大侠", mode="simplified", theta=1)
 68 | 
 69 | ['打下', '大虾', '大侠']
 70 | 
 71 | candidates = dimsim.get_candidates("粉丝", mode="traditinoal", theta=1)
 72 | ['門市', '分時', '焚屍', '粉飾', '粉絲']
 73 | ```
 74 | 
 75 | ***
 76 | Return Pinyin RCode 
 77 | ```python
 78 | rcode = getRCode("海底小纵队")
 79 | "hai'di'xiao'zong'dui"
 80 | 
 81 | rcode = getRCode("海底小中队")
 82 | "hai'di'xiao'zong'dui"
 83 | ```
 84 | 
 85 | ## Citation
 86 | 
 87 | Please cite the library by referencing the published paper:
 88 | ```
 89 | @InProceedings{K18-1043,
 90 |   author = 	{Li, Min and Danilevsky, Marina and Noeman, Sara and Li, Yunyao},
 91 |   title = 	{{DIMSIM:} An Accurate Chinese Phonetic Similarity Algorithm Based on Learned High Dimensional Encoding},
 92 |   booktitle = 	{Proceedings of the 22nd Conference on Computational Natural Language Learning},
 93 |   year = 	{2018},
 94 |   publisher = 	{Association for Computational Linguistics},
 95 |   pages = 	{444-453},
 96 |   location = 	{Brussels, Belgium},
 97 |   url = 	{http://aclweb.org/anthology/K18-1043}
 98 | }
 99 | ```
100 | 


--------------------------------------------------------------------------------
/code/dimsim/build/lib/dimsim/dimsim.py:
--------------------------------------------------------------------------------
  1 | 
  2 | # coding: utf-8
  3 | 
  4 | # In[656]:
  5 | 
  6 | 
  7 | from pypinyin import pinyin, lazy_pinyin, Style
  8 | import sys
  9 | import math
 10 | import itertools
 11 | import pickle
 12 | import os
 13 | 
 14 | '''
 15 |     @liushaohua
 16 |     本更改仅限个人使用，仅仅使用编码方式，对拼音进行一个统一编码
 17 | '''
 18 | 
 19 | 
 20 | # In[657]:
 21 | 
 22 | 
 23 | work_dir = os.path.abspath(__file__)
 24 | indexOfLastSlash = work_dir.rfind("/")
 25 | work_dir = work_dir[0:indexOfLastSlash]+"/"
 26 | 
 27 | 
 28 | # In[658]:
 29 | 
 30 | 
 31 | sfile = open(work_dir+'pinyin_to_simplified.pickle', 'rb')
 32 | 
 33 | # dump information to that file
 34 | pinyin_to_simplified = pickle.load(sfile)
 35 | sfile.close()
 36 | 
 37 | tfile = open(work_dir+'pinyin_to_traditional.pickle', 'rb')
 38 | pinyin_to_traditional = pickle.load(tfile)
 39 | tfile.close()
 40 | 
 41 | 
 42 | # In[659]:
 43 | 
 44 | 
 45 | def to_pinyin(utterance):
 46 |     length = len(utterance)
 47 |     translated = []
 48 |     pinyin_encodings = pinyin(utterance, style=Style.TONE2)
 49 |     for i in range(length):
 50 |         currPinyin = pinyin_encodings[i][0]
 51 | #         print("{} translates to {}".format(currPinyin, putToneToEnd(currPinyin)))
 52 |         translated.append(putToneToEnd(currPinyin))
 53 |     return translated
 54 | 
 55 | def putToneToEnd(input_pinyin):
 56 |     if len(input_pinyin) is 1:
 57 |         return input_pinyin + '1'
 58 |     tone_index = 0
 59 |     tone = '1'
 60 |     for index, character in enumerate(input_pinyin):
 61 |         if character in ("1","2","3","4"):
 62 |             tone_index = index
 63 |             tone = input_pinyin[index]
 64 |             break;
 65 |     if tone_index is 0:
 66 |         return input_pinyin + "5"
 67 |     return input_pinyin[0:index] + input_pinyin[index+1:] + tone
 68 | 
 69 | def get_distance(utterance1, utterance2):
 70 |     if(len(utterance1) is not len(utterance2)):
 71 |         print("the two inputs do not have the same length")
 72 |         return sys.float_info.max
 73 |     else:
 74 |         u1 = to_pinyin(utterance1)
 75 |         u2 = to_pinyin(utterance2)
 76 |         
 77 |         la = []
 78 |         lb = []
 79 |         for py in u1:
 80 |             la.append(Pinyin(py))
 81 |         for py in u2:
 82 |             lb.append(Pinyin(py))
 83 |     
 84 | 
 85 |         res = 0.0
 86 |         numDiff = 0        
 87 |         tot = len(utterance1)*2.1
 88 |         for i in range (len(utterance1)):
 89 |             apy = la[i]
 90 |             bpy = lb[i]
 91 | 
 92 |             if (apy is None) or (bpy is None):
 93 |                 print("!Error {},{}".format(la, lb))
 94 |             
 95 |             res += getEditDistanceClose_TwoDCode(apy, bpy)
 96 |             
 97 |             if apy.consonant is not bpy.consonant:
 98 |                 numDiff+=1
 99 |             
100 |             if not(str(apy.vowel) == str(bpy.vowel)):
101 |                 numDiff+=1
102 |             
103 |             if apy.tone is not bpy.tone:
104 |                 numDiff+=0.01;
105 |                 
106 |         diffRatio = (numDiff)/tot;
107 |         a = 0
108 |         if diffRatio is 0:
109 |             a=1
110 |         return res*diffRatio;
111 |             
112 | def getEditDistanceClose_TwoDCode(a, b):
113 |     res = 0
114 |     try:
115 |         if (a is None) or (b is None):
116 |             print("Error:pinyin({},{})".format(a.toString(),b.toString()))
117 |             return res
118 |         
119 |         twoDcode_consonant_a = consonantMap_TwoDCode[a.consonant]
120 |         twoDcode_consonant_b = consonantMap_TwoDCode[b.consonant]
121 |         
122 |         cDis = abs(getDistance_TwoDCode(twoDcode_consonant_a, twoDcode_consonant_b))
123 |         
124 |         twoDcode_vowel_a = vowelMap_TwoDCode[a.vowel]
125 |         twoDcode_vowel_b = vowelMap_TwoDCode[b.vowel]
126 |         
127 |         vDis = abs(getDistance_TwoDCode(twoDcode_vowel_a, twoDcode_vowel_b))
128 | 
129 |         hcDis = getSimDisFromHardcodMap(a,b)
130 |         
131 |         res = min((cDis+vDis),hcDis) + 1.0*abs(a.tone-b.tone)/10
132 |         
133 |     except:
134 |         print("Error pinyin {}{}".format(a.toString(), b.toString()))
135 |         raise
136 |     return res
137 | 
138 | def getSimDisFromHardcodMap(a, b):
139 |     try:
140 |         simPy = hardcodeMap[a.toStringNoTone()]
141 |         if simPy is not None:
142 |             if simPy is b.toStringNoTone():
143 |                 return 2.0
144 |         else:
145 |             simPy=hardcodeMap[b.toStringNoTone()]
146 |             if simPy is not None and simPy is a.toStringNoTone():
147 |                 return 2.0
148 |         return sys.float_info.max
149 |     except:
150 |         return sys.float_info.max
151 |     
152 |     
153 | def getDistance_TwoDCode(X, Y):
154 |     x1, x2 = X
155 |     y1, y2 = Y
156 | 
157 |     x1d = abs(x1-y1)
158 |     x2d = abs(x2-y2)
159 |     
160 |     return math.sqrt( x1d**2 + x2d**2)
161 | 
162 | 
163 | consonantMap_TwoDCode ={
164 |     "b":(1.0,0.5),
165 |     "p":(1.0,1.5), 
166 | 
167 |     "g":(7.0,0.5), 
168 |     "k":(7.0,1.5), 
169 |     "h":(7.0,3.0), 
170 |     "f":(7.0,4.0), 
171 | 
172 |     "d":(12.0,0.5), 
173 |     "t":(12.0,1.5), 
174 | 
175 |     "n":(22.5,0.5), 
176 |     "l":(22.5,1.5), 
177 |     "r":( 22.5,2.5), 
178 | 
179 |     
180 |     "zh":(30,1.7), 
181 |     "z":(30,1.5), 
182 |     "j":(30.0,0.5), 
183 | 
184 |     "ch":(31,1.7), 
185 |     "c":(31,1.5), 
186 |     "q":(31.0,0.5), 
187 | 
188 |     "sh":(33,3.7),
189 |     "s":(33,3.5),
190 |     "x":(33,2.5),
191 | 
192 |     
193 |     "m":(50.0,3.5), 
194 | 
195 |     "y":(40.0,0.0), 
196 |     "w":(40,5.0),
197 |     
198 |     "":(99999.0,99999.0)
199 | }
200 | 
201 | 
202 | # In[662]:
203 | 
204 | 
205 | vowelMap_TwoDCode = {
206 |     "a":(1.0,0.0),
207 |     "an":(1.0,1.0),
208 |     "ang":(1.0,1.5),
209 | 
210 |     
211 |     "ia":(0.0,0.0),
212 |     "ian":(0.0,1.0),
213 |     "iang":(0.0,1.5),
214 | 
215 |     "ua":(2.0,0.0),
216 |     "uan":(2.0,1.0),
217 |     "uang":(2.0,1.5),
218 |     "u:an":(2.0,1.0),
219 | 
220 |     
221 |     "ao":(5.0,0.0),
222 |     "iao":(5.0,1.5),
223 | 
224 |     "ai":(8.0,0.0),
225 |     "uai":(8.0,1.5),
226 | 
227 |     
228 | 
229 |     "o":(20,0.0),
230 |     "io":(20,2.5),
231 |     "iou":(20,4),
232 |     "iu":(20,4),
233 |     "ou":(20,5.5),
234 |     "uo":(20,6.0),
235 | 
236 |     "ong":(20,8.0),
237 |     "iong":(20,9.5),
238 | 
239 |     
240 |     "er":(41,1),
241 |     "e":(41,0.0),
242 | 
243 |     "u:e":(40,5.0),
244 |     "ve":(40,5.0),
245 |     "ue":(40,5.0),
246 |     "ie":(40,4.5),
247 |     "ei":(40,4.0),
248 |     "uei":(40,3.0),
249 |     "ui":(40,3.0),
250 | 
251 |     "en":(42,0.5),
252 |     "eng":(42,1.0),
253 | 
254 |     "uen":(43,0.5),
255 |     "un":(43,0.5),
256 |     "ueng":(43,1.0),
257 | 
258 |     
259 |     "i":(60,1.0),
260 |     "in":(60,2.5),
261 |     "ing":(60,3.0),
262 | 
263 |     "u:":(61,1.0),
264 |     "v":(61,1.0),
265 |     "u:n":(61,2.5),
266 |     "vn":(61,2.5),
267 | 
268 |     "u":(80,0.0),
269 | 
270 |     "":(99999.0,99999.0)
271 | }
272 | 
273 | # 这里根据拼音相似的计算规则，对元音、辅音统一编码, 先卡严格一点,
274 | # (a, b) b值小于0.5的，编成一个，按这套相似计算规则，0.5也算是相似的
275 | # 如果这么干，可能会太泛了，导致线上效果太差
276 | consonantMap_RCode ={
277 |     "b":"b",
278 |     "p":"p", 
279 | 
280 |     "g":"g", 
281 |     "k":"k", 
282 |     "h":"h", 
283 |     "f":"f", 
284 | 
285 |     "d":"d", 
286 |     "t":"t", 
287 | 
288 |     "n":"l", 
289 |     "l":"l", 
290 |     "r":"r", 
291 | 
292 |     
293 |     "zh":"z", 
294 |     "z":"z", 
295 |     "j":"j", 
296 | 
297 |     "ch":"c", 
298 |     "c":"c", 
299 |     "q":"q", 
300 | 
301 |     "sh":"s",
302 |     "s":"s",
303 |     "x":"x",
304 | 
305 |     
306 |     "m":"m", 
307 | 
308 |     "y":"y", 
309 |     "w":"w",
310 |     
311 |     "":"C"
312 | }
313 | 
314 | # 对于元音，(a, b) b值小于等于0.5，编码成一个
315 | vowelMap_RCode = {
316 |     "a":"a",
317 |     "an":"an",
318 |     "ang":"an",
319 | 
320 |     
321 |     "ia":"ia",
322 |     "ian":"ian",
323 |     "iang":"ian",
324 | 
325 |     "ua":"ua",
326 |     "uan":"uan",
327 |     "uang":"uan",
328 |     "u:an":"u:an",
329 | 
330 |     
331 |     "ao":"ao",
332 |     "iao":"iao",
333 | 
334 |     "ai":"ai",
335 |     "uai":"uai",
336 | 
337 |     
338 | 
339 |     "o":"o",
340 |     "io":"io",
341 |     "iou":"iou",
342 |     "iu":"iu",
343 |     "ou":"ou",
344 |     "uo":"ou",
345 | 
346 |     "ong":"ong",
347 |     "iong":"iong",
348 | 
349 |     
350 |     "er":"er",
351 |     "e":"e",
352 | 
353 |     "u:e":"ve",
354 |     "ve":"ve",
355 |     "ue":"ve",
356 |     "ie":"ie",
357 |     "ei":"ie",
358 |     "uei":"uei",
359 |     "ui":"ui",
360 | 
361 |     "en":"en",
362 |     "eng":"en",
363 | 
364 |     "uen":"un",
365 |     "un":"un",
366 |     "ueng":"un",
367 | 
368 |     
369 |     "i":"i",
370 |     "in":"in",
371 |     "ing":"in",
372 | 
373 |     "u:":"v",
374 |     "v":"v",
375 |     "u:n":"vn",
376 |     "vn":"vn",
377 | 
378 |     "u":"u",
379 | 
380 |     "":"V"
381 | }
382 | 
383 | # 对于数字来说，不做拼音转换，直接跳过
384 | rcode_ignore_set = set("0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZⅠ")
385 | 
386 | # In[663]:
387 | 
388 | 
389 | consonantList = ["b", "p", "m", "f", "d", "t", "n", "l", "g", "k","h", "j", "q", "x", "zh", "ch", "sh", "r", "z", "c", "s","y", "w"]
390 | 
391 | 
392 | # In[664]:
393 | 
394 | 
395 | vowelList = ["a", "o", "e", "i", "u", "v","u:","er", "ao","ai", "ou","ei", "ia", "iao", "iu", "iou","ie", "ui","uei","ua","uo","uai", "u:e","ve",  "an", "en", "in", "un","uen", "vn","u:n","ian","uan", "u:an","van", "ang", "eng", "ing", "ong","iang","iong","uang","ueng"]
396 | 
397 | 
398 | # In[665]:
399 | 
400 | 
401 | class Pinyin:
402 |     consonantList = ["b", "p", "m", "f", "d", "t", "n", "l", "g", "k", "h", "j", "q", "x", "zh", "ch", "sh", "r", "z", "c", "s","y", "w"]
403 |     vowelList = ["a", "o", "e", "i", "u", "v","u:","er", "ao","ai", "ou","ei", "ia", "iao", "iu", "iou","ie", "ui","uei","ua","uo","uai", "u:e","ve",  "an", "en", "in", "un","uen", "vn","u:n","ian","uan", "u:an","van", "ang", "eng", "ing", "ong","iang","iong","uang","ueng"]
404 |     
405 |     def __init__(self, pinyinstr):
406 |         self.tone = int(pinyinstr[-1])
407 |         self.locp = pinyinstr[0:-1].lower()
408 |         self.consonant, self.vowel = self.parseConsonant(self.locp)
409 | #         print("before rewriting consonant={}, vowel={}, locp={}, tone={}".format(self.consonant,
410 | #                                                                                self.vowel,
411 | #                                                                                self.locp,
412 | #                                                                                self.tone))
413 |         self.pinyinRewrite()
414 | #         print("after rewriting consonant={}, vowel={}, locp={}, tone={}".format(self.consonant,
415 | #                                                                                self.vowel,
416 | #                                                                                self.locp,
417 | #                                                                                self.tone))
418 |     
419 |     def parseConsonant(self, pinyin):
420 |         for consonant in consonantList:
421 |             if pinyin.startswith(consonant):
422 |                 return (consonant, pinyin[len(consonant):])
423 |         # it's a vowel without consonant
424 |         if pinyin in vowelList:
425 |             return None, pinyin.lower()
426 |         
427 |         print("Invalid Pinyin, please check!")
428 |         return None, None
429 |         
430 |     def toStringNoTone(self):
431 |         return "{}{}".format(self.consonant, self.vowel)
432 |     
433 |     def toStringWithTone(self):
434 |         return "{}{}{}".format(self.consonant, self.vowel, self.tone)
435 |     
436 |     def toString(self):
437 |         return "{}{}{}".format(self.consonant, self.vowel, self.tone)
438 |         
439 |     def pinyinRewrite(self):
440 |         import re
441 |         yVowels = {"u","ue","uan","un","u:","u:e","u:an","u:n"}
442 |         tconsonant = {"j","g","x"}
443 |         if 'v' in self.vowel:
444 |             self.vowel = self.vowel.replace("v", "u:")
445 |             
446 |         if self.consonant is None or self.consonant is "":
447 |             self.consonant = ""
448 |             return
449 |         if self.consonant is "y":
450 |             if self.vowel in yVowels:
451 |                 if "u:" not in self.vowel:
452 |                     self.vowel = self.vowel.replace("u","u:")
453 |             else:
454 |                 self.vowel="i"+self.vowel
455 |                 regex = re.compile("i+")
456 |                 self.vowel = self.vowel.replace("iii","i")
457 |                 self.vowel = self.vowel.replace("ii","i")
458 |             self.consonant=""
459 |         
460 |         if self.consonant is "w":
461 |             self.vowel="u"+self.vowel;
462 |             self.vowel=self.vowel.replace("uuu","u")
463 |             self.vowel=self.vowel.replace("uu","u")
464 |             self.consonant = ""
465 |         
466 |         if (self.consonant in tconsonant) and (self.vowel is "u") or (self.vowel is "v"):
467 |             self.vowel="u:"
468 |         
469 |         if self.vowel is "iou":
470 |             self.vowel = "iu"
471 |         
472 |         if self.vowel is "uei":
473 |             self.vowel = "ui"
474 |         
475 |         if self.vowel is "uen":
476 |             self.vowel = "un"
477 |         
478 |                 
479 |         
480 |         
481 | 
482 | 
483 | # In[666]:
484 | 
485 | 
486 | hardcodeMap = {
487 |     "hua":"fa",
488 |     "fa":"hua",
489 |     "huan":"fan",
490 |     "fan":"huan",
491 |     "hui":"fei",
492 |     "jie":"zhe",
493 |     "kou":"ke",
494 |     "gou":"ge",
495 |     "zhong":"zen",
496 |     "san":"shang"
497 | }
498 | 
499 | 
500 | # In[667]:
501 | 
502 | 
503 | consonantMap = {
504 |     "b":1.0,
505 |     "p":2.0,
506 |     
507 |     "m":11.0,
508 |     "f":12.0,
509 |     
510 |     "d":21.0,
511 |     "t":22.0,
512 |     
513 |     "n":31.0,
514 |     "l":31.0,
515 |     "r":32.0,
516 |     
517 |     "g":41.0,
518 |     "k":42.0,
519 |     "h":43.0,
520 |     
521 |     "j":46.0,
522 |     "q":47.0,
523 |     "x":48.0,
524 |     
525 |     "z":61.0,
526 |     "c":62.0,
527 |     
528 |     "zh":71.0,
529 |     "ch":72.0,
530 |     
531 |     "sh":81.0,
532 |     "s":82.0,
533 |     
534 |     "y":90.0,
535 |     "w":100.0,
536 |     
537 |     "":99999.0,
538 |     "__v":99999.0
539 | }
540 | 
541 | 
542 | # In[668]:
543 | 
544 | 
545 | vowelMap = {
546 |     "ia":0.0,
547 |     "a":2.0,
548 |     "ai":3.0,
549 |     "uai":4.0,
550 |     "iao":6.0,
551 |     "ao":7.0,
552 |     
553 |     "uan":10.0,
554 |     "an":11.0,
555 |     "ang":12.0,
556 |     "ian":14.0,
557 |     "iang":15.0,
558 |     "uang":17.0,
559 |     "ua":18.0,
560 |     
561 |     "o":21.0,
562 |     "io":22.0,
563 |     "ou":23.0,
564 |     "uo":24.0,
565 |     "ong":26.0,
566 |     "iong":27.0,
567 |     
568 |     "e":31.0,
569 |     "ei":33.0,
570 |     "ie":34.0,
571 |     "er":37.0,
572 |     
573 |     "ve":40.0,
574 |     "ue":40.0,
575 |     "u:e":40.0,
576 |     
577 |     "en":43.0,
578 |     "eng":44.0,
579 |     
580 |     "uen":45.0,
581 |     "ueng":45.0,
582 |     
583 |     "u:en":42.0,
584 |     "ven":42.0,
585 |     
586 |     "i":50.0,
587 |     "u:":51.0,
588 |     "v":51.0,
589 |     "u:n":53.0,
590 |     "vn":53.0,
591 |     "u:an":55.0,
592 |     "v:an":55.0,
593 |     
594 |     "in":53.0,
595 |     "ing":55.0,
596 |     
597 |     "u":60.0,
598 |     "ui":63.0,
599 |     "uei":63.0,
600 |     "iu":64.0,
601 |     "iou":64.0,
602 |     "un":66.0,
603 |     
604 |     "":99999.0,
605 |     "__v":99999.0
606 | }
607 | 
608 | 
609 | # ### Get Pinyin Candidates that are close to an input pinyin
610 | 
611 | # In[644]:
612 | 
613 | 
614 | doubleConsonantsMap = {}
615 | doubleVowelsMap = {}
616 | 
617 | def getClosePinyinCandids(word, theta=2):
618 |     res = []
619 |     word_pinyin = to_pinyin(word)
620 |     word_py = Pinyin(word_pinyin[0])
621 |     
622 |     cCandids = getConsonantCandids(theta, word_py)
623 |     for i in range(len(cCandids)):
624 |         if cCandids[i] == word_py.consonant or cCandids[i] == '__v':
625 |             continue
626 |         for j in range(1,5,1):
627 |             newPy = cCandids[i]+word_py.vowel+str(j)
628 |             res.append(Pinyin(newPy))
629 |     
630 |     vCandids = getVowelCandids(theta, word_py)
631 |     for i in range(len(vCandids)):
632 |         for j in range(1,5,1):
633 |             if word_py.consonant is None:
634 | #                 print(word_py.toStringWithTone(),"has none consonant")
635 |                 newPy = vCandids[i]+str(j)
636 |             else:
637 |                 newPy = word_py.consonant+vCandids[i]+str(j)
638 |             res.append(Pinyin(newPy))
639 |     return res
640 |             
641 |     
642 |     
643 | def getConsonantCandids(theta, word_py):
644 |     populateDoubleConsonantsMap()
645 |     res = []
646 |     curCode = 0        
647 |     if word_py.consonant is None:
648 |         orgCode = consonantMap["__v"]
649 |     else:
650 |         orgCode = consonantMap[word_py.consonant]
651 |         for i in range(int(orgCode-theta), int(orgCode+theta), 1):
652 |             if float(i) in doubleConsonantsMap:
653 |                 cand = doubleConsonantsMap[float(i)]
654 |                 if cand is not None:
655 |                     res += cand
656 |     return res
657 |     
658 | 
659 | def getVowelCandids(theta, word_py):
660 |     populateDoubleVowelsMap()
661 |     res = []
662 |     curCode = 0        
663 |     orgCode = vowelMap[word_py.vowel]
664 |     for i in range(int(orgCode-theta), int(orgCode+theta), 1):
665 |         if float(i) in doubleVowelsMap:
666 |             cand = doubleVowelsMap[float(i)]
667 |             if cand is not None:
668 |                 res += cand
669 |     return res
670 | 
671 | def populateDoubleConsonantsMap():
672 |     if len(doubleConsonantsMap) is not 0:
673 |         return
674 |     hmCdouble = consonantMap
675 |     for consonant in hmCdouble:
676 |         if hmCdouble[consonant] not in doubleConsonantsMap:
677 |             doubleConsonantsMap[hmCdouble[consonant]] = []
678 |             
679 |         doubleConsonantsMap[hmCdouble[consonant]].append(consonant)
680 |         
681 | def populateDoubleVowelsMap():
682 |     if len(doubleVowelsMap) is not 0:
683 |         return
684 |     hmVdouble = vowelMap
685 |     for vowel in hmVdouble:
686 |         if hmVdouble[vowel] not in doubleVowelsMap:
687 |             doubleVowelsMap[hmVdouble[vowel]] = []
688 |             
689 |         doubleVowelsMap[hmVdouble[vowel]].append(vowel)        
690 |         
691 | 
692 | def getCandidates(sentence, mode="simplified", theta=1):
693 |     candidates = []
694 |     words_candidates = []
695 |     for word in sentence:
696 |         candid = getClosePinyinCandids(word, theta)
697 |         words_candidates.append(candid)
698 |     all_combinations = itertools.product(*words_candidates)
699 |     counter = 0
700 |     for combination in all_combinations:
701 |         counter+=1
702 |         searchKey = ""
703 |         for i in combination:
704 |             searchKey = searchKey + i.toStringWithTone().replace("None","") + " "
705 |         if mode is "simplified":
706 |             if searchKey.strip() in pinyin_to_simplified:
707 |                 candidates+=pinyin_to_simplified[searchKey.strip()]
708 |         else:
709 |             if searchKey.strip() in pinyin_to_traditional:
710 |                 candidates+=pinyin_to_traditional[searchKey.strip()]
711 |     return candidates
712 | 
713 | 
714 | # 编码成RCode，这个可以根据需要来增加或减少泛化
715 | def getRCode(sentence):
716 |     rcode = []
717 |     pinyin = []
718 |     for wd in sentence:
719 |         if wd in rcode_ignore_set:
720 |             pinyin.append(wd)
721 |         else:
722 |             pinyin.append(to_pinyin(wd)[0])
723 |     for py in pinyin:
724 |         if py in rcode_ignore_set:
725 |             rcode.append(py)
726 |         else:
727 |             new_py = Pinyin(py)
728 |             code = consonantMap_RCode[new_py.consonant] + vowelMap_RCode[new_py.vowel]
729 |             rcode.append(code)
730 | 
731 |     return "'".join(rcode)
732 | 
733 | 
734 | 
735 | 
736 | 
737 | 


--------------------------------------------------------------------------------
/code/dimsim/dimsim/dimsim.py:
--------------------------------------------------------------------------------
  1 | 
  2 | # coding: utf-8
  3 | 
  4 | # In[656]:
  5 | 
  6 | 
  7 | from pypinyin import pinyin, lazy_pinyin, Style
  8 | import sys
  9 | import math
 10 | import itertools
 11 | import pickle
 12 | import os
 13 | 
 14 | '''
 15 |     @liushaohua
 16 |     本更改仅限个人使用，仅仅使用编码方式，对拼音进行一个统一编码
 17 | '''
 18 | 
 19 | 
 20 | # In[657]:
 21 | 
 22 | 
 23 | work_dir = os.path.abspath(__file__)
 24 | indexOfLastSlash = work_dir.rfind("/")
 25 | work_dir = work_dir[0:indexOfLastSlash]+"/"
 26 | 
 27 | 
 28 | # In[658]:
 29 | 
 30 | 
 31 | sfile = open(work_dir+'pinyin_to_simplified.pickle', 'rb')
 32 | 
 33 | # dump information to that file
 34 | pinyin_to_simplified = pickle.load(sfile)
 35 | sfile.close()
 36 | 
 37 | tfile = open(work_dir+'pinyin_to_traditional.pickle', 'rb')
 38 | pinyin_to_traditional = pickle.load(tfile)
 39 | tfile.close()
 40 | 
 41 | 
 42 | # In[659]:
 43 | 
 44 | 
 45 | def to_pinyin(utterance):
 46 |     length = len(utterance)
 47 |     translated = []
 48 |     pinyin_encodings = pinyin(utterance, style=Style.TONE2)
 49 |     for i in range(length):
 50 |         currPinyin = pinyin_encodings[i][0]
 51 | #         print("{} translates to {}".format(currPinyin, putToneToEnd(currPinyin)))
 52 |         translated.append(putToneToEnd(currPinyin))
 53 |     return translated
 54 | 
 55 | def putToneToEnd(input_pinyin):
 56 |     if len(input_pinyin) is 1:
 57 |         return input_pinyin + '1'
 58 |     tone_index = 0
 59 |     tone = '1'
 60 |     for index, character in enumerate(input_pinyin):
 61 |         if character in ("1","2","3","4"):
 62 |             tone_index = index
 63 |             tone = input_pinyin[index]
 64 |             break;
 65 |     if tone_index is 0:
 66 |         return input_pinyin + "5"
 67 |     return input_pinyin[0:index] + input_pinyin[index+1:] + tone
 68 | 
 69 | def get_distance(utterance1, utterance2):
 70 |     if(len(utterance1) is not len(utterance2)):
 71 |         print("the two inputs do not have the same length")
 72 |         return sys.float_info.max
 73 |     else:
 74 |         u1 = to_pinyin(utterance1)
 75 |         u2 = to_pinyin(utterance2)
 76 |         
 77 |         la = []
 78 |         lb = []
 79 |         for py in u1:
 80 |             la.append(Pinyin(py))
 81 |         for py in u2:
 82 |             lb.append(Pinyin(py))
 83 |     
 84 | 
 85 |         res = 0.0
 86 |         numDiff = 0        
 87 |         tot = len(utterance1)*2.1
 88 |         for i in range (len(utterance1)):
 89 |             apy = la[i]
 90 |             bpy = lb[i]
 91 | 
 92 |             if (apy is None) or (bpy is None):
 93 |                 print("!Error {},{}".format(la, lb))
 94 |             
 95 |             res += getEditDistanceClose_TwoDCode(apy, bpy)
 96 |             
 97 |             if apy.consonant is not bpy.consonant:
 98 |                 numDiff+=1
 99 |             
100 |             if not(str(apy.vowel) == str(bpy.vowel)):
101 |                 numDiff+=1
102 |             
103 |             if apy.tone is not bpy.tone:
104 |                 numDiff+=0.01;
105 |                 
106 |         diffRatio = (numDiff)/tot;
107 |         a = 0
108 |         if diffRatio is 0:
109 |             a=1
110 |         return res*diffRatio;
111 |             
112 | def getEditDistanceClose_TwoDCode(a, b):
113 |     res = 0
114 |     try:
115 |         if (a is None) or (b is None):
116 |             print("Error:pinyin({},{})".format(a.toString(),b.toString()))
117 |             return res
118 |         
119 |         twoDcode_consonant_a = consonantMap_TwoDCode[a.consonant]
120 |         twoDcode_consonant_b = consonantMap_TwoDCode[b.consonant]
121 |         
122 |         cDis = abs(getDistance_TwoDCode(twoDcode_consonant_a, twoDcode_consonant_b))
123 |         
124 |         twoDcode_vowel_a = vowelMap_TwoDCode[a.vowel]
125 |         twoDcode_vowel_b = vowelMap_TwoDCode[b.vowel]
126 |         
127 |         vDis = abs(getDistance_TwoDCode(twoDcode_vowel_a, twoDcode_vowel_b))
128 | 
129 |         hcDis = getSimDisFromHardcodMap(a,b)
130 |         
131 |         res = min((cDis+vDis),hcDis) + 1.0*abs(a.tone-b.tone)/10
132 |         
133 |     except:
134 |         print("Error pinyin {}{}".format(a.toString(), b.toString()))
135 |         raise
136 |     return res
137 | 
138 | def getSimDisFromHardcodMap(a, b):
139 |     try:
140 |         simPy = hardcodeMap[a.toStringNoTone()]
141 |         if simPy is not None:
142 |             if simPy is b.toStringNoTone():
143 |                 return 2.0
144 |         else:
145 |             simPy=hardcodeMap[b.toStringNoTone()]
146 |             if simPy is not None and simPy is a.toStringNoTone():
147 |                 return 2.0
148 |         return sys.float_info.max
149 |     except:
150 |         return sys.float_info.max
151 |     
152 |     
153 | def getDistance_TwoDCode(X, Y):
154 |     x1, x2 = X
155 |     y1, y2 = Y
156 | 
157 |     x1d = abs(x1-y1)
158 |     x2d = abs(x2-y2)
159 |     
160 |     return math.sqrt( x1d**2 + x2d**2)
161 | 
162 | 
163 | consonantMap_TwoDCode ={
164 |     "b":(1.0,0.5),
165 |     "p":(1.0,1.5), 
166 | 
167 |     "g":(7.0,0.5), 
168 |     "k":(7.0,1.5), 
169 |     "h":(7.0,3.0), 
170 |     "f":(7.0,4.0), 
171 | 
172 |     "d":(12.0,0.5), 
173 |     "t":(12.0,1.5), 
174 | 
175 |     "n":(22.5,0.5), 
176 |     "l":(22.5,1.5), 
177 |     "r":( 22.5,2.5), 
178 | 
179 |     
180 |     "zh":(30,1.7), 
181 |     "z":(30,1.5), 
182 |     "j":(30.0,0.5), 
183 | 
184 |     "ch":(31,1.7), 
185 |     "c":(31,1.5), 
186 |     "q":(31.0,0.5), 
187 | 
188 |     "sh":(33,3.7),
189 |     "s":(33,3.5),
190 |     "x":(33,2.5),
191 | 
192 |     
193 |     "m":(50.0,3.5), 
194 | 
195 |     "y":(40.0,0.0), 
196 |     "w":(40,5.0),
197 |     
198 |     "":(99999.0,99999.0)
199 | }
200 | 
201 | 
202 | # In[662]:
203 | 
204 | 
205 | vowelMap_TwoDCode = {
206 |     "a":(1.0,0.0),
207 |     "an":(1.0,1.0),
208 |     "ang":(1.0,1.5),
209 | 
210 |     
211 |     "ia":(0.0,0.0),
212 |     "ian":(0.0,1.0),
213 |     "iang":(0.0,1.5),
214 | 
215 |     "ua":(2.0,0.0),
216 |     "uan":(2.0,1.0),
217 |     "uang":(2.0,1.5),
218 |     "u:an":(2.0,1.0),
219 | 
220 |     
221 |     "ao":(5.0,0.0),
222 |     "iao":(5.0,1.5),
223 | 
224 |     "ai":(8.0,0.0),
225 |     "uai":(8.0,1.5),
226 | 
227 |     
228 | 
229 |     "o":(20,0.0),
230 |     "io":(20,2.5),
231 |     "iou":(20,4),
232 |     "iu":(20,4),
233 |     "ou":(20,5.5),
234 |     "uo":(20,6.0),
235 | 
236 |     "ong":(20,8.0),
237 |     "iong":(20,9.5),
238 | 
239 |     
240 |     "er":(41,1),
241 |     "e":(41,0.0),
242 | 
243 |     "u:e":(40,5.0),
244 |     "ve":(40,5.0),
245 |     "ue":(40,5.0),
246 |     "ie":(40,4.5),
247 |     "ei":(40,4.0),
248 |     "uei":(40,3.0),
249 |     "ui":(40,3.0),
250 | 
251 |     "en":(42,0.5),
252 |     "eng":(42,1.0),
253 | 
254 |     "uen":(43,0.5),
255 |     "un":(43,0.5),
256 |     "ueng":(43,1.0),
257 | 
258 |     
259 |     "i":(60,1.0),
260 |     "in":(60,2.5),
261 |     "ing":(60,3.0),
262 | 
263 |     "u:":(61,1.0),
264 |     "v":(61,1.0),
265 |     "u:n":(61,2.5),
266 |     "vn":(61,2.5),
267 | 
268 |     "u":(80,0.0),
269 | 
270 |     "":(99999.0,99999.0)
271 | }
272 | 
273 | # 这里根据拼音相似的计算规则，对元音、辅音统一编码, 先卡严格一点,
274 | # (a, b) b值小于0.5的，编成一个，按这套相似计算规则，0.5也算是相似的
275 | # 如果这么干，可能会太泛了，导致线上效果太差
276 | consonantMap_RCode ={
277 |     "b":"b",
278 |     "p":"p", 
279 | 
280 |     "g":"g", 
281 |     "k":"k", 
282 |     "h":"h", 
283 |     "f":"f", 
284 | 
285 |     "d":"d", 
286 |     "t":"t", 
287 | 
288 |     "n":"l", 
289 |     "l":"l", 
290 |     "r":"r", 
291 | 
292 |     
293 |     "zh":"z", 
294 |     "z":"z", 
295 |     "j":"j", 
296 | 
297 |     "ch":"c", 
298 |     "c":"c", 
299 |     "q":"q", 
300 | 
301 |     "sh":"s",
302 |     "s":"s",
303 |     "x":"x",
304 | 
305 |     
306 |     "m":"m", 
307 | 
308 |     "y":"y", 
309 |     "w":"w",
310 |     
311 |     "":"C"
312 | }
313 | 
314 | # 对于元音，(a, b) b值小于等于0.5，编码成一个
315 | vowelMap_RCode = {
316 |     "a":"a",
317 |     "an":"an",
318 |     "ang":"an",
319 | 
320 |     
321 |     "ia":"ia",
322 |     "ian":"ian",
323 |     "iang":"ian",
324 | 
325 |     "ua":"ua",
326 |     "uan":"uan",
327 |     "uang":"uan",
328 |     "u:an":"u:an",
329 | 
330 |     
331 |     "ao":"ao",
332 |     "iao":"iao",
333 | 
334 |     "ai":"ai",
335 |     "uai":"uai",
336 | 
337 |     
338 | 
339 |     "o":"o",
340 |     "io":"io",
341 |     "iou":"iou",
342 |     "iu":"iu",
343 |     "ou":"ou",
344 |     "uo":"ou",
345 | 
346 |     "ong":"ong",
347 |     "iong":"iong",
348 | 
349 |     
350 |     "er":"er",
351 |     "e":"e",
352 | 
353 |     "u:e":"ve",
354 |     "ve":"ve",
355 |     "ue":"ve",
356 |     "ie":"ie",
357 |     "ei":"ie",
358 |     "uei":"uei",
359 |     "ui":"ui",
360 | 
361 |     "en":"en",
362 |     "eng":"en",
363 | 
364 |     "uen":"un",
365 |     "un":"un",
366 |     "ueng":"un",
367 | 
368 |     
369 |     "i":"i",
370 |     "in":"in",
371 |     "ing":"in",
372 | 
373 |     "u:":"v",
374 |     "v":"v",
375 |     "u:n":"vn",
376 |     "vn":"vn",
377 | 
378 |     "u":"u",
379 | 
380 |     "":"V"
381 | }
382 | 
383 | # 对于数字来说，不做拼音转换，直接跳过
384 | rcode_ignore_set = set("0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZⅠ")
385 | 
386 | # In[663]:
387 | 
388 | 
389 | consonantList = ["b", "p", "m", "f", "d", "t", "n", "l", "g", "k","h", "j", "q", "x", "zh", "ch", "sh", "r", "z", "c", "s","y", "w"]
390 | 
391 | 
392 | # In[664]:
393 | 
394 | 
395 | vowelList = ["a", "o", "e", "i", "u", "v","u:","er", "ao","ai", "ou","ei", "ia", "iao", "iu", "iou","ie", "ui","uei","ua","uo","uai", "u:e","ve",  "an", "en", "in", "un","uen", "vn","u:n","ian","uan", "u:an","van", "ang", "eng", "ing", "ong","iang","iong","uang","ueng"]
396 | 
397 | 
398 | # In[665]:
399 | 
400 | 
401 | class Pinyin:
402 |     consonantList = ["b", "p", "m", "f", "d", "t", "n", "l", "g", "k", "h", "j", "q", "x", "zh", "ch", "sh", "r", "z", "c", "s","y", "w"]
403 |     vowelList = ["a", "o", "e", "i", "u", "v","u:","er", "ao","ai", "ou","ei", "ia", "iao", "iu", "iou","ie", "ui","uei","ua","uo","uai", "u:e","ve",  "an", "en", "in", "un","uen", "vn","u:n","ian","uan", "u:an","van", "ang", "eng", "ing", "ong","iang","iong","uang","ueng"]
404 |     
405 |     def __init__(self, pinyinstr):
406 |         self.tone = int(pinyinstr[-1])
407 |         self.locp = pinyinstr[0:-1].lower()
408 |         self.consonant, self.vowel = self.parseConsonant(self.locp)
409 | #         print("before rewriting consonant={}, vowel={}, locp={}, tone={}".format(self.consonant,
410 | #                                                                                self.vowel,
411 | #                                                                                self.locp,
412 | #                                                                                self.tone))
413 |         self.pinyinRewrite()
414 | #         print("after rewriting consonant={}, vowel={}, locp={}, tone={}".format(self.consonant,
415 | #                                                                                self.vowel,
416 | #                                                                                self.locp,
417 | #                                                                                self.tone))
418 |     
419 |     def parseConsonant(self, pinyin):
420 |         for consonant in consonantList:
421 |             if pinyin.startswith(consonant):
422 |                 return (consonant, pinyin[len(consonant):])
423 |         # it's a vowel without consonant
424 |         if pinyin in vowelList:
425 |             return None, pinyin.lower()
426 |         
427 |         print("Invalid Pinyin, please check!")
428 |         return None, None
429 |         
430 |     def toStringNoTone(self):
431 |         return "{}{}".format(self.consonant, self.vowel)
432 |     
433 |     def toStringWithTone(self):
434 |         return "{}{}{}".format(self.consonant, self.vowel, self.tone)
435 |     
436 |     def toString(self):
437 |         return "{}{}{}".format(self.consonant, self.vowel, self.tone)
438 |         
439 |     def pinyinRewrite(self):
440 |         import re
441 |         yVowels = {"u","ue","uan","un","u:","u:e","u:an","u:n"}
442 |         tconsonant = {"j","g","x"}
443 |         if 'v' in self.vowel:
444 |             self.vowel = self.vowel.replace("v", "u:")
445 |             
446 |         if self.consonant is None or self.consonant is "":
447 |             self.consonant = ""
448 |             return
449 |         if self.consonant is "y":
450 |             if self.vowel in yVowels:
451 |                 if "u:" not in self.vowel:
452 |                     self.vowel = self.vowel.replace("u","u:")
453 |             else:
454 |                 self.vowel="i"+self.vowel
455 |                 regex = re.compile("i+")
456 |                 self.vowel = self.vowel.replace("iii","i")
457 |                 self.vowel = self.vowel.replace("ii","i")
458 |             self.consonant=""
459 |         
460 |         if self.consonant is "w":
461 |             self.vowel="u"+self.vowel;
462 |             self.vowel=self.vowel.replace("uuu","u")
463 |             self.vowel=self.vowel.replace("uu","u")
464 |             self.consonant = ""
465 |         
466 |         if (self.consonant in tconsonant) and (self.vowel is "u") or (self.vowel is "v"):
467 |             self.vowel="u:"
468 |         
469 |         if self.vowel == "iou":
470 |             self.vowel = "iu"
471 |         
472 |         if self.vowel == "uei":
473 |             self.vowel = "ui"
474 |         
475 |         if self.vowel == "uen":
476 |             self.vowel = "un"
477 |         
478 |                 
479 |         
480 |         
481 | 
482 | 
483 | # In[666]:
484 | 
485 | 
486 | hardcodeMap = {
487 |     "hua":"fa",
488 |     "fa":"hua",
489 |     "huan":"fan",
490 |     "fan":"huan",
491 |     "hui":"fei",
492 |     "jie":"zhe",
493 |     "kou":"ke",
494 |     "gou":"ge",
495 |     "zhong":"zen",
496 |     "san":"shang"
497 | }
498 | 
499 | 
500 | # In[667]:
501 | 
502 | 
503 | consonantMap = {
504 |     "b":1.0,
505 |     "p":2.0,
506 |     
507 |     "m":11.0,
508 |     "f":12.0,
509 |     
510 |     "d":21.0,
511 |     "t":22.0,
512 |     
513 |     "n":31.0,
514 |     "l":31.0,
515 |     "r":32.0,
516 |     
517 |     "g":41.0,
518 |     "k":42.0,
519 |     "h":43.0,
520 |     
521 |     "j":46.0,
522 |     "q":47.0,
523 |     "x":48.0,
524 |     
525 |     "z":61.0,
526 |     "c":62.0,
527 |     
528 |     "zh":71.0,
529 |     "ch":72.0,
530 |     
531 |     "sh":81.0,
532 |     "s":82.0,
533 |     
534 |     "y":90.0,
535 |     "w":100.0,
536 |     
537 |     "":99999.0,
538 |     "__v":99999.0
539 | }
540 | 
541 | 
542 | # In[668]:
543 | 
544 | 
545 | vowelMap = {
546 |     "ia":0.0,
547 |     "a":2.0,
548 |     "ai":3.0,
549 |     "uai":4.0,
550 |     "iao":6.0,
551 |     "ao":7.0,
552 |     
553 |     "uan":10.0,
554 |     "an":11.0,
555 |     "ang":12.0,
556 |     "ian":14.0,
557 |     "iang":15.0,
558 |     "uang":17.0,
559 |     "ua":18.0,
560 |     
561 |     "o":21.0,
562 |     "io":22.0,
563 |     "ou":23.0,
564 |     "uo":24.0,
565 |     "ong":26.0,
566 |     "iong":27.0,
567 |     
568 |     "e":31.0,
569 |     "ei":33.0,
570 |     "ie":34.0,
571 |     "er":37.0,
572 |     
573 |     "ve":40.0,
574 |     "ue":40.0,
575 |     "u:e":40.0,
576 |     
577 |     "en":43.0,
578 |     "eng":44.0,
579 |     
580 |     "uen":45.0,
581 |     "ueng":45.0,
582 |     
583 |     "u:en":42.0,
584 |     "ven":42.0,
585 |     
586 |     "i":50.0,
587 |     "u:":51.0,
588 |     "v":51.0,
589 |     "u:n":53.0,
590 |     "vn":53.0,
591 |     "u:an":55.0,
592 |     "v:an":55.0,
593 |     
594 |     "in":53.0,
595 |     "ing":55.0,
596 |     
597 |     "u":60.0,
598 |     "ui":63.0,
599 |     "uei":63.0,
600 |     "iu":64.0,
601 |     "iou":64.0,
602 |     "un":66.0,
603 |     
604 |     "":99999.0,
605 |     "__v":99999.0
606 | }
607 | 
608 | 
609 | # ### Get Pinyin Candidates that are close to an input pinyin
610 | 
611 | # In[644]:
612 | 
613 | 
614 | doubleConsonantsMap = {}
615 | doubleVowelsMap = {}
616 | 
617 | def getClosePinyinCandids(word, theta=2):
618 |     res = []
619 |     word_pinyin = to_pinyin(word)
620 |     word_py = Pinyin(word_pinyin[0])
621 |     
622 |     cCandids = getConsonantCandids(theta, word_py)
623 |     for i in range(len(cCandids)):
624 |         if cCandids[i] == word_py.consonant or cCandids[i] == '__v':
625 |             continue
626 |         for j in range(1,5,1):
627 |             newPy = cCandids[i]+word_py.vowel+str(j)
628 |             res.append(Pinyin(newPy))
629 |     
630 |     vCandids = getVowelCandids(theta, word_py)
631 |     for i in range(len(vCandids)):
632 |         for j in range(1,5,1):
633 |             if word_py.consonant is None:
634 | #                 print(word_py.toStringWithTone(),"has none consonant")
635 |                 newPy = vCandids[i]+str(j)
636 |             else:
637 |                 newPy = word_py.consonant+vCandids[i]+str(j)
638 |             res.append(Pinyin(newPy))
639 |     return res
640 |             
641 |     
642 |     
643 | def getConsonantCandids(theta, word_py):
644 |     populateDoubleConsonantsMap()
645 |     res = []
646 |     curCode = 0        
647 |     if word_py.consonant is None:
648 |         orgCode = consonantMap["__v"]
649 |     else:
650 |         orgCode = consonantMap[word_py.consonant]
651 |         for i in range(int(orgCode-theta), int(orgCode+theta), 1):
652 |             if float(i) in doubleConsonantsMap:
653 |                 cand = doubleConsonantsMap[float(i)]
654 |                 if cand is not None:
655 |                     res += cand
656 |     return res
657 |     
658 | 
659 | def getVowelCandids(theta, word_py):
660 |     populateDoubleVowelsMap()
661 |     res = []
662 |     curCode = 0        
663 |     orgCode = vowelMap[word_py.vowel]
664 |     for i in range(int(orgCode-theta), int(orgCode+theta), 1):
665 |         if float(i) in doubleVowelsMap:
666 |             cand = doubleVowelsMap[float(i)]
667 |             if cand is not None:
668 |                 res += cand
669 |     return res
670 | 
671 | def populateDoubleConsonantsMap():
672 |     if len(doubleConsonantsMap) is not 0:
673 |         return
674 |     hmCdouble = consonantMap
675 |     for consonant in hmCdouble:
676 |         if hmCdouble[consonant] not in doubleConsonantsMap:
677 |             doubleConsonantsMap[hmCdouble[consonant]] = []
678 |             
679 |         doubleConsonantsMap[hmCdouble[consonant]].append(consonant)
680 |         
681 | def populateDoubleVowelsMap():
682 |     if len(doubleVowelsMap) is not 0:
683 |         return
684 |     hmVdouble = vowelMap
685 |     for vowel in hmVdouble:
686 |         if hmVdouble[vowel] not in doubleVowelsMap:
687 |             doubleVowelsMap[hmVdouble[vowel]] = []
688 |             
689 |         doubleVowelsMap[hmVdouble[vowel]].append(vowel)        
690 |         
691 | 
692 | def getCandidates(sentence, mode="simplified", theta=1):
693 |     candidates = []
694 |     words_candidates = []
695 |     for word in sentence:
696 |         candid = getClosePinyinCandids(word, theta)
697 |         words_candidates.append(candid)
698 |     all_combinations = itertools.product(*words_candidates)
699 |     counter = 0
700 |     for combination in all_combinations:
701 |         counter+=1
702 |         searchKey = ""
703 |         for i in combination:
704 |             searchKey = searchKey + i.toStringWithTone().replace("None","") + " "
705 |         if mode is "simplified":
706 |             if searchKey.strip() in pinyin_to_simplified:
707 |                 candidates+=pinyin_to_simplified[searchKey.strip()]
708 |         else:
709 |             if searchKey.strip() in pinyin_to_traditional:
710 |                 candidates+=pinyin_to_traditional[searchKey.strip()]
711 |     return candidates
712 | 
713 | 
714 | # 编码成RCode，这个可以根据需要来增加或减少泛化
715 | def getRCode(sentence):
716 |     rcode = []
717 |     pinyin = []
718 |     for wd in sentence:
719 |         if wd in rcode_ignore_set:
720 |             pinyin.append(wd)
721 |         else:
722 |             pinyin.append(to_pinyin(wd)[0])
723 |     for py in pinyin:
724 |         if py in rcode_ignore_set:
725 |             rcode.append(py)
726 |         else:
727 |             new_py = Pinyin(py)
728 |             code = consonantMap_RCode[new_py.consonant] + vowelMap_RCode[new_py.vowel]
729 |             rcode.append(code)
730 | 
731 |     return "'".join(rcode)
732 | 
733 | 
734 | 
735 | 
736 | 
737 | 


--------------------------------------------------------------------------------
/dimsim/__init__.py:
--------------------------------------------------------------------------------
1 | from .core.model import get_distance
2 | from .core.model import get_candidates


--------------------------------------------------------------------------------
/dimsim/core/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/System-T/DimSim/77261afc51e5d0995b5905b9ed1c45317082d613/dimsim/core/__init__.py


--------------------------------------------------------------------------------
/dimsim/core/model.py:
--------------------------------------------------------------------------------
  1 | from pypinyin import pinyin, lazy_pinyin, Style
  2 | import sys
  3 | import math
  4 | import itertools
  5 | import os
  6 | 
  7 | from dimsim.utils.pinyin import Pinyin, load_pinyin_to_simplified, load_pinyin_to_traditional
  8 | from dimsim.utils.utils import get_edit_distance_close_2d_code, to_pinyin
  9 | from dimsim.utils.maps import vowelMap, consonantMap
 10 | 
 11 | doubleConsonantsMap = {}
 12 | doubleVowelsMap = {}
 13 | 
 14 | pinyin_to_simplified = load_pinyin_to_simplified()
 15 | pinyin_to_traditional = load_pinyin_to_traditional()
 16 | 
 17 | def get_distance(utterance1, utterance2, pinyin=False):
 18 |     '''
 19 |     Calculates the distances between embeddings of two Chinese words.
 20 |     input: 
 21 |         utterance1, utterance2: utf-8 strings for Chinese words or 
 22 |                                 pinyin strings.
 23 |         pinyin : Boolean - indicates if words are in Chinese or Pinyin.
 24 |     output:
 25 |         distance - float.
 26 |     '''
 27 |     assert (len(utterance1) == len(utterance2)),"The two inputs do not have the same length"
 28 | 
 29 |     if not pinyin:
 30 |         u1 = to_pinyin(utterance1)
 31 |         u2 = to_pinyin(utterance2)
 32 |     
 33 |     else:
 34 |         u1 = utterance1
 35 |         u2 = utterance2
 36 |     
 37 |     la = []
 38 |     lb = []
 39 |     for py in u1:
 40 |         la.append(Pinyin(py))
 41 |     for py in u2:
 42 |         lb.append(Pinyin(py))
 43 | 
 44 | 
 45 |     res = 0.0
 46 |     numDiff = 0        
 47 |     tot = len(utterance1)*2.1
 48 |     for i in range (len(utterance1)):
 49 |         apy = la[i]
 50 |         bpy = lb[i]
 51 | 
 52 |         if (apy is None) or (bpy is None):
 53 |             raise Exception("!Empty Pinyin {},{}".format(la, lb))
 54 |         res += get_edit_distance_close_2d_code(apy, bpy)
 55 |         
 56 |         if apy.consonant is not bpy.consonant:
 57 |             numDiff+=1
 58 |         
 59 |         if not(str(apy.vowel) == str(bpy.vowel)):
 60 |             numDiff+=1
 61 |         
 62 |         if apy.tone is not bpy.tone:
 63 |             numDiff+=0.01
 64 |             
 65 |     diffRatio = (numDiff)/tot
 66 |     return res*diffRatio      
 67 |         
 68 | 
 69 | def get_candidates(sentence, mode="simplified", theta=1):
 70 |     '''
 71 |     Gets similar sounding words / candidates based on embeddings.
 72 |     inputs:
 73 |         sentence - utf-8 string with the Chinese words.
 74 |     outputs:
 75 |         candidates - a list containing utf-8 string Chinese words.
 76 |     '''    
 77 |     candidates = []
 78 |     words_candidates = []
 79 |     for word in sentence:
 80 |         candid = _get_close_pinyin_candids(word, theta)
 81 |         words_candidates.append(candid)
 82 |     all_combinations = itertools.product(*words_candidates)
 83 |     counter = 0
 84 |     for combination in all_combinations:
 85 |         counter+=1
 86 |         searchKey = ""
 87 |         for i in combination:
 88 |             searchKey = searchKey + i.toStringWithTone().replace("None","") + " "
 89 |         if mode is "simplified":
 90 |             if searchKey.strip() in pinyin_to_simplified:
 91 |                 candidates+=pinyin_to_simplified[searchKey.strip()]
 92 |         else:
 93 |             if searchKey.strip() in pinyin_to_traditional:
 94 |                 candidates+=pinyin_to_traditional[searchKey.strip()]
 95 |     return candidates
 96 | 
 97 | 
 98 | def _get_close_pinyin_candids(word, theta=2):
 99 |     res = []
100 |     word_pinyin = to_pinyin(word)
101 |     word_py = Pinyin(word_pinyin[0])
102 |     
103 |     cCandids = _get_consonant_candids(theta, word_py)
104 |     for i in range(len(cCandids)):
105 |         if cCandids[i] == word_py.consonant:
106 |             continue
107 |         for j in range(1,5,1):
108 |             newPy = cCandids[i]+word_py.vowel+str(j)
109 |             res.append(Pinyin(newPy))
110 |     
111 |     vCandids = _get_vowel_candids(theta, word_py)
112 |     for i in range(len(vCandids)):
113 |         for j in range(1,5,1):
114 |             if word_py.consonant is None:
115 |                 newPy = vCandids[i]+str(j)
116 |             else:
117 |                 newPy = word_py.consonant+vCandids[i]+str(j)
118 |             res.append(Pinyin(newPy))
119 |     return res
120 |     
121 | def _get_consonant_candids(theta, word_py):
122 |     _populate_double_consonants_map()
123 |     res = []
124 |     if word_py.consonant is None:
125 |         orgCode = consonantMap["__v"]
126 |     else:
127 |         orgCode = consonantMap[word_py.consonant]
128 |         for i in range(int(orgCode-theta), int(orgCode+theta), 1):
129 |             if float(i) in doubleConsonantsMap:
130 |                 cand = doubleConsonantsMap[float(i)]
131 |                 if cand is not None:
132 |                     res += cand
133 |     return res
134 |     
135 | 
136 | def _get_vowel_candids(theta, word_py):
137 |     _populate_double_vowels_map()
138 |     res = []       
139 |     orgCode = vowelMap[word_py.vowel]
140 |     for i in range(int(orgCode-theta), int(orgCode+theta), 1):
141 |         if float(i) in doubleVowelsMap:
142 |             cand = doubleVowelsMap[float(i)]
143 |             if cand is not None:
144 |                 res += cand
145 |     return res
146 | 
147 | def _populate_double_consonants_map():
148 |     if len(doubleConsonantsMap) is not 0:
149 |         return
150 |     hmCdouble = consonantMap
151 |     for consonant in hmCdouble:
152 |         if hmCdouble[consonant] not in doubleConsonantsMap:
153 |             doubleConsonantsMap[hmCdouble[consonant]] = []
154 |             
155 |         doubleConsonantsMap[hmCdouble[consonant]].append(consonant)
156 |         
157 | def _populate_double_vowels_map():
158 |     if len(doubleVowelsMap) is not 0:
159 |         return
160 |     hmVdouble = vowelMap
161 |     for vowel in hmVdouble:
162 |         if hmVdouble[vowel] not in doubleVowelsMap:
163 |             doubleVowelsMap[hmVdouble[vowel]] = []
164 |             
165 |         doubleVowelsMap[hmVdouble[vowel]].append(vowel)  
166 | 


--------------------------------------------------------------------------------
/dimsim/data/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/System-T/DimSim/77261afc51e5d0995b5905b9ed1c45317082d613/dimsim/data/__init__.py


--------------------------------------------------------------------------------
/dimsim/data/pinyin_to_simplified.pickle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/System-T/DimSim/77261afc51e5d0995b5905b9ed1c45317082d613/dimsim/data/pinyin_to_simplified.pickle


--------------------------------------------------------------------------------
/dimsim/data/pinyin_to_traditional.pickle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/System-T/DimSim/77261afc51e5d0995b5905b9ed1c45317082d613/dimsim/data/pinyin_to_traditional.pickle


--------------------------------------------------------------------------------
/dimsim/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/System-T/DimSim/77261afc51e5d0995b5905b9ed1c45317082d613/dimsim/tests/__init__.py


--------------------------------------------------------------------------------
/dimsim/tests/test_dimsim.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Standard libs
 4 | import io
 5 | import os
 6 | # Dependencies
 7 | import pytest
 8 | 
 9 | # The module to test
10 | from dimsim.core.model import get_distance, get_candidates
11 | 
12 | def test_distance_near():
13 |         dist = get_distance(u'大侠',u'大虾')
14 |         assert dist == 0.0002380952380952381
15 | 
16 | def test_distance_far():
17 |         dist = get_distance(u'大侠',u'大人')
18 |         assert dist == 25.001417183349876
19 | 
20 | def test_distance_pinyin():
21 |         dist = get_distance(['da4','xia2'],['da4','xia1'],pinyin=True)
22 |         assert dist == 0.0002380952380952381
23 | 
24 | def test_invalid_input():
25 |         pytest.raises(AssertionError, get_distance, u'大侠', u'大')
26 | 
27 | def test_get_candidates_simplified():
28 |         candidates = get_candidates(u'大侠', mode='simplified', theta=1)
29 |         for c in candidates:
30 |                 assert c in [u'打下', u'大虾', u'大侠']
31 | 
32 | def test_get_candidates_traditional():
33 |         candidates = get_candidates(u'粉丝', mode='traditional', theta=1)
34 |         for c in candidates:
35 |                 assert c in [u'門市', u'分時', u'焚屍', u'粉飾', u'粉絲']
36 | 
37 | if __name__ == '__main__':
38 |         pytest.main([__file__])
39 | 


--------------------------------------------------------------------------------
/dimsim/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/System-T/DimSim/77261afc51e5d0995b5905b9ed1c45317082d613/dimsim/utils/__init__.py


--------------------------------------------------------------------------------
/dimsim/utils/maps.py:
--------------------------------------------------------------------------------
  1 | consonantMap_TwoDCode ={
  2 |     "b":(1.0,0.5),
  3 |     "p":(1.0,1.5), 
  4 | 
  5 |     "g":(7.0,0.5), 
  6 |     "k":(7.0,1.5), 
  7 |     "h":(7.0,3.0), 
  8 |     "f":(7.0,4.0), 
  9 | 
 10 |     "d":(12.0,0.5), 
 11 |     "t":(12.0,1.5), 
 12 | 
 13 |     "n":(22.5,0.5), 
 14 |     "l":(22.5,1.5), 
 15 |     "r":( 22.5,2.5), 
 16 | 
 17 |     
 18 |     "zh":(30,1.7), 
 19 |     "z":(30,1.5), 
 20 |     "j":(30.0,0.5), 
 21 | 
 22 |     "ch":(31,1.7), 
 23 |     "c":(31,1.5), 
 24 |     "q":(31.0,0.5), 
 25 | 
 26 |     "sh":(33,3.7),
 27 |     "s":(33,3.5),
 28 |     "x":(33,2.5),
 29 | 
 30 |     
 31 |     "m":(50.0,3.5), 
 32 | 
 33 |     "y":(40.0,0.0), 
 34 |     "w":(40,5.0),
 35 |     
 36 |     "":(99999.0,99999.0)
 37 | }
 38 | 
 39 | vowelMap_TwoDCode = {
 40 |     "a":(1.0,0.0),
 41 |     "an":(1.0,1.0),
 42 |     "ang":(1.0,1.5),
 43 | 
 44 |     
 45 |     "ia":(0.0,0.0),
 46 |     "ian":(0.0,1.0),
 47 |     "iang":(0.0,1.5),
 48 | 
 49 |     "ua":(2.0,0.0),
 50 |     "uan":(2.0,1.0),
 51 |     "uang":(2.0,1.5),
 52 |     "u:an":(2.0,1.0),
 53 | 
 54 |     
 55 |     "ao":(5.0,0.0),
 56 |     "iao":(5.0,1.5),
 57 | 
 58 |     "ai":(8.0,0.0),
 59 |     "uai":(8.0,1.5),
 60 | 
 61 |     
 62 | 
 63 |     "o":(20,0.0),
 64 |     "io":(20,2.5),
 65 |     "iou":(20,4),
 66 |     "iu":(20,4),
 67 |     "ou":(20,5.5),
 68 |     "uo":(20,6.0),
 69 | 
 70 |     "ong":(20,8.0),
 71 |     "iong":(20,9.5),
 72 | 
 73 |     
 74 |     "er":(41,1),
 75 |     "e":(41,0.0),
 76 | 
 77 |     "u:e":(40,5.0),
 78 |     "ve":(40,5.0),
 79 |     "ue":(40,5.0),
 80 |     "ie":(40,4.5),
 81 |     "ei":(40,4.0),
 82 |     "uei":(40,3.0),
 83 |     "ui":(40,3.0),
 84 | 
 85 |     "en":(42,0.5),
 86 |     "eng":(42,1.0),
 87 | 
 88 |     "uen":(43,0.5),
 89 |     "un":(43,0.5),
 90 |     "ueng":(43,1.0),
 91 | 
 92 |     
 93 |     "i":(60,1.0),
 94 |     "in":(60,2.5),
 95 |     "ing":(60,3.0),
 96 | 
 97 |     "u:":(61,1.0),
 98 |     "v":(61,1.0),
 99 |     "u:n":(61,2.5),
100 |     "vn":(61,2.5),
101 | 
102 |     "u":(80,0.0),
103 | 
104 |     "":(99999.0,99999.0)
105 | }
106 | 
107 | consonantList = ["b", "p", "m", "f", "d", "t", "n", "l", "g", "k","h", "j", "q", "x", "zh", "ch", "sh", "r", "z", "c", "s","y", "w"]
108 | 
109 | vowelList = ["a", "o", "e", "i", "u", "v","u:","er", "ao","ai", "ou","ei", "ia", "iao", "iu", "iou","ie", "ui","uei","ua","uo","uai", "u:e","ve",  "an", "en", "in", "un","uen", "vn","u:n","ian","uan", "u:an","van", "ang", "eng", "ing", "ong","iang","iong","uang","ueng"]
110 | 
111 | hardcodeMap = {
112 |     "hua":"fa",
113 |     "fa":"hua",
114 |     "huan":"fan",
115 |     "fan":"huan",
116 |     "hui":"fei",
117 |     "jie":"zhe",
118 |     "kou":"ke",
119 |     "gou":"ge",
120 |     "zhong":"zen",
121 |     "san":"shang"
122 | }
123 | 
124 | consonantMap = {
125 |     "b":1.0,
126 |     "p":2.0,
127 |     
128 |     "m":11.0,
129 |     "f":12.0,
130 |     
131 |     "d":21.0,
132 |     "t":22.0,
133 |     
134 |     "n":31.0,
135 |     "l":31.0,
136 |     "r":32.0,
137 |     
138 |     "g":41.0,
139 |     "k":42.0,
140 |     "h":43.0,
141 |     
142 |     "j":46.0,
143 |     "q":47.0,
144 |     "x":48.0,
145 |     
146 |     "z":61.0,
147 |     "c":62.0,
148 |     
149 |     "zh":71.0,
150 |     "ch":72.0,
151 |     
152 |     "sh":81.0,
153 |     "s":82.0,
154 |     
155 |     "y":90.0,
156 |     "w":100.0,
157 |     
158 |     "":99999.0,
159 |     "__v":99999.0
160 | }
161 | 
162 | vowelMap = {
163 |     "ia":0.0,
164 |     "a":2.0,
165 |     "ai":3.0,
166 |     "uai":4.0,
167 |     "iao":6.0,
168 |     "ao":7.0,
169 |     
170 |     "uan":10.0,
171 |     "an":11.0,
172 |     "ang":12.0,
173 |     "ian":14.0,
174 |     "iang":15.0,
175 |     "uang":17.0,
176 |     "ua":18.0,
177 |     
178 |     "o":21.0,
179 |     "io":22.0,
180 |     "ou":23.0,
181 |     "uo":24.0,
182 |     "ong":26.0,
183 |     "iong":27.0,
184 |     
185 |     "e":31.0,
186 |     "ei":33.0,
187 |     "ie":34.0,
188 |     "er":37.0,
189 |     
190 |     "ve":40.0,
191 |     "ue":40.0,
192 |     "u:e":40.0,
193 |     
194 |     "en":43.0,
195 |     "eng":44.0,
196 |     
197 |     "uen":45.0,
198 |     "ueng":45.0,
199 |     
200 |     "u:en":42.0,
201 |     "ven":42.0,
202 |     
203 |     "i":50.0,
204 |     "u:":51.0,
205 |     "v":51.0,
206 |     "u:n":53.0,
207 |     "vn":53.0,
208 |     "u:an":55.0,
209 |     "v:an":55.0,
210 |     
211 |     "in":53.0,
212 |     "ing":55.0,
213 |     
214 |     "u":60.0,
215 |     "ui":63.0,
216 |     "uei":63.0,
217 |     "iu":64.0,
218 |     "iou":64.0,
219 |     "un":66.0,
220 |     
221 |     "":99999.0,
222 |     "__v":99999.0
223 | }
224 | 


--------------------------------------------------------------------------------
/dimsim/utils/pinyin.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import pickle
 3 | 
 4 | from pypinyin import pinyin, lazy_pinyin, Style
 5 | from dimsim.utils import maps
 6 | 
 7 | class Pinyin:
 8 |     consonantList = maps.consonantList
 9 |     vowelList = maps.vowelList
10 |     
11 |     def __init__(self, pinyinstr):
12 |         self.tone = int(pinyinstr[-1])
13 |         self.locp = pinyinstr[0:-1].lower()
14 |         self.consonant, self.vowel = self.parseConsonant(self.locp)
15 |         self.pinyinRewrite()
16 |     
17 |     def parseConsonant(self, pinyin):
18 |         for consonant in self.consonantList:
19 |             if pinyin.startswith(consonant):
20 |                 return (consonant, pinyin[len(consonant):])
21 |         # it's a vowel without consonant
22 |         if pinyin in self.vowelList:
23 |             return None, pinyin.lower()
24 |         
25 |         print("Invalid Pinyin, please check!")
26 |         return None, None
27 |         
28 |     def toStringNoTone(self):
29 |         return "{}{}".format(self.consonant, self.vowel)
30 |     
31 |     def toStringWithTone(self):
32 |         return "{}{}{}".format(self.consonant, self.vowel, self.tone)
33 |     
34 |     def toString(self):
35 |         return "{}{}{}".format(self.consonant, self.vowel, self.tone)
36 |         
37 |     def pinyinRewrite(self):
38 |         import re
39 |         yVowels = {"u","ue","uan","un","u:","u:e","u:an","u:n"}
40 |         tconsonant = {"j","g","x"}
41 |         if 'v' in self.vowel:
42 |             self.vowel = self.vowel.replace("v", "u:")
43 |             
44 |         if self.consonant is None or self.consonant is "":
45 |             self.consonant = ""
46 |             return
47 |         if self.consonant is "y":
48 |             if self.vowel in yVowels:
49 |                 if "u:" not in self.vowel:
50 |                     self.vowel = self.vowel.replace("u","u:")
51 |             else:
52 |                 self.vowel="i"+self.vowel
53 |                 regex = re.compile("i+")
54 |                 self.vowel = self.vowel.replace("iii","i")
55 |                 self.vowel = self.vowel.replace("ii","i")
56 |             self.consonant=""
57 |         
58 |         if self.consonant is "w":
59 |             self.vowel="u"+self.vowel
60 |             self.vowel=self.vowel.replace("uuu","u")
61 |             self.vowel=self.vowel.replace("uu","u")
62 |             self.consonant = ""
63 |         
64 |         if (self.consonant in tconsonant) and (self.vowel is "u") or (self.vowel is "v"):
65 |             self.vowel="u:"
66 |         
67 |         if self.vowel is "iou":
68 |             self.vowel = "iu"
69 |         
70 |         if self.vowel is "uei":
71 |             self.vowel = "ui"
72 |         
73 |         if self.vowel is "uen":
74 |             self.vowel = "un"
75 | 
76 | def load_pinyin_to_simplified():
77 | 
78 |     curr_dir, _ = os.path.split(__file__)
79 |     root_dir, _ = os.path.split(curr_dir)
80 |     DATA_PATH = os.path.join(root_dir, "data", "pinyin_to_simplified.pickle")
81 |     sfile = open(DATA_PATH, 'rb')
82 |     pinyin_to_simplified = pickle.load(sfile)
83 |     sfile.close()
84 |     return pinyin_to_simplified
85 | 
86 | 
87 | def load_pinyin_to_traditional():
88 | 
89 |     curr_dir, _ = os.path.split(__file__)
90 |     root_dir, _ = os.path.split(curr_dir)
91 |     DATA_PATH = os.path.join(root_dir, "data", "pinyin_to_traditional.pickle")
92 |     tfile = open(DATA_PATH, 'rb')
93 |     pinyin_to_traditional = pickle.load(tfile)
94 |     tfile.close()
95 |     return pinyin_to_traditional
96 |     


--------------------------------------------------------------------------------
/dimsim/utils/utils.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import math
 3 | from pypinyin import pinyin, lazy_pinyin, Style
 4 | 
 5 | from dimsim.utils.maps import consonantMap_TwoDCode, vowelMap_TwoDCode, hardcodeMap
 6 | 
 7 | def to_pinyin(utterance):
 8 |     length = len(utterance)
 9 |     translated = []
10 |     pinyin_encodings = pinyin(utterance, style=Style.TONE2)
11 |     for i in range(length):
12 |         currPinyin = pinyin_encodings[i][0]
13 |         translated.append(put_tone_to_end(currPinyin))
14 |     return translated
15 | 
16 | def put_tone_to_end(input_pinyin):
17 |     if len(input_pinyin) is 1:
18 |         return input_pinyin + '1'
19 |     tone_index = 0
20 |     tone = '1'
21 |     for index, character in enumerate(input_pinyin):
22 |         if character in ("1","2","3","4"):
23 |             tone_index = index
24 |             tone = input_pinyin[index]
25 |             break
26 |     if tone_index is 0:
27 |         return input_pinyin + "5"
28 |     return input_pinyin[0:index] + input_pinyin[index+1:] + tone
29 | 
30 | def get_edit_distance_close_2d_code(a, b):
31 |     res = 0
32 |     try:
33 |         if (a is None) or (b is None):
34 |             print("Error:pinyin({},{})".format(a.toString(),b.toString()))
35 |             return res
36 |         
37 |         twoDcode_consonant_a = consonantMap_TwoDCode[a.consonant]
38 |         twoDcode_consonant_b = consonantMap_TwoDCode[b.consonant]
39 |         
40 |         cDis = abs(get_distance_2d_code(twoDcode_consonant_a, twoDcode_consonant_b))
41 |         
42 |         twoDcode_vowel_a = vowelMap_TwoDCode[a.vowel]
43 |         twoDcode_vowel_b = vowelMap_TwoDCode[b.vowel]
44 |         
45 |         vDis = abs(get_distance_2d_code(twoDcode_vowel_a, twoDcode_vowel_b))
46 | 
47 |         hcDis = get_sim_dis_from_hardcod_map(a,b)
48 |         
49 |         res = min((cDis+vDis),hcDis) + 1.0*abs(a.tone-b.tone)/10
50 |         
51 |     except:
52 |         raise Exception("Error pinyin {}{}".format(a.toString(), b.toString()))
53 |     return res
54 | 
55 | def get_sim_dis_from_hardcod_map(a, b):
56 |     try:
57 |         simPy = hardcodeMap[a.toStringNoTone()]
58 |         if simPy is not None:
59 |             if simPy is b.toStringNoTone():
60 |                 return 2.0
61 |         else:
62 |             simPy=hardcodeMap[b.toStringNoTone()]
63 |             if simPy is not None and simPy is a.toStringNoTone():
64 |                 return 2.0
65 |         return sys.float_info.max
66 |     except:
67 |         return sys.float_info.max
68 | 
69 | def get_distance_2d_code(X, Y):
70 |     x1, x2 = X
71 |     y1, y2 = Y
72 | 
73 |     x1d = abs(x1-y1)
74 |     x2d = abs(x2-y2)
75 |     
76 |     return math.sqrt( x1d**2 + x2d**2)


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [metadata]
2 | description-file = README.md


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup
 2 | 
 3 | with open("README.md", "r") as fh:
 4 |     long_description = fh.read()
 5 | 
 6 | setup(name='dimsim',
 7 | 	version='0.2.2',
 8 | 	description='Python implementation of the Chinese soundex project DimSim',
 9 | 	long_description=long_description,
10 |     long_description_content_type="text/markdown",
11 | 	author='IBM SystemT, IBM CODAIT',
12 | 	author_email='qian.kun@ibm.com, karthik.muthuraman@ibm.com, ihjhuo@ibm.com, frreiss@us.ibm.com',
13 | 	url='https://github.com/System-T/DimSim',
14 | 	packages=['dimsim', 'dimsim.core', 'dimsim.utils', 'dimsim.data'],
15 | 	package_data={'':['dimsim/data/pinyin_to_simplified.pickle','dimsim/data/pinyin_to_traditional.pickle']},
16 | 	include_package_data=True,
17 | 	classifiers=['License :: OSI Approved :: Apache Software License'],
18 | 	install_requires=[
19 |         'pypinyin',
20 |         ],
21 | 	test_suite='nose.collector',
22 |     tests_require=['nose']
23 | )
24 | 


--------------------------------------------------------------------------------