├── .github
    ├── ISSUE_TEMPLATE.md
    └── PULL_REQUEST_TEMPLATE.md
├── .gitignore
├── .travis.yml
├── CHANGELOG.md
├── CODE_OF_CONDUCT.md
├── LICENSE
├── README.md
├── Requirements.txt
├── VALUATION.md
├── assets
    ├── 1.png
    ├── 2.png
    ├── 3.gif
    ├── 4.png
    ├── 5.png
    ├── 6.png
    ├── 64531083-3199aa80-d341-11e9-86cd-3a3ed860b14b.png
    ├── pr_curve_eulidean0.8+unigram0.2.png
    ├── pr_hresholds_eulidean0.8+unigram0.2+POS.png
    ├── roc_curve_eulidean0.8+unigram0.2+POS.png
    ├── screenshot_20231124180125.png
    ├── sentence_precision.jpg
    └── syn_order_post.jpg
├── benchmark.py
├── demo.py
├── scripts
    ├── package.sh
    ├── pypi.sh
    └── test.sh
├── setup.cfg
├── setup.py
└── synonyms
    ├── __init__.py
    ├── data
        ├── stopwords.txt
        └── vocab.txt
    ├── synonyms.py
    ├── utils.py
    └── word2vec.py


/.github/ISSUE_TEMPLATE.md:
--------------------------------------------------------------------------------
 1 | # description
 2 | 
 3 | ## current
 4 | 
 5 | ## expected
 6 | 
 7 | # solution
 8 | 
 9 | # environment
10 | 
11 | * version:
12 | The commit hash (`git rev-parse HEAD`)
13 | 


--------------------------------------------------------------------------------
/.github/PULL_REQUEST_TEMPLATE.md:
--------------------------------------------------------------------------------
 1 | <!--- Provide a general summary of your changes in the Title above -->
 2 | 
 3 | ## Description
 4 | <!--- Describe your changes in detail -->
 5 | 
 6 | ## Motivation and Context
 7 | <!--- Why is this change required? What problem does it solve? -->
 8 | <!--- If it fixes an open issue, please link to the issue here. -->
 9 | 
10 | ## How Has This Been Tested?
11 | <!--- Please describe in detail how you tested your changes. -->
12 | <!--- Include details of your testing environment, and the tests you ran to -->
13 | <!--- see how your change affects other areas of the code, etc. -->
14 | 
15 | ## Screenshots (if appropriate):
16 | 
17 | ## Types of changes
18 | <!--- What types of changes does your code introduce? Put an `x` in all the boxes that apply: -->
19 | - [ ] Bug fix (non-breaking change which fixes an issue)
20 | - [ ] New feature (non-breaking change which adds functionality)
21 | - [ ] Breaking change (fix or feature that would cause existing functionality to change)
22 | 
23 | ## Checklist:
24 | <!--- Go over all the following points, and put an `x` in all the boxes that apply. -->
25 | <!--- If you're unsure about any of these, don't hesitate to ask. We're here to help! -->
26 | - [ ] My code follows the code style of this project.
27 | - [ ] My change requires a change to the documentation.
28 | - [ ] I have updated the documentation accordingly.
29 | - [ ] I have added tests to cover my changes.
30 | - [ ] All new and existing tests passed.
31 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | *.swp
 2 | *.swo
 3 | *.sublime-*
 4 | *.pyc
 5 | __pycache__
 6 | tmp/
 7 | node_modules/
 8 | sftp-config.json
 9 | .DS_Store
10 | dist/
11 | synonyms.egg-info
12 | .vscode/
13 | build/
14 | .env
15 | synonyms/data/words.vector*
16 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: python
 2 | cache: pip
 3 | python:
 4 |     #- 2.7
 5 |     - 3.6
 6 |     #- nightly
 7 |     #- pypy
 8 |     #- pypy3
 9 | matrix:
10 |     allow_failures:
11 |         - python: nightly
12 |         - python: pypy
13 |         - python: pypy3
14 | install:
15 |     #- pip install -r requirements.txt
16 |     - pip install flake8  # pytest  # add another testing frameworks later
17 | before_script:
18 |     # stop the build if there are Python syntax errors or undefined names
19 |     - flake8 . --count --select=E901,E999,F821,F822,F823 --show-source --statistics
20 |     # exit-zero treats all errors as warnings.  The GitHub editor is 127 chars wide
21 |     - flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
22 | script:
23 |     - true  # pytest --capture=sys  # add other tests here
24 | notifications:
25 |     on_success: change
26 |     on_failure: change  # `always` will be the setting once code changes slow down
27 | 


--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
 1 | # 3.23
 2 | 
 3 | - Use chatoperastore to download model file
 4 | 
 5 | # 3.16
 6 | 
 7 | - Use github vector pkg download link
 8 | 
 9 | # 3.15
10 | 
11 | - Fix jieba exports 冲突，改为只暴露 keywords, seg 接口
12 | - 修正 vocab.txt 里的错误
13 | 
14 | # 3.13
15 | 
16 | - 减少依赖
17 | - export jieba as synonyms.jieba
18 | 
19 | # 3.12
20 | 
21 | - 使用更大词向量，42W+ 词汇表
22 | - 优化下载速度
23 | 
24 | # 3.11
25 | 
26 | - 支持定义查询词汇数量，默认 10 个词
27 | 
28 | # 3.10
29 | 
30 | - 计算编辑距离时去停用词
31 | 
32 | # 3.9
33 | 
34 | - fix bug
35 | 
36 | # 3.8
37 | 
38 | - 获得一个分词后句子的向量，向量以 BoW 方式组成
39 | 
40 | ```
41 |     sentence: 句子是分词后通过空格联合起来
42 |     ignore: 是否忽略OOV，False时，随机生成一个向量
43 | ```
44 | 
45 | # 3.7
46 | 
47 | - change import path of utils in word2vec.py to local path
48 | - expose vector fn
49 | 
50 | # 3.6
51 | 
52 | - Fix Bug: compare 保证交换两个句子后分数一致 [#60](https://github.com/huyingxi/Synonyms/issues/60)
53 | 
54 | # 3.5
55 | 
56 | - 根据实际情况，降低向量距离对近似度分数的影响
57 | 
58 | # 3.3
59 | 
60 | - 增加分词接口
61 | - 优化分词器初始化加载字典
62 | - 使用 jieba 分词源码
63 | - 使用 glog 作为日志输出模块
64 | 
65 | # 3.2
66 | 
67 | - 将发布证书改为 MIT
68 | 
69 | # 3.1
70 | 
71 | - 对空间临近词的邻居进行缓存，提高返回速度
72 | - nearby 中处理 OOV，返回 ([], [])
73 | 
74 | # 3.0 - 更简单的定制和配置，增加了额外的开销
75 | 
76 | - 去掉 nearby words, 使用 kdtree 检索空间词汇的最近临
77 | - 增加了对 sk-learn 的依赖，但是减少了对词向量的预处理
78 | - 优化了分词所使用的字典，也可以使用环境变量声明主字典
79 | - 支持自定义 word2vec 模型，使用环境变量声明
80 | 
81 | # 2.5
82 | 
83 | - 使用空间距离近的词汇优化编辑距离计算
84 | 
85 | # 2.3
86 | 
87 | - 计算相似度时增加平滑策略
88 | 
89 | # v1.6
90 | 
91 | - use `jieba` instead of `thulac` as tokeninzer.
92 | - refine console log for Jupyter notebook.
93 | 


--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
 1 | # Contributor Covenant Code of Conduct
 2 | 
 3 | ## Our Pledge
 4 | 
 5 | In the interest of fostering an open and welcoming environment, we as contributors and maintainers pledge to making participation in our project and our community a harassment-free experience for everyone, regardless of age, body size, disability, ethnicity, gender identity and expression, level of experience, nationality, personal appearance, race, religion, or sexual identity and orientation.
 6 | 
 7 | ## Our Standards
 8 | 
 9 | Examples of behavior that contributes to creating a positive environment include:
10 | 
11 | * Using welcoming and inclusive language
12 | * Being respectful of differing viewpoints and experiences
13 | * Gracefully accepting constructive criticism
14 | * Focusing on what is best for the community
15 | * Showing empathy towards other community members
16 | 
17 | Examples of unacceptable behavior by participants include:
18 | 
19 | * The use of sexualized language or imagery and unwelcome sexual attention or advances
20 | * Trolling, insulting/derogatory comments, and personal or political attacks
21 | * Public or private harassment
22 | * Publishing others' private information, such as a physical or electronic address, without explicit permission
23 | * Other conduct which could reasonably be considered inappropriate in a professional setting
24 | 
25 | ## Our Responsibilities
26 | 
27 | Project maintainers are responsible for clarifying the standards of acceptable behavior and are expected to take appropriate and fair corrective action in response to any instances of unacceptable behavior.
28 | 
29 | Project maintainers have the right and responsibility to remove, edit, or reject comments, commits, code, wiki edits, issues, and other contributions that are not aligned to this Code of Conduct, or to ban temporarily or permanently any contributor for other behaviors that they deem inappropriate, threatening, offensive, or harmful.
30 | 
31 | ## Scope
32 | 
33 | This Code of Conduct applies both within project spaces and in public spaces when an individual is representing the project or its community. Examples of representing a project or community include using an official project e-mail address, posting via an official social media account, or acting as an appointed representative at an online or offline event. Representation of a project may be further defined and clarified by project maintainers.
34 | 
35 | ## Enforcement
36 | 
37 | Instances of abusive, harassing, or otherwise unacceptable behavior may be reported by contacting the project team at hain_wang@foxmail.com. The project team will review and investigate all complaints, and will respond in a way that it deems appropriate to the circumstances. The project team is obligated to maintain confidentiality with regard to the reporter of an incident. Further details of specific enforcement policies may be posted separately.
38 | 
39 | Project maintainers who do not follow or enforce the Code of Conduct in good faith may face temporary or permanent repercussions as determined by other members of the project's leadership.
40 | 
41 | ## Attribution
42 | 
43 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4, available at [http://contributor-covenant.org/version/1/4][version]
44 | 
45 | [homepage]: http://contributor-covenant.org
46 | [version]: http://contributor-covenant.org/version/1/4/
47 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright 2023 Beijing Huaxia Chunsong Technology Co., Ltd. <https://www.chatopera.com> 
 2 | 
 3 | Licensed under the Chunsong Public License, Version 1.0 (the "License");
 4 | you may not use this file except in compliance with the License.
 5 | You may obtain a copy of the License at
 6 | 
 7 |  https://docs.cskefu.com/licenses/v1.html
 8 | 
 9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | [![PyPI](https://img.shields.io/pypi/v/synonyms.svg)](https://pypi.python.org/pypi/synonyms) [![PyPI download month](https://img.shields.io/pypi/dm/synonyms.svg)](https://pypi.python.org/pypi/synonyms/) [![](https://img.shields.io/pypi/pyversions/synonyms.svg)](https://pypi.org/pypi/synonyms/) [![License](https://cdndownload2.chatopera.com/cskefu/licenses/chunsong1.0.svg)](https://www.cskefu.com/licenses/v1.html "开源许可协议") [![](https://img.shields.io/pypi/format/synonyms.svg)](https://pypi.org/pypi/synonyms/)
  2 | 
  3 | # Synonyms
  4 | 
  5 | Chinese Synonyms for Natural Language Processing and Understanding.
  6 | 
  7 | 更好的中文近义词：聊天机器人、智能问答工具包。
  8 | 
  9 | `synonyms`可以用于自然语言理解的很多任务：[RAG (Retrieval-Augmented Generation)](https://github.com/chatopera/Synonyms/issues/146)，推荐算法，相似度计算，语义偏移，关键字提取，智能问答，自动摘要，搜索引擎等。
 10 | 
 11 | 为提供稳定、可靠、长期优化的服务，Synonyms 改为使用 [春松许可证, v1.0](https://www.cskefu.com/licenses/v1.html) 并针对机器学习模型的下载进行收费，详见[证书商店](https://store.chatopera.com/product/syns001)。之前的贡献者（突出贡献的代码贡献者），可与我们联系，讨论收费问题。-- [Chatopera Inc.](https://www.chatopera.com) @ Oct. 2023
 12 | 
 13 | # Table of Content:
 14 | 
 15 | - [Install](https://github.com/chatopera/Synonyms#welcome)
 16 | - [Usage](https://github.com/chatopera/Synonyms#usage)
 17 | - [Quick Get Start](https://github.com/chatopera/Synonyms#quick-get-start)
 18 | - [Valuation](https://github.com/chatopera/Synonyms#valuation)
 19 | - [Benchmark](https://github.com/chatopera/Synonyms#benchmark)
 20 | - [Statement](https://github.com/chatopera/Synonyms#statement)
 21 | - [References](https://github.com/chatopera/Synonyms#references)
 22 | - [Frequently Asked Questions](https://github.com/chatopera/Synonyms#frequently-asked-questions-faq)
 23 | - [License](https://github.com/chatopera/Synonyms#license)
 24 | 
 25 | # Welcome
 26 | 
 27 | Follow steps below to install and activate packages.
 28 | 
 29 | ## 1/3 Install Sourcecodes Package
 30 | 
 31 | ```bash
 32 | pip install -U synonyms
 33 | ```
 34 | 
 35 | 当前稳定版本 v3.x。
 36 | 
 37 | ## 2/3 Config license id
 38 | 
 39 | Synonyms's machine learning model package(s) requires a License from [Chatopera License Store](https://store.chatopera.com/product/syns001), first purchase a License and get the `license id` from **Licenses** page on Chatopera License Store(`license id`：在证书商店，证书详情页，点击【复制证书标识】).
 40 | 
 41 | ![image](./assets/syn_order_post.jpg)
 42 | 
 43 | Secondly, set environment variable in your terminal or shell scripts as below.
 44 | 
 45 | * For Shell Users
 46 | 
 47 | e.g. Shell, CMD Scripts on Linux, Windows, macOS.
 48 | 
 49 | ```bash
 50 | # Linux / macOS
 51 | export SYNONYMS_DL_LICENSE=YOUR_LICENSE
 52 | ## e.g. if your license id is `FOOBAR`, run `export SYNONYMS_DL_LICENSE=FOOBAR`
 53 | 
 54 | # Windows
 55 | ## 1/2 Command Prompt
 56 | set SYNONYMS_DL_LICENSE=YOUR_LICENSE
 57 | ## 2/2 PowerShell
 58 | $env:SYNONYMS_DL_LICENSE='YOUR_LICENSE'
 59 | ```
 60 | 
 61 | * For Python Code Users
 62 | 
 63 | Jupyter Notebook, etc.
 64 | 
 65 | ```python
 66 | import os
 67 | os.environ["SYNONYMS_DL_LICENSE"] = "YOUR_LICENSE"
 68 | _licenseid = os.environ.get("SYNONYMS_DL_LICENSE", None)
 69 | print("SYNONYMS_DL_LICENSE=", _licenseid)
 70 | ```
 71 | 
 72 | ![](./assets/screenshot_20231124180125.png)
 73 | 
 74 | **提示：安装后初次使用会下载词向量文件，下载速度取决于网络情况。**
 75 | 
 76 | ## 3/3 Download Model Package
 77 | 
 78 | Last, download the model package by command or script -
 79 | 
 80 | ```bash
 81 | python -c "import synonyms; synonyms.display('能量')" # download word vectors file
 82 | ```
 83 | 
 84 | ![](./assets/3.gif)
 85 | 
 86 | ## Usage
 87 | 
 88 | 支持使用环境变量配置分词词表和 word2vec 词向量文件。
 89 | 
 90 | | 环境变量                            | 描述                                                                                                                                                                                               |
 91 | | ----------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 92 | | _SYNONYMS_WORD2VEC_BIN_MODEL_ZH_CN_ | 使用 word2vec 训练的词向量文件，二进制格式。                                                                                                                                                       |
 93 | | _SYNONYMS_WORDSEG_DICT_             | 中文分词[**主字典**](https://github.com/fxsjy/jieba#%E5%BB%B6%E8%BF%9F%E5%8A%A0%E8%BD%BD%E6%9C%BA%E5%88%B6)，格式和使用[参考](https://github.com/fxsjy/jieba#%E8%BD%BD%E5%85%A5%E8%AF%8D%E5%85%B8) |
 94 | | _SYNONYMS_DEBUG_                    | ["TRUE"\|"FALSE"], 是否输出调试日志，设置为 “TRUE” 输出，默认为 “FALSE”                                                                                                                            |
 95 | 
 96 | ### 熟悉接口
 97 | 
 98 | ```bash
 99 | $ pip install -r Requirements.txt
100 | $ python demo.py
101 | ```
102 | 
103 | ### 实现 RAG (Retrieval-Augmented Generation)服务
104 | 
105 | 查看示例程序, [hailiang-wang/llm-get-started](https://github.com/hailiang-wang/llm-get-started/tree/master/003_rag_langchain)
106 | 
107 | 
108 | ## APIs
109 | 
110 | ### synonyms.nearby(word [, size = 10])
111 | 
112 | ```python
113 | import synonyms
114 | print("人脸: ", synonyms.nearby("人脸"))
115 | print("识别: ", synonyms.nearby("识别"))
116 | print("NOT_EXIST: ", synonyms.nearby("NOT_EXIST"))
117 | ```
118 | 
119 | `synonyms.nearby(WORD [,SIZE])`返回一个元组，元组中包含两项：`([nearby_words], [nearby_words_score])`，`nearby_words`是 WORD 的近义词们，也以 list 的方式存储，并且按照距离的长度由近及远排列，`nearby_words_score`是`nearby_words`中**对应位置**的词的距离的分数，分数在(0-1)区间内，越接近于 1，代表越相近；`SIZE` 是返回词汇数量，默认 10。比如:
120 | 
121 | ```python
122 | synonyms.nearby(人脸, 10) = (
123 |     ["图片", "图像", "通过观察", "数字图像", "几何图形", "脸部", "图象", "放大镜", "面孔", "Mii"],
124 |     [0.597284, 0.580373, 0.568486, 0.535674, 0.531835, 0.530
125 | 095, 0.525344, 0.524009, 0.523101, 0.516046])
126 | ```
127 | 
128 | 在 OOV 的情况下，返回 `([], [])`，目前的字典大小: 435,729。
129 | 
130 | ### synonyms.compare(sen1, sen2 [, seg=True])
131 | 
132 | 两个句子的相似度比较
133 | 
134 | ```python
135 |     sen1 = "发生历史性变革"
136 |     sen2 = "发生历史性变革"
137 |     r = synonyms.compare(sen1, sen2, seg=True)
138 | ```
139 | 
140 | 其中，参数 seg 表示 synonyms.compare 是否对 sen1 和 sen2 进行分词，默认为 True。返回值：[0-1]，并且越接近于 1 代表两个句子越相似。
141 | 
142 | ```python
143 | 旗帜引领方向 vs 道路决定命运: 0.429
144 | 旗帜引领方向 vs 旗帜指引道路: 0.93
145 | 发生历史性变革 vs 发生历史性变革: 1.0
146 | ```
147 | 
148 | ### synonyms.display(word [, size = 10])
149 | 
150 | 以友好的方式打印近义词，方便调试，`display(WORD [, SIZE])`调用了 `synonyms#nearby` 方法。
151 | 
152 | ```python
153 | >>> synonyms.display("飞机")
154 | '飞机'近义词：
155 |   1. 飞机:1.0
156 |   2. 直升机:0.8423391
157 |   3. 客机:0.8393003
158 |   4. 滑翔机:0.7872388
159 |   5. 军用飞机:0.7832081
160 |   6. 水上飞机:0.77857226
161 |   7. 运输机:0.7724742
162 |   8. 航机:0.7664748
163 |   9. 航空器:0.76592904
164 |   10. 民航机:0.74209654
165 | ```
166 | 
167 | `SIZE` 是打印词汇表的数量，默认 10。
168 | 
169 | ### synonyms.describe()
170 | 
171 | 打印当前包的描述信息：
172 | 
173 | ```
174 | >>> synonyms.describe()
175 | Vocab size in vector model: 435729
176 | model_path: /Users/hain/chatopera/Synonyms/synonyms/data/words.vector.gz
177 | version: 3.18.0
178 | {'vocab_size': 435729, 'version': '3.18.0', 'model_path': '/chatopera/Synonyms/synonyms/data/words.vector.gz'}
179 | ```
180 | 
181 | ### synonyms.v(word)
182 | 
183 | 获得一个词语的向量，该向量为 numpy 的 array，当该词语是未登录词时，抛出 KeyError 异常。
184 | 
185 | ```python
186 | >>> synonyms.v("飞机")
187 | array([-2.412167  ,  2.2628384 , -7.0214124 ,  3.9381874 ,  0.8219283 ,
188 |        -3.2809453 ,  3.8747153 , -5.217062  , -2.2786229 , -1.2572327 ],
189 |       dtype=float32)
190 | ```
191 | 
192 | ### synonyms.sv(sentence, ignore=False)
193 | 
194 | 获得一个分词后句子的向量，向量以 array[array[]] 方式组成，即获取 sentence 中每个词的向量 array[] 放在一个 array 中
195 | 
196 | ```python
197 |     sentence: 句子是分词后通过空格联合起来
198 |     ignore: 是否忽略OOV，False时，随机生成一个向量
199 | ```
200 | 
201 | ### synonyms.bow(sentence, ignore=False)
202 | 
203 | 获得一个分词后句子的向量，向量以 BoW 方式组成
204 | 
205 | ```python
206 |     sentence: 句子是分词后通过空格联合起来
207 |     ignore: 是否忽略OOV，False时，随机生成一个向量
208 | ```
209 | 
210 | ### synonyms.seg(sentence)
211 | 
212 | 中文分词
213 | 
214 | ```python
215 | synonyms.seg("中文近义词工具包")
216 | ```
217 | 
218 | 分词结果，由两个 list 组成的元组，分别是单词和对应的词性。
219 | 
220 | ```python
221 | (['中文', '近义词', '工具包'], ['nz', 'n', 'n'])
222 | ```
223 | 
224 | **该分词不去停用词和标点。**
225 | 
226 | ### synonyms.keywords(sentence [, topK=5, withWeight=False])
227 | 
228 | 提取关键词，默认按照重要程度提取关键词。
229 | 
230 | ```
231 | keywords = synonyms.keywords("9月15日以来，台积电、高通、三星等华为的重要合作伙伴，只要没有美国的相关许可证，都无法供应芯片给华为，而中芯国际等国产芯片企业，也因采用美国技术，而无法供货给华为。目前华为部分型号的手机产品出现货少的现象，若该形势持续下去，华为手机业务将遭受重创。")
232 | ```
233 | 
234 | ## Contribution
235 | 
236 | Get more logs for debugging, set environment variable.
237 | 
238 | ```
239 | SYNONYMS_DEBUG=TRUE
240 | ```
241 | 
242 | ## PCA
243 | 
244 | 以“人脸”为例主要成分分析：
245 | 
246 | ![](assets/1.png)
247 | 
248 | 
249 | ## Change logs
250 | 
251 | 更新情况[说明](./CHANGELOG.md)。
252 | 
253 | ## Voice of Users
254 | 
255 | 用户怎么说：
256 | 
257 | <img src="https://github.com/chatopera/Synonyms/raw/master/assets/4.png" width="600">
258 | 
259 | ## Data
260 | 
261 | data is built based on [wikidata-corpus](https://github.com/Samurais/wikidata-corpus).
262 | 
263 | ## Valuation
264 | 
265 | ### 同义词词林
266 | 
267 | 《同义词词林》是梅家驹等人于 1983 年编纂而成，现在使用广泛的是哈工大社会计算与信息检索研究中心维护的《同义词词林扩展版》，它精细的将中文词汇划分成大类和小类，梳理了词汇间的关系，同义词词林扩展版包含词语 7 万余条，其中 3 万余条被以开放数据形式共享。
268 | 
269 | ### 知网, HowNet
270 | 
271 | HowNet，也被称为知网，它并不只是一个语义字典，而是一个知识系统，词汇之间的关系是其一个基本使用场景。知网包含词语 8 余条。
272 | 
273 | 国际上对词语相似度算法的评价标准普遍采用 Miller&Charles 发布的英语词对集的人工判定值。该词对集由十对高度相关、十对中度相关、十对低度相关共 30 个英语词对组成,然后让 38 个受试者对这 30 对进行语义相关度判断，最后取他们的平均值作为人工判定标准。然后不同近义词工具也对这些词汇进行相似度评分，与人工判定标准做比较，比如使用皮尔森相关系数。在中文领域，使用这个词表的翻译版进行中文近义词比较也是常用的办法。
274 | 
275 | ### 对比
276 | 
277 | Synonyms 的词表容量是 435,729，下面选择一些在同义词词林、知网和 Synonyms 都存在的几个词，给出其近似度的对比：
278 | 
279 | ![](./assets/5.png)
280 | 
281 | 注：同义词林及知网数据、分数[来源](https://github.com/yaleimeng/Final_word_Similarity)。Synonyms 也在不断优化中，新的分数可能和上图不一致。
282 | 
283 | 更多[比对结果](./VALUATION.md)。
284 | 
285 | ## Used by
286 | 
287 | [Github 关联用户列表](https://github.com/chatopera/Synonyms/network/dependents?package_id=UGFja2FnZS01MjY2NDc1Nw%3D%3D)
288 | 
289 | ![](./assets/6.png)
290 | 
291 | ## Benchmark
292 | 
293 | Test with py3, MacBook Pro.
294 | 
295 | ```
296 | python benchmark.py
297 | ```
298 | 
299 | ++++++++++ OS Name and version ++++++++++
300 | 
301 | Platform: Darwin
302 | 
303 | Kernel: 16.7.0
304 | 
305 | Architecture: ('64bit', '')
306 | 
307 | ++++++++++ CPU Cores ++++++++++
308 | 
309 | Cores: 4
310 | 
311 | CPU Load: 60
312 | 
313 | ++++++++++ System Memory ++++++++++
314 | 
315 | meminfo 8GB
316 | 
317 | `synonyms#nearby: 100000 loops, best of 3 epochs: 0.209 usec per loop`
318 | 
319 | ## Live Sharing
320 | 
321 | [52nlp.cn](http://www.52nlp.cn/synonyms-%E4%B8%AD%E6%96%87%E8%BF%91%E4%B9%89%E8%AF%8D%E5%B7%A5%E5%85%B7%E5%8C%85)
322 | 
323 | [机器之心](https://www.jiqizhixin.com/articles/2018-01-14-3)
324 | 
325 | [线上分享实录: Synonyms 中文近义词工具包 @ 2018-02-07](http://gitbook.cn/gitchat/activity/5a563545a8b23d387720ccd5)
326 | 
327 | ## Statement
328 | 
329 | [Synonyms](https://github.com/chatopera/Synonyms)发布证书 MIT。数据和程序可用于研究和商业产品，必须注明引用和地址，比如发布的任何媒体、期刊、杂志或博客等内容。
330 | 
331 | ```
332 | @online{Synonyms:hain2017,
333 |   author = {Hai Liang Wang, Hu Ying Xi},
334 |   title = {中文近义词工具包Synonyms},
335 |   year = 2017,
336 |   url = {https://github.com/chatopera/Synonyms},
337 |   urldate = {2017-09-27}
338 | }
339 | ```
340 | 
341 | # References
342 | 
343 | [wikidata-corpus](https://github.com/Samurais/wikidata-corpus)
344 | 
345 | [word2vec 原理推导与代码分析](http://www.hankcs.com/nlp/word2vec.html)
346 | 
347 | # Frequently Asked Questions (FAQ)
348 | 
349 | 1. 是否支持添加单词到词表中？
350 | 
351 | 不支持，欲了解更多请看 [#5](https://github.com/chatopera/Synonyms/issues/5)
352 | 
353 | 2. 词向量的训练是用哪个工具？
354 | 
355 | Google 发布的[word2vec](https://code.google.com/archive/p/word2vec/)，该库由 C 语言编写，内存使用效率高，训练速度快。gensim 可以加载 word2vec 输出的模型文件。
356 | 
357 | 3. 相似度计算的方法是什么？
358 | 
359 | [详见 #64](https://github.com/chatopera/Synonyms/issues/64)
360 | 
361 | 4. [#118 词向量文件一直下载不下来？](https://github.com/chatopera/Synonyms/issues/118)
362 | 
363 | 5. [#146 Synonyms 和 Langchain 实现 RAG 检索服务](https://github.com/chatopera/Synonyms/issues/146)
364 | 
365 | # Authors
366 | 
367 | [Hai Liang Wang](https://pre-angel.com/peoples/hailiang-wang/)
368 | 
369 | [Hu Ying Xi](https://github.com/huyingxi)
370 | 
371 | # 自然语言处理推荐入门&工具书
372 | 
373 | 本书由 [Synonyms](https://github.com/chatopera/Synonyms) 作者参与著作。
374 | 
375 | <p align="center">
376 |   <b>快速购书<a href="https://item.jd.com/12479014.html" target="_blank">链接</a></b><br>
377 |   <a href="https://item.jd.com/12479014.html" target="_blank">
378 |   <img src="https://user-images.githubusercontent.com/3538629/48657619-bcd24880-ea6e-11e8-8c4e-8bcb00761942.png" width="400">      
379 |   </a>
380 | </p>
381 | 
382 | [《智能问答与深度学习》](https://item.jd.com/12479014.html) 这本书是服务于准备入门机器学习和自然语言处理的学生和软件工程师的，在理论上介绍了很多原理、算法，同时也提供很多示例程序增加实践性，这些程序被汇总到示例程序代码库，这些程序主要是帮助大家理解原理和算法的，欢迎大家下载和执行。代码库的地址是：
383 | 
384 | [https://github.com/l11x0m7/book-of-qna-code](https://github.com/l11x0m7/book-of-qna-code)
385 | 
386 | # Give credits to
387 | 
388 | [Word2vec by Google](https://code.google.com/archive/p/word2vec/)
389 | 
390 | [Wikimedia: 训练语料来源](https://dumps.wikimedia.org/)
391 | 
392 | [gensim: word2vec.py](https://github.com/RaRe-Technologies/gensim)
393 | 
394 | [SentenceSim: 相似度评测语料](https://github.com/fssqawj/SentenceSim/)
395 | 
396 | [jieba: 中文分词](https://github.com/fxsjy/jieba)
397 | 
398 | # License
399 | 
400 | [Chunsong Public License, version 1.0](./LICENSE)
401 | 
402 | # Project Sponsor
403 | 
404 | ## Chatopera 云服务
405 | 
406 | [https://bot.chatopera.com/](https://bot.chatopera.com/)
407 | 
408 | [Chatopera 云服务](https://bot.chatopera.com)是一站式实现聊天机器人的云服务，按接口调用次数计费。Chatopera 云服务是 [Chatopera 机器人平台](https://docs.chatopera.com/products/chatbot-platform/index.html)的软件即服务实例。在云计算基础上，Chatopera 云服务属于**聊天机器人即服务**的云服务。
409 | 
410 | Chatopera 机器人平台包括知识库、多轮对话、意图识别和语音识别等组件，标准化聊天机器人开发，支持企业 OA 智能问答、HR 智能问答、智能客服和网络营销等场景。企业 IT 部门、业务部门借助 Chatopera 云服务快速让聊天机器人上线！


--------------------------------------------------------------------------------
/Requirements.txt:
--------------------------------------------------------------------------------
1 | synonyms>=3.23


--------------------------------------------------------------------------------
/VALUATION.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chatopera/Synonyms/22d82ba3c62b6c8d142f40f43e3137b7360d05b2/VALUATION.md


--------------------------------------------------------------------------------
/assets/1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chatopera/Synonyms/22d82ba3c62b6c8d142f40f43e3137b7360d05b2/assets/1.png


--------------------------------------------------------------------------------
/assets/2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chatopera/Synonyms/22d82ba3c62b6c8d142f40f43e3137b7360d05b2/assets/2.png


--------------------------------------------------------------------------------
/assets/3.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chatopera/Synonyms/22d82ba3c62b6c8d142f40f43e3137b7360d05b2/assets/3.gif


--------------------------------------------------------------------------------
/assets/4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chatopera/Synonyms/22d82ba3c62b6c8d142f40f43e3137b7360d05b2/assets/4.png


--------------------------------------------------------------------------------
/assets/5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chatopera/Synonyms/22d82ba3c62b6c8d142f40f43e3137b7360d05b2/assets/5.png


--------------------------------------------------------------------------------
/assets/6.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chatopera/Synonyms/22d82ba3c62b6c8d142f40f43e3137b7360d05b2/assets/6.png


--------------------------------------------------------------------------------
/assets/64531083-3199aa80-d341-11e9-86cd-3a3ed860b14b.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chatopera/Synonyms/22d82ba3c62b6c8d142f40f43e3137b7360d05b2/assets/64531083-3199aa80-d341-11e9-86cd-3a3ed860b14b.png


--------------------------------------------------------------------------------
/assets/pr_curve_eulidean0.8+unigram0.2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chatopera/Synonyms/22d82ba3c62b6c8d142f40f43e3137b7360d05b2/assets/pr_curve_eulidean0.8+unigram0.2.png


--------------------------------------------------------------------------------
/assets/pr_hresholds_eulidean0.8+unigram0.2+POS.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chatopera/Synonyms/22d82ba3c62b6c8d142f40f43e3137b7360d05b2/assets/pr_hresholds_eulidean0.8+unigram0.2+POS.png


--------------------------------------------------------------------------------
/assets/roc_curve_eulidean0.8+unigram0.2+POS.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chatopera/Synonyms/22d82ba3c62b6c8d142f40f43e3137b7360d05b2/assets/roc_curve_eulidean0.8+unigram0.2+POS.png


--------------------------------------------------------------------------------
/assets/screenshot_20231124180125.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chatopera/Synonyms/22d82ba3c62b6c8d142f40f43e3137b7360d05b2/assets/screenshot_20231124180125.png


--------------------------------------------------------------------------------
/assets/sentence_precision.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chatopera/Synonyms/22d82ba3c62b6c8d142f40f43e3137b7360d05b2/assets/sentence_precision.jpg


--------------------------------------------------------------------------------
/assets/syn_order_post.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chatopera/Synonyms/22d82ba3c62b6c8d142f40f43e3137b7360d05b2/assets/syn_order_post.jpg


--------------------------------------------------------------------------------
/benchmark.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | #=========================================================================
 4 | #
 5 | # Copyright (c) 2017 <> All Rights Reserved
 6 | #
 7 | #
 8 | # File: /Users/hain/ai/Synonyms/benchmark.py
 9 | # Author: Hai Liang Wang
10 | # Date: 2017-10-21:11:26:53
11 | #
12 | #=========================================================================
13 | 
14 | """
15 | 
16 | """
17 | from __future__ import print_function
18 | from __future__ import division
19 | 
20 | __copyright__ = "Copyright (c) 2017-2023 Chatopera Inc. All Rights Reserved"
21 | __author__ = "Hai Liang Wang"
22 | __date__ = "2017-10-21:11:26:53"
23 | 
24 | 
25 | import os
26 | import sys
27 | import platform
28 | import multiprocessing
29 | curdir = os.path.dirname(os.path.abspath(__file__))
30 | sys.path.append(curdir)
31 | 
32 | if sys.version_info[0] < 3:
33 |     reload(sys)
34 |     sys.setdefaultencoding("utf-8")
35 |     # raise "Must be using Python 3"
36 | 
37 | import timeit
38 | 
39 | print("\nEnumerating Available System Resources...")
40 | 
41 | print("\n++++++++++ OS Name and version ++++++++++")
42 | 
43 | print("Platform:", platform.system())
44 | print("Kernel:", platform.release())
45 | print("Distro:", platform.linux_distribution())
46 | print("Architecture:", platform.architecture())
47 | 
48 | print("\n++++++++++ CPU Cores ++++++++++")
49 | p = os.popen("ps aux|awk 'NR > 0{s +=$3};END{print s}'").read()
50 | print("Cores:", multiprocessing.cpu_count(), '\nCPU Load:', p)
51 | 
52 | print("\n++++++++++ System Memory ++++++++++\n")
53 | 
54 | 
55 | def meminfo():
56 |     meminfo = dict()
57 | 
58 |     with os.popen('cat /proc/meminfo') as f:
59 |         for line in f:
60 |             meminfo[line.split(':')[0]] = line.split(':')[1].strip()
61 |     return meminfo
62 | 
63 | 
64 | try:
65 |     meminfo = meminfo()
66 |     print('Total Memory: {0}'.format(meminfo['MemTotal']))
67 |     print('Free Memory: {0}'.format(meminfo['MemFree']))
68 | except BaseException:
69 |     print("meminfo unavailable")
70 | 
71 | 
72 | def main():
73 |     repeat = 3
74 |     number = 100000
75 |     unit = "usec"  # 微秒
76 |     unittosec = {"usec": 1e6, "msec": 1000, "sec": 1}
77 |     result = timeit.repeat(
78 |         "synonyms.nearby('人脸')",
79 |         "import synonyms",
80 |         number=number,
81 |         repeat=repeat)
82 |     print("%s: %d loops, best of %d epochs: %.3g %s per loop" %
83 |           ("synonyms#nearby", number, repeat,
84 |            min(result) / number * unittosec[unit], unit))
85 | 
86 | 
87 | if __name__ == '__main__':
88 |     main()
89 | 


--------------------------------------------------------------------------------
/demo.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | #=========================================================================
  4 | #
  5 | # Copyright (c) 2017 <> All Rights Reserved
  6 | #
  7 | #
  8 | # File: /Users/hain/ai/Synonyms/demo.py
  9 | # Author: Hai Liang Wang
 10 | # Date: 2017-09-28:22:23:34
 11 | #
 12 | #=========================================================================
 13 | 
 14 | """
 15 | 
 16 | """
 17 | from __future__ import print_function
 18 | from __future__ import division
 19 | 
 20 | __copyright__ = "Copyright (c) (2017-2022) Chatopera Inc. All Rights Reserved"
 21 | __author__ = "Hai Liang Wang"
 22 | __date__ = "2017-09-28:22:23:34"
 23 | 
 24 | 
 25 | import os
 26 | import sys
 27 | curdir = os.path.dirname(os.path.abspath(__file__))
 28 | sys.path.insert(0, curdir)
 29 | 
 30 | if sys.version_info[0] < 3:
 31 |     reload(sys)
 32 |     sys.setdefaultencoding("utf-8")
 33 |     # raise "Must be using Python 3"
 34 |     # 
 35 | 
 36 | import synonyms  # https://github.com/chatopera/Synonyms
 37 | import numpy
 38 | import unittest
 39 | 
 40 | compare_ = lambda x,y,z: "%s vs %s: %f" % (x, y, synonyms.compare(x, y, seg=z)) + "\n" +"*"* 30 + "\n"
 41 | 
 42 | # run testcase: python /Users/hain/ai/Synonyms/demo.py Test.testExample
 43 | class Test(unittest.TestCase):
 44 |     '''
 45 | 
 46 |     '''
 47 | 
 48 |     def setUp(self):
 49 |         pass
 50 | 
 51 |     def tearDown(self):
 52 |         pass
 53 | 
 54 |     def test_wordseg(self):
 55 |         print("test_wordseg")
 56 |         print(synonyms.seg("中文近义词工具包"))
 57 | 
 58 | 
 59 |     def test_word_vector(self):
 60 |         print("test_word_vector")
 61 |         word = "三国"
 62 |         print(word, "向量", synonyms.v(word))
 63 | 
 64 |     def test_diff(self):
 65 |         print("test_diff")
 66 |         result = []
 67 |         # 30个  评测词对中的左侧词
 68 |         left = ['轿车', '宝石', '旅游', '男孩子', '海岸', '庇护所', '魔术师', '中午', '火炉', '食物', '鸟', '鸟', '工具', '兄弟', '起重机', '小伙子',
 69 |                 '旅行', '和尚', '墓地', '食物', '海岸', '森林', '岸边', '和尚', '海岸', '小伙子', '琴弦', '玻璃', '中午', '公鸡']
 70 |         # 30个  评测词对中的右侧词
 71 |         right = ['汽车', '宝物', '游历', '小伙子', '海滨', '精神病院', '巫师', '正午', '炉灶', '水果', '公鸡', '鹤', '器械', '和尚', '器械', '兄弟',
 72 |                  '轿车', '圣贤', '林地', '公鸡', '丘陵', '墓地', '林地', '奴隶', '森林', '巫师', '微笑', '魔术师', '绳子', '航行']
 73 |         # 人工评定的相似度列表。
 74 |         human = [0.98, 0.96, 0.96, 0.94, 0.925, 0.9025, 0.875, 0.855, 0.7775, 0.77, 0.7625, 0.7425, 0.7375, 0.705, 0.42, 0.415,
 75 |                  0.29, 0.275, 0.2375,
 76 |                  0.2225, 0.2175, 0.21, 0.1575, 0.1375, 0.105, 0.105, 0.0325, 0.0275, 0.02, 0.02]
 77 |         result.append("# synonyms 分数评测 [(v%s)](https://pypi.python.org/pypi/synonyms/%s)" % (synonyms.__version__, synonyms.__version__))
 78 |         result.append("| %s |  %s |   %s  |  %s |" % ("词1", "词2", "synonyms", "人工评定"))
 79 |         result.append("| --- | --- | --- | --- |")
 80 |         for x,y,z in zip(left, right, human):
 81 |             result.append("| %s | %s | %s  |  %s |" % (x, y, synonyms.compare(x, y), z))
 82 |         for x in result: print(x)
 83 |         with open(os.path.join(curdir, "VALUATION.md"), "w") as fout:
 84 |             for x in result: fout.write(x + "\n")
 85 | 
 86 |     def test_similarity(self):
 87 |         '''
 88 |         Generate sentence similarity
 89 |         '''
 90 |         sen1 = "旗帜引领方向"
 91 |         sen2 = "道路决定命运"
 92 |         r = synonyms.compare(sen1, sen2, seg=True)
 93 |         print("旗帜引领方向 vs 道路决定命运:", r)
 94 |         # assert r == 0.0, "the similarity should be zero"
 95 | 
 96 |         sen1 = "旗帜引领方向"
 97 |         sen2 = "旗帜指引道路"
 98 |         r = synonyms.compare(sen1, sen2, seg=True)
 99 |         print("旗帜引领方向 vs 旗帜指引道路:", r)
100 |         # assert r > 0, "the similarity should be bigger then zero"
101 | 
102 |         sen1 = "发生历史性变革"
103 |         sen2 = "发生历史性变革"
104 |         r = synonyms.compare(sen1, sen2, seg=True)
105 |         print("发生历史性变革 vs 发生历史性变革:", r)
106 |         # assert r > 0, "the similarity should be bigger then zero"
107 | 
108 |         sen1 = "骨折"
109 |         sen2 = "巴赫"
110 |         r = synonyms.compare(sen1, sen2, seg=True)
111 |         print("%s vs %s" % (sen1, sen2), r)
112 | 
113 | 
114 |         sen1 = "你们好呀"
115 |         sen2 = "大家好"
116 |         r = synonyms.compare(sen1, sen2, seg=False)
117 |         print("%s vs %s" % (sen1, sen2), r)
118 | 
119 | 
120 |     def test_swap_sent(self):
121 |         print("test_swap_sent")        
122 |         s1 = synonyms.compare("教学", "老师")
123 |         s2 = synonyms.compare("老师", "教学")
124 |         print('"教学", "老师": %s ' % s1)
125 |         print('"老师", "教学": %s ' % s2)
126 |         assert s1 == s2, "Scores should be the same after swap sents"
127 | 
128 |     def test_nearby(self):
129 |         synonyms.display("奥运")  # synonyms.display calls synonyms.nearby
130 |         synonyms.display("北新桥")  # synonyms.display calls synonyms.nearby
131 | 
132 | 
133 |     def test_badcase_1(self):
134 |         synonyms.display("人脸")  # synonyms.display calls synonyms.nearby
135 | 
136 | 
137 |     def test_basecase_2(self):
138 |         print("test_basecase_2")
139 |         sen1 = "今天天气"
140 |         sen2 = "今天天气怎么样"
141 |         r = synonyms.compare(sen1, sen2, seg=True)
142 | 
143 | 
144 |     def test_analyse_extract_tags(self):
145 |         '''
146 |         使用 Tag 方式获得关键词
147 |         https://github.com/fxsjy/jieba/tree/v0.39
148 |         '''
149 |         sentence = "华为芯片被断供，源于美国关于华为的修订版禁令生效——9月15日以来，台积电、高通、三星等华为的重要合作伙伴，只要没有美国的相关许可证，都无法供应芯片给华为，而中芯国际等国产芯片企业，也因采用美国技术，而无法供货给华为。目前华为部分型号的手机产品出现货少的现象，若该形势持续下去，华为手机业务将遭受重创。"
150 |         keywords = synonyms.keywords(sentence, topK=5, withWeight=False, allowPOS=())
151 |         print("[test_analyse_extract_tags] keywords %s" % keywords)
152 | 
153 | def test():
154 |     unittest.main()
155 | 
156 | 
157 | if __name__ == '__main__':
158 |     test()
159 | 


--------------------------------------------------------------------------------
/scripts/package.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash 
 2 | ###########################################
 3 | #
 4 | ###########################################
 5 | 
 6 | # constants
 7 | baseDir=$(cd `dirname "$0"`;pwd)
 8 | export PYTHONUNBUFFERED=1
 9 | export PATH=/opt/miniconda3/envs/venv-py3/bin:$PATH
10 | 
11 | # functions
12 | 
13 | # main 
14 | [ -z "${BASH_SOURCE[0]}" -o "${BASH_SOURCE[0]}" = "$0" ] || return
15 | cd $baseDir/..
16 | 
17 | if [ ! -d tmp ]; then
18 |     mkdir tmp
19 | fi
20 | 
21 | if [ -f synonyms/data/words.vector.gz ]; then
22 |     mv synonyms/data/words.vector.gz tmp
23 | fi
24 | 
25 | rm -rf ./dist/*
26 | python setup.py sdist
27 | # python setup.py sdist upload -r pypi
28 | mv tmp/words.vector.gz synonyms/data/words.vector.gz
29 | 
30 | echo "For internal package downloading, now upload ./dist/synonyms-xxx.tar.gz to corsair:/static/ml/synonyms download from http://192.168.2.217:30080/ml/synonyms/"


--------------------------------------------------------------------------------
/scripts/pypi.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash 
 2 | ###########################################
 3 | #
 4 | ###########################################
 5 | 
 6 | # constants
 7 | baseDir=$(cd `dirname "$0"`;pwd)
 8 | export PYTHONUNBUFFERED=1
 9 | export PATH=/opt/miniconda3/envs/venv-py3/bin:$PATH
10 | 
11 | # functions
12 | 
13 | # main 
14 | [ -z "${BASH_SOURCE[0]}" -o "${BASH_SOURCE[0]}" = "$0" ] || return
15 | cd $baseDir/..
16 | 
17 | if [ ! -d tmp ]; then
18 |     mkdir tmp
19 | fi
20 | 
21 | if [ -f synonyms/data/words.vector.gz ]; then
22 |     echo "Move pkg to tmp"
23 |     mv synonyms/data/words.vector.gz tmp
24 | fi
25 | 
26 | rm -rf ./dist/*
27 | python setup.py sdist
28 | twine upload --skip-existing dist/*
29 | 
30 | if [ -f tmp/words.vector.gz ]; then
31 |     mv tmp/words.vector.gz synonyms/data/words.vector.gz
32 | fi
33 | 
34 | 


--------------------------------------------------------------------------------
/scripts/test.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash 
 2 | ###########################################
 3 | #
 4 | ###########################################
 5 | 
 6 | # constants
 7 | baseDir=$(cd `dirname "$0"`;pwd)
 8 | export PYTHONUNBUFFERED=1
 9 | export PATH=/opt/miniconda3/envs/venv-py3/bin:$PATH
10 | 
11 | # functions
12 | 
13 | # main 
14 | [ -z "${BASH_SOURCE[0]}" -o "${BASH_SOURCE[0]}" = "$0" ] || return
15 | cd $baseDir/..
16 | if [ -f .env ]; then
17 |     echo "load env with" `pwd`"/.env"
18 |     source .env
19 | fi
20 | 
21 | python demo.py
22 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [metadata]
2 | description-file = README.md


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from setuptools import setup, find_packages
 3 | LONGDOC = """
 4 | Synonyms
 5 | =====================
 6 | 
 7 | 中文近义词
 8 | 
 9 | https://github.com/chatopera/Synonyms
10 | 
11 | """
12 | 
13 | setup(
14 |     name='synonyms',
15 |     version='3.23.6',
16 |     description='中文近义词：聊天机器人，智能问答工具包；Chinese Synonyms for Natural Language Processing and Understanding',
17 |     long_description=LONGDOC,
18 |     author='Hai Liang Wang, Hu Ying Xi',
19 |     author_email='hain@chatopera.com',
20 |     url='https://github.com/chatopera/Synonyms',
21 |     license="Chunsong Public License, version 1.0",
22 |     classifiers=[
23 |         'Intended Audience :: Developers',
24 |         'Operating System :: OS Independent',
25 |         'Natural Language :: Chinese (Simplified)',
26 |         'Natural Language :: Chinese (Traditional)',
27 |         'Programming Language :: Python',
28 |         'Programming Language :: Python :: 3',
29 |         'Programming Language :: Python :: 3.5',
30 |         'Programming Language :: Python :: 3.6',
31 |         'Programming Language :: Python :: 3.7',
32 |         'Topic :: Text Processing',
33 |         'Topic :: Text Processing :: Indexing',
34 |         'Topic :: Text Processing :: Linguistic'],
35 |     keywords='corpus,machine-learning,NLU,NLP,Synonyms,Similarity,chatbot',
36 |     packages=find_packages(),
37 |     install_requires=[
38 |         'six>=1.11.0',
39 |         'numpy>=1.13.1',
40 |         'scipy>=1.0.0',
41 |         'scikit-learn>=0.19.1',
42 |         'jieba>=0.40',
43 |         'chatoperastore>=1.2.0'
44 |     ],
45 |     package_data={
46 |         'synonyms': [
47 |             '**/**/idf.txt',
48 |             '**/**/*.p',
49 |             '**/*.gz',
50 |             '**/*.txt',
51 |             'LICENSE']})
52 | 


--------------------------------------------------------------------------------
/synonyms/__init__.py:
--------------------------------------------------------------------------------
 1 | __all__ = ["seg",
 2 |     "nearby", 
 3 |     "compare", 
 4 |     "display", 
 5 |     "keywords",
 6 |     "KeyedVectors", 
 7 |     "any2utf8",
 8 |     "sigmoid",
 9 |     "cosine",
10 |     "any2unicode",
11 |     "__version__"]
12 | 
13 | from .word2vec import *
14 | from .synonyms import *
15 | from .synonyms import __version__


--------------------------------------------------------------------------------
/synonyms/data/stopwords.txt:
--------------------------------------------------------------------------------
   1 | 啊
   2 | 阿
   3 | 哎
   4 | 哎呀
   5 | 哎哟
   6 | 唉
   7 | 俺
   8 | 俺们
   9 | 按
  10 | 按照
  11 | 吧
  12 | 吧哒
  13 | 把
  14 | 罢了
  15 | 被
  16 | 本
  17 | 本着
  18 | 比
  19 | 比方
  20 | 比如
  21 | 鄙人
  22 | 彼
  23 | 彼此
  24 | 边
  25 | 别
  26 | 别的
  27 | 别说
  28 | 并
  29 | 并且
  30 | 不比
  31 | 不成
  32 | 不单
  33 | 不但
  34 | 不独
  35 | 不管
  36 | 不光
  37 | 不过
  38 | 不仅
  39 | 不拘
  40 | 不论
  41 | 不怕
  42 | 不然
  43 | 不如
  44 | 不特
  45 | 不惟
  46 | 不问
  47 | 不只
  48 | 朝
  49 | 朝着
  50 | 趁
  51 | 趁着
  52 | 乘
  53 | 冲
  54 | 除
  55 | 除此之外
  56 | 除非
  57 | 除了
  58 | 此
  59 | 此间
  60 | 此外
  61 | 从
  62 | 从而
  63 | 打
  64 | 待
  65 | 但
  66 | 但是
  67 | 当
  68 | 当着
  69 | 到
  70 | 得
  71 | 的
  72 | 的话
  73 | 等
  74 | 等等
  75 | 地
  76 | 第
  77 | 叮咚
  78 | 对
  79 | 对于
  80 | 多
  81 | 多少
  82 | 而
  83 | 而况
  84 | 而且
  85 | 而是
  86 | 而外
  87 | 而言
  88 | 而已
  89 | 尔后
  90 | 反过来
  91 | 反过来说
  92 | 反之
  93 | 非但
  94 | 非徒
  95 | 否则
  96 | 嘎
  97 | 嘎登
  98 | 该
  99 | 赶
 100 | 个
 101 | 各
 102 | 各个
 103 | 各位
 104 | 各种
 105 | 各自
 106 | 给
 107 | 根据
 108 | 跟
 109 | 故
 110 | 故此
 111 | 固然
 112 | 关于
 113 | 管
 114 | 归
 115 | 果然
 116 | 果真
 117 | 过
 118 | 哈
 119 | 哈哈
 120 | 呵
 121 | 和
 122 | 何
 123 | 何处
 124 | 何况
 125 | 何时
 126 | 嘿
 127 | 哼
 128 | 哼唷
 129 | 呼哧
 130 | 乎
 131 | 哗
 132 | 还是
 133 | 还有
 134 | 换句话说
 135 | 换言之
 136 | 或
 137 | 或是
 138 | 或者
 139 | 极了
 140 | 及
 141 | 及其
 142 | 及至
 143 | 即
 144 | 即便
 145 | 即或
 146 | 即令
 147 | 即若
 148 | 即使
 149 | 几
 150 | 几时
 151 | 己
 152 | 既
 153 | 既然
 154 | 既是
 155 | 继而
 156 | 加之
 157 | 假如
 158 | 假若
 159 | 假使
 160 | 鉴于
 161 | 将
 162 | 较
 163 | 较之
 164 | 叫
 165 | 接着
 166 | 结果
 167 | 借
 168 | 紧接着
 169 | 进而
 170 | 尽
 171 | 尽管
 172 | 经
 173 | 经过
 174 | 就
 175 | 就是
 176 | 就是说
 177 | 据
 178 | 具体地说
 179 | 具体说来
 180 | 开始
 181 | 开外
 182 | 靠
 183 | 咳
 184 | 可
 185 | 可见
 186 | 可是
 187 | 可以
 188 | 况且
 189 | 啦
 190 | 来
 191 | 来着
 192 | 离
 193 | 例如
 194 | 哩
 195 | 连
 196 | 连同
 197 | 两者
 198 | 了
 199 | 临
 200 | 另
 201 | 另外
 202 | 另一方面
 203 | 论
 204 | 嘛
 205 | 吗
 206 | 慢说
 207 | 漫说
 208 | 冒
 209 | 么
 210 | 每
 211 | 每当
 212 | 们
 213 | 莫若
 214 | 某
 215 | 某个
 216 | 某些
 217 | 拿
 218 | 哪
 219 | 哪边
 220 | 哪儿
 221 | 哪个
 222 | 哪里
 223 | 哪年
 224 | 哪怕
 225 | 哪天
 226 | 哪些
 227 | 哪样
 228 | 那
 229 | 那边
 230 | 那儿
 231 | 那个
 232 | 那会儿
 233 | 那里
 234 | 那么
 235 | 那么些
 236 | 那么样
 237 | 那时
 238 | 那些
 239 | 那样
 240 | 乃
 241 | 乃至
 242 | 呢
 243 | 能
 244 | 你
 245 | 你们
 246 | 您
 247 | 宁
 248 | 宁可
 249 | 宁肯
 250 | 宁愿
 251 | 哦
 252 | 呕
 253 | 啪达
 254 | 旁人
 255 | 呸
 256 | 凭
 257 | 凭借
 258 | 其
 259 | 其次
 260 | 其二
 261 | 其他
 262 | 其它
 263 | 其一
 264 | 其余
 265 | 其中
 266 | 起
 267 | 起见
 268 | 岂但
 269 | 恰恰相反
 270 | 前后
 271 | 前者
 272 | 且
 273 | 然而
 274 | 然后
 275 | 然则
 276 | 让
 277 | 人家
 278 | 任
 279 | 任何
 280 | 任凭
 281 | 如
 282 | 如此
 283 | 如果
 284 | 如何
 285 | 如其
 286 | 如若
 287 | 如上所述
 288 | 若
 289 | 若非
 290 | 若是
 291 | 啥
 292 | 上下
 293 | 尚且
 294 | 设若
 295 | 设使
 296 | 甚而
 297 | 甚么
 298 | 甚至
 299 | 省得
 300 | 时候
 301 | 什么
 302 | 什么样
 303 | 使得
 304 | 是
 305 | 是的
 306 | 首先
 307 | 谁
 308 | 谁知
 309 | 顺
 310 | 顺着
 311 | 似的
 312 | 虽
 313 | 虽然
 314 | 虽说
 315 | 虽则
 316 | 随
 317 | 随着
 318 | 所
 319 | 所以
 320 | 他
 321 | 他们
 322 | 他人
 323 | 它
 324 | 它们
 325 | 她
 326 | 她们
 327 | 倘
 328 | 倘或
 329 | 倘然
 330 | 倘若
 331 | 倘使
 332 | 腾
 333 | 替
 334 | 通过
 335 | 同
 336 | 同时
 337 | 哇
 338 | 万一
 339 | 往
 340 | 望
 341 | 为
 342 | 为何
 343 | 为了
 344 | 为什么
 345 | 为着
 346 | 喂
 347 | 嗡嗡
 348 | 我
 349 | 我们
 350 | 呜
 351 | 呜呼
 352 | 乌乎
 353 | 无论
 354 | 无宁
 355 | 毋宁
 356 | 嘻
 357 | 吓
 358 | 相对而言
 359 | 像
 360 | 向
 361 | 向着
 362 | 嘘
 363 | 呀
 364 | 焉
 365 | 沿
 366 | 沿着
 367 | 要
 368 | 要不
 369 | 要不然
 370 | 要不是
 371 | 要么
 372 | 要是
 373 | 也
 374 | 也罢
 375 | 也好
 376 | 一
 377 | 一般
 378 | 一旦
 379 | 一方面
 380 | 一来
 381 | 一切
 382 | 一样
 383 | 一则
 384 | 依
 385 | 依照
 386 | 矣
 387 | 以
 388 | 以便
 389 | 以及
 390 | 以免
 391 | 以至
 392 | 以至于
 393 | 以致
 394 | 抑或
 395 | 因
 396 | 因此
 397 | 因而
 398 | 因为
 399 | 哟
 400 | 用
 401 | 由
 402 | 由此可见
 403 | 由于
 404 | 有
 405 | 有的
 406 | 有关
 407 | 有些
 408 | 又
 409 | 于
 410 | 于是
 411 | 于是乎
 412 | 与
 413 | 与此同时
 414 | 与否
 415 | 与其
 416 | 越是
 417 | 云云
 418 | 哉
 419 | 再说
 420 | 再者
 421 | 在
 422 | 在下
 423 | 咱
 424 | 咱们
 425 | 则
 426 | 怎
 427 | 怎么
 428 | 怎么办
 429 | 怎么样
 430 | 怎样
 431 | 咋
 432 | 照
 433 | 照着
 434 | 者
 435 | 这
 436 | 这边
 437 | 这儿
 438 | 这个
 439 | 这会儿
 440 | 这就是说
 441 | 这里
 442 | 这么
 443 | 这么点儿
 444 | 这么些
 445 | 这么样
 446 | 这时
 447 | 这些
 448 | 这样
 449 | 正如
 450 | 吱
 451 | 之
 452 | 之类
 453 | 之所以
 454 | 之一
 455 | 只是
 456 | 只限
 457 | 只要
 458 | 只有
 459 | 至
 460 | 至于
 461 | 诸位
 462 | 着
 463 | 着呢
 464 | 自
 465 | 自从
 466 | 自个儿
 467 | 自各儿
 468 | 自己
 469 | 自家
 470 | 自身
 471 | 综上所述
 472 | 总的来看
 473 | 总的来说
 474 | 总的说来
 475 | 总而言之
 476 | 总之
 477 | 纵
 478 | 纵令
 479 | 纵然
 480 | 纵使
 481 | 遵照
 482 | 作为
 483 | 兮
 484 | 呃
 485 | 呗
 486 | 咚
 487 | 咦
 488 | 喏
 489 | 啐
 490 | 喔唷
 491 | 嗬
 492 | 嗯
 493 | 嗳
 494 | 啊哈
 495 | 啊呀
 496 | 啊哟
 497 | 挨次
 498 | 挨个
 499 | 挨家挨户
 500 | 挨门挨户
 501 | 挨门逐户
 502 | 挨着
 503 | 按理
 504 | 按期
 505 | 按时
 506 | 按说
 507 | 暗地里
 508 | 暗中
 509 | 暗自
 510 | 昂然
 511 | 八成
 512 | 白白
 513 | 半
 514 | 梆
 515 | 保管
 516 | 保险
 517 | 饱
 518 | 背地里
 519 | 背靠背
 520 | 倍感
 521 | 倍加
 522 | 本人
 523 | 本身
 524 | 甭
 525 | 比起
 526 | 比如说
 527 | 比照
 528 | 毕竟
 529 | 必
 530 | 必定
 531 | 必将
 532 | 必须
 533 | 便
 534 | 别人
 535 | 并非
 536 | 并肩
 537 | 并没
 538 | 并没有
 539 | 并排
 540 | 并无
 541 | 勃然
 542 | 不
 543 | 不必
 544 | 不常
 545 | 不大
 546 | 不得
 547 | 不得不
 548 | 不得了
 549 | 不得已
 550 | 不迭
 551 | 不定
 552 | 不对
 553 | 不妨
 554 | 不管怎样
 555 | 不会
 556 | 不仅仅
 557 | 不仅仅是
 558 | 不经意
 559 | 不可开交
 560 | 不可抗拒
 561 | 不力
 562 | 不了
 563 | 不料
 564 | 不满
 565 | 不免
 566 | 不能不
 567 | 不起
 568 | 不巧
 569 | 不然的话
 570 | 不日
 571 | 不少
 572 | 不胜
 573 | 不时
 574 | 不是
 575 | 不同
 576 | 不能
 577 | 不要
 578 | 不外
 579 | 不外乎
 580 | 不下
 581 | 不限
 582 | 不消
 583 | 不已
 584 | 不亦乐乎
 585 | 不由得
 586 | 不再
 587 | 不择手段
 588 | 不怎么
 589 | 不曾
 590 | 不知不觉
 591 | 不止
 592 | 不止一次
 593 | 不至于
 594 | 才
 595 | 才能
 596 | 策略地
 597 | 差不多
 598 | 差一点
 599 | 常
 600 | 常常
 601 | 常言道
 602 | 常言说
 603 | 常言说得好
 604 | 长此下去
 605 | 长话短说
 606 | 长期以来
 607 | 长线
 608 | 敞开儿
 609 | 彻夜
 610 | 陈年
 611 | 趁便
 612 | 趁机
 613 | 趁热
 614 | 趁势
 615 | 趁早
 616 | 成年
 617 | 成年累月
 618 | 成心
 619 | 乘机
 620 | 乘胜
 621 | 乘势
 622 | 乘隙
 623 | 乘虚
 624 | 诚然
 625 | 迟早
 626 | 充分
 627 | 充其极
 628 | 充其量
 629 | 抽冷子
 630 | 臭
 631 | 初
 632 | 出
 633 | 出来
 634 | 出去
 635 | 除此
 636 | 除此而外
 637 | 除此以外
 638 | 除开
 639 | 除去
 640 | 除却
 641 | 除外
 642 | 处处
 643 | 川流不息
 644 | 传
 645 | 传说
 646 | 传闻
 647 | 串行
 648 | 纯
 649 | 纯粹
 650 | 此后
 651 | 此中
 652 | 次第
 653 | 匆匆
 654 | 从不
 655 | 从此
 656 | 从此以后
 657 | 从古到今
 658 | 从古至今
 659 | 从今以后
 660 | 从宽
 661 | 从来
 662 | 从轻
 663 | 从速
 664 | 从头
 665 | 从未
 666 | 从无到有
 667 | 从小
 668 | 从新
 669 | 从严
 670 | 从优
 671 | 从早到晚
 672 | 从中
 673 | 从重
 674 | 凑巧
 675 | 粗
 676 | 存心
 677 | 达旦
 678 | 打从
 679 | 打开天窗说亮话
 680 | 大
 681 | 大不了
 682 | 大大
 683 | 大抵
 684 | 大都
 685 | 大多
 686 | 大凡
 687 | 大概
 688 | 大家
 689 | 大举
 690 | 大略
 691 | 大面儿上
 692 | 大事
 693 | 大体
 694 | 大体上
 695 | 大约
 696 | 大张旗鼓
 697 | 大致
 698 | 呆呆地
 699 | 带
 700 | 殆
 701 | 待到
 702 | 单
 703 | 单纯
 704 | 单单
 705 | 但愿
 706 | 弹指之间
 707 | 当场
 708 | 当儿
 709 | 当即
 710 | 当口儿
 711 | 当然
 712 | 当庭
 713 | 当头
 714 | 当下
 715 | 当真
 716 | 当中
 717 | 倒不如
 718 | 倒不如说
 719 | 倒是
 720 | 到处
 721 | 到底
 722 | 到了儿
 723 | 到目前为止
 724 | 到头
 725 | 到头来
 726 | 得起
 727 | 得天独厚
 728 | 的确
 729 | 等到
 730 | 叮当
 731 | 顶多
 732 | 定
 733 | 动不动
 734 | 动辄
 735 | 陡然
 736 | 都
 737 | 独
 738 | 独自
 739 | 断然
 740 | 顿时
 741 | 多次
 742 | 多多
 743 | 多多少少
 744 | 多多益善
 745 | 多亏
 746 | 多年来
 747 | 多年前
 748 | 而后
 749 | 而论
 750 | 而又
 751 | 尔等
 752 | 二话不说
 753 | 二话没说
 754 | 反倒
 755 | 反倒是
 756 | 反而
 757 | 反手
 758 | 反之亦然
 759 | 反之则
 760 | 方
 761 | 方才
 762 | 方能
 763 | 放量
 764 | 非常
 765 | 非得
 766 | 分期
 767 | 分期分批
 768 | 分头
 769 | 奋勇
 770 | 愤然
 771 | 风雨无阻
 772 | 逢
 773 | 弗
 774 | 甫
 775 | 嘎嘎
 776 | 该当
 777 | 概
 778 | 赶快
 779 | 赶早不赶晚
 780 | 敢
 781 | 敢情
 782 | 敢于
 783 | 刚
 784 | 刚才
 785 | 刚好
 786 | 刚巧
 787 | 高低
 788 | 格外
 789 | 隔日
 790 | 隔夜
 791 | 个人
 792 | 各式
 793 | 更
 794 | 更加
 795 | 更进一步
 796 | 更为
 797 | 公然
 798 | 共
 799 | 共总
 800 | 够瞧的
 801 | 姑且
 802 | 古来
 803 | 故而
 804 | 故意
 805 | 固
 806 | 怪
 807 | 怪不得
 808 | 惯常
 809 | 光
 810 | 光是
 811 | 归根到底
 812 | 归根结底
 813 | 过于
 814 | 毫不
 815 | 毫无
 816 | 毫无保留地
 817 | 毫无例外
 818 | 好在
 819 | 何必
 820 | 何尝
 821 | 何妨
 822 | 何苦
 823 | 何乐而不为
 824 | 何须
 825 | 何止
 826 | 很
 827 | 很多
 828 | 很少
 829 | 轰然
 830 | 后来
 831 | 呼啦
 832 | 忽地
 833 | 忽然
 834 | 互
 835 | 互相
 836 | 哗啦
 837 | 话说
 838 | 还
 839 | 恍然
 840 | 会
 841 | 豁然
 842 | 活
 843 | 伙同
 844 | 或多或少
 845 | 或许
 846 | 基本
 847 | 基本上
 848 | 基于
 849 | 极
 850 | 极大
 851 | 极度
 852 | 极端
 853 | 极力
 854 | 极其
 855 | 极为
 856 | 急匆匆
 857 | 即将
 858 | 即刻
 859 | 即是说
 860 | 几度
 861 | 几番
 862 | 几乎
 863 | 几经
 864 | 既...又
 865 | 继之
 866 | 加上
 867 | 加以
 868 | 间或
 869 | 简而言之
 870 | 简言之
 871 | 简直
 872 | 见
 873 | 将才
 874 | 将近
 875 | 将要
 876 | 交口
 877 | 较比
 878 | 较为
 879 | 接连不断
 880 | 接下来
 881 | 皆可
 882 | 截然
 883 | 截至
 884 | 藉以
 885 | 借此
 886 | 借以
 887 | 届时
 888 | 仅
 889 | 仅仅
 890 | 谨
 891 | 进来
 892 | 进去
 893 | 近
 894 | 近几年来
 895 | 近来
 896 | 近年来
 897 | 尽管如此
 898 | 尽可能
 899 | 尽快
 900 | 尽量
 901 | 尽然
 902 | 尽如人意
 903 | 尽心竭力
 904 | 尽心尽力
 905 | 尽早
 906 | 精光
 907 | 经常
 908 | 竟
 909 | 竟然
 910 | 究竟
 911 | 就此
 912 | 就地
 913 | 就算
 914 | 居然
 915 | 局外
 916 | 举凡
 917 | 据称
 918 | 据此
 919 | 据实
 920 | 据说
 921 | 据我所知
 922 | 据悉
 923 | 具体来说
 924 | 决不
 925 | 决非
 926 | 绝
 927 | 绝不
 928 | 绝顶
 929 | 绝对
 930 | 绝非
 931 | 均
 932 | 喀
 933 | 看
 934 | 看来
 935 | 看起来
 936 | 看上去
 937 | 看样子
 938 | 可好
 939 | 可能
 940 | 恐怕
 941 | 快
 942 | 快要
 943 | 来不及
 944 | 来得及
 945 | 来讲
 946 | 来看
 947 | 拦腰
 948 | 牢牢
 949 | 老
 950 | 老大
 951 | 老老实实
 952 | 老是
 953 | 累次
 954 | 累年
 955 | 理当
 956 | 理该
 957 | 理应
 958 | 历
 959 | 立
 960 | 立地
 961 | 立刻
 962 | 立马
 963 | 立时
 964 | 联袂
 965 | 连连
 966 | 连日
 967 | 连日来
 968 | 连声
 969 | 连袂
 970 | 临到
 971 | 另方面
 972 | 另行
 973 | 另一个
 974 | 路经
 975 | 屡
 976 | 屡次
 977 | 屡次三番
 978 | 屡屡
 979 | 缕缕
 980 | 率尔
 981 | 率然
 982 | 略
 983 | 略加
 984 | 略微
 985 | 略为
 986 | 论说
 987 | 马上
 988 | 蛮
 989 | 满
 990 | 没
 991 | 没有
 992 | 每逢
 993 | 每每
 994 | 每时每刻
 995 | 猛然
 996 | 猛然间
 997 | 莫
 998 | 莫不
 999 | 莫非
1000 | 莫如
1001 | 默默地
1002 | 默然
1003 | 呐
1004 | 那末
1005 | 奈
1006 | 难道
1007 | 难得
1008 | 难怪
1009 | 难说
1010 | 内
1011 | 年复一年
1012 | 凝神
1013 | 偶而
1014 | 偶尔
1015 | 怕
1016 | 砰
1017 | 碰巧
1018 | 譬如
1019 | 偏偏
1020 | 乒
1021 | 平素
1022 | 颇
1023 | 迫于
1024 | 扑通
1025 | 其后
1026 | 其实
1027 | 奇
1028 | 齐
1029 | 起初
1030 | 起来
1031 | 起首
1032 | 起头
1033 | 起先
1034 | 岂
1035 | 岂非
1036 | 岂止
1037 | 迄
1038 | 恰逢
1039 | 恰好
1040 | 恰恰
1041 | 恰巧
1042 | 恰如
1043 | 恰似
1044 | 千
1045 | 万
1046 | 千万
1047 | 千万千万
1048 | 切
1049 | 切不可
1050 | 切莫
1051 | 切切
1052 | 切勿
1053 | 窃
1054 | 亲口
1055 | 亲身
1056 | 亲手
1057 | 亲眼
1058 | 亲自
1059 | 顷
1060 | 顷刻
1061 | 顷刻间
1062 | 顷刻之间
1063 | 请勿
1064 | 穷年累月
1065 | 取道
1066 | 去
1067 | 权时
1068 | 全都
1069 | 全力
1070 | 全年
1071 | 全然
1072 | 全身心
1073 | 然
1074 | 人人
1075 | 仍
1076 | 仍旧
1077 | 仍然
1078 | 日复一日
1079 | 日见
1080 | 日渐
1081 | 日益
1082 | 日臻
1083 | 如常
1084 | 如此等等
1085 | 如次
1086 | 如今
1087 | 如期
1088 | 如前所述
1089 | 如上
1090 | 如下
1091 | 汝
1092 | 三番两次
1093 | 三番五次
1094 | 三天两头
1095 | 瑟瑟
1096 | 沙沙
1097 | 上
1098 | 上来
1099 | 上去
1100 | 一.
1101 | 一一
1102 | 一下
1103 | 一个
1104 | 一些
1105 | 一何
1106 | 一则通过
1107 | 一天
1108 | 一定
1109 | 一时
1110 | 一次
1111 | 一片
1112 | 一番
1113 | 一直
1114 | 一致
1115 | 一起
1116 | 一转眼
1117 | 一边
1118 | 一面
1119 | 上升
1120 | 上述
1121 | 上面
1122 | 下
1123 | 下列
1124 | 下去
1125 | 下来
1126 | 下面
1127 | 不一
1128 | 不久
1129 | 不变
1130 | 不可
1131 | 不够
1132 | 不尽
1133 | 不尽然
1134 | 不敢
1135 | 不断
1136 | 不若
1137 | 不足
1138 | 与其说
1139 | 专门
1140 | 且不说
1141 | 且说
1142 | 严格
1143 | 严重
1144 | 个别
1145 | 中小
1146 | 中间
1147 | 丰富
1148 | 为主
1149 | 为什麽
1150 | 为止
1151 | 为此
1152 | 主张
1153 | 主要
1154 | 举行
1155 | 乃至于
1156 | 之前
1157 | 之后
1158 | 之後
1159 | 也就是说
1160 | 也是
1161 | 了解
1162 | 争取
1163 | 二来
1164 | 云尔
1165 | 些
1166 | 亦
1167 | 产生
1168 | 人
1169 | 人们
1170 | 什麽
1171 | 今
1172 | 今后
1173 | 今天
1174 | 今年
1175 | 今後
1176 | 介于
1177 | 从事
1178 | 他是
1179 | 他的
1180 | 代替
1181 | 以上
1182 | 以下
1183 | 以为
1184 | 以前
1185 | 以后
1186 | 以外
1187 | 以後
1188 | 以故
1189 | 以期
1190 | 以来
1191 | 任务
1192 | 企图
1193 | 伟大
1194 | 似乎
1195 | 但凡
1196 | 何以
1197 | 余外
1198 | 你是
1199 | 你的
1200 | 使
1201 | 使用
1202 | 依据
1203 | 依靠
1204 | 便于
1205 | 促进
1206 | 保持
1207 | 做到
1208 | 傥然
1209 | 儿
1210 | 允许
1211 | 元／吨
1212 | 先不先
1213 | 先后
1214 | 先後
1215 | 先生
1216 | 全体
1217 | 全部
1218 | 全面
1219 | 共同
1220 | 具体
1221 | 具有
1222 | 兼之
1223 | 再
1224 | 再其次
1225 | 再则
1226 | 再有
1227 | 再次
1228 | 再者说
1229 | 决定
1230 | 准备
1231 | 凡
1232 | 凡是
1233 | 出于
1234 | 出现
1235 | 分别
1236 | 则甚
1237 | 别处
1238 | 别是
1239 | 别管
1240 | 前此
1241 | 前进
1242 | 前面
1243 | 加入
1244 | 加强
1245 | 十分
1246 | 即如
1247 | 却
1248 | 却不
1249 | 原来
1250 | 又及
1251 | 及时
1252 | 双方
1253 | 反应
1254 | 反映
1255 | 取得
1256 | 受到
1257 | 变成
1258 | 另悉
1259 | 只
1260 | 只当
1261 | 只怕
1262 | 只消
1263 | 叫做
1264 | 召开
1265 | 各人
1266 | 各地
1267 | 各级
1268 | 合理
1269 | 同一
1270 | 同样
1271 | 后
1272 | 后者
1273 | 后面
1274 | 向使
1275 | 周围
1276 | 呵呵
1277 | 咧
1278 | 唯有
1279 | 啷当
1280 | 喽
1281 | 嗡
1282 | 嘿嘿
1283 | 因了
1284 | 因着
1285 | 在于
1286 | 坚决
1287 | 坚持
1288 | 处在
1289 | 处理
1290 | 复杂
1291 | 多么
1292 | 多数
1293 | 大力
1294 | 大多数
1295 | 大批
1296 | 大量
1297 | 失去
1298 | 她是
1299 | 她的
1300 | 好
1301 | 好的
1302 | 好象
1303 | 如同
1304 | 如是
1305 | 始而
1306 | 存在
1307 | 孰料
1308 | 孰知
1309 | 它们的
1310 | 它是
1311 | 它的
1312 | 安全
1313 | 完全
1314 | 完成
1315 | 实现
1316 | 实际
1317 | 宣布
1318 | 容易
1319 | 密切
1320 | 对应
1321 | 对待
1322 | 对方
1323 | 对比
1324 | 小
1325 | 少数
1326 | 尔
1327 | 尔尔
1328 | 尤其
1329 | 就是了
1330 | 就要
1331 | 属于
1332 | 左右
1333 | 巨大
1334 | 巩固
1335 | 已
1336 | 已矣
1337 | 已经
1338 | 巴
1339 | 巴巴
1340 | 帮助
1341 | 并不
1342 | 并不是
1343 | 广大
1344 | 广泛
1345 | 应当
1346 | 应用
1347 | 应该
1348 | 庶乎
1349 | 庶几
1350 | 开展
1351 | 引起
1352 | 强烈
1353 | 强调
1354 | 归齐
1355 | 当前
1356 | 当地
1357 | 当时
1358 | 形成
1359 | 彻底
1360 | 彼时
1361 | 往往
1362 | 後来
1363 | 後面
1364 | 得了
1365 | 得出
1366 | 得到
1367 | 心里
1368 | 必然
1369 | 必要
1370 | 怎奈
1371 | 怎麽
1372 | 总是
1373 | 总结
1374 | 您们
1375 | 您是
1376 | 惟其
1377 | 意思
1378 | 愿意
1379 | 成为
1380 | 我是
1381 | 我的
1382 | 或则
1383 | 或曰
1384 | 战斗
1385 | 所在
1386 | 所幸
1387 | 所有
1388 | 所谓
1389 | 扩大
1390 | 掌握
1391 | 接著
1392 | 数/
1393 | 整个
1394 | 方便
1395 | 方面
1396 | 无
1397 | 无法
1398 | 既往
1399 | 明显
1400 | 明确
1401 | 是不是
1402 | 是以
1403 | 是否
1404 | 显然
1405 | 显著
1406 | 普通
1407 | 普遍
1408 | 曾
1409 | 曾经
1410 | 替代
1411 | 最
1412 | 最后
1413 | 最大
1414 | 最好
1415 | 最後
1416 | 最近
1417 | 最高
1418 | 有利
1419 | 有力
1420 | 有及
1421 | 有所
1422 | 有效
1423 | 有时
1424 | 有点
1425 | 有的是
1426 | 有着
1427 | 有著
1428 | 末##末
1429 | 本地
1430 | 来自
1431 | 来说
1432 | 构成
1433 | 某某
1434 | 根本
1435 | 欢迎
1436 | 欤
1437 | 正值
1438 | 正在
1439 | 正巧
1440 | 正常
1441 | 正是
1442 | 此地
1443 | 此处
1444 | 此时
1445 | 此次
1446 | 每个
1447 | 每天
1448 | 每年
1449 | 比及
1450 | 比较
1451 | 没奈何
1452 | 注意
1453 | 深入
1454 | 清楚
1455 | 满足
1456 | 然後
1457 | 特别是
1458 | 特殊
1459 | 特点
1460 | 犹且
1461 | 犹自
1462 | 现代
1463 | 现在
1464 | 甚且
1465 | 甚或
1466 | 甚至于
1467 | 用来
1468 | 由是
1469 | 由此
1470 | 目前
1471 | 直到
1472 | 直接
1473 | 相似
1474 | 相信
1475 | 相反
1476 | 相同
1477 | 相对
1478 | 相应
1479 | 相当
1480 | 相等
1481 | 看出
1482 | 看到
1483 | 看看
1484 | 看见
1485 | 真是
1486 | 真正
1487 | 眨眼
1488 | 矣乎
1489 | 矣哉
1490 | 知道
1491 | 确定
1492 | 种
1493 | 积极
1494 | 移动
1495 | 突出
1496 | 突然
1497 | 立即
1498 | 竟而
1499 | 第二
1500 | 类如
1501 | 练习
1502 | 组成
1503 | 结合
1504 | 继后
1505 | 继续
1506 | 维持
1507 | 考虑
1508 | 联系
1509 | 能否
1510 | 能够
1511 | 自后
1512 | 自打
1513 | 至今
1514 | 至若
1515 | 致
1516 | 般的
1517 | 良好
1518 | 若夫
1519 | 若果
1520 | 范围
1521 | 莫不然
1522 | 获得
1523 | 行为
1524 | 行动
1525 | 表明
1526 | 表示
1527 | 要求
1528 | 规定
1529 | 觉得
1530 | 譬喻
1531 | 认为
1532 | 认真
1533 | 认识
1534 | 许多
1535 | 设或
1536 | 诚如
1537 | 说明
1538 | 说来
1539 | 说说
1540 | 诸
1541 | 诸如
1542 | 谁人
1543 | 谁料
1544 | 贼死
1545 | 赖以
1546 | 距
1547 | 转动
1548 | 转变
1549 | 转贴
1550 | 达到
1551 | 迅速
1552 | 过去
1553 | 过来
1554 | 运用
1555 | 还要
1556 | 这一来
1557 | 这次
1558 | 这点
1559 | 这种
1560 | 这般
1561 | 这麽
1562 | 进入
1563 | 进步
1564 | 进行
1565 | 适应
1566 | 适当
1567 | 适用
1568 | 逐步
1569 | 逐渐
1570 | 通常
1571 | 造成
1572 | 遇到
1573 | 遭到
1574 | 遵循
1575 | 避免
1576 | 那般
1577 | 那麽
1578 | 部分
1579 | 采取
1580 | 里面
1581 | 重大
1582 | 重新
1583 | 重要
1584 | 针对
1585 | 问题
1586 | 防止
1587 | 附近
1588 | 限制
1589 | 随后
1590 | 随时
1591 | 随著
1592 | 难道说
1593 | 集中
1594 | 需要
1595 | 非特
1596 | 非独
1597 | 高兴
1598 | 若果
1599 | ·
1600 | ~
1601 | -
1602 | ——
1603 | =
1604 | +
1605 | 【
1606 | {
1607 | }
1608 | 】
1609 | 、
1610 | |
1611 | ；
1612 | ：
1613 | ‘
1614 | ’
1615 | “
1616 | ”
1617 | ，
1618 | 《
1619 | 。
1620 | 》
1621 | /
1622 | ？
1623 | *
1624 | ！
1625 | @
1626 | #
1627 | ￥
1628 | %
1629 | ……
1630 | &
1631 | （
1632 | ）
1633 | `
1634 | ~
1635 | !
1636 | @
1637 | #
1638 | $
1639 | %
1640 | ^
1641 | &
1642 | (
1643 | )
1644 | [
1645 | ]
1646 | |
1647 | \
1648 | ;
1649 | :
1650 | '
1651 | "
1652 | ,
1653 | <
1654 | .
1655 | >
1656 | /
1657 | ?
1658 | 0
1659 | 1
1660 | 2
1661 | 3
1662 | 4
1663 | 5
1664 | 6
1665 | 7
1666 | 8
1667 | 9
1668 | 


--------------------------------------------------------------------------------
/synonyms/synonyms.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | #=========================================================================
  4 | #
  5 | # File: /Users/hain/ai/Synonyms/synonyms/__init__.py
  6 | # Author: Hai Liang Wang
  7 | # Date: 2017-09-27
  8 | #
  9 | #=========================================================================
 10 | 
 11 | """
 12 | Chinese Synonyms for Natural Language Processing and Understanding.
 13 | """
 14 | from __future__ import print_function
 15 | from __future__ import division
 16 | 
 17 | __copyright__ = "Copyright (c) (2017-2023) Chatopera Inc. All Rights Reserved"
 18 | __author__ = "Hu Ying Xi<>, Hai Liang Wang<hai@chatopera.com>"
 19 | __date__ = "2020-09-24"
 20 | __version__ = "3.23.6"
 21 | 
 22 | import os
 23 | import sys
 24 | import numpy as np
 25 | curdir = os.path.dirname(os.path.abspath(__file__))
 26 | sys.path.insert(0, curdir)
 27 | 
 28 | PLT = 2
 29 | 
 30 | if sys.version_info[0] < 3:
 31 |     default_stdout = sys.stdout
 32 |     default_stderr = sys.stderr
 33 |     reload(sys)
 34 |     sys.stdout = default_stdout
 35 |     sys.stderr = default_stderr
 36 |     sys.setdefaultencoding("utf-8")
 37 |     # raise "Must be using Python 3"
 38 | else:
 39 |     PLT = 3
 40 | 
 41 | # Get Environment variables
 42 | ENVIRON = os.environ.copy()
 43 | 
 44 | import json
 45 | import gzip
 46 | import shutil
 47 | from .word2vec import KeyedVectors
 48 | from .utils import any2utf8
 49 | from .utils import any2unicode
 50 | from .utils import sigmoid
 51 | from .utils import cosine
 52 | from .utils import is_digit
 53 | from jieba import posseg, analyse
 54 | from chatoperastore import download_licensedfile, LicensedfileDownloadException
 55 | 
 56 | '''
 57 | globals
 58 | '''
 59 | _vocab = dict()
 60 | _size = 0
 61 | _vectors = None
 62 | _stopwords = set()
 63 | _cache_nearby = dict()
 64 | _debug = False
 65 | 
 66 | if "SYNONYMS_DEBUG" in ENVIRON:
 67 |     if ENVIRON["SYNONYMS_DEBUG"].lower() == "true": _debug = True
 68 | 
 69 | '''
 70 | lambda fns
 71 | '''
 72 | # combine similarity scores
 73 | _similarity_smooth = lambda x, y, z, u: (x * y) + z - u
 74 | _flat_sum_array = lambda x: np.sum(x, axis=0)  # 分子
 75 | _logging_debug = lambda x: print(">> Synonyms DEBUG %s" % x) if _debug else None
 76 | 
 77 | '''
 78 | Sponsorship
 79 | '''
 80 | print("\n Synonyms: v%s, Project home: %s" % (__version__, "https://github.com/chatopera/Synonyms/"))
 81 | print("\n Project Sponsored by Chatopera")
 82 | print("\n  deliver your chatbots with Chatopera Cloud Services --> https://bot.chatopera.com\n")
 83 | print("\n Module file path: %s" % __file__)
 84 | print("\n ************ NOTICE ************")
 85 | print("  Require license to download model package, purchase from https://store.chatopera.com/product/syns001")
 86 | print(" ********************************\n")
 87 | 
 88 | '''
 89 | tokenizer settings
 90 | '''
 91 | tokenizer_dict = os.path.join(curdir, 'data', 'vocab.txt')
 92 | if "SYNONYMS_WORDSEG_DICT" in ENVIRON:
 93 |     if os.path.exists(ENVIRON["SYNONYMS_WORDSEG_DICT"]):
 94 |         print("info: set wordseg dict with %s" % tokenizer_dict)
 95 |         tokenizer_dict = ENVIRON["SYNONYMS_WORDSEG_DICT"]
 96 |     else: print("warning: can not find dict at [%s]" % tokenizer_dict)
 97 | 
 98 | print(">> Synonyms load wordseg dict [%s] ... " % tokenizer_dict)
 99 | posseg.initialize(tokenizer_dict)
100 | 
101 | # stopwords
102 | _fin_stopwords_path = os.path.join(curdir, 'data', 'stopwords.txt')
103 | def _load_stopwords(file_path):
104 |     '''
105 |     load stop words
106 |     '''
107 |     global _stopwords
108 |     if sys.version_info[0] < 3:
109 |         words = open(file_path, 'r')
110 |     else:
111 |         words = open(file_path, 'r', encoding='utf-8')
112 |     stopwords = words.readlines()
113 |     for w in stopwords:
114 |         _stopwords.add(any2unicode(w).strip())
115 | 
116 | print(">> Synonyms on loading stopwords [%s] ..." % _fin_stopwords_path)
117 | _load_stopwords(_fin_stopwords_path)
118 | 
119 | def _segment_words(sen, HMM=True):
120 |     '''
121 |     segment words
122 |     '''
123 |     words, tags = [], []
124 |     m = posseg.cut(sen, HMM=HMM)  # HMM更好的识别新词
125 |     for x in m:
126 |         words.append(x.word)
127 |         tags.append(x.flag)
128 |     return words, tags
129 | 
130 | def keywords(sentence, topK=5, withWeight=False, allowPOS=()):
131 |     '''
132 |     extract keywords with Jieba Tokenizer
133 |     '''
134 |     return analyse.extract_tags(sentence, topK=topK, withWeight=withWeight, allowPOS=allowPOS)
135 | 
136 | '''
137 | word embedding
138 | '''
139 | # vectors
140 | _licenseid = os.environ.get("SYNONYMS_DL_LICENSE", None)
141 | _f_model = os.path.join(curdir, 'data', 'words.vector.gz')
142 | _download_model = not os.path.exists(_f_model)
143 | 
144 | if "SYNONYMS_WORD2VEC_BIN_MODEL_ZH_CN" in ENVIRON:
145 |     _f_model = ENVIRON["SYNONYMS_WORD2VEC_BIN_MODEL_ZH_CN"]
146 |     _download_model = False
147 | 
148 | def _load_w2v(model_file=_f_model, binary=True):
149 |     '''
150 |     load word2vec model
151 |     '''
152 |     if not os.path.exists(model_file) and _download_model:
153 |         if not _licenseid:
154 |             raise Exception("SYNONYMS_DL_LICENSE is not in Environment variables, check out Installation Guide on https://github.com/chatopera/Synonyms")
155 | 
156 |         print("\n>> Synonyms downloading data with licenseId %s, save to %s ... \n this only happens if Synonyms initialization for the first time. \n It would take minutes that depends on network." % (_licenseid, model_file))
157 |         download_licensedfile(_licenseid, model_file)
158 |         dl_file_size = os.path.getsize(model_file)
159 |         min_file_size = 40900000 # ~ 40MB
160 | 
161 |         if dl_file_size < min_file_size:
162 |             os.remove(model_file)
163 |             raise Exception("Download File Error, please read the installation guide on https://github.com/chatopera/Synonyms, reach out for help with info@chatopera.com by describing the problem and procedures.")
164 | 
165 |         print("\n>> Synonyms downloaded\n")
166 | 
167 |     elif not os.path.exists(model_file):
168 |         print(">> Synonyms os.path : ", os.path)
169 |         raise Exception("Model file [%s] does not exist." % model_file)
170 | 
171 |     return KeyedVectors.load_word2vec_format(
172 |         model_file, binary=binary, unicode_errors='ignore')
173 | print(">> Synonyms on loading vectors [%s] ..." % _f_model)
174 | _vectors = _load_w2v(model_file=_f_model)
175 | 
176 | def _get_wv(sentence, ignore=False):
177 |     '''
178 |     get word2vec data by sentence
179 |     sentence is segmented string.
180 |     '''
181 |     global _vectors
182 |     vectors = []
183 |     for y in sentence:
184 |         y_ = any2unicode(y).strip()
185 |         if y_ not in _stopwords:
186 |             syns = nearby(y_)[0]
187 |             _logging_debug("sentence %s word: %s" %(sentence, y_))
188 |             _logging_debug("sentence %s word nearby: %s" %(sentence, " ".join(syns)))
189 |             c = []
190 |             try:
191 |                 c.append(_vectors.word_vec(y_))
192 |             except KeyError as error:
193 |                 if ignore:
194 |                     continue
195 |                 else:
196 |                     _logging_debug("not exist in w2v model: %s" % y_)
197 |                     # c.append(np.zeros((100,), dtype=float))
198 |                     random_state = np.random.RandomState(seed=(hash(y_) % (2**32 - 1)))
199 |                     c.append(random_state.uniform(low=-10.0, high=10.0, size=(100,)))
200 |             for n in syns:
201 |                 if n is None: continue
202 |                 try:
203 |                     v = _vectors.word_vec(any2unicode(n))
204 |                 except KeyError as error:
205 |                     # v = np.zeros((100,), dtype=float)
206 |                     random_state = np.random.RandomState(seed=(hash(n) % (2 ** 32 - 1)))
207 |                     v = random_state.uniform(low=10.0, high=10.0, size=(100,))
208 |                 c.append(v)
209 |             r = np.average(c, axis=0)
210 |             vectors.append(r)
211 |     return vectors
212 | 
213 | '''
214 | Distance
215 | '''
216 | # Levenshtein Distance
217 | def _levenshtein_distance(sentence1, sentence2):
218 |     '''
219 |     Return the Levenshtein distance between two strings.
220 |     Based on:
221 |         http://rosettacode.org/wiki/Levenshtein_distance#Python
222 |     '''
223 |     first = any2utf8(sentence1).decode('utf-8', 'ignore')
224 |     second = any2utf8(sentence2).decode('utf-8', 'ignore')
225 |     sentence1_len, sentence2_len = len(first), len(second)
226 |     maxlen = max(sentence1_len, sentence2_len)
227 |     if sentence1_len > sentence2_len:
228 |         first, second = second, first
229 | 
230 |     distances = range(len(first) + 1)
231 |     for index2, char2 in enumerate(second):
232 |         new_distances = [index2 + 1]
233 |         for index1, char1 in enumerate(first):
234 |             if char1 == char2:
235 |                 new_distances.append(distances[index1])
236 |             else:
237 |                 new_distances.append(1 + min((distances[index1],
238 |                                              distances[index1 + 1],
239 |                                              new_distances[-1])))
240 |         distances = new_distances
241 |     levenshtein = distances[-1]
242 |     d = float((maxlen - levenshtein)/maxlen)
243 |     # smoothing
244 |     s = (sigmoid(d * 6) - 0.5) * 2
245 |     # print("smoothing[%s| %s]: %s -> %s" % (sentence1, sentence2, d, s))
246 |     return s
247 | 
248 | def sv(sentence, ignore=False):
249 |     '''
250 |     获得一个分词后句子的向量，向量以 array[array[]] 方式组成，即获取 sentence 中每个词的向量 array[] 放在一个 array 中
251 | 
252 |     sentence: 句子是分词后通过空格联合起来
253 |     ignore: 是否忽略OOV，False时，随机生成一个向量
254 |     '''
255 |     return _get_wv(sentence, ignore = ignore)
256 | 
257 | def bow(sentence, ignore=False):
258 |     '''
259 |     获得一个分词后句子的向量，向量以BoW方式组成
260 | 
261 |     sentence: 句子是分词后通过空格联合起来
262 |     ignore: 是否忽略OOV，False时，随机生成一个向量
263 |     '''
264 |     return _flat_sum_array(_get_wv(sentence, ignore))
265 | 
266 | 
267 | def v(word):
268 |     '''
269 |     获得一个词语的向量，OOV时抛出 KeyError 异常
270 |     '''
271 |     y_ = any2unicode(word).strip()
272 |     return _vectors.word_vec(y_)
273 | 
274 | def _nearby_levenshtein_distance(s1, s2):
275 |     '''
276 |     使用空间距离近的词汇优化编辑距离计算
277 |     '''
278 |     s1_len, s2_len = len(s1), len(s2)
279 |     maxlen = s1_len
280 |     if s1_len == s2_len:
281 |         first, second = sorted([s1, s2])
282 |     elif s1_len < s2_len:
283 |         first = s1
284 |         second = s2
285 |         maxlen = s2_len
286 |     else:
287 |         first = s2
288 |         second = s1
289 | 
290 |     ft = set() # all related words with first sentence 
291 |     for x in first:
292 |         ft.add(x)
293 |         n, _ = nearby(x)
294 |         for o in n[:10]:
295 |             ft.add(o)
296 |     
297 |     scores = []
298 |     for x in second:
299 |         choices = [_levenshtein_distance(x, y) for y in ft]
300 |         if len(choices) > 0: scores.append(max(choices))
301 | 
302 |     s = np.sum(scores) / maxlen if len(scores) > 0 else 0
303 |     return s
304 | 
305 | def _similarity_distance(s1, s2, ignore):
306 |     '''
307 |     compute similarity with distance measurement
308 |     '''
309 |     g = 0.0
310 |     try:
311 |         g_ = cosine(_flat_sum_array(_get_wv(s1, ignore)), _flat_sum_array(_get_wv(s2, ignore)))
312 |         if is_digit(g_): g = g_
313 |     except: pass
314 | 
315 |     u = _nearby_levenshtein_distance(s1, s2)
316 |     if u >= 0.99:
317 |         r = 1.0
318 |     elif u > 0.9:
319 |         r = _similarity_smooth(g, 0.05, u, 0.05)
320 |     elif u > 0.8:
321 |         r = _similarity_smooth(g, 0.1, u, 0.2)
322 |     elif u > 0.4:
323 |         r = _similarity_smooth(g, 0.2, u, 0.15)
324 |     elif u > 0.2:
325 |         r = _similarity_smooth(g, 0.3, u, 0.1)
326 |     else:
327 |         r = _similarity_smooth(g, 0.4, u, 0)
328 | 
329 |     if r < 0: r = abs(r)
330 |     r = min(r, 1.0)
331 |     return float("%.3f" % r)
332 | 
333 | '''
334 | Public Methods
335 | '''
336 | seg = _segment_words # word segmenter
337 | 
338 | def nearby(word, size = 10):
339 |     '''
340 |     Nearby word
341 |     '''
342 |     w = any2unicode(word)
343 |     wk = w + '-' + str(size)
344 |     # read from cache
345 |     if wk in _cache_nearby: return _cache_nearby[wk]
346 | 
347 |     words, scores = [], []
348 |     try:
349 |         for x in _vectors.neighbours(w, size):
350 |             words.append(x[0])
351 |             scores.append(x[1])
352 |     except: pass # ignore key error, OOV
353 |     # put into cache
354 |     _cache_nearby[wk] = (words, scores)
355 |     return words, scores
356 | 
357 | def compare(s1, s2, seg=True, ignore=False, stopwords=False):
358 |     '''
359 |     compare similarity
360 |     s1 : sentence1
361 |     s2 : sentence2
362 |     seg : True : The original sentences need be cut
363 |           False : The original sentences have been cut
364 |     ignore: True: ignore OOV words
365 |             False: get vector randomly for OOV words
366 |     '''
367 |     if s1 == s2: return 1.0
368 |     
369 |     s1_words = []
370 |     s2_words = []
371 | 
372 |     if seg:
373 |         s1, _ = _segment_words(s1)
374 |         s2, _ = _segment_words(s2)
375 |     else:
376 |         s1 = s1.split()
377 |         s2 = s2.split()
378 | 
379 |     # check stopwords
380 |     if not stopwords:
381 |         global _stopwords
382 |         for x in s1: 
383 |             if not x in _stopwords:
384 |                 s1_words.append(x)
385 |         for x in s2:
386 |             if not x in _stopwords:
387 |                 s2_words.append(x)
388 |     else:
389 |         s1_words = s1 
390 |         s2_words = s2
391 | 
392 |     assert len(s1) > 0 and len(s2) > 0, "The length of s1 and s2 should > 0."
393 |     return _similarity_distance(s1_words, s2_words, ignore)
394 | 
395 | def describe():
396 |     '''
397 |     summary info of vectors
398 |     '''
399 |     vocab_size = len(_vectors.vocab.keys())
400 |     print("Vocab size in vector model: %d" % vocab_size)
401 |     print("model_path: %s" % _f_model)
402 |     print("version: %s" % __version__)
403 |     return dict({
404 |         "vocab_size": vocab_size,
405 |         "version": __version__,
406 |         "model_path": _f_model
407 |     })
408 | 
409 | def display(word, size = 10):
410 |     print("'%s'近义词：" % word)
411 |     o = nearby(word, size)
412 |     assert len(o) == 2, "should contain 2 list"
413 |     if len(o[0]) == 0:
414 |         print(" out of vocabulary")
415 |     for k, v in enumerate(o[0]):
416 |         print("  %d. %s:%s" % (k + 1, v, o[1][k]))
417 | 
418 | def main():
419 |     display("人脸")
420 |     display("NOT_EXIST")
421 | 
422 | if __name__ == '__main__':
423 |     main()
424 | 


--------------------------------------------------------------------------------
/synonyms/utils.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | #
  4 | # Copyright (C) 2016 Radim Rehurek <me@radimrehurek.com>
  5 | # Modifications (C) 2017 Hai Liang Wang <hailiang.hl.wang@gmail.com>
  6 | # Licensed under the GNU LGPL v3.0 - http://www.gnu.org/licenses/lgpl.html
  7 | # Author: Hai Liang Wang
  8 | # Date: 2017-10-16:14:13:24
  9 | #
 10 | #=========================================================================
 11 | 
 12 | from __future__ import print_function
 13 | from __future__ import division
 14 | 
 15 | __copyright__ = "Copyright (c) (2017-2023) Chatopera Inc. All Rights Reserved"
 16 | __author__ = "Hai Liang Wang"
 17 | __date__ = "2017-10-16:14:13:24"
 18 | 
 19 | import os
 20 | import sys
 21 | curdir = os.path.dirname(os.path.abspath(__file__))
 22 | sys.path.append(curdir)
 23 | 
 24 | import re
 25 | import unicodedata
 26 | import os
 27 | import random
 28 | import shutil
 29 | import sys
 30 | import subprocess
 31 | from contextlib import contextmanager
 32 | import numpy as np
 33 | import numbers
 34 | from six import string_types, u
 35 | 
 36 | if sys.version_info[0] < 3:
 37 |     reload(sys)
 38 |     sys.setdefaultencoding("utf-8")
 39 |     # raise "Must be using Python 3"
 40 | else:
 41 |     unicode = str
 42 | 
 43 | import collections
 44 | import warnings
 45 | 
 46 | try:
 47 |     from html.entities import name2codepoint as n2cp
 48 | except ImportError:
 49 |     from htmlentitydefs import name2codepoint as n2cp
 50 | try:
 51 |     import cPickle as _pickle
 52 | except ImportError:
 53 |     import pickle as _pickle
 54 | 
 55 | 
 56 | try:
 57 |     from smart_open import smart_open
 58 | except ImportError:
 59 |     print("smart_open library not found; falling back to local-filesystem-only")
 60 | 
 61 |     def make_closing(base, **attrs):
 62 |         """
 63 |         Add support for `with Base(attrs) as fout:` to the base class if it's missing.
 64 |         The base class' `close()` method will be called on context exit, to always close the file properly.
 65 | 
 66 |         This is needed for gzip.GzipFile, bz2.BZ2File etc in older Pythons (<=2.6), which otherwise
 67 |         raise "AttributeError: GzipFile instance has no attribute '__exit__'".
 68 | 
 69 |         """
 70 |         if not hasattr(base, '__enter__'):
 71 |             attrs['__enter__'] = lambda self: self
 72 |         if not hasattr(base, '__exit__'):
 73 |             attrs['__exit__'] = lambda self, type, value, traceback: self.close()
 74 |         return type('Closing' + base.__name__, (base, object), attrs)
 75 | 
 76 |     def smart_open(fname, mode='rb'):
 77 |         _, ext = os.path.splitext(fname)
 78 |         if ext == '.bz2':
 79 |             from bz2 import BZ2File
 80 |             return make_closing(BZ2File)(fname, mode)
 81 |         if ext == '.gz':
 82 |             from gzip import GzipFile
 83 |             return make_closing(GzipFile)(fname, mode)
 84 |         return open(fname, mode)
 85 | 
 86 | 
 87 | PAT_ALPHABETIC = re.compile(r'(((?![\d])\w)+)', re.UNICODE)
 88 | RE_HTML_ENTITY = re.compile(r'&(#?)([xX]?)(\w{1,8});', re.UNICODE)
 89 | 
 90 | 
 91 | def get_random_state(seed):
 92 |     """
 93 |     Turn seed into a np.random.RandomState instance.
 94 |     Method originally from maciejkula/glove-python, and written by @joshloyal.
 95 |     """
 96 |     if seed is None or seed is np.random:
 97 |         return np.random.mtrand._rand
 98 |     if isinstance(seed, (numbers.Integral, np.integer)):
 99 |         return np.random.RandomState(seed)
100 |     if isinstance(seed, np.random.RandomState):
101 |         return seed
102 |     raise ValueError(
103 |         '%r cannot be used to seed a np.random.RandomState instance' %
104 |         seed)
105 | 
106 | 
107 | class NoCM(object):
108 |     def acquire(self):
109 |         pass
110 | 
111 |     def release(self):
112 |         pass
113 | 
114 |     def __enter__(self):
115 |         pass
116 | 
117 |     def __exit__(self, type, value, traceback):
118 |         pass
119 | 
120 | 
121 | nocm = NoCM()
122 | 
123 | 
124 | @contextmanager
125 | def file_or_filename(input):
126 |     """
127 |     Return a file-like object ready to be read from the beginning. `input` is either
128 |     a filename (gz/bz2 also supported) or a file-like object supporting seek.
129 | 
130 |     """
131 |     if isinstance(input, string_types):
132 |         # input was a filename: open as file
133 |         yield smart_open(input)
134 |     else:
135 |         # input already a file-like object; just reset to the beginning
136 |         input.seek(0)
137 |         yield input
138 | 
139 | 
140 | def deaccent(text):
141 |     """
142 |     Remove accentuation from the given string. Input text is either a unicode string or utf8 encoded bytestring.
143 | 
144 |     Return input string with accents removed, as unicode.
145 | 
146 |     >>> deaccent("Šéf chomutovských komunistů dostal poštou bílý prášek")
147 |     u'Sef chomutovskych komunistu dostal postou bily prasek'
148 | 
149 |     """
150 |     if not isinstance(text, unicode):
151 |         # assume utf8 for byte strings, use default (strict) error handling
152 |         text = text.decode('utf8')
153 |     norm = unicodedata.normalize("NFD", text)
154 |     result = u('').join(ch for ch in norm if unicodedata.category(ch) != 'Mn')
155 |     return unicodedata.normalize("NFC", result)
156 | 
157 | 
158 | def copytree_hardlink(source, dest):
159 |     """
160 |     Recursively copy a directory ala shutils.copytree, but hardlink files
161 |     instead of copying. Available on UNIX systems only.
162 |     """
163 |     copy2 = shutil.copy2
164 |     try:
165 |         shutil.copy2 = os.link
166 |         shutil.copytree(source, dest)
167 |     finally:
168 |         shutil.copy2 = copy2
169 | 
170 | 
171 | def tokenize(
172 |         text,
173 |         lowercase=False,
174 |         deacc=False,
175 |         encoding='utf8',
176 |         errors="strict",
177 |         to_lower=False,
178 |         lower=False):
179 |     """
180 |     Iteratively yield tokens as unicode strings, removing accent marks
181 |     and optionally lowercasing the unidoce string by assigning True
182 |     to one of the parameters, lowercase, to_lower, or lower.
183 | 
184 |     Input text may be either unicode or utf8-encoded byte string.
185 | 
186 |     The tokens on output are maximal contiguous sequences of alphabetic
187 |     characters (no digits!).
188 | 
189 |     >>> list(tokenize('Nic nemůže letět rychlostí vyšší, než 300 tisíc kilometrů za sekundu!', deacc = True))
190 |     [u'Nic', u'nemuze', u'letet', u'rychlosti', u'vyssi', u'nez', u'tisic', u'kilometru', u'za', u'sekundu']
191 | 
192 |     """
193 |     lowercase = lowercase or to_lower or lower
194 |     text = to_unicode(text, encoding, errors=errors)
195 |     if lowercase:
196 |         text = text.lower()
197 |     if deacc:
198 |         text = deaccent(text)
199 |     return simple_tokenize(text)
200 | 
201 | 
202 | def simple_tokenize(text):
203 |     for match in PAT_ALPHABETIC.finditer(text):
204 |         yield match.group()
205 | 
206 | 
207 | def simple_preprocess(doc, deacc=False, min_len=2, max_len=15):
208 |     """
209 |     Convert a document into a list of tokens.
210 | 
211 |     This lowercases, tokenizes, de-accents (optional). -- the output are final
212 |     tokens = unicode strings, that won't be processed any further.
213 | 
214 |     """
215 |     tokens = [
216 |         token for token in tokenize(doc, lower=True, deacc=deacc, errors='ignore')
217 |         if min_len <= len(token) <= max_len and not token.startswith('_')
218 |     ]
219 |     return tokens
220 | 
221 | 
222 | def any2utf8(text, errors='strict', encoding='utf8'):
223 |     """Convert a string (unicode or bytestring in `encoding`), to bytestring in utf8."""
224 |     if isinstance(text, unicode):
225 |         return text.encode('utf8')
226 |     # do bytestring -> unicode -> utf8 full circle, to ensure valid utf8
227 |     return unicode(text, encoding, errors=errors).encode('utf8')
228 | 
229 | 
230 | to_utf8 = any2utf8
231 | 
232 | 
233 | def any2unicode(text, encoding='utf8', errors='strict'):
234 |     """Convert a string (bytestring in `encoding` or unicode), to unicode."""
235 |     if isinstance(text, unicode):
236 |         return text
237 |     return unicode(text, encoding, errors=errors)
238 | 
239 | 
240 | to_unicode = any2unicode
241 | 
242 | # cosine distance
243 | # https://docs.scipy.org/doc/numpy-1.13.0/reference/generated/numpy.linalg.norm.html
244 | from numpy import dot
245 | from numpy.linalg import norm
246 | cosine = lambda a, b: dot(a, b)/(norm(a)*norm(b))
247 | 
248 | def sigmoid(x):
249 |     return 1.0 / (1.0 + np.exp(-x))
250 | 
251 | def call_on_class_only(*args, **kwargs):
252 |     """Raise exception when load methods are called on instance"""
253 |     raise AttributeError('This method should be called on a class object.')
254 | 
255 | def is_digit(obj):
256 |     '''
257 |     Check if an object is Number
258 |     '''
259 |     return isinstance(obj, (numbers.Integral, numbers.Complex, numbers.Real))
260 | 
261 | def is_zhs(str):
262 |     '''
263 |     Check if str is Chinese Word
264 |     '''
265 |     for i in str:
266 |         if not is_zh(i):
267 |             return False
268 |     return True
269 | 
270 | def is_zh(ch):
271 |     """return True if ch is Chinese character.
272 |     full-width puncts/latins are not counted in.
273 |     """
274 |     x = ord(ch)
275 |     # CJK Radicals Supplement and Kangxi radicals
276 |     if 0x2e80 <= x <= 0x2fef:
277 |         return True
278 |     # CJK Unified Ideographs Extension A
279 |     elif 0x3400 <= x <= 0x4dbf:
280 |         return True
281 |     # CJK Unified Ideographs
282 |     elif 0x4e00 <= x <= 0x9fbb:
283 |         return True
284 |     # CJK Compatibility Ideographs
285 |     elif 0xf900 <= x <= 0xfad9:
286 |         return True
287 |     # CJK Unified Ideographs Extension B
288 |     elif 0x20000 <= x <= 0x2a6df:
289 |         return True
290 |     else:
291 |         return False
292 | 
293 | def is_punct(ch):
294 |     x = ord(ch)
295 |     # in no-formal literals, space is used as punctuation sometimes.
296 |     if x < 127 and ascii.ispunct(x):
297 |         return True
298 |     # General Punctuation
299 |     elif 0x2000 <= x <= 0x206f:
300 |         return True
301 |     # CJK Symbols and Punctuation
302 |     elif 0x3000 <= x <= 0x303f:
303 |         return True
304 |     # Halfwidth and Fullwidth Forms
305 |     elif 0xff00 <= x <= 0xffef:
306 |         return True
307 |     # CJK Compatibility Forms
308 |     elif 0xfe30 <= x <= 0xfe4f:
309 |         return True
310 |     else:
311 |         return False


--------------------------------------------------------------------------------
/synonyms/word2vec.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | #
  4 | # Copyright (C) 2016 Radim Rehurek <me@radimrehurek.com>
  5 | # Modifications (C) 2017 Hai Liang Wang <hailiang.hl.wang@gmail.com>
  6 | # Licensed under the GNU LGPL v3.0 - http://www.gnu.org/licenses/lgpl.html
  7 | # Author: Hai Liang Wang
  8 | # Date: 2017-10-16:14:13:24
  9 | #
 10 | #=========================================================================
 11 | 
 12 | from __future__ import print_function
 13 | from __future__ import division
 14 | 
 15 | __copyright__ = "Copyright (c) (2017-2023) Chatopera Inc. All Rights Reserved"
 16 | __author__ = "Hai Liang Wang"
 17 | __date__ = "2017-10-16:14:13:24"
 18 | 
 19 | import os
 20 | import sys
 21 | curdir = os.path.dirname(os.path.abspath(__file__))
 22 | sys.path.append(curdir)
 23 | 
 24 | if sys.version_info[0] < 3:
 25 |     reload(sys)
 26 |     sys.setdefaultencoding("utf-8")
 27 |     # raise "Must be using Python 3"
 28 | else:
 29 |     xrange = range
 30 | 
 31 | from .utils import smart_open, to_unicode, cosine
 32 | from numpy import dot, zeros, dtype, float32 as REAL,\
 33 |     double, array, vstack, fromstring, sqrt, newaxis,\
 34 |     ndarray, sum as np_sum, prod, ascontiguousarray,\
 35 |     argmax
 36 | from sklearn.neighbors import KDTree
 37 | 
 38 | class Vocab(object):
 39 |     """
 40 |     A single vocabulary item, used internally for collecting per-word frequency/sampling info,
 41 |     and for constructing binary trees (incl. both word leaves and inner nodes).
 42 |     """
 43 | 
 44 |     def __init__(self, **kwargs):
 45 |         self.count = 0
 46 |         self.__dict__.update(kwargs)
 47 | 
 48 |     def __lt__(self, other):  # used for sorting in a priority queue
 49 |         return self.count < other.count
 50 | 
 51 |     def __str__(self):
 52 |         vals = [
 53 |             '%s:%r' %
 54 |             (key,
 55 |              self.__dict__[key]) for key in sorted(
 56 |                 self.__dict__) if not key.startswith('_')]
 57 |         return "%s(%s)" % (self.__class__.__name__, ', '.join(vals))
 58 | 
 59 | 
 60 | class KeyedVectors():
 61 |     """
 62 |     Class to contain vectors and vocab for the Word2Vec training class and other w2v methods not directly
 63 |     involved in training such as most_similar()
 64 |     """
 65 | 
 66 |     def __init__(self):
 67 |         self.syn0 = []
 68 |         self.syn0norm = None
 69 |         self.vocab = {}
 70 |         self.index2word = []
 71 |         self.vector_size = None
 72 |         self.kdt = None
 73 | 
 74 |     @property
 75 |     def wv(self):
 76 |         return self
 77 | 
 78 |     def save(self, *args, **kwargs):
 79 |         # don't bother storing the cached normalized vectors
 80 |         kwargs['ignore'] = kwargs.get('ignore', ['syn0norm'])
 81 |         super(KeyedVectors, self).save(*args, **kwargs)
 82 | 
 83 |     @classmethod
 84 |     def load_word2vec_format(
 85 |             cls,
 86 |             fname,
 87 |             fvocab=None,
 88 |             binary=False,
 89 |             encoding='utf8',
 90 |             unicode_errors='strict',
 91 |             limit=None,
 92 |             datatype=REAL):
 93 |         """
 94 |         Load the input-hidden weight matrix from the original C word2vec-tool format.
 95 |         Note that the information stored in the file is incomplete (the binary tree is missing),
 96 |         so while you can query for word similarity etc., you cannot continue training
 97 |         with a model loaded this way.
 98 |         `binary` is a boolean indicating whether the data is in binary word2vec format.
 99 |         `norm_only` is a boolean indicating whether to only store normalised word2vec vectors in memory.
100 |         Word counts are read from `fvocab` filename, if set (this is the file generated
101 |         by `-save-vocab` flag of the original C tool).
102 |         If you trained the C model using non-utf8 encoding for words, specify that
103 |         encoding in `encoding`.
104 |         `unicode_errors`, default 'strict', is a string suitable to be passed as the `errors`
105 |         argument to the unicode() (Python 2.x) or str() (Python 3.x) function. If your source
106 |         file may include word tokens truncated in the middle of a multibyte unicode character
107 |         (as is common from the original word2vec.c tool), 'ignore' or 'replace' may help.
108 |         `limit` sets a maximum number of word-vectors to read from the file. The default,
109 |         None, means read all.
110 |         `datatype` (experimental) can coerce dimensions to a non-default float type (such
111 |         as np.float16) to save memory. (Such types may result in much slower bulk operations
112 |         or incompatibility with optimized routines.)
113 |         """
114 |         counts = None
115 |         if fvocab is not None:
116 |             # print("loading word counts from %s" % fvocab)
117 |             counts = {}
118 |             with smart_open(fvocab) as fin:
119 |                 for line in fin:
120 |                     word, count = to_unicode(line).strip().split()
121 |                     counts[word] = int(count)
122 | 
123 |         # print("loading projection weights from %s" % fname)
124 |         with smart_open(fname) as fin:
125 |             header = to_unicode(fin.readline(), encoding=encoding)
126 |             # throws for invalid file format
127 |             vocab_size, vector_size = (int(x) for x in header.split())
128 |             if limit:
129 |                 vocab_size = min(vocab_size, limit)
130 |             result = cls()
131 |             result.vector_size = vector_size
132 |             result.syn0 = zeros((vocab_size, vector_size), dtype=datatype)
133 | 
134 |             def add_word(word, weights):
135 |                 word_id = len(result.vocab)
136 |                 # print("word id: %d, word: %s, weights: %s" % (word_id, word, weights))
137 |                 if word in result.vocab:
138 |                     # print( "duplicate word '%s' in %s, ignoring all but first" % (word, fname))
139 |                     return
140 |                 if counts is None:
141 |                     # most common scenario: no vocab file given. just make up
142 |                     # some bogus counts, in descending order
143 |                     result.vocab[word] = Vocab(
144 |                         index=word_id, count=vocab_size - word_id)
145 |                 elif word in counts:
146 |                     # use count from the vocab file
147 |                     result.vocab[word] = Vocab(
148 |                         index=word_id, count=counts[word])
149 |                 else:
150 |                     # vocab file given, but word is missing -- set count to
151 |                     # None (TODO: or raise?)
152 |                     # print( "vocabulary file is incomplete: '%s' is missing" % word)
153 |                     result.vocab[word] = Vocab(index=word_id, count=None)
154 |                 result.syn0[word_id] = weights
155 |                 result.index2word.append(word)
156 | 
157 |             if binary:
158 |                 binary_len = dtype(REAL).itemsize * vector_size
159 |                 for _ in xrange(vocab_size):
160 |                     # mixed text and binary: read text first, then binary
161 |                     word = []
162 |                     while True:
163 |                         ch = fin.read(1)
164 |                         if ch == b' ':
165 |                             break
166 |                         if ch == b'':
167 |                             raise EOFError(
168 |                                 "unexpected end of input; is count incorrect or file otherwise damaged?")
169 |                         # ignore newlines in front of words (some binary files
170 |                         # have)
171 |                         if ch != b'\n':
172 |                             word.append(ch)
173 |                     word = to_unicode(
174 |                         b''.join(word), encoding=encoding, errors=unicode_errors)
175 |                     weights = fromstring(fin.read(binary_len), dtype=REAL)
176 |                     add_word(word, weights)
177 |             else:
178 |                 for line_no in xrange(vocab_size):
179 |                     line = fin.readline()
180 |                     if line == b'':
181 |                         raise EOFError(
182 |                             "unexpected end of input; is count incorrect or file otherwise damaged?")
183 |                     parts = to_unicode(
184 |                         line.rstrip(),
185 |                         encoding=encoding,
186 |                         errors=unicode_errors).split(" ")
187 |                     if len(parts) != vector_size + 1:
188 |                         raise ValueError(
189 |                             "invalid vector on line %s (is this really the text format?)" %
190 |                             line_no)
191 |                     word, weights = parts[0], [REAL(x) for x in parts[1:]]
192 |                     add_word(word, weights)
193 |         if result.syn0.shape[0] != len(result.vocab):
194 |             # print( "duplicate words detected, shrinking matrix size from %i to %i" % (result.syn0.shape[0], len(result.vocab)))
195 |             result.syn0 = ascontiguousarray(result.syn0[: len(result.vocab)])
196 |         assert (len(result.vocab), vector_size) == result.syn0.shape
197 |         '''
198 |         KDTree
199 |         Build KDTree with vectors.
200 |         http://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KDTree.html#sklearn.neighbors.KDTree
201 |         '''
202 |         result.kdt = KDTree(result.syn0, leaf_size=10, metric = "euclidean")
203 |         # print("loaded %s matrix from %s" % (result.syn0.shape, fname))
204 |         return result
205 | 
206 |     def word_vec(self, word, use_norm=False):
207 |         """
208 |         Accept a single word as input.
209 |         Returns the word's representations in vector space, as a 1D numpy array.
210 |         If `use_norm` is True, returns the normalized word vector.
211 |         Example::
212 |           >>> trained_model['office']
213 |           array([ -1.40128313e-02, ...])
214 |         """
215 |         if word in self.vocab:
216 |             if use_norm:
217 |                 result = self.syn0norm[self.vocab[word].index]
218 |             else:
219 |                 result = self.syn0[self.vocab[word].index]
220 | 
221 |             result.setflags(write=False)
222 |             return result
223 |         else:
224 |             raise KeyError("word '%s' not in vocabulary" % word)
225 | 
226 |     def neighbours(self, word, size = 10):
227 |         """
228 |         Get nearest words with KDTree, ranking by cosine distance
229 |         """
230 |         word = word.strip()
231 |         v = self.word_vec(word)
232 |         [distances], [points] = self.kdt.query(array([v]), k = size, return_distance = True)
233 |         assert len(distances) == len(points), "distances and points should be in same shape."
234 |         words, scores = [], {}
235 |         for (x,y) in zip(points, distances):
236 |             w = self.index2word[x]
237 |             if w == word: s = 1.0
238 |             else: s = cosine(v, self.syn0[x])
239 |             if s < 0: s = abs(s)
240 |             words.append(w)
241 |             scores[w] = min(s, 1.0)
242 |         for x in sorted(words, key=scores.get, reverse=True):
243 |             yield x, scores[x]
244 | 
245 | import unittest
246 | 
247 | # run testcase: python /Users/hain/tmp/ss Test.testExample
248 | 
249 | 
250 | class Test(unittest.TestCase):
251 |     '''
252 | 
253 |     '''
254 | 
255 |     def setUp(self):
256 |         pass
257 | 
258 |     def tearDown(self):
259 |         pass
260 | 
261 |     def test_load_w2v_data(self):
262 |         _fin_wv_path = os.path.join(curdir, 'data', 'words.vector')
263 |         _fin_stopwords_path = os.path.join(curdir, 'data', 'stopwords.txt')
264 |         kv = KeyedVectors()
265 |         binary = True
266 |         kv.load_word2vec_format(
267 |             _fin_wv_path,
268 |             binary=binary,
269 |             unicode_errors='ignore')
270 | 
271 | 
272 | def test():
273 |     unittest.main()
274 | 
275 | 
276 | if __name__ == '__main__':
277 |     test()
278 | 


--------------------------------------------------------------------------------