├── xiangsi ├── __init__.py ├── stoptext_en.txt ├── func.py ├── main.py └── stoptext_zh.txt ├── setup.py ├── REFERENCE ├── LICENSE ├── README.md └── README_en.md /xiangsi/__init__.py: -------------------------------------------------------------------------------- 1 | from .func import Functions 2 | from .main import Calculator 3 | 4 | fun = Functions() 5 | cal = Calculator() 6 | 7 | lang = fun.lang 8 | weight = fun.weight 9 | construct = fun.construct 10 | update_stopwords = fun.update_stopwords 11 | cossim = cal.cossim 12 | minhash = cal.minhash 13 | simhash = cal.simhash 14 | jaccard = cal.jaccard 15 | 16 | __all__ = [ 17 | 'Calculator', 18 | ] 19 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import setuptools 2 | 3 | with open("README.md", "r", encoding="utf-8") as fh: 4 | long_description = fh.read() 5 | 6 | setuptools.setup( 7 | name="xiangsi", 8 | version="4.2.3", 9 | author="kiwirafe", 10 | author_email="kiwirafe@gmail.com", 11 | description="中文文本相似度计算器", 12 | long_description=long_description, 13 | long_description_content_type="text/markdown", 14 | url="https://github.com/kiwirafe/xiangsi", 15 | project_urls={"Github": "https://github.com/kiwirafe/xiangsi"}, 16 | packages=["xiangsi"], 17 | classifiers=[ 18 | "Programming Language :: Python :: 3", 19 | "License :: OSI Approved :: MIT License", 20 | "Operating System :: OS Independent", 21 | ], 22 | install_requires=[ 23 | "jieba", 24 | ], 25 | package_data = { 26 | # If any package contains *.txt files, include them: 27 | "xiangsi": ["*.txt", "*.md", "xiangsi/*",], 28 | }, 29 | python_requires=">=3.4", 30 | ) 31 | -------------------------------------------------------------------------------- /REFERENCE: -------------------------------------------------------------------------------- 1 | Cossim: 2 | - https://zhuanlan.zhihu.com/p/43396514 3 | Simhash: 4 | - Charikar, Moses S. "Similarity estimation techniques from rounding algorithms." *Proceedings of the thiry-fourth annual ACM symposium on Theory of computing*. 2002. 5 | - Manku, Gurmeet Singh, Arvind Jain, and Anish Das Sarma. "Detecting near-duplicates for web crawling." *Proceedings of the 16th international conference on World Wide Web*. 2007. 6 | - Henzinger, Monika. "Finding near-duplicate web pages: a large-scale evaluation of algorithms." *Proceedings of the 29th annual international ACM SIGIR conference on Research and development in information retrieval*. 2006. 7 | - https://www.pinecone.io/learn/series/faiss/locality-sensitive-hashing-random-projection/ 8 | - https://www.youtube.com/watch?v=lRWINdZFAo0 9 | Minhash: 10 | - Broder, Andrei Z. "On the resemblance and containment of documents." Proceedings. Compression and Complexity of SEQUENCES 1997 (Cat. No. 97TB100171). IEEE, 1997. 11 | - https://www.youtube.com/watch?v=96WOGPUgMfw 12 | - https://github.com/duhaime/minhash 13 | Others: 14 | - https://github.com/fxsjy/jieba -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) [2020] [xiangshi] 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /xiangsi/stoptext_en.txt: -------------------------------------------------------------------------------- 1 | i 2 | me 3 | my 4 | myself 5 | we 6 | our 7 | ours 8 | ourselves 9 | you 10 | your 11 | yours 12 | yourself 13 | yourselves 14 | he 15 | him 16 | his 17 | himself 18 | she 19 | her 20 | hers 21 | herself 22 | it 23 | its 24 | itself 25 | they 26 | them 27 | their 28 | theirs 29 | themselves 30 | what 31 | which 32 | who 33 | whom 34 | this 35 | that 36 | these 37 | those 38 | am 39 | is 40 | are 41 | was 42 | were 43 | be 44 | been 45 | being 46 | have 47 | has 48 | had 49 | having 50 | do 51 | does 52 | did 53 | doing 54 | a 55 | an 56 | the 57 | and 58 | but 59 | if 60 | or 61 | because 62 | as 63 | until 64 | while 65 | of 66 | at 67 | by 68 | for 69 | with 70 | about 71 | against 72 | between 73 | into 74 | through 75 | during 76 | before 77 | after 78 | above 79 | below 80 | to 81 | from 82 | up 83 | down 84 | in 85 | out 86 | on 87 | off 88 | over 89 | under 90 | again 91 | further 92 | then 93 | once 94 | here 95 | there 96 | when 97 | where 98 | why 99 | how 100 | all 101 | any 102 | both 103 | each 104 | few 105 | more 106 | most 107 | other 108 | some 109 | such 110 | no 111 | nor 112 | not 113 | only 114 | own 115 | same 116 | so 117 | than 118 | too 119 | very 120 | s 121 | t 122 | can 123 | will 124 | just 125 | don 126 | should 127 | now -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Xiangsi 2 | 3 | ### 中文文本相似度计算器 4 | 5 | ![Pypi Version](https://img.shields.io/pypi/v/xiangsi?label=version) 6 | ![Downloads](https://static.pepy.tech/badge/xiangshi) 7 | 8 | 简体中文 | **[English](README_en.md)** 9 | 10 | Xiangsi是一个计算文本相似度的Python包,并支持中文文本(即中文分词、停用词过滤等) 11 | Xiangsi提供4个传统相似度算法,分别是:余弦相似度,Simhash,Minhash以及Jaccard 12 | 13 | [在线计算文本相似度](https://kiwirafe.pythonanywhere.com/app/xiangsi) 14 | 15 | ## 下载与安装 16 | Pip安装: 17 | ```sh 18 | pip3 install xiangsi 19 | ``` 20 | 国内较慢的话可以使用清华镜像: 21 | ```sh 22 | pip3 install -i https://pypi.tuna.tsinghua.edu.cn/simple xiangsi 23 | ``` 24 | 25 | 26 | ## 使用方法 27 | ### 计算文本相似度 28 | Xiangsi会自动对文本进行中文分词处理,并过滤掉停用词。之后,Xiangsi会根据所选算法计算文本相似度。 29 | 30 | #### 余弦相似度 31 | ```python 32 | import xiangsi as xs 33 | xs.cossim("如何更换花呗绑定银行卡", "花呗更改绑定银行卡") 34 | ``` 35 | 36 | #### Simhash & Minhash & Jaccard相似度 37 | ```python 38 | import xiangsi as xs 39 | # Simhash 40 | xs.simhash("如何更换花呗绑定银行卡", "花呗更改绑定银行卡") 41 | # Minhash 42 | xs.minhash("如何更换花呗绑定银行卡", "花呗更改绑定银行卡") 43 | # Jaccard 44 | xs.jaccard("如何更换花呗绑定银行卡", "花呗更改绑定银行卡") 45 | ``` 46 | 47 | ### 其它加权方法 48 | 默认的加权方法是计算每个单词在文本中出现的频率。此外,还有另外两种加权方法可供选择。 49 | 50 | #### TFIDF 51 | ```python 52 | arg = [ 53 | "西班牙失业率创新高", 54 | "澳大利亚失业率高达5.1%", 55 | "花呗更改绑定银行卡", 56 | "我什么时候开通了花呗", 57 | "从这个角度来看, 我们一般认为,抓住了问题的关键,其他一切则会迎刃而解。" 58 | "从这个角度来看, 每个人都不得不面对这些问题。" 59 | 60 | ] 61 | xs.weight = "TFIDF" # 将加权方式设置为TFIDF 62 | xs.construct(arg) # 输入TFIDF文本,相同的文本只需调用这个函数一次 63 | 64 | xs.cossim("如何更换花呗绑定银行卡", "花呗更改绑定银行卡") 65 | xs.simhash("如何更换花呗绑定银行卡", "花呗更改绑定银行卡") 66 | ``` 67 | 68 | #### 没有加权 69 | ```python 70 | xs.weight = "None" # 将加权方式设置为None 71 | xs.cossim("如何更换花呗绑定银行卡", "花呗更改绑定银行卡") 72 | ``` 73 | 74 | ### 修改参数 75 | #### 修改默认值 76 | ```python 77 | import xiangsi as xs 78 | #计算Simhash时取前多少的TFIDF值。默认值为64 79 | xs.feature = 64 80 | #计算Minhash时算出多少个哈希值。默认值为16 81 | xs.HashNums = 16 82 | #计算Minhash时的最大哈希。默认值为4294967311 83 | xs.prime = 4294967311 84 | ``` 85 | 86 | #### 修改停用词 87 | 在v4.2.1之后,Textsim支持更改停用词: 88 | ```python 89 | import xiangsi as xs 90 | stopwords = ["你好", "世界"] 91 | xs.update_stopwords(stopwords) 92 | ``` 93 | 94 | 95 | ## 新版本 96 | v4.2.3: 从xiangshi改名为xiangsi 97 | v4.2.2: 解决路径问题 98 | v4.2.1: 支持英文文本相似度 99 | 100 | #### 注意: 101 | - v4.2.0+文本相似度的计算结果可能和v4.1.0不一样,因为v4.1.0加权方式不同。 102 | - v4.2.0+文本相似度的输入均为两个`string`,且**不与**v4.1.0反向兼容。 103 | - v4.2.0+不再支持文本聚类(如果还有人需要的话请联系我,我会另开一个包) 104 | 105 | 106 | ## 其他链接 107 | - 在线计算文本计算器: 108 | https://kiwirafe.com/xiangsi 109 | - PyPI: 110 | https://pypi.org/project/xiangsi/ 111 | - Github: 112 | https://github.com/kiwirafe/xiangsi 113 | - 下载数量: 114 | https://pepy.tech/project/xiangsi 115 | - Gitee(中国开源): 116 | https://gitee.com/kiwirafe/xiangsi 117 | - 关于算法的其他链接: 118 | https://github.com/kiwirafe/xiangsi/blob/master/REFERENCE 119 | -------------------------------------------------------------------------------- /README_en.md: -------------------------------------------------------------------------------- 1 | ## Xiangsi 2 | 3 | ### Text Similarity Calculator 4 | 5 | ![Pypi Version](https://img.shields.io/pypi/v/xiangsi?label=version) 6 | ![Downloads](https://static.pepy.tech/badge/xiangshi) 7 | 8 | **[简体中文](README.md)** | English 9 | 10 | Xiangsi is a Python package for calculating text similarity. It provides 4 algorithms: Cosine Similarity, Jaccard Similarity, Simhash and Minhash. 11 | 12 | [Online Text Similarity Calculator](https://kiwirafe.pythonanywhere.com/app/xiangsi) 13 | 14 | ## Installation 15 | Pip install: 16 | ```sh 17 | pip3 install xiangsi 18 | ``` 19 | 20 | ## Usage 21 | ### Calculate Text Similarity 22 | #### Cosine 23 | ```python 24 | import xiangsi as xs 25 | xs.cossim("A mathematician found a solution to the problem.", "The problem was solved by a young mathematician.") 26 | ``` 27 | #### Simhash & Minhash & Jaccard 28 | ```python 29 | import xiangsi as xs 30 | # Simhash 31 | xs.simhash("A mathematician found a solution to the problem.", "The problem was solved by a young mathematician.") 32 | # Minhash 33 | xs.minhash("A mathematician found a solution to the problem.", "The problem was solved by a young mathematician.") 34 | # Jaccard 35 | xs.jaccard("A mathematician found a solution to the problem.", "The problem was solved by a young mathematician.") 36 | ``` 37 | 38 | ### Modify Weights 39 | #### Default weight (frequency of the words). 40 | ```python 41 | import xiangsi as xs 42 | xs.simhash("A mathematician found a solution to the problem.", "The problem was solved by a young mathematician.") 43 | ``` 44 | 45 | #### TFIDF 46 | For TF-IDF, first construct the IDF corpus. This calculates the IDF for all the strings inside the corpus. You only need to do this once for multiple calculations, given that you are using the same IDF corpus. 47 | 48 | ```python 49 | import xiangsi as xs 50 | 51 | arg = [ 52 | "There was a time in his life when her rudeness would have set him over the edge.", 53 | "He would have raised his voice and demanded to speak to the manager.", 54 | "That was no longer the case. He barely reacted at all, letting the rudeness melt away without saying a word back to her. ", 55 | "A mathematician found a solution to the problem." 56 | "The problem was solved by a young mathematician." 57 | ] # IDF 58 | xs.weight = "TFIDF" # Set weight method as TFIDF 59 | xs.construct(arg) # Constructs the IDF corpus. We only need to do this once. 60 | xs.cossim("A mathematician found a solution to the problem.", "The problem was solved by a young mathematician.") 61 | ``` 62 | 63 | #### No weight (all words have weight 1) 64 | ```python 65 | import xiangsi as xs 66 | 67 | xs.weight = "None" 68 | xs.cossim("A mathematician found a solution to the problem.", "The problem was solved by a young mathematician.") 69 | ``` 70 | 71 | ### Modify Default Variables 72 | ```python 73 | import xiangsi as xs 74 | xs.feature = 64 75 | # The first TFIDF values used when calculating Simhash. The default value is 64 76 | xs.HashNums = 16 77 | # Calculate the number of hash values ​​when calculating Minhash. The default value is 16 78 | xs.prime = 4294967311 79 | # Calculate the maximum hash when calculating Minhash. The default value is 4294967311 80 | ``` 81 | 82 | ## Other Links: 83 | - PyPI: 84 | https://pypi.org/project/xiangsi/ 85 | - Github: 86 | https://github.com/kiwirafe/xiangsi 87 | - PyPI Downloads: 88 | https://pepy.tech/project/xiangsi 89 | -------------------------------------------------------------------------------- /xiangsi/func.py: -------------------------------------------------------------------------------- 1 | import jieba 2 | import math 3 | import random 4 | import pickle 5 | import string 6 | import os 7 | import re 8 | 9 | class Functions(object): 10 | def __init__(self): 11 | self.abs_path = os.path.dirname(os.path.abspath(__file__)) 12 | self.lang = None 13 | self.weight = "Default" 14 | 15 | # Algorithm: h(x) = (a*x + b) % c 16 | def HashAlg(self, k): 17 | MaxHash = 2**32 - 1 18 | # Create a list of 'k' random values. 19 | RandomList = [] 20 | 21 | while k > 0: 22 | random.seed(k) 23 | # Get a random shingle ID. 24 | RandIndex = random.randint(0, MaxHash) 25 | 26 | # Make sure the hash is unique 27 | while RandIndex in RandomList: 28 | RandIndex = random.randint(0, MaxHash) 29 | 30 | # Append the value 31 | RandomList.append(RandIndex) 32 | k = k - 1 33 | 34 | return RandomList 35 | 36 | 37 | def update_stopwords(self, stopwords, lang="zh"): 38 | with open(f"xiangsi/stopword_{lang}.txt", 'w') as f: 39 | for word in stopwords: 40 | f.write(f"{word}\n") 41 | 42 | 43 | def segment_zh(self, corpus): 44 | path = os.path.join(self.abs_path, "stoptext_zh.txt") 45 | StopWords = [line.strip() for line in open(path, encoding='utf-8').readlines()] 46 | 47 | WordCut = jieba.lcut(corpus) 48 | 49 | output = [] 50 | for word in WordCut: 51 | word = word.lower() 52 | if word not in StopWords and word != '\n' and word != ' ': 53 | output.append(word) 54 | 55 | return output 56 | 57 | def segment_en(self, corpus): 58 | path = os.path.join(self.abs_path, "stoptext_en.txt") 59 | StopWords = [line.strip() for line in open(path, encoding='utf-8').readlines()] 60 | 61 | WordCut = [word.strip(string.punctuation) for word in corpus.split()] 62 | 63 | output = [] 64 | for word in WordCut: 65 | word = word.lower() 66 | if word not in StopWords and word != '\n' and word != ' ': 67 | output.append(word) 68 | 69 | return output 70 | 71 | 72 | def segment(self, corpus): 73 | if re.search(r'[\u4e00-\u9fff]', corpus) or self.lang == "zh": 74 | return self.segment_zh(corpus) 75 | else: 76 | return self.segment_en(corpus) 77 | 78 | 79 | def construct(self, data): 80 | segmented = [] 81 | for d in data: 82 | WordCut = self.segment(d) 83 | segmented.append(WordCut) 84 | 85 | path = os.path.join(self.abs_path, "cache.pickle") 86 | with open(path, "wb") as handle: 87 | pickle.dump(segmented, handle, protocol=pickle.HIGHEST_PROTOCOL) 88 | 89 | 90 | def GetTF(self, corpus): 91 | tf = {} 92 | total = len(corpus) 93 | for word in corpus: 94 | tf[word] = corpus.count(word) / total 95 | 96 | return tf 97 | 98 | 99 | def GetIDF(self, corpus, data): 100 | freq = dict.fromkeys(corpus, 0) 101 | 102 | idf = {} 103 | total = len(data) 104 | 105 | for word in corpus: 106 | for d in data: 107 | if word in d: 108 | freq[word] += 1 109 | 110 | for word in freq: 111 | idf[word] = math.log10(total / (freq[word] + 1)) 112 | return idf 113 | 114 | 115 | def GetTFIDF(self, input): 116 | path = os.path.join(self.abs_path, "cache.pickle") 117 | 118 | tf = self.GetTF(input) 119 | idf = self.GetIDF(input, pickle.load(open(path, 'rb'))) 120 | 121 | result = {} 122 | for key, value in tf.items(): 123 | result[key] = value * idf[key] 124 | 125 | return result 126 | 127 | 128 | def GetWeight(self, input): 129 | input = self.segment(input) 130 | if self.weight == "TFIDF": 131 | return self.GetTFIDF(input) 132 | elif self.weight == "None": 133 | weight = {} 134 | for word in input: 135 | weight[word] = 1 136 | return weight 137 | else: 138 | weight = {} 139 | for word in input: 140 | weight[word] = input.count(word) 141 | return weight 142 | -------------------------------------------------------------------------------- /xiangsi/main.py: -------------------------------------------------------------------------------- 1 | import math 2 | import binascii 3 | import hashlib 4 | from . import func 5 | 6 | funcs = func.Functions() 7 | 8 | class Calculator(object): 9 | def __init__(self): 10 | self.feature = 64 11 | self.HashNums = 16 12 | self.prime = 4294967311 13 | 14 | 15 | def cossim(self, input1, input2): 16 | result1 = funcs.GetWeight(input1) 17 | result2 = funcs.GetWeight(input2) 18 | 19 | WordSet = list(set(result1.keys()).union(set(result2.keys()))) 20 | DotProduct = 0 21 | sq1 = 0 22 | sq2 = 0 23 | 24 | for word in WordSet: 25 | # Get vector value of both documents 26 | vector1 = result1[word] if word in result1 else 0 27 | vector2 = result2[word] if word in result2 else 0 28 | 29 | # Calculate Cosine Similarity for this dimension 30 | DotProduct += vector1 * vector2 31 | sq1 += pow(vector1, 2) 32 | sq2 += pow(vector2, 2) 33 | 34 | try: 35 | FinalResult = DotProduct / (math.sqrt(sq1) * math.sqrt(sq2)) 36 | except ZeroDivisionError: 37 | FinalResult = 0.0 38 | 39 | return FinalResult 40 | 41 | 42 | def jaccard(self, input1, input2): 43 | result1 = funcs.GetWeight(input1) 44 | result2 = funcs.GetWeight(input2) 45 | 46 | WordSet = list(set(result1.keys()).union(set(result2.keys()))) 47 | TopSum = 0 48 | BottomSum = 0 49 | 50 | for word in WordSet: 51 | vector1 = result1[word] if word in result1 else 0 52 | vector2 = result2[word] if word in result2 else 0 53 | 54 | TopSum += min(vector1, vector2) 55 | BottomSum += max(vector1, vector2) 56 | 57 | try: 58 | FinalResult = TopSum / BottomSum 59 | except ZeroDivisionError: 60 | FinalResult = 0.0 61 | 62 | return FinalResult 63 | 64 | 65 | def simhash(self, input1, input2): 66 | result1 = funcs.GetWeight(input1) 67 | result2 = funcs.GetWeight(input2) 68 | 69 | tfidf1 = {k: result1[k] for k in sorted(result1, key=result1.get, reverse=True)[:self.feature]} 70 | tfidf2 = {k: result2[k] for k in sorted(result2, key=result2.get, reverse=True)[:self.feature]} 71 | 72 | product1 = [0] * self.feature 73 | product2 = [0] * self.feature 74 | 75 | for key, value in tfidf1.items(): 76 | hashed = hashlib.md5(key.encode('utf-8')).hexdigest() 77 | vector = bin(int(hashed, 16))[-self.feature:] 78 | 79 | for i, x in enumerate(vector): 80 | product1[i] += value if (x == '1') else (value * -1) 81 | 82 | for key, value in tfidf2.items(): 83 | hashed = hashlib.md5(key.encode('utf-8')).hexdigest() 84 | vector = bin(int(hashed, 16))[-self.feature:] 85 | 86 | for i, x in enumerate(vector): 87 | product2[i] += value if (x == '1') else (value * -1) 88 | 89 | 90 | fprint1 = "" 91 | fprint2 = "" 92 | for i in range(self.feature): 93 | fprint1 += '1' if product1[i] >= 0 else '0' 94 | fprint2 += '1' if product2[i] >= 0 else '0' 95 | 96 | FinalResult = sum(pos1 == pos2 for pos1, pos2 in zip(fprint1, fprint2)) / self.feature 97 | return FinalResult 98 | 99 | 100 | def minhash(self, input1, input2): 101 | result = funcs.GetWeight(input1) 102 | result2 = funcs.GetWeight(input2) 103 | 104 | coeff1 = funcs.HashAlg(self.HashNums) 105 | coeff2 = funcs.HashAlg(self.HashNums) 106 | 107 | signature = {} 108 | signature2 = {} 109 | 110 | MinhashNum = self.prime 111 | MinhashNum2 = self.prime 112 | for i in range(0, self.HashNums): 113 | for x in result.keys(): 114 | crc = binascii.crc32(x.encode('utf-8')) & 0xffffffff 115 | # Generating Hash 116 | HashCode = (coeff1[i] * crc + coeff2[i]) % self.prime 117 | # Track the lowest hash code seen. 118 | if HashCode < MinhashNum: 119 | MinhashNum = HashCode 120 | 121 | if MinhashNum not in signature: 122 | signature[MinhashNum] = result[x] 123 | 124 | for y in result2.keys(): 125 | crc = binascii.crc32(y.encode('utf-8')) & 0xffffffff 126 | HashCode2 = (coeff1[i] * crc + coeff2[i]) % self.prime 127 | if HashCode2 < MinhashNum2: 128 | MinhashNum2 = HashCode2 129 | 130 | if MinhashNum2 not in signature2: 131 | signature2[MinhashNum2] = result2[y] 132 | 133 | intersect = 0 134 | total = 0 135 | for x, y in signature.items(): 136 | if x in signature2: 137 | intersect += y 138 | total += y 139 | 140 | try: 141 | return intersect / total 142 | except ZeroDivisionError: 143 | return 0 144 | 145 | -------------------------------------------------------------------------------- /xiangsi/stoptext_zh.txt: -------------------------------------------------------------------------------- 1 | $ 2 | 0 3 | 1 4 | 2 5 | 3 6 | 4 7 | 5 8 | 6 9 | 7 10 | 8 11 | 9 12 | ? 13 | _ 14 | “ 15 | ” 16 | 、 17 | 。 18 | 《 19 | 》 20 | 一 21 | 一些 22 | 一何 23 | 一切 24 | 一则 25 | 一方面 26 | 一旦 27 | 一来 28 | 一样 29 | 一般 30 | 一转眼 31 | 万一 32 | 上 33 | 上下 34 | 下 35 | 不 36 | 不仅 37 | 不但 38 | 不光 39 | 不单 40 | 不只 41 | 不外乎 42 | 不如 43 | 不妨 44 | 不尽 45 | 不尽然 46 | 不得 47 | 不怕 48 | 不惟 49 | 不成 50 | 不拘 51 | 不料 52 | 不是 53 | 不比 54 | 不然 55 | 不特 56 | 不独 57 | 不管 58 | 不至于 59 | 不若 60 | 不论 61 | 不过 62 | 不问 63 | 与 64 | 与其 65 | 与其说 66 | 与否 67 | 与此同时 68 | 且 69 | 且不说 70 | 且说 71 | 两者 72 | 个 73 | 个别 74 | 临 75 | 为 76 | 为了 77 | 为什么 78 | 为何 79 | 为止 80 | 为此 81 | 为着 82 | 乃 83 | 乃至 84 | 乃至于 85 | 么 86 | 之 87 | 之一 88 | 之所以 89 | 之类 90 | 乌乎 91 | 乎 92 | 乘 93 | 也 94 | 也好 95 | 也罢 96 | 了 97 | 二来 98 | 于 99 | 于是 100 | 于是乎 101 | 云云 102 | 云尔 103 | 些 104 | 亦 105 | 人 106 | 人们 107 | 人家 108 | 什么 109 | 什么样 110 | 今 111 | 介于 112 | 仍 113 | 仍旧 114 | 从 115 | 从此 116 | 从而 117 | 他 118 | 他人 119 | 他们 120 | 以 121 | 以上 122 | 以为 123 | 以便 124 | 以免 125 | 以及 126 | 以故 127 | 以期 128 | 以来 129 | 以至 130 | 以至于 131 | 以致 132 | 们 133 | 任 134 | 任何 135 | 任凭 136 | 似的 137 | 但 138 | 但凡 139 | 但是 140 | 何 141 | 何以 142 | 何况 143 | 何处 144 | 何时 145 | 余外 146 | 作为 147 | 你 148 | 你们 149 | 使 150 | 使得 151 | 例如 152 | 依 153 | 依据 154 | 依照 155 | 便于 156 | 俺 157 | 俺们 158 | 倘 159 | 倘使 160 | 倘或 161 | 倘然 162 | 倘若 163 | 借 164 | 假使 165 | 假如 166 | 假若 167 | 傥然 168 | 像 169 | 儿 170 | 先不先 171 | 光是 172 | 全体 173 | 全部 174 | 兮 175 | 关于 176 | 其 177 | 其一 178 | 其中 179 | 其二 180 | 其他 181 | 其余 182 | 其它 183 | 其次 184 | 具体地说 185 | 具体说来 186 | 兼之 187 | 内 188 | 再 189 | 再其次 190 | 再则 191 | 再有 192 | 再者 193 | 再者说 194 | 再说 195 | 冒 196 | 冲 197 | 况且 198 | 几 199 | 几时 200 | 凡 201 | 凡是 202 | 凭 203 | 凭借 204 | 出于 205 | 出来 206 | 分别 207 | 则 208 | 则甚 209 | 别 210 | 别人 211 | 别处 212 | 别是 213 | 别的 214 | 别管 215 | 别说 216 | 到 217 | 前后 218 | 前此 219 | 前者 220 | 加之 221 | 加以 222 | 即 223 | 即令 224 | 即使 225 | 即便 226 | 即如 227 | 即或 228 | 即若 229 | 却 230 | 去 231 | 又 232 | 又及 233 | 及 234 | 及其 235 | 及至 236 | 反之 237 | 反而 238 | 反过来 239 | 反过来说 240 | 受到 241 | 另 242 | 另一方面 243 | 另外 244 | 另悉 245 | 只 246 | 只当 247 | 只怕 248 | 只是 249 | 只有 250 | 只消 251 | 只要 252 | 只限 253 | 叫 254 | 叮咚 255 | 可 256 | 可以 257 | 可是 258 | 可见 259 | 各 260 | 各个 261 | 各位 262 | 各种 263 | 各自 264 | 同 265 | 同时 266 | 后 267 | 后者 268 | 向 269 | 向使 270 | 向着 271 | 吓 272 | 吗 273 | 否则 274 | 吧 275 | 吧哒 276 | 吱 277 | 呀 278 | 呃 279 | 呕 280 | 呗 281 | 呜 282 | 呜呼 283 | 呢 284 | 呵 285 | 呵呵 286 | 呸 287 | 呼哧 288 | 咋 289 | 和 290 | 咚 291 | 咦 292 | 咧 293 | 咱 294 | 咱们 295 | 咳 296 | 哇 297 | 哈 298 | 哈哈 299 | 哉 300 | 哎 301 | 哎呀 302 | 哎哟 303 | 哗 304 | 哟 305 | 哦 306 | 哩 307 | 哪 308 | 哪个 309 | 哪些 310 | 哪儿 311 | 哪天 312 | 哪年 313 | 哪怕 314 | 哪样 315 | 哪边 316 | 哪里 317 | 哼 318 | 哼唷 319 | 唉 320 | 唯有 321 | 啊 322 | 啐 323 | 啥 324 | 啦 325 | 啪达 326 | 啷当 327 | 喂 328 | 喏 329 | 喔唷 330 | 喽 331 | 嗡 332 | 嗡嗡 333 | 嗬 334 | 嗯 335 | 嗳 336 | 嘎 337 | 嘎登 338 | 嘘 339 | 嘛 340 | 嘻 341 | 嘿 342 | 嘿嘿 343 | 因 344 | 因为 345 | 因了 346 | 因此 347 | 因着 348 | 因而 349 | 固然 350 | 在 351 | 在下 352 | 在于 353 | 地 354 | 基于 355 | 处在 356 | 多 357 | 多么 358 | 多少 359 | 大 360 | 大家 361 | 她 362 | 她们 363 | 好 364 | 如 365 | 如上 366 | 如上所述 367 | 如下 368 | 如何 369 | 如其 370 | 如同 371 | 如是 372 | 如果 373 | 如此 374 | 如若 375 | 始而 376 | 孰料 377 | 孰知 378 | 宁 379 | 宁可 380 | 宁愿 381 | 宁肯 382 | 它 383 | 它们 384 | 对 385 | 对于 386 | 对待 387 | 对方 388 | 对比 389 | 将 390 | 小 391 | 尔 392 | 尔后 393 | 尔尔 394 | 尚且 395 | 就 396 | 就是 397 | 就是了 398 | 就是说 399 | 就算 400 | 就要 401 | 尽 402 | 尽管 403 | 尽管如此 404 | 岂但 405 | 己 406 | 已 407 | 已矣 408 | 巴 409 | 巴巴 410 | 并 411 | 并且 412 | 并非 413 | 庶乎 414 | 庶几 415 | 开外 416 | 开始 417 | 归 418 | 归齐 419 | 当 420 | 当地 421 | 当然 422 | 当着 423 | 彼 424 | 彼时 425 | 彼此 426 | 往 427 | 待 428 | 很 429 | 得 430 | 得了 431 | 怎 432 | 怎么 433 | 怎么办 434 | 怎么样 435 | 怎奈 436 | 怎样 437 | 总之 438 | 总的来看 439 | 总的来说 440 | 总的说来 441 | 总而言之 442 | 恰恰相反 443 | 您 444 | 惟其 445 | 慢说 446 | 我 447 | 我们 448 | 或 449 | 或则 450 | 或是 451 | 或曰 452 | 或者 453 | 截至 454 | 所 455 | 所以 456 | 所在 457 | 所幸 458 | 所有 459 | 才 460 | 才能 461 | 打 462 | 打从 463 | 把 464 | 抑或 465 | 拿 466 | 按 467 | 按照 468 | 换句话说 469 | 换言之 470 | 据 471 | 据此 472 | 接着 473 | 故 474 | 故此 475 | 故而 476 | 旁人 477 | 无 478 | 无宁 479 | 无论 480 | 既 481 | 既往 482 | 既是 483 | 既然 484 | 时候 485 | 是 486 | 是以 487 | 是的 488 | 曾 489 | 替 490 | 替代 491 | 最 492 | 有 493 | 有些 494 | 有关 495 | 有及 496 | 有时 497 | 有的 498 | 望 499 | 朝 500 | 朝着 501 | 本 502 | 本人 503 | 本地 504 | 本着 505 | 本身 506 | 来 507 | 来着 508 | 来自 509 | 来说 510 | 极了 511 | 果然 512 | 果真 513 | 某 514 | 某个 515 | 某些 516 | 某某 517 | 根据 518 | 欤 519 | 正值 520 | 正如 521 | 正巧 522 | 正是 523 | 此 524 | 此地 525 | 此处 526 | 此外 527 | 此时 528 | 此次 529 | 此间 530 | 毋宁 531 | 每 532 | 每当 533 | 比 534 | 比及 535 | 比如 536 | 比方 537 | 没奈何 538 | 沿 539 | 沿着 540 | 漫说 541 | 焉 542 | 然则 543 | 然后 544 | 然而 545 | 照 546 | 照着 547 | 犹且 548 | 犹自 549 | 甚且 550 | 甚么 551 | 甚或 552 | 甚而 553 | 甚至 554 | 甚至于 555 | 用 556 | 用来 557 | 由 558 | 由于 559 | 由是 560 | 由此 561 | 由此可见 562 | 的 563 | 的确 564 | 的话 565 | 直到 566 | 相对而言 567 | 省得 568 | 看 569 | 眨眼 570 | 着 571 | 着呢 572 | 矣 573 | 矣乎 574 | 矣哉 575 | 离 576 | 竟而 577 | 第 578 | 等 579 | 等到 580 | 等等 581 | 简言之 582 | 管 583 | 类如 584 | 紧接着 585 | 纵 586 | 纵令 587 | 纵使 588 | 纵然 589 | 经 590 | 经过 591 | 结果 592 | 给 593 | 继之 594 | 继后 595 | 继而 596 | 综上所述 597 | 罢了 598 | 者 599 | 而 600 | 而且 601 | 而况 602 | 而后 603 | 而外 604 | 而已 605 | 而是 606 | 而言 607 | 能 608 | 能否 609 | 腾 610 | 自 611 | 自个儿 612 | 自从 613 | 自各儿 614 | 自后 615 | 自家 616 | 自己 617 | 自打 618 | 自身 619 | 至 620 | 至于 621 | 至今 622 | 至若 623 | 致 624 | 般的 625 | 若 626 | 若夫 627 | 若是 628 | 若果 629 | 若非 630 | 莫不然 631 | 莫如 632 | 莫若 633 | 虽 634 | 虽则 635 | 虽然 636 | 虽说 637 | 被 638 | 要 639 | 要不 640 | 要不是 641 | 要不然 642 | 要么 643 | 要是 644 | 譬喻 645 | 譬如 646 | 让 647 | 许多 648 | 论 649 | 设使 650 | 设或 651 | 设若 652 | 诚如 653 | 诚然 654 | 该 655 | 说来 656 | 诸 657 | 诸位 658 | 诸如 659 | 谁 660 | 谁人 661 | 谁料 662 | 谁知 663 | 贼死 664 | 赖以 665 | 赶 666 | 起 667 | 起见 668 | 趁 669 | 趁着 670 | 越是 671 | 距 672 | 跟 673 | 较 674 | 较之 675 | 边 676 | 过 677 | 还 678 | 还是 679 | 还有 680 | 还要 681 | 这 682 | 这一来 683 | 这个 684 | 这么 685 | 这么些 686 | 这么样 687 | 这么点儿 688 | 这些 689 | 这会儿 690 | 这儿 691 | 这就是说 692 | 这时 693 | 这样 694 | 这次 695 | 这般 696 | 这边 697 | 这里 698 | 进而 699 | 连 700 | 连同 701 | 逐步 702 | 通过 703 | 遵循 704 | 遵照 705 | 那 706 | 那个 707 | 那么 708 | 那么些 709 | 那么样 710 | 那些 711 | 那会儿 712 | 那儿 713 | 那时 714 | 那样 715 | 那般 716 | 那边 717 | 那里 718 | 都 719 | 鄙人 720 | 鉴于 721 | 针对 722 | 阿 723 | 除 724 | 除了 725 | 除外 726 | 除开 727 | 除此之外 728 | 除非 729 | 随 730 | 随后 731 | 随时 732 | 随着 733 | 难道说 734 | 非但 735 | 非徒 736 | 非特 737 | 非独 738 | 靠 739 | 顺 740 | 顺着 741 | 首先 742 | ! 743 | , 744 | : 745 | ; 746 | ? --------------------------------------------------------------------------------