├── .gitignore
├── README.md
├── src
├── test
│ └── java
│ │ └── com
│ │ └── lou
│ │ └── simhasher
│ │ └── SimhashTest.java
└── main
│ ├── java
│ └── com
│ │ └── lou
│ │ └── simhasher
│ │ ├── util
│ │ ├── DicReader.java
│ │ └── FNVHash.java
│ │ ├── seg
│ │ └── WordsSegment.java
│ │ ├── SimHasher.java
│ │ └── KeywordExtractor.java
│ └── resources
│ └── dict
│ └── stop_words.utf8
└── pom.xml
/.gitignore:
--------------------------------------------------------------------------------
1 | /target/
2 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # simhash
2 | 高效的文本相似度去重算法实现
3 |
4 | ## simhash是什么
5 | Google发明的的文本去重算法,适合于大批量文档的相似度计算
6 | * [博客介绍](http://grunt1223.iteye.com/blog/964564)
7 |
8 | ## 主要步骤
9 | * 对文本分词,得到N维特征向量(默认为64维)
10 | * 为分词设置权重(tf-idf)
11 | * 为特征向量计算哈希
12 | * 对所有特征向量加权,累加(目前仅进行非加权累加)
13 | * 对累加结果,大于零置一,小于零置零
14 | * 得到文本指纹(fingerprint)
15 |
--------------------------------------------------------------------------------
/src/test/java/com/lou/simhasher/SimhashTest.java:
--------------------------------------------------------------------------------
1 | package com.lou.simhasher;
2 |
3 | import java.io.FileInputStream;
4 | import java.io.IOException;
5 |
6 | import org.apache.commons.io.IOUtils;
7 | import org.junit.Test;
8 |
9 | /**
10 | * @author louxuezheng@hotmail.com
11 | */
12 | public class SimhashTest {
13 |
14 | @Test
15 | public void testDistance(){
16 | String str1 = readAllFile("D:/test/testin2.txt");
17 | SimHasher hash1 = new SimHasher(str1);
18 | System.out.println(hash1.getSignature());
19 | System.out.println("============================");
20 |
21 | String str2 = readAllFile("D:/test/testin.txt");
22 | SimHasher hash2 = new SimHasher(str2);
23 | System.out.println(hash2.getSignature());
24 | System.out.println("============================");
25 |
26 | System.out.println(hash1.getHammingDistance(hash2.getSignature()));
27 |
28 | }
29 |
30 | /**
31 | * 测试用
32 | * @param filename 名字
33 | * @return
34 | */
35 | public static String readAllFile(String filename) {
36 | String everything = "";
37 | try {
38 | FileInputStream inputStream = new FileInputStream(filename);
39 | everything = IOUtils.toString(inputStream);
40 | inputStream.close();
41 | } catch (IOException e) {
42 | }
43 |
44 | return everything;
45 | }
46 | }
47 |
--------------------------------------------------------------------------------
/src/main/java/com/lou/simhasher/util/DicReader.java:
--------------------------------------------------------------------------------
1 | package com.lou.simhasher.util;
2 |
3 | import java.io.BufferedReader;
4 | import java.io.InputStream;
5 | import java.io.InputStreamReader;
6 | import java.io.UnsupportedEncodingException;
7 |
8 | import org.slf4j.Logger;
9 | import org.slf4j.LoggerFactory;
10 |
11 | import com.lou.simhasher.KeywordExtractor;
12 |
13 | /**
14 | * 文档读入工具
15 | *
16 | * @author louxuezheng@hotmail.com
17 | */
18 | public final class DicReader {
19 |
20 | private static final Logger logger = LoggerFactory.getLogger(KeywordExtractor.class);
21 |
22 | private DicReader() {
23 | }
24 |
25 | /**
26 | * 返回BufferedReader
27 | *
28 | * @param name 文件名
29 | * @return
30 | */
31 | public static BufferedReader getReader(String name) {
32 | InputStream in = DicReader.class.getResourceAsStream("/" + name);
33 | try {
34 | return new BufferedReader(new InputStreamReader(in, "UTF-8"));
35 | } catch (UnsupportedEncodingException e) {
36 | logger.error("编码格式不支持:" + e.getMessage());
37 | }
38 | return null;
39 | }
40 |
41 | /**
42 | * 返回InputStream
43 | *
44 | * @param name 文件名
45 | * @return
46 | */
47 | public static InputStream getInputStream(String name) {
48 | InputStream in = DicReader.class.getResourceAsStream("/" + name);
49 | return in;
50 | }
51 | }
52 |
--------------------------------------------------------------------------------
/pom.xml:
--------------------------------------------------------------------------------
1 |
3 | 4.0.0
4 |
5 | com.lou
6 | simhasher
7 | 0.0.1-SNAPSHOT
8 | jar
9 |
10 | simhasher
11 | http://maven.apache.org
12 |
13 |
14 | UTF-8
15 |
16 |
17 |
18 |
19 | org.apache.lucene
20 | lucene-core
21 | 3.6.1
22 |
23 |
24 |
25 | com.janeluo
26 | ikanalyzer
27 | 2012_u6
28 |
29 |
30 | commons-io
31 | commons-io
32 | 2.4
33 |
34 |
35 | org.slf4j
36 | jcl-over-slf4j
37 | 1.6.4
38 |
39 |
40 | org.slf4j
41 | log4j-over-slf4j
42 | 1.6.3
43 |
44 |
45 | junit
46 | junit
47 | 4.11
48 | test
49 |
50 |
51 |
52 |
--------------------------------------------------------------------------------
/src/main/java/com/lou/simhasher/seg/WordsSegment.java:
--------------------------------------------------------------------------------
1 | /**
2 | * Copyright (c) 2011-2020 Panguso, Inc.
3 | * All rights reserved.
4 | *
5 | * This software is the confidential and proprietary information of Panguso,
6 | * Inc. ("Confidential Information"). You shall not
7 | * disclose such Confidential Information and shall use it only in
8 | * accordance with the terms of the license agreement you entered into with Panguso.
9 | */
10 | package com.lou.simhasher.seg;
11 |
12 | import java.io.IOException;
13 | import java.io.Reader;
14 | import java.io.StringReader;
15 | import java.util.ArrayList;
16 | import java.util.List;
17 |
18 | import org.apache.lucene.analysis.Analyzer;
19 | import org.apache.lucene.analysis.TokenStream;
20 | import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
21 | import org.slf4j.Logger;
22 | import org.slf4j.LoggerFactory;
23 | import org.wltea.analyzer.lucene.IKAnalyzer;
24 |
25 | /**
26 | * 文档分词
27 | *
28 | * @author louxuezheng@hotmail.com
29 | */
30 | public final class WordsSegment {
31 | private static final Logger logger = LoggerFactory.getLogger(WordsSegment.class);
32 |
33 | private WordsSegment() {
34 | }
35 |
36 | /**
37 | * 分词
38 | *
39 | * @param str 字符串
40 | * @return
41 | */
42 | public static List getCutWords(String str) {
43 | Analyzer analyzer = new IKAnalyzer();
44 | Reader r = new StringReader(str);
45 | TokenStream ts = analyzer.tokenStream("searchValue", r);
46 | ts.addAttribute(CharTermAttribute.class);
47 |
48 | List list = new ArrayList();
49 | try {
50 | while (ts.incrementToken()) {
51 | CharTermAttribute ta = ts.getAttribute(CharTermAttribute.class);
52 | String word = ta.toString();
53 | list.add(word);
54 | }
55 | } catch (IOException e) {
56 | logger.error("分词IO错误:" + e.getMessage());
57 | }
58 | return list;
59 | }
60 | }
--------------------------------------------------------------------------------
/src/main/java/com/lou/simhasher/util/FNVHash.java:
--------------------------------------------------------------------------------
1 | package com.lou.simhasher.util;
2 |
3 | import java.math.BigInteger;
4 |
5 | /**
6 | * FNVhash
7 | *
8 | * @author louxuezheng@hotmail.com
9 | */
10 | public final class FNVHash {
11 |
12 | public static final int HASH_BITS = 64;
13 | public static final BigInteger FNV_64_INIT = new BigInteger("14695981039346656037");
14 | public static final BigInteger FNV_64_PRIME = new BigInteger("1099511628211");
15 | public static final BigInteger MASK_64 = BigInteger.ONE.shiftLeft(HASH_BITS).subtract(BigInteger.ONE);
16 |
17 | private FNVHash() {
18 | }
19 |
20 | /**
21 | * fnv-1 hash算法,将字符串转换为64位hash值
22 | *
23 | * @param str str
24 | * @return
25 | */
26 | public static BigInteger fnv1Hash64(String str) {
27 | BigInteger hash = FNV_64_INIT;
28 | int len = str.length();
29 | for (int i = 0; i < len; i++) {
30 | hash = hash.multiply(FNV_64_PRIME);
31 | hash = hash.xor(BigInteger.valueOf(str.charAt(i)));
32 | }
33 | hash = hash.and(MASK_64);
34 | return hash;
35 | }
36 |
37 | /**
38 | * fnv-1a hash算法,将字符串转换为64位hash值
39 | *
40 | * @param str str
41 | * @return
42 | */
43 | public static BigInteger fnv1aHash64(String str) {
44 | BigInteger hash = FNV_64_INIT;
45 | int len = str.length();
46 | for (int i = 0; i < len; i++) {
47 | hash = hash.xor(BigInteger.valueOf(str.charAt(i)));
48 | hash = hash.multiply(FNV_64_PRIME);
49 | }
50 | hash = hash.and(MASK_64);
51 | return hash;
52 | }
53 |
54 | /**
55 | * 返回二进制串hash距离
56 | *
57 | * @param str1 str1
58 | * @param str2 str2
59 | * @return
60 | */
61 | public static int getDistance(String str1, String str2) {
62 | int distance;
63 |
64 | if (str1.length() != str2.length()) {
65 | distance = -1;
66 | } else {
67 | distance = 0;
68 | for (int i = 0; i < str1.length(); i++) {
69 | if (str1.charAt(i) != str2.charAt(i)) {
70 | distance++;
71 | }
72 | }
73 | }
74 | return distance;
75 | }
76 | }
--------------------------------------------------------------------------------
/src/main/java/com/lou/simhasher/SimHasher.java:
--------------------------------------------------------------------------------
1 | package com.lou.simhasher;
2 |
3 | import java.math.BigInteger;
4 | import java.util.Map;
5 | import java.util.Set;
6 |
7 | import com.lou.simhasher.util.FNVHash;
8 |
9 | /**
10 | * 文本去重算法的simhash类
11 | * 步骤如下:
12 | * 1,对文本分词,得到N维特征向量(默认为64维)
13 | * 2,为分词设置权重(tf-idf)
14 | * 3,为特征向量计算哈希
15 | * 4,对所有特征向量加权,累加(目前仅进行非加权累加)
16 | * 5,对累加结果,大于零置一,小于零置零
17 | * 6,得到文本指纹(fingerprint)
18 | *
19 | * @author louxuezheng@hotmail.com
20 | */
21 | public class SimHasher {
22 | private String hash;
23 | private BigInteger signature;
24 | private KeywordExtractor wordExtractor = KeywordExtractor.getInstance();
25 |
26 | /**
27 | * 构造函数
28 | *
29 | * @param content 字符串
30 | */
31 | public SimHasher(String content) {
32 | this.analysis(content);
33 | }
34 |
35 | private void analysis(String content) {
36 | Map wordInfos = wordExtractor.extract(content);
37 | double[] featureVector = new double[FNVHash.HASH_BITS];
38 | Set words = wordInfos.keySet();
39 | // System.out.println(words);
40 | for (String word : words) {
41 | BigInteger wordhash = FNVHash.fnv1aHash64(word);
42 | for (int i = 0; i < FNVHash.HASH_BITS; i++) {
43 | BigInteger bitmask = BigInteger.ONE.shiftLeft(FNVHash.HASH_BITS - i - 1);
44 | if (wordhash.and(bitmask).signum() != 0) {
45 | featureVector[i] += wordInfos.get(word);
46 | } else {
47 | featureVector[i] -= wordInfos.get(word);
48 | }
49 | }
50 | }
51 |
52 | BigInteger signature = BigInteger.ZERO;
53 | StringBuffer hashBuffer = new StringBuffer();
54 | for (int i = 0; i < FNVHash.HASH_BITS; i++) {
55 | if (featureVector[i] >= 0) {
56 | signature = signature.add(BigInteger.ONE.shiftLeft(FNVHash.HASH_BITS - i - 1));
57 | hashBuffer.append("1");
58 | } else {
59 | hashBuffer.append("0");
60 | }
61 | }
62 | this.hash = hashBuffer.toString();
63 | this.signature = signature;
64 | }
65 |
66 | /**
67 | * 汉明距离
68 | *
69 | * @param targetSignature 比较签名
70 | * @return
71 | */
72 | public int getHammingDistance(BigInteger targetSignature) {
73 | BigInteger x = this.getSignature().xor(targetSignature);
74 | int tot = 0;
75 |
76 | // 统计x中二进制位数为1的个数
77 | // 我们想想,一个二进制数减去1,那么,从最后那个1(包括那个1)后面的数字全都反了,
78 | // 对吧,然后,n&(n-1)就相当于把后面的数字清0,
79 | // 我们看n能做多少次这样的操作就OK了。
80 |
81 | while (x.signum() != 0) {
82 | tot += 1;
83 | x = x.and(x.subtract(new BigInteger("1")));
84 | }
85 |
86 | return tot;
87 | }
88 |
89 | /**
90 | * hash距离。二进制比较
91 | *
92 | * @param targetHash 比较目标
93 | * @return
94 | */
95 | public int getHashDistance(String targetHash) {
96 | int distance;
97 | if (this.getHash().length() != targetHash.length()) {
98 | distance = -1;
99 | } else {
100 | distance = 0;
101 | for (int i = 0; i < this.getHash().length(); i++) {
102 | if (this.getHash().charAt(i) != targetHash.charAt(i)) {
103 | distance++;
104 | }
105 | }
106 | }
107 | return distance;
108 | }
109 |
110 | public String getHash() {
111 | return this.hash;
112 | }
113 |
114 | public BigInteger getSignature() {
115 | return this.signature;
116 | }
117 |
118 | }
119 |
--------------------------------------------------------------------------------
/src/main/java/com/lou/simhasher/KeywordExtractor.java:
--------------------------------------------------------------------------------
1 | /**
2 | * Copyright (c) 2011-2020 Panguso, Inc.
3 | * All rights reserved.
4 | *
5 | * This software is the confidential and proprietary information of Panguso,
6 | * Inc. ("Confidential Information"). You shall not
7 | * disclose such Confidential Information and shall use it only in
8 | * accordance with the terms of the license agreement you entered into with Panguso.
9 | */
10 | package com.lou.simhasher;
11 |
12 | import java.io.BufferedReader;
13 | import java.io.IOException;
14 | import java.util.HashMap;
15 | import java.util.HashSet;
16 | import java.util.Iterator;
17 | import java.util.List;
18 | import java.util.Map;
19 | import java.util.Map.Entry;
20 | import java.util.Set;
21 |
22 | import org.slf4j.Logger;
23 | import org.slf4j.LoggerFactory;
24 |
25 | import com.lou.simhasher.seg.WordsSegment;
26 | import com.lou.simhasher.util.DicReader;
27 |
28 | /**
29 | * 关键词抽取类。消除停用词,并对词语进行tfidf权重计算
30 | *
31 | * @author louxuezheng@hotmail.com
32 | */
33 | public final class KeywordExtractor {
34 | private static final Logger logger = LoggerFactory.getLogger(KeywordExtractor.class);
35 | private Map idfMap = new HashMap();
36 | private Set stopWords = new HashSet();
37 | private double idfAverage;
38 | private static KeywordExtractor instance = new KeywordExtractor();
39 |
40 | public static KeywordExtractor getInstance() {
41 | return instance;
42 | }
43 |
44 | /**
45 | * 构造函数
46 | *
47 | */
48 | private KeywordExtractor() {
49 | String stopwordPath = "dict/stop_words.utf8";
50 | String idfPath = "dict/idf.utf8";
51 | loadIdfDict(idfPath);
52 | loadStopWordDict(stopwordPath);
53 | }
54 |
55 | /**
56 | * 抽取词,消除停用词,并对词语进行tfidf权重计算
57 | *
58 | * @param str str
59 | * @return
60 | */
61 | public Map extract(String str) {
62 | List words = WordsSegment.getCutWords(str);
63 |
64 | // 计算词频tf
65 | Map wordmap = new HashMap();
66 | for (String word : words) {
67 | if (!wordmap.containsKey(word)) {
68 | wordmap.put(word, 1.0);
69 | }else{
70 | wordmap.put(word, wordmap.get(word) + 1);
71 | }
72 | }
73 |
74 |
75 | // 删除停用词并计算权重
76 | Iterator> it = wordmap.entrySet().iterator();
77 | while (it.hasNext()) {
78 | Entry item = (Entry) it.next();
79 | String word = item.getKey();
80 | if (stopWords.contains(word)) {
81 | it.remove();
82 | continue;
83 | }
84 |
85 | // 计算权重tdf
86 | if (idfMap.containsKey(word)) {
87 | double idf = wordmap.get(word) * idfMap.get(word);
88 | wordmap.put(word, idf);
89 | } else {
90 | double idf = wordmap.get(word) * idfAverage;
91 | wordmap.put(word, idf);
92 | }
93 | }
94 |
95 | for(String key:wordmap.keySet()){
96 | System.out.println(key+" : "+wordmap.get(key));
97 | }
98 |
99 | return wordmap;
100 | }
101 |
102 | /**
103 | * 加载idf语料词典
104 | *
105 | * @param idfPath
106 | */
107 | private void loadIdfDict(String idfPath) {
108 | BufferedReader bf = DicReader.getReader(idfPath);
109 | double idf = 0.0;
110 | double idfSum = 0.0;
111 | int lineno = 0;
112 | String[] arrStrings = null;
113 | String line = null;
114 | try {
115 | while ((line = bf.readLine()) != null) {
116 | if (line.isEmpty()) {
117 | continue;
118 | }
119 | arrStrings = line.split(" ");
120 | if (arrStrings.length != 2) {
121 | continue;
122 | }
123 | idf = Double.valueOf(arrStrings[1]);
124 | idfMap.put(arrStrings[0], idf);
125 | idfSum += idf;
126 | lineno++;
127 | }
128 | } catch (NumberFormatException e) {
129 | logger.error("数据格式错误:" + e.getMessage());
130 | } catch (IOException e) {
131 | logger.error("IO错误:" + e.getMessage());
132 | }
133 |
134 | // assert (lineno > 0);
135 | idfAverage = idfSum / lineno;
136 | // assert (idfAverage > 0.0);
137 | }
138 |
139 | /**
140 | * 加载停用词
141 | *
142 | * @param filePath
143 | */
144 | private void loadStopWordDict(String filePath) {
145 | BufferedReader bf = DicReader.getReader(filePath);
146 | String line = null;
147 | try {
148 | while ((line = bf.readLine()) != null) {
149 | stopWords.add(line);
150 | }
151 | } catch (IOException e) {
152 | logger.error("IO错误:" + e.getMessage());
153 | }
154 | }
155 |
156 | }
157 |
--------------------------------------------------------------------------------
/src/main/resources/dict/stop_words.utf8:
--------------------------------------------------------------------------------
1 | "
2 | .
3 | 。
4 | ,
5 | 、
6 | !
7 | ?
8 | :
9 | ;
10 | `
11 | ﹑
12 | •
13 | "
14 | ^
15 | …
16 | ‘
17 | ’
18 | “
19 | ”
20 | 〝
21 | 〞
22 | ~
23 | \
24 | ∕
25 | |
26 | ¦
27 | ‖
28 | —
29 | (
30 | )
31 | 〈
32 | 〉
33 | ﹞
34 | ﹝
35 | 「
36 | 」
37 | ‹
38 | ›
39 | 〖
40 | 〗
41 | 】
42 | 【
43 | »
44 | «
45 | 』
46 | 『
47 | 〕
48 | 〔
49 | 》
50 | 《
51 | }
52 | {
53 | ]
54 | [
55 | ﹐
56 | ¸
57 | ﹕
58 | ︰
59 | ﹔
60 | ;
61 | !
62 | ¡
63 | ?
64 | ¿
65 | ﹖
66 | ﹌
67 | ﹏
68 | ﹋
69 | '
70 | ´
71 | ˊ
72 | ˋ
73 | -
74 | ―
75 | ﹫
76 | @
77 | ︳
78 | ︴
79 | _
80 | ¯
81 | _
82 |  ̄
83 | ﹢
84 | +
85 | ﹦
86 | =
87 | ﹤
88 | ‐
89 | <
90 |
91 | ˜
92 | ~
93 | ﹟
94 | #
95 | ﹩
96 | $
97 | ﹠
98 | &
99 | ﹪
100 | %
101 | ﹡
102 | *
103 | ﹨
104 | \
105 | ﹍
106 | ﹉
107 | ﹎
108 | ﹊
109 | ˇ
110 | ︵
111 | ︶
112 | ︷
113 | ︸
114 | ︹
115 | ︿
116 | ﹀
117 | ︺
118 | ︽
119 | ︾
120 | _
121 | ˉ
122 | ﹁
123 | ﹂
124 | ﹃
125 | ﹄
126 | ︻
127 | ︼
128 | 的
129 | 了
130 | the
131 | a
132 | an
133 | that
134 | those
135 | this
136 | that
137 | $
138 | 0
139 | 1
140 | 2
141 | 3
142 | 4
143 | 5
144 | 6
145 | 7
146 | 8
147 | 9
148 | ?
149 | _
150 | “
151 | ”
152 | 、
153 | 。
154 | 《
155 | 》
156 | 一
157 | 一些
158 | 一何
159 | 一切
160 | 一则
161 | 一方面
162 | 一旦
163 | 一来
164 | 一样
165 | 一般
166 | 一转眼
167 | 万一
168 | 上
169 | 上下
170 | 下
171 | 不
172 | 不仅
173 | 不但
174 | 不光
175 | 不单
176 | 不只
177 | 不外乎
178 | 不如
179 | 不妨
180 | 不尽
181 | 不尽然
182 | 不得
183 | 不怕
184 | 不惟
185 | 不成
186 | 不拘
187 | 不料
188 | 不是
189 | 不比
190 | 不然
191 | 不特
192 | 不独
193 | 不管
194 | 不至于
195 | 不若
196 | 不论
197 | 不过
198 | 不问
199 | 与
200 | 与其
201 | 与其说
202 | 与否
203 | 与此同时
204 | 且
205 | 且不说
206 | 且说
207 | 两者
208 | 个
209 | 个别
210 | 临
211 | 为
212 | 为了
213 | 为什么
214 | 为何
215 | 为止
216 | 为此
217 | 为着
218 | 乃
219 | 乃至
220 | 乃至于
221 | 么
222 | 之
223 | 之一
224 | 之所以
225 | 之类
226 | 乌乎
227 | 乎
228 | 乘
229 | 也
230 | 也好
231 | 也罢
232 | 了
233 | 二来
234 | 于
235 | 于是
236 | 于是乎
237 | 云云
238 | 云尔
239 | 些
240 | 亦
241 | 人
242 | 人们
243 | 人家
244 | 什么
245 | 什么样
246 | 今
247 | 介于
248 | 仍
249 | 仍旧
250 | 从
251 | 从此
252 | 从而
253 | 他
254 | 他人
255 | 他们
256 | 以
257 | 以上
258 | 以为
259 | 以便
260 | 以免
261 | 以及
262 | 以故
263 | 以期
264 | 以来
265 | 以至
266 | 以至于
267 | 以致
268 | 们
269 | 任
270 | 任何
271 | 任凭
272 | 似的
273 | 但
274 | 但凡
275 | 但是
276 | 何
277 | 何以
278 | 何况
279 | 何处
280 | 何时
281 | 余外
282 | 作为
283 | 你
284 | 你们
285 | 使
286 | 使得
287 | 例如
288 | 依
289 | 依据
290 | 依照
291 | 便于
292 | 俺
293 | 俺们
294 | 倘
295 | 倘使
296 | 倘或
297 | 倘然
298 | 倘若
299 | 借
300 | 假使
301 | 假如
302 | 假若
303 | 傥然
304 | 像
305 | 儿
306 | 先不先
307 | 光是
308 | 全体
309 | 全部
310 | 兮
311 | 关于
312 | 其
313 | 其一
314 | 其中
315 | 其二
316 | 其他
317 | 其余
318 | 其它
319 | 其次
320 | 具体地说
321 | 具体说来
322 | 兼之
323 | 内
324 | 再
325 | 再其次
326 | 再则
327 | 再有
328 | 再者
329 | 再者说
330 | 再说
331 | 冒
332 | 冲
333 | 况且
334 | 几
335 | 几时
336 | 凡
337 | 凡是
338 | 凭
339 | 凭借
340 | 出于
341 | 出来
342 | 分别
343 | 则
344 | 则甚
345 | 别
346 | 别人
347 | 别处
348 | 别是
349 | 别的
350 | 别管
351 | 别说
352 | 到
353 | 前后
354 | 前此
355 | 前者
356 | 加之
357 | 加以
358 | 即
359 | 即令
360 | 即使
361 | 即便
362 | 即如
363 | 即或
364 | 即若
365 | 却
366 | 去
367 | 又
368 | 又及
369 | 及
370 | 及其
371 | 及至
372 | 反之
373 | 反而
374 | 反过来
375 | 反过来说
376 | 受到
377 | 另
378 | 另一方面
379 | 另外
380 | 另悉
381 | 只
382 | 只当
383 | 只怕
384 | 只是
385 | 只有
386 | 只消
387 | 只要
388 | 只限
389 | 叫
390 | 叮咚
391 | 可
392 | 可以
393 | 可是
394 | 可见
395 | 各
396 | 各个
397 | 各位
398 | 各种
399 | 各自
400 | 同
401 | 同时
402 | 后
403 | 后者
404 | 向
405 | 向使
406 | 向着
407 | 吓
408 | 吗
409 | 否则
410 | 吧
411 | 吧哒
412 | 吱
413 | 呀
414 | 呃
415 | 呕
416 | 呗
417 | 呜
418 | 呜呼
419 | 呢
420 | 呵
421 | 呵呵
422 | 呸
423 | 呼哧
424 | 咋
425 | 和
426 | 咚
427 | 咦
428 | 咧
429 | 咱
430 | 咱们
431 | 咳
432 | 哇
433 | 哈
434 | 哈哈
435 | 哉
436 | 哎
437 | 哎呀
438 | 哎哟
439 | 哗
440 | 哟
441 | 哦
442 | 哩
443 | 哪
444 | 哪个
445 | 哪些
446 | 哪儿
447 | 哪天
448 | 哪年
449 | 哪怕
450 | 哪样
451 | 哪边
452 | 哪里
453 | 哼
454 | 哼唷
455 | 唉
456 | 唯有
457 | 啊
458 | 啐
459 | 啥
460 | 啦
461 | 啪达
462 | 啷当
463 | 喂
464 | 喏
465 | 喔唷
466 | 喽
467 | 嗡
468 | 嗡嗡
469 | 嗬
470 | 嗯
471 | 嗳
472 | 嘎
473 | 嘎登
474 | 嘘
475 | 嘛
476 | 嘻
477 | 嘿
478 | 嘿嘿
479 | 因
480 | 因为
481 | 因了
482 | 因此
483 | 因着
484 | 因而
485 | 固然
486 | 在
487 | 在下
488 | 在于
489 | 地
490 | 基于
491 | 处在
492 | 多
493 | 多么
494 | 多少
495 | 大
496 | 大家
497 | 她
498 | 她们
499 | 好
500 | 如
501 | 如上
502 | 如上所述
503 | 如下
504 | 如何
505 | 如其
506 | 如同
507 | 如是
508 | 如果
509 | 如此
510 | 如若
511 | 始而
512 | 孰料
513 | 孰知
514 | 宁
515 | 宁可
516 | 宁愿
517 | 宁肯
518 | 它
519 | 它们
520 | 对
521 | 对于
522 | 对待
523 | 对方
524 | 对比
525 | 将
526 | 小
527 | 尔
528 | 尔后
529 | 尔尔
530 | 尚且
531 | 就
532 | 就是
533 | 就是了
534 | 就是说
535 | 就算
536 | 就要
537 | 尽
538 | 尽管
539 | 尽管如此
540 | 岂但
541 | 己
542 | 已
543 | 已矣
544 | 巴
545 | 巴巴
546 | 并
547 | 并且
548 | 并非
549 | 庶乎
550 | 庶几
551 | 开外
552 | 开始
553 | 归
554 | 归齐
555 | 当
556 | 当地
557 | 当然
558 | 当着
559 | 彼
560 | 彼时
561 | 彼此
562 | 往
563 | 待
564 | 很
565 | 得
566 | 得了
567 | 怎
568 | 怎么
569 | 怎么办
570 | 怎么样
571 | 怎奈
572 | 怎样
573 | 总之
574 | 总的来看
575 | 总的来说
576 | 总的说来
577 | 总而言之
578 | 恰恰相反
579 | 您
580 | 惟其
581 | 慢说
582 | 我
583 | 我们
584 | 或
585 | 或则
586 | 或是
587 | 或曰
588 | 或者
589 | 截至
590 | 所
591 | 所以
592 | 所在
593 | 所幸
594 | 所有
595 | 才
596 | 才能
597 | 打
598 | 打从
599 | 把
600 | 抑或
601 | 拿
602 | 按
603 | 按照
604 | 换句话说
605 | 换言之
606 | 据
607 | 据此
608 | 接着
609 | 故
610 | 故此
611 | 故而
612 | 旁人
613 | 无
614 | 无宁
615 | 无论
616 | 既
617 | 既往
618 | 既是
619 | 既然
620 | 时候
621 | 是
622 | 是以
623 | 是的
624 | 曾
625 | 替
626 | 替代
627 | 最
628 | 有
629 | 有些
630 | 有关
631 | 有及
632 | 有时
633 | 有的
634 | 望
635 | 朝
636 | 朝着
637 | 本
638 | 本人
639 | 本地
640 | 本着
641 | 本身
642 | 来
643 | 来着
644 | 来自
645 | 来说
646 | 极了
647 | 果然
648 | 果真
649 | 某
650 | 某个
651 | 某些
652 | 某某
653 | 根据
654 | 欤
655 | 正值
656 | 正如
657 | 正巧
658 | 正是
659 | 此
660 | 此地
661 | 此处
662 | 此外
663 | 此时
664 | 此次
665 | 此间
666 | 毋宁
667 | 每
668 | 每当
669 | 比
670 | 比及
671 | 比如
672 | 比方
673 | 没奈何
674 | 沿
675 | 沿着
676 | 漫说
677 | 焉
678 | 然则
679 | 然后
680 | 然而
681 | 照
682 | 照着
683 | 犹且
684 | 犹自
685 | 甚且
686 | 甚么
687 | 甚或
688 | 甚而
689 | 甚至
690 | 甚至于
691 | 用
692 | 用来
693 | 由
694 | 由于
695 | 由是
696 | 由此
697 | 由此可见
698 | 的
699 | 的确
700 | 的话
701 | 直到
702 | 相对而言
703 | 省得
704 | 看
705 | 眨眼
706 | 着
707 | 着呢
708 | 矣
709 | 矣乎
710 | 矣哉
711 | 离
712 | 竟而
713 | 第
714 | 等
715 | 等到
716 | 等等
717 | 简言之
718 | 管
719 | 类如
720 | 紧接着
721 | 纵
722 | 纵令
723 | 纵使
724 | 纵然
725 | 经
726 | 经过
727 | 结果
728 | 给
729 | 继之
730 | 继后
731 | 继而
732 | 综上所述
733 | 罢了
734 | 者
735 | 而
736 | 而且
737 | 而况
738 | 而后
739 | 而外
740 | 而已
741 | 而是
742 | 而言
743 | 能
744 | 能否
745 | 腾
746 | 自
747 | 自个儿
748 | 自从
749 | 自各儿
750 | 自后
751 | 自家
752 | 自己
753 | 自打
754 | 自身
755 | 至
756 | 至于
757 | 至今
758 | 至若
759 | 致
760 | 般的
761 | 若
762 | 若夫
763 | 若是
764 | 若果
765 | 若非
766 | 莫不然
767 | 莫如
768 | 莫若
769 | 虽
770 | 虽则
771 | 虽然
772 | 虽说
773 | 被
774 | 要
775 | 要不
776 | 要不是
777 | 要不然
778 | 要么
779 | 要是
780 | 譬喻
781 | 譬如
782 | 让
783 | 许多
784 | 论
785 | 设使
786 | 设或
787 | 设若
788 | 诚如
789 | 诚然
790 | 该
791 | 说来
792 | 诸
793 | 诸位
794 | 诸如
795 | 谁
796 | 谁人
797 | 谁料
798 | 谁知
799 | 贼死
800 | 赖以
801 | 赶
802 | 起
803 | 起见
804 | 趁
805 | 趁着
806 | 越是
807 | 距
808 | 跟
809 | 较
810 | 较之
811 | 边
812 | 过
813 | 还
814 | 还是
815 | 还有
816 | 还要
817 | 这
818 | 这一来
819 | 这个
820 | 这么
821 | 这么些
822 | 这么样
823 | 这么点儿
824 | 这些
825 | 这会儿
826 | 这儿
827 | 这就是说
828 | 这时
829 | 这样
830 | 这次
831 | 这般
832 | 这边
833 | 这里
834 | 进而
835 | 连
836 | 连同
837 | 逐步
838 | 通过
839 | 遵循
840 | 遵照
841 | 那
842 | 那个
843 | 那么
844 | 那么些
845 | 那么样
846 | 那些
847 | 那会儿
848 | 那儿
849 | 那时
850 | 那样
851 | 那般
852 | 那边
853 | 那里
854 | 都
855 | 鄙人
856 | 鉴于
857 | 针对
858 | 阿
859 | 除
860 | 除了
861 | 除外
862 | 除开
863 | 除此之外
864 | 除非
865 | 随
866 | 随后
867 | 随时
868 | 随着
869 | 难道说
870 | 非但
871 | 非徒
872 | 非特
873 | 非独
874 | 靠
875 | 顺
876 | 顺着
877 | 首先
878 | !
879 | ,
880 | :
881 | ;
882 | ?
883 | to
884 | can
885 | could
886 | dare
887 | do
888 | did
889 | does
890 | may
891 | might
892 | would
893 | should
894 | must
895 | will
896 | ought
897 | shall
898 | need
899 | is
900 | a
901 | am
902 | are
903 | about
904 | according
905 | after
906 | against
907 | all
908 | almost
909 | also
910 | although
911 | among
912 | an
913 | and
914 | another
915 | any
916 | anything
917 | approximately
918 | as
919 | asked
920 | at
921 | back
922 | because
923 | before
924 | besides
925 | between
926 | both
927 | but
928 | by
929 | call
930 | called
931 | currently
932 | despite
933 | did
934 | do
935 | dr
936 | during
937 | each
938 | earlier
939 | eight
940 | even
941 | eventually
942 | every
943 | everything
944 | five
945 | for
946 | four
947 | from
948 | he
949 | her
950 | here
951 | his
952 | how
953 | however
954 | i
955 | if
956 | in
957 | indeed
958 | instead
959 | it
960 | its
961 | just
962 | last
963 | like
964 | major
965 | many
966 | may
967 | maybe
968 | meanwhile
969 | more
970 | moreover
971 | most
972 | mr
973 | mrs
974 | ms
975 | much
976 | my
977 | neither
978 | net
979 | never
980 | nevertheless
981 | nine
982 | no
983 | none
984 | not
985 | nothing
986 | now
987 | of
988 | on
989 | once
990 | one
991 | only
992 | or
993 | other
994 | our
995 | over
996 | partly
997 | perhaps
998 | prior
999 | regarding
1000 | separately
1001 | seven
1002 | several
1003 | she
1004 | should
1005 | similarly
1006 | since
1007 | six
1008 | so
1009 | some
1010 | somehow
1011 | still
1012 | such
1013 | ten
1014 | that
1015 | the
1016 | their
1017 | then
1018 | there
1019 | therefore
1020 | these
1021 | they
1022 | this
1023 | those
1024 | though
1025 | three
1026 | to
1027 | two
1028 | under
1029 | unless
1030 | unlike
1031 | until
1032 | volume
1033 | we
1034 | what
1035 | whatever
1036 | whats
1037 | when
1038 | where
1039 | which
1040 | while
1041 | why
1042 | with
1043 | without
1044 | yesterday
1045 | yet
1046 | you
1047 | your
1048 | aboard
1049 | about
1050 | above
1051 | according to
1052 | across
1053 | afore
1054 | after
1055 | against
1056 | agin
1057 | along
1058 | alongside
1059 | amid
1060 | amidst
1061 | among
1062 | amongst
1063 | anent
1064 | around
1065 | as
1066 | aslant
1067 | astride
1068 | at
1069 | athwart
1070 | bar
1071 | because of
1072 | before
1073 | behind
1074 | below
1075 | beneath
1076 | beside
1077 | besides
1078 | between
1079 | betwixt
1080 | beyond
1081 | but
1082 | by
1083 | circa
1084 | despite
1085 | down
1086 | during
1087 | due to
1088 | ere
1089 | except
1090 | for
1091 | from
1092 | in
1093 | inside
1094 | into
1095 | less
1096 | like
1097 | mid
1098 | midst
1099 | minus
1100 | near
1101 | next
1102 | nigh
1103 | nigher
1104 | nighest
1105 | notwithstanding
1106 | of
1107 | off
1108 | on
1109 | on to
1110 | onto
1111 | out
1112 | out of
1113 | outside
1114 | over
1115 | past
1116 | pending
1117 | per
1118 | plus
1119 | qua
1120 | re
1121 | round
1122 | sans
1123 | save
1124 | since
1125 | through
1126 | throughout
1127 | thru
1128 | till
1129 | to
1130 | toward
1131 | towards
1132 | under
1133 | underneath
1134 | unlike
1135 | until
1136 | unto
1137 | up
1138 | upon
1139 | versus
1140 | via
1141 | vice
1142 | with
1143 | within
1144 | without
1145 | he
1146 | her
1147 | herself
1148 | hers
1149 | him
1150 | himself
1151 | his
1152 | I
1153 | it
1154 | its
1155 | itself
1156 | me
1157 | mine
1158 | my
1159 | myself
1160 | ours
1161 | she
1162 | their
1163 | theirs
1164 | them
1165 | themselves
1166 | they
1167 | us
1168 | we
1169 | our
1170 | ourselves
1171 | you
1172 | your
1173 | yours
1174 | yourselves
1175 | yourself
1176 | this
1177 | that
1178 | these
1179 | those
1180 | "
1181 | '
1182 | ''
1183 | (
1184 | )
1185 | *LRB*
1186 | *RRB*
1187 |
1188 |
1189 |
1190 |
1191 |
1192 | @
1193 | &
1194 | [
1195 | ]
1196 | `
1197 | ``
1198 | e.g.,
1199 | {
1200 | }
1201 | "
1202 | “
1203 | ”
1204 | -RRB-
1205 | -LRB-
1206 | --
1207 | a
1208 | about
1209 | above
1210 | across
1211 | after
1212 | afterwards
1213 | again
1214 | against
1215 | all
1216 | almost
1217 | alone
1218 | along
1219 | already
1220 | also
1221 | although
1222 | always
1223 | am
1224 | among
1225 | amongst
1226 | amoungst
1227 | amount
1228 | an
1229 | and
1230 | another
1231 | any
1232 | anyhow
1233 | anyone
1234 | anything
1235 | anyway
1236 | anywhere
1237 | are
1238 | around
1239 | as
1240 | at
1241 | back
1242 | be
1243 | became
1244 | because
1245 | become
1246 | becomes
1247 | becoming
1248 | been
1249 | before
1250 | beforehand
1251 | behind
1252 | being
1253 | below
1254 | beside
1255 | besides
1256 | between
1257 | beyond
1258 | bill
1259 | both
1260 | bottom
1261 | but
1262 | by
1263 | call
1264 | can
1265 | cannot
1266 | cant
1267 | co
1268 | computer
1269 | con
1270 | could
1271 | couldnt
1272 | cry
1273 | de
1274 | describe
1275 | detail
1276 | do
1277 | done
1278 | down
1279 | due
1280 | during
1281 | each
1282 | eg
1283 | eight
1284 | either
1285 | eleven
1286 | else
1287 | elsewhere
1288 | empty
1289 | enough
1290 | etc
1291 | even
1292 | ever
1293 | every
1294 | everyone
1295 | everything
1296 | everywhere
1297 | except
1298 | few
1299 | fifteen
1300 | fify
1301 | fill
1302 | find
1303 | fire
1304 | first
1305 | five
1306 | for
1307 | former
1308 | formerly
1309 | forty
1310 | found
1311 | four
1312 | from
1313 | front
1314 | full
1315 | further
1316 | get
1317 | give
1318 | go
1319 | had
1320 | has
1321 | hasnt
1322 | have
1323 | he
1324 | hence
1325 | her
1326 | here
1327 | hereafter
1328 | hereby
1329 | herein
1330 | hereupon
1331 | hers
1332 | herself
1333 | him
1334 | himself
1335 | his
1336 | how
1337 | however
1338 | hundred
1339 | i
1340 | ie
1341 | if
1342 | in
1343 | inc
1344 | indeed
1345 | interest
1346 | into
1347 | is
1348 | it
1349 | its
1350 | itself
1351 | keep
1352 | last
1353 | latter
1354 | latterly
1355 | least
1356 | less
1357 | ltd
1358 | made
1359 | many
1360 | may
1361 | me
1362 | meanwhile
1363 | might
1364 | mill
1365 | mine
1366 | more
1367 | moreover
1368 | most
1369 | mostly
1370 | move
1371 | much
1372 | must
1373 | my
1374 | myself
1375 | name
1376 | namely
1377 | neither
1378 | never
1379 | nevertheless
1380 | next
1381 | nine
1382 | no
1383 | nobody
1384 | none
1385 | noone
1386 | nor
1387 | not
1388 | nothing
1389 | now
1390 | nowhere
1391 | of
1392 | off
1393 | often
1394 | on
1395 | once
1396 | one
1397 | only
1398 | onto
1399 | or
1400 | other
1401 | others
1402 | otherwise
1403 | our
1404 | ours
1405 | ourselves
1406 | out
1407 | over
1408 | own
1409 | p
1410 | part
1411 | per
1412 | perhaps
1413 | please
1414 | put
1415 | rather
1416 | re
1417 | same
1418 | see
1419 | seem
1420 | seemed
1421 | seeming
1422 | seems
1423 | serious
1424 | several
1425 | she
1426 | should
1427 | show
1428 | side
1429 | since
1430 | sincere
1431 | six
1432 | sixty
1433 | so
1434 | some
1435 | somehow
1436 | someone
1437 | something
1438 | sometime
1439 | sometimes
1440 | somewhere
1441 | still
1442 | such
1443 | system
1444 | take
1445 | ten
1446 | than
1447 | that
1448 | the
1449 | their
1450 | them
1451 | themselves
1452 | then
1453 | thence
1454 | there
1455 | thereafter
1456 | thereby
1457 | therefore
1458 | therein
1459 | thereupon
1460 | these
1461 | they
1462 | thick
1463 | thin
1464 | third
1465 | this
1466 | those
1467 | though
1468 | three
1469 | through
1470 | throughout
1471 | thru
1472 | thus
1473 | to
1474 | together
1475 | too
1476 | top
1477 | toward
1478 | towards
1479 | twelve
1480 | twenty
1481 | two
1482 | un
1483 | under
1484 | until
1485 | up
1486 | upon
1487 | us
1488 | very
1489 | via
1490 | was
1491 | we
1492 | well
1493 | were
1494 | what
1495 | whatever
1496 | when
1497 | whence
1498 | whenever
1499 | where
1500 | whereafter
1501 | whereas
1502 | whereby
1503 | wherein
1504 | whereupon
1505 | wherever
1506 | whether
1507 | which
1508 | while
1509 | whither
1510 | who
1511 | whoever
1512 | whole
1513 | whom
1514 | whose
1515 | why
1516 | will
1517 | with
1518 | within
1519 | without
1520 | would
1521 | yet
1522 | you
1523 | your
1524 | yours
1525 | yourself
1526 | yourselves
1527 |
1528 |
1529 | :
1530 | /
1531 | (
1532 | >
1533 | )
1534 | <
1535 | !
1536 |
--------------------------------------------------------------------------------