├── .gitignore ├── README.md ├── src ├── test │ └── java │ │ └── com │ │ └── lou │ │ └── simhasher │ │ └── SimhashTest.java └── main │ ├── java │ └── com │ │ └── lou │ │ └── simhasher │ │ ├── util │ │ ├── DicReader.java │ │ └── FNVHash.java │ │ ├── seg │ │ └── WordsSegment.java │ │ ├── SimHasher.java │ │ └── KeywordExtractor.java │ └── resources │ └── dict │ └── stop_words.utf8 └── pom.xml /.gitignore: -------------------------------------------------------------------------------- 1 | /target/ 2 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # simhash 2 | 高效的文本相似度去重算法实现 3 | 4 | ## simhash是什么 5 | Google发明的的文本去重算法,适合于大批量文档的相似度计算 6 | * [博客介绍](http://grunt1223.iteye.com/blog/964564) 7 | 8 | ## 主要步骤 9 | * 对文本分词,得到N维特征向量(默认为64维) 10 | * 为分词设置权重(tf-idf) 11 | * 为特征向量计算哈希 12 | * 对所有特征向量加权,累加(目前仅进行非加权累加) 13 | * 对累加结果,大于零置一,小于零置零 14 | * 得到文本指纹(fingerprint) 15 | -------------------------------------------------------------------------------- /src/test/java/com/lou/simhasher/SimhashTest.java: -------------------------------------------------------------------------------- 1 | package com.lou.simhasher; 2 | 3 | import java.io.FileInputStream; 4 | import java.io.IOException; 5 | 6 | import org.apache.commons.io.IOUtils; 7 | import org.junit.Test; 8 | 9 | /** 10 | * @author louxuezheng@hotmail.com 11 | */ 12 | public class SimhashTest { 13 | 14 | @Test 15 | public void testDistance(){ 16 | String str1 = readAllFile("D:/test/testin2.txt"); 17 | SimHasher hash1 = new SimHasher(str1); 18 | System.out.println(hash1.getSignature()); 19 | System.out.println("============================"); 20 | 21 | String str2 = readAllFile("D:/test/testin.txt"); 22 | SimHasher hash2 = new SimHasher(str2); 23 | System.out.println(hash2.getSignature()); 24 | System.out.println("============================"); 25 | 26 | System.out.println(hash1.getHammingDistance(hash2.getSignature())); 27 | 28 | } 29 | 30 | /** 31 | * 测试用 32 | * @param filename 名字 33 | * @return 34 | */ 35 | public static String readAllFile(String filename) { 36 | String everything = ""; 37 | try { 38 | FileInputStream inputStream = new FileInputStream(filename); 39 | everything = IOUtils.toString(inputStream); 40 | inputStream.close(); 41 | } catch (IOException e) { 42 | } 43 | 44 | return everything; 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /src/main/java/com/lou/simhasher/util/DicReader.java: -------------------------------------------------------------------------------- 1 | package com.lou.simhasher.util; 2 | 3 | import java.io.BufferedReader; 4 | import java.io.InputStream; 5 | import java.io.InputStreamReader; 6 | import java.io.UnsupportedEncodingException; 7 | 8 | import org.slf4j.Logger; 9 | import org.slf4j.LoggerFactory; 10 | 11 | import com.lou.simhasher.KeywordExtractor; 12 | 13 | /** 14 | * 文档读入工具 15 | * 16 | * @author louxuezheng@hotmail.com 17 | */ 18 | public final class DicReader { 19 | 20 | private static final Logger logger = LoggerFactory.getLogger(KeywordExtractor.class); 21 | 22 | private DicReader() { 23 | } 24 | 25 | /** 26 | * 返回BufferedReader 27 | * 28 | * @param name 文件名 29 | * @return 30 | */ 31 | public static BufferedReader getReader(String name) { 32 | InputStream in = DicReader.class.getResourceAsStream("/" + name); 33 | try { 34 | return new BufferedReader(new InputStreamReader(in, "UTF-8")); 35 | } catch (UnsupportedEncodingException e) { 36 | logger.error("编码格式不支持:" + e.getMessage()); 37 | } 38 | return null; 39 | } 40 | 41 | /** 42 | * 返回InputStream 43 | * 44 | * @param name 文件名 45 | * @return 46 | */ 47 | public static InputStream getInputStream(String name) { 48 | InputStream in = DicReader.class.getResourceAsStream("/" + name); 49 | return in; 50 | } 51 | } 52 | -------------------------------------------------------------------------------- /pom.xml: -------------------------------------------------------------------------------- 1 | 3 | 4.0.0 4 | 5 | com.lou 6 | simhasher 7 | 0.0.1-SNAPSHOT 8 | jar 9 | 10 | simhasher 11 | http://maven.apache.org 12 | 13 | 14 | UTF-8 15 | 16 | 17 | 18 | 19 | org.apache.lucene 20 | lucene-core 21 | 3.6.1 22 | 23 | 24 | 25 | com.janeluo 26 | ikanalyzer 27 | 2012_u6 28 | 29 | 30 | commons-io 31 | commons-io 32 | 2.4 33 | 34 | 35 | org.slf4j 36 | jcl-over-slf4j 37 | 1.6.4 38 | 39 | 40 | org.slf4j 41 | log4j-over-slf4j 42 | 1.6.3 43 | 44 | 45 | junit 46 | junit 47 | 4.11 48 | test 49 | 50 | 51 | 52 | -------------------------------------------------------------------------------- /src/main/java/com/lou/simhasher/seg/WordsSegment.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2011-2020 Panguso, Inc. 3 | * All rights reserved. 4 | * 5 | * This software is the confidential and proprietary information of Panguso, 6 | * Inc. ("Confidential Information"). You shall not 7 | * disclose such Confidential Information and shall use it only in 8 | * accordance with the terms of the license agreement you entered into with Panguso. 9 | */ 10 | package com.lou.simhasher.seg; 11 | 12 | import java.io.IOException; 13 | import java.io.Reader; 14 | import java.io.StringReader; 15 | import java.util.ArrayList; 16 | import java.util.List; 17 | 18 | import org.apache.lucene.analysis.Analyzer; 19 | import org.apache.lucene.analysis.TokenStream; 20 | import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; 21 | import org.slf4j.Logger; 22 | import org.slf4j.LoggerFactory; 23 | import org.wltea.analyzer.lucene.IKAnalyzer; 24 | 25 | /** 26 | * 文档分词 27 | * 28 | * @author louxuezheng@hotmail.com 29 | */ 30 | public final class WordsSegment { 31 | private static final Logger logger = LoggerFactory.getLogger(WordsSegment.class); 32 | 33 | private WordsSegment() { 34 | } 35 | 36 | /** 37 | * 分词 38 | * 39 | * @param str 字符串 40 | * @return 41 | */ 42 | public static List getCutWords(String str) { 43 | Analyzer analyzer = new IKAnalyzer(); 44 | Reader r = new StringReader(str); 45 | TokenStream ts = analyzer.tokenStream("searchValue", r); 46 | ts.addAttribute(CharTermAttribute.class); 47 | 48 | List list = new ArrayList(); 49 | try { 50 | while (ts.incrementToken()) { 51 | CharTermAttribute ta = ts.getAttribute(CharTermAttribute.class); 52 | String word = ta.toString(); 53 | list.add(word); 54 | } 55 | } catch (IOException e) { 56 | logger.error("分词IO错误:" + e.getMessage()); 57 | } 58 | return list; 59 | } 60 | } -------------------------------------------------------------------------------- /src/main/java/com/lou/simhasher/util/FNVHash.java: -------------------------------------------------------------------------------- 1 | package com.lou.simhasher.util; 2 | 3 | import java.math.BigInteger; 4 | 5 | /** 6 | * FNVhash 7 | * 8 | * @author louxuezheng@hotmail.com 9 | */ 10 | public final class FNVHash { 11 | 12 | public static final int HASH_BITS = 64; 13 | public static final BigInteger FNV_64_INIT = new BigInteger("14695981039346656037"); 14 | public static final BigInteger FNV_64_PRIME = new BigInteger("1099511628211"); 15 | public static final BigInteger MASK_64 = BigInteger.ONE.shiftLeft(HASH_BITS).subtract(BigInteger.ONE); 16 | 17 | private FNVHash() { 18 | } 19 | 20 | /** 21 | * fnv-1 hash算法,将字符串转换为64位hash值 22 | * 23 | * @param str str 24 | * @return 25 | */ 26 | public static BigInteger fnv1Hash64(String str) { 27 | BigInteger hash = FNV_64_INIT; 28 | int len = str.length(); 29 | for (int i = 0; i < len; i++) { 30 | hash = hash.multiply(FNV_64_PRIME); 31 | hash = hash.xor(BigInteger.valueOf(str.charAt(i))); 32 | } 33 | hash = hash.and(MASK_64); 34 | return hash; 35 | } 36 | 37 | /** 38 | * fnv-1a hash算法,将字符串转换为64位hash值 39 | * 40 | * @param str str 41 | * @return 42 | */ 43 | public static BigInteger fnv1aHash64(String str) { 44 | BigInteger hash = FNV_64_INIT; 45 | int len = str.length(); 46 | for (int i = 0; i < len; i++) { 47 | hash = hash.xor(BigInteger.valueOf(str.charAt(i))); 48 | hash = hash.multiply(FNV_64_PRIME); 49 | } 50 | hash = hash.and(MASK_64); 51 | return hash; 52 | } 53 | 54 | /** 55 | * 返回二进制串hash距离 56 | * 57 | * @param str1 str1 58 | * @param str2 str2 59 | * @return 60 | */ 61 | public static int getDistance(String str1, String str2) { 62 | int distance; 63 | 64 | if (str1.length() != str2.length()) { 65 | distance = -1; 66 | } else { 67 | distance = 0; 68 | for (int i = 0; i < str1.length(); i++) { 69 | if (str1.charAt(i) != str2.charAt(i)) { 70 | distance++; 71 | } 72 | } 73 | } 74 | return distance; 75 | } 76 | } -------------------------------------------------------------------------------- /src/main/java/com/lou/simhasher/SimHasher.java: -------------------------------------------------------------------------------- 1 | package com.lou.simhasher; 2 | 3 | import java.math.BigInteger; 4 | import java.util.Map; 5 | import java.util.Set; 6 | 7 | import com.lou.simhasher.util.FNVHash; 8 | 9 | /** 10 | * 文本去重算法的simhash类 11 | * 步骤如下: 12 | * 1,对文本分词,得到N维特征向量(默认为64维) 13 | * 2,为分词设置权重(tf-idf) 14 | * 3,为特征向量计算哈希 15 | * 4,对所有特征向量加权,累加(目前仅进行非加权累加) 16 | * 5,对累加结果,大于零置一,小于零置零 17 | * 6,得到文本指纹(fingerprint) 18 | * 19 | * @author louxuezheng@hotmail.com 20 | */ 21 | public class SimHasher { 22 | private String hash; 23 | private BigInteger signature; 24 | private KeywordExtractor wordExtractor = KeywordExtractor.getInstance(); 25 | 26 | /** 27 | * 构造函数 28 | * 29 | * @param content 字符串 30 | */ 31 | public SimHasher(String content) { 32 | this.analysis(content); 33 | } 34 | 35 | private void analysis(String content) { 36 | Map wordInfos = wordExtractor.extract(content); 37 | double[] featureVector = new double[FNVHash.HASH_BITS]; 38 | Set words = wordInfos.keySet(); 39 | // System.out.println(words); 40 | for (String word : words) { 41 | BigInteger wordhash = FNVHash.fnv1aHash64(word); 42 | for (int i = 0; i < FNVHash.HASH_BITS; i++) { 43 | BigInteger bitmask = BigInteger.ONE.shiftLeft(FNVHash.HASH_BITS - i - 1); 44 | if (wordhash.and(bitmask).signum() != 0) { 45 | featureVector[i] += wordInfos.get(word); 46 | } else { 47 | featureVector[i] -= wordInfos.get(word); 48 | } 49 | } 50 | } 51 | 52 | BigInteger signature = BigInteger.ZERO; 53 | StringBuffer hashBuffer = new StringBuffer(); 54 | for (int i = 0; i < FNVHash.HASH_BITS; i++) { 55 | if (featureVector[i] >= 0) { 56 | signature = signature.add(BigInteger.ONE.shiftLeft(FNVHash.HASH_BITS - i - 1)); 57 | hashBuffer.append("1"); 58 | } else { 59 | hashBuffer.append("0"); 60 | } 61 | } 62 | this.hash = hashBuffer.toString(); 63 | this.signature = signature; 64 | } 65 | 66 | /** 67 | * 汉明距离 68 | * 69 | * @param targetSignature 比较签名 70 | * @return 71 | */ 72 | public int getHammingDistance(BigInteger targetSignature) { 73 | BigInteger x = this.getSignature().xor(targetSignature); 74 | int tot = 0; 75 | 76 | // 统计x中二进制位数为1的个数 77 | // 我们想想,一个二进制数减去1,那么,从最后那个1(包括那个1)后面的数字全都反了, 78 | // 对吧,然后,n&(n-1)就相当于把后面的数字清0, 79 | // 我们看n能做多少次这样的操作就OK了。 80 | 81 | while (x.signum() != 0) { 82 | tot += 1; 83 | x = x.and(x.subtract(new BigInteger("1"))); 84 | } 85 | 86 | return tot; 87 | } 88 | 89 | /** 90 | * hash距离。二进制比较 91 | * 92 | * @param targetHash 比较目标 93 | * @return 94 | */ 95 | public int getHashDistance(String targetHash) { 96 | int distance; 97 | if (this.getHash().length() != targetHash.length()) { 98 | distance = -1; 99 | } else { 100 | distance = 0; 101 | for (int i = 0; i < this.getHash().length(); i++) { 102 | if (this.getHash().charAt(i) != targetHash.charAt(i)) { 103 | distance++; 104 | } 105 | } 106 | } 107 | return distance; 108 | } 109 | 110 | public String getHash() { 111 | return this.hash; 112 | } 113 | 114 | public BigInteger getSignature() { 115 | return this.signature; 116 | } 117 | 118 | } 119 | -------------------------------------------------------------------------------- /src/main/java/com/lou/simhasher/KeywordExtractor.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2011-2020 Panguso, Inc. 3 | * All rights reserved. 4 | * 5 | * This software is the confidential and proprietary information of Panguso, 6 | * Inc. ("Confidential Information"). You shall not 7 | * disclose such Confidential Information and shall use it only in 8 | * accordance with the terms of the license agreement you entered into with Panguso. 9 | */ 10 | package com.lou.simhasher; 11 | 12 | import java.io.BufferedReader; 13 | import java.io.IOException; 14 | import java.util.HashMap; 15 | import java.util.HashSet; 16 | import java.util.Iterator; 17 | import java.util.List; 18 | import java.util.Map; 19 | import java.util.Map.Entry; 20 | import java.util.Set; 21 | 22 | import org.slf4j.Logger; 23 | import org.slf4j.LoggerFactory; 24 | 25 | import com.lou.simhasher.seg.WordsSegment; 26 | import com.lou.simhasher.util.DicReader; 27 | 28 | /** 29 | * 关键词抽取类。消除停用词,并对词语进行tfidf权重计算 30 | * 31 | * @author louxuezheng@hotmail.com 32 | */ 33 | public final class KeywordExtractor { 34 | private static final Logger logger = LoggerFactory.getLogger(KeywordExtractor.class); 35 | private Map idfMap = new HashMap(); 36 | private Set stopWords = new HashSet(); 37 | private double idfAverage; 38 | private static KeywordExtractor instance = new KeywordExtractor(); 39 | 40 | public static KeywordExtractor getInstance() { 41 | return instance; 42 | } 43 | 44 | /** 45 | * 构造函数 46 | * 47 | */ 48 | private KeywordExtractor() { 49 | String stopwordPath = "dict/stop_words.utf8"; 50 | String idfPath = "dict/idf.utf8"; 51 | loadIdfDict(idfPath); 52 | loadStopWordDict(stopwordPath); 53 | } 54 | 55 | /** 56 | * 抽取词,消除停用词,并对词语进行tfidf权重计算 57 | * 58 | * @param str str 59 | * @return 60 | */ 61 | public Map extract(String str) { 62 | List words = WordsSegment.getCutWords(str); 63 | 64 | // 计算词频tf 65 | Map wordmap = new HashMap(); 66 | for (String word : words) { 67 | if (!wordmap.containsKey(word)) { 68 | wordmap.put(word, 1.0); 69 | }else{ 70 | wordmap.put(word, wordmap.get(word) + 1); 71 | } 72 | } 73 | 74 | 75 | // 删除停用词并计算权重 76 | Iterator> it = wordmap.entrySet().iterator(); 77 | while (it.hasNext()) { 78 | Entry item = (Entry) it.next(); 79 | String word = item.getKey(); 80 | if (stopWords.contains(word)) { 81 | it.remove(); 82 | continue; 83 | } 84 | 85 | // 计算权重tdf 86 | if (idfMap.containsKey(word)) { 87 | double idf = wordmap.get(word) * idfMap.get(word); 88 | wordmap.put(word, idf); 89 | } else { 90 | double idf = wordmap.get(word) * idfAverage; 91 | wordmap.put(word, idf); 92 | } 93 | } 94 | 95 | for(String key:wordmap.keySet()){ 96 | System.out.println(key+" : "+wordmap.get(key)); 97 | } 98 | 99 | return wordmap; 100 | } 101 | 102 | /** 103 | * 加载idf语料词典 104 | * 105 | * @param idfPath 106 | */ 107 | private void loadIdfDict(String idfPath) { 108 | BufferedReader bf = DicReader.getReader(idfPath); 109 | double idf = 0.0; 110 | double idfSum = 0.0; 111 | int lineno = 0; 112 | String[] arrStrings = null; 113 | String line = null; 114 | try { 115 | while ((line = bf.readLine()) != null) { 116 | if (line.isEmpty()) { 117 | continue; 118 | } 119 | arrStrings = line.split(" "); 120 | if (arrStrings.length != 2) { 121 | continue; 122 | } 123 | idf = Double.valueOf(arrStrings[1]); 124 | idfMap.put(arrStrings[0], idf); 125 | idfSum += idf; 126 | lineno++; 127 | } 128 | } catch (NumberFormatException e) { 129 | logger.error("数据格式错误:" + e.getMessage()); 130 | } catch (IOException e) { 131 | logger.error("IO错误:" + e.getMessage()); 132 | } 133 | 134 | // assert (lineno > 0); 135 | idfAverage = idfSum / lineno; 136 | // assert (idfAverage > 0.0); 137 | } 138 | 139 | /** 140 | * 加载停用词 141 | * 142 | * @param filePath 143 | */ 144 | private void loadStopWordDict(String filePath) { 145 | BufferedReader bf = DicReader.getReader(filePath); 146 | String line = null; 147 | try { 148 | while ((line = bf.readLine()) != null) { 149 | stopWords.add(line); 150 | } 151 | } catch (IOException e) { 152 | logger.error("IO错误:" + e.getMessage()); 153 | } 154 | } 155 | 156 | } 157 | -------------------------------------------------------------------------------- /src/main/resources/dict/stop_words.utf8: -------------------------------------------------------------------------------- 1 | " 2 | . 3 | 。 4 | , 5 | 、 6 | ! 7 | ? 8 | : 9 | ; 10 | ` 11 | ﹑ 12 | • 13 | " 14 | ^ 15 | … 16 | ‘ 17 | ’ 18 | “ 19 | ” 20 | 〝 21 | 〞 22 | ~ 23 | \ 24 | ∕ 25 | | 26 | ¦ 27 | ‖ 28 | —  29 | ( 30 | ) 31 | 〈 32 | 〉 33 | ﹞ 34 | ﹝ 35 | 「 36 | 」 37 | ‹ 38 | › 39 | 〖 40 | 〗 41 | 】 42 | 【 43 | » 44 | « 45 | 』 46 | 『 47 | 〕 48 | 〔 49 | 》 50 | 《 51 | } 52 | { 53 | ] 54 | [ 55 | ﹐ 56 | ¸ 57 | ﹕ 58 | ︰ 59 | ﹔ 60 | ; 61 | ! 62 | ¡ 63 | ? 64 | ¿ 65 | ﹖ 66 | ﹌ 67 | ﹏ 68 | ﹋ 69 | ' 70 | ´ 71 | ˊ 72 | ˋ 73 | - 74 | ― 75 | ﹫ 76 | @ 77 | ︳ 78 | ︴ 79 | _ 80 | ¯ 81 | _ 82 |  ̄ 83 | ﹢ 84 | + 85 | ﹦ 86 | = 87 | ﹤ 88 | ‐ 89 | < 90 | ­ 91 | ˜ 92 | ~ 93 | ﹟ 94 | # 95 | ﹩ 96 | $ 97 | ﹠ 98 | & 99 | ﹪ 100 | % 101 | ﹡ 102 | * 103 | ﹨ 104 | \ 105 | ﹍ 106 | ﹉ 107 | ﹎ 108 | ﹊ 109 | ˇ 110 | ︵ 111 | ︶ 112 | ︷ 113 | ︸ 114 | ︹ 115 | ︿ 116 | ﹀ 117 | ︺ 118 | ︽ 119 | ︾ 120 | _ 121 | ˉ 122 | ﹁ 123 | ﹂ 124 | ﹃ 125 | ﹄ 126 | ︻ 127 | ︼ 128 | 的 129 | 了 130 | the 131 | a 132 | an 133 | that 134 | those 135 | this 136 | that 137 | $ 138 | 0 139 | 1 140 | 2 141 | 3 142 | 4 143 | 5 144 | 6 145 | 7 146 | 8 147 | 9 148 | ? 149 | _ 150 | “ 151 | ” 152 | 、 153 | 。 154 | 《 155 | 》 156 | 一 157 | 一些 158 | 一何 159 | 一切 160 | 一则 161 | 一方面 162 | 一旦 163 | 一来 164 | 一样 165 | 一般 166 | 一转眼 167 | 万一 168 | 上 169 | 上下 170 | 下 171 | 不 172 | 不仅 173 | 不但 174 | 不光 175 | 不单 176 | 不只 177 | 不外乎 178 | 不如 179 | 不妨 180 | 不尽 181 | 不尽然 182 | 不得 183 | 不怕 184 | 不惟 185 | 不成 186 | 不拘 187 | 不料 188 | 不是 189 | 不比 190 | 不然 191 | 不特 192 | 不独 193 | 不管 194 | 不至于 195 | 不若 196 | 不论 197 | 不过 198 | 不问 199 | 与 200 | 与其 201 | 与其说 202 | 与否 203 | 与此同时 204 | 且 205 | 且不说 206 | 且说 207 | 两者 208 | 个 209 | 个别 210 | 临 211 | 为 212 | 为了 213 | 为什么 214 | 为何 215 | 为止 216 | 为此 217 | 为着 218 | 乃 219 | 乃至 220 | 乃至于 221 | 么 222 | 之 223 | 之一 224 | 之所以 225 | 之类 226 | 乌乎 227 | 乎 228 | 乘 229 | 也 230 | 也好 231 | 也罢 232 | 了 233 | 二来 234 | 于 235 | 于是 236 | 于是乎 237 | 云云 238 | 云尔 239 | 些 240 | 亦 241 | 人 242 | 人们 243 | 人家 244 | 什么 245 | 什么样 246 | 今 247 | 介于 248 | 仍 249 | 仍旧 250 | 从 251 | 从此 252 | 从而 253 | 他 254 | 他人 255 | 他们 256 | 以 257 | 以上 258 | 以为 259 | 以便 260 | 以免 261 | 以及 262 | 以故 263 | 以期 264 | 以来 265 | 以至 266 | 以至于 267 | 以致 268 | 们 269 | 任 270 | 任何 271 | 任凭 272 | 似的 273 | 但 274 | 但凡 275 | 但是 276 | 何 277 | 何以 278 | 何况 279 | 何处 280 | 何时 281 | 余外 282 | 作为 283 | 你 284 | 你们 285 | 使 286 | 使得 287 | 例如 288 | 依 289 | 依据 290 | 依照 291 | 便于 292 | 俺 293 | 俺们 294 | 倘 295 | 倘使 296 | 倘或 297 | 倘然 298 | 倘若 299 | 借 300 | 假使 301 | 假如 302 | 假若 303 | 傥然 304 | 像 305 | 儿 306 | 先不先 307 | 光是 308 | 全体 309 | 全部 310 | 兮 311 | 关于 312 | 其 313 | 其一 314 | 其中 315 | 其二 316 | 其他 317 | 其余 318 | 其它 319 | 其次 320 | 具体地说 321 | 具体说来 322 | 兼之 323 | 内 324 | 再 325 | 再其次 326 | 再则 327 | 再有 328 | 再者 329 | 再者说 330 | 再说 331 | 冒 332 | 冲 333 | 况且 334 | 几 335 | 几时 336 | 凡 337 | 凡是 338 | 凭 339 | 凭借 340 | 出于 341 | 出来 342 | 分别 343 | 则 344 | 则甚 345 | 别 346 | 别人 347 | 别处 348 | 别是 349 | 别的 350 | 别管 351 | 别说 352 | 到 353 | 前后 354 | 前此 355 | 前者 356 | 加之 357 | 加以 358 | 即 359 | 即令 360 | 即使 361 | 即便 362 | 即如 363 | 即或 364 | 即若 365 | 却 366 | 去 367 | 又 368 | 又及 369 | 及 370 | 及其 371 | 及至 372 | 反之 373 | 反而 374 | 反过来 375 | 反过来说 376 | 受到 377 | 另 378 | 另一方面 379 | 另外 380 | 另悉 381 | 只 382 | 只当 383 | 只怕 384 | 只是 385 | 只有 386 | 只消 387 | 只要 388 | 只限 389 | 叫 390 | 叮咚 391 | 可 392 | 可以 393 | 可是 394 | 可见 395 | 各 396 | 各个 397 | 各位 398 | 各种 399 | 各自 400 | 同 401 | 同时 402 | 后 403 | 后者 404 | 向 405 | 向使 406 | 向着 407 | 吓 408 | 吗 409 | 否则 410 | 吧 411 | 吧哒 412 | 吱 413 | 呀 414 | 呃 415 | 呕 416 | 呗 417 | 呜 418 | 呜呼 419 | 呢 420 | 呵 421 | 呵呵 422 | 呸 423 | 呼哧 424 | 咋 425 | 和 426 | 咚 427 | 咦 428 | 咧 429 | 咱 430 | 咱们 431 | 咳 432 | 哇 433 | 哈 434 | 哈哈 435 | 哉 436 | 哎 437 | 哎呀 438 | 哎哟 439 | 哗 440 | 哟 441 | 哦 442 | 哩 443 | 哪 444 | 哪个 445 | 哪些 446 | 哪儿 447 | 哪天 448 | 哪年 449 | 哪怕 450 | 哪样 451 | 哪边 452 | 哪里 453 | 哼 454 | 哼唷 455 | 唉 456 | 唯有 457 | 啊 458 | 啐 459 | 啥 460 | 啦 461 | 啪达 462 | 啷当 463 | 喂 464 | 喏 465 | 喔唷 466 | 喽 467 | 嗡 468 | 嗡嗡 469 | 嗬 470 | 嗯 471 | 嗳 472 | 嘎 473 | 嘎登 474 | 嘘 475 | 嘛 476 | 嘻 477 | 嘿 478 | 嘿嘿 479 | 因 480 | 因为 481 | 因了 482 | 因此 483 | 因着 484 | 因而 485 | 固然 486 | 在 487 | 在下 488 | 在于 489 | 地 490 | 基于 491 | 处在 492 | 多 493 | 多么 494 | 多少 495 | 大 496 | 大家 497 | 她 498 | 她们 499 | 好 500 | 如 501 | 如上 502 | 如上所述 503 | 如下 504 | 如何 505 | 如其 506 | 如同 507 | 如是 508 | 如果 509 | 如此 510 | 如若 511 | 始而 512 | 孰料 513 | 孰知 514 | 宁 515 | 宁可 516 | 宁愿 517 | 宁肯 518 | 它 519 | 它们 520 | 对 521 | 对于 522 | 对待 523 | 对方 524 | 对比 525 | 将 526 | 小 527 | 尔 528 | 尔后 529 | 尔尔 530 | 尚且 531 | 就 532 | 就是 533 | 就是了 534 | 就是说 535 | 就算 536 | 就要 537 | 尽 538 | 尽管 539 | 尽管如此 540 | 岂但 541 | 己 542 | 已 543 | 已矣 544 | 巴 545 | 巴巴 546 | 并 547 | 并且 548 | 并非 549 | 庶乎 550 | 庶几 551 | 开外 552 | 开始 553 | 归 554 | 归齐 555 | 当 556 | 当地 557 | 当然 558 | 当着 559 | 彼 560 | 彼时 561 | 彼此 562 | 往 563 | 待 564 | 很 565 | 得 566 | 得了 567 | 怎 568 | 怎么 569 | 怎么办 570 | 怎么样 571 | 怎奈 572 | 怎样 573 | 总之 574 | 总的来看 575 | 总的来说 576 | 总的说来 577 | 总而言之 578 | 恰恰相反 579 | 您 580 | 惟其 581 | 慢说 582 | 我 583 | 我们 584 | 或 585 | 或则 586 | 或是 587 | 或曰 588 | 或者 589 | 截至 590 | 所 591 | 所以 592 | 所在 593 | 所幸 594 | 所有 595 | 才 596 | 才能 597 | 打 598 | 打从 599 | 把 600 | 抑或 601 | 拿 602 | 按 603 | 按照 604 | 换句话说 605 | 换言之 606 | 据 607 | 据此 608 | 接着 609 | 故 610 | 故此 611 | 故而 612 | 旁人 613 | 无 614 | 无宁 615 | 无论 616 | 既 617 | 既往 618 | 既是 619 | 既然 620 | 时候 621 | 是 622 | 是以 623 | 是的 624 | 曾 625 | 替 626 | 替代 627 | 最 628 | 有 629 | 有些 630 | 有关 631 | 有及 632 | 有时 633 | 有的 634 | 望 635 | 朝 636 | 朝着 637 | 本 638 | 本人 639 | 本地 640 | 本着 641 | 本身 642 | 来 643 | 来着 644 | 来自 645 | 来说 646 | 极了 647 | 果然 648 | 果真 649 | 某 650 | 某个 651 | 某些 652 | 某某 653 | 根据 654 | 欤 655 | 正值 656 | 正如 657 | 正巧 658 | 正是 659 | 此 660 | 此地 661 | 此处 662 | 此外 663 | 此时 664 | 此次 665 | 此间 666 | 毋宁 667 | 每 668 | 每当 669 | 比 670 | 比及 671 | 比如 672 | 比方 673 | 没奈何 674 | 沿 675 | 沿着 676 | 漫说 677 | 焉 678 | 然则 679 | 然后 680 | 然而 681 | 照 682 | 照着 683 | 犹且 684 | 犹自 685 | 甚且 686 | 甚么 687 | 甚或 688 | 甚而 689 | 甚至 690 | 甚至于 691 | 用 692 | 用来 693 | 由 694 | 由于 695 | 由是 696 | 由此 697 | 由此可见 698 | 的 699 | 的确 700 | 的话 701 | 直到 702 | 相对而言 703 | 省得 704 | 看 705 | 眨眼 706 | 着 707 | 着呢 708 | 矣 709 | 矣乎 710 | 矣哉 711 | 离 712 | 竟而 713 | 第 714 | 等 715 | 等到 716 | 等等 717 | 简言之 718 | 管 719 | 类如 720 | 紧接着 721 | 纵 722 | 纵令 723 | 纵使 724 | 纵然 725 | 经 726 | 经过 727 | 结果 728 | 给 729 | 继之 730 | 继后 731 | 继而 732 | 综上所述 733 | 罢了 734 | 者 735 | 而 736 | 而且 737 | 而况 738 | 而后 739 | 而外 740 | 而已 741 | 而是 742 | 而言 743 | 能 744 | 能否 745 | 腾 746 | 自 747 | 自个儿 748 | 自从 749 | 自各儿 750 | 自后 751 | 自家 752 | 自己 753 | 自打 754 | 自身 755 | 至 756 | 至于 757 | 至今 758 | 至若 759 | 致 760 | 般的 761 | 若 762 | 若夫 763 | 若是 764 | 若果 765 | 若非 766 | 莫不然 767 | 莫如 768 | 莫若 769 | 虽 770 | 虽则 771 | 虽然 772 | 虽说 773 | 被 774 | 要 775 | 要不 776 | 要不是 777 | 要不然 778 | 要么 779 | 要是 780 | 譬喻 781 | 譬如 782 | 让 783 | 许多 784 | 论 785 | 设使 786 | 设或 787 | 设若 788 | 诚如 789 | 诚然 790 | 该 791 | 说来 792 | 诸 793 | 诸位 794 | 诸如 795 | 谁 796 | 谁人 797 | 谁料 798 | 谁知 799 | 贼死 800 | 赖以 801 | 赶 802 | 起 803 | 起见 804 | 趁 805 | 趁着 806 | 越是 807 | 距 808 | 跟 809 | 较 810 | 较之 811 | 边 812 | 过 813 | 还 814 | 还是 815 | 还有 816 | 还要 817 | 这 818 | 这一来 819 | 这个 820 | 这么 821 | 这么些 822 | 这么样 823 | 这么点儿 824 | 这些 825 | 这会儿 826 | 这儿 827 | 这就是说 828 | 这时 829 | 这样 830 | 这次 831 | 这般 832 | 这边 833 | 这里 834 | 进而 835 | 连 836 | 连同 837 | 逐步 838 | 通过 839 | 遵循 840 | 遵照 841 | 那 842 | 那个 843 | 那么 844 | 那么些 845 | 那么样 846 | 那些 847 | 那会儿 848 | 那儿 849 | 那时 850 | 那样 851 | 那般 852 | 那边 853 | 那里 854 | 都 855 | 鄙人 856 | 鉴于 857 | 针对 858 | 阿 859 | 除 860 | 除了 861 | 除外 862 | 除开 863 | 除此之外 864 | 除非 865 | 随 866 | 随后 867 | 随时 868 | 随着 869 | 难道说 870 | 非但 871 | 非徒 872 | 非特 873 | 非独 874 | 靠 875 | 顺 876 | 顺着 877 | 首先 878 | ! 879 | , 880 | : 881 | ; 882 | ? 883 | to 884 | can 885 | could 886 | dare 887 | do 888 | did 889 | does 890 | may 891 | might 892 | would 893 | should 894 | must 895 | will 896 | ought 897 | shall 898 | need 899 | is 900 | a 901 | am 902 | are 903 | about 904 | according 905 | after 906 | against 907 | all 908 | almost 909 | also 910 | although 911 | among 912 | an 913 | and 914 | another 915 | any 916 | anything 917 | approximately 918 | as 919 | asked 920 | at 921 | back 922 | because 923 | before 924 | besides 925 | between 926 | both 927 | but 928 | by 929 | call 930 | called 931 | currently 932 | despite 933 | did 934 | do 935 | dr 936 | during 937 | each 938 | earlier 939 | eight 940 | even 941 | eventually 942 | every 943 | everything 944 | five 945 | for 946 | four 947 | from 948 | he 949 | her 950 | here 951 | his 952 | how 953 | however 954 | i 955 | if 956 | in 957 | indeed 958 | instead 959 | it 960 | its 961 | just 962 | last 963 | like 964 | major 965 | many 966 | may 967 | maybe 968 | meanwhile 969 | more 970 | moreover 971 | most 972 | mr 973 | mrs 974 | ms 975 | much 976 | my 977 | neither 978 | net 979 | never 980 | nevertheless 981 | nine 982 | no 983 | none 984 | not 985 | nothing 986 | now 987 | of 988 | on 989 | once 990 | one 991 | only 992 | or 993 | other 994 | our 995 | over 996 | partly 997 | perhaps 998 | prior 999 | regarding 1000 | separately 1001 | seven 1002 | several 1003 | she 1004 | should 1005 | similarly 1006 | since 1007 | six 1008 | so 1009 | some 1010 | somehow 1011 | still 1012 | such 1013 | ten 1014 | that 1015 | the 1016 | their 1017 | then 1018 | there 1019 | therefore 1020 | these 1021 | they 1022 | this 1023 | those 1024 | though 1025 | three 1026 | to 1027 | two 1028 | under 1029 | unless 1030 | unlike 1031 | until 1032 | volume 1033 | we 1034 | what 1035 | whatever 1036 | whats 1037 | when 1038 | where 1039 | which 1040 | while 1041 | why 1042 | with 1043 | without 1044 | yesterday 1045 | yet 1046 | you 1047 | your 1048 | aboard 1049 | about 1050 | above 1051 | according to 1052 | across 1053 | afore 1054 | after 1055 | against 1056 | agin 1057 | along 1058 | alongside 1059 | amid 1060 | amidst 1061 | among 1062 | amongst 1063 | anent 1064 | around 1065 | as 1066 | aslant 1067 | astride 1068 | at 1069 | athwart 1070 | bar 1071 | because of 1072 | before 1073 | behind 1074 | below 1075 | beneath 1076 | beside 1077 | besides 1078 | between 1079 | betwixt 1080 | beyond 1081 | but 1082 | by 1083 | circa 1084 | despite 1085 | down 1086 | during 1087 | due to 1088 | ere 1089 | except 1090 | for 1091 | from 1092 | in 1093 | inside 1094 | into 1095 | less 1096 | like 1097 | mid 1098 | midst 1099 | minus 1100 | near 1101 | next 1102 | nigh 1103 | nigher 1104 | nighest 1105 | notwithstanding 1106 | of 1107 | off 1108 | on 1109 | on to 1110 | onto 1111 | out 1112 | out of 1113 | outside 1114 | over 1115 | past 1116 | pending 1117 | per 1118 | plus 1119 | qua 1120 | re 1121 | round 1122 | sans 1123 | save 1124 | since 1125 | through 1126 | throughout 1127 | thru 1128 | till 1129 | to 1130 | toward 1131 | towards 1132 | under 1133 | underneath 1134 | unlike 1135 | until 1136 | unto 1137 | up 1138 | upon 1139 | versus 1140 | via 1141 | vice 1142 | with 1143 | within 1144 | without 1145 | he 1146 | her 1147 | herself 1148 | hers 1149 | him 1150 | himself 1151 | his 1152 | I 1153 | it 1154 | its 1155 | itself 1156 | me 1157 | mine 1158 | my 1159 | myself 1160 | ours 1161 | she 1162 | their 1163 | theirs 1164 | them 1165 | themselves 1166 | they 1167 | us 1168 | we 1169 | our 1170 | ourselves 1171 | you 1172 | your 1173 | yours 1174 | yourselves 1175 | yourself 1176 | this 1177 | that 1178 | these 1179 | those 1180 | " 1181 | ' 1182 | '' 1183 | ( 1184 | ) 1185 | *LRB* 1186 | *RRB* 1187 | 1188 | 1189 | 1190 | 1191 | 1192 | @ 1193 | & 1194 | [ 1195 | ] 1196 | ` 1197 | `` 1198 | e.g., 1199 | { 1200 | } 1201 | " 1202 | “ 1203 | ” 1204 | -RRB- 1205 | -LRB- 1206 | -- 1207 | a 1208 | about 1209 | above 1210 | across 1211 | after 1212 | afterwards 1213 | again 1214 | against 1215 | all 1216 | almost 1217 | alone 1218 | along 1219 | already 1220 | also 1221 | although 1222 | always 1223 | am 1224 | among 1225 | amongst 1226 | amoungst 1227 | amount 1228 | an 1229 | and 1230 | another 1231 | any 1232 | anyhow 1233 | anyone 1234 | anything 1235 | anyway 1236 | anywhere 1237 | are 1238 | around 1239 | as 1240 | at 1241 | back 1242 | be 1243 | became 1244 | because 1245 | become 1246 | becomes 1247 | becoming 1248 | been 1249 | before 1250 | beforehand 1251 | behind 1252 | being 1253 | below 1254 | beside 1255 | besides 1256 | between 1257 | beyond 1258 | bill 1259 | both 1260 | bottom 1261 | but 1262 | by 1263 | call 1264 | can 1265 | cannot 1266 | cant 1267 | co 1268 | computer 1269 | con 1270 | could 1271 | couldnt 1272 | cry 1273 | de 1274 | describe 1275 | detail 1276 | do 1277 | done 1278 | down 1279 | due 1280 | during 1281 | each 1282 | eg 1283 | eight 1284 | either 1285 | eleven 1286 | else 1287 | elsewhere 1288 | empty 1289 | enough 1290 | etc 1291 | even 1292 | ever 1293 | every 1294 | everyone 1295 | everything 1296 | everywhere 1297 | except 1298 | few 1299 | fifteen 1300 | fify 1301 | fill 1302 | find 1303 | fire 1304 | first 1305 | five 1306 | for 1307 | former 1308 | formerly 1309 | forty 1310 | found 1311 | four 1312 | from 1313 | front 1314 | full 1315 | further 1316 | get 1317 | give 1318 | go 1319 | had 1320 | has 1321 | hasnt 1322 | have 1323 | he 1324 | hence 1325 | her 1326 | here 1327 | hereafter 1328 | hereby 1329 | herein 1330 | hereupon 1331 | hers 1332 | herself 1333 | him 1334 | himself 1335 | his 1336 | how 1337 | however 1338 | hundred 1339 | i 1340 | ie 1341 | if 1342 | in 1343 | inc 1344 | indeed 1345 | interest 1346 | into 1347 | is 1348 | it 1349 | its 1350 | itself 1351 | keep 1352 | last 1353 | latter 1354 | latterly 1355 | least 1356 | less 1357 | ltd 1358 | made 1359 | many 1360 | may 1361 | me 1362 | meanwhile 1363 | might 1364 | mill 1365 | mine 1366 | more 1367 | moreover 1368 | most 1369 | mostly 1370 | move 1371 | much 1372 | must 1373 | my 1374 | myself 1375 | name 1376 | namely 1377 | neither 1378 | never 1379 | nevertheless 1380 | next 1381 | nine 1382 | no 1383 | nobody 1384 | none 1385 | noone 1386 | nor 1387 | not 1388 | nothing 1389 | now 1390 | nowhere 1391 | of 1392 | off 1393 | often 1394 | on 1395 | once 1396 | one 1397 | only 1398 | onto 1399 | or 1400 | other 1401 | others 1402 | otherwise 1403 | our 1404 | ours 1405 | ourselves 1406 | out 1407 | over 1408 | own 1409 | p 1410 | part 1411 | per 1412 | perhaps 1413 | please 1414 | put 1415 | rather 1416 | re 1417 | same 1418 | see 1419 | seem 1420 | seemed 1421 | seeming 1422 | seems 1423 | serious 1424 | several 1425 | she 1426 | should 1427 | show 1428 | side 1429 | since 1430 | sincere 1431 | six 1432 | sixty 1433 | so 1434 | some 1435 | somehow 1436 | someone 1437 | something 1438 | sometime 1439 | sometimes 1440 | somewhere 1441 | still 1442 | such 1443 | system 1444 | take 1445 | ten 1446 | than 1447 | that 1448 | the 1449 | their 1450 | them 1451 | themselves 1452 | then 1453 | thence 1454 | there 1455 | thereafter 1456 | thereby 1457 | therefore 1458 | therein 1459 | thereupon 1460 | these 1461 | they 1462 | thick 1463 | thin 1464 | third 1465 | this 1466 | those 1467 | though 1468 | three 1469 | through 1470 | throughout 1471 | thru 1472 | thus 1473 | to 1474 | together 1475 | too 1476 | top 1477 | toward 1478 | towards 1479 | twelve 1480 | twenty 1481 | two 1482 | un 1483 | under 1484 | until 1485 | up 1486 | upon 1487 | us 1488 | very 1489 | via 1490 | was 1491 | we 1492 | well 1493 | were 1494 | what 1495 | whatever 1496 | when 1497 | whence 1498 | whenever 1499 | where 1500 | whereafter 1501 | whereas 1502 | whereby 1503 | wherein 1504 | whereupon 1505 | wherever 1506 | whether 1507 | which 1508 | while 1509 | whither 1510 | who 1511 | whoever 1512 | whole 1513 | whom 1514 | whose 1515 | why 1516 | will 1517 | with 1518 | within 1519 | without 1520 | would 1521 | yet 1522 | you 1523 | your 1524 | yours 1525 | yourself 1526 | yourselves 1527 | 1528 | 1529 | : 1530 | / 1531 | ( 1532 | > 1533 | ) 1534 | < 1535 | ! 1536 | --------------------------------------------------------------------------------